LUCENE-4095: remove deprecations from trunk (just the easy ones for now)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344531 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-05-31 02:07:11 +00:00
parent 8deb16bcf9
commit bc3a3dc5d4
288 changed files with 817 additions and 20160 deletions

View File

@ -137,8 +137,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ?
new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
// the order here is important: the stopword list is not normalized!
result = new StopFilter( matchVersion, result, stopwords);

View File

@ -1,96 +0,0 @@
package org.apache.lucene.analysis.ar;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/**
* Tokenizer that breaks text into runs of letters and diacritics.
* <p>
* The problem with the standard Letter tokenizer is that it fails on diacritics.
* Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc.
* </p>
* <p>
* <a name="version"/>
* You must specify the required {@link Version} compatibility when creating
* {@link ArabicLetterTokenizer}:
* <ul>
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
* detect token characters. See {@link #isTokenChar(int)} and
* {@link #normalize(int)} for details.</li>
* </ul>
* @deprecated (3.1) Use {@link StandardTokenizer} instead.
*/
@Deprecated
public class ArabicLetterTokenizer extends LetterTokenizer {
/**
* Construct a new ArabicLetterTokenizer.
* @param matchVersion Lucene version
* to match See {@link <a href="#version">above</a>}
*
* @param in
* the input to split up into tokens
*/
public ArabicLetterTokenizer(Version matchVersion, Reader in) {
super(matchVersion, in);
}
/**
* Construct a new ArabicLetterTokenizer using a given {@link AttributeSource}.
*
* @param matchVersion
* Lucene version to match See {@link <a href="#version">above</a>}
* @param source
* the attribute source to use for this Tokenizer
* @param in
* the input to split up into tokens
*/
public ArabicLetterTokenizer(Version matchVersion, AttributeSource source, Reader in) {
super(matchVersion, source, in);
}
/**
* Construct a new ArabicLetterTokenizer using a given
* {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. * @param
* matchVersion Lucene version to match See
* {@link <a href="#version">above</a>}
*
* @param factory
* the attribute factory to use for this Tokenizer
* @param in
* the input to split up into tokens
*/
public ArabicLetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
super(matchVersion, factory, in);
}
/**
* Allows for Letter category or NonspacingMark category
* @see org.apache.lucene.analysis.core.LetterTokenizer#isTokenChar(int)
*/
@Override
protected boolean isTokenChar(int c) {
return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK;
}
}

View File

@ -38,14 +38,6 @@ import org.tartarus.snowball.ext.CatalanStemmer;
/**
* {@link Analyzer} for Catalan.
* <p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating CatalanAnalyzer:
* <ul>
* <li> As of 3.6, ElisionFilter with a set of Catalan
* contractions is used by default.
* </ul>
*/
public final class CatalanAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
@ -126,8 +118,8 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
* @return A
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
@ -135,9 +127,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
}
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -89,16 +89,11 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
// run the widthfilter first before bigramming, it sometimes combines characters.
TokenStream result = new CJKWidthFilter(source);
result = new LowerCaseFilter(matchVersion, result);
result = new CJKBigramFilter(result);
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
} else {
final Tokenizer source = new CJKTokenizer(reader);
return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
}
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
// run the widthfilter first before bigramming, it sometimes combines characters.
TokenStream result = new CJKWidthFilter(source);
result = new LowerCaseFilter(matchVersion, result);
result = new CJKBigramFilter(result);
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}
}

View File

@ -1,317 +0,0 @@
package org.apache.lucene.analysis.cjk;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
* <p>
* The tokens returned are every two adjacent characters with overlap match.
* </p>
* <p>
* Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
* </p>
* Additionally, the following is applied to Latin text (such as English):
* <ul>
* <li>Text is converted to lowercase.
* <li>Numeric digits, '+', '#', and '_' are tokenized as letters.
* <li>Full-width forms are converted to half-width forms.
* </ul>
* For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
* please search <a
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
*
* @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.
*/
@Deprecated
public final class CJKTokenizer extends Tokenizer {
//~ Static fields/initializers ---------------------------------------------
/** Word token type */
static final int WORD_TYPE = 0;
/** Single byte token type */
static final int SINGLE_TOKEN_TYPE = 1;
/** Double byte token type */
static final int DOUBLE_TOKEN_TYPE = 2;
/** Names for token types */
static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
/** Max word length */
private static final int MAX_WORD_LEN = 255;
/** buffer size: */
private static final int IO_BUFFER_SIZE = 256;
//~ Instance fields --------------------------------------------------------
/** word offset, used to imply which character(in ) is parsed */
private int offset = 0;
/** the index used only for ioBuffer */
private int bufferIndex = 0;
/** data length */
private int dataLen = 0;
/**
* character buffer, store the characters which are used to compose <br>
* the returned Token
*/
private final char[] buffer = new char[MAX_WORD_LEN];
/**
* I/O buffer, used to store the content of the input(one of the <br>
* members of Tokenizer)
*/
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
/** word type: single=>ASCII double=>non-ASCII word=>default */
private int tokenType = WORD_TYPE;
/**
* tag: previous character is a cached double-byte character "C1C2C3C4"
* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
*/
private boolean preIsTokened = false;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
//~ Constructors -----------------------------------------------------------
/**
* Construct a token stream processing the given input.
*
* @param in I/O reader
*/
public CJKTokenizer(Reader in) {
super(in);
}
public CJKTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
public CJKTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
//~ Methods ----------------------------------------------------------------
/**
* Returns true for the next token in the stream, or false at EOS.
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
* for detail.
*
* @return false for end of stream, true otherwise
*
* @throws java.io.IOException - throw IOException when read error <br>
* happened in the InputStream
*
*/
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
/** how many character(s) has been stored in buffer */
while(true) { // loop until we find a non-empty token
int length = 0;
/** the position used to create Token */
int start = offset;
while (true) { // loop until we've found a full token
/** current character */
char c;
/** unicode block of current character for detail */
Character.UnicodeBlock ub;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1) {
if (length > 0) {
if (preIsTokened == true) {
length = 0;
preIsTokened = false;
}
else{
offset--;
}
break;
} else {
offset--;
return false;
}
} else {
//get current character
c = ioBuffer[bufferIndex++];
//get the UnicodeBlock of the current character
ub = Character.UnicodeBlock.of(c);
}
//if the current character is ASCII or Extend ASCII
if ((ub == Character.UnicodeBlock.BASIC_LATIN)
|| (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
) {
if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
int i = (int) c;
if (i >= 65281 && i <= 65374) {
// convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
i = i - 65248;
c = (char) i;
}
}
// if the current character is a letter or "_" "+" "#"
if (Character.isLetterOrDigit(c)
|| ((c == '_') || (c == '+') || (c == '#'))
) {
if (length == 0) {
// "javaC1C2C3C4linux" <br>
// ^--: the current character begin to token the ASCII
// letter
start = offset - 1;
} else if (tokenType == DOUBLE_TOKEN_TYPE) {
// "javaC1C2C3C4linux" <br>
// ^--: the previous non-ASCII
// : the current character
offset--;
bufferIndex--;
if (preIsTokened == true) {
// there is only one non-ASCII has been stored
length = 0;
preIsTokened = false;
break;
} else {
break;
}
}
// store the LowerCase(c) in the buffer
buffer[length++] = Character.toLowerCase(c);
tokenType = SINGLE_TOKEN_TYPE;
// break the procedure if buffer overflowed!
if (length == MAX_WORD_LEN) {
break;
}
} else if (length > 0) {
if (preIsTokened == true) {
length = 0;
preIsTokened = false;
} else {
break;
}
}
} else {
// non-ASCII letter, e.g."C1C2C3C4"
if (Character.isLetter(c)) {
if (length == 0) {
start = offset - 1;
buffer[length++] = c;
tokenType = DOUBLE_TOKEN_TYPE;
} else {
if (tokenType == SINGLE_TOKEN_TYPE) {
offset--;
bufferIndex--;
//return the previous ASCII characters
break;
} else {
buffer[length++] = c;
tokenType = DOUBLE_TOKEN_TYPE;
if (length == 2) {
offset--;
bufferIndex--;
preIsTokened = true;
break;
}
}
}
} else if (length > 0) {
if (preIsTokened == true) {
// empty the buffer
length = 0;
preIsTokened = false;
} else {
break;
}
}
}
}
if (length > 0) {
termAtt.copyBuffer(buffer, 0, length);
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
return true;
} else if (dataLen == -1) {
offset--;
return false;
}
// Cycle back and try for the next token (don't
// return an empty string)
}
}
@Override
public final void end() {
// set final offset
final int finalOffset = correctOffset(offset);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset() throws IOException {
super.reset();
offset = bufferIndex = dataLen = 0;
preIsTokened = false;
tokenType = WORD_TYPE;
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
reset();
}
}

View File

@ -1,50 +0,0 @@
package org.apache.lucene.analysis.cn;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // javadoc @link
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
/**
* An {@link Analyzer} that tokenizes text with {@link ChineseTokenizer} and
* filters with {@link ChineseFilter}
* @deprecated (3.1) Use {@link StandardAnalyzer} instead, which has the same functionality.
* This analyzer will be removed in Lucene 5.0
*/
@Deprecated
public final class ChineseAnalyzer extends Analyzer {
/**
* Creates
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link ChineseTokenizer} filtered with
* {@link ChineseFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new ChineseTokenizer(reader);
return new TokenStreamComponents(source, new ChineseFilter(source));
}
}

View File

@ -1,104 +0,0 @@
package org.apache.lucene.analysis.cn;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* A {@link TokenFilter} with a stop word table.
* <ul>
* <li>Numeric tokens are removed.
* <li>English tokens must be larger than 1 character.
* <li>One Chinese character as one Chinese word.
* </ul>
* TO DO:
* <ol>
* <li>Add Chinese stop words, such as \ue400
* <li>Dictionary based Chinese word extraction
* <li>Intelligent Chinese word extraction
* </ol>
*
* @deprecated (3.1) Use {@link StopFilter} instead, which has the same functionality.
* This filter will be removed in Lucene 5.0
*/
@Deprecated
public final class ChineseFilter extends TokenFilter {
// Only English now, Chinese to be added later.
public static final String[] STOP_WORDS = {
"and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
private CharArraySet stopTable;
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public ChineseFilter(TokenStream in) {
super(in);
stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false);
}
@Override
public boolean incrementToken() throws IOException {
while (input.incrementToken()) {
char text[] = termAtt.buffer();
int termLength = termAtt.length();
// why not key off token type here assuming ChineseTokenizer comes first?
if (!stopTable.contains(text, 0, termLength)) {
switch (Character.getType(text[0])) {
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
// English word/token should larger than 1 character.
if (termLength>1) {
return true;
}
break;
case Character.OTHER_LETTER:
// One Chinese character as one Chinese word.
// Chinese word extraction to be added later here.
return true;
}
}
}
return false;
}
}

View File

@ -1,175 +0,0 @@
package org.apache.lucene.analysis.cn;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* Tokenize Chinese text as individual chinese characters.
*
* <p>
* The difference between ChineseTokenizer and
* CJKTokenizer is that they have different
* token parsing logic.
* </p>
* <p>
* For example, if the Chinese text
* "C1C2C3C4" is to be indexed:
* <ul>
* <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4.
* <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
* </ul>
* </p>
* <p>
* Therefore the index created by CJKTokenizer is much larger.
* </p>
* <p>
* The problem is that when searching for C1, C1C2, C1C3,
* C4C2, C1C2C3 ... the ChineseTokenizer works, but the
* CJKTokenizer will not work.
* </p>
* @deprecated (3.1) Use {@link StandardTokenizer} instead, which has the same functionality.
* This filter will be removed in Lucene 5.0
*/
@Deprecated
public final class ChineseTokenizer extends Tokenizer {
public ChineseTokenizer(Reader in) {
super(in);
}
public ChineseTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
public ChineseTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
private int offset = 0, bufferIndex=0, dataLen=0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
private int length;
private int start;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final void push(char c) {
if (length == 0) start = offset-1; // start of token
buffer[length++] = Character.toLowerCase(c); // buffer it
}
private final boolean flush() {
if (length>0) {
//System.out.println(new String(buffer, 0,
//length));
termAtt.copyBuffer(buffer, 0, length);
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
return true;
}
else
return false;
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
length = 0;
start = offset;
while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1) {
offset--;
return flush();
} else
c = ioBuffer[bufferIndex++];
switch(Character.getType(c)) {
case Character.DECIMAL_DIGIT_NUMBER:
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
push(c);
if (length == MAX_WORD_LEN) return flush();
break;
case Character.OTHER_LETTER:
if (length>0) {
bufferIndex--;
offset--;
return flush();
}
push(c);
return flush();
default:
if (length>0) return flush();
break;
}
}
}
@Override
public final void end() {
// set final offset
final int finalOffset = correctOffset(offset);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset() throws IOException {
super.reset();
offset = bufferIndex = dataLen = 0;
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
reset();
}
}

View File

@ -1,41 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
Analyzer for Chinese, which indexes unigrams (individual chinese characters).
<p>
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
<ul>
<li>StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
<li>CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
<li>SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment Chinese text into words) as tokens.
</ul>
Example phrase "我是中国人"
<ol>
<li>StandardAnalyzer: 我-是-中-国-人</li>
<li>CJKAnalyzer: 我是-是中-中国-国人</li>
<li>SmartChineseAnalyzer: 我-是-中国-人</li>
</ol>
</p>
</body>
</html>

View File

@ -40,17 +40,6 @@ import java.io.*;
* all). A default set of stopwords is used unless an alternative list is
* specified.
* </p>
*
* <a name="version"/>
* <p>
* You must specify the required {@link Version} compatibility when creating
* CzechAnalyzer:
* <ul>
* <li>As of 3.1, words are stemmed with {@link CzechStemFilter}
* <li>As of 2.9, StopFilter preserves position increments
* <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
* <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
* </ul>
*/
public final class CzechAnalyzer extends StopwordAnalyzerBase {
/** File containing default Czech stopwords. */
@ -86,8 +75,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
*
* @param matchVersion Lucene version to match See
* {@link <a href="#version">above</a>}
* @param matchVersion Lucene version to match
*/
public CzechAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
@ -96,8 +84,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words.
*
* @param matchVersion Lucene version to match See
* {@link <a href="#version">above</a>}
* @param matchVersion Lucene version to match
* @param stopwords a stopword set
*/
public CzechAnalyzer(Version matchVersion, CharArraySet stopwords) {
@ -108,8 +95,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
* Builds an analyzer with the given stop words and a set of work to be
* excluded from the {@link CzechStemFilter}.
*
* @param matchVersion Lucene version to match See
* {@link <a href="#version">above</a>}
* @param matchVersion Lucene version to match
* @param stopwords a stopword set
* @param stemExclusionTable a stemming exclusion set
*/
@ -127,7 +113,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
* built from a {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
* a version is >= LUCENE_31 and a stem exclusion set is provided via
* a stem exclusion set is provided via
* {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a
* {@link KeywordMarkerFilter} is added before
* {@link CzechStemFilter}.
@ -139,11 +125,9 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stopwords);
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
if(!this.stemExclusionTable.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionTable);
result = new CzechStemFilter(result);
}
if(!this.stemExclusionTable.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionTable);
result = new CzechStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -20,7 +20,6 @@ package org.apache.lucene.analysis.de;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -37,7 +36,6 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.German2Stemmer;
/**
* {@link Analyzer} for German language.
@ -49,39 +47,11 @@ import org.tartarus.snowball.ext.German2Stemmer;
* exclusion list is empty by default.
* </p>
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating GermanAnalyzer:
* <ul>
* <li> As of 3.6, GermanLightStemFilter is used for less aggressive stemming.
* <li> As of 3.1, Snowball stemming is done with SnowballFilter, and
* Snowball stopwords are used by default.
* <li> As of 2.9, StopFilter preserves position
* increments
* </ul>
*
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public final class GermanAnalyzer extends StopwordAnalyzerBase {
/** @deprecated in 3.1, remove in Lucene 5.0 (index bw compat) */
@Deprecated
private final static String[] GERMAN_STOP_WORDS = {
"einer", "eine", "eines", "einem", "einen",
"der", "die", "das", "dass", "daß",
"du", "er", "sie", "es",
"was", "wer", "wie", "wir",
"und", "oder", "ohne", "mit",
"am", "im", "in", "aus", "auf",
"ist", "sein", "war", "wird",
"ihr", "ihre", "ihres",
"als", "für", "von", "mit",
"dich", "dir", "mich", "mir",
"mein", "sein", "kein",
"durch", "wegen", "wird"
};
/** File containing default German stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "german_stop.txt";
@ -94,10 +64,6 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
}
private static class DefaultSetHolder {
/** @deprecated in 3.1, remove in Lucene 5.0 (index bw compat) */
@Deprecated
private static final CharArraySet DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
private static final CharArraySet DEFAULT_SET;
static {
try {
@ -125,9 +91,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
* {@link #getDefaultStopSet()}.
*/
public GermanAnalyzer(Version matchVersion) {
this(matchVersion,
matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_SET
: DefaultSetHolder.DEFAULT_SET_30);
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
}
/**
@ -176,14 +140,8 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stopwords);
result = new KeywordMarkerFilter(result, exclusionSet);
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
result = new GermanNormalizationFilter(result);
result = new GermanLightStemFilter(result);
} else if (matchVersion.onOrAfter(Version.LUCENE_31)) {
result = new SnowballFilter(result, new German2Stemmer());
} else {
result = new GermanStemFilter(result);
}
result = new GermanNormalizationFilter(result);
result = new GermanLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -37,15 +37,6 @@ import org.apache.lucene.util.Version;
* that will not be indexed at all).
* A default set of stopwords is used unless an alternative list is specified.
* </p>
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating GreekAnalyzer:
* <ul>
* <li> As of 3.1, StandardFilter and GreekStemmer are used by default.
* <li> As of 2.9, StopFilter preserves position
* increments
* </ul>
*
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
@ -78,8 +69,7 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the default stop words.
* @param matchVersion Lucene compatibility version,
* See <a href="#version">above</a>
* @param matchVersion Lucene compatibility version
*/
public GreekAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
@ -91,8 +81,7 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
* <b>NOTE:</b> The stopwords set should be pre-processed with the logic of
* {@link GreekLowerCaseFilter} for best results.
*
* @param matchVersion Lucene compatibility version,
* See <a href="#version">above</a>
* @param matchVersion Lucene compatibility version
* @param stopwords a stopword set
*/
public GreekAnalyzer(Version matchVersion, CharArraySet stopwords) {
@ -114,11 +103,9 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new GreekLowerCaseFilter(matchVersion, source);
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new StandardFilter(matchVersion, result);
result = new StandardFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new GreekStemFilter(result);
result = new GreekStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -196,7 +196,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_50,
Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ"),
false);
@ -222,7 +222,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_50,
Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ",
"αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ",
"μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ",
@ -247,7 +247,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_50,
Arrays.asList("αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ",
"πεθ", "πικρ", "ποτ", "σιχ", "χ"),
false);
@ -274,11 +274,11 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_50,
Arrays.asList("τρ", "τσ"),
false);
private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_50,
Arrays.asList("βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ",
"καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ",
"π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ",
@ -337,7 +337,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_50,
Arrays.asList("αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ",
"βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ",
"σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ"),
@ -425,11 +425,11 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_50,
Arrays.asList("π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ"),
false);
private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_50,
Arrays.asList("αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ"),
false);
@ -449,7 +449,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_50,
Arrays.asList("διαθ", "θ", "παρακαταθ", "προσθ", "συνθ"),
false);
@ -483,7 +483,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_50,
Arrays.asList("φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ",
"λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ",
"ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε",
@ -521,7 +521,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_50,
Arrays.asList("αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ",
"αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ",
"ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ",
@ -530,7 +530,7 @@ public class GreekStemmer {
"ουλαμ", "ουρ", "π", "τρ", "μ"),
false);
private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_50,
Arrays.asList("ψοφ", "ναυλοχ"),
false);
@ -567,7 +567,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_50,
Arrays.asList("ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν"),
false);
@ -587,7 +587,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_50,
Arrays.asList("ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ"),
false);
@ -601,7 +601,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_50,
Arrays.asList("ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων"),
false);
@ -625,7 +625,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_31,
private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_50,
Arrays.asList("παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ"),
false);

View File

@ -94,7 +94,8 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
* @return A
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* {@link StandardFilter}, {@link EnglishPossessiveFilter},
* {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* provided and {@link PorterStemFilter}.
*/
@ -103,9 +104,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
// prior to this we get the classic behavior, standardfilter does it for us.
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new EnglishPossessiveFilter(matchVersion, result);
result = new EnglishPossessiveFilter(matchVersion, result);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -26,30 +26,13 @@ import org.apache.lucene.util.Version;
/**
* TokenFilter that removes possessives (trailing 's) from words.
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating EnglishPossessiveFilter:
* <ul>
* <li> As of 3.6, U+2019 RIGHT SINGLE QUOTATION MARK and
* U+FF07 FULLWIDTH APOSTROPHE are also treated as
* quotation marks.
* </ul>
*/
public final class EnglishPossessiveFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private Version matchVersion;
/**
* @deprecated Use {@link #EnglishPossessiveFilter(Version, TokenStream)} instead.
*/
@Deprecated
public EnglishPossessiveFilter(TokenStream input) {
this(Version.LUCENE_35, input);
}
// NOTE: version now unused
public EnglishPossessiveFilter(Version version, TokenStream input) {
super(input);
this.matchVersion = version;
}
@Override
@ -63,7 +46,8 @@ public final class EnglishPossessiveFilter extends TokenFilter {
if (bufferLength >= 2 &&
(buffer[bufferLength-2] == '\'' ||
(matchVersion.onOrAfter(Version.LUCENE_36) && (buffer[bufferLength-2] == '\u2019' || buffer[bufferLength-2] == '\uFF07'))) &&
buffer[bufferLength-2] == '\u2019' ||
buffer[bufferLength-2] == '\uFF07') &&
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
termAtt.setLength(bufferLength - 2); // Strip last 2 characters off
}

View File

@ -281,9 +281,9 @@ public class KStemmer {
DictEntry entry;
CharArrayMap<DictEntry> d = new CharArrayMap<DictEntry>(
Version.LUCENE_31, 1000, false);
Version.LUCENE_50, 1000, false);
d = new CharArrayMap<DictEntry>(Version.LUCENE_31, 1000, false);
d = new CharArrayMap<DictEntry>(Version.LUCENE_50, 1000, false);
for (int i = 0; i < exceptionWords.length; i++) {
if (!d.containsKey(exceptionWords[i])) {
entry = new DictEntry(exceptionWords[i], true);

View File

@ -34,17 +34,9 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.SpanishStemmer;
/**
* {@link Analyzer} for Spanish.
* <p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating SpanishAnalyzer:
* <ul>
* <li> As of 3.6, SpanishLightStemFilter is used for less aggressive stemming.
* </ul>
*/
public final class SpanishAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
@ -132,11 +124,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
result = new SpanishLightStemFilter(result);
} else {
result = new SnowballFilter(result, new SpanishStemmer());
}
result = new SpanishLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -24,7 +24,6 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
@ -36,7 +35,7 @@ import org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Persian.
* <p>
* This Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
* This Analyzer uses {@link PersianCharFilter} which implies tokenizing around
* zero-width non-joiner in addition to whitespace. Some persian-specific variant forms (such as farsi
* yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
* </p>
@ -118,12 +117,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source;
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
source = new StandardTokenizer(matchVersion, reader);
} else {
source = new ArabicLetterTokenizer(matchVersion, reader);
}
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
@ -140,8 +134,6 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
*/
@Override
protected Reader initReader(Reader reader) {
return matchVersion.onOrAfter(Version.LUCENE_31) ?
new PersianCharFilter(CharReader.get(reader)) :
reader;
return new PersianCharFilter(CharReader.get(reader));
}
}

View File

@ -35,7 +35,6 @@ import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
/**
* {@link Analyzer} for French language.
@ -47,53 +46,11 @@ import java.util.Arrays;
* exclusion list is empty by default.
* </p>
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating FrenchAnalyzer:
* <ul>
* <li> As of 3.6, FrenchLightStemFilter is used for less aggressive stemming.
* <li> As of 3.1, Snowball stemming is done with SnowballFilter,
* LowerCaseFilter is used prior to StopFilter, and ElisionFilter and
* Snowball stopwords are used by default.
* <li> As of 2.9, StopFilter preserves position
* increments
* </ul>
*
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public final class FrenchAnalyzer extends StopwordAnalyzerBase {
/**
* Extended list of typical French stopwords.
* @deprecated (3.1) remove in Lucene 5.0 (index bw compat)
*/
@Deprecated
private final static String[] FRENCH_STOP_WORDS = {
"a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
"autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
"c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
"certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
"combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
"dedans", "dehors", "delà", "depuis", "derrière", "des", "désormais", "desquelles",
"desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse",
"diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles",
"en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis",
"hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
"le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "",
"ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi",
"moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre",
"nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "", "par", "parmi",
"partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi",
"proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels",
"qui", "quoi", "quoique", "revoici", "revoilà", "s", "sa", "sans", "sauf", "se", "selon",
"seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit",
"son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes",
"tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers",
"voici", "voilà", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à", "ça", "ès",
"été", "être", "ô"
};
/** File containing default French stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt";
@ -111,11 +68,6 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
}
private static class DefaultSetHolder {
/** @deprecated (3.1) remove this in Lucene 5.0, index bw compat */
@Deprecated
static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
false));
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
@ -133,9 +85,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet}).
*/
public FrenchAnalyzer(Version matchVersion) {
this(matchVersion,
matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
: DefaultSetHolder.DEFAULT_STOP_SET_30);
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
@ -182,30 +132,15 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new ElisionFilter(matchVersion, result);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!excltable.isEmpty())
result = new KeywordMarkerFilter(result, excltable);
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
result = new FrenchLightStemFilter(result);
} else {
result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
}
return new TokenStreamComponents(source, result);
} else {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new StopFilter(matchVersion, result, stopwords);
if(!excltable.isEmpty())
result = new KeywordMarkerFilter(result, excltable);
result = new FrenchStemFilter(result);
// Convert to lowercase after stemming!
return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
}
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new ElisionFilter(matchVersion, result);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!excltable.isEmpty())
result = new KeywordMarkerFilter(result, excltable);
result = new FrenchLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -1,90 +0,0 @@
package org.apache.lucene.analysis.fr;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
/**
* A {@link TokenFilter} that stems french words.
* <p>
* The used stemmer can be changed at runtime after the
* filter object is created (as long as it is a {@link FrenchStemmer}).
* </p>
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerFilter
* @deprecated (3.1) Use {@link SnowballFilter} with
* {@link org.tartarus.snowball.ext.FrenchStemmer} instead, which has the
* same functionality. This filter will be removed in Lucene 5.0
*/
@Deprecated
public final class FrenchStemFilter extends TokenFilter {
/**
* The actual token in the input stream.
*/
private FrenchStemmer stemmer = new FrenchStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public FrenchStemFilter( TokenStream in ) {
super(in);
}
/**
* @return Returns true for the next token in the stream, or false at EOS
*/
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
String term = termAtt.toString();
// Check the exclusion table
if (!keywordAttr.isKeyword()) {
String s = stemmer.stem( term );
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals( term ) )
termAtt.setEmpty().append(s);
}
return true;
} else {
return false;
}
}
/**
* Set a alternative/custom {@link FrenchStemmer} for this filter.
*/
public void setStemmer( FrenchStemmer stemmer ) {
if ( stemmer != null ) {
this.stemmer = stemmer;
}
}
}

View File

@ -1,712 +0,0 @@
package org.apache.lucene.analysis.fr;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A stemmer for French words.
* <p>
* The algorithm is based on the work of
* Dr Martin Porter on his snowball project<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html<br>
* (French stemming algorithm) for details
* </p>
* @deprecated Use {@link org.tartarus.snowball.ext.FrenchStemmer} instead,
* which has the same functionality. This filter will be removed in Lucene 4.0
*/
@Deprecated
public class FrenchStemmer {
/**
* Buffer for the terms while stemming them.
*/
private StringBuilder sb = new StringBuilder();
/**
* A temporary buffer, used to reconstruct R2
*/
private StringBuilder tb = new StringBuilder();
/**
* Region R0 is equal to the whole buffer
*/
private String R0;
/**
* Region RV
* "If the word begins with two vowels, RV is the region after the third letter,
* otherwise the region after the first vowel not at the beginning of the word,
* or the end of the word if these positions cannot be found."
*/
private String RV;
/**
* Region R1
* "R1 is the region after the first non-vowel following a vowel
* or is the null region at the end of the word if there is no such non-vowel"
*/
private String R1;
/**
* Region R2
* "R2 is the region after the first non-vowel in R1 following a vowel
* or is the null region at the end of the word if there is no such non-vowel"
*/
private String R2;
/**
* Set to true if we need to perform step 2
*/
private boolean suite;
/**
* Set to true if the buffer was modified
*/
private boolean modified;
/**
* Stems the given term to a unique <tt>discriminator</tt>.
*
* @param term java.langString The term that should be stemmed
* @return java.lang.String Discriminator for <tt>term</tt>
*/
protected String stem( String term ) {
if ( !isStemmable( term ) ) {
return term;
}
// Use lowercase for medium stemming.
term = term.toLowerCase();
// Reset the StringBuilder.
sb.delete( 0, sb.length() );
sb.insert( 0, term );
// reset the booleans
modified = false;
suite = false;
sb = treatVowels( sb );
setStrings();
step1();
if (!modified || suite)
{
if (RV != null)
{
suite = step2a();
if (!suite)
step2b();
}
}
if (modified || suite)
step3();
else
step4();
step5();
step6();
return sb.toString();
}
/**
* Sets the search region Strings<br>
* it needs to be done each time the buffer was modified
*/
private void setStrings() {
// set the strings
R0 = sb.toString();
RV = retrieveRV( sb );
R1 = retrieveR( sb );
if ( R1 != null )
{
tb.delete( 0, tb.length() );
tb.insert( 0, R1 );
R2 = retrieveR( tb );
}
else
R2 = null;
}
/**
* First step of the Porter Algorithm<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void step1( ) {
String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
deleteFrom( R2, suffix );
replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" );
deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
deleteFrom( RV, new String[] { "ements", "ement" } );
deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
String[] autre = { "ifs", "ives", "if", "ive" };
deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
replaceFrom( R0, new String[] { "eaux" }, "eau" );
replaceFrom( R1, new String[] { "aux" }, "al" );
deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
deleteFrom( R2, new String[] { "eux" } );
// if one of the next steps is performed, we will need to perform step2a
boolean temp = false;
temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
if (temp == true)
suite = true;
temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
if (temp == true)
suite = true;
temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
if (temp == true)
suite = true;
}
/**
* Second step (A) of the Porter Algorithm<br>
* Will be performed if nothing changed from the first step
* or changed were done in the amment, emment, ments or ment suffixes<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*
* @return boolean - true if something changed in the StringBuilder
*/
private boolean step2a() {
String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
"irent", "iriez", "irez", "irions", "irons", "iront",
"issaIent", "issais", "issantes", "issante", "issants", "issant",
"issait", "issais", "issions", "issons", "issiez", "issez", "issent",
"isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
}
/**
* Second step (B) of the Porter Algorithm<br>
* Will be performed if step 2 A was performed unsuccessfully<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void step2b() {
String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
"erons", "eront","erez", "èrent", "era", "ées", "iez",
"ée", "és", "er", "ez", "é" };
deleteFrom( RV, suffix );
String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
"antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
"ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
deleteButSuffixFrom( RV, search, "e", true );
deleteFrom( R2, new String[] { "ions" } );
}
/**
* Third step of the Porter Algorithm<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void step3() {
if (sb.length()>0)
{
char ch = sb.charAt( sb.length()-1 );
if (ch == 'Y')
{
sb.setCharAt( sb.length()-1, 'i' );
setStrings();
}
else if (ch == 'ç')
{
sb.setCharAt( sb.length()-1, 'c' );
setStrings();
}
}
}
/**
* Fourth step of the Porter Algorithm<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void step4() {
if (sb.length() > 1)
{
char ch = sb.charAt( sb.length()-1 );
if (ch == 's')
{
char b = sb.charAt( sb.length()-2 );
if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
{
sb.delete( sb.length() - 1, sb.length());
setStrings();
}
}
}
boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
if (!found)
found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
deleteFrom( RV, new String[] { "e" } );
deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
}
/**
* Fifth step of the Porter Algorithm<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void step5() {
if (R0 != null)
{
if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
{
sb.delete( sb.length() - 1, sb.length() );
setStrings();
}
}
}
/**
* Sixth (and last!) step of the Porter Algorithm<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void step6() {
if (R0!=null && R0.length()>0)
{
boolean seenVowel = false;
boolean seenConson = false;
int pos = -1;
for (int i = R0.length()-1; i > -1; i--)
{
char ch = R0.charAt(i);
if (isVowel(ch))
{
if (!seenVowel)
{
if (ch == 'é' || ch == 'è')
{
pos = i;
break;
}
}
seenVowel = true;
}
else
{
if (seenVowel)
break;
else
seenConson = true;
}
}
if (pos > -1 && seenConson && !seenVowel)
sb.setCharAt(pos, 'e');
}
}
/**
* Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
*
* @param source java.lang.String - the primary source zone for search
* @param search java.lang.String[] - the strings to search for suppression
* @param from java.lang.String - the secondary source zone for search
* @param prefix java.lang.String - the prefix to add to the search string to test
* @return boolean - true if modified
*/
private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
boolean found = false;
if (source!=null )
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( search[i] ))
{
if (from!=null && from.endsWith( prefix + search[i] ))
{
sb.delete( sb.length() - search[i].length(), sb.length());
found = true;
setStrings();
break;
}
}
}
}
return found;
}
/**
* Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
*
* @param source java.lang.String - the primary source zone for search
* @param search java.lang.String[] - the strings to search for suppression
* @param vowel boolean - true if we need a vowel before the search string
* @param from java.lang.String - the secondary source zone for search (where vowel could be)
* @return boolean - true if modified
*/
private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) {
boolean found = false;
if (source!=null && from!=null)
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( search[i] ))
{
if ((search[i].length() + 1) <= from.length())
{
boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
if (test == vowel)
{
sb.delete( sb.length() - search[i].length(), sb.length());
modified = true;
found = true;
setStrings();
break;
}
}
}
}
}
return found;
}
/**
* Delete a suffix searched in zone "source" if preceded by the prefix
*
* @param source java.lang.String - the primary source zone for search
* @param search java.lang.String[] - the strings to search for suppression
* @param prefix java.lang.String - the prefix to add to the search string to test
* @param without boolean - true if it will be deleted even without prefix found
*/
private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) {
if (source!=null)
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( prefix + search[i] ))
{
sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
modified = true;
setStrings();
break;
}
else if ( without && source.endsWith( search[i] ))
{
sb.delete( sb.length() - search[i].length(), sb.length() );
modified = true;
setStrings();
break;
}
}
}
}
/**
* Delete a suffix searched in zone "source" if preceded by prefix<br>
* or replace it with the replace string if preceded by the prefix in the zone "from"<br>
* or delete the suffix if specified
*
* @param source java.lang.String - the primary source zone for search
* @param search java.lang.String[] - the strings to search for suppression
* @param prefix java.lang.String - the prefix to add to the search string to test
* @param without boolean - true if it will be deleted even without prefix found
*/
private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) {
if (source!=null)
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( prefix + search[i] ))
{
sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
modified = true;
setStrings();
break;
}
else if ( from!=null && from.endsWith( prefix + search[i] ))
{
sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
modified = true;
setStrings();
break;
}
else if ( without && source.endsWith( search[i] ))
{
sb.delete( sb.length() - search[i].length(), sb.length() );
modified = true;
setStrings();
break;
}
}
}
}
/**
* Replace a search string with another within the source zone
*
* @param source java.lang.String - the source zone for search
* @param search java.lang.String[] - the strings to search for replacement
* @param replace java.lang.String - the replacement string
*/
private boolean replaceFrom( String source, String[] search, String replace ) {
boolean found = false;
if (source!=null)
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( search[i] ))
{
sb.replace( sb.length() - search[i].length(), sb.length(), replace );
modified = true;
found = true;
setStrings();
break;
}
}
}
return found;
}
/**
* Delete a search string within the source zone
*
* @param source the source zone for search
* @param suffix the strings to search for suppression
*/
private void deleteFrom(String source, String[] suffix ) {
if (source!=null)
{
for (int i = 0; i < suffix.length; i++) {
if (source.endsWith( suffix[i] ))
{
sb.delete( sb.length() - suffix[i].length(), sb.length());
modified = true;
setStrings();
break;
}
}
}
}
/**
* Test if a char is a french vowel, including accentuated ones
*
* @param ch the char to test
* @return boolean - true if the char is a vowel
*/
private boolean isVowel(char ch) {
switch (ch)
{
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
case 'y':
case 'â':
case 'à':
case 'ë':
case 'é':
case 'ê':
case 'è':
case 'ï':
case 'î':
case 'ô':
case 'ü':
case 'ù':
case 'û':
return true;
default:
return false;
}
}
/**
* Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
* "R is the region after the first non-vowel following a vowel
* or is the null region at the end of the word if there is no such non-vowel"<br>
* @param buffer java.lang.StringBuilder - the in buffer
* @return java.lang.String - the resulting string
*/
private String retrieveR( StringBuilder buffer ) {
int len = buffer.length();
int pos = -1;
for (int c = 0; c < len; c++) {
if (isVowel( buffer.charAt( c )))
{
pos = c;
break;
}
}
if (pos > -1)
{
int consonne = -1;
for (int c = pos; c < len; c++) {
if (!isVowel(buffer.charAt( c )))
{
consonne = c;
break;
}
}
if (consonne > -1 && (consonne+1) < len)
return buffer.substring( consonne+1, len );
else
return null;
}
else
return null;
}
/**
* Retrieve the "RV zone" from a buffer an return the corresponding string<br>
* "If the word begins with two vowels, RV is the region after the third letter,
* otherwise the region after the first vowel not at the beginning of the word,
* or the end of the word if these positions cannot be found."<br>
* @param buffer java.lang.StringBuilder - the in buffer
* @return java.lang.String - the resulting string
*/
private String retrieveRV( StringBuilder buffer ) {
int len = buffer.length();
if ( buffer.length() > 3)
{
if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
return buffer.substring(3,len);
}
else
{
int pos = 0;
for (int c = 1; c < len; c++) {
if (isVowel( buffer.charAt( c )))
{
pos = c;
break;
}
}
if ( pos+1 < len )
return buffer.substring( pos+1, len );
else
return null;
}
}
else
return null;
}
/**
* Turns u and i preceded AND followed by a vowel to UpperCase<br>
* Turns y preceded OR followed by a vowel to UpperCase<br>
* Turns u preceded by q to UpperCase<br>
*
* @param buffer java.util.StringBuilder - the buffer to treat
* @return java.util.StringBuilder - the treated buffer
*/
private StringBuilder treatVowels( StringBuilder buffer ) {
for ( int c = 0; c < buffer.length(); c++ ) {
char ch = buffer.charAt( c );
if (c == 0) // first char
{
if (buffer.length()>1)
{
if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
buffer.setCharAt( c, 'Y' );
}
}
else if (c == buffer.length()-1) // last char
{
if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
buffer.setCharAt( c, 'U' );
if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
buffer.setCharAt( c, 'Y' );
}
else // other cases
{
if (ch == 'u')
{
if (buffer.charAt( c - 1) == 'q')
buffer.setCharAt( c, 'U' );
else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
buffer.setCharAt( c, 'U' );
}
if (ch == 'i')
{
if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
buffer.setCharAt( c, 'I' );
}
if (ch == 'y')
{
if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
buffer.setCharAt( c, 'Y' );
}
}
}
return buffer;
}
/**
* Checks a term if it can be processed correctly.
*
* @return boolean - true if, and only if, the given term consists in letters.
*/
private boolean isStemmable( String term ) {
boolean upper = false;
int first = -1;
for ( int c = 0; c < term.length(); c++ ) {
// Discard terms that contain non-letter characters.
if ( !Character.isLetter( term.charAt( c ) ) ) {
return false;
}
// Discard terms that contain multiple uppercase letters.
if ( Character.isUpperCase( term.charAt( c ) ) ) {
if ( upper ) {
return false;
}
// First encountered uppercase letter, set flag and save
// position.
else {
first = c;
upper = true;
}
}
}
// Discard the term if it contains a single uppercase letter that
// is not starting the term.
if ( first > 0 ) {
return false;
}
return true;
}
}

View File

@ -29,18 +29,10 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.in.IndicTokenizer;
import org.apache.lucene.util.Version;
/**
* Analyzer for Hindi.
* <p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating HindiAnalyzer:
* <ul>
* <li> As of 3.6, StandardTokenizer is used for tokenization
* </ul>
*/
public final class HindiAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
@ -126,12 +118,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source;
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
source = new StandardTokenizer(matchVersion, reader);
} else {
source = new IndicTokenizer(matchVersion, reader);
}
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
if (!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);

View File

@ -1,53 +0,0 @@
package org.apache.lucene.analysis.in;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer; // javadocs
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/**
* Simple Tokenizer for text in Indian Languages.
* @deprecated (3.6) Use {@link StandardTokenizer} instead.
*/
@Deprecated
public final class IndicTokenizer extends CharTokenizer {
public IndicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
super(matchVersion, factory, input);
}
public IndicTokenizer(Version matchVersion, AttributeSource source, Reader input) {
super(matchVersion, source, input);
}
public IndicTokenizer(Version matchVersion, Reader input) {
super(matchVersion, input);
}
@Override
protected boolean isTokenChar(int c) {
return Character.isLetter(c)
|| Character.getType(c) == Character.NON_SPACING_MARK
|| Character.getType(c) == Character.FORMAT
|| Character.getType(c) == Character.COMBINING_SPACING_MARK;
}
}

View File

@ -36,19 +36,9 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.ItalianStemmer;
/**
* {@link Analyzer} for Italian.
* <p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating ItalianAnalyzer:
* <ul>
* <li> As of 3.6, ItalianLightStemFilter is used for less aggressive stemming.
* <li> As of 3.2, ElisionFilter with a set of Italian
* contractions is used by default.
* </ul>
*/
public final class ItalianAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
@ -139,18 +129,12 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
if (matchVersion.onOrAfter(Version.LUCENE_32)) {
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
}
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
result = new ItalianLightStemFilter(result);
} else {
result = new SnowballFilter(result, new ItalianStemmer());
}
result = new ItalianLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -1,518 +0,0 @@
package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
* {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern}
* (with behaviour identical to {@link String#split(String)}),
* and that combines the functionality of
* {@link org.apache.lucene.analysis.core.LetterTokenizer},
* {@link org.apache.lucene.analysis.core.LowerCaseTokenizer},
* {@link org.apache.lucene.analysis.core.WhitespaceTokenizer},
* {@link org.apache.lucene.analysis.core.StopFilter} into a single efficient
* multi-purpose class.
* <p>
* If you are unsure how exactly a regular expression should look like, consider
* prototyping by simply trying various expressions on some test texts via
* {@link String#split(String)}. Once you are satisfied, give that regex to
* PatternAnalyzer. Also see <a target="_blank"
* href="http://java.sun.com/docs/books/tutorial/extra/regex/">Java Regular Expression Tutorial</a>.
* <p>
* This class can be considerably faster than the "normal" Lucene tokenizers.
* It can also serve as a building block in a compound Lucene
* {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this
* stemming example:
* <pre>
* PatternAnalyzer pat = ...
* TokenStream tokenStream = new SnowballFilter(
* pat.tokenStream("content", "James is running round in the woods"),
* "English"));
* </pre>
* @deprecated (4.0) use the pattern-based analysis in the analysis/pattern package instead.
*/
@Deprecated
public final class PatternAnalyzer extends Analyzer {
/** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */
public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
/** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
private static final CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
CharArraySet.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(
"a", "about", "above", "across", "adj", "after", "afterwards",
"again", "against", "albeit", "all", "almost", "alone", "along",
"already", "also", "although", "always", "among", "amongst", "an",
"and", "another", "any", "anyhow", "anyone", "anything",
"anywhere", "are", "around", "as", "at", "be", "became", "because",
"become", "becomes", "becoming", "been", "before", "beforehand",
"behind", "being", "below", "beside", "besides", "between",
"beyond", "both", "but", "by", "can", "cannot", "co", "could",
"down", "during", "each", "eg", "either", "else", "elsewhere",
"enough", "etc", "even", "ever", "every", "everyone", "everything",
"everywhere", "except", "few", "first", "for", "former",
"formerly", "from", "further", "had", "has", "have", "he", "hence",
"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
"must", "my", "myself", "namely", "neither", "never",
"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
"once one", "only", "onto", "or", "other", "others", "otherwise",
"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
"rather", "s", "same", "seem", "seemed", "seeming", "seems",
"several", "she", "should", "since", "so", "some", "somehow",
"someone", "something", "sometime", "sometimes", "somewhere",
"still", "such", "t", "than", "that", "the", "their", "them",
"themselves", "then", "thence", "there", "thereafter", "thereby",
"therefor", "therein", "thereupon", "these", "they", "this",
"those", "though", "through", "throughout", "thru", "thus", "to",
"together", "too", "toward", "towards", "under", "until", "up",
"upon", "us", "very", "via", "was", "we", "well", "were", "what",
"whatever", "whatsoever", "when", "whence", "whenever",
"whensoever", "where", "whereafter", "whereas", "whereat",
"whereby", "wherefrom", "wherein", "whereinto", "whereof",
"whereon", "whereto", "whereunto", "whereupon", "wherever",
"wherewith", "whether", "which", "whichever", "whichsoever",
"while", "whilst", "whither", "who", "whoever", "whole", "whom",
"whomever", "whomsoever", "whose", "whosoever", "why", "will",
"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
"yourselves"
), true));
/**
* A lower-casing word analyzer with English stop words (can be shared
* freely across threads without harm); global per class loader.
*/
public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
/**
* A lower-casing word analyzer with <b>extended </b> English stop words
* (can be shared freely across threads without harm); global per class
* loader. The stop words are borrowed from
* http://thomas.loc.gov/home/stopwords.html, see
* http://thomas.loc.gov/home/all.about.inquery.html
*/
public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
private final Pattern pattern;
private final boolean toLowerCase;
private final CharArraySet stopWords;
private final Version matchVersion;
/**
* Constructs a new instance with the given parameters.
*
* @param matchVersion currently does nothing
* @param pattern
* a regular expression delimiting tokens
* @param toLowerCase
* if <code>true</code> returns tokens after applying
* String.toLowerCase()
* @param stopWords
* if non-null, ignores all tokens that are contained in the
* given stop set (after previously having applied toLowerCase()
* if applicable). For example, created via
* {@link StopFilter#makeStopSet(Version, String[])}and/or
* {@link org.apache.lucene.analysis.util.WordlistLoader}as in
* <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
* or <a href="http://www.unine.ch/info/clef/">other stop words
* lists </a>.
*/
public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, CharArraySet stopWords) {
if (pattern == null)
throw new IllegalArgumentException("pattern must not be null");
if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
if (stopWords != null && stopWords.size() == 0) stopWords = null;
this.pattern = pattern;
this.toLowerCase = toLowerCase;
this.stopWords = stopWords;
this.matchVersion = matchVersion;
}
/**
* Creates a token stream that tokenizes the given string into token terms
* (aka words).
*
* @param fieldName
* the name of the field to tokenize (currently ignored).
* @param reader
* reader (e.g. charfilter) of the original text. can be null.
* @param text
* the string to tokenize
* @return a new token stream
*/
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
// Ideally the Analyzer superclass should have a method with the same signature,
// with a default impl that simply delegates to the StringReader flavour.
if (text == null)
throw new IllegalArgumentException("text must not be null");
if (pattern == NON_WORD_PATTERN) { // fast path
return new TokenStreamComponents(new FastStringTokenizer(reader, text, true, toLowerCase, stopWords));
} else if (pattern == WHITESPACE_PATTERN) { // fast path
return new TokenStreamComponents(new FastStringTokenizer(reader, text, false, toLowerCase, stopWords));
}
Tokenizer tokenizer = new PatternTokenizer(reader, text, pattern, toLowerCase);
TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
return new TokenStreamComponents(tokenizer, result);
}
/**
* Creates a token stream that tokenizes all the text in the given Reader;
* This implementation forwards to <code>tokenStream(String, Reader, String)</code> and is
* less efficient than <code>tokenStream(String, Reader, String)</code>.
*
* @param fieldName
* the name of the field to tokenize (currently ignored).
* @param reader
* the reader delivering the text
* @return a new token stream
*/
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
try {
String text = toString(reader);
return createComponents(fieldName, reader, text);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Indicates whether some other object is "equal to" this one.
*
* @param other
* the reference object with which to compare.
* @return true if equal, false otherwise
*/
@Override
public boolean equals(Object other) {
if (this == other) return true;
if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
if (other instanceof PatternAnalyzer) {
PatternAnalyzer p2 = (PatternAnalyzer) other;
return
toLowerCase == p2.toLowerCase &&
eqPattern(pattern, p2.pattern) &&
eq(stopWords, p2.stopWords);
}
return false;
}
/**
* Returns a hash code value for the object.
*
* @return the hash code.
*/
@Override
public int hashCode() {
if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
int h = 1;
h = 31*h + pattern.pattern().hashCode();
h = 31*h + pattern.flags();
h = 31*h + (toLowerCase ? 1231 : 1237);
h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
return h;
}
/** equality where o1 and/or o2 can be null */
private static boolean eq(Object o1, Object o2) {
return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
}
/** assumes p1 and p2 are not null */
private static boolean eqPattern(Pattern p1, Pattern p2) {
return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
}
/**
* Reads until end-of-stream and returns all read chars, finally closes the stream.
*
* @param input the input stream
* @throws IOException if an I/O error occurs while reading the stream
*/
private static String toString(Reader input) throws IOException {
if (input instanceof FastStringReader) { // fast path
return ((FastStringReader) input).getString();
}
try {
int len = 256;
char[] buffer = new char[len];
char[] output = new char[len];
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
char[] tmp = new char[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
return new String(output, 0, len);
} finally {
input.close();
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* The work horse; performance isn't fantastic, but it's not nearly as bad
* as one might think - kudos to the Sun regex developers.
*/
private static final class PatternTokenizer extends Tokenizer {
private final Pattern pattern;
private String str;
private final boolean toLowerCase;
private Matcher matcher;
private int pos = 0;
private static final Locale locale = Locale.getDefault();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public PatternTokenizer(Reader input, String str, Pattern pattern, boolean toLowerCase) {
super(input);
this.pattern = pattern;
this.str = str;
this.matcher = pattern.matcher(str);
this.toLowerCase = toLowerCase;
}
@Override
public final boolean incrementToken() {
if (matcher == null) return false;
clearAttributes();
while (true) { // loop takes care of leading and trailing boundary cases
int start = pos;
int end;
boolean isMatch = matcher.find();
if (isMatch) {
end = matcher.start();
pos = matcher.end();
} else {
end = str.length();
matcher = null; // we're finished
}
if (start != end) { // non-empty match (header/trailer)
String text = str.substring(start, end);
if (toLowerCase) text = text.toLowerCase(locale);
termAtt.setEmpty().append(text);
offsetAtt.setOffset(correctOffset(start), correctOffset(end));
return true;
}
if (!isMatch) return false;
}
}
@Override
public final void end() {
// set final offset
final int finalOffset = correctOffset(str.length());
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
this.str = PatternAnalyzer.toString(input);
this.matcher = pattern.matcher(this.str);
}
@Override
public void reset() throws IOException {
super.reset();
this.pos = 0;
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* Special-case class for best performance in common cases; this class is
* otherwise unnecessary.
*/
private static final class FastStringTokenizer extends Tokenizer {
private String str;
private int pos;
private final boolean isLetter;
private final boolean toLowerCase;
private final CharArraySet stopWords;
private static final Locale locale = Locale.getDefault();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, CharArraySet stopWords) {
super(input);
this.str = str;
this.isLetter = isLetter;
this.toLowerCase = toLowerCase;
this.stopWords = stopWords;
}
@Override
public boolean incrementToken() {
clearAttributes();
// cache loop instance vars (performance)
String s = str;
int len = s.length();
int i = pos;
boolean letter = isLetter;
int start = 0;
String text;
do {
// find beginning of token
text = null;
while (i < len && !isTokenChar(s.charAt(i), letter)) {
i++;
}
if (i < len) { // found beginning; now find end of token
start = i;
while (i < len && isTokenChar(s.charAt(i), letter)) {
i++;
}
text = s.substring(start, i);
if (toLowerCase) text = text.toLowerCase(locale);
// if (toLowerCase) {
//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
// text = s.substring(start, i).toLowerCase();
//// char[] chars = new char[i-start];
//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
//// text = new String(chars);
// } else {
// text = s.substring(start, i);
// }
}
} while (text != null && isStopWord(text));
pos = i;
if (text == null)
{
return false;
}
termAtt.setEmpty().append(text);
offsetAtt.setOffset(correctOffset(start), correctOffset(i));
return true;
}
@Override
public final void end() {
// set final offset
final int finalOffset = str.length();
this.offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
}
private boolean isTokenChar(char c, boolean isLetter) {
return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
}
private boolean isStopWord(String text) {
return stopWords != null && stopWords.contains(text);
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
this.str = PatternAnalyzer.toString(input);
}
@Override
public void reset() throws IOException {
super.reset();
this.pos = 0;
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* A StringReader that exposes it's contained string for fast direct access.
* Might make sense to generalize this to CharSequence and make it public?
*/
static final class FastStringReader extends StringReader {
private final String s;
FastStringReader(String s) {
super(s);
this.s = s;
}
String getString() {
return s;
}
}
}

View File

@ -35,7 +35,7 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
// use a fixed version, as we don't care about case sensitivity.
private final CharArraySet previous = new CharArraySet(Version.LUCENE_31, 8, false);
private final CharArraySet previous = new CharArraySet(Version.LUCENE_50, 8, false);
/**
* Creates a new RemoveDuplicatesTokenFilter

View File

@ -47,20 +47,6 @@ import java.io.Reader;
* A default set of stopwords is used unless an alternative list is specified, but the
* exclusion list is empty by default.
* </p>
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating DutchAnalyzer:
* <ul>
* <li> As of 3.6, {@link #DutchAnalyzer(Version, CharArraySet)} and
* {@link #DutchAnalyzer(Version, CharArraySet, CharArraySet)} also populate
* the default entries for the stem override dictionary
* <li> As of 3.1, Snowball stemming is done with SnowballFilter,
* LowerCaseFilter is used prior to StopFilter, and Snowball
* stopwords are used by default.
* <li> As of 2.9, StopFilter preserves position
* increments
* </ul>
*
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
@ -119,26 +105,15 @@ public final class DutchAnalyzer extends Analyzer {
*
*/
public DutchAnalyzer(Version matchVersion) {
// historically, only this ctor populated the stem dict!!!!!
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
}
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords){
// historically, this ctor never the stem dict!!!!!
// so we populate it only for >= 3.6
this(matchVersion, stopwords, CharArraySet.EMPTY_SET,
matchVersion.onOrAfter(Version.LUCENE_36)
? DefaultSetHolder.DEFAULT_STEM_DICT
: CharArrayMap.<String>emptyMap());
this(matchVersion, stopwords, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
}
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable){
// historically, this ctor never the stem dict!!!!!
// so we populate it only for >= 3.6
this(matchVersion, stopwords, stemExclusionTable,
matchVersion.onOrAfter(Version.LUCENE_36)
? DefaultSetHolder.DEFAULT_STEM_DICT
: CharArrayMap.<String>emptyMap());
this(matchVersion, stopwords, stemExclusionTable, DefaultSetHolder.DEFAULT_STEM_DICT);
}
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) {
@ -160,25 +135,15 @@ public final class DutchAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader aReader) {
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stoptable);
if (!excltable.isEmpty())
result = new KeywordMarkerFilter(result, excltable);
if (!stemdict.isEmpty())
result = new StemmerOverrideFilter(matchVersion, result, stemdict);
result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
return new TokenStreamComponents(source, result);
} else {
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new StopFilter(matchVersion, result, stoptable);
if (!excltable.isEmpty())
result = new KeywordMarkerFilter(result, excltable);
result = new DutchStemFilter(result, stemdict);
return new TokenStreamComponents(source, result);
}
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stoptable);
if (!excltable.isEmpty())
result = new KeywordMarkerFilter(result, excltable);
if (!stemdict.isEmpty())
result = new StemmerOverrideFilter(matchVersion, result, stemdict);
result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
return new TokenStreamComponents(source, result);
}
}

View File

@ -1,108 +0,0 @@
package org.apache.lucene.analysis.nl;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* A {@link TokenFilter} that stems Dutch words.
* <p>
* It supports a table of words that should
* not be stemmed at all. The stemmer used can be changed at runtime after the
* filter object is created (as long as it is a {@link DutchStemmer}).
* </p>
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerFilter
* @deprecated (3.1) Use {@link SnowballFilter} with
* {@link org.tartarus.snowball.ext.DutchStemmer} instead, which has the
* same functionality. This filter will be removed in Lucene 5.0
*/
@Deprecated
public final class DutchStemFilter extends TokenFilter {
/**
* The actual token in the input stream.
*/
private DutchStemmer stemmer = new DutchStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public DutchStemFilter(TokenStream _in) {
super(_in);
}
/**
* @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
*/
public DutchStemFilter(TokenStream _in, Map<?,?> stemdictionary) {
this(_in);
stemmer.setStemDictionary(stemdictionary);
}
/**
* Returns the next token in the stream, or null at EOS
*/
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
final String term = termAtt.toString();
// Check the exclusion table.
if (!keywordAttr.isKeyword()) {
final String s = stemmer.stem(term);
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals(term))
termAtt.setEmpty().append(s);
}
return true;
} else {
return false;
}
}
/**
* Set a alternative/custom {@link DutchStemmer} for this filter.
*/
public void setStemmer(DutchStemmer stemmer) {
if (stemmer != null) {
this.stemmer = stemmer;
}
}
/**
* Set dictionary for stemming, this dictionary overrules the algorithm,
* so you can correct for a particular unwanted word-stem pair.
*/
public void setStemDictionary(HashMap<?,?> dict) {
if (stemmer != null)
stemmer.setStemDictionary(dict);
}
}

View File

@ -1,409 +0,0 @@
package org.apache.lucene.analysis.nl;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
/**
* A stemmer for Dutch words.
* <p>
* The algorithm is an implementation of
* the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
* algorithm in Martin Porter's snowball project.
* </p>
* @deprecated (3.1) Use {@link org.tartarus.snowball.ext.DutchStemmer} instead,
* which has the same functionality. This filter will be removed in Lucene 5.0
*/
@Deprecated
public class DutchStemmer {
/**
* Buffer for the terms while stemming them.
*/
private StringBuilder sb = new StringBuilder();
private boolean _removedE;
private Map _stemDict;
private int _R1;
private int _R2;
//TODO convert to internal
/*
* Stems the given term to an unique <tt>discriminator</tt>.
*
* @param term The term that should be stemmed.
* @return Discriminator for <tt>term</tt>
*/
public String stem(String term) {
term = term.toLowerCase();
if (!isStemmable(term))
return term;
if (_stemDict != null && _stemDict.containsKey(term))
if (_stemDict.get(term) instanceof String)
return (String) _stemDict.get(term);
else
return null;
// Reset the StringBuilder.
sb.delete(0, sb.length());
sb.insert(0, term);
// Stemming starts here...
substitute(sb);
storeYandI(sb);
_R1 = getRIndex(sb, 0);
_R1 = Math.max(3, _R1);
step1(sb);
step2(sb);
_R2 = getRIndex(sb, _R1);
step3a(sb);
step3b(sb);
step4(sb);
reStoreYandI(sb);
return sb.toString();
}
private boolean enEnding(StringBuilder sb) {
String[] enend = new String[]{"ene", "en"};
for (int i = 0; i < enend.length; i++) {
String end = enend[i];
String s = sb.toString();
int index = s.length() - end.length();
if (s.endsWith(end) &&
index >= _R1 &&
isValidEnEnding(sb, index - 1)
) {
sb.delete(index, index + end.length());
unDouble(sb, index);
return true;
}
}
return false;
}
private void step1(StringBuilder sb) {
if (_R1 >= sb.length())
return;
String s = sb.toString();
int lengthR1 = sb.length() - _R1;
int index;
if (s.endsWith("heden")) {
sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
return;
}
if (enEnding(sb))
return;
if (s.endsWith("se") &&
(index = s.length() - 2) >= _R1 &&
isValidSEnding(sb, index - 1)
) {
sb.delete(index, index + 2);
return;
}
if (s.endsWith("s") &&
(index = s.length() - 1) >= _R1 &&
isValidSEnding(sb, index - 1)) {
sb.delete(index, index + 1);
}
}
/**
* Delete suffix e if in R1 and
* preceded by a non-vowel, and then undouble the ending
*
* @param sb String being stemmed
*/
private void step2(StringBuilder sb) {
_removedE = false;
if (_R1 >= sb.length())
return;
String s = sb.toString();
int index = s.length() - 1;
if (index >= _R1 &&
s.endsWith("e") &&
!isVowel(sb.charAt(index - 1))) {
sb.delete(index, index + 1);
unDouble(sb);
_removedE = true;
}
}
/**
* Delete "heid"
*
* @param sb String being stemmed
*/
private void step3a(StringBuilder sb) {
if (_R2 >= sb.length())
return;
String s = sb.toString();
int index = s.length() - 4;
if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') {
sb.delete(index, index + 4); //remove heid
enEnding(sb);
}
}
/**
* <p>A d-suffix, or derivational suffix, enables a new word,
* often with a different grammatical category, or with a different
* sense, to be built from another word. Whether a d-suffix can be
* attached is discovered not from the rules of grammar, but by
* referring to a dictionary. So in English, ness can be added to
* certain adjectives to form corresponding nouns (littleness,
* kindness, foolishness ...) but not to all adjectives
* (not for example, to big, cruel, wise ...) d-suffixes can be
* used to change meaning, often in rather exotic ways.</p>
* Remove "ing", "end", "ig", "lijk", "baar" and "bar"
*
* @param sb String being stemmed
*/
private void step3b(StringBuilder sb) {
if (_R2 >= sb.length())
return;
String s = sb.toString();
int index = 0;
if ((s.endsWith("end") || s.endsWith("ing")) &&
(index = s.length() - 3) >= _R2) {
sb.delete(index, index + 3);
if (sb.charAt(index - 2) == 'i' &&
sb.charAt(index - 1) == 'g') {
if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
index -= 2;
sb.delete(index, index + 2);
}
} else {
unDouble(sb, index);
}
return;
}
if (s.endsWith("ig") &&
(index = s.length() - 2) >= _R2
) {
if (sb.charAt(index - 1) != 'e')
sb.delete(index, index + 2);
return;
}
if (s.endsWith("lijk") &&
(index = s.length() - 4) >= _R2
) {
sb.delete(index, index + 4);
step2(sb);
return;
}
if (s.endsWith("baar") &&
(index = s.length() - 4) >= _R2
) {
sb.delete(index, index + 4);
return;
}
if (s.endsWith("bar") &&
(index = s.length() - 3) >= _R2
) {
if (_removedE)
sb.delete(index, index + 3);
return;
}
}
/**
* undouble vowel
* If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
*
* @param sb String being stemmed
*/
private void step4(StringBuilder sb) {
if (sb.length() < 4)
return;
String end = sb.substring(sb.length() - 4, sb.length());
char c = end.charAt(0);
char v1 = end.charAt(1);
char v2 = end.charAt(2);
char d = end.charAt(3);
if (v1 == v2 &&
d != 'I' &&
v1 != 'i' &&
isVowel(v1) &&
!isVowel(d) &&
!isVowel(c)) {
sb.delete(sb.length() - 2, sb.length() - 1);
}
}
/**
* Checks if a term could be stemmed.
*
* @return true if, and only if, the given term consists in letters.
*/
private boolean isStemmable(String term) {
for (int c = 0; c < term.length(); c++) {
if (!Character.isLetter(term.charAt(c))) return false;
}
return true;
}
/**
* Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
*/
private void substitute(StringBuilder buffer) {
for (int i = 0; i < buffer.length(); i++) {
switch (buffer.charAt(i)) {
case 'ä':
case 'á':
{
buffer.setCharAt(i, 'a');
break;
}
case 'ë':
case 'é':
{
buffer.setCharAt(i, 'e');
break;
}
case 'ü':
case 'ú':
{
buffer.setCharAt(i, 'u');
break;
}
case 'ï':
case 'i':
{
buffer.setCharAt(i, 'i');
break;
}
case 'ö':
case 'ó':
{
buffer.setCharAt(i, 'o');
break;
}
}
}
}
/*private boolean isValidSEnding(StringBuilder sb) {
return isValidSEnding(sb, sb.length() - 1);
}*/
private boolean isValidSEnding(StringBuilder sb, int index) {
char c = sb.charAt(index);
if (isVowel(c) || c == 'j')
return false;
return true;
}
/*private boolean isValidEnEnding(StringBuilder sb) {
return isValidEnEnding(sb, sb.length() - 1);
}*/
private boolean isValidEnEnding(StringBuilder sb, int index) {
char c = sb.charAt(index);
if (isVowel(c))
return false;
if (c < 3)
return false;
// ends with "gem"?
if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e')
return false;
return true;
}
private void unDouble(StringBuilder sb) {
unDouble(sb, sb.length());
}
private void unDouble(StringBuilder sb, int endIndex) {
String s = sb.substring(0, endIndex);
if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) {
sb.delete(endIndex - 1, endIndex);
}
}
private int getRIndex(StringBuilder sb, int start) {
if (start == 0)
start = 1;
int i = start;
for (; i < sb.length(); i++) {
//first non-vowel preceded by a vowel
if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
return i + 1;
}
}
return i + 1;
}
private void storeYandI(StringBuilder sb) {
if (sb.charAt(0) == 'y')
sb.setCharAt(0, 'Y');
int last = sb.length() - 1;
for (int i = 1; i < last; i++) {
switch (sb.charAt(i)) {
case 'i':
{
if (isVowel(sb.charAt(i - 1)) &&
isVowel(sb.charAt(i + 1))
)
sb.setCharAt(i, 'I');
break;
}
case 'y':
{
if (isVowel(sb.charAt(i - 1)))
sb.setCharAt(i, 'Y');
break;
}
}
}
if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1)))
sb.setCharAt(last, 'Y');
}
private void reStoreYandI(StringBuilder sb) {
String tmp = sb.toString();
sb.delete(0, sb.length());
sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
}
private boolean isVowel(char c) {
switch (c) {
case 'e':
case 'a':
case 'o':
case 'i':
case 'u':
case 'y':
case 'è':
{
return true;
}
}
return false;
}
void setStemDictionary(Map dict) {
_stemDict = dict;
}
}

View File

@ -49,8 +49,6 @@ import org.apache.lucene.analysis.charfilter.BaseCharFilter;
* @since Solr 1.5
*/
public class PatternReplaceCharFilter extends BaseCharFilter {
@Deprecated
public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
private final Pattern pattern;
private final String replacement;
@ -62,12 +60,6 @@ public class PatternReplaceCharFilter extends BaseCharFilter {
this.replacement = replacement;
}
@Deprecated
public PatternReplaceCharFilter(Pattern pattern, String replacement,
int maxBlockChars, String blockDelimiter, CharStream in) {
this(pattern, replacement, in);
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
// Buffer all input on the first call.

View File

@ -34,17 +34,9 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.PortugueseStemmer;
/**
* {@link Analyzer} for Portuguese.
* <p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating PortugueseAnalyzer:
* <ul>
* <li> As of 3.6, PortugueseLightStemFilter is used for less aggressive stemming.
* </ul>
*/
public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
@ -132,11 +124,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
result = new PortugueseLightStemFilter(result);
} else {
result = new SnowballFilter(result, new PortugueseStemmer());
}
result = new PortugueseLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -134,7 +134,7 @@ public abstract class RSLPStemmerBase {
if (!exceptions[i].endsWith(suffix))
System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
}
this.exceptions = new CharArraySet(Version.LUCENE_31,
this.exceptions = new CharArraySet(Version.LUCENE_50,
Arrays.asList(exceptions), false);
}

View File

@ -31,14 +31,6 @@ import java.io.IOException;
* that character. For example, with a marker of &#x5C;u0001, "country" =>
* "&#x5C;u0001yrtnuoc". This is useful when implementing efficient leading
* wildcards search.
* </p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating ReverseStringFilter, or when using any of
* its static methods:
* <ul>
* <li> As of 3.1, supplementary characters are handled correctly
* </ul>
*/
public final class ReverseStringFilter extends TokenFilter {
@ -74,7 +66,7 @@ public final class ReverseStringFilter extends TokenFilter {
* The reversed tokens will not be marked.
* </p>
*
* @param matchVersion See <a href="#version">above</a>
* @param matchVersion Lucene compatibility version
* @param in {@link TokenStream} to filter
*/
public ReverseStringFilter(Version matchVersion, TokenStream in) {
@ -89,7 +81,7 @@ public final class ReverseStringFilter extends TokenFilter {
* character.
* </p>
*
* @param matchVersion See <a href="#version">above</a>
* @param matchVersion compatibility version
* @param in {@link TokenStream} to filter
* @param marker A character used to mark reversed tokens
*/
@ -119,7 +111,7 @@ public final class ReverseStringFilter extends TokenFilter {
/**
* Reverses the given input string
*
* @param matchVersion See <a href="#version">above</a>
* @param matchVersion compatibility version
* @param input the string to reverse
* @return the given input string in reversed order
*/
@ -131,7 +123,7 @@ public final class ReverseStringFilter extends TokenFilter {
/**
* Reverses the given input buffer in-place
* @param matchVersion See <a href="#version">above</a>
* @param matchVersion compatibility version
* @param buffer the input char array to reverse
*/
public static void reverse(Version matchVersion, final char[] buffer) {
@ -141,7 +133,7 @@ public final class ReverseStringFilter extends TokenFilter {
/**
* Partially reverses the given input buffer in-place from offset 0
* up to the given length.
* @param matchVersion See <a href="#version">above</a>
* @param matchVersion compatibility version
* @param buffer the input char array to reverse
* @param len the length in the buffer up to where the
* buffer should be reversed
@ -151,24 +143,10 @@ public final class ReverseStringFilter extends TokenFilter {
reverse( matchVersion, buffer, 0, len );
}
/**
* @deprecated (3.1) Remove this when support for 3.0 indexes is no longer needed.
*/
@Deprecated
private static void reverseUnicode3( char[] buffer, int start, int len ){
if( len <= 1 ) return;
int num = len>>1;
for( int i = start; i < ( start + num ); i++ ){
char c = buffer[i];
buffer[i] = buffer[start * 2 + len - i - 1];
buffer[start * 2 + len - i - 1] = c;
}
}
/**
* Partially reverses the given input buffer in-place from the given offset
* up to the given length.
* @param matchVersion See <a href="#version">above</a>
* @param matchVersion compatibility version
* @param buffer the input char array to reverse
* @param start the offset from where to reverse the buffer
* @param len the length in the buffer up to where the
@ -176,10 +154,6 @@ public final class ReverseStringFilter extends TokenFilter {
*/
public static void reverse(Version matchVersion, final char[] buffer,
final int start, final int len) {
if (!matchVersion.onOrAfter(Version.LUCENE_31)) {
reverseUnicode3(buffer, start, len);
return;
}
/* modified version of Apache Harmony AbstractStringBuilder reverse0() */
if (len < 2)
return;

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ru;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -42,44 +41,13 @@ import org.apache.lucene.util.Version;
* Supports an external list of stopwords (words that
* will not be indexed at all).
* A default set of stopwords is used unless an alternative list is specified.
* </p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating RussianAnalyzer:
* <ul>
* <li> As of 3.1, StandardTokenizer is used, Snowball stemming is done with
* SnowballFilter, and Snowball stopwords are used by default.
* </ul>
*/
public final class RussianAnalyzer extends StopwordAnalyzerBase
{
/**
* List of typical Russian stopwords. (for backwards compatibility)
* @deprecated (3.1) Remove this for LUCENE 5.0
*/
@Deprecated
private static final String[] RUSSIAN_STOP_WORDS_30 = {
"а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в",
"вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где",
"да", "даже", "для", "до", "его", "ее", "ей", "ею", "если", "есть",
"еще", "же", "за", "здесь", "и", "из", "или", "им", "их", "к", "как",
"ко", "когда", "кто", "ли", "либо", "мне", "может", "мы", "на", "надо",
"наш", "не", "него", "нее", "нет", "ни", "них", "но", "ну", "о", "об",
"однако", "он", "она", "они", "оно", "от", "очень", "по", "под", "при",
"с", "со", "так", "также", "такой", "там", "те", "тем", "то", "того",
"тоже", "той", "только", "том", "ты", "у", "уже", "хотя", "чего", "чей",
"чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
};
public final class RussianAnalyzer extends StopwordAnalyzerBase {
/** File containing default Russian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "russian_stop.txt";
private static class DefaultSetHolder {
/** @deprecated (3.1) remove this for Lucene 5.0 */
@Deprecated
static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(RUSSIAN_STOP_WORDS_30), false));
static final CharArraySet DEFAULT_STOP_SET;
static {
@ -106,9 +74,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
}
public RussianAnalyzer(Version matchVersion) {
this(matchVersion,
matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
: DefaultSetHolder.DEFAULT_STOP_SET_30);
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
@ -151,23 +117,13 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(
result, stemExclusionSet);
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
return new TokenStreamComponents(source, result);
} else {
final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new StopFilter(matchVersion, result, stopwords);
if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(
result, stemExclusionSet);
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
return new TokenStreamComponents(source, result);
}
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if (!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
return new TokenStreamComponents(source, result);
}
}

View File

@ -1,97 +0,0 @@
package org.apache.lucene.analysis.ru;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; // for javadocs
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/**
* A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
* by also allowing the basic Latin digits 0-9.
* <p>
* <a name="version"/>
* You must specify the required {@link Version} compatibility when creating
* {@link RussianLetterTokenizer}:
* <ul>
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
* {@link CharTokenizer#normalize(int)} for details.</li>
* </ul>
* @deprecated (3.1) Use {@link StandardTokenizer} instead, which has the same functionality.
* This filter will be removed in Lucene 5.0
*/
@Deprecated
public class RussianLetterTokenizer extends CharTokenizer
{
private static final int DIGIT_0 = '0';
private static final int DIGIT_9 = '9';
/**
* Construct a new RussianLetterTokenizer. * @param matchVersion Lucene version
* to match See {@link <a href="#version">above</a>}
*
* @param in
* the input to split up into tokens
*/
public RussianLetterTokenizer(Version matchVersion, Reader in) {
super(matchVersion, in);
}
/**
* Construct a new RussianLetterTokenizer using a given {@link AttributeSource}.
*
* @param matchVersion
* Lucene version to match See {@link <a href="#version">above</a>}
* @param source
* the attribute source to use for this {@link Tokenizer}
* @param in
* the input to split up into tokens
*/
public RussianLetterTokenizer(Version matchVersion, AttributeSource source, Reader in) {
super(matchVersion, source, in);
}
/**
* Construct a new RussianLetterTokenizer using a given
* {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. * @param
* matchVersion Lucene version to match See
* {@link <a href="#version">above</a>}
*
* @param factory
* the attribute factory to use for this {@link Tokenizer}
* @param in
* the input to split up into tokens
*/
public RussianLetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
super(matchVersion, factory, in);
}
/**
* Collects only characters which satisfy
* {@link Character#isLetter(int)}.
*/
@Override
protected boolean isTokenChar(int c) {
return Character.isLetter(c) || (c >= DIGIT_0 && c <= DIGIT_9);
}
}

View File

@ -1,88 +0,0 @@
package org.apache.lucene.analysis.snowball;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import java.io.Reader;
/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
* LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}.
*
* Available stemmers are listed in org.tartarus.snowball.ext. The name of a
* stemmer is the part of the class name before "Stemmer", e.g., the stemmer in
* {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".
*
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}, with the following addition:
* <ul>
* <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
* </ul>
* </p>
* @deprecated (3.1) Use the language-specific analyzer in modules/analysis instead.
* This analyzer will be removed in Lucene 5.0
*/
@Deprecated
public final class SnowballAnalyzer extends Analyzer {
private String name;
private CharArraySet stopSet;
private final Version matchVersion;
/** Builds the named analyzer with no stop words. */
public SnowballAnalyzer(Version matchVersion, String name) {
this.name = name;
this.matchVersion = matchVersion;
}
/** Builds the named analyzer with the given stop words. */
public SnowballAnalyzer(Version matchVersion, String name, CharArraySet stopWords) {
this(matchVersion, name);
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
stopWords));
}
/** Constructs a {@link StandardTokenizer} filtered by a {@link
StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
and a {@link SnowballFilter} */
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, tokenizer);
// remove the possessive 's for english stemmers
if (matchVersion.onOrAfter(Version.LUCENE_31) &&
(name.equals("English") || name.equals("Porter") || name.equals("Lovins")))
result = new EnglishPossessiveFilter(result);
// Use a special lowercase filter for turkish, the stemmer expects it.
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
result = new TurkishLowerCaseFilter(result);
else
result = new LowerCaseFilter(matchVersion, result);
if (stopSet != null)
result = new StopFilter(matchVersion,
result, stopSet);
result = new SnowballFilter(result, name);
return new TokenStreamComponents(tokenizer, result);
}
}

View File

@ -21,61 +21,19 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Version;
/**
* Normalizes tokens extracted with {@link StandardTokenizer}.
*/
public class StandardFilter extends TokenFilter {
private final Version matchVersion;
public StandardFilter(Version matchVersion, TokenStream in) {
super(in);
this.matchVersion = matchVersion;
}
private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
// this filters uses attribute type
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
@Override
public final boolean incrementToken() throws IOException {
if (matchVersion.onOrAfter(Version.LUCENE_31))
return input.incrementToken(); // TODO: add some niceties for the new grammar
else
return incrementTokenClassic();
}
public final boolean incrementTokenClassic() throws IOException {
if (!input.incrementToken()) {
return false;
}
final char[] buffer = termAtt.buffer();
final int bufferLength = termAtt.length();
final String type = typeAtt.type();
if (type == APOSTROPHE_TYPE && // remove 's
bufferLength >= 2 &&
buffer[bufferLength-2] == '\'' &&
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
// Strip last 2 characters off
termAtt.setLength(bufferLength - 2);
} else if (type == ACRONYM_TYPE) { // remove dots
int upto = 0;
for(int i=0;i<bufferLength;i++) {
char c = buffer[i];
if (c != '.')
buffer[upto++] = c;
}
termAtt.setLength(upto);
}
return true;
return input.incrementToken(); // TODO: add some niceties for the new grammar
}
}

View File

@ -21,7 +21,6 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.std31.StandardTokenizerImpl31;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@ -31,31 +30,20 @@ import org.apache.lucene.util.Version;
/** A grammar-based tokenizer constructed with JFlex.
* <p>
* As of Lucene version 3.1, this class implements the Word Break rules from the
* This class implements the Word Break rules from the
* Unicode Text Segmentation algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
* <p/>
* <p>Many applications have specific tokenizer needs. If this tokenizer does
* not suit your application, please consider copying this source code
* directory to your project and maintaining your own grammar-based tokenizer.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating StandardTokenizer:
* <ul>
* <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
* from their combining characters. If you use a previous version number,
* you get the exact broken behavior for backwards compatibility.
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation.
* If you use a previous version number, you get the exact behavior of
* {@link ClassicTokenizer} for backwards compatibility.
* </ul>
*/
public final class StandardTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */
private StandardTokenizerInterface scanner;
// TODO: how can we remove these old types?!
public static final int ALPHANUM = 0;
/** @deprecated (3.1) */
@Deprecated
@ -146,13 +134,7 @@ public final class StandardTokenizer extends Tokenizer {
}
private final void init(Version matchVersion) {
if (matchVersion.onOrAfter(Version.LUCENE_34)) {
this.scanner = new StandardTokenizerImpl(input);
} else if (matchVersion.onOrAfter(Version.LUCENE_31)) {
this.scanner = new StandardTokenizerImpl31(input);
} else {
this.scanner = new ClassicTokenizerImpl(input);
}
this.scanner = new StandardTokenizerImpl(input);
}
// this tokenizer generates three attributes:
@ -184,15 +166,7 @@ public final class StandardTokenizer extends Tokenizer {
scanner.getText(termAtt);
final int start = scanner.yychar();
offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
if (tokenType == StandardTokenizer.ACRONYM_DEP) {
typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]);
termAtt.setLength(termAtt.length() - 1); // remove extra '.'
} else {
typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
}
typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
return true;
} else
// When we skip a too-long term, we still increment the

View File

@ -23,8 +23,6 @@ import java.io.InputStreamReader;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.std31.UAX29URLEmailTokenizerImpl31;
import org.apache.lucene.analysis.standard.std34.UAX29URLEmailTokenizerImpl34;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -50,14 +48,6 @@ import org.apache.lucene.util.AttributeSource.AttributeFactory;
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* </ul>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating UAX29URLEmailTokenizer:
* <ul>
* <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
* from their combining characters. If you use a previous version number,
* you get the exact broken behavior for backwards compatibility.
* </ul>
*/
public final class UAX29URLEmailTokenizer extends Tokenizer {
@ -128,13 +118,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
}
private static StandardTokenizerInterface getScannerFor(Version matchVersion, Reader input) {
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
return new UAX29URLEmailTokenizerImpl(input);
} else if (matchVersion.onOrAfter(Version.LUCENE_34)) {
return new UAX29URLEmailTokenizerImpl34(input);
} else {
return new UAX29URLEmailTokenizerImpl31(input);
}
return new UAX29URLEmailTokenizerImpl(input);
}
// this tokenizer generates three attributes:

View File

@ -1,330 +0,0 @@
/*
* Copyright 2001-2005 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
// file version from Wednesday, February 9, 2011 12:34:10 PM UTC
// generated on Wednesday, February 9, 2011 4:45:18 PM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
[aA][cC]
| [aA][dD]
| [aA][eE]
| [aA][eE][rR][oO]
| [aA][fF]
| [aA][gG]
| [aA][iI]
| [aA][lL]
| [aA][mM]
| [aA][nN]
| [aA][oO]
| [aA][qQ]
| [aA][rR]
| [aA][rR][pP][aA]
| [aA][sS]
| [aA][sS][iI][aA]
| [aA][tT]
| [aA][uU]
| [aA][wW]
| [aA][xX]
| [aA][zZ]
| [bB][aA]
| [bB][bB]
| [bB][dD]
| [bB][eE]
| [bB][fF]
| [bB][gG]
| [bB][hH]
| [bB][iI]
| [bB][iI][zZ]
| [bB][jJ]
| [bB][mM]
| [bB][nN]
| [bB][oO]
| [bB][rR]
| [bB][sS]
| [bB][tT]
| [bB][vV]
| [bB][wW]
| [bB][yY]
| [bB][zZ]
| [cC][aA]
| [cC][aA][tT]
| [cC][cC]
| [cC][dD]
| [cC][fF]
| [cC][gG]
| [cC][hH]
| [cC][iI]
| [cC][kK]
| [cC][lL]
| [cC][mM]
| [cC][nN]
| [cC][oO]
| [cC][oO][mM]
| [cC][oO][oO][pP]
| [cC][rR]
| [cC][uU]
| [cC][vV]
| [cC][xX]
| [cC][yY]
| [cC][zZ]
| [dD][eE]
| [dD][jJ]
| [dD][kK]
| [dD][mM]
| [dD][oO]
| [dD][zZ]
| [eE][cC]
| [eE][dD][uU]
| [eE][eE]
| [eE][gG]
| [eE][rR]
| [eE][sS]
| [eE][tT]
| [eE][uU]
| [fF][iI]
| [fF][jJ]
| [fF][kK]
| [fF][mM]
| [fF][oO]
| [fF][rR]
| [gG][aA]
| [gG][bB]
| [gG][dD]
| [gG][eE]
| [gG][fF]
| [gG][gG]
| [gG][hH]
| [gG][iI]
| [gG][lL]
| [gG][mM]
| [gG][nN]
| [gG][oO][vV]
| [gG][pP]
| [gG][qQ]
| [gG][rR]
| [gG][sS]
| [gG][tT]
| [gG][uU]
| [gG][wW]
| [gG][yY]
| [hH][kK]
| [hH][mM]
| [hH][nN]
| [hH][rR]
| [hH][tT]
| [hH][uU]
| [iI][dD]
| [iI][eE]
| [iI][lL]
| [iI][mM]
| [iI][nN]
| [iI][nN][fF][oO]
| [iI][nN][tT]
| [iI][oO]
| [iI][qQ]
| [iI][rR]
| [iI][sS]
| [iI][tT]
| [jJ][eE]
| [jJ][mM]
| [jJ][oO]
| [jJ][oO][bB][sS]
| [jJ][pP]
| [kK][eE]
| [kK][gG]
| [kK][hH]
| [kK][iI]
| [kK][mM]
| [kK][nN]
| [kK][pP]
| [kK][rR]
| [kK][wW]
| [kK][yY]
| [kK][zZ]
| [lL][aA]
| [lL][bB]
| [lL][cC]
| [lL][iI]
| [lL][kK]
| [lL][rR]
| [lL][sS]
| [lL][tT]
| [lL][uU]
| [lL][vV]
| [lL][yY]
| [mM][aA]
| [mM][cC]
| [mM][dD]
| [mM][eE]
| [mM][gG]
| [mM][hH]
| [mM][iI][lL]
| [mM][kK]
| [mM][lL]
| [mM][mM]
| [mM][nN]
| [mM][oO]
| [mM][oO][bB][iI]
| [mM][pP]
| [mM][qQ]
| [mM][rR]
| [mM][sS]
| [mM][tT]
| [mM][uU]
| [mM][uU][sS][eE][uU][mM]
| [mM][vV]
| [mM][wW]
| [mM][xX]
| [mM][yY]
| [mM][zZ]
| [nN][aA]
| [nN][aA][mM][eE]
| [nN][cC]
| [nN][eE]
| [nN][eE][tT]
| [nN][fF]
| [nN][gG]
| [nN][iI]
| [nN][lL]
| [nN][oO]
| [nN][pP]
| [nN][rR]
| [nN][uU]
| [nN][zZ]
| [oO][mM]
| [oO][rR][gG]
| [pP][aA]
| [pP][eE]
| [pP][fF]
| [pP][gG]
| [pP][hH]
| [pP][kK]
| [pP][lL]
| [pP][mM]
| [pP][nN]
| [pP][rR]
| [pP][rR][oO]
| [pP][sS]
| [pP][tT]
| [pP][wW]
| [pP][yY]
| [qQ][aA]
| [rR][eE]
| [rR][oO]
| [rR][sS]
| [rR][uU]
| [rR][wW]
| [sS][aA]
| [sS][bB]
| [sS][cC]
| [sS][dD]
| [sS][eE]
| [sS][gG]
| [sS][hH]
| [sS][iI]
| [sS][jJ]
| [sS][kK]
| [sS][lL]
| [sS][mM]
| [sS][nN]
| [sS][oO]
| [sS][rR]
| [sS][tT]
| [sS][uU]
| [sS][vV]
| [sS][yY]
| [sS][zZ]
| [tT][cC]
| [tT][dD]
| [tT][eE][lL]
| [tT][fF]
| [tT][gG]
| [tT][hH]
| [tT][jJ]
| [tT][kK]
| [tT][lL]
| [tT][mM]
| [tT][nN]
| [tT][oO]
| [tT][pP]
| [tT][rR]
| [tT][rR][aA][vV][eE][lL]
| [tT][tT]
| [tT][vV]
| [tT][wW]
| [tT][zZ]
| [uU][aA]
| [uU][gG]
| [uU][kK]
| [uU][sS]
| [uU][yY]
| [uU][zZ]
| [vV][aA]
| [vV][cC]
| [vV][eE]
| [vV][gG]
| [vV][iI]
| [vV][nN]
| [vV][uU]
| [wW][fF]
| [wW][sS]
| [xX][nN]--0[zZ][wW][mM]56[dD]
| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
| [xX][nN]--3[eE]0[bB]707[eE]
| [xX][nN]--45[bB][rR][jJ]9[cC]
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
| [xX][nN]--[fF][iI][qQ][sS]8[sS]
| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
| [xX][nN]--[fF][pP][cC][rR][jJ]9[cC]3[dD]
| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
| [xX][nN]--[gG]6[wW]251[dD]
| [xX][nN]--[gG][eE][cC][rR][jJ]9[cC]
| [xX][nN]--[hH]2[bB][rR][jJ]9[cC]
| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
| [xX][nN]--[jJ]6[wW]193[gG]
| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
| [xX][nN]--[kK][pP][rR][wW]13[dD]
| [xX][nN]--[kK][pP][rR][yY]57[dD]
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
| [xX][nN]--[oO]3[cC][wW]4[hH]
| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
| [xX][nN]--[pP]1[aA][iI]
| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
| [xX][nN]--[sS]9[bB][rR][jJ]9[cC]
| [xX][nN]--[wW][gG][bB][hH]1[cC]
| [xX][nN]--[wW][gG][bB][lL]6[aA]
| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
| [xX][nN]--[xX][kK][cC]2[dD][lL]3[aA]5[eE][eE]0[hH]
| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
| [yY][eE]
| [yY][tT]
| [zZ][aA]
| [zZ][mM]
| [zZ][wW]
) "."? // Accept trailing root (empty) domain

View File

@ -1,125 +0,0 @@
/*
* Copyright 2010 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated using ICU4J 4.6.0.0 on Wednesday, February 9, 2011 4:45:11 PM UTC
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
ALetterSupp = (
([\ud80d][\uDC00-\uDC2E])
| ([\ud80c][\uDC00-\uDFFF])
| ([\ud809][\uDC00-\uDC62])
| ([\ud808][\uDC00-\uDF6E])
| ([\ud81a][\uDC00-\uDE38])
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF])
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
| ([\ud801][\uDC00-\uDC9D])
| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
| ([\ud803][\uDC00-\uDC48])
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
)
FormatSupp = (
([\ud804][\uDCBD])
| ([\ud834][\uDD73-\uDD7A])
| ([\udb40][\uDC01\uDC20-\uDC7F])
)
ExtendSupp = (
([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA])
| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
| ([\ud800][\uDDFD])
| ([\udb40][\uDD00-\uDDEF])
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
)
NumericSupp = (
([\ud804][\uDC66-\uDC6F])
| ([\ud835][\uDFCE-\uDFFF])
| ([\ud801][\uDCA0-\uDCA9])
)
KatakanaSupp = (
([\ud82c][\uDC00])
)
MidLetterSupp = (
[]
)
MidNumSupp = (
[]
)
MidNumLetSupp = (
[]
)
ExtendNumLetSupp = (
[]
)
ExtendNumLetSupp = (
[]
)
ComplexContextSupp = (
[]
)
HanSupp = (
([\ud87e][\uDC00-\uDE1D])
| ([\ud86b][\uDC00-\uDFFF])
| ([\ud86a][\uDC00-\uDFFF])
| ([\ud869][\uDC00-\uDED6\uDF00-\uDFFF])
| ([\ud868][\uDC00-\uDFFF])
| ([\ud86e][\uDC00-\uDC1D])
| ([\ud86d][\uDC00-\uDF34\uDF40-\uDFFF])
| ([\ud86c][\uDC00-\uDFFF])
| ([\ud863][\uDC00-\uDFFF])
| ([\ud862][\uDC00-\uDFFF])
| ([\ud861][\uDC00-\uDFFF])
| ([\ud860][\uDC00-\uDFFF])
| ([\ud867][\uDC00-\uDFFF])
| ([\ud866][\uDC00-\uDFFF])
| ([\ud865][\uDC00-\uDFFF])
| ([\ud864][\uDC00-\uDFFF])
| ([\ud858][\uDC00-\uDFFF])
| ([\ud859][\uDC00-\uDFFF])
| ([\ud85a][\uDC00-\uDFFF])
| ([\ud85b][\uDC00-\uDFFF])
| ([\ud85c][\uDC00-\uDFFF])
| ([\ud85d][\uDC00-\uDFFF])
| ([\ud85e][\uDC00-\uDFFF])
| ([\ud85f][\uDC00-\uDFFF])
| ([\ud850][\uDC00-\uDFFF])
| ([\ud851][\uDC00-\uDFFF])
| ([\ud852][\uDC00-\uDFFF])
| ([\ud853][\uDC00-\uDFFF])
| ([\ud854][\uDC00-\uDFFF])
| ([\ud855][\uDC00-\uDFFF])
| ([\ud856][\uDC00-\uDFFF])
| ([\ud857][\uDC00-\uDFFF])
| ([\ud849][\uDC00-\uDFFF])
| ([\ud848][\uDC00-\uDFFF])
| ([\ud84b][\uDC00-\uDFFF])
| ([\ud84a][\uDC00-\uDFFF])
| ([\ud84d][\uDC00-\uDFFF])
| ([\ud84c][\uDC00-\uDFFF])
| ([\ud84f][\uDC00-\uDFFF])
| ([\ud84e][\uDC00-\uDFFF])
| ([\ud841][\uDC00-\uDFFF])
| ([\ud840][\uDC00-\uDFFF])
| ([\ud843][\uDC00-\uDFFF])
| ([\ud842][\uDC00-\uDFFF])
| ([\ud845][\uDC00-\uDFFF])
| ([\ud844][\uDC00-\uDFFF])
| ([\ud847][\uDC00-\uDFFF])
| ([\ud846][\uDC00-\uDFFF])
)
HiraganaSupp = (
([\ud83c][\uDE00])
| ([\ud82c][\uDC01])
)

View File

@ -1,184 +0,0 @@
package org.apache.lucene.analysis.standard.std31;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class implements StandardTokenizer, except with a bug
* (https://issues.apache.org/jira/browse/LUCENE-3358) where Han and Hiragana
* characters would be split from combining characters:
* @deprecated This class is only for exact backwards compatibility
*/
@Deprecated
%%
%unicode 6.0
%integer
%final
%public
%class StandardTokenizerImpl31
%implements StandardTokenizerInterface
%function getNextToken
%char
%include src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
Extend = ([\p{WB:Extend}] | {ExtendSupp})
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
// Script=Hangul & Aletter
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
%{
/** Alphanumeric sequences */
public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
/** Numbers */
public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
* together as as a single token rather than broken up, because the logic
* required to break them at word boundaries is too complex for UAX#29.
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
public final int yychar()
{
return yychar;
}
/**
* Fills CharTermAttribute with the current token text.
*/
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
%%
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
{HangulEx}+
{ return HANGUL_TYPE; }
{KatakanaEx}+
{ return KATAKANA_TYPE; }
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}*
{ return WORD_TYPE; }
// From UAX #29:
//
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
// boundary property values based on criteria outside of the scope of this
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
// Lao, etc.) are kept together. This grammar does the same below.
//
// See also the Unicode Line Breaking Algorithm:
//
// http://www.unicode.org/reports/tr14/#SA
//
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
//
{Han} { return IDEOGRAPHIC_TYPE; }
{Hiragana} { return HIRAGANA_TYPE; }
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB14. Any ÷ Any
//
[^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -1,269 +0,0 @@
package org.apache.lucene.analysis.standard.std31;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class implements UAX29URLEmailTokenizer, except with a bug
* (https://issues.apache.org/jira/browse/LUCENE-3358) where Han and Hiragana
* characters would be split from combining characters:
* @deprecated This class is only for exact backwards compatibility
*/
@Deprecated
%%
%unicode 6.0
%integer
%final
%public
%class UAX29URLEmailTokenizerImpl31
%implements StandardTokenizerInterface
%function getNextToken
%char
%include src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
Extend = ([\p{WB:Extend}] | {ExtendSupp})
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
// Script=Hangul & Aletter
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
// URL and E-mail syntax specifications:
//
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
// RFC-1123: Requirements for Internet Hosts - Application and Support
// RFC-1738: Uniform Resource Locators (URL)
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
// RFC-5321: Simple Mail Transfer Protocol
// RFC-5322: Internet Message Format
%include src/java/org/apache/lucene/analysis/standard/std31/ASCIITLD.jflex-macro
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
URIunreserved = [-._~A-Za-z0-9]
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
URIsubDelims = [!$&'()*+,;=]
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIport = ":" [0-9]{1,5}
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
URIauthorityStrict = {URIhostStrict} {URIport}?
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
HTTPpath = ("/" {HTTPsegment})*
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
FTPscheme = [fF][tT][pP] "://"
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
FILEscheme = [fF][iI][lL][eE] "://"
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
URL = {HTTPurl} | {FTPurl} | {FILEurl}
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
// in the {EMAILbracketedHost} definition without incurring any size penalties,
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
// The IP address regexes are included in {EMAILbracketedHost} simply as a
// reminder that they are acceptable bracketed host forms.
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
%{
/** Alphanumeric sequences */
public static final int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM;
/** Numbers */
public static final int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM;
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
* together as as a single token rather than broken up, because the logic
* required to break them at word boundaries is too complex for UAX#29.
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
public final int yychar()
{
return yychar;
}
/**
* Fills CharTermAttribute with the current token text.
*/
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
%%
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
{URL} { return URL_TYPE; }
{EMAIL} { return EMAIL_TYPE; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
{HangulEx}+
{ return HANGUL_TYPE; }
{KatakanaEx}+
{ return KATAKANA_TYPE; }
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}*
{ return WORD_TYPE; }
// From UAX #29:
//
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
// boundary property values based on criteria outside of the scope of this
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
// Lao, etc.) are kept together. This grammar does the same below.
//
// See also the Unicode Line Breaking Algorithm:
//
// http://www.unicode.org/reports/tr14/#SA
//
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
//
{Han} { return IDEOGRAPHIC_TYPE; }
{Hiragana} { return HIRAGANA_TYPE; }
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB14. Any ÷ Any
//
[^] { /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Backwards-compatible implementation to match {@link org.apache.lucene.util.Version#LUCENE_31}
</body>
</html>

View File

@ -1,334 +0,0 @@
/*
* Copyright 2001-2005 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
// file version from Thursday, August 4, 2011 11:34:20 AM UTC
// generated on Thursday, August 4, 2011 11:46:19 PM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
[aA][cC]
| [aA][dD]
| [aA][eE]
| [aA][eE][rR][oO]
| [aA][fF]
| [aA][gG]
| [aA][iI]
| [aA][lL]
| [aA][mM]
| [aA][nN]
| [aA][oO]
| [aA][qQ]
| [aA][rR]
| [aA][rR][pP][aA]
| [aA][sS]
| [aA][sS][iI][aA]
| [aA][tT]
| [aA][uU]
| [aA][wW]
| [aA][xX]
| [aA][zZ]
| [bB][aA]
| [bB][bB]
| [bB][dD]
| [bB][eE]
| [bB][fF]
| [bB][gG]
| [bB][hH]
| [bB][iI]
| [bB][iI][zZ]
| [bB][jJ]
| [bB][mM]
| [bB][nN]
| [bB][oO]
| [bB][rR]
| [bB][sS]
| [bB][tT]
| [bB][vV]
| [bB][wW]
| [bB][yY]
| [bB][zZ]
| [cC][aA]
| [cC][aA][tT]
| [cC][cC]
| [cC][dD]
| [cC][fF]
| [cC][gG]
| [cC][hH]
| [cC][iI]
| [cC][kK]
| [cC][lL]
| [cC][mM]
| [cC][nN]
| [cC][oO]
| [cC][oO][mM]
| [cC][oO][oO][pP]
| [cC][rR]
| [cC][uU]
| [cC][vV]
| [cC][xX]
| [cC][yY]
| [cC][zZ]
| [dD][eE]
| [dD][jJ]
| [dD][kK]
| [dD][mM]
| [dD][oO]
| [dD][zZ]
| [eE][cC]
| [eE][dD][uU]
| [eE][eE]
| [eE][gG]
| [eE][rR]
| [eE][sS]
| [eE][tT]
| [eE][uU]
| [fF][iI]
| [fF][jJ]
| [fF][kK]
| [fF][mM]
| [fF][oO]
| [fF][rR]
| [gG][aA]
| [gG][bB]
| [gG][dD]
| [gG][eE]
| [gG][fF]
| [gG][gG]
| [gG][hH]
| [gG][iI]
| [gG][lL]
| [gG][mM]
| [gG][nN]
| [gG][oO][vV]
| [gG][pP]
| [gG][qQ]
| [gG][rR]
| [gG][sS]
| [gG][tT]
| [gG][uU]
| [gG][wW]
| [gG][yY]
| [hH][kK]
| [hH][mM]
| [hH][nN]
| [hH][rR]
| [hH][tT]
| [hH][uU]
| [iI][dD]
| [iI][eE]
| [iI][lL]
| [iI][mM]
| [iI][nN]
| [iI][nN][fF][oO]
| [iI][nN][tT]
| [iI][oO]
| [iI][qQ]
| [iI][rR]
| [iI][sS]
| [iI][tT]
| [jJ][eE]
| [jJ][mM]
| [jJ][oO]
| [jJ][oO][bB][sS]
| [jJ][pP]
| [kK][eE]
| [kK][gG]
| [kK][hH]
| [kK][iI]
| [kK][mM]
| [kK][nN]
| [kK][pP]
| [kK][rR]
| [kK][wW]
| [kK][yY]
| [kK][zZ]
| [lL][aA]
| [lL][bB]
| [lL][cC]
| [lL][iI]
| [lL][kK]
| [lL][rR]
| [lL][sS]
| [lL][tT]
| [lL][uU]
| [lL][vV]
| [lL][yY]
| [mM][aA]
| [mM][cC]
| [mM][dD]
| [mM][eE]
| [mM][gG]
| [mM][hH]
| [mM][iI][lL]
| [mM][kK]
| [mM][lL]
| [mM][mM]
| [mM][nN]
| [mM][oO]
| [mM][oO][bB][iI]
| [mM][pP]
| [mM][qQ]
| [mM][rR]
| [mM][sS]
| [mM][tT]
| [mM][uU]
| [mM][uU][sS][eE][uU][mM]
| [mM][vV]
| [mM][wW]
| [mM][xX]
| [mM][yY]
| [mM][zZ]
| [nN][aA]
| [nN][aA][mM][eE]
| [nN][cC]
| [nN][eE]
| [nN][eE][tT]
| [nN][fF]
| [nN][gG]
| [nN][iI]
| [nN][lL]
| [nN][oO]
| [nN][pP]
| [nN][rR]
| [nN][uU]
| [nN][zZ]
| [oO][mM]
| [oO][rR][gG]
| [pP][aA]
| [pP][eE]
| [pP][fF]
| [pP][gG]
| [pP][hH]
| [pP][kK]
| [pP][lL]
| [pP][mM]
| [pP][nN]
| [pP][rR]
| [pP][rR][oO]
| [pP][sS]
| [pP][tT]
| [pP][wW]
| [pP][yY]
| [qQ][aA]
| [rR][eE]
| [rR][oO]
| [rR][sS]
| [rR][uU]
| [rR][wW]
| [sS][aA]
| [sS][bB]
| [sS][cC]
| [sS][dD]
| [sS][eE]
| [sS][gG]
| [sS][hH]
| [sS][iI]
| [sS][jJ]
| [sS][kK]
| [sS][lL]
| [sS][mM]
| [sS][nN]
| [sS][oO]
| [sS][rR]
| [sS][tT]
| [sS][uU]
| [sS][vV]
| [sS][yY]
| [sS][zZ]
| [tT][cC]
| [tT][dD]
| [tT][eE][lL]
| [tT][fF]
| [tT][gG]
| [tT][hH]
| [tT][jJ]
| [tT][kK]
| [tT][lL]
| [tT][mM]
| [tT][nN]
| [tT][oO]
| [tT][pP]
| [tT][rR]
| [tT][rR][aA][vV][eE][lL]
| [tT][tT]
| [tT][vV]
| [tT][wW]
| [tT][zZ]
| [uU][aA]
| [uU][gG]
| [uU][kK]
| [uU][sS]
| [uU][yY]
| [uU][zZ]
| [vV][aA]
| [vV][cC]
| [vV][eE]
| [vV][gG]
| [vV][iI]
| [vV][nN]
| [vV][uU]
| [wW][fF]
| [wW][sS]
| [xX][nN]--0[zZ][wW][mM]56[dD]
| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
| [xX][nN]--3[eE]0[bB]707[eE]
| [xX][nN]--45[bB][rR][jJ]9[cC]
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
| [xX][nN]--90[aA]3[aA][cC]
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
| [xX][nN]--[fF][iI][qQ][sS]8[sS]
| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
| [xX][nN]--[fF][pP][cC][rR][jJ]9[cC]3[dD]
| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
| [xX][nN]--[gG]6[wW]251[dD]
| [xX][nN]--[gG][eE][cC][rR][jJ]9[cC]
| [xX][nN]--[hH]2[bB][rR][jJ]9[cC]
| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
| [xX][nN]--[jJ]6[wW]193[gG]
| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
| [xX][nN]--[kK][pP][rR][wW]13[dD]
| [xX][nN]--[kK][pP][rR][yY]57[dD]
| [xX][nN]--[lL][gG][bB][bB][aA][tT]1[aA][dD]8[jJ]
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
| [xX][nN]--[mM][gG][bB][cC]0[aA]9[aA][zZ][cC][gG]
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
| [xX][nN]--[oO]3[cC][wW]4[hH]
| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
| [xX][nN]--[pP]1[aA][iI]
| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
| [xX][nN]--[sS]9[bB][rR][jJ]9[cC]
| [xX][nN]--[wW][gG][bB][hH]1[cC]
| [xX][nN]--[wW][gG][bB][lL]6[aA]
| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
| [xX][nN]--[xX][kK][cC]2[dD][lL]3[aA]5[eE][eE]0[hH]
| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
| [xX][xX][xX]
| [yY][eE]
| [yY][tT]
| [zZ][aA]
| [zZ][mM]
| [zZ][wW]
) "."? // Accept trailing root (empty) domain

View File

@ -1,125 +0,0 @@
/*
* Copyright 2010 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated using ICU4J 4.8.0.0 on Friday, September 30, 2011 4:10:42 PM UTC
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
ALetterSupp = (
([\ud80d][\uDC00-\uDC2E])
| ([\ud80c][\uDC00-\uDFFF])
| ([\ud809][\uDC00-\uDC62])
| ([\ud808][\uDC00-\uDF6E])
| ([\ud81a][\uDC00-\uDE38])
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF])
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
| ([\ud801][\uDC00-\uDC9D])
| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
| ([\ud803][\uDC00-\uDC48])
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
)
FormatSupp = (
([\ud804][\uDCBD])
| ([\ud834][\uDD73-\uDD7A])
| ([\udb40][\uDC01\uDC20-\uDC7F])
)
ExtendSupp = (
([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA])
| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
| ([\ud800][\uDDFD])
| ([\udb40][\uDD00-\uDDEF])
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
)
NumericSupp = (
([\ud804][\uDC66-\uDC6F])
| ([\ud835][\uDFCE-\uDFFF])
| ([\ud801][\uDCA0-\uDCA9])
)
KatakanaSupp = (
([\ud82c][\uDC00])
)
MidLetterSupp = (
[]
)
MidNumSupp = (
[]
)
MidNumLetSupp = (
[]
)
ExtendNumLetSupp = (
[]
)
ExtendNumLetSupp = (
[]
)
ComplexContextSupp = (
[]
)
HanSupp = (
([\ud87e][\uDC00-\uDE1D])
| ([\ud86b][\uDC00-\uDFFF])
| ([\ud86a][\uDC00-\uDFFF])
| ([\ud869][\uDC00-\uDED6\uDF00-\uDFFF])
| ([\ud868][\uDC00-\uDFFF])
| ([\ud86e][\uDC00-\uDC1D])
| ([\ud86d][\uDC00-\uDF34\uDF40-\uDFFF])
| ([\ud86c][\uDC00-\uDFFF])
| ([\ud863][\uDC00-\uDFFF])
| ([\ud862][\uDC00-\uDFFF])
| ([\ud861][\uDC00-\uDFFF])
| ([\ud860][\uDC00-\uDFFF])
| ([\ud867][\uDC00-\uDFFF])
| ([\ud866][\uDC00-\uDFFF])
| ([\ud865][\uDC00-\uDFFF])
| ([\ud864][\uDC00-\uDFFF])
| ([\ud858][\uDC00-\uDFFF])
| ([\ud859][\uDC00-\uDFFF])
| ([\ud85a][\uDC00-\uDFFF])
| ([\ud85b][\uDC00-\uDFFF])
| ([\ud85c][\uDC00-\uDFFF])
| ([\ud85d][\uDC00-\uDFFF])
| ([\ud85e][\uDC00-\uDFFF])
| ([\ud85f][\uDC00-\uDFFF])
| ([\ud850][\uDC00-\uDFFF])
| ([\ud851][\uDC00-\uDFFF])
| ([\ud852][\uDC00-\uDFFF])
| ([\ud853][\uDC00-\uDFFF])
| ([\ud854][\uDC00-\uDFFF])
| ([\ud855][\uDC00-\uDFFF])
| ([\ud856][\uDC00-\uDFFF])
| ([\ud857][\uDC00-\uDFFF])
| ([\ud849][\uDC00-\uDFFF])
| ([\ud848][\uDC00-\uDFFF])
| ([\ud84b][\uDC00-\uDFFF])
| ([\ud84a][\uDC00-\uDFFF])
| ([\ud84d][\uDC00-\uDFFF])
| ([\ud84c][\uDC00-\uDFFF])
| ([\ud84f][\uDC00-\uDFFF])
| ([\ud84e][\uDC00-\uDFFF])
| ([\ud841][\uDC00-\uDFFF])
| ([\ud840][\uDC00-\uDFFF])
| ([\ud843][\uDC00-\uDFFF])
| ([\ud842][\uDC00-\uDFFF])
| ([\ud845][\uDC00-\uDFFF])
| ([\ud844][\uDC00-\uDFFF])
| ([\ud847][\uDC00-\uDFFF])
| ([\ud846][\uDC00-\uDFFF])
)
HiraganaSupp = (
([\ud83c][\uDE00])
| ([\ud82c][\uDC01])
)

View File

@ -1,272 +0,0 @@
package org.apache.lucene.analysis.standard.std34;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class implements UAX29URLEmailTokenizer, except with a bug
* (https://issues.apache.org/jira/browse/LUCENE-3880) where "mailto:"
* URI scheme prepended to an email address will disrupt recognition
* of the email address.
* @deprecated This class is only for exact backwards compatibility
*/
@Deprecated
%%
%unicode 6.0
%integer
%final
%public
%class UAX29URLEmailTokenizerImpl34
%implements StandardTokenizerInterface
%function getNextToken
%char
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
Extend = ([\p{WB:Extend}] | {ExtendSupp})
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
// Script=Hangul & Aletter
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
// URL and E-mail syntax specifications:
//
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
// RFC-1123: Requirements for Internet Hosts - Application and Support
// RFC-1738: Uniform Resource Locators (URL)
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
// RFC-5321: Simple Mail Transfer Protocol
// RFC-5322: Internet Message Format
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
URIunreserved = [-._~A-Za-z0-9]
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
URIsubDelims = [!$&'()*+,;=]
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIport = ":" [0-9]{1,5}
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
URIauthorityStrict = {URIhostStrict} {URIport}?
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
HTTPpath = ("/" {HTTPsegment})*
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
FTPscheme = [fF][tT][pP] "://"
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
FILEscheme = [fF][iI][lL][eE] "://"
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
URL = {HTTPurl} | {FTPurl} | {FILEurl}
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
// in the {EMAILbracketedHost} definition without incurring any size penalties,
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
// The IP address regexes are included in {EMAILbracketedHost} simply as a
// reminder that they are acceptable bracketed host forms.
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
%{
/** Alphanumeric sequences */
public static final int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM;
/** Numbers */
public static final int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM;
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
* together as as a single token rather than broken up, because the logic
* required to break them at word boundaries is too complex for UAX#29.
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
public final int yychar()
{
return yychar;
}
/**
* Fills CharTermAttribute with the current token text.
*/
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
%%
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
{URL} { return URL_TYPE; }
{EMAIL} { return EMAIL_TYPE; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
{HangulEx}+
{ return HANGUL_TYPE; }
{KatakanaEx}+
{ return KATAKANA_TYPE; }
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}*
{ return WORD_TYPE; }
// From UAX #29:
//
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
// boundary property values based on criteria outside of the scope of this
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
// Lao, etc.) are kept together. This grammar does the same below.
//
// See also the Unicode Line Breaking Algorithm:
//
// http://www.unicode.org/reports/tr14/#SA
//
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
//
{HanEx} { return IDEOGRAPHIC_TYPE; }
{HiraganaEx} { return HIRAGANA_TYPE; }
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB14. Any ÷ Any
//
[^] { /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Backwards-compatible implementation to match {@link org.apache.lucene.util.Version#LUCENE_34}
</body>
</html>

View File

@ -33,13 +33,6 @@ import org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Thai language. It uses {@link java.text.BreakIterator} to break words.
* <p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating ThaiAnalyzer:
* <ul>
* <li> As of 3.6, a set of Thai stopwords is used by default
* </ul>
*/
public final class ThaiAnalyzer extends StopwordAnalyzerBase {
@ -84,7 +77,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version
*/
public ThaiAnalyzer(Version matchVersion) {
this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_36) ? DefaultSetHolder.DEFAULT_STOP_SET : StopAnalyzer.ENGLISH_STOP_WORDS_SET);
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
@ -112,8 +105,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new LowerCaseFilter(matchVersion, result);
result = new LowerCaseFilter(matchVersion, result);
result = new ThaiWordFilter(matchVersion, result);
return new TokenStreamComponents(source, new StopFilter(matchVersion,
result, stopwords));

View File

@ -23,7 +23,6 @@ import java.util.Locale;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@ -34,10 +33,6 @@ import org.apache.lucene.util.Version;
/**
* {@link TokenFilter} that use {@link java.text.BreakIterator} to break each
* Token that is Thai into separate Token(s) for each Thai word.
* <p>Please note: Since matchVersion 3.1 on, this filter no longer lowercases non-thai text.
* {@link ThaiAnalyzer} will insert a {@link LowerCaseFilter} before this filter
* so the behaviour of the Analyzer does not change. With version 3.1, the filter handles
* position increments correctly.
* <p>WARNING: this filter may not be supported by all JREs.
* It is known to work with Sun/Oracle and Harmony JREs.
* If your application needs to be fully portable, consider using ICUTokenizer instead,
@ -58,8 +53,6 @@ public final class ThaiWordFilter extends TokenFilter {
private final BreakIterator breaker = (BreakIterator) proto.clone();
private final CharArrayIterator charIterator = CharArrayIterator.newWordInstance();
private final boolean handlePosIncr;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
@ -72,11 +65,9 @@ public final class ThaiWordFilter extends TokenFilter {
/** Creates a new ThaiWordFilter with the specified match version. */
public ThaiWordFilter(Version matchVersion, TokenStream input) {
super(matchVersion.onOrAfter(Version.LUCENE_31) ?
input : new LowerCaseFilter(matchVersion, input));
super(input);
if (!DBBI_AVAILABLE)
throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_31);
}
@Override
@ -92,7 +83,7 @@ public final class ThaiWordFilter extends TokenFilter {
} else {
offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
}
if (handlePosIncr) posAtt.setPositionIncrement(1);
posAtt.setPositionIncrement(1);
return true;
}
hasMoreTokensInClone = false;

View File

@ -30,40 +30,6 @@ import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
/**
* An abstract base class for simple, character-oriented tokenizers.
* <p>
* <a name="version">You must specify the required {@link Version} compatibility
* when creating {@link CharTokenizer}:
* <ul>
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
* detect token codepoints. See {@link #isTokenChar(int)} and
* {@link #normalize(int)} for details.</li>
* </ul>
* <p>
* A new {@link CharTokenizer} API has been introduced with Lucene 3.1. This API
* moved from UTF-16 code units to UTF-32 codepoints to eventually add support
* for <a href=
* "http://java.sun.com/j2se/1.5.0/docs/api/java/lang/Character.html#supplementary"
* >supplementary characters</a>. The old <i>char</i> based API has been
* deprecated and should be replaced with the <i>int</i> based methods
* {@link #isTokenChar(int)} and {@link #normalize(int)}.
* </p>
* <p>
* As of Lucene 3.1 each {@link CharTokenizer} - constructor expects a
* {@link Version} argument. Based on the given {@link Version} either the new
* API or a backwards compatibility layer is used at runtime. For
* {@link Version} < 3.1 the backwards compatibility layer ensures correct
* behavior even for indexes build with previous versions of Lucene. If a
* {@link Version} >= 3.1 is used {@link CharTokenizer} requires the new API to
* be implemented by the instantiated class. Yet, the old <i>char</i> based API
* is not required anymore even if backwards compatibility must be preserved.
* {@link CharTokenizer} subclasses implementing the new API are fully backwards
* compatible if instantiated with {@link Version} < 3.1.
* </p>
* <p>
* <strong>Note:</strong> If you use a subclass of {@link CharTokenizer} with {@link Version} >=
* 3.1 on an index build with a version < 3.1, created tokens might not be
* compatible with the terms in your index.
* </p>
**/
public abstract class CharTokenizer extends Tokenizer {
@ -71,7 +37,7 @@ public abstract class CharTokenizer extends Tokenizer {
* Creates a new {@link CharTokenizer} instance
*
* @param matchVersion
* Lucene version to match See {@link <a href="#version">above</a>}
* Lucene version to match
* @param input
* the input to split up into tokens
*/
@ -84,7 +50,7 @@ public abstract class CharTokenizer extends Tokenizer {
* Creates a new {@link CharTokenizer} instance
*
* @param matchVersion
* Lucene version to match See {@link <a href="#version">above</a>}
* Lucene version to match
* @param source
* the attribute source to use for this {@link Tokenizer}
* @param input
@ -100,7 +66,7 @@ public abstract class CharTokenizer extends Tokenizer {
* Creates a new {@link CharTokenizer} instance
*
* @param matchVersion
* Lucene version to match See {@link <a href="#version">above</a>}
* Lucene version to match
* @param factory
* the attribute factory to use for this {@link Tokenizer}
* @param input

View File

@ -43,7 +43,12 @@ public abstract class CharacterUtils {
* {@link Version} instance.
*/
public static CharacterUtils getInstance(final Version matchVersion) {
return matchVersion.onOrAfter(Version.LUCENE_31) ? JAVA_5 : JAVA_4;
return JAVA_5;
}
/** explicitly returns a version matching java 4 semantics */
public static CharacterUtils getJava4Instance() {
return JAVA_4;
}
/**

View File

@ -98,7 +98,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
Reader reader = null;
try {
reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), IOUtils.CHARSET_UTF_8);
return WordlistLoader.getWordSet(reader, comment, new CharArraySet(Version.LUCENE_31, 16, ignoreCase));
return WordlistLoader.getWordSet(reader, comment, new CharArraySet(Version.LUCENE_CURRENT, 16, ignoreCase));
} finally {
IOUtils.close(reader);
}

View File

@ -20,7 +20,6 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.IndexableBinaryStringTools; // javadoc @link
import org.apache.lucene.util.Version;
import java.text.Collator;
@ -28,12 +27,11 @@ import java.io.Reader;
/**
* <p>
* Filters {@link KeywordTokenizer} with {@link CollationKeyFilter}.
* Configures {@link KeywordTokenizer} with {@link CollationAttributeFactory}.
* </p>
* <p>
* Converts the token into its {@link java.text.CollationKey}, and then
* encodes the CollationKey either directly or with
* {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow
* encodes the CollationKey directly to allow
* it to be stored as an index term.
* </p>
* <p>
@ -74,49 +72,24 @@ import java.io.Reader;
* CollationKeyAnalyzer to generate index terms, do not use
* ICUCollationKeyAnalyzer on the query side, or vice versa.
* </p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating CollationKeyAnalyzer:
* <ul>
* <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
* versions will encode the bytes with {@link IndexableBinaryStringTools}.
* </ul>
*/
public final class CollationKeyAnalyzer extends Analyzer {
private final Collator collator;
private final CollationAttributeFactory factory;
private final Version matchVersion;
/**
* Create a new CollationKeyAnalyzer, using the specified collator.
*
* @param matchVersion See <a href="#version">above</a>
* @param matchVersion compatibility version
* @param collator CollationKey generator
*/
public CollationKeyAnalyzer(Version matchVersion, Collator collator) {
this.matchVersion = matchVersion;
this.collator = collator;
this.factory = new CollationAttributeFactory(collator);
}
/**
* @deprecated Use {@link CollationKeyAnalyzer#CollationKeyAnalyzer(Version, Collator)}
* and specify a version instead. This ctor will be removed in Lucene 5.0
*/
@Deprecated
public CollationKeyAnalyzer(Collator collator) {
this(Version.LUCENE_31, collator);
}
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
if (matchVersion.onOrAfter(Version.LUCENE_40)) {
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new TokenStreamComponents(tokenizer, tokenizer);
} else {
KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator));
}
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new TokenStreamComponents(tokenizer, tokenizer);
}
}

View File

@ -1,108 +0,0 @@
package org.apache.lucene.collation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.IndexableBinaryStringTools;
import java.io.IOException;
import java.text.Collator;
/**
* <p>
* Converts each token into its {@link java.text.CollationKey}, and then
* encodes the CollationKey with {@link IndexableBinaryStringTools}, to allow
* it to be stored as an index term.
* </p>
* <p>
* <strong>WARNING:</strong> Make sure you use exactly the same Collator at
* index and query time -- CollationKeys are only comparable when produced by
* the same Collator. Since {@link java.text.RuleBasedCollator}s are not
* independently versioned, it is unsafe to search against stored
* CollationKeys unless the following are exactly the same (best practice is
* to store this information with the index and check that they remain the
* same at query time):
* </p>
* <ol>
* <li>JVM vendor</li>
* <li>JVM version, including patch version</li>
* <li>
* The language (and country and variant, if specified) of the Locale
* used when constructing the collator via
* {@link Collator#getInstance(java.util.Locale)}.
* </li>
* <li>
* The collation strength used - see {@link Collator#setStrength(int)}
* </li>
* </ol>
* <p>
* The <code>ICUCollationKeyFilter</code> in the analysis-icu package
* uses ICU4J's Collator, which makes its
* version available, thus allowing collation to be versioned independently
* from the JVM. ICUCollationKeyFilter is also significantly faster and
* generates significantly shorter keys than CollationKeyFilter. See
* <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
* >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
* <p>
* CollationKeys generated by java.text.Collators are not compatible
* with those those generated by ICU Collators. Specifically, if you use
* CollationKeyFilter to generate index terms, do not use
* ICUCollationKeyFilter on the query side, or vice versa.
* </p>
* @deprecated Use {@link CollationAttributeFactory} instead, which encodes
* terms directly as bytes. This filter will be removed in Lucene 5.0
*/
@Deprecated
public final class CollationKeyFilter extends TokenFilter {
private final Collator collator;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
* @param input Source token stream
* @param collator CollationKey generator
*/
public CollationKeyFilter(TokenStream input, Collator collator) {
super(input);
// clone in case JRE doesnt properly sync,
// or to reduce contention in case they do
this.collator = (Collator) collator.clone();
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
byte[] collationKey = collator.getCollationKey(termAtt.toString()).toByteArray();
int encodedLength = IndexableBinaryStringTools.getEncodedLength(
collationKey, 0, collationKey.length);
termAtt.resizeBuffer(encodedLength);
termAtt.setLength(encodedLength);
IndexableBinaryStringTools.encode(collationKey, 0, collationKey.length,
termAtt.buffer(), 0, encodedLength);
return true;
} else {
return false;
}
}
}

View File

@ -1,47 +0,0 @@
package org.apache.lucene.analysis.ar;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
/**
* Testcase for {@link TestArabicLetterTokenizer}
* @deprecated (3.1) Remove in Lucene 5.0
*/
@Deprecated
public class TestArabicLetterTokenizer extends BaseTokenStreamTestCase {
public void testArabicLetterTokenizer() throws IOException {
StringReader reader = new StringReader("1234567890 Tokenizer \ud801\udc1c\u0300test");
ArabicLetterTokenizer tokenizer = new ArabicLetterTokenizer(Version.LUCENE_31,
reader);
assertTokenStreamContents(tokenizer, new String[] {"Tokenizer",
"\ud801\udc1c\u0300test"});
}
public void testArabicLetterTokenizerBWCompat() throws IOException {
StringReader reader = new StringReader("1234567890 Tokenizer \ud801\udc1c\u0300test");
ArabicLetterTokenizer tokenizer = new ArabicLetterTokenizer(Version.LUCENE_30,
reader);
assertTokenStreamContents(tokenizer, new String[] {"Tokenizer", "\u0300test"});
}
}

View File

@ -23,6 +23,7 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
@ -88,7 +89,7 @@ public class TestArabicNormalizationFilter extends BaseTokenStreamTestCase {
}
private void check(final String input, final String expected) throws IOException {
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
MockTokenizer tokenStream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
assertTokenStreamContents(filter, new String[]{expected});
}

View File

@ -23,6 +23,7 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
@ -121,14 +122,14 @@ public class TestArabicStemFilter extends BaseTokenStreamTestCase {
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("ساهدهات");
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(TEST_VERSION_CURRENT, new StringReader("ساهدهات"));
MockTokenizer tokenStream = new MockTokenizer(new StringReader("ساهدهات"), MockTokenizer.WHITESPACE, false);
ArabicStemFilter filter = new ArabicStemFilter(new KeywordMarkerFilter(tokenStream, set));
assertTokenStreamContents(filter, new String[]{"ساهدهات"});
}
private void check(final String input, final String expected) throws IOException {
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
MockTokenizer tokenStream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
assertTokenStreamContents(filter, new String[]{expected});
}

View File

@ -68,7 +68,7 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
}
public void testWithStemExclusionSet() throws IOException {
CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("строеве");
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
assertAnalyzesTo(a, "строевете строеве", new String[] { "строй", "строеве" });

View File

@ -217,7 +217,7 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
}
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("строеве");
MockTokenizer tokenStream = new MockTokenizer(new StringReader("строевете строеве"), MockTokenizer.WHITESPACE, false);

View File

@ -1,281 +0,0 @@
package org.apache.lucene.analysis.cjk;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.Version;
/** @deprecated Remove when CJKTokenizer is removed (5.0) */
@Deprecated
public class TestCJKTokenizer extends BaseTokenStreamTestCase {
class TestToken {
String termText;
int start;
int end;
String type;
}
public TestToken newToken(String termText, int start, int end, int type) {
TestToken token = new TestToken();
token.termText = termText;
token.type = CJKTokenizer.TOKEN_TYPE_NAMES[type];
token.start = start;
token.end = end;
return token;
}
public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
String terms[] = new String[out_tokens.length];
int startOffsets[] = new int[out_tokens.length];
int endOffsets[] = new int[out_tokens.length];
String types[] = new String[out_tokens.length];
for (int i = 0; i < out_tokens.length; i++) {
terms[i] = out_tokens[i].termText;
startOffsets[i] = out_tokens[i].start;
endOffsets[i] = out_tokens[i].end;
types[i] = out_tokens[i].type;
}
assertAnalyzesTo(analyzer, str, terms, startOffsets, endOffsets, types, null);
}
public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
String terms[] = new String[out_tokens.length];
int startOffsets[] = new int[out_tokens.length];
int endOffsets[] = new int[out_tokens.length];
String types[] = new String[out_tokens.length];
for (int i = 0; i < out_tokens.length; i++) {
terms[i] = out_tokens[i].termText;
startOffsets[i] = out_tokens[i].start;
endOffsets[i] = out_tokens[i].end;
types[i] = out_tokens[i].type;
}
assertAnalyzesToReuse(analyzer, str, terms, startOffsets, endOffsets, types, null);
}
public void testJa1() throws IOException {
String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
TestToken[] out_tokens = {
newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u56db\u4e94", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e94\u516d", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u516d\u4e03", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e03\u516b", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u516b\u4e5d", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e5d\u5341", 8,10, CJKTokenizer.DOUBLE_TOKEN_TYPE)
};
checkCJKToken(str, out_tokens);
}
public void testJa2() throws IOException {
String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341";
TestToken[] out_tokens = {
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e94\u516d", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u516d\u4e03", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e03\u516b", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u516b\u4e5d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u5341", 12,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
};
checkCJKToken(str, out_tokens);
}
public void testC() throws IOException {
String str = "abc defgh ijklmn opqrstu vwxy z";
TestToken[] out_tokens = {
newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("opqrstu", 17, 24, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("vwxy", 25, 29, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("z", 30, 31, CJKTokenizer.SINGLE_TOKEN_TYPE),
};
checkCJKToken(str, out_tokens);
}
public void testMix() throws IOException {
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
TestToken[] out_tokens = {
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
};
checkCJKToken(str, out_tokens);
}
public void testMix2() throws IOException {
String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
TestToken[] out_tokens = {
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
};
checkCJKToken(str, out_tokens);
}
public void testSingleChar() throws IOException {
String str = "\u4e00";
TestToken[] out_tokens = {
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
};
checkCJKToken(str, out_tokens);
}
/*
* Full-width text is normalized to half-width
*/
public void testFullWidth() throws Exception {
String str = " ";
TestToken[] out_tokens = {
newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("1234", 5, 9, CJKTokenizer.SINGLE_TOKEN_TYPE)
};
checkCJKToken(str, out_tokens);
}
/*
* Non-english text (not just CJK) is treated the same as CJK: C1C2 C2C3
*/
public void testNonIdeographic() throws Exception {
String str = "\u4e00 روبرت موير";
TestToken[] out_tokens = {
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("رو", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("وب", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("بر", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("رت", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("مو", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("وي", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("ير", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE)
};
checkCJKToken(str, out_tokens);
}
/*
* Non-english text with nonletters (non-spacing marks,etc) is treated as C1C2 C2C3,
* except for words are split around non-letters.
*/
public void testNonIdeographicNonLetter() throws Exception {
String str = "\u4e00 رُوبرت موير";
TestToken[] out_tokens = {
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("ر", 2, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("وب", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("بر", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("رت", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("مو", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("وي", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("ير", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
};
checkCJKToken(str, out_tokens);
}
public void testTokenStream() throws Exception {
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02",
new String[] { "\u4e00\u4e01", "\u4e01\u4e02"});
}
public void testReusableTokenStream() throws Exception {
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
TestToken[] out_tokens = {
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
};
checkCJKTokenReusable(analyzer, str, out_tokens);
str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
TestToken[] out_tokens2 = {
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
};
checkCJKTokenReusable(analyzer, str, out_tokens2);
}
/**
* LUCENE-2207: wrong offset calculated by end()
*/
public void testFinalOffset() throws IOException {
checkCJKToken("あい", new TestToken[] {
newToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
checkCJKToken("あい ", new TestToken[] {
newToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
checkCJKToken("test", new TestToken[] {
newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE) });
checkCJKToken("test ", new TestToken[] {
newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE) });
checkCJKToken("あいtest", new TestToken[] {
newToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("test", 2, 6, CJKTokenizer.SINGLE_TOKEN_TYPE) });
checkCJKToken("testあい ", new TestToken[] {
newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("あい", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new CJKAnalyzer(Version.LUCENE_30), 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -1,126 +0,0 @@
package org.apache.lucene.analysis.cn;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.Version;
/** @deprecated Remove this test when ChineseAnalyzer is removed. */
@Deprecated
public class TestChineseTokenizer extends BaseTokenStreamTestCase
{
public void testOtherLetterOffset() throws IOException
{
String s = "a天b";
ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));
int correctStartOffset = 0;
int correctEndOffset = 1;
OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
while (tokenizer.incrementToken()) {
assertEquals(correctStartOffset, offsetAtt.startOffset());
assertEquals(correctEndOffset, offsetAtt.endOffset());
correctStartOffset++;
correctEndOffset++;
}
}
public void testReusableTokenStream() throws Exception
{
Analyzer a = new ChineseAnalyzer();
assertAnalyzesToReuse(a, "中华人民共和国",
new String[] { "", "", "", "", "", "", "" },
new int[] { 0, 1, 2, 3, 4, 5, 6 },
new int[] { 1, 2, 3, 4, 5, 6, 7 });
assertAnalyzesToReuse(a, "北京市",
new String[] { "", "", "" },
new int[] { 0, 1, 2 },
new int[] { 1, 2, 3 });
}
/*
* Analyzer that just uses ChineseTokenizer, not ChineseFilter.
* convenience to show the behavior of the tokenizer
*/
private class JustChineseTokenizerAnalyzer extends Analyzer {
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new ChineseTokenizer(reader));
}
}
/*
* Analyzer that just uses ChineseFilter, not ChineseTokenizer.
* convenience to show the behavior of the filter.
*/
private class JustChineseFilterAnalyzer extends Analyzer {
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader);
return new TokenStreamComponents(tokenizer, new ChineseFilter(tokenizer));
}
}
/*
* ChineseTokenizer tokenizes numbers as one token, but they are filtered by ChineseFilter
*/
public void testNumerics() throws Exception
{
Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
assertAnalyzesTo(justTokenizer, "中1234", new String[] { "", "1234" });
// in this case the ChineseAnalyzer (which applies ChineseFilter) will remove the numeric token.
Analyzer a = new ChineseAnalyzer();
assertAnalyzesTo(a, "中1234", new String[] { "" });
}
/*
* ChineseTokenizer tokenizes english similar to SimpleAnalyzer.
* it will lowercase terms automatically.
*
* ChineseFilter has an english stopword list, it also removes any single character tokens.
* the stopword list is case-sensitive.
*/
public void testEnglish() throws Exception
{
Analyzer chinese = new ChineseAnalyzer();
assertAnalyzesTo(chinese, "This is a Test. b c d",
new String[] { "test" });
Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
assertAnalyzesTo(justTokenizer, "This is a Test. b c d",
new String[] { "this", "is", "a", "test", "b", "c", "d" });
Analyzer justFilter = new JustChineseFilterAnalyzer();
assertAnalyzesTo(justFilter, "This is a Test. b c d",
new String[] { "This", "Test." });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new ChineseAnalyzer(), 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -27,7 +27,6 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.index.Payload;
import org.apache.lucene.util.Version;
public class TestAnalyzers extends BaseTokenStreamTestCase {
@ -182,15 +181,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
"\ud801\udc44test" });
}
/** @deprecated (3.1) */
@Deprecated
public void testLowerCaseTokenizerBWCompat() throws IOException {
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_30,
reader);
assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test" });
}
public void testWhitespaceTokenizer() throws IOException {
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
@ -198,16 +188,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
"\ud801\udc1ctest" });
}
/** @deprecated (3.1) */
@Deprecated
public void testWhitespaceTokenizerBWCompat() throws IOException {
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_30,
reader);
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
"\ud801\udc1ctest" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {

View File

@ -5,8 +5,8 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.standard.ClassicAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@ -15,7 +15,6 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.util.Arrays;
@ -137,7 +136,7 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
// 2.4 should not show the bug. But, alas, it's also obsolete,
// so we check latest released (Robert's gonna break this on 4.0 soon :) )
a2 = new ClassicAnalyzer(Version.LUCENE_31);
a2 = new ClassicAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
}
@ -244,7 +243,7 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
}
public void testJava14BWCompatibility() throws Exception {
ClassicAnalyzer sa = new ClassicAnalyzer(Version.LUCENE_30);
ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test", "test" });
}
@ -272,7 +271,7 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
writer.addDocument(doc);
writer.close();
IndexReader reader = IndexReader.open(dir);
IndexReader reader = DirectoryReader.open(dir);
// Make sure all terms < max size were indexed
assertEquals(2, reader.docFreq(new Term("content", "abc")));
@ -306,7 +305,7 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
writer.addDocument(doc);
writer.close();
reader = IndexReader.open(dir);
reader = DirectoryReader.open(dir);
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
reader.close();

View File

@ -26,6 +26,7 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
@ -58,7 +59,7 @@ public class TestKeywordAnalyzer extends BaseTokenStreamTestCase {
writer.close();
reader = IndexReader.open(directory);
reader = DirectoryReader.open(directory);
searcher = new IndexSearcher(reader);
}
@ -95,7 +96,7 @@ public class TestKeywordAnalyzer extends BaseTokenStreamTestCase {
writer.addDocument(doc);
writer.close();
IndexReader reader = IndexReader.open(dir);
IndexReader reader = DirectoryReader.open(dir);
DocsEnum td = _TestUtil.docs(random(),
reader,
"partnum",

View File

@ -230,16 +230,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
checkOneTerm(a, "아゙", "아゙"); // hangul
}
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
@Deprecated
public void testCombiningMarksBackwards() throws Exception {
Analyzer a = new StandardAnalyzer(Version.LUCENE_33);
checkOneTerm(a, "ざ", ""); // hiragana Bug
checkOneTerm(a, "ザ", "ザ"); // katakana Works
checkOneTerm(a, "壹゙", ""); // ideographic Bug
checkOneTerm(a, "아゙", "아゙"); // hangul Works
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {

View File

@ -209,16 +209,6 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
checkOneTerm(a, "아゙", "아゙"); // hangul
}
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
@Deprecated
public void testCombiningMarksBackwards() throws Exception {
Analyzer a = new UAX29URLEmailAnalyzer(Version.LUCENE_33);
checkOneTerm(a, "ざ", ""); // hiragana Bug
checkOneTerm(a, "ザ", "ザ"); // katakana Works
checkOneTerm(a, "壹゙", ""); // ideographic Bug
checkOneTerm(a, "아゙", "아゙"); // hangul Works
}
public void testBasicEmails() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a,

View File

@ -453,39 +453,6 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
checkOneTerm(a, "아゙", "아゙"); // hangul
}
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
@Deprecated
public void testCombiningMarksBackwards() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents
(String fieldName, Reader reader) {
Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_31, reader);
return new TokenStreamComponents(tokenizer);
}
};
checkOneTerm(a, "ざ", ""); // hiragana Bug
checkOneTerm(a, "ザ", "ザ"); // katakana Works
checkOneTerm(a, "壹゙", ""); // ideographic Bug
checkOneTerm(a, "아゙", "아゙"); // hangul Works
}
// LUCENE-3880
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
@Deprecated
public void testMailtoBackwards() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_34, reader);
return new TokenStreamComponents(tokenizer);
}
};
assertAnalyzesTo(a, "mailto:test@example.org",
new String[] { "mailto:test", "example.org" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);

View File

@ -31,30 +31,12 @@ import org.apache.lucene.util.Version;
*
*/
public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
/**
* @deprecated (3.1) Remove this test when support for 3.0 indexes is no longer needed.
*/
@Deprecated
public void testStopWordLegacy() throws Exception {
assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_30), "Pokud mluvime o volnem",
new String[] { "mluvime", "volnem" });
}
public void testStopWord() throws Exception {
assertAnalyzesTo(new CzechAnalyzer(TEST_VERSION_CURRENT), "Pokud mluvime o volnem",
new String[] { "mluvim", "voln" });
}
/**
* @deprecated (3.1) Remove this test when support for 3.0 indexes is no longer needed.
*/
@Deprecated
public void testReusableTokenStreamLegacy() throws Exception {
Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_30);
assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká", "republika" });
}
public void testReusableTokenStream() throws Exception {
Analyzer analyzer = new CzechAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvim", "voln" });

View File

@ -25,7 +25,6 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
public void testReusableTokenStream() throws Exception {
@ -58,10 +57,6 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
// a/o/u + e is equivalent to the umlaut form
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
checkOneTermReuse(a, "Schaltflaechen", "schaltflach");
// here they are with the old stemmer
a = new GermanAnalyzer(Version.LUCENE_30);
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
checkOneTermReuse(a, "Schaltflaechen", "schaltflaech");
}
/** blast some random strings through the analyzer */

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.el;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
/**
* A unit test class for verifying the correct operation of the GreekAnalyzer.
@ -47,29 +46,6 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
}
/**
* Test the analysis of various greek strings.
*
* @throws Exception in case an error occurs
* @deprecated (3.1) Remove this test when support for 3.0 is no longer needed
*/
@Deprecated
public void testAnalyzerBWCompat() throws Exception {
Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
// Verify the correct analysis of capitals and small accented letters
assertAnalyzesTo(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
new String[] { "μια", "εξαιρετικα", "καλη", "πλουσια", "σειρα", "χαρακτηρων",
"ελληνικησ", "γλωσσασ" });
// Verify the correct analysis of small letters with diaeresis and the elimination
// of punctuation marks
assertAnalyzesTo(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
new String[] { "προιοντα", "πολλαπλεσ", "αναγκεσ" });
// Verify the correct analysis of capital accented letters and capital letters with diaeresis,
// as well as the elimination of stop words
assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
new String[] { "προυποθεσεισ", "αψογοσ", "μεστοσ", "αλλοι" });
}
public void testReusableTokenStream() throws Exception {
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);

View File

@ -23,8 +23,8 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
@ -58,8 +58,7 @@ public class TestPersianNormalizationFilter extends BaseTokenStreamTestCase {
}
private void check(final String input, final String expected) throws IOException {
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(TEST_VERSION_CURRENT,
new StringReader(input));
MockTokenizer tokenStream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
PersianNormalizationFilter filter = new PersianNormalizationFilter(
tokenStream);
assertTokenStreamContents(filter, new String[]{expected});

View File

@ -115,94 +115,6 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
}
/**
* @deprecated (3.1) remove this test for Lucene 5.0
*/
@Deprecated
public void testAnalyzer30() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
assertAnalyzesTo(fa, "", new String[] {
});
assertAnalyzesTo(
fa,
"chien chat cheval",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
"chien CHAT CHEVAL",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
" chien ,? + = - CHAT /: > CHEVAL",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
assertAnalyzesTo(
fa,
"mot \"entreguillemet\"",
new String[] { "mot", "entreguillemet" });
// let's do some french specific tests now
/* 1. couldn't resist
I would expect this to stay one term as in French the minus
sign is often used for composing words */
assertAnalyzesTo(
fa,
"Jean-François",
new String[] { "jean", "françois" });
// 2. stopwords
assertAnalyzesTo(
fa,
"le la chien les aux chat du des à cheval",
new String[] { "chien", "chat", "cheval" });
// some nouns and adjectives
assertAnalyzesTo(
fa,
"lances chismes habitable chiste éléments captifs",
new String[] {
"lanc",
"chism",
"habit",
"chist",
"élément",
"captif" });
// some verbs
assertAnalyzesTo(
fa,
"finissions souffrirent rugissante",
new String[] { "fin", "souffr", "rug" });
// some everything else
// aujourd'hui stays one term which is OK
assertAnalyzesTo(
fa,
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
new String[] {
"c3po",
"aujourd'hui",
"oeuf",
"ïâöûàä",
"anticonstitutionnel",
"jav" });
// some more everything else
// here 1940-1945 stays as one term, 1940:1945 not ?
assertAnalyzesTo(
fa,
"33Bis 1940-1945 1940:1945 (---i+++)*",
new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
}
public void testReusableTokenStream() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
// stopwords
@ -242,22 +154,11 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouil" });
}
/**
* Prior to 3.1, this analyzer had no lowercase filter.
* stopwords were case sensitive. Preserve this for back compat.
* @deprecated (3.1) Remove this test in Lucene 5.0
*/
@Deprecated
public void testBuggyStopwordsCasing() throws IOException {
FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
assertAnalyzesTo(a, "Votre", new String[] { "votr" });
}
/**
* Test that stopwords are not case sensitive
*/
public void testStopwordsCasing() throws IOException {
FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
FrenchAnalyzer a = new FrenchAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(a, "Votre", new String[] { });
}

View File

@ -63,11 +63,4 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" });
assertAnalyzesTo(a, "l'Italiano", new String[] { "italian" });
}
/** test that we don't enable this before 3.2*/
public void testContractionsBackwards() throws IOException {
Analyzer a = new ItalianAnalyzer(Version.LUCENE_31);
assertAnalyzesTo(a, "dell'Italia", new String[] { "dell'ital" });
assertAnalyzesTo(a, "l'Italiano", new String[] { "l'ital" });
}
}

View File

@ -1,181 +0,0 @@
package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import java.lang.Thread.UncaughtExceptionHandler;
import java.util.Arrays;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopAnalyzer;
/**
* Verifies the behavior of PatternAnalyzer.
*/
public class PatternAnalyzerTest extends BaseTokenStreamTestCase {
/**
* Test PatternAnalyzer when it is configured with a non-word pattern.
* Behavior can be similar to SimpleAnalyzer (depending upon options)
*/
public void testNonWordPattern() throws IOException {
// Split on non-letter pattern, do not lowercase, no stopwords
PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
false, null);
check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
"The", "quick", "brown", "Fox", "the", "abcd", "dc" });
// split on non-letter pattern, lowercase, english stopwords
PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
"quick", "brown", "fox", "abcd", "dc" });
}
/**
* Test PatternAnalyzer when it is configured with a whitespace pattern.
* Behavior can be similar to WhitespaceAnalyzer (depending upon options)
*/
public void testWhitespacePattern() throws IOException {
// Split on whitespace patterns, do not lowercase, no stopwords
PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
false, null);
check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
"The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." });
// Split on whitespace patterns, lowercase, english stopwords
PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
"quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." });
}
/**
* Test PatternAnalyzer when it is configured with a custom pattern. In this
* case, text is tokenized on the comma ","
*/
public void testCustomPattern() throws IOException {
// Split on comma, do not lowercase, no stopwords
PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), false, null);
check(a, "Here,Are,some,Comma,separated,words,", new String[] { "Here",
"Are", "some", "Comma", "separated", "words" });
// split on comma, lowercase, english stopwords
PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true,
StopAnalyzer.ENGLISH_STOP_WORDS_SET);
check(b, "Here,Are,some,Comma,separated,words,", new String[] { "here",
"some", "comma", "separated", "words" });
}
/**
* Test PatternAnalyzer against a large document.
*/
public void testHugeDocument() throws IOException {
StringBuilder document = new StringBuilder();
// 5000 a's
char largeWord[] = new char[5000];
Arrays.fill(largeWord, 'a');
document.append(largeWord);
// a space
document.append(' ');
// 2000 b's
char largeWord2[] = new char[2000];
Arrays.fill(largeWord2, 'b');
document.append(largeWord2);
// Split on whitespace patterns, do not lowercase, no stopwords
PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
false, null);
check(a, document.toString(), new String[] { new String(largeWord),
new String(largeWord2) });
}
/**
* Verify the analyzer analyzes to the expected contents. For PatternAnalyzer,
* several methods are verified:
* <ul>
* <li>Analysis with a normal Reader
* <li>Analysis with a FastStringReader
* <li>Analysis with a String
* </ul>
*/
private void check(PatternAnalyzer analyzer, String document,
String expected[]) throws IOException {
// ordinary analysis of a Reader
assertAnalyzesTo(analyzer, document, expected);
// analysis with a "FastStringReader"
TokenStream ts = analyzer.tokenStream("dummy",
new PatternAnalyzer.FastStringReader(document));
assertTokenStreamContents(ts, expected);
// analysis of a String, uses PatternAnalyzer.tokenStream(String, String)
TokenStream ts2 = analyzer.tokenStream("dummy", new StringReader(document));
assertTokenStreamContents(ts2, expected);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
// dodge jre bug http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7104012
final UncaughtExceptionHandler savedHandler = Thread.getDefaultUncaughtExceptionHandler();
Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() {
@Override
public void uncaughtException(Thread thread, Throwable throwable) {
assumeTrue("not failing due to jre bug ", !isJREBug7104012(throwable));
// otherwise its some other bug, pass to default handler
savedHandler.uncaughtException(thread, throwable);
}
});
try {
Thread.getDefaultUncaughtExceptionHandler();
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
} catch (ArrayIndexOutOfBoundsException ex) {
assumeTrue("not failing due to jre bug ", !isJREBug7104012(ex));
throw ex; // otherwise rethrow
} finally {
Thread.setDefaultUncaughtExceptionHandler(savedHandler);
}
}
static boolean isJREBug7104012(Throwable t) {
if (!(t instanceof ArrayIndexOutOfBoundsException)) {
// BaseTokenStreamTestCase now wraps exc in a new RuntimeException:
t = t.getCause();
if (!(t instanceof ArrayIndexOutOfBoundsException)) {
return false;
}
}
StackTraceElement trace[] = t.getStackTrace();
for (StackTraceElement st : trace) {
if ("java.text.RuleBasedBreakIterator".equals(st.getClassName())
&& "lookupBackwardState".equals(st.getMethodName())) {
return true;
}
}
return false;
}
}

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@ -61,7 +62,7 @@ public class TestLimitTokenCountAnalyzer extends BaseTokenStreamTestCase {
writer.addDocument(doc);
writer.close();
IndexReader reader = IndexReader.open(dir);
IndexReader reader = DirectoryReader.open(dir);
Term t = new Term("field", "x");
assertEquals(1, reader.docFreq(t));
reader.close();

View File

@ -112,17 +112,6 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
check("ophouden", "ophoud");
}
/**
* @deprecated (3.1) remove this test in Lucene 5.0
*/
@Deprecated
public void testOldBuggyStemmer() throws Exception {
Analyzer a = new DutchAnalyzer(Version.LUCENE_30);
checkOneTermReuse(a, "opheffen", "ophef"); // versus snowball 'opheff'
checkOneTermReuse(a, "opheffende", "ophef"); // versus snowball 'opheff'
checkOneTermReuse(a, "opheffing", "ophef"); // versus snowball 'opheff'
}
public void testSnowballCorrectness() throws Exception {
Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
checkOneTermReuse(a, "opheffen", "opheff");
@ -139,7 +128,7 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
}
public void testExclusionTableViaCtor() throws IOException {
CharArraySet set = new CharArraySet(Version.LUCENE_30, 1, true);
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("lichamelijk");
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
@ -158,33 +147,11 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
checkOneTerm(a, "fiets", "fiets");
}
/**
* prior to 3.6, this confusingly did not happen if
* you specified your own stoplist!!!!
* @deprecated (3.6) Remove this test in Lucene 5.0
*/
@Deprecated
public void testBuggyStemOverrides() throws IOException {
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_35, CharArraySet.EMPTY_SET);
checkOneTerm(a, "fiets", "fiet");
}
/**
* Prior to 3.1, this analyzer had no lowercase filter.
* stopwords were case sensitive. Preserve this for back compat.
* @deprecated (3.1) Remove this test in Lucene 5.0
*/
@Deprecated
public void testBuggyStopwordsCasing() throws IOException {
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_30);
assertAnalyzesTo(a, "Zelf", new String[] { "zelf" });
}
/**
* Test that stopwords are not case sensitive
*/
public void testStopwordsCasing() throws IOException {
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_31);
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(a, "Zelf", new String[] { });
}

View File

@ -20,6 +20,7 @@ import org.apache.lucene.analysis.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@ -53,7 +54,7 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
writer.addDocument(doc);
}
writer.close();
reader = IndexReader.open(dir);
reader = DirectoryReader.open(dir);
}
@Override

View File

@ -57,15 +57,6 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase {
assertEquals( "ABEDCF", new String( buffer ) );
}
/**
* Test the broken 3.0 behavior, for back compat
* @deprecated (3.1) Remove in Lucene 5.0
*/
@Deprecated
public void testBackCompat() throws Exception {
assertEquals("\uDF05\uD866\uDF05\uD866", ReverseStringFilter.reverse(Version.LUCENE_30, "𩬅𩬅"));
}
public void testReverseSupplementary() throws Exception {
// supplementary at end
assertEquals("𩬅艱鍟䇹愯瀛", ReverseStringFilter.reverse(TEST_VERSION_CURRENT, "瀛愯䇹鍟艱𩬅"));

View File

@ -37,16 +37,6 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesTo(ra, "text 1000", new String[] { "text", "1000" });
}
/** @deprecated (3.1) remove this test in Lucene 5.0: stopwords changed */
@Deprecated
public void testReusableTokenStream30() throws Exception {
Analyzer a = new RussianAnalyzer(Version.LUCENE_30);
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
new String[] { "знан", "хран", "тайн" });
}
public void testReusableTokenStream() throws Exception {
Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",

View File

@ -1,47 +0,0 @@
package org.apache.lucene.analysis.ru;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
/**
* Testcase for {@link RussianLetterTokenizer}
* @deprecated (3.1) Remove this test class in Lucene 5.0
*/
@Deprecated
public class TestRussianLetterTokenizer extends BaseTokenStreamTestCase {
public void testRussianLetterTokenizer() throws IOException {
StringReader reader = new StringReader("1234567890 Вместе \ud801\udc1ctest");
RussianLetterTokenizer tokenizer = new RussianLetterTokenizer(Version.LUCENE_CURRENT,
reader);
assertTokenStreamContents(tokenizer, new String[] {"1234567890", "Вместе",
"\ud801\udc1ctest"});
}
public void testRussianLetterTokenizerBWCompat() throws IOException {
StringReader reader = new StringReader("1234567890 Вместе \ud801\udc1ctest");
RussianLetterTokenizer tokenizer = new RussianLetterTokenizer(Version.LUCENE_30,
reader);
assertTokenStreamContents(tokenizer, new String[] {"1234567890", "Вместе", "test"});
}
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@ -74,7 +75,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
writer.close();
reader = IndexReader.open(directory);
reader = DirectoryReader.open(directory);
searcher = new IndexSearcher(reader);
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
@ -103,7 +104,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
w.addDocument(doc);
w.close();
IndexReader r = IndexReader.open(dir);
IndexReader r = DirectoryReader.open(dir);
Terms vector = r.getTermVectors(0).terms("field");
assertEquals(1, vector.size());
TermsEnum termsEnum = vector.iterator(null);

View File

@ -22,6 +22,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.index.Payload;
import org.apache.lucene.analysis.TokenStream;
@ -38,65 +39,18 @@ import org.apache.lucene.util.Version;
public class TestSnowball extends BaseTokenStreamTestCase {
public void testEnglish() throws Exception {
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader);
return new TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, "English"));
}
};
assertAnalyzesTo(a, "he abhorred accents",
new String[]{"he", "abhor", "accent"});
}
public void testStopwords() throws Exception {
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English",
StandardAnalyzer.STOP_WORDS_SET);
assertAnalyzesTo(a, "the quick brown fox jumped",
new String[]{"quick", "brown", "fox", "jump"});
}
/**
* Test english lowercasing. Test both cases (pre-3.1 and post-3.1) to ensure
* we lowercase I correct for non-Turkish languages in either case.
*/
public void testEnglishLowerCase() throws Exception {
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");
assertAnalyzesTo(a, "cryogenic", new String[] { "cryogen" });
assertAnalyzesTo(a, "CRYOGENIC", new String[] { "cryogen" });
Analyzer b = new SnowballAnalyzer(Version.LUCENE_30, "English");
assertAnalyzesTo(b, "cryogenic", new String[] { "cryogen" });
assertAnalyzesTo(b, "CRYOGENIC", new String[] { "cryogen" });
}
/**
* Test turkish lowercasing
*/
public void testTurkish() throws Exception {
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "Turkish");
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaç" });
}
/**
* Test turkish lowercasing (old buggy behavior)
* @deprecated (3.1) Remove this when support for 3.0 indexes is no longer required (5.0)
*/
@Deprecated
public void testTurkishBWComp() throws Exception {
Analyzer a = new SnowballAnalyzer(Version.LUCENE_30, "Turkish");
// AĞACI in turkish lowercases to ağacı, but with lowercase filter ağaci.
// this fails due to wrong casing, because the stemmer
// will only remove -ı, not -i
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaci" });
}
public void testReusableTokenStream() throws Exception {
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");
assertAnalyzesToReuse(a, "he abhorred accents",
new String[]{"he", "abhor", "accent"});
assertAnalyzesToReuse(a, "she abhorred him",
new String[]{"she", "abhor", "him"});
}
public void testFilterTokens() throws Exception {
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);

View File

@ -62,13 +62,6 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
new int[] { 5, 2, 1 });
}
public void testBackwardsStopWords() throws Exception {
assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_35), "การที่ได้ต้องแสดงว่างานดี",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
}
public void testTokenType() throws Exception {
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
@ -79,43 +72,6 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
"<NUM>" });
}
/**
* Thai numeric tokens were typed as <ALPHANUM> instead of <NUM>.
* @deprecated (3.1) testing backwards behavior
*/
@Deprecated
public void testBuggyTokenType30() throws Exception {
assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_30), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
}
/** @deprecated (3.1) testing backwards behavior */
@Deprecated
public void testAnalyzer30() throws Exception {
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
assertAnalyzesTo(analyzer, "", new String[] {});
assertAnalyzesTo(
analyzer,
"การที่ได้ต้องแสดงว่างานดี",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
assertAnalyzesTo(
analyzer,
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
// English stop words
assertAnalyzesTo(
analyzer,
"ประโยคว่า The quick brown fox jumped over the lazy dogs",
new String[] { "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" });
}
/*
* Test that position increments are adjusted correctly for stopwords.
*/
@ -151,23 +107,6 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
}
/** @deprecated (3.1) for version back compat */
@Deprecated
public void testReusableTokenStream30() throws Exception {
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
assertAnalyzesToReuse(analyzer, "", new String[] {});
assertAnalyzesToReuse(
analyzer,
"การที่ได้ต้องแสดงว่างานดี",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
assertAnalyzesToReuse(
analyzer,
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new ThaiAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
@ -181,7 +120,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
// LUCENE-3044
public void testAttributeReuse() throws Exception {
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
// just consume
TokenStream ts = analyzer.tokenStream("dummy", new StringReader("ภาษาไทย"));
assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });

View File

@ -250,77 +250,6 @@ public class TestCharArraySet extends LuceneTestCase {
}
}
/**
* @deprecated (3.1) remove this test when lucene 3.0 "broken unicode 4" support is
* no longer needed.
*/
@Deprecated
public void testSupplementaryCharsBWCompat() {
String missing = "Term %s is missing in the set";
String falsePos = "Term %s is in the set but shouldn't";
// for reference see
// http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
String[] upperArr = new String[] {"Abc\ud801\udc1c",
"\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
String[] lowerArr = new String[] {"abc\ud801\udc44",
"\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
CharArraySet set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS), true);
for (String upper : upperArr) {
set.add(upper);
}
for (int i = 0; i < upperArr.length; i++) {
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
}
set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS), false);
for (String upper : upperArr) {
set.add(upper);
}
for (int i = 0; i < upperArr.length; i++) {
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
}
}
/**
* @deprecated (3.1) remove this test when lucene 3.0 "broken unicode 4" support is
* no longer needed.
*/
@Deprecated
public void testSingleHighSurrogateBWComapt() {
String missing = "Term %s is missing in the set";
String falsePos = "Term %s is in the set but shouldn't";
String[] upperArr = new String[] { "ABC\uD800", "ABC\uD800EfG",
"\uD800EfG", "\uD800\ud801\udc1cB" };
String[] lowerArr = new String[] { "abc\uD800", "abc\uD800efg",
"\uD800efg", "\uD800\ud801\udc44b" };
CharArraySet set = new CharArraySet(Version.LUCENE_30, Arrays
.asList(TEST_STOP_WORDS), true);
for (String upper : upperArr) {
set.add(upper);
}
for (int i = 0; i < upperArr.length; i++) {
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
if (i == lowerArr.length - 1)
assertFalse(String.format(falsePos, lowerArr[i]), set
.contains(lowerArr[i]));
else
assertTrue(String.format(missing, lowerArr[i]), set
.contains(lowerArr[i]));
}
set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS),
false);
for (String upper : upperArr) {
set.add(upper);
}
for (int i = 0; i < upperArr.length; i++) {
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
assertFalse(String.format(falsePos, lowerArr[i]), set
.contains(lowerArr[i]));
}
}
@SuppressWarnings("deprecated")
public void testCopyCharArraySetBWCompat() {
CharArraySet setIngoreCase = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
@ -499,10 +428,5 @@ public class TestCharArraySet extends LuceneTestCase {
assertEquals("[test]", set.toString());
set.add("test2");
assertTrue(set.toString().contains(", "));
set = CharArraySet.copy(Version.LUCENE_30, Collections.singleton("test"));
assertEquals("[test]", set.toString());
set.add("test2");
assertTrue(set.toString().contains(", "));
}
}

View File

@ -33,7 +33,7 @@ public class TestCharacterUtils extends LuceneTestCase {
@Test
public void testCodePointAtCharArrayInt() {
CharacterUtils java4 = CharacterUtils.getInstance(Version.LUCENE_30);
CharacterUtils java4 = CharacterUtils.getJava4Instance();
char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
@ -59,7 +59,7 @@ public class TestCharacterUtils extends LuceneTestCase {
@Test
public void testCodePointAtCharSequenceInt() {
CharacterUtils java4 = CharacterUtils.getInstance(Version.LUCENE_30);
CharacterUtils java4 = CharacterUtils.getJava4Instance();
String cpAt3 = "Abc\ud801\udc1c";
String highSurrogateAt3 = "Abc\ud801";
assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
@ -86,7 +86,7 @@ public class TestCharacterUtils extends LuceneTestCase {
@Test
public void testCodePointAtCharArrayIntInt() {
CharacterUtils java4 = CharacterUtils.getInstance(Version.LUCENE_30);
CharacterUtils java4 = CharacterUtils.getJava4Instance();
char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
assertEquals((int) 'A', java4.codePointAt(cpAt3, 0, 2));
@ -122,9 +122,10 @@ public class TestCharacterUtils extends LuceneTestCase {
@Test
public void testFillNoHighSurrogate() throws IOException {
Version[] versions = new Version[] { Version.LUCENE_30, TEST_VERSION_CURRENT };
for (Version version : versions) {
CharacterUtils instance = CharacterUtils.getInstance(version);
CharacterUtils versions[] = new CharacterUtils[] {
CharacterUtils.getInstance(TEST_VERSION_CURRENT),
CharacterUtils.getJava4Instance() };
for (CharacterUtils instance : versions) {
Reader reader = new StringReader("helloworld");
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
assertTrue(instance.fill(buffer,reader));
@ -172,7 +173,7 @@ public class TestCharacterUtils extends LuceneTestCase {
@Test
public void testFillJava14() throws IOException {
String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
CharacterUtils instance = CharacterUtils.getInstance(Version.LUCENE_30);
CharacterUtils instance = CharacterUtils.getJava4Instance();
Reader reader = new StringReader(input);
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
assertTrue(instance.fill(buffer, reader));

View File

@ -1,102 +0,0 @@
package org.apache.lucene.collation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.BytesRef;
import java.text.Collator;
import java.util.Locale;
import java.io.Reader;
/**
* @deprecated remove when CollationKeyFilter is removed.
*/
@Deprecated
public class TestCollationKeyFilter extends CollationTestBase {
// the sort order of Ø versus U depends on the version of the rules being used
// for the inherited root locale: Ø's order isnt specified in Locale.US since
// its not used in english.
boolean oStrokeFirst = Collator.getInstance(new Locale("")).compare("Ø", "U") < 0;
// Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in
// RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
// characters properly.
private Collator collator = Collator.getInstance(new Locale("ar"));
private Analyzer analyzer = new TestAnalyzer(collator);
private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
(collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
(collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
public final class TestAnalyzer extends Analyzer {
private Collator _collator;
TestAnalyzer(Collator collator) {
_collator = collator;
}
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer result = new KeywordTokenizer(reader);
return new TokenStreamComponents(result, new CollationKeyFilter(result, _collator));
}
}
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating
(analyzer, firstRangeBeginning, firstRangeEnd,
secondRangeBeginning, secondRangeEnd);
}
public void testFarsiRangeQueryCollating() throws Exception {
testFarsiRangeQueryCollating
(analyzer, firstRangeBeginning, firstRangeEnd,
secondRangeBeginning, secondRangeEnd);
}
public void testFarsiTermRangeQuery() throws Exception {
testFarsiTermRangeQuery
(analyzer, firstRangeBeginning, firstRangeEnd,
secondRangeBeginning, secondRangeEnd);
}
public void testCollationKeySort() throws Exception {
Analyzer usAnalyzer = new TestAnalyzer(Collator.getInstance(Locale.US));
Analyzer franceAnalyzer
= new TestAnalyzer(Collator.getInstance(Locale.FRANCE));
Analyzer swedenAnalyzer
= new TestAnalyzer(Collator.getInstance(new Locale("sv", "se")));
Analyzer denmarkAnalyzer
= new TestAnalyzer(Collator.getInstance(new Locale("da", "dk")));
// The ICU Collator and Sun java.text.Collator implementations differ in their
// orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US.
testCollationKeySort
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
oStrokeFirst ? "BFJHD" : "BFJDH", "EACGI", "BJDFH", "BJDHF");
}
}

View File

@ -22,18 +22,16 @@ import com.ibm.icu.text.Collator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.collation.CollationKeyAnalyzer; // javadocs
import org.apache.lucene.util.IndexableBinaryStringTools; // javadocs
import org.apache.lucene.util.Version;
import java.io.Reader;
/**
* <p>
* Filters {@link KeywordTokenizer} with {@link ICUCollationKeyFilter}.
* Configures {@link KeywordTokenizer} with {@link ICUCollationAttributeFactory}.
* <p>
* Converts the token into its {@link com.ibm.icu.text.CollationKey}, and
* then encodes the CollationKey either directly or with
* {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow it to
* then encodes the CollationKey directly to allow it to
* be stored as an index term.
* </p>
* <p>
@ -67,48 +65,24 @@ import java.io.Reader;
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating ICUCollationKeyAnalyzer:
* <ul>
* <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
* versions will encode the bytes with {@link IndexableBinaryStringTools}.
* </ul>
*/
public final class ICUCollationKeyAnalyzer extends Analyzer {
private final Collator collator;
private final ICUCollationAttributeFactory factory;
private final Version matchVersion;
/**
* Create a new ICUCollationKeyAnalyzer, using the specified collator.
*
* @param matchVersion See <a href="#version">above</a>
* @param matchVersion compatibility version
* @param collator CollationKey generator
*/
public ICUCollationKeyAnalyzer(Version matchVersion, Collator collator) {
this.matchVersion = matchVersion;
this.collator = collator;
this.factory = new ICUCollationAttributeFactory(collator);
}
/**
* @deprecated Use {@link ICUCollationKeyAnalyzer#ICUCollationKeyAnalyzer(Version, Collator)}
* and specify a version instead. This ctor will be removed in Lucene 5.0
*/
@Deprecated
public ICUCollationKeyAnalyzer(Collator collator) {
this(Version.LUCENE_31, collator);
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
if (matchVersion.onOrAfter(Version.LUCENE_40)) {
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new TokenStreamComponents(tokenizer, tokenizer);
} else {
KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ICUCollationKeyFilter(tokenizer, collator));
}
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new TokenStreamComponents(tokenizer, tokenizer);
}
}

View File

@ -1,114 +0,0 @@
package org.apache.lucene.collation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RawCollationKey;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.IndexableBinaryStringTools;
import org.apache.lucene.collation.CollationKeyFilter; // javadocs
import java.io.IOException;
/**
* <p>
* Converts each token into its {@link com.ibm.icu.text.CollationKey}, and
* then encodes the CollationKey with {@link IndexableBinaryStringTools}, to
* allow it to be stored as an index term.
* </p>
* <p>
* <strong>WARNING:</strong> Make sure you use exactly the same Collator at
* index and query time -- CollationKeys are only comparable when produced by
* the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are
* independently versioned, so it is safe to search against stored
* CollationKeys if the following are exactly the same (best practice is
* to store this information with the index and check that they remain the
* same at query time):
* </p>
* <ol>
* <li>
* Collator version - see {@link Collator#getVersion()}
* </li>
* <li>
* The collation strength used - see {@link Collator#setStrength(int)}
* </li>
* </ol>
* <p>
* CollationKeys generated by ICU Collators are not compatible with those
* generated by java.text.Collators. Specifically, if you use
* ICUCollationKeyFilter to generate index terms, do not use
* {@link CollationKeyFilter} on the query side, or vice versa.
* </p>
* <p>
* ICUCollationKeyFilter is significantly faster and generates significantly
* shorter keys than CollationKeyFilter. See
* <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
* >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
* @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
* terms directly as bytes. This filter will be removed in Lucene 5.0
*/
@Deprecated
public final class ICUCollationKeyFilter extends TokenFilter {
private Collator collator = null;
private RawCollationKey reusableKey = new RawCollationKey();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
*
* @param input Source token stream
* @param collator CollationKey generator
*/
public ICUCollationKeyFilter(TokenStream input, Collator collator) {
super(input);
// clone the collator: see http://userguide.icu-project.org/collation/architecture
try {
this.collator = (Collator) collator.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
char[] termBuffer = termAtt.buffer();
String termText = new String(termBuffer, 0, termAtt.length());
collator.getRawCollationKey(termText, reusableKey);
int encodedLength = IndexableBinaryStringTools.getEncodedLength(
reusableKey.bytes, 0, reusableKey.size);
if (encodedLength > termBuffer.length) {
termAtt.resizeBuffer(encodedLength);
}
termAtt.setLength(encodedLength);
IndexableBinaryStringTools.encode(reusableKey.bytes, 0, reusableKey.size,
termAtt.buffer(), 0, encodedLength);
return true;
} else {
return false;
}
}
}

View File

@ -1,98 +0,0 @@
package org.apache.lucene.collation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.ibm.icu.text.Collator;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.BytesRef;
import java.io.Reader;
import java.util.Locale;
/** @deprecated remove this when ICUCollationKeyFilter is removed */
@Deprecated
public class TestICUCollationKeyFilter extends CollationTestBase {
private Collator collator = Collator.getInstance(new Locale("fa"));
private Analyzer analyzer = new TestAnalyzer(collator);
private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
(collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
(collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
public final class TestAnalyzer extends Analyzer {
private Collator _collator;
TestAnalyzer(Collator collator) {
_collator = collator;
}
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer result = new KeywordTokenizer(reader);
return new TokenStreamComponents(result, new ICUCollationKeyFilter(result, _collator));
}
}
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd,
secondRangeBeginning, secondRangeEnd);
}
public void testFarsiRangeQueryCollating() throws Exception {
testFarsiRangeQueryCollating(analyzer, firstRangeBeginning, firstRangeEnd,
secondRangeBeginning, secondRangeEnd);
}
public void testFarsiTermRangeQuery() throws Exception {
testFarsiTermRangeQuery
(analyzer, firstRangeBeginning, firstRangeEnd,
secondRangeBeginning, secondRangeEnd);
}
// Test using various international locales with accented characters (which
// sort differently depending on locale)
//
// Copied (and slightly modified) from
// org.apache.lucene.search.TestSort.testInternationalSort()
//
public void testCollationKeySort() throws Exception {
Analyzer usAnalyzer = new TestAnalyzer(Collator.getInstance(Locale.US));
Analyzer franceAnalyzer
= new TestAnalyzer(Collator.getInstance(Locale.FRANCE));
Analyzer swedenAnalyzer
= new TestAnalyzer(Collator.getInstance(new Locale("sv", "se")));
Analyzer denmarkAnalyzer
= new TestAnalyzer(Collator.getInstance(new Locale("da", "dk")));
// The ICU Collator and java.text.Collator implementations differ in their
// orderings - "BFJHD" is the ordering for the ICU Collator for Locale.US.
testCollationKeySort
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
"BFJHD", "ECAGI", "BJDFH", "BJDHF");
}
}

View File

@ -60,7 +60,7 @@ public class NearRealtimeReaderTask extends PerfTask {
}
long t = System.currentTimeMillis();
DirectoryReader r = IndexReader.open(w, true);
DirectoryReader r = DirectoryReader.open(w, true);
runData.setIndexReader(r);
// Transfer our reference to runData
r.decRef();

View File

@ -45,9 +45,9 @@ public class OpenReaderTask extends PerfTask {
Directory dir = getRunData().getDirectory();
DirectoryReader r = null;
if (commitUserData != null) {
r = IndexReader.open(OpenReaderTask.findIndexCommit(dir, commitUserData));
r = DirectoryReader.open(OpenReaderTask.findIndexCommit(dir, commitUserData));
} else {
r = IndexReader.open(dir);
r = DirectoryReader.open(dir);
}
getRunData().setIndexReader(r);
// We transfer reference to the run data

View File

@ -18,6 +18,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
*/
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.Directory;
@ -47,9 +48,9 @@ public class PrintReaderTask extends PerfTask {
Directory dir = getRunData().getDirectory();
IndexReader r = null;
if (userData == null)
r = IndexReader.open(dir);
r = DirectoryReader.open(dir);
else
r = IndexReader.open(OpenReaderTask.findIndexCommit(dir, userData));
r = DirectoryReader.open(OpenReaderTask.findIndexCommit(dir, userData));
System.out.println("--> numDocs:"+r.numDocs()+" dels:"+r.numDeletedDocs());
r.close();
return 1;

View File

@ -28,6 +28,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiFields;
@ -84,7 +85,7 @@ public abstract class ReadTask extends PerfTask {
if (searcher == null) {
// open our own reader
Directory dir = getRunData().getDirectory();
reader = IndexReader.open(dir);
reader = DirectoryReader.open(dir);
searcher = new IndexSearcher(reader);
closeSearcher = true;
} else {

View File

@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.quality.trec;
import org.apache.lucene.benchmark.quality.utils.SimpleQQParser;
import org.apache.lucene.benchmark.quality.utils.SubmissionReport;
import org.apache.lucene.benchmark.quality.*;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.FSDirectory;
@ -53,7 +54,7 @@ public class QueryDriver {
SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2]), "lucene");
FSDirectory dir = FSDirectory.open(new File(args[3]));
String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified.
IndexReader reader = IndexReader.open(dir);
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
int maxResults = 1000;

View File

@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.quality.utils;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Terms;
@ -86,7 +87,7 @@ public class QualityQueriesFinder {
private String [] bestTerms(String field,int numTerms) throws IOException {
PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
IndexReader ir = IndexReader.open(dir);
IndexReader ir = DirectoryReader.open(dir);
try {
int threshold = ir.maxDoc() / 10; // ignore words too common.
Terms terms = MultiFields.getTerms(ir, field);

Some files were not shown because too many files have changed in this diff Show More