mirror of https://github.com/apache/lucene.git
LUCENE-1692: Additional tests and javadocs for contrib/analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@805400 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
524c9ceb70
commit
3887cf9419
|
@ -34,7 +34,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for Arabic.
|
* {@link Analyzer} for Arabic.
|
||||||
* <p>
|
* <p>
|
||||||
* This analyzer implements light-stemming as specified by:
|
* This analyzer implements light-stemming as specified by:
|
||||||
* <i>
|
* <i>
|
||||||
|
@ -108,10 +108,11 @@ public final class ArabicAnalyzer extends Analyzer {
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from an ArabicTokenizer filtered with
|
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
|
||||||
* StopFilter, LowerCaseFilter, ArabicNormalizationFilter and ArabicStemFilter.
|
* {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
|
||||||
|
* and {@link ArabicStemFilter}.
|
||||||
*/
|
*/
|
||||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new ArabicLetterTokenizer( reader );
|
TokenStream result = new ArabicLetterTokenizer( reader );
|
||||||
|
@ -129,12 +130,12 @@ public final class ArabicAnalyzer extends Analyzer {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||||
* in the provided Reader.
|
* in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from an ArabicTokenizer filtered with
|
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
|
||||||
* StopFilter, LowerCaseFilter, ArabicNormalizationFilter and
|
* {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
|
||||||
* ArabicStemFilter.
|
* and {@link ArabicStemFilter}.
|
||||||
*/
|
*/
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography.
|
* A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words..
|
* A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
|
@ -34,15 +34,17 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for Brazilian language. Supports an external list of stopwords (words that
|
* {@link Analyzer} for Brazilian Portuguese language.
|
||||||
* will not be indexed at all) and an external list of exclusions (word that will
|
* <p>
|
||||||
|
* Supports an external list of stopwords (words that
|
||||||
|
* will not be indexed at all) and an external list of exclusions (words that will
|
||||||
* not be stemmed, but indexed).
|
* not be stemmed, but indexed).
|
||||||
*
|
* </p>
|
||||||
*/
|
*/
|
||||||
public final class BrazilianAnalyzer extends Analyzer {
|
public final class BrazilianAnalyzer extends Analyzer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* List of typical Brazilian stopwords.
|
* List of typical Brazilian Portuguese stopwords.
|
||||||
*/
|
*/
|
||||||
public final static String[] BRAZILIAN_STOP_WORDS = {
|
public final static String[] BRAZILIAN_STOP_WORDS = {
|
||||||
"a","ainda","alem","ambas","ambos","antes",
|
"a","ainda","alem","ambas","ambos","antes",
|
||||||
|
@ -67,7 +69,7 @@ public final class BrazilianAnalyzer extends Analyzer {
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Contains the stopwords used with the StopFilter.
|
* Contains the stopwords used with the {@link StopFilter}.
|
||||||
*/
|
*/
|
||||||
private Set stoptable = new HashSet();
|
private Set stoptable = new HashSet();
|
||||||
|
|
||||||
|
@ -111,7 +113,7 @@ public final class BrazilianAnalyzer extends Analyzer {
|
||||||
excltable = StopFilter.makeStopSet( exclusionlist );
|
excltable = StopFilter.makeStopSet( exclusionlist );
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Builds an exclusionlist from a Hashtable.
|
* Builds an exclusionlist from a {@link Map}.
|
||||||
*/
|
*/
|
||||||
public void setStemExclusionTable( Map exclusionlist ) {
|
public void setStemExclusionTable( Map exclusionlist ) {
|
||||||
excltable = new HashSet(exclusionlist.keySet());
|
excltable = new HashSet(exclusionlist.keySet());
|
||||||
|
@ -124,11 +126,11 @@ public final class BrazilianAnalyzer extends Analyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||||
* LowerCaseFilter, StandardFilter, StopFilter, and
|
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
|
||||||
* BrazilianStemFilter.
|
* {@link BrazilianStemFilter}.
|
||||||
*/
|
*/
|
||||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new StandardTokenizer( reader );
|
TokenStream result = new StandardTokenizer( reader );
|
||||||
|
@ -145,12 +147,12 @@ public final class BrazilianAnalyzer extends Analyzer {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||||
* in the provided Reader.
|
* in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||||
* LowerCaseFilter, StandardFilter, StopFilter, and
|
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
|
||||||
* BrazilianStemFilter.
|
* {@link BrazilianStemFilter}.
|
||||||
*/
|
*/
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
|
@ -25,13 +25,13 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Based on GermanStemFilter
|
* A {@link TokenFilter} that applies {@link BrazilianStemmer}.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public final class BrazilianStemFilter extends TokenFilter {
|
public final class BrazilianStemFilter extends TokenFilter {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The actual token in the input stream.
|
* {@link BrazilianStemmer} in use by this filter.
|
||||||
*/
|
*/
|
||||||
private BrazilianStemmer stemmer = null;
|
private BrazilianStemmer stemmer = null;
|
||||||
private Set exclusions = null;
|
private Set exclusions = null;
|
||||||
|
|
|
@ -18,7 +18,7 @@ package org.apache.lucene.analysis.br;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A stemmer for Brazilian words.
|
* A stemmer for Brazilian Portuguese words.
|
||||||
*/
|
*/
|
||||||
public class BrazilianStemmer {
|
public class BrazilianStemmer {
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<html><head></head>
|
<html><head></head>
|
||||||
<body>
|
<body>
|
||||||
Analyzer for Brazilian.
|
Analyzer for Brazilian Portuguese.
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -28,7 +28,8 @@ import java.util.Set;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filters CJKTokenizer with StopFilter.
|
* An {@link Analyzer} that tokenizes text with {@link CJKTokenizer} and
|
||||||
|
* filters with {@link StopFilter}
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class CJKAnalyzer extends Analyzer {
|
public class CJKAnalyzer extends Analyzer {
|
||||||
|
@ -77,11 +78,12 @@ public class CJKAnalyzer extends Analyzer {
|
||||||
//~ Methods ----------------------------------------------------------------
|
//~ Methods ----------------------------------------------------------------
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get token stream from input
|
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @param fieldName lucene field name
|
* @param fieldName lucene field name
|
||||||
* @param reader input reader
|
* @param reader input {@link Reader}
|
||||||
* @return TokenStream
|
* @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
|
||||||
|
* {@link StopFilter}
|
||||||
*/
|
*/
|
||||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
return new StopFilter(new CJKTokenizer(reader), stopTable);
|
return new StopFilter(new CJKTokenizer(reader), stopTable);
|
||||||
|
@ -93,11 +95,13 @@ public class CJKAnalyzer extends Analyzer {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get (possibly reused) token stream from input
|
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||||
|
* in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @param fieldName lucene field name
|
* @param fieldName lucene field name
|
||||||
* @param reader input reader
|
* @param reader Input {@link Reader}
|
||||||
* @return TokenStream
|
* @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
|
||||||
|
* {@link StopFilter}
|
||||||
*/
|
*/
|
||||||
public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
/* tokenStream() is final, no back compat issue */
|
/* tokenStream() is final, no back compat issue */
|
||||||
|
|
|
@ -27,13 +27,20 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* CJKTokenizer was modified from StopTokenizer which does a decent job for
|
* CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
|
||||||
* most European languages. It performs other token methods for double-byte
|
* <p>
|
||||||
* Characters: the token will return at each two characters with overlap match.<br>
|
* The tokens returned are every two adjacent characters with overlap match.
|
||||||
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
|
* </p>
|
||||||
* also need filter filter zero length token ""<br>
|
* <p>
|
||||||
* for Digit: digit, '+', '#' will token as letter<br>
|
* Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
|
||||||
* for more info on Asia language(Chinese Japanese Korean) text segmentation:
|
* </p>
|
||||||
|
* Additionally, the following is applied to Latin text (such as English):
|
||||||
|
* <ul>
|
||||||
|
* <li>Text is converted to lowercase.
|
||||||
|
* <li>Numeric digits, '+', '#', and '_' are tokenized as letters.
|
||||||
|
* <li>Full-width forms are converted to half-width forms.
|
||||||
|
* </ul>
|
||||||
|
* For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
|
||||||
* please search <a
|
* please search <a
|
||||||
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
|
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
|
||||||
*
|
*
|
||||||
|
|
|
@ -24,13 +24,8 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Title: ChineseAnalyzer
|
* An {@link Analyzer} that tokenizes text with {@link ChineseTokenizer} and
|
||||||
* Description:
|
* filters with {@link ChineseFilter}
|
||||||
* Subclass of org.apache.lucene.analysis.Analyzer
|
|
||||||
* build from a ChineseTokenizer, filtered with ChineseFilter.
|
|
||||||
* Copyright: Copyright (c) 2001
|
|
||||||
* Company:
|
|
||||||
* @version 1.0
|
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -40,9 +35,10 @@ public class ChineseAnalyzer extends Analyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.
|
* @return A {@link TokenStream} built from a {@link ChineseTokenizer}
|
||||||
|
* filtered with {@link ChineseFilter}.
|
||||||
*/
|
*/
|
||||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new ChineseTokenizer(reader);
|
TokenStream result = new ChineseTokenizer(reader);
|
||||||
|
@ -56,11 +52,11 @@ public class ChineseAnalyzer extends Analyzer {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a (possibly reused) TokenStream which tokenizes all the text in the
|
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the
|
||||||
* provided Reader.
|
* provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream build from a ChineseTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link ChineseTokenizer}
|
||||||
* ChineseFilter.
|
* filtered with {@link ChineseFilter}.
|
||||||
*/
|
*/
|
||||||
public final TokenStream reusableTokenStream(String fieldName, Reader reader)
|
public final TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
|
@ -26,18 +26,19 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Title: ChineseFilter
|
* A {@link TokenFilter} with a stop word table.
|
||||||
* Description: Filter with a stop word table
|
* <ul>
|
||||||
* Rule: No digital is allowed.
|
* <li>Numeric tokens are removed.
|
||||||
* English word/token should larger than 1 character.
|
* <li>English tokens must be larger than 1 character.
|
||||||
* One Chinese character as one Chinese word.
|
* <li>One Chinese character as one Chinese word.
|
||||||
|
* </ul>
|
||||||
* TO DO:
|
* TO DO:
|
||||||
* 1. Add Chinese stop words, such as \ue400
|
* <ol>
|
||||||
* 2. Dictionary based Chinese word extraction
|
* <li>Add Chinese stop words, such as \ue400
|
||||||
* 3. Intelligent Chinese word extraction
|
* <li>Dictionary based Chinese word extraction
|
||||||
|
* <li>Intelligent Chinese word extraction
|
||||||
|
* </ol>
|
||||||
*
|
*
|
||||||
* Copyright: Copyright (c) 2001
|
|
||||||
* Company:
|
|
||||||
* @version 1.0
|
* @version 1.0
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -27,28 +27,29 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Title: ChineseTokenizer
|
* Tokenize Chinese text as individual chinese characters.
|
||||||
* Description: Extract tokens from the Stream using Character.getType()
|
|
||||||
* Rule: A Chinese character as a single token
|
|
||||||
* Copyright: Copyright (c) 2001
|
|
||||||
* Company:
|
|
||||||
*
|
*
|
||||||
* The difference between thr ChineseTokenizer and the
|
* <p>
|
||||||
* CJKTokenizer (id=23545) is that they have different
|
* The difference between ChineseTokenizer and
|
||||||
|
* CJKTokenizer is that they have different
|
||||||
* token parsing logic.
|
* token parsing logic.
|
||||||
*
|
* </p>
|
||||||
* Let me use an example. If having a Chinese text
|
* <p>
|
||||||
* "C1C2C3C4" to be indexed, the tokens returned from the
|
* For example, if the Chinese text
|
||||||
* ChineseTokenizer are C1, C2, C3, C4. And the tokens
|
* "C1C2C3C4" is to be indexed:
|
||||||
* returned from the CJKTokenizer are C1C2, C2C3, C3C4.
|
* <ul>
|
||||||
*
|
* <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4.
|
||||||
* Therefore the index the CJKTokenizer created is much
|
* <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
|
||||||
* larger.
|
* </ul>
|
||||||
*
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* Therefore the index created by CJKTokenizer is much larger.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
* The problem is that when searching for C1, C1C2, C1C3,
|
* The problem is that when searching for C1, C1C2, C1C3,
|
||||||
* C4C2, C1C2C3 ... the ChineseTokenizer works, but the
|
* C4C2, C1C2C3 ... the ChineseTokenizer works, but the
|
||||||
* CJKTokenizer will not work.
|
* CJKTokenizer will not work.
|
||||||
*
|
* </p>
|
||||||
* @version 1.0
|
* @version 1.0
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
Analyzer for Chinese, which indexes unigrams (individuals chinese characters).
|
Analyzer for Chinese, which indexes unigrams (individual chinese characters).
|
||||||
<p>
|
<p>
|
||||||
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
|
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
|
||||||
<ul>
|
<ul>
|
||||||
|
|
|
@ -119,9 +119,10 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
* The resulting Set does case insensitive matching
|
* The resulting Set does case insensitive matching
|
||||||
* TODO We should look for a faster dictionary lookup approach.
|
* TODO We should look for a faster dictionary lookup approach.
|
||||||
* @param dictionary
|
* @param dictionary
|
||||||
* @return
|
* @return {@link Set} of lowercased terms
|
||||||
*/
|
*/
|
||||||
public static final Set makeDictionary(final String[] dictionary) {
|
public static final Set makeDictionary(final String[] dictionary) {
|
||||||
|
// is the below really case insensitive?
|
||||||
CharArraySet dict = new CharArraySet(dictionary.length, false);
|
CharArraySet dict = new CharArraySet(dictionary.length, false);
|
||||||
addAllLowerCase(dict, Arrays.asList(dictionary));
|
addAllLowerCase(dict, Arrays.asList(dictionary));
|
||||||
return dict;
|
return dict;
|
||||||
|
|
|
@ -21,18 +21,21 @@ package org.apache.lucene.analysis.compound;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter; // for javadocs
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A TokenFilter that decomposes compound words found in many germanic languages
|
* A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
|
||||||
|
* <p>
|
||||||
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
|
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
|
||||||
* "Donaudampfschiff" even when you only enter "schiff".
|
* "Donaudampfschiff" even when you only enter "schiff".
|
||||||
* It uses a brute-force algorithm to achieve this.
|
* It uses a brute-force algorithm to achieve this.
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
|
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param input the token stream to process
|
* @param input the {@link TokenStream} to process
|
||||||
* @param dictionary the word dictionary to match against
|
* @param dictionary the word dictionary to match against
|
||||||
* @param minWordSize only words longer than this get processed
|
* @param minWordSize only words longer than this get processed
|
||||||
* @param minSubwordSize only subwords longer than this get to the output stream
|
* @param minSubwordSize only subwords longer than this get to the output stream
|
||||||
|
@ -46,7 +49,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param input the token stream to process
|
* @param input the {@link TokenStream} to process
|
||||||
* @param dictionary the word dictionary to match against
|
* @param dictionary the word dictionary to match against
|
||||||
*/
|
*/
|
||||||
public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
|
public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
|
||||||
|
@ -55,7 +58,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param input the token stream to process
|
* @param input the {@link TokenStream} to process
|
||||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||||
* lower case strings.
|
* lower case strings.
|
||||||
*/
|
*/
|
||||||
|
@ -65,7 +68,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param input the token stream to process
|
* @param input the {@link TokenStream} to process
|
||||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||||
* lower case strings.
|
* lower case strings.
|
||||||
* @param minWordSize only words longer than this get processed
|
* @param minWordSize only words longer than this get processed
|
||||||
|
|
|
@ -24,16 +24,19 @@ import java.io.Reader;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter; // for javadocs
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
|
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
|
||||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||||
import org.xml.sax.InputSource;
|
import org.xml.sax.InputSource;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A TokenFilter that decomposes compound words found in many germanic languages
|
* A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
|
||||||
|
* <p>
|
||||||
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
|
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
|
||||||
* "Donaudampfschiff" even when you only enter "schiff" It uses a hyphenation
|
* "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
|
||||||
* grammar and a word dictionary to achieve this.
|
* grammar and a word dictionary to achieve this.
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class HyphenationCompoundWordTokenFilter extends
|
public class HyphenationCompoundWordTokenFilter extends
|
||||||
CompoundWordTokenFilterBase {
|
CompoundWordTokenFilterBase {
|
||||||
|
@ -41,7 +44,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param input the token stream to process
|
* @param input the {@link TokenStream} to process
|
||||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||||
* @param dictionary the word dictionary to match against
|
* @param dictionary the word dictionary to match against
|
||||||
* @param minWordSize only words longer than this get processed
|
* @param minWordSize only words longer than this get processed
|
||||||
|
@ -60,7 +63,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param input the token stream to process
|
* @param input the {@link TokenStream} to process
|
||||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||||
* @param dictionary the word dictionary to match against
|
* @param dictionary the word dictionary to match against
|
||||||
*/
|
*/
|
||||||
|
@ -72,7 +75,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param input the token stream to process
|
* @param input the {@link TokenStream} to process
|
||||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||||
* lower case strings.
|
* lower case strings.
|
||||||
|
@ -85,7 +88,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param input the token stream to process
|
* @param input the {@link TokenStream} to process
|
||||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||||
* lower case strings.
|
* lower case strings.
|
||||||
|
|
|
@ -110,7 +110,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer,
|
||||||
/**
|
/**
|
||||||
* Read hyphenation patterns from an XML file.
|
* Read hyphenation patterns from an XML file.
|
||||||
*
|
*
|
||||||
* @param filename the filename
|
* @param f the filename
|
||||||
* @throws HyphenationException In case the parsing fails
|
* @throws HyphenationException In case the parsing fails
|
||||||
*/
|
*/
|
||||||
public void loadPatterns(File f) throws HyphenationException {
|
public void loadPatterns(File f) throws HyphenationException {
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
A filter that decomposes compound words you find in many Germanic
|
A filter that decomposes compound words you find in many Germanic
|
||||||
languages to the word parts. This example shows what it does:
|
languages into the word parts. This example shows what it does:
|
||||||
<table border="1">
|
<table border="1">
|
||||||
<tr>
|
<tr>
|
||||||
<th>Input token stream</th>
|
<th>Input token stream</th>
|
||||||
|
|
|
@ -31,11 +31,12 @@ import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for Czech language. Supports an external list of stopwords (words that
|
* {@link Analyzer} for Czech language.
|
||||||
|
* <p>
|
||||||
|
* Supports an external list of stopwords (words that
|
||||||
* will not be indexed at all).
|
* will not be indexed at all).
|
||||||
* A default set of stopwords is used unless an alternative list is specified, the
|
* A default set of stopwords is used unless an alternative list is specified.
|
||||||
* exclusion list is empty by default.
|
* </p>
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public final class CzechAnalyzer extends Analyzer {
|
public final class CzechAnalyzer extends Analyzer {
|
||||||
|
|
||||||
|
@ -64,7 +65,7 @@ public final class CzechAnalyzer extends Analyzer {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Contains the stopwords used with the StopFilter.
|
* Contains the stopwords used with the {@link StopFilter}.
|
||||||
*/
|
*/
|
||||||
private Set stoptable;
|
private Set stoptable;
|
||||||
|
|
||||||
|
@ -125,10 +126,10 @@ public final class CzechAnalyzer extends Analyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||||
* StandardFilter, LowerCaseFilter, and StopFilter
|
* {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
|
||||||
*/
|
*/
|
||||||
public final TokenStream tokenStream( String fieldName, Reader reader ) {
|
public final TokenStream tokenStream( String fieldName, Reader reader ) {
|
||||||
TokenStream result = new StandardTokenizer( reader );
|
TokenStream result = new StandardTokenizer( reader );
|
||||||
|
@ -144,11 +145,11 @@ public final class CzechAnalyzer extends Analyzer {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a (possibly reused) TokenStream which tokenizes all the text in
|
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in
|
||||||
* the provided Reader.
|
* the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||||
* StandardFilter, LowerCaseFilter, and StopFilter
|
* {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
|
||||||
*/
|
*/
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
|
@ -35,12 +35,14 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for German language. Supports an external list of stopwords (words that
|
* {@link Analyzer} for German language.
|
||||||
|
* <p>
|
||||||
|
* Supports an external list of stopwords (words that
|
||||||
* will not be indexed at all) and an external list of exclusions (word that will
|
* will not be indexed at all) and an external list of exclusions (word that will
|
||||||
* not be stemmed, but indexed).
|
* not be stemmed, but indexed).
|
||||||
* A default set of stopwords is used unless an alternative list is specified, the
|
* A default set of stopwords is used unless an alternative list is specified, but the
|
||||||
* exclusion list is empty by default.
|
* exclusion list is empty by default.
|
||||||
*
|
* </p>
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
|
@ -65,7 +67,7 @@ public class GermanAnalyzer extends Analyzer {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Contains the stopwords used with the StopFilter.
|
* Contains the stopwords used with the {@link StopFilter}.
|
||||||
*/
|
*/
|
||||||
private Set stopSet = new HashSet();
|
private Set stopSet = new HashSet();
|
||||||
|
|
||||||
|
@ -75,8 +77,8 @@ public class GermanAnalyzer extends Analyzer {
|
||||||
private Set exclusionSet = new HashSet();
|
private Set exclusionSet = new HashSet();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the default stop words
|
* Builds an analyzer with the default stop words:
|
||||||
* (<code>GERMAN_STOP_WORDS</code>).
|
* {@link #GERMAN_STOP_WORDS}.
|
||||||
*/
|
*/
|
||||||
public GermanAnalyzer() {
|
public GermanAnalyzer() {
|
||||||
stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
|
stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
|
||||||
|
@ -115,7 +117,7 @@ public class GermanAnalyzer extends Analyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an exclusionlist from a Hashtable.
|
* Builds an exclusionlist from a {@link Map}
|
||||||
*/
|
*/
|
||||||
public void setStemExclusionTable(Map exclusionlist) {
|
public void setStemExclusionTable(Map exclusionlist) {
|
||||||
exclusionSet = new HashSet(exclusionlist.keySet());
|
exclusionSet = new HashSet(exclusionlist.keySet());
|
||||||
|
@ -129,10 +131,11 @@ public class GermanAnalyzer extends Analyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||||
* StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
|
||||||
|
* {@link GermanStemFilter}
|
||||||
*/
|
*/
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new StandardTokenizer(reader);
|
TokenStream result = new StandardTokenizer(reader);
|
||||||
|
@ -149,11 +152,12 @@ public class GermanAnalyzer extends Analyzer {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||||
* in the provided Reader.
|
* in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||||
* StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
|
||||||
|
* {@link GermanStemFilter}
|
||||||
*/
|
*/
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
if (overridesTokenStreamMethod) {
|
if (overridesTokenStreamMethod) {
|
||||||
|
|
|
@ -25,10 +25,12 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A filter that stems German words. It supports a table of words that should
|
* A {@link TokenFilter} that stems German words.
|
||||||
|
* <p>
|
||||||
|
* It supports a table of words that should
|
||||||
* not be stemmed at all. The stemmer used can be changed at runtime after the
|
* not be stemmed at all. The stemmer used can be changed at runtime after the
|
||||||
* filter object is created (as long as it is a GermanStemmer).
|
* filter object is created (as long as it is a {@link GermanStemmer}).
|
||||||
*
|
* </p>
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
|
@ -78,7 +80,7 @@ public final class GermanStemFilter extends TokenFilter
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set a alternative/custom GermanStemmer for this filter.
|
* Set a alternative/custom {@link GermanStemmer} for this filter.
|
||||||
*/
|
*/
|
||||||
public void setStemmer( GermanStemmer stemmer )
|
public void setStemmer( GermanStemmer stemmer )
|
||||||
{
|
{
|
||||||
|
|
|
@ -19,10 +19,12 @@ package org.apache.lucene.analysis.de;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A stemmer for German words. The algorithm is based on the report
|
* A stemmer for German words.
|
||||||
|
* <p>
|
||||||
|
* The algorithm is based on the report
|
||||||
* "A Fast and Simple Stemming Algorithm for German Words" by Jörg
|
* "A Fast and Simple Stemming Algorithm for German Words" by Jörg
|
||||||
* Caumanns (joerg.caumanns at isst.fhg.de).
|
* Caumanns (joerg.caumanns at isst.fhg.de).
|
||||||
*
|
* </p>
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -30,10 +30,12 @@ import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for the Greek language. Supports an external list of stopwords (words
|
* {@link Analyzer} for the Greek language.
|
||||||
|
* <p>
|
||||||
|
* Supports an external list of stopwords (words
|
||||||
* that will not be indexed at all).
|
* that will not be indexed at all).
|
||||||
* A default set of stopwords is used unless an alternative list is specified.
|
* A default set of stopwords is used unless an alternative list is specified.
|
||||||
*
|
* </p>
|
||||||
*/
|
*/
|
||||||
public final class GreekAnalyzer extends Analyzer
|
public final class GreekAnalyzer extends Analyzer
|
||||||
{
|
{
|
||||||
|
@ -145,14 +147,14 @@ public final class GreekAnalyzer extends Analyzer
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Contains the stopwords used with the StopFilter.
|
* Contains the stopwords used with the {@link StopFilter}.
|
||||||
*/
|
*/
|
||||||
private Set stopSet = new HashSet();
|
private Set stopSet = new HashSet();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Charset for Greek letters.
|
* Charset for Greek letters.
|
||||||
* Represents encoding for 24 lowercase Greek letters.
|
* Represents encoding for 24 lowercase Greek letters.
|
||||||
* Predefined charsets can be taken from GreekCharSets class
|
* Predefined charsets can be taken from {@link GreekCharsets} class
|
||||||
*/
|
*/
|
||||||
private char[] charset;
|
private char[] charset;
|
||||||
|
|
||||||
|
@ -209,10 +211,10 @@ public final class GreekAnalyzer extends Analyzer
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||||
* GreekLowerCaseFilter and StopFilter
|
* {@link GreekLowerCaseFilter} and {@link StopFilter}
|
||||||
*/
|
*/
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader)
|
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||||
{
|
{
|
||||||
|
@ -228,11 +230,11 @@ public final class GreekAnalyzer extends Analyzer
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||||
* in the provided Reader.
|
* in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||||
* GreekLowerCaseFilter and StopFilter
|
* {@link GreekLowerCaseFilter} and {@link StopFilter}
|
||||||
*/
|
*/
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
|
@ -19,10 +19,11 @@ package org.apache.lucene.analysis.el;
|
||||||
/**
|
/**
|
||||||
* GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
|
* GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
|
||||||
* for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
|
* for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
|
||||||
|
* <p>
|
||||||
* Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters,
|
* Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters,
|
||||||
* including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
|
* including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
|
||||||
* the definition of a new charset as well as the required logic in the toLowerCase() method.
|
* the definition of a new charset as well as the required logic in the toLowerCase() method.
|
||||||
*
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class GreekCharsets
|
public class GreekCharsets
|
||||||
{
|
{
|
||||||
|
|
|
@ -36,12 +36,12 @@ import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
|
||||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for Persian.
|
* {@link Analyzer} for Persian.
|
||||||
*
|
* <p>
|
||||||
* Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
|
* This Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
|
||||||
* ZWNJ in addition to space. Some persian-specific variant forms (such as farsi
|
* zero-width non-joiner in addition to whitespace. Some persian-specific variant forms (such as farsi
|
||||||
* yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
|
* yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
|
||||||
*
|
* </p>
|
||||||
*/
|
*/
|
||||||
public final class PersianAnalyzer extends Analyzer {
|
public final class PersianAnalyzer extends Analyzer {
|
||||||
|
|
||||||
|
@ -107,11 +107,13 @@ public final class PersianAnalyzer extends Analyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
* Creates a {@link TokenStream} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream build from a ArabicLetterTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
|
||||||
* LowerCaseFilter, ArabicNormalizationFilter,
|
* filtered with {@link LowerCaseFilter},
|
||||||
* PersianNormalizationFilter and Persian Stop words
|
* {@link ArabicNormalizationFilter},
|
||||||
|
* {@link PersianNormalizationFilter} and Persian Stop words
|
||||||
*/
|
*/
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new ArabicLetterTokenizer(reader);
|
TokenStream result = new ArabicLetterTokenizer(reader);
|
||||||
|
@ -134,12 +136,13 @@ public final class PersianAnalyzer extends Analyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||||
* in the provided Reader.
|
* in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream build from a ArabicLetterTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
|
||||||
* LowerCaseFilter, ArabicNormalizationFilter,
|
* filtered with {@link LowerCaseFilter},
|
||||||
* PersianNormalizationFilter and Persian Stop words
|
* {@link ArabicNormalizationFilter},
|
||||||
|
* {@link PersianNormalizationFilter} and Persian Stop words
|
||||||
*/
|
*/
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A TokenFilter that applies {@link PersianNormalizer} to normalize the
|
* A {@link TokenFilter} that applies {@link PersianNormalizer} to normalize the
|
||||||
* orthography.
|
* orthography.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -22,16 +22,17 @@ import java.util.Set;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes elisions from a token stream. For example, "l'avion" (the plane) will be
|
* Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
|
||||||
* tokenized as "avion" (plane).
|
* tokenized as "avion" (plane).
|
||||||
* <p>
|
* <p>
|
||||||
* Note that StandardTokenizer sees " ' " as a space, and cuts it out.
|
* Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out.
|
||||||
*
|
*
|
||||||
* @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
|
* @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
|
||||||
*/
|
*/
|
||||||
|
@ -78,7 +79,7 @@ public class ElisionFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the next input Token with term() without elisioned start
|
* Increments the {@link TokenStream} with a {@link TermAttribute} without elisioned start
|
||||||
*/
|
*/
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
|
|
|
@ -34,12 +34,14 @@ import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for French language. Supports an external list of stopwords (words that
|
* {@link Analyzer} for French language.
|
||||||
|
* <p>
|
||||||
|
* Supports an external list of stopwords (words that
|
||||||
* will not be indexed at all) and an external list of exclusions (word that will
|
* will not be indexed at all) and an external list of exclusions (word that will
|
||||||
* not be stemmed, but indexed).
|
* not be stemmed, but indexed).
|
||||||
* A default set of stopwords is used unless an alternative list is specified, the
|
* A default set of stopwords is used unless an alternative list is specified, but the
|
||||||
* exclusion list is empty by default.
|
* exclusion list is empty by default.
|
||||||
*
|
* </p>
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
|
@ -74,7 +76,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Contains the stopwords used with the StopFilter.
|
* Contains the stopwords used with the {@link StopFilter}.
|
||||||
*/
|
*/
|
||||||
private Set stoptable = new HashSet();
|
private Set stoptable = new HashSet();
|
||||||
/**
|
/**
|
||||||
|
@ -127,10 +129,12 @@ public final class FrenchAnalyzer extends Analyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
* Creates a {@link TokenStream} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
|
||||||
* StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
|
* filtered with {@link StandardFilter}, {@link StopFilter},
|
||||||
|
* {@link FrenchStemFilter} and {@link LowerCaseFilter}
|
||||||
*/
|
*/
|
||||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
|
||||||
|
@ -152,11 +156,12 @@ public final class FrenchAnalyzer extends Analyzer {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the
|
||||||
* in the provided Reader.
|
* text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
|
||||||
* StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
|
* filtered with {@link StandardFilter}, {@link StopFilter},
|
||||||
|
* {@link FrenchStemFilter} and {@link LowerCaseFilter}
|
||||||
*/
|
*/
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.fr;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
@ -28,10 +27,12 @@ import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A filter that stemms french words. It supports a table of words that should
|
* A {@link TokenFilter} that stems french words.
|
||||||
|
* <p>
|
||||||
|
* It supports a table of words that should
|
||||||
* not be stemmed at all. The used stemmer can be changed at runtime after the
|
* not be stemmed at all. The used stemmer can be changed at runtime after the
|
||||||
* filter object is created (as long as it is a FrenchStemmer).
|
* filter object is created (as long as it is a {@link FrenchStemmer}).
|
||||||
*
|
* </p>
|
||||||
*/
|
*/
|
||||||
public final class FrenchStemFilter extends TokenFilter {
|
public final class FrenchStemFilter extends TokenFilter {
|
||||||
|
|
||||||
|
@ -75,7 +76,7 @@ public final class FrenchStemFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Set a alternative/custom FrenchStemmer for this filter.
|
* Set a alternative/custom {@link FrenchStemmer} for this filter.
|
||||||
*/
|
*/
|
||||||
public void setStemmer( FrenchStemmer stemmer ) {
|
public void setStemmer( FrenchStemmer stemmer ) {
|
||||||
if ( stemmer != null ) {
|
if ( stemmer != null ) {
|
||||||
|
|
|
@ -18,11 +18,13 @@ package org.apache.lucene.analysis.fr;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A stemmer for French words. The algorithm is based on the work of
|
* A stemmer for French words.
|
||||||
|
* <p>
|
||||||
|
* The algorithm is based on the work of
|
||||||
* Dr Martin Porter on his snowball project<br>
|
* Dr Martin Porter on his snowball project<br>
|
||||||
* refer to http://snowball.sourceforge.net/french/stemmer.html<br>
|
* refer to http://snowball.sourceforge.net/french/stemmer.html<br>
|
||||||
* (French stemming algorithm) for details
|
* (French stemming algorithm) for details
|
||||||
*
|
* </p>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class FrenchStemmer {
|
public class FrenchStemmer {
|
||||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Links two PrefixAwareTokenFilter
|
* Links two {@link PrefixAwareTokenFilter}.
|
||||||
* <p/>
|
* <p/>
|
||||||
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
|
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
|
||||||
* the ones located in org.apache.lucene.analysis.tokenattributes.
|
* the ones located in org.apache.lucene.analysis.tokenattributes.
|
||||||
|
|
|
@ -29,7 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A token stream containing a single token.
|
* A {@link TokenStream} containing a single token.
|
||||||
*/
|
*/
|
||||||
public class SingleTokenTokenStream extends TokenStream {
|
public class SingleTokenTokenStream extends TokenStream {
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Miscellaneous TokenStreams
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -27,9 +27,9 @@ import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tokenizes the given token into n-grams of given size(s).
|
* Tokenizes the given token into n-grams of given size(s).
|
||||||
*
|
* <p>
|
||||||
* This filter create n-grams from the beginning edge or ending edge of a input token.
|
* This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token.
|
||||||
*
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class EdgeNGramTokenFilter extends TokenFilter {
|
public class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
public static final Side DEFAULT_SIDE = Side.FRONT;
|
public static final Side DEFAULT_SIDE = Side.FRONT;
|
||||||
|
@ -84,7 +84,7 @@ public class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
||||||
*
|
*
|
||||||
* @param input TokenStream holding the input to be tokenized
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
* @param side the {@link Side} from which to chop off an n-gram
|
* @param side the {@link Side} from which to chop off an n-gram
|
||||||
* @param minGram the smallest n-gram to generate
|
* @param minGram the smallest n-gram to generate
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the largest n-gram to generate
|
||||||
|
@ -114,7 +114,7 @@ public class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
||||||
*
|
*
|
||||||
* @param input TokenStream holding the input to be tokenized
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
|
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
|
||||||
* @param minGram the smallest n-gram to generate
|
* @param minGram the smallest n-gram to generate
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the largest n-gram to generate
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ngram;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
@ -28,10 +27,10 @@ import java.io.Reader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tokenizes the input from an edge into n-grams of given size(s).
|
* Tokenizes the input from an edge into n-grams of given size(s).
|
||||||
*
|
* <p>
|
||||||
* This tokenizer create n-grams from the beginning edge or ending edge of a input token.
|
* This {@link Tokenizer} create n-grams from the beginning edge or ending edge of a input token.
|
||||||
* MaxGram can't be larger than 1024 because of limitation.
|
* MaxGram can't be larger than 1024 because of limitation.
|
||||||
*
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class EdgeNGramTokenizer extends Tokenizer {
|
public class EdgeNGramTokenizer extends Tokenizer {
|
||||||
public static final Side DEFAULT_SIDE = Side.FRONT;
|
public static final Side DEFAULT_SIDE = Side.FRONT;
|
||||||
|
@ -82,7 +81,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
||||||
/**
|
/**
|
||||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||||
*
|
*
|
||||||
* @param input Reader holding the input to be tokenized
|
* @param input {@link Reader} holding the input to be tokenized
|
||||||
* @param side the {@link Side} from which to chop off an n-gram
|
* @param side the {@link Side} from which to chop off an n-gram
|
||||||
* @param minGram the smallest n-gram to generate
|
* @param minGram the smallest n-gram to generate
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the largest n-gram to generate
|
||||||
|
@ -112,7 +111,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
||||||
/**
|
/**
|
||||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||||
*
|
*
|
||||||
* @param input Reader holding the input to be tokenized
|
* @param input {@link Reader} holding the input to be tokenized
|
||||||
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
|
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
|
||||||
* @param minGram the smallest n-gram to generate
|
* @param minGram the smallest n-gram to generate
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the largest n-gram to generate
|
||||||
|
|
|
@ -44,7 +44,7 @@ public class NGramTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenFilter with given min and max n-grams.
|
* Creates NGramTokenFilter with given min and max n-grams.
|
||||||
* @param input TokenStream holding the input to be tokenized
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
* @param minGram the smallest n-gram to generate
|
* @param minGram the smallest n-gram to generate
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the largest n-gram to generate
|
||||||
*/
|
*/
|
||||||
|
@ -65,7 +65,7 @@ public class NGramTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenFilter with default min and max n-grams.
|
* Creates NGramTokenFilter with default min and max n-grams.
|
||||||
* @param input TokenStream holding the input to be tokenized
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
*/
|
*/
|
||||||
public NGramTokenFilter(TokenStream input) {
|
public NGramTokenFilter(TokenStream input) {
|
||||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||||
|
|
|
@ -44,7 +44,7 @@ public class NGramTokenizer extends Tokenizer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenizer with given min and max n-grams.
|
* Creates NGramTokenizer with given min and max n-grams.
|
||||||
* @param input Reader holding the input to be tokenized
|
* @param input {@link Reader} holding the input to be tokenized
|
||||||
* @param minGram the smallest n-gram to generate
|
* @param minGram the smallest n-gram to generate
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the largest n-gram to generate
|
||||||
*/
|
*/
|
||||||
|
@ -64,7 +64,7 @@ public class NGramTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenizer with default min and max n-grams.
|
* Creates NGramTokenizer with default min and max n-grams.
|
||||||
* @param input Reader holding the input to be tokenized
|
* @param input {@link Reader} holding the input to be tokenized
|
||||||
*/
|
*/
|
||||||
public NGramTokenizer(Reader input) {
|
public NGramTokenizer(Reader input) {
|
||||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Character n-gram tokenizers and filters.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -33,13 +33,15 @@ import java.util.Set;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for Dutch language. Supports an external list of stopwords (words that
|
* {@link Analyzer} for Dutch language.
|
||||||
|
* <p>
|
||||||
|
* Supports an external list of stopwords (words that
|
||||||
* will not be indexed at all), an external list of exclusions (word that will
|
* will not be indexed at all), an external list of exclusions (word that will
|
||||||
* not be stemmed, but indexed) and an external list of word-stem pairs that overrule
|
* not be stemmed, but indexed) and an external list of word-stem pairs that overrule
|
||||||
* the algorithm (dictionary stemming).
|
* the algorithm (dictionary stemming).
|
||||||
* A default set of stopwords is used unless an alternative list is specified, the
|
* A default set of stopwords is used unless an alternative list is specified, but the
|
||||||
* exclusion list is empty by default.
|
* exclusion list is empty by default.
|
||||||
*
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class DutchAnalyzer extends Analyzer {
|
public class DutchAnalyzer extends Analyzer {
|
||||||
/**
|
/**
|
||||||
|
@ -165,10 +167,12 @@ public class DutchAnalyzer extends Analyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a TokenStream which tokenizes all the text in the provided TextReader.
|
* Creates a {@link TokenStream} which tokenizes all the text in the
|
||||||
|
* provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a StandardTokenizer filtered with StandardFilter,
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
|
||||||
* StopFilter, DutchStemFilter
|
* filtered with {@link StandardFilter}, {@link StopFilter},
|
||||||
|
* and {@link DutchStemFilter}
|
||||||
*/
|
*/
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new StandardTokenizer(reader);
|
TokenStream result = new StandardTokenizer(reader);
|
||||||
|
@ -184,11 +188,12 @@ public class DutchAnalyzer extends Analyzer {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the
|
||||||
* in the provided Reader.
|
* text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
|
||||||
* StandardFilter, StopFilter, DutchStemFilter
|
* filtered with {@link StandardFilter}, {@link StopFilter},
|
||||||
|
* and {@link DutchStemFilter}
|
||||||
*/
|
*/
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
|
@ -28,10 +28,12 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A filter that stems Dutch words. It supports a table of words that should
|
* A {@link TokenFilter} that stems Dutch words.
|
||||||
|
* <p>
|
||||||
|
* It supports a table of words that should
|
||||||
* not be stemmed at all. The stemmer used can be changed at runtime after the
|
* not be stemmed at all. The stemmer used can be changed at runtime after the
|
||||||
* filter object is created (as long as it is a DutchStemmer).
|
* filter object is created (as long as it is a {@link DutchStemmer}).
|
||||||
*
|
* </p>
|
||||||
*/
|
*/
|
||||||
public final class DutchStemFilter extends TokenFilter {
|
public final class DutchStemFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
|
@ -85,7 +87,7 @@ public final class DutchStemFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set a alternative/custom DutchStemmer for this filter.
|
* Set a alternative/custom {@link DutchStemmer} for this filter.
|
||||||
*/
|
*/
|
||||||
public void setStemmer(DutchStemmer stemmer) {
|
public void setStemmer(DutchStemmer stemmer) {
|
||||||
if (stemmer != null) {
|
if (stemmer != null) {
|
||||||
|
|
|
@ -20,11 +20,12 @@ package org.apache.lucene.analysis.nl;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* A stemmer for Dutch words.
|
||||||
* A stemmer for Dutch words. The algorithm is an implementation of
|
* <p>
|
||||||
|
* The algorithm is an implementation of
|
||||||
* the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
|
* the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
|
||||||
* algorithm in Martin Porter's snowball project.
|
* algorithm in Martin Porter's snowball project.
|
||||||
*
|
* </p>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class DutchStemmer {
|
public class DutchStemmer {
|
||||||
|
|
|
@ -4,7 +4,7 @@ import org.apache.lucene.index.Payload;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Base class for payload encoders.
|
||||||
*
|
*
|
||||||
**/
|
**/
|
||||||
public abstract class AbstractEncoder implements PayloadEncoder{
|
public abstract class AbstractEncoder implements PayloadEncoder{
|
||||||
|
|
|
@ -22,7 +22,7 @@ import org.apache.lucene.index.Payload;
|
||||||
/**
|
/**
|
||||||
* Encode a character array Float as a {@link org.apache.lucene.index.Payload}.
|
* Encode a character array Float as a {@link org.apache.lucene.index.Payload}.
|
||||||
* <p/>
|
* <p/>
|
||||||
* @see {@link org.apache.lucene.analysis.payloads.PayloadHelper#encodeFloat(float, byte[], int)}
|
* @see org.apache.lucene.analysis.payloads.PayloadHelper#encodeFloat(float, byte[], int)
|
||||||
*
|
*
|
||||||
**/
|
**/
|
||||||
public class FloatEncoder extends AbstractEncoder implements PayloadEncoder {
|
public class FloatEncoder extends AbstractEncoder implements PayloadEncoder {
|
||||||
|
|
|
@ -20,7 +20,7 @@ import org.apache.lucene.index.Payload;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Mainly for use with the DelimitedPayloadTokenFilter, converts char buffers to Payload
|
* Mainly for use with the DelimitedPayloadTokenFilter, converts char buffers to Payload.
|
||||||
* <p/>
|
* <p/>
|
||||||
* NOTE: This interface is subject to change
|
* NOTE: This interface is subject to change
|
||||||
*
|
*
|
||||||
|
@ -34,7 +34,7 @@ public interface PayloadEncoder {
|
||||||
* @param buffer
|
* @param buffer
|
||||||
* @param offset
|
* @param offset
|
||||||
* @param length
|
* @param length
|
||||||
* @return
|
* @return encoded {@link Payload}
|
||||||
*/
|
*/
|
||||||
Payload encode(char [] buffer, int offset, int length);
|
Payload encode(char [] buffer, int offset, int length);
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,7 @@ package org.apache.lucene.analysis.payloads;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Utility methods for encoding payloads.
|
||||||
*
|
*
|
||||||
**/
|
**/
|
||||||
public class PayloadHelper {
|
public class PayloadHelper {
|
||||||
|
@ -60,7 +60,7 @@ public class PayloadHelper {
|
||||||
* @param offset The offset into the array.
|
* @param offset The offset into the array.
|
||||||
* @return The float that was encoded
|
* @return The float that was encoded
|
||||||
*
|
*
|
||||||
* @see # encodeFloat (float)
|
* @see #encodeFloat(float)
|
||||||
*/
|
*/
|
||||||
public static final float decodeFloat(byte [] bytes, int offset){
|
public static final float decodeFloat(byte [] bytes, int offset){
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Filter for assigning position increments.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -28,19 +28,19 @@ import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* An analyzer used primarily at query time to wrap another analyzer and provide a layer of protection
|
* An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
|
||||||
* which prevents very common words from being passed into queries. For very large indexes the cost
|
* which prevents very common words from being passed into queries.
|
||||||
|
* <p>
|
||||||
|
* For very large indexes the cost
|
||||||
* of reading TermDocs for a very common word can be high. This analyzer was created after experience with
|
* of reading TermDocs for a very common word can be high. This analyzer was created after experience with
|
||||||
* a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
|
* a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
|
||||||
* this term to take 2 seconds.
|
* this term to take 2 seconds.
|
||||||
*
|
* </p>
|
||||||
|
* <p>
|
||||||
* Use the various "addStopWords" methods in this class to automate the identification and addition of
|
* Use the various "addStopWords" methods in this class to automate the identification and addition of
|
||||||
* stop words found in an already existing index.
|
* stop words found in an already existing index.
|
||||||
*
|
* </p>
|
||||||
*
|
|
||||||
*
|
|
||||||
|
|
||||||
*/
|
*/
|
||||||
public class QueryAutoStopWordAnalyzer extends Analyzer {
|
public class QueryAutoStopWordAnalyzer extends Analyzer {
|
||||||
Analyzer delegate;
|
Analyzer delegate;
|
||||||
|
@ -50,9 +50,9 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
||||||
public static final float defaultMaxDocFreqPercent = 0.4f;
|
public static final float defaultMaxDocFreqPercent = 0.4f;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initializes this analyzer with the Analyzer object that actual produces the tokens
|
* Initializes this analyzer with the Analyzer object that actually produces the tokens
|
||||||
*
|
*
|
||||||
* @param delegate The choice of analyzer that is used to produce the token stream which needs filtering
|
* @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
|
||||||
*/
|
*/
|
||||||
public QueryAutoStopWordAnalyzer(Analyzer delegate) {
|
public QueryAutoStopWordAnalyzer(Analyzer delegate) {
|
||||||
this.delegate = delegate;
|
this.delegate = delegate;
|
||||||
|
@ -62,7 +62,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
||||||
/**
|
/**
|
||||||
* Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
|
* Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
|
||||||
*
|
*
|
||||||
* @param reader The IndexReader class which will be consulted to identify potential stop words that
|
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
|
||||||
* exceed the required document frequency
|
* exceed the required document frequency
|
||||||
* @return The number of stop words identified.
|
* @return The number of stop words identified.
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
|
@ -74,7 +74,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
||||||
/**
|
/**
|
||||||
* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
|
* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
|
||||||
*
|
*
|
||||||
* @param reader The IndexReader class which will be consulted to identify potential stop words that
|
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
|
||||||
* exceed the required document frequency
|
* exceed the required document frequency
|
||||||
* @param maxDocFreq The maximum number of index documents which can contain a term, after which
|
* @param maxDocFreq The maximum number of index documents which can contain a term, after which
|
||||||
* the term is considered to be a stop word
|
* the term is considered to be a stop word
|
||||||
|
@ -94,7 +94,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
||||||
/**
|
/**
|
||||||
* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
|
* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
|
||||||
*
|
*
|
||||||
* @param reader The IndexReader class which will be consulted to identify potential stop words that
|
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
|
||||||
* exceed the required document frequency
|
* exceed the required document frequency
|
||||||
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
|
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
|
||||||
* contain a term, after which the word is considered to be a stop word.
|
* contain a term, after which the word is considered to be a stop word.
|
||||||
|
@ -114,7 +114,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
||||||
/**
|
/**
|
||||||
* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
|
* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
|
||||||
*
|
*
|
||||||
* @param reader The IndexReader class which will be consulted to identify potential stop words that
|
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
|
||||||
* exceed the required document frequency
|
* exceed the required document frequency
|
||||||
* @param fieldName The field for which stopwords will be added
|
* @param fieldName The field for which stopwords will be added
|
||||||
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
|
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
|
||||||
|
@ -129,7 +129,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
||||||
/**
|
/**
|
||||||
* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
|
* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
|
||||||
*
|
*
|
||||||
* @param reader The IndexReader class which will be consulted to identify potential stop words that
|
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
|
||||||
* exceed the required document frequency
|
* exceed the required document frequency
|
||||||
* @param fieldName The field for which stopwords will be added
|
* @param fieldName The field for which stopwords will be added
|
||||||
* @param maxDocFreq The maximum number of index documents which
|
* @param maxDocFreq The maximum number of index documents which
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Automatically filter high-frequency stopwords.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,5 @@
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Filter to reverse token text.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -29,10 +29,12 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for Russian language. Supports an external list of stopwords (words that
|
* {@link Analyzer} for Russian language.
|
||||||
|
* <p>
|
||||||
|
* Supports an external list of stopwords (words that
|
||||||
* will not be indexed at all).
|
* will not be indexed at all).
|
||||||
* A default set of stopwords is used unless an alternative list is specified.
|
* A default set of stopwords is used unless an alternative list is specified.
|
||||||
*
|
* </p>
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
|
@ -246,10 +248,13 @@ public final class RussianAnalyzer extends Analyzer
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
* Creates a {@link TokenStream} which tokenizes all the text in the
|
||||||
|
* provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a RussianLetterTokenizer filtered with
|
* @return A {@link TokenStream} built from a
|
||||||
* RussianLowerCaseFilter, StopFilter, and RussianStemFilter
|
* {@link RussianLetterTokenizer} filtered with
|
||||||
|
* {@link RussianLowerCaseFilter}, {@link StopFilter},
|
||||||
|
* and {@link RussianStemFilter}
|
||||||
*/
|
*/
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader)
|
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||||
{
|
{
|
||||||
|
@ -266,11 +271,13 @@ public final class RussianAnalyzer extends Analyzer
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||||
* in the provided Reader.
|
* in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A TokenStream built from a RussianLetterTokenizer filtered with
|
* @return A {@link TokenStream} built from a
|
||||||
* RussianLowerCaseFilter, StopFilter, and RussianStemFilter
|
* {@link RussianLetterTokenizer} filtered with
|
||||||
|
* {@link RussianLowerCaseFilter}, {@link StopFilter},
|
||||||
|
* and {@link RussianStemFilter}
|
||||||
*/
|
*/
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
|
@ -19,10 +19,11 @@ package org.apache.lucene.analysis.ru;
|
||||||
/**
|
/**
|
||||||
* RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
|
* RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
|
||||||
* for russian characters in Unicode, KOI8 and CP1252.
|
* for russian characters in Unicode, KOI8 and CP1252.
|
||||||
|
* <p>
|
||||||
* Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
|
* Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
|
||||||
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
|
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
|
||||||
* and adding logic to toLowerCase() method for that charset.
|
* and adding logic to toLowerCase() method for that charset.
|
||||||
*
|
* </p>
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -19,13 +19,18 @@ package org.apache.lucene.analysis.ru;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.CharTokenizer;
|
import org.apache.lucene.analysis.CharTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer; // for javadocs
|
||||||
|
import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
|
* A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
|
||||||
* in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
|
* by additionally looking up letters in a given "russian charset".
|
||||||
|
* <p>
|
||||||
|
* The problem with
|
||||||
|
* {@link LetterTokenizer} is that it uses {@link Character#isLetter(char)} method,
|
||||||
* which doesn't know how to detect letters in encodings like CP1252 and KOI8
|
* which doesn't know how to detect letters in encodings like CP1252 and KOI8
|
||||||
* (well-known problems with 0xD7 and 0xF7 chars)
|
* (well-known problems with 0xD7 and 0xF7 chars)
|
||||||
*
|
* </p>
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ru;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.ru;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
@ -25,10 +24,12 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
|
* A {@link TokenFilter} that stems Russian words.
|
||||||
* The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter ,
|
* <p>
|
||||||
|
* The implementation was inspired by GermanStemFilter.
|
||||||
|
* The input should be filtered by {@link RussianLowerCaseFilter} before passing it to RussianStemFilter ,
|
||||||
* because RussianStemFilter only works with lowercase part of any "russian" charset.
|
* because RussianStemFilter only works with lowercase part of any "russian" charset.
|
||||||
*
|
* </p>
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
|
@ -66,7 +67,7 @@ public final class RussianStemFilter extends TokenFilter
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set a alternative/custom RussianStemmer for this filter.
|
* Set a alternative/custom {@link RussianStemmer} for this filter.
|
||||||
*/
|
*/
|
||||||
public void setStemmer(RussianStemmer stemmer)
|
public void setStemmer(RussianStemmer stemmer)
|
||||||
{
|
{
|
||||||
|
|
|
@ -25,8 +25,10 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A ShingleAnalyzerWrapper wraps a ShingleFilter around another analyzer. A
|
* A ShingleAnalyzerWrapper wraps a {@link ShingleFilter} around another {@link Analyzer}.
|
||||||
* shingle is another namefor a token based n-gram.
|
* <p>
|
||||||
|
* A shingle is another name for a token based n-gram.
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class ShingleAnalyzerWrapper extends Analyzer {
|
public class ShingleAnalyzerWrapper extends Analyzer {
|
||||||
|
|
||||||
|
|
|
@ -76,7 +76,7 @@ public class ShingleFilter extends TokenFilter {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs a ShingleFilter with the specified single size from the
|
* Constructs a ShingleFilter with the specified single size from the
|
||||||
* TokenStream <code>input</code>
|
* {@link TokenStream} <code>input</code>
|
||||||
*
|
*
|
||||||
* @param input input stream
|
* @param input input stream
|
||||||
* @param maxShingleSize maximum shingle size produced by the filter.
|
* @param maxShingleSize maximum shingle size produced by the filter.
|
||||||
|
|
|
@ -129,7 +129,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
/**
|
/**
|
||||||
* Retrieves information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
|
* Retrieves information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
|
||||||
* @param token
|
* @param token
|
||||||
* @return
|
* @return {@link ShingleMatrixFilter.TokenPositioner}
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public abstract TokenPositioner getTokenPositioner(Token token) throws IOException;
|
public abstract TokenPositioner getTokenPositioner(Token token) throws IOException;
|
||||||
|
@ -1014,7 +1014,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
* Returns a 32 bit float from the payload, or 1f it null.
|
* Returns a 32 bit float from the payload, or 1f it null.
|
||||||
*
|
*
|
||||||
* @param token
|
* @param token
|
||||||
* @return
|
* @return 32 bit float
|
||||||
*/
|
*/
|
||||||
public float getWeight(Token token) {
|
public float getWeight(Token token) {
|
||||||
if (token.getPayload() == null || token.getPayload().getData() == null) {
|
if (token.getPayload() == null || token.getPayload().getData() == null) {
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Word n-gram filters
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -27,7 +27,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for Thai language. It uses java.text.BreakIterator to break words.
|
* {@link Analyzer} for Thai language. It uses {@link java.text.BreakIterator} to break words.
|
||||||
* @version 0.2
|
* @version 0.2
|
||||||
*/
|
*/
|
||||||
public class ThaiAnalyzer extends Analyzer {
|
public class ThaiAnalyzer extends Analyzer {
|
||||||
|
|
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import java.text.BreakIterator;
|
import java.text.BreakIterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TokenFilter that use java.text.BreakIterator to break each
|
* {@link TokenFilter} that use {@link java.text.BreakIterator} to break each
|
||||||
* Token that is Thai into separate Token(s) for each Thai word.
|
* Token that is Thai into separate Token(s) for each Thai word.
|
||||||
* @version 0.2
|
* @version 0.2
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Thai.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -118,6 +118,14 @@ public class TestBrazilianStemmer extends TestCase {
|
||||||
check("quiosque", "quiosqu");
|
check("quiosque", "quiosqu");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testNormalization() throws Exception {
|
||||||
|
check("Brasil", "brasil"); // lowercase by default
|
||||||
|
check("Brasília", "brasil"); // remove diacritics
|
||||||
|
check("quimio5terápicos", "quimio5terapicos"); // contains non-letter, diacritic will still be removed
|
||||||
|
check("áá", "áá"); // token is too short: diacritics are not removed
|
||||||
|
check("ááá", "aaa"); // normally, diacritics are removed
|
||||||
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer a = new BrazilianAnalyzer();
|
Analyzer a = new BrazilianAnalyzer();
|
||||||
checkReuse(a, "boa", "boa");
|
checkReuse(a, "boa", "boa");
|
||||||
|
@ -126,6 +134,11 @@ public class TestBrazilianStemmer extends TestCase {
|
||||||
checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
|
checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testStemExclusionTable() throws Exception {
|
||||||
|
BrazilianAnalyzer a = new BrazilianAnalyzer();
|
||||||
|
a.setStemExclusionTable(new String[] { "quintessência" });
|
||||||
|
checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
|
||||||
|
}
|
||||||
|
|
||||||
private void check(final String input, final String expected) throws IOException {
|
private void check(final String input, final String expected) throws IOException {
|
||||||
Analyzer analyzer = new BrazilianAnalyzer();
|
Analyzer analyzer = new BrazilianAnalyzer();
|
||||||
|
|
|
@ -169,6 +169,66 @@ public class TestCJKTokenizer extends TestCase{
|
||||||
checkCJKToken(str, out_tokens);
|
checkCJKToken(str, out_tokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Full-width text is normalized to half-width
|
||||||
|
*/
|
||||||
|
public void testFullWidth() throws Exception {
|
||||||
|
String str = "Test 1234";
|
||||||
|
TestToken[] out_tokens = {
|
||||||
|
newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||||
|
newToken("1234", 5, 9, CJKTokenizer.SINGLE_TOKEN_TYPE)
|
||||||
|
};
|
||||||
|
checkCJKToken(str, out_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Non-english text (not just CJK) is treated the same as CJK: C1C2 C2C3
|
||||||
|
*/
|
||||||
|
public void testNonIdeographic() throws Exception {
|
||||||
|
String str = "\u4e00 روبرت موير";
|
||||||
|
TestToken[] out_tokens = {
|
||||||
|
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("رو", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("وب", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("بر", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("رت", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("مو", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("وي", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("ير", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||||
|
};
|
||||||
|
checkCJKToken(str, out_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Non-english text with nonletters (non-spacing marks,etc) is treated as C1C2 C2C3,
|
||||||
|
* except for words are split around non-letters.
|
||||||
|
*/
|
||||||
|
public void testNonIdeographicNonLetter() throws Exception {
|
||||||
|
String str = "\u4e00 رُوبرت موير";
|
||||||
|
TestToken[] out_tokens = {
|
||||||
|
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("ر", 2, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("وب", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("بر", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("رت", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("مو", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("وي", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
newToken("ير", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||||
|
};
|
||||||
|
checkCJKToken(str, out_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTokenStream() throws Exception {
|
||||||
|
Analyzer analyzer = new CJKAnalyzer();
|
||||||
|
TokenStream ts = analyzer.tokenStream("dummy", new StringReader("\u4e00\u4e01\u4e02"));
|
||||||
|
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||||
|
assertTrue(ts.incrementToken());
|
||||||
|
assertEquals("\u4e00\u4e01", termAtt.term());
|
||||||
|
assertTrue(ts.incrementToken());
|
||||||
|
assertEquals("\u4e01\u4e02", termAtt.term());
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer analyzer = new CJKAnalyzer();
|
Analyzer analyzer = new CJKAnalyzer();
|
||||||
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
||||||
|
|
|
@ -18,12 +18,15 @@ package org.apache.lucene.analysis.cn;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
@ -59,6 +62,76 @@ public class TestChineseTokenizer extends TestCase
|
||||||
new int[] { 1, 2, 3 });
|
new int[] { 1, 2, 3 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Analyzer that just uses ChineseTokenizer, not ChineseFilter.
|
||||||
|
* convenience to show the behavior of the tokenizer
|
||||||
|
*/
|
||||||
|
private class JustChineseTokenizerAnalyzer extends Analyzer {
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
return new ChineseTokenizer(reader);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Analyzer that just uses ChineseFilter, not ChineseTokenizer.
|
||||||
|
* convenience to show the behavior of the filter.
|
||||||
|
*/
|
||||||
|
private class JustChineseFilterAnalyzer extends Analyzer {
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
return new ChineseFilter(new WhitespaceTokenizer(reader));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ChineseTokenizer tokenizes numbers as one token, but they are filtered by ChineseFilter
|
||||||
|
*/
|
||||||
|
public void testNumerics() throws Exception
|
||||||
|
{
|
||||||
|
Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
|
||||||
|
assertAnalyzesTo(justTokenizer, "中1234", new String[] { "中", "1234" });
|
||||||
|
|
||||||
|
// in this case the ChineseAnalyzer (which applies ChineseFilter) will remove the numeric token.
|
||||||
|
Analyzer a = new ChineseAnalyzer();
|
||||||
|
assertAnalyzesTo(a, "中1234", new String[] { "中" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ChineseTokenizer tokenizes english similar to SimpleAnalyzer.
|
||||||
|
* it will lowercase terms automatically.
|
||||||
|
*
|
||||||
|
* ChineseFilter has an english stopword list, it also removes any single character tokens.
|
||||||
|
* the stopword list is case-sensitive.
|
||||||
|
*/
|
||||||
|
public void testEnglish() throws Exception
|
||||||
|
{
|
||||||
|
Analyzer chinese = new ChineseAnalyzer();
|
||||||
|
assertAnalyzesTo(chinese, "This is a Test. b c d",
|
||||||
|
new String[] { "test" });
|
||||||
|
|
||||||
|
Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
|
||||||
|
assertAnalyzesTo(justTokenizer, "This is a Test. b c d",
|
||||||
|
new String[] { "this", "is", "a", "test", "b", "c", "d" });
|
||||||
|
|
||||||
|
Analyzer justFilter = new JustChineseFilterAnalyzer();
|
||||||
|
assertAnalyzesTo(justFilter, "This is a Test. b c d",
|
||||||
|
new String[] { "This", "Test." });
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertAnalyzesTo(Analyzer a, String input, String[] output)
|
||||||
|
throws Exception {
|
||||||
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
|
TermAttribute termAtt = (TermAttribute) ts
|
||||||
|
.getAttribute(TermAttribute.class);
|
||||||
|
|
||||||
|
for (int i = 0; i < output.length; i++) {
|
||||||
|
assertTrue(ts.incrementToken());
|
||||||
|
assertEquals(output[i], termAtt.term());
|
||||||
|
}
|
||||||
|
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
ts.close();
|
||||||
|
}
|
||||||
|
|
||||||
private void assertAnalyzesToReuse(Analyzer a, String input, String[] output,
|
private void assertAnalyzesToReuse(Analyzer a, String input, String[] output,
|
||||||
int startOffsets[], int endOffsets[])
|
int startOffsets[], int endOffsets[])
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
|
|
@ -90,12 +90,12 @@ public class TestGermanStemFilter extends TestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void check(final String input, final String expected) throws IOException {
|
private void check(final String input, final String expected) throws IOException {
|
||||||
StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
|
Analyzer a = new GermanAnalyzer();
|
||||||
GermanStemFilter filter = new GermanStemFilter(tokenStream);
|
TokenStream tokenStream = a.tokenStream("dummy", new StringReader(input));
|
||||||
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
|
||||||
assertTrue(filter.incrementToken());
|
assertTrue(tokenStream.incrementToken());
|
||||||
assertEquals(expected, termAtt.term());
|
assertEquals(expected, termAtt.term());
|
||||||
filter.close();
|
tokenStream.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkReuse(Analyzer a, String input, String expected) throws IOException {
|
private void checkReuse(Analyzer a, String input, String expected) throws IOException {
|
||||||
|
|
|
@ -18,9 +18,11 @@ package org.apache.lucene.analysis.query;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.LetterTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
@ -35,6 +37,7 @@ import org.apache.lucene.store.RAMDirectory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
public class QueryAutoStopWordAnalyzerTest extends TestCase {
|
public class QueryAutoStopWordAnalyzerTest extends TestCase {
|
||||||
String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"};
|
String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"};
|
||||||
|
@ -162,4 +165,37 @@ public class QueryAutoStopWordAnalyzerTest extends TestCase {
|
||||||
Hits h = search(a, "repetitiveField:boring");
|
Hits h = search(a, "repetitiveField:boring");
|
||||||
assertFalse(h.length() == 0);
|
assertFalse(h.length() == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* analyzer that does not support reuse
|
||||||
|
* it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
|
||||||
|
*/
|
||||||
|
private class NonreusableAnalyzer extends Analyzer {
|
||||||
|
int invocationCount = 0;
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
if (++invocationCount % 2 == 0)
|
||||||
|
return new WhitespaceTokenizer(reader);
|
||||||
|
else
|
||||||
|
return new LetterTokenizer(reader);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWrappingNonReusableAnalyzer() throws Exception {
|
||||||
|
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(new NonreusableAnalyzer());
|
||||||
|
a.addStopWords(reader, 10);
|
||||||
|
Hits h = search(a, "repetitiveField:boring");
|
||||||
|
assertTrue(h.length() == 0);
|
||||||
|
h = search(a, "repetitiveField:vaguelyboring");
|
||||||
|
assertTrue(h.length() == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTokenStream() throws Exception {
|
||||||
|
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(new WhitespaceAnalyzer());
|
||||||
|
a.addStopWords(reader, 10);
|
||||||
|
TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
|
||||||
|
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||||
|
assertTrue(ts.incrementToken());
|
||||||
|
assertEquals("this", termAtt.term());
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -336,7 +336,9 @@ public class TestShingleMatrixFilter extends TestCase {
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public void testMatrix() throws IOException {
|
public void testMatrix() throws IOException {
|
||||||
|
// some other tests set this to null.
|
||||||
|
// set it here in case tests are run out of the usual order.
|
||||||
|
ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
|
||||||
Matrix matrix = new Matrix();
|
Matrix matrix = new Matrix();
|
||||||
|
|
||||||
matrix.new Column(tokenFactory("no", 1));
|
matrix.new Column(tokenFactory("no", 1));
|
||||||
|
|
|
@ -57,7 +57,7 @@ public class TestSmartChineseAnalyzer extends TestCase {
|
||||||
* This test is the same as the above, except using an ideographic space as a separator.
|
* This test is the same as the above, except using an ideographic space as a separator.
|
||||||
* This tests to ensure the stopwords are working correctly.
|
* This tests to ensure the stopwords are working correctly.
|
||||||
*/
|
*/
|
||||||
public void testChineseStopWordsDefaultTwoPhrasesIdeoSpache() throws Exception {
|
public void testChineseStopWordsDefaultTwoPhrasesIdeoSpace() throws Exception {
|
||||||
Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
|
Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
|
||||||
String sentence = "我购买了道具和服装 我购买了道具和服装。";
|
String sentence = "我购买了道具和服装 我购买了道具和服装。";
|
||||||
String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
|
String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
|
||||||
|
@ -101,6 +101,52 @@ public class TestSmartChineseAnalyzer extends TestCase {
|
||||||
new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
|
new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Numerics are parsed as their own tokens
|
||||||
|
*/
|
||||||
|
public void testNumerics() throws Exception {
|
||||||
|
assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Tests 了道具和服装1234",
|
||||||
|
new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Full width alphas and numerics are folded to half-width
|
||||||
|
*/
|
||||||
|
public void testFullWidth() throws Exception {
|
||||||
|
assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Tests 了道具和服装1234",
|
||||||
|
new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Presentation form delimiters are removed
|
||||||
|
*/
|
||||||
|
public void testDelimiters() throws Exception {
|
||||||
|
assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买︱ Tests 了道具和服装",
|
||||||
|
new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Text from writing systems other than Chinese and Latin are parsed as individual characters.
|
||||||
|
* (regardless of Unicode category)
|
||||||
|
*/
|
||||||
|
public void testNonChinese() throws Exception {
|
||||||
|
assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 روبرتTests 了道具和服装",
|
||||||
|
new String[] { "我", "购买", "ر", "و", "ب", "ر", "ت", "test", "了", "道具", "和", "服装"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test what the analyzer does with out-of-vocabulary words.
|
||||||
|
* In this case the name is Yousaf Raza Gillani.
|
||||||
|
* Currently it is being analyzed into single characters...
|
||||||
|
*/
|
||||||
|
public void testOOV() throws Exception {
|
||||||
|
assertAnalyzesTo(new SmartChineseAnalyzer(true), "优素福·拉扎·吉拉尼",
|
||||||
|
new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
|
||||||
|
|
||||||
|
assertAnalyzesTo(new SmartChineseAnalyzer(true), "优素福拉扎吉拉尼",
|
||||||
|
new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
|
||||||
|
}
|
||||||
|
|
||||||
public void testOffsets() throws Exception {
|
public void testOffsets() throws Exception {
|
||||||
assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买了道具和服装",
|
assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买了道具和服装",
|
||||||
new String[] { "我", "购买", "了", "道具", "和", "服装" },
|
new String[] { "我", "购买", "了", "道具", "和", "服装" },
|
||||||
|
|
|
@ -109,6 +109,7 @@ public class TestSynonymTokenFilter extends TestCase {
|
||||||
streams.source = new WhitespaceTokenizer(reader);
|
streams.source = new WhitespaceTokenizer(reader);
|
||||||
streams.result = new LowerCaseFilter(streams.source);
|
streams.result = new LowerCaseFilter(streams.source);
|
||||||
streams.result = new SynonymTokenFilter(streams.result, synonyms, maxSynonyms);
|
streams.result = new SynonymTokenFilter(streams.result, synonyms, maxSynonyms);
|
||||||
|
setPreviousTokenStream(streams);
|
||||||
} else {
|
} else {
|
||||||
streams.source.reset(reader);
|
streams.source.reset(reader);
|
||||||
streams.result.reset(); // reset the SynonymTokenFilter
|
streams.result.reset(); // reset the SynonymTokenFilter
|
||||||
|
|
Loading…
Reference in New Issue