mirror of https://github.com/apache/lucene.git
Fixed Bug 4555. Possible NullPointerException when a short term with
substitutions was being checked for a particle denotion. Length checking corrected, and sequence of resubstitution and removing particle denotion changed to prevent denoted term to pass through remove because of the reduced length. Corrected and improved documentation. Fix in WordlistLoader, files are not read correct, loosing a line. Fix in GermanStemFilter, typo in constructor with custom exclusion table as parameter, parameter was ignored. GermanStemFilter has two new Methods for setting stemmer and exclusion list after creating the filter object. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149630 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c05b8e334b
commit
89a554ffab
|
@ -14,6 +14,8 @@ import java.util.Hashtable;
|
||||||
* Analyzer for german language. Supports an external list of stopwords (words that
|
* Analyzer for german language. Supports an external list of stopwords (words that
|
||||||
* will not be indexed at all) and an external list of exclusions (word that will
|
* will not be indexed at all) and an external list of exclusions (word that will
|
||||||
* not be stemmed, but indexed).
|
* not be stemmed, but indexed).
|
||||||
|
* A default set of stopwords is used unless an other list is specified, the
|
||||||
|
* exclusionlist is empty by default.
|
||||||
*
|
*
|
||||||
* @author Gerhard Schwarz
|
* @author Gerhard Schwarz
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
@ -35,8 +37,8 @@ public final class GermanAnalyzer extends Analyzer {
|
||||||
"als", "für", "von", "mit",
|
"als", "für", "von", "mit",
|
||||||
"dich", "dir", "mich", "mir",
|
"dich", "dir", "mich", "mir",
|
||||||
"mein", "sein", "kein",
|
"mein", "sein", "kein",
|
||||||
"durch", "wegen"
|
"durch", "wegen", "wird"
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Contains the stopwords used with the StopFilter.
|
* Contains the stopwords used with the StopFilter.
|
||||||
|
@ -98,9 +100,9 @@ public final class GermanAnalyzer extends Analyzer {
|
||||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||||
*
|
*
|
||||||
* @return A TokenStream build from a StandardTokenizer filtered with
|
* @return A TokenStream build from a StandardTokenizer filtered with
|
||||||
* StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
|
* StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter
|
||||||
*/
|
*/
|
||||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
public final TokenStream tokenStream( String fieldName, Reader reader ) {
|
||||||
TokenStream result = new StandardTokenizer( reader );
|
TokenStream result = new StandardTokenizer( reader );
|
||||||
result = new StandardFilter( result );
|
result = new StandardFilter( result );
|
||||||
result = new StopFilter( result, stoptable );
|
result = new StopFilter( result, stoptable );
|
||||||
|
|
|
@ -8,7 +8,8 @@ import java.util.Hashtable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A filter that stemms german words. It supports a table of words that should
|
* A filter that stemms german words. It supports a table of words that should
|
||||||
* not be stemmed at all.
|
* not be stemmed at all. The used stemmer can be changed at runtime after the
|
||||||
|
* filter object is created (as long as it is a GermanStemmer).
|
||||||
*
|
*
|
||||||
* @author Gerhard Schwarz
|
* @author Gerhard Schwarz
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
@ -32,30 +33,44 @@ public final class GermanStemFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) {
|
public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) {
|
||||||
this( in );
|
this( in );
|
||||||
this.exclusions = exclusions;
|
exclusions = exclusiontable;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return Returns the next token in the stream, or null at EOS.
|
* @return Returns the next token in the stream, or null at EOS
|
||||||
*/
|
*/
|
||||||
public final Token next()
|
public final Token next()
|
||||||
throws IOException {
|
throws IOException {
|
||||||
if ( ( token = input.next() ) == null ) {
|
if ( ( token = input.next() ) == null ) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
// Check the exclusiontable.
|
// Check the exclusiontable
|
||||||
else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
|
else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
|
||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
String s = stemmer.stem( token.termText() );
|
String s = stemmer.stem( token.termText() );
|
||||||
// If not stemmed, dont waste the time creating a new token.
|
// If not stemmed, dont waste the time creating a new token
|
||||||
if ( !s.equals( token.termText() ) ) {
|
if ( !s.equals( token.termText() ) ) {
|
||||||
return new Token( s, 0, s.length(), token.type() );
|
return new Token( s, 0, s.length(), token.type() );
|
||||||
}
|
}
|
||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Set a alternative/custom GermanStemmer for this filter.
|
||||||
|
*/
|
||||||
|
public void setStemmer( GermanStemmer stemmer ) {
|
||||||
|
if ( stemmer != null ) {
|
||||||
|
this.stemmer = stemmer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Set an alternative exclusion list for this filter.
|
||||||
|
*/
|
||||||
|
public void setExclusionTable( Hashtable exclusiontable ) {
|
||||||
|
exclusions = exclusiontable;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,7 @@ public class GermanStemmer {
|
||||||
/**
|
/**
|
||||||
* Stemms the given term to an unique <tt>discriminator</tt>.
|
* Stemms the given term to an unique <tt>discriminator</tt>.
|
||||||
*
|
*
|
||||||
* @param word The term that should be stemmed.
|
* @param term The term that should be stemmed.
|
||||||
* @return Discriminator for <tt>term</tt>
|
* @return Discriminator for <tt>term</tt>
|
||||||
*/
|
*/
|
||||||
protected String stem( String term ) {
|
protected String stem( String term ) {
|
||||||
|
@ -41,6 +41,9 @@ public class GermanStemmer {
|
||||||
if ( Character.isUpperCase( term.charAt( 0 ) ) ) {
|
if ( Character.isUpperCase( term.charAt( 0 ) ) ) {
|
||||||
uppercase = true;
|
uppercase = true;
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
uppercase = false;
|
||||||
|
}
|
||||||
// Use lowercase for medium stemming.
|
// Use lowercase for medium stemming.
|
||||||
term = term.toLowerCase();
|
term = term.toLowerCase();
|
||||||
// Reset the StringBuffer.
|
// Reset the StringBuffer.
|
||||||
|
@ -79,9 +82,10 @@ public class GermanStemmer {
|
||||||
sb.setCharAt( sb.length() - 1, 'x' );
|
sb.setCharAt( sb.length() - 1, 'x' );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Check the 7 "base" suffixes: "e", "s", "n", "t", "em", "er", "nd" for all
|
// Strip the 7 "base" suffixes: "e", "s", "n", "t", "em", "er", "nd" from all
|
||||||
// other terms. Adjectives, Verbs and Adverbs have a total of 52 different
|
// other terms. Adjectives, Verbs and Adverbs have a total of 52 different
|
||||||
// possible suffixes.
|
// possible suffixes, stripping only the characters from they are build
|
||||||
|
// does mostly the same
|
||||||
else {
|
else {
|
||||||
// Strip base suffixes as long as enough characters remain.
|
// Strip base suffixes as long as enough characters remain.
|
||||||
boolean doMore = true;
|
boolean doMore = true;
|
||||||
|
@ -112,10 +116,10 @@ public class GermanStemmer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
sb = resubstitute( sb );
|
||||||
if ( !uppercase ) {
|
if ( !uppercase ) {
|
||||||
sb = removeParticleDenotion( sb );
|
sb = removeParticleDenotion( sb );
|
||||||
}
|
}
|
||||||
sb = resubstitute( sb );
|
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -127,8 +131,8 @@ public class GermanStemmer {
|
||||||
*/
|
*/
|
||||||
private StringBuffer removeParticleDenotion( StringBuffer buffer ) {
|
private StringBuffer removeParticleDenotion( StringBuffer buffer ) {
|
||||||
for ( int c = 0; c < buffer.length(); c++ ) {
|
for ( int c = 0; c < buffer.length(); c++ ) {
|
||||||
// Strip from the beginning of the string to the "ge" inclusive.
|
// Strip from the beginning of the string to the "ge" inclusive
|
||||||
if ( c < ( sb.length() - 3 ) && buffer.charAt( c ) == 'g' && buffer.charAt ( c + 1 ) == 'e' ) {
|
if ( c < ( buffer.length() - 4 ) && buffer.charAt( c ) == 'g' && buffer.charAt ( c + 1 ) == 'e' ) {
|
||||||
buffer.delete( 0, c + 2 );
|
buffer.delete( 0, c + 2 );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -140,7 +144,7 @@ public class GermanStemmer {
|
||||||
*
|
*
|
||||||
* - Substitute Umlauts with their corresponding vowel: äöü -> aou,
|
* - Substitute Umlauts with their corresponding vowel: äöü -> aou,
|
||||||
* "ß" is substituted by "ss"
|
* "ß" is substituted by "ss"
|
||||||
* - Substitute an second char of an pair of equal characters with
|
* - Substitute a second char of an pair of equal characters with
|
||||||
* an asterisk: ?? -> ?*
|
* an asterisk: ?? -> ?*
|
||||||
* - Substitute some common character combinations with a token:
|
* - Substitute some common character combinations with a token:
|
||||||
* sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
|
* sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
|
||||||
|
@ -149,7 +153,7 @@ public class GermanStemmer {
|
||||||
*/
|
*/
|
||||||
private StringBuffer substitute( StringBuffer buffer ) {
|
private StringBuffer substitute( StringBuffer buffer ) {
|
||||||
for ( int c = 0; c < buffer.length(); c++ ) {
|
for ( int c = 0; c < buffer.length(); c++ ) {
|
||||||
// Replace the second char of a pair of the equal characters with an asterisk.
|
// Replace the second char of a pair of the equal characters with an asterisk
|
||||||
if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
|
if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
|
||||||
buffer.setCharAt( c, '*' );
|
buffer.setCharAt( c, '*' );
|
||||||
}
|
}
|
||||||
|
@ -163,14 +167,14 @@ public class GermanStemmer {
|
||||||
else if ( buffer.charAt( c ) == 'ü' ) {
|
else if ( buffer.charAt( c ) == 'ü' ) {
|
||||||
buffer.setCharAt( c, 'u' );
|
buffer.setCharAt( c, 'u' );
|
||||||
}
|
}
|
||||||
// Take care that enough characters at left for search.
|
// Take care that at least one character is left left side from the current one
|
||||||
if ( c < buffer.length() - 1 ) {
|
if ( c < buffer.length() - 1 ) {
|
||||||
if ( buffer.charAt( c ) == 'ß' ) {
|
if ( buffer.charAt( c ) == 'ß' ) {
|
||||||
buffer.setCharAt( c, 's' );
|
buffer.setCharAt( c, 's' );
|
||||||
buffer.insert( c + 1, 's' );
|
buffer.insert( c + 1, 's' );
|
||||||
substCount++;
|
substCount++;
|
||||||
}
|
}
|
||||||
// Masking several common character combinations with an token.
|
// Masking several common character combinations with an token
|
||||||
else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) {
|
else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) {
|
||||||
buffer.setCharAt( c, '$' );
|
buffer.setCharAt( c, '$' );
|
||||||
buffer.delete( c + 1, c + 3 );
|
buffer.delete( c + 1, c + 3 );
|
||||||
|
@ -240,10 +244,11 @@ public class GermanStemmer {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Undoes some changes made by substitute(). That are character pairs and
|
* Undoes the changes made by substitute(). That are character pairs and
|
||||||
* character combinations.
|
* character combinations. Umlauts will remain as their corresponding vowel,
|
||||||
|
* as "ß" remains as "ss".
|
||||||
*
|
*
|
||||||
* @return The term without the not human reaqdable substitutions.
|
* @return The term without the not human readable substitutions.
|
||||||
*/
|
*/
|
||||||
private StringBuffer resubstitute( StringBuffer buffer ) {
|
private StringBuffer resubstitute( StringBuffer buffer ) {
|
||||||
for ( int c = 0; c < buffer.length(); c++ ) {
|
for ( int c = 0; c < buffer.length(); c++ ) {
|
||||||
|
|
|
@ -7,8 +7,9 @@ import java.io.LineNumberReader;
|
||||||
import java.util.Hashtable;
|
import java.util.Hashtable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a textfile and adds every entry to a Hashtable. If a file is not found
|
* Loads a textfile and adds every line as an entry to a Hashtable. Every line
|
||||||
* or on any error, an empty table is returned.
|
* should contain only one word. If a file is not found or on any error, an
|
||||||
|
* empty table is returned.
|
||||||
*
|
*
|
||||||
* @author Gerhard Schwarz
|
* @author Gerhard Schwarz
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
@ -16,15 +17,14 @@ import java.util.Hashtable;
|
||||||
public class WordlistLoader {
|
public class WordlistLoader {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param path Path to the wordlist.
|
* @param path Path to the wordlist
|
||||||
* @param wordfile Name of the wordlist.
|
* @param wordfile Name of the wordlist
|
||||||
*/
|
*/
|
||||||
public static Hashtable getWordtable( String path, String wordfile ) {
|
public static Hashtable getWordtable( String path, String wordfile ) {
|
||||||
if ( path == null || wordfile == null ) {
|
if ( path == null || wordfile == null ) {
|
||||||
return new Hashtable();
|
return new Hashtable();
|
||||||
}
|
}
|
||||||
File absoluteName = new File( path, wordfile );
|
return getWordtable( new File( path, wordfile ) );
|
||||||
return getWordtable( absoluteName );
|
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* @param wordfile Complete path to the wordlist
|
* @param wordfile Complete path to the wordlist
|
||||||
|
@ -33,12 +33,11 @@ public class WordlistLoader {
|
||||||
if ( wordfile == null ) {
|
if ( wordfile == null ) {
|
||||||
return new Hashtable();
|
return new Hashtable();
|
||||||
}
|
}
|
||||||
File absoluteName = new File( wordfile );
|
return getWordtable( new File( wordfile ) );
|
||||||
return getWordtable( absoluteName );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param wordfile File containing the wordlist.
|
* @param wordfile File containing the wordlist
|
||||||
*/
|
*/
|
||||||
public static Hashtable getWordtable( File wordfile ) {
|
public static Hashtable getWordtable( File wordfile ) {
|
||||||
if ( wordfile == null ) {
|
if ( wordfile == null ) {
|
||||||
|
@ -57,11 +56,11 @@ public class WordlistLoader {
|
||||||
System.arraycopy( stopwords, 0, tmp, 0, wordcount );
|
System.arraycopy( stopwords, 0, tmp, 0, wordcount );
|
||||||
stopwords = tmp;
|
stopwords = tmp;
|
||||||
}
|
}
|
||||||
stopwords[wordcount] = word;
|
stopwords[wordcount-1] = word;
|
||||||
}
|
}
|
||||||
result = makeWordTable( stopwords, wordcount );
|
result = makeWordTable( stopwords, wordcount );
|
||||||
}
|
}
|
||||||
// On error, use an empty table.
|
// On error, use an empty table
|
||||||
catch ( IOException e ) {
|
catch ( IOException e ) {
|
||||||
result = new Hashtable();
|
result = new Hashtable();
|
||||||
}
|
}
|
||||||
|
@ -71,8 +70,8 @@ public class WordlistLoader {
|
||||||
/**
|
/**
|
||||||
* Builds the wordlist table.
|
* Builds the wordlist table.
|
||||||
*
|
*
|
||||||
* @param words Word that where read.
|
* @param words Word that where read
|
||||||
* @param length Amount of words that where read into <tt>words</tt>.
|
* @param length Amount of words that where read into <tt>words</tt>
|
||||||
*/
|
*/
|
||||||
private static Hashtable makeWordTable( String[] words, int length ) {
|
private static Hashtable makeWordTable( String[] words, int length ) {
|
||||||
Hashtable table = new Hashtable( length );
|
Hashtable table = new Hashtable( length );
|
||||||
|
|
Loading…
Reference in New Issue