mirror of https://github.com/apache/lucene.git
LUCENE-3508: Decompounders based on CompoundWordTokenFilterBase can now be used with custom attributes. All those attributes are preserved and set on all added decompounded tokens
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1188597 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
51d010010c
commit
ec186e7280
|
@ -82,6 +82,9 @@ Changes in backwards compatibility policy
|
|||
* LUCENE-3446: Removed BooleanFilter.finalResult() due to change to
|
||||
FixedBitSet. (Uwe Schindler)
|
||||
|
||||
* LUCENE-3508: Changed some method signatures in decompounding TokenFilters
|
||||
to make them no longer use the Token class. (Uwe Schindler)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-1824: Add BoundaryScanner interface and its implementation classes,
|
||||
|
@ -129,6 +132,10 @@ Bug Fixes
|
|||
Java that crash on certain inputs containing supplementary characters.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-3508: Decompounders based on CompoundWordTokenFilterBase can now be
|
||||
used with custom attributes. All those attributes are preserved and set on all
|
||||
added decompounded tokens. (Spyros Kapnissis, Uwe Schindler)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy
|
||||
|
|
|
@ -24,20 +24,17 @@ import java.util.LinkedList;
|
|||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Base class for decomposition token filters. <a name="version"/>
|
||||
* Base class for decomposition token filters.
|
||||
* <p>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* CompoundWordTokenFilterBase:
|
||||
|
@ -46,6 +43,13 @@ import org.apache.lucene.util.Version;
|
|||
* supplementary characters in strings and char arrays provided as compound word
|
||||
* dictionaries.
|
||||
* </ul>
|
||||
* <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
|
||||
* it should be case-insensitive unless it contains only lowercased entries and you
|
||||
* have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
|
||||
* For optional performance (as this filter does lots of lookups to the dictionary,
|
||||
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
|
||||
* {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
|
||||
* transformed to case-insensitive!
|
||||
*/
|
||||
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||
/**
|
||||
|
@ -64,37 +68,22 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
|
||||
|
||||
protected final CharArraySet dictionary;
|
||||
protected final LinkedList<Token> tokens;
|
||||
protected final LinkedList<CompoundToken> tokens;
|
||||
protected final int minWordSize;
|
||||
protected final int minSubwordSize;
|
||||
protected final int maxSubwordSize;
|
||||
protected final boolean onlyLongestMatch;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
|
||||
|
||||
private final Token wrapper = new Token();
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
|
||||
this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
}
|
||||
private AttributeSource.State current;
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
|
||||
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
|
||||
this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary) {
|
||||
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
@ -102,7 +91,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input);
|
||||
|
||||
this.tokens=new LinkedList<Token>();
|
||||
this.tokens=new LinkedList<CompoundToken>();
|
||||
this.minWordSize=minWordSize;
|
||||
this.minSubwordSize=minSubwordSize;
|
||||
this.maxSubwordSize=maxSubwordSize;
|
||||
|
@ -111,113 +100,95 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
if (dictionary==null || dictionary instanceof CharArraySet) {
|
||||
this.dictionary = (CharArraySet) dictionary;
|
||||
} else {
|
||||
this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
|
||||
addAllLowerCase(this.dictionary, dictionary);
|
||||
this.dictionary = new CharArraySet(matchVersion, dictionary, true);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a set of words from an array
|
||||
* The resulting Set does case insensitive matching
|
||||
* TODO We should look for a faster dictionary lookup approach.
|
||||
* @param dictionary
|
||||
* @return {@link Set} of lowercased terms
|
||||
*/
|
||||
public static Set<?> makeDictionary(final String[] dictionary) {
|
||||
return makeDictionary(Version.LUCENE_30, dictionary);
|
||||
/** @deprecated Use the constructors taking {@link Set} */
|
||||
@Deprecated
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
|
||||
this(matchVersion, input,makeDictionary(matchVersion,dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
/** @deprecated Use the constructors taking {@link Set} */
|
||||
@Deprecated
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
this(matchVersion, input,makeDictionary(matchVersion,dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
public static Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
|
||||
/** @deprecated Use the constructors taking {@link Set} */
|
||||
@Deprecated
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
|
||||
this(matchVersion, input,makeDictionary(matchVersion,dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
}
|
||||
|
||||
/** @deprecated Only available for backwards compatibility. */
|
||||
@Deprecated
|
||||
public static CharArraySet makeDictionary(final Version matchVersion, final String[] dictionary) {
|
||||
if (dictionary == null) {
|
||||
return null;
|
||||
}
|
||||
// is the below really case insensitive?
|
||||
CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
|
||||
addAllLowerCase(dict, Arrays.asList(dictionary));
|
||||
return dict;
|
||||
}
|
||||
|
||||
private void setToken(final Token token) throws IOException {
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(token.buffer(), 0, token.length());
|
||||
flagsAtt.setFlags(token.getFlags());
|
||||
typeAtt.setType(token.type());
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
posIncAtt.setPositionIncrement(token.getPositionIncrement());
|
||||
payloadAtt.setPayload(token.getPayload());
|
||||
return new CharArraySet(matchVersion, Arrays.asList(dictionary), true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (tokens.size() > 0) {
|
||||
setToken(tokens.removeFirst());
|
||||
if (!tokens.isEmpty()) {
|
||||
assert current != null;
|
||||
CompoundToken token = tokens.removeFirst();
|
||||
restoreState(current); // keep all other attributes untouched
|
||||
termAtt.setEmpty().append(token.txt);
|
||||
offsetAtt.setOffset(token.startOffset, token.endOffset);
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!input.incrementToken())
|
||||
return false;
|
||||
|
||||
wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
||||
wrapper.setStartOffset(offsetAtt.startOffset());
|
||||
wrapper.setEndOffset(offsetAtt.endOffset());
|
||||
wrapper.setFlags(flagsAtt.getFlags());
|
||||
wrapper.setType(typeAtt.type());
|
||||
wrapper.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||
wrapper.setPayload(payloadAtt.getPayload());
|
||||
|
||||
decompose(wrapper);
|
||||
|
||||
if (tokens.size() > 0) {
|
||||
setToken(tokens.removeFirst());
|
||||
current = null; // not really needed, but for safety
|
||||
if (input.incrementToken()) {
|
||||
// Only words longer than minWordSize get processed
|
||||
if (termAtt.length() >= this.minWordSize) {
|
||||
decompose();
|
||||
// only capture the state if we really need it for producing new tokens
|
||||
if (!tokens.isEmpty()) {
|
||||
current = captureState();
|
||||
}
|
||||
}
|
||||
// return original token:
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
protected static void addAllLowerCase(CharArraySet target, Collection<?> col) {
|
||||
for (Object obj : col) {
|
||||
String string = (String) obj;
|
||||
target.add(string.toLowerCase(Locale.ENGLISH));
|
||||
}
|
||||
}
|
||||
|
||||
protected static char[] makeLowerCaseCopy(final char[] buffer) {
|
||||
char[] result=new char[buffer.length];
|
||||
System.arraycopy(buffer, 0, result, 0, buffer.length);
|
||||
|
||||
for (int i=0;i<buffer.length;++i) {
|
||||
result[i]=Character.toLowerCase(buffer[i]);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
protected final Token createToken(final int offset, final int length,
|
||||
final Token prototype) {
|
||||
int newStart = prototype.startOffset() + offset;
|
||||
Token t = prototype.clone(prototype.buffer(), offset, length, newStart, newStart+length);
|
||||
t.setPositionIncrement(0);
|
||||
return t;
|
||||
}
|
||||
|
||||
protected void decompose(final Token token) {
|
||||
// In any case we give the original token back
|
||||
tokens.add((Token) token.clone());
|
||||
|
||||
// Only words longer than minWordSize get processed
|
||||
if (token.length() < this.minWordSize) {
|
||||
return;
|
||||
}
|
||||
|
||||
decomposeInternal(token);
|
||||
}
|
||||
|
||||
protected abstract void decomposeInternal(final Token token);
|
||||
/** Decomposes the current {@link #termAtt} and places {@link CompoundToken} instances in the {@link #tokens} list.
|
||||
* The original token may not be placed in the list, as it is automatically passed through this filter.
|
||||
*/
|
||||
protected abstract void decompose();
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
tokens.clear();
|
||||
current = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper class to hold decompounded token information
|
||||
*/
|
||||
protected class CompoundToken {
|
||||
public final CharSequence txt;
|
||||
public final int startOffset, endOffset;
|
||||
|
||||
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
|
||||
public CompoundToken(int offset, int length) {
|
||||
final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
|
||||
this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
|
||||
// TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
|
||||
// chars from the term, offsets may not match correctly (other filters producing tokens
|
||||
// may also have this problem):
|
||||
this.startOffset = newStart;
|
||||
this.endOffset = newStart + length;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.compound;
|
|||
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -31,12 +30,25 @@ import org.apache.lucene.util.Version;
|
|||
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
|
||||
* "Donaudampfschiff" even when you only enter "schiff".
|
||||
* It uses a brute-force algorithm to achieve this.
|
||||
* </p>
|
||||
* <p>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* CompoundWordTokenFilterBase:
|
||||
* <ul>
|
||||
* <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
|
||||
* supplementary characters in strings and char arrays provided as compound word
|
||||
* dictionaries.
|
||||
* </ul>
|
||||
* <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
|
||||
* it should be case-insensitive unless it contains only lowercased entries and you
|
||||
* have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
|
||||
* For optional performance (as this filter does lots of lookups to the dictionary,
|
||||
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
|
||||
* {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
|
||||
* transformed to case-insensitive!
|
||||
*/
|
||||
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
|
||||
/**
|
||||
* Creates a new {@link DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* Creates a new {@link DictionaryCompoundWordTokenFilter}.
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
|
@ -54,7 +66,9 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
|||
* only subwords shorter than this get to the output stream
|
||||
* @param onlyLongestMatch
|
||||
* Add only the longest matching subword to the stream
|
||||
* @deprecated Use the constructors taking {@link Set}
|
||||
*/
|
||||
@Deprecated
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary,
|
||||
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
|
@ -73,7 +87,9 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
|||
* the {@link TokenStream} to process
|
||||
* @param dictionary
|
||||
* the word dictionary to match against
|
||||
* @deprecated Use the constructors taking {@link Set}
|
||||
*/
|
||||
@Deprecated
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) {
|
||||
super(matchVersion, input, dictionary);
|
||||
}
|
||||
|
@ -89,12 +105,9 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
|||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param dictionary
|
||||
* the word dictionary to match against. If this is a
|
||||
* {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it
|
||||
* must have set ignoreCase=false and only contain lower case
|
||||
* strings.
|
||||
* the word dictionary to match against.
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary) {
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary) {
|
||||
super(matchVersion, input, dictionary);
|
||||
}
|
||||
|
||||
|
@ -109,10 +122,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
|||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param dictionary
|
||||
* the word dictionary to match against. If this is a
|
||||
* {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it
|
||||
* must have set ignoreCase=false and only contain lower case
|
||||
* strings.
|
||||
* the word dictionary to match against.
|
||||
* @param minWordSize
|
||||
* only words longer than this get processed
|
||||
* @param minSubwordSize
|
||||
|
@ -122,37 +132,31 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
|||
* @param onlyLongestMatch
|
||||
* Add only the longest matching subword to the stream
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary,
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary,
|
||||
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void decomposeInternal(final Token token) {
|
||||
// Only words longer than minWordSize get processed
|
||||
if (token.length() < this.minWordSize) {
|
||||
return;
|
||||
}
|
||||
|
||||
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
|
||||
|
||||
for (int i=0;i<=token.length()-this.minSubwordSize;++i) {
|
||||
Token longestMatchToken=null;
|
||||
protected void decompose() {
|
||||
final int len = termAtt.length();
|
||||
for (int i=0;i<=len-this.minSubwordSize;++i) {
|
||||
CompoundToken longestMatchToken=null;
|
||||
for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) {
|
||||
if(i+j>token.length()) {
|
||||
if(i+j>len) {
|
||||
break;
|
||||
}
|
||||
if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
|
||||
if(dictionary.contains(termAtt.buffer(), i, j)) {
|
||||
if (this.onlyLongestMatch) {
|
||||
if (longestMatchToken!=null) {
|
||||
if (longestMatchToken.length()<j) {
|
||||
longestMatchToken=createToken(i,j,token);
|
||||
if (longestMatchToken.txt.length()<j) {
|
||||
longestMatchToken=new CompoundToken(i,j);
|
||||
}
|
||||
} else {
|
||||
longestMatchToken=createToken(i,j,token);
|
||||
longestMatchToken=new CompoundToken(i,j);
|
||||
}
|
||||
} else {
|
||||
tokens.add(createToken(i,j,token));
|
||||
tokens.add(new CompoundToken(i,j));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.compound;
|
|||
import java.io.File;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
|
||||
|
@ -34,7 +33,21 @@ import org.xml.sax.InputSource;
|
|||
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
|
||||
* "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
|
||||
* grammar and a word dictionary to achieve this.
|
||||
* </p>
|
||||
* <p>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* CompoundWordTokenFilterBase:
|
||||
* <ul>
|
||||
* <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
|
||||
* supplementary characters in strings and char arrays provided as compound word
|
||||
* dictionaries.
|
||||
* </ul>
|
||||
* <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
|
||||
* it should be case-insensitive unless it contains only lowercased entries and you
|
||||
* have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
|
||||
* For optional performance (as this filter does lots of lookups to the dictionary,
|
||||
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
|
||||
* {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
|
||||
* transformed to case-insensitive!
|
||||
*/
|
||||
public class HyphenationCompoundWordTokenFilter extends
|
||||
CompoundWordTokenFilterBase {
|
||||
|
@ -62,7 +75,9 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
* only subwords shorter than this get to the output stream
|
||||
* @param onlyLongestMatch
|
||||
* Add only the longest matching subword to the stream
|
||||
* @deprecated Use the constructors taking {@link Set}
|
||||
*/
|
||||
@Deprecated
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
HyphenationTree hyphenator, String[] dictionary, int minWordSize,
|
||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
|
@ -86,10 +101,12 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
* the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary
|
||||
* the word dictionary to match against
|
||||
* @deprecated Use the constructors taking {@link Set}
|
||||
*/
|
||||
@Deprecated
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
HyphenationTree hyphenator, String[] dictionary) {
|
||||
this(matchVersion, input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
|
||||
this(matchVersion, input, hyphenator, makeDictionary(matchVersion,dictionary), DEFAULT_MIN_WORD_SIZE,
|
||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
|
@ -106,10 +123,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
* @param hyphenator
|
||||
* the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary
|
||||
* the word dictionary to match against. If this is a
|
||||
* {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it
|
||||
* must have set ignoreCase=false and only contain lower case
|
||||
* strings.
|
||||
* the word dictionary to match against.
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
HyphenationTree hyphenator, Set<?> dictionary) {
|
||||
|
@ -130,10 +144,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
* @param hyphenator
|
||||
* the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary
|
||||
* the word dictionary to match against. If this is a
|
||||
* {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it
|
||||
* must have set ignoreCase=false and only contain lower case
|
||||
* strings.
|
||||
* the word dictionary to match against.
|
||||
* @param minWordSize
|
||||
* only words longer than this get processed
|
||||
* @param minSubwordSize
|
||||
|
@ -218,22 +229,20 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void decomposeInternal(final Token token) {
|
||||
protected void decompose() {
|
||||
// get the hyphenation points
|
||||
Hyphenation hyphens = hyphenator.hyphenate(token.buffer(), 0, token
|
||||
.length(), 1, 1);
|
||||
Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
|
||||
// No hyphen points found -> exit
|
||||
if (hyphens == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
final int[] hyp = hyphens.getHyphenationPoints();
|
||||
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
|
||||
|
||||
for (int i = 0; i < hyp.length; ++i) {
|
||||
int remaining = hyp.length - i;
|
||||
int start = hyp[i];
|
||||
Token longestMatchToken = null;
|
||||
CompoundToken longestMatchToken = null;
|
||||
for (int j = 1; j < remaining; j++) {
|
||||
int partLength = hyp[i + j] - start;
|
||||
|
||||
|
@ -250,34 +259,33 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
}
|
||||
|
||||
// check the dictionary
|
||||
if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
|
||||
if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
|
||||
if (this.onlyLongestMatch) {
|
||||
if (longestMatchToken != null) {
|
||||
if (longestMatchToken.length() < partLength) {
|
||||
longestMatchToken = createToken(start, partLength, token);
|
||||
if (longestMatchToken.txt.length() < partLength) {
|
||||
longestMatchToken = new CompoundToken(start, partLength);
|
||||
}
|
||||
} else {
|
||||
longestMatchToken = createToken(start, partLength, token);
|
||||
longestMatchToken = new CompoundToken(start, partLength);
|
||||
}
|
||||
} else {
|
||||
tokens.add(createToken(start, partLength, token));
|
||||
tokens.add(new CompoundToken(start, partLength));
|
||||
}
|
||||
} else if (dictionary.contains(lowerCaseTermBuffer, start,
|
||||
partLength - 1)) {
|
||||
} else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
|
||||
// check the dictionary again with a word that is one character
|
||||
// shorter
|
||||
// to avoid problems with genitive 's characters and other binding
|
||||
// characters
|
||||
if (this.onlyLongestMatch) {
|
||||
if (longestMatchToken != null) {
|
||||
if (longestMatchToken.length() < partLength - 1) {
|
||||
longestMatchToken = createToken(start, partLength - 1, token);
|
||||
if (longestMatchToken.txt.length() < partLength - 1) {
|
||||
longestMatchToken = new CompoundToken(start, partLength - 1);
|
||||
}
|
||||
} else {
|
||||
longestMatchToken = createToken(start, partLength - 1, token);
|
||||
longestMatchToken = new CompoundToken(start, partLength - 1);
|
||||
}
|
||||
} else {
|
||||
tokens.add(createToken(start, partLength - 1, token));
|
||||
tokens.add(new CompoundToken(start, partLength - 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,15 +17,20 @@ package org.apache.lucene.analysis.compound;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||
public void testHyphenationCompoundWordsDA() throws Exception {
|
||||
|
@ -166,45 +171,45 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
String[] dict = {"ab", "cd", "ef"};
|
||||
|
||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader(
|
||||
"abcdef")
|
||||
),
|
||||
dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader(
|
||||
"abcdef")
|
||||
),
|
||||
dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
|
||||
assertTokenStreamContents(tf,
|
||||
new String[] { "abcdef", "ab", "cd", "ef" },
|
||||
new int[] { 0, 0, 2, 4},
|
||||
new int[] { 6, 2, 4, 6},
|
||||
new int[] { 1, 0, 0, 0}
|
||||
);
|
||||
new String[] { "abcdef", "ab", "cd", "ef" },
|
||||
new int[] { 0, 0, 2, 4},
|
||||
new int[] { 6, 2, 4, 6},
|
||||
new int[] { 1, 0, 0, 0}
|
||||
);
|
||||
}
|
||||
|
||||
public void testWordComponentWithLessThanMinimumLength() throws Exception {
|
||||
String[] dict = {"abc", "d", "efg"};
|
||||
|
||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader(
|
||||
"abcdefg")
|
||||
),
|
||||
dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader(
|
||||
"abcdefg")
|
||||
),
|
||||
dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
|
||||
// since "d" is shorter than the minimum subword size, it should not be added to the token stream
|
||||
// since "d" is shorter than the minimum subword size, it should not be added to the token stream
|
||||
assertTokenStreamContents(tf,
|
||||
new String[] { "abcdefg", "abc", "efg" },
|
||||
new int[] { 0, 0, 4},
|
||||
new int[] { 7, 3, 7},
|
||||
new int[] { 1, 0, 0}
|
||||
);
|
||||
new String[] { "abcdefg", "abc", "efg" },
|
||||
new int[] { 0, 0, 4},
|
||||
new int[] { 7, 3, 7},
|
||||
new int[] { 1, 0, 0}
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public void testReset() throws Exception {
|
||||
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
||||
"Aufgabe", "Überwachung" };
|
||||
|
@ -228,4 +233,64 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
||||
}
|
||||
|
||||
public void testRetainMockAttribute() throws Exception {
|
||||
String[] dict = { "abc", "d", "efg" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader("abcdefg"));
|
||||
TokenStream stream = new MockRetainAttributeFilter(tokenizer);
|
||||
stream = new DictionaryCompoundWordTokenFilter(
|
||||
TEST_VERSION_CURRENT, stream, dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
assertTrue("Custom attribute value was lost", retAtt.getRetain());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static interface MockRetainAttribute extends Attribute {
|
||||
void setRetain(boolean attr);
|
||||
boolean getRetain();
|
||||
}
|
||||
|
||||
public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute {
|
||||
private boolean retain = false;
|
||||
@Override
|
||||
public void clear() {
|
||||
retain = false;
|
||||
}
|
||||
public boolean getRetain() {
|
||||
return retain;
|
||||
}
|
||||
public void setRetain(boolean retain) {
|
||||
this.retain = retain;
|
||||
}
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
MockRetainAttribute t = (MockRetainAttribute) target;
|
||||
t.setRetain(retain);
|
||||
}
|
||||
}
|
||||
|
||||
private static class MockRetainAttributeFilter extends TokenFilter {
|
||||
|
||||
MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class);
|
||||
|
||||
MockRetainAttributeFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()){
|
||||
retainAtt.setRetain(true);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue