mirror of https://github.com/apache/lucene.git
LUCENE-4817: Add KeywordRepeaterFilter to emit tokens twice once as keyword and once not as keyword
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1454313 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
14c9f28f57
commit
5a3ec2d457
|
@ -32,6 +32,10 @@ New Features
|
|||
* LUCENE-4815: DrillSideways now allows more than one FacetRequest per
|
||||
dimension (Mike McCandless)
|
||||
|
||||
* LUCENE-4817: Added KeywordRepeatFilter that allows to emit a token twice
|
||||
once as a keyword and once as an ordinary token allow stemmers to emit
|
||||
a stemmed version along with the un-stemmed version. (Simon Willnauer)
|
||||
|
||||
======================= Lucene 4.2.0 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* This TokenFilter emits each incoming token twice once as keyword and once non-keyword, in other words once with
|
||||
* {@link KeywordAttribute#setKeyword(boolean)} set to <code>true</code> and once set to <code>false</code>.
|
||||
* This is useful if used with a stem filter that respects the {@link KeywordAttribute} to index the stemmed and the
|
||||
* un-stemmed version of a term into the same field.
|
||||
*/
|
||||
public final class KeywordRepeatFilter extends TokenFilter {
|
||||
|
||||
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class);
|
||||
private State state;
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input.
|
||||
*/
|
||||
public KeywordRepeatFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (state != null) {
|
||||
restoreState(state);
|
||||
posIncAttr.setPositionIncrement(0);
|
||||
keywordAttribute.setKeyword(false);
|
||||
state = null;
|
||||
return true;
|
||||
}
|
||||
if (input.incrementToken()) {
|
||||
state = captureState();
|
||||
keywordAttribute.setKeyword(true);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
state = null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link KeywordRepeatFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.KeywordRepeatFilter"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public final class KeywordRepeatFilterFactory extends TokenFilterFactory {
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new KeywordRepeatFilter(input);
|
||||
}
|
||||
}
|
|
@ -58,6 +58,7 @@ org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
|
|||
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class TestKeywordRepeatFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testBasic() throws IOException {
|
||||
TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter(
|
||||
new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false)), "English"));
|
||||
assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0});
|
||||
}
|
||||
|
||||
|
||||
public void testComposition() throws IOException {
|
||||
TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter(new KeywordRepeatFilter(
|
||||
new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false))), "English"));
|
||||
assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0});
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue