LUCENE-4817: Add KeywordRepeaterFilter to emit tokens twice once as keyword and once not as keyword

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1454313 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2013-03-08 10:33:23 +00:00
parent 14c9f28f57
commit 5a3ec2d457
5 changed files with 157 additions and 0 deletions

View File

@ -32,6 +32,10 @@ New Features
* LUCENE-4815: DrillSideways now allows more than one FacetRequest per
dimension (Mike McCandless)
* LUCENE-4817: Added KeywordRepeatFilter that allows to emit a token twice
once as a keyword and once as an ordinary token allow stemmers to emit
a stemmed version along with the un-stemmed version. (Simon Willnauer)
======================= Lucene 4.2.0 =======================
Changes in backwards compatibility policy

View File

@ -0,0 +1,69 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import java.io.IOException;
/**
* This TokenFilter emits each incoming token twice once as keyword and once non-keyword, in other words once with
* {@link KeywordAttribute#setKeyword(boolean)} set to <code>true</code> and once set to <code>false</code>.
* This is useful if used with a stem filter that respects the {@link KeywordAttribute} to index the stemmed and the
* un-stemmed version of a term into the same field.
*/
public final class KeywordRepeatFilter extends TokenFilter {
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class);
private State state;
/**
* Construct a token stream filtering the given input.
*/
public KeywordRepeatFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (state != null) {
restoreState(state);
posIncAttr.setPositionIncrement(0);
keywordAttribute.setKeyword(false);
state = null;
return true;
}
if (input.incrementToken()) {
state = captureState();
keywordAttribute.setKeyword(true);
return true;
}
return false;
}
@Override
public void reset() throws IOException {
super.reset();
state = null;
}
}

View File

@ -0,0 +1,38 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link KeywordRepeatFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.KeywordRepeatFilter"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public final class KeywordRepeatFilterFactory extends TokenFilterFactory {
@Override
public TokenStream create(TokenStream input) {
return new KeywordRepeatFilter(input);
}
}

View File

@ -58,6 +58,7 @@ org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory

View File

@ -0,0 +1,45 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import java.io.IOException;
import java.io.StringReader;
public class TestKeywordRepeatFilter extends BaseTokenStreamTestCase {
public void testBasic() throws IOException {
TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter(
new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false)), "English"));
assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0});
}
public void testComposition() throws IOException {
TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter(new KeywordRepeatFilter(
new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false))), "English"));
assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0});
}
}