diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index a32dee3ae1f..a3a75a60e9b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -32,6 +32,10 @@ New Features
* LUCENE-4815: DrillSideways now allows more than one FacetRequest per
dimension (Mike McCandless)
+* LUCENE-4817: Added KeywordRepeatFilter that allows to emit a token twice
+ once as a keyword and once as an ordinary token allow stemmers to emit
+ a stemmed version along with the un-stemmed version. (Simon Willnauer)
+
======================= Lucene 4.2.0 =======================
Changes in backwards compatibility policy
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java
new file mode 100644
index 00000000000..d62998fe5a6
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java
@@ -0,0 +1,69 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+import java.io.IOException;
+
+
+/**
+ * This TokenFilterĀ emits each incoming token twice once as keyword and once non-keyword, in other words once with
+ * {@link KeywordAttribute#setKeyword(boolean)} set to true
and once set to false
.
+ * This is useful if used with a stem filter that respects the {@link KeywordAttribute} to index the stemmed and the
+ * un-stemmed version of a term into the same field.
+ */
+public final class KeywordRepeatFilter extends TokenFilter {
+
+ private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
+ private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class);
+ private State state;
+
+ /**
+ * Construct a token stream filtering the given input.
+ */
+ public KeywordRepeatFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (state != null) {
+ restoreState(state);
+ posIncAttr.setPositionIncrement(0);
+ keywordAttribute.setKeyword(false);
+ state = null;
+ return true;
+ }
+ if (input.incrementToken()) {
+ state = captureState();
+ keywordAttribute.setKeyword(true);
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ state = null;
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java
new file mode 100644
index 00000000000..fa5b566a4b7
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java
@@ -0,0 +1,38 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link KeywordRepeatFilter}.
+ *
+ * <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100"> + * <analyzer> + * <tokenizer class="solr.WhitespaceTokenizerFactory"/> + * <filter class="solr.KeywordRepeatFilter"/> + * </analyzer> + * </fieldType>+ */ +public final class KeywordRepeatFilterFactory extends TokenFilterFactory { + @Override + public TokenStream create(TokenStream input) { + return new KeywordRepeatFilter(input); + } +} diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index f790d02d3e4..84886d82d65 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -58,6 +58,7 @@ org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory +org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory org.apache.lucene.analysis.miscellaneous.LengthFilterFactory org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java new file mode 100644 index 00000000000..eec35b463ed --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java @@ -0,0 +1,45 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.snowball.SnowballFilter; + +import java.io.IOException; +import java.io.StringReader; + +public class TestKeywordRepeatFilter extends BaseTokenStreamTestCase { + + public void testBasic() throws IOException { + TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter( + new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false)), "English")); + assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0}); + } + + + public void testComposition() throws IOException { + TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter(new KeywordRepeatFilter( + new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false))), "English")); + assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0}); + } + + + +}