LUCENE-4817: Add KeywordRepeaterFilter to emit tokens twice once as keyword and once not as keyword

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1454313 13f79535-47bb-0310-9956-ffa450edef68
2025-02-17 15:35:20 +00:00 · 2013-03-08 10:33:23 +00:00 · 2013-03-08 10:33:23 +00:00 · 5a3ec2d457
commit 5a3ec2d457
parent 14c9f28f57
5 changed files with 157 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -32,6 +32,10 @@ New Features
 * LUCENE-4815: DrillSideways now allows more than one FacetRequest per
  dimension (Mike McCandless)

+* LUCENE-4817: Added KeywordRepeatFilter that allows to emit a token twice
+  once as a keyword and once as an ordinary token allow stemmers to emit
+  a stemmed version along with the un-stemmed version. (Simon Willnauer)
+
 ======================= Lucene 4.2.0 =======================

 Changes in backwards compatibility policy
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java
@ -0,0 +1,69 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+import java.io.IOException;
+
+
+/**
+ * This TokenFilter emits each incoming token twice once as keyword and once non-keyword, in other words once with
+ * {@link KeywordAttribute#setKeyword(boolean)} set to <code>true</code> and once set to <code>false</code>.
+ * This is useful if used with a stem filter that respects the {@link KeywordAttribute} to index the stemmed and the
+ * un-stemmed version of a term into the same field.
+ */
+public final class KeywordRepeatFilter extends TokenFilter {
+
+  private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
+  private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class);
+  private State state;
+
+  /**
+   * Construct a token stream filtering the given input.
+   */
+  public KeywordRepeatFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (state != null) {
+      restoreState(state);
+      posIncAttr.setPositionIncrement(0);
+      keywordAttribute.setKeyword(false);
+      state = null;
+      return true;
+    }
+    if (input.incrementToken()) {
+      state = captureState();
+      keywordAttribute.setKeyword(true);
+      return true;
+    }
+    return false;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    state = null;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java
@ -0,0 +1,38 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link KeywordRepeatFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.KeywordRepeatFilter"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public final class KeywordRepeatFilterFactory extends TokenFilterFactory {
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new KeywordRepeatFilter(input);
+  }
+}
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -58,6 +58,7 @@ org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
 org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
 org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
 org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
+org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
 org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
 org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
 org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java
@ -0,0 +1,45 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+public class TestKeywordRepeatFilter extends BaseTokenStreamTestCase {
+
+  public void testBasic() throws IOException {
+    TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter(
+        new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false)), "English"));
+    assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0});
+  }
+
+
+  public void testComposition() throws IOException {
+    TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter(new KeywordRepeatFilter(
+        new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false))), "English"));
+    assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0});
+  }
+
+
+
+}