LUCENE-7854: Add a new DelimitedTermFrequencyTokenFilter that allows to mark tokens with a custom term frequency

2017-06-09 23:52:19 +02:00 · 2017-06-09 23:52:19 +02:00 · 5844ed4ac9
parent c37b377438
commit 5844ed4ac9
5 changed files with 212 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -18,6 +18,12 @@ New Features
  with a custom token stream allows indexing custom term frequencies
  (Mike McCandless)

+* LUCENE-7866: Add a new DelimitedTermFrequencyTokenFilter that allows to
+  mark tokens with a custom term frequency (LUCENE-7854). It parses a numeric
+  value after a separator char ('|') at the end of each token and changes
+  the term frequency to this value.  (Uwe Schindler, Robert Muir,
+  Mike McCandless)
+
 API Changes

 * LUCENE-2605: Classic QueryParser no longer splits on whitespace by default.
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java
@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+
+/**
+ * Characters before the delimiter are the "token", the textual integer after is the term frequency.
+ * To use this {@code TokenFilter} the field must be indexed with
+ * {@link IndexOptions#DOCS_AND_FREQS} but no positions or offsets.
+ * <p>
+ * For example, if the delimiter is '|', then for the string "foo|5", "foo" is the token
+ * and "5" is a term frequency. If there is no delimiter, the TokenFilter does not modify
+ * the term frequency.
+ * <p>
+ * Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
+ */
+public final class DelimitedTermFrequencyTokenFilter extends TokenFilter {
+  public static final char DEFAULT_DELIMITER = '|';
+  
+  private final char delimiter;
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final TermFrequencyAttribute tfAtt = addAttribute(TermFrequencyAttribute.class);
+
+
+  public DelimitedTermFrequencyTokenFilter(TokenStream input) {
+    this(input, DEFAULT_DELIMITER);
+  }
+
+  public DelimitedTermFrequencyTokenFilter(TokenStream input, char delimiter) {
+    super(input);
+    this.delimiter = delimiter;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      final char[] buffer = termAtt.buffer();
+      final int length = termAtt.length();
+      for (int i = 0; i < length; i++) {
+        if (buffer[i] == delimiter) {
+          termAtt.setLength(i); // simply set a new length
+          i++;
+          tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, length - i));
+          return true;
+        }
+      }
+      return true;
+    }
+    return false;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilterFactory.java
@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link DelimitedTermFrequencyTokenFilter}. The field must have {@code omitPositions=true}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_tfdl" class="solr.TextField" omitPositions="true"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.DelimitedTermFrequencyTokenFilterFactory" delimiter="|"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class DelimitedTermFrequencyTokenFilterFactory extends TokenFilterFactory {
+  public static final String DELIMITER_ATTR = "delimiter";
+
+  private final char delimiter;
+
+  /** Creates a new DelimitedPayloadTokenFilterFactory */
+  public DelimitedTermFrequencyTokenFilterFactory(Map<String, String> args) {
+    super(args);
+    delimiter = getChar(args, DELIMITER_ATTR, DelimitedTermFrequencyTokenFilter.DEFAULT_DELIMITER);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public DelimitedTermFrequencyTokenFilter create(TokenStream input) {
+    return new DelimitedTermFrequencyTokenFilter(input, delimiter);
+  }
+}
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -63,6 +63,7 @@ org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
 org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
 org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
 org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
+org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory
 org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
 org.apache.lucene.analysis.miscellaneous.FixBrokenOffsetsFilterFactory
 org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilterTest.java
@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
+
+public class DelimitedTermFrequencyTokenFilterTest extends BaseTokenStreamTestCase {
+
+  public void testTermFrequency() throws Exception {
+    String test = "The quick|40 red|4 fox|06 jumped|1 over the lazy|2 brown|123 dogs|1024";
+    DelimitedTermFrequencyTokenFilter filter =
+        new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
+    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+    TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
+    filter.reset();
+    assertTermEquals("The", filter, termAtt, tfAtt, 1);
+    assertTermEquals("quick", filter, termAtt, tfAtt, 40);
+    assertTermEquals("red", filter, termAtt, tfAtt, 4);
+    assertTermEquals("fox", filter, termAtt, tfAtt, 6);
+    assertTermEquals("jumped", filter, termAtt, tfAtt, 1);
+    assertTermEquals("over", filter, termAtt, tfAtt, 1);
+    assertTermEquals("the", filter, termAtt, tfAtt, 1);
+    assertTermEquals("lazy", filter, termAtt, tfAtt, 2);
+    assertTermEquals("brown", filter, termAtt, tfAtt, 123);
+    assertTermEquals("dogs", filter, termAtt, tfAtt, 1024);
+    assertFalse(filter.incrementToken());
+    filter.end();
+    filter.close();
+  }
+
+  public void testInvalidNegativeTf() throws Exception {
+    String test = "foo bar|-20";
+    DelimitedTermFrequencyTokenFilter filter =
+        new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
+    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+    TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
+    filter.reset();
+    assertTermEquals("foo", filter, termAtt, tfAtt, 1);
+    IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, filter::incrementToken);
+    assertEquals("Term frequency must be 1 or greater; got -20", iae.getMessage());
+  }
+
+  public void testInvalidFloatTf() throws Exception {
+    String test = "foo bar|1.2";
+    DelimitedTermFrequencyTokenFilter filter =
+        new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
+    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+    TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
+    filter.reset();
+    assertTermEquals("foo", filter, termAtt, tfAtt, 1);
+    expectThrows(NumberFormatException.class, filter::incrementToken);
+  }
+
+  void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, TermFrequencyAttribute tfAtt, int expectedTf) throws Exception {
+    assertTrue(stream.incrementToken());
+    assertEquals(expected, termAtt.toString());
+    assertEquals(expectedTf, tfAtt.getTermFrequency());
+  }
+}