LUCENE-5558: Add TruncateTokenFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1583525 13f79535-47bb-0310-9956-ffa450edef68
2014-04-01 04:05:14 +00:00 · 2014-04-01 04:05:14 +00:00 · e34bd74aac
parent 002f3d1b6b
commit e34bd74aac
6 changed files with 233 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -135,6 +135,9 @@ New Features
  resort the hits from a first pass search using a Sort or an
  Expression. (Simon Willnauer, Robert Muir, Mike McCandless)

+* LUCENE-5558: Add TruncateTokenFilter which truncates terms to
+  the specified length.  (Ahmet Arslan via Robert Muir)
+
 API Changes

 * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java
@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+import java.io.IOException;
+
+/**
+ * A token filter for truncating the terms into a specific length.
+ * Fixed prefix truncation, as a stemming method, produces good results on Turkish language.
+ * It is reported that F5, using first 5 characters, produced best results in
+ * <a href="http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf">
+ * Information Retrieval on Turkish Texts</a>
+ */
+public final class TruncateTokenFilter extends TokenFilter {
+
+  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  private final int length;
+
+  public TruncateTokenFilter(TokenStream input, int length) {
+    super(input);
+    if (length < 1)
+      throw new IllegalArgumentException("length parameter must be a positive number: " + length);
+    this.length = length;
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword() && termAttribute.length() > length)
+        termAttribute.setLength(length);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java
@ -0,0 +1,59 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+import java.util.Map;
+
+/**
+ * Factory for {@link org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter}. The following type is recommended for "<i>diacritics-insensitive search</i>" for Turkish.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_tr_ascii_f5" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.ApostropheFilterFactory"/&gt;
+ *     &lt;filter class="solr.TurkishLowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/&gt;
+ *     &lt;filter class="solr.KeywordRepeatFilterFactory"/&gt;
+ *     &lt;filter class="solr.TruncateTokenFilterFactory" prefixLength="5"/&gt;
+ *     &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class TruncateTokenFilterFactory extends TokenFilterFactory {
+
+  public static final String PREFIX_LENGTH_KEY = "prefixLength";
+  private final byte prefixLength;
+
+  public TruncateTokenFilterFactory(Map<String, String> args) {
+    super(args);
+    prefixLength = Byte.parseByte(get(args, PREFIX_LENGTH_KEY, "5"));
+    if (prefixLength < 1)
+      throw new IllegalArgumentException(PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameter(s): " + args);
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new TruncateTokenFilter(input, prefixLength);
+  }
+}
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -69,6 +69,7 @@ org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
 org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
 org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
 org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
+org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
 org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
 org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory
 org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilter.java
@ -0,0 +1,39 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.junit.Test;
+
+/**
+ * Test the truncate token filter.
+ */
+public class TestTruncateTokenFilter extends BaseTokenStreamTestCase {
+
+  public void testTruncating() throws Exception {
+    TokenStream stream = whitespaceMockTokenizer("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
+    stream = new TruncateTokenFilter(stream, 5);
+    assertTokenStreamContents(stream, new String[]{"abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123"});
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testNonPositiveLength() throws Exception {
+    new TruncateTokenFilter(whitespaceMockTokenizer("length must be a positive number"), -48);
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java
@ -0,0 +1,73 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+import java.io.Reader;
+import java.io.StringReader;
+
+/**
+ * Simple tests to ensure the simple truncation filter factory is working.
+ */
+public class TestTruncateTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
+  /**
+   * Ensure the filter actually truncates text.
+   */
+  public void testTruncating() throws Exception {
+    Reader reader = new StringReader("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
+    TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    ((Tokenizer) stream).setReader(reader);
+    stream = tokenFilterFactory("Truncate",
+        TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "5").create(stream);
+    assertTokenStreamContents(stream, new String[]{"abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123"});
+  }
+
+  /**
+   * Test that bogus arguments result in exception
+   */
+  public void testBogusArguments() throws Exception {
+    try {
+      tokenFilterFactory("Truncate",
+          TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "5",
+          "bogusArg", "bogusValue");
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Unknown parameter(s):"));
+    }
+  }
+
+  /**
+   * Test that negative prefix length result in exception
+   */
+  public void testNonPositivePrefixLengthArgument() throws Exception {
+    try {
+      tokenFilterFactory("Truncate",
+          TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "-5"
+      );
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains(TruncateTokenFilterFactory.PREFIX_LENGTH_KEY + " parameter must be a positive number: -5"));
+    }
+  }
+}
+
+