LUCENE-5558: Add TruncateTokenFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1583525 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-04-01 04:05:14 +00:00
parent 002f3d1b6b
commit e34bd74aac
6 changed files with 233 additions and 0 deletions

View File

@ -135,6 +135,9 @@ New Features
resort the hits from a first pass search using a Sort or an
Expression. (Simon Willnauer, Robert Muir, Mike McCandless)
* LUCENE-5558: Add TruncateTokenFilter which truncates terms to
the specified length. (Ahmet Arslan via Robert Muir)
API Changes
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import java.io.IOException;
/**
* A token filter for truncating the terms into a specific length.
* Fixed prefix truncation, as a stemming method, produces good results on Turkish language.
* It is reported that F5, using first 5 characters, produced best results in
* <a href="http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf">
* Information Retrieval on Turkish Texts</a>
*/
public final class TruncateTokenFilter extends TokenFilter {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final int length;
public TruncateTokenFilter(TokenStream input, int length) {
super(input);
if (length < 1)
throw new IllegalArgumentException("length parameter must be a positive number: " + length);
this.length = length;
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword() && termAttribute.length() > length)
termAttribute.setLength(length);
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,59 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.util.Map;
/**
* Factory for {@link org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter}. The following type is recommended for "<i>diacritics-insensitive search</i>" for Turkish.
* <pre class="prettyprint">
* &lt;fieldType name="text_tr_ascii_f5" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.ApostropheFilterFactory"/&gt;
* &lt;filter class="solr.TurkishLowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/&gt;
* &lt;filter class="solr.KeywordRepeatFilterFactory"/&gt;
* &lt;filter class="solr.TruncateTokenFilterFactory" prefixLength="5"/&gt;
* &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class TruncateTokenFilterFactory extends TokenFilterFactory {
public static final String PREFIX_LENGTH_KEY = "prefixLength";
private final byte prefixLength;
public TruncateTokenFilterFactory(Map<String, String> args) {
super(args);
prefixLength = Byte.parseByte(get(args, PREFIX_LENGTH_KEY, "5"));
if (prefixLength < 1)
throw new IllegalArgumentException(PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameter(s): " + args);
}
}
@Override
public TokenStream create(TokenStream input) {
return new TruncateTokenFilter(input, prefixLength);
}
}

View File

@ -69,6 +69,7 @@ org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory
org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilterFactory

View File

@ -0,0 +1,39 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.junit.Test;
/**
* Test the truncate token filter.
*/
public class TestTruncateTokenFilter extends BaseTokenStreamTestCase {
public void testTruncating() throws Exception {
TokenStream stream = whitespaceMockTokenizer("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
stream = new TruncateTokenFilter(stream, 5);
assertTokenStreamContents(stream, new String[]{"abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123"});
}
@Test(expected = IllegalArgumentException.class)
public void testNonPositiveLength() throws Exception {
new TruncateTokenFilter(whitespaceMockTokenizer("length must be a positive number"), -48);
}
}

View File

@ -0,0 +1,73 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
import java.io.Reader;
import java.io.StringReader;
/**
* Simple tests to ensure the simple truncation filter factory is working.
*/
public class TestTruncateTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
/**
* Ensure the filter actually truncates text.
*/
public void testTruncating() throws Exception {
Reader reader = new StringReader("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
((Tokenizer) stream).setReader(reader);
stream = tokenFilterFactory("Truncate",
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "5").create(stream);
assertTokenStreamContents(stream, new String[]{"abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123"});
}
/**
* Test that bogus arguments result in exception
*/
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("Truncate",
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "5",
"bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameter(s):"));
}
}
/**
* Test that negative prefix length result in exception
*/
public void testNonPositivePrefixLengthArgument() throws Exception {
try {
tokenFilterFactory("Truncate",
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "-5"
);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains(TruncateTokenFilterFactory.PREFIX_LENGTH_KEY + " parameter must be a positive number: -5"));
}
}
}