mirror of https://github.com/apache/lucene.git
LUCENE-5558: Add TruncateTokenFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1583525 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
002f3d1b6b
commit
e34bd74aac
|
@ -135,6 +135,9 @@ New Features
|
|||
resort the hits from a first pass search using a Sort or an
|
||||
Expression. (Simon Willnauer, Robert Muir, Mike McCandless)
|
||||
|
||||
* LUCENE-5558: Add TruncateTokenFilter which truncates terms to
|
||||
the specified length. (Ahmet Arslan via Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A token filter for truncating the terms into a specific length.
|
||||
* Fixed prefix truncation, as a stemming method, produces good results on Turkish language.
|
||||
* It is reported that F5, using first 5 characters, produced best results in
|
||||
* <a href="http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf">
|
||||
* Information Retrieval on Turkish Texts</a>
|
||||
*/
|
||||
public final class TruncateTokenFilter extends TokenFilter {
|
||||
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
private final int length;
|
||||
|
||||
public TruncateTokenFilter(TokenStream input, int length) {
|
||||
super(input);
|
||||
if (length < 1)
|
||||
throw new IllegalArgumentException("length parameter must be a positive number: " + length);
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAttr.isKeyword() && termAttribute.length() > length)
|
||||
termAttribute.setLength(length);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory for {@link org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter}. The following type is recommended for "<i>diacritics-insensitive search</i>" for Turkish.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_tr_ascii_f5" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.ApostropheFilterFactory"/>
|
||||
* <filter class="solr.TurkishLowerCaseFilterFactory"/>
|
||||
* <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/>
|
||||
* <filter class="solr.KeywordRepeatFilterFactory"/>
|
||||
* <filter class="solr.TruncateTokenFilterFactory" prefixLength="5"/>
|
||||
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class TruncateTokenFilterFactory extends TokenFilterFactory {
|
||||
|
||||
public static final String PREFIX_LENGTH_KEY = "prefixLength";
|
||||
private final byte prefixLength;
|
||||
|
||||
public TruncateTokenFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
prefixLength = Byte.parseByte(get(args, PREFIX_LENGTH_KEY, "5"));
|
||||
if (prefixLength < 1)
|
||||
throw new IllegalArgumentException(PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameter(s): " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new TruncateTokenFilter(input, prefixLength);
|
||||
}
|
||||
}
|
|
@ -69,6 +69,7 @@ org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
|
|||
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilterFactory
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Test the truncate token filter.
|
||||
*/
|
||||
public class TestTruncateTokenFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testTruncating() throws Exception {
|
||||
TokenStream stream = whitespaceMockTokenizer("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
|
||||
stream = new TruncateTokenFilter(stream, 5);
|
||||
assertTokenStreamContents(stream, new String[]{"abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123"});
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testNonPositiveLength() throws Exception {
|
||||
new TruncateTokenFilter(whitespaceMockTokenizer("length must be a positive number"), -48);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the simple truncation filter factory is working.
|
||||
*/
|
||||
public class TestTruncateTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
/**
|
||||
* Ensure the filter actually truncates text.
|
||||
*/
|
||||
public void testTruncating() throws Exception {
|
||||
Reader reader = new StringReader("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
|
||||
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
((Tokenizer) stream).setReader(reader);
|
||||
stream = tokenFilterFactory("Truncate",
|
||||
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "5").create(stream);
|
||||
assertTokenStreamContents(stream, new String[]{"abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that bogus arguments result in exception
|
||||
*/
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("Truncate",
|
||||
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "5",
|
||||
"bogusArg", "bogusValue");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("Unknown parameter(s):"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that negative prefix length result in exception
|
||||
*/
|
||||
public void testNonPositivePrefixLengthArgument() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("Truncate",
|
||||
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "-5"
|
||||
);
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains(TruncateTokenFilterFactory.PREFIX_LENGTH_KEY + " parameter must be a positive number: -5"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue