mirror of https://github.com/apache/lucene.git
LUCENE-7854: Add a new DelimitedTermFrequencyTokenFilter that allows to mark tokens with a custom term frequency
This commit is contained in:
parent
c37b377438
commit
5844ed4ac9
|
@ -18,6 +18,12 @@ New Features
|
|||
with a custom token stream allows indexing custom term frequencies
|
||||
(Mike McCandless)
|
||||
|
||||
* LUCENE-7866: Add a new DelimitedTermFrequencyTokenFilter that allows to
|
||||
mark tokens with a custom term frequency (LUCENE-7854). It parses a numeric
|
||||
value after a separator char ('|') at the end of each token and changes
|
||||
the term frequency to this value. (Uwe Schindler, Robert Muir,
|
||||
Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-2605: Classic QueryParser no longer splits on whitespace by default.
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
|
||||
/**
|
||||
* Characters before the delimiter are the "token", the textual integer after is the term frequency.
|
||||
* To use this {@code TokenFilter} the field must be indexed with
|
||||
* {@link IndexOptions#DOCS_AND_FREQS} but no positions or offsets.
|
||||
* <p>
|
||||
* For example, if the delimiter is '|', then for the string "foo|5", "foo" is the token
|
||||
* and "5" is a term frequency. If there is no delimiter, the TokenFilter does not modify
|
||||
* the term frequency.
|
||||
* <p>
|
||||
* Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
|
||||
*/
|
||||
public final class DelimitedTermFrequencyTokenFilter extends TokenFilter {
|
||||
public static final char DEFAULT_DELIMITER = '|';
|
||||
|
||||
private final char delimiter;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final TermFrequencyAttribute tfAtt = addAttribute(TermFrequencyAttribute.class);
|
||||
|
||||
|
||||
public DelimitedTermFrequencyTokenFilter(TokenStream input) {
|
||||
this(input, DEFAULT_DELIMITER);
|
||||
}
|
||||
|
||||
public DelimitedTermFrequencyTokenFilter(TokenStream input, char delimiter) {
|
||||
super(input);
|
||||
this.delimiter = delimiter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
final char[] buffer = termAtt.buffer();
|
||||
final int length = termAtt.length();
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (buffer[i] == delimiter) {
|
||||
termAtt.setLength(i); // simply set a new length
|
||||
i++;
|
||||
tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, length - i));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link DelimitedTermFrequencyTokenFilter}. The field must have {@code omitPositions=true}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_tfdl" class="solr.TextField" omitPositions="true">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.DelimitedTermFrequencyTokenFilterFactory" delimiter="|"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class DelimitedTermFrequencyTokenFilterFactory extends TokenFilterFactory {
|
||||
public static final String DELIMITER_ATTR = "delimiter";
|
||||
|
||||
private final char delimiter;
|
||||
|
||||
/** Creates a new DelimitedPayloadTokenFilterFactory */
|
||||
public DelimitedTermFrequencyTokenFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
delimiter = getChar(args, DELIMITER_ATTR, DelimitedTermFrequencyTokenFilter.DEFAULT_DELIMITER);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public DelimitedTermFrequencyTokenFilter create(TokenStream input) {
|
||||
return new DelimitedTermFrequencyTokenFilter(input, delimiter);
|
||||
}
|
||||
}
|
|
@ -63,6 +63,7 @@ org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
|
|||
org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.FixBrokenOffsetsFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
||||
|
||||
public class DelimitedTermFrequencyTokenFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testTermFrequency() throws Exception {
|
||||
String test = "The quick|40 red|4 fox|06 jumped|1 over the lazy|2 brown|123 dogs|1024";
|
||||
DelimitedTermFrequencyTokenFilter filter =
|
||||
new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
|
||||
filter.reset();
|
||||
assertTermEquals("The", filter, termAtt, tfAtt, 1);
|
||||
assertTermEquals("quick", filter, termAtt, tfAtt, 40);
|
||||
assertTermEquals("red", filter, termAtt, tfAtt, 4);
|
||||
assertTermEquals("fox", filter, termAtt, tfAtt, 6);
|
||||
assertTermEquals("jumped", filter, termAtt, tfAtt, 1);
|
||||
assertTermEquals("over", filter, termAtt, tfAtt, 1);
|
||||
assertTermEquals("the", filter, termAtt, tfAtt, 1);
|
||||
assertTermEquals("lazy", filter, termAtt, tfAtt, 2);
|
||||
assertTermEquals("brown", filter, termAtt, tfAtt, 123);
|
||||
assertTermEquals("dogs", filter, termAtt, tfAtt, 1024);
|
||||
assertFalse(filter.incrementToken());
|
||||
filter.end();
|
||||
filter.close();
|
||||
}
|
||||
|
||||
public void testInvalidNegativeTf() throws Exception {
|
||||
String test = "foo bar|-20";
|
||||
DelimitedTermFrequencyTokenFilter filter =
|
||||
new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
|
||||
filter.reset();
|
||||
assertTermEquals("foo", filter, termAtt, tfAtt, 1);
|
||||
IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, filter::incrementToken);
|
||||
assertEquals("Term frequency must be 1 or greater; got -20", iae.getMessage());
|
||||
}
|
||||
|
||||
public void testInvalidFloatTf() throws Exception {
|
||||
String test = "foo bar|1.2";
|
||||
DelimitedTermFrequencyTokenFilter filter =
|
||||
new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
|
||||
filter.reset();
|
||||
assertTermEquals("foo", filter, termAtt, tfAtt, 1);
|
||||
expectThrows(NumberFormatException.class, filter::incrementToken);
|
||||
}
|
||||
|
||||
void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, TermFrequencyAttribute tfAtt, int expectedTf) throws Exception {
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, termAtt.toString());
|
||||
assertEquals(expectedTf, tfAtt.getTermFrequency());
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue