LUCENE-7854: Add a new DelimitedTermFrequencyTokenFilter that allows to mark tokens with a custom term frequency

This commit is contained in:
Uwe Schindler 2017-06-09 23:52:19 +02:00
parent c37b377438
commit 5844ed4ac9
5 changed files with 212 additions and 0 deletions

View File

@ -18,6 +18,12 @@ New Features
with a custom token stream allows indexing custom term frequencies
(Mike McCandless)
* LUCENE-7866: Add a new DelimitedTermFrequencyTokenFilter that allows to
mark tokens with a custom term frequency (LUCENE-7854). It parses a numeric
value after a separator char ('|') at the end of each token and changes
the term frequency to this value. (Uwe Schindler, Robert Muir,
Mike McCandless)
API Changes
* LUCENE-2605: Classic QueryParser no longer splits on whitespace by default.

View File

@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* Characters before the delimiter are the "token", the textual integer after is the term frequency.
* To use this {@code TokenFilter} the field must be indexed with
* {@link IndexOptions#DOCS_AND_FREQS} but no positions or offsets.
* <p>
* For example, if the delimiter is '|', then for the string "foo|5", "foo" is the token
* and "5" is a term frequency. If there is no delimiter, the TokenFilter does not modify
* the term frequency.
* <p>
* Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
*/
public final class DelimitedTermFrequencyTokenFilter extends TokenFilter {
public static final char DEFAULT_DELIMITER = '|';
private final char delimiter;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TermFrequencyAttribute tfAtt = addAttribute(TermFrequencyAttribute.class);
public DelimitedTermFrequencyTokenFilter(TokenStream input) {
this(input, DEFAULT_DELIMITER);
}
public DelimitedTermFrequencyTokenFilter(TokenStream input, char delimiter) {
super(input);
this.delimiter = delimiter;
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
final char[] buffer = termAtt.buffer();
final int length = termAtt.length();
for (int i = 0; i < length; i++) {
if (buffer[i] == delimiter) {
termAtt.setLength(i); // simply set a new length
i++;
tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, length - i));
return true;
}
}
return true;
}
return false;
}
}

View File

@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link DelimitedTermFrequencyTokenFilter}. The field must have {@code omitPositions=true}.
* <pre class="prettyprint">
* &lt;fieldType name="text_tfdl" class="solr.TextField" omitPositions="true"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.DelimitedTermFrequencyTokenFilterFactory" delimiter="|"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class DelimitedTermFrequencyTokenFilterFactory extends TokenFilterFactory {
public static final String DELIMITER_ATTR = "delimiter";
private final char delimiter;
/** Creates a new DelimitedPayloadTokenFilterFactory */
public DelimitedTermFrequencyTokenFilterFactory(Map<String, String> args) {
super(args);
delimiter = getChar(args, DELIMITER_ATTR, DelimitedTermFrequencyTokenFilter.DEFAULT_DELIMITER);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public DelimitedTermFrequencyTokenFilter create(TokenStream input) {
return new DelimitedTermFrequencyTokenFilter(input, delimiter);
}
}

View File

@ -63,6 +63,7 @@ org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
org.apache.lucene.analysis.miscellaneous.FixBrokenOffsetsFilterFactory
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory

View File

@ -0,0 +1,77 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
public class DelimitedTermFrequencyTokenFilterTest extends BaseTokenStreamTestCase {
public void testTermFrequency() throws Exception {
String test = "The quick|40 red|4 fox|06 jumped|1 over the lazy|2 brown|123 dogs|1024";
DelimitedTermFrequencyTokenFilter filter =
new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
filter.reset();
assertTermEquals("The", filter, termAtt, tfAtt, 1);
assertTermEquals("quick", filter, termAtt, tfAtt, 40);
assertTermEquals("red", filter, termAtt, tfAtt, 4);
assertTermEquals("fox", filter, termAtt, tfAtt, 6);
assertTermEquals("jumped", filter, termAtt, tfAtt, 1);
assertTermEquals("over", filter, termAtt, tfAtt, 1);
assertTermEquals("the", filter, termAtt, tfAtt, 1);
assertTermEquals("lazy", filter, termAtt, tfAtt, 2);
assertTermEquals("brown", filter, termAtt, tfAtt, 123);
assertTermEquals("dogs", filter, termAtt, tfAtt, 1024);
assertFalse(filter.incrementToken());
filter.end();
filter.close();
}
public void testInvalidNegativeTf() throws Exception {
String test = "foo bar|-20";
DelimitedTermFrequencyTokenFilter filter =
new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
filter.reset();
assertTermEquals("foo", filter, termAtt, tfAtt, 1);
IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, filter::incrementToken);
assertEquals("Term frequency must be 1 or greater; got -20", iae.getMessage());
}
public void testInvalidFloatTf() throws Exception {
String test = "foo bar|1.2";
DelimitedTermFrequencyTokenFilter filter =
new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
filter.reset();
assertTermEquals("foo", filter, termAtt, tfAtt, 1);
expectThrows(NumberFormatException.class, filter::incrementToken);
}
void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, TermFrequencyAttribute tfAtt, int expectedTf) throws Exception {
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.toString());
assertEquals(expectedTf, tfAtt.getTermFrequency());
}
}