LUCENE-7854: enable indexing custom term frequencies

This commit is contained in:
Mike McCandless 2017-06-06 13:37:31 -04:00
parent 09a9fdab6d
commit d276acfbbc
14 changed files with 803 additions and 20 deletions

View File

@ -14,6 +14,10 @@ New Features
well as the oldest Lucene version that contributed to the segment. well as the oldest Lucene version that contributed to the segment.
(Adrien Grand) (Adrien Grand)
* LUCENE-7854: The new TermFrequencyAttribute used during analysis
with a custom token stream allows indexing custom term frequencies
(Mike McCandless)
API Changes API Changes
* LUCENE-2605: Classic QueryParser no longer splits on whitespace by default. * LUCENE-2605: Classic QueryParser no longer splits on whitespace by default.

View File

@ -26,15 +26,18 @@ import org.apache.lucene.util.AttributeReflector;
* <li>{@link PositionIncrementAttribute} * <li>{@link PositionIncrementAttribute}
* <li>{@link PositionLengthAttribute} * <li>{@link PositionLengthAttribute}
* <li>{@link OffsetAttribute} * <li>{@link OffsetAttribute}
* <li>{@link TermFrequencyAttribute}
* </ul>*/ * </ul>*/
public class PackedTokenAttributeImpl extends CharTermAttributeImpl public class PackedTokenAttributeImpl extends CharTermAttributeImpl
implements TypeAttribute, PositionIncrementAttribute, implements TypeAttribute, PositionIncrementAttribute,
PositionLengthAttribute, OffsetAttribute { PositionLengthAttribute, OffsetAttribute,
TermFrequencyAttribute {
private int startOffset,endOffset; private int startOffset,endOffset;
private String type = DEFAULT_TYPE; private String type = DEFAULT_TYPE;
private int positionIncrement = 1; private int positionIncrement = 1;
private int positionLength = 1; private int positionLength = 1;
private int termFrequency = 1;
/** Constructs the attribute implementation. */ /** Constructs the attribute implementation. */
public PackedTokenAttributeImpl() { public PackedTokenAttributeImpl() {
@ -132,12 +135,26 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
this.type = type; this.type = type;
} }
@Override
public final void setTermFrequency(int termFrequency) {
if (termFrequency < 1) {
throw new IllegalArgumentException("Term frequency must be 1 or greater; got " + termFrequency);
}
this.termFrequency = termFrequency;
}
@Override
public final int getTermFrequency() {
return termFrequency;
}
/** Resets the attributes /** Resets the attributes
*/ */
@Override @Override
public void clear() { public void clear() {
super.clear(); super.clear();
positionIncrement = positionLength = 1; positionIncrement = positionLength = 1;
termFrequency = 1;
startOffset = endOffset = 0; startOffset = endOffset = 0;
type = DEFAULT_TYPE; type = DEFAULT_TYPE;
} }
@ -147,10 +164,8 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
@Override @Override
public void end() { public void end() {
super.end(); super.end();
// super.end already calls this.clear, so we only set values that are different from clear:
positionIncrement = 0; positionIncrement = 0;
positionLength = 1;
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
} }
@Override @Override
@ -170,6 +185,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
positionIncrement == other.positionIncrement && positionIncrement == other.positionIncrement &&
positionLength == other.positionLength && positionLength == other.positionLength &&
(type == null ? other.type == null : type.equals(other.type)) && (type == null ? other.type == null : type.equals(other.type)) &&
termFrequency == other.termFrequency &&
super.equals(obj) super.equals(obj)
); );
} else } else
@ -185,6 +201,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
code = code * 31 + positionLength; code = code * 31 + positionLength;
if (type != null) if (type != null)
code = code * 31 + type.hashCode(); code = code * 31 + type.hashCode();
code = code * 31 + termFrequency;;
return code; return code;
} }
@ -198,12 +215,14 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
to.startOffset = startOffset; to.startOffset = startOffset;
to.endOffset = endOffset; to.endOffset = endOffset;
to.type = type; to.type = type;
to.termFrequency = termFrequency;
} else { } else {
super.copyTo(target); super.copyTo(target);
((OffsetAttribute) target).setOffset(startOffset, endOffset); ((OffsetAttribute) target).setOffset(startOffset, endOffset);
((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement); ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
((PositionLengthAttribute) target).setPositionLength(positionLength); ((PositionLengthAttribute) target).setPositionLength(positionLength);
((TypeAttribute) target).setType(type); ((TypeAttribute) target).setType(type);
((TermFrequencyAttribute) target).setTermFrequency(termFrequency);
} }
} }
@ -215,6 +234,6 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement); reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
reflector.reflect(PositionLengthAttribute.class, "positionLength", positionLength); reflector.reflect(PositionLengthAttribute.class, "positionLength", positionLength);
reflector.reflect(TypeAttribute.class, "type", type); reflector.reflect(TypeAttribute.class, "type", type);
reflector.reflect(TermFrequencyAttribute.class, "termFrequency", termFrequency);
} }
} }

View File

@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.tokenattributes;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.util.Attribute;
/** Sets the custom term frequency of a term within one document. If this attribute
* is present in your analysis chain for a given field, that field must be indexed with
* {@link IndexOptions#DOCS_AND_FREQS}. */
public interface TermFrequencyAttribute extends Attribute {
/** Set the custom term frequency of the current term within one document. */
public void setTermFrequency(int termFrequency);
/** Returns the custom term frequencey. */
public int getTermFrequency();
}

View File

@ -0,0 +1,82 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.tokenattributes;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
/** Default implementation of {@link TermFrequencyAttribute}. */
public class TermFrequencyAttributeImpl extends AttributeImpl implements TermFrequencyAttribute, Cloneable {
private int termFrequency = 1;
/** Initialize this attribute with term frequencey of 1 */
public TermFrequencyAttributeImpl() {}
@Override
public void setTermFrequency(int termFrequency) {
if (termFrequency < 1) {
throw new IllegalArgumentException("Term frequency must be 1 or greater; got " + termFrequency);
}
this.termFrequency = termFrequency;
}
@Override
public int getTermFrequency() {
return termFrequency;
}
@Override
public void clear() {
this.termFrequency = 1;
}
@Override
public void end() {
this.termFrequency = 1;
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof TermFrequencyAttributeImpl) {
TermFrequencyAttributeImpl _other = (TermFrequencyAttributeImpl) other;
return termFrequency == _other.termFrequency;
}
return false;
}
@Override
public int hashCode() {
return Integer.hashCode(termFrequency);
}
@Override
public void copyTo(AttributeImpl target) {
TermFrequencyAttribute t = (TermFrequencyAttribute) target;
t.setTermFrequency(termFrequency);
}
@Override
public void reflectWith(AttributeReflector reflector) {
reflector.reflect(TermFrequencyAttribute.class, "termFrequency", termFrequency);
}
}

View File

@ -770,10 +770,8 @@ final class DefaultIndexingChain extends DocConsumer {
} }
invertState.lastStartOffset = startOffset; invertState.lastStartOffset = startOffset;
invertState.length++; invertState.length = Math.addExact(invertState.length, invertState.termFreqAttribute.getTermFrequency());
if (invertState.length < 0) {
throw new IllegalArgumentException("too many tokens in field '" + field.name() + "'");
}
//System.out.println(" term=" + invertState.termAttribute); //System.out.println(" term=" + invertState.termAttribute);
// If we hit an exception in here, we abort // If we hit an exception in here, we abort

View File

@ -20,6 +20,7 @@ import org.apache.lucene.analysis.TokenStream; // javadocs
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
@ -48,6 +49,7 @@ public final class FieldInvertState {
PositionIncrementAttribute posIncrAttribute; PositionIncrementAttribute posIncrAttribute;
PayloadAttribute payloadAttribute; PayloadAttribute payloadAttribute;
TermToBytesRefAttribute termAttribute; TermToBytesRefAttribute termAttribute;
TermFrequencyAttribute termFreqAttribute;
/** Creates {code FieldInvertState} for the specified /** Creates {code FieldInvertState} for the specified
* field name. */ * field name. */
@ -88,6 +90,7 @@ public final class FieldInvertState {
if (this.attributeSource != attributeSource) { if (this.attributeSource != attributeSource) {
this.attributeSource = attributeSource; this.attributeSource = attributeSource;
termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class); termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class);
posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class); posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class); offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class); payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);

View File

@ -113,9 +113,10 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
if (!hasFreq) { if (!hasFreq) {
assert postings.termFreqs == null; assert postings.termFreqs == null;
postings.lastDocCodes[termID] = docState.docID; postings.lastDocCodes[termID] = docState.docID;
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
} else { } else {
postings.lastDocCodes[termID] = docState.docID << 1; postings.lastDocCodes[termID] = docState.docID << 1;
postings.termFreqs[termID] = 1; postings.termFreqs[termID] = getTermFreq();
if (hasProx) { if (hasProx) {
writeProx(termID, fieldState.position); writeProx(termID, fieldState.position);
if (hasOffsets) { if (hasOffsets) {
@ -124,19 +125,21 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
} else { } else {
assert !hasOffsets; assert !hasOffsets;
} }
fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
} }
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
fieldState.uniqueTermCount++; fieldState.uniqueTermCount++;
} }
@Override @Override
void addTerm(final int termID) { void addTerm(final int termID) {
final FreqProxPostingsArray postings = freqProxPostingsArray; final FreqProxPostingsArray postings = freqProxPostingsArray;
assert !hasFreq || postings.termFreqs[termID] > 0; assert !hasFreq || postings.termFreqs[termID] > 0;
if (!hasFreq) { if (!hasFreq) {
assert postings.termFreqs == null; assert postings.termFreqs == null;
if (termFreqAtt.getTermFrequency() != 1) {
throw new IllegalStateException("field \"" + fieldInfo.name + "\": must index term freq while using custom TermFrequencyAttribute");
}
if (docState.docID != postings.lastDocIDs[termID]) { if (docState.docID != postings.lastDocIDs[termID]) {
// New document; now encode docCode for previous doc: // New document; now encode docCode for previous doc:
assert docState.docID > postings.lastDocIDs[termID]; assert docState.docID > postings.lastDocIDs[termID];
@ -160,8 +163,8 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
} }
// Init freq for the current document // Init freq for the current document
postings.termFreqs[termID] = 1; postings.termFreqs[termID] = getTermFreq();
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency); fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1; postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
postings.lastDocIDs[termID] = docState.docID; postings.lastDocIDs[termID] = docState.docID;
if (hasProx) { if (hasProx) {
@ -175,7 +178,8 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
} }
fieldState.uniqueTermCount++; fieldState.uniqueTermCount++;
} else { } else {
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.termFreqs[termID]); postings.termFreqs[termID] = Math.addExact(postings.termFreqs[termID], getTermFreq());
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, postings.termFreqs[termID]);
if (hasProx) { if (hasProx) {
writeProx(termID, fieldState.position-postings.lastPositions[termID]); writeProx(termID, fieldState.position-postings.lastPositions[termID]);
if (hasOffsets) { if (hasOffsets) {
@ -185,6 +189,17 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
} }
} }
private int getTermFreq() {
int freq = termFreqAtt.getTermFrequency();
if (freq != 1) {
if (hasProx) {
throw new IllegalStateException("field \"" + fieldInfo.name + "\": cannot index positions while using custom TermFrequencyAttribute");
}
}
return freq;
}
@Override @Override
public void newPostingsArray() { public void newPostingsArray() {
freqProxPostingsArray = (FreqProxPostingsArray) postingsArray; freqProxPostingsArray = (FreqProxPostingsArray) postingsArray;

View File

@ -109,6 +109,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
@Override @Override
boolean start(IndexableField field, boolean first) { boolean start(IndexableField field, boolean first) {
super.start(field, first);
assert field.fieldType().indexOptions() != IndexOptions.NONE; assert field.fieldType().indexOptions() != IndexOptions.NONE;
if (first) { if (first) {
@ -224,7 +225,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
void newTerm(final int termID) { void newTerm(final int termID) {
TermVectorsPostingsArray postings = termVectorsPostingsArray; TermVectorsPostingsArray postings = termVectorsPostingsArray;
postings.freqs[termID] = 1; postings.freqs[termID] = getTermFreq();
postings.lastOffsets[termID] = 0; postings.lastOffsets[termID] = 0;
postings.lastPositions[termID] = 0; postings.lastPositions[termID] = 0;
@ -235,11 +236,25 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
void addTerm(final int termID) { void addTerm(final int termID) {
TermVectorsPostingsArray postings = termVectorsPostingsArray; TermVectorsPostingsArray postings = termVectorsPostingsArray;
postings.freqs[termID]++; postings.freqs[termID] += getTermFreq();
writeProx(postings, termID); writeProx(postings, termID);
} }
private int getTermFreq() {
int freq = termFreqAtt.getTermFrequency();
if (freq != 1) {
if (doVectorPositions) {
throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": cannot index term vector positions while using custom TermFrequencyAttribute");
}
if (doVectorOffsets) {
throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": cannot index term vector offsets while using custom TermFrequencyAttribute");
}
}
return freq;
}
@Override @Override
public void newPostingsArray() { public void newPostingsArray() {
termVectorsPostingsArray = (TermVectorsPostingsArray) postingsArray; termVectorsPostingsArray = (TermVectorsPostingsArray) postingsArray;

View File

@ -19,12 +19,13 @@ package org.apache.lucene.index;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRefHash.BytesStartArray;
import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.Counter; import org.apache.lucene.util.Counter;
import org.apache.lucene.util.IntBlockPool; import org.apache.lucene.util.IntBlockPool;
import org.apache.lucene.util.BytesRefHash.BytesStartArray;
abstract class TermsHashPerField implements Comparable<TermsHashPerField> { abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
private static final int HASH_INIT_SIZE = 4; private static final int HASH_INIT_SIZE = 4;
@ -35,6 +36,7 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
protected final DocumentsWriterPerThread.DocState docState; protected final DocumentsWriterPerThread.DocState docState;
protected final FieldInvertState fieldState; protected final FieldInvertState fieldState;
TermToBytesRefAttribute termAtt; TermToBytesRefAttribute termAtt;
protected TermFrequencyAttribute termFreqAtt;
// Copied from our perThread // Copied from our perThread
final IntBlockPool intPool; final IntBlockPool intPool;
@ -287,6 +289,7 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
* document. */ * document. */
boolean start(IndexableField field, boolean first) { boolean start(IndexableField field, boolean first) {
termAtt = fieldState.termAttribute; termAtt = fieldState.termAttribute;
termFreqAtt = fieldState.termFreqAttribute;
if (nextPerField != null) { if (nextPerField != null) {
doNextCall = nextPerField.start(field, first); doNextCall = nextPerField.start(field, first);
} }

View File

@ -125,6 +125,7 @@ public class TestToken extends LuceneTestCase {
t.setFlags(8); t.setFlags(8);
t.setPositionIncrement(3); t.setPositionIncrement(3);
t.setPositionLength(11); t.setPositionLength(11);
t.setTermFrequency(42);
TestUtil.assertAttributeReflection(t, TestUtil.assertAttributeReflection(t,
new HashMap<String, Object>() {{ new HashMap<String, Object>() {{
put(CharTermAttribute.class.getName() + "#term", "foobar"); put(CharTermAttribute.class.getName() + "#term", "foobar");
@ -136,6 +137,7 @@ public class TestToken extends LuceneTestCase {
put(PayloadAttribute.class.getName() + "#payload", null); put(PayloadAttribute.class.getName() + "#payload", null);
put(TypeAttribute.class.getName() + "#type", TypeAttribute.DEFAULT_TYPE); put(TypeAttribute.class.getName() + "#type", TypeAttribute.DEFAULT_TYPE);
put(FlagsAttribute.class.getName() + "#flags", 8); put(FlagsAttribute.class.getName() + "#flags", 8);
put(TermFrequencyAttribute.class.getName() + "#termFrequency", 42);
}}); }});
} }
} }

View File

@ -82,6 +82,7 @@ public class TestPackedTokenAttributeImpl extends LuceneTestCase {
t.setPositionIncrement(3); t.setPositionIncrement(3);
t.setPositionLength(11); t.setPositionLength(11);
t.setType("foobar"); t.setType("foobar");
t.setTermFrequency(42);
TestUtil.assertAttributeReflection(t, TestUtil.assertAttributeReflection(t,
new HashMap<String, Object>() {{ new HashMap<String, Object>() {{
put(CharTermAttribute.class.getName() + "#term", "foobar"); put(CharTermAttribute.class.getName() + "#term", "foobar");
@ -91,6 +92,7 @@ public class TestPackedTokenAttributeImpl extends LuceneTestCase {
put(PositionIncrementAttribute.class.getName() + "#positionIncrement", 3); put(PositionIncrementAttribute.class.getName() + "#positionIncrement", 3);
put(PositionLengthAttribute.class.getName() + "#positionLength", 11); put(PositionLengthAttribute.class.getName() + "#positionLength", 11);
put(TypeAttribute.class.getName() + "#type", "foobar"); put(TypeAttribute.class.getName() + "#type", "foobar");
put(TermFrequencyAttribute.class.getName() + "#termFrequency", 42);
}}); }});
} }
} }

View File

@ -0,0 +1,468 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import static org.apache.lucene.index.PostingsEnum.NO_MORE_DOCS;
public class TestCustomTermFreq extends LuceneTestCase {
private static final class CannedTermFreqs extends TokenStream {
private final String[] terms;
private final int[] termFreqs;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TermFrequencyAttribute termFreqAtt = addAttribute(TermFrequencyAttribute.class);
private int upto;
public CannedTermFreqs(String[] terms, int[] termFreqs) {
this.terms = terms;
this.termFreqs = termFreqs;
assert terms.length == termFreqs.length;
}
@Override
public boolean incrementToken() {
if (upto == terms.length) {
return false;
}
clearAttributes();
termAtt.append(terms[upto]);
termFreqAtt.setTermFrequency(termFreqs[upto]);
upto++;
return true;
}
@Override
public void reset() {
upto = 0;
}
}
public void testSingletonTermsOneDoc() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
Field field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar"},
new int[] {42, 128}),
fieldType);
doc.add(field);
w.addDocument(doc);
IndexReader r = DirectoryReader.open(w);
PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
assertNotNull(postings);
assertEquals(0, postings.nextDoc());
assertEquals(128, postings.freq());
assertEquals(NO_MORE_DOCS, postings.nextDoc());
postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
assertNotNull(postings);
assertEquals(0, postings.nextDoc());
assertEquals(42, postings.freq());
assertEquals(NO_MORE_DOCS, postings.nextDoc());
IOUtils.close(r, w, dir);
}
public void testSingletonTermsTwoDocs() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
Field field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar"},
new int[] {42, 128}),
fieldType);
doc.add(field);
w.addDocument(doc);
doc = new Document();
field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar"},
new int[] {50, 50}),
fieldType);
doc.add(field);
w.addDocument(doc);
IndexReader r = DirectoryReader.open(w);
PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
assertNotNull(postings);
assertEquals(0, postings.nextDoc());
assertEquals(128, postings.freq());
assertEquals(1, postings.nextDoc());
assertEquals(50, postings.freq());
assertEquals(NO_MORE_DOCS, postings.nextDoc());
postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
assertNotNull(postings);
assertEquals(0, postings.nextDoc());
assertEquals(42, postings.freq());
assertEquals(1, postings.nextDoc());
assertEquals(50, postings.freq());
assertEquals(NO_MORE_DOCS, postings.nextDoc());
IOUtils.close(r, w, dir);
}
public void testRepeatTermsOneDoc() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
Field field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
new int[] {42, 128, 17, 100}),
fieldType);
doc.add(field);
w.addDocument(doc);
IndexReader r = DirectoryReader.open(w);
PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
assertNotNull(postings);
assertEquals(0, postings.nextDoc());
assertEquals(228, postings.freq());
assertEquals(NO_MORE_DOCS, postings.nextDoc());
postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
assertNotNull(postings);
assertEquals(0, postings.nextDoc());
assertEquals(59, postings.freq());
assertEquals(NO_MORE_DOCS, postings.nextDoc());
IOUtils.close(r, w, dir);
}
public void testRepeatTermsTwoDocs() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
Field field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
new int[] {42, 128, 17, 100}),
fieldType);
doc.add(field);
w.addDocument(doc);
doc = new Document();
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
new int[] {50, 60, 70, 80}),
fieldType);
doc.add(field);
w.addDocument(doc);
IndexReader r = DirectoryReader.open(w);
PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
assertNotNull(postings);
assertEquals(0, postings.nextDoc());
assertEquals(228, postings.freq());
assertEquals(1, postings.nextDoc());
assertEquals(140, postings.freq());
assertEquals(NO_MORE_DOCS, postings.nextDoc());
postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
assertNotNull(postings);
assertEquals(0, postings.nextDoc());
assertEquals(59, postings.freq());
assertEquals(1, postings.nextDoc());
assertEquals(120, postings.freq());
assertEquals(NO_MORE_DOCS, postings.nextDoc());
IOUtils.close(r, w, dir);
}
public void testTotalTermFreq() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
Field field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
new int[] {42, 128, 17, 100}),
fieldType);
doc.add(field);
w.addDocument(doc);
doc = new Document();
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
new int[] {50, 60, 70, 80}),
fieldType);
doc.add(field);
w.addDocument(doc);
IndexReader r = DirectoryReader.open(w);
TermsEnum termsEnum = MultiFields.getTerms(r, "field").iterator();
assertTrue(termsEnum.seekExact(new BytesRef("foo")));
assertEquals(179, termsEnum.totalTermFreq());
assertTrue(termsEnum.seekExact(new BytesRef("bar")));
assertEquals(368, termsEnum.totalTermFreq());
IOUtils.close(r, w, dir);
}
// you can't index proximity with custom term freqs:
public void testInvalidProx() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
Field field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
new int[] {42, 128, 17, 100}),
fieldType);
doc.add(field);
Exception e = expectThrows(IllegalStateException.class, () -> {w.addDocument(doc);});
assertEquals("field \"field\": cannot index positions while using custom TermFrequencyAttribute", e.getMessage());
IOUtils.close(w, dir);
}
// you can't index DOCS_ONLY with custom term freq
public void testInvalidDocsOnly() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
fieldType.setIndexOptions(IndexOptions.DOCS);
Field field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
new int[] {42, 128, 17, 100}),
fieldType);
doc.add(field);
Exception e = expectThrows(IllegalStateException.class, () -> {w.addDocument(doc);});
assertEquals("field \"field\": must index term freq while using custom TermFrequencyAttribute", e.getMessage());
IOUtils.close(w, dir);
}
// sum of term freqs must fit in an int
public void testOverflowInt() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
fieldType.setIndexOptions(IndexOptions.DOCS);
Document doc = new Document();
doc.add(new Field("field", "this field should be indexed", fieldType));
w.addDocument(doc);
Document doc2 = new Document();
Field field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar"},
new int[] {3, Integer.MAX_VALUE}),
fieldType);
doc2.add(field);
expectThrows(ArithmeticException.class, () -> {w.addDocument(doc2);});
IndexReader r = DirectoryReader.open(w);
assertEquals(1, r.numDocs());
IOUtils.close(r, w, dir);
}
public void testInvalidTermVectorPositions() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
Field field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
new int[] {42, 128, 17, 100}),
fieldType);
doc.add(field);
Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
assertEquals("field \"field\": cannot index term vector positions while using custom TermFrequencyAttribute", e.getMessage());
IOUtils.close(w, dir);
}
public void testInvalidTermVectorOffsets() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorOffsets(true);
Field field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
new int[] {42, 128, 17, 100}),
fieldType);
doc.add(field);
Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
assertEquals("field \"field\": cannot index term vector offsets while using custom TermFrequencyAttribute", e.getMessage());
IOUtils.close(w, dir);
}
public void testTermVectors() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
fieldType.setStoreTermVectors(true);
Field field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
new int[] {42, 128, 17, 100}),
fieldType);
doc.add(field);
w.addDocument(doc);
doc = new Document();
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
new int[] {50, 60, 70, 80}),
fieldType);
doc.add(field);
w.addDocument(doc);
IndexReader r = DirectoryReader.open(w);
Fields fields = r.getTermVectors(0);
TermsEnum termsEnum = fields.terms("field").iterator();
assertTrue(termsEnum.seekExact(new BytesRef("bar")));
assertEquals(228, termsEnum.totalTermFreq());
PostingsEnum postings = termsEnum.postings(null);
assertNotNull(postings);
assertEquals(0, postings.nextDoc());
assertEquals(228, postings.freq());
assertEquals(NO_MORE_DOCS, postings.nextDoc());
assertTrue(termsEnum.seekExact(new BytesRef("foo")));
assertEquals(59, termsEnum.totalTermFreq());
postings = termsEnum.postings(null);
assertNotNull(postings);
assertEquals(0, postings.nextDoc());
assertEquals(59, postings.freq());
assertEquals(NO_MORE_DOCS, postings.nextDoc());
fields = r.getTermVectors(1);
termsEnum = fields.terms("field").iterator();
assertTrue(termsEnum.seekExact(new BytesRef("bar")));
assertEquals(140, termsEnum.totalTermFreq());
postings = termsEnum.postings(null);
assertNotNull(postings);
assertEquals(0, postings.nextDoc());
assertEquals(140, postings.freq());
assertEquals(NO_MORE_DOCS, postings.nextDoc());
assertTrue(termsEnum.seekExact(new BytesRef("foo")));
assertEquals(120, termsEnum.totalTermFreq());
postings = termsEnum.postings(null);
assertNotNull(postings);
assertEquals(0, postings.nextDoc());
assertEquals(120, postings.freq());
assertEquals(NO_MORE_DOCS, postings.nextDoc());
IOUtils.close(r, w, dir);
}
/**
* Similarity holds onto the FieldInvertState for subsequent verification.
*/
private static class NeverForgetsSimilarity extends Similarity {
public FieldInvertState lastState;
private final static NeverForgetsSimilarity INSTANCE = new NeverForgetsSimilarity();
private NeverForgetsSimilarity() {
// no
}
@Override
public long computeNorm(FieldInvertState state) {
this.lastState = state;
return 1;
}
@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
throw new UnsupportedOperationException();
}
@Override
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
throw new UnsupportedOperationException();
}
}
public void testFieldInvertState() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE);
IndexWriter w = new IndexWriter(dir, iwc);
Document doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
Field field = new Field("field",
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
new int[] {42, 128, 17, 100}),
fieldType);
doc.add(field);
w.addDocument(doc);
FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState;
assertEquals(228, fis.getMaxTermFrequency());
assertEquals(2, fis.getUniqueTermCount());
assertEquals(0, fis.getNumOverlap());
assertEquals(287, fis.getLength());
IOUtils.close(w, dir);
}
}

View File

@ -0,0 +1,139 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestFieldInvertState extends LuceneTestCase {
/**
* Similarity holds onto the FieldInvertState for subsequent verification.
*/
private static class NeverForgetsSimilarity extends Similarity {
public FieldInvertState lastState;
private final static NeverForgetsSimilarity INSTANCE = new NeverForgetsSimilarity();
private NeverForgetsSimilarity() {
// no
}
@Override
public long computeNorm(FieldInvertState state) {
this.lastState = state;
return 1;
}
@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
throw new UnsupportedOperationException();
}
@Override
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
throw new UnsupportedOperationException();
}
}
public void testBasic() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE);
IndexWriter w = new IndexWriter(dir, iwc);
Document doc = new Document();
Field field = new Field("field",
new CannedTokenStream(new Token("a", 0, 1),
new Token("b", 2, 3),
new Token("c", 4, 5)),
TextField.TYPE_NOT_STORED);
doc.add(field);
w.addDocument(doc);
FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState;
assertEquals(1, fis.getMaxTermFrequency());
assertEquals(3, fis.getUniqueTermCount());
assertEquals(0, fis.getNumOverlap());
assertEquals(3, fis.getLength());
IOUtils.close(w, dir);
}
public void testRandom() throws Exception {
int numUniqueTokens = TestUtil.nextInt(random(), 1, 25);
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE);
IndexWriter w = new IndexWriter(dir, iwc);
Document doc = new Document();
int numTokens = atLeast(10000);
Token[] tokens = new Token[numTokens];
Map<Character,Integer> counts = new HashMap<>();
int numStacked = 0;
int maxTermFreq = 0;
int pos = -1;
for (int i=0;i<numTokens;i++) {
char tokenChar = (char) ('a' + random().nextInt(numUniqueTokens));
Integer oldCount = counts.get(tokenChar);
int newCount;
if (oldCount == null) {
newCount = 1;
} else {
newCount = 1 + oldCount;
}
counts.put(tokenChar, newCount);
maxTermFreq = Math.max(maxTermFreq, newCount);
Token token = new Token(Character.toString(tokenChar), 2*i, 2*i+1);
if (i > 0 && random().nextInt(7) == 3) {
token.setPositionIncrement(0);
numStacked++;
} else {
pos++;
}
tokens[i] = token;
}
Field field = new Field("field",
new CannedTokenStream(tokens),
TextField.TYPE_NOT_STORED);
doc.add(field);
w.addDocument(doc);
FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState;
assertEquals(maxTermFreq, fis.getMaxTermFrequency());
assertEquals(counts.size(), fis.getUniqueTermCount());
assertEquals(numStacked, fis.getNumOverlap());
assertEquals(numTokens, fis.getLength());
assertEquals(pos, fis.getPosition());
IOUtils.close(w, dir);
}
}

View File

@ -2676,11 +2676,11 @@ public abstract class LuceneTestCase extends Assert {
if (expectedType.isInstance(e)) { if (expectedType.isInstance(e)) {
return expectedType.cast(e); return expectedType.cast(e);
} }
AssertionFailedError assertion = new AssertionFailedError("Unexpected exception type, expected " + expectedType.getSimpleName()); AssertionFailedError assertion = new AssertionFailedError("Unexpected exception type, expected " + expectedType.getSimpleName() + " but got " + e);
assertion.initCause(e); assertion.initCause(e);
throw assertion; throw assertion;
} }
throw new AssertionFailedError("Expected exception " + expectedType.getSimpleName()); throw new AssertionFailedError("Expected exception " + expectedType.getSimpleName() + " but no exception was thrown");
} }
/** /**