mirror of https://github.com/apache/lucene.git
LUCENE-7854: enable indexing custom term frequencies
This commit is contained in:
parent
09a9fdab6d
commit
d276acfbbc
|
@ -14,6 +14,10 @@ New Features
|
|||
well as the oldest Lucene version that contributed to the segment.
|
||||
(Adrien Grand)
|
||||
|
||||
* LUCENE-7854: The new TermFrequencyAttribute used during analysis
|
||||
with a custom token stream allows indexing custom term frequencies
|
||||
(Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-2605: Classic QueryParser no longer splits on whitespace by default.
|
||||
|
|
|
@ -26,15 +26,18 @@ import org.apache.lucene.util.AttributeReflector;
|
|||
* <li>{@link PositionIncrementAttribute}
|
||||
* <li>{@link PositionLengthAttribute}
|
||||
* <li>{@link OffsetAttribute}
|
||||
* <li>{@link TermFrequencyAttribute}
|
||||
* </ul>*/
|
||||
public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
||||
implements TypeAttribute, PositionIncrementAttribute,
|
||||
PositionLengthAttribute, OffsetAttribute {
|
||||
PositionLengthAttribute, OffsetAttribute,
|
||||
TermFrequencyAttribute {
|
||||
|
||||
private int startOffset,endOffset;
|
||||
private String type = DEFAULT_TYPE;
|
||||
private int positionIncrement = 1;
|
||||
private int positionLength = 1;
|
||||
private int termFrequency = 1;
|
||||
|
||||
/** Constructs the attribute implementation. */
|
||||
public PackedTokenAttributeImpl() {
|
||||
|
@ -132,12 +135,26 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
|||
this.type = type;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void setTermFrequency(int termFrequency) {
|
||||
if (termFrequency < 1) {
|
||||
throw new IllegalArgumentException("Term frequency must be 1 or greater; got " + termFrequency);
|
||||
}
|
||||
this.termFrequency = termFrequency;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int getTermFrequency() {
|
||||
return termFrequency;
|
||||
}
|
||||
|
||||
/** Resets the attributes
|
||||
*/
|
||||
@Override
|
||||
public void clear() {
|
||||
super.clear();
|
||||
positionIncrement = positionLength = 1;
|
||||
termFrequency = 1;
|
||||
startOffset = endOffset = 0;
|
||||
type = DEFAULT_TYPE;
|
||||
}
|
||||
|
@ -147,10 +164,8 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
|||
@Override
|
||||
public void end() {
|
||||
super.end();
|
||||
// super.end already calls this.clear, so we only set values that are different from clear:
|
||||
positionIncrement = 0;
|
||||
positionLength = 1;
|
||||
startOffset = endOffset = 0;
|
||||
type = DEFAULT_TYPE;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -170,6 +185,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
|||
positionIncrement == other.positionIncrement &&
|
||||
positionLength == other.positionLength &&
|
||||
(type == null ? other.type == null : type.equals(other.type)) &&
|
||||
termFrequency == other.termFrequency &&
|
||||
super.equals(obj)
|
||||
);
|
||||
} else
|
||||
|
@ -185,6 +201,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
|||
code = code * 31 + positionLength;
|
||||
if (type != null)
|
||||
code = code * 31 + type.hashCode();
|
||||
code = code * 31 + termFrequency;;
|
||||
return code;
|
||||
}
|
||||
|
||||
|
@ -198,12 +215,14 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
|||
to.startOffset = startOffset;
|
||||
to.endOffset = endOffset;
|
||||
to.type = type;
|
||||
to.termFrequency = termFrequency;
|
||||
} else {
|
||||
super.copyTo(target);
|
||||
((OffsetAttribute) target).setOffset(startOffset, endOffset);
|
||||
((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
|
||||
((PositionLengthAttribute) target).setPositionLength(positionLength);
|
||||
((TypeAttribute) target).setType(type);
|
||||
((TermFrequencyAttribute) target).setTermFrequency(termFrequency);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -215,6 +234,6 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
|||
reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
|
||||
reflector.reflect(PositionLengthAttribute.class, "positionLength", positionLength);
|
||||
reflector.reflect(TypeAttribute.class, "type", type);
|
||||
reflector.reflect(TermFrequencyAttribute.class, "termFrequency", termFrequency);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/** Sets the custom term frequency of a term within one document. If this attribute
|
||||
* is present in your analysis chain for a given field, that field must be indexed with
|
||||
* {@link IndexOptions#DOCS_AND_FREQS}. */
|
||||
public interface TermFrequencyAttribute extends Attribute {
|
||||
|
||||
/** Set the custom term frequency of the current term within one document. */
|
||||
public void setTermFrequency(int termFrequency);
|
||||
|
||||
/** Returns the custom term frequencey. */
|
||||
public int getTermFrequency();
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeReflector;
|
||||
|
||||
/** Default implementation of {@link TermFrequencyAttribute}. */
|
||||
public class TermFrequencyAttributeImpl extends AttributeImpl implements TermFrequencyAttribute, Cloneable {
|
||||
private int termFrequency = 1;
|
||||
|
||||
/** Initialize this attribute with term frequencey of 1 */
|
||||
public TermFrequencyAttributeImpl() {}
|
||||
|
||||
@Override
|
||||
public void setTermFrequency(int termFrequency) {
|
||||
if (termFrequency < 1) {
|
||||
throw new IllegalArgumentException("Term frequency must be 1 or greater; got " + termFrequency);
|
||||
}
|
||||
this.termFrequency = termFrequency;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTermFrequency() {
|
||||
return termFrequency;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
this.termFrequency = 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
this.termFrequency = 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof TermFrequencyAttributeImpl) {
|
||||
TermFrequencyAttributeImpl _other = (TermFrequencyAttributeImpl) other;
|
||||
return termFrequency == _other.termFrequency;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Integer.hashCode(termFrequency);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
TermFrequencyAttribute t = (TermFrequencyAttribute) target;
|
||||
t.setTermFrequency(termFrequency);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reflectWith(AttributeReflector reflector) {
|
||||
reflector.reflect(TermFrequencyAttribute.class, "termFrequency", termFrequency);
|
||||
}
|
||||
}
|
|
@ -770,10 +770,8 @@ final class DefaultIndexingChain extends DocConsumer {
|
|||
}
|
||||
invertState.lastStartOffset = startOffset;
|
||||
|
||||
invertState.length++;
|
||||
if (invertState.length < 0) {
|
||||
throw new IllegalArgumentException("too many tokens in field '" + field.name() + "'");
|
||||
}
|
||||
invertState.length = Math.addExact(invertState.length, invertState.termFreqAttribute.getTermFrequency());
|
||||
|
||||
//System.out.println(" term=" + invertState.termAttribute);
|
||||
|
||||
// If we hit an exception in here, we abort
|
||||
|
|
|
@ -20,6 +20,7 @@ import org.apache.lucene.analysis.TokenStream; // javadocs
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
@ -48,6 +49,7 @@ public final class FieldInvertState {
|
|||
PositionIncrementAttribute posIncrAttribute;
|
||||
PayloadAttribute payloadAttribute;
|
||||
TermToBytesRefAttribute termAttribute;
|
||||
TermFrequencyAttribute termFreqAttribute;
|
||||
|
||||
/** Creates {code FieldInvertState} for the specified
|
||||
* field name. */
|
||||
|
@ -88,6 +90,7 @@ public final class FieldInvertState {
|
|||
if (this.attributeSource != attributeSource) {
|
||||
this.attributeSource = attributeSource;
|
||||
termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
|
||||
termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class);
|
||||
posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
|
||||
offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
|
||||
payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);
|
||||
|
|
|
@ -113,9 +113,10 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
|||
if (!hasFreq) {
|
||||
assert postings.termFreqs == null;
|
||||
postings.lastDocCodes[termID] = docState.docID;
|
||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
||||
} else {
|
||||
postings.lastDocCodes[termID] = docState.docID << 1;
|
||||
postings.termFreqs[termID] = 1;
|
||||
postings.termFreqs[termID] = getTermFreq();
|
||||
if (hasProx) {
|
||||
writeProx(termID, fieldState.position);
|
||||
if (hasOffsets) {
|
||||
|
@ -124,19 +125,21 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
|||
} else {
|
||||
assert !hasOffsets;
|
||||
}
|
||||
fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
|
||||
}
|
||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
||||
fieldState.uniqueTermCount++;
|
||||
}
|
||||
|
||||
@Override
|
||||
void addTerm(final int termID) {
|
||||
final FreqProxPostingsArray postings = freqProxPostingsArray;
|
||||
|
||||
assert !hasFreq || postings.termFreqs[termID] > 0;
|
||||
|
||||
if (!hasFreq) {
|
||||
assert postings.termFreqs == null;
|
||||
if (termFreqAtt.getTermFrequency() != 1) {
|
||||
throw new IllegalStateException("field \"" + fieldInfo.name + "\": must index term freq while using custom TermFrequencyAttribute");
|
||||
}
|
||||
if (docState.docID != postings.lastDocIDs[termID]) {
|
||||
// New document; now encode docCode for previous doc:
|
||||
assert docState.docID > postings.lastDocIDs[termID];
|
||||
|
@ -160,8 +163,8 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
|||
}
|
||||
|
||||
// Init freq for the current document
|
||||
postings.termFreqs[termID] = 1;
|
||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
||||
postings.termFreqs[termID] = getTermFreq();
|
||||
fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
|
||||
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
|
||||
postings.lastDocIDs[termID] = docState.docID;
|
||||
if (hasProx) {
|
||||
|
@ -175,7 +178,8 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
|||
}
|
||||
fieldState.uniqueTermCount++;
|
||||
} else {
|
||||
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.termFreqs[termID]);
|
||||
postings.termFreqs[termID] = Math.addExact(postings.termFreqs[termID], getTermFreq());
|
||||
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, postings.termFreqs[termID]);
|
||||
if (hasProx) {
|
||||
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
|
||||
if (hasOffsets) {
|
||||
|
@ -185,6 +189,17 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
|||
}
|
||||
}
|
||||
|
||||
private int getTermFreq() {
|
||||
int freq = termFreqAtt.getTermFrequency();
|
||||
if (freq != 1) {
|
||||
if (hasProx) {
|
||||
throw new IllegalStateException("field \"" + fieldInfo.name + "\": cannot index positions while using custom TermFrequencyAttribute");
|
||||
}
|
||||
}
|
||||
|
||||
return freq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void newPostingsArray() {
|
||||
freqProxPostingsArray = (FreqProxPostingsArray) postingsArray;
|
||||
|
|
|
@ -109,6 +109,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
|||
|
||||
@Override
|
||||
boolean start(IndexableField field, boolean first) {
|
||||
super.start(field, first);
|
||||
assert field.fieldType().indexOptions() != IndexOptions.NONE;
|
||||
|
||||
if (first) {
|
||||
|
@ -224,7 +225,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
|||
void newTerm(final int termID) {
|
||||
TermVectorsPostingsArray postings = termVectorsPostingsArray;
|
||||
|
||||
postings.freqs[termID] = 1;
|
||||
postings.freqs[termID] = getTermFreq();
|
||||
postings.lastOffsets[termID] = 0;
|
||||
postings.lastPositions[termID] = 0;
|
||||
|
||||
|
@ -235,11 +236,25 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
|||
void addTerm(final int termID) {
|
||||
TermVectorsPostingsArray postings = termVectorsPostingsArray;
|
||||
|
||||
postings.freqs[termID]++;
|
||||
postings.freqs[termID] += getTermFreq();
|
||||
|
||||
writeProx(postings, termID);
|
||||
}
|
||||
|
||||
private int getTermFreq() {
|
||||
int freq = termFreqAtt.getTermFrequency();
|
||||
if (freq != 1) {
|
||||
if (doVectorPositions) {
|
||||
throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": cannot index term vector positions while using custom TermFrequencyAttribute");
|
||||
}
|
||||
if (doVectorOffsets) {
|
||||
throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": cannot index term vector offsets while using custom TermFrequencyAttribute");
|
||||
}
|
||||
}
|
||||
|
||||
return freq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void newPostingsArray() {
|
||||
termVectorsPostingsArray = (TermVectorsPostingsArray) postingsArray;
|
||||
|
|
|
@ -19,12 +19,13 @@ package org.apache.lucene.index;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||
import org.apache.lucene.util.ByteBlockPool;
|
||||
import org.apache.lucene.util.BytesRefHash.BytesStartArray;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
import org.apache.lucene.util.Counter;
|
||||
import org.apache.lucene.util.IntBlockPool;
|
||||
import org.apache.lucene.util.BytesRefHash.BytesStartArray;
|
||||
|
||||
abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
||||
private static final int HASH_INIT_SIZE = 4;
|
||||
|
@ -35,6 +36,7 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
|||
protected final DocumentsWriterPerThread.DocState docState;
|
||||
protected final FieldInvertState fieldState;
|
||||
TermToBytesRefAttribute termAtt;
|
||||
protected TermFrequencyAttribute termFreqAtt;
|
||||
|
||||
// Copied from our perThread
|
||||
final IntBlockPool intPool;
|
||||
|
@ -287,6 +289,7 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
|||
* document. */
|
||||
boolean start(IndexableField field, boolean first) {
|
||||
termAtt = fieldState.termAttribute;
|
||||
termFreqAtt = fieldState.termFreqAttribute;
|
||||
if (nextPerField != null) {
|
||||
doNextCall = nextPerField.start(field, first);
|
||||
}
|
||||
|
|
|
@ -125,6 +125,7 @@ public class TestToken extends LuceneTestCase {
|
|||
t.setFlags(8);
|
||||
t.setPositionIncrement(3);
|
||||
t.setPositionLength(11);
|
||||
t.setTermFrequency(42);
|
||||
TestUtil.assertAttributeReflection(t,
|
||||
new HashMap<String, Object>() {{
|
||||
put(CharTermAttribute.class.getName() + "#term", "foobar");
|
||||
|
@ -136,6 +137,7 @@ public class TestToken extends LuceneTestCase {
|
|||
put(PayloadAttribute.class.getName() + "#payload", null);
|
||||
put(TypeAttribute.class.getName() + "#type", TypeAttribute.DEFAULT_TYPE);
|
||||
put(FlagsAttribute.class.getName() + "#flags", 8);
|
||||
put(TermFrequencyAttribute.class.getName() + "#termFrequency", 42);
|
||||
}});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -82,6 +82,7 @@ public class TestPackedTokenAttributeImpl extends LuceneTestCase {
|
|||
t.setPositionIncrement(3);
|
||||
t.setPositionLength(11);
|
||||
t.setType("foobar");
|
||||
t.setTermFrequency(42);
|
||||
TestUtil.assertAttributeReflection(t,
|
||||
new HashMap<String, Object>() {{
|
||||
put(CharTermAttribute.class.getName() + "#term", "foobar");
|
||||
|
@ -91,6 +92,7 @@ public class TestPackedTokenAttributeImpl extends LuceneTestCase {
|
|||
put(PositionIncrementAttribute.class.getName() + "#positionIncrement", 3);
|
||||
put(PositionLengthAttribute.class.getName() + "#positionLength", 11);
|
||||
put(TypeAttribute.class.getName() + "#type", "foobar");
|
||||
put(TermFrequencyAttribute.class.getName() + "#termFrequency", 42);
|
||||
}});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,468 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.index;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import static org.apache.lucene.index.PostingsEnum.NO_MORE_DOCS;
|
||||
|
||||
public class TestCustomTermFreq extends LuceneTestCase {
|
||||
|
||||
private static final class CannedTermFreqs extends TokenStream {
|
||||
private final String[] terms;
|
||||
private final int[] termFreqs;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final TermFrequencyAttribute termFreqAtt = addAttribute(TermFrequencyAttribute.class);
|
||||
private int upto;
|
||||
|
||||
public CannedTermFreqs(String[] terms, int[] termFreqs) {
|
||||
this.terms = terms;
|
||||
this.termFreqs = termFreqs;
|
||||
assert terms.length == termFreqs.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() {
|
||||
if (upto == terms.length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
clearAttributes();
|
||||
|
||||
termAtt.append(terms[upto]);
|
||||
termFreqAtt.setTermFrequency(termFreqs[upto]);
|
||||
|
||||
upto++;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
upto = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public void testSingletonTermsOneDoc() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
Field field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar"},
|
||||
new int[] {42, 128}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
IndexReader r = DirectoryReader.open(w);
|
||||
PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
|
||||
assertNotNull(postings);
|
||||
assertEquals(0, postings.nextDoc());
|
||||
assertEquals(128, postings.freq());
|
||||
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||
|
||||
postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
|
||||
assertNotNull(postings);
|
||||
assertEquals(0, postings.nextDoc());
|
||||
assertEquals(42, postings.freq());
|
||||
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||
|
||||
IOUtils.close(r, w, dir);
|
||||
}
|
||||
|
||||
public void testSingletonTermsTwoDocs() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
Field field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar"},
|
||||
new int[] {42, 128}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar"},
|
||||
new int[] {50, 50}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexReader r = DirectoryReader.open(w);
|
||||
PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
|
||||
assertNotNull(postings);
|
||||
assertEquals(0, postings.nextDoc());
|
||||
assertEquals(128, postings.freq());
|
||||
assertEquals(1, postings.nextDoc());
|
||||
assertEquals(50, postings.freq());
|
||||
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||
|
||||
postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
|
||||
assertNotNull(postings);
|
||||
assertEquals(0, postings.nextDoc());
|
||||
assertEquals(42, postings.freq());
|
||||
assertEquals(1, postings.nextDoc());
|
||||
assertEquals(50, postings.freq());
|
||||
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||
|
||||
IOUtils.close(r, w, dir);
|
||||
}
|
||||
|
||||
public void testRepeatTermsOneDoc() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
Field field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||
new int[] {42, 128, 17, 100}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
IndexReader r = DirectoryReader.open(w);
|
||||
PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
|
||||
assertNotNull(postings);
|
||||
assertEquals(0, postings.nextDoc());
|
||||
assertEquals(228, postings.freq());
|
||||
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||
|
||||
postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
|
||||
assertNotNull(postings);
|
||||
assertEquals(0, postings.nextDoc());
|
||||
assertEquals(59, postings.freq());
|
||||
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||
|
||||
IOUtils.close(r, w, dir);
|
||||
}
|
||||
|
||||
public void testRepeatTermsTwoDocs() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
Field field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||
new int[] {42, 128, 17, 100}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||
new int[] {50, 60, 70, 80}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexReader r = DirectoryReader.open(w);
|
||||
PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
|
||||
assertNotNull(postings);
|
||||
assertEquals(0, postings.nextDoc());
|
||||
assertEquals(228, postings.freq());
|
||||
assertEquals(1, postings.nextDoc());
|
||||
assertEquals(140, postings.freq());
|
||||
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||
|
||||
postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
|
||||
assertNotNull(postings);
|
||||
assertEquals(0, postings.nextDoc());
|
||||
assertEquals(59, postings.freq());
|
||||
assertEquals(1, postings.nextDoc());
|
||||
assertEquals(120, postings.freq());
|
||||
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||
|
||||
IOUtils.close(r, w, dir);
|
||||
}
|
||||
|
||||
public void testTotalTermFreq() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
Field field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||
new int[] {42, 128, 17, 100}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||
new int[] {50, 60, 70, 80}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexReader r = DirectoryReader.open(w);
|
||||
|
||||
TermsEnum termsEnum = MultiFields.getTerms(r, "field").iterator();
|
||||
assertTrue(termsEnum.seekExact(new BytesRef("foo")));
|
||||
assertEquals(179, termsEnum.totalTermFreq());
|
||||
assertTrue(termsEnum.seekExact(new BytesRef("bar")));
|
||||
assertEquals(368, termsEnum.totalTermFreq());
|
||||
|
||||
IOUtils.close(r, w, dir);
|
||||
}
|
||||
|
||||
// you can't index proximity with custom term freqs:
|
||||
public void testInvalidProx() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
Field field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||
new int[] {42, 128, 17, 100}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
Exception e = expectThrows(IllegalStateException.class, () -> {w.addDocument(doc);});
|
||||
assertEquals("field \"field\": cannot index positions while using custom TermFrequencyAttribute", e.getMessage());
|
||||
IOUtils.close(w, dir);
|
||||
}
|
||||
|
||||
// you can't index DOCS_ONLY with custom term freq
|
||||
public void testInvalidDocsOnly() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS);
|
||||
Field field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||
new int[] {42, 128, 17, 100}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
Exception e = expectThrows(IllegalStateException.class, () -> {w.addDocument(doc);});
|
||||
assertEquals("field \"field\": must index term freq while using custom TermFrequencyAttribute", e.getMessage());
|
||||
IOUtils.close(w, dir);
|
||||
}
|
||||
|
||||
// sum of term freqs must fit in an int
|
||||
public void testOverflowInt() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", "this field should be indexed", fieldType));
|
||||
w.addDocument(doc);
|
||||
|
||||
Document doc2 = new Document();
|
||||
Field field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar"},
|
||||
new int[] {3, Integer.MAX_VALUE}),
|
||||
fieldType);
|
||||
doc2.add(field);
|
||||
expectThrows(ArithmeticException.class, () -> {w.addDocument(doc2);});
|
||||
|
||||
IndexReader r = DirectoryReader.open(w);
|
||||
assertEquals(1, r.numDocs());
|
||||
|
||||
IOUtils.close(r, w, dir);
|
||||
}
|
||||
|
||||
public void testInvalidTermVectorPositions() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
fieldType.setStoreTermVectors(true);
|
||||
fieldType.setStoreTermVectorPositions(true);
|
||||
Field field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||
new int[] {42, 128, 17, 100}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
|
||||
assertEquals("field \"field\": cannot index term vector positions while using custom TermFrequencyAttribute", e.getMessage());
|
||||
IOUtils.close(w, dir);
|
||||
}
|
||||
|
||||
public void testInvalidTermVectorOffsets() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
fieldType.setStoreTermVectors(true);
|
||||
fieldType.setStoreTermVectorOffsets(true);
|
||||
Field field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||
new int[] {42, 128, 17, 100}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
|
||||
assertEquals("field \"field\": cannot index term vector offsets while using custom TermFrequencyAttribute", e.getMessage());
|
||||
IOUtils.close(w, dir);
|
||||
}
|
||||
|
||||
public void testTermVectors() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
fieldType.setStoreTermVectors(true);
|
||||
Field field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||
new int[] {42, 128, 17, 100}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||
new int[] {50, 60, 70, 80}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexReader r = DirectoryReader.open(w);
|
||||
|
||||
Fields fields = r.getTermVectors(0);
|
||||
TermsEnum termsEnum = fields.terms("field").iterator();
|
||||
assertTrue(termsEnum.seekExact(new BytesRef("bar")));
|
||||
assertEquals(228, termsEnum.totalTermFreq());
|
||||
PostingsEnum postings = termsEnum.postings(null);
|
||||
assertNotNull(postings);
|
||||
assertEquals(0, postings.nextDoc());
|
||||
assertEquals(228, postings.freq());
|
||||
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||
|
||||
assertTrue(termsEnum.seekExact(new BytesRef("foo")));
|
||||
assertEquals(59, termsEnum.totalTermFreq());
|
||||
postings = termsEnum.postings(null);
|
||||
assertNotNull(postings);
|
||||
assertEquals(0, postings.nextDoc());
|
||||
assertEquals(59, postings.freq());
|
||||
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||
|
||||
fields = r.getTermVectors(1);
|
||||
termsEnum = fields.terms("field").iterator();
|
||||
assertTrue(termsEnum.seekExact(new BytesRef("bar")));
|
||||
assertEquals(140, termsEnum.totalTermFreq());
|
||||
postings = termsEnum.postings(null);
|
||||
assertNotNull(postings);
|
||||
assertEquals(0, postings.nextDoc());
|
||||
assertEquals(140, postings.freq());
|
||||
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||
|
||||
assertTrue(termsEnum.seekExact(new BytesRef("foo")));
|
||||
assertEquals(120, termsEnum.totalTermFreq());
|
||||
postings = termsEnum.postings(null);
|
||||
assertNotNull(postings);
|
||||
assertEquals(0, postings.nextDoc());
|
||||
assertEquals(120, postings.freq());
|
||||
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||
|
||||
IOUtils.close(r, w, dir);
|
||||
}
|
||||
|
||||
/**
|
||||
* Similarity holds onto the FieldInvertState for subsequent verification.
|
||||
*/
|
||||
private static class NeverForgetsSimilarity extends Similarity {
|
||||
public FieldInvertState lastState;
|
||||
private final static NeverForgetsSimilarity INSTANCE = new NeverForgetsSimilarity();
|
||||
|
||||
private NeverForgetsSimilarity() {
|
||||
// no
|
||||
}
|
||||
|
||||
@Override
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
this.lastState = state;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
public void testFieldInvertState() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||
iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE);
|
||||
IndexWriter w = new IndexWriter(dir, iwc);
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
Field field = new Field("field",
|
||||
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||
new int[] {42, 128, 17, 100}),
|
||||
fieldType);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState;
|
||||
assertEquals(228, fis.getMaxTermFrequency());
|
||||
assertEquals(2, fis.getUniqueTermCount());
|
||||
assertEquals(0, fis.getNumOverlap());
|
||||
assertEquals(287, fis.getLength());
|
||||
|
||||
IOUtils.close(w, dir);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,139 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.index;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestFieldInvertState extends LuceneTestCase {
|
||||
/**
|
||||
* Similarity holds onto the FieldInvertState for subsequent verification.
|
||||
*/
|
||||
private static class NeverForgetsSimilarity extends Similarity {
|
||||
public FieldInvertState lastState;
|
||||
private final static NeverForgetsSimilarity INSTANCE = new NeverForgetsSimilarity();
|
||||
|
||||
private NeverForgetsSimilarity() {
|
||||
// no
|
||||
}
|
||||
|
||||
@Override
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
this.lastState = state;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||
iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE);
|
||||
IndexWriter w = new IndexWriter(dir, iwc);
|
||||
Document doc = new Document();
|
||||
Field field = new Field("field",
|
||||
new CannedTokenStream(new Token("a", 0, 1),
|
||||
new Token("b", 2, 3),
|
||||
new Token("c", 4, 5)),
|
||||
TextField.TYPE_NOT_STORED);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState;
|
||||
assertEquals(1, fis.getMaxTermFrequency());
|
||||
assertEquals(3, fis.getUniqueTermCount());
|
||||
assertEquals(0, fis.getNumOverlap());
|
||||
assertEquals(3, fis.getLength());
|
||||
IOUtils.close(w, dir);
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
int numUniqueTokens = TestUtil.nextInt(random(), 1, 25);
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||
iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE);
|
||||
IndexWriter w = new IndexWriter(dir, iwc);
|
||||
Document doc = new Document();
|
||||
|
||||
int numTokens = atLeast(10000);
|
||||
Token[] tokens = new Token[numTokens];
|
||||
Map<Character,Integer> counts = new HashMap<>();
|
||||
int numStacked = 0;
|
||||
int maxTermFreq = 0;
|
||||
int pos = -1;
|
||||
for (int i=0;i<numTokens;i++) {
|
||||
char tokenChar = (char) ('a' + random().nextInt(numUniqueTokens));
|
||||
Integer oldCount = counts.get(tokenChar);
|
||||
int newCount;
|
||||
if (oldCount == null) {
|
||||
newCount = 1;
|
||||
} else {
|
||||
newCount = 1 + oldCount;
|
||||
}
|
||||
counts.put(tokenChar, newCount);
|
||||
maxTermFreq = Math.max(maxTermFreq, newCount);
|
||||
|
||||
Token token = new Token(Character.toString(tokenChar), 2*i, 2*i+1);
|
||||
|
||||
if (i > 0 && random().nextInt(7) == 3) {
|
||||
token.setPositionIncrement(0);
|
||||
numStacked++;
|
||||
} else {
|
||||
pos++;
|
||||
}
|
||||
tokens[i] = token;
|
||||
}
|
||||
|
||||
Field field = new Field("field",
|
||||
new CannedTokenStream(tokens),
|
||||
TextField.TYPE_NOT_STORED);
|
||||
doc.add(field);
|
||||
w.addDocument(doc);
|
||||
FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState;
|
||||
assertEquals(maxTermFreq, fis.getMaxTermFrequency());
|
||||
assertEquals(counts.size(), fis.getUniqueTermCount());
|
||||
assertEquals(numStacked, fis.getNumOverlap());
|
||||
assertEquals(numTokens, fis.getLength());
|
||||
assertEquals(pos, fis.getPosition());
|
||||
|
||||
IOUtils.close(w, dir);
|
||||
}
|
||||
}
|
|
@ -2676,11 +2676,11 @@ public abstract class LuceneTestCase extends Assert {
|
|||
if (expectedType.isInstance(e)) {
|
||||
return expectedType.cast(e);
|
||||
}
|
||||
AssertionFailedError assertion = new AssertionFailedError("Unexpected exception type, expected " + expectedType.getSimpleName());
|
||||
AssertionFailedError assertion = new AssertionFailedError("Unexpected exception type, expected " + expectedType.getSimpleName() + " but got " + e);
|
||||
assertion.initCause(e);
|
||||
throw assertion;
|
||||
}
|
||||
throw new AssertionFailedError("Expected exception " + expectedType.getSimpleName());
|
||||
throw new AssertionFailedError("Expected exception " + expectedType.getSimpleName() + " but no exception was thrown");
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Reference in New Issue