mirror of https://github.com/apache/lucene.git
LUCENE-7854: enable indexing custom term frequencies
This commit is contained in:
parent
09a9fdab6d
commit
d276acfbbc
|
@ -14,6 +14,10 @@ New Features
|
||||||
well as the oldest Lucene version that contributed to the segment.
|
well as the oldest Lucene version that contributed to the segment.
|
||||||
(Adrien Grand)
|
(Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-7854: The new TermFrequencyAttribute used during analysis
|
||||||
|
with a custom token stream allows indexing custom term frequencies
|
||||||
|
(Mike McCandless)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-2605: Classic QueryParser no longer splits on whitespace by default.
|
* LUCENE-2605: Classic QueryParser no longer splits on whitespace by default.
|
||||||
|
|
|
@ -26,15 +26,18 @@ import org.apache.lucene.util.AttributeReflector;
|
||||||
* <li>{@link PositionIncrementAttribute}
|
* <li>{@link PositionIncrementAttribute}
|
||||||
* <li>{@link PositionLengthAttribute}
|
* <li>{@link PositionLengthAttribute}
|
||||||
* <li>{@link OffsetAttribute}
|
* <li>{@link OffsetAttribute}
|
||||||
|
* <li>{@link TermFrequencyAttribute}
|
||||||
* </ul>*/
|
* </ul>*/
|
||||||
public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
||||||
implements TypeAttribute, PositionIncrementAttribute,
|
implements TypeAttribute, PositionIncrementAttribute,
|
||||||
PositionLengthAttribute, OffsetAttribute {
|
PositionLengthAttribute, OffsetAttribute,
|
||||||
|
TermFrequencyAttribute {
|
||||||
|
|
||||||
private int startOffset,endOffset;
|
private int startOffset,endOffset;
|
||||||
private String type = DEFAULT_TYPE;
|
private String type = DEFAULT_TYPE;
|
||||||
private int positionIncrement = 1;
|
private int positionIncrement = 1;
|
||||||
private int positionLength = 1;
|
private int positionLength = 1;
|
||||||
|
private int termFrequency = 1;
|
||||||
|
|
||||||
/** Constructs the attribute implementation. */
|
/** Constructs the attribute implementation. */
|
||||||
public PackedTokenAttributeImpl() {
|
public PackedTokenAttributeImpl() {
|
||||||
|
@ -132,12 +135,26 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
||||||
this.type = type;
|
this.type = type;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final void setTermFrequency(int termFrequency) {
|
||||||
|
if (termFrequency < 1) {
|
||||||
|
throw new IllegalArgumentException("Term frequency must be 1 or greater; got " + termFrequency);
|
||||||
|
}
|
||||||
|
this.termFrequency = termFrequency;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final int getTermFrequency() {
|
||||||
|
return termFrequency;
|
||||||
|
}
|
||||||
|
|
||||||
/** Resets the attributes
|
/** Resets the attributes
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void clear() {
|
public void clear() {
|
||||||
super.clear();
|
super.clear();
|
||||||
positionIncrement = positionLength = 1;
|
positionIncrement = positionLength = 1;
|
||||||
|
termFrequency = 1;
|
||||||
startOffset = endOffset = 0;
|
startOffset = endOffset = 0;
|
||||||
type = DEFAULT_TYPE;
|
type = DEFAULT_TYPE;
|
||||||
}
|
}
|
||||||
|
@ -147,10 +164,8 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
||||||
@Override
|
@Override
|
||||||
public void end() {
|
public void end() {
|
||||||
super.end();
|
super.end();
|
||||||
|
// super.end already calls this.clear, so we only set values that are different from clear:
|
||||||
positionIncrement = 0;
|
positionIncrement = 0;
|
||||||
positionLength = 1;
|
|
||||||
startOffset = endOffset = 0;
|
|
||||||
type = DEFAULT_TYPE;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -170,6 +185,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
||||||
positionIncrement == other.positionIncrement &&
|
positionIncrement == other.positionIncrement &&
|
||||||
positionLength == other.positionLength &&
|
positionLength == other.positionLength &&
|
||||||
(type == null ? other.type == null : type.equals(other.type)) &&
|
(type == null ? other.type == null : type.equals(other.type)) &&
|
||||||
|
termFrequency == other.termFrequency &&
|
||||||
super.equals(obj)
|
super.equals(obj)
|
||||||
);
|
);
|
||||||
} else
|
} else
|
||||||
|
@ -185,6 +201,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
||||||
code = code * 31 + positionLength;
|
code = code * 31 + positionLength;
|
||||||
if (type != null)
|
if (type != null)
|
||||||
code = code * 31 + type.hashCode();
|
code = code * 31 + type.hashCode();
|
||||||
|
code = code * 31 + termFrequency;;
|
||||||
return code;
|
return code;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -198,12 +215,14 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
||||||
to.startOffset = startOffset;
|
to.startOffset = startOffset;
|
||||||
to.endOffset = endOffset;
|
to.endOffset = endOffset;
|
||||||
to.type = type;
|
to.type = type;
|
||||||
|
to.termFrequency = termFrequency;
|
||||||
} else {
|
} else {
|
||||||
super.copyTo(target);
|
super.copyTo(target);
|
||||||
((OffsetAttribute) target).setOffset(startOffset, endOffset);
|
((OffsetAttribute) target).setOffset(startOffset, endOffset);
|
||||||
((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
|
((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
|
||||||
((PositionLengthAttribute) target).setPositionLength(positionLength);
|
((PositionLengthAttribute) target).setPositionLength(positionLength);
|
||||||
((TypeAttribute) target).setType(type);
|
((TypeAttribute) target).setType(type);
|
||||||
|
((TermFrequencyAttribute) target).setTermFrequency(termFrequency);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -215,6 +234,6 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
|
||||||
reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
|
reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
|
||||||
reflector.reflect(PositionLengthAttribute.class, "positionLength", positionLength);
|
reflector.reflect(PositionLengthAttribute.class, "positionLength", positionLength);
|
||||||
reflector.reflect(TypeAttribute.class, "type", type);
|
reflector.reflect(TypeAttribute.class, "type", type);
|
||||||
|
reflector.reflect(TermFrequencyAttribute.class, "termFrequency", termFrequency);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.tokenattributes;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexOptions;
|
||||||
|
import org.apache.lucene.util.Attribute;
|
||||||
|
|
||||||
|
/** Sets the custom term frequency of a term within one document. If this attribute
|
||||||
|
* is present in your analysis chain for a given field, that field must be indexed with
|
||||||
|
* {@link IndexOptions#DOCS_AND_FREQS}. */
|
||||||
|
public interface TermFrequencyAttribute extends Attribute {
|
||||||
|
|
||||||
|
/** Set the custom term frequency of the current term within one document. */
|
||||||
|
public void setTermFrequency(int termFrequency);
|
||||||
|
|
||||||
|
/** Returns the custom term frequencey. */
|
||||||
|
public int getTermFrequency();
|
||||||
|
}
|
|
@ -0,0 +1,82 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.tokenattributes;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.util.AttributeImpl;
|
||||||
|
import org.apache.lucene.util.AttributeReflector;
|
||||||
|
|
||||||
|
/** Default implementation of {@link TermFrequencyAttribute}. */
|
||||||
|
public class TermFrequencyAttributeImpl extends AttributeImpl implements TermFrequencyAttribute, Cloneable {
|
||||||
|
private int termFrequency = 1;
|
||||||
|
|
||||||
|
/** Initialize this attribute with term frequencey of 1 */
|
||||||
|
public TermFrequencyAttributeImpl() {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setTermFrequency(int termFrequency) {
|
||||||
|
if (termFrequency < 1) {
|
||||||
|
throw new IllegalArgumentException("Term frequency must be 1 or greater; got " + termFrequency);
|
||||||
|
}
|
||||||
|
this.termFrequency = termFrequency;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getTermFrequency() {
|
||||||
|
return termFrequency;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void clear() {
|
||||||
|
this.termFrequency = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void end() {
|
||||||
|
this.termFrequency = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other == this) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (other instanceof TermFrequencyAttributeImpl) {
|
||||||
|
TermFrequencyAttributeImpl _other = (TermFrequencyAttributeImpl) other;
|
||||||
|
return termFrequency == _other.termFrequency;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Integer.hashCode(termFrequency);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void copyTo(AttributeImpl target) {
|
||||||
|
TermFrequencyAttribute t = (TermFrequencyAttribute) target;
|
||||||
|
t.setTermFrequency(termFrequency);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reflectWith(AttributeReflector reflector) {
|
||||||
|
reflector.reflect(TermFrequencyAttribute.class, "termFrequency", termFrequency);
|
||||||
|
}
|
||||||
|
}
|
|
@ -770,10 +770,8 @@ final class DefaultIndexingChain extends DocConsumer {
|
||||||
}
|
}
|
||||||
invertState.lastStartOffset = startOffset;
|
invertState.lastStartOffset = startOffset;
|
||||||
|
|
||||||
invertState.length++;
|
invertState.length = Math.addExact(invertState.length, invertState.termFreqAttribute.getTermFrequency());
|
||||||
if (invertState.length < 0) {
|
|
||||||
throw new IllegalArgumentException("too many tokens in field '" + field.name() + "'");
|
|
||||||
}
|
|
||||||
//System.out.println(" term=" + invertState.termAttribute);
|
//System.out.println(" term=" + invertState.termAttribute);
|
||||||
|
|
||||||
// If we hit an exception in here, we abort
|
// If we hit an exception in here, we abort
|
||||||
|
|
|
@ -20,6 +20,7 @@ import org.apache.lucene.analysis.TokenStream; // javadocs
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
|
@ -48,6 +49,7 @@ public final class FieldInvertState {
|
||||||
PositionIncrementAttribute posIncrAttribute;
|
PositionIncrementAttribute posIncrAttribute;
|
||||||
PayloadAttribute payloadAttribute;
|
PayloadAttribute payloadAttribute;
|
||||||
TermToBytesRefAttribute termAttribute;
|
TermToBytesRefAttribute termAttribute;
|
||||||
|
TermFrequencyAttribute termFreqAttribute;
|
||||||
|
|
||||||
/** Creates {code FieldInvertState} for the specified
|
/** Creates {code FieldInvertState} for the specified
|
||||||
* field name. */
|
* field name. */
|
||||||
|
@ -88,6 +90,7 @@ public final class FieldInvertState {
|
||||||
if (this.attributeSource != attributeSource) {
|
if (this.attributeSource != attributeSource) {
|
||||||
this.attributeSource = attributeSource;
|
this.attributeSource = attributeSource;
|
||||||
termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
|
termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
|
||||||
|
termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class);
|
||||||
posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
|
posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
|
||||||
offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
|
offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
|
||||||
payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);
|
payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);
|
||||||
|
|
|
@ -113,9 +113,10 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
if (!hasFreq) {
|
if (!hasFreq) {
|
||||||
assert postings.termFreqs == null;
|
assert postings.termFreqs == null;
|
||||||
postings.lastDocCodes[termID] = docState.docID;
|
postings.lastDocCodes[termID] = docState.docID;
|
||||||
|
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
||||||
} else {
|
} else {
|
||||||
postings.lastDocCodes[termID] = docState.docID << 1;
|
postings.lastDocCodes[termID] = docState.docID << 1;
|
||||||
postings.termFreqs[termID] = 1;
|
postings.termFreqs[termID] = getTermFreq();
|
||||||
if (hasProx) {
|
if (hasProx) {
|
||||||
writeProx(termID, fieldState.position);
|
writeProx(termID, fieldState.position);
|
||||||
if (hasOffsets) {
|
if (hasOffsets) {
|
||||||
|
@ -124,19 +125,21 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
} else {
|
} else {
|
||||||
assert !hasOffsets;
|
assert !hasOffsets;
|
||||||
}
|
}
|
||||||
|
fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
|
||||||
}
|
}
|
||||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
|
||||||
fieldState.uniqueTermCount++;
|
fieldState.uniqueTermCount++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
void addTerm(final int termID) {
|
void addTerm(final int termID) {
|
||||||
final FreqProxPostingsArray postings = freqProxPostingsArray;
|
final FreqProxPostingsArray postings = freqProxPostingsArray;
|
||||||
|
|
||||||
assert !hasFreq || postings.termFreqs[termID] > 0;
|
assert !hasFreq || postings.termFreqs[termID] > 0;
|
||||||
|
|
||||||
if (!hasFreq) {
|
if (!hasFreq) {
|
||||||
assert postings.termFreqs == null;
|
assert postings.termFreqs == null;
|
||||||
|
if (termFreqAtt.getTermFrequency() != 1) {
|
||||||
|
throw new IllegalStateException("field \"" + fieldInfo.name + "\": must index term freq while using custom TermFrequencyAttribute");
|
||||||
|
}
|
||||||
if (docState.docID != postings.lastDocIDs[termID]) {
|
if (docState.docID != postings.lastDocIDs[termID]) {
|
||||||
// New document; now encode docCode for previous doc:
|
// New document; now encode docCode for previous doc:
|
||||||
assert docState.docID > postings.lastDocIDs[termID];
|
assert docState.docID > postings.lastDocIDs[termID];
|
||||||
|
@ -160,8 +163,8 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Init freq for the current document
|
// Init freq for the current document
|
||||||
postings.termFreqs[termID] = 1;
|
postings.termFreqs[termID] = getTermFreq();
|
||||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
|
||||||
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
|
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
|
||||||
postings.lastDocIDs[termID] = docState.docID;
|
postings.lastDocIDs[termID] = docState.docID;
|
||||||
if (hasProx) {
|
if (hasProx) {
|
||||||
|
@ -175,7 +178,8 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
}
|
}
|
||||||
fieldState.uniqueTermCount++;
|
fieldState.uniqueTermCount++;
|
||||||
} else {
|
} else {
|
||||||
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.termFreqs[termID]);
|
postings.termFreqs[termID] = Math.addExact(postings.termFreqs[termID], getTermFreq());
|
||||||
|
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, postings.termFreqs[termID]);
|
||||||
if (hasProx) {
|
if (hasProx) {
|
||||||
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
|
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
|
||||||
if (hasOffsets) {
|
if (hasOffsets) {
|
||||||
|
@ -185,6 +189,17 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int getTermFreq() {
|
||||||
|
int freq = termFreqAtt.getTermFrequency();
|
||||||
|
if (freq != 1) {
|
||||||
|
if (hasProx) {
|
||||||
|
throw new IllegalStateException("field \"" + fieldInfo.name + "\": cannot index positions while using custom TermFrequencyAttribute");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return freq;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void newPostingsArray() {
|
public void newPostingsArray() {
|
||||||
freqProxPostingsArray = (FreqProxPostingsArray) postingsArray;
|
freqProxPostingsArray = (FreqProxPostingsArray) postingsArray;
|
||||||
|
|
|
@ -109,6 +109,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
boolean start(IndexableField field, boolean first) {
|
boolean start(IndexableField field, boolean first) {
|
||||||
|
super.start(field, first);
|
||||||
assert field.fieldType().indexOptions() != IndexOptions.NONE;
|
assert field.fieldType().indexOptions() != IndexOptions.NONE;
|
||||||
|
|
||||||
if (first) {
|
if (first) {
|
||||||
|
@ -224,7 +225,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
||||||
void newTerm(final int termID) {
|
void newTerm(final int termID) {
|
||||||
TermVectorsPostingsArray postings = termVectorsPostingsArray;
|
TermVectorsPostingsArray postings = termVectorsPostingsArray;
|
||||||
|
|
||||||
postings.freqs[termID] = 1;
|
postings.freqs[termID] = getTermFreq();
|
||||||
postings.lastOffsets[termID] = 0;
|
postings.lastOffsets[termID] = 0;
|
||||||
postings.lastPositions[termID] = 0;
|
postings.lastPositions[termID] = 0;
|
||||||
|
|
||||||
|
@ -235,11 +236,25 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
||||||
void addTerm(final int termID) {
|
void addTerm(final int termID) {
|
||||||
TermVectorsPostingsArray postings = termVectorsPostingsArray;
|
TermVectorsPostingsArray postings = termVectorsPostingsArray;
|
||||||
|
|
||||||
postings.freqs[termID]++;
|
postings.freqs[termID] += getTermFreq();
|
||||||
|
|
||||||
writeProx(postings, termID);
|
writeProx(postings, termID);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int getTermFreq() {
|
||||||
|
int freq = termFreqAtt.getTermFrequency();
|
||||||
|
if (freq != 1) {
|
||||||
|
if (doVectorPositions) {
|
||||||
|
throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": cannot index term vector positions while using custom TermFrequencyAttribute");
|
||||||
|
}
|
||||||
|
if (doVectorOffsets) {
|
||||||
|
throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": cannot index term vector offsets while using custom TermFrequencyAttribute");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return freq;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void newPostingsArray() {
|
public void newPostingsArray() {
|
||||||
termVectorsPostingsArray = (TermVectorsPostingsArray) postingsArray;
|
termVectorsPostingsArray = (TermVectorsPostingsArray) postingsArray;
|
||||||
|
|
|
@ -19,12 +19,13 @@ package org.apache.lucene.index;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||||
import org.apache.lucene.util.ByteBlockPool;
|
import org.apache.lucene.util.ByteBlockPool;
|
||||||
|
import org.apache.lucene.util.BytesRefHash.BytesStartArray;
|
||||||
import org.apache.lucene.util.BytesRefHash;
|
import org.apache.lucene.util.BytesRefHash;
|
||||||
import org.apache.lucene.util.Counter;
|
import org.apache.lucene.util.Counter;
|
||||||
import org.apache.lucene.util.IntBlockPool;
|
import org.apache.lucene.util.IntBlockPool;
|
||||||
import org.apache.lucene.util.BytesRefHash.BytesStartArray;
|
|
||||||
|
|
||||||
abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
||||||
private static final int HASH_INIT_SIZE = 4;
|
private static final int HASH_INIT_SIZE = 4;
|
||||||
|
@ -35,6 +36,7 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
||||||
protected final DocumentsWriterPerThread.DocState docState;
|
protected final DocumentsWriterPerThread.DocState docState;
|
||||||
protected final FieldInvertState fieldState;
|
protected final FieldInvertState fieldState;
|
||||||
TermToBytesRefAttribute termAtt;
|
TermToBytesRefAttribute termAtt;
|
||||||
|
protected TermFrequencyAttribute termFreqAtt;
|
||||||
|
|
||||||
// Copied from our perThread
|
// Copied from our perThread
|
||||||
final IntBlockPool intPool;
|
final IntBlockPool intPool;
|
||||||
|
@ -287,6 +289,7 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
||||||
* document. */
|
* document. */
|
||||||
boolean start(IndexableField field, boolean first) {
|
boolean start(IndexableField field, boolean first) {
|
||||||
termAtt = fieldState.termAttribute;
|
termAtt = fieldState.termAttribute;
|
||||||
|
termFreqAtt = fieldState.termFreqAttribute;
|
||||||
if (nextPerField != null) {
|
if (nextPerField != null) {
|
||||||
doNextCall = nextPerField.start(field, first);
|
doNextCall = nextPerField.start(field, first);
|
||||||
}
|
}
|
||||||
|
|
|
@ -125,6 +125,7 @@ public class TestToken extends LuceneTestCase {
|
||||||
t.setFlags(8);
|
t.setFlags(8);
|
||||||
t.setPositionIncrement(3);
|
t.setPositionIncrement(3);
|
||||||
t.setPositionLength(11);
|
t.setPositionLength(11);
|
||||||
|
t.setTermFrequency(42);
|
||||||
TestUtil.assertAttributeReflection(t,
|
TestUtil.assertAttributeReflection(t,
|
||||||
new HashMap<String, Object>() {{
|
new HashMap<String, Object>() {{
|
||||||
put(CharTermAttribute.class.getName() + "#term", "foobar");
|
put(CharTermAttribute.class.getName() + "#term", "foobar");
|
||||||
|
@ -136,6 +137,7 @@ public class TestToken extends LuceneTestCase {
|
||||||
put(PayloadAttribute.class.getName() + "#payload", null);
|
put(PayloadAttribute.class.getName() + "#payload", null);
|
||||||
put(TypeAttribute.class.getName() + "#type", TypeAttribute.DEFAULT_TYPE);
|
put(TypeAttribute.class.getName() + "#type", TypeAttribute.DEFAULT_TYPE);
|
||||||
put(FlagsAttribute.class.getName() + "#flags", 8);
|
put(FlagsAttribute.class.getName() + "#flags", 8);
|
||||||
|
put(TermFrequencyAttribute.class.getName() + "#termFrequency", 42);
|
||||||
}});
|
}});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -82,6 +82,7 @@ public class TestPackedTokenAttributeImpl extends LuceneTestCase {
|
||||||
t.setPositionIncrement(3);
|
t.setPositionIncrement(3);
|
||||||
t.setPositionLength(11);
|
t.setPositionLength(11);
|
||||||
t.setType("foobar");
|
t.setType("foobar");
|
||||||
|
t.setTermFrequency(42);
|
||||||
TestUtil.assertAttributeReflection(t,
|
TestUtil.assertAttributeReflection(t,
|
||||||
new HashMap<String, Object>() {{
|
new HashMap<String, Object>() {{
|
||||||
put(CharTermAttribute.class.getName() + "#term", "foobar");
|
put(CharTermAttribute.class.getName() + "#term", "foobar");
|
||||||
|
@ -91,6 +92,7 @@ public class TestPackedTokenAttributeImpl extends LuceneTestCase {
|
||||||
put(PositionIncrementAttribute.class.getName() + "#positionIncrement", 3);
|
put(PositionIncrementAttribute.class.getName() + "#positionIncrement", 3);
|
||||||
put(PositionLengthAttribute.class.getName() + "#positionLength", 11);
|
put(PositionLengthAttribute.class.getName() + "#positionLength", 11);
|
||||||
put(TypeAttribute.class.getName() + "#type", "foobar");
|
put(TypeAttribute.class.getName() + "#type", "foobar");
|
||||||
|
put(TermFrequencyAttribute.class.getName() + "#termFrequency", 42);
|
||||||
}});
|
}});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,468 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.FieldType;
|
||||||
|
import org.apache.lucene.document.TextField;
|
||||||
|
import org.apache.lucene.search.CollectionStatistics;
|
||||||
|
import org.apache.lucene.search.TermStatistics;
|
||||||
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.index.PostingsEnum.NO_MORE_DOCS;
|
||||||
|
|
||||||
|
public class TestCustomTermFreq extends LuceneTestCase {
|
||||||
|
|
||||||
|
private static final class CannedTermFreqs extends TokenStream {
|
||||||
|
private final String[] terms;
|
||||||
|
private final int[] termFreqs;
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final TermFrequencyAttribute termFreqAtt = addAttribute(TermFrequencyAttribute.class);
|
||||||
|
private int upto;
|
||||||
|
|
||||||
|
public CannedTermFreqs(String[] terms, int[] termFreqs) {
|
||||||
|
this.terms = terms;
|
||||||
|
this.termFreqs = termFreqs;
|
||||||
|
assert terms.length == termFreqs.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() {
|
||||||
|
if (upto == terms.length) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
clearAttributes();
|
||||||
|
|
||||||
|
termAtt.append(terms[upto]);
|
||||||
|
termFreqAtt.setTermFrequency(termFreqs[upto]);
|
||||||
|
|
||||||
|
upto++;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() {
|
||||||
|
upto = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSingletonTermsOneDoc() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar"},
|
||||||
|
new int[] {42, 128}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
IndexReader r = DirectoryReader.open(w);
|
||||||
|
PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
|
||||||
|
assertNotNull(postings);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(128, postings.freq());
|
||||||
|
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
|
||||||
|
postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
|
||||||
|
assertNotNull(postings);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(42, postings.freq());
|
||||||
|
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
|
||||||
|
IOUtils.close(r, w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSingletonTermsTwoDocs() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar"},
|
||||||
|
new int[] {42, 128}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar"},
|
||||||
|
new int[] {50, 50}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader r = DirectoryReader.open(w);
|
||||||
|
PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
|
||||||
|
assertNotNull(postings);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(128, postings.freq());
|
||||||
|
assertEquals(1, postings.nextDoc());
|
||||||
|
assertEquals(50, postings.freq());
|
||||||
|
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
|
||||||
|
postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
|
||||||
|
assertNotNull(postings);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(42, postings.freq());
|
||||||
|
assertEquals(1, postings.nextDoc());
|
||||||
|
assertEquals(50, postings.freq());
|
||||||
|
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
|
||||||
|
IOUtils.close(r, w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRepeatTermsOneDoc() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||||
|
new int[] {42, 128, 17, 100}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
IndexReader r = DirectoryReader.open(w);
|
||||||
|
PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
|
||||||
|
assertNotNull(postings);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(228, postings.freq());
|
||||||
|
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
|
||||||
|
postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
|
||||||
|
assertNotNull(postings);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(59, postings.freq());
|
||||||
|
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
|
||||||
|
IOUtils.close(r, w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRepeatTermsTwoDocs() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||||
|
new int[] {42, 128, 17, 100}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||||
|
new int[] {50, 60, 70, 80}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader r = DirectoryReader.open(w);
|
||||||
|
PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
|
||||||
|
assertNotNull(postings);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(228, postings.freq());
|
||||||
|
assertEquals(1, postings.nextDoc());
|
||||||
|
assertEquals(140, postings.freq());
|
||||||
|
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
|
||||||
|
postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
|
||||||
|
assertNotNull(postings);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(59, postings.freq());
|
||||||
|
assertEquals(1, postings.nextDoc());
|
||||||
|
assertEquals(120, postings.freq());
|
||||||
|
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
|
||||||
|
IOUtils.close(r, w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTotalTermFreq() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||||
|
new int[] {42, 128, 17, 100}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||||
|
new int[] {50, 60, 70, 80}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader r = DirectoryReader.open(w);
|
||||||
|
|
||||||
|
TermsEnum termsEnum = MultiFields.getTerms(r, "field").iterator();
|
||||||
|
assertTrue(termsEnum.seekExact(new BytesRef("foo")));
|
||||||
|
assertEquals(179, termsEnum.totalTermFreq());
|
||||||
|
assertTrue(termsEnum.seekExact(new BytesRef("bar")));
|
||||||
|
assertEquals(368, termsEnum.totalTermFreq());
|
||||||
|
|
||||||
|
IOUtils.close(r, w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
// you can't index proximity with custom term freqs:
|
||||||
|
public void testInvalidProx() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||||
|
new int[] {42, 128, 17, 100}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
Exception e = expectThrows(IllegalStateException.class, () -> {w.addDocument(doc);});
|
||||||
|
assertEquals("field \"field\": cannot index positions while using custom TermFrequencyAttribute", e.getMessage());
|
||||||
|
IOUtils.close(w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
// you can't index DOCS_ONLY with custom term freq
|
||||||
|
public void testInvalidDocsOnly() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS);
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||||
|
new int[] {42, 128, 17, 100}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
Exception e = expectThrows(IllegalStateException.class, () -> {w.addDocument(doc);});
|
||||||
|
assertEquals("field \"field\": must index term freq while using custom TermFrequencyAttribute", e.getMessage());
|
||||||
|
IOUtils.close(w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum of term freqs must fit in an int
|
||||||
|
public void testOverflowInt() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||||
|
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS);
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("field", "this field should be indexed", fieldType));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
Document doc2 = new Document();
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar"},
|
||||||
|
new int[] {3, Integer.MAX_VALUE}),
|
||||||
|
fieldType);
|
||||||
|
doc2.add(field);
|
||||||
|
expectThrows(ArithmeticException.class, () -> {w.addDocument(doc2);});
|
||||||
|
|
||||||
|
IndexReader r = DirectoryReader.open(w);
|
||||||
|
assertEquals(1, r.numDocs());
|
||||||
|
|
||||||
|
IOUtils.close(r, w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testInvalidTermVectorPositions() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
fieldType.setStoreTermVectors(true);
|
||||||
|
fieldType.setStoreTermVectorPositions(true);
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||||
|
new int[] {42, 128, 17, 100}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
|
||||||
|
assertEquals("field \"field\": cannot index term vector positions while using custom TermFrequencyAttribute", e.getMessage());
|
||||||
|
IOUtils.close(w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testInvalidTermVectorOffsets() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
fieldType.setStoreTermVectors(true);
|
||||||
|
fieldType.setStoreTermVectorOffsets(true);
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||||
|
new int[] {42, 128, 17, 100}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
|
||||||
|
assertEquals("field \"field\": cannot index term vector offsets while using custom TermFrequencyAttribute", e.getMessage());
|
||||||
|
IOUtils.close(w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTermVectors() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
fieldType.setStoreTermVectors(true);
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||||
|
new int[] {42, 128, 17, 100}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||||
|
new int[] {50, 60, 70, 80}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader r = DirectoryReader.open(w);
|
||||||
|
|
||||||
|
Fields fields = r.getTermVectors(0);
|
||||||
|
TermsEnum termsEnum = fields.terms("field").iterator();
|
||||||
|
assertTrue(termsEnum.seekExact(new BytesRef("bar")));
|
||||||
|
assertEquals(228, termsEnum.totalTermFreq());
|
||||||
|
PostingsEnum postings = termsEnum.postings(null);
|
||||||
|
assertNotNull(postings);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(228, postings.freq());
|
||||||
|
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
|
||||||
|
assertTrue(termsEnum.seekExact(new BytesRef("foo")));
|
||||||
|
assertEquals(59, termsEnum.totalTermFreq());
|
||||||
|
postings = termsEnum.postings(null);
|
||||||
|
assertNotNull(postings);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(59, postings.freq());
|
||||||
|
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
|
||||||
|
fields = r.getTermVectors(1);
|
||||||
|
termsEnum = fields.terms("field").iterator();
|
||||||
|
assertTrue(termsEnum.seekExact(new BytesRef("bar")));
|
||||||
|
assertEquals(140, termsEnum.totalTermFreq());
|
||||||
|
postings = termsEnum.postings(null);
|
||||||
|
assertNotNull(postings);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(140, postings.freq());
|
||||||
|
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
|
||||||
|
assertTrue(termsEnum.seekExact(new BytesRef("foo")));
|
||||||
|
assertEquals(120, termsEnum.totalTermFreq());
|
||||||
|
postings = termsEnum.postings(null);
|
||||||
|
assertNotNull(postings);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(120, postings.freq());
|
||||||
|
assertEquals(NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
|
||||||
|
IOUtils.close(r, w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Similarity holds onto the FieldInvertState for subsequent verification.
|
||||||
|
*/
|
||||||
|
private static class NeverForgetsSimilarity extends Similarity {
|
||||||
|
public FieldInvertState lastState;
|
||||||
|
private final static NeverForgetsSimilarity INSTANCE = new NeverForgetsSimilarity();
|
||||||
|
|
||||||
|
private NeverForgetsSimilarity() {
|
||||||
|
// no
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long computeNorm(FieldInvertState state) {
|
||||||
|
this.lastState = state;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFieldInvertState() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE);
|
||||||
|
IndexWriter w = new IndexWriter(dir, iwc);
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
|
||||||
|
new int[] {42, 128, 17, 100}),
|
||||||
|
fieldType);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState;
|
||||||
|
assertEquals(228, fis.getMaxTermFrequency());
|
||||||
|
assertEquals(2, fis.getUniqueTermCount());
|
||||||
|
assertEquals(0, fis.getNumOverlap());
|
||||||
|
assertEquals(287, fis.getLength());
|
||||||
|
|
||||||
|
IOUtils.close(w, dir);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,139 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CannedTokenStream;
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.TextField;
|
||||||
|
import org.apache.lucene.search.CollectionStatistics;
|
||||||
|
import org.apache.lucene.search.TermStatistics;
|
||||||
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
|
public class TestFieldInvertState extends LuceneTestCase {
|
||||||
|
/**
|
||||||
|
* Similarity holds onto the FieldInvertState for subsequent verification.
|
||||||
|
*/
|
||||||
|
private static class NeverForgetsSimilarity extends Similarity {
|
||||||
|
public FieldInvertState lastState;
|
||||||
|
private final static NeverForgetsSimilarity INSTANCE = new NeverForgetsSimilarity();
|
||||||
|
|
||||||
|
private NeverForgetsSimilarity() {
|
||||||
|
// no
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long computeNorm(FieldInvertState state) {
|
||||||
|
this.lastState = state;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBasic() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE);
|
||||||
|
IndexWriter w = new IndexWriter(dir, iwc);
|
||||||
|
Document doc = new Document();
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTokenStream(new Token("a", 0, 1),
|
||||||
|
new Token("b", 2, 3),
|
||||||
|
new Token("c", 4, 5)),
|
||||||
|
TextField.TYPE_NOT_STORED);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState;
|
||||||
|
assertEquals(1, fis.getMaxTermFrequency());
|
||||||
|
assertEquals(3, fis.getUniqueTermCount());
|
||||||
|
assertEquals(0, fis.getNumOverlap());
|
||||||
|
assertEquals(3, fis.getLength());
|
||||||
|
IOUtils.close(w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandom() throws Exception {
|
||||||
|
int numUniqueTokens = TestUtil.nextInt(random(), 1, 25);
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE);
|
||||||
|
IndexWriter w = new IndexWriter(dir, iwc);
|
||||||
|
Document doc = new Document();
|
||||||
|
|
||||||
|
int numTokens = atLeast(10000);
|
||||||
|
Token[] tokens = new Token[numTokens];
|
||||||
|
Map<Character,Integer> counts = new HashMap<>();
|
||||||
|
int numStacked = 0;
|
||||||
|
int maxTermFreq = 0;
|
||||||
|
int pos = -1;
|
||||||
|
for (int i=0;i<numTokens;i++) {
|
||||||
|
char tokenChar = (char) ('a' + random().nextInt(numUniqueTokens));
|
||||||
|
Integer oldCount = counts.get(tokenChar);
|
||||||
|
int newCount;
|
||||||
|
if (oldCount == null) {
|
||||||
|
newCount = 1;
|
||||||
|
} else {
|
||||||
|
newCount = 1 + oldCount;
|
||||||
|
}
|
||||||
|
counts.put(tokenChar, newCount);
|
||||||
|
maxTermFreq = Math.max(maxTermFreq, newCount);
|
||||||
|
|
||||||
|
Token token = new Token(Character.toString(tokenChar), 2*i, 2*i+1);
|
||||||
|
|
||||||
|
if (i > 0 && random().nextInt(7) == 3) {
|
||||||
|
token.setPositionIncrement(0);
|
||||||
|
numStacked++;
|
||||||
|
} else {
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
tokens[i] = token;
|
||||||
|
}
|
||||||
|
|
||||||
|
Field field = new Field("field",
|
||||||
|
new CannedTokenStream(tokens),
|
||||||
|
TextField.TYPE_NOT_STORED);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState;
|
||||||
|
assertEquals(maxTermFreq, fis.getMaxTermFrequency());
|
||||||
|
assertEquals(counts.size(), fis.getUniqueTermCount());
|
||||||
|
assertEquals(numStacked, fis.getNumOverlap());
|
||||||
|
assertEquals(numTokens, fis.getLength());
|
||||||
|
assertEquals(pos, fis.getPosition());
|
||||||
|
|
||||||
|
IOUtils.close(w, dir);
|
||||||
|
}
|
||||||
|
}
|
|
@ -2676,11 +2676,11 @@ public abstract class LuceneTestCase extends Assert {
|
||||||
if (expectedType.isInstance(e)) {
|
if (expectedType.isInstance(e)) {
|
||||||
return expectedType.cast(e);
|
return expectedType.cast(e);
|
||||||
}
|
}
|
||||||
AssertionFailedError assertion = new AssertionFailedError("Unexpected exception type, expected " + expectedType.getSimpleName());
|
AssertionFailedError assertion = new AssertionFailedError("Unexpected exception type, expected " + expectedType.getSimpleName() + " but got " + e);
|
||||||
assertion.initCause(e);
|
assertion.initCause(e);
|
||||||
throw assertion;
|
throw assertion;
|
||||||
}
|
}
|
||||||
throw new AssertionFailedError("Expected exception " + expectedType.getSimpleName());
|
throw new AssertionFailedError("Expected exception " + expectedType.getSimpleName() + " but no exception was thrown");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in New Issue