mirror of
https://github.com/apache/lucene.git
synced 2025-02-09 03:25:15 +00:00
Merging r1058717 through r1059431 into realtime branch
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/realtime_search@1059434 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
11e9007442
9
dev-tools/idea/.idea/copyright/Apache_Software_Foundation.xml
generated
Normal file
9
dev-tools/idea/.idea/copyright/Apache_Software_Foundation.xml
generated
Normal file
@ -0,0 +1,9 @@
|
||||
<component name="CopyrightManager">
|
||||
<copyright>
|
||||
<option name="notice" value="Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." />
|
||||
<option name="keyword" value="http://www.apache.org/licenses/LICENSE-2.0" />
|
||||
<option name="allowReplaceKeyword" value="Copyright 20" />
|
||||
<option name="myName" value="Apache Software Foundation" />
|
||||
<option name="myLocal" value="true" />
|
||||
</copyright>
|
||||
</component>
|
27
dev-tools/idea/.idea/copyright/profiles_settings.xml
generated
Normal file
27
dev-tools/idea/.idea/copyright/profiles_settings.xml
generated
Normal file
@ -0,0 +1,27 @@
|
||||
<component name="CopyrightManager">
|
||||
<settings default="Apache Software Foundation">
|
||||
<module2copyright>
|
||||
<element module="All" copyright="Apache Software Foundation" />
|
||||
</module2copyright>
|
||||
<LanguageOptions name="HTML">
|
||||
<option name="fileTypeOverride" value="3" />
|
||||
<option name="prefixLines" value="false" />
|
||||
</LanguageOptions>
|
||||
<LanguageOptions name="JAVA">
|
||||
<option name="fileTypeOverride" value="3" />
|
||||
<option name="fileLocation" value="2" />
|
||||
</LanguageOptions>
|
||||
<LanguageOptions name="JSP">
|
||||
<option name="fileTypeOverride" value="3" />
|
||||
<option name="prefixLines" value="false" />
|
||||
</LanguageOptions>
|
||||
<LanguageOptions name="JSPX">
|
||||
<option name="fileTypeOverride" value="3" />
|
||||
<option name="prefixLines" value="false" />
|
||||
</LanguageOptions>
|
||||
<LanguageOptions name="XML">
|
||||
<option name="fileTypeOverride" value="3" />
|
||||
<option name="prefixLines" value="false" />
|
||||
</LanguageOptions>
|
||||
</settings>
|
||||
</component>
|
@ -359,6 +359,9 @@ New features
|
||||
terms dict. This impl stores the indexed terms in an FST, which is
|
||||
much more RAM efficient than FixedGapTermsIndex. (Mike McCandless)
|
||||
|
||||
* LUCENE-2862: Added TermsEnum.totalTermFreq() and
|
||||
Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching.
|
||||
@ -738,6 +741,10 @@ New features
|
||||
Query improves performance, as out-of-order collection is now supported.
|
||||
(Uwe Schindler)
|
||||
|
||||
* LUCENE-2864: Add getMaxTermFrequency (maximum within-document TF) to
|
||||
FieldInvertState so that it can be used in Similarity.computeNorm.
|
||||
(Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2075: Terms dict cache is now shared across threads instead
|
||||
|
@ -238,6 +238,10 @@ public class InstantiatedIndex
|
||||
while((text = termsEnum.next()) != null) {
|
||||
String termText = text.utf8ToString();
|
||||
InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText);
|
||||
final long totalTermFreq = termsEnum.totalTermFreq();
|
||||
if (totalTermFreq != -1) {
|
||||
instantiatedTerm.addPositionsCount(totalTermFreq);
|
||||
}
|
||||
getTermsByFieldAndText().get(field).put(termText, instantiatedTerm);
|
||||
instantiatedTerm.setTermIndex(terms.size());
|
||||
terms.add(instantiatedTerm);
|
||||
|
@ -398,18 +398,33 @@ public class InstantiatedIndexReader extends IndexReader {
|
||||
if (i < 0) {
|
||||
i = -i - 1;
|
||||
}
|
||||
if (i >= orderedTerms.length || !orderedTerms[i].field().equals(field)) {
|
||||
if (i >= orderedTerms.length || orderedTerms[i].field() != field) {
|
||||
// field does not exist
|
||||
return null;
|
||||
}
|
||||
final int startLoc = i;
|
||||
|
||||
// TODO: heavy to do this here; would be better to
|
||||
// do it up front & cache
|
||||
long sum = 0;
|
||||
int upto = i;
|
||||
while(upto < orderedTerms.length && orderedTerms[i].field() == field) {
|
||||
sum += orderedTerms[i].getTotalTermFreq();
|
||||
upto++;
|
||||
}
|
||||
final long sumTotalTermFreq = sum;
|
||||
|
||||
return new Terms() {
|
||||
@Override
|
||||
public TermsEnum iterator() {
|
||||
return new InstantiatedTermsEnum(orderedTerms, startLoc, field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return sumTotalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
|
@ -315,6 +315,7 @@ public class InstantiatedIndexWriter implements Closeable {
|
||||
}
|
||||
associatedDocuments[associatedDocuments.length - 1] = info;
|
||||
term.setAssociatedDocuments(associatedDocuments);
|
||||
term.addPositionsCount(positions.length);
|
||||
|
||||
// todo optimize, only if term vector?
|
||||
informationByTermOfCurrentDocument.put(term, info);
|
||||
|
@ -45,6 +45,8 @@ public class InstantiatedTerm
|
||||
|
||||
private Term term;
|
||||
|
||||
private long totalTermFreq;
|
||||
|
||||
/**
|
||||
* index of term in InstantiatedIndex
|
||||
* @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */
|
||||
@ -92,6 +94,14 @@ public class InstantiatedTerm
|
||||
this.associatedDocuments = associatedDocuments;
|
||||
}
|
||||
|
||||
void addPositionsCount(long count) {
|
||||
totalTermFreq += count;
|
||||
}
|
||||
|
||||
public long getTotalTermFreq() {
|
||||
return totalTermFreq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds index to the first beyond the current whose document number is
|
||||
* greater than or equal to <i>target</i>, -1 if there is no such element.
|
||||
|
@ -24,7 +24,6 @@ import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.codecs.PrefixCodedTermState;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
@ -110,6 +109,12 @@ public class InstantiatedTermsEnum extends TermsEnum {
|
||||
return terms[upto].getAssociatedDocuments().length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
final long v = terms[upto].getTotalTermFreq();
|
||||
return v == 0 ? -1 : v;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
||||
if (reuse == null || !(reuse instanceof InstantiatedDocsEnum)) {
|
||||
|
@ -66,6 +66,7 @@ public class TestIndicesEquals extends LuceneTestCase {
|
||||
// create dir data
|
||||
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig(
|
||||
TEST_VERSION_CURRENT, new MockAnalyzer()));
|
||||
|
||||
for (int i = 0; i < 20; i++) {
|
||||
Document document = new Document();
|
||||
assembleDocument(document, i);
|
||||
@ -395,6 +396,10 @@ public class TestIndicesEquals extends LuceneTestCase {
|
||||
}
|
||||
|
||||
assertTrue(aprioriTermEnum.docFreq() == testTermEnum.docFreq());
|
||||
final long totalTermFreq = aprioriTermEnum.totalTermFreq();
|
||||
if (totalTermFreq != -1) {
|
||||
assertEquals(totalTermFreq, testTermEnum.totalTermFreq());
|
||||
}
|
||||
|
||||
// compare termDocs seeking
|
||||
|
||||
|
@ -610,6 +610,8 @@ public class MemoryIndex implements Serializable {
|
||||
/** Term for this field's fieldName, lazily computed on demand */
|
||||
public transient Term template;
|
||||
|
||||
private final long sumTotalTermFreq;
|
||||
|
||||
private static final long serialVersionUID = 2882195016849084649L;
|
||||
|
||||
public Info(HashMap<BytesRef,ArrayIntList> terms, int numTokens, int numOverlapTokens, float boost) {
|
||||
@ -617,6 +619,15 @@ public class MemoryIndex implements Serializable {
|
||||
this.numTokens = numTokens;
|
||||
this.numOverlapTokens = numOverlapTokens;
|
||||
this.boost = boost;
|
||||
long sum = 0;
|
||||
for(Map.Entry<BytesRef,ArrayIntList> ent : terms.entrySet()) {
|
||||
sum += ent.getValue().size();
|
||||
}
|
||||
sumTotalTermFreq = sum;
|
||||
}
|
||||
|
||||
public long getSumTotalTermFreq() {
|
||||
return sumTotalTermFreq;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -826,6 +837,11 @@ public class MemoryIndex implements Serializable {
|
||||
public long getUniqueTermCount() {
|
||||
return info.sortedTerms.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return info.getSumTotalTermFreq();
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -895,6 +911,11 @@ public class MemoryIndex implements Serializable {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return info.sortedTerms[termUpto].getValue().size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
||||
if (reuse == null || !(reuse instanceof MemoryDocsEnum)) {
|
||||
|
@ -176,15 +176,34 @@ public class HighFreqTerms {
|
||||
return ts;
|
||||
}
|
||||
|
||||
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
|
||||
BytesRef br = termtext;
|
||||
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termText) throws Exception {
|
||||
|
||||
long totalTF = 0;
|
||||
Bits skipDocs = MultiFields.getDeletedDocs(reader);
|
||||
DocsEnum de = MultiFields.getTermDocsEnum(reader, skipDocs, field, br);
|
||||
// if term is not in index return totalTF of 0
|
||||
if (de == null) {
|
||||
|
||||
Terms terms = MultiFields.getTerms(reader, field);
|
||||
if (terms == null) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
if (termsEnum.seek(termText) != TermsEnum.SeekStatus.FOUND) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
Bits skipDocs = MultiFields.getDeletedDocs(reader);
|
||||
if (skipDocs == null) {
|
||||
// TODO: we could do this up front, during the scan
|
||||
// (next()), instead of after-the-fact here w/ seek,
|
||||
// if the codec supports it and there are no del
|
||||
// docs...
|
||||
final long totTF = termsEnum.totalTermFreq();
|
||||
if (totTF != -1) {
|
||||
return totTF;
|
||||
}
|
||||
}
|
||||
|
||||
DocsEnum de = termsEnum.docs(skipDocs, null);
|
||||
|
||||
// use DocsEnum.read() and BulkResult api
|
||||
final DocsEnum.BulkReadResult bulkresult = de.getBulkResult();
|
||||
int count;
|
||||
|
@ -41,4 +41,9 @@ public final class TermStats {
|
||||
String getTermText() {
|
||||
return termtext.utf8ToString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return("TermStats: term=" + termtext.utf8ToString() + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq);
|
||||
}
|
||||
}
|
@ -17,15 +17,16 @@ package org.apache.lucene.misc;
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
@ -41,8 +42,10 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
||||
writer = new IndexWriter(dir, newIndexWriterConfig(random,
|
||||
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false))
|
||||
.setMaxBufferedDocs(2));
|
||||
writer.setInfoStream(VERBOSE ? System.out : null);
|
||||
indexDocs(writer);
|
||||
reader = IndexReader.open(dir, true);
|
||||
_TestUtil.checkIndex(dir);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
@ -135,9 +138,10 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
||||
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
|
||||
|
||||
for (int i = 0; i < termsWithTF.length; i++) {
|
||||
// check that they are sorted by descending termfreq order
|
||||
// check that they are sorted by descending termfreq
|
||||
// order
|
||||
if (i > 0) {
|
||||
assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq > termsWithTF[i].totalTermFreq);
|
||||
assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq >= termsWithTF[i].totalTermFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -124,6 +124,10 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
|
||||
return fcsi.getTermsEnum();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return -1;
|
||||
}
|
||||
});
|
||||
|
||||
assert termsEnum != null;
|
||||
|
@ -610,6 +610,8 @@ public class CheckIndex {
|
||||
|
||||
Comparator<BytesRef> termComp = terms.getComparator();
|
||||
|
||||
long sumTotalTermFreq = 0;
|
||||
|
||||
while(true) {
|
||||
|
||||
final BytesRef term = terms.next();
|
||||
@ -660,6 +662,8 @@ public class CheckIndex {
|
||||
}
|
||||
|
||||
int lastDoc = -1;
|
||||
int docCount = 0;
|
||||
long totalTermFreq = 0;
|
||||
while(true) {
|
||||
final int doc = docs2.nextDoc();
|
||||
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
@ -667,6 +671,8 @@ public class CheckIndex {
|
||||
}
|
||||
final int freq = docs2.freq();
|
||||
status.totPos += freq;
|
||||
totalTermFreq += freq;
|
||||
docCount++;
|
||||
|
||||
if (doc <= lastDoc) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
|
||||
@ -698,18 +704,35 @@ public class CheckIndex {
|
||||
}
|
||||
}
|
||||
|
||||
// Now count how many deleted docs occurred in
|
||||
// this term:
|
||||
final long totalTermFreq2 = terms.totalTermFreq();
|
||||
final boolean hasTotalTermFreq = postings != null && totalTermFreq2 != -1;
|
||||
|
||||
// Re-count if there are deleted docs:
|
||||
if (reader.hasDeletions()) {
|
||||
final DocsEnum docsNoDel = terms.docs(null, docs);
|
||||
int count = 0;
|
||||
docCount = 0;
|
||||
totalTermFreq = 0;
|
||||
while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
count++;
|
||||
docCount++;
|
||||
totalTermFreq += docsNoDel.freq();
|
||||
}
|
||||
if (count != docFreq) {
|
||||
throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + count);
|
||||
}
|
||||
|
||||
if (docCount != docFreq) {
|
||||
throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount);
|
||||
}
|
||||
if (hasTotalTermFreq) {
|
||||
sumTotalTermFreq += totalTermFreq;
|
||||
if (totalTermFreq != totalTermFreq2) {
|
||||
throw new RuntimeException("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sumTotalTermFreq != 0) {
|
||||
final long v = fields.terms(field).getSumTotalTermFreq();
|
||||
if (v != -1 && sumTotalTermFreq != v) {
|
||||
throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -30,6 +30,7 @@ public final class FieldInvertState {
|
||||
int length;
|
||||
int numOverlap;
|
||||
int offset;
|
||||
int maxTermFrequency;
|
||||
float boost;
|
||||
AttributeSource attributeSource;
|
||||
|
||||
@ -53,6 +54,7 @@ public final class FieldInvertState {
|
||||
length = 0;
|
||||
numOverlap = 0;
|
||||
offset = 0;
|
||||
maxTermFrequency = 0;
|
||||
boost = docBoost;
|
||||
attributeSource = null;
|
||||
}
|
||||
@ -111,6 +113,15 @@ public final class FieldInvertState {
|
||||
this.boost = boost;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the maximum term-frequency encountered for any term in the field. A
|
||||
* field containing "the quick brown fox jumps over the lazy dog" would have
|
||||
* a value of 2, because "the" appears twice.
|
||||
*/
|
||||
public int getMaxTermFrequency() {
|
||||
return maxTermFrequency;
|
||||
}
|
||||
|
||||
public AttributeSource getAttributeSource() {
|
||||
return attributeSource;
|
||||
}
|
||||
|
@ -99,6 +99,11 @@ public class FilterIndexReader extends IndexReader {
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
return in.getUniqueTermCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() throws IOException {
|
||||
return in.getSumTotalTermFreq();
|
||||
}
|
||||
}
|
||||
|
||||
/** Base class for filtering {@link TermsEnum} implementations. */
|
||||
@ -155,6 +160,11 @@ public class FilterIndexReader extends IndexReader {
|
||||
return in.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return in.totalTermFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
return in.docs(skipDocs, reuse);
|
||||
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.index.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.index.codecs.TermsConsumer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
@ -130,6 +131,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||
postings.docFreqs[termID] = 1;
|
||||
writeProx(termID, fieldState.position);
|
||||
}
|
||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -163,11 +165,12 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||
termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
|
||||
}
|
||||
postings.docFreqs[termID] = 1;
|
||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
||||
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
|
||||
postings.lastDocIDs[termID] = docState.docID;
|
||||
writeProx(termID, fieldState.position);
|
||||
} else {
|
||||
postings.docFreqs[termID]++;
|
||||
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
|
||||
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
|
||||
}
|
||||
}
|
||||
@ -237,7 +240,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||
final ByteSliceReader freq = new ByteSliceReader();
|
||||
final ByteSliceReader prox = new ByteSliceReader();
|
||||
|
||||
|
||||
long sumTotalTermFreq = 0;
|
||||
for (int i = 0; i < numTerms; i++) {
|
||||
final int termID = termIDs[i];
|
||||
// Get BytesRef
|
||||
@ -261,6 +264,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||
// which all share the same term. Now we must
|
||||
// interleave the docID streams.
|
||||
int numDocs = 0;
|
||||
long totTF = 0;
|
||||
int docID = 0;
|
||||
int termFreq = 0;
|
||||
|
||||
@ -305,6 +309,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||
// omitTermFreqAndPositions == false so we do write positions &
|
||||
// payload
|
||||
int position = 0;
|
||||
totTF += termDocFreq;
|
||||
for(int j=0;j<termDocFreq;j++) {
|
||||
final int code = prox.readVInt();
|
||||
position += code >> 1;
|
||||
@ -338,10 +343,11 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||
postingsConsumer.finishDoc();
|
||||
}
|
||||
}
|
||||
termsConsumer.finishTerm(text, numDocs);
|
||||
termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
|
||||
sumTotalTermFreq += totTF;
|
||||
}
|
||||
|
||||
termsConsumer.finish();
|
||||
termsConsumer.finish(sumTotalTermFreq);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -997,6 +997,23 @@ public abstract class IndexReader implements Cloneable,Closeable {
|
||||
return terms.docFreq(term);
|
||||
}
|
||||
|
||||
/** Returns the number of documents containing the term
|
||||
* <code>t</code>. This method returns 0 if the term or
|
||||
* field does not exists. This method does not take into
|
||||
* account deleted documents that have not yet been merged
|
||||
* away. */
|
||||
public long totalTermFreq(String field, BytesRef term) throws IOException {
|
||||
final Fields fields = fields();
|
||||
if (fields == null) {
|
||||
return 0;
|
||||
}
|
||||
final Terms terms = fields.terms(field);
|
||||
if (terms == null) {
|
||||
return 0;
|
||||
}
|
||||
return terms.totalTermFreq(term);
|
||||
}
|
||||
|
||||
/** This may return null if the field does not exist.*/
|
||||
public Terms terms(String field) throws IOException {
|
||||
final Fields fields = fields();
|
||||
|
@ -76,6 +76,19 @@ public final class MultiTerms extends Terms {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() throws IOException {
|
||||
long sum = 0;
|
||||
for(Terms terms : subs) {
|
||||
final long v = terms.getSumTotalTermFreq();
|
||||
if (v == -1) {
|
||||
return -1;
|
||||
}
|
||||
sum += v;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return termComp;
|
||||
|
@ -265,6 +265,19 @@ public final class MultiTermsEnum extends TermsEnum {
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
long sum = 0;
|
||||
for(int i=0;i<numTop;i++) {
|
||||
final long v = top[i].terms.totalTermFreq();
|
||||
if (v == -1) {
|
||||
return v;
|
||||
}
|
||||
sum += v;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
final MultiDocsEnum docsEnum;
|
||||
|
@ -57,6 +57,18 @@ public abstract class Terms {
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the number of documents containing the
|
||||
* specified term text. Returns 0 if the term does not
|
||||
* exist. */
|
||||
public long totalTermFreq(BytesRef text) throws IOException {
|
||||
final TermsEnum termsEnum = getThreadTermsEnum();
|
||||
if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) {
|
||||
return termsEnum.totalTermFreq();
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Get {@link DocsEnum} for the specified term. This
|
||||
* method may return null if the term does not exist. */
|
||||
public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException {
|
||||
@ -115,6 +127,14 @@ public abstract class Terms {
|
||||
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
|
||||
}
|
||||
|
||||
/** Returns the sum of {@link TermsEnum#totalTermFreq} for
|
||||
* all terms in this field, or -1 if this measure isn't
|
||||
* stored by the codec (or if this fields omits term freq
|
||||
* and positions). Note that, just like other term
|
||||
* measures, this measure does not take deleted documents
|
||||
* into account. */
|
||||
public abstract long getSumTotalTermFreq() throws IOException;
|
||||
|
||||
/**
|
||||
* Returns a thread-private {@link TermsEnum} instance. Obtaining
|
||||
* {@link TermsEnum} from this method might be more efficient than using
|
||||
|
@ -126,6 +126,14 @@ public abstract class TermsEnum {
|
||||
* {@link SeekStatus#END}.*/
|
||||
public abstract int docFreq();
|
||||
|
||||
/** Returns the total number of occurrences of this term
|
||||
* across all documents (the sum of the freq() for each
|
||||
* doc that has this term). This will be -1 if the
|
||||
* codec doesn't support this measure. Note that, like
|
||||
* other term measures, this measure does not take
|
||||
* deleted documents into account. */
|
||||
public abstract long totalTermFreq();
|
||||
|
||||
/** Get {@link DocsEnum} for the current term. Do not
|
||||
* call this before calling {@link #next} or {@link
|
||||
* #seek} for the first time. This method will not
|
||||
@ -198,6 +206,11 @@ public abstract class TermsEnum {
|
||||
throw new IllegalStateException("this method should never be called");
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
throw new IllegalStateException("this method should never be called");
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ord() {
|
||||
throw new IllegalStateException("this method should never be called");
|
||||
|
@ -132,7 +132,6 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
|
||||
private class IndexEnum extends FieldIndexEnum {
|
||||
private final FieldIndexData.CoreFieldIndex fieldIndex;
|
||||
private final BytesRef term = new BytesRef();
|
||||
private final BytesRef nextTerm = new BytesRef();
|
||||
private long ord;
|
||||
|
||||
public IndexEnum(FieldIndexData.CoreFieldIndex fieldIndex) {
|
||||
@ -192,7 +191,7 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
|
||||
|
||||
final long offset = fieldIndex.termOffsets.get(idx);
|
||||
final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
|
||||
termBytesReader.fillSlice(nextTerm, fieldIndex.termBytesStart + offset, length);
|
||||
termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
|
||||
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
|
||||
}
|
||||
|
||||
|
@ -128,7 +128,7 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
|
||||
public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
// First term is first indexed term:
|
||||
if (0 == (numTerms++ % termIndexInterval)) {
|
||||
|
||||
|
@ -55,9 +55,10 @@ public abstract class PostingsConsumer {
|
||||
|
||||
/** Default merge impl: append documents, mapping around
|
||||
* deletes */
|
||||
public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
|
||||
public TermStats merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
|
||||
|
||||
int df = 0;
|
||||
long totTF = 0;
|
||||
|
||||
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
|
||||
while(true) {
|
||||
@ -68,6 +69,7 @@ public abstract class PostingsConsumer {
|
||||
this.startDoc(doc, postings.freq());
|
||||
this.finishDoc();
|
||||
df++;
|
||||
totTF++;
|
||||
}
|
||||
} else {
|
||||
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
|
||||
@ -78,6 +80,7 @@ public abstract class PostingsConsumer {
|
||||
}
|
||||
final int freq = postingsEnum.freq();
|
||||
this.startDoc(doc, freq);
|
||||
totTF += freq;
|
||||
for(int i=0;i<freq;i++) {
|
||||
final int position = postingsEnum.nextPosition();
|
||||
final BytesRef payload;
|
||||
@ -92,6 +95,6 @@ public abstract class PostingsConsumer {
|
||||
df++;
|
||||
}
|
||||
}
|
||||
return df;
|
||||
return new TermStats(df, totTF);
|
||||
}
|
||||
}
|
||||
|
@ -34,7 +34,7 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
|
||||
public abstract void startTerm() throws IOException;
|
||||
|
||||
/** Finishes the current term */
|
||||
public abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException;
|
||||
public abstract void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException;
|
||||
|
||||
public abstract void setField(FieldInfo fieldInfo);
|
||||
|
||||
|
@ -27,6 +27,7 @@ import org.apache.lucene.index.TermState;
|
||||
public class PrefixCodedTermState extends OrdTermState {
|
||||
public int docFreq; // how many docs have this term
|
||||
public long filePointer; // fp into the terms dict primary file (_X.tis)
|
||||
public long totalTermFreq; // total number of occurrences of this term
|
||||
|
||||
@Override
|
||||
public void copyFrom(TermState _other) {
|
||||
@ -35,11 +36,12 @@ public class PrefixCodedTermState extends OrdTermState {
|
||||
super.copyFrom(_other);
|
||||
filePointer = other.filePointer;
|
||||
docFreq = other.docFreq;
|
||||
totalTermFreq = other.totalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + "]";
|
||||
return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + ", docFreq=" + docFreq + ", totalTermFreq=" + totalTermFreq + "]";
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -129,18 +129,17 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
||||
// Read per-field details
|
||||
seekDir(in, dirOffset);
|
||||
|
||||
final int numFields = in.readInt();
|
||||
final int numFields = in.readVInt();
|
||||
|
||||
for(int i=0;i<numFields;i++) {
|
||||
final int field = in.readInt();
|
||||
final long numTerms = in.readLong();
|
||||
final int field = in.readVInt();
|
||||
final long numTerms = in.readVLong();
|
||||
assert numTerms >= 0;
|
||||
final long termsStartPointer = in.readLong();
|
||||
final long termsStartPointer = in.readVLong();
|
||||
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||
if (numTerms > 0) {
|
||||
final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
|
||||
assert !fields.containsKey(fieldInfo.name);
|
||||
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer));
|
||||
}
|
||||
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq));
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
@ -245,12 +244,14 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
||||
final long numTerms;
|
||||
final FieldInfo fieldInfo;
|
||||
final long termsStartPointer;
|
||||
final long sumTotalTermFreq;
|
||||
|
||||
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
|
||||
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) {
|
||||
assert numTerms > 0;
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.numTerms = numTerms;
|
||||
this.termsStartPointer = termsStartPointer;
|
||||
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -273,6 +274,11 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
||||
return numTerms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return sumTotalTermFreq;
|
||||
}
|
||||
|
||||
// Iterates through terms in this field, not supporting ord()
|
||||
private final class SegmentTermsEnum extends TermsEnum {
|
||||
private final IndexInput in;
|
||||
@ -295,6 +301,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
||||
bytesReader = new DeltaBytesReader(in);
|
||||
fieldTerm.field = fieldInfo.name;
|
||||
state = postingsReader.newTermState();
|
||||
state.totalTermFreq = -1;
|
||||
state.ord = -1;
|
||||
}
|
||||
|
||||
@ -494,6 +501,10 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
||||
state.docFreq = (in.readVInt() << 6) | (b & 0x3F);
|
||||
}
|
||||
|
||||
if (!fieldInfo.omitTermFreqAndPositions) {
|
||||
state.totalTermFreq = state.docFreq + in.readVLong();
|
||||
}
|
||||
|
||||
postingsReader.readTerm(in,
|
||||
fieldInfo, state,
|
||||
isIndexTerm);
|
||||
@ -511,6 +522,11 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
||||
return state.docFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return state.totalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
|
||||
|
@ -60,7 +60,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
||||
final FieldInfos fieldInfos;
|
||||
FieldInfo currentField;
|
||||
private final TermsIndexWriterBase termsIndexWriter;
|
||||
private final List<TermsConsumer> fields = new ArrayList<TermsConsumer>();
|
||||
private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
|
||||
private final Comparator<BytesRef> termComp;
|
||||
|
||||
public PrefixCodedTermsWriter(
|
||||
@ -96,7 +96,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
||||
assert currentField == null || currentField.name.compareTo(field.name) < 0;
|
||||
currentField = field;
|
||||
TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
|
||||
TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
|
||||
final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
|
||||
fields.add(terms);
|
||||
return terms;
|
||||
}
|
||||
@ -105,16 +105,26 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
||||
public void close() throws IOException {
|
||||
|
||||
try {
|
||||
final int fieldCount = fields.size();
|
||||
|
||||
int nonZeroCount = 0;
|
||||
for(TermsWriter field : fields) {
|
||||
if (field.numTerms > 0) {
|
||||
nonZeroCount++;
|
||||
}
|
||||
}
|
||||
|
||||
final long dirStart = out.getFilePointer();
|
||||
|
||||
out.writeInt(fieldCount);
|
||||
for(int i=0;i<fieldCount;i++) {
|
||||
TermsWriter field = (TermsWriter) fields.get(i);
|
||||
out.writeInt(field.fieldInfo.number);
|
||||
out.writeLong(field.numTerms);
|
||||
out.writeLong(field.termsStartPointer);
|
||||
out.writeVInt(nonZeroCount);
|
||||
for(TermsWriter field : fields) {
|
||||
if (field.numTerms > 0) {
|
||||
out.writeVInt(field.fieldInfo.number);
|
||||
out.writeVLong(field.numTerms);
|
||||
out.writeVLong(field.termsStartPointer);
|
||||
if (!field.fieldInfo.omitTermFreqAndPositions) {
|
||||
out.writeVLong(field.sumTotalTermFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
writeTrailer(dirStart);
|
||||
} finally {
|
||||
@ -142,6 +152,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
||||
private final long termsStartPointer;
|
||||
private long numTerms;
|
||||
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
|
||||
long sumTotalTermFreq;
|
||||
|
||||
TermsWriter(
|
||||
TermsIndexWriterBase.FieldWriter fieldIndexWriter,
|
||||
@ -169,12 +180,12 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishTerm(BytesRef text, int numDocs) throws IOException {
|
||||
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
|
||||
assert numDocs > 0;
|
||||
assert stats.docFreq > 0;
|
||||
//System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer());
|
||||
|
||||
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs);
|
||||
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
|
||||
|
||||
termWriter.write(text);
|
||||
final int highBit = isIndexTerm ? 0x80 : 0;
|
||||
@ -182,23 +193,28 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
||||
|
||||
// This is a vInt, except, we steal top bit to record
|
||||
// whether this was an indexed term:
|
||||
if ((numDocs & ~0x3F) == 0) {
|
||||
if ((stats.docFreq & ~0x3F) == 0) {
|
||||
// Fast case -- docFreq fits in 6 bits
|
||||
out.writeByte((byte) (highBit | numDocs));
|
||||
out.writeByte((byte) (highBit | stats.docFreq));
|
||||
} else {
|
||||
// Write bottom 6 bits of docFreq, then write the
|
||||
// remainder as vInt:
|
||||
out.writeByte((byte) (highBit | 0x40 | (numDocs & 0x3F)));
|
||||
out.writeVInt(numDocs >>> 6);
|
||||
out.writeByte((byte) (highBit | 0x40 | (stats.docFreq & 0x3F)));
|
||||
out.writeVInt(stats.docFreq >>> 6);
|
||||
}
|
||||
postingsWriter.finishTerm(numDocs, isIndexTerm);
|
||||
if (!fieldInfo.omitTermFreqAndPositions) {
|
||||
assert stats.totalTermFreq >= stats.docFreq;
|
||||
out.writeVLong(stats.totalTermFreq - stats.docFreq);
|
||||
}
|
||||
postingsWriter.finishTerm(stats, isIndexTerm);
|
||||
numTerms++;
|
||||
}
|
||||
|
||||
// Finishes all terms in this field
|
||||
@Override
|
||||
public void finish() throws IOException {
|
||||
public void finish(long sumTotalTermFreq) throws IOException {
|
||||
// EOF marker:
|
||||
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||
out.writeVInt(DeltaBytesWriter.TERM_EOF);
|
||||
fieldIndexWriter.finish();
|
||||
}
|
||||
|
@ -0,0 +1,28 @@
|
||||
package org.apache.lucene.index.codecs;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TermStats {
|
||||
public final int docFreq;
|
||||
public final long totalTermFreq;
|
||||
|
||||
public TermStats(int docFreq, long totalTermFreq) {
|
||||
this.docFreq = docFreq;
|
||||
this.totalTermFreq = totalTermFreq;
|
||||
}
|
||||
}
|
@ -38,10 +38,10 @@ public abstract class TermsConsumer {
|
||||
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
|
||||
|
||||
/** Finishes the current term; numDocs must be > 0. */
|
||||
public abstract void finishTerm(BytesRef text, int numDocs) throws IOException;
|
||||
public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
|
||||
|
||||
/** Called when we are done adding terms to this field */
|
||||
public abstract void finish() throws IOException;
|
||||
public abstract void finish(long sumTotalTermFreq) throws IOException;
|
||||
|
||||
/** Return the BytesRef Comparator used to sort terms
|
||||
* before feeding to this API. */
|
||||
@ -55,6 +55,7 @@ public abstract class TermsConsumer {
|
||||
|
||||
BytesRef term;
|
||||
assert termsEnum != null;
|
||||
long sumTotalTermFreq = 0;
|
||||
|
||||
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
|
||||
if (docsEnum == null) {
|
||||
@ -69,9 +70,9 @@ public abstract class TermsConsumer {
|
||||
if (docsEnumIn != null) {
|
||||
docsEnum.reset(docsEnumIn);
|
||||
final PostingsConsumer postingsConsumer = startTerm(term);
|
||||
final int numDocs = postingsConsumer.merge(mergeState, docsEnum);
|
||||
if (numDocs > 0) {
|
||||
finishTerm(term, numDocs);
|
||||
final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
|
||||
if (stats.docFreq > 0) {
|
||||
finishTerm(term, stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -94,14 +95,15 @@ public abstract class TermsConsumer {
|
||||
}
|
||||
}
|
||||
final PostingsConsumer postingsConsumer = startTerm(term);
|
||||
final int numDocs = postingsConsumer.merge(mergeState, postingsEnum);
|
||||
if (numDocs > 0) {
|
||||
finishTerm(term, numDocs);
|
||||
final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum);
|
||||
if (stats.docFreq > 0) {
|
||||
finishTerm(term, stats);
|
||||
sumTotalTermFreq += stats.totalTermFreq;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
finish();
|
||||
finish(sumTotalTermFreq);
|
||||
}
|
||||
}
|
||||
|
@ -28,7 +28,7 @@ public abstract class TermsIndexWriterBase {
|
||||
public abstract void setTermsOutput(IndexOutput out);
|
||||
|
||||
public abstract class FieldWriter {
|
||||
public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException;
|
||||
public abstract boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException;
|
||||
public abstract void finish() throws IOException;
|
||||
}
|
||||
|
||||
|
@ -59,7 +59,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||
public static abstract class IndexTermSelector {
|
||||
// Called sequentially on every term being written,
|
||||
// returning true if this term should be indexed
|
||||
public abstract boolean isIndexTerm(BytesRef term, int docFreq);
|
||||
public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
|
||||
}
|
||||
|
||||
/** Same policy as {@link FixedGapTermsIndexWriter} */
|
||||
@ -74,7 +74,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isIndexTerm(BytesRef term, int docFreq) {
|
||||
public boolean isIndexTerm(BytesRef term, TermStats stats) {
|
||||
if (count >= interval) {
|
||||
count = 0;
|
||||
return true;
|
||||
@ -99,8 +99,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isIndexTerm(BytesRef term, int docFreq) {
|
||||
if (docFreq >= docFreqThresh || count >= interval) {
|
||||
public boolean isIndexTerm(BytesRef term, TermStats stats) {
|
||||
if (stats.docFreq >= docFreqThresh || count >= interval) {
|
||||
count = 0;
|
||||
return true;
|
||||
} else {
|
||||
@ -214,8 +214,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
|
||||
if (policy.isIndexTerm(text, docFreq) || first) {
|
||||
public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
if (policy.isIndexTerm(text, stats) || first) {
|
||||
first = false;
|
||||
//System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
|
||||
final int lengthSave = text.length;
|
||||
|
@ -33,7 +33,6 @@ import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.CompoundFileReader;
|
||||
@ -263,6 +262,11 @@ public class PreFlexFields extends FieldsProducer {
|
||||
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private class PreTermsEnum extends TermsEnum {
|
||||
@ -938,6 +942,11 @@ public class PreFlexFields extends FieldsProducer {
|
||||
return termEnum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
PreDocsEnum docsEnum;
|
||||
|
@ -54,6 +54,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
|
||||
public void init(IndexInput termsIn) throws IOException {
|
||||
CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC,
|
||||
PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START);
|
||||
maxPositions = termsIn.readVInt();
|
||||
wrappedPostingsReader.init(termsIn);
|
||||
}
|
||||
|
||||
@ -115,8 +116,10 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
|
||||
|
||||
termState.pendingIndexTerm |= isIndexTerm;
|
||||
|
||||
// TODO: wasteful to use whole byte for this (need just a 1 bit);
|
||||
if (termsIn.readByte() == 1) {
|
||||
// total TF, but in the omitTFAP case its computed based on docFreq.
|
||||
long count = fieldInfo.omitTermFreqAndPositions ? termState.docFreq : termState.totalTermFreq;
|
||||
|
||||
if (count <= maxPositions) {
|
||||
|
||||
// Inlined into terms dict -- just read the byte[] blob in,
|
||||
// but don't decode it now (we only decode when a DocsEnum
|
||||
|
@ -21,15 +21,16 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CodecUtil;
|
||||
|
||||
// TODO: we now pulse entirely according to docFreq of the
|
||||
// term; it might be better to eg pulse by "net bytes used"
|
||||
// so that a term that has only 1 doc but zillions of
|
||||
// positions would not be inlined. Though this is
|
||||
// TODO: we pulse based on total TF of the term,
|
||||
// it might be better to eg pulse by "net bytes used"
|
||||
// so that a term that has only 1 posting but a huge
|
||||
// payload would not be inlined. Though this is
|
||||
// presumably rare in practice...
|
||||
|
||||
/** @lucene.experimental */
|
||||
@ -85,6 +86,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
|
||||
public void start(IndexOutput termsOut) throws IOException {
|
||||
this.termsOut = termsOut;
|
||||
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
|
||||
termsOut.writeVInt(pending.length); // encode maxPositions in header
|
||||
wrappedPostingsWriter.start(termsOut);
|
||||
}
|
||||
|
||||
@ -177,7 +179,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
|
||||
public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
|
||||
//System.out.println("PW finishTerm docCount=" + docCount);
|
||||
|
||||
assert pendingCount > 0 || pendingCount == -1;
|
||||
@ -185,8 +187,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
|
||||
pendingIsIndexTerm |= isIndexTerm;
|
||||
|
||||
if (pendingCount == -1) {
|
||||
termsOut.writeByte((byte) 0);
|
||||
wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm);
|
||||
wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm);
|
||||
pendingIsIndexTerm = false;
|
||||
} else {
|
||||
|
||||
@ -194,8 +195,6 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
|
||||
// term, so we fully inline our postings data into
|
||||
// terms dict, now:
|
||||
|
||||
termsOut.writeByte((byte) 1);
|
||||
|
||||
// TODO: it'd be better to share this encoding logic
|
||||
// in some inner codec that knows how to write a
|
||||
// single doc / single position, etc. This way if a
|
||||
|
@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CodecUtil;
|
||||
@ -239,11 +240,11 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
|
||||
public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
|
||||
|
||||
// TODO: -- wasteful we are counting this in two places?
|
||||
assert docCount > 0;
|
||||
assert docCount == df;
|
||||
assert stats.docFreq > 0;
|
||||
assert stats.docFreq == df;
|
||||
|
||||
docIndex.write(termsOut, isIndexTerm);
|
||||
|
||||
|
@ -21,7 +21,6 @@ import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.codecs.FieldsProducer;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
@ -119,28 +118,31 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||
private final IndexInput in;
|
||||
private final boolean omitTF;
|
||||
private int docFreq;
|
||||
private long totalTermFreq;
|
||||
private long docsStart;
|
||||
private boolean ended;
|
||||
private final BytesRefFSTEnum<PairOutputs.Pair<Long,Long>> fstEnum;
|
||||
private final BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstEnum;
|
||||
|
||||
public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,Long>> fst, boolean omitTF) throws IOException {
|
||||
public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst, boolean omitTF) throws IOException {
|
||||
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||
this.omitTF = omitTF;
|
||||
fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,Long>>(fst);
|
||||
fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(fst);
|
||||
}
|
||||
|
||||
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
|
||||
|
||||
//System.out.println("seek to text=" + text.utf8ToString());
|
||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.seekCeil(text);
|
||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.seekCeil(text);
|
||||
if (result == null) {
|
||||
//System.out.println(" end");
|
||||
return SeekStatus.END;
|
||||
} else {
|
||||
//System.out.println(" got text=" + term.utf8ToString());
|
||||
PairOutputs.Pair<Long,Long> pair = result.output;
|
||||
docsStart = pair.output1;
|
||||
docFreq = pair.output2.intValue();
|
||||
PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
|
||||
PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
|
||||
docsStart = pair1.output1;
|
||||
docFreq = pair2.output1.intValue();
|
||||
totalTermFreq = pair2.output2;
|
||||
|
||||
if (result.input.equals(text)) {
|
||||
//System.out.println(" match docsStart=" + docsStart);
|
||||
@ -155,11 +157,13 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
assert !ended;
|
||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.next();
|
||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.next();
|
||||
if (result != null) {
|
||||
final PairOutputs.Pair<Long,Long> pair = result.output;
|
||||
docsStart = pair.output1;
|
||||
docFreq = pair.output2.intValue();
|
||||
PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
|
||||
PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
|
||||
docsStart = pair1.output1;
|
||||
docFreq = pair2.output1.intValue();
|
||||
totalTermFreq = pair2.output2;
|
||||
return result.input;
|
||||
} else {
|
||||
return null;
|
||||
@ -186,6 +190,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||
return docFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return totalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
SimpleTextDocsEnum docsEnum;
|
||||
@ -438,8 +447,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||
private class SimpleTextTerms extends Terms {
|
||||
private final long termsStart;
|
||||
private final boolean omitTF;
|
||||
private FST<PairOutputs.Pair<Long,Long>> fst;
|
||||
|
||||
private long sumTotalTermFreq;
|
||||
private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
|
||||
private int termCount;
|
||||
private final BytesRef scratch = new BytesRef(10);
|
||||
|
||||
public SimpleTextTerms(String field, long termsStart) throws IOException {
|
||||
@ -450,24 +460,38 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||
|
||||
private void loadTerms() throws IOException {
|
||||
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
|
||||
Builder<PairOutputs.Pair<Long,Long>> b = new Builder<PairOutputs.Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs));
|
||||
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
|
||||
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
|
||||
0,
|
||||
0,
|
||||
true,
|
||||
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
|
||||
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
|
||||
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||
in.seek(termsStart);
|
||||
final BytesRef lastTerm = new BytesRef(10);
|
||||
long lastDocsStart = -1;
|
||||
int docFreq = 0;
|
||||
long totalTermFreq = 0;
|
||||
while(true) {
|
||||
readLine(in, scratch);
|
||||
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
|
||||
if (lastDocsStart != -1) {
|
||||
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
|
||||
b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
|
||||
new PairOutputs.Pair<Long,Long>((long) docFreq,
|
||||
posIntOutputs.get(totalTermFreq))));
|
||||
sumTotalTermFreq += totalTermFreq;
|
||||
}
|
||||
break;
|
||||
} else if (scratch.startsWith(DOC)) {
|
||||
docFreq++;
|
||||
} else if (scratch.startsWith(POS)) {
|
||||
totalTermFreq++;
|
||||
} else if (scratch.startsWith(TERM)) {
|
||||
if (lastDocsStart != -1) {
|
||||
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
|
||||
b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
|
||||
new PairOutputs.Pair<Long,Long>((long) docFreq,
|
||||
posIntOutputs.get(totalTermFreq))));
|
||||
}
|
||||
lastDocsStart = in.getFilePointer();
|
||||
final int len = scratch.length - TERM.length;
|
||||
@ -477,6 +501,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||
System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
|
||||
lastTerm.length = len;
|
||||
docFreq = 0;
|
||||
sumTotalTermFreq += totalTermFreq;
|
||||
totalTermFreq = 0;
|
||||
termCount++;
|
||||
}
|
||||
}
|
||||
fst = b.finish();
|
||||
@ -502,6 +529,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getUniqueTermCount() {
|
||||
return (long) termCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return sumTotalTermFreq;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -22,6 +22,7 @@ import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.index.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
@ -84,11 +85,11 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishTerm(BytesRef term, int numDocs) throws IOException {
|
||||
public void finishTerm(BytesRef term, TermStats stats) throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finish() throws IOException {
|
||||
public void finish(long sumTotalTermFreq) throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CodecUtil;
|
||||
|
||||
@ -184,12 +185,12 @@ public final class StandardPostingsWriter extends PostingsWriterBase {
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
|
||||
assert docCount > 0;
|
||||
public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
|
||||
assert stats.docFreq > 0;
|
||||
|
||||
// TODO: wasteful we are counting this (counting # docs
|
||||
// for this term) in two places?
|
||||
assert docCount == df;
|
||||
assert stats.docFreq == df;
|
||||
|
||||
if (isIndexTerm) {
|
||||
// Write absolute at seek points
|
||||
|
@ -20,7 +20,6 @@ package org.apache.lucene.search;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Weight.ScorerContext;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -126,6 +126,11 @@ public abstract class FilteredTermsEnum extends TermsEnum {
|
||||
return tenum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return tenum.totalTermFreq();
|
||||
}
|
||||
|
||||
/** This enum does not support seeking!
|
||||
* @throws UnsupportedOperationException
|
||||
*/
|
||||
|
@ -245,6 +245,11 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||
return actualEnum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return actualEnum.totalTermFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
return actualEnum.docs(skipDocs, reuse);
|
||||
|
@ -25,7 +25,6 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.search.Weight.ScorerContext;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
@ -26,7 +26,6 @@ import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Explanation.IDFExplanation;
|
||||
import org.apache.lucene.search.Weight.ScorerContext;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
@ -28,7 +28,6 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader.ReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Explanation.IDFExplanation;
|
||||
import org.apache.lucene.search.Weight.ScorerContext;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.PerReaderTermState;
|
||||
import org.apache.lucene.util.ReaderUtil;
|
||||
|
@ -28,7 +28,6 @@ import org.apache.lucene.index.OrdTermState;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.codecs.PrefixCodedTermState;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.FieldCache.DocTermsIndex;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
@ -321,6 +320,11 @@ public class DocTermsIndexCreator extends EntryCreatorWithOptions<DocTermsIndex>
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
|
@ -31,7 +31,6 @@ import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.Weight.ScorerContext;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
/**
|
||||
|
@ -19,7 +19,6 @@ package org.apache.lucene.search.function;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.Weight.ScorerContext;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
@ -102,6 +102,8 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||
static class RAMField extends Terms {
|
||||
final String field;
|
||||
final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>();
|
||||
long sumTotalTermFreq;
|
||||
|
||||
RAMField(String field) {
|
||||
this.field = field;
|
||||
}
|
||||
@ -111,6 +113,11 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||
return termToDocs.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return sumTotalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum iterator() {
|
||||
return new RAMTermsEnum(RAMOnlyCodec.RAMField.this);
|
||||
@ -124,6 +131,7 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||
|
||||
static class RAMTerm {
|
||||
final String term;
|
||||
long totalTermFreq;
|
||||
final List<RAMDoc> docs = new ArrayList<RAMDoc>();
|
||||
public RAMTerm(String term) {
|
||||
this.term = term;
|
||||
@ -189,14 +197,16 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishTerm(BytesRef text, int numDocs) {
|
||||
assert numDocs > 0;
|
||||
assert numDocs == current.docs.size();
|
||||
public void finishTerm(BytesRef text, TermStats stats) {
|
||||
assert stats.docFreq > 0;
|
||||
assert stats.docFreq == current.docs.size();
|
||||
current.totalTermFreq = stats.totalTermFreq;
|
||||
field.termToDocs.put(current.term, current);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finish() {
|
||||
public void finish(long sumTotalTermFreq) {
|
||||
field.sumTotalTermFreq = sumTotalTermFreq;
|
||||
}
|
||||
}
|
||||
|
||||
@ -331,6 +341,10 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return ramField.termToDocs.get(current).totalTermFreq;
|
||||
}
|
||||
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
||||
return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs);
|
||||
}
|
||||
|
@ -30,6 +30,7 @@ import org.apache.lucene.index.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.index.codecs.FieldsProducer;
|
||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.index.codecs.mocksep.MockSepCodec;
|
||||
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
@ -97,9 +98,11 @@ public class TestCodecs extends LuceneTestCase {
|
||||
public void write(final FieldsConsumer consumer) throws Throwable {
|
||||
Arrays.sort(terms);
|
||||
final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
|
||||
for (final TermData term : terms)
|
||||
term.write(termsConsumer);
|
||||
termsConsumer.finish();
|
||||
long sumTotalTermCount = 0;
|
||||
for (final TermData term : terms) {
|
||||
sumTotalTermCount += term.write(termsConsumer);
|
||||
}
|
||||
termsConsumer.finish(sumTotalTermCount);
|
||||
}
|
||||
}
|
||||
|
||||
@ -131,8 +134,9 @@ public class TestCodecs extends LuceneTestCase {
|
||||
return text.compareTo(((TermData) o).text);
|
||||
}
|
||||
|
||||
public void write(final TermsConsumer termsConsumer) throws Throwable {
|
||||
public long write(final TermsConsumer termsConsumer) throws Throwable {
|
||||
final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text);
|
||||
long totTF = 0;
|
||||
for(int i=0;i<docs.length;i++) {
|
||||
final int termDocFreq;
|
||||
if (field.omitTF) {
|
||||
@ -142,6 +146,7 @@ public class TestCodecs extends LuceneTestCase {
|
||||
}
|
||||
postingsConsumer.startDoc(docs[i], termDocFreq);
|
||||
if (!field.omitTF) {
|
||||
totTF += positions[i].length;
|
||||
for(int j=0;j<positions[i].length;j++) {
|
||||
final PositionData pos = positions[i][j];
|
||||
postingsConsumer.addPosition(pos.pos, pos.payload);
|
||||
@ -149,7 +154,8 @@ public class TestCodecs extends LuceneTestCase {
|
||||
postingsConsumer.finishDoc();
|
||||
}
|
||||
}
|
||||
termsConsumer.finishTerm(text, docs.length);
|
||||
termsConsumer.finishTerm(text, new TermStats(docs.length, totTF));
|
||||
return totTF;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -29,7 +29,7 @@ import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
|
||||
import org.junit.Assume;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
@ -1865,4 +1865,27 @@ public class TestIndexReader extends LuceneTestCase
|
||||
assertTrue(IndexReader.indexExists(dir));
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// Make sure totalTermFreq works correctly in the terms
|
||||
// dict cache
|
||||
public void testTotalTermFreqCached() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
|
||||
Document d = new Document();
|
||||
d.add(newField("f", "a a b", Field.Index.ANALYZED));
|
||||
writer.addDocument(d);
|
||||
IndexReader r = writer.getReader();
|
||||
writer.close();
|
||||
Terms terms = MultiFields.getTerms(r, "f");
|
||||
try {
|
||||
// Make sure codec impls totalTermFreq (eg PreFlex doesn't)
|
||||
Assume.assumeTrue(terms.totalTermFreq(new BytesRef("b")) != -1);
|
||||
assertEquals(1, terms.totalTermFreq(new BytesRef("b")));
|
||||
assertEquals(2, terms.totalTermFreq(new BytesRef("a")));
|
||||
assertEquals(1, terms.totalTermFreq(new BytesRef("b")));
|
||||
} finally {
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,116 @@
|
||||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
/**
|
||||
* Tests the maxTermFrequency statistic in FieldInvertState
|
||||
*/
|
||||
public class TestMaxTermFrequency extends LuceneTestCase {
|
||||
Directory dir;
|
||||
IndexReader reader;
|
||||
/* expected maxTermFrequency values for our documents */
|
||||
ArrayList<Integer> expected = new ArrayList<Integer>();
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
dir = newDirectory();
|
||||
IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||
new MockAnalyzer(MockTokenizer.SIMPLE, true));
|
||||
config.setSimilarity(new TestSimilarity());
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
|
||||
Document doc = new Document();
|
||||
Field foo = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED);
|
||||
doc.add(foo);
|
||||
for (int i = 0; i < 100; i++) {
|
||||
foo.setValue(addValue());
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
reader.close();
|
||||
dir.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
public void test() throws Exception {
|
||||
byte fooNorms[] = MultiNorms.norms(reader, "foo");
|
||||
for (int i = 0; i < reader.maxDoc(); i++)
|
||||
assertEquals(expected.get(i).intValue(), fooNorms[i] & 0xff);
|
||||
}
|
||||
|
||||
/**
|
||||
* Makes a bunch of single-char tokens (the max freq will at most be 255).
|
||||
* shuffles them around, and returns the whole list with Arrays.toString().
|
||||
* This works fine because we use lettertokenizer.
|
||||
* puts the max-frequency term into expected, to be checked against the norm.
|
||||
*/
|
||||
private String addValue() {
|
||||
List<String> terms = new ArrayList<String>();
|
||||
int maxCeiling = _TestUtil.nextInt(random, 0, 255);
|
||||
int max = 0;
|
||||
for (char ch = 'a'; ch <= 'z'; ch++) {
|
||||
int num = _TestUtil.nextInt(random, 0, maxCeiling);
|
||||
for (int i = 0; i < num; i++)
|
||||
terms.add(Character.toString(ch));
|
||||
max = Math.max(max, num);
|
||||
}
|
||||
expected.add(max);
|
||||
Collections.shuffle(terms, random);
|
||||
return Arrays.toString(terms.toArray(new String[terms.size()]));
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple similarity that encodes maxTermFrequency directly as a byte
|
||||
*/
|
||||
class TestSimilarity extends DefaultSimilarity {
|
||||
|
||||
@Override
|
||||
public byte encodeNormValue(float f) {
|
||||
return (byte) f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float decodeNormValue(byte b) {
|
||||
return (float) b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computeNorm(String field, FieldInvertState state) {
|
||||
return (float) state.getMaxTermFrequency();
|
||||
}
|
||||
}
|
||||
}
|
@ -39,6 +39,7 @@ import org.apache.lucene.index.codecs.TermsIndexReaderBase;
|
||||
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
|
||||
import org.apache.lucene.index.codecs.VariableGapTermsIndexReader;
|
||||
import org.apache.lucene.index.codecs.VariableGapTermsIndexWriter;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.index.codecs.mockintblock.MockFixedIntBlockCodec;
|
||||
import org.apache.lucene.index.codecs.mockintblock.MockVariableIntBlockCodec;
|
||||
import org.apache.lucene.index.codecs.mocksep.MockSingleIntFactory;
|
||||
@ -66,7 +67,7 @@ public class MockRandomCodec extends Codec {
|
||||
|
||||
public MockRandomCodec(Random random) {
|
||||
name = "MockRandom";
|
||||
this.seedRandom = random;
|
||||
this.seedRandom = new Random(random.nextLong());
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -148,7 +149,7 @@ public class MockRandomCodec extends Codec {
|
||||
final Random rand = new Random(seed2);
|
||||
|
||||
@Override
|
||||
public boolean isIndexTerm(BytesRef term, int docFreq) {
|
||||
public boolean isIndexTerm(BytesRef term, TermStats stats) {
|
||||
return random.nextInt(gap) == 17;
|
||||
}
|
||||
};
|
||||
|
@ -21,6 +21,7 @@ import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter;
|
||||
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
@ -184,10 +185,10 @@ class PreFlexFieldsWriter extends FieldsConsumer {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishTerm(BytesRef text, int numDocs) throws IOException {
|
||||
if (numDocs > 0) {
|
||||
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
if (stats.docFreq > 0) {
|
||||
long skipPointer = skipListWriter.writeSkip(freqOut);
|
||||
termInfo.docFreq = numDocs;
|
||||
termInfo.docFreq = stats.docFreq;
|
||||
termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer);
|
||||
//System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number);
|
||||
termsOut.add(fieldInfo.number,
|
||||
@ -197,7 +198,7 @@ class PreFlexFieldsWriter extends FieldsConsumer {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finish() throws IOException {
|
||||
public void finish(long sumTotalTermCount) throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.Weight.ScorerContext;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
@ -18,7 +18,7 @@ $Id$
|
||||
This release of C2 also does not have a specific Lucene dependency. (Stanislaw Osinski, gsingers)
|
||||
|
||||
* SOLR-2282: Add distributed search support for search result clustering.
|
||||
(Brad Giaccio, koji)
|
||||
(Brad Giaccio, Dawid Weiss, Stanislaw Osinski, rmuir, koji)
|
||||
|
||||
================== Release 1.4.0 ==================
|
||||
|
||||
|
@ -124,7 +124,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||
// is included in the code base of Solr, so that it's possible to refactor
|
||||
// the Lucene APIs the factory relies on if needed.
|
||||
initAttributes.put("PreprocessingPipeline.languageModelFactory",
|
||||
new LuceneLanguageModelFactory());
|
||||
LuceneLanguageModelFactory.class);
|
||||
this.controller.init(initAttributes);
|
||||
|
||||
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
|
||||
|
@ -200,7 +200,7 @@ public class LuceneLanguageModelFactory extends DefaultLanguageModelFactory {
|
||||
logger
|
||||
.warn(
|
||||
"Could not instantiate Lucene stemmer for Arabic, clustering quality "
|
||||
+ "of Chinese content may be degraded. For best quality clusters, "
|
||||
+ "of Arabic content may be degraded. For best quality clusters, "
|
||||
+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
|
||||
e);
|
||||
}
|
||||
|
@ -20,9 +20,6 @@ package org.apache.solr.handler.clustering;
|
||||
import org.apache.solr.BaseDistributedSearchTestCase;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
|
||||
import org.junit.Ignore;
|
||||
|
||||
@Ignore("FIXME: test fails on hudson")
|
||||
public class DistributedClusteringComponentTest extends
|
||||
BaseDistributedSearchTestCase {
|
||||
|
||||
|
@ -1000,6 +1000,10 @@ class NumberedTermsEnum extends TermsEnum {
|
||||
return tenum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return tenum.totalTermFreq();
|
||||
}
|
||||
|
||||
public BytesRef skipTo(BytesRef target) throws IOException {
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user