reduce RAM cost per unique field while writing postings

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1399607 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-10-18 12:19:05 +00:00
parent dd93b85321
commit 33b30097aa
2 changed files with 84 additions and 49 deletions

View File

@ -69,7 +69,27 @@ public class BlockTermsWriter extends FieldsConsumer {
final FieldInfos fieldInfos; final FieldInfos fieldInfos;
FieldInfo currentField; FieldInfo currentField;
private final TermsIndexWriterBase termsIndexWriter; private final TermsIndexWriterBase termsIndexWriter;
private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
private static class FieldMetaData {
public final FieldInfo fieldInfo;
public final long numTerms;
public final long termsStartPointer;
public final long sumTotalTermFreq;
public final long sumDocFreq;
public final int docCount;
public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
this.termsStartPointer = termsStartPointer;
this.numTerms = numTerms;
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
}
}
private final List<FieldMetaData> fields = new ArrayList<FieldMetaData>();
// private final String segment; // private final String segment;
@ -108,9 +128,7 @@ public class BlockTermsWriter extends FieldsConsumer {
assert currentField == null || currentField.name.compareTo(field.name) < 0; assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field; currentField = field;
TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field, out.getFilePointer()); TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field, out.getFilePointer());
final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter); return new TermsWriter(fieldIndexWriter, field, postingsWriter);
fields.add(terms);
return terms;
} }
@Override @Override
@ -118,27 +136,18 @@ public class BlockTermsWriter extends FieldsConsumer {
try { try {
int nonZeroCount = 0;
for(TermsWriter field : fields) {
if (field.numTerms > 0) {
nonZeroCount++;
}
}
final long dirStart = out.getFilePointer(); final long dirStart = out.getFilePointer();
out.writeVInt(nonZeroCount); out.writeVInt(fields.size());
for(TermsWriter field : fields) { for(FieldMetaData field : fields) {
if (field.numTerms > 0) { out.writeVInt(field.fieldInfo.number);
out.writeVInt(field.fieldInfo.number); out.writeVLong(field.numTerms);
out.writeVLong(field.numTerms); out.writeVLong(field.termsStartPointer);
out.writeVLong(field.termsStartPointer); if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { out.writeVLong(field.sumTotalTermFreq);
out.writeVLong(field.sumTotalTermFreq);
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
} }
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
} }
writeTrailer(dirStart); writeTrailer(dirStart);
} finally { } finally {
@ -249,6 +258,14 @@ public class BlockTermsWriter extends FieldsConsumer {
this.sumDocFreq = sumDocFreq; this.sumDocFreq = sumDocFreq;
this.docCount = docCount; this.docCount = docCount;
fieldIndexWriter.finish(out.getFilePointer()); fieldIndexWriter.finish(out.getFilePointer());
if (numTerms > 0) {
fields.add(new FieldMetaData(fieldInfo,
numTerms,
termsStartPointer,
sumTotalTermFreq,
sumDocFreq,
docCount));
}
} }
private int sharedPrefix(BytesRef term1, BytesRef term2) { private int sharedPrefix(BytesRef term1, BytesRef term2) {

View File

@ -228,7 +228,30 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
final PostingsWriterBase postingsWriter; final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos; final FieldInfos fieldInfos;
FieldInfo currentField; FieldInfo currentField;
private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
private static class FieldMetaData {
public final FieldInfo fieldInfo;
public final BytesRef rootCode;
public final long numTerms;
public final long indexStartFP;
public final long sumTotalTermFreq;
public final long sumDocFreq;
public final int docCount;
public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms;
this.rootCode = rootCode;
this.indexStartFP = indexStartFP;
this.numTerms = numTerms;
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
}
}
private final List<FieldMetaData> fields = new ArrayList<FieldMetaData>();
// private final String segment; // private final String segment;
/** Create a new writer. The number of items (terms or /** Create a new writer. The number of items (terms or
@ -313,9 +336,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
//if (DEBUG) System.out.println("\nBTTW.addField seg=" + segment + " field=" + field.name); //if (DEBUG) System.out.println("\nBTTW.addField seg=" + segment + " field=" + field.name);
assert currentField == null || currentField.name.compareTo(field.name) < 0; assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field; currentField = field;
final TermsWriter terms = new TermsWriter(field); return new TermsWriter(field);
fields.add(terms);
return terms;
} }
static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) { static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) {
@ -1007,6 +1028,14 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
// System.out.println("SAVED to " + dotFileName); // System.out.println("SAVED to " + dotFileName);
// w.close(); // w.close();
// } // }
fields.add(new FieldMetaData(fieldInfo,
((PendingBlock) pending.get(0)).index.getEmptyOutput(),
numTerms,
indexStartFP,
sumTotalTermFreq,
sumDocFreq,
docCount));
} else { } else {
assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1; assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1;
assert sumDocFreq == 0; assert sumDocFreq == 0;
@ -1024,34 +1053,23 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
IOException ioe = null; IOException ioe = null;
try { try {
int nonZeroCount = 0;
for(TermsWriter field : fields) {
if (field.numTerms > 0) {
nonZeroCount++;
}
}
final long dirStart = out.getFilePointer(); final long dirStart = out.getFilePointer();
final long indexDirStart = indexOut.getFilePointer(); final long indexDirStart = indexOut.getFilePointer();
out.writeVInt(nonZeroCount); out.writeVInt(fields.size());
for(TermsWriter field : fields) { for(FieldMetaData field : fields) {
if (field.numTerms > 0) { //System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms");
//System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms"); out.writeVInt(field.fieldInfo.number);
out.writeVInt(field.fieldInfo.number); out.writeVLong(field.numTerms);
out.writeVLong(field.numTerms); out.writeVInt(field.rootCode.length);
final BytesRef rootCode = ((PendingBlock) field.pending.get(0)).index.getEmptyOutput(); out.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length);
assert rootCode != null: "field=" + field.fieldInfo.name + " numTerms=" + field.numTerms; if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
out.writeVInt(rootCode.length); out.writeVLong(field.sumTotalTermFreq);
out.writeBytes(rootCode.bytes, rootCode.offset, rootCode.length);
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
out.writeVLong(field.sumTotalTermFreq);
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
indexOut.writeVLong(field.indexStartFP);
} }
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
indexOut.writeVLong(field.indexStartFP);
} }
writeTrailer(out, dirStart); writeTrailer(out, dirStart);
writeIndexTrailer(indexOut, indexDirStart); writeIndexTrailer(indexOut, indexDirStart);