LUCENE-3069: reader part, update logic in outputs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1500814 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Han Jiang 2013-07-08 16:08:32 +00:00
parent 9f6db24cee
commit fb794540b5
3 changed files with 48 additions and 70 deletions

View File

@ -46,7 +46,6 @@ import org.apache.lucene.codecs.CodecUtil;
/** FST based term dict, all the metadata held /** FST based term dict, all the metadata held
* as output of FST */ * as output of FST */
// nocommit: where is 'TermStats' ???
public class TempFSTTermsWriter extends FieldsConsumer { public class TempFSTTermsWriter extends FieldsConsumer {
static final String TERMS_EXTENSION = "tmp"; static final String TERMS_EXTENSION = "tmp";
static final String TERMS_CODEC_NAME = "FST_TERMS_DICT"; static final String TERMS_CODEC_NAME = "FST_TERMS_DICT";

View File

@ -33,7 +33,7 @@ import org.apache.lucene.util.LongsRef;
public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> { public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
private final static TempMetaData NO_OUTPUT = new TempMetaData(); private final static TempMetaData NO_OUTPUT = new TempMetaData();
private static boolean DEBUG = false; private static boolean DEBUG = false;
private FieldInfo fieldInfo; private boolean hasPos;
private int longsSize; private int longsSize;
public static class TempMetaData { public static class TempMetaData {
@ -104,23 +104,26 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
} }
protected TempTermOutputs(FieldInfo fieldInfo, int longsSize) { protected TempTermOutputs(FieldInfo fieldInfo, int longsSize) {
this.fieldInfo = fieldInfo; this.hasPos = (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY);
this.longsSize = longsSize; this.longsSize = longsSize;
} }
@Override @Override
// //
// Since longs blob is fixed length, when these two are 'comparable' // The return value will be the smaller one, when these two are
// i.e. when every value in long[] fits the same ordering, the smaller one // 'comparable', i.e. every value in long[] fits the same ordering.
// will be the result.
// //
// NOTE: only long[] is 'shared', i.e. if there are two byte[] on the successive // NOTE:
// arcs, only the last byte[] is valid. (this somewhat saves nodes, but might affect // Only long[] is 'shared', byte[] and term stats simply act
// compression, since we'll have to load metadata block for other terms as well, currently, // as 'attachment': when walking on the FST, if we see two byte[] on
// we don't support this) // successive arcs, only the second byte[] is valid.
// //
// nocommit: get the byte[] from smaller one as well, so that // Therefore, during building, we always make sure that, for most nodes,
// byte[] is actually inherited // the first output is 'pushed' one step towards root and reduced to
// be NO_OUTPUT, so that we get rid of the 'all zero' long[], and netly
// get smaller amount of total outputs.
//
// However, when decoding, terms might have to load redundant byte[] blob.
// //
public TempMetaData common(TempMetaData t1, TempMetaData t2) { public TempMetaData common(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("common("+t1+", "+t2+") = "); if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
@ -128,14 +131,11 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
if (DEBUG) System.out.println("ret:"+NO_OUTPUT); if (DEBUG) System.out.println("ret:"+NO_OUTPUT);
return NO_OUTPUT; return NO_OUTPUT;
} }
assert t1.longs != null;
assert t2.longs != null;
assert t1.longs.length == t2.longs.length; assert t1.longs.length == t2.longs.length;
long accum = 0;
long[] longs1 = t1.longs, longs2 = t2.longs; long[] longs1 = t1.longs, longs2 = t2.longs;
int pos = 0; int pos = 0;
boolean order = true; boolean smaller = true;
TempMetaData ret; TempMetaData ret;
while (pos < longsSize && longs1[pos] == longs2[pos]) { while (pos < longsSize && longs1[pos] == longs2[pos]) {
@ -143,56 +143,45 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
} }
if (pos < longsSize) { if (pos < longsSize) {
// unequal // unequal
order = (longs1[pos] > longs2[pos]); smaller = (longs1[pos] < longs2[pos]);
if (order) { if (smaller) {
// check whether strictly longs1 >= longs2
while (pos < longsSize && longs1[pos] >= longs2[pos]) {
accum += longs2[pos];
pos++;
}
} else {
// check whether strictly longs1 <= longs2 // check whether strictly longs1 <= longs2
while (pos < longsSize && longs1[pos] <= longs2[pos]) { while (pos < longsSize && longs1[pos] <= longs2[pos]) {
accum += longs1[pos]; pos++;
}
} else {
// check whether strictly longs1 >= longs2
while (pos < longsSize && longs1[pos] >= longs2[pos]) {
pos++; pos++;
} }
} }
if (pos < longsSize || accum == 0) { if (pos < longsSize) { // not fully 'comparable'
ret = NO_OUTPUT; ret = NO_OUTPUT;
} else if (order) { } else if (smaller) {
ret = new TempMetaData(longs2, null, 0, -1);
} else {
ret = new TempMetaData(longs1, null, 0, -1);
}
} else {
// equal
if (t1.bytes!= null && bytesEqual(t1, t2) && statsEqual(t1, t2)) { // all fields are equal
ret = t1; ret = t1;
} else if (accum == 0) { // all zero case
ret = NO_OUTPUT;
} else { } else {
ret = new TempMetaData(longs1, null, 0, -1); ret = t2;
} }
} else {
// equal, we won't check byte[] and docFreq
ret = t1;
} }
if (DEBUG) System.out.println("ret:"+ret); if (DEBUG) System.out.println("ret:"+ret);
return ret; return ret;
} }
@Override @Override
// nocommit:
// this *actually* always assume that t2 <= t1 before calling the method
public TempMetaData subtract(TempMetaData t1, TempMetaData t2) { public TempMetaData subtract(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("subtract("+t1+", "+t2+") = "); if (DEBUG) System.out.print("subtract("+t1+", "+t2+") = ");
if (t2 == NO_OUTPUT) { if (t2 == NO_OUTPUT) {
if (DEBUG) System.out.println("ret:"+t1); if (DEBUG) System.out.println("ret:"+t1);
return t1; return t1;
} }
assert t1.longs != null; assert t1.longs.length == t2.longs.length;
assert t2.longs != null;
int pos = 0; int pos = 0;
long diff = 0; long diff = 0;
long[] share = new long[longsSize]; // nocommit: reuse long[] share = new long[longsSize];
while (pos < longsSize) { while (pos < longsSize) {
share[pos] = t1.longs[pos] - t2.longs[pos]; share[pos] = t1.longs[pos] - t2.longs[pos];
@ -201,7 +190,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
} }
TempMetaData ret; TempMetaData ret;
if (diff == 0 && bytesEqual(t1, t2) && statsEqual(t1, t2)) { if (diff == 0 && statsEqual(t1, t2) && bytesEqual(t1, t2)) {
ret = NO_OUTPUT; ret = NO_OUTPUT;
} else { } else {
ret = new TempMetaData(share, t1.bytes, t1.docFreq, t1.totalTermFreq); ret = new TempMetaData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
@ -210,16 +199,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
return ret; return ret;
} }
static boolean statsEqual(final TempMetaData t1, final TempMetaData t2) {
return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
}
static boolean bytesEqual(final TempMetaData t1, final TempMetaData t2) {
return Arrays.equals(t1.bytes, t2.bytes);
}
@Override @Override
// nocommit: need to check all-zero case?
// so we can reuse one long[]
public TempMetaData add(TempMetaData t1, TempMetaData t2) { public TempMetaData add(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("add("+t1+", "+t2+") = "); if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
if (t1 == NO_OUTPUT) { if (t1 == NO_OUTPUT) {
@ -229,22 +209,16 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
if (DEBUG) System.out.println("ret:"+t1); if (DEBUG) System.out.println("ret:"+t1);
return t1; return t1;
} }
assert t1.longs != null; assert t1.longs.length == t2.longs.length;
assert t2.longs != null;
int pos = 0; int pos = 0;
long[] accum = new long[longsSize]; // nocommit: reuse? long[] accum = new long[longsSize];
while (pos < longsSize) { while (pos < longsSize) {
accum[pos] = t1.longs[pos] + t2.longs[pos]; accum[pos] = t1.longs[pos] + t2.longs[pos];
assert(accum[pos] >= 0);
pos++; pos++;
} }
TempMetaData ret; TempMetaData ret;
if (t2.bytes != null || t2.docFreq > 0) {
ret = new TempMetaData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq); ret = new TempMetaData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq);
} else {
ret = new TempMetaData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
}
if (DEBUG) System.out.println("ret:"+ret); if (DEBUG) System.out.println("ret:"+ret);
return ret; return ret;
} }
@ -263,7 +237,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
} }
if (data.docFreq > 0) { if (data.docFreq > 0) {
out.writeVInt(data.docFreq); out.writeVInt(data.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { if (hasPos) {
out.writeVLong(data.totalTermFreq - data.docFreq); out.writeVLong(data.totalTermFreq - data.docFreq);
} }
} }
@ -272,26 +246,25 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
@Override @Override
public TempMetaData read(DataInput in) throws IOException { public TempMetaData read(DataInput in) throws IOException {
long[] longs = new long[longsSize]; long[] longs = new long[longsSize];
byte[] bytes = null;
int docFreq = 0;
long totalTermFreq = -1;
for (int pos = 0; pos < longsSize; pos++) { for (int pos = 0; pos < longsSize; pos++) {
longs[pos] = in.readVLong(); longs[pos] = in.readVLong();
} }
int code = in.readVInt(); int code = in.readVInt();
int bytesSize = code >>> 1; int bytesSize = code >>> 1;
int docFreq = 0;
long totalTermFreq = -1;
byte[] bytes = null;
if (bytesSize > 0) { if (bytesSize > 0) {
bytes = new byte[bytesSize]; bytes = new byte[bytesSize];
in.readBytes(bytes, 0, bytes.length); in.readBytes(bytes, 0, bytes.length);
} }
if ((code & 1) == 1) { if ((code & 1) == 1) {
docFreq = in.readVInt(); docFreq = in.readVInt();
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { if (hasPos) {
totalTermFreq = docFreq + in.readVLong(); totalTermFreq = docFreq + in.readVLong();
} }
} }
TempMetaData meta = new TempMetaData(longs, bytes, docFreq, totalTermFreq); return new TempMetaData(longs, bytes, docFreq, totalTermFreq);
return meta;
} }
@Override @Override
@ -303,5 +276,11 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
public String outputToString(TempMetaData data) { public String outputToString(TempMetaData data) {
return data.toString(); return data.toString();
} }
}
static boolean statsEqual(final TempMetaData t1, final TempMetaData t2) {
return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
}
static boolean bytesEqual(final TempMetaData t1, final TempMetaData t2) {
return Arrays.equals(t1.bytes, t2.bytes);
}
}

View File

@ -40,7 +40,7 @@ public abstract class Outputs<T> {
// (new object per byte/char/int) if eg used during // (new object per byte/char/int) if eg used during
// analysis // analysis
/** Eg common("foo", "foobar") -> "foo" */ /** Eg common("foobar", "food") -> "foo" */
public abstract T common(T output1, T output2); public abstract T common(T output1, T output2);
/** Eg subtract("foobar", "foo") -> "bar" */ /** Eg subtract("foobar", "foo") -> "bar" */