mirror of https://github.com/apache/lucene.git
LUCENE-3069: reader part, update logic in outputs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1500814 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9f6db24cee
commit
fb794540b5
|
@ -46,7 +46,6 @@ import org.apache.lucene.codecs.CodecUtil;
|
|||
/** FST based term dict, all the metadata held
|
||||
* as output of FST */
|
||||
|
||||
// nocommit: where is 'TermStats' ???
|
||||
public class TempFSTTermsWriter extends FieldsConsumer {
|
||||
static final String TERMS_EXTENSION = "tmp";
|
||||
static final String TERMS_CODEC_NAME = "FST_TERMS_DICT";
|
||||
|
|
|
@ -33,7 +33,7 @@ import org.apache.lucene.util.LongsRef;
|
|||
public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||
private final static TempMetaData NO_OUTPUT = new TempMetaData();
|
||||
private static boolean DEBUG = false;
|
||||
private FieldInfo fieldInfo;
|
||||
private boolean hasPos;
|
||||
private int longsSize;
|
||||
|
||||
public static class TempMetaData {
|
||||
|
@ -104,23 +104,26 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
}
|
||||
|
||||
protected TempTermOutputs(FieldInfo fieldInfo, int longsSize) {
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.hasPos = (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY);
|
||||
this.longsSize = longsSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
//
|
||||
// Since longs blob is fixed length, when these two are 'comparable'
|
||||
// i.e. when every value in long[] fits the same ordering, the smaller one
|
||||
// will be the result.
|
||||
// The return value will be the smaller one, when these two are
|
||||
// 'comparable', i.e. every value in long[] fits the same ordering.
|
||||
//
|
||||
// NOTE: only long[] is 'shared', i.e. if there are two byte[] on the successive
|
||||
// arcs, only the last byte[] is valid. (this somewhat saves nodes, but might affect
|
||||
// compression, since we'll have to load metadata block for other terms as well, currently,
|
||||
// we don't support this)
|
||||
// NOTE:
|
||||
// Only long[] is 'shared', byte[] and term stats simply act
|
||||
// as 'attachment': when walking on the FST, if we see two byte[] on
|
||||
// successive arcs, only the second byte[] is valid.
|
||||
//
|
||||
// nocommit: get the byte[] from smaller one as well, so that
|
||||
// byte[] is actually inherited
|
||||
// Therefore, during building, we always make sure that, for most nodes,
|
||||
// the first output is 'pushed' one step towards root and reduced to
|
||||
// be NO_OUTPUT, so that we get rid of the 'all zero' long[], and netly
|
||||
// get smaller amount of total outputs.
|
||||
//
|
||||
// However, when decoding, terms might have to load redundant byte[] blob.
|
||||
//
|
||||
public TempMetaData common(TempMetaData t1, TempMetaData t2) {
|
||||
if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
|
||||
|
@ -128,14 +131,11 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
if (DEBUG) System.out.println("ret:"+NO_OUTPUT);
|
||||
return NO_OUTPUT;
|
||||
}
|
||||
assert t1.longs != null;
|
||||
assert t2.longs != null;
|
||||
assert t1.longs.length == t2.longs.length;
|
||||
|
||||
long accum = 0;
|
||||
long[] longs1 = t1.longs, longs2 = t2.longs;
|
||||
int pos = 0;
|
||||
boolean order = true;
|
||||
boolean smaller = true;
|
||||
TempMetaData ret;
|
||||
|
||||
while (pos < longsSize && longs1[pos] == longs2[pos]) {
|
||||
|
@ -143,56 +143,45 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
}
|
||||
if (pos < longsSize) {
|
||||
// unequal
|
||||
order = (longs1[pos] > longs2[pos]);
|
||||
if (order) {
|
||||
// check whether strictly longs1 >= longs2
|
||||
while (pos < longsSize && longs1[pos] >= longs2[pos]) {
|
||||
accum += longs2[pos];
|
||||
pos++;
|
||||
}
|
||||
} else {
|
||||
smaller = (longs1[pos] < longs2[pos]);
|
||||
if (smaller) {
|
||||
// check whether strictly longs1 <= longs2
|
||||
while (pos < longsSize && longs1[pos] <= longs2[pos]) {
|
||||
accum += longs1[pos];
|
||||
pos++;
|
||||
}
|
||||
} else {
|
||||
// check whether strictly longs1 >= longs2
|
||||
while (pos < longsSize && longs1[pos] >= longs2[pos]) {
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
if (pos < longsSize || accum == 0) {
|
||||
if (pos < longsSize) { // not fully 'comparable'
|
||||
ret = NO_OUTPUT;
|
||||
} else if (order) {
|
||||
ret = new TempMetaData(longs2, null, 0, -1);
|
||||
} else if (smaller) {
|
||||
ret = t1;
|
||||
} else {
|
||||
ret = new TempMetaData(longs1, null, 0, -1);
|
||||
ret = t2;
|
||||
}
|
||||
} else {
|
||||
// equal
|
||||
if (t1.bytes!= null && bytesEqual(t1, t2) && statsEqual(t1, t2)) { // all fields are equal
|
||||
ret = t1;
|
||||
} else if (accum == 0) { // all zero case
|
||||
ret = NO_OUTPUT;
|
||||
} else {
|
||||
ret = new TempMetaData(longs1, null, 0, -1);
|
||||
}
|
||||
// equal, we won't check byte[] and docFreq
|
||||
ret = t1;
|
||||
}
|
||||
if (DEBUG) System.out.println("ret:"+ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
// nocommit:
|
||||
// this *actually* always assume that t2 <= t1 before calling the method
|
||||
public TempMetaData subtract(TempMetaData t1, TempMetaData t2) {
|
||||
if (DEBUG) System.out.print("subtract("+t1+", "+t2+") = ");
|
||||
if (t2 == NO_OUTPUT) {
|
||||
if (DEBUG) System.out.println("ret:"+t1);
|
||||
return t1;
|
||||
}
|
||||
assert t1.longs != null;
|
||||
assert t2.longs != null;
|
||||
assert t1.longs.length == t2.longs.length;
|
||||
|
||||
int pos = 0;
|
||||
long diff = 0;
|
||||
long[] share = new long[longsSize]; // nocommit: reuse
|
||||
long[] share = new long[longsSize];
|
||||
|
||||
while (pos < longsSize) {
|
||||
share[pos] = t1.longs[pos] - t2.longs[pos];
|
||||
|
@ -201,7 +190,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
}
|
||||
|
||||
TempMetaData ret;
|
||||
if (diff == 0 && bytesEqual(t1, t2) && statsEqual(t1, t2)) {
|
||||
if (diff == 0 && statsEqual(t1, t2) && bytesEqual(t1, t2)) {
|
||||
ret = NO_OUTPUT;
|
||||
} else {
|
||||
ret = new TempMetaData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
|
||||
|
@ -210,16 +199,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
return ret;
|
||||
}
|
||||
|
||||
static boolean statsEqual(final TempMetaData t1, final TempMetaData t2) {
|
||||
return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
|
||||
}
|
||||
static boolean bytesEqual(final TempMetaData t1, final TempMetaData t2) {
|
||||
return Arrays.equals(t1.bytes, t2.bytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
// nocommit: need to check all-zero case?
|
||||
// so we can reuse one long[]
|
||||
public TempMetaData add(TempMetaData t1, TempMetaData t2) {
|
||||
if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
|
||||
if (t1 == NO_OUTPUT) {
|
||||
|
@ -229,22 +209,16 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
if (DEBUG) System.out.println("ret:"+t1);
|
||||
return t1;
|
||||
}
|
||||
assert t1.longs != null;
|
||||
assert t2.longs != null;
|
||||
assert t1.longs.length == t2.longs.length;
|
||||
|
||||
int pos = 0;
|
||||
long[] accum = new long[longsSize]; // nocommit: reuse?
|
||||
long[] accum = new long[longsSize];
|
||||
while (pos < longsSize) {
|
||||
accum[pos] = t1.longs[pos] + t2.longs[pos];
|
||||
assert(accum[pos] >= 0);
|
||||
pos++;
|
||||
}
|
||||
TempMetaData ret;
|
||||
if (t2.bytes != null || t2.docFreq > 0) {
|
||||
ret = new TempMetaData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq);
|
||||
} else {
|
||||
ret = new TempMetaData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
|
||||
}
|
||||
ret = new TempMetaData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq);
|
||||
if (DEBUG) System.out.println("ret:"+ret);
|
||||
return ret;
|
||||
}
|
||||
|
@ -263,7 +237,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
}
|
||||
if (data.docFreq > 0) {
|
||||
out.writeVInt(data.docFreq);
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
if (hasPos) {
|
||||
out.writeVLong(data.totalTermFreq - data.docFreq);
|
||||
}
|
||||
}
|
||||
|
@ -272,26 +246,25 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
@Override
|
||||
public TempMetaData read(DataInput in) throws IOException {
|
||||
long[] longs = new long[longsSize];
|
||||
byte[] bytes = null;
|
||||
int docFreq = 0;
|
||||
long totalTermFreq = -1;
|
||||
for (int pos = 0; pos < longsSize; pos++) {
|
||||
longs[pos] = in.readVLong();
|
||||
}
|
||||
int code = in.readVInt();
|
||||
int bytesSize = code >>> 1;
|
||||
int docFreq = 0;
|
||||
long totalTermFreq = -1;
|
||||
byte[] bytes = null;
|
||||
if (bytesSize > 0) {
|
||||
bytes = new byte[bytesSize];
|
||||
in.readBytes(bytes, 0, bytes.length);
|
||||
}
|
||||
if ((code & 1) == 1) {
|
||||
docFreq = in.readVInt();
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
if (hasPos) {
|
||||
totalTermFreq = docFreq + in.readVLong();
|
||||
}
|
||||
}
|
||||
TempMetaData meta = new TempMetaData(longs, bytes, docFreq, totalTermFreq);
|
||||
return meta;
|
||||
return new TempMetaData(longs, bytes, docFreq, totalTermFreq);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -303,5 +276,11 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
public String outputToString(TempMetaData data) {
|
||||
return data.toString();
|
||||
}
|
||||
|
||||
static boolean statsEqual(final TempMetaData t1, final TempMetaData t2) {
|
||||
return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
|
||||
}
|
||||
static boolean bytesEqual(final TempMetaData t1, final TempMetaData t2) {
|
||||
return Arrays.equals(t1.bytes, t2.bytes);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ public abstract class Outputs<T> {
|
|||
// (new object per byte/char/int) if eg used during
|
||||
// analysis
|
||||
|
||||
/** Eg common("foo", "foobar") -> "foo" */
|
||||
/** Eg common("foobar", "food") -> "foo" */
|
||||
public abstract T common(T output1, T output2);
|
||||
|
||||
/** Eg subtract("foobar", "foo") -> "bar" */
|
||||
|
|
Loading…
Reference in New Issue