mirror of https://github.com/apache/lucene.git
LUCENE-3069: introduce intersect() to TempFSTOrd
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1508744 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
004e4b75bc
commit
e388e20c1d
|
@ -66,7 +66,7 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
|
|||
final TempPostingsReaderBase postingsReader;
|
||||
IndexInput indexIn = null;
|
||||
IndexInput blockIn = null;
|
||||
static final boolean TEST = false;
|
||||
//static final boolean TEST = false;
|
||||
|
||||
public TempFSTOrdTermsReader(SegmentReadState state, TempPostingsReaderBase postingsReader) throws IOException {
|
||||
final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempFSTOrdTermsWriter.TERMS_INDEX_EXTENSION);
|
||||
|
@ -251,7 +251,7 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
|
|||
|
||||
@Override
|
||||
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
|
||||
return super.intersect(compiled, startTerm);
|
||||
return new IntersectTermsEnum(compiled, startTerm);
|
||||
}
|
||||
|
||||
// Only wraps common operations for PBF interact
|
||||
|
@ -297,7 +297,6 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
|
|||
this.totalTermFreq = new long[INTERVAL];
|
||||
this.statsBlockOrd = -1;
|
||||
this.metaBlockOrd = -1;
|
||||
|
||||
}
|
||||
|
||||
/** Decodes stats data into term state */
|
||||
|
@ -393,11 +392,6 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
|
|||
return state.totalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ord() {
|
||||
return ord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
|
||||
decodeMetaData();
|
||||
|
@ -419,6 +413,12 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
|
|||
public void seekExact(long ord) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ord() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -495,6 +495,317 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Iterates intersect result with automaton (cannot seek!)
|
||||
private final class IntersectTermsEnum extends BaseTermsEnum {
|
||||
/* True when current term's metadata is decoded */
|
||||
boolean decoded;
|
||||
|
||||
/* True when there is pending term when calling next() */
|
||||
boolean pending;
|
||||
|
||||
/* stack to record how current term is constructed,
|
||||
* used to accumulate metadata or rewind term:
|
||||
* level == term.length + 1,
|
||||
* == 0 when term is null */
|
||||
Frame[] stack;
|
||||
int level;
|
||||
|
||||
/* term dict fst */
|
||||
final FST<Long> fst;
|
||||
final FST.BytesReader fstReader;
|
||||
final Outputs<Long> fstOutputs;
|
||||
|
||||
/* query automaton to intersect with */
|
||||
final ByteRunAutomaton fsa;
|
||||
|
||||
private final class Frame {
|
||||
/* fst stats */
|
||||
FST.Arc<Long> arc;
|
||||
|
||||
/* automaton stats */
|
||||
int state;
|
||||
|
||||
Frame() {
|
||||
this.arc = new FST.Arc<Long>();
|
||||
this.state = -1;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "arc=" + arc + " state=" + state;
|
||||
}
|
||||
}
|
||||
|
||||
IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
|
||||
//if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
|
||||
this.fst = index;
|
||||
this.fstReader = fst.getBytesReader();
|
||||
this.fstOutputs = index.outputs;
|
||||
this.fsa = compiled.runAutomaton;
|
||||
/*
|
||||
PrintWriter pw1 = new PrintWriter(new File("../temp/fst.txt"));
|
||||
Util.toDot(dict,pw1, false, false);
|
||||
pw1.close();
|
||||
PrintWriter pw2 = new PrintWriter(new File("../temp/fsa.txt"));
|
||||
pw2.write(compiled.toDot());
|
||||
pw2.close();
|
||||
*/
|
||||
this.level = -1;
|
||||
this.stack = new Frame[16];
|
||||
for (int i = 0 ; i < stack.length; i++) {
|
||||
this.stack[i] = new Frame();
|
||||
}
|
||||
|
||||
Frame frame;
|
||||
frame = loadVirtualFrame(newFrame());
|
||||
this.level++;
|
||||
frame = loadFirstFrame(newFrame());
|
||||
pushFrame(frame);
|
||||
|
||||
this.decoded = false;
|
||||
this.pending = false;
|
||||
|
||||
if (startTerm == null) {
|
||||
pending = isAccept(topFrame());
|
||||
} else {
|
||||
doSeekCeil(startTerm);
|
||||
pending = !startTerm.equals(term) && isValid(topFrame()) && isAccept(topFrame());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
void decodeMetaData() throws IOException {
|
||||
if (!decoded) {
|
||||
super.decodeMetaData();
|
||||
decoded = true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
void decodeStats() throws IOException {
|
||||
final FST.Arc<Long> arc = topFrame().arc;
|
||||
if (arc.isFinal()) {
|
||||
ord = fstOutputs.add(arc.output, arc.nextFinalOutput);
|
||||
} else {
|
||||
ord = arc.output;
|
||||
}
|
||||
super.decodeStats();
|
||||
}
|
||||
|
||||
// nocommit: need testcase for this
|
||||
@Override
|
||||
public SeekStatus seekCeil(BytesRef target, boolean useCache) throws IOException {
|
||||
decoded = false;
|
||||
term = doSeekCeil(target);
|
||||
decodeStats();
|
||||
if (term == null) {
|
||||
return SeekStatus.END;
|
||||
} else {
|
||||
return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
//if (TEST) System.out.println("Enum next()");
|
||||
if (pending) {
|
||||
pending = false;
|
||||
decodeStats();
|
||||
return term;
|
||||
}
|
||||
decoded = false;
|
||||
DFS:
|
||||
while (level > 0) {
|
||||
Frame frame = newFrame();
|
||||
if (loadExpandFrame(topFrame(), frame) != null) { // has valid target
|
||||
pushFrame(frame);
|
||||
if (isAccept(frame)) { // gotcha
|
||||
break;
|
||||
}
|
||||
continue; // check next target
|
||||
}
|
||||
frame = popFrame();
|
||||
while(level > 0) {
|
||||
if (loadNextFrame(topFrame(), frame) != null) { // has valid sibling
|
||||
pushFrame(frame);
|
||||
if (isAccept(frame)) { // gotcha
|
||||
break DFS;
|
||||
}
|
||||
continue DFS; // check next target
|
||||
}
|
||||
frame = popFrame();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
decodeStats();
|
||||
return term;
|
||||
}
|
||||
|
||||
BytesRef doSeekCeil(BytesRef target) throws IOException {
|
||||
//if (TEST) System.out.println("Enum doSeekCeil()");
|
||||
Frame frame= null;
|
||||
int label, upto = 0, limit = target.length;
|
||||
while (upto < limit) { // to target prefix, or ceil label (rewind prefix)
|
||||
frame = newFrame();
|
||||
label = target.bytes[upto] & 0xff;
|
||||
frame = loadCeilFrame(label, topFrame(), frame);
|
||||
if (frame == null || frame.arc.label != label) {
|
||||
break;
|
||||
}
|
||||
assert isValid(frame); // target must be fetched from automaton
|
||||
pushFrame(frame);
|
||||
upto++;
|
||||
}
|
||||
if (upto == limit) { // got target
|
||||
return term;
|
||||
}
|
||||
if (frame != null) { // got larger term('s prefix)
|
||||
pushFrame(frame);
|
||||
return isAccept(frame) ? term : next();
|
||||
}
|
||||
while (level > 0) { // got target's prefix, advance to larger term
|
||||
frame = popFrame();
|
||||
while (level > 0 && !canRewind(frame)) {
|
||||
frame = popFrame();
|
||||
}
|
||||
if (loadNextFrame(topFrame(), frame) != null) {
|
||||
pushFrame(frame);
|
||||
return isAccept(frame) ? term : next();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Virtual frame, never pop */
|
||||
Frame loadVirtualFrame(Frame frame) throws IOException {
|
||||
frame.arc.output = fstOutputs.getNoOutput();
|
||||
frame.arc.nextFinalOutput = fstOutputs.getNoOutput();
|
||||
frame.state = -1;
|
||||
return frame;
|
||||
}
|
||||
|
||||
/** Load frame for start arc(node) on fst */
|
||||
Frame loadFirstFrame(Frame frame) throws IOException {
|
||||
frame.arc = fst.getFirstArc(frame.arc);
|
||||
frame.state = fsa.getInitialState();
|
||||
return frame;
|
||||
}
|
||||
|
||||
// nocommit: expected to use readFirstTargetArc here?
|
||||
|
||||
/** Load frame for target arc(node) on fst */
|
||||
Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
|
||||
if (!canGrow(top)) {
|
||||
return null;
|
||||
}
|
||||
frame.arc = fst.readFirstRealTargetArc(top.arc.target, frame.arc, fstReader);
|
||||
frame.state = fsa.step(top.state, frame.arc.label);
|
||||
//if (TEST) System.out.println(" loadExpand frame="+frame);
|
||||
if (frame.state == -1) {
|
||||
return loadNextFrame(top, frame);
|
||||
}
|
||||
return frame;
|
||||
}
|
||||
|
||||
/** Load frame for sibling arc(node) on fst */
|
||||
Frame loadNextFrame(Frame top, Frame frame) throws IOException {
|
||||
if (!canRewind(frame)) {
|
||||
return null;
|
||||
}
|
||||
while (!frame.arc.isLast()) {
|
||||
frame.arc = fst.readNextRealArc(frame.arc, fstReader);
|
||||
frame.state = fsa.step(top.state, frame.arc.label);
|
||||
if (frame.state != -1) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
//if (TEST) System.out.println(" loadNext frame="+frame);
|
||||
if (frame.state == -1) {
|
||||
return null;
|
||||
}
|
||||
return frame;
|
||||
}
|
||||
|
||||
/** Load frame for target arc(node) on fst, so that
|
||||
* arc.label >= label and !fsa.reject(arc.label) */
|
||||
Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException {
|
||||
FST.Arc<Long> arc = frame.arc;
|
||||
arc = Util.readCeilArc(label, fst, top.arc, arc, fstReader);
|
||||
if (arc == null) {
|
||||
return null;
|
||||
}
|
||||
frame.state = fsa.step(top.state, arc.label);
|
||||
//if (TEST) System.out.println(" loadCeil frame="+frame);
|
||||
if (frame.state == -1) {
|
||||
return loadNextFrame(top, frame);
|
||||
}
|
||||
return frame;
|
||||
}
|
||||
|
||||
boolean isAccept(Frame frame) { // reach a term both fst&fsa accepts
|
||||
return fsa.isAccept(frame.state) && frame.arc.isFinal();
|
||||
}
|
||||
boolean isValid(Frame frame) { // reach a prefix both fst&fsa won't reject
|
||||
return /*frame != null &&*/ frame.state != -1;
|
||||
}
|
||||
boolean canGrow(Frame frame) { // can walk forward on both fst&fsa
|
||||
return frame.state != -1 && FST.targetHasArcs(frame.arc);
|
||||
}
|
||||
boolean canRewind(Frame frame) { // can jump to sibling
|
||||
return !frame.arc.isLast();
|
||||
}
|
||||
|
||||
// nocommit: need to load ord lazily?
|
||||
void pushFrame(Frame frame) {
|
||||
final FST.Arc<Long> arc = frame.arc;
|
||||
arc.output = fstOutputs.add(topFrame().arc.output, arc.output);
|
||||
term = grow(arc.label);
|
||||
level++;
|
||||
}
|
||||
|
||||
Frame popFrame() {
|
||||
term = shrink();
|
||||
level--;
|
||||
return stack[level+1];
|
||||
}
|
||||
|
||||
Frame newFrame() {
|
||||
if (level+1 == stack.length) {
|
||||
final Frame[] temp = new Frame[ArrayUtil.oversize(level+2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
System.arraycopy(stack, 0, temp, 0, stack.length);
|
||||
for (int i = stack.length; i < temp.length; i++) {
|
||||
temp[i] = new Frame();
|
||||
}
|
||||
stack = temp;
|
||||
}
|
||||
return stack[level+1];
|
||||
}
|
||||
|
||||
Frame topFrame() {
|
||||
return stack[level];
|
||||
}
|
||||
|
||||
BytesRef grow(int label) {
|
||||
if (term == null) {
|
||||
term = new BytesRef(new byte[16], 0, 0);
|
||||
} else {
|
||||
if (term.length == term.bytes.length) {
|
||||
term.grow(term.length+1);
|
||||
}
|
||||
term.bytes[term.length++] = (byte)label;
|
||||
}
|
||||
return term;
|
||||
}
|
||||
|
||||
BytesRef shrink() {
|
||||
if (term.length == 0) {
|
||||
term = null;
|
||||
} else {
|
||||
term.length--;
|
||||
}
|
||||
return term;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static<T> void walk(FST<T> fst) throws IOException {
|
||||
|
|
Loading…
Reference in New Issue