From 762272e48ae5b2392bd9f711f42f9ea1bdbc2731 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 9 Feb 2011 16:10:00 +0000 Subject: [PATCH] resolve TODO: run the dfas backwards git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1068957 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/lucene/search/FuzzyTermsEnum.java | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index b9694d10aa2..655630954f8 100644 --- a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -325,22 +325,26 @@ public final class FuzzyTermsEnum extends TermsEnum { /** finds the smallest Lev(n) DFA that accepts the term. */ @Override - protected AcceptStatus accept(BytesRef term) { - if (term.equals(termRef)) { // ed = 0 - boostAtt.setBoost(1.0F); - return AcceptStatus.YES_AND_SEEK; - } + protected AcceptStatus accept(BytesRef term) { + int ed = matchers.length - 1; - int codePointCount = -1; - - // TODO: benchmark doing this backwards - for (int i = 1; i < matchers.length; i++) - if (matchers[i].run(term.bytes, term.offset, term.length)) { - // this sucks, we convert just to score based on length. - if (codePointCount == -1) { - codePointCount = UnicodeUtil.codePointCount(term); + if (matches(term, ed)) { // we match the outer dfa + // now compute exact edit distance + while (ed > 0) { + if (matches(term, ed - 1)) { + ed--; + } else { + break; } - final float similarity = 1.0f - ((float) i / (float) + } + + // scale to a boost and return (if similarity > minSimilarity) + if (ed == 0) { // exact match + boostAtt.setBoost(1.0F); + return AcceptStatus.YES_AND_SEEK; + } else { + final int codePointCount = UnicodeUtil.codePointCount(term); + final float similarity = 1.0f - ((float) ed / (float) (Math.min(codePointCount, termLength))); if (similarity > minSimilarity) { boostAtt.setBoost((similarity - minSimilarity) * scale_factor); @@ -349,8 +353,14 @@ public final class FuzzyTermsEnum extends TermsEnum { return AcceptStatus.NO_AND_SEEK; } } - - return AcceptStatus.NO_AND_SEEK; + } else { + return AcceptStatus.NO_AND_SEEK; + } + } + + /** returns true if term is within k edits of the query term */ + final boolean matches(BytesRef term, int k) { + return k == 0 ? term.equals(termRef) : matchers[k].run(term.bytes, term.offset, term.length); } /** defers to superclass, except can start at an arbitrary location */