LUCENE-2995: factor out a shared spellchecking module

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1126642 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-05-23 18:33:53 +00:00
parent 3be9e4b90a
commit f5048293b5
61 changed files with 464 additions and 238 deletions

View File

@ -20,8 +20,6 @@
<classpathentry kind="src" path="lucene/contrib/queryparser/src/test"/>
<classpathentry kind="src" path="lucene/contrib/spatial/src/java"/>
<classpathentry kind="src" path="lucene/contrib/spatial/src/test"/>
<classpathentry kind="src" path="lucene/contrib/spellchecker/src/java"/>
<classpathentry kind="src" path="lucene/contrib/spellchecker/src/test"/>
<classpathentry kind="src" path="lucene/contrib/wordnet/src/java"/>
<classpathentry kind="src" path="lucene/contrib/wordnet/src/test"/>
<classpathentry kind="src" path="lucene/contrib/xml-query-parser/src/java"/>
@ -44,6 +42,8 @@
<classpathentry kind="src" path="modules/benchmark/src/test"/>
<classpathentry kind="src" path="modules/grouping/src/java"/>
<classpathentry kind="src" path="modules/grouping/src/test"/>
<classpathentry kind="src" path="modules/suggest/src/java"/>
<classpathentry kind="src" path="modules/suggest/src/test"/>
<classpathentry kind="src" path="solr/src/java"/>
<classpathentry kind="src" path="solr/src/webapp/src"/>
<classpathentry kind="src" path="solr/src/common"/>

View File

@ -227,7 +227,6 @@
<packageset dir="contrib/misc/src/java"/>
<packageset dir="contrib/queries/src/java"/>
<packageset dir="contrib/spatial/src/java"/>
<packageset dir="contrib/spellchecker/src/java"/>
<packageset dir="contrib/wordnet/src/java"/>
<packageset dir="contrib/xml-query-parser/src/java"/>
<packageset dir="contrib/queryparser/src/java"/>
@ -248,7 +247,6 @@
<group title="contrib: Queries" packages="org.apache.lucene.search.similar*:org.apache.lucene.search.regex*:org.apache.regexp*"/>
<group title="contrib: Query Parser" packages="org.apache.lucene.queryParser.*"/>
<group title="contrib: Spatial" packages="org.apache.lucene.spatial*"/>
<group title="contrib: SpellChecker" packages="org.apache.lucene.search.spell*"/>
<group title="contrib: WordNet" packages="org.apache.lucene.wordnet*"/>
<group title="contrib: XML Query Parser" packages="org.apache.lucene.xmlparser*"/>

View File

@ -6,6 +6,8 @@ Build
* LUCENE-2845: Moved contrib/benchmark to modules.
* LUCENE-2995: Moved contrib/spellchecker into modules/suggest.
New Features
* LUCENE-2604: Added RegexpQuery support to contrib/queryparser.

View File

@ -25,6 +25,7 @@
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>
</target>
@ -35,6 +36,7 @@
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>
</target>
@ -45,6 +47,7 @@
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>
</target>
@ -55,6 +58,7 @@
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>
</target>
@ -66,6 +70,7 @@
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>
</target>
@ -96,6 +101,7 @@
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>
</target>

View File

@ -17,13 +17,17 @@
limitations under the License.
-->
<project name="spellchecker" default="default">
<project name="suggest" default="default">
<description>
Spell Checker
Suggest
</description>
<property name="build.dir" location="build/" />
<property name="dist.dir" location="dist/" />
<property name="maven.dist.dir" location="../dist/maven" />
<import file="../contrib-build.xml"/>
<import file="../../lucene/contrib/contrib-build.xml"/>
<module-uptodate name="analysis/common" jarfile="${common.dir}/../modules/analysis/build/common/lucene-analyzers-common-${version}.jar"
property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
package org.apache.solr.util;
package org.apache.lucene.search.spell;
import java.io.IOException;
import java.util.Iterator;
@ -49,7 +49,7 @@ public class HighFrequencyDictionary implements Dictionary {
this.thresh = thresh;
}
public final Iterator getWordsIterator() {
public final Iterator<String> getWordsIterator() {
return new HighFrequencyIterator();
}

View File

@ -1,4 +1,4 @@
package org.apache.solr.util;
package org.apache.lucene.search.spell;
import java.util.Iterator;

View File

@ -1,4 +1,4 @@
package org.apache.solr.util;
package org.apache.lucene.search.spell;
import java.util.Iterator;
@ -7,9 +7,9 @@ public interface TermFreqIterator extends Iterator<String> {
public float freq();
public static class TermFreqIteratorWrapper implements TermFreqIterator {
private Iterator wrapped;
private Iterator<String> wrapped;
public TermFreqIteratorWrapper(Iterator wrapped) {
public TermFreqIteratorWrapper(Iterator<String> wrapped) {
this.wrapped = wrapped;
}

View File

@ -1,10 +1,10 @@
package org.apache.solr.spelling.suggest;
package org.apache.lucene.search.suggest;
import java.util.ArrayList;
import java.util.List;
import org.apache.solr.util.TermFreqIterator;
import org.apache.lucene.search.spell.TermFreqIterator;
/**
* This wrapper buffers incoming elements.

View File

@ -1,4 +1,4 @@
package org.apache.solr.spelling.suggest;
package org.apache.lucene.search.suggest;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -21,7 +21,7 @@ package org.apache.solr.spelling.suggest;
import java.io.*;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.solr.util.TermFreqIterator;
import org.apache.lucene.search.spell.TermFreqIterator;
/**

View File

@ -1,4 +1,4 @@
package org.apache.solr.spelling.suggest;
package org.apache.lucene.search.suggest;
import java.io.File;
import java.io.IOException;
@ -6,10 +6,8 @@ import java.util.Iterator;
import java.util.List;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.PriorityQueue;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.util.TermFreqIterator;
public abstract class Lookup {
/**
@ -56,9 +54,6 @@ public abstract class Lookup {
}
}
/** Initialize the lookup. */
public abstract void init(NamedList config, SolrCore core);
/** Build lookup from a dictionary. Some implementations may require sorted
* or unsorted keys from the dictionary's iterator - use
* {@link SortedTermFreqIteratorWrapper} or
@ -75,7 +70,7 @@ public abstract class Lookup {
build(tfit);
}
protected abstract void build(TermFreqIterator tfit) throws IOException;
public abstract void build(TermFreqIterator tfit) throws IOException;
/**
* Persist the constructed lookup data to a directory. Optional operation.

View File

@ -1,9 +1,9 @@
package org.apache.solr.spelling.suggest;
package org.apache.lucene.search.suggest;
import java.util.Collections;
import org.apache.solr.util.SortedIterator;
import org.apache.solr.util.TermFreqIterator;
import org.apache.lucene.search.spell.SortedIterator;
import org.apache.lucene.search.spell.TermFreqIterator;
/**
* This wrapper buffers incoming elements and makes sure they are sorted in

View File

@ -1,8 +1,8 @@
package org.apache.solr.spelling.suggest;
package org.apache.lucene.search.suggest;
import java.util.Collections;
import org.apache.solr.util.TermFreqIterator;
import org.apache.lucene.search.spell.TermFreqIterator;
/**
* This wrapper buffers the incoming elements and makes sure they are in

View File

@ -1,4 +1,4 @@
package org.apache.solr.spelling.suggest.fst;
package org.apache.lucene.search.suggest.fst;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
@ -13,20 +13,17 @@ import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.fst.Builder;
import org.apache.lucene.util.automaton.fst.FST;
import org.apache.lucene.util.automaton.fst.FST.Arc;
import org.apache.lucene.util.automaton.fst.NoOutputs;
import org.apache.lucene.util.automaton.fst.Outputs;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.spelling.suggest.Lookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.apache.solr.util.TermFreqIterator;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.search.spell.TermFreqIterator;
/**
* Finite state automata based implementation of {@link Lookup} query
@ -93,6 +90,16 @@ import com.google.common.io.Closeables;
* nothing else.
*/
public class FSTLookup extends Lookup {
public FSTLookup() {
this(10, true);
}
public FSTLookup(int buckets, boolean exactMatchFirst) {
this.buckets = buckets;
this.exactMatchFirst = exactMatchFirst;
}
/** A structure for a single entry (for sorting/ preprocessing). */
private static class Entry {
char [] term;
@ -104,6 +111,12 @@ public class FSTLookup extends Lookup {
}
}
/** Serialized automaton file name (storage). */
public static final String FILENAME = "fst.dat";
/** An empty result. */
private static final List<LookupResult> EMPTY_RESULT = Collections.emptyList();
/**
* The number of separate buckets for weights (discretization). The more buckets,
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
@ -113,29 +126,13 @@ public class FSTLookup extends Lookup {
*
* <p>The number of buckets must be within [1, 255] range.
*/
public static final String WEIGHT_BUCKETS = "weightBuckets";
private final int buckets;
/**
* If <code>true</code>, exact suggestions are returned first, even if they are prefixes
* of other strings in the automaton (possibly with larger weights).
*/
public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
/** Serialized automaton file name (storage). */
public static final String FILENAME = "fst.dat";
/** An empty result. */
private static final List<LookupResult> EMPTY_RESULT = Lists.newArrayList();
/**
* @see #WEIGHT_BUCKETS
*/
private int buckets = 10;
/**
* #see #EXACT_MATCH_FIRST
*/
private boolean exactMatchFirst = true;
private final boolean exactMatchFirst;
/**
* Finite state automaton encoding all the lookup terms. See class
@ -149,25 +146,12 @@ public class FSTLookup extends Lookup {
*/
private Arc<Object> [] rootArcs;
/* */
@Override
@SuppressWarnings("rawtypes")
public void init(NamedList config, SolrCore core) {
this.buckets = config.get(WEIGHT_BUCKETS) != null
? Integer.parseInt(config.get(WEIGHT_BUCKETS).toString())
: 10;
this.exactMatchFirst = config.get(EXACT_MATCH_FIRST) != null
? Boolean.valueOf(config.get(EXACT_MATCH_FIRST).toString())
: true;
}
/* */
@Override
public void build(TermFreqIterator tfit) throws IOException {
// Buffer the input because we will need it twice: for calculating
// weights distribution and for the actual automata building.
List<Entry> entries = Lists.newArrayList();
List<Entry> entries = new ArrayList<Entry>();
while (tfit.hasNext()) {
String term = tfit.next();
char [] termChars = new char [term.length() + 1]; // add padding for weight.
@ -200,7 +184,7 @@ public class FSTLookup extends Lookup {
@SuppressWarnings("unchecked")
private void cacheRootArcs() throws IOException {
if (automaton != null) {
List<Arc<Object>> rootArcs = Lists.newArrayList();
List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>();
Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
automaton.readFirstTargetArc(arc, arc);
while (true) {
@ -312,7 +296,7 @@ public class FSTLookup extends Lookup {
// Sort and trim.
Collections.sort(res, new Comparator<LookupResult>() {
@Override
// not till java6 @Override
public int compare(LookupResult o1, LookupResult o2) {
return o1.key.compareTo(o2.key);
}
@ -526,7 +510,7 @@ public class FSTLookup extends Lookup {
this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton());
cacheRootArcs();
} finally {
Closeables.closeQuietly(is);
IOUtils.closeSafely(is);
}
return true;
}
@ -548,7 +532,7 @@ public class FSTLookup extends Lookup {
try {
this.automaton.save(new OutputStreamDataOutput(os));
} finally {
Closeables.closeQuietly(os);
IOUtils.closeSafely(os);
}
return true;

View File

@ -1,10 +1,9 @@
package org.apache.solr.spelling.suggest.fst;
package org.apache.lucene.search.suggest.fst;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.lucene.store.DataInput;
import com.google.common.io.ByteStreams;
/**
* A {@link DataInput} wrapping a plain {@link InputStream}.
@ -26,6 +25,8 @@ public class InputStreamDataInput extends DataInput {
@Override
public void readBytes(byte[] b, int offset, int len) throws IOException {
ByteStreams.readFully(is, b, offset, len);
if (is.read(b, offset, len) != len) {
throw new EOFException();
}
}
}

View File

@ -1,4 +1,4 @@
package org.apache.solr.spelling.suggest.fst;
package org.apache.lucene.search.suggest.fst;
import java.io.IOException;
import java.io.OutputStream;

View File

@ -1,4 +1,4 @@
package org.apache.solr.spelling.suggest.jaspell;
package org.apache.lucene.search.suggest.jaspell;
import java.io.DataInputStream;
import java.io.DataOutputStream;
@ -9,27 +9,17 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.spelling.suggest.Lookup;
import org.apache.solr.spelling.suggest.UnsortedTermFreqIteratorWrapper;
import org.apache.solr.spelling.suggest.jaspell.JaspellTernarySearchTrie.TSTNode;
import org.apache.solr.util.SortedIterator;
import org.apache.solr.util.TermFreqIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.lucene.search.spell.SortedIterator;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.UnsortedTermFreqIteratorWrapper;
import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie.TSTNode;
public class JaspellLookup extends Lookup {
private static final Logger LOG = LoggerFactory.getLogger(JaspellLookup.class);
JaspellTernarySearchTrie trie = new JaspellTernarySearchTrie();
private boolean usePrefix = true;
private int editDistance = 2;
@Override
public void init(NamedList config, SolrCore core) {
LOG.info("init: " + config);
}
@Override
public void build(TermFreqIterator tfit) throws IOException {
if (tfit instanceof SortedIterator) {

View File

@ -1,4 +1,4 @@
package org.apache.solr.spelling.suggest.jaspell;
package org.apache.lucene.search.suggest.jaspell;
/**
* Copyright (c) 2005 Bruno Martins

View File

@ -1,4 +1,4 @@
package org.apache.solr.spelling.suggest.tst;
package org.apache.lucene.search.suggest.tst;
import java.util.*;

View File

@ -1,4 +1,4 @@
package org.apache.solr.spelling.suggest.tst;
package org.apache.lucene.search.suggest.tst;
import java.io.DataInputStream;
import java.io.DataOutputStream;
@ -9,21 +9,15 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.spelling.suggest.Lookup;
import org.apache.solr.spelling.suggest.SortedTermFreqIteratorWrapper;
import org.apache.solr.util.SortedIterator;
import org.apache.solr.util.TermFreqIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
import org.apache.lucene.search.spell.SortedIterator;
import org.apache.lucene.search.spell.TermFreqIterator;
public class TSTLookup extends Lookup {
TernaryTreeNode root = new TernaryTreeNode();
TSTAutocomplete autocomplete = new TSTAutocomplete();
@Override
public void init(NamedList config, SolrCore core) {
}
@Override
public void build(TermFreqIterator tfit) throws IOException {
root = new TernaryTreeNode();

View File

@ -1,4 +1,4 @@
package org.apache.solr.spelling.suggest.tst;
package org.apache.lucene.search.suggest.tst;
/**
* The class creates a TST node.

View File

@ -1,4 +1,22 @@
package org.apache.solr.spelling.suggest;
package org.apache.lucene.search.suggest;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.List;
import java.util.Locale;

View File

@ -1,34 +1,51 @@
package org.apache.solr.spelling.suggest;
package org.apache.lucene.search.suggest;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import java.util.concurrent.Callable;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.solr.spelling.suggest.fst.FSTLookup;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.junit.Assert;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.FSTLookup;
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
/**
* Benchmarks tests for implementations of {@link Lookup} interface.
*/
@Ignore // COMMENT ME TO RUN BENCHMARKS!
public class LookupBenchmarkTest {
@Ignore("COMMENT ME TO RUN BENCHMARKS!")
public class LookupBenchmarkTest extends LuceneTestCase {
@SuppressWarnings("unchecked")
private final List<Class<? extends Lookup>> benchmarkClasses = Lists.newArrayList(
private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(
JaspellLookup.class,
TSTLookup.class,
FSTLookup.class);
@ -63,28 +80,32 @@ public class LookupBenchmarkTest {
LookupBenchmarkTest.benchmarkInput = input;
}
static final Charset UTF_8 = Charset.forName("UTF-8");
/**
* Collect the multilingual input for benchmarks/ tests.
*/
public static List<TermFreq> readTop50KWiki() throws Exception {
List<TermFreq> input = Lists.newArrayList();
URL resource = Thread.currentThread().getContextClassLoader().getResource("Top50KWiki.utf8");
List<TermFreq> input = new ArrayList<TermFreq>();
URL resource = LookupBenchmarkTest.class.getResource("Top50KWiki.utf8");
assert resource != null : "Resource missing: Top50KWiki.utf8";
for (String line : Resources.readLines(resource, Charsets.UTF_8)) {
String line = null;
BufferedReader br = new BufferedReader(new InputStreamReader(resource.openStream(), UTF_8));
while ((line = br.readLine()) != null) {
int tab = line.indexOf('|');
Assert.assertTrue("No | separator?: " + line, tab >= 0);
assertTrue("No | separator?: " + line, tab >= 0);
float weight = Float.parseFloat(line.substring(tab + 1));
String key = line.substring(0, tab);
input.add(new TermFreq(key, weight));
}
br.close();
return input;
}
/**
* Test construction time.
*/
@Test
public void testConstructionTime() throws Exception {
System.err.println("-- construction time");
for (final Class<? extends Lookup> cls : benchmarkClasses) {
@ -106,7 +127,6 @@ public class LookupBenchmarkTest {
/**
* Test memory required for the storage.
*/
@Test
public void testStorageNeeds() throws Exception {
System.err.println("-- RAM consumption");
final RamUsageEstimator rue = new RamUsageEstimator();
@ -131,7 +151,6 @@ public class LookupBenchmarkTest {
/**
* Test performance of lookup on full hits.
*/
@Test
public void testPerformanceOnFullHits() throws Exception {
final int minPrefixLen = 100;
final int maxPrefixLen = 200;
@ -141,7 +160,6 @@ public class LookupBenchmarkTest {
/**
* Test performance of lookup on longer term prefixes (6-9 letters or shorter).
*/
@Test
public void testPerformanceOnPrefixes6_9() throws Exception {
final int minPrefixLen = 6;
final int maxPrefixLen = 9;
@ -151,7 +169,6 @@ public class LookupBenchmarkTest {
/**
* Test performance of lookup on short term prefixes (2-4 letters or shorter).
*/
@Test
public void testPerformanceOnPrefixes2_4() throws Exception {
final int minPrefixLen = 2;
final int maxPrefixLen = 4;
@ -170,12 +187,11 @@ public class LookupBenchmarkTest {
for (Class<? extends Lookup> cls : benchmarkClasses) {
final Lookup lookup = buildLookup(cls, dictionaryInput);
final List<String> input = Lists.newArrayList(Iterables.transform(benchmarkInput, new Function<TermFreq, String>() {
public String apply(TermFreq tf) {
return tf.term.substring(0, Math.min(tf.term.length(),
minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)));
}
}));
final List<String> input = new ArrayList<String>(benchmarkInput.size());
for (TermFreq tf : benchmarkInput) {
input.add(tf.term.substring(0, Math.min(tf.term.length(),
minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1))));
}
BenchmarkResult result = measure(new Callable<Integer>() {
public Integer call() throws Exception {
@ -203,7 +219,7 @@ public class LookupBenchmarkTest {
final double NANOS_PER_MS = 1000000;
try {
List<Double> times = Lists.newArrayList();
List<Double> times = new ArrayList<Double>();
for (int i = 0; i < warmup + rounds; i++) {
final long start = System.nanoTime();
guard = callable.call().intValue();

View File

@ -14,17 +14,17 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling.suggest;
package org.apache.lucene.search.suggest;
import java.io.File;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.spelling.suggest.fst.FSTLookup;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.junit.Test;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.FSTLookup;
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.util.LuceneTestCase;
public class PersistenceTest extends SolrTestCaseJ4 {
public class PersistenceTest extends LuceneTestCase {
public final String[] keys = new String[] {
"one",
"two",
@ -42,17 +42,14 @@ public class PersistenceTest extends SolrTestCaseJ4 {
"fourier",
"fourty"};
@Test
public void testTSTPersistence() throws Exception {
runTest(TSTLookup.class, true);
}
@Test
public void testJaspellPersistence() throws Exception {
runTest(JaspellLookup.class, true);
}
@Test
public void testFSTPersistence() throws Exception {
runTest(FSTLookup.class, false);
}
@ -68,7 +65,7 @@ public class PersistenceTest extends SolrTestCaseJ4 {
lookup.build(new TermFreqArrayIterator(keys));
// Store the suggester.
File storeDir = new File(TEST_HOME());
File storeDir = TEMP_DIR;
lookup.store(storeDir);
// Re-read it from disk.

View File

@ -0,0 +1,28 @@
package org.apache.lucene.search.suggest;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public final class TermFreq {
public final String term;
public final float v;
public TermFreq(String term, float v) {
this.term = term;
this.v = v;
}
}

View File

@ -0,0 +1,57 @@
package org.apache.lucene.search.suggest;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Arrays;
import java.util.Iterator;
import org.apache.lucene.search.spell.TermFreqIterator;
/**
* A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
*/
public final class TermFreqArrayIterator implements TermFreqIterator {
private final Iterator<TermFreq> i;
private TermFreq current;
public TermFreqArrayIterator(Iterator<TermFreq> i) {
this.i = i;
}
public TermFreqArrayIterator(TermFreq [] i) {
this(Arrays.asList(i));
}
public TermFreqArrayIterator(Iterable<TermFreq> i) {
this(i.iterator());
}
public float freq() {
return current.v;
}
public boolean hasNext() {
return i.hasNext();
}
public String next() {
return (current = i.next()).term;
}
public void remove() { throw new UnsupportedOperationException(); }
}

View File

@ -1,20 +1,35 @@
package org.apache.solr.spelling.suggest.fst;
package org.apache.lucene.search.suggest.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.fst.FSTLookup;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.spelling.suggest.Lookup.LookupResult;
import org.apache.solr.spelling.suggest.LookupBenchmarkTest;
import org.apache.solr.spelling.suggest.TermFreq;
import org.apache.solr.spelling.suggest.TermFreqArrayIterator;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import com.google.common.collect.Lists;
import org.apache.lucene.search.suggest.LookupBenchmarkTest;
import org.apache.lucene.search.suggest.TermFreq;
import org.apache.lucene.search.suggest.TermFreqArrayIterator;
/**
* Unit tests for {@link FSTLookup}.
@ -26,8 +41,8 @@ public class FSTLookupTest extends LuceneTestCase {
private FSTLookup lookup;
@Before
public void prepare() throws Exception {
public void setUp() throws Exception {
super.setUp();
final TermFreq[] keys = new TermFreq[] {
tf("one", 0.5f),
tf("oneness", 1),
@ -51,29 +66,24 @@ public class FSTLookupTest extends LuceneTestCase {
lookup.build(new TermFreqArrayIterator(keys));
}
@Test
public void testExactMatchHighPriority() throws Exception {
assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0");
}
@Test
public void testExactMatchLowPriority() throws Exception {
assertMatchEquals(lookup.lookup("one", true, 2),
"one/0.0",
"oneness/1.0");
}
@Test
public void testMiss() throws Exception {
assertMatchEquals(lookup.lookup("xyz", true, 1));
}
@Test
public void testAlphabeticWithWeights() throws Exception {
assertEquals(0, lookup.lookup("xyz", false, 1).size());
}
@Test
public void testFullMatchList() throws Exception {
assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE),
"oneness/1.0",
@ -82,7 +92,6 @@ public class FSTLookupTest extends LuceneTestCase {
"one/0.0");
}
@Test
public void testMultilingualInput() throws Exception {
List<TermFreq> input = LookupBenchmarkTest.readTop50KWiki();
@ -95,7 +104,6 @@ public class FSTLookupTest extends LuceneTestCase {
}
}
@Test
public void testEmptyInput() throws Exception {
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(new TermFreq[0]));
@ -103,9 +111,8 @@ public class FSTLookupTest extends LuceneTestCase {
assertMatchEquals(lookup.lookup("", true, 10));
}
@Test
public void testRandom() throws Exception {
List<TermFreq> freqs = Lists.newArrayList();
List<TermFreq> freqs = new ArrayList<TermFreq>();
Random rnd = random;
for (int i = 0; i < 5000; i++) {
freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100)));
@ -118,7 +125,7 @@ public class FSTLookupTest extends LuceneTestCase {
for (int i = 1; i < term.length(); i++) {
String prefix = term.substring(0, i);
for (LookupResult lr : lookup.lookup(prefix, true, 10)) {
Assert.assertTrue(lr.key.startsWith(prefix));
assertTrue(lr.key.startsWith(prefix));
}
}
}

View File

@ -246,6 +246,10 @@ Other Changes
variance in asserting score comparisons in unit tests.
(David Smiley, Chris Hostetter)
* LUCENE-2995: Moved some spellchecker and suggest APIs to modules/suggest:
HighFrequencyDictionary, SortedIterator, TermFreqIterator, and the
suggester APIs and implementations. (rmuir)
Documentation
----------------------

View File

@ -188,12 +188,12 @@
<pathelement location="${common-solr.dir}/../lucene/build/classes/java" />
<pathelement location="${common-solr.dir}/../modules/analysis/build/common/classes/java" />
<pathelement location="${common-solr.dir}/../modules/analysis/build/phonetic/classes/java" />
<pathelement location="${common-solr.dir}/../modules/suggest/build/classes/java" />
<pathelement location="${common-solr.dir}/../lucene/build/contrib/highlighter/classes/java" />
<pathelement location="${common-solr.dir}/../lucene/build/contrib/memory/classes/java" />
<pathelement location="${common-solr.dir}/../lucene/build/contrib/misc/classes/java" />
<pathelement location="${common-solr.dir}/../lucene/build/contrib/queries/classes/java" />
<pathelement location="${common-solr.dir}/../lucene/build/contrib/spatial/classes/java" />
<pathelement location="${common-solr.dir}/../lucene/build/contrib/spellchecker/classes/java" />
</path>
<target name="prep-lucene-jars">
@ -204,12 +204,12 @@
<subant target="jar" inheritall="false" failonerror="true">
<fileset dir="../modules/analysis/common" includes="build.xml" />
<fileset dir="../modules/analysis/phonetic" includes="build.xml" />
<fileset dir="../modules/suggest" includes="build.xml" />
<fileset dir="../lucene/contrib/highlighter" includes="build.xml" />
<fileset dir="../lucene/contrib/memory" includes="build.xml" />
<fileset dir="../lucene/contrib/misc" includes="build.xml" />
<fileset dir="../lucene/contrib/queries" includes="build.xml" />
<fileset dir="../lucene/contrib/spatial" includes="build.xml" />
<fileset dir="../lucene/contrib/spellchecker" includes="build.xml" />
</subant>
</sequential>
</target>
@ -226,6 +226,9 @@
<fileset dir="../modules/analysis/build/phonetic">
<include name="lucene-analyzers-phonetic-${version}.jar" />
</fileset>
<fileset dir="../modules/suggest/build">
<include name="lucene-suggest-${version}.jar" />
</fileset>
<fileset dir="../lucene/build/contrib/highlighter">
<include name="lucene-highlighter-${version}.jar" />
</fileset>
@ -241,9 +244,6 @@
<fileset dir="../lucene/build/contrib/spatial">
<include name="lucene-spatial-${version}.jar" />
</fileset>
<fileset dir="../lucene/build/contrib/spellchecker">
<include name="lucene-spellchecker-${version}.jar" />
</fileset>
</copy>
</target>
@ -252,12 +252,12 @@
<subant target="default">
<fileset dir="../modules/analysis/common" includes="build.xml"/>
<fileset dir="../modules/analysis/phonetic" includes="build.xml"/>
<fileset dir="../modules/suggest" includes="build.xml"/>
<fileset dir="../lucene/contrib/highlighter" includes="build.xml"/>
<fileset dir="../lucene/contrib/memory" includes="build.xml"/>
<fileset dir="../lucene/contrib/misc" includes="build.xml"/>
<fileset dir="../lucene/contrib/queries" includes="build.xml"/>
<fileset dir="../lucene/contrib/spatial" includes="build.xml"/>
<fileset dir="../lucene/contrib/spellchecker" includes="build.xml"/>
</subant>
</target>

View File

@ -26,12 +26,12 @@ import org.slf4j.LoggerFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.spell.HighFrequencyDictionary;
import org.apache.lucene.search.spell.PlainTextDictionary;
import org.apache.lucene.store.RAMDirectory;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
import org.apache.solr.util.HighFrequencyDictionary;
import org.apache.solr.search.SolrIndexSearcher;
/**

View File

@ -18,10 +18,11 @@ package org.apache.solr.spelling;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.search.spell.HighFrequencyDictionary;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.HighFrequencyDictionary;
import java.io.File;
import java.io.IOException;

View File

@ -0,0 +1,29 @@
package org.apache.solr.spelling.suggest;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.suggest.Lookup;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
/**
* Suggester factory for creating {@link Lookup} instances.
*/
public abstract class LookupFactory {
public abstract Lookup create(NamedList params, SolrCore core);
}

View File

@ -27,15 +27,20 @@ import java.util.List;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.HighFrequencyDictionary;
import org.apache.lucene.search.suggest.FileDictionary;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.spelling.SolrSpellChecker;
import org.apache.solr.spelling.SpellingOptions;
import org.apache.solr.spelling.SpellingResult;
import org.apache.solr.spelling.suggest.Lookup.LookupResult;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.util.HighFrequencyDictionary;
import org.apache.solr.spelling.suggest.fst.FSTLookupFactory;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookupFactory;
import org.apache.solr.spelling.suggest.tst.TSTLookupFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -80,11 +85,18 @@ public class Suggester extends SolrSpellChecker {
sourceLocation = (String) config.get(LOCATION);
field = (String)config.get(FIELD);
lookupImpl = (String)config.get(LOOKUP_IMPL);
if (lookupImpl == null) {
lookupImpl = JaspellLookup.class.getName();
// support the old classnames without -Factory for config file backwards compatibility.
if (lookupImpl == null || "org.apache.solr.spelling.suggest.jaspell.JaspellLookup".equals(lookupImpl)) {
lookupImpl = JaspellLookupFactory.class.getName();
} else if ("org.apache.solr.spelling.suggest.tst.TSTLookup".equals(lookupImpl)) {
lookupImpl = TSTLookupFactory.class.getName();
} else if ("org.apache.solr.spelling.suggest.fst.FSTLookup".equals(lookupImpl)) {
lookupImpl = FSTLookupFactory.class.getName();
}
lookup = (Lookup) core.getResourceLoader().newInstance(lookupImpl);
lookup.init(config, core);
LookupFactory factory = (LookupFactory) core.getResourceLoader().newInstance(lookupImpl);
lookup = factory.create(config, core);
String store = (String)config.get(STORE_DIR);
if (store != null) {
storeDir = new File(store);

View File

@ -0,0 +1,60 @@
package org.apache.solr.spelling.suggest.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.FSTLookup;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.spelling.suggest.LookupFactory;
/**
* Factory for {@link FSTLookup}
*/
public class FSTLookupFactory extends LookupFactory {
/**
* The number of separate buckets for weights (discretization). The more buckets,
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
* will not decrease for prefixes which have highly-weighted completions (because these
* are filled-in first), but will decrease significantly for low-weighted terms (but
* these should be infrequent, so it is all right).
*
* <p>The number of buckets must be within [1, 255] range.
*/
public static final String WEIGHT_BUCKETS = "weightBuckets";
/**
* If <code>true</code>, exact suggestions are returned first, even if they are prefixes
* of other strings in the automaton (possibly with larger weights).
*/
public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
@Override
public Lookup create(NamedList params, SolrCore core) {
int buckets = params.get(WEIGHT_BUCKETS) != null
? Integer.parseInt(params.get(WEIGHT_BUCKETS).toString())
: 10;
boolean exactMatchFirst = params.get(EXACT_MATCH_FIRST) != null
? Boolean.valueOf(params.get(EXACT_MATCH_FIRST).toString())
: true;
return new FSTLookup(buckets, exactMatchFirst);
}
}

View File

@ -0,0 +1,39 @@
package org.apache.solr.spelling.suggest.jaspell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.spelling.suggest.LookupFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Factory for {@link JaspellLookup}
*/
public class JaspellLookupFactory extends LookupFactory {
private static final Logger LOG = LoggerFactory.getLogger(JaspellLookup.class);
@Override
public Lookup create(NamedList params, SolrCore core) {
LOG.info("init: " + params);
return new JaspellLookup();
}
}

View File

@ -0,0 +1,35 @@
package org.apache.solr.spelling.suggest.tst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.spelling.suggest.LookupFactory;
/**
* Factory for {@link TSTLookup}
*/
public class TSTLookupFactory extends LookupFactory {
@Override
public Lookup create(NamedList params, SolrCore core) {
return new TSTLookup();
}
}

View File

@ -1,11 +0,0 @@
package org.apache.solr.spelling.suggest;
public final class TermFreq {
public final String term;
public final float v;
public TermFreq(String term, float v) {
this.term = term;
this.v = v;
}
}

View File

@ -1,40 +0,0 @@
package org.apache.solr.spelling.suggest;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.solr.util.TermFreqIterator;
/**
* A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
*/
public final class TermFreqArrayIterator implements TermFreqIterator {
private final Iterator<TermFreq> i;
private TermFreq current;
public TermFreqArrayIterator(Iterator<TermFreq> i) {
this.i = i;
}
public TermFreqArrayIterator(TermFreq [] i) {
this(Arrays.asList(i));
}
public TermFreqArrayIterator(Iterable<TermFreq> i) {
this(i.iterator());
}
public float freq() {
return current.v;
}
public boolean hasNext() {
return i.hasNext();
}
public String next() {
return (current = i.next()).term;
}
public void remove() { throw new UnsupportedOperationException(); }
}