mirror of https://github.com/apache/lucene.git
LUCENE-2995: factor out a shared spellchecking module
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1126642 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3be9e4b90a
commit
f5048293b5
|
@ -20,8 +20,6 @@
|
|||
<classpathentry kind="src" path="lucene/contrib/queryparser/src/test"/>
|
||||
<classpathentry kind="src" path="lucene/contrib/spatial/src/java"/>
|
||||
<classpathentry kind="src" path="lucene/contrib/spatial/src/test"/>
|
||||
<classpathentry kind="src" path="lucene/contrib/spellchecker/src/java"/>
|
||||
<classpathentry kind="src" path="lucene/contrib/spellchecker/src/test"/>
|
||||
<classpathentry kind="src" path="lucene/contrib/wordnet/src/java"/>
|
||||
<classpathentry kind="src" path="lucene/contrib/wordnet/src/test"/>
|
||||
<classpathentry kind="src" path="lucene/contrib/xml-query-parser/src/java"/>
|
||||
|
@ -44,6 +42,8 @@
|
|||
<classpathentry kind="src" path="modules/benchmark/src/test"/>
|
||||
<classpathentry kind="src" path="modules/grouping/src/java"/>
|
||||
<classpathentry kind="src" path="modules/grouping/src/test"/>
|
||||
<classpathentry kind="src" path="modules/suggest/src/java"/>
|
||||
<classpathentry kind="src" path="modules/suggest/src/test"/>
|
||||
<classpathentry kind="src" path="solr/src/java"/>
|
||||
<classpathentry kind="src" path="solr/src/webapp/src"/>
|
||||
<classpathentry kind="src" path="solr/src/common"/>
|
||||
|
|
|
@ -227,7 +227,6 @@
|
|||
<packageset dir="contrib/misc/src/java"/>
|
||||
<packageset dir="contrib/queries/src/java"/>
|
||||
<packageset dir="contrib/spatial/src/java"/>
|
||||
<packageset dir="contrib/spellchecker/src/java"/>
|
||||
<packageset dir="contrib/wordnet/src/java"/>
|
||||
<packageset dir="contrib/xml-query-parser/src/java"/>
|
||||
<packageset dir="contrib/queryparser/src/java"/>
|
||||
|
@ -248,7 +247,6 @@
|
|||
<group title="contrib: Queries" packages="org.apache.lucene.search.similar*:org.apache.lucene.search.regex*:org.apache.regexp*"/>
|
||||
<group title="contrib: Query Parser" packages="org.apache.lucene.queryParser.*"/>
|
||||
<group title="contrib: Spatial" packages="org.apache.lucene.spatial*"/>
|
||||
<group title="contrib: SpellChecker" packages="org.apache.lucene.search.spell*"/>
|
||||
<group title="contrib: WordNet" packages="org.apache.lucene.wordnet*"/>
|
||||
<group title="contrib: XML Query Parser" packages="org.apache.lucene.xmlparser*"/>
|
||||
|
||||
|
|
|
@ -6,6 +6,8 @@ Build
|
|||
|
||||
* LUCENE-2845: Moved contrib/benchmark to modules.
|
||||
|
||||
* LUCENE-2995: Moved contrib/spellchecker into modules/suggest.
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-2604: Added RegexpQuery support to contrib/queryparser.
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
<fileset dir="analysis" includes="build.xml" />
|
||||
<fileset dir="benchmark" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
</target>
|
||||
|
@ -35,6 +36,7 @@
|
|||
<fileset dir="analysis" includes="build.xml" />
|
||||
<fileset dir="benchmark" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
</target>
|
||||
|
@ -45,6 +47,7 @@
|
|||
<fileset dir="analysis" includes="build.xml" />
|
||||
<fileset dir="benchmark" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
</target>
|
||||
|
@ -55,6 +58,7 @@
|
|||
<fileset dir="analysis" includes="build.xml" />
|
||||
<fileset dir="benchmark" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
</target>
|
||||
|
@ -66,6 +70,7 @@
|
|||
<fileset dir="analysis" includes="build.xml" />
|
||||
<fileset dir="benchmark" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
</target>
|
||||
|
@ -96,6 +101,7 @@
|
|||
<fileset dir="analysis" includes="build.xml" />
|
||||
<fileset dir="benchmark" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
</target>
|
||||
|
|
|
@ -17,13 +17,17 @@
|
|||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="spellchecker" default="default">
|
||||
<project name="suggest" default="default">
|
||||
|
||||
<description>
|
||||
Spell Checker
|
||||
Suggest
|
||||
</description>
|
||||
|
||||
<import file="../contrib-build.xml"/>
|
||||
<property name="build.dir" location="build/" />
|
||||
<property name="dist.dir" location="dist/" />
|
||||
<property name="maven.dist.dir" location="../dist/maven" />
|
||||
|
||||
<import file="../../lucene/contrib/contrib-build.xml"/>
|
||||
|
||||
<module-uptodate name="analysis/common" jarfile="${common.dir}/../modules/analysis/build/common/lucene-analyzers-common-${version}.jar"
|
||||
property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util;
|
||||
package org.apache.lucene.search.spell;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
@ -49,7 +49,7 @@ public class HighFrequencyDictionary implements Dictionary {
|
|||
this.thresh = thresh;
|
||||
}
|
||||
|
||||
public final Iterator getWordsIterator() {
|
||||
public final Iterator<String> getWordsIterator() {
|
||||
return new HighFrequencyIterator();
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.util;
|
||||
package org.apache.lucene.search.spell;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.util;
|
||||
package org.apache.lucene.search.spell;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
|
@ -7,9 +7,9 @@ public interface TermFreqIterator extends Iterator<String> {
|
|||
public float freq();
|
||||
|
||||
public static class TermFreqIteratorWrapper implements TermFreqIterator {
|
||||
private Iterator wrapped;
|
||||
private Iterator<String> wrapped;
|
||||
|
||||
public TermFreqIteratorWrapper(Iterator wrapped) {
|
||||
public TermFreqIteratorWrapper(Iterator<String> wrapped) {
|
||||
this.wrapped = wrapped;
|
||||
}
|
||||
|
|
@ -1,10 +1,10 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
package org.apache.lucene.search.suggest;
|
||||
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.solr.util.TermFreqIterator;
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
|
||||
/**
|
||||
* This wrapper buffers incoming elements.
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
package org.apache.lucene.search.suggest;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -21,7 +21,7 @@ package org.apache.solr.spelling.suggest;
|
|||
import java.io.*;
|
||||
|
||||
import org.apache.lucene.search.spell.Dictionary;
|
||||
import org.apache.solr.util.TermFreqIterator;
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
|
||||
|
||||
/**
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
package org.apache.lucene.search.suggest;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
@ -6,10 +6,8 @@ import java.util.Iterator;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.spell.Dictionary;
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.util.TermFreqIterator;
|
||||
|
||||
public abstract class Lookup {
|
||||
/**
|
||||
|
@ -56,9 +54,6 @@ public abstract class Lookup {
|
|||
}
|
||||
}
|
||||
|
||||
/** Initialize the lookup. */
|
||||
public abstract void init(NamedList config, SolrCore core);
|
||||
|
||||
/** Build lookup from a dictionary. Some implementations may require sorted
|
||||
* or unsorted keys from the dictionary's iterator - use
|
||||
* {@link SortedTermFreqIteratorWrapper} or
|
||||
|
@ -75,7 +70,7 @@ public abstract class Lookup {
|
|||
build(tfit);
|
||||
}
|
||||
|
||||
protected abstract void build(TermFreqIterator tfit) throws IOException;
|
||||
public abstract void build(TermFreqIterator tfit) throws IOException;
|
||||
|
||||
/**
|
||||
* Persist the constructed lookup data to a directory. Optional operation.
|
|
@ -1,9 +1,9 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
package org.apache.lucene.search.suggest;
|
||||
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.solr.util.SortedIterator;
|
||||
import org.apache.solr.util.TermFreqIterator;
|
||||
import org.apache.lucene.search.spell.SortedIterator;
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
|
||||
/**
|
||||
* This wrapper buffers incoming elements and makes sure they are sorted in
|
|
@ -1,8 +1,8 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
package org.apache.lucene.search.suggest;
|
||||
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.solr.util.TermFreqIterator;
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
|
||||
/**
|
||||
* This wrapper buffers the incoming elements and makes sure they are in
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.spelling.suggest.fst;
|
||||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
|
@ -13,20 +13,17 @@ import java.util.Collections;
|
|||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.automaton.fst.Builder;
|
||||
import org.apache.lucene.util.automaton.fst.FST;
|
||||
import org.apache.lucene.util.automaton.fst.FST.Arc;
|
||||
import org.apache.lucene.util.automaton.fst.NoOutputs;
|
||||
import org.apache.lucene.util.automaton.fst.Outputs;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.spelling.suggest.Lookup;
|
||||
import org.apache.solr.spelling.suggest.tst.TSTLookup;
|
||||
import org.apache.solr.util.TermFreqIterator;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.io.Closeables;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.tst.TSTLookup;
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
|
||||
/**
|
||||
* Finite state automata based implementation of {@link Lookup} query
|
||||
|
@ -93,6 +90,16 @@ import com.google.common.io.Closeables;
|
|||
* nothing else.
|
||||
*/
|
||||
public class FSTLookup extends Lookup {
|
||||
|
||||
public FSTLookup() {
|
||||
this(10, true);
|
||||
}
|
||||
|
||||
public FSTLookup(int buckets, boolean exactMatchFirst) {
|
||||
this.buckets = buckets;
|
||||
this.exactMatchFirst = exactMatchFirst;
|
||||
}
|
||||
|
||||
/** A structure for a single entry (for sorting/ preprocessing). */
|
||||
private static class Entry {
|
||||
char [] term;
|
||||
|
@ -104,6 +111,12 @@ public class FSTLookup extends Lookup {
|
|||
}
|
||||
}
|
||||
|
||||
/** Serialized automaton file name (storage). */
|
||||
public static final String FILENAME = "fst.dat";
|
||||
|
||||
/** An empty result. */
|
||||
private static final List<LookupResult> EMPTY_RESULT = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* The number of separate buckets for weights (discretization). The more buckets,
|
||||
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
|
||||
|
@ -113,29 +126,13 @@ public class FSTLookup extends Lookup {
|
|||
*
|
||||
* <p>The number of buckets must be within [1, 255] range.
|
||||
*/
|
||||
public static final String WEIGHT_BUCKETS = "weightBuckets";
|
||||
private final int buckets;
|
||||
|
||||
/**
|
||||
* If <code>true</code>, exact suggestions are returned first, even if they are prefixes
|
||||
* of other strings in the automaton (possibly with larger weights).
|
||||
*/
|
||||
public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
|
||||
|
||||
/** Serialized automaton file name (storage). */
|
||||
public static final String FILENAME = "fst.dat";
|
||||
|
||||
/** An empty result. */
|
||||
private static final List<LookupResult> EMPTY_RESULT = Lists.newArrayList();
|
||||
|
||||
/**
|
||||
* @see #WEIGHT_BUCKETS
|
||||
*/
|
||||
private int buckets = 10;
|
||||
|
||||
/**
|
||||
* #see #EXACT_MATCH_FIRST
|
||||
*/
|
||||
private boolean exactMatchFirst = true;
|
||||
private final boolean exactMatchFirst;
|
||||
|
||||
/**
|
||||
* Finite state automaton encoding all the lookup terms. See class
|
||||
|
@ -149,25 +146,12 @@ public class FSTLookup extends Lookup {
|
|||
*/
|
||||
private Arc<Object> [] rootArcs;
|
||||
|
||||
/* */
|
||||
@Override
|
||||
@SuppressWarnings("rawtypes")
|
||||
public void init(NamedList config, SolrCore core) {
|
||||
this.buckets = config.get(WEIGHT_BUCKETS) != null
|
||||
? Integer.parseInt(config.get(WEIGHT_BUCKETS).toString())
|
||||
: 10;
|
||||
|
||||
this.exactMatchFirst = config.get(EXACT_MATCH_FIRST) != null
|
||||
? Boolean.valueOf(config.get(EXACT_MATCH_FIRST).toString())
|
||||
: true;
|
||||
}
|
||||
|
||||
/* */
|
||||
@Override
|
||||
public void build(TermFreqIterator tfit) throws IOException {
|
||||
// Buffer the input because we will need it twice: for calculating
|
||||
// weights distribution and for the actual automata building.
|
||||
List<Entry> entries = Lists.newArrayList();
|
||||
List<Entry> entries = new ArrayList<Entry>();
|
||||
while (tfit.hasNext()) {
|
||||
String term = tfit.next();
|
||||
char [] termChars = new char [term.length() + 1]; // add padding for weight.
|
||||
|
@ -200,7 +184,7 @@ public class FSTLookup extends Lookup {
|
|||
@SuppressWarnings("unchecked")
|
||||
private void cacheRootArcs() throws IOException {
|
||||
if (automaton != null) {
|
||||
List<Arc<Object>> rootArcs = Lists.newArrayList();
|
||||
List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>();
|
||||
Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
|
||||
automaton.readFirstTargetArc(arc, arc);
|
||||
while (true) {
|
||||
|
@ -312,7 +296,7 @@ public class FSTLookup extends Lookup {
|
|||
|
||||
// Sort and trim.
|
||||
Collections.sort(res, new Comparator<LookupResult>() {
|
||||
@Override
|
||||
// not till java6 @Override
|
||||
public int compare(LookupResult o1, LookupResult o2) {
|
||||
return o1.key.compareTo(o2.key);
|
||||
}
|
||||
|
@ -526,7 +510,7 @@ public class FSTLookup extends Lookup {
|
|||
this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton());
|
||||
cacheRootArcs();
|
||||
} finally {
|
||||
Closeables.closeQuietly(is);
|
||||
IOUtils.closeSafely(is);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -548,7 +532,7 @@ public class FSTLookup extends Lookup {
|
|||
try {
|
||||
this.automaton.save(new OutputStreamDataOutput(os));
|
||||
} finally {
|
||||
Closeables.closeQuietly(os);
|
||||
IOUtils.closeSafely(os);
|
||||
}
|
||||
|
||||
return true;
|
|
@ -1,10 +1,9 @@
|
|||
package org.apache.solr.spelling.suggest.fst;
|
||||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.io.EOFException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import com.google.common.io.ByteStreams;
|
||||
|
||||
/**
|
||||
* A {@link DataInput} wrapping a plain {@link InputStream}.
|
||||
|
@ -26,6 +25,8 @@ public class InputStreamDataInput extends DataInput {
|
|||
|
||||
@Override
|
||||
public void readBytes(byte[] b, int offset, int len) throws IOException {
|
||||
ByteStreams.readFully(is, b, offset, len);
|
||||
if (is.read(b, offset, len) != len) {
|
||||
throw new EOFException();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.spelling.suggest.fst;
|
||||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.spelling.suggest.jaspell;
|
||||
package org.apache.lucene.search.suggest.jaspell;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
|
@ -9,27 +9,17 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.spelling.suggest.Lookup;
|
||||
import org.apache.solr.spelling.suggest.UnsortedTermFreqIteratorWrapper;
|
||||
import org.apache.solr.spelling.suggest.jaspell.JaspellTernarySearchTrie.TSTNode;
|
||||
import org.apache.solr.util.SortedIterator;
|
||||
import org.apache.solr.util.TermFreqIterator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.apache.lucene.search.spell.SortedIterator;
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.UnsortedTermFreqIteratorWrapper;
|
||||
import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie.TSTNode;
|
||||
|
||||
public class JaspellLookup extends Lookup {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(JaspellLookup.class);
|
||||
JaspellTernarySearchTrie trie = new JaspellTernarySearchTrie();
|
||||
private boolean usePrefix = true;
|
||||
private int editDistance = 2;
|
||||
|
||||
@Override
|
||||
public void init(NamedList config, SolrCore core) {
|
||||
LOG.info("init: " + config);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void build(TermFreqIterator tfit) throws IOException {
|
||||
if (tfit instanceof SortedIterator) {
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.spelling.suggest.jaspell;
|
||||
package org.apache.lucene.search.suggest.jaspell;
|
||||
|
||||
/**
|
||||
* Copyright (c) 2005 Bruno Martins
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.spelling.suggest.tst;
|
||||
package org.apache.lucene.search.suggest.tst;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.spelling.suggest.tst;
|
||||
package org.apache.lucene.search.suggest.tst;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
|
@ -9,21 +9,15 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.spelling.suggest.Lookup;
|
||||
import org.apache.solr.spelling.suggest.SortedTermFreqIteratorWrapper;
|
||||
import org.apache.solr.util.SortedIterator;
|
||||
import org.apache.solr.util.TermFreqIterator;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
|
||||
import org.apache.lucene.search.spell.SortedIterator;
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
|
||||
public class TSTLookup extends Lookup {
|
||||
TernaryTreeNode root = new TernaryTreeNode();
|
||||
TSTAutocomplete autocomplete = new TSTAutocomplete();
|
||||
|
||||
@Override
|
||||
public void init(NamedList config, SolrCore core) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void build(TermFreqIterator tfit) throws IOException {
|
||||
root = new TernaryTreeNode();
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.spelling.suggest.tst;
|
||||
package org.apache.lucene.search.suggest.tst;
|
||||
|
||||
/**
|
||||
* The class creates a TST node.
|
|
@ -1,4 +1,22 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
package org.apache.lucene.search.suggest;
|
||||
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
|
@ -1,34 +1,51 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
package org.apache.lucene.search.suggest;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.Callable;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.solr.spelling.suggest.fst.FSTLookup;
|
||||
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
|
||||
import org.apache.solr.spelling.suggest.tst.TSTLookup;
|
||||
import org.junit.Assert;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.fst.FSTLookup;
|
||||
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
|
||||
import org.apache.lucene.search.suggest.tst.TSTLookup;
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.io.Resources;
|
||||
|
||||
/**
|
||||
* Benchmarks tests for implementations of {@link Lookup} interface.
|
||||
*/
|
||||
@Ignore // COMMENT ME TO RUN BENCHMARKS!
|
||||
public class LookupBenchmarkTest {
|
||||
@Ignore("COMMENT ME TO RUN BENCHMARKS!")
|
||||
public class LookupBenchmarkTest extends LuceneTestCase {
|
||||
@SuppressWarnings("unchecked")
|
||||
private final List<Class<? extends Lookup>> benchmarkClasses = Lists.newArrayList(
|
||||
private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(
|
||||
JaspellLookup.class,
|
||||
TSTLookup.class,
|
||||
FSTLookup.class);
|
||||
|
@ -63,28 +80,32 @@ public class LookupBenchmarkTest {
|
|||
LookupBenchmarkTest.benchmarkInput = input;
|
||||
}
|
||||
|
||||
static final Charset UTF_8 = Charset.forName("UTF-8");
|
||||
|
||||
/**
|
||||
* Collect the multilingual input for benchmarks/ tests.
|
||||
*/
|
||||
public static List<TermFreq> readTop50KWiki() throws Exception {
|
||||
List<TermFreq> input = Lists.newArrayList();
|
||||
URL resource = Thread.currentThread().getContextClassLoader().getResource("Top50KWiki.utf8");
|
||||
List<TermFreq> input = new ArrayList<TermFreq>();
|
||||
URL resource = LookupBenchmarkTest.class.getResource("Top50KWiki.utf8");
|
||||
assert resource != null : "Resource missing: Top50KWiki.utf8";
|
||||
|
||||
for (String line : Resources.readLines(resource, Charsets.UTF_8)) {
|
||||
String line = null;
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(resource.openStream(), UTF_8));
|
||||
while ((line = br.readLine()) != null) {
|
||||
int tab = line.indexOf('|');
|
||||
Assert.assertTrue("No | separator?: " + line, tab >= 0);
|
||||
assertTrue("No | separator?: " + line, tab >= 0);
|
||||
float weight = Float.parseFloat(line.substring(tab + 1));
|
||||
String key = line.substring(0, tab);
|
||||
input.add(new TermFreq(key, weight));
|
||||
}
|
||||
br.close();
|
||||
return input;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test construction time.
|
||||
*/
|
||||
@Test
|
||||
public void testConstructionTime() throws Exception {
|
||||
System.err.println("-- construction time");
|
||||
for (final Class<? extends Lookup> cls : benchmarkClasses) {
|
||||
|
@ -106,7 +127,6 @@ public class LookupBenchmarkTest {
|
|||
/**
|
||||
* Test memory required for the storage.
|
||||
*/
|
||||
@Test
|
||||
public void testStorageNeeds() throws Exception {
|
||||
System.err.println("-- RAM consumption");
|
||||
final RamUsageEstimator rue = new RamUsageEstimator();
|
||||
|
@ -131,7 +151,6 @@ public class LookupBenchmarkTest {
|
|||
/**
|
||||
* Test performance of lookup on full hits.
|
||||
*/
|
||||
@Test
|
||||
public void testPerformanceOnFullHits() throws Exception {
|
||||
final int minPrefixLen = 100;
|
||||
final int maxPrefixLen = 200;
|
||||
|
@ -141,7 +160,6 @@ public class LookupBenchmarkTest {
|
|||
/**
|
||||
* Test performance of lookup on longer term prefixes (6-9 letters or shorter).
|
||||
*/
|
||||
@Test
|
||||
public void testPerformanceOnPrefixes6_9() throws Exception {
|
||||
final int minPrefixLen = 6;
|
||||
final int maxPrefixLen = 9;
|
||||
|
@ -151,7 +169,6 @@ public class LookupBenchmarkTest {
|
|||
/**
|
||||
* Test performance of lookup on short term prefixes (2-4 letters or shorter).
|
||||
*/
|
||||
@Test
|
||||
public void testPerformanceOnPrefixes2_4() throws Exception {
|
||||
final int minPrefixLen = 2;
|
||||
final int maxPrefixLen = 4;
|
||||
|
@ -170,12 +187,11 @@ public class LookupBenchmarkTest {
|
|||
for (Class<? extends Lookup> cls : benchmarkClasses) {
|
||||
final Lookup lookup = buildLookup(cls, dictionaryInput);
|
||||
|
||||
final List<String> input = Lists.newArrayList(Iterables.transform(benchmarkInput, new Function<TermFreq, String>() {
|
||||
public String apply(TermFreq tf) {
|
||||
return tf.term.substring(0, Math.min(tf.term.length(),
|
||||
minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)));
|
||||
final List<String> input = new ArrayList<String>(benchmarkInput.size());
|
||||
for (TermFreq tf : benchmarkInput) {
|
||||
input.add(tf.term.substring(0, Math.min(tf.term.length(),
|
||||
minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1))));
|
||||
}
|
||||
}));
|
||||
|
||||
BenchmarkResult result = measure(new Callable<Integer>() {
|
||||
public Integer call() throws Exception {
|
||||
|
@ -203,7 +219,7 @@ public class LookupBenchmarkTest {
|
|||
final double NANOS_PER_MS = 1000000;
|
||||
|
||||
try {
|
||||
List<Double> times = Lists.newArrayList();
|
||||
List<Double> times = new ArrayList<Double>();
|
||||
for (int i = 0; i < warmup + rounds; i++) {
|
||||
final long start = System.nanoTime();
|
||||
guard = callable.call().intValue();
|
|
@ -14,17 +14,17 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.spelling.suggest;
|
||||
package org.apache.lucene.search.suggest;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.spelling.suggest.fst.FSTLookup;
|
||||
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
|
||||
import org.apache.solr.spelling.suggest.tst.TSTLookup;
|
||||
import org.junit.Test;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.fst.FSTLookup;
|
||||
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
|
||||
import org.apache.lucene.search.suggest.tst.TSTLookup;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class PersistenceTest extends SolrTestCaseJ4 {
|
||||
public class PersistenceTest extends LuceneTestCase {
|
||||
public final String[] keys = new String[] {
|
||||
"one",
|
||||
"two",
|
||||
|
@ -42,17 +42,14 @@ public class PersistenceTest extends SolrTestCaseJ4 {
|
|||
"fourier",
|
||||
"fourty"};
|
||||
|
||||
@Test
|
||||
public void testTSTPersistence() throws Exception {
|
||||
runTest(TSTLookup.class, true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaspellPersistence() throws Exception {
|
||||
runTest(JaspellLookup.class, true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFSTPersistence() throws Exception {
|
||||
runTest(FSTLookup.class, false);
|
||||
}
|
||||
|
@ -68,7 +65,7 @@ public class PersistenceTest extends SolrTestCaseJ4 {
|
|||
lookup.build(new TermFreqArrayIterator(keys));
|
||||
|
||||
// Store the suggester.
|
||||
File storeDir = new File(TEST_HOME());
|
||||
File storeDir = TEMP_DIR;
|
||||
lookup.store(storeDir);
|
||||
|
||||
// Re-read it from disk.
|
|
@ -0,0 +1,28 @@
|
|||
package org.apache.lucene.search.suggest;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public final class TermFreq {
|
||||
public final String term;
|
||||
public final float v;
|
||||
|
||||
public TermFreq(String term, float v) {
|
||||
this.term = term;
|
||||
this.v = v;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
package org.apache.lucene.search.suggest;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
|
||||
/**
|
||||
* A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
|
||||
*/
|
||||
public final class TermFreqArrayIterator implements TermFreqIterator {
|
||||
private final Iterator<TermFreq> i;
|
||||
private TermFreq current;
|
||||
|
||||
public TermFreqArrayIterator(Iterator<TermFreq> i) {
|
||||
this.i = i;
|
||||
}
|
||||
|
||||
public TermFreqArrayIterator(TermFreq [] i) {
|
||||
this(Arrays.asList(i));
|
||||
}
|
||||
|
||||
public TermFreqArrayIterator(Iterable<TermFreq> i) {
|
||||
this(i.iterator());
|
||||
}
|
||||
|
||||
public float freq() {
|
||||
return current.v;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return i.hasNext();
|
||||
}
|
||||
|
||||
public String next() {
|
||||
return (current = i.next()).term;
|
||||
}
|
||||
|
||||
public void remove() { throw new UnsupportedOperationException(); }
|
||||
}
|
|
@ -1,20 +1,35 @@
|
|||
package org.apache.solr.spelling.suggest.fst;
|
||||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
||||
import org.apache.lucene.search.suggest.fst.FSTLookup;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.solr.spelling.suggest.Lookup.LookupResult;
|
||||
import org.apache.solr.spelling.suggest.LookupBenchmarkTest;
|
||||
import org.apache.solr.spelling.suggest.TermFreq;
|
||||
import org.apache.solr.spelling.suggest.TermFreqArrayIterator;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.apache.lucene.search.suggest.LookupBenchmarkTest;
|
||||
import org.apache.lucene.search.suggest.TermFreq;
|
||||
import org.apache.lucene.search.suggest.TermFreqArrayIterator;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link FSTLookup}.
|
||||
|
@ -26,8 +41,8 @@ public class FSTLookupTest extends LuceneTestCase {
|
|||
|
||||
private FSTLookup lookup;
|
||||
|
||||
@Before
|
||||
public void prepare() throws Exception {
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
final TermFreq[] keys = new TermFreq[] {
|
||||
tf("one", 0.5f),
|
||||
tf("oneness", 1),
|
||||
|
@ -51,29 +66,24 @@ public class FSTLookupTest extends LuceneTestCase {
|
|||
lookup.build(new TermFreqArrayIterator(keys));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExactMatchHighPriority() throws Exception {
|
||||
assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExactMatchLowPriority() throws Exception {
|
||||
assertMatchEquals(lookup.lookup("one", true, 2),
|
||||
"one/0.0",
|
||||
"oneness/1.0");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMiss() throws Exception {
|
||||
assertMatchEquals(lookup.lookup("xyz", true, 1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAlphabeticWithWeights() throws Exception {
|
||||
assertEquals(0, lookup.lookup("xyz", false, 1).size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFullMatchList() throws Exception {
|
||||
assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE),
|
||||
"oneness/1.0",
|
||||
|
@ -82,7 +92,6 @@ public class FSTLookupTest extends LuceneTestCase {
|
|||
"one/0.0");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultilingualInput() throws Exception {
|
||||
List<TermFreq> input = LookupBenchmarkTest.readTop50KWiki();
|
||||
|
||||
|
@ -95,7 +104,6 @@ public class FSTLookupTest extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmptyInput() throws Exception {
|
||||
lookup = new FSTLookup();
|
||||
lookup.build(new TermFreqArrayIterator(new TermFreq[0]));
|
||||
|
@ -103,9 +111,8 @@ public class FSTLookupTest extends LuceneTestCase {
|
|||
assertMatchEquals(lookup.lookup("", true, 10));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRandom() throws Exception {
|
||||
List<TermFreq> freqs = Lists.newArrayList();
|
||||
List<TermFreq> freqs = new ArrayList<TermFreq>();
|
||||
Random rnd = random;
|
||||
for (int i = 0; i < 5000; i++) {
|
||||
freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100)));
|
||||
|
@ -118,7 +125,7 @@ public class FSTLookupTest extends LuceneTestCase {
|
|||
for (int i = 1; i < term.length(); i++) {
|
||||
String prefix = term.substring(0, i);
|
||||
for (LookupResult lr : lookup.lookup(prefix, true, 10)) {
|
||||
Assert.assertTrue(lr.key.startsWith(prefix));
|
||||
assertTrue(lr.key.startsWith(prefix));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -246,6 +246,10 @@ Other Changes
|
|||
variance in asserting score comparisons in unit tests.
|
||||
(David Smiley, Chris Hostetter)
|
||||
|
||||
* LUCENE-2995: Moved some spellchecker and suggest APIs to modules/suggest:
|
||||
HighFrequencyDictionary, SortedIterator, TermFreqIterator, and the
|
||||
suggester APIs and implementations. (rmuir)
|
||||
|
||||
Documentation
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -188,12 +188,12 @@
|
|||
<pathelement location="${common-solr.dir}/../lucene/build/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../modules/analysis/build/common/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../modules/analysis/build/phonetic/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../modules/suggest/build/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../lucene/build/contrib/highlighter/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../lucene/build/contrib/memory/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../lucene/build/contrib/misc/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../lucene/build/contrib/queries/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../lucene/build/contrib/spatial/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../lucene/build/contrib/spellchecker/classes/java" />
|
||||
</path>
|
||||
|
||||
<target name="prep-lucene-jars">
|
||||
|
@ -204,12 +204,12 @@
|
|||
<subant target="jar" inheritall="false" failonerror="true">
|
||||
<fileset dir="../modules/analysis/common" includes="build.xml" />
|
||||
<fileset dir="../modules/analysis/phonetic" includes="build.xml" />
|
||||
<fileset dir="../modules/suggest" includes="build.xml" />
|
||||
<fileset dir="../lucene/contrib/highlighter" includes="build.xml" />
|
||||
<fileset dir="../lucene/contrib/memory" includes="build.xml" />
|
||||
<fileset dir="../lucene/contrib/misc" includes="build.xml" />
|
||||
<fileset dir="../lucene/contrib/queries" includes="build.xml" />
|
||||
<fileset dir="../lucene/contrib/spatial" includes="build.xml" />
|
||||
<fileset dir="../lucene/contrib/spellchecker" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
</target>
|
||||
|
@ -226,6 +226,9 @@
|
|||
<fileset dir="../modules/analysis/build/phonetic">
|
||||
<include name="lucene-analyzers-phonetic-${version}.jar" />
|
||||
</fileset>
|
||||
<fileset dir="../modules/suggest/build">
|
||||
<include name="lucene-suggest-${version}.jar" />
|
||||
</fileset>
|
||||
<fileset dir="../lucene/build/contrib/highlighter">
|
||||
<include name="lucene-highlighter-${version}.jar" />
|
||||
</fileset>
|
||||
|
@ -241,9 +244,6 @@
|
|||
<fileset dir="../lucene/build/contrib/spatial">
|
||||
<include name="lucene-spatial-${version}.jar" />
|
||||
</fileset>
|
||||
<fileset dir="../lucene/build/contrib/spellchecker">
|
||||
<include name="lucene-spellchecker-${version}.jar" />
|
||||
</fileset>
|
||||
</copy>
|
||||
</target>
|
||||
|
||||
|
@ -252,12 +252,12 @@
|
|||
<subant target="default">
|
||||
<fileset dir="../modules/analysis/common" includes="build.xml"/>
|
||||
<fileset dir="../modules/analysis/phonetic" includes="build.xml"/>
|
||||
<fileset dir="../modules/suggest" includes="build.xml"/>
|
||||
<fileset dir="../lucene/contrib/highlighter" includes="build.xml"/>
|
||||
<fileset dir="../lucene/contrib/memory" includes="build.xml"/>
|
||||
<fileset dir="../lucene/contrib/misc" includes="build.xml"/>
|
||||
<fileset dir="../lucene/contrib/queries" includes="build.xml"/>
|
||||
<fileset dir="../lucene/contrib/spatial" includes="build.xml"/>
|
||||
<fileset dir="../lucene/contrib/spellchecker" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
|
|
|
@ -26,12 +26,12 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.spell.HighFrequencyDictionary;
|
||||
import org.apache.lucene.search.spell.PlainTextDictionary;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.util.HighFrequencyDictionary;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
|
||||
/**
|
||||
|
|
|
@ -18,10 +18,11 @@ package org.apache.solr.spelling;
|
|||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.search.spell.HighFrequencyDictionary;
|
||||
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.util.HighFrequencyDictionary;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
|
||||
/**
|
||||
* Suggester factory for creating {@link Lookup} instances.
|
||||
*/
|
||||
public abstract class LookupFactory {
|
||||
public abstract Lookup create(NamedList params, SolrCore core);
|
||||
}
|
|
@ -27,15 +27,20 @@ import java.util.List;
|
|||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.spell.Dictionary;
|
||||
import org.apache.lucene.search.spell.HighFrequencyDictionary;
|
||||
import org.apache.lucene.search.suggest.FileDictionary;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
||||
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.spelling.SolrSpellChecker;
|
||||
import org.apache.solr.spelling.SpellingOptions;
|
||||
import org.apache.solr.spelling.SpellingResult;
|
||||
import org.apache.solr.spelling.suggest.Lookup.LookupResult;
|
||||
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
|
||||
import org.apache.solr.util.HighFrequencyDictionary;
|
||||
import org.apache.solr.spelling.suggest.fst.FSTLookupFactory;
|
||||
import org.apache.solr.spelling.suggest.jaspell.JaspellLookupFactory;
|
||||
import org.apache.solr.spelling.suggest.tst.TSTLookupFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -80,11 +85,18 @@ public class Suggester extends SolrSpellChecker {
|
|||
sourceLocation = (String) config.get(LOCATION);
|
||||
field = (String)config.get(FIELD);
|
||||
lookupImpl = (String)config.get(LOOKUP_IMPL);
|
||||
if (lookupImpl == null) {
|
||||
lookupImpl = JaspellLookup.class.getName();
|
||||
|
||||
// support the old classnames without -Factory for config file backwards compatibility.
|
||||
if (lookupImpl == null || "org.apache.solr.spelling.suggest.jaspell.JaspellLookup".equals(lookupImpl)) {
|
||||
lookupImpl = JaspellLookupFactory.class.getName();
|
||||
} else if ("org.apache.solr.spelling.suggest.tst.TSTLookup".equals(lookupImpl)) {
|
||||
lookupImpl = TSTLookupFactory.class.getName();
|
||||
} else if ("org.apache.solr.spelling.suggest.fst.FSTLookup".equals(lookupImpl)) {
|
||||
lookupImpl = FSTLookupFactory.class.getName();
|
||||
}
|
||||
lookup = (Lookup) core.getResourceLoader().newInstance(lookupImpl);
|
||||
lookup.init(config, core);
|
||||
|
||||
LookupFactory factory = (LookupFactory) core.getResourceLoader().newInstance(lookupImpl);
|
||||
lookup = factory.create(config, core);
|
||||
String store = (String)config.get(STORE_DIR);
|
||||
if (store != null) {
|
||||
storeDir = new File(store);
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
package org.apache.solr.spelling.suggest.fst;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.fst.FSTLookup;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.spelling.suggest.LookupFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link FSTLookup}
|
||||
*/
|
||||
public class FSTLookupFactory extends LookupFactory {
|
||||
|
||||
/**
|
||||
* The number of separate buckets for weights (discretization). The more buckets,
|
||||
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
|
||||
* will not decrease for prefixes which have highly-weighted completions (because these
|
||||
* are filled-in first), but will decrease significantly for low-weighted terms (but
|
||||
* these should be infrequent, so it is all right).
|
||||
*
|
||||
* <p>The number of buckets must be within [1, 255] range.
|
||||
*/
|
||||
public static final String WEIGHT_BUCKETS = "weightBuckets";
|
||||
|
||||
/**
|
||||
* If <code>true</code>, exact suggestions are returned first, even if they are prefixes
|
||||
* of other strings in the automaton (possibly with larger weights).
|
||||
*/
|
||||
public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
|
||||
|
||||
@Override
|
||||
public Lookup create(NamedList params, SolrCore core) {
|
||||
int buckets = params.get(WEIGHT_BUCKETS) != null
|
||||
? Integer.parseInt(params.get(WEIGHT_BUCKETS).toString())
|
||||
: 10;
|
||||
|
||||
boolean exactMatchFirst = params.get(EXACT_MATCH_FIRST) != null
|
||||
? Boolean.valueOf(params.get(EXACT_MATCH_FIRST).toString())
|
||||
: true;
|
||||
|
||||
return new FSTLookup(buckets, exactMatchFirst);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
package org.apache.solr.spelling.suggest.jaspell;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.spelling.suggest.LookupFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link JaspellLookup}
|
||||
*/
|
||||
public class JaspellLookupFactory extends LookupFactory {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(JaspellLookup.class);
|
||||
|
||||
@Override
|
||||
public Lookup create(NamedList params, SolrCore core) {
|
||||
LOG.info("init: " + params);
|
||||
return new JaspellLookup();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
package org.apache.solr.spelling.suggest.tst;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.tst.TSTLookup;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.spelling.suggest.LookupFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link TSTLookup}
|
||||
*/
|
||||
public class TSTLookupFactory extends LookupFactory {
|
||||
|
||||
@Override
|
||||
public Lookup create(NamedList params, SolrCore core) {
|
||||
return new TSTLookup();
|
||||
}
|
||||
}
|
|
@ -1,11 +0,0 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
|
||||
public final class TermFreq {
|
||||
public final String term;
|
||||
public final float v;
|
||||
|
||||
public TermFreq(String term, float v) {
|
||||
this.term = term;
|
||||
this.v = v;
|
||||
}
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.solr.util.TermFreqIterator;
|
||||
|
||||
/**
|
||||
* A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
|
||||
*/
|
||||
public final class TermFreqArrayIterator implements TermFreqIterator {
|
||||
private final Iterator<TermFreq> i;
|
||||
private TermFreq current;
|
||||
|
||||
public TermFreqArrayIterator(Iterator<TermFreq> i) {
|
||||
this.i = i;
|
||||
}
|
||||
|
||||
public TermFreqArrayIterator(TermFreq [] i) {
|
||||
this(Arrays.asList(i));
|
||||
}
|
||||
|
||||
public TermFreqArrayIterator(Iterable<TermFreq> i) {
|
||||
this(i.iterator());
|
||||
}
|
||||
|
||||
public float freq() {
|
||||
return current.v;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return i.hasNext();
|
||||
}
|
||||
|
||||
public String next() {
|
||||
return (current = i.next()).term;
|
||||
}
|
||||
|
||||
public void remove() { throw new UnsupportedOperationException(); }
|
||||
}
|
Loading…
Reference in New Issue