SOLR-2375 Store & Load functionality for Suggester Lookup implementations.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1075804 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andrzej Bialecki 2011-03-01 13:04:01 +00:00
parent 21355a452b
commit e71e682352
8 changed files with 315 additions and 16 deletions

View File

@ -175,6 +175,10 @@ Other Changes
using Generics where applicable in method/object declatations, and
adding @SuppressWarnings("unchecked") when appropriate (hossman)
* SOLR-2375: Suggester Lookup implementations now store trie data
and load it back on init. This means that large tries don't have to be
rebuilt on every commit or core reload. (ab)
Documentation
----------------------

View File

@ -83,6 +83,8 @@ public class Suggester extends SolrSpellChecker {
if (lookupImpl == null) {
lookupImpl = JaspellLookup.class.getName();
}
lookup = (Lookup) core.getResourceLoader().newInstance(lookupImpl);
lookup.init(config, core);
String store = (String)config.get(STORE_DIR);
if (store != null) {
storeDir = new File(store);
@ -91,6 +93,13 @@ public class Suggester extends SolrSpellChecker {
}
if (!storeDir.exists()) {
storeDir.mkdirs();
} else {
// attempt reload of the stored lookup
try {
lookup.load(storeDir);
} catch (IOException e) {
LOG.warn("Loading stored lookup data failed", e);
}
}
}
return name;
@ -107,17 +116,17 @@ public class Suggester extends SolrSpellChecker {
dictionary = new FileDictionary(new InputStreamReader(
core.getResourceLoader().openResource(sourceLocation), "UTF-8"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
// should not happen
LOG.error("should not happen", e);
}
}
lookup = (Lookup) core.getResourceLoader().newInstance(lookupImpl);
try {
lookup.build(dictionary);
if (storeDir != null) {
lookup.store(storeDir);
}
} catch (Exception e) {
e.printStackTrace();
LOG.error("Error while building or storing Suggester data", e);
}
}

View File

@ -1,6 +1,10 @@
package org.apache.solr.spelling.suggest.jaspell;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@ -9,6 +13,7 @@ import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.spelling.suggest.Lookup;
import org.apache.solr.spelling.suggest.UnsortedTermFreqIteratorWrapper;
import org.apache.solr.spelling.suggest.jaspell.JaspellTernarySearchTrie.TSTNode;
import org.apache.solr.util.SortedIterator;
import org.apache.solr.util.TermFreqIterator;
import org.slf4j.Logger;
@ -16,7 +21,7 @@ import org.slf4j.LoggerFactory;
public class JaspellLookup extends Lookup {
private static final Logger LOG = LoggerFactory.getLogger(JaspellLookup.class);
JaspellTernarySearchTrie trie;
JaspellTernarySearchTrie trie = new JaspellTernarySearchTrie();
private boolean usePrefix = true;
private int editDistance = 2;
@ -89,14 +94,89 @@ public class JaspellLookup extends Lookup {
return res;
}
public static final String FILENAME = "jaspell.dat";
private static final byte LO_KID = 0x01;
private static final byte EQ_KID = 0x02;
private static final byte HI_KID = 0x04;
private static final byte HAS_VALUE = 0x08;
@Override
public boolean load(File storeDir) throws IOException {
return false;
File data = new File(storeDir, FILENAME);
if (!data.exists() || !data.canRead()) {
return false;
}
DataInputStream in = new DataInputStream(new FileInputStream(data));
TSTNode root = trie.new TSTNode('\0', null);
try {
readRecursively(in, root);
trie.setRoot(root);
} finally {
in.close();
}
return true;
}
private void readRecursively(DataInputStream in, TSTNode node) throws IOException {
node.splitchar = in.readChar();
byte mask = in.readByte();
if ((mask & HAS_VALUE) != 0) {
node.data = new Float(in.readFloat());
}
if ((mask & LO_KID) != 0) {
TSTNode kid = trie.new TSTNode('\0', node);
node.relatives[TSTNode.LOKID] = kid;
readRecursively(in, kid);
}
if ((mask & EQ_KID) != 0) {
TSTNode kid = trie.new TSTNode('\0', node);
node.relatives[TSTNode.EQKID] = kid;
readRecursively(in, kid);
}
if ((mask & HI_KID) != 0) {
TSTNode kid = trie.new TSTNode('\0', node);
node.relatives[TSTNode.HIKID] = kid;
readRecursively(in, kid);
}
}
@Override
public boolean store(File storeDir) throws IOException {
return false;
if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
return false;
}
TSTNode root = trie.getRoot();
if (root == null) { // empty tree
return false;
}
File data = new File(storeDir, FILENAME);
DataOutputStream out = new DataOutputStream(new FileOutputStream(data));
try {
writeRecursively(out, root);
out.flush();
} finally {
out.close();
}
return true;
}
private void writeRecursively(DataOutputStream out, TSTNode node) throws IOException {
if (node == null) {
return;
}
out.writeChar(node.splitchar);
byte mask = 0;
if (node.relatives[TSTNode.LOKID] != null) mask |= LO_KID;
if (node.relatives[TSTNode.EQKID] != null) mask |= EQ_KID;
if (node.relatives[TSTNode.HIKID] != null) mask |= HI_KID;
if (node.data != null) mask |= HAS_VALUE;
out.writeByte(mask);
if (node.data != null) {
out.writeFloat((Float)node.data);
}
writeRecursively(out, node.relatives[TSTNode.LOKID]);
writeRecursively(out, node.relatives[TSTNode.EQKID]);
writeRecursively(out, node.relatives[TSTNode.HIKID]);
}
}

View File

@ -127,6 +127,16 @@ public class JaspellTernarySearchTrie {
*/
public JaspellTernarySearchTrie() {
}
// for loading
void setRoot(TSTNode newRoot) {
rootNode = newRoot;
}
// for saving
TSTNode getRoot() {
return rootNode;
}
/**
* Constructs a Ternary Search Trie and loads data from a <code>File</code>

View File

@ -1,6 +1,10 @@
package org.apache.solr.spelling.suggest.tst;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@ -13,8 +17,8 @@ import org.apache.solr.util.SortedIterator;
import org.apache.solr.util.TermFreqIterator;
public class TSTLookup extends Lookup {
TernaryTreeNode root;
TSTAutocomplete autocomplete;
TernaryTreeNode root = new TernaryTreeNode();
TSTAutocomplete autocomplete = new TSTAutocomplete();
@Override
public void init(NamedList config, SolrCore core) {
@ -23,7 +27,6 @@ public class TSTLookup extends Lookup {
@Override
public void build(TermFreqIterator tfit) throws IOException {
root = new TernaryTreeNode();
autocomplete = new TSTAutocomplete();
// buffer first
if (!(tfit instanceof SortedIterator)) {
// make sure it's sorted
@ -48,7 +51,16 @@ public class TSTLookup extends Lookup {
@Override
public Object get(String key) {
throw new UnsupportedOperationException("get() is not supported here");
List<TernaryTreeNode> list = autocomplete.prefixCompletion(root, key, 0);
if (list == null || list.isEmpty()) {
return null;
}
for (TernaryTreeNode n : list) {
if (n.token.equals(key)) {
return n.val;
}
}
return null;
}
@Override
@ -75,15 +87,94 @@ public class TSTLookup extends Lookup {
}
return res;
}
public static final String FILENAME = "tst.dat";
private static final byte LO_KID = 0x01;
private static final byte EQ_KID = 0x02;
private static final byte HI_KID = 0x04;
private static final byte HAS_TOKEN = 0x08;
private static final byte HAS_VALUE = 0x10;
@Override
public boolean load(File storeDir) throws IOException {
return false;
public synchronized boolean load(File storeDir) throws IOException {
File data = new File(storeDir, FILENAME);
if (!data.exists() || !data.canRead()) {
return false;
}
DataInputStream in = new DataInputStream(new FileInputStream(data));
root = new TernaryTreeNode();
try {
readRecursively(in, root);
} finally {
in.close();
}
return true;
}
// pre-order traversal
private void readRecursively(DataInputStream in, TernaryTreeNode node) throws IOException {
node.splitchar = in.readChar();
byte mask = in.readByte();
if ((mask & HAS_TOKEN) != 0) {
node.token = in.readUTF();
}
if ((mask & HAS_VALUE) != 0) {
node.val = new Float(in.readFloat());
}
if ((mask & LO_KID) != 0) {
node.loKid = new TernaryTreeNode();
readRecursively(in, node.loKid);
}
if ((mask & EQ_KID) != 0) {
node.eqKid = new TernaryTreeNode();
readRecursively(in, node.eqKid);
}
if ((mask & HI_KID) != 0) {
node.hiKid = new TernaryTreeNode();
readRecursively(in, node.hiKid);
}
}
@Override
public boolean store(File storeDir) throws IOException {
return false;
public synchronized boolean store(File storeDir) throws IOException {
if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
return false;
}
File data = new File(storeDir, FILENAME);
DataOutputStream out = new DataOutputStream(new FileOutputStream(data));
try {
writeRecursively(out, root);
out.flush();
} finally {
out.close();
}
return true;
}
// pre-order traversal
private void writeRecursively(DataOutputStream out, TernaryTreeNode node) throws IOException {
// write out the current node
out.writeChar(node.splitchar);
// prepare a mask of kids
byte mask = 0;
if (node.eqKid != null) mask |= EQ_KID;
if (node.loKid != null) mask |= LO_KID;
if (node.hiKid != null) mask |= HI_KID;
if (node.token != null) mask |= HAS_TOKEN;
if (node.val != null) mask |= HAS_VALUE;
out.writeByte(mask);
if (node.token != null) out.writeUTF(node.token);
if (node.val != null) out.writeFloat((Float)node.val);
// recurse and write kids
if (node.loKid != null) {
writeRecursively(out, node.loKid);
}
if (node.eqKid != null) {
writeRecursively(out, node.eqKid);
}
if (node.hiKid != null) {
writeRecursively(out, node.hiKid);
}
}
}

View File

@ -37,6 +37,7 @@
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
<str name="lookupImpl">org.apache.solr.spelling.suggest.jaspell.JaspellLookup</str>
<str name="field">suggest</str>
<str name="storeDir">suggest</str>
<str name="buildOnCommit">true</str>
<!-- Suggester properties -->

View File

@ -0,0 +1,80 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling.suggest;
import java.io.File;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.junit.Test;
public class PersistenceTest extends SolrTestCaseJ4 {
public static final String[] keys = new String[] {
"one",
"two",
"three",
"four",
"oneness",
"onerous",
"onesimus",
"twofold",
"twonk",
"thrive",
"through",
"threat",
"foundation",
"fourier",
"fourty"
};
@Test
public void testTSTPersistence() throws Exception {
TSTLookup lookup = new TSTLookup();
for (String k : keys) {
lookup.add(k, new Float(k.length()));
}
File storeDir = new File(TEST_HOME);
lookup.store(storeDir);
lookup = new TSTLookup();
lookup.load(storeDir);
for (String k : keys) {
Float val = (Float)lookup.get(k);
assertNotNull(k, val);
assertEquals(k, k.length(), val.intValue());
}
}
@Test
public void testJaspellPersistence() throws Exception {
JaspellLookup lookup = new JaspellLookup();
for (String k : keys) {
lookup.add(k, new Float(k.length()));
}
File storeDir = new File(TEST_HOME);
lookup.store(storeDir);
lookup = new JaspellLookup();
lookup.load(storeDir);
for (String k : keys) {
Float val = (Float)lookup.get(k);
assertNotNull(k, val);
assertEquals(k, k.length(), val.intValue());
}
}
}

View File

@ -27,6 +27,7 @@ import org.apache.solr.util.TermFreqIterator;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -65,6 +66,29 @@ public class SuggesterTest extends SolrTestCaseJ4 {
@Test
public void testReload() throws Exception {
String leaveData = System.getProperty("solr.test.leavedatadir");
if (leaveData == null) leaveData = "";
System.setProperty("solr.test.leavedatadir", "true");
addDocs();
assertU(commit());
File data = dataDir;
String config = configString;
deleteCore();
dataDir = data;
configString = config;
initCore();
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']"
);
// restore the property
System.setProperty("solr.test.leavedatadir", leaveData);
}
@Test
public void testRebuild() throws Exception {
addDocs();
assertU(commit());
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),