SOLR-351: external file value source

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@587098 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2007-10-22 13:53:14 +00:00
parent fc96a325ed
commit 2d07595bc6
5 changed files with 626 additions and 6 deletions

View File

@ -139,6 +139,10 @@ New Features
26. SOLR-334L Pluggable query parsers. Allows specification of query
type and arguments as a prefix on a query string. (yonik)
27. SOLR-351L External Value Source. An external file may be used
to specify the values of a field, currently usable as
a ValueSource in a FunctionQuery. (yonik)
Changes in runtime behavior
Optimizations

View File

@ -0,0 +1,98 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.schema;
import org.apache.lucene.search.SortField;
import org.apache.lucene.document.Fieldable;
import org.apache.solr.search.function.ValueSource;
import org.apache.solr.search.function.FloatFieldSource;
import org.apache.solr.search.function.FileFloatSource;
import org.apache.solr.search.QParser;
import org.apache.solr.request.XMLWriter;
import org.apache.solr.request.TextResponseWriter;
import org.apache.solr.common.SolrException;
import java.util.Map;
import java.io.IOException;
/** Get values from an external file instead of the index.
*
* <p/><code>keyField</code> will normally be the unique key field, but it doesn't have to be.
* <ul><li> It's OK to have a keyField value that can't be found in the index</li>
* <li>It's OK to have some documents without a keyField in the file (defVal is used as the default)</li>
* <li>It's OK for a keyField value to point to multiple documents (no uniqueness requirement)</li>
* </ul>
* <code>valType</code> is a reference to another fieldType to define the value type of this field (must currently be FloatField (float))
*
* The format of the external file is simply newline separated keyFieldValue=floatValue.
* <br/>Example:
* <br/><code>doc33=1.414</code>
* <br/><code>doc34=3.14159</code>
* <br/><code>doc40=42</code>
*
* <p/>Solr looks for the external file in the index directory under the name of
* external_&lt;fieldname&gt; or external_&lt;fieldname&gt;.*
*
* <p/>If any files of the latter pattern appear, the last (after being sorted by name) will be used and previous versions will be deleted.
* This is to help support systems where one may not be able to overwrite a file (like Windows, if the file is in use).
* <p/>If the external file has already been loaded, and it is changed, those changes will not be visible until a commit has been done.
* <p/>The external file may be sorted or unsorted by the key field, but it will be substantially slower (untested) if it isn't sorted.
* <p/>Fields of this type may currently only be used as a ValueSource in a FunctionQuery.
*
* @version $Id$
*/
public class ExternalFileField extends FieldType {
private FieldType ftype;
private String keyFieldName;
private IndexSchema schema;
private float defVal;
protected void init(IndexSchema schema, Map<String,String> args) {
restrictProps(SORT_MISSING_FIRST | SORT_MISSING_LAST);
String ftypeS = getArg("valType", args);
if (ftypeS!=null) {
ftype = schema.getFieldTypes().get(ftypeS);
if (ftype==null || !(ftype instanceof FloatField)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Only float (FloatField) is currently supported as external field type. got " + ftypeS);
}
}
keyFieldName = args.remove("keyField");
String defValS = args.remove("defVal");
defVal = defValS==null ? 0 : Float.parseFloat(defValS);
this.schema = schema;
}
public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
throw new UnsupportedOperationException();
}
public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException {
throw new UnsupportedOperationException();
}
public SortField getSortField(SchemaField field,boolean reverse) {
throw new UnsupportedOperationException();
}
public ValueSource getValueSource(SchemaField field, QParser parser) {
// default key field to unique key
SchemaField keyField = keyFieldName==null ? schema.getUniqueKeyField() : schema.getField(keyFieldName);
return new FileFloatSource(field, keyField, defVal, parser);
}
}

View File

@ -0,0 +1,417 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.function;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.Term;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.FieldType;
import org.apache.solr.search.QParser;
import java.io.*;
import java.util.*;
/**
* Obtains float field values from an external file.
* @version $Id$
*/
public class FileFloatSource extends ValueSource {
private SchemaField field;
private final SchemaField keyField;
private final float defVal;
private final String indexDir;
public FileFloatSource(SchemaField field, SchemaField keyField, float defVal, QParser parser) {
this.field = field;
this.keyField = keyField;
this.defVal = defVal;
this.indexDir = parser.getReq().getCore().getIndexDir();
}
public String description() {
return "float(" + field + ')';
}
public DocValues getValues(IndexReader reader) throws IOException {
final float[] arr = getCachedFloats(reader);
return new DocValues() {
public float floatVal(int doc) {
return arr[doc];
}
public int intVal(int doc) {
return (int)arr[doc];
}
public long longVal(int doc) {
return (long)arr[doc];
}
public double doubleVal(int doc) {
return (double)arr[doc];
}
public String strVal(int doc) {
return Float.toString(arr[doc]);
}
public String toString(int doc) {
return description() + '=' + floatVal(doc);
}
};
}
public boolean equals(Object o) {
if (o.getClass() != FileFloatSource.class) return false;
FileFloatSource other = (FileFloatSource)o;
return this.field.getName().equals(other.field.getName())
&& this.keyField.getName().equals(other.keyField.getName())
&& this.defVal == other.defVal
&& this.indexDir.equals(other.indexDir);
}
public int hashCode() {
return FileFloatSource.class.hashCode() + field.getName().hashCode();
};
public String toString() {
return "FileFloatSource(field="+field.getName()+",keyField="+keyField.getName()
+ ",defVal="+defVal+",indexDir="+indexDir+")";
}
private final float[] getCachedFloats(IndexReader reader) {
return (float[])floatCache.get(reader, new Entry(this));
}
static Cache floatCache = new Cache() {
protected Object createValue(IndexReader reader, Object key) {
return getFloats(((Entry)key).ffs, reader);
}
};
/** Internal cache. (from lucene FieldCache) */
abstract static class Cache {
private final Map readerCache = new WeakHashMap();
protected abstract Object createValue(IndexReader reader, Object key);
public Object get(IndexReader reader, Object key) {
Map innerCache;
Object value;
synchronized (readerCache) {
innerCache = (Map) readerCache.get(reader);
if (innerCache == null) {
innerCache = new HashMap();
readerCache.put(reader, innerCache);
value = null;
} else {
value = innerCache.get(key);
}
if (value == null) {
value = new CreationPlaceholder();
innerCache.put(key, value);
}
}
if (value instanceof CreationPlaceholder) {
synchronized (value) {
CreationPlaceholder progress = (CreationPlaceholder) value;
if (progress.value == null) {
progress.value = createValue(reader, key);
synchronized (readerCache) {
innerCache.put(key, progress.value);
onlyForTesting = progress.value;
}
}
return progress.value;
}
}
return value;
}
}
static Object onlyForTesting; // set to the last value
static final class CreationPlaceholder {
Object value;
}
/** Expert: Every composite-key in the internal cache is of this type. */
private static class Entry {
final FileFloatSource ffs;
public Entry(FileFloatSource ffs) {
this.ffs = ffs;
}
public boolean equals(Object o) {
if (!(o instanceof Entry)) return false;
Entry other = (Entry)o;
return ffs.equals(other.ffs);
}
public int hashCode() {
return ffs.hashCode();
}
}
private static float[] getFloats(FileFloatSource ffs, IndexReader reader) {
float[] vals = new float[reader.maxDoc()];
if (ffs.defVal != 0) {
Arrays.fill(vals, ffs.defVal);
}
InputStream is;
String fname = "external_" + ffs.field.getName();
try {
is = getLatestFile(ffs.indexDir, fname);
} catch (IOException e) {
// log, use defaults
SolrCore.log.severe("Error opening external value source file: " +e);
return vals;
}
BufferedReader r = new BufferedReader(new InputStreamReader(is));
String idName = ffs.keyField.getName().intern();
FieldType idType = ffs.keyField.getType();
boolean sorted=true; // assume sorted until we discover it's not
// warning: lucene's termEnum.skipTo() is not optimized... it simply does a next()
// because of this, simply ask the reader for a new termEnum rather than
// trying to use skipTo()
List<String> notFound = new ArrayList<String>();
int notFoundCount=0;
int otherErrors=0;
TermDocs termDocs = null;
Term protoTerm = new Term(idName, "");
TermEnum termEnum = null;
// Number of times to try termEnum.next() before resorting to skip
int numTimesNext = 10;
char delimiter='=';
String termVal;
boolean hasNext=true;
String prevKey="";
String lastVal="\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF";
try {
termDocs = reader.termDocs();
termEnum = reader.terms(protoTerm);
Term t = termEnum.term();
if (t != null && t.field() == idName) { // intern'd comparison
termVal = t.text();
} else {
termVal = lastVal;
}
for (String line; (line=r.readLine())!=null;) {
int delimIndex = line.indexOf(delimiter);
if (delimIndex < 0) continue;
int endIndex = line.length();
/* EOLs should already be removed for BufferedReader.readLine()
for(int endIndex = line.length();endIndex>delimIndex+1; endIndex--) {
char ch = line.charAt(endIndex-1);
if (ch!='\n' && ch!='\r') break;
}
*/
String key = line.substring(0, delimIndex);
String val = line.substring(delimIndex+1, endIndex);
String internalKey = idType.toInternal(key);
float fval;
try {
fval=Float.parseFloat(val);
} catch (Exception e) {
if (++otherErrors<=10) {
SolrCore.log.severe( "Error loading external value source + fileName + " + e
+ (otherErrors<10 ? "" : "\tSkipping future errors for this file.")
);
}
continue; // go to next line in file.. leave values as default.
}
if (sorted) {
// make sure this key is greater than the previous key
sorted = internalKey.compareTo(prevKey) >= 0;
prevKey = internalKey;
if (sorted) {
int countNext = 0;
for(;;) {
int cmp = internalKey.compareTo(termVal);
if (cmp == 0) {
termDocs.seek(termEnum);
while (termDocs.next()) {
vals[termDocs.doc()] = fval;
}
break;
} else if (cmp < 0) {
// term enum has already advanced past current key... we didn't find it.
if (notFoundCount<10) { // collect first 10 not found for logging
notFound.add(key);
}
notFoundCount++;
break;
} else {
// termEnum is less than our current key, so skip ahead
// try next() a few times to see if we hit or pass the target.
// Lucene's termEnum.skipTo() is currently unoptimized (it just does next())
// so the best thing is to simply ask the reader for a new termEnum(target)
// if we really need to skip.
if (++countNext > numTimesNext) {
termEnum = reader.terms(protoTerm.createTerm(internalKey));
t = termEnum.term();
} else {
hasNext = termEnum.next();
t = hasNext ? termEnum.term() : null;
}
if (t != null && t.field() == idName) { // intern'd comparison
termVal = t.text();
} else {
termVal = lastVal;
}
}
} // end for(;;)
}
}
if (!sorted) {
termEnum = reader.terms(protoTerm.createTerm(internalKey));
t = termEnum.term();
if (t != null && t.field() == idName // intern'd comparison
&& internalKey.equals(t.text()))
{
termDocs.seek (termEnum);
while (termDocs.next()) {
vals[termDocs.doc()] = fval;
}
} else {
if (notFoundCount<10) { // collect first 10 not found for logging
notFound.add(key);
}
notFoundCount++;
}
}
}
} catch (IOException e) {
// log, use defaults
SolrCore.log.severe("Error loading external value source: " +e);
} finally {
// swallow exceptions on close so we don't override any
// exceptions that happened in the loop
if (termDocs!=null) try{termDocs.close();}catch(Exception e){}
if (termEnum!=null) try{termEnum.close();}catch(Exception e){}
try{r.close();}catch(Exception e){}
}
SolrCore.log.info("Loaded external value source " + fname
+ (notFoundCount==0 ? "" : " :"+notFoundCount+" missing keys "+notFound)
);
return vals;
}
// Future: refactor/pull out into VersionedFile class
/* Open the latest version of a file... fileName if that exists, or
* the last fileName.* after being sorted lexicographically.
* Older versions of the file are deleted (and queued for deletion if
* that fails).
*/
private static InputStream getLatestFile(String dirName, String fileName) throws FileNotFoundException {
Collection<File> oldFiles=null;
final String prefix = fileName+'.';
File f = new File(dirName, fileName);
InputStream is = null;
// there can be a race between checking for a file and opening it...
// the user may have just put a new version in and deleted an old version.
// try multiple times in a row.
for (int retry=0; retry<10; retry++) {
try {
if (!f.exists()) {
File dir = new File(dirName);
String[] names = dir.list(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.startsWith(prefix);
}
});
Arrays.sort(names);
f = new File(dir, names[names.length-1]);
oldFiles = new ArrayList<File>();
for (int i=0; i<names.length-1; i++) {
oldFiles.add(new File(dir, names[i]));
}
}
is = new FileInputStream(f);
} catch (Exception e) {
// swallow exception for now
}
}
// allow exception to be thrown from the final try.
is = new FileInputStream(f);
// delete old files only after we have successfuly opened the newest
if (oldFiles != null) {
delete(oldFiles);
}
return is;
}
private static final Set<File> deleteList = new HashSet<File>();
private static synchronized void delete(Collection<File> files) {
synchronized (deleteList) {
deleteList.addAll(files);
List<File> deleted = new ArrayList<File>();
for (File df : deleteList) {
try {
df.delete();
// deleteList.remove(df);
deleted.add(df);
} catch (SecurityException e) {
if (!df.exists()) {
deleted.add(df);
}
}
}
deleteList.removeAll(deleted);
}
}
}

View File

@ -18,9 +18,16 @@
package org.apache.solr.search.function;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.core.SolrCore;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.Arrays;
import java.io.File;
import java.io.Writer;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
/**
* Tests some basic functionality of Solr while demonstrating good
@ -43,12 +50,27 @@ public class TestFunctionQuery extends AbstractSolrTestCase {
super.tearDown();
}
String base = "external_foo_extf";
void makeExternalFile(String field, String contents, String charset) {
String dir = h.getCore().getIndexDir();
String filename = dir + "/external_" + field + "." + System.currentTimeMillis();
try {
Writer out = new OutputStreamWriter(new FileOutputStream(filename), charset);
out.write(contents);
out.close();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
void createIndex(String field, float... values) {
// lrf.args.put("version","2.0");
for (float val : values) {
String s = Float.toString(val);
assertU(adoc("id", s, field, s));
System.out.println("added doc for " + val);
if (field!=null) assertU(adoc("id", s, field, s));
else assertU(adoc("id", s));
// System.out.println("added doc for " + val);
}
assertU(optimize()); // squeeze out any possible deleted docs
}
@ -77,14 +99,14 @@ public class TestFunctionQuery extends AbstractSolrTestCase {
// "//doc[./float[@name='foo_pf']='10.0' and ./float[@name='score']='10.0']"
for (int i=0; i<results.length; i+=2) {
String xpath = "//doc[./float[@name='" + field + "']='"
String xpath = "//doc[./float[@name='" + "id" + "']='"
+ results[i] + "' and ./float[@name='score']='"
+ results[i+1] + "']";
tests.add(xpath);
}
assertQ(req("q", parseableQuery
,"fl", "*,score"
,"fl", "*,score","indent","on","rows","100"
)
, tests.toArray(new String[tests.size()])
);
@ -137,4 +159,78 @@ public class TestFunctionQuery extends AbstractSolrTestCase {
doTest("foo_pf"); // a plain float field
doTest("foo_f"); // a sortable float field
}
public void testExternalField() {
String field = "foo_extf";
float[] ids = {100,-4,0,10,25,5,77,23,55,-78,-45,-24,63,78,94,22,34,54321,261,-627};
createIndex(null,ids);
// Unsorted field, largest first
makeExternalFile(field, "54321=543210\n0=-999\n25=250","UTF-8");
// test identity (straight field value)
singleTest(field, "\0", 54321, 543210, 0,-999, 25,250, 100, 1);
Object orig = FileFloatSource.onlyForTesting;
singleTest(field, "log(\0)");
// make sure the values were cached
assertTrue(orig == FileFloatSource.onlyForTesting);
singleTest(field, "sqrt(\0)");
assertTrue(orig == FileFloatSource.onlyForTesting);
makeExternalFile(field, "0=1","UTF-8");
assertU(commit());
assertTrue(orig != FileFloatSource.onlyForTesting);
Random r = new Random();
for (int i=0; i<10; i++) { // do more iterations for a thorough test
int len = r.nextInt(ids.length+1);
boolean sorted = r.nextBoolean();
// shuffle ids
for (int j=0; j<ids.length; j++) {
int other=r.nextInt(ids.length);
float v=ids[0];
ids[0] = ids[other];
ids[other] = v;
}
if (sorted) {
// sort only the first elements
Arrays.sort(ids,0,len);
}
// make random values
float[] vals = new float[len];
for (int j=0; j<len; j++) {
vals[j] = r.nextInt(200)-100;
}
// make and write the external file
StringBuilder sb = new StringBuilder();
for (int j=0; j<len; j++) {
sb.append("" + ids[j] + "=" + vals[j]+"\n");
}
makeExternalFile(field, sb.toString(),"UTF-8");
// make it visible
assertU(commit());
// test it
float[] answers = new float[ids.length*2];
for (int j=0; j<len; j++) {
answers[j*2] = ids[j];
answers[j*2+1] = vals[j];
}
for (int j=len; j<ids.length; j++) {
answers[j*2] = ids[j];
answers[j*2+1] = 1; // the default values
}
singleTest(field, "\0", answers);
System.out.println("Done test "+i);
}
}
}

View File

@ -232,7 +232,10 @@
<!-- since fields of this type are by default not stored or indexed, any data added to
them will be ignored outright
-->
<fieldtype name="ignored" stored="false" indexed="false" class="solr.StrField" />
<fieldType name="ignored" stored="false" indexed="false" class="solr.StrField" />
<fieldType name="file" keyField="id" defVal="1" stored="false" indexed="false" class="solr.ExternalFileField" valType="float"/>
</types>
@ -282,6 +285,8 @@
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
<dynamicField name="*_extf" type="file"/>
<dynamicField name="*_random" type="random" />
<!-- uncomment the following to ignore any fields that don't already match an existing