LUCENE-1015: Added support for caching longs and doubles in a new interface, ExtendedFieldCache and *Impl. Added support into the various sorting fields. Put in some testing for it.

Removed the author tag in NumberTools.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@590530 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2007-10-31 01:28:25 +00:00
parent 526d97676d
commit 7012f67027
9 changed files with 425 additions and 38 deletions

View File

@ -154,6 +154,9 @@ New features
obtained or released, throwing an exception if an illegal lock
obtain occurred. (Patrick Kimber vis Mike McCandless)
6. LUCENE-1015: Added FieldCache extension (ExtendedFieldCache) to support doubles and longs.
Added support into SortField for sorting on doubles and longs as well. (Grant Ingersoll)
Optimizations
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the

View File

@ -31,7 +31,7 @@ package org.apache.lucene.document;
* This class handles <b>all</b> long values (unlike
* {@link org.apache.lucene.document.DateField}).
*
* @author Matt Quail (spud at madbean dot com)
*
*/
public class NumberTools {

View File

@ -0,0 +1,87 @@
package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader;
import java.io.IOException;
/**
*
*
**/
public interface ExtendedFieldCache extends FieldCache {
public interface LongParser {
/**
* Return an long representation of this field's value.
*/
public long parseLong(String string);
}
public interface DoubleParser {
/**
* Return an long representation of this field's value.
*/
public double parseDouble(String string);
}
public static ExtendedFieldCache EXT_DEFAULT = new ExtendedFieldCacheImpl();
/**
* Checks the internal cache for an appropriate entry, and if none is
* found, reads the terms in <code>field</code> as longs and returns an array
* of size <code>reader.maxDoc()</code> of the value each document
* has in the given field.
*
* @param reader Used to get field values.
* @param field Which field contains the longs.
* @return The values in the given field for each document.
* @throws java.io.IOException If any error occurs.
*/
public long[] getLongs(IndexReader reader, String field)
throws IOException;
/**
* Checks the internal cache for an appropriate entry, and if none is found,
* reads the terms in <code>field</code> as longs and returns an array of
* size <code>reader.maxDoc()</code> of the value each document has in the
* given field.
*
* @param reader Used to get field values.
* @param field Which field contains the longs.
* @param parser Computes integer for string values.
* @return The values in the given field for each document.
* @throws IOException If any error occurs.
*/
public long[] getLongs(IndexReader reader, String field, LongParser parser)
throws IOException;
/**
* Checks the internal cache for an appropriate entry, and if none is
* found, reads the terms in <code>field</code> as integers and returns an array
* of size <code>reader.maxDoc()</code> of the value each document
* has in the given field.
*
* @param reader Used to get field values.
* @param field Which field contains the doubles.
* @return The values in the given field for each document.
* @throws IOException If any error occurs.
*/
public double[] getDoubles(IndexReader reader, String field)
throws IOException;
/**
* Checks the internal cache for an appropriate entry, and if none is found,
* reads the terms in <code>field</code> as doubles and returns an array of
* size <code>reader.maxDoc()</code> of the value each document has in the
* given field.
*
* @param reader Used to get field values.
* @param field Which field contains the doubles.
* @param parser Computes integer for string values.
* @return The values in the given field for each document.
* @throws IOException If any error occurs.
*/
public double[] getDoubles(IndexReader reader, String field, DoubleParser parser)
throws IOException;
}

View File

@ -0,0 +1,117 @@
package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import java.io.IOException;
/**
*
*
**/
class ExtendedFieldCacheImpl extends FieldCacheImpl implements ExtendedFieldCache {
private static final LongParser LONG_PARSER = new LongParser() {
public long parseLong(String value) {
return Long.parseLong(value);
}
};
private static final DoubleParser DOUBLE_PARSER = new DoubleParser() {
public double parseDouble(String value) {
return Double.parseDouble(value);
}
};
private static final ByteParser BYTE_PARSER = new ByteParser() {
public byte parseByte(String string) {
return Byte.parseByte(string);
}
};
private static final ShortParser SHORT_PARSER = new ShortParser() {
public short parseShort(String string) {
return Short.parseShort(string);
}
};
public long[] getLongs(IndexReader reader, String field) throws IOException {
return getLongs(reader, field, LONG_PARSER);
}
// inherit javadocs
public long[] getLongs(IndexReader reader, String field, LongParser parser)
throws IOException {
return (long[]) longsCache.get(reader, new Entry(field, parser));
}
Cache longsCache = new Cache() {
protected Object createValue(IndexReader reader, Object entryKey)
throws IOException {
Entry entry = (Entry) entryKey;
String field = entry.field;
LongParser parser = (LongParser) entry.custom;
final long[] retArray = new long[reader.maxDoc()];
TermDocs termDocs = reader.termDocs();
TermEnum termEnum = reader.terms (new Term(field, ""));
try {
do {
Term term = termEnum.term();
if (term==null || term.field() != field) break;
long termval = parser.parseLong(term.text());
termDocs.seek (termEnum);
while (termDocs.next()) {
retArray[termDocs.doc()] = termval;
}
} while (termEnum.next());
} finally {
termDocs.close();
termEnum.close();
}
return retArray;
}
};
// inherit javadocs
public double[] getDoubles(IndexReader reader, String field)
throws IOException {
return getDoubles(reader, field, DOUBLE_PARSER);
}
// inherit javadocs
public double[] getDoubles(IndexReader reader, String field, DoubleParser parser)
throws IOException {
return (double[]) doublesCache.get(reader, new Entry(field, parser));
}
Cache doublesCache = new Cache() {
protected Object createValue(IndexReader reader, Object entryKey)
throws IOException {
Entry entry = (Entry) entryKey;
String field = entry.field;
DoubleParser parser = (DoubleParser) entry.custom;
final double[] retArray = new double[reader.maxDoc()];
TermDocs termDocs = reader.termDocs();
TermEnum termEnum = reader.terms (new Term (field, ""));
try {
do {
Term term = termEnum.term();
if (term==null || term.field() != field) break;
double termval = parser.parseDouble(term.text());
termDocs.seek (termEnum);
while (termDocs.next()) {
retArray[termDocs.doc()] = termval;
}
} while (termEnum.next());
} finally {
termDocs.close();
termEnum.close();
}
return retArray;
}
};
}

View File

@ -109,20 +109,29 @@ extends PriorityQueue {
for (int i=0; i<n && c==0; ++i) {
final int type = fields[i].getType();
switch (type) {
case SortField.SCORE:
case SortField.SCORE:{
float r1 = ((Float)docA.fields[i]).floatValue();
float r2 = ((Float)docB.fields[i]).floatValue();
if (r1 > r2) c = -1;
if (r1 < r2) c = 1;
break;
case SortField.DOC:
case SortField.INT:
}
case SortField.DOC:
case SortField.INT:{
int i1 = ((Integer)docA.fields[i]).intValue();
int i2 = ((Integer)docB.fields[i]).intValue();
if (i1 < i2) c = -1;
if (i1 > i2) c = 1;
break;
case SortField.STRING:
}
case SortField.LONG:{
long l1 = ((Long)docA.fields[i]).longValue();
long l2 = ((Long)docB.fields[i]).longValue();
if (l1 < l2) c = -1;
if (l1 > l2) c = 1;
break;
}
case SortField.STRING:{
String s1 = (String) docA.fields[i];
String s2 = (String) docB.fields[i];
// null values need to be sorted first, because of how FieldCache.getStringIndex()
@ -136,24 +145,36 @@ extends PriorityQueue {
c = collators[i].compare (s1, s2);
}
break;
case SortField.FLOAT:
}
case SortField.FLOAT:{
float f1 = ((Float)docA.fields[i]).floatValue();
float f2 = ((Float)docB.fields[i]).floatValue();
if (f1 < f2) c = -1;
if (f1 > f2) c = 1;
break;
case SortField.CUSTOM:
}
case SortField.DOUBLE:{
double d1 = ((Double)docA.fields[i]).doubleValue();
double d2 = ((Double)docB.fields[i]).doubleValue();
if (d1 < d2) c = -1;
if (d1 > d2) c = 1;
break;
}
case SortField.CUSTOM:{
c = docA.fields[i].compareTo (docB.fields[i]);
break;
case SortField.AUTO:
}
case SortField.AUTO:{
// we cannot handle this - even if we determine the type of object (Float or
// Integer), we don't necessarily know how to compare them (both SCORE and
// FLOAT contain floats, but are sorted opposite of each other). Before
// we get here, each AUTO should have been replaced with its actual value.
throw new RuntimeException ("FieldDocSortedHitQueue cannot use an AUTO SortField");
default:
}
default:{
throw new RuntimeException ("invalid SortField type: "+type);
}
}
}
if (fields[i].getReverse()) {
c = -c;
}

View File

@ -21,8 +21,8 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.PriorityQueue;
import java.io.IOException;
import java.util.Locale;
import java.text.Collator;
import java.util.Locale;
/**
* Expert: A hit queue for sorting by hits by terms in more than one field.
@ -177,6 +177,12 @@ extends PriorityQueue {
case SortField.FLOAT:
comparator = comparatorFloat (reader, fieldname);
break;
case SortField.LONG:
comparator = comparatorLong(reader, fieldname);
break;
case SortField.DOUBLE:
comparator = comparatorDouble(reader, fieldname);
break;
case SortField.STRING:
if (locale != null) comparator = comparatorStringLocale (reader, fieldname, locale);
else comparator = comparatorString (reader, fieldname);
@ -222,6 +228,38 @@ extends PriorityQueue {
};
}
/**
* Returns a comparator for sorting hits according to a field containing integers.
* @param reader Index to use.
* @param fieldname Fieldable containg integer values.
* @return Comparator for sorting hits.
* @throws IOException If an error occurs reading the index.
*/
static ScoreDocComparator comparatorLong (final IndexReader reader, final String fieldname)
throws IOException {
final String field = fieldname.intern();
final long[] fieldOrder = ExtendedFieldCache.EXT_DEFAULT.getLongs (reader, field);
return new ScoreDocComparator() {
public final int compare (final ScoreDoc i, final ScoreDoc j) {
final long li = fieldOrder[i.doc];
final long lj = fieldOrder[j.doc];
if (li < lj) return -1;
if (li > lj) return 1;
return 0;
}
public Comparable sortValue (final ScoreDoc i) {
return new Long(fieldOrder[i.doc]);
}
public int sortType() {
return SortField.LONG;
}
};
}
/**
* Returns a comparator for sorting hits according to a field containing floats.
* @param reader Index to use.
@ -253,6 +291,37 @@ extends PriorityQueue {
};
}
/**
* Returns a comparator for sorting hits according to a field containing doubles.
* @param reader Index to use.
* @param fieldname Fieldable containg float values.
* @return Comparator for sorting hits.
* @throws IOException If an error occurs reading the index.
*/
static ScoreDocComparator comparatorDouble(final IndexReader reader, final String fieldname)
throws IOException {
final String field = fieldname.intern();
final double[] fieldOrder = ExtendedFieldCache.EXT_DEFAULT.getDoubles (reader, field);
return new ScoreDocComparator () {
public final int compare (final ScoreDoc i, final ScoreDoc j) {
final double di = fieldOrder[i.doc];
final double dj = fieldOrder[j.doc];
if (di < dj) return -1;
if (di > dj) return 1;
return 0;
}
public Comparable sortValue (final ScoreDoc i) {
return new Double (fieldOrder[i.doc]);
}
public int sortType() {
return SortField.DOUBLE;
}
};
}
/**
* Returns a comparator for sorting hits according to a field containing strings.
* @param reader Index to use.

View File

@ -60,6 +60,14 @@ implements Serializable {
* lower values are at the front. */
public static final int FLOAT = 5;
/** Sort using term values as encoded Longs. Sort values are Long and
* lower values are at the front. */
public static final int LONG = 6;
/** Sort using term values as encoded Doubles. Sort values are Double and
* lower values are at the front. */
public static final int DOUBLE = 7;
/** Sort using a custom Comparator. Sort values are any Comparable and
* sorting is done according to natural order. */
public static final int CUSTOM = 9;

View File

@ -0,0 +1,71 @@
package org.apache.lucene.search;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
public class TestExtendedFieldCache extends LuceneTestCase {
protected IndexReader reader;
private static final int NUM_DOCS = 1000;
public TestExtendedFieldCache(String s) {
super(s);
}
protected void setUp() throws Exception {
super.setUp();
RAMDirectory directory = new RAMDirectory();
IndexWriter writer= new IndexWriter(directory, new WhitespaceAnalyzer(), true);
long theLong = Long.MAX_VALUE;
double theDouble = Double.MAX_VALUE;
for (int i = 0; i < NUM_DOCS; i++){
Document doc = new Document();
doc.add(new Field("theLong", String.valueOf(theLong--), Field.Store.NO, Field.Index.UN_TOKENIZED));
doc.add(new Field("theDouble", String.valueOf(theDouble--), Field.Store.NO, Field.Index.UN_TOKENIZED));
doc.add(new Field("text", English.intToEnglish(i), Field.Store.NO, Field.Index.TOKENIZED));
writer.addDocument(doc);
}
writer.close();
reader = IndexReader.open(directory);
}
public void test() throws IOException {
ExtendedFieldCache cache = new ExtendedFieldCacheImpl();
double [] doubles = cache.getDoubles(reader, "theDouble");
assertTrue("doubles Size: " + doubles.length + " is not: " + NUM_DOCS, doubles.length == NUM_DOCS);
for (int i = 0; i < doubles.length; i++) {
assertTrue(doubles[i] + " does not equal: " + (Double.MAX_VALUE - i), doubles[i] == (Double.MAX_VALUE - i));
}
long [] longs = cache.getLongs(reader, "theLong");
assertTrue("longs Size: " + longs.length + " is not: " + NUM_DOCS, longs.length == NUM_DOCS);
for (int i = 0; i < longs.length; i++) {
assertTrue(longs[i] + " does not equal: " + (Long.MAX_VALUE - i), longs[i] == (Long.MAX_VALUE - i));
}
}
}

View File

@ -17,27 +17,28 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.index.*;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.textui.TestRunner;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
import java.io.Serializable;
import java.rmi.Naming;
import java.rmi.registry.LocateRegistry;
import java.rmi.registry.Registry;
import java.io.IOException;
import java.io.Serializable;
import java.util.regex.Pattern;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Locale;
import java.util.BitSet;
import junit.framework.TestCase;
import junit.framework.Test;
import junit.framework.TestSuite;
import junit.textui.TestRunner;
import java.util.regex.Pattern;
/**
* Unit tests for sorting code.
@ -97,21 +98,21 @@ implements Serializable {
// the string field to sort by string
// the i18n field includes accented characters for testing locale-specific sorting
private String[][] data = new String[][] {
// tracer contents int float string custom i18n
{ "A", "x a", "5", "4f", "c", "A-3", "p\u00EAche"},
{ "B", "y a", "5", "3.4028235E38", "i", "B-10", "HAT"},
{ "C", "x a b c", "2147483647", "1.0", "j", "A-2", "p\u00E9ch\u00E9"},
{ "D", "y a b c", "-1", "0.0f", "a", "C-0", "HUT"},
{ "E", "x a b c d", "5", "2f", "h", "B-8", "peach"},
{ "F", "y a b c d", "2", "3.14159f", "g", "B-1", "H\u00C5T"},
{ "G", "x a b c d", "3", "-1.0", "f", "C-100", "sin"},
{ "H", "y a b c d", "0", "1.4E-45", "e", "C-88", "H\u00D8T"},
{ "I", "x a b c d e f", "-2147483648", "1.0e+0", "d", "A-10", "s\u00EDn"},
{ "J", "y a b c d e f", "4", ".5", "b", "C-7", "HOT"},
{ "W", "g", "1", null, null, null, null},
{ "X", "g", "1", "0.1", null, null, null},
{ "Y", "g", "1", "0.2", null, null, null},
{ "Z", "f g", null, null, null, null, null}
// tracer contents int float string custom i18n long double
{ "A", "x a", "5", "4f", "c", "A-3", "p\u00EAche", "10", "-4.0"},//A
{ "B", "y a", "5", "3.4028235E38", "i", "B-10", "HAT", "1000000000", "40.0"},//B
{ "C", "x a b c", "2147483647", "1.0", "j", "A-2", "p\u00E9ch\u00E9", "99999999", "40.00002343"},//C
{ "D", "y a b c", "-1", "0.0f", "a", "C-0", "HUT", String.valueOf(Long.MAX_VALUE), String.valueOf(Double.MIN_VALUE)},//D
{ "E", "x a b c d", "5", "2f", "h", "B-8", "peach", String.valueOf(Long.MIN_VALUE), String.valueOf(Double.MAX_VALUE)},//E
{ "F", "y a b c d", "2", "3.14159f", "g", "B-1", "H\u00C5T", "-44", "343.034435444"},//F
{ "G", "x a b c d", "3", "-1.0", "f", "C-100", "sin", "323254543543", "4.043544"},//G
{ "H", "y a b c d", "0", "1.4E-45", "e", "C-88", "H\u00D8T", "1023423423005","4.043545"},//H
{ "I", "x a b c d e f", "-2147483648", "1.0e+0", "d", "A-10", "s\u00EDn", "332422459999", "4.043546"},//I
{ "J", "y a b c d e f", "4", ".5", "b", "C-7", "HOT", "34334543543", "4.0000220343"},//J
{ "W", "g", "1", null, null, null, null, null, null},
{ "X", "g", "1", "0.1", null, null, null, null, null},
{ "Y", "g", "1", "0.2", null, null, null, null, null},
{ "Z", "f g", null, null, null, null, null, null, null}
};
// create an index of all the documents, or just the x, or just the y documents
@ -129,6 +130,8 @@ implements Serializable {
if (data[i][4] != null) doc.add (new Field ("string", data[i][4], Field.Store.NO, Field.Index.UN_TOKENIZED));
if (data[i][5] != null) doc.add (new Field ("custom", data[i][5], Field.Store.NO, Field.Index.UN_TOKENIZED));
if (data[i][6] != null) doc.add (new Field ("i18n", data[i][6], Field.Store.NO, Field.Index.UN_TOKENIZED));
if (data[i][7] != null) doc.add (new Field ("long", data[i][7], Field.Store.NO, Field.Index.UN_TOKENIZED));
if (data[i][8] != null) doc.add (new Field ("double", data[i][8], Field.Store.NO, Field.Index.UN_TOKENIZED));
doc.setBoost(2); // produce some scores above 1.0
writer.addDocument (doc);
}
@ -192,7 +195,15 @@ implements Serializable {
assertMatches (full, queryX, sort, "GCIEA");
assertMatches (full, queryY, sort, "DHJFB");
sort.setSort (new SortField[] { new SortField ("string", SortField.STRING), SortField.FIELD_DOC });
sort.setSort (new SortField[] { new SortField ("long", SortField.LONG), SortField.FIELD_DOC });
assertMatches (full, queryX, sort, "EACGI");
assertMatches (full, queryY, sort, "FBJHD");
sort.setSort (new SortField[] { new SortField ("double", SortField.DOUBLE), SortField.FIELD_DOC });
assertMatches (full, queryX, sort, "AGICE");
assertMatches (full, queryY, sort, "DJHBF");
sort.setSort (new SortField[] { new SortField ("string", SortField.STRING), SortField.FIELD_DOC });
assertMatches (full, queryX, sort, "AIGEC");
assertMatches (full, queryY, sort, "DJHFB");
}