diff --git a/CHANGES.txt b/CHANGES.txt index 52ff971c465..7b99a96bce1 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -154,6 +154,9 @@ New features obtained or released, throwing an exception if an illegal lock obtain occurred. (Patrick Kimber vis Mike McCandless) + 6. LUCENE-1015: Added FieldCache extension (ExtendedFieldCache) to support doubles and longs. + Added support into SortField for sorting on doubles and longs as well. (Grant Ingersoll) + Optimizations 1. LUCENE-937: CachingTokenFilter now uses an iterator to access the diff --git a/src/java/org/apache/lucene/document/NumberTools.java b/src/java/org/apache/lucene/document/NumberTools.java index d6669d42e43..1d70216d261 100644 --- a/src/java/org/apache/lucene/document/NumberTools.java +++ b/src/java/org/apache/lucene/document/NumberTools.java @@ -31,7 +31,7 @@ package org.apache.lucene.document; * This class handles all long values (unlike * {@link org.apache.lucene.document.DateField}). * - * @author Matt Quail (spud at madbean dot com) + * */ public class NumberTools { diff --git a/src/java/org/apache/lucene/search/ExtendedFieldCache.java b/src/java/org/apache/lucene/search/ExtendedFieldCache.java new file mode 100644 index 00000000000..265e997ad77 --- /dev/null +++ b/src/java/org/apache/lucene/search/ExtendedFieldCache.java @@ -0,0 +1,87 @@ +package org.apache.lucene.search; + +import org.apache.lucene.index.IndexReader; + +import java.io.IOException; + + +/** + * + * + **/ +public interface ExtendedFieldCache extends FieldCache { + public interface LongParser { + /** + * Return an long representation of this field's value. + */ + public long parseLong(String string); + } + + public interface DoubleParser { + /** + * Return an long representation of this field's value. + */ + public double parseDouble(String string); + } + + public static ExtendedFieldCache EXT_DEFAULT = new ExtendedFieldCacheImpl(); + + /** + * Checks the internal cache for an appropriate entry, and if none is + * found, reads the terms in field as longs and returns an array + * of size reader.maxDoc() of the value each document + * has in the given field. + * + * @param reader Used to get field values. + * @param field Which field contains the longs. + * @return The values in the given field for each document. + * @throws java.io.IOException If any error occurs. + */ + public long[] getLongs(IndexReader reader, String field) + throws IOException; + + /** + * Checks the internal cache for an appropriate entry, and if none is found, + * reads the terms in field as longs and returns an array of + * size reader.maxDoc() of the value each document has in the + * given field. + * + * @param reader Used to get field values. + * @param field Which field contains the longs. + * @param parser Computes integer for string values. + * @return The values in the given field for each document. + * @throws IOException If any error occurs. + */ + public long[] getLongs(IndexReader reader, String field, LongParser parser) + throws IOException; + + + /** + * Checks the internal cache for an appropriate entry, and if none is + * found, reads the terms in field as integers and returns an array + * of size reader.maxDoc() of the value each document + * has in the given field. + * + * @param reader Used to get field values. + * @param field Which field contains the doubles. + * @return The values in the given field for each document. + * @throws IOException If any error occurs. + */ + public double[] getDoubles(IndexReader reader, String field) + throws IOException; + + /** + * Checks the internal cache for an appropriate entry, and if none is found, + * reads the terms in field as doubles and returns an array of + * size reader.maxDoc() of the value each document has in the + * given field. + * + * @param reader Used to get field values. + * @param field Which field contains the doubles. + * @param parser Computes integer for string values. + * @return The values in the given field for each document. + * @throws IOException If any error occurs. + */ + public double[] getDoubles(IndexReader reader, String field, DoubleParser parser) + throws IOException; +} diff --git a/src/java/org/apache/lucene/search/ExtendedFieldCacheImpl.java b/src/java/org/apache/lucene/search/ExtendedFieldCacheImpl.java new file mode 100644 index 00000000000..256f113c94b --- /dev/null +++ b/src/java/org/apache/lucene/search/ExtendedFieldCacheImpl.java @@ -0,0 +1,117 @@ +package org.apache.lucene.search; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; + +import java.io.IOException; + + +/** + * + * + **/ +class ExtendedFieldCacheImpl extends FieldCacheImpl implements ExtendedFieldCache { + private static final LongParser LONG_PARSER = new LongParser() { + public long parseLong(String value) { + return Long.parseLong(value); + } + }; + + private static final DoubleParser DOUBLE_PARSER = new DoubleParser() { + public double parseDouble(String value) { + return Double.parseDouble(value); + } + }; + + private static final ByteParser BYTE_PARSER = new ByteParser() { + public byte parseByte(String string) { + return Byte.parseByte(string); + } + }; + + private static final ShortParser SHORT_PARSER = new ShortParser() { + public short parseShort(String string) { + return Short.parseShort(string); + } + }; + + public long[] getLongs(IndexReader reader, String field) throws IOException { + return getLongs(reader, field, LONG_PARSER); + } + + // inherit javadocs + public long[] getLongs(IndexReader reader, String field, LongParser parser) + throws IOException { + return (long[]) longsCache.get(reader, new Entry(field, parser)); + } + + Cache longsCache = new Cache() { + + protected Object createValue(IndexReader reader, Object entryKey) + throws IOException { + Entry entry = (Entry) entryKey; + String field = entry.field; + LongParser parser = (LongParser) entry.custom; + final long[] retArray = new long[reader.maxDoc()]; + TermDocs termDocs = reader.termDocs(); + TermEnum termEnum = reader.terms (new Term(field, "")); + try { + do { + Term term = termEnum.term(); + if (term==null || term.field() != field) break; + long termval = parser.parseLong(term.text()); + termDocs.seek (termEnum); + while (termDocs.next()) { + retArray[termDocs.doc()] = termval; + } + } while (termEnum.next()); + } finally { + termDocs.close(); + termEnum.close(); + } + return retArray; + } + }; + + // inherit javadocs + public double[] getDoubles(IndexReader reader, String field) + throws IOException { + return getDoubles(reader, field, DOUBLE_PARSER); + } + + // inherit javadocs + public double[] getDoubles(IndexReader reader, String field, DoubleParser parser) + throws IOException { + return (double[]) doublesCache.get(reader, new Entry(field, parser)); + } + + Cache doublesCache = new Cache() { + + protected Object createValue(IndexReader reader, Object entryKey) + throws IOException { + Entry entry = (Entry) entryKey; + String field = entry.field; + DoubleParser parser = (DoubleParser) entry.custom; + final double[] retArray = new double[reader.maxDoc()]; + TermDocs termDocs = reader.termDocs(); + TermEnum termEnum = reader.terms (new Term (field, "")); + try { + do { + Term term = termEnum.term(); + if (term==null || term.field() != field) break; + double termval = parser.parseDouble(term.text()); + termDocs.seek (termEnum); + while (termDocs.next()) { + retArray[termDocs.doc()] = termval; + } + } while (termEnum.next()); + } finally { + termDocs.close(); + termEnum.close(); + } + return retArray; + } + }; +} diff --git a/src/java/org/apache/lucene/search/FieldDocSortedHitQueue.java b/src/java/org/apache/lucene/search/FieldDocSortedHitQueue.java index 7782a73be1f..208ed860d23 100644 --- a/src/java/org/apache/lucene/search/FieldDocSortedHitQueue.java +++ b/src/java/org/apache/lucene/search/FieldDocSortedHitQueue.java @@ -109,20 +109,29 @@ extends PriorityQueue { for (int i=0; i r2) c = -1; if (r1 < r2) c = 1; break; - case SortField.DOC: - case SortField.INT: + } + case SortField.DOC: + case SortField.INT:{ int i1 = ((Integer)docA.fields[i]).intValue(); int i2 = ((Integer)docB.fields[i]).intValue(); if (i1 < i2) c = -1; if (i1 > i2) c = 1; break; - case SortField.STRING: + } + case SortField.LONG:{ + long l1 = ((Long)docA.fields[i]).longValue(); + long l2 = ((Long)docB.fields[i]).longValue(); + if (l1 < l2) c = -1; + if (l1 > l2) c = 1; + break; + } + case SortField.STRING:{ String s1 = (String) docA.fields[i]; String s2 = (String) docB.fields[i]; // null values need to be sorted first, because of how FieldCache.getStringIndex() @@ -136,24 +145,36 @@ extends PriorityQueue { c = collators[i].compare (s1, s2); } break; - case SortField.FLOAT: + } + case SortField.FLOAT:{ float f1 = ((Float)docA.fields[i]).floatValue(); float f2 = ((Float)docB.fields[i]).floatValue(); if (f1 < f2) c = -1; if (f1 > f2) c = 1; break; - case SortField.CUSTOM: + } + case SortField.DOUBLE:{ + double d1 = ((Double)docA.fields[i]).doubleValue(); + double d2 = ((Double)docB.fields[i]).doubleValue(); + if (d1 < d2) c = -1; + if (d1 > d2) c = 1; + break; + } + case SortField.CUSTOM:{ c = docA.fields[i].compareTo (docB.fields[i]); break; - case SortField.AUTO: + } + case SortField.AUTO:{ // we cannot handle this - even if we determine the type of object (Float or // Integer), we don't necessarily know how to compare them (both SCORE and // FLOAT contain floats, but are sorted opposite of each other). Before // we get here, each AUTO should have been replaced with its actual value. throw new RuntimeException ("FieldDocSortedHitQueue cannot use an AUTO SortField"); - default: + } + default:{ throw new RuntimeException ("invalid SortField type: "+type); - } + } + } if (fields[i].getReverse()) { c = -c; } diff --git a/src/java/org/apache/lucene/search/FieldSortedHitQueue.java b/src/java/org/apache/lucene/search/FieldSortedHitQueue.java index 40cc143c694..c51f1cc9871 100644 --- a/src/java/org/apache/lucene/search/FieldSortedHitQueue.java +++ b/src/java/org/apache/lucene/search/FieldSortedHitQueue.java @@ -21,8 +21,8 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.PriorityQueue; import java.io.IOException; -import java.util.Locale; import java.text.Collator; +import java.util.Locale; /** * Expert: A hit queue for sorting by hits by terms in more than one field. @@ -177,6 +177,12 @@ extends PriorityQueue { case SortField.FLOAT: comparator = comparatorFloat (reader, fieldname); break; + case SortField.LONG: + comparator = comparatorLong(reader, fieldname); + break; + case SortField.DOUBLE: + comparator = comparatorDouble(reader, fieldname); + break; case SortField.STRING: if (locale != null) comparator = comparatorStringLocale (reader, fieldname, locale); else comparator = comparatorString (reader, fieldname); @@ -222,6 +228,38 @@ extends PriorityQueue { }; } + /** + * Returns a comparator for sorting hits according to a field containing integers. + * @param reader Index to use. + * @param fieldname Fieldable containg integer values. + * @return Comparator for sorting hits. + * @throws IOException If an error occurs reading the index. + */ + static ScoreDocComparator comparatorLong (final IndexReader reader, final String fieldname) + throws IOException { + final String field = fieldname.intern(); + final long[] fieldOrder = ExtendedFieldCache.EXT_DEFAULT.getLongs (reader, field); + return new ScoreDocComparator() { + + public final int compare (final ScoreDoc i, final ScoreDoc j) { + final long li = fieldOrder[i.doc]; + final long lj = fieldOrder[j.doc]; + if (li < lj) return -1; + if (li > lj) return 1; + return 0; + } + + public Comparable sortValue (final ScoreDoc i) { + return new Long(fieldOrder[i.doc]); + } + + public int sortType() { + return SortField.LONG; + } + }; + } + + /** * Returns a comparator for sorting hits according to a field containing floats. * @param reader Index to use. @@ -253,6 +291,37 @@ extends PriorityQueue { }; } + /** + * Returns a comparator for sorting hits according to a field containing doubles. + * @param reader Index to use. + * @param fieldname Fieldable containg float values. + * @return Comparator for sorting hits. + * @throws IOException If an error occurs reading the index. + */ + static ScoreDocComparator comparatorDouble(final IndexReader reader, final String fieldname) + throws IOException { + final String field = fieldname.intern(); + final double[] fieldOrder = ExtendedFieldCache.EXT_DEFAULT.getDoubles (reader, field); + return new ScoreDocComparator () { + + public final int compare (final ScoreDoc i, final ScoreDoc j) { + final double di = fieldOrder[i.doc]; + final double dj = fieldOrder[j.doc]; + if (di < dj) return -1; + if (di > dj) return 1; + return 0; + } + + public Comparable sortValue (final ScoreDoc i) { + return new Double (fieldOrder[i.doc]); + } + + public int sortType() { + return SortField.DOUBLE; + } + }; + } + /** * Returns a comparator for sorting hits according to a field containing strings. * @param reader Index to use. diff --git a/src/java/org/apache/lucene/search/SortField.java b/src/java/org/apache/lucene/search/SortField.java index 9b8f810271c..ca59936c425 100644 --- a/src/java/org/apache/lucene/search/SortField.java +++ b/src/java/org/apache/lucene/search/SortField.java @@ -60,6 +60,14 @@ implements Serializable { * lower values are at the front. */ public static final int FLOAT = 5; + /** Sort using term values as encoded Longs. Sort values are Long and + * lower values are at the front. */ + public static final int LONG = 6; + + /** Sort using term values as encoded Doubles. Sort values are Double and + * lower values are at the front. */ + public static final int DOUBLE = 7; + /** Sort using a custom Comparator. Sort values are any Comparable and * sorting is done according to natural order. */ public static final int CUSTOM = 9; diff --git a/src/test/org/apache/lucene/search/TestExtendedFieldCache.java b/src/test/org/apache/lucene/search/TestExtendedFieldCache.java new file mode 100644 index 00000000000..aa865ffb537 --- /dev/null +++ b/src/test/org/apache/lucene/search/TestExtendedFieldCache.java @@ -0,0 +1,71 @@ +package org.apache.lucene.search; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; + +import java.io.IOException; + +public class TestExtendedFieldCache extends LuceneTestCase { + protected IndexReader reader; + private static final int NUM_DOCS = 1000; + + public TestExtendedFieldCache(String s) { + super(s); + } + + protected void setUp() throws Exception { + super.setUp(); + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer= new IndexWriter(directory, new WhitespaceAnalyzer(), true); + long theLong = Long.MAX_VALUE; + double theDouble = Double.MAX_VALUE; + for (int i = 0; i < NUM_DOCS; i++){ + Document doc = new Document(); + doc.add(new Field("theLong", String.valueOf(theLong--), Field.Store.NO, Field.Index.UN_TOKENIZED)); + doc.add(new Field("theDouble", String.valueOf(theDouble--), Field.Store.NO, Field.Index.UN_TOKENIZED)); + doc.add(new Field("text", English.intToEnglish(i), Field.Store.NO, Field.Index.TOKENIZED)); + writer.addDocument(doc); + } + writer.close(); + reader = IndexReader.open(directory); + } + + + public void test() throws IOException { + ExtendedFieldCache cache = new ExtendedFieldCacheImpl(); + double [] doubles = cache.getDoubles(reader, "theDouble"); + assertTrue("doubles Size: " + doubles.length + " is not: " + NUM_DOCS, doubles.length == NUM_DOCS); + for (int i = 0; i < doubles.length; i++) { + assertTrue(doubles[i] + " does not equal: " + (Double.MAX_VALUE - i), doubles[i] == (Double.MAX_VALUE - i)); + + } + long [] longs = cache.getLongs(reader, "theLong"); + assertTrue("longs Size: " + longs.length + " is not: " + NUM_DOCS, longs.length == NUM_DOCS); + for (int i = 0; i < longs.length; i++) { + assertTrue(longs[i] + " does not equal: " + (Long.MAX_VALUE - i), longs[i] == (Long.MAX_VALUE - i)); + + } + } +} \ No newline at end of file diff --git a/src/test/org/apache/lucene/search/TestSort.java b/src/test/org/apache/lucene/search/TestSort.java index 4d32b919d8e..8758f4db421 100644 --- a/src/test/org/apache/lucene/search/TestSort.java +++ b/src/test/org/apache/lucene/search/TestSort.java @@ -17,27 +17,28 @@ package org.apache.lucene.search; * limitations under the License. */ -import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.index.*; +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; +import java.io.IOException; +import java.io.Serializable; import java.rmi.Naming; import java.rmi.registry.LocateRegistry; import java.rmi.registry.Registry; -import java.io.IOException; -import java.io.Serializable; -import java.util.regex.Pattern; +import java.util.BitSet; import java.util.HashMap; import java.util.Iterator; import java.util.Locale; -import java.util.BitSet; - -import junit.framework.TestCase; -import junit.framework.Test; -import junit.framework.TestSuite; -import junit.textui.TestRunner; +import java.util.regex.Pattern; /** * Unit tests for sorting code. @@ -97,21 +98,21 @@ implements Serializable { // the string field to sort by string // the i18n field includes accented characters for testing locale-specific sorting private String[][] data = new String[][] { - // tracer contents int float string custom i18n - { "A", "x a", "5", "4f", "c", "A-3", "p\u00EAche"}, - { "B", "y a", "5", "3.4028235E38", "i", "B-10", "HAT"}, - { "C", "x a b c", "2147483647", "1.0", "j", "A-2", "p\u00E9ch\u00E9"}, - { "D", "y a b c", "-1", "0.0f", "a", "C-0", "HUT"}, - { "E", "x a b c d", "5", "2f", "h", "B-8", "peach"}, - { "F", "y a b c d", "2", "3.14159f", "g", "B-1", "H\u00C5T"}, - { "G", "x a b c d", "3", "-1.0", "f", "C-100", "sin"}, - { "H", "y a b c d", "0", "1.4E-45", "e", "C-88", "H\u00D8T"}, - { "I", "x a b c d e f", "-2147483648", "1.0e+0", "d", "A-10", "s\u00EDn"}, - { "J", "y a b c d e f", "4", ".5", "b", "C-7", "HOT"}, - { "W", "g", "1", null, null, null, null}, - { "X", "g", "1", "0.1", null, null, null}, - { "Y", "g", "1", "0.2", null, null, null}, - { "Z", "f g", null, null, null, null, null} + // tracer contents int float string custom i18n long double + { "A", "x a", "5", "4f", "c", "A-3", "p\u00EAche", "10", "-4.0"},//A + { "B", "y a", "5", "3.4028235E38", "i", "B-10", "HAT", "1000000000", "40.0"},//B + { "C", "x a b c", "2147483647", "1.0", "j", "A-2", "p\u00E9ch\u00E9", "99999999", "40.00002343"},//C + { "D", "y a b c", "-1", "0.0f", "a", "C-0", "HUT", String.valueOf(Long.MAX_VALUE), String.valueOf(Double.MIN_VALUE)},//D + { "E", "x a b c d", "5", "2f", "h", "B-8", "peach", String.valueOf(Long.MIN_VALUE), String.valueOf(Double.MAX_VALUE)},//E + { "F", "y a b c d", "2", "3.14159f", "g", "B-1", "H\u00C5T", "-44", "343.034435444"},//F + { "G", "x a b c d", "3", "-1.0", "f", "C-100", "sin", "323254543543", "4.043544"},//G + { "H", "y a b c d", "0", "1.4E-45", "e", "C-88", "H\u00D8T", "1023423423005","4.043545"},//H + { "I", "x a b c d e f", "-2147483648", "1.0e+0", "d", "A-10", "s\u00EDn", "332422459999", "4.043546"},//I + { "J", "y a b c d e f", "4", ".5", "b", "C-7", "HOT", "34334543543", "4.0000220343"},//J + { "W", "g", "1", null, null, null, null, null, null}, + { "X", "g", "1", "0.1", null, null, null, null, null}, + { "Y", "g", "1", "0.2", null, null, null, null, null}, + { "Z", "f g", null, null, null, null, null, null, null} }; // create an index of all the documents, or just the x, or just the y documents @@ -129,6 +130,8 @@ implements Serializable { if (data[i][4] != null) doc.add (new Field ("string", data[i][4], Field.Store.NO, Field.Index.UN_TOKENIZED)); if (data[i][5] != null) doc.add (new Field ("custom", data[i][5], Field.Store.NO, Field.Index.UN_TOKENIZED)); if (data[i][6] != null) doc.add (new Field ("i18n", data[i][6], Field.Store.NO, Field.Index.UN_TOKENIZED)); + if (data[i][7] != null) doc.add (new Field ("long", data[i][7], Field.Store.NO, Field.Index.UN_TOKENIZED)); + if (data[i][8] != null) doc.add (new Field ("double", data[i][8], Field.Store.NO, Field.Index.UN_TOKENIZED)); doc.setBoost(2); // produce some scores above 1.0 writer.addDocument (doc); } @@ -192,7 +195,15 @@ implements Serializable { assertMatches (full, queryX, sort, "GCIEA"); assertMatches (full, queryY, sort, "DHJFB"); - sort.setSort (new SortField[] { new SortField ("string", SortField.STRING), SortField.FIELD_DOC }); + sort.setSort (new SortField[] { new SortField ("long", SortField.LONG), SortField.FIELD_DOC }); + assertMatches (full, queryX, sort, "EACGI"); + assertMatches (full, queryY, sort, "FBJHD"); + + sort.setSort (new SortField[] { new SortField ("double", SortField.DOUBLE), SortField.FIELD_DOC }); + assertMatches (full, queryX, sort, "AGICE"); + assertMatches (full, queryY, sort, "DJHBF"); + + sort.setSort (new SortField[] { new SortField ("string", SortField.STRING), SortField.FIELD_DOC }); assertMatches (full, queryX, sort, "AIGEC"); assertMatches (full, queryY, sort, "DJHFB"); }