From 60f1d192c7fe49cba55432569ed988099f4de5e4 Mon Sep 17 00:00:00 2001 From: Doug Cutting Date: Tue, 24 Feb 2004 19:34:58 +0000 Subject: [PATCH] Latest patches to hit sorting code, including unit tests, two of which currently fail. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150207 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/search/FieldDocSortedHitQueue.java | 31 +- .../lucene/search/FieldSortedHitQueue.java | 11 +- .../lucene/search/FloatSortedHitQueue.java | 12 +- .../lucene/search/IntegerSortedHitQueue.java | 16 +- .../search/MultiFieldSortedHitQueue.java | 11 +- src/java/org/apache/lucene/search/Sort.java | 197 ++++++---- .../org/apache/lucene/search/SortField.java | 8 +- .../lucene/search/StringSortedHitQueue.java | 2 +- .../org/apache/lucene/search/TestSort.java | 341 ++++++++++++++++++ 9 files changed, 527 insertions(+), 102 deletions(-) create mode 100644 src/test/org/apache/lucene/search/TestSort.java diff --git a/src/java/org/apache/lucene/search/FieldDocSortedHitQueue.java b/src/java/org/apache/lucene/search/FieldDocSortedHitQueue.java index 617de522b8f..7c11de14154 100644 --- a/src/java/org/apache/lucene/search/FieldDocSortedHitQueue.java +++ b/src/java/org/apache/lucene/search/FieldDocSortedHitQueue.java @@ -33,8 +33,10 @@ import java.io.IOException; class FieldDocSortedHitQueue extends PriorityQueue { - // this cannot contain AUTO fields - SortField[] fields; + // this cannot contain AUTO fields - any AUTO fields should + // have been resolved by the time this class is used. + volatile SortField[] fields; + /** * Creates a hit queue sorted by the given list of fields. @@ -51,15 +53,14 @@ extends PriorityQueue { /** * Allows redefinition of sort fields if they are null. - * This is to handle the - * case using ParallelMultiSearcher where the original list - * contains AUTO and we don't know - * the actual sort type until the values come back. This - * method is thread safe. + * This is to handle the case using ParallelMultiSearcher where the + * original list contains AUTO and we don't know the actual sort + * type until the values come back. The fields can only be set once. + * This method is thread safe. * @param fields */ synchronized void setFields (SortField[] fields) { - if (fields == null) this.fields = fields; + if (this.fields == null) this.fields = fields; } @@ -92,12 +93,16 @@ extends PriorityQueue { break; case SortField.DOC: case SortField.INT: - case SortField.STRING: int i1 = ((Integer)docA.fields[i]).intValue(); int i2 = ((Integer)docB.fields[i]).intValue(); if (i1 > i2) c = -1; if (i1 < i2) c = 1; break; + case SortField.STRING: + String s1 = (String) docA.fields[i]; + String s2 = (String) docB.fields[i]; + c = s2.compareTo(s1); + break; case SortField.FLOAT: float f1 = ((Float)docA.fields[i]).floatValue(); float f2 = ((Float)docB.fields[i]).floatValue(); @@ -123,12 +128,16 @@ extends PriorityQueue { break; case SortField.DOC: case SortField.INT: - case SortField.STRING: int i1 = ((Integer)docA.fields[i]).intValue(); int i2 = ((Integer)docB.fields[i]).intValue(); if (i1 < i2) c = -1; if (i1 > i2) c = 1; break; + case SortField.STRING: + String s1 = (String) docA.fields[i]; + String s2 = (String) docB.fields[i]; + c = s1.compareTo(s2); + break; case SortField.FLOAT: float f1 = ((Float)docA.fields[i]).floatValue(); float f2 = ((Float)docB.fields[i]).floatValue(); @@ -148,4 +157,4 @@ extends PriorityQueue { } return c > 0; } -} \ No newline at end of file +} diff --git a/src/java/org/apache/lucene/search/FieldSortedHitQueue.java b/src/java/org/apache/lucene/search/FieldSortedHitQueue.java index 9357c0f9b74..e3aa7a54660 100644 --- a/src/java/org/apache/lucene/search/FieldSortedHitQueue.java +++ b/src/java/org/apache/lucene/search/FieldSortedHitQueue.java @@ -39,11 +39,16 @@ import java.util.regex.Pattern; * *

A static cache is maintained. This cache contains an integer * or float array of length IndexReader.maxDoc() for each field - * name for which a sort is performed. In other words, the size of - * the cache in bytes is: + * name for which a sort is performed. In other words, the size of the + * cache in bytes is: * *

4 * IndexReader.maxDoc() * (# of different fields actually used to sort) * + *

For String fields, the cache is larger: in addition to the + * above array, the value of every term in the field is kept in memory. + * If there are many unique terms in the field, this could + * be quite large. + * *

Note that the size of the cache is not affected by how many * fields are in the index and might be used to sort - only by * the ones actually used to sort a result set. @@ -172,7 +177,7 @@ extends PriorityQueue { */ protected static ScoreDocComparator determineComparator (IndexReader reader, String field) throws IOException { - + field = field.intern(); TermEnum enumerator = reader.terms (new Term (field, "")); try { Term term = enumerator.term(); diff --git a/src/java/org/apache/lucene/search/FloatSortedHitQueue.java b/src/java/org/apache/lucene/search/FloatSortedHitQueue.java index e650ed324a5..1a476818cc7 100644 --- a/src/java/org/apache/lucene/search/FloatSortedHitQueue.java +++ b/src/java/org/apache/lucene/search/FloatSortedHitQueue.java @@ -66,12 +66,13 @@ extends FieldSortedHitQueue { /** * Returns a comparator for sorting hits according to a field containing floats. * @param reader Index to use. - * @param field Field containg float values. + * @param fieldname Field containg float values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ - static ScoreDocLookupComparator comparator (final IndexReader reader, final String field) + static ScoreDocLookupComparator comparator (final IndexReader reader, final String fieldname) throws IOException { + final String field = fieldname.intern(); return new ScoreDocLookupComparator () { protected final float[] fieldOrder = generateSortIndex(); @@ -140,12 +141,13 @@ extends FieldSortedHitQueue { * Returns a comparator for sorting hits according to a field containing floats using the given enumerator * to collect term values. * @param reader Index to use. - * @param field Field containg float values. + * @param fieldname Field containg float values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ - static ScoreDocLookupComparator comparator (final IndexReader reader, final TermEnum enumerator, final String field) + static ScoreDocLookupComparator comparator (final IndexReader reader, final TermEnum enumerator, final String fieldname) throws IOException { + final String field = fieldname.intern(); return new ScoreDocLookupComparator () { protected final float[] fieldOrder = generateSortIndex(); @@ -202,4 +204,4 @@ extends FieldSortedHitQueue { } }; } -} \ No newline at end of file +} diff --git a/src/java/org/apache/lucene/search/IntegerSortedHitQueue.java b/src/java/org/apache/lucene/search/IntegerSortedHitQueue.java index 05756838f06..9f8c6f73ad2 100644 --- a/src/java/org/apache/lucene/search/IntegerSortedHitQueue.java +++ b/src/java/org/apache/lucene/search/IntegerSortedHitQueue.java @@ -28,8 +28,8 @@ import java.io.IOException; * Expert: A sorted hit queue for fields that contain strictly integer values. * Hits are sorted into the queue by the values in the field and then by document number. * - *

Created: Jan 30, 2004 3:35:09 PM - * + *

Created: Jan 30, 2004 3:35:09 PM + * * @author Tim Jones (Nacimiento Software) * @since lucene 1.4 * @version $Id$ @@ -67,12 +67,13 @@ extends FieldSortedHitQueue { /** * Returns a comparator for sorting hits according to a field containing integers. * @param reader Index to use. - * @param field Field containg integer values. + * @param fieldname Field containg integer values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ - static ScoreDocLookupComparator comparator (final IndexReader reader, final String field) + static ScoreDocLookupComparator comparator (final IndexReader reader, final String fieldname) throws IOException { + final String field = fieldname.intern(); return new ScoreDocLookupComparator() { /** The sort information being used by this instance */ @@ -142,12 +143,13 @@ extends FieldSortedHitQueue { * Returns a comparator for sorting hits according to a field containing integers using the given enumerator * to collect term values. * @param reader Index to use. - * @param field Field containg integer values. + * @param fieldname Field containg integer values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ - static ScoreDocLookupComparator comparator (final IndexReader reader, final TermEnum enumerator, final String field) + static ScoreDocLookupComparator comparator (final IndexReader reader, final TermEnum enumerator, final String fieldname) throws IOException { + final String field = fieldname.intern(); return new ScoreDocLookupComparator() { protected final int[] fieldOrder = generateSortIndex(); @@ -204,4 +206,4 @@ extends FieldSortedHitQueue { } }; } -} \ No newline at end of file +} diff --git a/src/java/org/apache/lucene/search/MultiFieldSortedHitQueue.java b/src/java/org/apache/lucene/search/MultiFieldSortedHitQueue.java index d00df00ab0e..9a0fb4dc2e2 100644 --- a/src/java/org/apache/lucene/search/MultiFieldSortedHitQueue.java +++ b/src/java/org/apache/lucene/search/MultiFieldSortedHitQueue.java @@ -26,8 +26,8 @@ import java.io.IOException; * The type of content in each field could be determined dynamically by * FieldSortedHitQueue.determineComparator(). * - *

Created: Feb 3, 2004 4:46:55 PM - * + *

Created: Feb 3, 2004 4:46:55 PM + * * @author Tim Jones (Nacimiento Software) * @since lucene 1.4 * @version $Id$ @@ -50,8 +50,9 @@ extends PriorityQueue { comparators = new ScoreDocComparator[n]; this.fields = new SortField[n]; for (int i=0; iThe fields used to determine sort order must be carefully chosen. + * Documents must contain a single term in such a field, + * and the value of the term should indicate the document's relative position in + * a given sort order. The field must be indexed, but should not be tokenized, + * and does not need to be stored (unless you happen to want it back with the + * rest of your document data). In other words: + * + *

document.add (new Field ("byNumber", Integer.toString(x), false, true, false)); + *
+ * + *

Valid Types of Values

+ * + *

There are three possible kinds of term values which may be put into + * sorting fields: Integers, Floats, or Strings. Unless + * {@link SortField SortField} objects are specified, the type of value + * in the field is determined by using a regular expression against the + * first term in the field. + * + *

Integer term values should contain only digits and an optional + * preceeding negative sign. Values must be base 10 and in the range + * Integer.MIN_VALUE and Integer.MAX_VALUE inclusive. + * Documents which should appear first in the sort + * should have low value integers, later documents high values + * (i.e. the documents should be numbered 1..n where + * 1 is the first and n the last). + * + *

Float term values should conform to values accepted by + * {@link Float Float.valueOf(String)} (except that NaN + * and Infinity are not supported). + * Documents which should appear first in the sort + * should have low values, later documents high values. + * + *

String term values can contain any valid String, but should + * not be tokenized. The values are sorted according to their + * {@link Comparable natural order}. Note that using this type + * of term value has higher memory requirements than the other + * two types - see {@link FieldSortedHitQueue FieldSortedHitQueue}. + * + *

Object Reuse

+ * + *

One of these objects can be + * used multiple times and the sort order changed between usages. + * + *

This class is thread safe. + * + *

Memory Usage

+ * + * See {@link FieldSortedHitQueue FieldSortedHitQueue} for + * information on the memory requirements of sorting hits. + * + *

Created: Feb 12, 2004 10:53:57 AM * - *

Created: Feb 12, 2004 10:53:57 AM - * * @author Tim Jones (Nacimiento Software) * @since lucene 1.4 * @version $Id$ @@ -31,80 +81,95 @@ import java.io.Serializable; public class Sort implements Serializable { - /** Represents sorting by computed relevance. Using this sort criteria - * returns the same results with slightly more overhead as calling - * Searcher#search() without a sort criteria. */ - public static final Sort RELEVANCE = - new Sort (new SortField[] { SortField.FIELD_SCORE, SortField.FIELD_DOC }); + /** Represents sorting by computed relevance. Using this sort criteria + * returns the same results with slightly more overhead as calling + * Searcher#search() without a sort criteria. */ + public static final Sort RELEVANCE = new Sort (); - /** Represents sorting by index order. */ - public static final Sort INDEXORDER = new Sort (SortField.FIELD_DOC); + /** Represents sorting by index order. */ + public static final Sort INDEXORDER = new Sort (SortField.FIELD_DOC); + + // internal representation of the sort criteria + SortField[] fields; - // internal representation of the sort criteria - SortField[] fields; + /** Sorts by computed relevance. This is the same sort criteria as + * calling Searcher#search() without a sort criteria, only with + * slightly more overhead. */ + public Sort() { + this (new SortField[]{SortField.FIELD_SCORE, SortField.FIELD_DOC}); + } - /** Sorts by the terms in field then by index order (document - * number). */ - public Sort (String field) { - setSort (field, false); - } + /** Sorts by the terms in field then by index order (document + * number). */ + public Sort (String field) { + setSort (field, false); + } - /** Sorts possibly in reverse by the terms in field then by - * index order (document number). */ - public Sort (String field, boolean reverse) { - setSort (field, reverse); - } - /** Sorts in succession by the terms in each field. */ - public Sort (String[] fields) { - setSort (fields); - } + /** Sorts possibly in reverse by the terms in field then by + * index order (document number). */ + public Sort (String field, boolean reverse) { + setSort (field, reverse); + } - /** Sorts by the criteria in the given SortField. */ - public Sort (SortField field) { - setSort (field); - } - /** Sorts in succession by the criteria in each SortField. */ - public Sort (SortField[] fields) { - setSort (fields); - } + /** Sorts in succession by the terms in each field. */ + public Sort (String[] fields) { + setSort (fields); + } - /** Sets the sort to the terms in field then by index order - * (document number). */ - public final void setSort (String field) { - setSort (field, false); - } - /** Sets the sort to the terms in field possibly in reverse, - * then by index order (document number). */ - public void setSort (String field, boolean reverse) { - SortField[] nfields = new SortField[] { - new SortField (field, SortField.AUTO, reverse), - new SortField (field, SortField.DOC) - }; - fields = nfields; - } + /** Sorts by the criteria in the given SortField. */ + public Sort (SortField field) { + setSort (field); + } - /** Sets the sort to the terms in each field in succession. */ - public void setSort (String[] fieldnames) { - final int n = fieldnames.length; - SortField[] nfields = new SortField[n]; - for (int i=0; ifield then by index order + * (document number). */ + public final void setSort (String field) { + setSort (field, false); + } + + + /** Sets the sort to the terms in field possibly in reverse, + * then by index order (document number). */ + public void setSort (String field, boolean reverse) { + SortField[] nfields = new SortField[]{ + new SortField (field, SortField.AUTO, reverse), + new SortField (field, SortField.DOC) + }; + fields = nfields; + } + + + /** Sets the sort to the terms in each field in succession. */ + public void setSort (String[] fieldnames) { + final int n = fieldnames.length; + SortField[] nfields = new SortField[n]; + for (int i = 0; i < n; ++i) { + nfields[i] = new SortField (fieldnames[i], SortField.AUTO); + } + fields = nfields; + } + + + /** Sets the sort to the given criteria. */ + public void setSort (SortField field) { + this.fields = new SortField[]{field}; + } + + + /** Sets the sort to the given criteria in succession. */ + public void setSort (SortField[] fields) { + this.fields = fields; + } } diff --git a/src/java/org/apache/lucene/search/SortField.java b/src/java/org/apache/lucene/search/SortField.java index d45572ba556..1d712643fe1 100644 --- a/src/java/org/apache/lucene/search/SortField.java +++ b/src/java/org/apache/lucene/search/SortField.java @@ -74,7 +74,7 @@ implements Serializable { * @param field Name of field to sort by, cannot be null. */ public SortField (String field) { - this.field = field; + this.field = field.intern(); } /** Creates a sort, possibly in reverse, by terms in the given field where @@ -83,7 +83,7 @@ implements Serializable { * @param reverse True if natural order should be reversed. */ public SortField (String field, boolean reverse) { - this.field = field; + this.field = field.intern(); this.reverse = reverse; } @@ -94,7 +94,7 @@ implements Serializable { * @param type Type of values in the terms. */ public SortField (String field, int type) { - this.field = field; + this.field = (field != null) ? field.intern() : field; this.type = type; } @@ -106,7 +106,7 @@ implements Serializable { * @param reverse True if natural order should be reversed. */ public SortField (String field, int type, boolean reverse) { - this.field = field; + this.field = (field != null) ? field.intern() : field; this.type = type; this.reverse = reverse; } diff --git a/src/java/org/apache/lucene/search/StringSortedHitQueue.java b/src/java/org/apache/lucene/search/StringSortedHitQueue.java index 0ec6ea027c6..a41eb56d312 100644 --- a/src/java/org/apache/lucene/search/StringSortedHitQueue.java +++ b/src/java/org/apache/lucene/search/StringSortedHitQueue.java @@ -223,4 +223,4 @@ extends FieldSortedHitQueue { } }; } -} \ No newline at end of file +} diff --git a/src/test/org/apache/lucene/search/TestSort.java b/src/test/org/apache/lucene/search/TestSort.java new file mode 100644 index 00000000000..0b07cda4b66 --- /dev/null +++ b/src/test/org/apache/lucene/search/TestSort.java @@ -0,0 +1,341 @@ +package org.apache.lucene.search; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import java.rmi.Naming; +import java.rmi.registry.LocateRegistry; +import java.rmi.registry.Registry; +import java.io.IOException; +import java.util.regex.Pattern; + +import junit.framework.TestCase; +import junit.framework.Test; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + +/** + * Unit tests for sorting code. + * + *

Created: Feb 17, 2004 4:55:10 PM + * + * @author Tim Jones (Nacimiento Software) + * @since lucene 1.4 + * @version $Id$ + */ + +public class TestSort +extends TestCase { + + private Searcher full; + private Searcher searchX; + private Searcher searchY; + private Query queryX; + private Query queryY; + private Query queryA; + private Sort sort; + + + public TestSort (String name) { + super (name); + } + + public static void main (String[] argv) { + if (argv == null || argv.length < 1) + TestRunner.run (suite()); + else if ("server".equals (argv[0])) { + TestSort test = new TestSort (null); + try { + test.startServer(); + Thread.sleep (500000); + } catch (Exception e) { + System.out.println (e); + e.printStackTrace(); + } + } + } + + public static Test suite() { + return new TestSuite (TestSort.class); + } + + + // document data: + // the tracer field is used to determine which document was hit + // the contents field is used to search and sort by relevance + // the int field to sort by int + // the float field to sort by float + // the string field to sort by string + private String[][] data = new String[][] { + // tracer contents int float string + { "A", "x a", "5", "4f", "c" }, + { "B", "y a", "5", "3.4028235E38", "i" }, + { "C", "x a b c", "2147483647", "1.0", "j" }, + { "D", "y a b c", "-1", "0.0f", "a" }, + { "E", "x a b c d", "5", "2f", "h" }, + { "F", "y a b c d", "2", "3.14159f", "g" }, + { "G", "x a b c d", "3", "-1.0", "f" }, + { "H", "y a b c d", "0", "1.4E-45", "e" }, + { "I", "x a b c d e f", "-2147483648", "1.0e+0", "d" }, + { "J", "y a b c d e f", "4", ".5", "b" }, + }; + + // create an index of all the documents, or just the x, or just the y documents + private Searcher getIndex (boolean even, boolean odd) + throws IOException { + RAMDirectory indexStore = new RAMDirectory (); + IndexWriter writer = new IndexWriter (indexStore, new SimpleAnalyzer(), true); + for (int i=0; i