From 46466920228108bbd4aa121150a87e50a4fa81be Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Wed, 12 Nov 2008 09:43:39 +0000 Subject: [PATCH] LUCENE-1450: make sure RangeQuery/Filter check all terms in the index when using a Collator git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@713332 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/lucene/search/RangeTermEnum.java | 3 +- .../search/TestMultiTermConstantScore.java | 39 +++++++++++++++++ .../apache/lucene/search/TestRangeFilter.java | 42 +++++++++++++++++++ .../apache/lucene/search/TestRangeQuery.java | 20 +++++++++ 4 files changed, 103 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/lucene/search/RangeTermEnum.java b/src/java/org/apache/lucene/search/RangeTermEnum.java index 7eebc3e6df5..28c95979100 100644 --- a/src/java/org/apache/lucene/search/RangeTermEnum.java +++ b/src/java/org/apache/lucene/search/RangeTermEnum.java @@ -70,7 +70,8 @@ public class RangeTermEnum extends FilteredTermEnum { this.includeUpper = true; } - setEnum(reader.terms(new Term(this.field, this.lowerTermText))); + String startTermText = collator == null ? this.lowerTermText : ""; + setEnum(reader.terms(new Term(this.field, startTermText))); } public float difference() { diff --git a/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java b/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java index c4ca86352f5..587e8f7bf22 100644 --- a/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java +++ b/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java @@ -564,4 +564,43 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter { assertEquals("The index Term should be included.", 1, result.length); search.close(); } + + public void testDanish() throws Exception { + + /* build an index */ + RAMDirectory danishIndex = new RAMDirectory(); + IndexWriter writer = new IndexWriter(danishIndex, new SimpleAnalyzer(), T, + IndexWriter.MaxFieldLength.LIMITED); + + // Danish collation orders the words below in the given order + // (example taken from TestSort.testInternationalSort() ). + String[] words = { "H\u00D8T", "H\u00C5T", "MAND" }; + for (int docnum = 0 ; docnum < words.length ; ++docnum) { + Document doc = new Document(); + doc.add(new Field("content", words[docnum], + Field.Store.YES, Field.Index.UN_TOKENIZED)); + doc.add(new Field("body", "body", + Field.Store.YES, Field.Index.UN_TOKENIZED)); + writer.addDocument(doc); + } + writer.optimize(); + writer.close(); + + IndexReader reader = IndexReader.open(danishIndex); + IndexSearcher search = new IndexSearcher(reader); + Query q = new TermQuery(new Term("body","body")); + + Collator c = Collator.getInstance(new Locale("da", "dk")); + + // Unicode order would not include "H\u00C5T" in [ "H\u00D8T", "MAND" ], + // but Danish collation does. + ScoreDoc[] result = search.search + (csrq("content", "H\u00D8T", "MAND", F, F, c), null, 1000).scoreDocs; + assertEquals("The index Term should be included.", 1, result.length); + + result = search.search + (csrq("content", "H\u00C5T", "MAND", F, F, c), null, 1000).scoreDocs; + assertEquals("The index Term should not be included.", 0, result.length); + search.close(); + } } diff --git a/src/test/org/apache/lucene/search/TestRangeFilter.java b/src/test/org/apache/lucene/search/TestRangeFilter.java index 56a287735ec..18ce8682d88 100644 --- a/src/test/org/apache/lucene/search/TestRangeFilter.java +++ b/src/test/org/apache/lucene/search/TestRangeFilter.java @@ -376,4 +376,46 @@ public class TestRangeFilter extends BaseTestRangeFilter { assertEquals("The index Term should be included.", 1, result.length()); search.close(); } + + public void testDanish() throws Exception { + + /* build an index */ + RAMDirectory danishIndex = new RAMDirectory(); + IndexWriter writer = new IndexWriter + (danishIndex, new SimpleAnalyzer(), T, + IndexWriter.MaxFieldLength.LIMITED); + // Danish collation orders the words below in the given order + // (example taken from TestSort.testInternationalSort() ). + String[] words = { "H\u00D8T", "H\u00C5T", "MAND" }; + for (int docnum = 0 ; docnum < words.length ; ++docnum) { + Document doc = new Document(); + doc.add(new Field("content", words[docnum], + Field.Store.YES, Field.Index.UN_TOKENIZED)); + doc.add(new Field("body", "body", + Field.Store.YES, Field.Index.UN_TOKENIZED)); + writer.addDocument(doc); + } + writer.optimize(); + writer.close(); + + IndexReader reader = IndexReader.open(danishIndex); + IndexSearcher search = new IndexSearcher(reader); + Query q = new TermQuery(new Term("body","body")); + + Collator collator = Collator.getInstance(new Locale("da", "dk")); + Query query = new RangeQuery + ("content", "H\u00D8T", "MAND", false, false, collator); + + // Unicode order would not include "H\u00C5T" in [ "H\u00D8T", "MAND" ], + // but Danish collation does. + Hits result = search.search + (q, new RangeFilter("content", "H\u00D8T", "MAND", F, F, collator)); + assertEquals("The index Term should be included.", 1, result.length()); + + result = search.search + (q, new RangeFilter("content", "H\u00C5T", "MAND", F, F, collator)); + assertEquals + ("The index Term should not be included.", 0, result.length()); + search.close(); + } } diff --git a/src/test/org/apache/lucene/search/TestRangeQuery.java b/src/test/org/apache/lucene/search/TestRangeQuery.java index e02a5c00eb6..e4194b0ad45 100644 --- a/src/test/org/apache/lucene/search/TestRangeQuery.java +++ b/src/test/org/apache/lucene/search/TestRangeQuery.java @@ -187,6 +187,26 @@ public class TestRangeQuery extends LuceneTestCase { assertEquals("The index Term should be included.", 1, hits.length); searcher.close(); } + + public void testDanish() throws Exception { + Collator collator = Collator.getInstance(new Locale("da", "dk")); + // Danish collation orders the words below in the given order (example taken + // from TestSort.testInternationalSort() ). + String[] words = { "H\u00D8T", "H\u00C5T", "MAND" }; + Query query = new RangeQuery("content", "H\u00D8T", "MAND", false, false, collator); + + // Unicode order would not include "H\u00C5T" in [ "H\u00D8T", "MAND" ], + // but Danish collation does. + initializeIndex(words); + IndexSearcher searcher = new IndexSearcher(dir); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals("The index Term should be included.", 1, hits.length); + + query = new RangeQuery("content", "H\u00C5T", "MAND", false, false, collator); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals("The index Term should not be included.", 0, hits.length); + searcher.close(); + } private void initializeIndex(String[] values) throws IOException { IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);