From 8602dd54fdea8c882fb36b42e7ac4753daf0af2c Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 24 Oct 2011 16:59:59 +0000 Subject: [PATCH] LUCENE-3526: preflex codec returned ghost terms if you used empty field name git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1188232 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/index/CheckIndex.java | 6 ++++++ .../index/codecs/preflex/SegmentTermEnum.java | 9 +++++++++ .../index/codecs/preflex/TermInfosReader.java | 8 +------- .../codecs/preflexrw/TermInfosWriter.java | 5 +++++ .../apache/lucene/index/TestIndexWriter.java | 18 ++++++++++++++++++ .../search/TestFieldCacheRewriteMethod.java | 4 ++-- .../lucene/search/TestRegexpRandom2.java | 12 +++++++----- 7 files changed, 48 insertions(+), 14 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/src/java/org/apache/lucene/index/CheckIndex.java index dc642ad1d5c..aaf0385f695 100644 --- a/lucene/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/src/java/org/apache/lucene/index/CheckIndex.java @@ -726,6 +726,9 @@ public class CheckIndex { } final int docFreq = terms.docFreq(); + if (docFreq <= 0) { + throw new RuntimeException("docfreq: " + docFreq + " is out of bounds"); + } status.totFreq += docFreq; sumDocFreq += docFreq; @@ -823,6 +826,9 @@ public class CheckIndex { throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount); } if (hasTotalTermFreq) { + if (totalTermFreq2 <= 0) { + throw new RuntimeException("totalTermFreq: " + totalTermFreq2 + " is out of bounds"); + } sumTotalTermFreq += totalTermFreq; if (totalTermFreq != totalTermFreq2) { throw new RuntimeException("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java index 6c6681d6a1d..179e94622ed 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java @@ -61,6 +61,7 @@ public final class SegmentTermEnum implements Cloneable { int skipInterval; int newSuffixStart; int maxSkipLevels; + private boolean first = true; SegmentTermEnum(IndexInput i, FieldInfos fis, boolean isi) throws CorruptIndexException, IOException { @@ -123,6 +124,7 @@ public final class SegmentTermEnum implements Cloneable { prevBuffer.reset(); //System.out.println(" ste doSeek prev=" + prevBuffer.toTerm() + " this=" + this); termInfo.set(ti); + first = p == -1; } /** Increments the enumeration to the next element. True if one exists.*/ @@ -162,6 +164,13 @@ public final class SegmentTermEnum implements Cloneable { final int scanTo(Term term) throws IOException { scanBuffer.set(term); int count = 0; + if (first) { + // Always force initial next() in case term is + // Term("", "") + next(); + first = false; + count++; + } while (scanBuffer.compareTo(termBuffer) > 0 && next()) { count++; } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java index 3ca8ca617f4..48352000236 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java @@ -320,13 +320,7 @@ public final class TermInfosReader { ti = enumerator.termInfo; if (tiOrd == null) { if (useCache) { - // LUCENE-3183: it's possible, if term is Term("", - // ""), for the STE to be incorrectly un-positioned - // after scan-to; work around this by not caching in - // this case: - if (enumerator.position >= 0) { - termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, enumerator.position)); - } + termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, enumerator.position)); } } else { assert sameTermInfo(ti, tiOrd, enumerator); diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java index 34a79ef99ba..91e07a19340 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java @@ -199,6 +199,11 @@ final class TermInfosWriter implements Closeable { if (ch1 != ch2) return ch1-ch2; } + if (utf16Result1.length == 0 && lastFieldNumber == -1) { + // If there is a field named "" (empty string) with a term text of "" (empty string) then we + // will get 0 on this comparison, yet, it's "OK". + return -1; + } return utf16Result1.length - utf16Result2.length; } diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java index d7e5e8e1a70..952430d8410 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -869,6 +869,24 @@ public class TestIndexWriter extends LuceneTestCase { writer.close(); dir.close(); } + + public void testEmptyFieldNameTerms() throws IOException { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random))); + Document doc = new Document(); + doc.add(newField("", "a b c", TextField.TYPE_UNSTORED)); + writer.addDocument(doc); + writer.close(); + IndexReader reader = IndexReader.open(dir, true); + IndexReader subreader = getOnlySegmentReader(reader); + TermsEnum te = subreader.fields().terms("").iterator(); + assertEquals(new BytesRef("a"), te.next()); + assertEquals(new BytesRef("b"), te.next()); + assertEquals(new BytesRef("c"), te.next()); + assertNull(te.next()); + reader.close(); + dir.close(); + } diff --git a/lucene/src/test/org/apache/lucene/search/TestFieldCacheRewriteMethod.java b/lucene/src/test/org/apache/lucene/search/TestFieldCacheRewriteMethod.java index b261cdea031..aa72457079b 100644 --- a/lucene/src/test/org/apache/lucene/search/TestFieldCacheRewriteMethod.java +++ b/lucene/src/test/org/apache/lucene/search/TestFieldCacheRewriteMethod.java @@ -30,10 +30,10 @@ public class TestFieldCacheRewriteMethod extends TestRegexpRandom2 { /** Test fieldcache rewrite against filter rewrite */ @Override protected void assertSame(String regexp) throws IOException { - RegexpQuery fieldCache = new RegexpQuery(new Term("field", regexp), RegExp.NONE); + RegexpQuery fieldCache = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE); fieldCache.setRewriteMethod(new FieldCacheRewriteMethod()); - RegexpQuery filter = new RegexpQuery(new Term("field", regexp), RegExp.NONE); + RegexpQuery filter = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE); filter.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); TopDocs fieldCacheDocs = searcher1.search(fieldCache, 25); diff --git a/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java index 2f74d8da6e3..83db492dc81 100644 --- a/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java +++ b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java @@ -55,16 +55,18 @@ public class TestRegexpRandom2 extends LuceneTestCase { protected IndexSearcher searcher2; private IndexReader reader; private Directory dir; - + protected String fieldName; + @Override public void setUp() throws Exception { super.setUp(); dir = newDirectory(); + fieldName = random.nextBoolean() ? "field" : ""; // sometimes use an empty string as field name RandomIndexWriter writer = new RandomIndexWriter(random, dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.KEYWORD, false)) .setMaxBufferedDocs(_TestUtil.nextInt(random, 50, 1000))); Document doc = new Document(); - Field field = newField("field", "", StringField.TYPE_UNSTORED); + Field field = newField(fieldName, "", StringField.TYPE_UNSTORED); doc.add(field); List terms = new ArrayList(); int num = atLeast(200); @@ -141,7 +143,7 @@ public class TestRegexpRandom2 extends LuceneTestCase { public void testRegexps() throws Exception { // we generate aweful regexps: good for testing. // but for preflex codec, the test can be very slow, so use less iterations. - int num = CodecProvider.getDefault().getFieldCodec("field").equals("PreFlex") ? 100 * RANDOM_MULTIPLIER : atLeast(1000); + int num = CodecProvider.getDefault().getFieldCodec(fieldName).equals("PreFlex") ? 100 * RANDOM_MULTIPLIER : atLeast(1000); for (int i = 0; i < num; i++) { String reg = AutomatonTestUtil.randomRegexp(random); if (VERBOSE) { @@ -155,8 +157,8 @@ public class TestRegexpRandom2 extends LuceneTestCase { * simple regexpquery implementation. */ protected void assertSame(String regexp) throws IOException { - RegexpQuery smart = new RegexpQuery(new Term("field", regexp), RegExp.NONE); - DumbRegexpQuery dumb = new DumbRegexpQuery(new Term("field", regexp), RegExp.NONE); + RegexpQuery smart = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE); + DumbRegexpQuery dumb = new DumbRegexpQuery(new Term(fieldName, regexp), RegExp.NONE); TopDocs smartDocs = searcher1.search(smart, 25); TopDocs dumbDocs = searcher2.search(dumb, 25);