LUCENE-510: fix backwards compatibility bug when bulk-merging stored fields from pre-UTF8 segments that contain non-ascii stored fields

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@654774 13f79535-47bb-0310-9956-ffa450edef68
2008-05-09 12:04:46 +00:00 · 2008-05-09 12:04:46 +00:00 · 39651e029d
parent 5a4f2572c0
commit 39651e029d
6 changed files with 42 additions and 5 deletions
--- a/src/java/org/apache/lucene/index/FieldsReader.java
+++ b/src/java/org/apache/lucene/index/FieldsReader.java
@ -172,6 +172,10 @@ final class FieldsReader {
    indexStream.seek(formatSize + (docID + docStoreOffset) * 8L);
  }

+  boolean canReadRawDocs() {
+    return format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
+  }
+
  final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
    seekIndex(n);
    long position = indexStream.readLong();
--- a/src/java/org/apache/lucene/index/SegmentMerger.java
+++ b/src/java/org/apache/lucene/index/SegmentMerger.java
@ -302,8 +302,14 @@ final class SegmentMerger {
          final FieldsReader matchingFieldsReader;
          final boolean hasMatchingReader;
          if (matchingSegmentReader != null) {
-            hasMatchingReader = true;
-            matchingFieldsReader = matchingSegmentReader.getFieldsReader();
+            final FieldsReader fieldsReader = matchingSegmentReader.getFieldsReader();
+            if (fieldsReader != null && !fieldsReader.canReadRawDocs()) {            
+              matchingFieldsReader = null;
+              hasMatchingReader = false;
+            } else {
+              matchingFieldsReader = fieldsReader;
+              hasMatchingReader = true;
+            }
          } else {
            hasMatchingReader = false;
            matchingFieldsReader = null;
--- a/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
+++ b/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
@ -129,6 +129,22 @@ public class TestBackwardsCompatibility extends LuceneTestCase
                             "23.nocfs",
  };

+  public void testOptimizeOldIndex() throws IOException {
+    for(int i=0;i<oldNames.length;i++) {
+      String dirName = "src/test/org/apache/lucene/index/index." + oldNames[i];
+      unzip(dirName, oldNames[i]);
+      String fullPath = fullDir(oldNames[i]);
+      Directory dir = FSDirectory.getDirectory(fullPath);
+      IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+      w.optimize();
+      w.close();
+
+      _TestUtil.checkIndex(dir);
+      dir.close();
+      rmDir(oldNames[i]);
+    }
+  }
+
  public void testSearchOldIndex() throws IOException {
    for(int i=0;i<oldNames.length;i++) {
      String dirName = "src/test/org/apache/lucene/index/index." + oldNames[i];
@ -190,13 +206,16 @@ public class TestBackwardsCompatibility extends LuceneTestCase
        Document d = reader.document(i);
        List fields = d.getFields();
        if (oldName.startsWith("23.")) {
-          assertEquals(3, fields.size());
+          assertEquals(4, fields.size());
          Field f = (Field) d.getField("id");
          assertEquals(""+i, f.stringValue());

          f = (Field) d.getField("utf8");
          assertEquals("Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", f.stringValue());

+          f = (Field) d.getField("autf8");
+          assertEquals("Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", f.stringValue());
+        
          f = (Field) d.getField("content2");
          assertEquals("here is more content with aaa aaa aaa", f.stringValue());
        }        
@ -214,7 +233,11 @@ public class TestBackwardsCompatibility extends LuceneTestCase

    testHits(hits, 34, searcher.getIndexReader());

-    if (oldName.startsWith("23.")) {
+    if (!oldName.startsWith("19.") &&
+        !oldName.startsWith("20.") &&
+        !oldName.startsWith("21.") &&
+        !oldName.startsWith("22.")) {
+      // Test on indices >= 2.3
      hits = searcher.search(new TermQuery(new Term("utf8", "\u0000")));
      assertEquals(34, hits.length());
      hits = searcher.search(new TermQuery(new Term("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne")));
@ -455,6 +478,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
    Document doc = new Document();
    doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED));
    doc.add(new Field("id", Integer.toString(id), Field.Store.YES, Field.Index.UN_TOKENIZED));
+    doc.add(new Field("autf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
    doc.add(new Field("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
    doc.add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
    writer.addDocument(doc);
--- a/src/test/org/apache/lucene/index/index.23.cfs.zip
+++ b/src/test/org/apache/lucene/index/index.23.cfs.zip
--- a/src/test/org/apache/lucene/index/index.23.nocfs.zip
+++ b/src/test/org/apache/lucene/index/index.23.nocfs.zip
--- a/src/test/org/apache/lucene/util/_TestUtil.java
+++ b/src/test/org/apache/lucene/util/_TestUtil.java
@ -54,13 +54,16 @@ public class _TestUtil {
      ((ConcurrentMergeScheduler) ms).sync();
  }

+  /** This runs the CheckIndex tool on the index in.  If any
+   *  issues are hit, a RuntimeException is thrown; else,
+   *  true is returned. */
  public static boolean checkIndex(Directory dir) throws IOException {
    ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
    CheckIndex.out = new PrintStream(bos);
    if (!CheckIndex.check(dir, false, null)) {
      System.out.println("CheckIndex failed");
      System.out.println(bos.toString());
-      return false;
+      throw new RuntimeException("CheckIndex failed");
    } else
      return true;
  }