diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java b/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java index 6886055fb6c..84e070a7b80 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java @@ -699,7 +699,32 @@ final class IndexFileDeleter implements Closeable { infoStream.message("IFD", "delete \"" + names + "\""); } + // We make two passes, first deleting any segments_N files, second deleting all the rest. We do this so that if we throw exc or JVM + // crashes during deletions, we don't leave the index in an "apparently corrupt" state: for(String name : names) { + if (name.startsWith(IndexFileNames.SEGMENTS) == false) { + continue; + } + try { + directory.deleteFile(name); + } catch (NoSuchFileException | FileNotFoundException e) { + // IndexWriter should only ask us to delete files it knows it wrote, so if we hit this, something is wrong! + + if (Constants.WINDOWS) { + // TODO: can we remove this OS-specific hacky logic? If windows deleteFile is buggy, we should instead contain this workaround in + // a WindowsFSDirectory ... + // LUCENE-6684: we suppress this assert for Windows, since a file could be in a confusing "pending delete" state, and falsely + // return NSFE/FNFE + } else { + throw e; + } + } + } + + for(String name : names) { + if (name.startsWith(IndexFileNames.SEGMENTS) == true) { + continue; + } try { directory.deleteFile(name); } catch (NoSuchFileException | FileNotFoundException e) { diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaFileDeleter.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaFileDeleter.java index 005f93875c5..b15fc05ff2c 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaFileDeleter.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaFileDeleter.java @@ -122,6 +122,7 @@ class ReplicaFileDeleter { node.message("file " + fileName + ": delete failed: " + missing); throw new IllegalStateException("file " + fileName + ": we attempted delete but the file does not exist?", missing); } catch (IOException ioe) { + // nocommit remove this retry logic! it's Directory's job now... if (Node.VERBOSE_FILES) { node.message("file " + fileName + ": delete failed: " + ioe + "; will retry later"); } diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java index a7adbe2103d..54083b4e682 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java @@ -92,8 +92,6 @@ abstract class ReplicaNode extends Node { // Obtain a write lock on this index since we "act like" an IndexWriter, to prevent any other IndexWriter or ReplicaNode from using it: writeFileLock = dir.obtainLock(IndexWriter.WRITE_LOCK_NAME); - // nocommit must check for no pending deletes here, like IW does - state = "init"; deleter = new ReplicaFileDeleter(this, dir); } catch (Throwable t) { diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/Jobs.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/Jobs.java index 369414fa448..3cb2fbb77a4 100644 --- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/Jobs.java +++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/Jobs.java @@ -84,6 +84,7 @@ class Jobs extends Thread implements Closeable { topJob.onceDone.run(topJob); } catch (Throwable t2) { node.message("ignore exception calling OnceDone: " + t2); + t2.printStackTrace(System.out); } continue; } diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java index 2510c40e9a1..83ce6cb6cc6 100644 --- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java +++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java @@ -135,9 +135,9 @@ class SimpleReplicaNode extends ReplicaNode { MockDirectoryWrapper dir = LuceneTestCase.newMockFSDirectory(path); dir.setAssertNoUnrefencedFilesOnClose(true); - if (doCheckIndexOnClose) { - dir.setCheckIndexOnClose(false); - } + // nocommit + //dir.setCheckIndexOnClose(doCheckIndexOnClose); + dir.setCheckIndexOnClose(true); // Corrupt any index files not referenced by current commit point; this is important (increases test evilness) because we may have done // a hard crash of the previous JVM writing to this directory and so MDW's corrupt-unknown-files-on-close never ran: diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestNRTReplication.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestNRTReplication.java index cd98b48b3de..2c66994c27d 100644 --- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestNRTReplication.java +++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestNRTReplication.java @@ -39,8 +39,6 @@ import org.apache.lucene.util.TestUtil; import com.carrotsearch.randomizedtesting.SeedUtils; -// nocommit make some explicit failure tests - // MockRandom's .sd file has no index header/footer: @SuppressCodecs({"MockRandom", "Memory", "Direct", "SimpleText"}) @SuppressSysoutChecks(bugUrl = "Stuff gets printed, important stuff for debugging a failure") diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestStressNRTReplication.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestStressNRTReplication.java index 04bbdc11e2f..a765f11b436 100644 --- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestStressNRTReplication.java +++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestStressNRTReplication.java @@ -161,7 +161,8 @@ public class TestStressNRTReplication extends LuceneTestCase { static final boolean DO_BIT_FLIPS_DURING_COPY = true; /** Set to a non-null value to force exactly that many nodes; else, it's random. */ - static final Integer NUM_NODES = null; + // nocommit + static final Integer NUM_NODES = 2; final AtomicBoolean failed = new AtomicBoolean(); @@ -980,10 +981,10 @@ public class TestStressNRTReplication extends LuceneTestCase { continue; } - // nocommit not anymore? - // This can be null if we got the new primary after crash and that primary is still catching up (replaying xlog): + // This can be null if primary is flushing, has already refreshed its searcher, but is e.g. still notifying replicas and hasn't + // yet returned the version to us, in which case this searcher thread can see the version before the main thread has added it to + // versionToMarker: Integer expectedAtLeastHitCount = versionToMarker.get(version); - assertNotNull("version=" + version, expectedAtLeastHitCount); if (expectedAtLeastHitCount != null && expectedAtLeastHitCount > 0 && random().nextInt(10) == 7) { try { diff --git a/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java index a5fc3979145..9e25889c499 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java @@ -230,10 +230,6 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper { throw (AssertionError) fillOpenTrace(new AssertionError("MockDirectoryWrapper: dest file \"" + dest + "\" is still open: cannot rename"), dest, true); } - if (createdFiles.contains(dest)) { - throw new IOException("MockDirectoryWrapper: dest file \"" + dest + "\" already exists: cannot rename"); - } - boolean success = false; try { in.renameFile(source, dest); @@ -275,7 +271,14 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper { for(String fileName : listAll()) { if (fileName.startsWith(IndexFileNames.SEGMENTS)) { System.out.println("MDW: read " + fileName + " to gather files it references"); - knownFiles.addAll(SegmentInfos.readCommit(this, fileName).files(true)); + SegmentInfos infos; + try { + infos = SegmentInfos.readCommit(this, fileName); + } catch (IOException ioe) { + System.out.println("MDW: exception reading segment infos " + fileName + "; files: " + Arrays.toString(listAll())); + throw ioe; + } + knownFiles.addAll(infos.files(true)); } } @@ -438,7 +441,6 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper { /** Simulates a crash of OS or machine by overwriting * unsynced files. */ public synchronized void crash() throws IOException { - crashed = true; openFiles = new HashMap<>(); openFilesForWrite = new HashSet<>(); openFilesDeleted = new HashSet<>(); @@ -451,6 +453,7 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper { } catch (Exception ignored) {} } corruptFiles(unSyncedFiles); + crashed = true; unSyncedFiles = new HashSet<>(); } @@ -569,6 +572,7 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper { unSyncedFiles.remove(name); in.deleteFile(name); + createdFiles.remove(name); } // sets the cause of the incoming ioe to be the stack @@ -829,7 +833,7 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper { } throw new RuntimeException("MockDirectoryWrapper: cannot close: there are still open locks: " + openLocks, cause); } - + if (getCheckIndexOnClose()) { randomIOExceptionRate = 0.0; randomIOExceptionRateOnOpen = 0.0; @@ -846,6 +850,7 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper { TestUtil.checkIndex(this, getCrossCheckTermVectorsOnClose(), true); // TODO: factor this out / share w/ TestIW.assertNoUnreferencedFiles + // nocommit pull this outside of "getCheckIndexOnClose" if (assertNoUnreferencedFilesOnClose) { // now look for unreferenced files: discount ones that we tried to delete but could not