From b6c68ccdedc7bedb73d90c7e21b285b4e71f7ff4 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 11 Jun 2019 17:27:55 +0200 Subject: [PATCH] LUCENE-8835: Respect file extension when listing files form FileSwitchDirectory (#700) FileSwitchDirectory splits file actions between 2 directories based on file extensions. The extensions are respected on write operations like delete or create but ignored when we list the content of the directories. Until now we only deduplicated the contents on Directory#listAll which can cause inconsistencies and hard to debug errors due to double deletions in IndexWriter is a file is pending delete in one of the directories but still shows up in the directory listing form the other directory. This case can happen if both directories point to the same underlying FS directory which is a common use-case to split between mmap and NIOFS. This change filters out files from directories depending on their file extension to make sure files that are deleted in one directory are not returned form another if they point to the same FS directory. --- lucene/CHANGES.txt | 4 +++ .../lucene/store/FileSwitchDirectory.java | 16 +++++++-- .../lucene/store/TestFileSwitchDirectory.java | 33 ++++++++++++++++++- .../apache/lucene/util/LuceneTestCase.java | 8 +++++ 4 files changed, 57 insertions(+), 4 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 148cf4b31d2..0c80901b86a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -74,6 +74,10 @@ Bug Fixes * LUCENE-8843: Don't ignore exceptions that are thrown when trying to open a file in IOUtils#fsync. (Jason Tedor via Adrien Grand) +* LUCENE-8835: FileSwitchDirectory now respects the file extension when listing directory + contents to ensure we don't expose pending deletes if both directory point to the same + underlying filesystem directory. (Simon Willnauer) + Improvements * LUCENE-7840: Non-scoring BooleanQuery now removes SHOULD clauses before building the scorer supplier diff --git a/lucene/core/src/java/org/apache/lucene/store/FileSwitchDirectory.java b/lucene/core/src/java/org/apache/lucene/store/FileSwitchDirectory.java index da52c5d1a96..7484ae00f56 100644 --- a/lucene/core/src/java/org/apache/lucene/store/FileSwitchDirectory.java +++ b/lucene/core/src/java/org/apache/lucene/store/FileSwitchDirectory.java @@ -86,7 +86,7 @@ public class FileSwitchDirectory extends Directory { @Override public String[] listAll() throws IOException { - Set files = new HashSet<>(); + List files = new ArrayList<>(); // LUCENE-3380: either or both of our dirs could be FSDirs, // but if one underlying delegate is an FSDir and mkdirs() has not // yet been called, because so far everything is written to the other, @@ -94,14 +94,24 @@ public class FileSwitchDirectory extends Directory { NoSuchFileException exc = null; try { for(String f : primaryDir.listAll()) { - files.add(f); + String ext = getExtension(f); + // we should respect the extension here as well to ensure that we don't list a file that is already + // deleted or rather in the one of the directories pending deletions if both directories point + // to the same filesystem path. This is quite common for instance to use NIOFS as a primary + // and MMap as a secondary to only mmap files like docvalues or term dictionaries. + if (primaryExtensions.contains(ext)) { + files.add(f); + } } } catch (NoSuchFileException e) { exc = e; } try { for(String f : secondaryDir.listAll()) { - files.add(f); + String ext = getExtension(f); + if (primaryExtensions.contains(ext) == false) { + files.add(f); + } } } catch (NoSuchFileException e) { // we got NoSuchFileException from both dirs diff --git a/lucene/core/src/test/org/apache/lucene/store/TestFileSwitchDirectory.java b/lucene/core/src/test/org/apache/lucene/store/TestFileSwitchDirectory.java index f324bb35108..031d5eb8872 100644 --- a/lucene/core/src/test/org/apache/lucene/store/TestFileSwitchDirectory.java +++ b/lucene/core/src/test/org/apache/lucene/store/TestFileSwitchDirectory.java @@ -18,10 +18,14 @@ package org.apache.lucene.store; import java.io.IOException; +import java.net.URI; +import java.nio.file.FileSystem; import java.nio.file.Path; +import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Set; +import java.util.function.Function; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter; @@ -31,9 +35,10 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.TestIndexWriterReader; +import org.apache.lucene.mockfile.FilterPath; +import org.apache.lucene.mockfile.WindowsFS; import org.apache.lucene.util.TestUtil; -// See: https://issues.apache.org/jira/browse/SOLR-12028 Tests cannot remove files on Windows machines occasionally public class TestFileSwitchDirectory extends BaseDirectoryTestCase { /** @@ -131,4 +136,30 @@ public class TestFileSwitchDirectory extends BaseDirectoryTestCase { } return newFSSwitchDirectory(extensions); } + + public void testDeleteAndList() throws IOException { + // relies on windows semantics + Path path = createTempDir(); + FileSystem fs = new WindowsFS(path.getFileSystem()).getFileSystem(URI.create("file:///")); + Path indexPath = new FilterPath(path, fs); + try (final FileSwitchDirectory dir = new FileSwitchDirectory(Collections.singleton("tim"), + new SimpleFSDirectory(indexPath), new SimpleFSDirectory(indexPath), true)) { + dir.createOutput("foo.tim", IOContext.DEFAULT).close(); + Function stripExtra = array -> Arrays.asList(array).stream() + .filter(f -> f.startsWith("extra") == false).count(); + try (IndexInput indexInput = dir.openInput("foo.tim", IOContext.DEFAULT)) { + dir.deleteFile("foo.tim"); + assertEquals(1, dir.getPrimaryDir().getPendingDeletions().size()); + assertEquals(1, dir.getPendingDeletions().size()); + assertEquals(0, stripExtra.apply(dir.listAll()).intValue()); + assertEquals(0, stripExtra.apply(dir.getPrimaryDir().listAll()).intValue()); + assertEquals(1, stripExtra.apply(dir.getSecondaryDir().listAll()).intValue()); + } + assertEquals(0, dir.getPrimaryDir().getPendingDeletions().size()); + assertEquals(0, dir.getPendingDeletions().size()); + assertEquals(0, stripExtra.apply(dir.listAll()).intValue()); + assertEquals(0, stripExtra.apply(dir.getPrimaryDir().listAll()).intValue()); + assertEquals(0, stripExtra.apply(dir.getSecondaryDir().listAll()).intValue()); + } + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java index ae73b3ff032..e5159a784e3 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java @@ -91,6 +91,7 @@ import org.apache.lucene.store.ByteBuffersDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FSLockFactory; +import org.apache.lucene.store.FileSwitchDirectory; import org.apache.lucene.store.FlushInfo; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.LockFactory; @@ -1414,6 +1415,13 @@ public abstract class LuceneTestCase extends Assert { } Directory fsdir = newFSDirectoryImpl(clazz, f, lf); + if (rarely()) { + List fileExtensions = + Arrays.asList("fdt", "fdx", "tim", "tip", "si", "fnm", "pos", "dii", "dim", "nvm", "nvd", "dvm", "dvd"); + Collections.shuffle(fileExtensions, random()); + fileExtensions = fileExtensions.subList(0, 1 + random().nextInt(fileExtensions.size())); + fsdir = new FileSwitchDirectory(new HashSet<>(fileExtensions), fsdir, newFSDirectoryImpl(clazz, f, lf), true); + } BaseDirectoryWrapper wrapped = wrapDirectory(random(), fsdir, bare); return wrapped; } catch (Exception e) {