LUCENE-8835: Respect file extension when listing files form FileSwitchDirectory (#700)

FileSwitchDirectory splits file actions between 2 directories based
on file extensions. The extensions are respected on write operations
like delete or create but ignored when we list the content of the
directories. Until now we only deduplicated the contents on
Directory#listAll which can cause inconsistencies and hard to debug
errors due to double deletions in IndexWriter is a file is pending
delete in one of the directories but still shows up in the directory
listing form the other directory. This case can happen if both
directories point to the same underlying FS directory which is a
common use-case to split between mmap and NIOFS.

This change filters out files from directories depending on their
file extension to make sure files that are deleted in one directory
are not returned form another if they point to the same FS directory.
This commit is contained in:
Simon Willnauer 2019-06-11 17:27:55 +02:00 committed by GitHub
parent abeecfacd4
commit b6c68ccded
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 57 additions and 4 deletions

View File

@ -74,6 +74,10 @@ Bug Fixes
* LUCENE-8843: Don't ignore exceptions that are thrown when trying to open a
file in IOUtils#fsync. (Jason Tedor via Adrien Grand)
* LUCENE-8835: FileSwitchDirectory now respects the file extension when listing directory
contents to ensure we don't expose pending deletes if both directory point to the same
underlying filesystem directory. (Simon Willnauer)
Improvements
* LUCENE-7840: Non-scoring BooleanQuery now removes SHOULD clauses before building the scorer supplier

View File

@ -86,7 +86,7 @@ public class FileSwitchDirectory extends Directory {
@Override
public String[] listAll() throws IOException {
Set<String> files = new HashSet<>();
List<String> files = new ArrayList<>();
// LUCENE-3380: either or both of our dirs could be FSDirs,
// but if one underlying delegate is an FSDir and mkdirs() has not
// yet been called, because so far everything is written to the other,
@ -94,14 +94,24 @@ public class FileSwitchDirectory extends Directory {
NoSuchFileException exc = null;
try {
for(String f : primaryDir.listAll()) {
files.add(f);
String ext = getExtension(f);
// we should respect the extension here as well to ensure that we don't list a file that is already
// deleted or rather in the one of the directories pending deletions if both directories point
// to the same filesystem path. This is quite common for instance to use NIOFS as a primary
// and MMap as a secondary to only mmap files like docvalues or term dictionaries.
if (primaryExtensions.contains(ext)) {
files.add(f);
}
}
} catch (NoSuchFileException e) {
exc = e;
}
try {
for(String f : secondaryDir.listAll()) {
files.add(f);
String ext = getExtension(f);
if (primaryExtensions.contains(ext) == false) {
files.add(f);
}
}
} catch (NoSuchFileException e) {
// we got NoSuchFileException from both dirs

View File

@ -18,10 +18,14 @@ package org.apache.lucene.store;
import java.io.IOException;
import java.net.URI;
import java.nio.file.FileSystem;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.function.Function;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter;
@ -31,9 +35,10 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.TestIndexWriterReader;
import org.apache.lucene.mockfile.FilterPath;
import org.apache.lucene.mockfile.WindowsFS;
import org.apache.lucene.util.TestUtil;
// See: https://issues.apache.org/jira/browse/SOLR-12028 Tests cannot remove files on Windows machines occasionally
public class TestFileSwitchDirectory extends BaseDirectoryTestCase {
/**
@ -131,4 +136,30 @@ public class TestFileSwitchDirectory extends BaseDirectoryTestCase {
}
return newFSSwitchDirectory(extensions);
}
public void testDeleteAndList() throws IOException {
// relies on windows semantics
Path path = createTempDir();
FileSystem fs = new WindowsFS(path.getFileSystem()).getFileSystem(URI.create("file:///"));
Path indexPath = new FilterPath(path, fs);
try (final FileSwitchDirectory dir = new FileSwitchDirectory(Collections.singleton("tim"),
new SimpleFSDirectory(indexPath), new SimpleFSDirectory(indexPath), true)) {
dir.createOutput("foo.tim", IOContext.DEFAULT).close();
Function<String[], Long> stripExtra = array -> Arrays.asList(array).stream()
.filter(f -> f.startsWith("extra") == false).count();
try (IndexInput indexInput = dir.openInput("foo.tim", IOContext.DEFAULT)) {
dir.deleteFile("foo.tim");
assertEquals(1, dir.getPrimaryDir().getPendingDeletions().size());
assertEquals(1, dir.getPendingDeletions().size());
assertEquals(0, stripExtra.apply(dir.listAll()).intValue());
assertEquals(0, stripExtra.apply(dir.getPrimaryDir().listAll()).intValue());
assertEquals(1, stripExtra.apply(dir.getSecondaryDir().listAll()).intValue());
}
assertEquals(0, dir.getPrimaryDir().getPendingDeletions().size());
assertEquals(0, dir.getPendingDeletions().size());
assertEquals(0, stripExtra.apply(dir.listAll()).intValue());
assertEquals(0, stripExtra.apply(dir.getPrimaryDir().listAll()).intValue());
assertEquals(0, stripExtra.apply(dir.getSecondaryDir().listAll()).intValue());
}
}
}

View File

@ -91,6 +91,7 @@ import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.FSLockFactory;
import org.apache.lucene.store.FileSwitchDirectory;
import org.apache.lucene.store.FlushInfo;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.LockFactory;
@ -1414,6 +1415,13 @@ public abstract class LuceneTestCase extends Assert {
}
Directory fsdir = newFSDirectoryImpl(clazz, f, lf);
if (rarely()) {
List<String> fileExtensions =
Arrays.asList("fdt", "fdx", "tim", "tip", "si", "fnm", "pos", "dii", "dim", "nvm", "nvd", "dvm", "dvd");
Collections.shuffle(fileExtensions, random());
fileExtensions = fileExtensions.subList(0, 1 + random().nextInt(fileExtensions.size()));
fsdir = new FileSwitchDirectory(new HashSet<>(fileExtensions), fsdir, newFSDirectoryImpl(clazz, f, lf), true);
}
BaseDirectoryWrapper wrapped = wrapDirectory(random(), fsdir, bare);
return wrapped;
} catch (Exception e) {