LUCENE-8853: Try parsing original file extension from tmp file (#716)

FileSwitchDirectory fails if the tmp file are not in the same directory
as the file it's renamed to. This is correct behavior but breaks with
tmp files used with index sorting. This change tries best effort to find
the right extension directory if the file ends with `.tmp`
This commit is contained in:
Simon Willnauer 2019-06-18 08:47:59 +02:00 committed by GitHub
parent 4f6314c59b
commit fb6e28d9f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 72 additions and 14 deletions

View File

@ -83,6 +83,9 @@ Bug Fixes
contents to ensure we don't expose pending deletes if both directory point to the same
underlying filesystem directory. (Simon Willnauer)
* LUCENE-8853: FileSwitchDirectory now applies best effort to place tmp files in the same
direcotry as the target files. (Simon Willnauer)
Improvements
* LUCENE-7840: Non-scoring BooleanQuery now removes SHOULD clauses before building the scorer supplier

View File

@ -24,6 +24,7 @@ import java.nio.file.NoSuchFileException;
import java.util.Collection; // for javadocs
import java.util.Set;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.util.IOUtils;
/**
@ -207,4 +208,13 @@ public abstract class Directory implements Closeable {
* @lucene.internal
*/
public abstract Set<String> getPendingDeletions() throws IOException;
/**
* Creates a file name for a temporary file. The name will start with
* {@code prefix}, end with {@code suffix} and have a reserved file extension {@code .tmp}.
* @see #createTempOutput(String, String, IOContext)
*/
protected static String getTempFileName(String prefix, String suffix, long counter) {
return IndexFileNames.segmentFileName(prefix, suffix + "_" + Long.toString(counter, Character.MAX_RADIX), "tmp");
}
}

View File

@ -41,7 +41,6 @@ import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.IOUtils;
@ -261,7 +260,7 @@ public abstract class FSDirectory extends BaseDirectory {
maybeDeletePendingFiles();
while (true) {
try {
String name = IndexFileNames.segmentFileName(prefix, suffix + "_" + Long.toString(nextTempFileCounter.getAndIncrement(), Character.MAX_RADIX), "tmp");
String name = getTempFileName(prefix, suffix, nextTempFileCounter.getAndIncrement());
if (pendingDeletes.contains(name)) {
continue;
}

View File

@ -27,6 +27,8 @@ import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.util.IOUtils;
@ -53,8 +55,12 @@ public class FileSwitchDirectory extends Directory {
private final Directory primaryDir;
private final Set<String> primaryExtensions;
private boolean doClose;
private static final Pattern EXT_PATTERN = Pattern.compile("\\.([a-zA-Z]+)");
public FileSwitchDirectory(Set<String> primaryExtensions, Directory primaryDir, Directory secondaryDir, boolean doClose) {
if (primaryExtensions.contains("tmp")) {
throw new IllegalArgumentException("tmp is a reserved extension");
}
this.primaryExtensions = primaryExtensions;
this.primaryDir = primaryDir;
this.secondaryDir = secondaryDir;
@ -141,7 +147,14 @@ public class FileSwitchDirectory extends Directory {
if (i == -1) {
return "";
}
return name.substring(i+1, name.length());
String ext = name.substring(i + 1);
if (ext.equals("tmp")) {
Matcher matcher = EXT_PATTERN.matcher(name.substring(0, i + 1));
if (matcher.find()) {
return matcher.group(1);
}
}
return ext;
}
private Directory getDirectory(String name) {
@ -174,7 +187,12 @@ public class FileSwitchDirectory extends Directory {
@Override
public IndexOutput createTempOutput(String prefix, String suffix, IOContext context) throws IOException {
return getDirectory("."+suffix).createTempOutput(prefix, suffix, context);
// this is best effort - it's ok to create a tmp file with any prefix and suffix. Yet if this file is then
// in-turn used to rename they must match to the same directory hence we use the full file-name to find
// the right directory. Here we can't make a decision but we need to ensure that all other operations
// map to the right directory.
String tmpFileName = getTempFileName(prefix, suffix, 0);
return getDirectory(tmpFileName).createTempOutput(prefix, suffix, context);
}
@Override
@ -183,10 +201,11 @@ public class FileSwitchDirectory extends Directory {
List<String> secondaryNames = new ArrayList<>();
for (String name : names)
if (primaryExtensions.contains(getExtension(name)))
if (primaryExtensions.contains(getExtension(name))) {
primaryNames.add(name);
else
} else {
secondaryNames.add(name);
}
primaryDir.sync(primaryNames);
secondaryDir.sync(secondaryNames);

View File

@ -19,6 +19,7 @@ package org.apache.lucene.store;
import java.io.IOException;
import java.net.URI;
import java.nio.file.AtomicMoveNotSupportedException;
import java.nio.file.FileSystem;
import java.nio.file.Path;
import java.util.Arrays;
@ -114,6 +115,32 @@ public class TestFileSwitchDirectory extends BaseDirectoryTestCase {
dir.close();
}
public void testRenameTmpFile() throws IOException {
try (Directory directory = getDirectory(createTempDir())) {
final String name;
try (IndexOutput out = directory.createTempOutput("foo.cfs", "", IOContext.DEFAULT)) {
out.writeInt(1);
name = out.getName();
}
assertEquals(1, Arrays.stream(directory.listAll()).filter(f -> f.equals(name)).count());
assertEquals(0, Arrays.stream(directory.listAll()).filter(f -> f.equals("foo.cfs")).count());
directory.rename(name, "foo.cfs");
assertEquals(1, Arrays.stream(directory.listAll()).filter(f -> f.equals("foo.cfs")).count());
assertEquals(0, Arrays.stream(directory.listAll()).filter(f -> f.equals(name)).count());
}
try (Directory directory= newFSSwitchDirectory(Collections.singleton("bar"))) {
String brokenName;
try (IndexOutput out = directory.createTempOutput("foo", "bar", IOContext.DEFAULT)) {
out.writeInt(1);
brokenName = out.getName();
}
AtomicMoveNotSupportedException exception = expectThrows(AtomicMoveNotSupportedException.class,
() -> directory.rename(brokenName, "foo.bar"));
assertEquals("foo_bar_0.tmp -> foo.bar: source and dest are in different directories", exception.getMessage());
}
}
@Override
protected Directory getDirectory(Path path) throws IOException {
Set<String> extensions = new HashSet<String>();

View File

@ -91,6 +91,7 @@ import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.FSLockFactory;
import org.apache.lucene.store.FileSwitchDirectory;
import org.apache.lucene.store.FlushInfo;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.LockFactory;
@ -1414,14 +1415,13 @@ public abstract class LuceneTestCase extends Assert {
}
Directory fsdir = newFSDirectoryImpl(clazz, f, lf);
// LUCENE-8853: FileSwitchDirectory is broken for tmp outputs.
// if (rarely()) {
// List<String> fileExtensions =
// Arrays.asList("fdt", "fdx", "tim", "tip", "si", "fnm", "pos", "dii", "dim", "nvm", "nvd", "dvm", "dvd");
// Collections.shuffle(fileExtensions, random());
// fileExtensions = fileExtensions.subList(0, 1 + random().nextInt(fileExtensions.size()));
// fsdir = new FileSwitchDirectory(new HashSet<>(fileExtensions), fsdir, newFSDirectoryImpl(clazz, f, lf), true);
// }
if (rarely()) {
List<String> fileExtensions =
Arrays.asList("fdt", "fdx", "tim", "tip", "si", "fnm", "pos", "dii", "dim", "nvm", "nvd", "dvm", "dvd");
Collections.shuffle(fileExtensions, random());
fileExtensions = fileExtensions.subList(0, 1 + random().nextInt(fileExtensions.size()));
fsdir = new FileSwitchDirectory(new HashSet<>(fileExtensions), fsdir, newFSDirectoryImpl(clazz, f, lf), true);
}
BaseDirectoryWrapper wrapped = wrapDirectory(random(), fsdir, bare);
return wrapped;
} catch (Exception e) {