fixup issues with 32-bit jvm

If you run tests under a 32-bit jvm, you will get a test failure in IndexStoreTests,
the logic there is wrong in the case of 32-bit (its NIOFSDirectory on linux).

Also if mlockall fails, you'll see huge bogus values (because of use of `long` instead of `NativeLong`)

finally add seccomp support for 32 bit too, and clean up all its `long` usage as well.
This commit is contained in:
Robert Muir 2015-11-08 21:08:08 -05:00
parent bfc235f2ed
commit 0de503b34b
4 changed files with 87 additions and 48 deletions

View File

@ -20,6 +20,7 @@
package org.elasticsearch.bootstrap;
import com.sun.jna.Native;
import com.sun.jna.NativeLong;
import com.sun.jna.Structure;
import org.apache.lucene.util.Constants;
@ -55,8 +56,8 @@ final class JNACLibrary {
/** corresponds to struct rlimit */
public static final class Rlimit extends Structure implements Structure.ByReference {
public long rlim_cur = 0;
public long rlim_max = 0;
public NativeLong rlim_cur = new NativeLong(0);
public NativeLong rlim_max = new NativeLong(0);
@Override
protected List<String> getFieldOrder() {

View File

@ -71,8 +71,8 @@ class JNANatives {
JNACLibrary.Rlimit rlimit = new JNACLibrary.Rlimit();
if (JNACLibrary.getrlimit(JNACLibrary.RLIMIT_MEMLOCK, rlimit) == 0) {
rlimitSuccess = true;
softLimit = rlimit.rlim_cur;
hardLimit = rlimit.rlim_max;
softLimit = rlimit.rlim_cur.longValue();
hardLimit = rlimit.rlim_max.longValue();
} else {
logger.warn("Unable to retrieve resource limits: " + JNACLibrary.strerror(Native.getLastError()));
}

View File

@ -22,6 +22,7 @@ package org.elasticsearch.bootstrap;
import com.sun.jna.Library;
import com.sun.jna.Memory;
import com.sun.jna.Native;
import com.sun.jna.NativeLong;
import com.sun.jna.Pointer;
import com.sun.jna.Structure;
import com.sun.jna.ptr.PointerByReference;
@ -38,7 +39,9 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Installs a limited form of secure computing mode,
@ -46,7 +49,7 @@ import java.util.List;
* <p>
* This is only supported on the Linux, Solaris, and Mac OS X operating systems.
* <p>
* On Linux it currently supports on the amd64 architecture, on Linux kernels 3.5 or above, and requires
* On Linux it currently supports amd64 and i386 architectures, requires Linux kernel 3.5 or above, and requires
* {@code CONFIG_SECCOMP} and {@code CONFIG_SECCOMP_FILTER} compiled into the kernel.
* <p>
* On Linux BPF Filters are installed using either {@code seccomp(2)} (3.17+) or {@code prctl(2)} (3.5+). {@code seccomp(2)}
@ -95,12 +98,12 @@ final class Seccomp {
/**
* maps to prctl(2)
*/
int prctl(int option, long arg2, long arg3, long arg4, long arg5);
int prctl(int option, NativeLong arg2, NativeLong arg3, NativeLong arg4, NativeLong arg5);
/**
* used to call seccomp(2), its too new...
* this is the only way, DONT use it on some other architecture unless you know wtf you are doing
*/
long syscall(long number, Object... args);
NativeLong syscall(NativeLong number, Object... args);
};
// null if unavailable or something goes wrong.
@ -119,7 +122,6 @@ final class Seccomp {
}
/** the preferred method is seccomp(2), since we can apply to all threads of the process */
static final int SECCOMP_SYSCALL_NR = 317; // since Linux 3.17
static final int SECCOMP_SET_MODE_FILTER = 1; // since Linux 3.17
static final int SECCOMP_FILTER_FLAG_TSYNC = 1; // since Linux 3.17
@ -190,7 +192,6 @@ final class Seccomp {
return new SockFilter((short) code, (byte) jt, (byte) jf, k);
}
static final int AUDIT_ARCH_X86_64 = 0xC000003E;
static final int SECCOMP_RET_ERRNO = 0x00050000;
static final int SECCOMP_RET_DATA = 0x0000FFFF;
static final int SECCOMP_RET_ALLOW = 0x7FFF0000;
@ -201,29 +202,63 @@ final class Seccomp {
static final int EINVAL = 0x16;
static final int ENOSYS = 0x26;
// offsets (arch dependent) that our BPF checks
// offsets that our BPF checks
// check with offsetof() when adding a new arch, move to Arch if different.
static final int SECCOMP_DATA_NR_OFFSET = 0x00;
static final int SECCOMP_DATA_ARCH_OFFSET = 0x04;
// currently these ranges are blocked (inclusive):
// execve is really the only one needed but why let someone fork a 30G heap? (not really what happens)
// ...
// 57: fork
// 58: vfork
// 59: execve
// ...
// 322: execveat
// ...
static final int NR_SYSCALL_FORK = 57;
static final int NR_SYSCALL_EXECVE = 59;
static final int NR_SYSCALL_EXECVEAT = 322; // since Linux 3.19
static final int NR_SYSCALL_TUXCALL = 184; // should return ENOSYS
static class Arch {
/** AUDIT_ARCH_XXX constant from linux/audit.h */
final int audit;
/** syscall limit (necessary for blacklisting on amd64, to ban 32-bit syscalls) */
final int limit;
/** __NR_fork */
final int fork;
/** __NR_vfork */
final int vfork;
/** __NR_execve */
final int execve;
/** __NR_execveat */
final int execveat;
/** __NR_seccomp */
final int seccomp;
Arch(int audit, int limit, int fork, int vfork, int execve, int execveat, int seccomp) {
this.audit = audit;
this.limit = limit;
this.fork = fork;
this.vfork = vfork;
this.execve = execve;
this.execveat = execveat;
this.seccomp = seccomp;
}
}
/** supported architectures map keyed by os.arch */
private static final Map<String,Arch> ARCHITECTURES;
static {
Map<String,Arch> m = new HashMap<>();
m.put("amd64", new Arch(0xC000003E, 0x3FFFFFFF, 57, 58, 59, 322, 317));
m.put("i386", new Arch(0x40000003, 0xFFFFFFFF, 2, 190, 11, 358, 354));
ARCHITECTURES = Collections.unmodifiableMap(m);
}
/** invokes prctl() from linux libc library */
private static int linux_prctl(int option, long arg2, long arg3, long arg4, long arg5) {
return linux_libc.prctl(option, new NativeLong(arg2), new NativeLong(arg3), new NativeLong(arg4), new NativeLong(arg5));
}
/** invokes syscall() from linux libc library */
private static long linux_syscall(long number, Object... args) {
return linux_libc.syscall(new NativeLong(number), args).longValue();
}
/** try to install our BPF filters via seccomp() or prctl() to block execution */
private static int linuxImpl() {
// first be defensive: we can give nice errors this way, at the very least.
// also, some of these security features get backported to old versions, checking kernel version here is a big no-no!
boolean supported = Constants.LINUX && "amd64".equals(Constants.OS_ARCH);
final Arch arch = ARCHITECTURES.get(Constants.OS_ARCH);
boolean supported = Constants.LINUX && arch != null;
if (supported == false) {
throw new UnsupportedOperationException("seccomp unavailable: '" + Constants.OS_ARCH + "' architecture unsupported");
}
@ -237,7 +272,7 @@ final class Seccomp {
// check that unimplemented syscalls actually return ENOSYS
// you never know (e.g. https://code.google.com/p/chromium/issues/detail?id=439795)
if (linux_libc.syscall(NR_SYSCALL_TUXCALL) >= 0 || Native.getLastError() != ENOSYS) {
if (linux_syscall(999) >= 0 || Native.getLastError() != ENOSYS) {
throw new UnsupportedOperationException("seccomp unavailable: your kernel is buggy and you should upgrade");
}
@ -246,7 +281,7 @@ final class Seccomp {
final int bogusArg = 0xf7a46a5c;
// test seccomp(BOGUS)
long ret = linux_libc.syscall(SECCOMP_SYSCALL_NR, bogusArg);
long ret = linux_syscall(arch.seccomp, bogusArg);
if (ret != -1) {
throw new UnsupportedOperationException("seccomp unavailable: seccomp(BOGUS_OPERATION) returned " + ret);
} else {
@ -259,7 +294,7 @@ final class Seccomp {
}
// test seccomp(VALID, BOGUS)
ret = linux_libc.syscall(SECCOMP_SYSCALL_NR, SECCOMP_SET_MODE_FILTER, bogusArg);
ret = linux_syscall(arch.seccomp, SECCOMP_SET_MODE_FILTER, bogusArg);
if (ret != -1) {
throw new UnsupportedOperationException("seccomp unavailable: seccomp(SECCOMP_SET_MODE_FILTER, BOGUS_FLAG) returned " + ret);
} else {
@ -272,7 +307,7 @@ final class Seccomp {
}
// test prctl(BOGUS)
ret = linux_libc.prctl(bogusArg, 0, 0, 0, 0);
ret = linux_prctl(bogusArg, 0, 0, 0, 0);
if (ret != -1) {
throw new UnsupportedOperationException("seccomp unavailable: prctl(BOGUS_OPTION) returned " + ret);
} else {
@ -287,7 +322,7 @@ final class Seccomp {
// now just normal defensive checks
// check for GET_NO_NEW_PRIVS
switch (linux_libc.prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0)) {
switch (linux_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0)) {
case 0: break; // not yet set
case 1: break; // already set by caller
default:
@ -299,7 +334,7 @@ final class Seccomp {
}
}
// check for SECCOMP
switch (linux_libc.prctl(PR_GET_SECCOMP, 0, 0, 0, 0)) {
switch (linux_prctl(PR_GET_SECCOMP, 0, 0, 0, 0)) {
case 0: break; // not yet set
case 2: break; // already in filter mode by caller
default:
@ -311,7 +346,7 @@ final class Seccomp {
}
}
// check for SECCOMP_MODE_FILTER
if (linux_libc.prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 0, 0, 0) != 0) {
if (linux_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 0, 0, 0) != 0) {
int errno = Native.getLastError();
switch (errno) {
case EFAULT: break; // available
@ -321,27 +356,28 @@ final class Seccomp {
}
// ok, now set PR_SET_NO_NEW_PRIVS, needed to be able to set a seccomp filter as ordinary user
if (linux_libc.prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) {
if (linux_prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) {
throw new UnsupportedOperationException("prctl(PR_SET_NO_NEW_PRIVS): " + JNACLibrary.strerror(Native.getLastError()));
}
// check it worked
if (linux_libc.prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0) != 1) {
if (linux_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0) != 1) {
throw new UnsupportedOperationException("seccomp filter did not really succeed: prctl(PR_GET_NO_NEW_PRIVS): " + JNACLibrary.strerror(Native.getLastError()));
}
// BPF installed to check arch, then syscall range. See https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt for details.
// BPF installed to check arch, limit, then syscall. See https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt for details.
SockFilter insns[] = {
/* 1 */ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_ARCH_OFFSET), //
/* 2 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, AUDIT_ARCH_X86_64, 0, 4), // if (arch != amd64) goto fail;
/* 3 */ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_NR_OFFSET), //
/* 4 */ BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, NR_SYSCALL_FORK, 0, 3), // if (syscall < SYSCALL_FORK) goto pass;
/* 5 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, NR_SYSCALL_EXECVEAT, 1, 0), // if (syscall == SYSCALL_EXECVEAT) goto fail;
/* 6 */ BPF_JUMP(BPF_JMP + BPF_JGT + BPF_K, NR_SYSCALL_EXECVE, 1, 0), // if (syscall > SYSCALL_EXECVE) goto pass;
/* 7 */ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ERRNO | (EACCES & SECCOMP_RET_DATA)), // fail: return EACCES;
/* 8 */ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW) // pass: return OK;
/* 1 */ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_ARCH_OFFSET), //
/* 2 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.audit, 0, 7), // if (arch != audit) goto fail;
/* 3 */ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_NR_OFFSET), //
/* 4 */ BPF_JUMP(BPF_JMP + BPF_JGT + BPF_K, arch.limit, 5, 0), // if (syscall > LIMIT) goto fail;
/* 5 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.fork, 4, 0), // if (syscall == FORK) goto fail;
/* 6 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.vfork, 3, 0), // if (syscall == VFORK) goto fail;
/* 7 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.execve, 2, 0), // if (syscall == EXECVE) goto fail;
/* 8 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.execveat, 1, 0), // if (syscall == EXECVEAT) goto fail;
/* 9 */ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), // pass: return OK;
/* 10 */ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ERRNO | (EACCES & SECCOMP_RET_DATA)), // fail: return EACCES;
};
// seccomp takes a long, so we pass it one explicitly to keep the JNA simple
SockFProg prog = new SockFProg(insns);
prog.write();
@ -350,13 +386,13 @@ final class Seccomp {
int method = 1;
// install filter, if this works, after this there is no going back!
// first try it with seccomp(SECCOMP_SET_MODE_FILTER), falling back to prctl()
if (linux_libc.syscall(SECCOMP_SYSCALL_NR, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, pointer) != 0) {
if (linux_syscall(arch.seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, new NativeLong(pointer)) != 0) {
method = 0;
int errno1 = Native.getLastError();
if (logger.isDebugEnabled()) {
logger.debug("seccomp(SECCOMP_SET_MODE_FILTER): " + JNACLibrary.strerror(errno1) + ", falling back to prctl(PR_SET_SECCOMP)...");
}
if (linux_libc.prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, pointer, 0, 0) != 0) {
if (linux_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, pointer, 0, 0) != 0) {
int errno2 = Native.getLastError();
throw new UnsupportedOperationException("seccomp(SECCOMP_SET_MODE_FILTER): " + JNACLibrary.strerror(errno1) +
", prctl(PR_SET_SECCOMP): " + JNACLibrary.strerror(errno2));
@ -364,7 +400,7 @@ final class Seccomp {
}
// now check that the filter was really installed, we should be in filter mode.
if (linux_libc.prctl(PR_GET_SECCOMP, 0, 0, 0, 0) != 2) {
if (linux_prctl(PR_GET_SECCOMP, 0, 0, 0, 0) != 2) {
throw new UnsupportedOperationException("seccomp filter installation did not really succeed. seccomp(PR_GET_SECCOMP): " + JNACLibrary.strerror(Native.getLastError()));
}

View File

@ -84,8 +84,10 @@ public class IndexStoreTests extends ESTestCase {
try (final Directory directory = service.newFSDirectory(tempDir, NoLockFactory.INSTANCE)) {
if (Constants.WINDOWS) {
assertTrue(directory.toString(), directory instanceof MMapDirectory || directory instanceof SimpleFSDirectory);
} else {
} else if (Constants.JRE_IS_64BIT) {
assertTrue(directory.toString(), directory instanceof FileSwitchDirectory);
} else {
assertTrue(directory.toString(), directory instanceof NIOFSDirectory);
}
}
}