fixup issues with 32-bit jvm
If you run tests under a 32-bit jvm, you will get a test failure in IndexStoreTests, the logic there is wrong in the case of 32-bit (its NIOFSDirectory on linux). Also if mlockall fails, you'll see huge bogus values (because of use of `long` instead of `NativeLong`) finally add seccomp support for 32 bit too, and clean up all its `long` usage as well.
This commit is contained in:
parent
bfc235f2ed
commit
0de503b34b
|
@ -20,6 +20,7 @@
|
|||
package org.elasticsearch.bootstrap;
|
||||
|
||||
import com.sun.jna.Native;
|
||||
import com.sun.jna.NativeLong;
|
||||
import com.sun.jna.Structure;
|
||||
|
||||
import org.apache.lucene.util.Constants;
|
||||
|
@ -55,8 +56,8 @@ final class JNACLibrary {
|
|||
|
||||
/** corresponds to struct rlimit */
|
||||
public static final class Rlimit extends Structure implements Structure.ByReference {
|
||||
public long rlim_cur = 0;
|
||||
public long rlim_max = 0;
|
||||
public NativeLong rlim_cur = new NativeLong(0);
|
||||
public NativeLong rlim_max = new NativeLong(0);
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
|
|
|
@ -71,8 +71,8 @@ class JNANatives {
|
|||
JNACLibrary.Rlimit rlimit = new JNACLibrary.Rlimit();
|
||||
if (JNACLibrary.getrlimit(JNACLibrary.RLIMIT_MEMLOCK, rlimit) == 0) {
|
||||
rlimitSuccess = true;
|
||||
softLimit = rlimit.rlim_cur;
|
||||
hardLimit = rlimit.rlim_max;
|
||||
softLimit = rlimit.rlim_cur.longValue();
|
||||
hardLimit = rlimit.rlim_max.longValue();
|
||||
} else {
|
||||
logger.warn("Unable to retrieve resource limits: " + JNACLibrary.strerror(Native.getLastError()));
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ package org.elasticsearch.bootstrap;
|
|||
import com.sun.jna.Library;
|
||||
import com.sun.jna.Memory;
|
||||
import com.sun.jna.Native;
|
||||
import com.sun.jna.NativeLong;
|
||||
import com.sun.jna.Pointer;
|
||||
import com.sun.jna.Structure;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
@ -38,7 +39,9 @@ import java.nio.file.Files;
|
|||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Installs a limited form of secure computing mode,
|
||||
|
@ -46,7 +49,7 @@ import java.util.List;
|
|||
* <p>
|
||||
* This is only supported on the Linux, Solaris, and Mac OS X operating systems.
|
||||
* <p>
|
||||
* On Linux it currently supports on the amd64 architecture, on Linux kernels 3.5 or above, and requires
|
||||
* On Linux it currently supports amd64 and i386 architectures, requires Linux kernel 3.5 or above, and requires
|
||||
* {@code CONFIG_SECCOMP} and {@code CONFIG_SECCOMP_FILTER} compiled into the kernel.
|
||||
* <p>
|
||||
* On Linux BPF Filters are installed using either {@code seccomp(2)} (3.17+) or {@code prctl(2)} (3.5+). {@code seccomp(2)}
|
||||
|
@ -95,12 +98,12 @@ final class Seccomp {
|
|||
/**
|
||||
* maps to prctl(2)
|
||||
*/
|
||||
int prctl(int option, long arg2, long arg3, long arg4, long arg5);
|
||||
int prctl(int option, NativeLong arg2, NativeLong arg3, NativeLong arg4, NativeLong arg5);
|
||||
/**
|
||||
* used to call seccomp(2), its too new...
|
||||
* this is the only way, DONT use it on some other architecture unless you know wtf you are doing
|
||||
*/
|
||||
long syscall(long number, Object... args);
|
||||
NativeLong syscall(NativeLong number, Object... args);
|
||||
};
|
||||
|
||||
// null if unavailable or something goes wrong.
|
||||
|
@ -119,7 +122,6 @@ final class Seccomp {
|
|||
}
|
||||
|
||||
/** the preferred method is seccomp(2), since we can apply to all threads of the process */
|
||||
static final int SECCOMP_SYSCALL_NR = 317; // since Linux 3.17
|
||||
static final int SECCOMP_SET_MODE_FILTER = 1; // since Linux 3.17
|
||||
static final int SECCOMP_FILTER_FLAG_TSYNC = 1; // since Linux 3.17
|
||||
|
||||
|
@ -190,7 +192,6 @@ final class Seccomp {
|
|||
return new SockFilter((short) code, (byte) jt, (byte) jf, k);
|
||||
}
|
||||
|
||||
static final int AUDIT_ARCH_X86_64 = 0xC000003E;
|
||||
static final int SECCOMP_RET_ERRNO = 0x00050000;
|
||||
static final int SECCOMP_RET_DATA = 0x0000FFFF;
|
||||
static final int SECCOMP_RET_ALLOW = 0x7FFF0000;
|
||||
|
@ -201,29 +202,63 @@ final class Seccomp {
|
|||
static final int EINVAL = 0x16;
|
||||
static final int ENOSYS = 0x26;
|
||||
|
||||
// offsets (arch dependent) that our BPF checks
|
||||
// offsets that our BPF checks
|
||||
// check with offsetof() when adding a new arch, move to Arch if different.
|
||||
static final int SECCOMP_DATA_NR_OFFSET = 0x00;
|
||||
static final int SECCOMP_DATA_ARCH_OFFSET = 0x04;
|
||||
|
||||
// currently these ranges are blocked (inclusive):
|
||||
// execve is really the only one needed but why let someone fork a 30G heap? (not really what happens)
|
||||
// ...
|
||||
// 57: fork
|
||||
// 58: vfork
|
||||
// 59: execve
|
||||
// ...
|
||||
// 322: execveat
|
||||
// ...
|
||||
static final int NR_SYSCALL_FORK = 57;
|
||||
static final int NR_SYSCALL_EXECVE = 59;
|
||||
static final int NR_SYSCALL_EXECVEAT = 322; // since Linux 3.19
|
||||
static final int NR_SYSCALL_TUXCALL = 184; // should return ENOSYS
|
||||
static class Arch {
|
||||
/** AUDIT_ARCH_XXX constant from linux/audit.h */
|
||||
final int audit;
|
||||
/** syscall limit (necessary for blacklisting on amd64, to ban 32-bit syscalls) */
|
||||
final int limit;
|
||||
/** __NR_fork */
|
||||
final int fork;
|
||||
/** __NR_vfork */
|
||||
final int vfork;
|
||||
/** __NR_execve */
|
||||
final int execve;
|
||||
/** __NR_execveat */
|
||||
final int execveat;
|
||||
/** __NR_seccomp */
|
||||
final int seccomp;
|
||||
|
||||
Arch(int audit, int limit, int fork, int vfork, int execve, int execveat, int seccomp) {
|
||||
this.audit = audit;
|
||||
this.limit = limit;
|
||||
this.fork = fork;
|
||||
this.vfork = vfork;
|
||||
this.execve = execve;
|
||||
this.execveat = execveat;
|
||||
this.seccomp = seccomp;
|
||||
}
|
||||
}
|
||||
|
||||
/** supported architectures map keyed by os.arch */
|
||||
private static final Map<String,Arch> ARCHITECTURES;
|
||||
static {
|
||||
Map<String,Arch> m = new HashMap<>();
|
||||
m.put("amd64", new Arch(0xC000003E, 0x3FFFFFFF, 57, 58, 59, 322, 317));
|
||||
m.put("i386", new Arch(0x40000003, 0xFFFFFFFF, 2, 190, 11, 358, 354));
|
||||
ARCHITECTURES = Collections.unmodifiableMap(m);
|
||||
}
|
||||
|
||||
/** invokes prctl() from linux libc library */
|
||||
private static int linux_prctl(int option, long arg2, long arg3, long arg4, long arg5) {
|
||||
return linux_libc.prctl(option, new NativeLong(arg2), new NativeLong(arg3), new NativeLong(arg4), new NativeLong(arg5));
|
||||
}
|
||||
|
||||
/** invokes syscall() from linux libc library */
|
||||
private static long linux_syscall(long number, Object... args) {
|
||||
return linux_libc.syscall(new NativeLong(number), args).longValue();
|
||||
}
|
||||
|
||||
/** try to install our BPF filters via seccomp() or prctl() to block execution */
|
||||
private static int linuxImpl() {
|
||||
// first be defensive: we can give nice errors this way, at the very least.
|
||||
// also, some of these security features get backported to old versions, checking kernel version here is a big no-no!
|
||||
boolean supported = Constants.LINUX && "amd64".equals(Constants.OS_ARCH);
|
||||
final Arch arch = ARCHITECTURES.get(Constants.OS_ARCH);
|
||||
boolean supported = Constants.LINUX && arch != null;
|
||||
if (supported == false) {
|
||||
throw new UnsupportedOperationException("seccomp unavailable: '" + Constants.OS_ARCH + "' architecture unsupported");
|
||||
}
|
||||
|
@ -237,7 +272,7 @@ final class Seccomp {
|
|||
|
||||
// check that unimplemented syscalls actually return ENOSYS
|
||||
// you never know (e.g. https://code.google.com/p/chromium/issues/detail?id=439795)
|
||||
if (linux_libc.syscall(NR_SYSCALL_TUXCALL) >= 0 || Native.getLastError() != ENOSYS) {
|
||||
if (linux_syscall(999) >= 0 || Native.getLastError() != ENOSYS) {
|
||||
throw new UnsupportedOperationException("seccomp unavailable: your kernel is buggy and you should upgrade");
|
||||
}
|
||||
|
||||
|
@ -246,7 +281,7 @@ final class Seccomp {
|
|||
final int bogusArg = 0xf7a46a5c;
|
||||
|
||||
// test seccomp(BOGUS)
|
||||
long ret = linux_libc.syscall(SECCOMP_SYSCALL_NR, bogusArg);
|
||||
long ret = linux_syscall(arch.seccomp, bogusArg);
|
||||
if (ret != -1) {
|
||||
throw new UnsupportedOperationException("seccomp unavailable: seccomp(BOGUS_OPERATION) returned " + ret);
|
||||
} else {
|
||||
|
@ -259,7 +294,7 @@ final class Seccomp {
|
|||
}
|
||||
|
||||
// test seccomp(VALID, BOGUS)
|
||||
ret = linux_libc.syscall(SECCOMP_SYSCALL_NR, SECCOMP_SET_MODE_FILTER, bogusArg);
|
||||
ret = linux_syscall(arch.seccomp, SECCOMP_SET_MODE_FILTER, bogusArg);
|
||||
if (ret != -1) {
|
||||
throw new UnsupportedOperationException("seccomp unavailable: seccomp(SECCOMP_SET_MODE_FILTER, BOGUS_FLAG) returned " + ret);
|
||||
} else {
|
||||
|
@ -272,7 +307,7 @@ final class Seccomp {
|
|||
}
|
||||
|
||||
// test prctl(BOGUS)
|
||||
ret = linux_libc.prctl(bogusArg, 0, 0, 0, 0);
|
||||
ret = linux_prctl(bogusArg, 0, 0, 0, 0);
|
||||
if (ret != -1) {
|
||||
throw new UnsupportedOperationException("seccomp unavailable: prctl(BOGUS_OPTION) returned " + ret);
|
||||
} else {
|
||||
|
@ -287,7 +322,7 @@ final class Seccomp {
|
|||
// now just normal defensive checks
|
||||
|
||||
// check for GET_NO_NEW_PRIVS
|
||||
switch (linux_libc.prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0)) {
|
||||
switch (linux_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0)) {
|
||||
case 0: break; // not yet set
|
||||
case 1: break; // already set by caller
|
||||
default:
|
||||
|
@ -299,7 +334,7 @@ final class Seccomp {
|
|||
}
|
||||
}
|
||||
// check for SECCOMP
|
||||
switch (linux_libc.prctl(PR_GET_SECCOMP, 0, 0, 0, 0)) {
|
||||
switch (linux_prctl(PR_GET_SECCOMP, 0, 0, 0, 0)) {
|
||||
case 0: break; // not yet set
|
||||
case 2: break; // already in filter mode by caller
|
||||
default:
|
||||
|
@ -311,7 +346,7 @@ final class Seccomp {
|
|||
}
|
||||
}
|
||||
// check for SECCOMP_MODE_FILTER
|
||||
if (linux_libc.prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 0, 0, 0) != 0) {
|
||||
if (linux_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 0, 0, 0) != 0) {
|
||||
int errno = Native.getLastError();
|
||||
switch (errno) {
|
||||
case EFAULT: break; // available
|
||||
|
@ -321,27 +356,28 @@ final class Seccomp {
|
|||
}
|
||||
|
||||
// ok, now set PR_SET_NO_NEW_PRIVS, needed to be able to set a seccomp filter as ordinary user
|
||||
if (linux_libc.prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) {
|
||||
if (linux_prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) {
|
||||
throw new UnsupportedOperationException("prctl(PR_SET_NO_NEW_PRIVS): " + JNACLibrary.strerror(Native.getLastError()));
|
||||
}
|
||||
|
||||
// check it worked
|
||||
if (linux_libc.prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0) != 1) {
|
||||
if (linux_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0) != 1) {
|
||||
throw new UnsupportedOperationException("seccomp filter did not really succeed: prctl(PR_GET_NO_NEW_PRIVS): " + JNACLibrary.strerror(Native.getLastError()));
|
||||
}
|
||||
|
||||
// BPF installed to check arch, then syscall range. See https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt for details.
|
||||
// BPF installed to check arch, limit, then syscall. See https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt for details.
|
||||
SockFilter insns[] = {
|
||||
/* 1 */ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_ARCH_OFFSET), //
|
||||
/* 2 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, AUDIT_ARCH_X86_64, 0, 4), // if (arch != amd64) goto fail;
|
||||
/* 2 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.audit, 0, 7), // if (arch != audit) goto fail;
|
||||
/* 3 */ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_NR_OFFSET), //
|
||||
/* 4 */ BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, NR_SYSCALL_FORK, 0, 3), // if (syscall < SYSCALL_FORK) goto pass;
|
||||
/* 5 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, NR_SYSCALL_EXECVEAT, 1, 0), // if (syscall == SYSCALL_EXECVEAT) goto fail;
|
||||
/* 6 */ BPF_JUMP(BPF_JMP + BPF_JGT + BPF_K, NR_SYSCALL_EXECVE, 1, 0), // if (syscall > SYSCALL_EXECVE) goto pass;
|
||||
/* 7 */ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ERRNO | (EACCES & SECCOMP_RET_DATA)), // fail: return EACCES;
|
||||
/* 8 */ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW) // pass: return OK;
|
||||
/* 4 */ BPF_JUMP(BPF_JMP + BPF_JGT + BPF_K, arch.limit, 5, 0), // if (syscall > LIMIT) goto fail;
|
||||
/* 5 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.fork, 4, 0), // if (syscall == FORK) goto fail;
|
||||
/* 6 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.vfork, 3, 0), // if (syscall == VFORK) goto fail;
|
||||
/* 7 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.execve, 2, 0), // if (syscall == EXECVE) goto fail;
|
||||
/* 8 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.execveat, 1, 0), // if (syscall == EXECVEAT) goto fail;
|
||||
/* 9 */ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), // pass: return OK;
|
||||
/* 10 */ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ERRNO | (EACCES & SECCOMP_RET_DATA)), // fail: return EACCES;
|
||||
};
|
||||
|
||||
// seccomp takes a long, so we pass it one explicitly to keep the JNA simple
|
||||
SockFProg prog = new SockFProg(insns);
|
||||
prog.write();
|
||||
|
@ -350,13 +386,13 @@ final class Seccomp {
|
|||
int method = 1;
|
||||
// install filter, if this works, after this there is no going back!
|
||||
// first try it with seccomp(SECCOMP_SET_MODE_FILTER), falling back to prctl()
|
||||
if (linux_libc.syscall(SECCOMP_SYSCALL_NR, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, pointer) != 0) {
|
||||
if (linux_syscall(arch.seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, new NativeLong(pointer)) != 0) {
|
||||
method = 0;
|
||||
int errno1 = Native.getLastError();
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("seccomp(SECCOMP_SET_MODE_FILTER): " + JNACLibrary.strerror(errno1) + ", falling back to prctl(PR_SET_SECCOMP)...");
|
||||
}
|
||||
if (linux_libc.prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, pointer, 0, 0) != 0) {
|
||||
if (linux_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, pointer, 0, 0) != 0) {
|
||||
int errno2 = Native.getLastError();
|
||||
throw new UnsupportedOperationException("seccomp(SECCOMP_SET_MODE_FILTER): " + JNACLibrary.strerror(errno1) +
|
||||
", prctl(PR_SET_SECCOMP): " + JNACLibrary.strerror(errno2));
|
||||
|
@ -364,7 +400,7 @@ final class Seccomp {
|
|||
}
|
||||
|
||||
// now check that the filter was really installed, we should be in filter mode.
|
||||
if (linux_libc.prctl(PR_GET_SECCOMP, 0, 0, 0, 0) != 2) {
|
||||
if (linux_prctl(PR_GET_SECCOMP, 0, 0, 0, 0) != 2) {
|
||||
throw new UnsupportedOperationException("seccomp filter installation did not really succeed. seccomp(PR_GET_SECCOMP): " + JNACLibrary.strerror(Native.getLastError()));
|
||||
}
|
||||
|
||||
|
|
|
@ -84,8 +84,10 @@ public class IndexStoreTests extends ESTestCase {
|
|||
try (final Directory directory = service.newFSDirectory(tempDir, NoLockFactory.INSTANCE)) {
|
||||
if (Constants.WINDOWS) {
|
||||
assertTrue(directory.toString(), directory instanceof MMapDirectory || directory instanceof SimpleFSDirectory);
|
||||
} else {
|
||||
} else if (Constants.JRE_IS_64BIT) {
|
||||
assertTrue(directory.toString(), directory instanceof FileSwitchDirectory);
|
||||
} else {
|
||||
assertTrue(directory.toString(), directory instanceof NIOFSDirectory);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue