From 0de503b34bcccf7fa3bd89cc46f6c149fc5217fd Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sun, 8 Nov 2015 21:08:08 -0500 Subject: [PATCH] fixup issues with 32-bit jvm If you run tests under a 32-bit jvm, you will get a test failure in IndexStoreTests, the logic there is wrong in the case of 32-bit (its NIOFSDirectory on linux). Also if mlockall fails, you'll see huge bogus values (because of use of `long` instead of `NativeLong`) finally add seccomp support for 32 bit too, and clean up all its `long` usage as well. --- .../elasticsearch/bootstrap/JNACLibrary.java | 5 +- .../elasticsearch/bootstrap/JNANatives.java | 4 +- .../org/elasticsearch/bootstrap/Seccomp.java | 122 ++++++++++++------ .../index/store/IndexStoreTests.java | 4 +- 4 files changed, 87 insertions(+), 48 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/bootstrap/JNACLibrary.java b/core/src/main/java/org/elasticsearch/bootstrap/JNACLibrary.java index 972d1a11cc7..8a2cc96f9c6 100644 --- a/core/src/main/java/org/elasticsearch/bootstrap/JNACLibrary.java +++ b/core/src/main/java/org/elasticsearch/bootstrap/JNACLibrary.java @@ -20,6 +20,7 @@ package org.elasticsearch.bootstrap; import com.sun.jna.Native; +import com.sun.jna.NativeLong; import com.sun.jna.Structure; import org.apache.lucene.util.Constants; @@ -55,8 +56,8 @@ final class JNACLibrary { /** corresponds to struct rlimit */ public static final class Rlimit extends Structure implements Structure.ByReference { - public long rlim_cur = 0; - public long rlim_max = 0; + public NativeLong rlim_cur = new NativeLong(0); + public NativeLong rlim_max = new NativeLong(0); @Override protected List getFieldOrder() { diff --git a/core/src/main/java/org/elasticsearch/bootstrap/JNANatives.java b/core/src/main/java/org/elasticsearch/bootstrap/JNANatives.java index e8ec03a5f5c..5db88ec254d 100644 --- a/core/src/main/java/org/elasticsearch/bootstrap/JNANatives.java +++ b/core/src/main/java/org/elasticsearch/bootstrap/JNANatives.java @@ -71,8 +71,8 @@ class JNANatives { JNACLibrary.Rlimit rlimit = new JNACLibrary.Rlimit(); if (JNACLibrary.getrlimit(JNACLibrary.RLIMIT_MEMLOCK, rlimit) == 0) { rlimitSuccess = true; - softLimit = rlimit.rlim_cur; - hardLimit = rlimit.rlim_max; + softLimit = rlimit.rlim_cur.longValue(); + hardLimit = rlimit.rlim_max.longValue(); } else { logger.warn("Unable to retrieve resource limits: " + JNACLibrary.strerror(Native.getLastError())); } diff --git a/core/src/main/java/org/elasticsearch/bootstrap/Seccomp.java b/core/src/main/java/org/elasticsearch/bootstrap/Seccomp.java index 640d25e4e58..8e2d96f8729 100644 --- a/core/src/main/java/org/elasticsearch/bootstrap/Seccomp.java +++ b/core/src/main/java/org/elasticsearch/bootstrap/Seccomp.java @@ -22,6 +22,7 @@ package org.elasticsearch.bootstrap; import com.sun.jna.Library; import com.sun.jna.Memory; import com.sun.jna.Native; +import com.sun.jna.NativeLong; import com.sun.jna.Pointer; import com.sun.jna.Structure; import com.sun.jna.ptr.PointerByReference; @@ -38,7 +39,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; /** * Installs a limited form of secure computing mode, @@ -46,7 +49,7 @@ import java.util.List; *

* This is only supported on the Linux, Solaris, and Mac OS X operating systems. *

- * On Linux it currently supports on the amd64 architecture, on Linux kernels 3.5 or above, and requires + * On Linux it currently supports amd64 and i386 architectures, requires Linux kernel 3.5 or above, and requires * {@code CONFIG_SECCOMP} and {@code CONFIG_SECCOMP_FILTER} compiled into the kernel. *

* On Linux BPF Filters are installed using either {@code seccomp(2)} (3.17+) or {@code prctl(2)} (3.5+). {@code seccomp(2)} @@ -95,12 +98,12 @@ final class Seccomp { /** * maps to prctl(2) */ - int prctl(int option, long arg2, long arg3, long arg4, long arg5); + int prctl(int option, NativeLong arg2, NativeLong arg3, NativeLong arg4, NativeLong arg5); /** * used to call seccomp(2), its too new... * this is the only way, DONT use it on some other architecture unless you know wtf you are doing */ - long syscall(long number, Object... args); + NativeLong syscall(NativeLong number, Object... args); }; // null if unavailable or something goes wrong. @@ -119,7 +122,6 @@ final class Seccomp { } /** the preferred method is seccomp(2), since we can apply to all threads of the process */ - static final int SECCOMP_SYSCALL_NR = 317; // since Linux 3.17 static final int SECCOMP_SET_MODE_FILTER = 1; // since Linux 3.17 static final int SECCOMP_FILTER_FLAG_TSYNC = 1; // since Linux 3.17 @@ -190,7 +192,6 @@ final class Seccomp { return new SockFilter((short) code, (byte) jt, (byte) jf, k); } - static final int AUDIT_ARCH_X86_64 = 0xC000003E; static final int SECCOMP_RET_ERRNO = 0x00050000; static final int SECCOMP_RET_DATA = 0x0000FFFF; static final int SECCOMP_RET_ALLOW = 0x7FFF0000; @@ -201,29 +202,63 @@ final class Seccomp { static final int EINVAL = 0x16; static final int ENOSYS = 0x26; - // offsets (arch dependent) that our BPF checks + // offsets that our BPF checks + // check with offsetof() when adding a new arch, move to Arch if different. static final int SECCOMP_DATA_NR_OFFSET = 0x00; static final int SECCOMP_DATA_ARCH_OFFSET = 0x04; - - // currently these ranges are blocked (inclusive): - // execve is really the only one needed but why let someone fork a 30G heap? (not really what happens) - // ... - // 57: fork - // 58: vfork - // 59: execve - // ... - // 322: execveat - // ... - static final int NR_SYSCALL_FORK = 57; - static final int NR_SYSCALL_EXECVE = 59; - static final int NR_SYSCALL_EXECVEAT = 322; // since Linux 3.19 - static final int NR_SYSCALL_TUXCALL = 184; // should return ENOSYS + + static class Arch { + /** AUDIT_ARCH_XXX constant from linux/audit.h */ + final int audit; + /** syscall limit (necessary for blacklisting on amd64, to ban 32-bit syscalls) */ + final int limit; + /** __NR_fork */ + final int fork; + /** __NR_vfork */ + final int vfork; + /** __NR_execve */ + final int execve; + /** __NR_execveat */ + final int execveat; + /** __NR_seccomp */ + final int seccomp; + + Arch(int audit, int limit, int fork, int vfork, int execve, int execveat, int seccomp) { + this.audit = audit; + this.limit = limit; + this.fork = fork; + this.vfork = vfork; + this.execve = execve; + this.execveat = execveat; + this.seccomp = seccomp; + } + } + + /** supported architectures map keyed by os.arch */ + private static final Map ARCHITECTURES; + static { + Map m = new HashMap<>(); + m.put("amd64", new Arch(0xC000003E, 0x3FFFFFFF, 57, 58, 59, 322, 317)); + m.put("i386", new Arch(0x40000003, 0xFFFFFFFF, 2, 190, 11, 358, 354)); + ARCHITECTURES = Collections.unmodifiableMap(m); + } + + /** invokes prctl() from linux libc library */ + private static int linux_prctl(int option, long arg2, long arg3, long arg4, long arg5) { + return linux_libc.prctl(option, new NativeLong(arg2), new NativeLong(arg3), new NativeLong(arg4), new NativeLong(arg5)); + } + + /** invokes syscall() from linux libc library */ + private static long linux_syscall(long number, Object... args) { + return linux_libc.syscall(new NativeLong(number), args).longValue(); + } /** try to install our BPF filters via seccomp() or prctl() to block execution */ private static int linuxImpl() { // first be defensive: we can give nice errors this way, at the very least. // also, some of these security features get backported to old versions, checking kernel version here is a big no-no! - boolean supported = Constants.LINUX && "amd64".equals(Constants.OS_ARCH); + final Arch arch = ARCHITECTURES.get(Constants.OS_ARCH); + boolean supported = Constants.LINUX && arch != null; if (supported == false) { throw new UnsupportedOperationException("seccomp unavailable: '" + Constants.OS_ARCH + "' architecture unsupported"); } @@ -237,7 +272,7 @@ final class Seccomp { // check that unimplemented syscalls actually return ENOSYS // you never know (e.g. https://code.google.com/p/chromium/issues/detail?id=439795) - if (linux_libc.syscall(NR_SYSCALL_TUXCALL) >= 0 || Native.getLastError() != ENOSYS) { + if (linux_syscall(999) >= 0 || Native.getLastError() != ENOSYS) { throw new UnsupportedOperationException("seccomp unavailable: your kernel is buggy and you should upgrade"); } @@ -246,7 +281,7 @@ final class Seccomp { final int bogusArg = 0xf7a46a5c; // test seccomp(BOGUS) - long ret = linux_libc.syscall(SECCOMP_SYSCALL_NR, bogusArg); + long ret = linux_syscall(arch.seccomp, bogusArg); if (ret != -1) { throw new UnsupportedOperationException("seccomp unavailable: seccomp(BOGUS_OPERATION) returned " + ret); } else { @@ -259,7 +294,7 @@ final class Seccomp { } // test seccomp(VALID, BOGUS) - ret = linux_libc.syscall(SECCOMP_SYSCALL_NR, SECCOMP_SET_MODE_FILTER, bogusArg); + ret = linux_syscall(arch.seccomp, SECCOMP_SET_MODE_FILTER, bogusArg); if (ret != -1) { throw new UnsupportedOperationException("seccomp unavailable: seccomp(SECCOMP_SET_MODE_FILTER, BOGUS_FLAG) returned " + ret); } else { @@ -272,7 +307,7 @@ final class Seccomp { } // test prctl(BOGUS) - ret = linux_libc.prctl(bogusArg, 0, 0, 0, 0); + ret = linux_prctl(bogusArg, 0, 0, 0, 0); if (ret != -1) { throw new UnsupportedOperationException("seccomp unavailable: prctl(BOGUS_OPTION) returned " + ret); } else { @@ -287,7 +322,7 @@ final class Seccomp { // now just normal defensive checks // check for GET_NO_NEW_PRIVS - switch (linux_libc.prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0)) { + switch (linux_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0)) { case 0: break; // not yet set case 1: break; // already set by caller default: @@ -299,7 +334,7 @@ final class Seccomp { } } // check for SECCOMP - switch (linux_libc.prctl(PR_GET_SECCOMP, 0, 0, 0, 0)) { + switch (linux_prctl(PR_GET_SECCOMP, 0, 0, 0, 0)) { case 0: break; // not yet set case 2: break; // already in filter mode by caller default: @@ -311,7 +346,7 @@ final class Seccomp { } } // check for SECCOMP_MODE_FILTER - if (linux_libc.prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 0, 0, 0) != 0) { + if (linux_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 0, 0, 0) != 0) { int errno = Native.getLastError(); switch (errno) { case EFAULT: break; // available @@ -321,27 +356,28 @@ final class Seccomp { } // ok, now set PR_SET_NO_NEW_PRIVS, needed to be able to set a seccomp filter as ordinary user - if (linux_libc.prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) { + if (linux_prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) { throw new UnsupportedOperationException("prctl(PR_SET_NO_NEW_PRIVS): " + JNACLibrary.strerror(Native.getLastError())); } // check it worked - if (linux_libc.prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0) != 1) { + if (linux_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0) != 1) { throw new UnsupportedOperationException("seccomp filter did not really succeed: prctl(PR_GET_NO_NEW_PRIVS): " + JNACLibrary.strerror(Native.getLastError())); } - // BPF installed to check arch, then syscall range. See https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt for details. + // BPF installed to check arch, limit, then syscall. See https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt for details. SockFilter insns[] = { - /* 1 */ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_ARCH_OFFSET), // - /* 2 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, AUDIT_ARCH_X86_64, 0, 4), // if (arch != amd64) goto fail; - /* 3 */ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_NR_OFFSET), // - /* 4 */ BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, NR_SYSCALL_FORK, 0, 3), // if (syscall < SYSCALL_FORK) goto pass; - /* 5 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, NR_SYSCALL_EXECVEAT, 1, 0), // if (syscall == SYSCALL_EXECVEAT) goto fail; - /* 6 */ BPF_JUMP(BPF_JMP + BPF_JGT + BPF_K, NR_SYSCALL_EXECVE, 1, 0), // if (syscall > SYSCALL_EXECVE) goto pass; - /* 7 */ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ERRNO | (EACCES & SECCOMP_RET_DATA)), // fail: return EACCES; - /* 8 */ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW) // pass: return OK; + /* 1 */ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_ARCH_OFFSET), // + /* 2 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.audit, 0, 7), // if (arch != audit) goto fail; + /* 3 */ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_NR_OFFSET), // + /* 4 */ BPF_JUMP(BPF_JMP + BPF_JGT + BPF_K, arch.limit, 5, 0), // if (syscall > LIMIT) goto fail; + /* 5 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.fork, 4, 0), // if (syscall == FORK) goto fail; + /* 6 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.vfork, 3, 0), // if (syscall == VFORK) goto fail; + /* 7 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.execve, 2, 0), // if (syscall == EXECVE) goto fail; + /* 8 */ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch.execveat, 1, 0), // if (syscall == EXECVEAT) goto fail; + /* 9 */ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), // pass: return OK; + /* 10 */ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ERRNO | (EACCES & SECCOMP_RET_DATA)), // fail: return EACCES; }; - // seccomp takes a long, so we pass it one explicitly to keep the JNA simple SockFProg prog = new SockFProg(insns); prog.write(); @@ -350,13 +386,13 @@ final class Seccomp { int method = 1; // install filter, if this works, after this there is no going back! // first try it with seccomp(SECCOMP_SET_MODE_FILTER), falling back to prctl() - if (linux_libc.syscall(SECCOMP_SYSCALL_NR, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, pointer) != 0) { + if (linux_syscall(arch.seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, new NativeLong(pointer)) != 0) { method = 0; int errno1 = Native.getLastError(); if (logger.isDebugEnabled()) { logger.debug("seccomp(SECCOMP_SET_MODE_FILTER): " + JNACLibrary.strerror(errno1) + ", falling back to prctl(PR_SET_SECCOMP)..."); } - if (linux_libc.prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, pointer, 0, 0) != 0) { + if (linux_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, pointer, 0, 0) != 0) { int errno2 = Native.getLastError(); throw new UnsupportedOperationException("seccomp(SECCOMP_SET_MODE_FILTER): " + JNACLibrary.strerror(errno1) + ", prctl(PR_SET_SECCOMP): " + JNACLibrary.strerror(errno2)); @@ -364,7 +400,7 @@ final class Seccomp { } // now check that the filter was really installed, we should be in filter mode. - if (linux_libc.prctl(PR_GET_SECCOMP, 0, 0, 0, 0) != 2) { + if (linux_prctl(PR_GET_SECCOMP, 0, 0, 0, 0) != 2) { throw new UnsupportedOperationException("seccomp filter installation did not really succeed. seccomp(PR_GET_SECCOMP): " + JNACLibrary.strerror(Native.getLastError())); } diff --git a/core/src/test/java/org/elasticsearch/index/store/IndexStoreTests.java b/core/src/test/java/org/elasticsearch/index/store/IndexStoreTests.java index 97faf1ed374..e403715b8e5 100644 --- a/core/src/test/java/org/elasticsearch/index/store/IndexStoreTests.java +++ b/core/src/test/java/org/elasticsearch/index/store/IndexStoreTests.java @@ -84,8 +84,10 @@ public class IndexStoreTests extends ESTestCase { try (final Directory directory = service.newFSDirectory(tempDir, NoLockFactory.INSTANCE)) { if (Constants.WINDOWS) { assertTrue(directory.toString(), directory instanceof MMapDirectory || directory instanceof SimpleFSDirectory); - } else { + } else if (Constants.JRE_IS_64BIT) { assertTrue(directory.toString(), directory instanceof FileSwitchDirectory); + } else { + assertTrue(directory.toString(), directory instanceof NIOFSDirectory); } } }