diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index 95ff032182f..3da0154bb61 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -36,6 +36,9 @@ Release 2.8.0 - UNRELEASED HADOOP-11741. Add LOG.isDebugEnabled() guard for some LOG.debug(). (Walter Su via ozawa) + HADOOP-11660. Add support for hardware crc of HDFS checksums on ARM aarch64 + architecture (Edward Nevill via Colin P. McCabe) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-common-project/hadoop-common/src/CMakeLists.txt b/hadoop-common-project/hadoop-common/src/CMakeLists.txt index 942b19c7526..7d68fd7b0b7 100644 --- a/hadoop-common-project/hadoop-common/src/CMakeLists.txt +++ b/hadoop-common-project/hadoop-common/src/CMakeLists.txt @@ -163,6 +163,14 @@ else (SNAPPY_LIBRARY AND SNAPPY_INCLUDE_DIR) ENDIF(REQUIRE_SNAPPY) endif (SNAPPY_LIBRARY AND SNAPPY_INCLUDE_DIR) +IF (CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") + set(BULK_CRC_ARCH_SOURCE_FIlE "${D}/util/bulk_crc32_x86.c") +ELSEIF (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + set(BULK_CRC_ARCH_SOURCE_FIlE "${D}/util/bulk_crc32_aarch64.c") +ELSE() + MESSAGE("No HW CRC acceleration for ${CMAKE_SYSTEM_PROCESSOR}, falling back to SW") +ENDIF() + # Find the no-suffix version of libcrypto. # See HADOOP-11216 for details. SET(STORED_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) @@ -228,6 +236,7 @@ CONFIGURE_FILE(${CMAKE_SOURCE_DIR}/config.h.cmake ${CMAKE_BINARY_DIR}/config.h) add_executable(test_bulk_crc32 ${D}/util/bulk_crc32.c + ${BULK_CRC_ARCH_SOURCE_FIlE} ${T}/util/test_bulk_crc32.c ) @@ -256,6 +265,7 @@ add_dual_library(hadoop ${D}/util/NativeCodeLoader.c ${D}/util/NativeCrc32.c ${D}/util/bulk_crc32.c + ${BULK_CRC_ARCH_SOURCE_FIlE} ) if (NEED_LINK_DL) set(LIB_DL dl) diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c index c7efb8db95c..b3bb69959b2 100644 --- a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c +++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c @@ -38,22 +38,23 @@ #include "bulk_crc32.h" #include "gcc_optimizations.h" -#if (!defined(__FreeBSD__) && !defined(WINDOWS)) -#define USE_PIPELINED -#endif - #define CRC_INITIAL_VAL 0xffffffff -typedef uint32_t (*crc_update_func_t)(uint32_t, const uint8_t *, size_t); static uint32_t crc_val(uint32_t crc); -static uint32_t crc32_zlib_sb8(uint32_t crc, const uint8_t *buf, size_t length); -static uint32_t crc32c_sb8(uint32_t crc, const uint8_t *buf, size_t length); -#ifdef USE_PIPELINED -static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks); -#endif -static int cached_cpu_supports_crc32; // initialized by constructor below -static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* data, size_t length); +typedef void (*crc_pipelined_func_t)(uint32_t *, uint32_t *, uint32_t *, const uint8_t *, size_t, int); + +// The software versions of pipelined crc +static void pipelined_crc32c_sb8(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, + const uint8_t *p_buf, size_t block_size, int num_blocks); +static void pipelined_crc32_zlib_sb8(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, + const uint8_t *p_buf, size_t block_size, int num_blocks); + +// Satically initialise the function pointers to the software versions +// If a HW implementation is available they will subsequently be initialised in the dynamic +// initialisers to point to the HW routines. +crc_pipelined_func_t pipelined_crc32c_func = pipelined_crc32c_sb8; +crc_pipelined_func_t pipelined_crc32_zlib_func = pipelined_crc32_zlib_sb8; static inline int store_or_verify(uint32_t *sums, uint32_t crc, int is_verify) { @@ -72,94 +73,66 @@ int bulk_crc(const uint8_t *data, size_t data_len, int is_verify = error_info != NULL; -#ifdef USE_PIPELINED uint32_t crc1, crc2, crc3; int n_blocks = data_len / bytes_per_checksum; int remainder = data_len % bytes_per_checksum; - int do_pipelined = 0; -#endif uint32_t crc; - crc_update_func_t crc_update_func; + crc_pipelined_func_t crc_pipelined_func; switch (checksum_type) { case CRC32_ZLIB_POLYNOMIAL: - crc_update_func = crc32_zlib_sb8; + crc_pipelined_func = pipelined_crc32_zlib_func; break; case CRC32C_POLYNOMIAL: - if (likely(cached_cpu_supports_crc32)) { - crc_update_func = crc32c_hardware; -#ifdef USE_PIPELINED - do_pipelined = 1; -#endif - } else { - crc_update_func = crc32c_sb8; - } + crc_pipelined_func = pipelined_crc32c_func; break; default: return is_verify ? INVALID_CHECKSUM_TYPE : -EINVAL; } -#ifdef USE_PIPELINED - if (do_pipelined) { - /* Process three blocks at a time */ - while (likely(n_blocks >= 3)) { - crc1 = crc2 = crc3 = CRC_INITIAL_VAL; - pipelined_crc32c(&crc1, &crc2, &crc3, data, bytes_per_checksum, 3); + /* Process three blocks at a time */ + while (likely(n_blocks >= 3)) { + crc1 = crc2 = crc3 = CRC_INITIAL_VAL; + crc_pipelined_func(&crc1, &crc2, &crc3, data, bytes_per_checksum, 3); - if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc1))), is_verify))) - goto return_crc_error; - sums++; - data += bytes_per_checksum; + if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc1))), is_verify))) + goto return_crc_error; + sums++; + data += bytes_per_checksum; + if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc2))), is_verify))) + goto return_crc_error; + sums++; + data += bytes_per_checksum; + if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc3))), is_verify))) + goto return_crc_error; + sums++; + data += bytes_per_checksum; + n_blocks -= 3; + } + + /* One or two blocks */ + if (n_blocks) { + crc1 = crc2 = crc3 = CRC_INITIAL_VAL; + crc_pipelined_func(&crc1, &crc2, &crc3, data, bytes_per_checksum, n_blocks); + + if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc1))), is_verify))) + goto return_crc_error; + data += bytes_per_checksum; + sums++; + if (n_blocks == 2) { if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc2))), is_verify))) goto return_crc_error; sums++; data += bytes_per_checksum; - if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc3))), is_verify))) - goto return_crc_error; - sums++; - data += bytes_per_checksum; - n_blocks -= 3; } - - /* One or two blocks */ - if (n_blocks) { - crc1 = crc2 = crc3 = CRC_INITIAL_VAL; - pipelined_crc32c(&crc1, &crc2, &crc3, data, bytes_per_checksum, n_blocks); - - if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc1))), is_verify))) - goto return_crc_error; - data += bytes_per_checksum; - sums++; - if (n_blocks == 2) { - if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc2))), is_verify))) - goto return_crc_error; - sums++; - data += bytes_per_checksum; - } - } - - /* For something smaller than a block */ - if (remainder) { - crc1 = crc2 = crc3 = CRC_INITIAL_VAL; - pipelined_crc32c(&crc1, &crc2, &crc3, data, remainder, 1); - - if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc1))), is_verify))) - goto return_crc_error; - } - return is_verify ? CHECKSUMS_VALID : 0; } -#endif - while (likely(data_len > 0)) { - int len = likely(data_len >= bytes_per_checksum) ? bytes_per_checksum : data_len; - crc = CRC_INITIAL_VAL; - crc = crc_update_func(crc, data, len); - crc = ntohl(crc_val(crc)); - if (unlikely(!store_or_verify(sums, crc, is_verify))) { + /* For something smaller than a block */ + if (remainder) { + crc1 = crc2 = crc3 = CRC_INITIAL_VAL; + crc_pipelined_func(&crc1, &crc2, &crc3, data, remainder, 1); + + if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc1))), is_verify))) goto return_crc_error; - } - data += len; - data_len -= len; - sums++; } return is_verify ? CHECKSUMS_VALID : 0; @@ -175,7 +148,7 @@ return_crc_error: /** * Extract the final result of a CRC */ -uint32_t crc_val(uint32_t crc) { +static uint32_t crc_val(uint32_t crc) { return ~crc; } @@ -214,6 +187,16 @@ static uint32_t crc32c_sb8(uint32_t crc, const uint8_t *buf, size_t length) { return crc; } +static void pipelined_crc32c_sb8(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, + const uint8_t *p_buf, size_t block_size, int num_blocks) { + assert(num_blocks >= 1 && num_blocks <=3 && "invalid num_blocks"); + *crc1 = crc32c_sb8(*crc1, p_buf, block_size); + if (num_blocks >= 2) + *crc2 = crc32c_sb8(*crc2, p_buf+block_size, block_size); + if (num_blocks >= 3) + *crc3 = crc32c_sb8(*crc3, p_buf+2*block_size, block_size); +} + /** * Update a CRC using the "zlib" polynomial -- what Hadoop calls CHECKSUM_CRC32 * using slicing-by-8 @@ -250,416 +233,12 @@ static uint32_t crc32_zlib_sb8( return crc; } -/////////////////////////////////////////////////////////////////////////// -// Begin code for SSE4.2 specific hardware support of CRC32C -/////////////////////////////////////////////////////////////////////////// - -#if (defined(__amd64__) || defined(__i386)) && defined(__GNUC__) && !defined(__FreeBSD__) -# define SSE42_FEATURE_BIT (1 << 20) -# define CPUID_FEATURES 1 -/** - * Call the cpuid instruction to determine CPU feature flags. - */ -static uint32_t cpuid(uint32_t eax_in) { - uint32_t eax, ebx, ecx, edx; -# if defined(__PIC__) && !defined(__LP64__) -// 32-bit PIC code uses the ebx register for the base offset -- -// have to save and restore it on the stack - asm("pushl %%ebx\n\t" - "cpuid\n\t" - "movl %%ebx, %[ebx]\n\t" - "popl %%ebx" : "=a" (eax), [ebx] "=r"(ebx), "=c"(ecx), "=d"(edx) : "a" (eax_in) - : "cc"); -# else - asm("cpuid" : "=a" (eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(eax_in) - : "cc"); -# endif - - return ecx; +static void pipelined_crc32_zlib_sb8(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, + const uint8_t *p_buf, size_t block_size, int num_blocks) { + assert(num_blocks >= 1 && num_blocks <=3 && "invalid num_blocks"); + *crc1 = crc32_zlib_sb8(*crc1, p_buf, block_size); + if (num_blocks >= 2) + *crc2 = crc32_zlib_sb8(*crc2, p_buf+block_size, block_size); + if (num_blocks >= 3) + *crc3 = crc32_zlib_sb8(*crc3, p_buf+2*block_size, block_size); } - -/** - * On library load, initiailize the cached value above for - * whether the cpu supports SSE4.2's crc32 instruction. - */ -void __attribute__ ((constructor)) init_cpu_support_flag(void) { - uint32_t ecx = cpuid(CPUID_FEATURES); - cached_cpu_supports_crc32 = ecx & SSE42_FEATURE_BIT; -} - - -// -// Definitions of the SSE4.2 crc32 operations. Using these instead of -// the GCC __builtin_* intrinsics allows this code to compile without -// -msse4.2, since we do dynamic CPU detection at runtime. -// - -# ifdef __LP64__ -inline uint64_t _mm_crc32_u64(uint64_t crc, uint64_t value) { - asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); - return crc; -} -# endif - -inline uint32_t _mm_crc32_u32(uint32_t crc, uint32_t value) { - asm("crc32l %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); - return crc; -} - -inline uint32_t _mm_crc32_u16(uint32_t crc, uint16_t value) { - asm("crc32w %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); - return crc; -} - -inline uint32_t _mm_crc32_u8(uint32_t crc, uint8_t value) { - asm("crc32b %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); - return crc; -} - - -# ifdef __LP64__ -/** - * Hardware-accelerated CRC32C calculation using the 64-bit instructions. - */ -static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* p_buf, size_t length) { - // start directly at p_buf, even if it's an unaligned address. According - // to the original author of this code, doing a small run of single bytes - // to word-align the 64-bit instructions doesn't seem to help, but - // we haven't reconfirmed those benchmarks ourselves. - uint64_t crc64bit = crc; - size_t i; - for (i = 0; i < length / sizeof(uint64_t); i++) { - crc64bit = _mm_crc32_u64(crc64bit, *(uint64_t*) p_buf); - p_buf += sizeof(uint64_t); - } - - // This ugly switch is slightly faster for short strings than the straightforward loop - uint32_t crc32bit = (uint32_t) crc64bit; - length &= sizeof(uint64_t) - 1; - switch (length) { - case 7: - crc32bit = _mm_crc32_u8(crc32bit, *p_buf++); - case 6: - crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*) p_buf); - p_buf += 2; - // case 5 is below: 4 + 1 - case 4: - crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*) p_buf); - break; - case 3: - crc32bit = _mm_crc32_u8(crc32bit, *p_buf++); - case 2: - crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*) p_buf); - break; - case 5: - crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*) p_buf); - p_buf += 4; - case 1: - crc32bit = _mm_crc32_u8(crc32bit, *p_buf); - break; - case 0: - break; - default: - // This should never happen; enable in debug code - assert(0 && "ended up with 8 or more bytes at tail of calculation"); - } - - return crc32bit; -} - -#ifdef USE_PIPELINED -/** - * Pipelined version of hardware-accelerated CRC32C calculation using - * the 64 bit crc32q instruction. - * One crc32c instruction takes three cycles, but two more with no data - * dependency can be in the pipeline to achieve something close to single - * instruction/cycle. Here we feed three blocks in RR. - * - * crc1, crc2, crc3 : Store initial checksum for each block before - * calling. When it returns, updated checksums are stored. - * p_buf : The base address of the data buffer. The buffer should be - * at least as big as block_size * num_blocks. - * block_size : The size of each block in bytes. - * num_blocks : The number of blocks to work on. Min = 1, Max = 3 - */ -static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks) { - uint64_t c1 = *crc1; - uint64_t c2 = *crc2; - uint64_t c3 = *crc3; - uint64_t *data = (uint64_t*)p_buf; - int counter = block_size / sizeof(uint64_t); - int remainder = block_size % sizeof(uint64_t); - uint8_t *bdata; - - /* We do switch here because the loop has to be tight in order - * to fill the pipeline. Any other statement inside the loop - * or inbetween crc32 instruction can slow things down. Calling - * individual crc32 instructions three times from C also causes - * gcc to insert other instructions inbetween. - * - * Do not rearrange the following code unless you have verified - * the generated machine code is as efficient as before. - */ - switch (num_blocks) { - case 3: - /* Do three blocks */ - while (likely(counter)) { - __asm__ __volatile__( - "crc32q (%7), %0;\n\t" - "crc32q (%7,%6,1), %1;\n\t" - "crc32q (%7,%6,2), %2;\n\t" - : "=r"(c1), "=r"(c2), "=r"(c3) - : "0"(c1), "1"(c2), "2"(c3), "r"(block_size), "r"(data) - ); - data++; - counter--; - } - - /* Take care of the remainder. They are only up to seven bytes, - * so performing byte-level crc32 won't take much time. - */ - bdata = (uint8_t*)data; - while (likely(remainder)) { - __asm__ __volatile__( - "crc32b (%7), %0;\n\t" - "crc32b (%7,%6,1), %1;\n\t" - "crc32b (%7,%6,2), %2;\n\t" - : "=r"(c1), "=r"(c2), "=r"(c3) - : "0"(c1), "1"(c2), "2"(c3), "r"(block_size), "r"(bdata) - ); - bdata++; - remainder--; - } - break; - case 2: - /* Do two blocks */ - while (likely(counter)) { - __asm__ __volatile__( - "crc32q (%5), %0;\n\t" - "crc32q (%5,%4,1), %1;\n\t" - : "=r"(c1), "=r"(c2) - : "0"(c1), "1"(c2), "r"(block_size), "r"(data) - ); - data++; - counter--; - } - - bdata = (uint8_t*)data; - while (likely(remainder)) { - __asm__ __volatile__( - "crc32b (%5), %0;\n\t" - "crc32b (%5,%4,1), %1;\n\t" - : "=r"(c1), "=r"(c2) - : "0"(c1), "1"(c2), "r"(block_size), "r"(bdata) - ); - bdata++; - remainder--; - } - break; - case 1: - /* single block */ - while (likely(counter)) { - __asm__ __volatile__( - "crc32q (%2), %0;\n\t" - : "=r"(c1) - : "0"(c1), "r"(data) - ); - data++; - counter--; - } - bdata = (uint8_t*)data; - while (likely(remainder)) { - __asm__ __volatile__( - "crc32b (%2), %0;\n\t" - : "=r"(c1) - : "0"(c1), "r"(bdata) - ); - bdata++; - remainder--; - } - break; - case 0: - return; - default: - assert(0 && "BUG: Invalid number of checksum blocks"); - } - - *crc1 = c1; - *crc2 = c2; - *crc3 = c3; - return; -} -#endif /* USE_PIPELINED */ - -# else // 32-bit - -/** - * Hardware-accelerated CRC32C calculation using the 32-bit instructions. - */ -static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* p_buf, size_t length) { - // start directly at p_buf, even if it's an unaligned address. According - // to the original author of this code, doing a small run of single bytes - // to word-align the 64-bit instructions doesn't seem to help, but - // we haven't reconfirmed those benchmarks ourselves. - size_t i; - for (i = 0; i < length / sizeof(uint32_t); i++) { - crc = _mm_crc32_u32(crc, *(uint32_t*) p_buf); - p_buf += sizeof(uint32_t); - } - - // This ugly switch is slightly faster for short strings than the straightforward loop - length &= sizeof(uint32_t) - 1; - switch (length) { - case 3: - crc = _mm_crc32_u8(crc, *p_buf++); - case 2: - crc = _mm_crc32_u16(crc, *(uint16_t*) p_buf); - break; - case 1: - crc = _mm_crc32_u8(crc, *p_buf); - break; - case 0: - break; - default: - // This should never happen; enable in debug code - assert(0 && "ended up with 4 or more bytes at tail of calculation"); - } - - return crc; -} - -#ifdef USE_PIPELINED -/** - * Pipelined version of hardware-accelerated CRC32C calculation using - * the 32 bit crc32l instruction. - * One crc32c instruction takes three cycles, but two more with no data - * dependency can be in the pipeline to achieve something close to single - * instruction/cycle. Here we feed three blocks in RR. - * - * crc1, crc2, crc3 : Store initial checksum for each block before - * calling. When it returns, updated checksums are stored. - * data : The base address of the data buffer. The buffer should be - * at least as big as block_size * num_blocks. - * block_size : The size of each block in bytes. - * num_blocks : The number of blocks to work on. Min = 1, Max = 3 - */ -static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks) { - uint32_t c1 = *crc1; - uint32_t c2 = *crc2; - uint32_t c3 = *crc3; - int counter = block_size / sizeof(uint32_t); - int remainder = block_size % sizeof(uint32_t); - uint32_t *data = (uint32_t*)p_buf; - uint8_t *bdata; - - /* We do switch here because the loop has to be tight in order - * to fill the pipeline. Any other statement inside the loop - * or inbetween crc32 instruction can slow things down. Calling - * individual crc32 instructions three times from C also causes - * gcc to insert other instructions inbetween. - * - * Do not rearrange the following code unless you have verified - * the generated machine code is as efficient as before. - */ - switch (num_blocks) { - case 3: - /* Do three blocks */ - while (likely(counter)) { - __asm__ __volatile__( - "crc32l (%7), %0;\n\t" - "crc32l (%7,%6,1), %1;\n\t" - "crc32l (%7,%6,2), %2;\n\t" - : "=r"(c1), "=r"(c2), "=r"(c3) - : "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(data) - ); - data++; - counter--; - } - /* Take care of the remainder. They are only up to three bytes, - * so performing byte-level crc32 won't take much time. - */ - bdata = (uint8_t*)data; - while (likely(remainder)) { - __asm__ __volatile__( - "crc32b (%7), %0;\n\t" - "crc32b (%7,%6,1), %1;\n\t" - "crc32b (%7,%6,2), %2;\n\t" - : "=r"(c1), "=r"(c2), "=r"(c3) - : "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(bdata) - ); - bdata++; - remainder--; - } - break; - case 2: - /* Do two blocks */ - while (likely(counter)) { - __asm__ __volatile__( - "crc32l (%5), %0;\n\t" - "crc32l (%5,%4,1), %1;\n\t" - : "=r"(c1), "=r"(c2) - : "r"(c1), "r"(c2), "r"(block_size), "r"(data) - ); - data++; - counter--; - } - - bdata = (uint8_t*)data; - while (likely(remainder)) { - __asm__ __volatile__( - "crc32b (%5), %0;\n\t" - "crc32b (%5,%4,1), %1;\n\t" - : "=r"(c1), "=r"(c2) - : "r"(c1), "r"(c2), "r"(block_size), "r"(bdata) - ); - bdata++; - remainder--; - } - break; - case 1: - /* single block */ - while (likely(counter)) { - __asm__ __volatile__( - "crc32l (%2), %0;\n\t" - : "=r"(c1) - : "r"(c1), "r"(data) - ); - data++; - counter--; - } - bdata = (uint8_t*)data; - while (likely(remainder)) { - __asm__ __volatile__( - "crc32b (%2), %0;\n\t" - : "=r"(c1) - : "r"(c1), "r"(bdata) - ); - bdata++; - remainder--; - } - break; - case 0: - return; - default: - assert(0 && "BUG: Invalid number of checksum blocks"); - } - - *crc1 = c1; - *crc2 = c2; - *crc3 = c3; - return; -} - -#endif /* USE_PIPELINED */ - -# endif // 64-bit vs 32-bit - -#else // end x86 architecture - -static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* data, size_t length) { - // never called! - assert(0 && "hardware crc called on an unsupported platform"); - return 0; -} - -#endif diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_aarch64.c b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_aarch64.c new file mode 100644 index 00000000000..ab4690bcdfd --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_aarch64.c @@ -0,0 +1,362 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include // for size_t + +#include "bulk_crc32.h" +#include "gcc_optimizations.h" + +/** + * Hardware-accelerated CRC32 calculation using the 64-bit instructions. + * 2 variants:- + * pipelined_crc32c uses the Castagnoli polynomial 0x1EDC6F41 + * pipelined_crc32_zlib uses the Zlib polynomial 0x04C11DB7 + */ + +// gcc doesn't know how to vectorize a 128 bit load, so use the following to tell it +#define LDP(x,y,p) asm("ldp %x[a], %x[b], [%x[c]], #16" : [a]"=r"(x),[b]"=r"(y),[c]"+r"(p)) + +#define CRC32CX(crc,value) asm("crc32cx %w[c], %w[c], %x[v]" : [c]"+r"(*&crc) : [v]"r"(+value)) +#define CRC32CW(crc,value) asm("crc32cw %w[c], %w[c], %w[v]" : [c]"+r"(*&crc) : [v]"r"(+value)) +#define CRC32CH(crc,value) asm("crc32ch %w[c], %w[c], %w[v]" : [c]"+r"(*&crc) : [v]"r"(+value)) +#define CRC32CB(crc,value) asm("crc32cb %w[c], %w[c], %w[v]" : [c]"+r"(*&crc) : [v]"r"(+value)) + +#define CRC32ZX(crc,value) asm("crc32x %w[c], %w[c], %x[v]" : [c]"+r"(crc) : [v]"r"(value)) +#define CRC32ZW(crc,value) asm("crc32w %w[c], %w[c], %w[v]" : [c]"+r"(crc) : [v]"r"(value)) +#define CRC32ZH(crc,value) asm("crc32h %w[c], %w[c], %w[v]" : [c]"+r"(crc) : [v]"r"(value)) +#define CRC32ZB(crc,value) asm("crc32b %w[c], %w[c], %w[v]" : [c]"+r"(crc) : [v]"r"(value)) + +/** + * Pipelined version of hardware-accelerated CRC32 calculation using + * the 64 bit crc32 instructions. + * One crc32 instruction takes three cycles, but two more with no data + * dependency can be in the pipeline to achieve something close to single + * instruction/cycle. Here we feed three blocks in RR. + * + * 2 variants:- + * pipelined_crc32c uses the Castagnoli polynomial 0x1EDC6F41 + * pipelined_crc32_zlib uses the Zlib polynomial 0x04C11DB7 + * + * crc1, crc2, crc3 : Store initial checksum for each block before + * calling. When it returns, updated checksums are stored. + * p_buf : The base address of the data buffer. The buffer should be + * at least as big as block_size * num_blocks. + * block_size : The size of each block in bytes. + * num_blocks : The number of blocks to work on. Min = 1, Max = 3 + */ +static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf1, size_t block_size, int num_blocks) { + uint64_t c1 = *crc1; + uint64_t c2 = *crc2; + uint64_t c3 = *crc3; + const uint8_t *p_buf2 = p_buf1 + block_size; + const uint8_t *p_buf3 = p_buf1 + block_size * 2; + uint64_t x1, y1, x2, y2, x3, y3; + long len = block_size; + + /* We do switch here because the loop has to be tight in order + * to fill the pipeline. Any other statement inside the loop + * or inbetween crc32 instruction can slow things down. + * + * Do verify that this code generates the expected assembler + * by disassembling test_bulk_crc32 + */ + + asm(".cpu generic+crc"); // Allow crc instructions in asm + switch (num_blocks) { + case 3: + /* Do three blocks */ + while ((len -= 2*sizeof(uint64_t)) >= 0) { + LDP(x1,y1,p_buf1); + LDP(x2,y2,p_buf2); + LDP(x3,y3,p_buf3); + CRC32CX(c1, x1); + CRC32CX(c2, x2); + CRC32CX(c3, x3); + CRC32CX(c1, y1); + CRC32CX(c2, y2); + CRC32CX(c3, y3); + } + + if (unlikely(len & sizeof(uint64_t))) { + x1 = *(uint64_t*)p_buf1; p_buf1 += sizeof(uint64_t); + x2 = *(uint64_t*)p_buf2; p_buf2 += sizeof(uint64_t); + x3 = *(uint64_t*)p_buf3; p_buf3 += sizeof(uint64_t); + CRC32CX(c1, x1); + CRC32CX(c2, x2); + CRC32CX(c3, x3); + } + if (unlikely(len & sizeof(uint32_t))) { + x1 = *(uint32_t*)p_buf1; p_buf1 += sizeof(uint32_t); + x2 = *(uint32_t*)p_buf2; p_buf2 += sizeof(uint32_t); + x3 = *(uint32_t*)p_buf3; p_buf3 += sizeof(uint32_t); + CRC32CW(c1, x1); + CRC32CW(c2, x2); + CRC32CW(c3, x3); + } + if (unlikely(len & sizeof(uint16_t))) { + x1 = *(uint16_t*)p_buf1; p_buf1 += sizeof(uint16_t); + x2 = *(uint16_t*)p_buf2; p_buf2 += sizeof(uint16_t); + x3 = *(uint16_t*)p_buf3; p_buf3 += sizeof(uint16_t); + CRC32CH(c1, x1); + CRC32CH(c2, x2); + CRC32CH(c3, x3); + } + if (unlikely(len & sizeof(uint8_t))) { + x1 = *p_buf1; + x2 = *p_buf2; + x3 = *p_buf3; + CRC32CB(c1, x1); + CRC32CB(c2, x2); + CRC32CB(c3, x3); + } + break; + case 2: + /* Do two blocks */ + while ((len -= 2*sizeof(uint64_t)) >= 0) { + LDP(x1,y1,p_buf1); + LDP(x2,y2,p_buf2); + CRC32CX(c1, x1); + CRC32CX(c2, x2); + CRC32CX(c1, y1); + CRC32CX(c2, y2); + } + + if (unlikely(len & sizeof(uint64_t))) { + x1 = *(uint64_t*)p_buf1; p_buf1 += sizeof(uint64_t); + x2 = *(uint64_t*)p_buf2; p_buf2 += sizeof(uint64_t); + CRC32CX(c1, x1); + CRC32CX(c2, x2); + } + if (unlikely(len & sizeof(uint32_t))) { + x1 = *(uint32_t*)p_buf1; p_buf1 += sizeof(uint32_t); + x2 = *(uint32_t*)p_buf2; p_buf2 += sizeof(uint32_t); + CRC32CW(c1, x1); + CRC32CW(c2, x2); + } + if (unlikely(len & sizeof(uint16_t))) { + x1 = *(uint16_t*)p_buf1; p_buf1 += sizeof(uint16_t); + x2 = *(uint16_t*)p_buf2; p_buf2 += sizeof(uint16_t); + CRC32CH(c1, x1); + CRC32CH(c2, x2); + } + if (unlikely(len & sizeof(uint8_t))) { + x1 = *p_buf1; + x2 = *p_buf2; + CRC32CB(c1, x1); + CRC32CB(c2, x2); + } + break; + case 1: + /* single block */ + while ((len -= 2*sizeof(uint64_t)) >= 0) { + LDP(x1,y1,p_buf1); + CRC32CX(c1, x1); + CRC32CX(c1, y1); + } + + if (unlikely(len & sizeof(uint64_t))) { + x1 = *(uint64_t*)p_buf1; p_buf1 += sizeof(uint64_t); + CRC32CX(c1, x1); + } + if (unlikely(len & sizeof(uint32_t))) { + x1 = *(uint32_t*)p_buf1; p_buf1 += sizeof(uint32_t); + CRC32CW(c1, x1); + } + if (unlikely(len & sizeof(uint16_t))) { + x1 = *(uint16_t*)p_buf1; p_buf1 += sizeof(uint16_t); + CRC32CH(c1, x1); + } + if (unlikely(len & sizeof(uint8_t))) { + x1 = *p_buf1; + CRC32CB(c1, x1); + } + break; + case 0: + return; + default: + assert(0 && "BUG: Invalid number of checksum blocks"); + } + + *crc1 = c1; + *crc2 = c2; + *crc3 = c3; + return; +} + +static void pipelined_crc32_zlib(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf1, size_t block_size, int num_blocks) { + uint64_t c1 = *crc1; + uint64_t c2 = *crc2; + uint64_t c3 = *crc3; + const uint8_t *p_buf2 = p_buf1 + block_size; + const uint8_t *p_buf3 = p_buf1 + block_size * 2; + uint64_t x1, y1, x2, y2, x3, y3; + long len = block_size; + + /* We do switch here because the loop has to be tight in order + * to fill the pipeline. Any other statement inside the loop + * or inbetween crc32 instruction can slow things down. + * + * Do verify that this code generates the expected assembler + * by disassembling test_bulk_crc32 + */ + + asm(".cpu generic+crc"); // Allow crc instructions in asm + switch (num_blocks) { + case 3: + /* Do three blocks */ + while ((len -= 2*sizeof(uint64_t)) >= 0) { + LDP(x1,y1,p_buf1); + LDP(x2,y2,p_buf2); + LDP(x3,y3,p_buf3); + CRC32ZX(c1, x1); + CRC32ZX(c2, x2); + CRC32ZX(c3, x3); + CRC32ZX(c1, y1); + CRC32ZX(c2, y2); + CRC32ZX(c3, y3); + } + + if (unlikely(len & sizeof(uint64_t))) { + x1 = *(uint64_t*)p_buf1; p_buf1 += sizeof(uint64_t); + x2 = *(uint64_t*)p_buf2; p_buf2 += sizeof(uint64_t); + x3 = *(uint64_t*)p_buf3; p_buf3 += sizeof(uint64_t); + CRC32ZX(c1, x1); + CRC32ZX(c2, x2); + CRC32ZX(c3, x3); + } + if (unlikely(len & sizeof(uint32_t))) { + x1 = *(uint32_t*)p_buf1; p_buf1 += sizeof(uint32_t); + x2 = *(uint32_t*)p_buf2; p_buf2 += sizeof(uint32_t); + x3 = *(uint32_t*)p_buf3; p_buf3 += sizeof(uint32_t); + CRC32ZW(c1, x1); + CRC32ZW(c2, x2); + CRC32ZW(c3, x3); + } + if (unlikely(len & sizeof(uint16_t))) { + x1 = *(uint16_t*)p_buf1; p_buf1 += sizeof(uint16_t); + x2 = *(uint16_t*)p_buf2; p_buf2 += sizeof(uint16_t); + x3 = *(uint16_t*)p_buf3; p_buf3 += sizeof(uint16_t); + CRC32ZH(c1, x1); + CRC32ZH(c2, x2); + CRC32ZH(c3, x3); + } + if (unlikely(len & sizeof(uint8_t))) { + x1 = *p_buf1; + x2 = *p_buf2; + x3 = *p_buf3; + CRC32ZB(c1, x1); + CRC32ZB(c2, x2); + CRC32ZB(c3, x3); + } + break; + case 2: + /* Do two blocks */ + while ((len -= 2*sizeof(uint64_t)) >= 0) { + LDP(x1,y1,p_buf1); + LDP(x2,y2,p_buf2); + CRC32ZX(c1, x1); + CRC32ZX(c2, x2); + CRC32ZX(c1, y1); + CRC32ZX(c2, y2); + } + + if (unlikely(len & sizeof(uint64_t))) { + x1 = *(uint64_t*)p_buf1; p_buf1 += sizeof(uint64_t); + x2 = *(uint64_t*)p_buf2; p_buf2 += sizeof(uint64_t); + CRC32ZX(c1, x1); + CRC32ZX(c2, x2); + } + if (unlikely(len & sizeof(uint32_t))) { + x1 = *(uint32_t*)p_buf1; p_buf1 += sizeof(uint32_t); + x2 = *(uint32_t*)p_buf2; p_buf2 += sizeof(uint32_t); + CRC32ZW(c1, x1); + CRC32ZW(c2, x2); + } + if (unlikely(len & sizeof(uint16_t))) { + x1 = *(uint16_t*)p_buf1; p_buf1 += sizeof(uint16_t); + x2 = *(uint16_t*)p_buf2; p_buf2 += sizeof(uint16_t); + CRC32ZH(c1, x1); + CRC32ZH(c2, x2); + } + if (unlikely(len & sizeof(uint8_t))) { + x1 = *p_buf1; + x2 = *p_buf2; + CRC32ZB(c1, x1); + CRC32ZB(c2, x2); + } + break; + case 1: + /* single block */ + while ((len -= 2*sizeof(uint64_t)) >= 0) { + LDP(x1,y1,p_buf1); + CRC32ZX(c1, x1); + CRC32ZX(c1, y1); + } + + if (unlikely(len & sizeof(uint64_t))) { + x1 = *(uint64_t*)p_buf1; p_buf1 += sizeof(uint64_t); + CRC32ZX(c1, x1); + } + if (unlikely(len & sizeof(uint32_t))) { + x1 = *(uint32_t*)p_buf1; p_buf1 += sizeof(uint32_t); + CRC32ZW(c1, x1); + } + if (unlikely(len & sizeof(uint16_t))) { + x1 = *(uint16_t*)p_buf1; p_buf1 += sizeof(uint16_t); + CRC32ZH(c1, x1); + } + if (unlikely(len & sizeof(uint8_t))) { + x1 = *p_buf1; + CRC32ZB(c1, x1); + } + break; + case 0: + return; + default: + assert(0 && "BUG: Invalid number of checksum blocks"); + } + + *crc1 = c1; + *crc2 = c2; + *crc3 = c3; + return; +} + +typedef void (*crc_pipelined_func_t)(uint32_t *, uint32_t *, uint32_t *, const uint8_t *, size_t, int); +extern crc_pipelined_func_t pipelined_crc32c_func; +extern crc_pipelined_func_t pipelined_crc32_zlib_func; + +#include +#include + +#ifndef HWCAP_CRC32 +#define HWCAP_CRC32 (1 << 7) +#endif + +/** + * On library load, determine what sort of crc we are going to do + * and set crc function pointers appropriately. + */ +void __attribute__ ((constructor)) init_cpu_support_flag(void) { + unsigned long auxv = getauxval(AT_HWCAP); + if (auxv & HWCAP_CRC32) { + pipelined_crc32c_func = pipelined_crc32c; + pipelined_crc32_zlib_func = pipelined_crc32_zlib; + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_x86.c b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_x86.c new file mode 100644 index 00000000000..290b8a61995 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_x86.c @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Portions of this file are from http://www.evanjones.ca/crc32c.html under + * the BSD license: + * Copyright 2008,2009,2010 Massachusetts Institute of Technology. + * All rights reserved. Use of this source code is governed by a + * BSD-style license that can be found in the LICENSE file. + */ + +#include +#include // for size_t + +#include "bulk_crc32.h" +#include "gcc_optimizations.h" +#include "gcc_optimizations.h" + +/////////////////////////////////////////////////////////////////////////// +// Begin code for SSE4.2 specific hardware support of CRC32C +/////////////////////////////////////////////////////////////////////////// + +# define SSE42_FEATURE_BIT (1 << 20) +# define CPUID_FEATURES 1 +/** + * Call the cpuid instruction to determine CPU feature flags. + */ +static uint32_t cpuid(uint32_t eax_in) { + uint32_t eax, ebx, ecx, edx; +# if defined(__PIC__) && !defined(__LP64__) +// 32-bit PIC code uses the ebx register for the base offset -- +// have to save and restore it on the stack + asm("pushl %%ebx\n\t" + "cpuid\n\t" + "movl %%ebx, %[ebx]\n\t" + "popl %%ebx" : "=a" (eax), [ebx] "=r"(ebx), "=c"(ecx), "=d"(edx) : "a" (eax_in) + : "cc"); +# else + asm("cpuid" : "=a" (eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(eax_in) + : "cc"); +# endif + + return ecx; +} + +// +// Definitions of the SSE4.2 crc32 operations. Using these instead of +// the GCC __builtin_* intrinsics allows this code to compile without +// -msse4.2, since we do dynamic CPU detection at runtime. +// + +# ifdef __LP64__ +inline uint64_t _mm_crc32_u64(uint64_t crc, uint64_t value) { + asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} +# endif + +inline uint32_t _mm_crc32_u32(uint32_t crc, uint32_t value) { + asm("crc32l %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + +inline uint32_t _mm_crc32_u16(uint32_t crc, uint16_t value) { + asm("crc32w %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + +inline uint32_t _mm_crc32_u8(uint32_t crc, uint8_t value) { + asm("crc32b %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + +# ifdef __LP64__ +/** + * Pipelined version of hardware-accelerated CRC32C calculation using + * the 64 bit crc32q instruction. + * One crc32c instruction takes three cycles, but two more with no data + * dependency can be in the pipeline to achieve something close to single + * instruction/cycle. Here we feed three blocks in RR. + * + * crc1, crc2, crc3 : Store initial checksum for each block before + * calling. When it returns, updated checksums are stored. + * p_buf : The base address of the data buffer. The buffer should be + * at least as big as block_size * num_blocks. + * block_size : The size of each block in bytes. + * num_blocks : The number of blocks to work on. Min = 1, Max = 3 + */ +static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks) { + uint64_t c1 = *crc1; + uint64_t c2 = *crc2; + uint64_t c3 = *crc3; + uint64_t *data = (uint64_t*)p_buf; + int counter = block_size / sizeof(uint64_t); + int remainder = block_size % sizeof(uint64_t); + uint8_t *bdata; + + /* We do switch here because the loop has to be tight in order + * to fill the pipeline. Any other statement inside the loop + * or inbetween crc32 instruction can slow things down. Calling + * individual crc32 instructions three times from C also causes + * gcc to insert other instructions inbetween. + * + * Do not rearrange the following code unless you have verified + * the generated machine code is as efficient as before. + */ + switch (num_blocks) { + case 3: + /* Do three blocks */ + while (likely(counter)) { + __asm__ __volatile__( + "crc32q (%7), %0;\n\t" + "crc32q (%7,%6,1), %1;\n\t" + "crc32q (%7,%6,2), %2;\n\t" + : "=r"(c1), "=r"(c2), "=r"(c3) + : "0"(c1), "1"(c2), "2"(c3), "r"(block_size), "r"(data) + ); + data++; + counter--; + } + + /* Take care of the remainder. They are only up to seven bytes, + * so performing byte-level crc32 won't take much time. + */ + bdata = (uint8_t*)data; + while (likely(remainder)) { + __asm__ __volatile__( + "crc32b (%7), %0;\n\t" + "crc32b (%7,%6,1), %1;\n\t" + "crc32b (%7,%6,2), %2;\n\t" + : "=r"(c1), "=r"(c2), "=r"(c3) + : "0"(c1), "1"(c2), "2"(c3), "r"(block_size), "r"(bdata) + ); + bdata++; + remainder--; + } + break; + case 2: + /* Do two blocks */ + while (likely(counter)) { + __asm__ __volatile__( + "crc32q (%5), %0;\n\t" + "crc32q (%5,%4,1), %1;\n\t" + : "=r"(c1), "=r"(c2) + : "0"(c1), "1"(c2), "r"(block_size), "r"(data) + ); + data++; + counter--; + } + + bdata = (uint8_t*)data; + while (likely(remainder)) { + __asm__ __volatile__( + "crc32b (%5), %0;\n\t" + "crc32b (%5,%4,1), %1;\n\t" + : "=r"(c1), "=r"(c2) + : "0"(c1), "1"(c2), "r"(block_size), "r"(bdata) + ); + bdata++; + remainder--; + } + break; + case 1: + /* single block */ + while (likely(counter)) { + __asm__ __volatile__( + "crc32q (%2), %0;\n\t" + : "=r"(c1) + : "0"(c1), "r"(data) + ); + data++; + counter--; + } + bdata = (uint8_t*)data; + while (likely(remainder)) { + __asm__ __volatile__( + "crc32b (%2), %0;\n\t" + : "=r"(c1) + : "0"(c1), "r"(bdata) + ); + bdata++; + remainder--; + } + break; + case 0: + return; + default: + assert(0 && "BUG: Invalid number of checksum blocks"); + } + + *crc1 = c1; + *crc2 = c2; + *crc3 = c3; + return; +} + +# else // 32-bit + +/** + * Pipelined version of hardware-accelerated CRC32C calculation using + * the 32 bit crc32l instruction. + * One crc32c instruction takes three cycles, but two more with no data + * dependency can be in the pipeline to achieve something close to single + * instruction/cycle. Here we feed three blocks in RR. + * + * crc1, crc2, crc3 : Store initial checksum for each block before + * calling. When it returns, updated checksums are stored. + * data : The base address of the data buffer. The buffer should be + * at least as big as block_size * num_blocks. + * block_size : The size of each block in bytes. + * num_blocks : The number of blocks to work on. Min = 1, Max = 3 + */ +static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks) { + uint32_t c1 = *crc1; + uint32_t c2 = *crc2; + uint32_t c3 = *crc3; + int counter = block_size / sizeof(uint32_t); + int remainder = block_size % sizeof(uint32_t); + uint32_t *data = (uint32_t*)p_buf; + uint8_t *bdata; + + /* We do switch here because the loop has to be tight in order + * to fill the pipeline. Any other statement inside the loop + * or inbetween crc32 instruction can slow things down. Calling + * individual crc32 instructions three times from C also causes + * gcc to insert other instructions inbetween. + * + * Do not rearrange the following code unless you have verified + * the generated machine code is as efficient as before. + */ + switch (num_blocks) { + case 3: + /* Do three blocks */ + while (likely(counter)) { + __asm__ __volatile__( + "crc32l (%7), %0;\n\t" + "crc32l (%7,%6,1), %1;\n\t" + "crc32l (%7,%6,2), %2;\n\t" + : "=r"(c1), "=r"(c2), "=r"(c3) + : "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(data) + ); + data++; + counter--; + } + /* Take care of the remainder. They are only up to three bytes, + * so performing byte-level crc32 won't take much time. + */ + bdata = (uint8_t*)data; + while (likely(remainder)) { + __asm__ __volatile__( + "crc32b (%7), %0;\n\t" + "crc32b (%7,%6,1), %1;\n\t" + "crc32b (%7,%6,2), %2;\n\t" + : "=r"(c1), "=r"(c2), "=r"(c3) + : "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(bdata) + ); + bdata++; + remainder--; + } + break; + case 2: + /* Do two blocks */ + while (likely(counter)) { + __asm__ __volatile__( + "crc32l (%5), %0;\n\t" + "crc32l (%5,%4,1), %1;\n\t" + : "=r"(c1), "=r"(c2) + : "r"(c1), "r"(c2), "r"(block_size), "r"(data) + ); + data++; + counter--; + } + + bdata = (uint8_t*)data; + while (likely(remainder)) { + __asm__ __volatile__( + "crc32b (%5), %0;\n\t" + "crc32b (%5,%4,1), %1;\n\t" + : "=r"(c1), "=r"(c2) + : "r"(c1), "r"(c2), "r"(block_size), "r"(bdata) + ); + bdata++; + remainder--; + } + break; + case 1: + /* single block */ + while (likely(counter)) { + __asm__ __volatile__( + "crc32l (%2), %0;\n\t" + : "=r"(c1) + : "r"(c1), "r"(data) + ); + data++; + counter--; + } + bdata = (uint8_t*)data; + while (likely(remainder)) { + __asm__ __volatile__( + "crc32b (%2), %0;\n\t" + : "=r"(c1) + : "r"(c1), "r"(bdata) + ); + bdata++; + remainder--; + } + break; + case 0: + return; + default: + assert(0 && "BUG: Invalid number of checksum blocks"); + } + + *crc1 = c1; + *crc2 = c2; + *crc3 = c3; + return; +} + +# endif // 64-bit vs 32-bit + +/** + * On library load, initiailize the cached function pointer + * if cpu supports SSE4.2's crc32 instruction. + */ +typedef void (*crc_pipelined_func_t)(uint32_t *, uint32_t *, uint32_t *, const uint8_t *, size_t, int); +extern crc_pipelined_func_t pipelined_crc32c_func; + +void __attribute__ ((constructor)) init_cpu_support_flag(void) { + uint32_t ecx = cpuid(CPUID_FEATURES); + if (ecx & SSE42_FEATURE_BIT) pipelined_crc32c_func = pipelined_crc32c; +} diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/test/org/apache/hadoop/util/test_bulk_crc32.c b/hadoop-common-project/hadoop-common/src/main/native/src/test/org/apache/hadoop/util/test_bulk_crc32.c index 5a8c9f2d855..ef3dbecceca 100644 --- a/hadoop-common-project/hadoop-common/src/main/native/src/test/org/apache/hadoop/util/test_bulk_crc32.c +++ b/hadoop-common-project/hadoop-common/src/main/native/src/test/org/apache/hadoop/util/test_bulk_crc32.c @@ -23,6 +23,7 @@ #include #include #include +#include #define EXPECT_ZERO(x) \ do { \ @@ -57,6 +58,36 @@ static int testBulkVerifyCrc(int dataLen, int crcType, int bytesPerChecksum) return 0; } +static int timeBulkCrc(int dataLen, int crcType, int bytesPerChecksum, int iterations) +{ + int i; + uint8_t *data; + uint32_t *sums; + crc32_error_t errorData; + clock_t start, fini; + + data = malloc(dataLen); + for (i = 0; i < dataLen; i++) { + data[i] = (i % 16) + 1; + } + sums = calloc(sizeof(uint32_t), + (dataLen + bytesPerChecksum - 1) / bytesPerChecksum); + + start = clock(); + for (i = 0; i < iterations; i++) { + EXPECT_ZERO(bulk_crc(data, dataLen, sums, crcType, + bytesPerChecksum, NULL)); + EXPECT_ZERO(bulk_crc(data, dataLen, sums, crcType, + bytesPerChecksum, &errorData)); + } + fini = clock(); + printf("CRC %d bytes @ %d bytes per checksum X %d iterations = %g\n", + dataLen, bytesPerChecksum, iterations, (double)(fini-start)/CLOCKS_PER_SEC); + free(data); + free(sums); + return 0; +} + int main(int argc, char **argv) { /* Test running bulk_calculate_crc with some different algorithms and @@ -74,6 +105,9 @@ int main(int argc, char **argv) EXPECT_ZERO(testBulkVerifyCrc(17, CRC32C_POLYNOMIAL, 4)); EXPECT_ZERO(testBulkVerifyCrc(17, CRC32_ZLIB_POLYNOMIAL, 4)); + EXPECT_ZERO(timeBulkCrc(16 * 1024, CRC32C_POLYNOMIAL, 512, 1000000)); + EXPECT_ZERO(timeBulkCrc(16 * 1024, CRC32_ZLIB_POLYNOMIAL, 512, 1000000)); + fprintf(stderr, "%s: SUCCESS.\n", argv[0]); return EXIT_SUCCESS; } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNativeCrc32.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNativeCrc32.java index aecdc8f58e6..ecc6c906ab9 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNativeCrc32.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNativeCrc32.java @@ -96,6 +96,22 @@ public class TestNativeCrc32 { checksums, data, fileName, BASE_POSITION); } + @Test + public void testVerifyChunkedSumsSuccessOddSize() throws ChecksumException { + // Test checksum with an odd number of bytes. This is a corner case that + // is often broken in checksum calculation, because there is an loop which + // handles an even multiple or 4 or 8 bytes and then some additional code + // to finish the few odd bytes at the end. This code can often be broken + // but is never tested because we are always calling it with an even value + // such as 512. + bytesPerChecksum--; + allocateDirectByteBuffers(); + fillDataAndValidChecksums(); + NativeCrc32.verifyChunkedSums(bytesPerChecksum, checksumType.id, + checksums, data, fileName, BASE_POSITION); + bytesPerChecksum++; + } + @Test public void testVerifyChunkedSumsByteArraySuccess() throws ChecksumException { allocateArrayByteBuffers();