//+build !noasm !appengine // Copyright 2015, Klaus Post, see LICENSE for details. // func crc32sse(a []byte) hash TEXT ·crc32sse(SB), 7, $0 MOVQ a+0(FP), R10 XORQ BX, BX // CRC32 dword (R10), EBX BYTE $0xF2; BYTE $0x41; BYTE $0x0f BYTE $0x38; BYTE $0xf1; BYTE $0x1a MOVL BX, ret+24(FP) RET // func crc32sseAll(a []byte, dst []hash) TEXT ·crc32sseAll(SB), 7, $0 MOVQ a+0(FP), R8 // R8: src MOVQ a_len+8(FP), R10 // input length MOVQ dst+24(FP), R9 // R9: dst SUBQ $4, R10 JS end JZ one_crc MOVQ R10, R13 SHRQ $2, R10 // len/4 ANDQ $3, R13 // len&3 XORQ BX, BX ADDQ $1, R13 TESTQ R10, R10 JZ rem_loop crc_loop: MOVQ (R8), R11 XORQ BX, BX XORQ DX, DX XORQ DI, DI MOVQ R11, R12 SHRQ $8, R11 MOVQ R12, AX MOVQ R11, CX SHRQ $16, R12 SHRQ $16, R11 MOVQ R12, SI // CRC32 EAX, EBX BYTE $0xF2; BYTE $0x0f BYTE $0x38; BYTE $0xf1; BYTE $0xd8 // CRC32 ECX, EDX BYTE $0xF2; BYTE $0x0f BYTE $0x38; BYTE $0xf1; BYTE $0xd1 // CRC32 ESI, EDI BYTE $0xF2; BYTE $0x0f BYTE $0x38; BYTE $0xf1; BYTE $0xfe MOVL BX, (R9) MOVL DX, 4(R9) MOVL DI, 8(R9) XORQ BX, BX MOVL R11, AX // CRC32 EAX, EBX BYTE $0xF2; BYTE $0x0f BYTE $0x38; BYTE $0xf1; BYTE $0xd8 MOVL BX, 12(R9) ADDQ $16, R9 ADDQ $4, R8 XORQ BX, BX SUBQ $1, R10 JNZ crc_loop rem_loop: MOVL (R8), AX // CRC32 EAX, EBX BYTE $0xF2; BYTE $0x0f BYTE $0x38; BYTE $0xf1; BYTE $0xd8 MOVL BX, (R9) ADDQ $4, R9 ADDQ $1, R8 XORQ BX, BX SUBQ $1, R13 JNZ rem_loop end: RET one_crc: MOVQ $1, R13 XORQ BX, BX JMP rem_loop // func matchLenSSE4(a, b []byte, max int) int TEXT ·matchLenSSE4(SB), 7, $0 MOVQ a+0(FP), R8 // R8: &a MOVQ b+24(FP), R9 // R9: &b MOVQ max+48(FP), R10 // R10: max XORQ R11, R11 // match length MOVQ R10, R12 SHRQ $4, R10 // max/16 ANDQ $15, R12 // max & 15 CMPQ R10, $0 JEQ matchlen_verysmall loopback_matchlen: MOVOU (R8), X0 // a[x] MOVOU (R9), X1 // b[x] // PCMPESTRI $0x18, X1, X0 BYTE $0x66; BYTE $0x0f; BYTE $0x3a BYTE $0x61; BYTE $0xc1; BYTE $0x18 JC match_ended ADDQ $16, R8 ADDQ $16, R9 ADDQ $16, R11 SUBQ $1, R10 JNZ loopback_matchlen matchlen_verysmall: CMPQ R12, $0 JEQ done_matchlen loopback_matchlen_single: // Naiive, but small use MOVB (R8), R13 MOVB (R9), R14 CMPB R13, R14 JNE done_matchlen ADDQ $1, R8 ADDQ $1, R9 ADDQ $1, R11 SUBQ $1, R12 JNZ loopback_matchlen_single MOVQ R11, ret+56(FP) RET match_ended: ADDQ CX, R11 done_matchlen: MOVQ R11, ret+56(FP) RET // func histogram(b []byte, h []int32) TEXT ·histogram(SB), 7, $0 MOVQ b+0(FP), SI // SI: &b MOVQ b_len+8(FP), R9 // R9: len(b) MOVQ h+24(FP), DI // DI: Histogram MOVQ R9, R8 SHRQ $3, R8 JZ hist1 XORQ R11, R11 loop_hist8: MOVQ (SI), R10 MOVB R10, R11 INCL (DI)(R11*4) SHRQ $8, R10 MOVB R10, R11 INCL (DI)(R11*4) SHRQ $8, R10 MOVB R10, R11 INCL (DI)(R11*4) SHRQ $8, R10 MOVB R10, R11 INCL (DI)(R11*4) SHRQ $8, R10 MOVB R10, R11 INCL (DI)(R11*4) SHRQ $8, R10 MOVB R10, R11 INCL (DI)(R11*4) SHRQ $8, R10 MOVB R10, R11 INCL (DI)(R11*4) SHRQ $8, R10 INCL (DI)(R10*4) ADDQ $8, SI DECQ R8 JNZ loop_hist8 hist1: ANDQ $7, R9 JZ end_hist XORQ R10, R10 loop_hist1: MOVB (SI), R10 INCL (DI)(R10*4) INCQ SI DECQ R9 JNZ loop_hist1 end_hist: RET