//+build !noasm !appengine // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT TEXT ·_transpose_uint8_uint8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB0_1 LBB0_5: WORD $0xd089 // mov eax, edx WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x0157b60f // movzx edx, byte [rdi + 1] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x0257b60f // movzx edx, byte [rdi + 2] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x0357b60f // movzx edx, byte [rdi + 3] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB0_5 LBB0_1: WORD $0xd285 // test edx, edx JLE LBB0_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB0_3: LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB0_3 LBB0_4: RET TEXT ·_transpose_int8_uint8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB1_1 LBB1_5: WORD $0xd089 // mov eax, edx LONG $0x17be0f48 // movsx rdx, byte [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB1_5 LBB1_1: WORD $0xd285 // test edx, edx JLE LBB1_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB1_3: LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB1_3 LBB1_4: RET TEXT ·_transpose_uint16_uint8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB2_1 LBB2_5: WORD $0xd089 // mov eax, edx WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x0257b70f // movzx edx, word [rdi + 2] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x0457b70f // movzx edx, word [rdi + 4] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x0657b70f // movzx edx, word [rdi + 6] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB2_5 LBB2_1: WORD $0xd285 // test edx, edx JLE LBB2_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB2_3: LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB2_3 LBB2_4: RET TEXT ·_transpose_int16_uint8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB3_1 LBB3_5: WORD $0xd089 // mov eax, edx LONG $0x17bf0f48 // movsx rdx, word [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB3_5 LBB3_1: WORD $0xd285 // test edx, edx JLE LBB3_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB3_3: LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB3_3 LBB3_4: RET TEXT ·_transpose_uint32_uint8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB4_1 LBB4_5: WORD $0xd089 // mov eax, edx WORD $0x178b // mov edx, dword [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB4_5 LBB4_1: WORD $0xd285 // test edx, edx JLE LBB4_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB4_3: LONG $0x87048b42 // mov eax, dword [rdi + 4*r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB4_3 LBB4_4: RET TEXT ·_transpose_int32_uint8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB5_1 LBB5_5: WORD $0xd089 // mov eax, edx WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x04576348 // movsxd rdx, dword [rdi + 4] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x08576348 // movsxd rdx, dword [rdi + 8] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB5_5 LBB5_1: WORD $0xd285 // test edx, edx JLE LBB5_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB5_3: LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB5_3 LBB5_4: RET TEXT ·_transpose_uint64_uint8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB6_1 LBB6_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x08578b48 // mov rdx, qword [rdi + 8] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x10578b48 // mov rdx, qword [rdi + 16] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x18578b48 // mov rdx, qword [rdi + 24] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB6_5 LBB6_1: WORD $0xd285 // test edx, edx JLE LBB6_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB6_3: LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB6_3 LBB6_4: RET TEXT ·_transpose_int64_uint8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB7_1 LBB7_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x08578b48 // mov rdx, qword [rdi + 8] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x10578b48 // mov rdx, qword [rdi + 16] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x18578b48 // mov rdx, qword [rdi + 24] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB7_5 LBB7_1: WORD $0xd285 // test edx, edx JLE LBB7_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB7_3: LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB7_3 LBB7_4: RET TEXT ·_transpose_uint8_int8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB8_1 LBB8_5: WORD $0xd089 // mov eax, edx WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x0157b60f // movzx edx, byte [rdi + 1] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x0257b60f // movzx edx, byte [rdi + 2] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x0357b60f // movzx edx, byte [rdi + 3] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB8_5 LBB8_1: WORD $0xd285 // test edx, edx JLE LBB8_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB8_3: LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB8_3 LBB8_4: RET TEXT ·_transpose_int8_int8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB9_1 LBB9_5: WORD $0xd089 // mov eax, edx LONG $0x17be0f48 // movsx rdx, byte [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB9_5 LBB9_1: WORD $0xd285 // test edx, edx JLE LBB9_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB9_3: LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB9_3 LBB9_4: RET TEXT ·_transpose_uint16_int8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB10_1 LBB10_5: WORD $0xd089 // mov eax, edx WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x0257b70f // movzx edx, word [rdi + 2] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x0457b70f // movzx edx, word [rdi + 4] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x0657b70f // movzx edx, word [rdi + 6] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB10_5 LBB10_1: WORD $0xd285 // test edx, edx JLE LBB10_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB10_3: LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB10_3 LBB10_4: RET TEXT ·_transpose_int16_int8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB11_1 LBB11_5: WORD $0xd089 // mov eax, edx LONG $0x17bf0f48 // movsx rdx, word [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB11_5 LBB11_1: WORD $0xd285 // test edx, edx JLE LBB11_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB11_3: LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB11_3 LBB11_4: RET TEXT ·_transpose_uint32_int8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB12_1 LBB12_5: WORD $0xd089 // mov eax, edx WORD $0x178b // mov edx, dword [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB12_5 LBB12_1: WORD $0xd285 // test edx, edx JLE LBB12_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB12_3: LONG $0x87048b42 // mov eax, dword [rdi + 4*r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB12_3 LBB12_4: RET TEXT ·_transpose_int32_int8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB13_1 LBB13_5: WORD $0xd089 // mov eax, edx WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x04576348 // movsxd rdx, dword [rdi + 4] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x08576348 // movsxd rdx, dword [rdi + 8] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB13_5 LBB13_1: WORD $0xd285 // test edx, edx JLE LBB13_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB13_3: LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB13_3 LBB13_4: RET TEXT ·_transpose_uint64_int8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB14_1 LBB14_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x08578b48 // mov rdx, qword [rdi + 8] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x10578b48 // mov rdx, qword [rdi + 16] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x18578b48 // mov rdx, qword [rdi + 24] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB14_5 LBB14_1: WORD $0xd285 // test edx, edx JLE LBB14_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB14_3: LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB14_3 LBB14_4: RET TEXT ·_transpose_int64_int8_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB15_1 LBB15_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x1688 // mov byte [rsi], dl LONG $0x08578b48 // mov rdx, qword [rdi + 8] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl LONG $0x10578b48 // mov rdx, qword [rdi + 16] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl LONG $0x18578b48 // mov rdx, qword [rdi + 24] LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x04c68348 // add rsi, 4 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB15_5 LBB15_1: WORD $0xd285 // test edx, edx JLE LBB15_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB15_3: LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] LONG $0x06048842 // mov byte [rsi + r8], al LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB15_3 LBB15_4: RET TEXT ·_transpose_uint8_uint16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB16_1 LBB16_5: WORD $0xd089 // mov eax, edx WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x0157b60f // movzx edx, byte [rdi + 1] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x0257b60f // movzx edx, byte [rdi + 2] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x0357b60f // movzx edx, byte [rdi + 3] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB16_5 LBB16_1: WORD $0xd285 // test edx, edx JLE LBB16_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB16_3: LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB16_3 LBB16_4: RET TEXT ·_transpose_int8_uint16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB17_1 LBB17_5: WORD $0xd089 // mov eax, edx LONG $0x17be0f48 // movsx rdx, byte [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB17_5 LBB17_1: WORD $0xd285 // test edx, edx JLE LBB17_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB17_3: LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB17_3 LBB17_4: RET TEXT ·_transpose_uint16_uint16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB18_1 LBB18_5: WORD $0xd089 // mov eax, edx WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x0257b70f // movzx edx, word [rdi + 2] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x0457b70f // movzx edx, word [rdi + 4] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x0657b70f // movzx edx, word [rdi + 6] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB18_5 LBB18_1: WORD $0xd285 // test edx, edx JLE LBB18_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB18_3: LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB18_3 LBB18_4: RET TEXT ·_transpose_int16_uint16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB19_1 LBB19_5: WORD $0xd089 // mov eax, edx LONG $0x17bf0f48 // movsx rdx, word [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB19_5 LBB19_1: WORD $0xd285 // test edx, edx JLE LBB19_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB19_3: LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB19_3 LBB19_4: RET TEXT ·_transpose_uint32_uint16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB20_1 LBB20_5: WORD $0xd089 // mov eax, edx WORD $0x178b // mov edx, dword [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB20_5 LBB20_1: WORD $0xd285 // test edx, edx JLE LBB20_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB20_3: LONG $0x47048b42 // mov eax, dword [rdi + 2*r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB20_3 LBB20_4: RET TEXT ·_transpose_int32_uint16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB21_1 LBB21_5: WORD $0xd089 // mov eax, edx WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x04576348 // movsxd rdx, dword [rdi + 4] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x08576348 // movsxd rdx, dword [rdi + 8] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB21_5 LBB21_1: WORD $0xd285 // test edx, edx JLE LBB21_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB21_3: LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB21_3 LBB21_4: RET TEXT ·_transpose_uint64_uint16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB22_1 LBB22_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x08578b48 // mov rdx, qword [rdi + 8] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x10578b48 // mov rdx, qword [rdi + 16] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x18578b48 // mov rdx, qword [rdi + 24] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB22_5 LBB22_1: WORD $0xd285 // test edx, edx JLE LBB22_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB22_3: LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB22_3 LBB22_4: RET TEXT ·_transpose_int64_uint16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB23_1 LBB23_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x08578b48 // mov rdx, qword [rdi + 8] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x10578b48 // mov rdx, qword [rdi + 16] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x18578b48 // mov rdx, qword [rdi + 24] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB23_5 LBB23_1: WORD $0xd285 // test edx, edx JLE LBB23_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB23_3: LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB23_3 LBB23_4: RET TEXT ·_transpose_uint8_int16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB24_1 LBB24_5: WORD $0xd089 // mov eax, edx WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x0157b60f // movzx edx, byte [rdi + 1] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x0257b60f // movzx edx, byte [rdi + 2] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x0357b60f // movzx edx, byte [rdi + 3] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB24_5 LBB24_1: WORD $0xd285 // test edx, edx JLE LBB24_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB24_3: LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB24_3 LBB24_4: RET TEXT ·_transpose_int8_int16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB25_1 LBB25_5: WORD $0xd089 // mov eax, edx LONG $0x17be0f48 // movsx rdx, byte [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB25_5 LBB25_1: WORD $0xd285 // test edx, edx JLE LBB25_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB25_3: LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB25_3 LBB25_4: RET TEXT ·_transpose_uint16_int16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB26_1 LBB26_5: WORD $0xd089 // mov eax, edx WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x0257b70f // movzx edx, word [rdi + 2] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x0457b70f // movzx edx, word [rdi + 4] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x0657b70f // movzx edx, word [rdi + 6] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB26_5 LBB26_1: WORD $0xd285 // test edx, edx JLE LBB26_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB26_3: LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB26_3 LBB26_4: RET TEXT ·_transpose_int16_int16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB27_1 LBB27_5: WORD $0xd089 // mov eax, edx LONG $0x17bf0f48 // movsx rdx, word [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB27_5 LBB27_1: WORD $0xd285 // test edx, edx JLE LBB27_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB27_3: LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB27_3 LBB27_4: RET TEXT ·_transpose_uint32_int16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB28_1 LBB28_5: WORD $0xd089 // mov eax, edx WORD $0x178b // mov edx, dword [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB28_5 LBB28_1: WORD $0xd285 // test edx, edx JLE LBB28_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB28_3: LONG $0x47048b42 // mov eax, dword [rdi + 2*r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB28_3 LBB28_4: RET TEXT ·_transpose_int32_int16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB29_1 LBB29_5: WORD $0xd089 // mov eax, edx WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x04576348 // movsxd rdx, dword [rdi + 4] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x08576348 // movsxd rdx, dword [rdi + 8] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB29_5 LBB29_1: WORD $0xd285 // test edx, edx JLE LBB29_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB29_3: LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB29_3 LBB29_4: RET TEXT ·_transpose_uint64_int16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB30_1 LBB30_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x08578b48 // mov rdx, qword [rdi + 8] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x10578b48 // mov rdx, qword [rdi + 16] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x18578b48 // mov rdx, qword [rdi + 24] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB30_5 LBB30_1: WORD $0xd285 // test edx, edx JLE LBB30_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB30_3: LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB30_3 LBB30_4: RET TEXT ·_transpose_int64_int16_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB31_1 LBB31_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] WORD $0x8966; BYTE $0x16 // mov word [rsi], dx LONG $0x08578b48 // mov rdx, qword [rdi + 8] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x02568966 // mov word [rsi + 2], dx LONG $0x10578b48 // mov rdx, qword [rdi + 16] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x04568966 // mov word [rsi + 4], dx LONG $0x18578b48 // mov rdx, qword [rdi + 24] LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] LONG $0x06568966 // mov word [rsi + 6], dx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x08c68348 // add rsi, 8 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB31_5 LBB31_1: WORD $0xd285 // test edx, edx JLE LBB31_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB31_3: LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB31_3 LBB31_4: RET TEXT ·_transpose_uint8_uint32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB32_1 LBB32_5: WORD $0xd089 // mov eax, edx WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x0157b60f // movzx edx, byte [rdi + 1] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x0257b60f // movzx edx, byte [rdi + 2] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x0357b60f // movzx edx, byte [rdi + 3] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB32_5 LBB32_1: WORD $0xd285 // test edx, edx JLE LBB32_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB32_3: LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x86048942 // mov dword [rsi + 4*r8], eax LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB32_3 LBB32_4: RET TEXT ·_transpose_int8_uint32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB33_1 LBB33_5: WORD $0xd089 // mov eax, edx LONG $0x17be0f48 // movsx rdx, byte [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB33_5 LBB33_1: WORD $0xd285 // test edx, edx JLE LBB33_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB33_3: LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x86048942 // mov dword [rsi + 4*r8], eax LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB33_3 LBB33_4: RET TEXT ·_transpose_uint16_uint32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB34_1 LBB34_5: WORD $0xd089 // mov eax, edx WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x0257b70f // movzx edx, word [rdi + 2] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x0457b70f // movzx edx, word [rdi + 4] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x0657b70f // movzx edx, word [rdi + 6] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB34_5 LBB34_1: WORD $0xd285 // test edx, edx JLE LBB34_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB34_3: LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x46048942 // mov dword [rsi + 2*r8], eax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB34_3 LBB34_4: RET TEXT ·_transpose_int16_uint32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB35_1 LBB35_5: WORD $0xd089 // mov eax, edx LONG $0x17bf0f48 // movsx rdx, word [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB35_5 LBB35_1: WORD $0xd285 // test edx, edx JLE LBB35_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB35_3: LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x46048942 // mov dword [rsi + 2*r8], eax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB35_3 LBB35_4: RET TEXT ·_transpose_uint32_uint32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB36_1 LBB36_5: WORD $0xd089 // mov eax, edx WORD $0x178b // mov edx, dword [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB36_5 LBB36_1: WORD $0xd285 // test edx, edx JLE LBB36_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB36_3: LONG $0x07048b42 // mov eax, dword [rdi + r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x06048942 // mov dword [rsi + r8], eax LONG $0x04c08349 // add r8, 4 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB36_3 LBB36_4: RET TEXT ·_transpose_int32_uint32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB37_1 LBB37_5: WORD $0xd089 // mov eax, edx WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x04576348 // movsxd rdx, dword [rdi + 4] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x08576348 // movsxd rdx, dword [rdi + 8] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB37_5 LBB37_1: WORD $0xd285 // test edx, edx JLE LBB37_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB37_3: LONG $0x0704634a // movsxd rax, dword [rdi + r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x06048942 // mov dword [rsi + r8], eax LONG $0x04c08349 // add r8, 4 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB37_3 LBB37_4: RET TEXT ·_transpose_uint64_uint32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB38_1 LBB38_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x08578b48 // mov rdx, qword [rdi + 8] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x10578b48 // mov rdx, qword [rdi + 16] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x18578b48 // mov rdx, qword [rdi + 24] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB38_5 LBB38_1: WORD $0xd285 // test edx, edx JLE LBB38_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB38_3: LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x06048942 // mov dword [rsi + r8], eax LONG $0x04c08349 // add r8, 4 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB38_3 LBB38_4: RET TEXT ·_transpose_int64_uint32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB39_1 LBB39_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x08578b48 // mov rdx, qword [rdi + 8] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x10578b48 // mov rdx, qword [rdi + 16] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x18578b48 // mov rdx, qword [rdi + 24] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB39_5 LBB39_1: WORD $0xd285 // test edx, edx JLE LBB39_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB39_3: LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x06048942 // mov dword [rsi + r8], eax LONG $0x04c08349 // add r8, 4 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB39_3 LBB39_4: RET TEXT ·_transpose_uint8_int32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB40_1 LBB40_5: WORD $0xd089 // mov eax, edx WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x0157b60f // movzx edx, byte [rdi + 1] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x0257b60f // movzx edx, byte [rdi + 2] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x0357b60f // movzx edx, byte [rdi + 3] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB40_5 LBB40_1: WORD $0xd285 // test edx, edx JLE LBB40_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB40_3: LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x86048942 // mov dword [rsi + 4*r8], eax LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB40_3 LBB40_4: RET TEXT ·_transpose_int8_int32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB41_1 LBB41_5: WORD $0xd089 // mov eax, edx LONG $0x17be0f48 // movsx rdx, byte [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB41_5 LBB41_1: WORD $0xd285 // test edx, edx JLE LBB41_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB41_3: LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x86048942 // mov dword [rsi + 4*r8], eax LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB41_3 LBB41_4: RET TEXT ·_transpose_uint16_int32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB42_1 LBB42_5: WORD $0xd089 // mov eax, edx WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x0257b70f // movzx edx, word [rdi + 2] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x0457b70f // movzx edx, word [rdi + 4] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x0657b70f // movzx edx, word [rdi + 6] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB42_5 LBB42_1: WORD $0xd285 // test edx, edx JLE LBB42_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB42_3: LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x46048942 // mov dword [rsi + 2*r8], eax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB42_3 LBB42_4: RET TEXT ·_transpose_int16_int32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB43_1 LBB43_5: WORD $0xd089 // mov eax, edx LONG $0x17bf0f48 // movsx rdx, word [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB43_5 LBB43_1: WORD $0xd285 // test edx, edx JLE LBB43_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB43_3: LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x46048942 // mov dword [rsi + 2*r8], eax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB43_3 LBB43_4: RET TEXT ·_transpose_uint32_int32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB44_1 LBB44_5: WORD $0xd089 // mov eax, edx WORD $0x178b // mov edx, dword [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB44_5 LBB44_1: WORD $0xd285 // test edx, edx JLE LBB44_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB44_3: LONG $0x07048b42 // mov eax, dword [rdi + r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x06048942 // mov dword [rsi + r8], eax LONG $0x04c08349 // add r8, 4 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB44_3 LBB44_4: RET TEXT ·_transpose_int32_int32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB45_1 LBB45_5: WORD $0xd089 // mov eax, edx WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x04576348 // movsxd rdx, dword [rdi + 4] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x08576348 // movsxd rdx, dword [rdi + 8] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB45_5 LBB45_1: WORD $0xd285 // test edx, edx JLE LBB45_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB45_3: LONG $0x0704634a // movsxd rax, dword [rdi + r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x06048942 // mov dword [rsi + r8], eax LONG $0x04c08349 // add r8, 4 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB45_3 LBB45_4: RET TEXT ·_transpose_uint64_int32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB46_1 LBB46_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x08578b48 // mov rdx, qword [rdi + 8] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x10578b48 // mov rdx, qword [rdi + 16] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x18578b48 // mov rdx, qword [rdi + 24] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB46_5 LBB46_1: WORD $0xd285 // test edx, edx JLE LBB46_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB46_3: LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x06048942 // mov dword [rsi + r8], eax LONG $0x04c08349 // add r8, 4 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB46_3 LBB46_4: RET TEXT ·_transpose_int64_int32_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB47_1 LBB47_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x1689 // mov dword [rsi], edx LONG $0x08578b48 // mov rdx, qword [rdi + 8] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx LONG $0x10578b48 // mov rdx, qword [rdi + 16] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx LONG $0x18578b48 // mov rdx, qword [rdi + 24] WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x10c68348 // add rsi, 16 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB47_5 LBB47_1: WORD $0xd285 // test edx, edx JLE LBB47_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB47_3: LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] LONG $0x06048942 // mov dword [rsi + r8], eax LONG $0x04c08349 // add r8, 4 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB47_3 LBB47_4: RET TEXT ·_transpose_uint8_uint64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB48_1 LBB48_5: WORD $0xd089 // mov eax, edx WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x0157b60f // movzx edx, byte [rdi + 1] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x0257b60f // movzx edx, byte [rdi + 2] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x0357b60f // movzx edx, byte [rdi + 3] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB48_5 LBB48_1: WORD $0xd285 // test edx, edx JLE LBB48_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB48_3: LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0xc604894a // mov qword [rsi + 8*r8], rax LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB48_3 LBB48_4: RET TEXT ·_transpose_int8_uint64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB49_1 LBB49_5: WORD $0xd089 // mov eax, edx LONG $0x17be0f48 // movsx rdx, byte [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB49_5 LBB49_1: WORD $0xd285 // test edx, edx JLE LBB49_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB49_3: LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0xc604894a // mov qword [rsi + 8*r8], rax LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB49_3 LBB49_4: RET TEXT ·_transpose_uint16_uint64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB50_1 LBB50_5: WORD $0xd089 // mov eax, edx WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x0257b70f // movzx edx, word [rdi + 2] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x0457b70f // movzx edx, word [rdi + 4] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x0657b70f // movzx edx, word [rdi + 6] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB50_5 LBB50_1: WORD $0xd285 // test edx, edx JLE LBB50_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB50_3: LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0x8604894a // mov qword [rsi + 4*r8], rax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB50_3 LBB50_4: RET TEXT ·_transpose_int16_uint64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB51_1 LBB51_5: WORD $0xd089 // mov eax, edx LONG $0x17bf0f48 // movsx rdx, word [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB51_5 LBB51_1: WORD $0xd285 // test edx, edx JLE LBB51_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB51_3: LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0x8604894a // mov qword [rsi + 4*r8], rax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB51_3 LBB51_4: RET TEXT ·_transpose_uint32_uint64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB52_1 LBB52_5: WORD $0xd089 // mov eax, edx WORD $0x178b // mov edx, dword [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB52_5 LBB52_1: WORD $0xd285 // test edx, edx JLE LBB52_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB52_3: LONG $0x07048b42 // mov eax, dword [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0x4604894a // mov qword [rsi + 2*r8], rax LONG $0x04c08349 // add r8, 4 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB52_3 LBB52_4: RET TEXT ·_transpose_int32_uint64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB53_1 LBB53_5: WORD $0xd089 // mov eax, edx WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x04576348 // movsxd rdx, dword [rdi + 4] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x08576348 // movsxd rdx, dword [rdi + 8] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB53_5 LBB53_1: WORD $0xd285 // test edx, edx JLE LBB53_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB53_3: LONG $0x0704634a // movsxd rax, dword [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0x4604894a // mov qword [rsi + 2*r8], rax LONG $0x04c08349 // add r8, 4 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB53_3 LBB53_4: RET TEXT ·_transpose_uint64_uint64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB54_1 LBB54_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x08578b48 // mov rdx, qword [rdi + 8] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x10578b48 // mov rdx, qword [rdi + 16] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x18578b48 // mov rdx, qword [rdi + 24] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB54_5 LBB54_1: WORD $0xd285 // test edx, edx JLE LBB54_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB54_3: LONG $0x07048b4a // mov rax, qword [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0x0604894a // mov qword [rsi + r8], rax LONG $0x08c08349 // add r8, 8 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB54_3 LBB54_4: RET TEXT ·_transpose_int64_uint64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB55_1 LBB55_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x08578b48 // mov rdx, qword [rdi + 8] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x10578b48 // mov rdx, qword [rdi + 16] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x18578b48 // mov rdx, qword [rdi + 24] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB55_5 LBB55_1: WORD $0xd285 // test edx, edx JLE LBB55_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB55_3: LONG $0x07048b4a // mov rax, qword [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0x0604894a // mov qword [rsi + r8], rax LONG $0x08c08349 // add r8, 8 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB55_3 LBB55_4: RET TEXT ·_transpose_uint8_int64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB56_1 LBB56_5: WORD $0xd089 // mov eax, edx WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x0157b60f // movzx edx, byte [rdi + 1] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x0257b60f // movzx edx, byte [rdi + 2] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x0357b60f // movzx edx, byte [rdi + 3] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB56_5 LBB56_1: WORD $0xd285 // test edx, edx JLE LBB56_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB56_3: LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0xc604894a // mov qword [rsi + 8*r8], rax LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB56_3 LBB56_4: RET TEXT ·_transpose_int8_int64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB57_1 LBB57_5: WORD $0xd089 // mov eax, edx LONG $0x17be0f48 // movsx rdx, byte [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x04c78348 // add rdi, 4 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB57_5 LBB57_1: WORD $0xd285 // test edx, edx JLE LBB57_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB57_3: LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0xc604894a // mov qword [rsi + 8*r8], rax LONG $0x01c08349 // add r8, 1 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB57_3 LBB57_4: RET TEXT ·_transpose_uint16_int64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB58_1 LBB58_5: WORD $0xd089 // mov eax, edx WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x0257b70f // movzx edx, word [rdi + 2] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x0457b70f // movzx edx, word [rdi + 4] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x0657b70f // movzx edx, word [rdi + 6] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB58_5 LBB58_1: WORD $0xd285 // test edx, edx JLE LBB58_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB58_3: LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0x8604894a // mov qword [rsi + 4*r8], rax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB58_3 LBB58_4: RET TEXT ·_transpose_int16_int64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB59_1 LBB59_5: WORD $0xd089 // mov eax, edx LONG $0x17bf0f48 // movsx rdx, word [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x08c78348 // add rdi, 8 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB59_5 LBB59_1: WORD $0xd285 // test edx, edx JLE LBB59_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB59_3: LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0x8604894a // mov qword [rsi + 4*r8], rax LONG $0x02c08349 // add r8, 2 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB59_3 LBB59_4: RET TEXT ·_transpose_uint32_int64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB60_1 LBB60_5: WORD $0xd089 // mov eax, edx WORD $0x178b // mov edx, dword [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB60_5 LBB60_1: WORD $0xd285 // test edx, edx JLE LBB60_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB60_3: LONG $0x07048b42 // mov eax, dword [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0x4604894a // mov qword [rsi + 2*r8], rax LONG $0x04c08349 // add r8, 4 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB60_3 LBB60_4: RET TEXT ·_transpose_int32_int64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB61_1 LBB61_5: WORD $0xd089 // mov eax, edx WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x04576348 // movsxd rdx, dword [rdi + 4] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x08576348 // movsxd rdx, dword [rdi + 8] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x10c78348 // add rdi, 16 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB61_5 LBB61_1: WORD $0xd285 // test edx, edx JLE LBB61_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB61_3: LONG $0x0704634a // movsxd rax, dword [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0x4604894a // mov qword [rsi + 2*r8], rax LONG $0x04c08349 // add r8, 4 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB61_3 LBB61_4: RET TEXT ·_transpose_uint64_int64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB62_1 LBB62_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x08578b48 // mov rdx, qword [rdi + 8] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x10578b48 // mov rdx, qword [rdi + 16] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x18578b48 // mov rdx, qword [rdi + 24] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB62_5 LBB62_1: WORD $0xd285 // test edx, edx JLE LBB62_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB62_3: LONG $0x07048b4a // mov rax, qword [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0x0604894a // mov qword [rsi + r8], rax LONG $0x08c08349 // add r8, 8 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB62_3 LBB62_4: RET TEXT ·_transpose_int64_int64_sse4(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ length+16(FP), DX MOVQ transposeMap+24(FP), CX WORD $0xfa83; BYTE $0x04 // cmp edx, 4 JL LBB63_1 LBB63_5: WORD $0xd089 // mov eax, edx WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx LONG $0x08578b48 // mov rdx, qword [rdi + 8] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x08568948 // mov qword [rsi + 8], rdx LONG $0x10578b48 // mov rdx, qword [rdi + 16] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x10568948 // mov qword [rsi + 16], rdx LONG $0x18578b48 // mov rdx, qword [rdi + 24] LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] LONG $0x18568948 // mov qword [rsi + 24], rdx WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] LONG $0x20c78348 // add rdi, 32 LONG $0x20c68348 // add rsi, 32 WORD $0xf883; BYTE $0x07 // cmp eax, 7 JG LBB63_5 LBB63_1: WORD $0xd285 // test edx, edx JLE LBB63_4 WORD $0xc283; BYTE $0x01 // add edx, 1 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB63_3: LONG $0x07048b4a // mov rax, qword [rdi + r8] LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] LONG $0x0604894a // mov qword [rsi + r8], rax LONG $0x08c08349 // add r8, 8 WORD $0xc283; BYTE $0xff // add edx, -1 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JG LBB63_3 LBB63_4: RET