diff --git a/benchmarks/add-r32_imd-TP.S b/benchmarks/add-r32_imd-TP.S new file mode 100644 index 0000000..26cfee8 --- /dev/null +++ b/benchmarks/add-r32_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/add-r32_imd.S b/benchmarks/add-r32_imd.S new file mode 100644 index 0000000..3e57ea7 --- /dev/null +++ b/benchmarks/add-r32_imd.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/add-r32_mem-TP.S b/benchmarks/add-r32_mem-TP.S new file mode 100644 index 0000000..64fc02f --- /dev/null +++ b/benchmarks/add-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/add-r32_mem.S b/benchmarks/add-r32_mem.S new file mode 100644 index 0000000..7c94bcc --- /dev/null +++ b/benchmarks/add-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/add-r64_imd-TP.S b/benchmarks/add-r64_imd-TP.S new file mode 100644 index 0000000..b4e6897 --- /dev/null +++ b/benchmarks/add-r64_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + INSTR rdx, 2 + INSTR r9, 13 + INSTR r10, 1 + INSTR r11, 2 + INSTR r12, 13 + INSTR r13, 1 + INSTR r14, 2 + INSTR r15, 13 + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + INSTR rdx, 2 + INSTR r9, 13 + INSTR r10, 1 + INSTR r11, 2 + INSTR r12, 13 + INSTR r13, 1 + INSTR r14, 2 + INSTR r15, 13 + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/add-r64_imd.S b/benchmarks/add-r64_imd.S new file mode 100644 index 0000000..79aabb2 --- /dev/null +++ b/benchmarks/add-r64_imd.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/add-r64_r64-TP.S b/benchmarks/add-r64_r64-TP.S new file mode 100644 index 0000000..d475743 --- /dev/null +++ b/benchmarks/add-r64_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/add-r64_r64.S b/benchmarks/add-r64_r64.S new file mode 100644 index 0000000..a64dc7c --- /dev/null +++ b/benchmarks/add-r64_r64.S @@ -0,0 +1,143 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/addl-mem_imd-TP.S b/benchmarks/addl-mem_imd-TP.S new file mode 100644 index 0000000..3987eb3 --- /dev/null +++ b/benchmarks/addl-mem_imd-TP.S @@ -0,0 +1,101 @@ +#define INSTR addl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/addl-mem_imd.S b/benchmarks/addl-mem_imd.S new file mode 100644 index 0000000..4693ece --- /dev/null +++ b/benchmarks/addl-mem_imd.S @@ -0,0 +1,101 @@ +#define INSTR addl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/and-r32_imd-TP.S b/benchmarks/and-r32_imd-TP.S new file mode 100644 index 0000000..0d48a75 --- /dev/null +++ b/benchmarks/and-r32_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR and +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/and-r32_imd.S b/benchmarks/and-r32_imd.S new file mode 100644 index 0000000..99deb6a --- /dev/null +++ b/benchmarks/and-r32_imd.S @@ -0,0 +1,134 @@ +#define INSTR and +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/cmp-r32_imd-TP.S b/benchmarks/cmp-r32_imd-TP.S new file mode 100644 index 0000000..34d0509 --- /dev/null +++ b/benchmarks/cmp-r32_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/cmp-r32_imd.S b/benchmarks/cmp-r32_imd.S new file mode 100644 index 0000000..5f412ad --- /dev/null +++ b/benchmarks/cmp-r32_imd.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/cmp-r32_mem-TP.S b/benchmarks/cmp-r32_mem-TP.S new file mode 100644 index 0000000..88baf8d --- /dev/null +++ b/benchmarks/cmp-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/cmp-r32_mem.S b/benchmarks/cmp-r32_mem.S new file mode 100644 index 0000000..12b88d1 --- /dev/null +++ b/benchmarks/cmp-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/cmp-r32_r32-TP.S b/benchmarks/cmp-r32_r32-TP.S new file mode 100644 index 0000000..c359fe8 --- /dev/null +++ b/benchmarks/cmp-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/cmp-r32_r32.S b/benchmarks/cmp-r32_r32.S new file mode 100644 index 0000000..99b4b20 --- /dev/null +++ b/benchmarks/cmp-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/cmp-r64_imd-TP.S b/benchmarks/cmp-r64_imd-TP.S new file mode 100644 index 0000000..9c76e1b --- /dev/null +++ b/benchmarks/cmp-r64_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + INSTR rdx, 2 + INSTR r9, 13 + INSTR r10, 1 + INSTR r11, 2 + INSTR r12, 13 + INSTR r13, 1 + INSTR r14, 2 + INSTR r15, 13 + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + INSTR rdx, 2 + INSTR r9, 13 + INSTR r10, 1 + INSTR r11, 2 + INSTR r12, 13 + INSTR r13, 1 + INSTR r14, 2 + INSTR r15, 13 + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/cmp-r64_imd.S b/benchmarks/cmp-r64_imd.S new file mode 100644 index 0000000..54e5a3c --- /dev/null +++ b/benchmarks/cmp-r64_imd.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/cmp-r64_r64-TP.S b/benchmarks/cmp-r64_r64-TP.S new file mode 100644 index 0000000..45eaa89 --- /dev/null +++ b/benchmarks/cmp-r64_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/cmp-r64_r64.S b/benchmarks/cmp-r64_r64.S new file mode 100644 index 0000000..1e80d5d --- /dev/null +++ b/benchmarks/cmp-r64_r64.S @@ -0,0 +1,143 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/dec-r32-TP.S b/benchmarks/dec-r32-TP.S new file mode 100644 index 0000000..f886ad1 --- /dev/null +++ b/benchmarks/dec-r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR dec +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/dec-r32.S b/benchmarks/dec-r32.S new file mode 100644 index 0000000..7c18fd9 --- /dev/null +++ b/benchmarks/dec-r32.S @@ -0,0 +1,143 @@ +#define INSTR dec +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/imul-r64_r64_imd-TP.S b/benchmarks/imul-r64_r64_imd-TP.S new file mode 100644 index 0000000..ba6292c --- /dev/null +++ b/benchmarks/imul-r64_r64_imd-TP.S @@ -0,0 +1,143 @@ +#define INSTR imul +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax, 1 + INSTR r9, rbx, 2 + INSTR r10, rcx, 13 + INSTR r11, rax, 1 + INSTR r12, rbx, 2 + INSTR r13, rcx, 13 + INSTR r14, rax, 1 + INSTR r15, rbx, 2 + INSTR rdx, rcx, 13 + INSTR r9, rax, 1 + INSTR r10, rbx, 2 + INSTR r11, rcx, 13 + INSTR r12, rax, 1 + INSTR r13, rbx, 2 + INSTR r14, rcx, 13 + INSTR r15, rax, 1 + INSTR rdx, rbx, 2 + INSTR r9, rcx, 13 + INSTR r10, rax, 1 + INSTR r11, rbx, 2 + INSTR r12, rcx, 13 + INSTR r13, rax, 1 + INSTR r14, rbx, 2 + INSTR r15, rcx, 13 + INSTR rdx, rax, 1 + INSTR r9, rbx, 2 + INSTR r10, rcx, 13 + INSTR r11, rax, 1 + INSTR r12, rbx, 2 + INSTR r13, rcx, 13 + INSTR r14, rax, 1 + INSTR r15, rbx, 2 + INSTR rdx, rcx, 13 + INSTR r9, rax, 1 + INSTR r10, rbx, 2 + INSTR r11, rcx, 13 + INSTR r12, rax, 1 + INSTR r13, rbx, 2 + INSTR r14, rcx, 13 + INSTR r15, rax, 1 + INSTR rdx, rbx, 2 + INSTR r9, rcx, 13 + INSTR r10, rax, 1 + INSTR r11, rbx, 2 + INSTR r12, rcx, 13 + INSTR r13, rax, 1 + INSTR r14, rbx, 2 + INSTR r15, rcx, 13 + INSTR rdx, rax, 1 + INSTR r9, rbx, 2 + INSTR r10, rcx, 13 + INSTR r11, rax, 1 + INSTR r12, rbx, 2 + INSTR r13, rcx, 13 + INSTR r14, rax, 1 + INSTR r15, rbx, 2 + INSTR rdx, rcx, 13 + INSTR r9, rax, 1 + INSTR r10, rbx, 2 + INSTR r11, rcx, 13 + INSTR r12, rax, 1 + INSTR r13, rbx, 2 + INSTR r14, rcx, 13 + INSTR r15, rax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/imul-r64_r64_imd.S b/benchmarks/imul-r64_r64_imd.S new file mode 100644 index 0000000..f0fac52 --- /dev/null +++ b/benchmarks/imul-r64_r64_imd.S @@ -0,0 +1,143 @@ +#define INSTR imul +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/inc-r32-TP.S b/benchmarks/inc-r32-TP.S new file mode 100644 index 0000000..34f98ff --- /dev/null +++ b/benchmarks/inc-r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR inc +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/inc-r32.S b/benchmarks/inc-r32.S new file mode 100644 index 0000000..84f2a8c --- /dev/null +++ b/benchmarks/inc-r32.S @@ -0,0 +1,143 @@ +#define INSTR inc +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/inc-r64-TP.S b/benchmarks/inc-r64-TP.S new file mode 100644 index 0000000..a9273f0 --- /dev/null +++ b/benchmarks/inc-r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR inc +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/inc-r64.S b/benchmarks/inc-r64.S new file mode 100644 index 0000000..1c15147 --- /dev/null +++ b/benchmarks/inc-r64.S @@ -0,0 +1,143 @@ +#define INSTR inc +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/lea-r32_mem-TP.S b/benchmarks/lea-r32_mem-TP.S new file mode 100644 index 0000000..9ab76b8 --- /dev/null +++ b/benchmarks/lea-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR lea +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/lea-r32_mem.S b/benchmarks/lea-r32_mem.S new file mode 100644 index 0000000..0516e8d --- /dev/null +++ b/benchmarks/lea-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR lea +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/lea-r64_mem-TP.S b/benchmarks/lea-r64_mem-TP.S new file mode 100644 index 0000000..e31ca30 --- /dev/null +++ b/benchmarks/lea-r64_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR lea +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/lea-r64_mem.S b/benchmarks/lea-r64_mem.S new file mode 100644 index 0000000..aad963e --- /dev/null +++ b/benchmarks/lea-r64_mem.S @@ -0,0 +1,134 @@ +#define INSTR lea +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/mov-mem_r32-TP.S b/benchmarks/mov-mem_r32-TP.S new file mode 100644 index 0000000..18142e2 --- /dev/null +++ b/benchmarks/mov-mem_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/mov-mem_r32.S b/benchmarks/mov-mem_r32.S new file mode 100644 index 0000000..427caf4 --- /dev/null +++ b/benchmarks/mov-mem_r32.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/mov-mem_r64-TP.S b/benchmarks/mov-mem_r64-TP.S new file mode 100644 index 0000000..b4a7f6a --- /dev/null +++ b/benchmarks/mov-mem_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/mov-mem_r64.S b/benchmarks/mov-mem_r64.S new file mode 100644 index 0000000..c1c6012 --- /dev/null +++ b/benchmarks/mov-mem_r64.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/mov-r32_imd-TP.S b/benchmarks/mov-r32_imd-TP.S new file mode 100644 index 0000000..d81bbac --- /dev/null +++ b/benchmarks/mov-r32_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/mov-r32_imd.S b/benchmarks/mov-r32_imd.S new file mode 100644 index 0000000..b1f4bda --- /dev/null +++ b/benchmarks/mov-r32_imd.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/mov-r32_mem-TP.S b/benchmarks/mov-r32_mem-TP.S new file mode 100644 index 0000000..69c76ec --- /dev/null +++ b/benchmarks/mov-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/mov-r32_mem.S b/benchmarks/mov-r32_mem.S new file mode 100644 index 0000000..e4e7313 --- /dev/null +++ b/benchmarks/mov-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/mov-r32_r32-TP.S b/benchmarks/mov-r32_r32-TP.S new file mode 100644 index 0000000..5bdcdf6 --- /dev/null +++ b/benchmarks/mov-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/mov-r32_r32.S b/benchmarks/mov-r32_r32.S new file mode 100644 index 0000000..d896ce7 --- /dev/null +++ b/benchmarks/mov-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/mov-r64_mem-TP.S b/benchmarks/mov-r64_mem-TP.S new file mode 100644 index 0000000..97984a3 --- /dev/null +++ b/benchmarks/mov-r64_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/mov-r64_mem.S b/benchmarks/mov-r64_mem.S new file mode 100644 index 0000000..7095f31 --- /dev/null +++ b/benchmarks/mov-r64_mem.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/movl-mem_imd-TP.S b/benchmarks/movl-mem_imd-TP.S new file mode 100644 index 0000000..2be91ea --- /dev/null +++ b/benchmarks/movl-mem_imd-TP.S @@ -0,0 +1,101 @@ +#define INSTR movl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/movl-mem_imd.S b/benchmarks/movl-mem_imd.S new file mode 100644 index 0000000..acf1961 --- /dev/null +++ b/benchmarks/movl-mem_imd.S @@ -0,0 +1,101 @@ +#define INSTR movl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/movsbl-r32_mem-TP.S b/benchmarks/movsbl-r32_mem-TP.S new file mode 100644 index 0000000..68c4967 --- /dev/null +++ b/benchmarks/movsbl-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR movsbl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/movsbl-r32_mem.S b/benchmarks/movsbl-r32_mem.S new file mode 100644 index 0000000..d24ac60 --- /dev/null +++ b/benchmarks/movsbl-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR movsbl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/movslq-r64_mem-TP.S b/benchmarks/movslq-r64_mem-TP.S new file mode 100644 index 0000000..e4ba19f --- /dev/null +++ b/benchmarks/movslq-r64_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR movslq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/movslq-r64_mem.S b/benchmarks/movslq-r64_mem.S new file mode 100644 index 0000000..50c48ed --- /dev/null +++ b/benchmarks/movslq-r64_mem.S @@ -0,0 +1,134 @@ +#define INSTR movslq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/movslq-r64_r32-TP.S b/benchmarks/movslq-r64_r32-TP.S new file mode 100644 index 0000000..9b12cc4 --- /dev/null +++ b/benchmarks/movslq-r64_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR movslq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR r11, eax + INSTR r12, ebx + INSTR r13, ecx + INSTR r14, eax + INSTR r15, ebx + INSTR rdx, ecx + INSTR r9, eax + INSTR r10, ebx + INSTR r11, ecx + INSTR r12, eax + INSTR r13, ebx + INSTR r14, ecx + INSTR r15, eax + INSTR rdx, ebx + INSTR r9, ecx + INSTR r10, eax + INSTR r11, ebx + INSTR r12, ecx + INSTR r13, eax + INSTR r14, ebx + INSTR r15, ecx + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR r11, eax + INSTR r12, ebx + INSTR r13, ecx + INSTR r14, eax + INSTR r15, ebx + INSTR rdx, ecx + INSTR r9, eax + INSTR r10, ebx + INSTR r11, ecx + INSTR r12, eax + INSTR r13, ebx + INSTR r14, ecx + INSTR r15, eax + INSTR rdx, ebx + INSTR r9, ecx + INSTR r10, eax + INSTR r11, ebx + INSTR r12, ecx + INSTR r13, eax + INSTR r14, ebx + INSTR r15, ecx + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR r11, eax + INSTR r12, ebx + INSTR r13, ecx + INSTR r14, eax + INSTR r15, ebx + INSTR rdx, ecx + INSTR r9, eax + INSTR r10, ebx + INSTR r11, ecx + INSTR r12, eax + INSTR r13, ebx + INSTR r14, ecx + INSTR r15, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/movslq-r64_r32.S b/benchmarks/movslq-r64_r32.S new file mode 100644 index 0000000..bf6f2bd --- /dev/null +++ b/benchmarks/movslq-r64_r32.S @@ -0,0 +1,143 @@ +#define INSTR movslq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/movzbl-r32_r8-TP.S b/benchmarks/movzbl-r32_r8-TP.S new file mode 100644 index 0000000..95020b8 --- /dev/null +++ b/benchmarks/movzbl-r32_r8-TP.S @@ -0,0 +1,143 @@ +#define INSTR movzbl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + INSTR edx, cl + INSTR r9d, al + INSTR r10d, bl + INSTR r11d, cl + INSTR r12d, al + INSTR r13d, bl + INSTR r14d, cl + INSTR r15d, al + INSTR edx, bl + INSTR r9d, cl + INSTR r10d, al + INSTR r11d, bl + INSTR r12d, cl + INSTR r13d, al + INSTR r14d, bl + INSTR r15d, cl + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + INSTR edx, cl + INSTR r9d, al + INSTR r10d, bl + INSTR r11d, cl + INSTR r12d, al + INSTR r13d, bl + INSTR r14d, cl + INSTR r15d, al + INSTR edx, bl + INSTR r9d, cl + INSTR r10d, al + INSTR r11d, bl + INSTR r12d, cl + INSTR r13d, al + INSTR r14d, bl + INSTR r15d, cl + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + INSTR edx, cl + INSTR r9d, al + INSTR r10d, bl + INSTR r11d, cl + INSTR r12d, al + INSTR r13d, bl + INSTR r14d, cl + INSTR r15d, al + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/movzbl-r32_r8.S b/benchmarks/movzbl-r32_r8.S new file mode 100644 index 0000000..d67a693 --- /dev/null +++ b/benchmarks/movzbl-r32_r8.S @@ -0,0 +1,143 @@ +#define INSTR movzbl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/neg-r32-TP.S b/benchmarks/neg-r32-TP.S new file mode 100644 index 0000000..e60f4a2 --- /dev/null +++ b/benchmarks/neg-r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR neg +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/neg-r32.S b/benchmarks/neg-r32.S new file mode 100644 index 0000000..c25e69c --- /dev/null +++ b/benchmarks/neg-r32.S @@ -0,0 +1,143 @@ +#define INSTR neg +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/nopl-mem-TP.S b/benchmarks/nopl-mem-TP.S new file mode 100644 index 0000000..9519631 --- /dev/null +++ b/benchmarks/nopl-mem-TP.S @@ -0,0 +1,101 @@ +#define INSTR nopl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/nopl-mem.S b/benchmarks/nopl-mem.S new file mode 100644 index 0000000..9519631 --- /dev/null +++ b/benchmarks/nopl-mem.S @@ -0,0 +1,101 @@ +#define INSTR nopl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/nopw-mem-TP.S b/benchmarks/nopw-mem-TP.S new file mode 100644 index 0000000..36786a2 --- /dev/null +++ b/benchmarks/nopw-mem-TP.S @@ -0,0 +1,101 @@ +#define INSTR nopw +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/nopw-mem.S b/benchmarks/nopw-mem.S new file mode 100644 index 0000000..36786a2 --- /dev/null +++ b/benchmarks/nopw-mem.S @@ -0,0 +1,101 @@ +#define INSTR nopw +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/pop-r64-TP.S b/benchmarks/pop-r64-TP.S new file mode 100644 index 0000000..29c5434 --- /dev/null +++ b/benchmarks/pop-r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR pop +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/pop-r64.S b/benchmarks/pop-r64.S new file mode 100644 index 0000000..c1f6ccf --- /dev/null +++ b/benchmarks/pop-r64.S @@ -0,0 +1,143 @@ +#define INSTR pop +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/pushq-imd-TP.S b/benchmarks/pushq-imd-TP.S new file mode 100644 index 0000000..ec56071 --- /dev/null +++ b/benchmarks/pushq-imd-TP.S @@ -0,0 +1,101 @@ +#define INSTR pushq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR 22 + INSTR 8 + INSTR 78 + INSTR 159 + INSTR 222 + INSTR 3 + INSTR 9 + INSTR 5 + INSTR 55 + INSTR 173 + INSTR 317 + INSTR 254 + INSTR 255 + INSTR 22 + INSTR 8 + INSTR 78 + INSTR 159 + INSTR 222 + INSTR 3 + INSTR 9 + INSTR 5 + INSTR 55 + INSTR 173 + INSTR 317 + INSTR 254 + INSTR 255 + INSTR 22 + INSTR 8 + INSTR 78 + INSTR 159 + INSTR 222 + INSTR 3 + INSTR 9 + INSTR 5 + INSTR 55 + INSTR 173 + INSTR 317 + INSTR 254 + INSTR 255 + INSTR 22 + INSTR 8 + INSTR 78 + INSTR 159 + INSTR 222 + INSTR 3 + INSTR 9 + INSTR 5 + INSTR 55 + INSTR 173 + INSTR 317 + INSTR 254 + INSTR 255 + INSTR 22 + INSTR 8 + INSTR 78 + INSTR 159 + INSTR 222 + INSTR 3 + INSTR 9 + INSTR 5 + INSTR 55 + INSTR 173 + INSTR 317 + INSTR 254 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/pushq-imd.S b/benchmarks/pushq-imd.S new file mode 100644 index 0000000..433b6d2 --- /dev/null +++ b/benchmarks/pushq-imd.S @@ -0,0 +1,101 @@ +#define INSTR pushq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc icmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/shr-r32_imd-TP.S b/benchmarks/shr-r32_imd-TP.S new file mode 100644 index 0000000..547f8af --- /dev/null +++ b/benchmarks/shr-r32_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR shr +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/shr-r32_imd.S b/benchmarks/shr-r32_imd.S new file mode 100644 index 0000000..0d62a94 --- /dev/null +++ b/benchmarks/shr-r32_imd.S @@ -0,0 +1,134 @@ +#define INSTR shr +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/sub-r32_r32-TP.S b/benchmarks/sub-r32_r32-TP.S new file mode 100644 index 0000000..2f45769 --- /dev/null +++ b/benchmarks/sub-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR sub +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/sub-r32_r32.S b/benchmarks/sub-r32_r32.S new file mode 100644 index 0000000..91a7610 --- /dev/null +++ b/benchmarks/sub-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR sub +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/sub-r64_r64-TP.S b/benchmarks/sub-r64_r64-TP.S new file mode 100644 index 0000000..6cdf17b --- /dev/null +++ b/benchmarks/sub-r64_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR sub +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/sub-r64_r64.S b/benchmarks/sub-r64_r64.S new file mode 100644 index 0000000..cee91a9 --- /dev/null +++ b/benchmarks/sub-r64_r64.S @@ -0,0 +1,143 @@ +#define INSTR sub +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/test-r32_r32-TP.S b/benchmarks/test-r32_r32-TP.S new file mode 100644 index 0000000..5403390 --- /dev/null +++ b/benchmarks/test-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR test +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/test-r32_r32.S b/benchmarks/test-r32_r32.S new file mode 100644 index 0000000..8c7e48d --- /dev/null +++ b/benchmarks/test-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR test +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/test-r8_imd-TP.S b/benchmarks/test-r8_imd-TP.S new file mode 100644 index 0000000..b2650fa --- /dev/null +++ b/benchmarks/test-r8_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR test +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR dl, 1 + INSTR r9l, 2 + INSTR r10l, 13 + INSTR r11l, 1 + INSTR r12l, 2 + INSTR r13l, 13 + INSTR r14l, 1 + INSTR r15l, 2 + INSTR dl, 13 + INSTR r9l, 1 + INSTR r10l, 2 + INSTR r11l, 13 + INSTR r12l, 1 + INSTR r13l, 2 + INSTR r14l, 13 + INSTR r15l, 1 + INSTR dl, 2 + INSTR r9l, 13 + INSTR r10l, 1 + INSTR r11l, 2 + INSTR r12l, 13 + INSTR r13l, 1 + INSTR r14l, 2 + INSTR r15l, 13 + INSTR dl, 1 + INSTR r9l, 2 + INSTR r10l, 13 + INSTR r11l, 1 + INSTR r12l, 2 + INSTR r13l, 13 + INSTR r14l, 1 + INSTR r15l, 2 + INSTR dl, 13 + INSTR r9l, 1 + INSTR r10l, 2 + INSTR r11l, 13 + INSTR r12l, 1 + INSTR r13l, 2 + INSTR r14l, 13 + INSTR r15l, 1 + INSTR dl, 2 + INSTR r9l, 13 + INSTR r10l, 1 + INSTR r11l, 2 + INSTR r12l, 13 + INSTR r13l, 1 + INSTR r14l, 2 + INSTR r15l, 13 + INSTR dl, 1 + INSTR r9l, 2 + INSTR r10l, 13 + INSTR r11l, 1 + INSTR r12l, 2 + INSTR r13l, 13 + INSTR r14l, 1 + INSTR r15l, 2 + INSTR dl, 13 + INSTR r9l, 1 + INSTR r10l, 2 + INSTR r11l, 13 + INSTR r12l, 1 + INSTR r13l, 2 + INSTR r14l, 13 + INSTR r15l, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/test-r8_imd.S b/benchmarks/test-r8_imd.S new file mode 100644 index 0000000..1abce37 --- /dev/null +++ b/benchmarks/test-r8_imd.S @@ -0,0 +1,134 @@ +#define INSTR test +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vaddpd-xmm_xmm_xmm-TP.S b/benchmarks/vaddpd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..7bf13a5 --- /dev/null +++ b/benchmarks/vaddpd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vaddpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vaddpd-xmm_xmm_xmm.S b/benchmarks/vaddpd-xmm_xmm_xmm.S new file mode 100644 index 0000000..a4bf29b --- /dev/null +++ b/benchmarks/vaddpd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vaddpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vaddpd-ymm_ymm_ymm-TP.S b/benchmarks/vaddpd-ymm_ymm_ymm-TP.S new file mode 100644 index 0000000..268aafe --- /dev/null +++ b/benchmarks/vaddpd-ymm_ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vaddpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + INSTR ymm15, ymm1, ymm1 + INSTR ymm3, ymm2, ymm2 + INSTR ymm4, ymm0, ymm0 + INSTR ymm5, ymm1, ymm1 + INSTR ymm6, ymm2, ymm2 + INSTR ymm7, ymm0, ymm0 + INSTR ymm8, ymm1, ymm1 + INSTR ymm9, ymm2, ymm2 + INSTR ymm10, ymm0, ymm0 + INSTR ymm11, ymm1, ymm1 + INSTR ymm12, ymm2, ymm2 + INSTR ymm13, ymm0, ymm0 + INSTR ymm14, ymm1, ymm1 + INSTR ymm15, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vaddpd-ymm_ymm_ymm.S b/benchmarks/vaddpd-ymm_ymm_ymm.S new file mode 100644 index 0000000..0edbbbe --- /dev/null +++ b/benchmarks/vaddpd-ymm_ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vaddpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vaddsd-xmm_xmm_mem-TP.S b/benchmarks/vaddsd-xmm_xmm_mem-TP.S new file mode 100644 index 0000000..902cf3a --- /dev/null +++ b/benchmarks/vaddsd-xmm_xmm_mem-TP.S @@ -0,0 +1,108 @@ +#define INSTR vaddsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + INSTR xmm15, xmm1, [rip+PI] + INSTR xmm3, xmm2, [rip+PI] + INSTR xmm4, xmm0, [rip+PI] + INSTR xmm5, xmm1, [rip+PI] + INSTR xmm6, xmm2, [rip+PI] + INSTR xmm7, xmm0, [rip+PI] + INSTR xmm8, xmm1, [rip+PI] + INSTR xmm9, xmm2, [rip+PI] + INSTR xmm10, xmm0, [rip+PI] + INSTR xmm11, xmm1, [rip+PI] + INSTR xmm12, xmm2, [rip+PI] + INSTR xmm13, xmm0, [rip+PI] + INSTR xmm14, xmm1, [rip+PI] + INSTR xmm15, xmm2, [rip+PI] + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vaddsd-xmm_xmm_mem.S b/benchmarks/vaddsd-xmm_xmm_mem.S new file mode 100644 index 0000000..8a4bc84 --- /dev/null +++ b/benchmarks/vaddsd-xmm_xmm_mem.S @@ -0,0 +1,108 @@ +#define INSTR vaddsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vaddsd-xmm_xmm_xmm-TP.S b/benchmarks/vaddsd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..274e201 --- /dev/null +++ b/benchmarks/vaddsd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vaddsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vaddsd-xmm_xmm_xmm.S b/benchmarks/vaddsd-xmm_xmm_xmm.S new file mode 100644 index 0000000..d071892 --- /dev/null +++ b/benchmarks/vaddsd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vaddsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vaddss-xmm_xmm_xmm-TP.S b/benchmarks/vaddss-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..af113a5 --- /dev/null +++ b/benchmarks/vaddss-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vaddss +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vaddss-xmm_xmm_xmm.S b/benchmarks/vaddss-xmm_xmm_xmm.S new file mode 100644 index 0000000..2ac1630 --- /dev/null +++ b/benchmarks/vaddss-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vaddss +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vcvtsi2ss-xmm_xmm_r32-TP.S b/benchmarks/vcvtsi2ss-xmm_xmm_r32-TP.S new file mode 100644 index 0000000..981f0de --- /dev/null +++ b/benchmarks/vcvtsi2ss-xmm_xmm_r32-TP.S @@ -0,0 +1,141 @@ +#define INSTR vcvtsi2ss +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, eax + INSTR xmm4, xmm1, ebx + INSTR xmm5, xmm2, ecx + INSTR xmm6, xmm0, eax + INSTR xmm7, xmm1, ebx + INSTR xmm8, xmm2, ecx + INSTR xmm9, xmm0, eax + INSTR xmm10, xmm1, ebx + INSTR xmm11, xmm2, ecx + INSTR xmm12, xmm0, eax + INSTR xmm13, xmm1, ebx + INSTR xmm14, xmm2, ecx + INSTR xmm15, xmm0, eax + INSTR xmm3, xmm1, ebx + INSTR xmm4, xmm2, ecx + INSTR xmm5, xmm0, eax + INSTR xmm6, xmm1, ebx + INSTR xmm7, xmm2, ecx + INSTR xmm8, xmm0, eax + INSTR xmm9, xmm1, ebx + INSTR xmm10, xmm2, ecx + INSTR xmm11, xmm0, eax + INSTR xmm12, xmm1, ebx + INSTR xmm13, xmm2, ecx + INSTR xmm14, xmm0, eax + INSTR xmm15, xmm1, ebx + INSTR xmm3, xmm2, ecx + INSTR xmm4, xmm0, eax + INSTR xmm5, xmm1, ebx + INSTR xmm6, xmm2, ecx + INSTR xmm7, xmm0, eax + INSTR xmm8, xmm1, ebx + INSTR xmm9, xmm2, ecx + INSTR xmm10, xmm0, eax + INSTR xmm11, xmm1, ebx + INSTR xmm12, xmm2, ecx + INSTR xmm13, xmm0, eax + INSTR xmm14, xmm1, ebx + INSTR xmm15, xmm2, ecx + INSTR xmm3, xmm0, eax + INSTR xmm4, xmm1, ebx + INSTR xmm5, xmm2, ecx + INSTR xmm6, xmm0, eax + INSTR xmm7, xmm1, ebx + INSTR xmm8, xmm2, ecx + INSTR xmm9, xmm0, eax + INSTR xmm10, xmm1, ebx + INSTR xmm11, xmm2, ecx + INSTR xmm12, xmm0, eax + INSTR xmm13, xmm1, ebx + INSTR xmm14, xmm2, ecx + INSTR xmm15, xmm0, eax + INSTR xmm3, xmm1, ebx + INSTR xmm4, xmm2, ecx + INSTR xmm5, xmm0, eax + INSTR xmm6, xmm1, ebx + INSTR xmm7, xmm2, ecx + INSTR xmm8, xmm0, eax + INSTR xmm9, xmm1, ebx + INSTR xmm10, xmm2, ecx + INSTR xmm11, xmm0, eax + INSTR xmm12, xmm1, ebx + INSTR xmm13, xmm2, ecx + INSTR xmm14, xmm0, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vcvtsi2ss-xmm_xmm_r32.S b/benchmarks/vcvtsi2ss-xmm_xmm_r32.S new file mode 100644 index 0000000..e2bdd56 --- /dev/null +++ b/benchmarks/vcvtsi2ss-xmm_xmm_r32.S @@ -0,0 +1,141 @@ +#define INSTR vcvtsi2ss +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vextractf128-xmm_ymm_imd-TP.S b/benchmarks/vextractf128-xmm_ymm_imd-TP.S new file mode 100644 index 0000000..6586196 --- /dev/null +++ b/benchmarks/vextractf128-xmm_ymm_imd-TP.S @@ -0,0 +1,110 @@ +#define INSTR vextractf128 +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR xmm3, ymm0, 1 + INSTR xmm4, ymm1, 2 + INSTR xmm5, ymm2, 13 + INSTR xmm6, ymm0, 1 + INSTR xmm7, ymm1, 2 + INSTR xmm8, ymm2, 13 + INSTR xmm9, ymm0, 1 + INSTR xmm10, ymm1, 2 + INSTR xmm11, ymm2, 13 + INSTR xmm12, ymm0, 1 + INSTR xmm13, ymm1, 2 + INSTR xmm14, ymm2, 13 + INSTR xmm15, ymm0, 1 + INSTR xmm3, ymm1, 2 + INSTR xmm4, ymm2, 13 + INSTR xmm5, ymm0, 1 + INSTR xmm6, ymm1, 2 + INSTR xmm7, ymm2, 13 + INSTR xmm8, ymm0, 1 + INSTR xmm9, ymm1, 2 + INSTR xmm10, ymm2, 13 + INSTR xmm11, ymm0, 1 + INSTR xmm12, ymm1, 2 + INSTR xmm13, ymm2, 13 + INSTR xmm14, ymm0, 1 + INSTR xmm15, ymm1, 2 + INSTR xmm3, ymm2, 13 + INSTR xmm4, ymm0, 1 + INSTR xmm5, ymm1, 2 + INSTR xmm6, ymm2, 13 + INSTR xmm7, ymm0, 1 + INSTR xmm8, ymm1, 2 + INSTR xmm9, ymm2, 13 + INSTR xmm10, ymm0, 1 + INSTR xmm11, ymm1, 2 + INSTR xmm12, ymm2, 13 + INSTR xmm13, ymm0, 1 + INSTR xmm14, ymm1, 2 + INSTR xmm15, ymm2, 13 + INSTR xmm3, ymm0, 1 + INSTR xmm4, ymm1, 2 + INSTR xmm5, ymm2, 13 + INSTR xmm6, ymm0, 1 + INSTR xmm7, ymm1, 2 + INSTR xmm8, ymm2, 13 + INSTR xmm9, ymm0, 1 + INSTR xmm10, ymm1, 2 + INSTR xmm11, ymm2, 13 + INSTR xmm12, ymm0, 1 + INSTR xmm13, ymm1, 2 + INSTR xmm14, ymm2, 13 + INSTR xmm15, ymm0, 1 + INSTR xmm3, ymm1, 2 + INSTR xmm4, ymm2, 13 + INSTR xmm5, ymm0, 1 + INSTR xmm6, ymm1, 2 + INSTR xmm7, ymm2, 13 + INSTR xmm8, ymm0, 1 + INSTR xmm9, ymm1, 2 + INSTR xmm10, ymm2, 13 + INSTR xmm11, ymm0, 1 + INSTR xmm12, ymm1, 2 + INSTR xmm13, ymm2, 13 + INSTR xmm14, ymm0, 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vextractf128-xmm_ymm_imd.S b/benchmarks/vextractf128-xmm_ymm_imd.S new file mode 100644 index 0000000..1eb9b58 --- /dev/null +++ b/benchmarks/vextractf128-xmm_ymm_imd.S @@ -0,0 +1,46 @@ +#define INSTR vextractf128 +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vinsertf128-ymm_ymm_-TP.S b/benchmarks/vinsertf128-ymm_ymm_-TP.S new file mode 100644 index 0000000..f0b4652 --- /dev/null +++ b/benchmarks/vinsertf128-ymm_ymm_-TP.S @@ -0,0 +1,110 @@ +#define INSTR vinsertf128 +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3 + INSTR ymm4 + INSTR ymm5 + INSTR ymm6 + INSTR ymm7 + INSTR ymm8 + INSTR ymm9 + INSTR ymm10 + INSTR ymm11 + INSTR ymm12 + INSTR ymm13 + INSTR ymm14 + INSTR ymm15 + INSTR ymm3 + INSTR ymm4 + INSTR ymm5 + INSTR ymm6 + INSTR ymm7 + INSTR ymm8 + INSTR ymm9 + INSTR ymm10 + INSTR ymm11 + INSTR ymm12 + INSTR ymm13 + INSTR ymm14 + INSTR ymm15 + INSTR ymm3 + INSTR ymm4 + INSTR ymm5 + INSTR ymm6 + INSTR ymm7 + INSTR ymm8 + INSTR ymm9 + INSTR ymm10 + INSTR ymm11 + INSTR ymm12 + INSTR ymm13 + INSTR ymm14 + INSTR ymm15 + INSTR ymm3 + INSTR ymm4 + INSTR ymm5 + INSTR ymm6 + INSTR ymm7 + INSTR ymm8 + INSTR ymm9 + INSTR ymm10 + INSTR ymm11 + INSTR ymm12 + INSTR ymm13 + INSTR ymm14 + INSTR ymm15 + INSTR ymm3 + INSTR ymm4 + INSTR ymm5 + INSTR ymm6 + INSTR ymm7 + INSTR ymm8 + INSTR ymm9 + INSTR ymm10 + INSTR ymm11 + INSTR ymm12 + INSTR ymm13 + INSTR ymm14 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vinsertf128-ymm_ymm_.S b/benchmarks/vinsertf128-ymm_ymm_.S new file mode 100644 index 0000000..7527b42 --- /dev/null +++ b/benchmarks/vinsertf128-ymm_ymm_.S @@ -0,0 +1,46 @@ +#define INSTR vinsertf128 +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vinsertf128-ymm_ymm_imd-TP.S b/benchmarks/vinsertf128-ymm_ymm_imd-TP.S new file mode 100644 index 0000000..ba164fa --- /dev/null +++ b/benchmarks/vinsertf128-ymm_ymm_imd-TP.S @@ -0,0 +1,110 @@ +#define INSTR vinsertf128 +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, 1 + INSTR ymm4, ymm1, 2 + INSTR ymm5, ymm2, 13 + INSTR ymm6, ymm0, 1 + INSTR ymm7, ymm1, 2 + INSTR ymm8, ymm2, 13 + INSTR ymm9, ymm0, 1 + INSTR ymm10, ymm1, 2 + INSTR ymm11, ymm2, 13 + INSTR ymm12, ymm0, 1 + INSTR ymm13, ymm1, 2 + INSTR ymm14, ymm2, 13 + INSTR ymm15, ymm0, 1 + INSTR ymm3, ymm1, 2 + INSTR ymm4, ymm2, 13 + INSTR ymm5, ymm0, 1 + INSTR ymm6, ymm1, 2 + INSTR ymm7, ymm2, 13 + INSTR ymm8, ymm0, 1 + INSTR ymm9, ymm1, 2 + INSTR ymm10, ymm2, 13 + INSTR ymm11, ymm0, 1 + INSTR ymm12, ymm1, 2 + INSTR ymm13, ymm2, 13 + INSTR ymm14, ymm0, 1 + INSTR ymm15, ymm1, 2 + INSTR ymm3, ymm2, 13 + INSTR ymm4, ymm0, 1 + INSTR ymm5, ymm1, 2 + INSTR ymm6, ymm2, 13 + INSTR ymm7, ymm0, 1 + INSTR ymm8, ymm1, 2 + INSTR ymm9, ymm2, 13 + INSTR ymm10, ymm0, 1 + INSTR ymm11, ymm1, 2 + INSTR ymm12, ymm2, 13 + INSTR ymm13, ymm0, 1 + INSTR ymm14, ymm1, 2 + INSTR ymm15, ymm2, 13 + INSTR ymm3, ymm0, 1 + INSTR ymm4, ymm1, 2 + INSTR ymm5, ymm2, 13 + INSTR ymm6, ymm0, 1 + INSTR ymm7, ymm1, 2 + INSTR ymm8, ymm2, 13 + INSTR ymm9, ymm0, 1 + INSTR ymm10, ymm1, 2 + INSTR ymm11, ymm2, 13 + INSTR ymm12, ymm0, 1 + INSTR ymm13, ymm1, 2 + INSTR ymm14, ymm2, 13 + INSTR ymm15, ymm0, 1 + INSTR ymm3, ymm1, 2 + INSTR ymm4, ymm2, 13 + INSTR ymm5, ymm0, 1 + INSTR ymm6, ymm1, 2 + INSTR ymm7, ymm2, 13 + INSTR ymm8, ymm0, 1 + INSTR ymm9, ymm1, 2 + INSTR ymm10, ymm2, 13 + INSTR ymm11, ymm0, 1 + INSTR ymm12, ymm1, 2 + INSTR ymm13, ymm2, 13 + INSTR ymm14, ymm0, 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vinsertf128-ymm_ymm_imd.S b/benchmarks/vinsertf128-ymm_ymm_imd.S new file mode 100644 index 0000000..3372e40 --- /dev/null +++ b/benchmarks/vinsertf128-ymm_ymm_imd.S @@ -0,0 +1,110 @@ +#define INSTR vinsertf128 +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovapd-xmm_xmm-TP.S b/benchmarks/vmovapd-xmm_xmm-TP.S new file mode 100644 index 0000000..f39e016 --- /dev/null +++ b/benchmarks/vmovapd-xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovapd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + INSTR xmm15, xmm1 + INSTR xmm3, xmm2 + INSTR xmm4, xmm0 + INSTR xmm5, xmm1 + INSTR xmm6, xmm2 + INSTR xmm7, xmm0 + INSTR xmm8, xmm1 + INSTR xmm9, xmm2 + INSTR xmm10, xmm0 + INSTR xmm11, xmm1 + INSTR xmm12, xmm2 + INSTR xmm13, xmm0 + INSTR xmm14, xmm1 + INSTR xmm15, xmm2 + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovapd-xmm_xmm.S b/benchmarks/vmovapd-xmm_xmm.S new file mode 100644 index 0000000..6b4e7af --- /dev/null +++ b/benchmarks/vmovapd-xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmovapd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovapd-ymm_ymm-TP.S b/benchmarks/vmovapd-ymm_ymm-TP.S new file mode 100644 index 0000000..754eedb --- /dev/null +++ b/benchmarks/vmovapd-ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vmovapd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0 + INSTR ymm4, ymm1 + INSTR ymm5, ymm2 + INSTR ymm6, ymm0 + INSTR ymm7, ymm1 + INSTR ymm8, ymm2 + INSTR ymm9, ymm0 + INSTR ymm10, ymm1 + INSTR ymm11, ymm2 + INSTR ymm12, ymm0 + INSTR ymm13, ymm1 + INSTR ymm14, ymm2 + INSTR ymm15, ymm0 + INSTR ymm3, ymm1 + INSTR ymm4, ymm2 + INSTR ymm5, ymm0 + INSTR ymm6, ymm1 + INSTR ymm7, ymm2 + INSTR ymm8, ymm0 + INSTR ymm9, ymm1 + INSTR ymm10, ymm2 + INSTR ymm11, ymm0 + INSTR ymm12, ymm1 + INSTR ymm13, ymm2 + INSTR ymm14, ymm0 + INSTR ymm15, ymm1 + INSTR ymm3, ymm2 + INSTR ymm4, ymm0 + INSTR ymm5, ymm1 + INSTR ymm6, ymm2 + INSTR ymm7, ymm0 + INSTR ymm8, ymm1 + INSTR ymm9, ymm2 + INSTR ymm10, ymm0 + INSTR ymm11, ymm1 + INSTR ymm12, ymm2 + INSTR ymm13, ymm0 + INSTR ymm14, ymm1 + INSTR ymm15, ymm2 + INSTR ymm3, ymm0 + INSTR ymm4, ymm1 + INSTR ymm5, ymm2 + INSTR ymm6, ymm0 + INSTR ymm7, ymm1 + INSTR ymm8, ymm2 + INSTR ymm9, ymm0 + INSTR ymm10, ymm1 + INSTR ymm11, ymm2 + INSTR ymm12, ymm0 + INSTR ymm13, ymm1 + INSTR ymm14, ymm2 + INSTR ymm15, ymm0 + INSTR ymm3, ymm1 + INSTR ymm4, ymm2 + INSTR ymm5, ymm0 + INSTR ymm6, ymm1 + INSTR ymm7, ymm2 + INSTR ymm8, ymm0 + INSTR ymm9, ymm1 + INSTR ymm10, ymm2 + INSTR ymm11, ymm0 + INSTR ymm12, ymm1 + INSTR ymm13, ymm2 + INSTR ymm14, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovapd-ymm_ymm.S b/benchmarks/vmovapd-ymm_ymm.S new file mode 100644 index 0000000..ac3137c --- /dev/null +++ b/benchmarks/vmovapd-ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vmovapd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovaps-xmm_xmm-TP.S b/benchmarks/vmovaps-xmm_xmm-TP.S new file mode 100644 index 0000000..53df367 --- /dev/null +++ b/benchmarks/vmovaps-xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovaps +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + INSTR xmm15, xmm1 + INSTR xmm3, xmm2 + INSTR xmm4, xmm0 + INSTR xmm5, xmm1 + INSTR xmm6, xmm2 + INSTR xmm7, xmm0 + INSTR xmm8, xmm1 + INSTR xmm9, xmm2 + INSTR xmm10, xmm0 + INSTR xmm11, xmm1 + INSTR xmm12, xmm2 + INSTR xmm13, xmm0 + INSTR xmm14, xmm1 + INSTR xmm15, xmm2 + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovaps-xmm_xmm.S b/benchmarks/vmovaps-xmm_xmm.S new file mode 100644 index 0000000..3e1baac --- /dev/null +++ b/benchmarks/vmovaps-xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmovaps +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovhpd-xmm_xmm_mem-TP.S b/benchmarks/vmovhpd-xmm_xmm_mem-TP.S new file mode 100644 index 0000000..11cbaf0 --- /dev/null +++ b/benchmarks/vmovhpd-xmm_xmm_mem-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovhpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + INSTR xmm15, xmm1, [rip+PI] + INSTR xmm3, xmm2, [rip+PI] + INSTR xmm4, xmm0, [rip+PI] + INSTR xmm5, xmm1, [rip+PI] + INSTR xmm6, xmm2, [rip+PI] + INSTR xmm7, xmm0, [rip+PI] + INSTR xmm8, xmm1, [rip+PI] + INSTR xmm9, xmm2, [rip+PI] + INSTR xmm10, xmm0, [rip+PI] + INSTR xmm11, xmm1, [rip+PI] + INSTR xmm12, xmm2, [rip+PI] + INSTR xmm13, xmm0, [rip+PI] + INSTR xmm14, xmm1, [rip+PI] + INSTR xmm15, xmm2, [rip+PI] + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovhpd-xmm_xmm_mem.S b/benchmarks/vmovhpd-xmm_xmm_mem.S new file mode 100644 index 0000000..b423e4a --- /dev/null +++ b/benchmarks/vmovhpd-xmm_xmm_mem.S @@ -0,0 +1,108 @@ +#define INSTR vmovhpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovq-r64_xmm-TP.S b/benchmarks/vmovq-r64_xmm-TP.S new file mode 100644 index 0000000..b80c773 --- /dev/null +++ b/benchmarks/vmovq-r64_xmm-TP.S @@ -0,0 +1,141 @@ +#define INSTR vmovq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR rdx, xmm0 + INSTR r9, xmm1 + INSTR r10, xmm2 + INSTR r11, xmm0 + INSTR r12, xmm1 + INSTR r13, xmm2 + INSTR r14, xmm0 + INSTR r15, xmm1 + INSTR rdx, xmm2 + INSTR r9, xmm0 + INSTR r10, xmm1 + INSTR r11, xmm2 + INSTR r12, xmm0 + INSTR r13, xmm1 + INSTR r14, xmm2 + INSTR r15, xmm0 + INSTR rdx, xmm1 + INSTR r9, xmm2 + INSTR r10, xmm0 + INSTR r11, xmm1 + INSTR r12, xmm2 + INSTR r13, xmm0 + INSTR r14, xmm1 + INSTR r15, xmm2 + INSTR rdx, xmm0 + INSTR r9, xmm1 + INSTR r10, xmm2 + INSTR r11, xmm0 + INSTR r12, xmm1 + INSTR r13, xmm2 + INSTR r14, xmm0 + INSTR r15, xmm1 + INSTR rdx, xmm2 + INSTR r9, xmm0 + INSTR r10, xmm1 + INSTR r11, xmm2 + INSTR r12, xmm0 + INSTR r13, xmm1 + INSTR r14, xmm2 + INSTR r15, xmm0 + INSTR rdx, xmm1 + INSTR r9, xmm2 + INSTR r10, xmm0 + INSTR r11, xmm1 + INSTR r12, xmm2 + INSTR r13, xmm0 + INSTR r14, xmm1 + INSTR r15, xmm2 + INSTR rdx, xmm0 + INSTR r9, xmm1 + INSTR r10, xmm2 + INSTR r11, xmm0 + INSTR r12, xmm1 + INSTR r13, xmm2 + INSTR r14, xmm0 + INSTR r15, xmm1 + INSTR rdx, xmm2 + INSTR r9, xmm0 + INSTR r10, xmm1 + INSTR r11, xmm2 + INSTR r12, xmm0 + INSTR r13, xmm1 + INSTR r14, xmm2 + INSTR r15, xmm0 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovq-r64_xmm.S b/benchmarks/vmovq-r64_xmm.S new file mode 100644 index 0000000..029ebc3 --- /dev/null +++ b/benchmarks/vmovq-r64_xmm.S @@ -0,0 +1,141 @@ +#define INSTR vmovq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovq-xmm_r64-TP.S b/benchmarks/vmovq-xmm_r64-TP.S new file mode 100644 index 0000000..fc7da5a --- /dev/null +++ b/benchmarks/vmovq-xmm_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR vmovq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR xmm3, rax + INSTR xmm4, rbx + INSTR xmm5, rcx + INSTR xmm6, rax + INSTR xmm7, rbx + INSTR xmm8, rcx + INSTR xmm9, rax + INSTR xmm10, rbx + INSTR xmm11, rcx + INSTR xmm12, rax + INSTR xmm13, rbx + INSTR xmm14, rcx + INSTR xmm15, rax + INSTR xmm3, rbx + INSTR xmm4, rcx + INSTR xmm5, rax + INSTR xmm6, rbx + INSTR xmm7, rcx + INSTR xmm8, rax + INSTR xmm9, rbx + INSTR xmm10, rcx + INSTR xmm11, rax + INSTR xmm12, rbx + INSTR xmm13, rcx + INSTR xmm14, rax + INSTR xmm15, rbx + INSTR xmm3, rcx + INSTR xmm4, rax + INSTR xmm5, rbx + INSTR xmm6, rcx + INSTR xmm7, rax + INSTR xmm8, rbx + INSTR xmm9, rcx + INSTR xmm10, rax + INSTR xmm11, rbx + INSTR xmm12, rcx + INSTR xmm13, rax + INSTR xmm14, rbx + INSTR xmm15, rcx + INSTR xmm3, rax + INSTR xmm4, rbx + INSTR xmm5, rcx + INSTR xmm6, rax + INSTR xmm7, rbx + INSTR xmm8, rcx + INSTR xmm9, rax + INSTR xmm10, rbx + INSTR xmm11, rcx + INSTR xmm12, rax + INSTR xmm13, rbx + INSTR xmm14, rcx + INSTR xmm15, rax + INSTR xmm3, rbx + INSTR xmm4, rcx + INSTR xmm5, rax + INSTR xmm6, rbx + INSTR xmm7, rcx + INSTR xmm8, rax + INSTR xmm9, rbx + INSTR xmm10, rcx + INSTR xmm11, rax + INSTR xmm12, rbx + INSTR xmm13, rcx + INSTR xmm14, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovq-xmm_r64.S b/benchmarks/vmovq-xmm_r64.S new file mode 100644 index 0000000..6a89af7 --- /dev/null +++ b/benchmarks/vmovq-xmm_r64.S @@ -0,0 +1,143 @@ +#define INSTR vmovq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovsd-mem_xmm-TP.S b/benchmarks/vmovsd-mem_xmm-TP.S new file mode 100644 index 0000000..14a1cb6 --- /dev/null +++ b/benchmarks/vmovsd-mem_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovsd-mem_xmm.S b/benchmarks/vmovsd-mem_xmm.S new file mode 100644 index 0000000..4f1bfbb --- /dev/null +++ b/benchmarks/vmovsd-mem_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovsd-xmm_mem-TP.S b/benchmarks/vmovsd-xmm_mem-TP.S new file mode 100644 index 0000000..74f7da2 --- /dev/null +++ b/benchmarks/vmovsd-xmm_mem-TP.S @@ -0,0 +1,101 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovsd-xmm_mem.S b/benchmarks/vmovsd-xmm_mem.S new file mode 100644 index 0000000..6447ff8 --- /dev/null +++ b/benchmarks/vmovsd-xmm_mem.S @@ -0,0 +1,101 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovsd-xmm_xmm_xmm-TP.S b/benchmarks/vmovsd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..1c847dd --- /dev/null +++ b/benchmarks/vmovsd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovsd-xmm_xmm_xmm.S b/benchmarks/vmovsd-xmm_xmm_xmm.S new file mode 100644 index 0000000..d31c45a --- /dev/null +++ b/benchmarks/vmovsd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovss-mem_xmm-TP.S b/benchmarks/vmovss-mem_xmm-TP.S new file mode 100644 index 0000000..226cbb0 --- /dev/null +++ b/benchmarks/vmovss-mem_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovss +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovss-mem_xmm.S b/benchmarks/vmovss-mem_xmm.S new file mode 100644 index 0000000..d4c3ee5 --- /dev/null +++ b/benchmarks/vmovss-mem_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmovss +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovupd-xmm_mem-TP.S b/benchmarks/vmovupd-xmm_mem-TP.S new file mode 100644 index 0000000..9c5d7a0 --- /dev/null +++ b/benchmarks/vmovupd-xmm_mem-TP.S @@ -0,0 +1,101 @@ +#define INSTR vmovupd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmovupd-xmm_mem.S b/benchmarks/vmovupd-xmm_mem.S new file mode 100644 index 0000000..b5cc153 --- /dev/null +++ b/benchmarks/vmovupd-xmm_mem.S @@ -0,0 +1,101 @@ +#define INSTR vmovupd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmulpd-ymm_ymm_mem-TP.S b/benchmarks/vmulpd-ymm_ymm_mem-TP.S new file mode 100644 index 0000000..bdbd111 --- /dev/null +++ b/benchmarks/vmulpd-ymm_ymm_mem-TP.S @@ -0,0 +1,110 @@ +#define INSTR vmulpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, [rip+PI] + INSTR ymm4, ymm1, [rip+PI] + INSTR ymm5, ymm2, [rip+PI] + INSTR ymm6, ymm0, [rip+PI] + INSTR ymm7, ymm1, [rip+PI] + INSTR ymm8, ymm2, [rip+PI] + INSTR ymm9, ymm0, [rip+PI] + INSTR ymm10, ymm1, [rip+PI] + INSTR ymm11, ymm2, [rip+PI] + INSTR ymm12, ymm0, [rip+PI] + INSTR ymm13, ymm1, [rip+PI] + INSTR ymm14, ymm2, [rip+PI] + INSTR ymm15, ymm0, [rip+PI] + INSTR ymm3, ymm1, [rip+PI] + INSTR ymm4, ymm2, [rip+PI] + INSTR ymm5, ymm0, [rip+PI] + INSTR ymm6, ymm1, [rip+PI] + INSTR ymm7, ymm2, [rip+PI] + INSTR ymm8, ymm0, [rip+PI] + INSTR ymm9, ymm1, [rip+PI] + INSTR ymm10, ymm2, [rip+PI] + INSTR ymm11, ymm0, [rip+PI] + INSTR ymm12, ymm1, [rip+PI] + INSTR ymm13, ymm2, [rip+PI] + INSTR ymm14, ymm0, [rip+PI] + INSTR ymm15, ymm1, [rip+PI] + INSTR ymm3, ymm2, [rip+PI] + INSTR ymm4, ymm0, [rip+PI] + INSTR ymm5, ymm1, [rip+PI] + INSTR ymm6, ymm2, [rip+PI] + INSTR ymm7, ymm0, [rip+PI] + INSTR ymm8, ymm1, [rip+PI] + INSTR ymm9, ymm2, [rip+PI] + INSTR ymm10, ymm0, [rip+PI] + INSTR ymm11, ymm1, [rip+PI] + INSTR ymm12, ymm2, [rip+PI] + INSTR ymm13, ymm0, [rip+PI] + INSTR ymm14, ymm1, [rip+PI] + INSTR ymm15, ymm2, [rip+PI] + INSTR ymm3, ymm0, [rip+PI] + INSTR ymm4, ymm1, [rip+PI] + INSTR ymm5, ymm2, [rip+PI] + INSTR ymm6, ymm0, [rip+PI] + INSTR ymm7, ymm1, [rip+PI] + INSTR ymm8, ymm2, [rip+PI] + INSTR ymm9, ymm0, [rip+PI] + INSTR ymm10, ymm1, [rip+PI] + INSTR ymm11, ymm2, [rip+PI] + INSTR ymm12, ymm0, [rip+PI] + INSTR ymm13, ymm1, [rip+PI] + INSTR ymm14, ymm2, [rip+PI] + INSTR ymm15, ymm0, [rip+PI] + INSTR ymm3, ymm1, [rip+PI] + INSTR ymm4, ymm2, [rip+PI] + INSTR ymm5, ymm0, [rip+PI] + INSTR ymm6, ymm1, [rip+PI] + INSTR ymm7, ymm2, [rip+PI] + INSTR ymm8, ymm0, [rip+PI] + INSTR ymm9, ymm1, [rip+PI] + INSTR ymm10, ymm2, [rip+PI] + INSTR ymm11, ymm0, [rip+PI] + INSTR ymm12, ymm1, [rip+PI] + INSTR ymm13, ymm2, [rip+PI] + INSTR ymm14, ymm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmulpd-ymm_ymm_mem.S b/benchmarks/vmulpd-ymm_ymm_mem.S new file mode 100644 index 0000000..3193575 --- /dev/null +++ b/benchmarks/vmulpd-ymm_ymm_mem.S @@ -0,0 +1,110 @@ +#define INSTR vmulpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmulpd-ymm_ymm_ymm-TP.S b/benchmarks/vmulpd-ymm_ymm_ymm-TP.S new file mode 100644 index 0000000..029acd9 --- /dev/null +++ b/benchmarks/vmulpd-ymm_ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vmulpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + INSTR ymm15, ymm1, ymm1 + INSTR ymm3, ymm2, ymm2 + INSTR ymm4, ymm0, ymm0 + INSTR ymm5, ymm1, ymm1 + INSTR ymm6, ymm2, ymm2 + INSTR ymm7, ymm0, ymm0 + INSTR ymm8, ymm1, ymm1 + INSTR ymm9, ymm2, ymm2 + INSTR ymm10, ymm0, ymm0 + INSTR ymm11, ymm1, ymm1 + INSTR ymm12, ymm2, ymm2 + INSTR ymm13, ymm0, ymm0 + INSTR ymm14, ymm1, ymm1 + INSTR ymm15, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmulpd-ymm_ymm_ymm.S b/benchmarks/vmulpd-ymm_ymm_ymm.S new file mode 100644 index 0000000..830c26d --- /dev/null +++ b/benchmarks/vmulpd-ymm_ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vmulpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmulsd-xmm_xmm_mem-TP.S b/benchmarks/vmulsd-xmm_xmm_mem-TP.S new file mode 100644 index 0000000..5a0359f --- /dev/null +++ b/benchmarks/vmulsd-xmm_xmm_mem-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmulsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + INSTR xmm15, xmm1, [rip+PI] + INSTR xmm3, xmm2, [rip+PI] + INSTR xmm4, xmm0, [rip+PI] + INSTR xmm5, xmm1, [rip+PI] + INSTR xmm6, xmm2, [rip+PI] + INSTR xmm7, xmm0, [rip+PI] + INSTR xmm8, xmm1, [rip+PI] + INSTR xmm9, xmm2, [rip+PI] + INSTR xmm10, xmm0, [rip+PI] + INSTR xmm11, xmm1, [rip+PI] + INSTR xmm12, xmm2, [rip+PI] + INSTR xmm13, xmm0, [rip+PI] + INSTR xmm14, xmm1, [rip+PI] + INSTR xmm15, xmm2, [rip+PI] + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmulsd-xmm_xmm_mem.S b/benchmarks/vmulsd-xmm_xmm_mem.S new file mode 100644 index 0000000..4b70252 --- /dev/null +++ b/benchmarks/vmulsd-xmm_xmm_mem.S @@ -0,0 +1,108 @@ +#define INSTR vmulsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmulsd-xmm_xmm_xmm-TP.S b/benchmarks/vmulsd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..c2dc870 --- /dev/null +++ b/benchmarks/vmulsd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmulsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmulsd-xmm_xmm_xmm.S b/benchmarks/vmulsd-xmm_xmm_xmm.S new file mode 100644 index 0000000..97d4bac --- /dev/null +++ b/benchmarks/vmulsd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmulsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmulss-xmm_xmm_xmm-TP.S b/benchmarks/vmulss-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..a8b7b5b --- /dev/null +++ b/benchmarks/vmulss-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmulss +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vmulss-xmm_xmm_xmm.S b/benchmarks/vmulss-xmm_xmm_xmm.S new file mode 100644 index 0000000..4a8d582 --- /dev/null +++ b/benchmarks/vmulss-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmulss +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vsubpd-ymm_ymm_ymm-TP.S b/benchmarks/vsubpd-ymm_ymm_ymm-TP.S new file mode 100644 index 0000000..2eca166 --- /dev/null +++ b/benchmarks/vsubpd-ymm_ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vsubpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + INSTR ymm15, ymm1, ymm1 + INSTR ymm3, ymm2, ymm2 + INSTR ymm4, ymm0, ymm0 + INSTR ymm5, ymm1, ymm1 + INSTR ymm6, ymm2, ymm2 + INSTR ymm7, ymm0, ymm0 + INSTR ymm8, ymm1, ymm1 + INSTR ymm9, ymm2, ymm2 + INSTR ymm10, ymm0, ymm0 + INSTR ymm11, ymm1, ymm1 + INSTR ymm12, ymm2, ymm2 + INSTR ymm13, ymm0, ymm0 + INSTR ymm14, ymm1, ymm1 + INSTR ymm15, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vsubpd-ymm_ymm_ymm.S b/benchmarks/vsubpd-ymm_ymm_ymm.S new file mode 100644 index 0000000..96d3fe9 --- /dev/null +++ b/benchmarks/vsubpd-ymm_ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vsubpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vsubsd-xmm_xmm_xmm-TP.S b/benchmarks/vsubsd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..ceb9507 --- /dev/null +++ b/benchmarks/vsubsd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vsubsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vsubsd-xmm_xmm_xmm.S b/benchmarks/vsubsd-xmm_xmm_xmm.S new file mode 100644 index 0000000..b7429a4 --- /dev/null +++ b/benchmarks/vsubsd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vsubsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vunpckhpd-xmm_xmm_xmm-TP.S b/benchmarks/vunpckhpd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..1d99838 --- /dev/null +++ b/benchmarks/vunpckhpd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vunpckhpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vunpckhpd-xmm_xmm_xmm.S b/benchmarks/vunpckhpd-xmm_xmm_xmm.S new file mode 100644 index 0000000..8807655 --- /dev/null +++ b/benchmarks/vunpckhpd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vunpckhpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vxorpd-xmm_xmm_xmm-TP.S b/benchmarks/vxorpd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..2188e73 --- /dev/null +++ b/benchmarks/vxorpd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vxorpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vxorpd-xmm_xmm_xmm.S b/benchmarks/vxorpd-xmm_xmm_xmm.S new file mode 100644 index 0000000..eb1d6c9 --- /dev/null +++ b/benchmarks/vxorpd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vxorpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vxorpd-ymm_ymm_ymm-TP.S b/benchmarks/vxorpd-ymm_ymm_ymm-TP.S new file mode 100644 index 0000000..3a7e7fe --- /dev/null +++ b/benchmarks/vxorpd-ymm_ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vxorpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + INSTR ymm15, ymm1, ymm1 + INSTR ymm3, ymm2, ymm2 + INSTR ymm4, ymm0, ymm0 + INSTR ymm5, ymm1, ymm1 + INSTR ymm6, ymm2, ymm2 + INSTR ymm7, ymm0, ymm0 + INSTR ymm8, ymm1, ymm1 + INSTR ymm9, ymm2, ymm2 + INSTR ymm10, ymm0, ymm0 + INSTR ymm11, ymm1, ymm1 + INSTR ymm12, ymm2, ymm2 + INSTR ymm13, ymm0, ymm0 + INSTR ymm14, ymm1, ymm1 + INSTR ymm15, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vxorpd-ymm_ymm_ymm.S b/benchmarks/vxorpd-ymm_ymm_ymm.S new file mode 100644 index 0000000..8ab0f92 --- /dev/null +++ b/benchmarks/vxorpd-ymm_ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vxorpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vxorps-xmm_xmm_xmm-TP.S b/benchmarks/vxorps-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..77475af --- /dev/null +++ b/benchmarks/vxorps-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vxorps +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/vxorps-xmm_xmm_xmm.S b/benchmarks/vxorps-xmm_xmm_xmm.S new file mode 100644 index 0000000..f1a1a8c --- /dev/null +++ b/benchmarks/vxorps-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vxorps +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/xor-r32_r32-TP.S b/benchmarks/xor-r32_r32-TP.S new file mode 100644 index 0000000..bf5757b --- /dev/null +++ b/benchmarks/xor-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR xor +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/benchmarks/xor-r32_r32.S b/benchmarks/xor-r32_r32.S new file mode 100644 index 0000000..652a935 --- /dev/null +++ b/benchmarks/xor-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR xor +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file