add sqrt to AVX benchmarks

This commit is contained in:
Johannes Hofmann
2017-06-06 10:19:25 +02:00
parent 7c5b855eaa
commit 5c4db847bd
12 changed files with 724 additions and 0 deletions

59
src/AVX/sqrtsd-TP.S Normal file
View File

@@ -0,0 +1,59 @@
#define INSTR sqrtsd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddsd xmm1, xmm0, xmm0 # create 2.0
vaddsd xmm2, xmm0, xmm1 # create 3.0
vaddsd xmm4, xmm1, xmm1 # create 4.0
vaddsd xmm4, xmm4, xmm4 # create 8.0
vaddsd xmm4, xmm4, xmm4 # create 16.0
vaddsd xmm4, xmm4, xmm4 # create 32.0
vaddsd xmm4, xmm4, xmm4 # create 64.0
vaddsd xmm4, xmm4, xmm4 # create 128.0
vaddsd xmm4, xmm4, xmm4 # create 256.0
vaddsd xmm4, xmm4, xmm4 # create 512.0
vaddsd xmm4, xmm4, xmm4 # create 1024.0
vdivsd xmm1, xmm4, xmm2 # create 341.3333
vdivsd xmm2, xmm0, xmm1 # create 1/341.3333
vaddsd xmm0, xmm1, xmm1 # create 2*341.3333
movsd xmm1, xmm0
movsd xmm2, xmm0
movsd xmm3, xmm0
movsd xmm4, xmm0
movsd xmm5, xmm0
loop:
inc i
INSTR xmm10, xmm0
INSTR xmm11, xmm1
INSTR xmm12, xmm2
cmp i, N
INSTR xmm13, xmm3
INSTR xmm14, xmm4
INSTR xmm15, xmm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

61
src/AVX/sqrtsd.S Normal file
View File

@@ -0,0 +1,61 @@
#define INSTR sqrtsd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddpd xmm1, xmm0, xmm0 # create 2.0
vaddpd xmm2, xmm0, xmm1 # create 3.0
vaddpd xmm4, xmm1, xmm1 # create 4.0
vaddpd xmm4, xmm4, xmm4 # create 8.0
vaddpd xmm4, xmm4, xmm4 # create 16.0
vaddpd xmm4, xmm4, xmm4 # create 32.0
vaddpd xmm4, xmm4, xmm4 # create 64.0
vaddpd xmm4, xmm4, xmm4 # create 128.0
vaddpd xmm4, xmm4, xmm4 # create 256.0
vaddpd xmm4, xmm4, xmm4 # create 512.0
vaddpd xmm4, xmm4, xmm4 # create 1024.0
vdivpd xmm1, xmm4, xmm2 # create 341.3333
vdivpd xmm2, xmm0, xmm1 # create 1/341.3333
vaddpd xmm0, xmm1, xmm1 # create 2*341.3333
vmovapd xmm10, xmm0 # save value
loop:
inc i
INSTR xmm1, xmm0
addsd xmm1, xmm10
INSTR xmm2, xmm1
addsd xmm2, xmm10
INSTR xmm3, xmm2
addsd xmm3, xmm10
cmp i, N
INSTR xmm4, xmm3
addsd xmm4, xmm10
INSTR xmm5, xmm4
addsd xmm5, xmm10
INSTR xmm0, xmm5
addsd xmm0, xmm10
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

59
src/AVX/sqrtss-TP.S Normal file
View File

@@ -0,0 +1,59 @@
#define INSTR sqrtss
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddss xmm1, xmm0, xmm0 # create 2.0
vaddss xmm2, xmm0, xmm1 # create 3.0
vaddss xmm4, xmm1, xmm1 # create 4.0
vaddss xmm4, xmm4, xmm4 # create 8.0
vaddss xmm4, xmm4, xmm4 # create 16.0
vaddss xmm4, xmm4, xmm4 # create 32.0
vaddss xmm4, xmm4, xmm4 # create 64.0
vaddss xmm4, xmm4, xmm4 # create 128.0
vaddss xmm4, xmm4, xmm4 # create 256.0
vaddss xmm4, xmm4, xmm4 # create 512.0
vaddss xmm4, xmm4, xmm4 # create 1024.0
vdivss xmm1, xmm4, xmm2 # create 341.3333
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
movss xmm1, xmm0
movss xmm2, xmm0
movss xmm3, xmm0
movss xmm4, xmm0
movss xmm5, xmm0
loop:
inc i
INSTR xmm10, xmm0
INSTR xmm11, xmm1
INSTR xmm12, xmm2
cmp i, N
INSTR xmm13, xmm3
INSTR xmm14, xmm4
INSTR xmm15, xmm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

61
src/AVX/sqrtss.S Normal file
View File

@@ -0,0 +1,61 @@
#define INSTR sqrtss
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddps xmm1, xmm0, xmm0 # create 2.0
vaddps xmm2, xmm0, xmm1 # create 3.0
vaddps xmm4, xmm1, xmm1 # create 4.0
vaddps xmm4, xmm4, xmm4 # create 8.0
vaddps xmm4, xmm4, xmm4 # create 16.0
vaddps xmm4, xmm4, xmm4 # create 32.0
vaddps xmm4, xmm4, xmm4 # create 64.0
vaddps xmm4, xmm4, xmm4 # create 128.0
vaddps xmm4, xmm4, xmm4 # create 256.0
vaddps xmm4, xmm4, xmm4 # create 512.0
vaddps xmm4, xmm4, xmm4 # create 1024.0
vdivps xmm1, xmm4, xmm2 # create 341.3333
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
vmovaps xmm10, xmm0 # save value
loop:
inc i
INSTR xmm1, xmm0
addss xmm1, xmm10
INSTR xmm2, xmm1
addss xmm2, xmm10
INSTR xmm3, xmm2
addss xmm3, xmm10
cmp i, N
INSTR xmm4, xmm3
addss xmm4, xmm10
INSTR xmm5, xmm4
addss xmm5, xmm10
INSTR xmm0, xmm5
addss xmm0, xmm10
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

60
src/AVX/vsqrtpd-avx-TP.S Normal file
View File

@@ -0,0 +1,60 @@
#define INSTR vsqrtpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vinsertf128 ymm0, ymm0, xmm0, 0x1
vaddpd ymm1, ymm0, ymm0 # create 2.0
vaddpd ymm2, ymm0, ymm1 # create 3.0
vaddpd ymm4, ymm1, ymm1 # create 4.0
vaddpd ymm4, ymm4, ymm4 # create 8.0
vaddpd ymm4, ymm4, ymm4 # create 16.0
vaddpd ymm4, ymm4, ymm4 # create 32.0
vaddpd ymm4, ymm4, ymm4 # create 64.0
vaddpd ymm4, ymm4, ymm4 # create 128.0
vaddpd ymm4, ymm4, ymm4 # create 256.0
vaddpd ymm4, ymm4, ymm4 # create 512.0
vaddpd ymm4, ymm4, ymm4 # create 1024.0
vdivpd ymm1, ymm4, ymm2 # create 341.3333
vdivpd ymm2, ymm0, ymm1 # create 1/341.3333
vaddpd ymm0, ymm1, ymm1 # create 2*341.3333
vmovapd ymm1, ymm0
vmovapd ymm2, ymm0
vmovapd ymm3, ymm0
vmovapd ymm4, ymm0
vmovapd ymm5, ymm0
loop:
inc i
INSTR ymm10, ymm0
INSTR ymm11, ymm1
INSTR ymm12, ymm2
cmp i, N
INSTR ymm13, ymm3
INSTR ymm14, ymm4
INSTR ymm15, ymm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

62
src/AVX/vsqrtpd-avx.S Normal file
View File

@@ -0,0 +1,62 @@
#define INSTR vsqrtpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vinsertf128 ymm0, ymm0, xmm0, 0x1
vaddpd ymm1, ymm0, ymm0 # create 2.0
vaddpd ymm2, ymm0, ymm1 # create 3.0
vaddpd ymm4, ymm1, ymm1 # create 4.0
vaddpd ymm4, ymm4, ymm4 # create 8.0
vaddpd ymm4, ymm4, ymm4 # create 16.0
vaddpd ymm4, ymm4, ymm4 # create 32.0
vaddpd ymm4, ymm4, ymm4 # create 64.0
vaddpd ymm4, ymm4, ymm4 # create 128.0
vaddpd ymm4, ymm4, ymm4 # create 256.0
vaddpd ymm4, ymm4, ymm4 # create 512.0
vaddpd ymm4, ymm4, ymm4 # create 1024.0
vdivpd ymm1, ymm4, ymm2 # create 341.3333
vdivpd ymm2, ymm0, ymm1 # create 1/341.3333
vaddpd ymm0, ymm1, ymm1 # create 2*341.3333
vmovapd ymm10, ymm0 # save value
loop:
inc i
INSTR ymm1, ymm0
vaddpd ymm1, ymm1, ymm10
INSTR ymm2, ymm1
vaddpd ymm2, ymm2, ymm10
INSTR ymm3, ymm2
vaddpd ymm3, ymm3, ymm10
cmp i, N
INSTR ymm4, ymm3
vaddpd ymm4, ymm4, ymm10
INSTR ymm5, ymm4
vaddpd ymm5, ymm5, ymm10
INSTR ymm0, ymm5
vaddpd ymm0, ymm0, ymm10
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

59
src/AVX/vsqrtpd-sse-TP.S Normal file
View File

@@ -0,0 +1,59 @@
#define INSTR vsqrtpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddpd xmm1, xmm0, xmm0 # create 2.0
vaddpd xmm2, xmm0, xmm1 # create 3.0
vaddpd xmm4, xmm1, xmm1 # create 4.0
vaddpd xmm4, xmm4, xmm4 # create 8.0
vaddpd xmm4, xmm4, xmm4 # create 16.0
vaddpd xmm4, xmm4, xmm4 # create 32.0
vaddpd xmm4, xmm4, xmm4 # create 64.0
vaddpd xmm4, xmm4, xmm4 # create 128.0
vaddpd xmm4, xmm4, xmm4 # create 256.0
vaddpd xmm4, xmm4, xmm4 # create 512.0
vaddpd xmm4, xmm4, xmm4 # create 1024.0
vdivpd xmm1, xmm4, xmm2 # create 341.3333
vdivpd xmm2, xmm0, xmm1 # create 1/341.3333
vaddpd xmm0, xmm1, xmm1 # create 2*341.3333
vmovapd xmm1, xmm0
vmovapd xmm2, xmm0
vmovapd xmm3, xmm0
vmovapd xmm4, xmm0
vmovapd xmm5, xmm0
loop:
inc i
INSTR xmm10, xmm0
INSTR xmm11, xmm1
INSTR xmm12, xmm2
cmp i, N
INSTR xmm13, xmm3
INSTR xmm14, xmm4
INSTR xmm15, xmm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

61
src/AVX/vsqrtpd-sse.S Normal file
View File

@@ -0,0 +1,61 @@
#define INSTR vsqrtpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddpd xmm1, xmm0, xmm0 # create 2.0
vaddpd xmm2, xmm0, xmm1 # create 3.0
vaddpd xmm4, xmm1, xmm1 # create 4.0
vaddpd xmm4, xmm4, xmm4 # create 8.0
vaddpd xmm4, xmm4, xmm4 # create 16.0
vaddpd xmm4, xmm4, xmm4 # create 32.0
vaddpd xmm4, xmm4, xmm4 # create 64.0
vaddpd xmm4, xmm4, xmm4 # create 128.0
vaddpd xmm4, xmm4, xmm4 # create 256.0
vaddpd xmm4, xmm4, xmm4 # create 512.0
vaddpd xmm4, xmm4, xmm4 # create 1024.0
vdivpd xmm1, xmm4, xmm2 # create 341.3333
vdivpd xmm2, xmm0, xmm1 # create 1/341.3333
vaddpd xmm0, xmm1, xmm1 # create 2*341.3333
vmovapd xmm10, xmm0 # save value
loop:
inc i
INSTR xmm1, xmm0
vaddpd xmm1, xmm1, xmm10
INSTR xmm2, xmm1
vaddpd xmm2, xmm2, xmm10
INSTR xmm3, xmm2
vaddpd xmm3, xmm3, xmm10
cmp i, N
INSTR xmm4, xmm3
vaddpd xmm4, xmm4, xmm10
INSTR xmm5, xmm4
vaddpd xmm5, xmm5, xmm10
INSTR xmm0, xmm5
vaddpd xmm0, xmm0, xmm10
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

60
src/AVX/vsqrtps-avx-TP.S Normal file
View File

@@ -0,0 +1,60 @@
#define INSTR vsqrtps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vinsertf128 ymm0, ymm0, xmm0, 0x1
vaddps ymm1, ymm0, ymm0 # create 2.0
vaddps ymm2, ymm0, ymm1 # create 3.0
vaddps ymm4, ymm1, ymm1 # create 4.0
vaddps ymm4, ymm4, ymm4 # create 8.0
vaddps ymm4, ymm4, ymm4 # create 16.0
vaddps ymm4, ymm4, ymm4 # create 32.0
vaddps ymm4, ymm4, ymm4 # create 64.0
vaddps ymm4, ymm4, ymm4 # create 128.0
vaddps ymm4, ymm4, ymm4 # create 256.0
vaddps ymm4, ymm4, ymm4 # create 512.0
vaddps ymm4, ymm4, ymm4 # create 1024.0
vdivps ymm1, ymm4, ymm2 # create 341.3333
vdivps ymm2, ymm0, ymm1 # create 1/341.3333
vaddps ymm0, ymm1, ymm1 # create 2*341.3333
vmovaps ymm1, ymm0
vmovaps ymm2, ymm0
vmovaps ymm3, ymm0
vmovaps ymm4, ymm0
vmovaps ymm5, ymm0
loop:
inc i
INSTR ymm10, ymm0
INSTR ymm11, ymm1
INSTR ymm12, ymm2
cmp i, N
INSTR ymm13, ymm3
INSTR ymm14, ymm4
INSTR ymm15, ymm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

62
src/AVX/vsqrtps-avx.S Normal file
View File

@@ -0,0 +1,62 @@
#define INSTR vsqrtps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vinsertf128 ymm0, ymm0, xmm0, 0x1
vaddps ymm1, ymm0, ymm0 # create 2.0
vaddps ymm2, ymm0, ymm1 # create 3.0
vaddps ymm4, ymm1, ymm1 # create 4.0
vaddps ymm4, ymm4, ymm4 # create 8.0
vaddps ymm4, ymm4, ymm4 # create 16.0
vaddps ymm4, ymm4, ymm4 # create 32.0
vaddps ymm4, ymm4, ymm4 # create 64.0
vaddps ymm4, ymm4, ymm4 # create 128.0
vaddps ymm4, ymm4, ymm4 # create 256.0
vaddps ymm4, ymm4, ymm4 # create 512.0
vaddps ymm4, ymm4, ymm4 # create 1024.0
vdivps ymm1, ymm4, ymm2 # create 341.3333
vdivps ymm2, ymm0, ymm1 # create 1/341.3333
vaddps ymm0, ymm1, ymm1 # create 2*341.3333
vmovaps ymm10, ymm0 # save value
loop:
inc i
INSTR ymm1, ymm0
vaddps ymm1, ymm1, ymm10
INSTR ymm2, ymm1
vaddps ymm2, ymm2, ymm10
INSTR ymm3, ymm2
vaddps ymm3, ymm3, ymm10
cmp i, N
INSTR ymm4, ymm3
vaddps ymm4, ymm4, ymm10
INSTR ymm5, ymm4
vaddps ymm5, ymm5, ymm10
INSTR ymm0, ymm5
vaddps ymm0, ymm0, ymm10
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

59
src/AVX/vsqrtps-sse-TP.S Normal file
View File

@@ -0,0 +1,59 @@
#define INSTR vsqrtps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddps xmm1, xmm0, xmm0 # create 2.0
vaddps xmm2, xmm0, xmm1 # create 3.0
vaddps xmm4, xmm1, xmm1 # create 4.0
vaddps xmm4, xmm4, xmm4 # create 8.0
vaddps xmm4, xmm4, xmm4 # create 16.0
vaddps xmm4, xmm4, xmm4 # create 32.0
vaddps xmm4, xmm4, xmm4 # create 64.0
vaddps xmm4, xmm4, xmm4 # create 128.0
vaddps xmm4, xmm4, xmm4 # create 256.0
vaddps xmm4, xmm4, xmm4 # create 512.0
vaddps xmm4, xmm4, xmm4 # create 1024.0
vdivps xmm1, xmm4, xmm2 # create 341.3333
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
vmovaps xmm1, xmm0
vmovaps xmm2, xmm0
vmovaps xmm3, xmm0
vmovaps xmm4, xmm0
vmovaps xmm5, xmm0
loop:
inc i
INSTR xmm10, xmm0
INSTR xmm11, xmm1
INSTR xmm12, xmm2
cmp i, N
INSTR xmm13, xmm3
INSTR xmm14, xmm4
INSTR xmm15, xmm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

61
src/AVX/vsqrtps-sse.S Normal file
View File

@@ -0,0 +1,61 @@
#define INSTR vsqrtps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddps xmm1, xmm0, xmm0 # create 2.0
vaddps xmm2, xmm0, xmm1 # create 3.0
vaddps xmm4, xmm1, xmm1 # create 4.0
vaddps xmm4, xmm4, xmm4 # create 8.0
vaddps xmm4, xmm4, xmm4 # create 16.0
vaddps xmm4, xmm4, xmm4 # create 32.0
vaddps xmm4, xmm4, xmm4 # create 64.0
vaddps xmm4, xmm4, xmm4 # create 128.0
vaddps xmm4, xmm4, xmm4 # create 256.0
vaddps xmm4, xmm4, xmm4 # create 512.0
vaddps xmm4, xmm4, xmm4 # create 1024.0
vdivps xmm1, xmm4, xmm2 # create 341.3333
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
vmovaps xmm10, xmm0 # save value
loop:
inc i
INSTR xmm1, xmm0
vaddps xmm1, xmm1, xmm10
INSTR xmm2, xmm1
vaddps xmm2, xmm2, xmm10
INSTR xmm3, xmm2
vaddps xmm3, xmm3, xmm10
cmp i, N
INSTR xmm4, xmm3
vaddps xmm4, xmm4, xmm10
INSTR xmm5, xmm4
vaddps xmm5, xmm5, xmm10
INSTR xmm0, xmm5
vaddps xmm0, xmm0, xmm10
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency