mirror of
https://github.com/RRZE-HPC/ibench.git
synced 2025-07-21 04:41:09 +02:00
add sqrt to AVX benchmarks
This commit is contained in:
59
src/AVX/sqrtsd-TP.S
Normal file
59
src/AVX/sqrtsd-TP.S
Normal file
@@ -0,0 +1,59 @@
|
||||
#define INSTR sqrtsd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddsd xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddsd xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddsd xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivsd xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivsd xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddsd xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
movsd xmm1, xmm0
|
||||
movsd xmm2, xmm0
|
||||
movsd xmm3, xmm0
|
||||
movsd xmm4, xmm0
|
||||
movsd xmm5, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm10, xmm0
|
||||
INSTR xmm11, xmm1
|
||||
INSTR xmm12, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm13, xmm3
|
||||
INSTR xmm14, xmm4
|
||||
INSTR xmm15, xmm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
61
src/AVX/sqrtsd.S
Normal file
61
src/AVX/sqrtsd.S
Normal file
@@ -0,0 +1,61 @@
|
||||
#define INSTR sqrtsd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddpd xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddpd xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddpd xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivpd xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivpd xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddpd xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
vmovapd xmm10, xmm0 # save value
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm1, xmm0
|
||||
addsd xmm1, xmm10
|
||||
INSTR xmm2, xmm1
|
||||
addsd xmm2, xmm10
|
||||
INSTR xmm3, xmm2
|
||||
addsd xmm3, xmm10
|
||||
cmp i, N
|
||||
INSTR xmm4, xmm3
|
||||
addsd xmm4, xmm10
|
||||
INSTR xmm5, xmm4
|
||||
addsd xmm5, xmm10
|
||||
INSTR xmm0, xmm5
|
||||
addsd xmm0, xmm10
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
59
src/AVX/sqrtss-TP.S
Normal file
59
src/AVX/sqrtss-TP.S
Normal file
@@ -0,0 +1,59 @@
|
||||
#define INSTR sqrtss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddss xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddss xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddss xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivss xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
movss xmm1, xmm0
|
||||
movss xmm2, xmm0
|
||||
movss xmm3, xmm0
|
||||
movss xmm4, xmm0
|
||||
movss xmm5, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm10, xmm0
|
||||
INSTR xmm11, xmm1
|
||||
INSTR xmm12, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm13, xmm3
|
||||
INSTR xmm14, xmm4
|
||||
INSTR xmm15, xmm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
61
src/AVX/sqrtss.S
Normal file
61
src/AVX/sqrtss.S
Normal file
@@ -0,0 +1,61 @@
|
||||
#define INSTR sqrtss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddps xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddps xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddps xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivps xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
vmovaps xmm10, xmm0 # save value
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm1, xmm0
|
||||
addss xmm1, xmm10
|
||||
INSTR xmm2, xmm1
|
||||
addss xmm2, xmm10
|
||||
INSTR xmm3, xmm2
|
||||
addss xmm3, xmm10
|
||||
cmp i, N
|
||||
INSTR xmm4, xmm3
|
||||
addss xmm4, xmm10
|
||||
INSTR xmm5, xmm4
|
||||
addss xmm5, xmm10
|
||||
INSTR xmm0, xmm5
|
||||
addss xmm0, xmm10
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
60
src/AVX/vsqrtpd-avx-TP.S
Normal file
60
src/AVX/vsqrtpd-avx-TP.S
Normal file
@@ -0,0 +1,60 @@
|
||||
#define INSTR vsqrtpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
|
||||
vaddpd ymm1, ymm0, ymm0 # create 2.0
|
||||
vaddpd ymm2, ymm0, ymm1 # create 3.0
|
||||
vaddpd ymm4, ymm1, ymm1 # create 4.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 8.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 16.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 32.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 64.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 128.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 256.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 512.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 1024.0
|
||||
vdivpd ymm1, ymm4, ymm2 # create 341.3333
|
||||
vdivpd ymm2, ymm0, ymm1 # create 1/341.3333
|
||||
vaddpd ymm0, ymm1, ymm1 # create 2*341.3333
|
||||
vmovapd ymm1, ymm0
|
||||
vmovapd ymm2, ymm0
|
||||
vmovapd ymm3, ymm0
|
||||
vmovapd ymm4, ymm0
|
||||
vmovapd ymm5, ymm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm10, ymm0
|
||||
INSTR ymm11, ymm1
|
||||
INSTR ymm12, ymm2
|
||||
cmp i, N
|
||||
INSTR ymm13, ymm3
|
||||
INSTR ymm14, ymm4
|
||||
INSTR ymm15, ymm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
62
src/AVX/vsqrtpd-avx.S
Normal file
62
src/AVX/vsqrtpd-avx.S
Normal file
@@ -0,0 +1,62 @@
|
||||
#define INSTR vsqrtpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
|
||||
vaddpd ymm1, ymm0, ymm0 # create 2.0
|
||||
vaddpd ymm2, ymm0, ymm1 # create 3.0
|
||||
vaddpd ymm4, ymm1, ymm1 # create 4.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 8.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 16.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 32.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 64.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 128.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 256.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 512.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 1024.0
|
||||
vdivpd ymm1, ymm4, ymm2 # create 341.3333
|
||||
vdivpd ymm2, ymm0, ymm1 # create 1/341.3333
|
||||
vaddpd ymm0, ymm1, ymm1 # create 2*341.3333
|
||||
vmovapd ymm10, ymm0 # save value
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm1, ymm0
|
||||
vaddpd ymm1, ymm1, ymm10
|
||||
INSTR ymm2, ymm1
|
||||
vaddpd ymm2, ymm2, ymm10
|
||||
INSTR ymm3, ymm2
|
||||
vaddpd ymm3, ymm3, ymm10
|
||||
cmp i, N
|
||||
INSTR ymm4, ymm3
|
||||
vaddpd ymm4, ymm4, ymm10
|
||||
INSTR ymm5, ymm4
|
||||
vaddpd ymm5, ymm5, ymm10
|
||||
INSTR ymm0, ymm5
|
||||
vaddpd ymm0, ymm0, ymm10
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
59
src/AVX/vsqrtpd-sse-TP.S
Normal file
59
src/AVX/vsqrtpd-sse-TP.S
Normal file
@@ -0,0 +1,59 @@
|
||||
#define INSTR vsqrtpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddpd xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddpd xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddpd xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivpd xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivpd xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddpd xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
vmovapd xmm1, xmm0
|
||||
vmovapd xmm2, xmm0
|
||||
vmovapd xmm3, xmm0
|
||||
vmovapd xmm4, xmm0
|
||||
vmovapd xmm5, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm10, xmm0
|
||||
INSTR xmm11, xmm1
|
||||
INSTR xmm12, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm13, xmm3
|
||||
INSTR xmm14, xmm4
|
||||
INSTR xmm15, xmm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
61
src/AVX/vsqrtpd-sse.S
Normal file
61
src/AVX/vsqrtpd-sse.S
Normal file
@@ -0,0 +1,61 @@
|
||||
#define INSTR vsqrtpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddpd xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddpd xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddpd xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivpd xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivpd xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddpd xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
vmovapd xmm10, xmm0 # save value
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm1, xmm0
|
||||
vaddpd xmm1, xmm1, xmm10
|
||||
INSTR xmm2, xmm1
|
||||
vaddpd xmm2, xmm2, xmm10
|
||||
INSTR xmm3, xmm2
|
||||
vaddpd xmm3, xmm3, xmm10
|
||||
cmp i, N
|
||||
INSTR xmm4, xmm3
|
||||
vaddpd xmm4, xmm4, xmm10
|
||||
INSTR xmm5, xmm4
|
||||
vaddpd xmm5, xmm5, xmm10
|
||||
INSTR xmm0, xmm5
|
||||
vaddpd xmm0, xmm0, xmm10
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
60
src/AVX/vsqrtps-avx-TP.S
Normal file
60
src/AVX/vsqrtps-avx-TP.S
Normal file
@@ -0,0 +1,60 @@
|
||||
#define INSTR vsqrtps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
|
||||
vaddps ymm1, ymm0, ymm0 # create 2.0
|
||||
vaddps ymm2, ymm0, ymm1 # create 3.0
|
||||
vaddps ymm4, ymm1, ymm1 # create 4.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 8.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 16.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 32.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 64.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 128.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 256.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 512.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 1024.0
|
||||
vdivps ymm1, ymm4, ymm2 # create 341.3333
|
||||
vdivps ymm2, ymm0, ymm1 # create 1/341.3333
|
||||
vaddps ymm0, ymm1, ymm1 # create 2*341.3333
|
||||
vmovaps ymm1, ymm0
|
||||
vmovaps ymm2, ymm0
|
||||
vmovaps ymm3, ymm0
|
||||
vmovaps ymm4, ymm0
|
||||
vmovaps ymm5, ymm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm10, ymm0
|
||||
INSTR ymm11, ymm1
|
||||
INSTR ymm12, ymm2
|
||||
cmp i, N
|
||||
INSTR ymm13, ymm3
|
||||
INSTR ymm14, ymm4
|
||||
INSTR ymm15, ymm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
62
src/AVX/vsqrtps-avx.S
Normal file
62
src/AVX/vsqrtps-avx.S
Normal file
@@ -0,0 +1,62 @@
|
||||
#define INSTR vsqrtps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
|
||||
vaddps ymm1, ymm0, ymm0 # create 2.0
|
||||
vaddps ymm2, ymm0, ymm1 # create 3.0
|
||||
vaddps ymm4, ymm1, ymm1 # create 4.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 8.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 16.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 32.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 64.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 128.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 256.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 512.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 1024.0
|
||||
vdivps ymm1, ymm4, ymm2 # create 341.3333
|
||||
vdivps ymm2, ymm0, ymm1 # create 1/341.3333
|
||||
vaddps ymm0, ymm1, ymm1 # create 2*341.3333
|
||||
vmovaps ymm10, ymm0 # save value
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm1, ymm0
|
||||
vaddps ymm1, ymm1, ymm10
|
||||
INSTR ymm2, ymm1
|
||||
vaddps ymm2, ymm2, ymm10
|
||||
INSTR ymm3, ymm2
|
||||
vaddps ymm3, ymm3, ymm10
|
||||
cmp i, N
|
||||
INSTR ymm4, ymm3
|
||||
vaddps ymm4, ymm4, ymm10
|
||||
INSTR ymm5, ymm4
|
||||
vaddps ymm5, ymm5, ymm10
|
||||
INSTR ymm0, ymm5
|
||||
vaddps ymm0, ymm0, ymm10
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
59
src/AVX/vsqrtps-sse-TP.S
Normal file
59
src/AVX/vsqrtps-sse-TP.S
Normal file
@@ -0,0 +1,59 @@
|
||||
#define INSTR vsqrtps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddps xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddps xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddps xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivps xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
vmovaps xmm1, xmm0
|
||||
vmovaps xmm2, xmm0
|
||||
vmovaps xmm3, xmm0
|
||||
vmovaps xmm4, xmm0
|
||||
vmovaps xmm5, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm10, xmm0
|
||||
INSTR xmm11, xmm1
|
||||
INSTR xmm12, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm13, xmm3
|
||||
INSTR xmm14, xmm4
|
||||
INSTR xmm15, xmm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
61
src/AVX/vsqrtps-sse.S
Normal file
61
src/AVX/vsqrtps-sse.S
Normal file
@@ -0,0 +1,61 @@
|
||||
#define INSTR vsqrtps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddps xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddps xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddps xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivps xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
vmovaps xmm10, xmm0 # save value
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm1, xmm0
|
||||
vaddps xmm1, xmm1, xmm10
|
||||
INSTR xmm2, xmm1
|
||||
vaddps xmm2, xmm2, xmm10
|
||||
INSTR xmm3, xmm2
|
||||
vaddps xmm3, xmm3, xmm10
|
||||
cmp i, N
|
||||
INSTR xmm4, xmm3
|
||||
vaddps xmm4, xmm4, xmm10
|
||||
INSTR xmm5, xmm4
|
||||
vaddps xmm5, xmm5, xmm10
|
||||
INSTR xmm0, xmm5
|
||||
vaddps xmm0, xmm0, xmm10
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
Reference in New Issue
Block a user