mirror of
https://github.com/RRZE-HPC/ibench.git
synced 2025-07-21 21:01:10 +02:00
mark registers before using them
This commit is contained in:
40
src/AVX-512/LAT-FMA-FMA-MUL-alt.S
Normal file
40
src/AVX-512/LAT-FMA-FMA-MUL-alt.S
Normal file
@@ -0,0 +1,40 @@
|
||||
#define INSTR vfmadd213sd
|
||||
#define NINST 1
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovapd xmm1, xmm0
|
||||
vmovapd xmm2, xmm0
|
||||
loop:
|
||||
inc i
|
||||
vfmadd231sd xmm0, xmm1, xmm2
|
||||
vfmadd231sd xmm0, xmm2, xmm1
|
||||
vmulsd xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX-512/LAT-FMA-FMA-MUL.S
Normal file
44
src/AVX-512/LAT-FMA-FMA-MUL.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vfmadd213sd
|
||||
#define NINST 1
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovapd zmm1, zmm0
|
||||
vmovapd zmm2, zmm0
|
||||
loop:
|
||||
inc i
|
||||
vfmadd231sd xmm0, xmm1, xmm2
|
||||
vfmadd231sd xmm0, xmm2, xmm1
|
||||
vmulsd xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
41
src/AVX-512/LAT-FMA-MUL-alt.S
Normal file
41
src/AVX-512/LAT-FMA-MUL-alt.S
Normal file
@@ -0,0 +1,41 @@
|
||||
#define INSTR vfmadd213sd
|
||||
#define NINST 1
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm0
|
||||
movsd xmm2, xmm0
|
||||
|
||||
loop:
|
||||
inc i
|
||||
vfmadd231sd xmm0, xmm1, xmm2
|
||||
vmulsd xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
43
src/AVX-512/LAT-FMA-MUL.S
Normal file
43
src/AVX-512/LAT-FMA-MUL.S
Normal file
@@ -0,0 +1,43 @@
|
||||
#define INSTR vfmadd213sd
|
||||
#define NINST 1
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovapd zmm1, zmm0
|
||||
vmovapd zmm2, zmm0
|
||||
loop:
|
||||
inc i
|
||||
vfmadd231sd xmm0, xmm1, xmm2
|
||||
vmulsd xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
@@ -29,6 +29,11 @@ latency:
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovapd zmm1, zmm0
|
||||
|
||||
# Mark registers as AVX-512
|
||||
vmovapd zmm0, zmm0
|
||||
vmovapd zmm1, zmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm0, zmm1
|
@@ -29,6 +29,11 @@ latency:
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps zmm1, zmm0
|
||||
|
||||
# Mark registers as AVX-512
|
||||
vmovapd zmm0, zmm0
|
||||
vmovapd zmm1, zmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm0, zmm1
|
48
src/AVX-512/vfmadd213sd-avx512-alt1.S
Normal file
48
src/AVX-512/vfmadd213sd-avx512-alt1.S
Normal file
@@ -0,0 +1,48 @@
|
||||
#define INSTR vfmadd213sd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovapd zmm1, zmm2
|
||||
vpcmpeqw xmm1, xmm1, xmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX-512/vfmadd213sd-avx512.S
Normal file
42
src/AVX-512/vfmadd213sd-avx512.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vfmadd213sd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
@@ -31,6 +31,12 @@ latency:
|
||||
vaddpd zmm1, zmm0, zmm0
|
||||
# create AVX-512 DP 0.5
|
||||
vdivpd zmm2, zmm0, zmm1
|
||||
|
||||
# Mark registers as AVX-512
|
||||
vmovapd zmm0, zmm0
|
||||
vmovapd zmm1, zmm1
|
||||
vmovapd zmm2, zmm2
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm0, zmm1
|
@@ -31,6 +31,12 @@ latency:
|
||||
vaddps zmm1, zmm0, zmm0
|
||||
# create AVX-512 DP 0.5
|
||||
vdivps zmm2, zmm0, zmm1
|
||||
|
||||
# Mark registers as AVX-512
|
||||
vmovapd zmm0, zmm0
|
||||
vmovapd zmm1, zmm1
|
||||
vmovapd zmm2, zmm2
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm0, zmm1
|
48
src/AVX-512/vmulsd-avx512.S
Normal file
48
src/AVX-512/vmulsd-avx512.S
Normal file
@@ -0,0 +1,48 @@
|
||||
#define INSTR vmulsd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# create AVX-512 DP 2.0
|
||||
vaddpd zmm1, zmm0, zmm0
|
||||
# create AVX-512 DP 0.5
|
||||
vdivpd zmm2, zmm0, zmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
@@ -26,7 +26,19 @@ latency:
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
vmovapd ymm1, ymm0
|
||||
vmovapd ymm2, ymm0
|
||||
|
||||
# Mark registers as AVX
|
||||
vmovapd ymm0, ymm0
|
||||
vmovapd ymm1, ymm1
|
||||
vmovapd ymm2, ymm2
|
||||
vmovapd ymm3, ymm3
|
||||
vmovapd ymm4, ymm4
|
||||
vmovapd ymm5, ymm5
|
||||
vmovapd ymm6, ymm6
|
||||
vmovapd ymm7, ymm7
|
||||
vmovapd ymm8, ymm8
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm1
|
||||
|
@@ -26,7 +26,12 @@ latency:
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
vmovapd ymm1, ymm0
|
||||
|
||||
# Mark registers as AVX
|
||||
vmovapd ymm0, ymm0
|
||||
vmovapd ymm1, ymm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
|
@@ -27,6 +27,11 @@ latency:
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
|
||||
# Mark registers as AVX
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
|
@@ -29,6 +29,18 @@ latency:
|
||||
vaddpd ymm1, ymm0, ymm0
|
||||
# create SP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
|
||||
# Mark registers as AVX
|
||||
vmovapd ymm0, ymm0
|
||||
vmovapd ymm1, ymm1
|
||||
vmovapd ymm2, ymm2
|
||||
vmovapd ymm3, ymm3
|
||||
vmovapd ymm4, ymm4
|
||||
vmovapd ymm5, ymm5
|
||||
vmovapd ymm6, ymm6
|
||||
vmovapd ymm7, ymm7
|
||||
vmovapd ymm8, ymm8
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm1
|
||||
|
@@ -29,6 +29,12 @@ latency:
|
||||
vaddpd ymm1, ymm0, ymm0
|
||||
# create SP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
|
||||
# Mark registers as AVX
|
||||
vmovapd ymm0, ymm0
|
||||
vmovapd ymm1, ymm1
|
||||
vmovapd ymm2, ymm2
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
|
@@ -29,6 +29,12 @@ latency:
|
||||
vaddps ymm1, ymm0, ymm0
|
||||
# create SP 0.5
|
||||
vdivps ymm2, ymm0, ymm1
|
||||
|
||||
# Mark registers as AVX
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm1
|
||||
vmovaps ymm2, ymm2
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
|
@@ -27,6 +27,11 @@ latency:
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
|
||||
# Mark registers AVX
|
||||
vmovapd ymm0, ymm0
|
||||
vmovapd ymm1, ymm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm1, ymm1
|
51
src/FMA/vfmadd213pd-avx512.S
Normal file
51
src/FMA/vfmadd213pd-avx512.S
Normal file
@@ -0,0 +1,51 @@
|
||||
#define INSTR vfmadd213pd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovapd zmm1, zmm0
|
||||
|
||||
# Mark registers AVX-512
|
||||
vmovapd zmm0, zmm0
|
||||
vmovapd zmm1, zmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
cmp i, N
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
@@ -25,6 +25,11 @@ latency:
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
|
||||
# Mark registers SSE
|
||||
movapd xmm0, xmm0
|
||||
movapd xmm1, xmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm1
|
@@ -27,6 +27,11 @@ latency:
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
|
||||
# Mark registers AVX
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm1, ymm1
|
51
src/FMA/vfmadd213ps-avx512.S
Normal file
51
src/FMA/vfmadd213ps-avx512.S
Normal file
@@ -0,0 +1,51 @@
|
||||
#define INSTR vfmadd213ps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps zmm1, zmm0
|
||||
|
||||
# Mark registers AVX-512
|
||||
vmovaps zmm0, zmm0
|
||||
vmovaps zmm1, zmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
cmp i, N
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
@@ -25,6 +25,11 @@ latency:
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
|
||||
# Mark registers SSE
|
||||
movaps xmm0, xmm0
|
||||
movaps xmm1, xmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm1
|
@@ -25,6 +25,11 @@ latency:
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
|
||||
# Mark registers scalar
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm1
|
@@ -25,6 +25,11 @@ latency:
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
|
||||
# Mark registers scalar
|
||||
movss xmm0, xmm0
|
||||
movss xmm1, xmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm1
|
@@ -24,7 +24,20 @@ latency:
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
vmovapd xmm1, xmm0
|
||||
vmovapd xmm2, xmm0
|
||||
|
||||
# Mark registers as SSE
|
||||
movapd xmm0, xmm0
|
||||
movapd xmm1, xmm1
|
||||
movapd xmm2, xmm2
|
||||
movapd xmm3, xmm3
|
||||
movapd xmm4, xmm4
|
||||
movapd xmm5, xmm5
|
||||
movapd xmm6, xmm6
|
||||
movapd xmm7, xmm7
|
||||
movapd xmm8, xmm8
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
|
@@ -24,7 +24,12 @@ latency:
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
vmovapd xmm1, xmm0
|
||||
|
||||
# Mark registers as SSE
|
||||
movapd xmm0, xmm0
|
||||
movapd xmm1, xmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
|
@@ -27,6 +27,18 @@ latency:
|
||||
vaddpd xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
|
||||
# Mark registers as SSE
|
||||
movapd xmm0, xmm0
|
||||
movapd xmm1, xmm1
|
||||
movapd xmm2, xmm2
|
||||
movapd xmm3, xmm3
|
||||
movapd xmm4, xmm4
|
||||
movapd xmm5, xmm5
|
||||
movapd xmm6, xmm6
|
||||
movapd xmm7, xmm7
|
||||
movapd xmm8, xmm8
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
|
@@ -27,6 +27,12 @@ latency:
|
||||
vaddpd xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
|
||||
# Mark registers as SSE
|
||||
movapd xmm0, xmm0
|
||||
movapd xmm1, xmm1
|
||||
movapd xmm2, xmm2
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
|
@@ -27,6 +27,12 @@ latency:
|
||||
vaddps xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivps xmm2, xmm0, xmm1
|
||||
|
||||
# Mark registers as SSE
|
||||
movaps xmm0, xmm0
|
||||
movaps xmm1, xmm1
|
||||
movaps xmm2, xmm2
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
|
47
src/scalar/addsd.S
Normal file
47
src/scalar/addsd.S
Normal file
@@ -0,0 +1,47 @@
|
||||
#define INSTR addsd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
|
||||
# Mark registers as scalar
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm0, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
47
src/scalar/addss.S
Normal file
47
src/scalar/addss.S
Normal file
@@ -0,0 +1,47 @@
|
||||
#define INSTR addss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
|
||||
# Mark registers as scalar
|
||||
movss xmm0, xmm0
|
||||
movss xmm1, xmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm0, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
50
src/scalar/mulsd.S
Normal file
50
src/scalar/mulsd.S
Normal file
@@ -0,0 +1,50 @@
|
||||
#define INSTR mulsd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# create SP 2.0
|
||||
vaddpd xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
|
||||
# Mark registers as scalar
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm1
|
||||
movsd xmm2, xmm2
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm0, xmm2
|
||||
INSTR xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm2
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm0, xmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
50
src/scalar/mulss.S
Normal file
50
src/scalar/mulss.S
Normal file
@@ -0,0 +1,50 @@
|
||||
#define INSTR mulss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# create SP 2.0
|
||||
vaddps xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivps xmm2, xmm0, xmm1
|
||||
|
||||
# Mark registers as scalar
|
||||
movss xmm0, xmm0
|
||||
movss xmm1, xmm1
|
||||
movss xmm2, xmm2
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm0, xmm2
|
||||
INSTR xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm2
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm0, xmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
@@ -37,11 +37,15 @@ latency:
|
||||
vdivss xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
|
||||
# Mark registers as scalar
|
||||
movss xmm0, xmm0
|
||||
movss xmm1, xmm0
|
||||
movss xmm2, xmm0
|
||||
movss xmm3, xmm0
|
||||
movss xmm4, xmm0
|
||||
movss xmm5, xmm0
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm10, xmm0
|
@@ -1,4 +1,4 @@
|
||||
#define INSTR vrcpps
|
||||
#define INSTR rcpss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
@@ -37,6 +37,15 @@ latency:
|
||||
vdivps xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
|
||||
# Mark registers as scalar
|
||||
movss xmm0, xmm0
|
||||
movss xmm1, xmm1
|
||||
movss xmm2, xmm2
|
||||
movss xmm3, xmm3
|
||||
movss xmm4, xmm4
|
||||
movss xmm5, xmm5
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm1, xmm0
|
@@ -37,11 +37,15 @@ latency:
|
||||
vdivsd xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivsd xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddsd xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
|
||||
# Mark registers as scalar
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm0
|
||||
movsd xmm2, xmm0
|
||||
movsd xmm3, xmm0
|
||||
movsd xmm4, xmm0
|
||||
movsd xmm5, xmm0
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm10, xmm0
|
@@ -38,6 +38,16 @@ latency:
|
||||
vdivpd xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddpd xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
vmovapd xmm10, xmm0 # save value
|
||||
|
||||
# Mark registers as scalar
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm1
|
||||
movsd xmm2, xmm2
|
||||
movsd xmm3, xmm3
|
||||
movsd xmm4, xmm4
|
||||
movsd xmm5, xmm5
|
||||
movsd xmm10, xmm10
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm1, xmm0
|
@@ -37,11 +37,15 @@ latency:
|
||||
vdivss xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
|
||||
# Mark registers as scalar
|
||||
movss xmm0, xmm0
|
||||
movss xmm1, xmm0
|
||||
movss xmm2, xmm0
|
||||
movss xmm3, xmm0
|
||||
movss xmm4, xmm0
|
||||
movss xmm5, xmm0
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm10, xmm0
|
@@ -38,6 +38,16 @@ latency:
|
||||
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
vmovaps xmm10, xmm0 # save value
|
||||
|
||||
# Mark registers as scalar
|
||||
movss xmm0, xmm0
|
||||
movss xmm1, xmm1
|
||||
movss xmm2, xmm2
|
||||
movss xmm3, xmm3
|
||||
movss xmm4, xmm4
|
||||
movss xmm5, xmm5
|
||||
movss xmm10, xmm10
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm1, xmm0
|
@@ -25,6 +25,18 @@ latency:
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
|
||||
# Mark registers as scalar
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm0
|
||||
movsd xmm2, xmm0
|
||||
movsd xmm3, xmm0
|
||||
movsd xmm4, xmm0
|
||||
movsd xmm5, xmm0
|
||||
movsd xmm6, xmm0
|
||||
movsd xmm7, xmm0
|
||||
movsd xmm8, xmm0
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
@@ -25,6 +25,11 @@ latency:
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
|
||||
# Mark registers as scalar
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
@@ -25,6 +25,18 @@ latency:
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
|
||||
# Mark registers as scalar
|
||||
movss xmm0, xmm0
|
||||
movss xmm1, xmm0
|
||||
movss xmm2, xmm0
|
||||
movss xmm3, xmm0
|
||||
movss xmm4, xmm0
|
||||
movss xmm5, xmm0
|
||||
movss xmm6, xmm0
|
||||
movss xmm7, xmm0
|
||||
movss xmm8, xmm0
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
@@ -25,6 +25,11 @@ latency:
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
|
||||
# Mark registers as scalar
|
||||
movss xmm0, xmm0
|
||||
movss xmm1, xmm1
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
@@ -37,6 +37,18 @@ latency:
|
||||
vdivsd xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivsd xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddsd xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
|
||||
# Mark registers as scalar
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm0
|
||||
movsd xmm2, xmm0
|
||||
movsd xmm3, xmm0
|
||||
movsd xmm4, xmm0
|
||||
movsd xmm5, xmm0
|
||||
movsd xmm6, xmm0
|
||||
movsd xmm7, xmm0
|
||||
movsd xmm8, xmm0
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
@@ -37,6 +37,12 @@ latency:
|
||||
vdivsd xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivsd xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddsd xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
|
||||
# Mark registers as scalar
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm1
|
||||
movsd xmm2, xmm2
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
@@ -37,6 +37,18 @@ latency:
|
||||
vdivss xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
|
||||
# Mark registers as scalar
|
||||
movss xmm0, xmm0
|
||||
movss xmm1, xmm0
|
||||
movss xmm2, xmm0
|
||||
movss xmm3, xmm0
|
||||
movss xmm4, xmm0
|
||||
movss xmm5, xmm0
|
||||
movss xmm6, xmm0
|
||||
movss xmm7, xmm0
|
||||
movss xmm8, xmm0
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
@@ -37,6 +37,12 @@ latency:
|
||||
vdivss xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
|
||||
# Mark registers as scalar
|
||||
movss xmm0, xmm0
|
||||
movss xmm1, xmm1
|
||||
movss xmm2, xmm2
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
@@ -27,6 +27,18 @@ latency:
|
||||
vaddps xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivps xmm2, xmm0, xmm1
|
||||
|
||||
# Mark registers as scalar
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm1
|
||||
movsd xmm2, xmm2
|
||||
movsd xmm3, xmm3
|
||||
movsd xmm4, xmm4
|
||||
movsd xmm5, xmm5
|
||||
movsd xmm6, xmm6
|
||||
movsd xmm7, xmm7
|
||||
movsd xmm8, xmm8
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
@@ -27,6 +27,12 @@ latency:
|
||||
vaddpd xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
|
||||
# Mark registers as scalar
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm1
|
||||
movsd xmm2, xmm2
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
@@ -27,6 +27,18 @@ latency:
|
||||
vaddps xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivps xmm2, xmm0, xmm1
|
||||
|
||||
# Mark registers as scalar
|
||||
movsd xmm0, xmm0
|
||||
movsd xmm1, xmm1
|
||||
movsd xmm2, xmm2
|
||||
movsd xmm3, xmm3
|
||||
movsd xmm4, xmm4
|
||||
movsd xmm5, xmm5
|
||||
movsd xmm6, xmm6
|
||||
movsd xmm7, xmm7
|
||||
movsd xmm8, xmm8
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
@@ -27,6 +27,12 @@ latency:
|
||||
vaddps xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivps xmm2, xmm0, xmm1
|
||||
|
||||
# Mark registers as scalar
|
||||
movss xmm0, xmm0
|
||||
movss xmm1, xmm1
|
||||
movss xmm2, xmm2
|
||||
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
Reference in New Issue
Block a user