mark registers before using them

This commit is contained in:
Johannes Hofmann
2019-03-22 12:43:44 +01:00
parent 6e2d119109
commit f8f004d575
82 changed files with 894 additions and 5 deletions

View File

@@ -0,0 +1,40 @@
#define INSTR vfmadd213sd
#define NINST 1
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovapd xmm1, xmm0
vmovapd xmm2, xmm0
loop:
inc i
vfmadd231sd xmm0, xmm1, xmm2
vfmadd231sd xmm0, xmm2, xmm1
vmulsd xmm0, xmm0, xmm1
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,44 @@
#define INSTR vfmadd213sd
#define NINST 1
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy DP 1.0
vmovapd zmm1, zmm0
vmovapd zmm2, zmm0
loop:
inc i
vfmadd231sd xmm0, xmm1, xmm2
vfmadd231sd xmm0, xmm2, xmm1
vmulsd xmm0, xmm0, xmm1
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,41 @@
#define INSTR vfmadd213sd
#define NINST 1
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
movsd xmm0, xmm0
movsd xmm1, xmm0
movsd xmm2, xmm0
loop:
inc i
vfmadd231sd xmm0, xmm1, xmm2
vmulsd xmm0, xmm0, xmm1
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

43
src/AVX-512/LAT-FMA-MUL.S Normal file
View File

@@ -0,0 +1,43 @@
#define INSTR vfmadd213sd
#define NINST 1
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy DP 1.0
vmovapd zmm1, zmm0
vmovapd zmm2, zmm0
loop:
inc i
vfmadd231sd xmm0, xmm1, xmm2
vmulsd xmm0, xmm0, xmm1
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -29,6 +29,11 @@ latency:
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy DP 1.0
vmovapd zmm1, zmm0
# Mark registers as AVX-512
vmovapd zmm0, zmm0
vmovapd zmm1, zmm1
loop:
inc i
INSTR zmm0, zmm0, zmm1

View File

@@ -29,6 +29,11 @@ latency:
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy SP 1.0
vmovaps zmm1, zmm0
# Mark registers as AVX-512
vmovapd zmm0, zmm0
vmovapd zmm1, zmm1
loop:
inc i
INSTR zmm0, zmm0, zmm1

View File

@@ -0,0 +1,48 @@
#define INSTR vfmadd213sd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy DP 1.0
vmovapd zmm1, zmm2
vpcmpeqw xmm1, xmm1, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
cmp i, N
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,42 @@
#define INSTR vfmadd213sd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
cmp i, N
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -31,6 +31,12 @@ latency:
vaddpd zmm1, zmm0, zmm0
# create AVX-512 DP 0.5
vdivpd zmm2, zmm0, zmm1
# Mark registers as AVX-512
vmovapd zmm0, zmm0
vmovapd zmm1, zmm1
vmovapd zmm2, zmm2
loop:
inc i
INSTR zmm0, zmm0, zmm1

View File

@@ -31,6 +31,12 @@ latency:
vaddps zmm1, zmm0, zmm0
# create AVX-512 DP 0.5
vdivps zmm2, zmm0, zmm1
# Mark registers as AVX-512
vmovapd zmm0, zmm0
vmovapd zmm1, zmm1
vmovapd zmm2, zmm2
loop:
inc i
INSTR zmm0, zmm0, zmm1

View File

@@ -0,0 +1,48 @@
#define INSTR vmulsd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# create AVX-512 DP 2.0
vaddpd zmm1, zmm0, zmm0
# create AVX-512 DP 0.5
vdivpd zmm2, zmm0, zmm1
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -26,7 +26,19 @@ latency:
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
vmovapd ymm1, ymm0
vmovapd ymm2, ymm0
# Mark registers as AVX
vmovapd ymm0, ymm0
vmovapd ymm1, ymm1
vmovapd ymm2, ymm2
vmovapd ymm3, ymm3
vmovapd ymm4, ymm4
vmovapd ymm5, ymm5
vmovapd ymm6, ymm6
vmovapd ymm7, ymm7
vmovapd ymm8, ymm8
loop:
inc i
INSTR ymm3, ymm0, ymm1

View File

@@ -26,7 +26,12 @@ latency:
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
vmovapd ymm1, ymm0
# Mark registers as AVX
vmovapd ymm0, ymm0
vmovapd ymm1, ymm1
loop:
inc i
INSTR ymm0, ymm0, ymm1

View File

@@ -27,6 +27,11 @@ latency:
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
# Mark registers as AVX
vmovaps ymm0, ymm0
vmovaps ymm1, ymm1
loop:
inc i
INSTR ymm0, ymm0, ymm1

View File

@@ -29,6 +29,18 @@ latency:
vaddpd ymm1, ymm0, ymm0
# create SP 0.5
vdivpd ymm2, ymm0, ymm1
# Mark registers as AVX
vmovapd ymm0, ymm0
vmovapd ymm1, ymm1
vmovapd ymm2, ymm2
vmovapd ymm3, ymm3
vmovapd ymm4, ymm4
vmovapd ymm5, ymm5
vmovapd ymm6, ymm6
vmovapd ymm7, ymm7
vmovapd ymm8, ymm8
loop:
inc i
INSTR ymm3, ymm0, ymm1

View File

@@ -29,6 +29,12 @@ latency:
vaddpd ymm1, ymm0, ymm0
# create SP 0.5
vdivpd ymm2, ymm0, ymm1
# Mark registers as AVX
vmovapd ymm0, ymm0
vmovapd ymm1, ymm1
vmovapd ymm2, ymm2
loop:
inc i
INSTR ymm0, ymm0, ymm1

View File

@@ -29,6 +29,12 @@ latency:
vaddps ymm1, ymm0, ymm0
# create SP 0.5
vdivps ymm2, ymm0, ymm1
# Mark registers as AVX
vmovaps ymm0, ymm0
vmovaps ymm1, ymm1
vmovaps ymm2, ymm2
loop:
inc i
INSTR ymm0, ymm0, ymm1

View File

@@ -27,6 +27,11 @@ latency:
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
# Mark registers AVX
vmovapd ymm0, ymm0
vmovapd ymm1, ymm1
loop:
inc i
INSTR ymm0, ymm1, ymm1

View File

@@ -0,0 +1,51 @@
#define INSTR vfmadd213pd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy DP 1.0
vmovapd zmm1, zmm0
# Mark registers AVX-512
vmovapd zmm0, zmm0
vmovapd zmm1, zmm1
loop:
inc i
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
cmp i, N
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -25,6 +25,11 @@ latency:
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
# Mark registers SSE
movapd xmm0, xmm0
movapd xmm1, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm1

View File

@@ -27,6 +27,11 @@ latency:
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
# Mark registers AVX
vmovaps ymm0, ymm0
vmovaps ymm1, ymm1
loop:
inc i
INSTR ymm0, ymm1, ymm1

View File

@@ -0,0 +1,51 @@
#define INSTR vfmadd213ps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy SP 1.0
vmovaps zmm1, zmm0
# Mark registers AVX-512
vmovaps zmm0, zmm0
vmovaps zmm1, zmm1
loop:
inc i
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
cmp i, N
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -25,6 +25,11 @@ latency:
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
# Mark registers SSE
movaps xmm0, xmm0
movaps xmm1, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm1

View File

@@ -25,6 +25,11 @@ latency:
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
# Mark registers scalar
movsd xmm0, xmm0
movsd xmm1, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm1

View File

@@ -25,6 +25,11 @@ latency:
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
# Mark registers scalar
movss xmm0, xmm0
movss xmm1, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm1

View File

@@ -24,7 +24,20 @@ latency:
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
vmovapd xmm1, xmm0
vmovapd xmm2, xmm0
# Mark registers as SSE
movapd xmm0, xmm0
movapd xmm1, xmm1
movapd xmm2, xmm2
movapd xmm3, xmm3
movapd xmm4, xmm4
movapd xmm5, xmm5
movapd xmm6, xmm6
movapd xmm7, xmm7
movapd xmm8, xmm8
loop:
inc i
INSTR xmm3, xmm0, xmm1

View File

@@ -24,7 +24,12 @@ latency:
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
vmovapd xmm1, xmm0
# Mark registers as SSE
movapd xmm0, xmm0
movapd xmm1, xmm1
loop:
inc i
INSTR xmm0, xmm0, xmm1

View File

@@ -27,6 +27,18 @@ latency:
vaddpd xmm1, xmm0, xmm0
# create SP 0.5
vdivpd xmm2, xmm0, xmm1
# Mark registers as SSE
movapd xmm0, xmm0
movapd xmm1, xmm1
movapd xmm2, xmm2
movapd xmm3, xmm3
movapd xmm4, xmm4
movapd xmm5, xmm5
movapd xmm6, xmm6
movapd xmm7, xmm7
movapd xmm8, xmm8
loop:
inc i
INSTR xmm3, xmm0, xmm1

View File

@@ -27,6 +27,12 @@ latency:
vaddpd xmm1, xmm0, xmm0
# create SP 0.5
vdivpd xmm2, xmm0, xmm1
# Mark registers as SSE
movapd xmm0, xmm0
movapd xmm1, xmm1
movapd xmm2, xmm2
loop:
inc i
INSTR xmm0, xmm0, xmm1

View File

@@ -27,6 +27,12 @@ latency:
vaddps xmm1, xmm0, xmm0
# create SP 0.5
vdivps xmm2, xmm0, xmm1
# Mark registers as SSE
movaps xmm0, xmm0
movaps xmm1, xmm1
movaps xmm2, xmm2
loop:
inc i
INSTR xmm0, xmm0, xmm1

47
src/scalar/addsd.S Normal file
View File

@@ -0,0 +1,47 @@
#define INSTR addsd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
# Mark registers as scalar
movsd xmm0, xmm0
movsd xmm1, xmm1
loop:
inc i
INSTR xmm0, xmm1
INSTR xmm0, xmm1
INSTR xmm0, xmm1
cmp i, N
INSTR xmm0, xmm1
INSTR xmm0, xmm1
INSTR xmm0, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

47
src/scalar/addss.S Normal file
View File

@@ -0,0 +1,47 @@
#define INSTR addss
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
# Mark registers as scalar
movss xmm0, xmm0
movss xmm1, xmm1
loop:
inc i
INSTR xmm0, xmm1
INSTR xmm0, xmm1
INSTR xmm0, xmm1
cmp i, N
INSTR xmm0, xmm1
INSTR xmm0, xmm1
INSTR xmm0, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

50
src/scalar/mulsd.S Normal file
View File

@@ -0,0 +1,50 @@
#define INSTR mulsd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# create SP 2.0
vaddpd xmm1, xmm0, xmm0
# create SP 0.5
vdivpd xmm2, xmm0, xmm1
# Mark registers as scalar
movsd xmm0, xmm0
movsd xmm1, xmm1
movsd xmm2, xmm2
loop:
inc i
INSTR xmm0, xmm1
INSTR xmm0, xmm2
INSTR xmm0, xmm1
cmp i, N
INSTR xmm0, xmm2
INSTR xmm0, xmm1
INSTR xmm0, xmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

50
src/scalar/mulss.S Normal file
View File

@@ -0,0 +1,50 @@
#define INSTR mulss
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# create SP 2.0
vaddps xmm1, xmm0, xmm0
# create SP 0.5
vdivps xmm2, xmm0, xmm1
# Mark registers as scalar
movss xmm0, xmm0
movss xmm1, xmm1
movss xmm2, xmm2
loop:
inc i
INSTR xmm0, xmm1
INSTR xmm0, xmm2
INSTR xmm0, xmm1
cmp i, N
INSTR xmm0, xmm2
INSTR xmm0, xmm1
INSTR xmm0, xmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -37,11 +37,15 @@ latency:
vdivss xmm1, xmm4, xmm2 # create 341.3333
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
# Mark registers as scalar
movss xmm0, xmm0
movss xmm1, xmm0
movss xmm2, xmm0
movss xmm3, xmm0
movss xmm4, xmm0
movss xmm5, xmm0
loop:
inc i
INSTR xmm10, xmm0

View File

@@ -1,4 +1,4 @@
#define INSTR vrcpps
#define INSTR rcpss
#define NINST 6
#define N edi
#define i r8d
@@ -37,6 +37,15 @@ latency:
vdivps xmm1, xmm4, xmm2 # create 341.3333
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
# Mark registers as scalar
movss xmm0, xmm0
movss xmm1, xmm1
movss xmm2, xmm2
movss xmm3, xmm3
movss xmm4, xmm4
movss xmm5, xmm5
loop:
inc i
INSTR xmm1, xmm0

View File

@@ -37,11 +37,15 @@ latency:
vdivsd xmm1, xmm4, xmm2 # create 341.3333
vdivsd xmm2, xmm0, xmm1 # create 1/341.3333
vaddsd xmm0, xmm1, xmm1 # create 2*341.3333
# Mark registers as scalar
movsd xmm0, xmm0
movsd xmm1, xmm0
movsd xmm2, xmm0
movsd xmm3, xmm0
movsd xmm4, xmm0
movsd xmm5, xmm0
loop:
inc i
INSTR xmm10, xmm0

View File

@@ -38,6 +38,16 @@ latency:
vdivpd xmm2, xmm0, xmm1 # create 1/341.3333
vaddpd xmm0, xmm1, xmm1 # create 2*341.3333
vmovapd xmm10, xmm0 # save value
# Mark registers as scalar
movsd xmm0, xmm0
movsd xmm1, xmm1
movsd xmm2, xmm2
movsd xmm3, xmm3
movsd xmm4, xmm4
movsd xmm5, xmm5
movsd xmm10, xmm10
loop:
inc i
INSTR xmm1, xmm0

View File

@@ -37,11 +37,15 @@ latency:
vdivss xmm1, xmm4, xmm2 # create 341.3333
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
# Mark registers as scalar
movss xmm0, xmm0
movss xmm1, xmm0
movss xmm2, xmm0
movss xmm3, xmm0
movss xmm4, xmm0
movss xmm5, xmm0
loop:
inc i
INSTR xmm10, xmm0

View File

@@ -38,6 +38,16 @@ latency:
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
vmovaps xmm10, xmm0 # save value
# Mark registers as scalar
movss xmm0, xmm0
movss xmm1, xmm1
movss xmm2, xmm2
movss xmm3, xmm3
movss xmm4, xmm4
movss xmm5, xmm5
movss xmm10, xmm10
loop:
inc i
INSTR xmm1, xmm0

View File

@@ -25,6 +25,18 @@ latency:
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
# Mark registers as scalar
movsd xmm0, xmm0
movsd xmm1, xmm0
movsd xmm2, xmm0
movsd xmm3, xmm0
movsd xmm4, xmm0
movsd xmm5, xmm0
movsd xmm6, xmm0
movsd xmm7, xmm0
movsd xmm8, xmm0
loop:
inc i
INSTR xmm3, xmm0, xmm1

View File

@@ -25,6 +25,11 @@ latency:
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
# Mark registers as scalar
movsd xmm0, xmm0
movsd xmm1, xmm1
loop:
inc i
INSTR xmm0, xmm0, xmm1

View File

@@ -25,6 +25,18 @@ latency:
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
# Mark registers as scalar
movss xmm0, xmm0
movss xmm1, xmm0
movss xmm2, xmm0
movss xmm3, xmm0
movss xmm4, xmm0
movss xmm5, xmm0
movss xmm6, xmm0
movss xmm7, xmm0
movss xmm8, xmm0
loop:
inc i
INSTR xmm3, xmm0, xmm1

View File

@@ -25,6 +25,11 @@ latency:
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
# Mark registers as scalar
movss xmm0, xmm0
movss xmm1, xmm1
loop:
inc i
INSTR xmm0, xmm0, xmm1

View File

@@ -37,6 +37,18 @@ latency:
vdivsd xmm1, xmm4, xmm2 # create 341.3333
vdivsd xmm2, xmm0, xmm1 # create 1/341.3333
vaddsd xmm0, xmm1, xmm1 # create 2*341.3333
# Mark registers as scalar
movsd xmm0, xmm0
movsd xmm1, xmm0
movsd xmm2, xmm0
movsd xmm3, xmm0
movsd xmm4, xmm0
movsd xmm5, xmm0
movsd xmm6, xmm0
movsd xmm7, xmm0
movsd xmm8, xmm0
loop:
inc i
INSTR xmm3, xmm0, xmm1

View File

@@ -37,6 +37,12 @@ latency:
vdivsd xmm1, xmm4, xmm2 # create 341.3333
vdivsd xmm2, xmm0, xmm1 # create 1/341.3333
vaddsd xmm0, xmm1, xmm1 # create 2*341.3333
# Mark registers as scalar
movsd xmm0, xmm0
movsd xmm1, xmm1
movsd xmm2, xmm2
loop:
inc i
INSTR xmm0, xmm0, xmm1

View File

@@ -37,6 +37,18 @@ latency:
vdivss xmm1, xmm4, xmm2 # create 341.3333
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
# Mark registers as scalar
movss xmm0, xmm0
movss xmm1, xmm0
movss xmm2, xmm0
movss xmm3, xmm0
movss xmm4, xmm0
movss xmm5, xmm0
movss xmm6, xmm0
movss xmm7, xmm0
movss xmm8, xmm0
loop:
inc i
INSTR xmm3, xmm0, xmm1

View File

@@ -37,6 +37,12 @@ latency:
vdivss xmm1, xmm4, xmm2 # create 341.3333
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
# Mark registers as scalar
movss xmm0, xmm0
movss xmm1, xmm1
movss xmm2, xmm2
loop:
inc i
INSTR xmm0, xmm0, xmm1

View File

@@ -27,6 +27,18 @@ latency:
vaddps xmm1, xmm0, xmm0
# create SP 0.5
vdivps xmm2, xmm0, xmm1
# Mark registers as scalar
movsd xmm0, xmm0
movsd xmm1, xmm1
movsd xmm2, xmm2
movsd xmm3, xmm3
movsd xmm4, xmm4
movsd xmm5, xmm5
movsd xmm6, xmm6
movsd xmm7, xmm7
movsd xmm8, xmm8
loop:
inc i
INSTR xmm3, xmm0, xmm1

View File

@@ -27,6 +27,12 @@ latency:
vaddpd xmm1, xmm0, xmm0
# create SP 0.5
vdivpd xmm2, xmm0, xmm1
# Mark registers as scalar
movsd xmm0, xmm0
movsd xmm1, xmm1
movsd xmm2, xmm2
loop:
inc i
INSTR xmm0, xmm0, xmm1

View File

@@ -27,6 +27,18 @@ latency:
vaddps xmm1, xmm0, xmm0
# create SP 0.5
vdivps xmm2, xmm0, xmm1
# Mark registers as scalar
movsd xmm0, xmm0
movsd xmm1, xmm1
movsd xmm2, xmm2
movsd xmm3, xmm3
movsd xmm4, xmm4
movsd xmm5, xmm5
movsd xmm6, xmm6
movsd xmm7, xmm7
movsd xmm8, xmm8
loop:
inc i
INSTR xmm3, xmm0, xmm1

View File

@@ -27,6 +27,12 @@ latency:
vaddps xmm1, xmm0, xmm0
# create SP 0.5
vdivps xmm2, xmm0, xmm1
# Mark registers as scalar
movss xmm0, xmm0
movss xmm1, xmm1
movss xmm2, xmm2
loop:
inc i
INSTR xmm0, xmm0, xmm1