mirror of
https://github.com/RRZE-HPC/OSACA.git
synced 2025-09-04 17:00:26 +02:00
143 lines
2.8 KiB
ArmAsm
143 lines
2.8 KiB
ArmAsm
#define INSTR vmovq
|
|
#define NINST 64
|
|
#define N edi
|
|
#define i r8d
|
|
|
|
|
|
.intel_syntax noprefix
|
|
.globl ninst
|
|
.data
|
|
ninst:
|
|
.long NINST
|
|
.align 32
|
|
PI:
|
|
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
|
|
.text
|
|
.globl latency
|
|
.type latency, @function
|
|
.align 32
|
|
latency:
|
|
push rbp
|
|
mov rbp, rsp
|
|
xor i, i
|
|
test N, N
|
|
jle done
|
|
# create DP 1.0
|
|
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
|
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
|
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
|
push rax
|
|
push rbx
|
|
push rcx
|
|
push rdx
|
|
push r9
|
|
push r10
|
|
push r11
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
xor rax, rax
|
|
xor rbx, rbx
|
|
xor rcx, rcx
|
|
xor rdx, rdx
|
|
xor r9, r9
|
|
xor r10, r10
|
|
xor r11, r11
|
|
xor r12, r12
|
|
xor r13, r13
|
|
xor r14, r14
|
|
xor r15, r15
|
|
# copy DP 1.0
|
|
vmovq rax, xmm0
|
|
vmovq rbx, xmm0
|
|
# Create DP 2.0
|
|
add rbx, rax
|
|
# Create DP 0.5
|
|
div rax
|
|
movq rcx, rax
|
|
vmovq rax, xmm0
|
|
loop:
|
|
inc i
|
|
INSTR xmm3, rax
|
|
INSTR xmm4, rbx
|
|
INSTR xmm5, rcx
|
|
INSTR xmm6, rax
|
|
INSTR xmm7, rbx
|
|
INSTR xmm8, rcx
|
|
INSTR xmm9, rax
|
|
INSTR xmm10, rbx
|
|
INSTR xmm11, rcx
|
|
INSTR xmm12, rax
|
|
INSTR xmm13, rbx
|
|
INSTR xmm14, rcx
|
|
INSTR xmm15, rax
|
|
INSTR xmm3, rbx
|
|
INSTR xmm4, rcx
|
|
INSTR xmm5, rax
|
|
INSTR xmm6, rbx
|
|
INSTR xmm7, rcx
|
|
INSTR xmm8, rax
|
|
INSTR xmm9, rbx
|
|
INSTR xmm10, rcx
|
|
INSTR xmm11, rax
|
|
INSTR xmm12, rbx
|
|
INSTR xmm13, rcx
|
|
INSTR xmm14, rax
|
|
INSTR xmm15, rbx
|
|
INSTR xmm3, rcx
|
|
INSTR xmm4, rax
|
|
INSTR xmm5, rbx
|
|
INSTR xmm6, rcx
|
|
INSTR xmm7, rax
|
|
INSTR xmm8, rbx
|
|
INSTR xmm9, rcx
|
|
INSTR xmm10, rax
|
|
INSTR xmm11, rbx
|
|
INSTR xmm12, rcx
|
|
INSTR xmm13, rax
|
|
INSTR xmm14, rbx
|
|
INSTR xmm15, rcx
|
|
INSTR xmm3, rax
|
|
INSTR xmm4, rbx
|
|
INSTR xmm5, rcx
|
|
INSTR xmm6, rax
|
|
INSTR xmm7, rbx
|
|
INSTR xmm8, rcx
|
|
INSTR xmm9, rax
|
|
INSTR xmm10, rbx
|
|
INSTR xmm11, rcx
|
|
INSTR xmm12, rax
|
|
INSTR xmm13, rbx
|
|
INSTR xmm14, rcx
|
|
INSTR xmm15, rax
|
|
INSTR xmm3, rbx
|
|
INSTR xmm4, rcx
|
|
INSTR xmm5, rax
|
|
INSTR xmm6, rbx
|
|
INSTR xmm7, rcx
|
|
INSTR xmm8, rax
|
|
INSTR xmm9, rbx
|
|
INSTR xmm10, rcx
|
|
INSTR xmm11, rax
|
|
INSTR xmm12, rbx
|
|
INSTR xmm13, rcx
|
|
INSTR xmm14, rax
|
|
cmp i, N
|
|
jl loop
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop r11
|
|
pop r10
|
|
pop r9
|
|
pop rdx
|
|
pop rcx
|
|
pop rbx
|
|
pop rax
|
|
done:
|
|
mov rsp, rbp
|
|
pop rbp
|
|
ret
|
|
.size latency, .-latency |