more instrs

This commit is contained in:
JanLJL
2020-06-11 15:21:07 +02:00
parent 863b0b4c41
commit b808cdc09f
3 changed files with 278 additions and 0 deletions

108
src/NEON/str-q_mb-TP.S Normal file
View File

@@ -0,0 +1,108 @@
#define INSTR str
#define NINST 64
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, #-64
mov x25, #-128
mov x28, #-192
mov x27, #-256
loop:
subs x4, x4, #1
INSTR q2, [sp]
INSTR q6, [sp]
INSTR q8, [sp]
INSTR q10, [sp]
INSTR q12, [sp]
INSTR q14, [sp]
INSTR q16, [sp]
INSTR q18, [sp]
INSTR q2, [sp]
INSTR q6, [sp]
INSTR q8, [sp]
INSTR q10, [sp]
INSTR q12, [sp]
INSTR q14, [sp]
INSTR q16, [sp]
INSTR q18, [sp]
INSTR q2, [sp]
INSTR q6, [sp]
INSTR q8, [sp]
INSTR q10, [sp]
INSTR q12, [sp]
INSTR q14, [sp]
INSTR q16, [sp]
INSTR q18, [sp]
INSTR q2, [sp]
INSTR q6, [sp]
INSTR q8, [sp]
INSTR q10, [sp]
INSTR q12, [sp]
INSTR q14, [sp]
INSTR q16, [sp]
INSTR q18, [sp]
INSTR q2, [sp]
INSTR q6, [sp]
INSTR q8, [sp]
INSTR q10, [sp]
INSTR q12, [sp]
INSTR q14, [sp]
INSTR q16, [sp]
INSTR q18, [sp]
INSTR q2, [sp]
INSTR q6, [sp]
INSTR q8, [sp]
INSTR q10, [sp]
INSTR q12, [sp]
INSTR q14, [sp]
INSTR q16, [sp]
INSTR q18, [sp]
INSTR q2, [sp]
INSTR q6, [sp]
INSTR q8, [sp]
INSTR q10, [sp]
INSTR q12, [sp]
INSTR q14, [sp]
INSTR q16, [sp]
INSTR q18, [sp]
INSTR q2, [sp]
INSTR q6, [sp]
INSTR q8, [sp]
INSTR q10, [sp]
INSTR q12, [sp]
INSTR q14, [sp]
INSTR q16, [sp]
INSTR q18, [sp]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,81 @@
#define INSTR str
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
fmov v3.2d, #1.00000000
fmov v4.2d, #1.00000000
fmov v5.2d, #1.00000000
fmov v6.2d, #1.00000000
fmov v7.2d, #1.00000000
mov x9, #-64
mov x10, #-128
mov x11, #-192
mov x12, #-256
mov x13, #-320
mov x14, #-384
mov x0, #1
mov x1, #1
mov x2, #1
mov x3, #1
mov x5, #1
mov x6, #1
mov x7, #1
mov x8, #1
mov x15, #1
mov x16, #1
mov x17, #1
mov x18, #1
loop:
subs x4, x4, #1
INSTR q0, [sp, x9]
mul x0, x0, x0
add x1, x1, x1
INSTR q1, [sp, x10]
mul x2, x2, x2
add x3, x3, x3
INSTR q2, [sp, x11]
mul x5, x5, x5
add x6, x6, x6
INSTR q3, [sp, x12]
mul x7, x7, x7
add x8, x8, x8
INSTR q4, [sp, x13]
mul x15, x15, x15
add x16, x16, x16
INSTR q5, [sp, x14]
mul x17, x17, x17
add x18, x18, x18
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,89 @@
#define INSTR str
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
# sub sp, sp, #64
# st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
# sub sp, sp, #64
# st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, sp
mov x16, sp
add x16, x16, #128
mov x25, sp
add x25, x25, #192
mov x27, sp
sub x27, x27, #256
mov x28, sp
sub x28, x28, #320
mov x1, #1
mov x2, #1
mov x3, #1
mov x5, #1
mov x6, #1
mov x7, #1
mov x8, #1
mov x9, #1
mov x10, #1
mov x11, #1
mov x12, #1
mov x13, #1
mov x14, #1
mov x15, #1
mov x17, #1
mov x18, #1
loop:
subs x4, x4, #1
INSTR q1, [x25], #64
mul x1, x1, x1
add x2, x2, x2
INSTR q2, [x27], #64
mul x3, x3, x3
add x5, x5, x5
INSTR q3, [x28], #64
mul x6, x6, x6
add x7, x7, x7
INSTR q4, [x16], #64
mul x8, x8, x8
add x9, x9, x9
INSTR q6, [x25], #-64
mul x10, x10, x10
add x11, x11, x11
INSTR q7, [x27], #-64
mul x12, x8, x12
add x13, x13, x13
INSTR q8, [x28], #-64
mul x14, x14, x14
add x15, x15, x15
INSTR q9, [x16], #-64
mul x17, x17, x17
add x18, x18, x18
bne loop
done:
mov sp, x24
# pop callee-save registers from stack
# ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
# add sp, sp, #64
# ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
# add sp, sp, #64
ret
.size latency, .-latency