mirror of
https://github.com/RRZE-HPC/ibench.git
synced 2025-07-21 04:41:09 +02:00
benchmarks for A64FX
This commit is contained in:
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,6 +42,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -80,6 +87,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -36,6 +43,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -80,6 +87,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,6 +42,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -80,6 +87,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -227,6 +234,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -227,6 +234,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -37,6 +44,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -57,6 +64,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -57,6 +64,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,6 +42,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,6 +42,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
203
src/NEON/fadd-vd_vd_vd-il_1_2-add-x_x_x-TP.S
Normal file
203
src/NEON/fadd-vd_vd_vd-il_1_2-add-x_x_x-TP.S
Normal file
@@ -0,0 +1,203 @@
|
||||
#define INSTR fadd
|
||||
#define NINST 48
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
add x5, x1, x1
|
||||
add x6, x2, x2
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
add x7, x3, x3
|
||||
add x8, x1, x1
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
INSTR v9.2d, v0.2d, v2.2d
|
||||
add x5, x2, x2
|
||||
add x6, x3, x3
|
||||
INSTR v10.2d, v1.2d, v1.2d
|
||||
add x7, x1, x1
|
||||
add x8, x2, x2
|
||||
INSTR v11.2d, v1.2d, v2.2d
|
||||
add x9, x3, x3
|
||||
add x10, x1, x1
|
||||
INSTR v12.2d, v2.2d, v2.2d
|
||||
add x11, x2, x2
|
||||
add x12, x3, x3
|
||||
INSTR v13.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v14.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
add x5, x1, x1
|
||||
add x6, x2, x2
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
add x7, x3, x3
|
||||
add x8, x1, x1
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
INSTR v9.2d, v0.2d, v2.2d
|
||||
add x5, x2, x2
|
||||
add x6, x3, x3
|
||||
INSTR v10.2d, v1.2d, v1.2d
|
||||
add x7, x1, x1
|
||||
add x8, x2, x2
|
||||
INSTR v11.2d, v1.2d, v2.2d
|
||||
add x9, x3, x3
|
||||
add x10, x1, x1
|
||||
INSTR v12.2d, v2.2d, v2.2d
|
||||
add x11, x2, x2
|
||||
add x12, x3, x3
|
||||
INSTR v13.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v14.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
add x5, x1, x1
|
||||
add x6, x2, x2
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
add x7, x3, x3
|
||||
add x8, x1, x1
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
INSTR v9.2d, v0.2d, v2.2d
|
||||
add x5, x2, x2
|
||||
add x6, x3, x3
|
||||
INSTR v10.2d, v1.2d, v1.2d
|
||||
add x7, x1, x1
|
||||
add x8, x2, x2
|
||||
INSTR v11.2d, v1.2d, v2.2d
|
||||
add x9, x3, x3
|
||||
add x10, x1, x1
|
||||
INSTR v12.2d, v2.2d, v2.2d
|
||||
add x11, x2, x2
|
||||
add x12, x3, x3
|
||||
INSTR v13.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v14.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
add x5, x1, x1
|
||||
add x6, x2, x2
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
add x7, x3, x3
|
||||
add x8, x1, x1
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
INSTR v9.2d, v0.2d, v2.2d
|
||||
add x5, x2, x2
|
||||
add x6, x3, x3
|
||||
INSTR v10.2d, v1.2d, v1.2d
|
||||
add x7, x1, x1
|
||||
add x8, x2, x2
|
||||
INSTR v11.2d, v1.2d, v2.2d
|
||||
add x9, x3, x3
|
||||
add x10, x1, x1
|
||||
INSTR v12.2d, v2.2d, v2.2d
|
||||
add x11, x2, x2
|
||||
add x12, x3, x3
|
||||
INSTR v13.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v14.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
73
src/NEON/fadd-vd_vd_vd-il_1_2-sub-x_x_i-TP.S
Normal file
73
src/NEON/fadd-vd_vd_vd-il_1_2-sub-x_x_i-TP.S
Normal file
@@ -0,0 +1,73 @@
|
||||
#define INSTR fadd
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
sub x5, x1, #1
|
||||
sub x6, x2, #1
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
sub x7, x3, #1
|
||||
sub x8, x1, #1
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
sub x9, x2, #1
|
||||
sub x10, x3, #1
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
sub x11, x1, #1
|
||||
sub x12, x2, #1
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
sub x13, x3, #1
|
||||
sub x14, x1, #1
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
sub x15, x2, #1
|
||||
sub x16, x3, #1
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
79
src/NEON/fadd-vd_vd_vd-il_1_3-add-x_x_x-TP.S
Normal file
79
src/NEON/fadd-vd_vd_vd-il_1_3-add-x_x_x-TP.S
Normal file
@@ -0,0 +1,79 @@
|
||||
#define INSTR fadd
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
add x5, x1, x1
|
||||
add x6, x2, x2
|
||||
add x7, x3, x3
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
add x8, x1, x1
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
add x13, x3, x3
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
add x14, x1, x1
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
add x8, x1, x1
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
add x13, x3, x3
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
73
src/NEON/fadd-vd_vd_vd-il_2_1-mul-x_x_x-TP.S
Normal file
73
src/NEON/fadd-vd_vd_vd-il_2_1-mul-x_x_x-TP.S
Normal file
@@ -0,0 +1,73 @@
|
||||
#define INSTR fadd
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
mul x5, x1, x1
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
mul x6, x2, x2
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
mul x7, x3, x3
|
||||
INSTR v9.2d, v2.2d, v2.2d
|
||||
INSTR v10.2d, v2.2d, v2.2d
|
||||
mul x8, x1, x1
|
||||
INSTR v11.2d, v2.2d, v2.2d
|
||||
INSTR v12.2d, v2.2d, v2.2d
|
||||
mul x9, x2, x2
|
||||
INSTR v13.2d, v2.2d, v2.2d
|
||||
INSTR v14.2d, v2.2d, v2.2d
|
||||
mul x10, x3, x3
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,6 +42,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
73
src/NEON/fadd-vs_vs_vs-il_1_2-add-x_x_x-TP.S
Normal file
73
src/NEON/fadd-vs_vs_vs-il_1_2-add-x_x_x-TP.S
Normal file
@@ -0,0 +1,73 @@
|
||||
#define INSTR fadd
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.4s, #1.00000000
|
||||
fmov v1.4s, #1.00000000
|
||||
fmov v2.4s, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.4s, v0.4s, v0.4s
|
||||
add x5, x1, x1
|
||||
add x6, x2, x2
|
||||
INSTR v4.4s, v0.4s, v1.4s
|
||||
add x7, x3, x3
|
||||
add x8, x1, x1
|
||||
INSTR v5.4s, v0.4s, v2.4s
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v6.4s, v1.4s, v1.4s
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
INSTR v7.4s, v1.4s, v2.4s
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v8.4s, v2.4s, v2.4s
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -61,6 +68,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -61,6 +68,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -67,6 +74,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -71,6 +78,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -61,6 +68,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -61,6 +68,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -88,6 +95,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -116,6 +123,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -58,6 +65,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -58,6 +65,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -44,6 +51,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,6 +42,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,6 +42,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,6 +42,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -61,6 +68,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -67,6 +74,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -61,6 +68,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -67,6 +74,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -68,6 +75,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -66,6 +73,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -68,6 +75,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -66,6 +73,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,6 +42,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,6 +42,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -41,6 +48,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
58
src/NEON/ldp-d_d_mbo-TP.Se
Normal file
58
src/NEON/ldp-d_d_mbo-TP.Se
Normal file
@@ -0,0 +1,58 @@
|
||||
#define INSTR ldp
|
||||
#define NINST 12
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR d24, d1, [sp, #-256]
|
||||
INSTR d2, d3, [sp, #-256]
|
||||
INSTR d25, d5, [sp, #-256]
|
||||
INSTR d6, d7, [sp, #-256]
|
||||
INSTR d8, d9, [sp, #-256]
|
||||
INSTR d10, d11, [sp, #-256]
|
||||
INSTR d12, d13, [sp, #-256]
|
||||
INSTR d14, d15, [sp, #-256]
|
||||
INSTR d16, d17, [sp, #-256]
|
||||
INSTR d18, d19, [sp, #-256]
|
||||
INSTR d20, d21, [sp, #-256]
|
||||
INSTR d22, d23, [sp, #-256]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -53,6 +60,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
70
src/NEON/ldp-d_d_mbo-il_1_1-ldr-x_mb-TP.Se
Normal file
70
src/NEON/ldp-d_d_mbo-il_1_1-ldr-x_mb-TP.Se
Normal file
@@ -0,0 +1,70 @@
|
||||
#define INSTR ldp
|
||||
#define NINST 12
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR d24, d1, [sp, #-256]
|
||||
ldr x0, [sp]
|
||||
INSTR d2, d3, [sp, #-256]
|
||||
ldr x1, [sp]
|
||||
INSTR d25, d5, [sp, #-256]
|
||||
ldr x2, [sp]
|
||||
INSTR d6, d7, [sp, #-256]
|
||||
ldr x3, [sp]
|
||||
INSTR d8, d9, [sp, #-256]
|
||||
ldr x5, [sp]
|
||||
INSTR d10, d11, [sp, #-256]
|
||||
ldr x6, [sp]
|
||||
INSTR d12, d13, [sp, #-256]
|
||||
ldr x7, [sp]
|
||||
INSTR d14, d15, [sp, #-256]
|
||||
ldr x8, [sp]
|
||||
INSTR d16, d17, [sp, #-256]
|
||||
ldr x9, [sp]
|
||||
INSTR d18, d19, [sp, #-256]
|
||||
ldr x10, [sp]
|
||||
INSTR d20, d21, [sp, #-256]
|
||||
ldr x11, [sp]
|
||||
INSTR d22, d23, [sp, #-256]
|
||||
ldr x12, [sp]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -41,6 +48,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
58
src/NEON/ldp-q_q_mbo-TP.Se
Normal file
58
src/NEON/ldp-q_q_mbo-TP.Se
Normal file
@@ -0,0 +1,58 @@
|
||||
#define INSTR ldp
|
||||
#define NINST 12
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q0, q1, [sp, #-256]
|
||||
INSTR q2, q3, [sp, #-256]
|
||||
INSTR q4, q5, [sp, #-256]
|
||||
INSTR q6, q7, [sp, #-256]
|
||||
INSTR q8, q9, [sp, #-256]
|
||||
INSTR q10, q11, [sp, #-256]
|
||||
INSTR q12, q13, [sp, #-256]
|
||||
INSTR q14, q15, [sp, #-256]
|
||||
INSTR q16, q17, [sp, #-256]
|
||||
INSTR q18, q19, [sp, #-256]
|
||||
INSTR q20, q21, [sp, #-256]
|
||||
INSTR q22, q23, [sp, #-256]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -45,6 +52,12 @@ loop:
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
62
src/NEON/ldp-q_q_mbp-LAT.Se
Normal file
62
src/NEON/ldp-q_q_mbp-LAT.Se
Normal file
@@ -0,0 +1,62 @@
|
||||
#define INSTR ldp
|
||||
#define NINST 14
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x24, sp
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q0, q1, [sp], #64
|
||||
INSTR q2, q3, [sp], #64
|
||||
INSTR q4, q5, [sp], #64
|
||||
INSTR q6, q7, [sp], #64
|
||||
INSTR q8, q9, [sp], #64
|
||||
INSTR q10, q11, [sp], #64
|
||||
INSTR q12, q13, [sp], #64
|
||||
INSTR q14, q15, [sp], #-64
|
||||
INSTR q16, q17, [sp], #-64
|
||||
INSTR q18, q19, [sp], #-64
|
||||
INSTR q20, q21, [sp], #-64
|
||||
INSTR q22, q23, [sp], #-64
|
||||
INSTR q25, q26, [sp], #-64
|
||||
INSTR q27, q28, [sp], #-64
|
||||
|
||||
bne loop
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -50,6 +57,12 @@ loop:
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
67
src/NEON/ldp-q_q_mbp-TP.Se
Normal file
67
src/NEON/ldp-q_q_mbp-TP.Se
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR ldp
|
||||
#define NINST 10
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x24, sp
|
||||
|
||||
mov x16, sp
|
||||
add x16, x16, #32
|
||||
mov x25, sp
|
||||
add x25, x25, #64
|
||||
mov x27, sp
|
||||
sub x27, x27, #32
|
||||
mov x28, sp
|
||||
sub x28, x28, #64
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q0, q1, [sp], #64
|
||||
INSTR q2, q3, [x25], #64
|
||||
INSTR q4, q5, [x27], #64
|
||||
INSTR q6, q7, [x28], #64
|
||||
INSTR q18, q19, [x16], #64
|
||||
INSTR q8, q9, [sp], #-64
|
||||
INSTR q10, q11, [x25], #-64
|
||||
INSTR q12, q13, [x27], #-64
|
||||
INSTR q14, q15, [x28], #-64
|
||||
INSTR q20, q21, [x16], #-64
|
||||
|
||||
bne loop
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -39,6 +46,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
56
src/NEON/ldp-x_x_mbo-TP.Se
Normal file
56
src/NEON/ldp-x_x_mbo-TP.Se
Normal file
@@ -0,0 +1,56 @@
|
||||
#define INSTR ldp
|
||||
#define NINST 10
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR x24, x1, [sp, #-256]
|
||||
INSTR x2, x3, [sp, #-256]
|
||||
INSTR x25, x5, [sp, #-256]
|
||||
INSTR x6, x7, [sp, #-256]
|
||||
INSTR x8, x9, [sp, #-256]
|
||||
INSTR x10, x11, [sp, #-256]
|
||||
INSTR x12, x13, [sp, #-256]
|
||||
INSTR x14, x15, [sp, #-256]
|
||||
INSTR x16, x17, [sp, #-256]
|
||||
INSTR x28, x29, [sp, #-256]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -50,6 +57,12 @@ loop:
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
67
src/NEON/ldr-q_mb-TP.Se
Normal file
67
src/NEON/ldr-q_mb-TP.Se
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 10
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x24, sp
|
||||
|
||||
mov x16, sp
|
||||
add x16, x16, #32
|
||||
mov x25, sp
|
||||
add x25, x25, #64
|
||||
mov x27, sp
|
||||
sub x27, x27, #32
|
||||
mov x28, sp
|
||||
sub x28, x28, #64
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q0, [sp]
|
||||
INSTR q1, [x25]
|
||||
INSTR q2, [x27]
|
||||
INSTR q3, [x28]
|
||||
INSTR q4, [x16]
|
||||
INSTR q5, [sp]
|
||||
INSTR q6, [x25]
|
||||
INSTR q7, [x27]
|
||||
INSTR q8, [x28]
|
||||
INSTR q9, [x16]
|
||||
|
||||
bne loop
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
71
src/NEON/ldr-q_mb-il_1_1-str-q_mb-TP.S
Normal file
71
src/NEON/ldr-q_mb-il_1_1-str-q_mb-TP.S
Normal file
@@ -0,0 +1,71 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 8
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
sub x24, x24, #192
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q1, [sp]
|
||||
str q2, [x24]
|
||||
INSTR q3, [sp]
|
||||
str q5, [x24]
|
||||
INSTR q6, [sp]
|
||||
str q7, [x24]
|
||||
INSTR q8, [sp]
|
||||
str q9, [x24]
|
||||
INSTR q10, [sp]
|
||||
str q11, [x24]
|
||||
INSTR q12, [sp]
|
||||
str q13, [x24]
|
||||
INSTR q14, [sp]
|
||||
str q15, [x24]
|
||||
INSTR q16, [sp]
|
||||
str q17, [x24]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
65
src/NEON/ldr-q_mb-il_1_1-str-q_mb-TP.Se
Normal file
65
src/NEON/ldr-q_mb-il_1_1-str-q_mb-TP.Se
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 8
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
sub x24, x24, #192
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q1, [sp]
|
||||
str q2, [x24]
|
||||
INSTR q3, [sp]
|
||||
str q5, [x24]
|
||||
INSTR q6, [sp]
|
||||
str q7, [x24]
|
||||
INSTR q8, [sp]
|
||||
str q9, [x24]
|
||||
INSTR q10, [sp]
|
||||
str q11, [x24]
|
||||
INSTR q12, [sp]
|
||||
str q13, [x24]
|
||||
INSTR q14, [sp]
|
||||
str q15, [x24]
|
||||
INSTR q16, [sp]
|
||||
str q17, [x24]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
79
src/NEON/ldr-q_mb-il_1_1_1-str-q_mb-add-x_x_x-TP.S
Normal file
79
src/NEON/ldr-q_mb-il_1_1_1-str-q_mb-add-x_x_x-TP.S
Normal file
@@ -0,0 +1,79 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 8
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
sub x24, x24, #192
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q1, [sp]
|
||||
str q2, [x24]
|
||||
add x0, x0, x0
|
||||
INSTR q3, [sp]
|
||||
str q5, [x24]
|
||||
add x1, x1, x1
|
||||
INSTR q6, [sp]
|
||||
str q7, [x24]
|
||||
add x2, x2, x2
|
||||
INSTR q8, [sp]
|
||||
str q9, [x24]
|
||||
add x3, x3, x3
|
||||
INSTR q10, [sp]
|
||||
str q11, [x24]
|
||||
add x5, x5, x5
|
||||
INSTR q12, [sp]
|
||||
str q13, [x24]
|
||||
add x6, x6, x6
|
||||
INSTR q14, [sp]
|
||||
str q15, [x24]
|
||||
add x7, x7, x7
|
||||
INSTR q16, [sp]
|
||||
str q17, [x24]
|
||||
add x8, x8, x8
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
73
src/NEON/ldr-q_mb-il_1_1_1-str-q_mb-add-x_x_x-TP.Se
Normal file
73
src/NEON/ldr-q_mb-il_1_1_1-str-q_mb-add-x_x_x-TP.Se
Normal file
@@ -0,0 +1,73 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 8
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
sub x24, x24, #192
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q1, [sp]
|
||||
str q2, [x24]
|
||||
add x0, x0, x0
|
||||
INSTR q3, [sp]
|
||||
str q5, [x24]
|
||||
add x1, x1, x1
|
||||
INSTR q6, [sp]
|
||||
str q7, [x24]
|
||||
add x2, x2, x2
|
||||
INSTR q8, [sp]
|
||||
str q9, [x24]
|
||||
add x3, x3, x3
|
||||
INSTR q10, [sp]
|
||||
str q11, [x24]
|
||||
add x5, x5, x5
|
||||
INSTR q12, [sp]
|
||||
str q13, [x24]
|
||||
add x6, x6, x6
|
||||
INSTR q14, [sp]
|
||||
str q15, [x24]
|
||||
add x7, x7, x7
|
||||
INSTR q16, [sp]
|
||||
str q17, [x24]
|
||||
add x8, x8, x8
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
72
src/NEON/ldr-q_mb-il_2_1-str-q_mb-TP.S
Normal file
72
src/NEON/ldr-q_mb-il_2_1-str-q_mb-TP.S
Normal file
@@ -0,0 +1,72 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
sub x24, x24, #192
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q1, [sp]
|
||||
INSTR q2, [sp]
|
||||
str q3, [x24]
|
||||
INSTR q5, [sp]
|
||||
INSTR q6, [sp]
|
||||
str q7, [x24]
|
||||
INSTR q8, [sp]
|
||||
INSTR q9, [sp]
|
||||
str q10, [x24]
|
||||
INSTR q11, [sp]
|
||||
INSTR q12, [sp]
|
||||
str q13, [x24]
|
||||
INSTR q14, [sp]
|
||||
INSTR q15, [sp]
|
||||
str q16, [x24]
|
||||
INSTR q17, [sp]
|
||||
INSTR q18, [sp]
|
||||
str q28, [x24]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -98,6 +105,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
115
src/NEON/ldr-q_mbi-TP.Se
Normal file
115
src/NEON/ldr-q_mbi-TP.Se
Normal file
@@ -0,0 +1,115 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 64
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x24, #-64
|
||||
mov x25, #-128
|
||||
mov x28, #-192
|
||||
mov x27, #-256
|
||||
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q2, [sp, x24]
|
||||
INSTR q6, [sp, x25]
|
||||
INSTR q8, [sp, x28]
|
||||
INSTR q10, [sp, x27]
|
||||
INSTR q12, [sp, x24]
|
||||
INSTR q14, [sp, x25]
|
||||
INSTR q16, [sp, x28]
|
||||
INSTR q18, [sp, x27]
|
||||
INSTR q2, [sp, x24]
|
||||
INSTR q6, [sp, x25]
|
||||
INSTR q8, [sp, x28]
|
||||
INSTR q10, [sp, x27]
|
||||
INSTR q12, [sp, x24]
|
||||
INSTR q14, [sp, x25]
|
||||
INSTR q16, [sp, x28]
|
||||
INSTR q18, [sp, x27]
|
||||
INSTR q2, [sp, x24]
|
||||
INSTR q6, [sp, x25]
|
||||
INSTR q8, [sp, x28]
|
||||
INSTR q10, [sp, x27]
|
||||
INSTR q12, [sp, x24]
|
||||
INSTR q14, [sp, x25]
|
||||
INSTR q16, [sp, x28]
|
||||
INSTR q18, [sp, x27]
|
||||
INSTR q2, [sp, x24]
|
||||
INSTR q6, [sp, x25]
|
||||
INSTR q8, [sp, x28]
|
||||
INSTR q10, [sp, x27]
|
||||
INSTR q12, [sp, x24]
|
||||
INSTR q14, [sp, x25]
|
||||
INSTR q16, [sp, x28]
|
||||
INSTR q18, [sp, x27]
|
||||
INSTR q2, [sp, x24]
|
||||
INSTR q6, [sp, x25]
|
||||
INSTR q8, [sp, x28]
|
||||
INSTR q10, [sp, x27]
|
||||
INSTR q12, [sp, x24]
|
||||
INSTR q14, [sp, x25]
|
||||
INSTR q16, [sp, x28]
|
||||
INSTR q18, [sp, x27]
|
||||
INSTR q2, [sp, x24]
|
||||
INSTR q6, [sp, x25]
|
||||
INSTR q8, [sp, x28]
|
||||
INSTR q10, [sp, x27]
|
||||
INSTR q12, [sp, x24]
|
||||
INSTR q14, [sp, x25]
|
||||
INSTR q16, [sp, x28]
|
||||
INSTR q18, [sp, x27]
|
||||
INSTR q2, [sp, x24]
|
||||
INSTR q6, [sp, x25]
|
||||
INSTR q8, [sp, x28]
|
||||
INSTR q10, [sp, x27]
|
||||
INSTR q12, [sp, x24]
|
||||
INSTR q14, [sp, x25]
|
||||
INSTR q16, [sp, x28]
|
||||
INSTR q18, [sp, x27]
|
||||
INSTR q2, [sp, x24]
|
||||
INSTR q6, [sp, x25]
|
||||
INSTR q8, [sp, x28]
|
||||
INSTR q10, [sp, x27]
|
||||
INSTR q12, [sp, x24]
|
||||
INSTR q14, [sp, x25]
|
||||
INSTR q16, [sp, x28]
|
||||
INSTR q18, [sp, x27]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -43,6 +50,12 @@ loop:
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
60
src/NEON/ldr-q_mbp-LAT.Se
Normal file
60
src/NEON/ldr-q_mbp-LAT.Se
Normal file
@@ -0,0 +1,60 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 12
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x24, sp
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q0, [sp], #64
|
||||
INSTR q1, [sp], #64
|
||||
INSTR q2, [sp], #64
|
||||
INSTR q3, [sp], #64
|
||||
INSTR q4, [sp], #64
|
||||
INSTR q5, [sp], #64
|
||||
INSTR q6, [sp], #-64
|
||||
INSTR q7, [sp], #-64
|
||||
INSTR q8, [sp], #-64
|
||||
INSTR q9, [sp], #-64
|
||||
INSTR q10, [sp], #-64
|
||||
INSTR q11, [sp], #-64
|
||||
|
||||
bne loop
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -1,5 +1,5 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 10
|
||||
#define NINST 18
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
@@ -17,14 +17,30 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x24, sp
|
||||
|
||||
sub sp, sp, #-128
|
||||
mov x24, sp
|
||||
mov x10, sp
|
||||
mov x11, sp
|
||||
mov x12, sp
|
||||
mov x13, sp
|
||||
add x10, x10, #16
|
||||
add x11, x11, #48
|
||||
sub x12, x12, #16
|
||||
sub x13, x13, #48
|
||||
mov x16, sp
|
||||
add x16, x16, #32
|
||||
mov x25, sp
|
||||
@@ -40,16 +56,30 @@ loop:
|
||||
INSTR q2, [x27], #64
|
||||
INSTR q3, [x28], #64
|
||||
INSTR q4, [x16], #64
|
||||
INSTR q5, [sp], #-64
|
||||
INSTR q6, [x25], #-64
|
||||
INSTR q7, [x27], #-64
|
||||
INSTR q8, [x28], #-64
|
||||
INSTR q9, [x16], #-64
|
||||
|
||||
INSTR q5, [x10], #64
|
||||
INSTR q6, [x11], #64
|
||||
INSTR q6, [x12], #64
|
||||
INSTR q6, [x13], #64
|
||||
INSTR q7, [sp], #-64
|
||||
INSTR q8, [x25], #-64
|
||||
INSTR q9, [x27], #-64
|
||||
INSTR q10, [x28], #-64
|
||||
INSTR q11, [x16], #-64
|
||||
INSTR q12, [x10], #-64
|
||||
INSTR q13, [x11], #-64
|
||||
INSTR q6, [x12], #-64
|
||||
INSTR q6, [x13], #-64
|
||||
bne loop
|
||||
done:
|
||||
mov sp, x24
|
||||
add sp, sp, #-128
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
67
src/NEON/ldr-q_mbp-TP.Se
Normal file
67
src/NEON/ldr-q_mbp-TP.Se
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 10
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x24, sp
|
||||
|
||||
mov x16, sp
|
||||
add x16, x16, #32
|
||||
mov x25, sp
|
||||
add x25, x25, #64
|
||||
mov x27, sp
|
||||
sub x27, x27, #32
|
||||
mov x28, sp
|
||||
sub x28, x28, #64
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q0, [sp], #64
|
||||
INSTR q1, [x25], #64
|
||||
INSTR q2, [x27], #64
|
||||
INSTR q3, [x28], #64
|
||||
INSTR q4, [x16], #64
|
||||
INSTR q5, [sp], #-64
|
||||
INSTR q6, [x25], #-64
|
||||
INSTR q7, [x27], #-64
|
||||
INSTR q8, [x28], #-64
|
||||
INSTR q9, [x16], #-64
|
||||
|
||||
bne loop
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -38,6 +45,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
55
src/NEON/ldr-x_mb-TP.Se
Normal file
55
src/NEON/ldr-x_mb-TP.Se
Normal file
@@ -0,0 +1,55 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 8
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR x2, [sp]
|
||||
INSTR x6, [sp]
|
||||
INSTR x8, [sp]
|
||||
INSTR x10, [sp]
|
||||
INSTR x12, [sp]
|
||||
INSTR x14, [sp]
|
||||
INSTR x16, [sp]
|
||||
INSTR x18, [sp]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
@@ -48,6 +55,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
64
src/NEON/ldr-x_mb-il_1_1-str-x_mb-TP.Se
Normal file
64
src/NEON/ldr-x_mb-il_1_1-str-x_mb-TP.Se
Normal file
@@ -0,0 +1,64 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 8
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
sub x24, x24, #192
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR x1, [sp]
|
||||
str x2, [x24]
|
||||
INSTR x3, [sp]
|
||||
str x5, [x24]
|
||||
INSTR x6, [sp]
|
||||
str x7, [x24]
|
||||
INSTR x8, [sp]
|
||||
str x9, [x24]
|
||||
INSTR x10, [sp]
|
||||
str x11, [x24]
|
||||
INSTR x12, [sp]
|
||||
str x13, [x24]
|
||||
INSTR x14, [sp]
|
||||
str x15, [x24]
|
||||
INSTR x16, [sp]
|
||||
str x17, [x24]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,12 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
@@ -50,6 +56,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,6 +42,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,6 +41,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,6 +42,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -36,6 +43,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -80,6 +87,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -41,6 +48,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
58
src/NEON/stp-d_d_mbo-TP.Se
Normal file
58
src/NEON/stp-d_d_mbo-TP.Se
Normal file
@@ -0,0 +1,58 @@
|
||||
#define INSTR stp
|
||||
#define NINST 12
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR d0, d1, [sp, #-64]
|
||||
INSTR d2, d3, [sp, #-128]
|
||||
INSTR d4, d5, [sp, #-192]
|
||||
INSTR d6, d7, [sp, #-256]
|
||||
INSTR d8, d9, [sp, #-320]
|
||||
INSTR d10, d11, [sp, #-384]
|
||||
INSTR d12, d13, [sp, #-448]
|
||||
INSTR d14, d15, [sp, #-32]
|
||||
INSTR d16, d17, [sp, #-96]
|
||||
INSTR d18, d19, [sp, #-160]
|
||||
INSTR d20, d21, [sp, #-224]
|
||||
INSTR d22, d23, [sp, #-288]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -53,6 +60,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
70
src/NEON/stp-d_d_mbo-il_1_1-ldr-d_mb-TP.Se
Normal file
70
src/NEON/stp-d_d_mbo-il_1_1-ldr-d_mb-TP.Se
Normal file
@@ -0,0 +1,70 @@
|
||||
#define INSTR stp
|
||||
#define NINST 12
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR d0, d1, [sp, #-64]
|
||||
ldr x0, [sp]
|
||||
INSTR d2, d3, [sp, #-128]
|
||||
ldr x1, [sp]
|
||||
INSTR d4, d5, [sp, #-192]
|
||||
ldr x2, [sp]
|
||||
INSTR d6, d7, [sp, #-256]
|
||||
ldr x3, [sp]
|
||||
INSTR d8, d9, [sp, #-320]
|
||||
ldr x5, [sp]
|
||||
INSTR d10, d11, [sp, #-384]
|
||||
ldr x6, [sp]
|
||||
INSTR d12, d13, [sp, #-448]
|
||||
ldr x7, [sp]
|
||||
INSTR d14, d15, [sp, #-32]
|
||||
ldr x8, [sp]
|
||||
INSTR d16, d17, [sp, #-96]
|
||||
ldr x9, [sp]
|
||||
INSTR d18, d19, [sp, #-160]
|
||||
ldr x10, [sp]
|
||||
INSTR d20, d21, [sp, #-224]
|
||||
ldr x11, [sp]
|
||||
INSTR d22, d23, [sp, #-288]
|
||||
ldr x12, [sp]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,12 +17,21 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
|
||||
sub sp, sp, #64
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q0, q1, [sp]
|
||||
@@ -39,8 +48,14 @@ loop:
|
||||
INSTR q22, q23, [sp]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
add sp, sp, #64
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
58
src/NEON/stp-q_q_mb-TP.Se
Normal file
58
src/NEON/stp-q_q_mb-TP.Se
Normal file
@@ -0,0 +1,58 @@
|
||||
#define INSTR stp
|
||||
#define NINST 12
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q0, q1, [sp]
|
||||
INSTR q2, q3, [sp]
|
||||
INSTR q4, q5, [sp]
|
||||
INSTR q6, q7, [sp]
|
||||
INSTR q8, q9, [sp]
|
||||
INSTR q10, q11, [sp]
|
||||
INSTR q12, q13, [sp]
|
||||
INSTR q14, q15, [sp]
|
||||
INSTR q16, q17, [sp]
|
||||
INSTR q18, q19, [sp]
|
||||
INSTR q20, q21, [sp]
|
||||
INSTR q22, q23, [sp]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -41,6 +48,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
58
src/NEON/stp-q_q_mbo-TP.Se
Normal file
58
src/NEON/stp-q_q_mbo-TP.Se
Normal file
@@ -0,0 +1,58 @@
|
||||
#define INSTR stp
|
||||
#define NINST 12
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q0, q1, [sp, #-64]
|
||||
INSTR q2, q3, [sp, #-128]
|
||||
INSTR q4, q5, [sp, #-192]
|
||||
INSTR q6, q7, [sp, #-256]
|
||||
INSTR q8, q9, [sp, #-320]
|
||||
INSTR q10, q11, [sp, #-384]
|
||||
INSTR q12, q13, [sp, #-448]
|
||||
INSTR q14, q15, [sp, #-512]
|
||||
INSTR q16, q17, [sp, #-576]
|
||||
INSTR q18, q19, [sp, #-640]
|
||||
INSTR q20, q21, [sp, #-704]
|
||||
INSTR q22, q23, [sp, #-768]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -17,6 +17,13 @@ latency:
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -41,6 +48,12 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp, 16]
|
||||
ldp x21, x22, [sp, 32]
|
||||
ldp x24, x25, [sp, 48]
|
||||
ldp x26, x27, [sp, 64]
|
||||
ldr x28, [sp, 80]
|
||||
ldp x29, x30, [sp], 96
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
|
58
src/NEON/stp-x_x_mbo-TP.Se
Normal file
58
src/NEON/stp-x_x_mbo-TP.Se
Normal file
@@ -0,0 +1,58 @@
|
||||
#define INSTR stp
|
||||
#define NINST 12
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
stp x29, x30, [sp, -96]!
|
||||
stp x19, x20, [sp, 16]
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x24, x25, [sp, 48]
|
||||
stp x26, x27, [sp, 64]
|
||||
str x28, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR x24, x1, [sp, #-256]
|
||||
INSTR x2, x3, [sp, #-256]
|
||||
INSTR x25, x5, [sp, #-256]
|
||||
INSTR x6, x7, [sp, #-256]
|
||||
INSTR x8, x9, [sp, #-256]
|
||||
INSTR x10, x11, [sp, #-256]
|
||||
INSTR x12, x13, [sp, #-256]
|
||||
INSTR x14, x15, [sp, #-256]
|
||||
INSTR x16, x17, [sp, #-256]
|
||||
INSTR x18, x19, [sp, #-256]
|
||||
INSTR x20, x21, [sp, #-256]
|
||||
INSTR x22, x23, [sp, #-256]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user