benchmarks for A64FX

This commit is contained in:
JanLJL
2020-08-12 19:29:01 +02:00
parent de3bda1e3c
commit ce05692884
156 changed files with 5659 additions and 18 deletions

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -35,6 +42,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -80,6 +87,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -36,6 +43,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -80,6 +87,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -35,6 +42,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -80,6 +87,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -227,6 +234,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -227,6 +234,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -37,6 +44,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -57,6 +64,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -57,6 +64,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -35,6 +42,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -35,6 +42,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -0,0 +1,203 @@
#define INSTR fadd
#define NINST 48
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR v3.2d, v0.2d, v0.2d
add x5, x1, x1
add x6, x2, x2
INSTR v4.2d, v0.2d, v1.2d
add x7, x3, x3
add x8, x1, x1
INSTR v5.2d, v0.2d, v2.2d
add x9, x2, x2
add x10, x3, x3
INSTR v6.2d, v1.2d, v1.2d
add x11, x1, x1
add x12, x2, x2
INSTR v7.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v8.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v9.2d, v0.2d, v2.2d
add x5, x2, x2
add x6, x3, x3
INSTR v10.2d, v1.2d, v1.2d
add x7, x1, x1
add x8, x2, x2
INSTR v11.2d, v1.2d, v2.2d
add x9, x3, x3
add x10, x1, x1
INSTR v12.2d, v2.2d, v2.2d
add x11, x2, x2
add x12, x3, x3
INSTR v13.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v14.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v3.2d, v0.2d, v0.2d
add x5, x1, x1
add x6, x2, x2
INSTR v4.2d, v0.2d, v1.2d
add x7, x3, x3
add x8, x1, x1
INSTR v5.2d, v0.2d, v2.2d
add x9, x2, x2
add x10, x3, x3
INSTR v6.2d, v1.2d, v1.2d
add x11, x1, x1
add x12, x2, x2
INSTR v7.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v8.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v9.2d, v0.2d, v2.2d
add x5, x2, x2
add x6, x3, x3
INSTR v10.2d, v1.2d, v1.2d
add x7, x1, x1
add x8, x2, x2
INSTR v11.2d, v1.2d, v2.2d
add x9, x3, x3
add x10, x1, x1
INSTR v12.2d, v2.2d, v2.2d
add x11, x2, x2
add x12, x3, x3
INSTR v13.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v14.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v3.2d, v0.2d, v0.2d
add x5, x1, x1
add x6, x2, x2
INSTR v4.2d, v0.2d, v1.2d
add x7, x3, x3
add x8, x1, x1
INSTR v5.2d, v0.2d, v2.2d
add x9, x2, x2
add x10, x3, x3
INSTR v6.2d, v1.2d, v1.2d
add x11, x1, x1
add x12, x2, x2
INSTR v7.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v8.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v9.2d, v0.2d, v2.2d
add x5, x2, x2
add x6, x3, x3
INSTR v10.2d, v1.2d, v1.2d
add x7, x1, x1
add x8, x2, x2
INSTR v11.2d, v1.2d, v2.2d
add x9, x3, x3
add x10, x1, x1
INSTR v12.2d, v2.2d, v2.2d
add x11, x2, x2
add x12, x3, x3
INSTR v13.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v14.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v3.2d, v0.2d, v0.2d
add x5, x1, x1
add x6, x2, x2
INSTR v4.2d, v0.2d, v1.2d
add x7, x3, x3
add x8, x1, x1
INSTR v5.2d, v0.2d, v2.2d
add x9, x2, x2
add x10, x3, x3
INSTR v6.2d, v1.2d, v1.2d
add x11, x1, x1
add x12, x2, x2
INSTR v7.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v8.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v9.2d, v0.2d, v2.2d
add x5, x2, x2
add x6, x3, x3
INSTR v10.2d, v1.2d, v1.2d
add x7, x1, x1
add x8, x2, x2
INSTR v11.2d, v1.2d, v2.2d
add x9, x3, x3
add x10, x1, x1
INSTR v12.2d, v2.2d, v2.2d
add x11, x2, x2
add x12, x3, x3
INSTR v13.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v14.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,73 @@
#define INSTR fadd
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR v3.2d, v0.2d, v0.2d
sub x5, x1, #1
sub x6, x2, #1
INSTR v4.2d, v0.2d, v1.2d
sub x7, x3, #1
sub x8, x1, #1
INSTR v5.2d, v0.2d, v2.2d
sub x9, x2, #1
sub x10, x3, #1
INSTR v6.2d, v1.2d, v1.2d
sub x11, x1, #1
sub x12, x2, #1
INSTR v7.2d, v1.2d, v2.2d
sub x13, x3, #1
sub x14, x1, #1
INSTR v8.2d, v2.2d, v2.2d
sub x15, x2, #1
sub x16, x3, #1
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,79 @@
#define INSTR fadd
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR v3.2d, v0.2d, v0.2d
add x5, x1, x1
add x6, x2, x2
add x7, x3, x3
INSTR v4.2d, v0.2d, v1.2d
add x8, x1, x1
add x9, x2, x2
add x10, x3, x3
INSTR v5.2d, v0.2d, v2.2d
add x11, x1, x1
add x12, x2, x2
add x13, x3, x3
INSTR v6.2d, v1.2d, v1.2d
add x14, x1, x1
add x15, x2, x2
add x16, x3, x3
INSTR v7.2d, v1.2d, v2.2d
add x8, x1, x1
add x9, x2, x2
add x10, x3, x3
INSTR v8.2d, v2.2d, v2.2d
add x11, x1, x1
add x12, x2, x2
add x13, x3, x3
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,73 @@
#define INSTR fadd
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR v3.2d, v0.2d, v0.2d
INSTR v4.2d, v0.2d, v1.2d
mul x5, x1, x1
INSTR v5.2d, v0.2d, v2.2d
INSTR v6.2d, v1.2d, v1.2d
mul x6, x2, x2
INSTR v7.2d, v1.2d, v2.2d
INSTR v8.2d, v2.2d, v2.2d
mul x7, x3, x3
INSTR v9.2d, v2.2d, v2.2d
INSTR v10.2d, v2.2d, v2.2d
mul x8, x1, x1
INSTR v11.2d, v2.2d, v2.2d
INSTR v12.2d, v2.2d, v2.2d
mul x9, x2, x2
INSTR v13.2d, v2.2d, v2.2d
INSTR v14.2d, v2.2d, v2.2d
mul x10, x3, x3
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -35,6 +42,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -0,0 +1,73 @@
#define INSTR fadd
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.4s, #1.00000000
fmov v1.4s, #1.00000000
fmov v2.4s, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR v3.4s, v0.4s, v0.4s
add x5, x1, x1
add x6, x2, x2
INSTR v4.4s, v0.4s, v1.4s
add x7, x3, x3
add x8, x1, x1
INSTR v5.4s, v0.4s, v2.4s
add x9, x2, x2
add x10, x3, x3
INSTR v6.4s, v1.4s, v1.4s
add x11, x1, x1
add x12, x2, x2
INSTR v7.4s, v1.4s, v2.4s
add x13, x3, x3
add x14, x1, x1
INSTR v8.4s, v2.4s, v2.4s
add x15, x2, x2
add x16, x3, x3
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -61,6 +68,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -61,6 +68,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -67,6 +74,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -71,6 +78,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -61,6 +68,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -61,6 +68,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -88,6 +95,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -116,6 +123,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -58,6 +65,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -58,6 +65,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -44,6 +51,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -35,6 +42,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -35,6 +42,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -35,6 +42,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -61,6 +68,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -67,6 +74,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -61,6 +68,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -67,6 +74,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -68,6 +75,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -66,6 +73,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -68,6 +75,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -66,6 +73,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -35,6 +42,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -35,6 +42,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -41,6 +48,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -0,0 +1,58 @@
#define INSTR ldp
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR d24, d1, [sp, #-256]
INSTR d2, d3, [sp, #-256]
INSTR d25, d5, [sp, #-256]
INSTR d6, d7, [sp, #-256]
INSTR d8, d9, [sp, #-256]
INSTR d10, d11, [sp, #-256]
INSTR d12, d13, [sp, #-256]
INSTR d14, d15, [sp, #-256]
INSTR d16, d17, [sp, #-256]
INSTR d18, d19, [sp, #-256]
INSTR d20, d21, [sp, #-256]
INSTR d22, d23, [sp, #-256]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -53,6 +60,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -0,0 +1,70 @@
#define INSTR ldp
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR d24, d1, [sp, #-256]
ldr x0, [sp]
INSTR d2, d3, [sp, #-256]
ldr x1, [sp]
INSTR d25, d5, [sp, #-256]
ldr x2, [sp]
INSTR d6, d7, [sp, #-256]
ldr x3, [sp]
INSTR d8, d9, [sp, #-256]
ldr x5, [sp]
INSTR d10, d11, [sp, #-256]
ldr x6, [sp]
INSTR d12, d13, [sp, #-256]
ldr x7, [sp]
INSTR d14, d15, [sp, #-256]
ldr x8, [sp]
INSTR d16, d17, [sp, #-256]
ldr x9, [sp]
INSTR d18, d19, [sp, #-256]
ldr x10, [sp]
INSTR d20, d21, [sp, #-256]
ldr x11, [sp]
INSTR d22, d23, [sp, #-256]
ldr x12, [sp]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -41,6 +48,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -0,0 +1,58 @@
#define INSTR ldp
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR q0, q1, [sp, #-256]
INSTR q2, q3, [sp, #-256]
INSTR q4, q5, [sp, #-256]
INSTR q6, q7, [sp, #-256]
INSTR q8, q9, [sp, #-256]
INSTR q10, q11, [sp, #-256]
INSTR q12, q13, [sp, #-256]
INSTR q14, q15, [sp, #-256]
INSTR q16, q17, [sp, #-256]
INSTR q18, q19, [sp, #-256]
INSTR q20, q21, [sp, #-256]
INSTR q22, q23, [sp, #-256]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -45,6 +52,12 @@ loop:
done:
mov sp, x24
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -0,0 +1,62 @@
#define INSTR ldp
#define NINST 14
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, sp
loop:
subs x4, x4, #1
INSTR q0, q1, [sp], #64
INSTR q2, q3, [sp], #64
INSTR q4, q5, [sp], #64
INSTR q6, q7, [sp], #64
INSTR q8, q9, [sp], #64
INSTR q10, q11, [sp], #64
INSTR q12, q13, [sp], #64
INSTR q14, q15, [sp], #-64
INSTR q16, q17, [sp], #-64
INSTR q18, q19, [sp], #-64
INSTR q20, q21, [sp], #-64
INSTR q22, q23, [sp], #-64
INSTR q25, q26, [sp], #-64
INSTR q27, q28, [sp], #-64
bne loop
done:
mov sp, x24
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -50,6 +57,12 @@ loop:
done:
mov sp, x24
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -0,0 +1,67 @@
#define INSTR ldp
#define NINST 10
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, sp
mov x16, sp
add x16, x16, #32
mov x25, sp
add x25, x25, #64
mov x27, sp
sub x27, x27, #32
mov x28, sp
sub x28, x28, #64
loop:
subs x4, x4, #1
INSTR q0, q1, [sp], #64
INSTR q2, q3, [x25], #64
INSTR q4, q5, [x27], #64
INSTR q6, q7, [x28], #64
INSTR q18, q19, [x16], #64
INSTR q8, q9, [sp], #-64
INSTR q10, q11, [x25], #-64
INSTR q12, q13, [x27], #-64
INSTR q14, q15, [x28], #-64
INSTR q20, q21, [x16], #-64
bne loop
done:
mov sp, x24
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -39,6 +46,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -0,0 +1,56 @@
#define INSTR ldp
#define NINST 10
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR x24, x1, [sp, #-256]
INSTR x2, x3, [sp, #-256]
INSTR x25, x5, [sp, #-256]
INSTR x6, x7, [sp, #-256]
INSTR x8, x9, [sp, #-256]
INSTR x10, x11, [sp, #-256]
INSTR x12, x13, [sp, #-256]
INSTR x14, x15, [sp, #-256]
INSTR x16, x17, [sp, #-256]
INSTR x28, x29, [sp, #-256]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -50,6 +57,12 @@ loop:
done:
mov sp, x24
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

67
src/NEON/ldr-q_mb-TP.Se Normal file
View File

@@ -0,0 +1,67 @@
#define INSTR ldr
#define NINST 10
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, sp
mov x16, sp
add x16, x16, #32
mov x25, sp
add x25, x25, #64
mov x27, sp
sub x27, x27, #32
mov x28, sp
sub x28, x28, #64
loop:
subs x4, x4, #1
INSTR q0, [sp]
INSTR q1, [x25]
INSTR q2, [x27]
INSTR q3, [x28]
INSTR q4, [x16]
INSTR q5, [sp]
INSTR q6, [x25]
INSTR q7, [x27]
INSTR q8, [x28]
INSTR q9, [x16]
bne loop
done:
mov sp, x24
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,71 @@
#define INSTR ldr
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
mov x24, sp
sub x24, x24, #192
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR q1, [sp]
str q2, [x24]
INSTR q3, [sp]
str q5, [x24]
INSTR q6, [sp]
str q7, [x24]
INSTR q8, [sp]
str q9, [x24]
INSTR q10, [sp]
str q11, [x24]
INSTR q12, [sp]
str q13, [x24]
INSTR q14, [sp]
str q15, [x24]
INSTR q16, [sp]
str q17, [x24]
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR ldr
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
mov x24, sp
sub x24, x24, #192
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR q1, [sp]
str q2, [x24]
INSTR q3, [sp]
str q5, [x24]
INSTR q6, [sp]
str q7, [x24]
INSTR q8, [sp]
str q9, [x24]
INSTR q10, [sp]
str q11, [x24]
INSTR q12, [sp]
str q13, [x24]
INSTR q14, [sp]
str q15, [x24]
INSTR q16, [sp]
str q17, [x24]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,79 @@
#define INSTR ldr
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
mov x24, sp
sub x24, x24, #192
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR q1, [sp]
str q2, [x24]
add x0, x0, x0
INSTR q3, [sp]
str q5, [x24]
add x1, x1, x1
INSTR q6, [sp]
str q7, [x24]
add x2, x2, x2
INSTR q8, [sp]
str q9, [x24]
add x3, x3, x3
INSTR q10, [sp]
str q11, [x24]
add x5, x5, x5
INSTR q12, [sp]
str q13, [x24]
add x6, x6, x6
INSTR q14, [sp]
str q15, [x24]
add x7, x7, x7
INSTR q16, [sp]
str q17, [x24]
add x8, x8, x8
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,73 @@
#define INSTR ldr
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
mov x24, sp
sub x24, x24, #192
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR q1, [sp]
str q2, [x24]
add x0, x0, x0
INSTR q3, [sp]
str q5, [x24]
add x1, x1, x1
INSTR q6, [sp]
str q7, [x24]
add x2, x2, x2
INSTR q8, [sp]
str q9, [x24]
add x3, x3, x3
INSTR q10, [sp]
str q11, [x24]
add x5, x5, x5
INSTR q12, [sp]
str q13, [x24]
add x6, x6, x6
INSTR q14, [sp]
str q15, [x24]
add x7, x7, x7
INSTR q16, [sp]
str q17, [x24]
add x8, x8, x8
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,72 @@
#define INSTR ldr
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
mov x24, sp
sub x24, x24, #192
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR q1, [sp]
INSTR q2, [sp]
str q3, [x24]
INSTR q5, [sp]
INSTR q6, [sp]
str q7, [x24]
INSTR q8, [sp]
INSTR q9, [sp]
str q10, [x24]
INSTR q11, [sp]
INSTR q12, [sp]
str q13, [x24]
INSTR q14, [sp]
INSTR q15, [sp]
str q16, [x24]
INSTR q17, [sp]
INSTR q18, [sp]
str q28, [x24]
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -98,6 +105,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

115
src/NEON/ldr-q_mbi-TP.Se Normal file
View File

@@ -0,0 +1,115 @@
#define INSTR ldr
#define NINST 64
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, #-64
mov x25, #-128
mov x28, #-192
mov x27, #-256
loop:
subs x4, x4, #1
INSTR q2, [sp, x24]
INSTR q6, [sp, x25]
INSTR q8, [sp, x28]
INSTR q10, [sp, x27]
INSTR q12, [sp, x24]
INSTR q14, [sp, x25]
INSTR q16, [sp, x28]
INSTR q18, [sp, x27]
INSTR q2, [sp, x24]
INSTR q6, [sp, x25]
INSTR q8, [sp, x28]
INSTR q10, [sp, x27]
INSTR q12, [sp, x24]
INSTR q14, [sp, x25]
INSTR q16, [sp, x28]
INSTR q18, [sp, x27]
INSTR q2, [sp, x24]
INSTR q6, [sp, x25]
INSTR q8, [sp, x28]
INSTR q10, [sp, x27]
INSTR q12, [sp, x24]
INSTR q14, [sp, x25]
INSTR q16, [sp, x28]
INSTR q18, [sp, x27]
INSTR q2, [sp, x24]
INSTR q6, [sp, x25]
INSTR q8, [sp, x28]
INSTR q10, [sp, x27]
INSTR q12, [sp, x24]
INSTR q14, [sp, x25]
INSTR q16, [sp, x28]
INSTR q18, [sp, x27]
INSTR q2, [sp, x24]
INSTR q6, [sp, x25]
INSTR q8, [sp, x28]
INSTR q10, [sp, x27]
INSTR q12, [sp, x24]
INSTR q14, [sp, x25]
INSTR q16, [sp, x28]
INSTR q18, [sp, x27]
INSTR q2, [sp, x24]
INSTR q6, [sp, x25]
INSTR q8, [sp, x28]
INSTR q10, [sp, x27]
INSTR q12, [sp, x24]
INSTR q14, [sp, x25]
INSTR q16, [sp, x28]
INSTR q18, [sp, x27]
INSTR q2, [sp, x24]
INSTR q6, [sp, x25]
INSTR q8, [sp, x28]
INSTR q10, [sp, x27]
INSTR q12, [sp, x24]
INSTR q14, [sp, x25]
INSTR q16, [sp, x28]
INSTR q18, [sp, x27]
INSTR q2, [sp, x24]
INSTR q6, [sp, x25]
INSTR q8, [sp, x28]
INSTR q10, [sp, x27]
INSTR q12, [sp, x24]
INSTR q14, [sp, x25]
INSTR q16, [sp, x28]
INSTR q18, [sp, x27]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -43,6 +50,12 @@ loop:
done:
mov sp, x24
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

60
src/NEON/ldr-q_mbp-LAT.Se Normal file
View File

@@ -0,0 +1,60 @@
#define INSTR ldr
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, sp
loop:
subs x4, x4, #1
INSTR q0, [sp], #64
INSTR q1, [sp], #64
INSTR q2, [sp], #64
INSTR q3, [sp], #64
INSTR q4, [sp], #64
INSTR q5, [sp], #64
INSTR q6, [sp], #-64
INSTR q7, [sp], #-64
INSTR q8, [sp], #-64
INSTR q9, [sp], #-64
INSTR q10, [sp], #-64
INSTR q11, [sp], #-64
bne loop
done:
mov sp, x24
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -1,5 +1,5 @@
#define INSTR ldr
#define NINST 10
#define NINST 18
#define N x0
.globl ninst
@@ -17,14 +17,30 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, sp
sub sp, sp, #-128
mov x24, sp
mov x10, sp
mov x11, sp
mov x12, sp
mov x13, sp
add x10, x10, #16
add x11, x11, #48
sub x12, x12, #16
sub x13, x13, #48
mov x16, sp
add x16, x16, #32
mov x25, sp
@@ -40,16 +56,30 @@ loop:
INSTR q2, [x27], #64
INSTR q3, [x28], #64
INSTR q4, [x16], #64
INSTR q5, [sp], #-64
INSTR q6, [x25], #-64
INSTR q7, [x27], #-64
INSTR q8, [x28], #-64
INSTR q9, [x16], #-64
INSTR q5, [x10], #64
INSTR q6, [x11], #64
INSTR q6, [x12], #64
INSTR q6, [x13], #64
INSTR q7, [sp], #-64
INSTR q8, [x25], #-64
INSTR q9, [x27], #-64
INSTR q10, [x28], #-64
INSTR q11, [x16], #-64
INSTR q12, [x10], #-64
INSTR q13, [x11], #-64
INSTR q6, [x12], #-64
INSTR q6, [x13], #-64
bne loop
done:
mov sp, x24
add sp, sp, #-128
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

67
src/NEON/ldr-q_mbp-TP.Se Normal file
View File

@@ -0,0 +1,67 @@
#define INSTR ldr
#define NINST 10
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, sp
mov x16, sp
add x16, x16, #32
mov x25, sp
add x25, x25, #64
mov x27, sp
sub x27, x27, #32
mov x28, sp
sub x28, x28, #64
loop:
subs x4, x4, #1
INSTR q0, [sp], #64
INSTR q1, [x25], #64
INSTR q2, [x27], #64
INSTR q3, [x28], #64
INSTR q4, [x16], #64
INSTR q5, [sp], #-64
INSTR q6, [x25], #-64
INSTR q7, [x27], #-64
INSTR q8, [x28], #-64
INSTR q9, [x16], #-64
bne loop
done:
mov sp, x24
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -38,6 +45,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

55
src/NEON/ldr-x_mb-TP.Se Normal file
View File

@@ -0,0 +1,55 @@
#define INSTR ldr
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR x2, [sp]
INSTR x6, [sp]
INSTR x8, [sp]
INSTR x10, [sp]
INSTR x12, [sp]
INSTR x14, [sp]
INSTR x16, [sp]
INSTR x18, [sp]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
mov x24, sp
@@ -48,6 +55,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -0,0 +1,64 @@
#define INSTR ldr
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
mov x24, sp
sub x24, x24, #192
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR x1, [sp]
str x2, [x24]
INSTR x3, [sp]
str x5, [x24]
INSTR x6, [sp]
str x7, [x24]
INSTR x8, [sp]
str x9, [x24]
INSTR x10, [sp]
str x11, [x24]
INSTR x12, [sp]
str x13, [x24]
INSTR x14, [sp]
str x15, [x24]
INSTR x16, [sp]
str x17, [x24]
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,12 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
mov x24, sp
@@ -50,6 +56,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -35,6 +42,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -34,6 +41,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -35,6 +42,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -36,6 +43,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -80,6 +87,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -41,6 +48,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -0,0 +1,58 @@
#define INSTR stp
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR d0, d1, [sp, #-64]
INSTR d2, d3, [sp, #-128]
INSTR d4, d5, [sp, #-192]
INSTR d6, d7, [sp, #-256]
INSTR d8, d9, [sp, #-320]
INSTR d10, d11, [sp, #-384]
INSTR d12, d13, [sp, #-448]
INSTR d14, d15, [sp, #-32]
INSTR d16, d17, [sp, #-96]
INSTR d18, d19, [sp, #-160]
INSTR d20, d21, [sp, #-224]
INSTR d22, d23, [sp, #-288]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -53,6 +60,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -0,0 +1,70 @@
#define INSTR stp
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR d0, d1, [sp, #-64]
ldr x0, [sp]
INSTR d2, d3, [sp, #-128]
ldr x1, [sp]
INSTR d4, d5, [sp, #-192]
ldr x2, [sp]
INSTR d6, d7, [sp, #-256]
ldr x3, [sp]
INSTR d8, d9, [sp, #-320]
ldr x5, [sp]
INSTR d10, d11, [sp, #-384]
ldr x6, [sp]
INSTR d12, d13, [sp, #-448]
ldr x7, [sp]
INSTR d14, d15, [sp, #-32]
ldr x8, [sp]
INSTR d16, d17, [sp, #-96]
ldr x9, [sp]
INSTR d18, d19, [sp, #-160]
ldr x10, [sp]
INSTR d20, d21, [sp, #-224]
ldr x11, [sp]
INSTR d22, d23, [sp, #-288]
ldr x12, [sp]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,12 +17,21 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
sub sp, sp, #64
loop:
subs x4, x4, #1
INSTR q0, q1, [sp]
@@ -39,8 +48,14 @@ loop:
INSTR q22, q23, [sp]
bne loop
done:
add sp, sp, #64
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

58
src/NEON/stp-q_q_mb-TP.Se Normal file
View File

@@ -0,0 +1,58 @@
#define INSTR stp
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR q0, q1, [sp]
INSTR q2, q3, [sp]
INSTR q4, q5, [sp]
INSTR q6, q7, [sp]
INSTR q8, q9, [sp]
INSTR q10, q11, [sp]
INSTR q12, q13, [sp]
INSTR q14, q15, [sp]
INSTR q16, q17, [sp]
INSTR q18, q19, [sp]
INSTR q20, q21, [sp]
INSTR q22, q23, [sp]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -41,6 +48,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -0,0 +1,58 @@
#define INSTR stp
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR q0, q1, [sp, #-64]
INSTR q2, q3, [sp, #-128]
INSTR q4, q5, [sp, #-192]
INSTR q6, q7, [sp, #-256]
INSTR q8, q9, [sp, #-320]
INSTR q10, q11, [sp, #-384]
INSTR q12, q13, [sp, #-448]
INSTR q14, q15, [sp, #-512]
INSTR q16, q17, [sp, #-576]
INSTR q18, q19, [sp, #-640]
INSTR q20, q21, [sp, #-704]
INSTR q22, q23, [sp, #-768]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -17,6 +17,13 @@ latency:
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
@@ -41,6 +48,12 @@ loop:
done:
# pop callee-save registers from stack
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x24, x25, [sp, 48]
ldp x26, x27, [sp, 64]
ldr x28, [sp, 80]
ldp x29, x30, [sp], 96
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]

View File

@@ -0,0 +1,58 @@
#define INSTR stp
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
stp x29, x30, [sp, -96]!
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
stp x24, x25, [sp, 48]
stp x26, x27, [sp, 64]
str x28, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR x24, x1, [sp, #-256]
INSTR x2, x3, [sp, #-256]
INSTR x25, x5, [sp, #-256]
INSTR x6, x7, [sp, #-256]
INSTR x8, x9, [sp, #-256]
INSTR x10, x11, [sp, #-256]
INSTR x12, x13, [sp, #-256]
INSTR x14, x15, [sp, #-256]
INSTR x16, x17, [sp, #-256]
INSTR x18, x19, [sp, #-256]
INSTR x20, x21, [sp, #-256]
INSTR x22, x23, [sp, #-256]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

Some files were not shown because too many files have changed in this diff Show More