Merge pull request #1 from RRZE-HPC/NEON

Neon
This commit is contained in:
Jan
2022-01-31 18:16:21 +01:00
committed by GitHub
211 changed files with 12756 additions and 1178 deletions

View File

@@ -1,3 +1,4 @@
# Possible targets: GCC, ICC, MIC, POWER8, ARMGCC
COMPILER=ICC
TARGET = ibench

View File

@@ -1,6 +1,8 @@
CC = gcc
AS = gcc
CFLAGS = -O3
# -msve-vector-bits=512 -march=armv8.2-a+sve
LFLAGS = -shared
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/NEON/*.S))
#KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/SVE/*.S))

View File

@@ -1,6 +0,0 @@
CC = gcc
AS = gcc
CFLAGS = -O3 -msve-vector-bits=512 -march=armv8.2-a+sve
LFLAGS = -shared
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/SVE/*.S))

69
src/NEON/adc-x_x_x-LAT.S Normal file
View File

@@ -0,0 +1,69 @@
#define INSTR adc
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.4s, #1.00000000
fmov v1.4s, #1.00000000
mov x0, #1
mov x1, #1
loop:
subs x4, x4, #1
INSTR x0, x0, x1
INSTR x0, x0, x1
INSTR x0, x0, x1
INSTR x0, x0, x1
INSTR x0, x0, x1
INSTR x0, x0, x1
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

113
src/NEON/adc-x_x_x-TP.S Normal file
View File

@@ -0,0 +1,113 @@
#define INSTR adc
#define NINST 48
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR x5, x1, x1
INSTR x6, x2, x2
INSTR x7, x3, x3
INSTR x8, x1, x1
INSTR x9, x2, x2
INSTR x10, x3, x3
INSTR x11, x1, x1
INSTR x12, x2, x2
INSTR x13, x3, x3
INSTR x14, x1, x1
INSTR x15, x2, x2
INSTR x16, x3, x3
INSTR x5, x1, x1
INSTR x6, x2, x2
INSTR x7, x3, x3
INSTR x8, x1, x1
INSTR x9, x2, x2
INSTR x10, x3, x3
INSTR x11, x1, x1
INSTR x12, x2, x2
INSTR x13, x3, x3
INSTR x14, x1, x1
INSTR x15, x2, x2
INSTR x16, x3, x3
INSTR x5, x1, x1
INSTR x6, x2, x2
INSTR x7, x3, x3
INSTR x8, x1, x1
INSTR x9, x2, x2
INSTR x10, x3, x3
INSTR x11, x1, x1
INSTR x12, x2, x2
INSTR x13, x3, x3
INSTR x14, x1, x1
INSTR x15, x2, x2
INSTR x16, x3, x3
INSTR x5, x1, x1
INSTR x6, x2, x2
INSTR x7, x3, x3
INSTR x8, x1, x1
INSTR x9, x2, x2
INSTR x10, x3, x3
INSTR x11, x1, x1
INSTR x12, x2, x2
INSTR x13, x3, x3
INSTR x14, x1, x1
INSTR x15, x2, x2
INSTR x16, x3, x3
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -35,10 +50,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -80,10 +95,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -36,10 +51,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -80,10 +95,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -0,0 +1,67 @@
#define INSTR addp
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR v0.2d, v0.2d, v1.2d
INSTR v0.2d, v0.2d, v1.2d
INSTR v0.2d, v0.2d, v1.2d
INSTR v0.2d, v0.2d, v1.2d
INSTR v0.2d, v0.2d, v1.2d
INSTR v0.2d, v0.2d, v1.2d
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,68 @@
#define INSTR addp
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR v3.2d, v0.2d, v0.2d
INSTR v4.2d, v0.2d, v1.2d
INSTR v5.2d, v0.2d, v2.2d
INSTR v6.2d, v1.2d, v1.2d
INSTR v7.2d, v1.2d, v2.2d
INSTR v8.2d, v2.2d, v2.2d
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -35,10 +50,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -80,10 +95,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -227,10 +242,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -227,10 +242,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +49,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -37,10 +52,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -57,10 +72,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -57,10 +72,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

70
src/NEON/dup-vd_x-TP.S Normal file
View File

@@ -0,0 +1,70 @@
#define INSTR dup
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR v8.2d, x0
INSTR v9.2d, x1
INSTR v10.2d, x2
INSTR v11.2d, x3
INSTR v12.2d, x4
INSTR v13.2d, x5
INSTR v14.2d, x6
INSTR v15.2d, x7
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,71 @@
#define INSTR fadd
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
fadd v2.2d, v1.2d, v8.2d
fadd v3.2d, v2.2d, v1.2d
fadd v4.2d, v3.2d, v2.2d
fadd v5.2d, v4.2d, v3.2d
fadd v6.2d, v5.2d, v4.2d
fadd v7.2d, v6.2d, v5.2d
fadd v8.2d, v7.2d, v6.2d
fadd v1.2d, v8.2d, v7.2d
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,71 @@
#define INSTR fadd
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
fadd v2.2d, v8.2d, v1.2d
fadd v3.2d, v1.2d, v2.2d
fadd v4.2d, v2.2d, v3.2d
fadd v5.2d, v3.2d, v4.2d
fadd v6.2d, v4.2d, v5.2d
fadd v7.2d, v5.2d, v6.2d
fadd v8.2d, v6.2d, v7.2d
fadd v1.2d, v7.2d, v8.2d
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +49,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -35,10 +50,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +49,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -35,10 +50,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -0,0 +1,214 @@
#define INSTR fadd
#define NINST 48
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR v3.2d, v0.2d, v0.2d
add x5, x1, x1
add x6, x2, x2
INSTR v4.2d, v0.2d, v1.2d
add x7, x3, x3
add x8, x1, x1
INSTR v5.2d, v0.2d, v2.2d
add x9, x2, x2
add x10, x3, x3
INSTR v6.2d, v1.2d, v1.2d
add x11, x1, x1
add x12, x2, x2
INSTR v7.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v8.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v9.2d, v0.2d, v2.2d
add x5, x2, x2
add x6, x3, x3
INSTR v10.2d, v1.2d, v1.2d
add x7, x1, x1
add x8, x2, x2
INSTR v11.2d, v1.2d, v2.2d
add x9, x3, x3
add x10, x1, x1
INSTR v12.2d, v2.2d, v2.2d
add x11, x2, x2
add x12, x3, x3
INSTR v13.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v14.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v3.2d, v0.2d, v0.2d
add x5, x1, x1
add x6, x2, x2
INSTR v4.2d, v0.2d, v1.2d
add x7, x3, x3
add x8, x1, x1
INSTR v5.2d, v0.2d, v2.2d
add x9, x2, x2
add x10, x3, x3
INSTR v6.2d, v1.2d, v1.2d
add x11, x1, x1
add x12, x2, x2
INSTR v7.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v8.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v9.2d, v0.2d, v2.2d
add x5, x2, x2
add x6, x3, x3
INSTR v10.2d, v1.2d, v1.2d
add x7, x1, x1
add x8, x2, x2
INSTR v11.2d, v1.2d, v2.2d
add x9, x3, x3
add x10, x1, x1
INSTR v12.2d, v2.2d, v2.2d
add x11, x2, x2
add x12, x3, x3
INSTR v13.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v14.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v3.2d, v0.2d, v0.2d
add x5, x1, x1
add x6, x2, x2
INSTR v4.2d, v0.2d, v1.2d
add x7, x3, x3
add x8, x1, x1
INSTR v5.2d, v0.2d, v2.2d
add x9, x2, x2
add x10, x3, x3
INSTR v6.2d, v1.2d, v1.2d
add x11, x1, x1
add x12, x2, x2
INSTR v7.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v8.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v9.2d, v0.2d, v2.2d
add x5, x2, x2
add x6, x3, x3
INSTR v10.2d, v1.2d, v1.2d
add x7, x1, x1
add x8, x2, x2
INSTR v11.2d, v1.2d, v2.2d
add x9, x3, x3
add x10, x1, x1
INSTR v12.2d, v2.2d, v2.2d
add x11, x2, x2
add x12, x3, x3
INSTR v13.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v14.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v3.2d, v0.2d, v0.2d
add x5, x1, x1
add x6, x2, x2
INSTR v4.2d, v0.2d, v1.2d
add x7, x3, x3
add x8, x1, x1
INSTR v5.2d, v0.2d, v2.2d
add x9, x2, x2
add x10, x3, x3
INSTR v6.2d, v1.2d, v1.2d
add x11, x1, x1
add x12, x2, x2
INSTR v7.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v8.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
INSTR v9.2d, v0.2d, v2.2d
add x5, x2, x2
add x6, x3, x3
INSTR v10.2d, v1.2d, v1.2d
add x7, x1, x1
add x8, x2, x2
INSTR v11.2d, v1.2d, v2.2d
add x9, x3, x3
add x10, x1, x1
INSTR v12.2d, v2.2d, v2.2d
add x11, x2, x2
add x12, x3, x3
INSTR v13.2d, v1.2d, v2.2d
add x13, x3, x3
add x14, x1, x1
INSTR v14.2d, v2.2d, v2.2d
add x15, x2, x2
add x16, x3, x3
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,84 @@
#define INSTR fadd
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR v3.2d, v0.2d, v0.2d
sub x5, x1, #1
sub x6, x2, #1
INSTR v4.2d, v0.2d, v1.2d
sub x7, x3, #1
sub x8, x1, #1
INSTR v5.2d, v0.2d, v2.2d
sub x9, x2, #1
sub x10, x3, #1
INSTR v6.2d, v1.2d, v1.2d
sub x11, x1, #1
sub x12, x2, #1
INSTR v7.2d, v1.2d, v2.2d
sub x13, x3, #1
sub x14, x1, #1
INSTR v8.2d, v2.2d, v2.2d
sub x15, x2, #1
sub x16, x3, #1
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,90 @@
#define INSTR fadd
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR v3.2d, v0.2d, v0.2d
add x5, x1, x1
add x6, x2, x2
add x7, x3, x3
INSTR v4.2d, v0.2d, v1.2d
add x8, x1, x1
add x9, x2, x2
add x10, x3, x3
INSTR v5.2d, v0.2d, v2.2d
add x11, x1, x1
add x12, x2, x2
add x13, x3, x3
INSTR v6.2d, v1.2d, v1.2d
add x14, x1, x1
add x15, x2, x2
add x16, x3, x3
INSTR v7.2d, v1.2d, v2.2d
add x8, x1, x1
add x9, x2, x2
add x10, x3, x3
INSTR v8.2d, v2.2d, v2.2d
add x11, x1, x1
add x12, x2, x2
add x13, x3, x3
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,84 @@
#define INSTR fadd
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR v3.2d, v0.2d, v0.2d
INSTR v4.2d, v0.2d, v1.2d
mul x5, x1, x1
INSTR v5.2d, v0.2d, v2.2d
INSTR v6.2d, v1.2d, v1.2d
mul x6, x2, x2
INSTR v7.2d, v1.2d, v2.2d
INSTR v8.2d, v2.2d, v2.2d
mul x7, x3, x3
INSTR v9.2d, v2.2d, v2.2d
INSTR v10.2d, v2.2d, v2.2d
mul x8, x1, x1
INSTR v11.2d, v2.2d, v2.2d
INSTR v12.2d, v2.2d, v2.2d
mul x9, x2, x2
INSTR v13.2d, v2.2d, v2.2d
INSTR v14.2d, v2.2d, v2.2d
mul x10, x3, x3
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +49,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -35,10 +50,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -0,0 +1,84 @@
#define INSTR fadd
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.4s, #1.00000000
fmov v1.4s, #1.00000000
fmov v2.4s, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR v3.4s, v0.4s, v0.4s
add x5, x1, x1
add x6, x2, x2
INSTR v4.4s, v0.4s, v1.4s
add x7, x3, x3
add x8, x1, x1
INSTR v5.4s, v0.4s, v2.4s
add x9, x2, x2
add x10, x3, x3
INSTR v6.4s, v1.4s, v1.4s
add x11, x1, x1
add x12, x2, x2
INSTR v7.4s, v1.4s, v2.4s
add x13, x3, x3
add x14, x1, x1
INSTR v8.4s, v2.4s, v2.4s
add x15, x2, x2
add x16, x3, x3
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

94
src/NEON/fdiv-d_d_d-LAT.S Normal file
View File

@@ -0,0 +1,94 @@
#define INSTR fdiv
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000
# create 2.0
fadd v1.2d, v0.2d, v0.2d
# create 3.0
fadd v2.2d, v0.2d, v1.2d
# create 4.0
fadd v4.2d, v1.2d, v1.2d
# create 8.0
fadd v4.2d, v4.2d, v4.2d
# create 16.0
fadd v4.2d, v4.2d, v4.2d
# create 32.0
fadd v4.2d, v4.2d, v4.2d
# create 64.0
fadd v4.2d, v4.2d, v4.2d
# create 128.0
fadd v4.2d, v4.2d, v4.2d
# create 256.0
fadd v4.2d, v4.2d, v4.2d
# create 512.0
fadd v4.2d, v4.2d, v4.2d
# create 1024.0
fadd v4.2d, v4.2d, v4.2d
# create 341.3333 = (1024.0/3.0)
fdiv v1.2d, v4.2d, v2.2d
# create 1/341.3333
fdiv v2.2d, v0.2d, v1.2d
# create 2*341.3333
fadd v0.2d, v1.2d, v1.2d
loop:
subs x4, x4, #1
INSTR d0, d0, d1
INSTR d0, d0, d2
INSTR d0, d0, d1
INSTR d0, d0, d2
INSTR d0, d0, d1
INSTR d0, d0, d2
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

94
src/NEON/fdiv-d_d_d-TP.S Normal file
View File

@@ -0,0 +1,94 @@
#define INSTR fdiv
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000
# create 2.0
fadd v1.2d, v0.2d, v0.2d
# create 3.0
fadd v2.2d, v0.2d, v1.2d
# create 4.0
fadd v4.2d, v1.2d, v1.2d
# create 8.0
fadd v4.2d, v4.2d, v4.2d
# create 16.0
fadd v4.2d, v4.2d, v4.2d
# create 32.0
fadd v4.2d, v4.2d, v4.2d
# create 64.0
fadd v4.2d, v4.2d, v4.2d
# create 128.0
fadd v4.2d, v4.2d, v4.2d
# create 256.0
fadd v4.2d, v4.2d, v4.2d
# create 512.0
fadd v4.2d, v4.2d, v4.2d
# create 1024.0
fadd v4.2d, v4.2d, v4.2d
# create 341.3333 = (1024.0/3.0)
fdiv v1.2d, v4.2d, v2.2d
# create 1/341.3333
fdiv v2.2d, v0.2d, v1.2d
# create 2*341.3333
fadd v0.2d, v1.2d, v1.2d
loop:
subs x4, x4, #1
INSTR d3, d0, d1
INSTR d4, d1, d0
INSTR d5, d0, d2
INSTR d6, d2, d0
INSTR d7, d1, d2
INSTR d8, d2, d1
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -61,10 +76,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -61,10 +76,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -67,10 +82,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -71,10 +86,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -61,10 +76,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -61,10 +76,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -88,10 +103,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -116,10 +131,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -0,0 +1,68 @@
#define INSTR fmadd
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov d0, #1.00000000
fmov d1, #1.00000000
fmov d2, #1.00000000
loop:
subs x4, x4, #1
INSTR d1, d0, d0, d0
INSTR d0, d1, d1, d1
INSTR d1, d0, d0, d0
INSTR d0, d1, d1, d1
INSTR d1, d0, d0, d0
INSTR d0, d1, d1, d1
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,68 @@
#define INSTR fmadd
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov d0, #1.00000000
fmov d1, #1.00000000
fmov d2, #1.00000000
loop:
subs x4, x4, #1
INSTR d3, d0, d0, d0
INSTR d4, d0, d1, d1
INSTR d5, d0, d2, d2
INSTR d6, d1, d1, d1
INSTR d7, d1, d2, d2
INSTR d8, d2, d2, d2
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,68 @@
#define INSTR fmadd
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov d0, #1.00000000
fmov d1, #1.00000000
fmov d2, #1.00000000
loop:
subs x4, x4, #1
INSTR s1, s0, s0, s0
INSTR s0, s1, s1, s1
INSTR s1, s0, s0, s0
INSTR s0, s1, s1, s1
INSTR s1, s0, s0, s0
INSTR s0, s1, s1, s1
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,68 @@
#define INSTR fmadd
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov d0, #1.00000000
fmov d1, #1.00000000
fmov d2, #1.00000000
loop:
subs x4, x4, #1
INSTR s3, s0, s0, s0
INSTR s4, s0, s1, s1
INSTR s5, s0, s2, s2
INSTR s6, s1, s1, s1
INSTR s7, s1, s2, s2
INSTR s8, s2, s2, s2
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +49,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -58,10 +73,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +49,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -58,10 +73,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

77
src/NEON/fmov-d_x-TP.S Normal file
View File

@@ -0,0 +1,77 @@
#define INSTR fmov
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR d5, x1
INSTR d6, x2
INSTR d7, x3
INSTR d8, x1
INSTR d9, x2
INSTR d10, x3
INSTR d11, x1
INSTR d12, x2
INSTR d13, x3
INSTR d14, x1
INSTR d15, x2
INSTR d16, x3
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -44,10 +59,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

77
src/NEON/fmov-x_d-TP.S Normal file
View File

@@ -0,0 +1,77 @@
#define INSTR fmov
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR x1, d1
INSTR x2, d2
INSTR x3, d3
INSTR x5, d1
INSTR x6, d2
INSTR x7, d3
INSTR x8, d1
INSTR x9, d2
INSTR x10, d3
INSTR x11, d1
INSTR x12, d2
INSTR x13, d3
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +49,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -35,10 +50,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +49,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -35,10 +50,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +49,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -35,10 +50,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -0,0 +1,67 @@
#define INSTR fneg
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR v0.2d, v1.2d
INSTR v1.2d, v0.2d
INSTR v0.2d, v1.2d
INSTR v1.2d, v0.2d
INSTR v0.2d, v1.2d
INSTR v1.2d, v0.2d
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,68 @@
#define INSTR fneg
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR v3.2d, v0.2d
INSTR v4.2d, v1.2d
INSTR v5.2d, v2.2d
INSTR v6.2d, v0.2d
INSTR v7.2d, v1.2d
INSTR v8.2d, v2.2d
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,67 @@
#define INSTR fneg
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR v0.4s, v1.4s
INSTR v1.4s, v0.4s
INSTR v0.4s, v1.4s
INSTR v1.4s, v0.4s
INSTR v0.4s, v1.4s
INSTR v1.4s, v0.4s
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,68 @@
#define INSTR fneg
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR v3.4s, v0.4s
INSTR v4.4s, v1.4s
INSTR v5.4s, v2.4s
INSTR v6.4s, v0.4s
INSTR v7.4s, v1.4s
INSTR v8.4s, v2.4s
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -61,10 +76,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -67,10 +82,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -61,10 +76,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -67,10 +82,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -68,10 +83,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -66,10 +81,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -68,10 +83,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -66,10 +81,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +49,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -35,10 +50,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +49,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,25 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -35,10 +50,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -41,10 +55,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -53,10 +67,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -41,10 +55,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -45,10 +59,19 @@ loop:
done:
mov sp, x24
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -50,10 +64,19 @@ loop:
done:
mov sp, x24
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -39,10 +53,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -50,10 +64,19 @@ loop:
done:
mov sp, x24
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -0,0 +1,82 @@
#define INSTR ldr
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
mov x24, sp
sub x24, x24, #192
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR q1, [sp]
str q2, [x24]
INSTR q3, [sp]
str q5, [x24]
INSTR q6, [sp]
str q7, [x24]
INSTR q8, [sp]
str q9, [x24]
INSTR q10, [sp]
str q11, [x24]
INSTR q12, [sp]
str q13, [x24]
INSTR q14, [sp]
str q15, [x24]
INSTR q16, [sp]
str q17, [x24]
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,90 @@
#define INSTR ldr
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
mov x24, sp
sub x24, x24, #192
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR q1, [sp]
str q2, [x24]
add x0, x0, x0
INSTR q3, [sp]
str q5, [x24]
add x1, x1, x1
INSTR q6, [sp]
str q7, [x24]
add x2, x2, x2
INSTR q8, [sp]
str q9, [x24]
add x3, x3, x3
INSTR q10, [sp]
str q11, [x24]
add x5, x5, x5
INSTR q12, [sp]
str q13, [x24]
add x6, x6, x6
INSTR q14, [sp]
str q15, [x24]
add x7, x7, x7
INSTR q16, [sp]
str q17, [x24]
add x8, x8, x8
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,83 @@
#define INSTR ldr
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
mov x24, sp
sub x24, x24, #192
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR q1, [sp]
INSTR q2, [sp]
str q3, [x24]
INSTR q5, [sp]
INSTR q6, [sp]
str q7, [x24]
INSTR q8, [sp]
INSTR q9, [sp]
str q10, [x24]
INSTR q11, [sp]
INSTR q12, [sp]
str q13, [x24]
INSTR q14, [sp]
INSTR q15, [sp]
str q16, [x24]
INSTR q17, [sp]
INSTR q18, [sp]
str q28, [x24]
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -98,10 +112,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -43,10 +57,19 @@ loop:
done:
mov sp, x24
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -1,5 +1,5 @@
#define INSTR ldr
#define NINST 10
#define NINST 18
#define N x0
.globl ninst
@@ -13,47 +13,76 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, sp
mov x16, sp
add x16, x16, #32
mov x25, sp
add x25, x25, #64
mov x27, sp
sub x27, x27, #32
mov x28, sp
sub x28, x28, #64
add x11, sp, #32
add x12, sp, #64
add x13, sp, #96
add x14, sp, #128
add x15, sp, #160
add x16, sp, #192
add x17, sp, #224
add x18, sp, #256
loop:
subs x4, x4, #1
INSTR q0, [sp], #64
INSTR q1, [x25], #64
INSTR q2, [x27], #64
INSTR q3, [x28], #64
INSTR q4, [x16], #64
INSTR q5, [sp], #-64
INSTR q6, [x25], #-64
INSTR q7, [x27], #-64
INSTR q8, [x28], #-64
INSTR q9, [x16], #-64
INSTR q0, [sp], #16
INSTR q1, [x11], #16
INSTR q2, [x12], #16
INSTR q3, [x13], #16
INSTR q4, [x14], #16
INSTR q5, [x15], #16
INSTR q6, [x16], #16
INSTR q7, [x17], #16
INSTR q8, [x18], #16
INSTR q9, [sp], #-16
INSTR q10, [x11], #-16
INSTR q11, [x12], #-16
INSTR q12, [x13], #-16
INSTR q13, [x14], #-16
INSTR q14, [x15], #-16
INSTR q15, [x16], #-16
INSTR q16, [x17], #-16
INSTR q17, [x18], #-16
bne loop
done:
mov sp, x24
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -38,10 +52,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
mov x24, sp
@@ -48,10 +62,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
mov x24, sp
@@ -50,10 +64,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -0,0 +1,67 @@
#define INSTR mla
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR v0.2s, v0.2s, v1.2s
INSTR v0.2s, v0.2s, v1.2s
INSTR v0.2s, v0.2s, v1.2s
INSTR v0.2s, v0.2s, v1.2s
INSTR v0.2s, v0.2s, v1.2s
INSTR v0.2s, v0.2s, v1.2s
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,68 @@
#define INSTR mla
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR v3.2s, v0.2s, v0.2s
INSTR v4.2s, v0.2s, v1.2s
INSTR v5.2s, v0.2s, v2.2s
INSTR v6.2s, v1.2s, v1.2s
INSTR v7.2s, v1.2s, v2.2s
INSTR v8.2s, v2.2s, v2.2s
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,67 @@
#define INSTR mla
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR v0.4s, v0.4s, v1.4s
INSTR v0.4s, v0.4s, v1.4s
INSTR v0.4s, v0.4s, v1.4s
INSTR v0.4s, v0.4s, v1.4s
INSTR v0.4s, v0.4s, v1.4s
INSTR v0.4s, v0.4s, v1.4s
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,68 @@
#define INSTR mla
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR v3.4s, v0.4s, v0.4s
INSTR v4.4s, v0.4s, v1.4s
INSTR v5.4s, v0.4s, v2.4s
INSTR v6.4s, v1.4s, v1.4s
INSTR v7.4s, v1.4s, v2.4s
INSTR v8.4s, v2.4s, v2.4s
bne loop
done:
# pop callee-save registers from stack
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret
.size latency, .-latency

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +48,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -35,10 +49,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -34,10 +48,19 @@ loop:
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

View File

@@ -1,5 +1,5 @@
#define INSTR mov
#define NINST 6
#define NINST 16
#define N x0
.globl ninst
@@ -13,10 +13,24 @@ ninst:
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
sub sp, sp, #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
sub sp, sp, #64
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
sub sp, sp, #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
sub sp, sp, #64
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
stp x19, x20, [sp, -96]!
stp x21, x22, [sp, 16]
stp x23, x24, [sp, 32]
stp x25, x26, [sp, 48]
stp x27, x28, [sp, 64]
stp x29, x30, [sp, 80]
mov x4, N
@@ -26,19 +40,38 @@ latency:
loop:
subs x4, x4, #1
INSTR x3, x0
INSTR x9, x1
INSTR x5, x2
INSTR x6, x1
INSTR x7, x2
INSTR x8, x2
INSTR x5, x1
INSTR x6, x2
INSTR x7, x0
INSTR x8, x1
INSTR x9, x2
INSTR x10, x0
INSTR x11, x1
INSTR x12, x2
INSTR x13, x0
INSTR x14, x1
INSTR x15, x2
INSTR x16, x0
INSTR x17, x1
INSTR x18, x2
INSTR x19, x2
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ldp x19, x20, [sp]
ldp x21, x22, [sp, 16]
ldp x23, x24, [sp, 32]
ldp x25, x26, [sp, 48]
ldp x27, x28, [sp, 64]
ldp x29, x30, [sp, 80]
add sp, sp, #96
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ret

Some files were not shown because too many files have changed in this diff Show More