mirror of
https://github.com/RRZE-HPC/ibench.git
synced 2025-07-21 12:51:10 +02:00
1
Makefile
1
Makefile
@@ -1,3 +1,4 @@
|
||||
# Possible targets: GCC, ICC, MIC, POWER8, ARMGCC
|
||||
COMPILER=ICC
|
||||
|
||||
TARGET = ibench
|
||||
|
@@ -1,6 +1,8 @@
|
||||
CC = gcc
|
||||
AS = gcc
|
||||
CFLAGS = -O3
|
||||
# -msve-vector-bits=512 -march=armv8.2-a+sve
|
||||
LFLAGS = -shared
|
||||
|
||||
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/NEON/*.S))
|
||||
#KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/SVE/*.S))
|
@@ -1,6 +0,0 @@
|
||||
CC = gcc
|
||||
AS = gcc
|
||||
CFLAGS = -O3 -msve-vector-bits=512 -march=armv8.2-a+sve
|
||||
LFLAGS = -shared
|
||||
|
||||
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/SVE/*.S))
|
69
src/NEON/adc-x_x_x-LAT.S
Normal file
69
src/NEON/adc-x_x_x-LAT.S
Normal file
@@ -0,0 +1,69 @@
|
||||
#define INSTR adc
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.4s, #1.00000000
|
||||
fmov v1.4s, #1.00000000
|
||||
mov x0, #1
|
||||
mov x1, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR x0, x0, x1
|
||||
INSTR x0, x0, x1
|
||||
INSTR x0, x0, x1
|
||||
INSTR x0, x0, x1
|
||||
INSTR x0, x0, x1
|
||||
INSTR x0, x0, x1
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
113
src/NEON/adc-x_x_x-TP.S
Normal file
113
src/NEON/adc-x_x_x-TP.S
Normal file
@@ -0,0 +1,113 @@
|
||||
#define INSTR adc
|
||||
#define NINST 48
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR x5, x1, x1
|
||||
INSTR x6, x2, x2
|
||||
INSTR x7, x3, x3
|
||||
INSTR x8, x1, x1
|
||||
INSTR x9, x2, x2
|
||||
INSTR x10, x3, x3
|
||||
INSTR x11, x1, x1
|
||||
INSTR x12, x2, x2
|
||||
INSTR x13, x3, x3
|
||||
INSTR x14, x1, x1
|
||||
INSTR x15, x2, x2
|
||||
INSTR x16, x3, x3
|
||||
INSTR x5, x1, x1
|
||||
INSTR x6, x2, x2
|
||||
INSTR x7, x3, x3
|
||||
INSTR x8, x1, x1
|
||||
INSTR x9, x2, x2
|
||||
INSTR x10, x3, x3
|
||||
INSTR x11, x1, x1
|
||||
INSTR x12, x2, x2
|
||||
INSTR x13, x3, x3
|
||||
INSTR x14, x1, x1
|
||||
INSTR x15, x2, x2
|
||||
INSTR x16, x3, x3
|
||||
INSTR x5, x1, x1
|
||||
INSTR x6, x2, x2
|
||||
INSTR x7, x3, x3
|
||||
INSTR x8, x1, x1
|
||||
INSTR x9, x2, x2
|
||||
INSTR x10, x3, x3
|
||||
INSTR x11, x1, x1
|
||||
INSTR x12, x2, x2
|
||||
INSTR x13, x3, x3
|
||||
INSTR x14, x1, x1
|
||||
INSTR x15, x2, x2
|
||||
INSTR x16, x3, x3
|
||||
INSTR x5, x1, x1
|
||||
INSTR x6, x2, x2
|
||||
INSTR x7, x3, x3
|
||||
INSTR x8, x1, x1
|
||||
INSTR x9, x2, x2
|
||||
INSTR x10, x3, x3
|
||||
INSTR x11, x1, x1
|
||||
INSTR x12, x2, x2
|
||||
INSTR x13, x3, x3
|
||||
INSTR x14, x1, x1
|
||||
INSTR x15, x2, x2
|
||||
INSTR x16, x3, x3
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,10 +50,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -80,10 +95,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -36,10 +51,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -80,10 +95,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
67
src/NEON/addp-vd_vd_vd-LAT.S
Normal file
67
src/NEON/addp-vd_vd_vd-LAT.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR addp
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v0.2d, v0.2d, v1.2d
|
||||
INSTR v0.2d, v0.2d, v1.2d
|
||||
INSTR v0.2d, v0.2d, v1.2d
|
||||
INSTR v0.2d, v0.2d, v1.2d
|
||||
INSTR v0.2d, v0.2d, v1.2d
|
||||
INSTR v0.2d, v0.2d, v1.2d
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
68
src/NEON/addp-vd_vd_vd-TP.S
Normal file
68
src/NEON/addp-vd_vd_vd-TP.S
Normal file
@@ -0,0 +1,68 @@
|
||||
#define INSTR addp
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,10 +50,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -80,10 +95,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -227,10 +242,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -227,10 +242,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +49,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -37,10 +52,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -57,10 +72,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -57,10 +72,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
70
src/NEON/dup-vd_x-TP.S
Normal file
70
src/NEON/dup-vd_x-TP.S
Normal file
@@ -0,0 +1,70 @@
|
||||
#define INSTR dup
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v8.2d, x0
|
||||
INSTR v9.2d, x1
|
||||
INSTR v10.2d, x2
|
||||
INSTR v11.2d, x3
|
||||
INSTR v12.2d, x4
|
||||
INSTR v13.2d, x5
|
||||
INSTR v14.2d, x6
|
||||
INSTR v15.2d, x7
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
71
src/NEON/fadd-LAT_order1.S
Normal file
71
src/NEON/fadd-LAT_order1.S
Normal file
@@ -0,0 +1,71 @@
|
||||
#define INSTR fadd
|
||||
#define NINST 8
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
fadd v2.2d, v1.2d, v8.2d
|
||||
fadd v3.2d, v2.2d, v1.2d
|
||||
fadd v4.2d, v3.2d, v2.2d
|
||||
fadd v5.2d, v4.2d, v3.2d
|
||||
fadd v6.2d, v5.2d, v4.2d
|
||||
fadd v7.2d, v6.2d, v5.2d
|
||||
fadd v8.2d, v7.2d, v6.2d
|
||||
fadd v1.2d, v8.2d, v7.2d
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
71
src/NEON/fadd-LAT_order2.S
Normal file
71
src/NEON/fadd-LAT_order2.S
Normal file
@@ -0,0 +1,71 @@
|
||||
#define INSTR fadd
|
||||
#define NINST 8
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
fadd v2.2d, v8.2d, v1.2d
|
||||
fadd v3.2d, v1.2d, v2.2d
|
||||
fadd v4.2d, v2.2d, v3.2d
|
||||
fadd v5.2d, v3.2d, v4.2d
|
||||
fadd v6.2d, v4.2d, v5.2d
|
||||
fadd v7.2d, v5.2d, v6.2d
|
||||
fadd v8.2d, v6.2d, v7.2d
|
||||
fadd v1.2d, v7.2d, v8.2d
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +49,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,10 +50,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +49,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,10 +50,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
214
src/NEON/fadd-vd_vd_vd-il_1_2-add-x_x_x-TP.S
Normal file
214
src/NEON/fadd-vd_vd_vd-il_1_2-add-x_x_x-TP.S
Normal file
@@ -0,0 +1,214 @@
|
||||
#define INSTR fadd
|
||||
#define NINST 48
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
add x5, x1, x1
|
||||
add x6, x2, x2
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
add x7, x3, x3
|
||||
add x8, x1, x1
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
INSTR v9.2d, v0.2d, v2.2d
|
||||
add x5, x2, x2
|
||||
add x6, x3, x3
|
||||
INSTR v10.2d, v1.2d, v1.2d
|
||||
add x7, x1, x1
|
||||
add x8, x2, x2
|
||||
INSTR v11.2d, v1.2d, v2.2d
|
||||
add x9, x3, x3
|
||||
add x10, x1, x1
|
||||
INSTR v12.2d, v2.2d, v2.2d
|
||||
add x11, x2, x2
|
||||
add x12, x3, x3
|
||||
INSTR v13.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v14.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
add x5, x1, x1
|
||||
add x6, x2, x2
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
add x7, x3, x3
|
||||
add x8, x1, x1
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
INSTR v9.2d, v0.2d, v2.2d
|
||||
add x5, x2, x2
|
||||
add x6, x3, x3
|
||||
INSTR v10.2d, v1.2d, v1.2d
|
||||
add x7, x1, x1
|
||||
add x8, x2, x2
|
||||
INSTR v11.2d, v1.2d, v2.2d
|
||||
add x9, x3, x3
|
||||
add x10, x1, x1
|
||||
INSTR v12.2d, v2.2d, v2.2d
|
||||
add x11, x2, x2
|
||||
add x12, x3, x3
|
||||
INSTR v13.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v14.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
add x5, x1, x1
|
||||
add x6, x2, x2
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
add x7, x3, x3
|
||||
add x8, x1, x1
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
INSTR v9.2d, v0.2d, v2.2d
|
||||
add x5, x2, x2
|
||||
add x6, x3, x3
|
||||
INSTR v10.2d, v1.2d, v1.2d
|
||||
add x7, x1, x1
|
||||
add x8, x2, x2
|
||||
INSTR v11.2d, v1.2d, v2.2d
|
||||
add x9, x3, x3
|
||||
add x10, x1, x1
|
||||
INSTR v12.2d, v2.2d, v2.2d
|
||||
add x11, x2, x2
|
||||
add x12, x3, x3
|
||||
INSTR v13.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v14.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
add x5, x1, x1
|
||||
add x6, x2, x2
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
add x7, x3, x3
|
||||
add x8, x1, x1
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
INSTR v9.2d, v0.2d, v2.2d
|
||||
add x5, x2, x2
|
||||
add x6, x3, x3
|
||||
INSTR v10.2d, v1.2d, v1.2d
|
||||
add x7, x1, x1
|
||||
add x8, x2, x2
|
||||
INSTR v11.2d, v1.2d, v2.2d
|
||||
add x9, x3, x3
|
||||
add x10, x1, x1
|
||||
INSTR v12.2d, v2.2d, v2.2d
|
||||
add x11, x2, x2
|
||||
add x12, x3, x3
|
||||
INSTR v13.2d, v1.2d, v2.2d
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v14.2d, v2.2d, v2.2d
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
84
src/NEON/fadd-vd_vd_vd-il_1_2-sub-x_x_i-TP.S
Normal file
84
src/NEON/fadd-vd_vd_vd-il_1_2-sub-x_x_i-TP.S
Normal file
@@ -0,0 +1,84 @@
|
||||
#define INSTR fadd
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
sub x5, x1, #1
|
||||
sub x6, x2, #1
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
sub x7, x3, #1
|
||||
sub x8, x1, #1
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
sub x9, x2, #1
|
||||
sub x10, x3, #1
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
sub x11, x1, #1
|
||||
sub x12, x2, #1
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
sub x13, x3, #1
|
||||
sub x14, x1, #1
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
sub x15, x2, #1
|
||||
sub x16, x3, #1
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
90
src/NEON/fadd-vd_vd_vd-il_1_3-add-x_x_x-TP.S
Normal file
90
src/NEON/fadd-vd_vd_vd-il_1_3-add-x_x_x-TP.S
Normal file
@@ -0,0 +1,90 @@
|
||||
#define INSTR fadd
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
add x5, x1, x1
|
||||
add x6, x2, x2
|
||||
add x7, x3, x3
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
add x8, x1, x1
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
add x13, x3, x3
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
add x14, x1, x1
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
add x8, x1, x1
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
add x13, x3, x3
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
84
src/NEON/fadd-vd_vd_vd-il_2_1-mul-x_x_x-TP.S
Normal file
84
src/NEON/fadd-vd_vd_vd-il_2_1-mul-x_x_x-TP.S
Normal file
@@ -0,0 +1,84 @@
|
||||
#define INSTR fadd
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.2d, v0.2d, v0.2d
|
||||
INSTR v4.2d, v0.2d, v1.2d
|
||||
mul x5, x1, x1
|
||||
INSTR v5.2d, v0.2d, v2.2d
|
||||
INSTR v6.2d, v1.2d, v1.2d
|
||||
mul x6, x2, x2
|
||||
INSTR v7.2d, v1.2d, v2.2d
|
||||
INSTR v8.2d, v2.2d, v2.2d
|
||||
mul x7, x3, x3
|
||||
INSTR v9.2d, v2.2d, v2.2d
|
||||
INSTR v10.2d, v2.2d, v2.2d
|
||||
mul x8, x1, x1
|
||||
INSTR v11.2d, v2.2d, v2.2d
|
||||
INSTR v12.2d, v2.2d, v2.2d
|
||||
mul x9, x2, x2
|
||||
INSTR v13.2d, v2.2d, v2.2d
|
||||
INSTR v14.2d, v2.2d, v2.2d
|
||||
mul x10, x3, x3
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +49,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,10 +50,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
84
src/NEON/fadd-vs_vs_vs-il_1_2-add-x_x_x-TP.S
Normal file
84
src/NEON/fadd-vs_vs_vs-il_1_2-add-x_x_x-TP.S
Normal file
@@ -0,0 +1,84 @@
|
||||
#define INSTR fadd
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.4s, #1.00000000
|
||||
fmov v1.4s, #1.00000000
|
||||
fmov v2.4s, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.4s, v0.4s, v0.4s
|
||||
add x5, x1, x1
|
||||
add x6, x2, x2
|
||||
INSTR v4.4s, v0.4s, v1.4s
|
||||
add x7, x3, x3
|
||||
add x8, x1, x1
|
||||
INSTR v5.4s, v0.4s, v2.4s
|
||||
add x9, x2, x2
|
||||
add x10, x3, x3
|
||||
INSTR v6.4s, v1.4s, v1.4s
|
||||
add x11, x1, x1
|
||||
add x12, x2, x2
|
||||
INSTR v7.4s, v1.4s, v2.4s
|
||||
add x13, x3, x3
|
||||
add x14, x1, x1
|
||||
INSTR v8.4s, v2.4s, v2.4s
|
||||
add x15, x2, x2
|
||||
add x16, x3, x3
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
94
src/NEON/fdiv-d_d_d-LAT.S
Normal file
94
src/NEON/fdiv-d_d_d-LAT.S
Normal file
@@ -0,0 +1,94 @@
|
||||
#define INSTR fdiv
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000
|
||||
# create 2.0
|
||||
fadd v1.2d, v0.2d, v0.2d
|
||||
# create 3.0
|
||||
fadd v2.2d, v0.2d, v1.2d
|
||||
# create 4.0
|
||||
fadd v4.2d, v1.2d, v1.2d
|
||||
# create 8.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 16.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 32.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 64.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 128.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 256.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 512.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 1024.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 341.3333 = (1024.0/3.0)
|
||||
fdiv v1.2d, v4.2d, v2.2d
|
||||
# create 1/341.3333
|
||||
fdiv v2.2d, v0.2d, v1.2d
|
||||
# create 2*341.3333
|
||||
fadd v0.2d, v1.2d, v1.2d
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR d0, d0, d1
|
||||
INSTR d0, d0, d2
|
||||
INSTR d0, d0, d1
|
||||
INSTR d0, d0, d2
|
||||
INSTR d0, d0, d1
|
||||
INSTR d0, d0, d2
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
94
src/NEON/fdiv-d_d_d-TP.S
Normal file
94
src/NEON/fdiv-d_d_d-TP.S
Normal file
@@ -0,0 +1,94 @@
|
||||
#define INSTR fdiv
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000
|
||||
# create 2.0
|
||||
fadd v1.2d, v0.2d, v0.2d
|
||||
# create 3.0
|
||||
fadd v2.2d, v0.2d, v1.2d
|
||||
# create 4.0
|
||||
fadd v4.2d, v1.2d, v1.2d
|
||||
# create 8.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 16.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 32.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 64.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 128.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 256.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 512.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 1024.0
|
||||
fadd v4.2d, v4.2d, v4.2d
|
||||
# create 341.3333 = (1024.0/3.0)
|
||||
fdiv v1.2d, v4.2d, v2.2d
|
||||
# create 1/341.3333
|
||||
fdiv v2.2d, v0.2d, v1.2d
|
||||
# create 2*341.3333
|
||||
fadd v0.2d, v1.2d, v1.2d
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR d3, d0, d1
|
||||
INSTR d4, d1, d0
|
||||
INSTR d5, d0, d2
|
||||
INSTR d6, d2, d0
|
||||
INSTR d7, d1, d2
|
||||
INSTR d8, d2, d1
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -61,10 +76,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -61,10 +76,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -67,10 +82,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -71,10 +86,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -61,10 +76,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -61,10 +76,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -88,10 +103,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -116,10 +131,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
68
src/NEON/fmadd-d_d_d_d-LAT.S
Normal file
68
src/NEON/fmadd-d_d_d_d-LAT.S
Normal file
@@ -0,0 +1,68 @@
|
||||
#define INSTR fmadd
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov d0, #1.00000000
|
||||
fmov d1, #1.00000000
|
||||
fmov d2, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR d1, d0, d0, d0
|
||||
INSTR d0, d1, d1, d1
|
||||
INSTR d1, d0, d0, d0
|
||||
INSTR d0, d1, d1, d1
|
||||
INSTR d1, d0, d0, d0
|
||||
INSTR d0, d1, d1, d1
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
68
src/NEON/fmadd-d_d_d_d-TP.S
Normal file
68
src/NEON/fmadd-d_d_d_d-TP.S
Normal file
@@ -0,0 +1,68 @@
|
||||
#define INSTR fmadd
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov d0, #1.00000000
|
||||
fmov d1, #1.00000000
|
||||
fmov d2, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR d3, d0, d0, d0
|
||||
INSTR d4, d0, d1, d1
|
||||
INSTR d5, d0, d2, d2
|
||||
INSTR d6, d1, d1, d1
|
||||
INSTR d7, d1, d2, d2
|
||||
INSTR d8, d2, d2, d2
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
68
src/NEON/fmadd-s_s_s_s-LAT.S
Normal file
68
src/NEON/fmadd-s_s_s_s-LAT.S
Normal file
@@ -0,0 +1,68 @@
|
||||
#define INSTR fmadd
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov d0, #1.00000000
|
||||
fmov d1, #1.00000000
|
||||
fmov d2, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR s1, s0, s0, s0
|
||||
INSTR s0, s1, s1, s1
|
||||
INSTR s1, s0, s0, s0
|
||||
INSTR s0, s1, s1, s1
|
||||
INSTR s1, s0, s0, s0
|
||||
INSTR s0, s1, s1, s1
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
68
src/NEON/fmadd-s_s_s_s-TP.S
Normal file
68
src/NEON/fmadd-s_s_s_s-TP.S
Normal file
@@ -0,0 +1,68 @@
|
||||
#define INSTR fmadd
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov d0, #1.00000000
|
||||
fmov d1, #1.00000000
|
||||
fmov d2, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR s3, s0, s0, s0
|
||||
INSTR s4, s0, s1, s1
|
||||
INSTR s5, s0, s2, s2
|
||||
INSTR s6, s1, s1, s1
|
||||
INSTR s7, s1, s2, s2
|
||||
INSTR s8, s2, s2, s2
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +49,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -58,10 +73,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +49,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -58,10 +73,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
77
src/NEON/fmov-d_x-TP.S
Normal file
77
src/NEON/fmov-d_x-TP.S
Normal file
@@ -0,0 +1,77 @@
|
||||
#define INSTR fmov
|
||||
#define NINST 12
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR d5, x1
|
||||
INSTR d6, x2
|
||||
INSTR d7, x3
|
||||
INSTR d8, x1
|
||||
INSTR d9, x2
|
||||
INSTR d10, x3
|
||||
INSTR d11, x1
|
||||
INSTR d12, x2
|
||||
INSTR d13, x3
|
||||
INSTR d14, x1
|
||||
INSTR d15, x2
|
||||
INSTR d16, x3
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -44,10 +59,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
77
src/NEON/fmov-x_d-TP.S
Normal file
77
src/NEON/fmov-x_d-TP.S
Normal file
@@ -0,0 +1,77 @@
|
||||
#define INSTR fmov
|
||||
#define NINST 12
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x1, #1
|
||||
mov x2, #1
|
||||
mov x3, #1
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR x1, d1
|
||||
INSTR x2, d2
|
||||
INSTR x3, d3
|
||||
INSTR x5, d1
|
||||
INSTR x6, d2
|
||||
INSTR x7, d3
|
||||
INSTR x8, d1
|
||||
INSTR x9, d2
|
||||
INSTR x10, d3
|
||||
INSTR x11, d1
|
||||
INSTR x12, d2
|
||||
INSTR x13, d3
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +49,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,10 +50,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +49,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,10 +50,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +49,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,10 +50,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
67
src/NEON/fneg-vd_vd_vd-LAT.S
Normal file
67
src/NEON/fneg-vd_vd_vd-LAT.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR fneg
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v0.2d, v1.2d
|
||||
INSTR v1.2d, v0.2d
|
||||
INSTR v0.2d, v1.2d
|
||||
INSTR v1.2d, v0.2d
|
||||
INSTR v0.2d, v1.2d
|
||||
INSTR v1.2d, v0.2d
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
68
src/NEON/fneg-vd_vd_vd-TP.S
Normal file
68
src/NEON/fneg-vd_vd_vd-TP.S
Normal file
@@ -0,0 +1,68 @@
|
||||
#define INSTR fneg
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.2d, v0.2d
|
||||
INSTR v4.2d, v1.2d
|
||||
INSTR v5.2d, v2.2d
|
||||
INSTR v6.2d, v0.2d
|
||||
INSTR v7.2d, v1.2d
|
||||
INSTR v8.2d, v2.2d
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
67
src/NEON/fneg-vs_vs_vs-LAT.S
Normal file
67
src/NEON/fneg-vs_vs_vs-LAT.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR fneg
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v0.4s, v1.4s
|
||||
INSTR v1.4s, v0.4s
|
||||
INSTR v0.4s, v1.4s
|
||||
INSTR v1.4s, v0.4s
|
||||
INSTR v0.4s, v1.4s
|
||||
INSTR v1.4s, v0.4s
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
68
src/NEON/fneg-vs_vs_vs-TP.S
Normal file
68
src/NEON/fneg-vs_vs_vs-TP.S
Normal file
@@ -0,0 +1,68 @@
|
||||
#define INSTR fneg
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.4s, v0.4s
|
||||
INSTR v4.4s, v1.4s
|
||||
INSTR v5.4s, v2.4s
|
||||
INSTR v6.4s, v0.4s
|
||||
INSTR v7.4s, v1.4s
|
||||
INSTR v8.4s, v2.4s
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -61,10 +76,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -67,10 +82,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -61,10 +76,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -67,10 +82,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -68,10 +83,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -66,10 +81,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -68,10 +83,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -66,10 +81,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +49,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,10 +50,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +49,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,25 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,10 +50,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -41,10 +55,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -53,10 +67,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -41,10 +55,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -45,10 +59,19 @@ loop:
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -50,10 +64,19 @@ loop:
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -39,10 +53,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -50,10 +64,19 @@ loop:
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
82
src/NEON/ldr-q_mb-il_1_1-str-q_mb-TP.S
Normal file
82
src/NEON/ldr-q_mb-il_1_1-str-q_mb-TP.S
Normal file
@@ -0,0 +1,82 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 8
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
sub x24, x24, #192
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q1, [sp]
|
||||
str q2, [x24]
|
||||
INSTR q3, [sp]
|
||||
str q5, [x24]
|
||||
INSTR q6, [sp]
|
||||
str q7, [x24]
|
||||
INSTR q8, [sp]
|
||||
str q9, [x24]
|
||||
INSTR q10, [sp]
|
||||
str q11, [x24]
|
||||
INSTR q12, [sp]
|
||||
str q13, [x24]
|
||||
INSTR q14, [sp]
|
||||
str q15, [x24]
|
||||
INSTR q16, [sp]
|
||||
str q17, [x24]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
90
src/NEON/ldr-q_mb-il_1_1_1-str-q_mb-add-x_x_x-TP.S
Normal file
90
src/NEON/ldr-q_mb-il_1_1_1-str-q_mb-add-x_x_x-TP.S
Normal file
@@ -0,0 +1,90 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 8
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
sub x24, x24, #192
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q1, [sp]
|
||||
str q2, [x24]
|
||||
add x0, x0, x0
|
||||
INSTR q3, [sp]
|
||||
str q5, [x24]
|
||||
add x1, x1, x1
|
||||
INSTR q6, [sp]
|
||||
str q7, [x24]
|
||||
add x2, x2, x2
|
||||
INSTR q8, [sp]
|
||||
str q9, [x24]
|
||||
add x3, x3, x3
|
||||
INSTR q10, [sp]
|
||||
str q11, [x24]
|
||||
add x5, x5, x5
|
||||
INSTR q12, [sp]
|
||||
str q13, [x24]
|
||||
add x6, x6, x6
|
||||
INSTR q14, [sp]
|
||||
str q15, [x24]
|
||||
add x7, x7, x7
|
||||
INSTR q16, [sp]
|
||||
str q17, [x24]
|
||||
add x8, x8, x8
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
83
src/NEON/ldr-q_mb-il_2_1-str-q_mb-TP.S
Normal file
83
src/NEON/ldr-q_mb-il_2_1-str-q_mb-TP.S
Normal file
@@ -0,0 +1,83 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
sub x24, x24, #192
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q1, [sp]
|
||||
INSTR q2, [sp]
|
||||
str q3, [x24]
|
||||
INSTR q5, [sp]
|
||||
INSTR q6, [sp]
|
||||
str q7, [x24]
|
||||
INSTR q8, [sp]
|
||||
INSTR q9, [sp]
|
||||
str q10, [x24]
|
||||
INSTR q11, [sp]
|
||||
INSTR q12, [sp]
|
||||
str q13, [x24]
|
||||
INSTR q14, [sp]
|
||||
INSTR q15, [sp]
|
||||
str q16, [x24]
|
||||
INSTR q17, [sp]
|
||||
INSTR q18, [sp]
|
||||
str q28, [x24]
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -98,10 +112,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -43,10 +57,19 @@ loop:
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#define INSTR ldr
|
||||
#define NINST 10
|
||||
#define NINST 18
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
@@ -13,47 +13,76 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
mov x24, sp
|
||||
|
||||
mov x16, sp
|
||||
add x16, x16, #32
|
||||
mov x25, sp
|
||||
add x25, x25, #64
|
||||
mov x27, sp
|
||||
sub x27, x27, #32
|
||||
mov x28, sp
|
||||
sub x28, x28, #64
|
||||
add x11, sp, #32
|
||||
add x12, sp, #64
|
||||
add x13, sp, #96
|
||||
add x14, sp, #128
|
||||
add x15, sp, #160
|
||||
add x16, sp, #192
|
||||
add x17, sp, #224
|
||||
add x18, sp, #256
|
||||
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR q0, [sp], #64
|
||||
INSTR q1, [x25], #64
|
||||
INSTR q2, [x27], #64
|
||||
INSTR q3, [x28], #64
|
||||
INSTR q4, [x16], #64
|
||||
INSTR q5, [sp], #-64
|
||||
INSTR q6, [x25], #-64
|
||||
INSTR q7, [x27], #-64
|
||||
INSTR q8, [x28], #-64
|
||||
INSTR q9, [x16], #-64
|
||||
|
||||
INSTR q0, [sp], #16
|
||||
INSTR q1, [x11], #16
|
||||
INSTR q2, [x12], #16
|
||||
INSTR q3, [x13], #16
|
||||
INSTR q4, [x14], #16
|
||||
INSTR q5, [x15], #16
|
||||
INSTR q6, [x16], #16
|
||||
INSTR q7, [x17], #16
|
||||
INSTR q8, [x18], #16
|
||||
INSTR q9, [sp], #-16
|
||||
INSTR q10, [x11], #-16
|
||||
INSTR q11, [x12], #-16
|
||||
INSTR q12, [x13], #-16
|
||||
INSTR q13, [x14], #-16
|
||||
INSTR q14, [x15], #-16
|
||||
INSTR q15, [x16], #-16
|
||||
INSTR q16, [x17], #-16
|
||||
INSTR q17, [x18], #-16
|
||||
bne loop
|
||||
done:
|
||||
mov sp, x24
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -38,10 +52,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
@@ -48,10 +62,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
mov x24, sp
|
||||
@@ -50,10 +64,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
67
src/NEON/mla-vd_vd_vd-LAT.S
Normal file
67
src/NEON/mla-vd_vd_vd-LAT.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR mla
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v0.2s, v0.2s, v1.2s
|
||||
INSTR v0.2s, v0.2s, v1.2s
|
||||
INSTR v0.2s, v0.2s, v1.2s
|
||||
INSTR v0.2s, v0.2s, v1.2s
|
||||
INSTR v0.2s, v0.2s, v1.2s
|
||||
INSTR v0.2s, v0.2s, v1.2s
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
68
src/NEON/mla-vd_vd_vd-TP.S
Normal file
68
src/NEON/mla-vd_vd_vd-TP.S
Normal file
@@ -0,0 +1,68 @@
|
||||
#define INSTR mla
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.2s, v0.2s, v0.2s
|
||||
INSTR v4.2s, v0.2s, v1.2s
|
||||
INSTR v5.2s, v0.2s, v2.2s
|
||||
INSTR v6.2s, v1.2s, v1.2s
|
||||
INSTR v7.2s, v1.2s, v2.2s
|
||||
INSTR v8.2s, v2.2s, v2.2s
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
67
src/NEON/mla-vs_vs_vs-LAT.S
Normal file
67
src/NEON/mla-vs_vs_vs-LAT.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR mla
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v0.4s, v0.4s, v1.4s
|
||||
INSTR v0.4s, v0.4s, v1.4s
|
||||
INSTR v0.4s, v0.4s, v1.4s
|
||||
INSTR v0.4s, v0.4s, v1.4s
|
||||
INSTR v0.4s, v0.4s, v1.4s
|
||||
INSTR v0.4s, v0.4s, v1.4s
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
68
src/NEON/mla-vs_vs_vs-TP.S
Normal file
68
src/NEON/mla-vs_vs_vs-TP.S
Normal file
@@ -0,0 +1,68 @@
|
||||
#define INSTR mla
|
||||
#define NINST 6
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 2
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
fmov v0.2d, #1.00000000
|
||||
fmov v1.2d, #1.00000000
|
||||
fmov v2.2d, #1.00000000
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR v3.4s, v0.4s, v0.4s
|
||||
INSTR v4.4s, v0.4s, v1.4s
|
||||
INSTR v5.4s, v0.4s, v2.4s
|
||||
INSTR v6.4s, v1.4s, v1.4s
|
||||
INSTR v7.4s, v1.4s, v2.4s
|
||||
INSTR v8.4s, v2.4s, v2.4s
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
.size latency, .-latency
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +48,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -35,10 +49,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -34,10 +48,19 @@ loop:
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#define INSTR mov
|
||||
#define NINST 6
|
||||
#define NINST 16
|
||||
#define N x0
|
||||
|
||||
.globl ninst
|
||||
@@ -13,10 +13,24 @@ ninst:
|
||||
latency:
|
||||
|
||||
# push callee-save registers onto stack
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp]
|
||||
sub sp, sp, #64
|
||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp]
|
||||
stp x19, x20, [sp, -96]!
|
||||
stp x21, x22, [sp, 16]
|
||||
stp x23, x24, [sp, 32]
|
||||
stp x25, x26, [sp, 48]
|
||||
stp x27, x28, [sp, 64]
|
||||
stp x29, x30, [sp, 80]
|
||||
|
||||
mov x4, N
|
||||
|
||||
@@ -26,19 +40,38 @@ latency:
|
||||
loop:
|
||||
subs x4, x4, #1
|
||||
INSTR x3, x0
|
||||
INSTR x9, x1
|
||||
INSTR x5, x2
|
||||
INSTR x6, x1
|
||||
INSTR x7, x2
|
||||
INSTR x8, x2
|
||||
INSTR x5, x1
|
||||
INSTR x6, x2
|
||||
INSTR x7, x0
|
||||
INSTR x8, x1
|
||||
INSTR x9, x2
|
||||
INSTR x10, x0
|
||||
INSTR x11, x1
|
||||
INSTR x12, x2
|
||||
INSTR x13, x0
|
||||
INSTR x14, x1
|
||||
INSTR x15, x2
|
||||
INSTR x16, x0
|
||||
INSTR x17, x1
|
||||
INSTR x18, x2
|
||||
INSTR x19, x2
|
||||
bne loop
|
||||
done:
|
||||
|
||||
# pop callee-save registers from stack
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
|
||||
add sp, sp, #64
|
||||
ldp x19, x20, [sp]
|
||||
ldp x21, x22, [sp, 16]
|
||||
ldp x23, x24, [sp, 32]
|
||||
ldp x25, x26, [sp, 48]
|
||||
ldp x27, x28, [sp, 64]
|
||||
ldp x29, x30, [sp, 80]
|
||||
add sp, sp, #96
|
||||
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64
|
||||
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64
|
||||
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64
|
||||
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
||||
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
||||
|
||||
ret
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user