more benchmarks

This commit is contained in:
JanLJL
2020-07-11 12:57:35 +00:00
parent d9e607d25c
commit 686f44be7a
13 changed files with 910 additions and 0 deletions

View File

@@ -0,0 +1,63 @@
#define INSTR ldp
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR d24, d1, [sp, #-256]
ldr x0, [sp]
INSTR d2, d3, [sp, #-256]
ldr x1, [sp]
INSTR d25, d5, [sp, #-256]
ldr x2, [sp]
INSTR d6, d7, [sp, #-256]
ldr x3, [sp]
INSTR d8, d9, [sp, #-256]
ldr x5, [sp]
INSTR d10, d11, [sp, #-256]
ldr x6, [sp]
INSTR d12, d13, [sp, #-256]
ldr x7, [sp]
INSTR d14, d15, [sp, #-256]
ldr x8, [sp]
INSTR d16, d17, [sp, #-256]
ldr x9, [sp]
INSTR d18, d19, [sp, #-256]
ldr x10, [sp]
INSTR d20, d21, [sp, #-256]
ldr x11, [sp]
INSTR d22, d23, [sp, #-256]
ldr x12, [sp]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

60
src/NEON/ldr-q_mb-TP.S Normal file
View File

@@ -0,0 +1,60 @@
#define INSTR ldr
#define NINST 10
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, sp
mov x16, sp
add x16, x16, #32
mov x25, sp
add x25, x25, #64
mov x27, sp
sub x27, x27, #32
mov x28, sp
sub x28, x28, #64
loop:
subs x4, x4, #1
INSTR q0, [sp]
INSTR q1, [x25]
INSTR q2, [x27]
INSTR q3, [x28]
INSTR q4, [x16]
INSTR q5, [sp]
INSTR q6, [x25]
INSTR q7, [x27]
INSTR q8, [x28]
INSTR q9, [x16]
bne loop
done:
mov sp, x24
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,58 @@
#define INSTR ldr
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
mov x24, sp
sub x24, x24, #192
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR x1, [sp]
str x2, [x24]
INSTR x3, [sp]
str x5, [x24]
INSTR x6, [sp]
str x7, [x24]
INSTR x8, [sp]
str x9, [x24]
INSTR x10, [sp]
str x11, [x24]
INSTR x12, [sp]
str x13, [x24]
INSTR x14, [sp]
str x15, [x24]
INSTR x16, [sp]
str x17, [x24]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,60 @@
#define INSTR ldr
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
mov x24, sp
sub x24, x24, #192
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR x1, [sp]
INSTR x2, [sp]
str x3, [x24]
INSTR x5, [sp]
INSTR x6, [sp]
str x7, [x24]
INSTR x8, [sp]
INSTR x9, [sp]
str x10, [x24]
INSTR x11, [sp]
INSTR x12, [sp]
str x13, [x24]
INSTR x14, [sp]
INSTR x15, [sp]
str x16, [x24]
INSTR x17, [sp]
INSTR x18, [sp]
str x28, [x24]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,63 @@
#define INSTR stp
#define NINST 12
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR d0, d1, [sp, #-64]
ldr x0, [sp]
INSTR d2, d3, [sp, #-128]
ldr x1, [sp]
INSTR d4, d5, [sp, #-192]
ldr x2, [sp]
INSTR d6, d7, [sp, #-256]
ldr x3, [sp]
INSTR d8, d9, [sp, #-320]
ldr x5, [sp]
INSTR d10, d11, [sp, #-384]
ldr x6, [sp]
INSTR d12, d13, [sp, #-448]
ldr x7, [sp]
INSTR d14, d15, [sp, #-32]
ldr x8, [sp]
INSTR d16, d17, [sp, #-96]
ldr x9, [sp]
INSTR d18, d19, [sp, #-160]
ldr x10, [sp]
INSTR d20, d21, [sp, #-224]
ldr x11, [sp]
INSTR d22, d23, [sp, #-288]
ldr x12, [sp]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

108
src/NEON/str-d_mb-TP.S Normal file
View File

@@ -0,0 +1,108 @@
#define INSTR str
#define NINST 64
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, #-64
mov x25, #-128
mov x28, #-192
mov x27, #-256
loop:
subs x4, x4, #1
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

59
src/NEON/str-d_mbp-TP.S Normal file
View File

@@ -0,0 +1,59 @@
#define INSTR str
#define NINST 8
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
# sub sp, sp, #64
# st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
# sub sp, sp, #64
# st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, sp
mov x10, sp
mov x16, sp
add x16, x16, #128
mov x25, sp
add x25, x25, #192
mov x27, sp
sub x27, x27, #256
mov x28, sp
sub x28, x28, #320
loop:
subs x4, x4, #1
INSTR d1, [x25], #64
INSTR d2, [x27], #64
INSTR d3, [x28], #64
INSTR d4, [x16], #64
INSTR d6, [x25], #-64
INSTR d7, [x27], #-64
INSTR d8, [x28], #-64
INSTR d9, [x16], #-64
bne loop
done:
mov sp, x10
# pop callee-save registers from stack
# ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
# add sp, sp, #64
# ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
# add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,60 @@
#define INSTR str
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
mov x24, sp
sub x24, x24, #192
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
loop:
subs x4, x4, #1
INSTR x1, [sp]
INSTR x2, [sp]
ldr x3, [x24]
INSTR x5, [sp]
INSTR x6, [sp]
ldr x7, [x24]
INSTR x8, [sp]
INSTR x9, [sp]
ldr x10, [x24]
INSTR x11, [sp]
INSTR x12, [sp]
ldr x13, [x24]
INSTR x14, [sp]
INSTR x15, [sp]
ldr x16, [x24]
INSTR x17, [sp]
INSTR x18, [sp]
ldr x28, [x24]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

108
src/NEON/stur-d_mb-TP.S Normal file
View File

@@ -0,0 +1,108 @@
#define INSTR stur
#define NINST 64
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, #-64
mov x25, #-128
mov x28, #-192
mov x27, #-256
loop:
subs x4, x4, #1
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
INSTR d2, [sp]
INSTR d6, [sp]
INSTR d8, [sp]
INSTR d10, [sp]
INSTR d12, [sp]
INSTR d14, [sp]
INSTR d16, [sp]
INSTR d18, [sp]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

60
src/NEON/stur-q_mb-TP.S Normal file
View File

@@ -0,0 +1,60 @@
#define INSTR stur
#define NINST 16
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, #-64
mov x25, #-128
mov x28, #-192
mov x27, #-256
loop:
subs x4, x4, #1
INSTR q1, [sp]
INSTR q2, [sp]
INSTR q3, [sp]
INSTR q4, [sp]
INSTR q5, [sp]
INSTR q6, [sp]
INSTR q7, [sp]
INSTR q8, [sp]
INSTR q9, [sp]
INSTR q10, [sp]
INSTR q11, [sp]
INSTR q12, [sp]
INSTR q13, [sp]
INSTR q14, [sp]
INSTR q16, [sp]
INSTR q18, [sp]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

View File

@@ -0,0 +1,76 @@
#define INSTR stur
#define NINST 16
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x24, #-64
mov x25, #-128
mov x28, #-192
mov x27, #-256
loop:
subs x4, x4, #1
INSTR q1, [sp]
ldr x0, [sp]
INSTR q2, [sp]
ldr x1, [sp]
INSTR q3, [sp]
ldr x2, [sp]
INSTR q4, [sp]
ldr x3, [sp]
INSTR q5, [sp]
ldr x5, [sp]
INSTR q6, [sp]
ldr x6, [sp]
INSTR q7, [sp]
ldr x7, [sp]
INSTR q8, [sp]
ldr x8, [sp]
INSTR q9, [sp]
ldr x9, [sp]
INSTR q10, [sp]
ldr x10, [sp]
INSTR q11, [sp]
ldr x11, [sp]
INSTR q12, [sp]
ldr x12, [sp]
INSTR q13, [sp]
ldr x13, [sp]
INSTR q14, [sp]
ldr x14, [sp]
INSTR q16, [sp]
ldr x15, [sp]
INSTR q18, [sp]
ldr x16, [sp]
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

45
src/NEON/sub-w_w_i-LAT.S Normal file
View File

@@ -0,0 +1,45 @@
#define INSTR sub
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
mov x4, N
fmov v0.4s, #1.00000000
fmov v1.4s, #1.00000000
mov x0, #1
loop:
subs x4, x4, #1
INSTR w0, w0, #2
INSTR w0, w0, #2
INSTR w0, w0, #2
INSTR w0, w0, #2
INSTR w0, w0, #2
INSTR w0, w0, #2
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

90
src/NEON/sub-w_w_i-TP.S Normal file
View File

@@ -0,0 +1,90 @@
#define INSTR sub
#define NINST 48
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000000
fmov v1.2d, #1.00000000
fmov v2.2d, #1.00000000
mov x1, #1
mov x2, #1
mov x3, #1
loop:
subs x4, x4, #1
INSTR w5, w1, #64
INSTR w6, w2, #128
INSTR w7, w3, #192
INSTR w8, w1, #256
INSTR w9, w2, #320
INSTR w10, w3, #384
INSTR w11, w1, #448
INSTR w12, w2, #512
INSTR w13, w3, #576
INSTR w14, w1, #640
INSTR w15, w2, #704
INSTR w16, w3, #764
INSTR w5, w1, #64
INSTR w6, w2, #128
INSTR w7, w3, #192
INSTR w8, w1, #256
INSTR w9, w2, #320
INSTR w10, w3, #384
INSTR w11, w1, #448
INSTR w12, w2, #512
INSTR w13, w3, #576
INSTR w14, w1, #640
INSTR w15, w2, #704
INSTR w16, w3, #764
INSTR w5, w1, #64
INSTR w6, w2, #128
INSTR w7, w3, #192
INSTR w8, w1, #256
INSTR w9, w2, #320
INSTR w10, w3, #384
INSTR w11, w1, #448
INSTR w12, w2, #512
INSTR w13, w3, #576
INSTR w14, w1, #640
INSTR w15, w2, #704
INSTR w16, w3, #764
INSTR w5, w1, #64
INSTR w6, w2, #128
INSTR w7, w3, #192
INSTR w8, w1, #256
INSTR w9, w2, #320
INSTR w10, w3, #384
INSTR w11, w1, #448
INSTR w12, w2, #512
INSTR w13, w3, #576
INSTR w14, w1, #640
INSTR w15, w2, #704
INSTR w16, w3, #764
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency