add div and reciprocal

This commit is contained in:
Johannes Hofmann
2019-03-06 14:26:30 +01:00
parent 1d15a4bc76
commit 903de0bb73
8 changed files with 580 additions and 0 deletions

71
src/NEON/fdiv-DP-LAT.S Normal file
View File

@@ -0,0 +1,71 @@
#define INSTR fdiv
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000
# create 2.0
fadd v1.2d, v0.2d, v0.2d
# create 3.0
fadd v2.2d, v0.2d, v1.2d
# create 4.0
fadd v4.2d, v1.2d, v1.2d
# create 8.0
fadd v4.2d, v4.2d, v4.2d
# create 16.0
fadd v4.2d, v4.2d, v4.2d
# create 32.0
fadd v4.2d, v4.2d, v4.2d
# create 64.0
fadd v4.2d, v4.2d, v4.2d
# create 128.0
fadd v4.2d, v4.2d, v4.2d
# create 256.0
fadd v4.2d, v4.2d, v4.2d
# create 512.0
fadd v4.2d, v4.2d, v4.2d
# create 1024.0
fadd v4.2d, v4.2d, v4.2d
# create 341.3333 = (1024.0/3.0)
fdiv v1.2d, v4.2d, v2.2d
# create 1/341.3333
fdiv v2.2d, v0.2d, v1.2d
# create 2*341.3333
fadd v0.2d, v1.2d, v1.2d
loop:
subs x4, x4, #1
INSTR v0.2d, v0.2d, v1.2d
INSTR v0.2d, v0.2d, v2.2d
INSTR v0.2d, v0.2d, v1.2d
INSTR v0.2d, v0.2d, v2.2d
INSTR v0.2d, v0.2d, v1.2d
INSTR v0.2d, v0.2d, v2.2d
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

71
src/NEON/fdiv-DP-TP.S Normal file
View File

@@ -0,0 +1,71 @@
#define INSTR fdiv
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000
# create 2.0
fadd v1.2d, v0.2d, v0.2d
# create 3.0
fadd v2.2d, v0.2d, v1.2d
# create 4.0
fadd v4.2d, v1.2d, v1.2d
# create 8.0
fadd v4.2d, v4.2d, v4.2d
# create 16.0
fadd v4.2d, v4.2d, v4.2d
# create 32.0
fadd v4.2d, v4.2d, v4.2d
# create 64.0
fadd v4.2d, v4.2d, v4.2d
# create 128.0
fadd v4.2d, v4.2d, v4.2d
# create 256.0
fadd v4.2d, v4.2d, v4.2d
# create 512.0
fadd v4.2d, v4.2d, v4.2d
# create 1024.0
fadd v4.2d, v4.2d, v4.2d
# create 341.3333 = (1024.0/3.0)
fdiv v1.2d, v4.2d, v2.2d
# create 1/341.3333
fdiv v2.2d, v0.2d, v1.2d
# create 2*341.3333
fadd v0.2d, v1.2d, v1.2d
loop:
subs x4, x4, #1
INSTR v3.2d, v0.2d, v1.2d
INSTR v4.2d, v1.2d, v0.2d
INSTR v5.2d, v0.2d, v2.2d
INSTR v6.2d, v2.2d, v0.2d
INSTR v7.2d, v1.2d, v2.2d
INSTR v8.2d, v2.2d, v1.2d
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

71
src/NEON/fdiv-SP-LAT.S Normal file
View File

@@ -0,0 +1,71 @@
#define INSTR fdiv
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
mov x4, N
fmov v0.4s, #1.00000
# create 2.0
fadd v1.4s, v0.4s, v0.4s
# create 3.0
fadd v2.4s, v0.4s, v1.4s
# create 4.0
fadd v4.4s, v1.4s, v1.4s
# create 8.0
fadd v4.4s, v4.4s, v4.4s
# create 16.0
fadd v4.4s, v4.4s, v4.4s
# create 32.0
fadd v4.4s, v4.4s, v4.4s
# create 64.0
fadd v4.4s, v4.4s, v4.4s
# create 128.0
fadd v4.4s, v4.4s, v4.4s
# create 256.0
fadd v4.4s, v4.4s, v4.4s
# create 512.0
fadd v4.4s, v4.4s, v4.4s
# create 1024.0
fadd v4.4s, v4.4s, v4.4s
# create 341.3333 = (1024.0/3.0)
fdiv v1.4s, v4.4s, v2.4s
# create 1/341.3333
fdiv v2.4s, v0.4s, v1.4s
# create 2*341.3333
fadd v0.4s, v1.4s, v1.4s
loop:
subs x4, x4, #1
INSTR v0.4s, v0.4s, v1.4s
INSTR v0.4s, v0.4s, v2.4s
INSTR v0.4s, v0.4s, v1.4s
INSTR v0.4s, v0.4s, v2.4s
INSTR v0.4s, v0.4s, v1.4s
INSTR v0.4s, v0.4s, v2.4s
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

71
src/NEON/fdiv-SP-TP.S Normal file
View File

@@ -0,0 +1,71 @@
#define INSTR fdiv
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
mov x4, N
fmov v0.4s, #1.00000
# create 2.0
fadd v1.4s, v0.4s, v0.4s
# create 3.0
fadd v2.4s, v0.4s, v1.4s
# create 4.0
fadd v4.4s, v1.4s, v1.4s
# create 8.0
fadd v4.4s, v4.4s, v4.4s
# create 16.0
fadd v4.4s, v4.4s, v4.4s
# create 32.0
fadd v4.4s, v4.4s, v4.4s
# create 64.0
fadd v4.4s, v4.4s, v4.4s
# create 128.0
fadd v4.4s, v4.4s, v4.4s
# create 256.0
fadd v4.4s, v4.4s, v4.4s
# create 512.0
fadd v4.4s, v4.4s, v4.4s
# create 1024.0
fadd v4.4s, v4.4s, v4.4s
# create 341.3333 = (1024.0/3.0)
fdiv v1.4s, v4.4s, v2.4s
# create 1/341.3333
fdiv v2.4s, v0.4s, v1.4s
# create 2*341.3333
fadd v0.4s, v1.4s, v1.4s
loop:
subs x4, x4, #1
INSTR v3.4s, v0.4s, v1.4s
INSTR v4.4s, v1.4s, v0.4s
INSTR v5.4s, v0.4s, v2.4s
INSTR v6.4s, v2.4s, v0.4s
INSTR v7.4s, v1.4s, v2.4s
INSTR v8.4s, v2.4s, v1.4s
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

71
src/NEON/frecpe-DP-LAT.S Normal file
View File

@@ -0,0 +1,71 @@
#define INSTR frecpe
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000
# create 2.0
fadd v1.2d, v0.2d, v0.2d
# create 3.0
fadd v2.2d, v0.2d, v1.2d
# create 4.0
fadd v4.2d, v1.2d, v1.2d
# create 8.0
fadd v4.2d, v4.2d, v4.2d
# create 16.0
fadd v4.2d, v4.2d, v4.2d
# create 32.0
fadd v4.2d, v4.2d, v4.2d
# create 64.0
fadd v4.2d, v4.2d, v4.2d
# create 128.0
fadd v4.2d, v4.2d, v4.2d
# create 256.0
fadd v4.2d, v4.2d, v4.2d
# create 512.0
fadd v4.2d, v4.2d, v4.2d
# create 1024.0
fadd v4.2d, v4.2d, v4.2d
# create 341.3333 = (1024.0/3.0)
fdiv v1.2d, v4.2d, v2.2d
# create 1/341.3333
fdiv v2.2d, v0.2d, v1.2d
# create 2*341.3333
fadd v0.2d, v1.2d, v1.2d
loop:
subs x4, x4, #1
INSTR v1.2d, v0.2d
INSTR v2.2d, v1.2d
INSTR v3.2d, v2.2d
INSTR v4.2d, v3.2d
INSTR v5.2d, v4.2d
INSTR v0.2d, v5.2d
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

77
src/NEON/frecpe-DP-TP.S Normal file
View File

@@ -0,0 +1,77 @@
#define INSTR frecpe
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
sub sp, sp, #64
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
mov x4, N
fmov v0.2d, #1.00000
# create 2.0
fadd v1.2d, v0.2d, v0.2d
# create 3.0
fadd v2.2d, v0.2d, v1.2d
# create 4.0
fadd v4.2d, v1.2d, v1.2d
# create 8.0
fadd v4.2d, v4.2d, v4.2d
# create 16.0
fadd v4.2d, v4.2d, v4.2d
# create 32.0
fadd v4.2d, v4.2d, v4.2d
# create 64.0
fadd v4.2d, v4.2d, v4.2d
# create 128.0
fadd v4.2d, v4.2d, v4.2d
# create 256.0
fadd v4.2d, v4.2d, v4.2d
# create 512.0
fadd v4.2d, v4.2d, v4.2d
# create 1024.0
fadd v4.2d, v4.2d, v4.2d
# create 341.3333 = (1024.0/3.0)
fdiv v1.2d, v4.2d, v2.2d
# create 1/341.3333
fdiv v2.2d, v0.2d, v1.2d
# create 2*341.3333
fadd v0.2d, v1.2d, v1.2d
fadd v1.2d, v1.2d, v1.2d
fadd v2.2d, v1.2d, v1.2d
fadd v3.2d, v1.2d, v1.2d
fadd v4.2d, v1.2d, v1.2d
fadd v5.2d, v1.2d, v1.2d
loop:
subs x4, x4, #1
INSTR v10.2d, v0.2d
INSTR v11.2d, v1.2d
INSTR v12.2d, v2.2d
INSTR v13.2d, v3.2d
INSTR v14.2d, v4.2d
INSTR v15.2d, v5.2d
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
add sp, sp, #64
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

71
src/NEON/frecpe-SP-LAT.S Normal file
View File

@@ -0,0 +1,71 @@
#define INSTR frecpe
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
mov x4, N
fmov v0.4s, #1.00000
# create 2.0
fadd v1.4s, v0.4s, v0.4s
# create 3.0
fadd v2.4s, v0.4s, v1.4s
# create 4.0
fadd v4.4s, v1.4s, v1.4s
# create 8.0
fadd v4.4s, v4.4s, v4.4s
# create 16.0
fadd v4.4s, v4.4s, v4.4s
# create 32.0
fadd v4.4s, v4.4s, v4.4s
# create 64.0
fadd v4.4s, v4.4s, v4.4s
# create 128.0
fadd v4.4s, v4.4s, v4.4s
# create 256.0
fadd v4.4s, v4.4s, v4.4s
# create 512.0
fadd v4.4s, v4.4s, v4.4s
# create 1024.0
fadd v4.4s, v4.4s, v4.4s
# create 341.3333 = (1024.0/3.0)
fdiv v1.4s, v4.4s, v2.4s
# create 1/341.3333
fdiv v2.4s, v0.4s, v1.4s
# create 2*341.3333
fadd v0.4s, v1.4s, v1.4s
loop:
subs x4, x4, #1
INSTR v1.4s, v0.4s
INSTR v2.4s, v1.4s
INSTR v3.4s, v2.4s
INSTR v4.4s, v3.4s
INSTR v5.4s, v4.4s
INSTR v0.4s, v5.4s
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ret
.size latency, .-latency

77
src/NEON/frecpe-SP-TP.S Normal file
View File

@@ -0,0 +1,77 @@
#define INSTR frecpe
#define NINST 6
#define N x0
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 2
latency:
# push callee-save registers onto stack
sub sp, sp, #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
sub sp, sp, #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
mov x4, N
fmov v0.4s, #1.00000
# create 2.0
fadd v1.4s, v0.4s, v0.4s
# create 3.0
fadd v2.4s, v0.4s, v1.4s
# create 4.0
fadd v4.4s, v1.4s, v1.4s
# create 8.0
fadd v4.4s, v4.4s, v4.4s
# create 16.0
fadd v4.4s, v4.4s, v4.4s
# create 32.0
fadd v4.4s, v4.4s, v4.4s
# create 64.0
fadd v4.4s, v4.4s, v4.4s
# create 128.0
fadd v4.4s, v4.4s, v4.4s
# create 256.0
fadd v4.4s, v4.4s, v4.4s
# create 512.0
fadd v4.4s, v4.4s, v4.4s
# create 1024.0
fadd v4.4s, v4.4s, v4.4s
# create 341.3333 = (1024.0/3.0)
fdiv v1.4s, v4.4s, v2.4s
# create 1/341.3333
fdiv v2.4s, v0.4s, v1.4s
# create 2*341.3333
fadd v0.4s, v1.4s, v1.4s
fadd v1.4s, v1.4s, v1.4s
fadd v2.4s, v1.4s, v1.4s
fadd v3.4s, v1.4s, v1.4s
fadd v4.4s, v1.4s, v1.4s
fadd v5.4s, v1.4s, v1.4s
loop:
subs x4, x4, #1
INSTR v10.4s, v0.4s
INSTR v11.4s, v1.4s
INSTR v12.4s, v2.4s
INSTR v13.4s, v3.4s
INSTR v14.4s, v4.4s
INSTR v15.4s, v5.4s
bne loop
done:
# pop callee-save registers from stack
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp]
add sp, sp, #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
add sp, sp, #64
ret
.size latency, .-latency