Files
asmbench/random_pf10.txt
2020-02-14 16:57:29 +01:00

1139 lines
24 KiB
Plaintext

## Selected Instructions
VPERMILPSri, MULPSrr, ANDPDrr, VPSIGNBrr, PSIGNBrr, PMOVZXWDrr, PMINUWrr, PADDSWrr, VPSHUFHWri, MOVUPDrr
## Generated Assembly (8x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.section __TEXT,__literal4,4byte_literals
.p2align 2
LCPI0_0:
.long 1065361408
.section __TEXT,__text,regular,pure_instructions
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
testq %rdi, %rdi
jle LBB0_1
movabsq $LCPI0_0, %rax
vbroadcastss (%rax), %xmm0
movq $-1, %rcx
vmovaps %xmm0, %xmm1
vmovaps %xmm0, %xmm2
vmovaps %xmm0, %xmm4
vmovaps %xmm0, %xmm3
vmovaps %xmm0, %xmm5
vmovaps %xmm0, %xmm6
vmovaps %xmm0, %xmm7
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
vpermilps $1, %xmm0, %xmm0
mulps %xmm0, %xmm0
andpd %xmm0, %xmm0
vpsignb %xmm0, %xmm0, %xmm0
psignb %xmm0, %xmm0
pmovzxwd %xmm0, %xmm0
pminuw %xmm0, %xmm0
paddsw %xmm0, %xmm0
vpshufhw $1, %xmm0, %xmm0
vpermilps $1, %xmm1, %xmm1
movupd %xmm0, %xmm0
mulps %xmm1, %xmm1
andpd %xmm1, %xmm1
vpsignb %xmm1, %xmm1, %xmm1
psignb %xmm1, %xmm1
pmovzxwd %xmm1, %xmm1
pminuw %xmm1, %xmm1
paddsw %xmm1, %xmm1
vpshufhw $1, %xmm1, %xmm1
movupd %xmm1, %xmm1
vpermilps $1, %xmm2, %xmm2
mulps %xmm2, %xmm2
andpd %xmm2, %xmm2
vpsignb %xmm2, %xmm2, %xmm2
psignb %xmm2, %xmm2
pmovzxwd %xmm2, %xmm2
pminuw %xmm2, %xmm2
paddsw %xmm2, %xmm2
vpshufhw $1, %xmm2, %xmm2
movupd %xmm2, %xmm2
vpermilps $1, %xmm4, %xmm4
mulps %xmm4, %xmm4
andpd %xmm4, %xmm4
vpsignb %xmm4, %xmm4, %xmm4
psignb %xmm4, %xmm4
pmovzxwd %xmm4, %xmm4
pminuw %xmm4, %xmm4
paddsw %xmm4, %xmm4
vpshufhw $1, %xmm4, %xmm4
vpermilps $1, %xmm3, %xmm3
movupd %xmm4, %xmm4
mulps %xmm3, %xmm3
andpd %xmm3, %xmm3
vpsignb %xmm3, %xmm3, %xmm3
psignb %xmm3, %xmm3
pmovzxwd %xmm3, %xmm3
pminuw %xmm3, %xmm3
paddsw %xmm3, %xmm3
vpshufhw $1, %xmm3, %xmm3
movupd %xmm3, %xmm3
vpermilps $1, %xmm5, %xmm5
mulps %xmm5, %xmm5
andpd %xmm5, %xmm5
vpsignb %xmm5, %xmm5, %xmm5
psignb %xmm5, %xmm5
pmovzxwd %xmm5, %xmm5
pminuw %xmm5, %xmm5
paddsw %xmm5, %xmm5
vpshufhw $1, %xmm5, %xmm5
movupd %xmm5, %xmm5
vpermilps $1, %xmm6, %xmm6
mulps %xmm6, %xmm6
andpd %xmm6, %xmm6
vpsignb %xmm6, %xmm6, %xmm6
psignb %xmm6, %xmm6
pmovzxwd %xmm6, %xmm6
pminuw %xmm6, %xmm6
paddsw %xmm6, %xmm6
vpshufhw $1, %xmm6, %xmm6
vpermilps $1, %xmm7, %xmm7
movupd %xmm6, %xmm6
mulps %xmm7, %xmm7
andpd %xmm7, %xmm7
vpsignb %xmm7, %xmm7, %xmm7
psignb %xmm7, %xmm7
pmovzxwd %xmm7, %xmm7
pminuw %xmm7, %xmm7
paddsw %xmm7, %xmm7
vpshufhw $1, %xmm7, %xmm7
## InlineAsm End
leaq 1(%rcx), %rax
addq $2, %rcx
cmpq %rdi, %rcx
movq %rax, %rcx
## InlineAsm Start
movupd %xmm7, %xmm7
## InlineAsm End
jl LBB0_3
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (10000000,),
'frequency': 2600000000.0,
'iterations': 10000000,
'parallel_factor': 8,
'returned': [9999999, 9999999, 9999999, 9999999],
'runtimes': [0.12005576398223639,
0.12028825294692069,
0.1209630100056529,
0.11989319801796228]}
minimal throughput: 3.90 cy
## Selected Instructions
VFMADD132PDYr, VPADDWYrr, VFMADD132PSYr, VPADDDYrr, VSUBPDYrr, VPACKUSDWYrr, VPMULHUWYrr, VMINPDYrr, VPUNPCKLWDYrr, VBLENDVPSYrr
## Generated Assembly (8x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.section __TEXT,__literal4,4byte_literals
.p2align 2
LCPI0_0:
.long 1065361408
.section __TEXT,__text,regular,pure_instructions
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
testq %rdi, %rdi
jle LBB0_1
movabsq $LCPI0_0, %rax
vbroadcastss (%rax), %ymm0
movq $-1, %rcx
vmovaps %ymm0, %ymm1
vmovaps %ymm0, %ymm2
vmovaps %ymm0, %ymm3
vmovaps %ymm0, %ymm6
vmovaps %ymm0, %ymm5
vmovaps %ymm0, %ymm7
vmovaps %ymm0, %ymm8
.p2align 4, 0x90
LBB0_3:
vmovaps %ymm2, %ymm4
## InlineAsm Start
vfmadd132pd %ymm3, %ymm6, %ymm4
## InlineAsm End
vmovaps %ymm1, %ymm9
## InlineAsm Start
vfmadd132pd %ymm2, %ymm3, %ymm9
vfmadd132pd %ymm6, %ymm5, %ymm3
vfmadd132pd %ymm5, %ymm7, %ymm6
vfmadd132pd %ymm7, %ymm8, %ymm5
vfmadd132pd %ymm8, %ymm0, %ymm7
vfmadd132pd %ymm0, %ymm1, %ymm8
vfmadd132pd %ymm1, %ymm2, %ymm0
vpaddw %ymm0, %ymm0, %ymm0
vfmadd132ps %ymm0, %ymm0, %ymm0
vpaddd %ymm0, %ymm0, %ymm0
vsubpd %ymm0, %ymm0, %ymm0
vpackusdw %ymm0, %ymm0, %ymm0
vpmulhuw %ymm0, %ymm0, %ymm0
vminpd %ymm0, %ymm0, %ymm0
vpunpcklwd %ymm0, %ymm0, %ymm0
vblendvps %ymm0, %ymm0, %ymm0, %ymm0
vpaddw %ymm3, %ymm3, %ymm1
vfmadd132ps %ymm1, %ymm1, %ymm1
vpaddd %ymm1, %ymm1, %ymm1
vsubpd %ymm1, %ymm1, %ymm1
vpackusdw %ymm1, %ymm1, %ymm1
vpmulhuw %ymm1, %ymm1, %ymm1
vminpd %ymm1, %ymm1, %ymm1
vpunpcklwd %ymm1, %ymm1, %ymm1
vblendvps %ymm1, %ymm1, %ymm1, %ymm1
vpaddw %ymm7, %ymm7, %ymm2
vfmadd132ps %ymm2, %ymm2, %ymm2
vpaddd %ymm2, %ymm2, %ymm2
vsubpd %ymm2, %ymm2, %ymm2
vpackusdw %ymm2, %ymm2, %ymm2
vpmulhuw %ymm2, %ymm2, %ymm2
vminpd %ymm2, %ymm2, %ymm2
vpunpcklwd %ymm2, %ymm2, %ymm2
vblendvps %ymm2, %ymm2, %ymm2, %ymm2
vpaddw %ymm9, %ymm9, %ymm3
vfmadd132ps %ymm3, %ymm3, %ymm3
vpaddd %ymm3, %ymm3, %ymm3
vsubpd %ymm3, %ymm3, %ymm3
vpackusdw %ymm3, %ymm3, %ymm3
vpmulhuw %ymm3, %ymm3, %ymm3
vminpd %ymm3, %ymm3, %ymm3
vpunpcklwd %ymm3, %ymm3, %ymm3
vblendvps %ymm3, %ymm3, %ymm3, %ymm3
vpaddw %ymm6, %ymm6, %ymm6
vfmadd132ps %ymm6, %ymm6, %ymm6
vpaddd %ymm6, %ymm6, %ymm6
vsubpd %ymm6, %ymm6, %ymm6
vpackusdw %ymm6, %ymm6, %ymm6
vpmulhuw %ymm6, %ymm6, %ymm6
vminpd %ymm6, %ymm6, %ymm6
vpunpcklwd %ymm6, %ymm6, %ymm6
vblendvps %ymm6, %ymm6, %ymm6, %ymm6
vpaddw %ymm8, %ymm8, %ymm7
vfmadd132ps %ymm7, %ymm7, %ymm7
vpaddd %ymm7, %ymm7, %ymm7
vsubpd %ymm7, %ymm7, %ymm7
vpackusdw %ymm7, %ymm7, %ymm7
vpmulhuw %ymm7, %ymm7, %ymm7
vminpd %ymm7, %ymm7, %ymm7
vpunpcklwd %ymm7, %ymm7, %ymm7
vpaddw %ymm5, %ymm5, %ymm8
vblendvps %ymm7, %ymm7, %ymm7, %ymm5
vpaddw %ymm4, %ymm4, %ymm4
vfmadd132ps %ymm4, %ymm4, %ymm4
vpaddd %ymm4, %ymm4, %ymm4
vsubpd %ymm4, %ymm4, %ymm4
vpackusdw %ymm4, %ymm4, %ymm4
vpmulhuw %ymm4, %ymm4, %ymm4
vminpd %ymm4, %ymm4, %ymm4
vpunpcklwd %ymm4, %ymm4, %ymm4
vblendvps %ymm4, %ymm4, %ymm4, %ymm7
vfmadd132ps %ymm8, %ymm8, %ymm8
vpaddd %ymm8, %ymm8, %ymm4
vsubpd %ymm4, %ymm4, %ymm4
vpackusdw %ymm4, %ymm4, %ymm4
vpmulhuw %ymm4, %ymm4, %ymm4
vminpd %ymm4, %ymm4, %ymm4
vpunpcklwd %ymm4, %ymm4, %ymm4
vblendvps %ymm4, %ymm4, %ymm4, %ymm8
## InlineAsm End
leaq 1(%rcx), %rax
addq $2, %rcx
cmpq %rdi, %rcx
movq %rax, %rcx
jl LBB0_3
vzeroupper
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (10000000,),
'frequency': 2600000000.0,
'iterations': 10000000,
'parallel_factor': 8,
'returned': [9999999, 9999999, 9999999, 9999999],
'runtimes': [0.17754955508280545,
0.17602652800269425,
0.17718603508546948,
0.17694135499186814]}
minimal throughput: 5.72 cy
## Selected Instructions
VCVTSI642SDrr, VFMADD213SDr, DIVSDrr, VCVTSI642SDrr, MAXSDrr, VFNMADD213SDr, VFMADD132SDr, VMAXSDrr, VFNMADD132SDr, SQRTSDr
## Generated Assembly (8x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.section __TEXT,__literal8,8byte_literals
.p2align 3
LCPI0_0:
.quad 4607186816846528512
.section __TEXT,__text,regular,pure_instructions
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
testq %rdi, %rdi
jle LBB0_1
movq $-1, %rcx
movabsq $LCPI0_0, %rax
vmovsd (%rax), %xmm0
movl $3, %edx
vmovaps %xmm0, %xmm1
vmovaps %xmm0, %xmm2
vmovaps %xmm0, %xmm3
vmovaps %xmm0, %xmm4
vmovaps %xmm0, %xmm5
vmovaps %xmm0, %xmm6
vmovaps %xmm0, %xmm7
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
vcvtsi2sdq %rdx, %xmm0, %xmm0
vfmadd213sd %xmm0, %xmm0, %xmm0
divsd %xmm0, %xmm0
vcvtsi2sdq %rdx, %xmm0, %xmm0
maxsd %xmm0, %xmm0
vfnmadd213sd %xmm0, %xmm0, %xmm0
vfmadd132sd %xmm0, %xmm0, %xmm0
vmaxsd %xmm0, %xmm0, %xmm0
vfnmadd132sd %xmm0, %xmm0, %xmm0
sqrtsd %xmm0, %xmm0
vcvtsi2sdq %rdx, %xmm1, %xmm1
vfmadd213sd %xmm1, %xmm1, %xmm1
divsd %xmm1, %xmm1
vcvtsi2sdq %rdx, %xmm1, %xmm1
maxsd %xmm1, %xmm1
vfnmadd213sd %xmm1, %xmm1, %xmm1
vfmadd132sd %xmm1, %xmm1, %xmm1
vmaxsd %xmm1, %xmm1, %xmm1
vfnmadd132sd %xmm1, %xmm1, %xmm1
sqrtsd %xmm1, %xmm1
vcvtsi2sdq %rdx, %xmm2, %xmm2
vfmadd213sd %xmm2, %xmm2, %xmm2
divsd %xmm2, %xmm2
vcvtsi2sdq %rdx, %xmm2, %xmm2
maxsd %xmm2, %xmm2
vfnmadd213sd %xmm2, %xmm2, %xmm2
vfmadd132sd %xmm2, %xmm2, %xmm2
vmaxsd %xmm2, %xmm2, %xmm2
vfnmadd132sd %xmm2, %xmm2, %xmm2
sqrtsd %xmm2, %xmm2
vcvtsi2sdq %rdx, %xmm3, %xmm3
vfmadd213sd %xmm3, %xmm3, %xmm3
divsd %xmm3, %xmm3
vcvtsi2sdq %rdx, %xmm3, %xmm3
maxsd %xmm3, %xmm3
vfnmadd213sd %xmm3, %xmm3, %xmm3
vfmadd132sd %xmm3, %xmm3, %xmm3
vmaxsd %xmm3, %xmm3, %xmm3
vfnmadd132sd %xmm3, %xmm3, %xmm3
sqrtsd %xmm3, %xmm3
vcvtsi2sdq %rdx, %xmm4, %xmm4
vfmadd213sd %xmm4, %xmm4, %xmm4
divsd %xmm4, %xmm4
vcvtsi2sdq %rdx, %xmm4, %xmm4
maxsd %xmm4, %xmm4
vfnmadd213sd %xmm4, %xmm4, %xmm4
vfmadd132sd %xmm4, %xmm4, %xmm4
vmaxsd %xmm4, %xmm4, %xmm4
vfnmadd132sd %xmm4, %xmm4, %xmm4
sqrtsd %xmm4, %xmm4
vcvtsi2sdq %rdx, %xmm5, %xmm5
vfmadd213sd %xmm5, %xmm5, %xmm5
divsd %xmm5, %xmm5
vcvtsi2sdq %rdx, %xmm5, %xmm5
maxsd %xmm5, %xmm5
vfnmadd213sd %xmm5, %xmm5, %xmm5
vfmadd132sd %xmm5, %xmm5, %xmm5
vmaxsd %xmm5, %xmm5, %xmm5
vfnmadd132sd %xmm5, %xmm5, %xmm5
sqrtsd %xmm5, %xmm5
vcvtsi2sdq %rdx, %xmm6, %xmm6
vfmadd213sd %xmm6, %xmm6, %xmm6
divsd %xmm6, %xmm6
vcvtsi2sdq %rdx, %xmm6, %xmm6
maxsd %xmm6, %xmm6
vfnmadd213sd %xmm6, %xmm6, %xmm6
vfmadd132sd %xmm6, %xmm6, %xmm6
vmaxsd %xmm6, %xmm6, %xmm6
vfnmadd132sd %xmm6, %xmm6, %xmm6
sqrtsd %xmm6, %xmm6
vcvtsi2sdq %rdx, %xmm7, %xmm7
vfmadd213sd %xmm7, %xmm7, %xmm7
divsd %xmm7, %xmm7
vcvtsi2sdq %rdx, %xmm7, %xmm7
maxsd %xmm7, %xmm7
vfnmadd213sd %xmm7, %xmm7, %xmm7
vfmadd132sd %xmm7, %xmm7, %xmm7
vmaxsd %xmm7, %xmm7, %xmm7
vfnmadd132sd %xmm7, %xmm7, %xmm7
sqrtsd %xmm7, %xmm7
## InlineAsm End
leaq 1(%rcx), %rax
addq $2, %rcx
cmpq %rdi, %rcx
movq %rax, %rcx
jl LBB0_3
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (4751663,),
'frequency': 2600000000.0,
'iterations': 4751663,
'parallel_factor': 8,
'returned': [4751662, 4751662, 4751662, 4751662],
'runtimes': [0.1328908569412306,
0.1338977849809453,
0.1339660519734025,
0.13359365100041032]}
minimal throughput: 9.09 cy
## Selected Instructions
RCPSSr, VCVTSI2SSrr, MULSSrr, VCVTSD2SSrr, VROUNDSSr, VRCPSSr, VCVTSI2SSrr, VSQRTSSr, VFNMADD231SSr, VSQRTSSr
## Generated Assembly (8x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.section __TEXT,__literal4,4byte_literals
.p2align 2
LCPI0_0:
.long 1065361408
.section __TEXT,__literal8,8byte_literals
.p2align 3
LCPI0_1:
.quad 4607186816846528512
.section __TEXT,__text,regular,pure_instructions
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
testq %rdi, %rdi
jle LBB0_1
movq $-1, %rcx
movabsq $LCPI0_0, %rax
vmovss (%rax), %xmm2
movl $3, %edx
movabsq $LCPI0_1, %rax
vmovsd (%rax), %xmm0
vmovaps %xmm2, %xmm8
vmovaps %xmm2, %xmm9
vmovaps %xmm2, %xmm4
vmovaps %xmm2, %xmm5
vmovaps %xmm2, %xmm6
vmovaps %xmm2, %xmm7
vmovaps %xmm2, %xmm1
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
rcpss %xmm2, %xmm2
vcvtsi2ssl %edx, %xmm2, %xmm2
mulss %xmm2, %xmm2
vcvtsd2ss %xmm0, %xmm2, %xmm2
vroundss $1, %xmm2, %xmm2, %xmm2
vrcpss %xmm2, %xmm2, %xmm2
vcvtsi2ssl %edx, %xmm2, %xmm2
vsqrtss %xmm2, %xmm2, %xmm2
vfnmadd231ss %xmm2, %xmm2, %xmm2
vsqrtss %xmm2, %xmm2, %xmm2
rcpss %xmm8, %xmm3
vcvtsi2ssl %edx, %xmm3, %xmm3
mulss %xmm3, %xmm3
vcvtsd2ss %xmm0, %xmm3, %xmm3
vroundss $1, %xmm3, %xmm3, %xmm3
vrcpss %xmm3, %xmm3, %xmm3
vcvtsi2ssl %edx, %xmm3, %xmm3
vsqrtss %xmm3, %xmm3, %xmm3
vfnmadd231ss %xmm3, %xmm3, %xmm3
vsqrtss %xmm3, %xmm3, %xmm8
rcpss %xmm9, %xmm3
vcvtsi2ssl %edx, %xmm3, %xmm3
mulss %xmm3, %xmm3
vcvtsd2ss %xmm0, %xmm3, %xmm3
vroundss $1, %xmm3, %xmm3, %xmm3
vrcpss %xmm3, %xmm3, %xmm3
vcvtsi2ssl %edx, %xmm3, %xmm3
vsqrtss %xmm3, %xmm3, %xmm3
vfnmadd231ss %xmm3, %xmm3, %xmm3
vsqrtss %xmm3, %xmm3, %xmm9
rcpss %xmm4, %xmm3
vcvtsi2ssl %edx, %xmm3, %xmm3
mulss %xmm3, %xmm3
vcvtsd2ss %xmm0, %xmm3, %xmm3
vroundss $1, %xmm3, %xmm3, %xmm3
vrcpss %xmm3, %xmm3, %xmm3
vcvtsi2ssl %edx, %xmm3, %xmm3
vsqrtss %xmm3, %xmm3, %xmm3
vfnmadd231ss %xmm3, %xmm3, %xmm3
vsqrtss %xmm3, %xmm3, %xmm4
rcpss %xmm5, %xmm3
vcvtsi2ssl %edx, %xmm3, %xmm3
mulss %xmm3, %xmm3
vcvtsd2ss %xmm0, %xmm3, %xmm3
vroundss $1, %xmm3, %xmm3, %xmm3
vrcpss %xmm3, %xmm3, %xmm3
vcvtsi2ssl %edx, %xmm3, %xmm3
vsqrtss %xmm3, %xmm3, %xmm3
vfnmadd231ss %xmm3, %xmm3, %xmm3
vsqrtss %xmm3, %xmm3, %xmm5
rcpss %xmm6, %xmm3
vcvtsi2ssl %edx, %xmm3, %xmm3
mulss %xmm3, %xmm3
vcvtsd2ss %xmm0, %xmm3, %xmm3
vroundss $1, %xmm3, %xmm3, %xmm3
vrcpss %xmm3, %xmm3, %xmm3
vcvtsi2ssl %edx, %xmm3, %xmm3
vsqrtss %xmm3, %xmm3, %xmm3
vfnmadd231ss %xmm3, %xmm3, %xmm3
vsqrtss %xmm3, %xmm3, %xmm6
rcpss %xmm7, %xmm3
vcvtsi2ssl %edx, %xmm3, %xmm3
mulss %xmm3, %xmm3
vcvtsd2ss %xmm0, %xmm3, %xmm3
vroundss $1, %xmm3, %xmm3, %xmm3
vrcpss %xmm3, %xmm3, %xmm3
vcvtsi2ssl %edx, %xmm3, %xmm3
vsqrtss %xmm3, %xmm3, %xmm3
vfnmadd231ss %xmm3, %xmm3, %xmm3
vsqrtss %xmm3, %xmm3, %xmm7
rcpss %xmm1, %xmm1
vcvtsi2ssl %edx, %xmm1, %xmm1
mulss %xmm1, %xmm1
vcvtsd2ss %xmm0, %xmm1, %xmm1
vroundss $1, %xmm1, %xmm1, %xmm1
vrcpss %xmm1, %xmm1, %xmm1
vcvtsi2ssl %edx, %xmm1, %xmm1
vsqrtss %xmm1, %xmm1, %xmm1
vfnmadd231ss %xmm1, %xmm1, %xmm1
vsqrtss %xmm1, %xmm1, %xmm1
## InlineAsm End
leaq 1(%rcx), %rax
addq $2, %rcx
cmpq %rdi, %rcx
movq %rax, %rcx
jl LBB0_3
retq
LBB0_1:
xorl %eax, %eax
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (1245590,),
'frequency': 2600000000.0,
'iterations': 1245590,
'parallel_factor': 8,
'returned': [1245589, 1245589, 1245589, 1245589],
'runtimes': [0.1349610739853233,
0.1329138990258798,
0.13283832801971585,
0.1326144520426169]}
minimal throughput: 34.60 cy
## Selected Instructions
ROR16ri, CMOVS16rr, SBB16ri, ADC16ri8, XOR16ri8, BTR16rr, XOR16ri8, SAR16r1, DEC16r, SUB16ri
## Generated Assembly (8x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset %rbx, -24
.cfi_offset %rbp, -16
testq %rdi, %rdi
jle LBB0_1
movw $3, %r9w
movq $-1, %r8
movw $3, %r10w
movw $3, %r11w
movw $3, %cx
movw $3, %si
movw $3, %dx
movw $3, %bx
movw $3, %bp
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
rorw %r9w
cmovsw %r9w, %r9w
sbbw $1, %r9w
adcw $1, %r9w
xorw $1, %r9w
btrw %r9w, %r9w
xorw $1, %r9w
sarw %r9w
decw %r9w
subw $1, %r9w
rorw %r10w
cmovsw %r10w, %r10w
sbbw $1, %r10w
adcw $1, %r10w
xorw $1, %r10w
btrw %r10w, %r10w
xorw $1, %r10w
sarw %r10w
decw %r10w
subw $1, %r10w
rorw %r11w
cmovsw %r11w, %r11w
sbbw $1, %r11w
adcw $1, %r11w
xorw $1, %r11w
btrw %r11w, %r11w
xorw $1, %r11w
sarw %r11w
decw %r11w
subw $1, %r11w
rorw %cx
cmovsw %cx, %cx
sbbw $1, %cx
adcw $1, %cx
xorw $1, %cx
btrw %cx, %cx
xorw $1, %cx
sarw %cx
decw %cx
subw $1, %cx
rorw %si
cmovsw %si, %si
sbbw $1, %si
adcw $1, %si
xorw $1, %si
btrw %si, %si
xorw $1, %si
sarw %si
decw %si
subw $1, %si
rorw %dx
cmovsw %dx, %dx
sbbw $1, %dx
adcw $1, %dx
xorw $1, %dx
btrw %dx, %dx
xorw $1, %dx
sarw %dx
decw %dx
subw $1, %dx
rorw %bx
cmovsw %bx, %bx
sbbw $1, %bx
adcw $1, %bx
xorw $1, %bx
btrw %bx, %bx
xorw $1, %bx
sarw %bx
decw %bx
subw $1, %bx
rorw %bp
cmovsw %bp, %bp
sbbw $1, %bp
adcw $1, %bp
xorw $1, %bp
btrw %bp, %bp
xorw $1, %bp
sarw %bp
decw %bp
subw $1, %bp
## InlineAsm End
leaq 1(%r8), %rax
addq $2, %r8
cmpq %rdi, %r8
movq %rax, %r8
jl LBB0_3
popq %rbx
popq %rbp
retq
LBB0_1:
xorl %eax, %eax
popq %rbx
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (10000000,),
'frequency': 2600000000.0,
'iterations': 10000000,
'parallel_factor': 8,
'returned': [9999999, 9999999, 9999999, 9999999],
'runtimes': [0.18081542605068535,
0.17877629201393574,
0.17950556799769402,
0.1797733639832586]}
minimal throughput: 5.81 cy
## Selected Instructions
SHLX32rr, CMOVO32rr, MOV32rr, CMOVS32rr, CRC32r32r8, SHR32r1, ADD32rr, CRC32r32r8, RCR32ri, SHR32r1
## Generated Assembly (8x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
pushq %r15
.cfi_def_cfa_offset 24
pushq %r14
.cfi_def_cfa_offset 32
pushq %r13
.cfi_def_cfa_offset 40
pushq %r12
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
testq %rdi, %rdi
jle LBB0_1
movl $3, %r10d
movq $-1, %r9
movb $3, %r8b
movl $3, %r12d
movl $3, %r13d
movl $3, %esi
movl $3, %ebx
movl $3, %ebp
movl $3, %ecx
movl $3, %edx
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
shlxl %edx, %ecx, %r11d
shlxl %ebp, %ebx, %r14d
shlxl %esi, %r13d, %r15d
shlxl %r12d, %r10d, %eax
shlxl %edx, %ecx, %ecx
shlxl %ebp, %ebx, %edx
shlxl %esi, %r13d, %esi
shlxl %r12d, %r10d, %ebp
cmovol %ebp, %ebp
movl %ebp, %r10d
cmovsl %r10d, %r10d
crc32b %r8b, %r10d
shrl %r10d
addl %r10d, %r10d
crc32b %r8b, %r10d
rcrl %r10d
shrl %r10d
cmovol %esi, %esi
movl %esi, %r12d
cmovsl %r12d, %r12d
crc32b %r8b, %r12d
shrl %r12d
addl %r12d, %r12d
crc32b %r8b, %r12d
rcrl %r12d
shrl %r12d
cmovol %edx, %edx
movl %edx, %r13d
cmovsl %r13d, %r13d
crc32b %r8b, %r13d
shrl %r13d
addl %r13d, %r13d
crc32b %r8b, %r13d
rcrl %r13d
shrl %r13d
cmovol %ecx, %ecx
movl %ecx, %esi
cmovsl %esi, %esi
crc32b %r8b, %esi
shrl %esi
addl %esi, %esi
crc32b %r8b, %esi
rcrl %esi
shrl %esi
cmovol %eax, %eax
movl %eax, %ebx
cmovsl %ebx, %ebx
crc32b %r8b, %ebx
shrl %ebx
addl %ebx, %ebx
crc32b %r8b, %ebx
rcrl %ebx
shrl %ebx
cmovol %r15d, %r15d
movl %r15d, %ebp
cmovsl %ebp, %ebp
crc32b %r8b, %ebp
shrl %ebp
addl %ebp, %ebp
crc32b %r8b, %ebp
rcrl %ebp
shrl %ebp
cmovol %r14d, %r14d
movl %r14d, %ecx
cmovsl %ecx, %ecx
crc32b %r8b, %ecx
shrl %ecx
addl %ecx, %ecx
crc32b %r8b, %ecx
rcrl %ecx
shrl %ecx
cmovol %r11d, %r11d
movl %r11d, %edx
cmovsl %edx, %edx
crc32b %r8b, %edx
shrl %edx
addl %edx, %edx
crc32b %r8b, %edx
rcrl %edx
shrl %edx
## InlineAsm End
leaq 1(%r9), %rax
addq $2, %r9
cmpq %rdi, %r9
movq %rax, %r9
jl LBB0_3
jmp LBB0_4
LBB0_1:
xorl %eax, %eax
LBB0_4:
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (5002974,),
'frequency': 2600000000.0,
'iterations': 5002974,
'parallel_factor': 8,
'returned': [5002973, 5002973, 5002973, 5002973],
'runtimes': [0.1367008569650352,
0.13341521099209785,
0.13342649908736348,
0.133624273003079]}
minimal throughput: 8.67 cy
## Selected Instructions
SHRX64rr, SBB64ri32, AND64ri8, MOV64rc, INC64r, SUB64ri32, POPCNT64rr, OR64ri8, BTS64rr, ROL64ri
## Generated Assembly (8x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
pushq %r14
.cfi_def_cfa_offset 24
pushq %r13
.cfi_def_cfa_offset 32
pushq %r12
.cfi_def_cfa_offset 40
pushq %rbx
.cfi_def_cfa_offset 48
.cfi_offset %rbx, -48
.cfi_offset %r12, -40
.cfi_offset %r13, -32
.cfi_offset %r14, -24
.cfi_offset %r15, -16
testq %rdi, %rdi
jle LBB0_1
movq $-1, %r8
movl $3, %r9d
movl $3, %r11d
movl $3, %r12d
movl $3, %r13d
movl $3, %esi
movl $3, %ebx
movl $3, %ecx
movl $3, %edx
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
shrxq %rdx, %rcx, %r10
shrxq %rbx, %rsi, %r14
shrxq %r13, %r12, %r15
shrxq %r11, %r9, %rax
shrxq %rdx, %rcx, %rcx
shrxq %rbx, %rsi, %rdx
shrxq %r13, %r12, %rsi
shrxq %r11, %r9, %rbx
sbbq $1, %rbx
andq $1, %rbx
movq %rbx, %rbx
incq %rbx
subq $1, %rbx
popcntq %rbx, %r9
orq $1, %r9
btsq %r9, %r9
rolq %r9
sbbq $1, %rsi
andq $1, %rsi
movq %rsi, %rsi
incq %rsi
subq $1, %rsi
popcntq %rsi, %r11
orq $1, %r11
btsq %r11, %r11
rolq %r11
sbbq $1, %rdx
andq $1, %rdx
movq %rdx, %rdx
incq %rdx
subq $1, %rdx
popcntq %rdx, %r12
orq $1, %r12
btsq %r12, %r12
rolq %r12
sbbq $1, %rcx
andq $1, %rcx
movq %rcx, %rcx
incq %rcx
subq $1, %rcx
popcntq %rcx, %r13
orq $1, %r13
btsq %r13, %r13
rolq %r13
sbbq $1, %rax
andq $1, %rax
movq %rax, %rax
incq %rax
subq $1, %rax
popcntq %rax, %rsi
orq $1, %rsi
btsq %rsi, %rsi
rolq %rsi
sbbq $1, %r15
andq $1, %r15
movq %r15, %rax
incq %rax
subq $1, %rax
popcntq %rax, %rbx
orq $1, %rbx
btsq %rbx, %rbx
rolq %rbx
sbbq $1, %r14
andq $1, %r14
movq %r14, %rax
incq %rax
subq $1, %rax
popcntq %rax, %rcx
orq $1, %rcx
btsq %rcx, %rcx
rolq %rcx
sbbq $1, %r10
andq $1, %r10
movq %r10, %rax
incq %rax
subq $1, %rax
popcntq %rax, %rdx
orq $1, %rdx
btsq %rdx, %rdx
rolq %rdx
## InlineAsm End
leaq 1(%r8), %rax
addq $2, %r8
cmpq %rdi, %r8
movq %rax, %r8
jl LBB0_3
jmp LBB0_4
LBB0_1:
xorl %eax, %eax
LBB0_4:
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (6115790,),
'frequency': 2600000000.0,
'iterations': 6115790,
'parallel_factor': 8,
'returned': [6115789, 6115789, 6115789, 6115789],
'runtimes': [0.13210842409171164,
0.13202582008671016,
0.13190629601012915,
0.1326259489869699]}
minimal throughput: 7.01 cy
## Selected Instructions
SAR8r1, SHR8ri, INC8r, AND8rr, RCR8ri, ROL8ri, SUB8ri, SBB8rr, NEG8r, NOT8r
## Generated Assembly (8x parallel)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 13
.globl _test
.p2align 4, 0x90
_test:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset %rbx, -24
.cfi_offset %rbp, -16
testq %rdi, %rdi
jle LBB0_1
movb $3, %r8b
movq $-1, %rsi
movb $3, %r9b
movb $3, %r10b
movb $3, %r11b
movb $3, %bpl
movb $3, %dl
movb $3, %bl
movb $3, %cl
.p2align 4, 0x90
LBB0_3:
## InlineAsm Start
sarb %r8b
shrb %r8b
incb %r8b
andb %r8b, %r8b
rcrb %r8b
rolb %r8b
subb $1, %r8b
sbbb %r8b, %r8b
negb %r8b
notb %r8b
sarb %r9b
shrb %r9b
incb %r9b
andb %r9b, %r9b
rcrb %r9b
rolb %r9b
subb $1, %r9b
sbbb %r9b, %r9b
negb %r9b
notb %r9b
sarb %r10b
shrb %r10b
incb %r10b
andb %r10b, %r10b
rcrb %r10b
rolb %r10b
subb $1, %r10b
sbbb %r10b, %r10b
negb %r10b
notb %r10b
sarb %r11b
shrb %r11b
incb %r11b
andb %r11b, %r11b
rcrb %r11b
rolb %r11b
subb $1, %r11b
sbbb %r11b, %r11b
negb %r11b
notb %r11b
sarb %bpl
shrb %bpl
incb %bpl
andb %bpl, %bpl
rcrb %bpl
rolb %bpl
subb $1, %bpl
sbbb %bpl, %bpl
negb %bpl
notb %bpl
sarb %dl
shrb %dl
incb %dl
andb %dl, %dl
rcrb %dl
rolb %dl
subb $1, %dl
sbbb %dl, %dl
negb %dl
notb %dl
sarb %bl
shrb %bl
incb %bl
andb %bl, %bl
rcrb %bl
rolb %bl
subb $1, %bl
sbbb %bl, %bl
negb %bl
notb %bl
sarb %cl
shrb %cl
incb %cl
andb %cl, %cl
rcrb %cl
rolb %cl
subb $1, %cl
sbbb %cl, %cl
negb %cl
notb %cl
## InlineAsm End
leaq 1(%rsi), %rax
addq $2, %rsi
cmpq %rdi, %rsi
movq %rax, %rsi
jl LBB0_3
popq %rbx
popq %rbp
retq
LBB0_1:
xorl %eax, %eax
popq %rbx
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
## Detailed Results
{'arguments': (10000000,),
'frequency': 2600000000.0,
'iterations': 10000000,
'parallel_factor': 8,
'returned': [9999999, 9999999, 9999999, 9999999],
'runtimes': [0.12684946099761873,
0.1267317159799859,
0.1264580050483346,
0.1265630420530215]}
minimal throughput: 4.11 cy