## Selected Instructions VPERMILPSri, MULPSrr, ANDPDrr, VPSIGNBrr, PSIGNBrr, PMOVZXWDrr, PMINUWrr, PADDSWrr, VPSHUFHWri, MOVUPDrr ## Generated Assembly (1x parallel) .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 13 .section __TEXT,__literal4,4byte_literals .p2align 2 LCPI0_0: .long 1065361408 .section __TEXT,__text,regular,pure_instructions .globl _test .p2align 4, 0x90 _test: .cfi_startproc testq %rdi, %rdi jle LBB0_1 movabsq $LCPI0_0, %rax vbroadcastss (%rax), %xmm0 movq $-1, %rcx .p2align 4, 0x90 LBB0_3: ## InlineAsm Start vpermilps $1, %xmm0, %xmm0 mulps %xmm0, %xmm0 andpd %xmm0, %xmm0 vpsignb %xmm0, %xmm0, %xmm0 psignb %xmm0, %xmm0 pmovzxwd %xmm0, %xmm0 pminuw %xmm0, %xmm0 paddsw %xmm0, %xmm0 vpshufhw $1, %xmm0, %xmm0 movupd %xmm0, %xmm0 ## InlineAsm End leaq 1(%rcx), %rax addq $2, %rcx cmpq %rdi, %rcx movq %rax, %rcx jl LBB0_3 retq LBB0_1: xorl %eax, %eax retq .cfi_endproc .subsections_via_symbols ## Detailed Results {'arguments': (24655919,), 'frequency': 2600000000.0, 'iterations': 24655919, 'parallel_factor': 1, 'returned': [24655918, 24655918, 24655918, 24655918], 'runtimes': [0.13202582497615367, 0.13208268792368472, 0.13151856907643378, 0.13161470007617027]} minimal throughput: 13.87 cy ## Selected Instructions VFMADD132PDYr, VPADDWYrr, VFMADD132PSYr, VPADDDYrr, VSUBPDYrr, VPACKUSDWYrr, VPMULHUWYrr, VMINPDYrr, VPUNPCKLWDYrr, VBLENDVPSYrr ## Generated Assembly (1x parallel) .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 13 .section __TEXT,__literal4,4byte_literals .p2align 2 LCPI0_0: .long 1065361408 .section __TEXT,__text,regular,pure_instructions .globl _test .p2align 4, 0x90 _test: .cfi_startproc testq %rdi, %rdi jle LBB0_1 movabsq $LCPI0_0, %rax vbroadcastss (%rax), %ymm0 movq $-1, %rcx .p2align 4, 0x90 LBB0_3: ## InlineAsm Start vfmadd132pd %ymm0, %ymm0, %ymm0 vpaddw %ymm0, %ymm0, %ymm0 vfmadd132ps %ymm0, %ymm0, %ymm0 vpaddd %ymm0, %ymm0, %ymm0 vsubpd %ymm0, %ymm0, %ymm0 vpackusdw %ymm0, %ymm0, %ymm0 vpmulhuw %ymm0, %ymm0, %ymm0 vminpd %ymm0, %ymm0, %ymm0 vpunpcklwd %ymm0, %ymm0, %ymm0 vblendvps %ymm0, %ymm0, %ymm0, %ymm0 ## InlineAsm End leaq 1(%rcx), %rax addq $2, %rcx cmpq %rdi, %rcx movq %rax, %rcx jl LBB0_3 vzeroupper retq LBB0_1: xorl %eax, %eax retq .cfi_endproc .subsections_via_symbols ## Detailed Results {'arguments': (10000000,), 'frequency': 2600000000.0, 'iterations': 10000000, 'parallel_factor': 1, 'returned': [9999999, 9999999, 9999999, 9999999], 'runtimes': [0.11892832000739872, 0.11891822703182697, 0.11902078497223556, 0.12094117503147572]} minimal throughput: 30.92 cy ## Selected Instructions VCVTSI642SDrr, VFMADD213SDr, DIVSDrr, VCVTSI642SDrr, MAXSDrr, VFNMADD213SDr, VFMADD132SDr, VMAXSDrr, VFNMADD132SDr, SQRTSDr ## Generated Assembly (1x parallel) .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 13 .section __TEXT,__literal8,8byte_literals .p2align 3 LCPI0_0: .quad 4607186816846528512 .section __TEXT,__text,regular,pure_instructions .globl _test .p2align 4, 0x90 _test: .cfi_startproc testq %rdi, %rdi jle LBB0_1 movq $-1, %rcx movabsq $LCPI0_0, %rax vmovsd (%rax), %xmm0 movl $3, %edx .p2align 4, 0x90 LBB0_3: ## InlineAsm Start vcvtsi2sdq %rdx, %xmm0, %xmm0 vfmadd213sd %xmm0, %xmm0, %xmm0 divsd %xmm0, %xmm0 vcvtsi2sdq %rdx, %xmm0, %xmm0 maxsd %xmm0, %xmm0 vfnmadd213sd %xmm0, %xmm0, %xmm0 vfmadd132sd %xmm0, %xmm0, %xmm0 vmaxsd %xmm0, %xmm0, %xmm0 vfnmadd132sd %xmm0, %xmm0, %xmm0 sqrtsd %xmm0, %xmm0 ## InlineAsm End leaq 1(%rcx), %rax addq $2, %rcx cmpq %rdi, %rcx movq %rax, %rcx jl LBB0_3 retq LBB0_1: xorl %eax, %eax retq .cfi_endproc .subsections_via_symbols ## Detailed Results {'arguments': (5841530,), 'frequency': 2600000000.0, 'iterations': 5841530, 'parallel_factor': 1, 'returned': [5841529, 5841529, 5841529, 5841529], 'runtimes': [0.13433505699504167, 0.13318849296774715, 0.13303690601605922, 0.13309408095665276]} minimal throughput: 59.21 cy ## Selected Instructions RCPSSr, VCVTSI2SSrr, MULSSrr, VCVTSD2SSrr, VROUNDSSr, VRCPSSr, VCVTSI2SSrr, VSQRTSSr, VFNMADD231SSr, VSQRTSSr ## Generated Assembly (1x parallel) .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 13 .section __TEXT,__literal4,4byte_literals .p2align 2 LCPI0_0: .long 1065361408 .section __TEXT,__literal8,8byte_literals .p2align 3 LCPI0_1: .quad 4607186816846528512 .section __TEXT,__text,regular,pure_instructions .globl _test .p2align 4, 0x90 _test: .cfi_startproc testq %rdi, %rdi jle LBB0_1 movq $-1, %rcx movabsq $LCPI0_0, %rax vmovss (%rax), %xmm1 movl $3, %edx movabsq $LCPI0_1, %rax vmovsd (%rax), %xmm0 .p2align 4, 0x90 LBB0_3: ## InlineAsm Start rcpss %xmm1, %xmm1 vcvtsi2ssl %edx, %xmm1, %xmm1 mulss %xmm1, %xmm1 vcvtsd2ss %xmm0, %xmm1, %xmm1 vroundss $1, %xmm1, %xmm1, %xmm1 vrcpss %xmm1, %xmm1, %xmm1 vcvtsi2ssl %edx, %xmm1, %xmm1 vsqrtss %xmm1, %xmm1, %xmm1 vfnmadd231ss %xmm1, %xmm1, %xmm1 vsqrtss %xmm1, %xmm1, %xmm1 ## InlineAsm End leaq 1(%rcx), %rax addq $2, %rcx cmpq %rdi, %rcx movq %rax, %rcx jl LBB0_3 retq LBB0_1: xorl %eax, %eax retq .cfi_endproc .subsections_via_symbols ## Detailed Results {'arguments': (6011291,), 'frequency': 2600000000.0, 'iterations': 6011291, 'parallel_factor': 1, 'returned': [6011290, 6011290, 6011290, 6011290], 'runtimes': [0.13239118899218738, 0.13244657206814736, 0.1326694720191881, 0.13262002903502434]} minimal throughput: 57.26 cy ## Selected Instructions ROR16ri, CMOVS16rr, SBB16ri, ADC16ri8, XOR16ri8, BTR16rr, XOR16ri8, SAR16r1, DEC16r, SUB16ri ## Generated Assembly (1x parallel) .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 13 .globl _test .p2align 4, 0x90 _test: .cfi_startproc testq %rdi, %rdi jle LBB0_1 movw $3, %cx movq $-1, %rdx .p2align 4, 0x90 LBB0_3: ## InlineAsm Start rorw %cx cmovsw %cx, %cx sbbw $1, %cx adcw $1, %cx xorw $1, %cx btrw %cx, %cx xorw $1, %cx sarw %cx decw %cx subw $1, %cx ## InlineAsm End leaq 1(%rdx), %rax addq $2, %rdx cmpq %rdi, %rdx movq %rax, %rdx jl LBB0_3 retq LBB0_1: xorl %eax, %eax retq .cfi_endproc .subsections_via_symbols ## Detailed Results {'arguments': (31283731,), 'frequency': 2600000000.0, 'iterations': 31283731, 'parallel_factor': 1, 'returned': [31283730, 31283730, 31283730, 31283730], 'runtimes': [0.13291946100071073, 0.13294463406782597, 0.1332225619116798, 0.13287500606384128]} minimal throughput: 11.04 cy ## Selected Instructions SHLX32rr, CMOVO32rr, MOV32rr, CMOVS32rr, CRC32r32r8, SHR32r1, ADD32rr, CRC32r32r8, RCR32ri, SHR32r1 ## Generated Assembly (1x parallel) .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 13 .globl _test .p2align 4, 0x90 _test: .cfi_startproc testq %rdi, %rdi jle LBB0_1 movl $3, %esi movq $-1, %rdx movb $3, %cl .p2align 4, 0x90 LBB0_3: ## InlineAsm Start shlxl %esi, %esi, %eax cmovol %eax, %eax movl %eax, %esi cmovsl %esi, %esi crc32b %cl, %esi shrl %esi addl %esi, %esi crc32b %cl, %esi rcrl %esi shrl %esi ## InlineAsm End leaq 1(%rdx), %rax addq $2, %rdx cmpq %rdi, %rdx movq %rax, %rdx jl LBB0_3 retq LBB0_1: xorl %eax, %eax retq .cfi_endproc .subsections_via_symbols ## Detailed Results {'arguments': (24008543,), 'frequency': 2600000000.0, 'iterations': 24008543, 'parallel_factor': 1, 'returned': [24008542, 24008542, 24008542, 24008542], 'runtimes': [0.13333229208365083, 0.13314284407533705, 0.13381975598167628, 0.13447994901798666]} minimal throughput: 14.42 cy ## Selected Instructions SHRX64rr, SBB64ri32, AND64ri8, MOV64rc, INC64r, SUB64ri32, POPCNT64rr, OR64ri8, BTS64rr, ROL64ri ## Generated Assembly (1x parallel) .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 13 .globl _test .p2align 4, 0x90 _test: .cfi_startproc testq %rdi, %rdi jle LBB0_1 movq $-1, %rcx movl $3, %edx .p2align 4, 0x90 LBB0_3: ## InlineAsm Start shrxq %rdx, %rdx, %rax sbbq $1, %rax andq $1, %rax movq %rax, %rax incq %rax subq $1, %rax popcntq %rax, %rdx orq $1, %rdx btsq %rdx, %rdx rolq %rdx ## InlineAsm End leaq 1(%rcx), %rax addq $2, %rcx cmpq %rdi, %rcx movq %rax, %rcx jl LBB0_3 retq LBB0_1: xorl %eax, %eax retq .cfi_endproc .subsections_via_symbols ## Detailed Results {'arguments': (27539225,), 'frequency': 2600000000.0, 'iterations': 27539225, 'parallel_factor': 1, 'returned': [27539224, 27539224, 27539224, 27539224], 'runtimes': [0.1335972750093788, 0.13322542910464108, 0.13357082300353795, 0.13376462296582758]} minimal throughput: 12.58 cy ## Selected Instructions SAR8r1, SHR8ri, INC8r, AND8rr, RCR8ri, ROL8ri, SUB8ri, SBB8rr, NEG8r, NOT8r ## Generated Assembly (1x parallel) .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 13 .globl _test .p2align 4, 0x90 _test: .cfi_startproc testq %rdi, %rdi jle LBB0_1 movb $3, %cl movq $-1, %rdx .p2align 4, 0x90 LBB0_3: ## InlineAsm Start sarb %cl shrb %cl incb %cl andb %cl, %cl rcrb %cl rolb %cl subb $1, %cl sbbb %cl, %cl negb %cl notb %cl ## InlineAsm End leaq 1(%rdx), %rax addq $2, %rdx cmpq %rdi, %rdx movq %rax, %rdx jl LBB0_3 retq LBB0_1: xorl %eax, %eax retq .cfi_endproc .subsections_via_symbols ## Detailed Results {'arguments': (30431254,), 'frequency': 2600000000.0, 'iterations': 30431254, 'parallel_factor': 1, 'returned': [30431253, 30431253, 30431253, 30431253], 'runtimes': [0.13894746906589717, 0.1348069809610024, 0.13318019802682102, 0.13318415405228734]} minimal throughput: 11.38 cy