mirror of
https://github.com/RRZE-HPC/ibench.git
synced 2025-07-21 04:41:09 +02:00
initial import
This commit is contained in:
26
Makefile
Normal file
26
Makefile
Normal file
@@ -0,0 +1,26 @@
|
||||
COMPILER=ICC
|
||||
|
||||
TARGET = ibench
|
||||
SRC_DIR = src
|
||||
KDIRS += $(patsubst $(SRC_DIR)/%, %, $(wildcard $(SRC_DIR)/*))
|
||||
Q = @
|
||||
|
||||
include include_$(COMPILER).mk
|
||||
|
||||
$(TARGET): ibench.c $(KDIRS) $(KERNELS)
|
||||
$(Q)echo "===> COMPILING $@"
|
||||
$(Q)$(CC) $(CFLAGS) $< -o $@ -ldl
|
||||
|
||||
$(KDIRS):
|
||||
$(Q)mkdir $(KDIRS)
|
||||
|
||||
%.so:
|
||||
$(Q)echo "===> ASSEMBLING $@"
|
||||
$(Q)$(AS) $(LFLAGS) $(patsubst %.so, $(SRC_DIR)/%.S, $@) -o $@
|
||||
|
||||
.PHONY: clean
|
||||
|
||||
clean:
|
||||
$(Q)echo "===> CLEAN"
|
||||
$(Q)rm -rf $(KDIRS)
|
||||
$(Q)rm -f $(TARGET)
|
105
ibench.c
Normal file
105
ibench.c
Normal file
@@ -0,0 +1,105 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <dirent.h>
|
||||
#include <dlfcn.h>
|
||||
#include <immintrin.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
double (*latency)(int);
|
||||
int *ninst;
|
||||
|
||||
void benchmark(const int N, float freq, char *sofile) {
|
||||
struct timeval start, end;
|
||||
double benchtime;
|
||||
char *instr = strtok(sofile, ".");
|
||||
|
||||
double result;
|
||||
|
||||
// run benchmark
|
||||
gettimeofday(&start, NULL);
|
||||
result = (*latency)(N);
|
||||
gettimeofday(&end, NULL);
|
||||
|
||||
|
||||
benchtime = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec);
|
||||
// divide by 1e6 (usec -> s), ninst (number of instr per loop),
|
||||
// N/1e9 (loop count vs. GHz); multiply by frequency
|
||||
benchtime = benchtime / (1e6 * *ninst / freq * (N / 1e9));
|
||||
printf("%s:%s\t%.3f (clock cycles)\t[DEBUG - result: %f]\n", instr, strlen(instr) + 1 < 8 ? "\t" : "", benchtime, result);
|
||||
}
|
||||
|
||||
int main(int argc, const char *argv[]) {
|
||||
// one million runs
|
||||
const int N = 1000000;
|
||||
float freq = 0.0f;
|
||||
|
||||
// need a target directory containing benchmarks
|
||||
if (argc < 2) {
|
||||
printf("please specify a directory containing the shared objects with benchmarks to run\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// did the command line specify a frequency?
|
||||
if (argc < 3) {
|
||||
printf("Please specify the CPU frequency in GHz. For best results make "
|
||||
"sure the frequency is fixed, otherwise SpeedStep/Turbo Boost "
|
||||
"might distort the results.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
freq = atof(argv[2]);
|
||||
printf("Using frequency %.2fGHz.\n", freq);
|
||||
|
||||
// perform benchmark for all shared objects in target directory
|
||||
DIR *dirp;
|
||||
struct dirent *dp;
|
||||
struct stat st;
|
||||
if ((dirp = opendir(argv[1])) == NULL) {
|
||||
perror("opendir");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
while ((dp = readdir(dirp)) != NULL) {
|
||||
// only try .so files
|
||||
char *suffix = ".so";
|
||||
int lensuffix = strlen(suffix);
|
||||
if (strncmp(dp->d_name + strlen(dp->d_name) - lensuffix, ".so", 3))
|
||||
continue;
|
||||
|
||||
// load .so
|
||||
void *handle;
|
||||
size_t len1 = strlen(argv[1]);
|
||||
size_t len2 = strlen(dp->d_name);
|
||||
// directory might be missing a trailing '/'
|
||||
char *relpath;
|
||||
if ((relpath = malloc(len1 + len2 + 2)) == NULL) {
|
||||
perror("malloc");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
snprintf(relpath, len1 + len2 + 2, "%s/%s", argv[1], dp->d_name);
|
||||
if ((handle = dlopen(relpath, RTLD_LAZY)) == NULL) {
|
||||
fprintf(stderr, "dlopen: failed to open %s: %s\n", relpath,
|
||||
dlerror());
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if ((latency = (double (*)(int))dlsym(handle, "latency")) == NULL) {
|
||||
fprintf(stderr, "dlsym: couldn't find function latency in %s: %s\n",
|
||||
relpath, dlerror());
|
||||
return (EXIT_FAILURE);
|
||||
}
|
||||
if ((ninst = (int *)dlsym(handle, "ninst")) == NULL) {
|
||||
fprintf(stderr, "dlsym: couldn't find symbol ninst in %s: %s\n",
|
||||
relpath, dlerror());
|
||||
return (EXIT_FAILURE);
|
||||
}
|
||||
free(relpath);
|
||||
|
||||
// do actual benchmark
|
||||
benchmark(N, freq, dp->d_name);
|
||||
|
||||
dlclose(handle);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
9
include_GCC.mk
Normal file
9
include_GCC.mk
Normal file
@@ -0,0 +1,9 @@
|
||||
CC = gcc
|
||||
AS = gcc
|
||||
CFLAGS = -O3 -x assembler-with-cpp
|
||||
LFLAGS = -shared
|
||||
|
||||
KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/gp/*.S))
|
||||
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/sse/*.S))
|
||||
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/avx/*.S))
|
||||
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/avx2/*.S))
|
9
include_ICC.mk
Normal file
9
include_ICC.mk
Normal file
@@ -0,0 +1,9 @@
|
||||
CC = icc
|
||||
AS = icc
|
||||
CFLAGS = -O3
|
||||
LFLAGS = -shared
|
||||
|
||||
KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/gp/*.S))
|
||||
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/sse/*.S))
|
||||
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/AVX/*.S))
|
||||
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/AVX-512/*.S))
|
7
include_MIC.mk
Normal file
7
include_MIC.mk
Normal file
@@ -0,0 +1,7 @@
|
||||
CC = icc
|
||||
AS = icc
|
||||
CFLAGS = -O3 -mmic
|
||||
LFLAGS = -shared -mmic
|
||||
|
||||
KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/gp/*.S))
|
||||
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/imci/*.S))
|
6
include_POWER8.mk
Normal file
6
include_POWER8.mk
Normal file
@@ -0,0 +1,6 @@
|
||||
CC = xlc
|
||||
AS = xlc
|
||||
CFLAGS = -O3
|
||||
LFLAGS = -shared
|
||||
|
||||
KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/vsx/*.S))
|
46
src/AVX-512/vaddpd-avx512-TP.S
Normal file
46
src/AVX-512/vaddpd-avx512-TP.S
Normal file
@@ -0,0 +1,46 @@
|
||||
#define INSTR vaddpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovapd zmm1, zmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm3, zmm0, zmm1
|
||||
INSTR zmm4, zmm1, zmm0
|
||||
INSTR zmm5, zmm0, zmm2
|
||||
cmp i, N
|
||||
INSTR zmm6, zmm2, zmm0
|
||||
INSTR zmm7, zmm1, zmm2
|
||||
INSTR zmm8, zmm2, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
46
src/AVX-512/vaddpd-avx512.S
Normal file
46
src/AVX-512/vaddpd-avx512.S
Normal file
@@ -0,0 +1,46 @@
|
||||
#define INSTR vaddpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovapd zmm1, zmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
cmp i, N
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
46
src/AVX-512/vaddps-avx512-TP.S
Normal file
46
src/AVX-512/vaddps-avx512-TP.S
Normal file
@@ -0,0 +1,46 @@
|
||||
#define INSTR vaddps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps zmm1, zmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm3, zmm0, zmm1
|
||||
INSTR zmm4, zmm1, zmm0
|
||||
INSTR zmm5, zmm0, zmm2
|
||||
cmp i, N
|
||||
INSTR zmm6, zmm2, zmm0
|
||||
INSTR zmm7, zmm1, zmm2
|
||||
INSTR zmm8, zmm2, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
46
src/AVX-512/vaddps-avx512.S
Normal file
46
src/AVX-512/vaddps-avx512.S
Normal file
@@ -0,0 +1,46 @@
|
||||
#define INSTR vaddps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps zmm1, zmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
cmp i, N
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
59
src/AVX-512/vdivpd-avx512-TP.S
Normal file
59
src/AVX-512/vdivpd-avx512-TP.S
Normal file
@@ -0,0 +1,59 @@
|
||||
#define INSTR vdivpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
|
||||
vaddpd zmm1, zmm0, zmm0 # create 2.0
|
||||
vaddpd zmm2, zmm0, zmm1 # create 3.0
|
||||
vaddpd zmm4, zmm1, zmm1 # create 4.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 8.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 16.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 32.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 64.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 128.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 256.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 512.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 1024.0
|
||||
vdivpd zmm1, zmm4, zmm2 # create 341.3333
|
||||
vdivpd zmm2, zmm0, zmm1 # create 1/341.3333
|
||||
vaddpd zmm0, zmm1, zmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm3, zmm0, zmm1
|
||||
INSTR zmm4, zmm1, zmm0
|
||||
INSTR zmm5, zmm0, zmm2
|
||||
cmp i, N
|
||||
INSTR zmm6, zmm2, zmm0
|
||||
INSTR zmm7, zmm1, zmm2
|
||||
INSTR zmm8, zmm2, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
59
src/AVX-512/vdivpd-avx512.S
Normal file
59
src/AVX-512/vdivpd-avx512.S
Normal file
@@ -0,0 +1,59 @@
|
||||
#define INSTR vdivpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
|
||||
vaddpd zmm1, zmm0, zmm0 # create 2.0
|
||||
vaddpd zmm2, zmm0, zmm1 # create 3.0
|
||||
vaddpd zmm4, zmm1, zmm1 # create 4.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 8.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 16.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 32.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 64.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 128.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 256.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 512.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 1024.0
|
||||
vdivpd zmm1, zmm4, zmm2 # create 341.3333
|
||||
vdivpd zmm2, zmm0, zmm1 # create 1/341.3333
|
||||
vaddpd zmm0, zmm1, zmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm2
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
cmp i, N
|
||||
INSTR zmm0, zmm0, zmm2
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
59
src/AVX-512/vdivps-avx512-TP.S
Normal file
59
src/AVX-512/vdivps-avx512-TP.S
Normal file
@@ -0,0 +1,59 @@
|
||||
#define INSTR vdivps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
|
||||
vaddps zmm1, zmm0, zmm0 # create 2.0
|
||||
vaddps zmm2, zmm0, zmm1 # create 3.0
|
||||
vaddps zmm4, zmm1, zmm1 # create 4.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 8.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 16.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 32.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 64.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 128.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 256.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 512.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 1024.0
|
||||
vdivps zmm1, zmm4, zmm2 # create 341.3333
|
||||
vdivps zmm2, zmm0, zmm1 # create 1/341.3333
|
||||
vaddps zmm0, zmm1, zmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm3, zmm0, zmm1
|
||||
INSTR zmm4, zmm1, zmm2
|
||||
INSTR zmm5, zmm0, zmm2
|
||||
cmp i, N
|
||||
INSTR zmm6, zmm2, zmm0
|
||||
INSTR zmm7, zmm1, zmm2
|
||||
INSTR zmm8, zmm2, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
59
src/AVX-512/vdivps-avx512.S
Normal file
59
src/AVX-512/vdivps-avx512.S
Normal file
@@ -0,0 +1,59 @@
|
||||
#define INSTR vdivps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
|
||||
vaddps zmm1, zmm0, zmm0 # create 2.0
|
||||
vaddps zmm2, zmm0, zmm1 # create 3.0
|
||||
vaddps zmm4, zmm1, zmm1 # create 4.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 8.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 16.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 32.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 64.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 128.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 256.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 512.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 1024.0
|
||||
vdivps zmm1, zmm4, zmm2 # create 341.3333
|
||||
vdivps zmm2, zmm0, zmm1 # create 1/341.3333
|
||||
vaddps zmm0, zmm1, zmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm2
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
cmp i, N
|
||||
INSTR zmm0, zmm0, zmm2
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
53
src/AVX-512/vfmadd213pd-avx512-TP.S
Normal file
53
src/AVX-512/vfmadd213pd-avx512-TP.S
Normal file
@@ -0,0 +1,53 @@
|
||||
#define INSTR vfmadd213pd
|
||||
#define NINST 13
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovapd zmm1, zmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm3, zmm0, zmm1
|
||||
INSTR zmm4, zmm1, zmm0
|
||||
INSTR zmm5, zmm0, zmm2
|
||||
INSTR zmm6, zmm2, zmm0
|
||||
INSTR zmm7, zmm1, zmm2
|
||||
INSTR zmm8, zmm2, zmm1
|
||||
INSTR zmm9, zmm2, zmm1
|
||||
cmp i, N
|
||||
INSTR zmm10, zmm2, zmm1
|
||||
INSTR zmm11, zmm2, zmm1
|
||||
INSTR zmm12, zmm2, zmm1
|
||||
INSTR zmm13, zmm2, zmm1
|
||||
INSTR zmm14, zmm2, zmm1
|
||||
INSTR zmm15, zmm2, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
46
src/AVX-512/vfmadd213pd-avx512.S
Normal file
46
src/AVX-512/vfmadd213pd-avx512.S
Normal file
@@ -0,0 +1,46 @@
|
||||
#define INSTR vfmadd213pd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovapd zmm1, zmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
cmp i, N
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
53
src/AVX-512/vfmadd213ps-avx512-TP.S
Normal file
53
src/AVX-512/vfmadd213ps-avx512-TP.S
Normal file
@@ -0,0 +1,53 @@
|
||||
#define INSTR vfmadd213ps
|
||||
#define NINST 13
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps zmm1, zmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm3, zmm0, zmm1
|
||||
INSTR zmm4, zmm1, zmm0
|
||||
INSTR zmm5, zmm0, zmm2
|
||||
INSTR zmm6, zmm2, zmm0
|
||||
INSTR zmm7, zmm1, zmm2
|
||||
INSTR zmm8, zmm2, zmm1
|
||||
INSTR zmm9, zmm2, zmm1
|
||||
cmp i, N
|
||||
INSTR zmm10, zmm2, zmm1
|
||||
INSTR zmm11, zmm2, zmm1
|
||||
INSTR zmm12, zmm2, zmm1
|
||||
INSTR zmm13, zmm2, zmm1
|
||||
INSTR zmm14, zmm2, zmm1
|
||||
INSTR zmm15, zmm2, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
46
src/AVX-512/vfmadd213ps-avx512.S
Normal file
46
src/AVX-512/vfmadd213ps-avx512.S
Normal file
@@ -0,0 +1,46 @@
|
||||
#define INSTR vfmadd213ps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps zmm1, zmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
cmp i, N
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
INSTR zmm0, zmm1, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
48
src/AVX-512/vmulpd-avx512-TP.S
Normal file
48
src/AVX-512/vmulpd-avx512-TP.S
Normal file
@@ -0,0 +1,48 @@
|
||||
#define INSTR vmulpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# create AVX-512 DP 2.0
|
||||
vaddpd zmm1, zmm0, zmm0
|
||||
# create AVX-512 DP 0.5
|
||||
vdivpd zmm2, zmm0, zmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm3, zmm0, zmm1
|
||||
INSTR zmm4, zmm1, zmm0
|
||||
INSTR zmm5, zmm0, zmm2
|
||||
cmp i, N
|
||||
INSTR zmm6, zmm2, zmm0
|
||||
INSTR zmm7, zmm1, zmm2
|
||||
INSTR zmm8, zmm2, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
48
src/AVX-512/vmulpd-avx512.S
Normal file
48
src/AVX-512/vmulpd-avx512.S
Normal file
@@ -0,0 +1,48 @@
|
||||
#define INSTR vmulpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# create AVX-512 DP 2.0
|
||||
vaddpd zmm1, zmm0, zmm0
|
||||
# create AVX-512 DP 0.5
|
||||
vdivpd zmm2, zmm0, zmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm2
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
cmp i, N
|
||||
INSTR zmm0, zmm0, zmm2
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
48
src/AVX-512/vmulps-avx512-TP.S
Normal file
48
src/AVX-512/vmulps-avx512-TP.S
Normal file
@@ -0,0 +1,48 @@
|
||||
#define INSTR vmulps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# create AVX-512 DP 2.0
|
||||
vaddps zmm1, zmm0, zmm0
|
||||
# create AVX-512 DP 0.5
|
||||
vdivps zmm2, zmm0, zmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm3, zmm0, zmm1
|
||||
INSTR zmm4, zmm1, zmm0
|
||||
INSTR zmm5, zmm0, zmm2
|
||||
cmp i, N
|
||||
INSTR zmm6, zmm2, zmm0
|
||||
INSTR zmm7, zmm1, zmm2
|
||||
INSTR zmm8, zmm2, zmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
48
src/AVX-512/vmulps-avx512.S
Normal file
48
src/AVX-512/vmulps-avx512.S
Normal file
@@ -0,0 +1,48 @@
|
||||
#define INSTR vmulps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
# create AVX-512 DP 2.0
|
||||
vaddps zmm1, zmm0, zmm0
|
||||
# create AVX-512 DP 0.5
|
||||
vdivps zmm2, zmm0, zmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm2
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
cmp i, N
|
||||
INSTR zmm0, zmm0, zmm2
|
||||
INSTR zmm0, zmm0, zmm1
|
||||
INSTR zmm0, zmm0, zmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
63
src/AVX-512/vrcp14pd-avx512-TP.S
Normal file
63
src/AVX-512/vrcp14pd-avx512-TP.S
Normal file
@@ -0,0 +1,63 @@
|
||||
#define INSTR vrcp14pd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
|
||||
vaddpd zmm1, zmm0, zmm0 # create 2.0
|
||||
vaddpd zmm2, zmm0, zmm1 # create 3.0
|
||||
vaddpd zmm4, zmm1, zmm1 # create 4.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 8.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 16.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 32.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 64.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 128.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 256.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 512.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 1024.0
|
||||
vdivpd zmm1, zmm4, zmm2 # create 341.3333
|
||||
vdivpd zmm2, zmm0, zmm1 # create 1/341.3333
|
||||
vaddpd zmm0, zmm1, zmm1 # create 2*341.3333
|
||||
vmovapd zmm1, zmm0
|
||||
vmovapd zmm2, zmm0
|
||||
vmovapd zmm3, zmm0
|
||||
vmovapd zmm4, zmm0
|
||||
vmovapd zmm5, zmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm10, zmm0
|
||||
INSTR zmm11, zmm1
|
||||
INSTR zmm12, zmm2
|
||||
cmp i, N
|
||||
INSTR zmm13, zmm3
|
||||
INSTR zmm14, zmm4
|
||||
INSTR zmm15, zmm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
58
src/AVX-512/vrcp14pd-avx512.S
Normal file
58
src/AVX-512/vrcp14pd-avx512.S
Normal file
@@ -0,0 +1,58 @@
|
||||
#define INSTR vrcp14pd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
|
||||
vaddpd zmm1, zmm0, zmm0 # create 2.0
|
||||
vaddpd zmm2, zmm0, zmm1 # create 3.0
|
||||
vaddpd zmm4, zmm1, zmm1 # create 4.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 8.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 16.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 32.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 64.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 128.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 256.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 512.0
|
||||
vaddpd zmm4, zmm4, zmm4 # create 1024.0
|
||||
vdivpd zmm1, zmm4, zmm2 # create 341.3333
|
||||
vdivpd zmm2, zmm0, zmm1 # create 1/341.3333
|
||||
vaddpd zmm0, zmm1, zmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm1, zmm0
|
||||
INSTR zmm2, zmm1
|
||||
INSTR zmm3, zmm2
|
||||
cmp i, N
|
||||
INSTR zmm4, zmm3
|
||||
INSTR zmm5, zmm4
|
||||
INSTR zmm0, zmm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
63
src/AVX-512/vrcp14ps-avx512-TP.S
Normal file
63
src/AVX-512/vrcp14ps-avx512-TP.S
Normal file
@@ -0,0 +1,63 @@
|
||||
#define INSTR vrcp14ps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
|
||||
vaddps zmm1, zmm0, zmm0 # create 2.0
|
||||
vaddps zmm2, zmm0, zmm1 # create 3.0
|
||||
vaddps zmm4, zmm1, zmm1 # create 4.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 8.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 16.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 32.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 64.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 128.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 256.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 512.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 1024.0
|
||||
vdivps zmm1, zmm4, zmm2 # create 341.3333
|
||||
vdivps zmm2, zmm0, zmm1 # create 1/341.3333
|
||||
vaddps zmm0, zmm1, zmm1 # create 2*341.3333
|
||||
vmovaps zmm1, zmm0
|
||||
vmovaps zmm2, zmm0
|
||||
vmovaps zmm3, zmm0
|
||||
vmovaps zmm4, zmm0
|
||||
vmovaps zmm5, zmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm10, zmm0
|
||||
INSTR zmm11, zmm1
|
||||
INSTR zmm12, zmm2
|
||||
cmp i, N
|
||||
INSTR zmm13, zmm3
|
||||
INSTR zmm14, zmm4
|
||||
INSTR zmm15, zmm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
58
src/AVX-512/vrcp14ps-avx512.S
Normal file
58
src/AVX-512/vrcp14ps-avx512.S
Normal file
@@ -0,0 +1,58 @@
|
||||
#define INSTR vrcp14ps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SSE SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# expand from AVX to AVX-512
|
||||
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
|
||||
|
||||
vaddps zmm1, zmm0, zmm0 # create 2.0
|
||||
vaddps zmm2, zmm0, zmm1 # create 3.0
|
||||
vaddps zmm4, zmm1, zmm1 # create 4.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 8.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 16.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 32.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 64.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 128.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 256.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 512.0
|
||||
vaddps zmm4, zmm4, zmm4 # create 1024.0
|
||||
vdivps zmm1, zmm4, zmm2 # create 341.3333
|
||||
vdivps zmm2, zmm0, zmm1 # create 1/341.3333
|
||||
vaddps zmm0, zmm1, zmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR zmm1, zmm0
|
||||
INSTR zmm2, zmm1
|
||||
INSTR zmm3, zmm2
|
||||
cmp i, N
|
||||
INSTR zmm4, zmm3
|
||||
INSTR zmm5, zmm4
|
||||
INSTR zmm0, zmm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
59
src/AVX/rcpss-TP.S
Normal file
59
src/AVX/rcpss-TP.S
Normal file
@@ -0,0 +1,59 @@
|
||||
#define INSTR rcpss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddss xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddss xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddss xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivss xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
movss xmm1, xmm0
|
||||
movss xmm2, xmm0
|
||||
movss xmm3, xmm0
|
||||
movss xmm4, xmm0
|
||||
movss xmm5, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm10, xmm0
|
||||
INSTR xmm11, xmm1
|
||||
INSTR xmm12, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm13, xmm3
|
||||
INSTR xmm14, xmm4
|
||||
INSTR xmm15, xmm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
54
src/AVX/rcpss.S
Normal file
54
src/AVX/rcpss.S
Normal file
@@ -0,0 +1,54 @@
|
||||
#define INSTR vrcpps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddps xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddps xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddps xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivps xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm2, xmm1
|
||||
INSTR xmm3, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm4, xmm3
|
||||
INSTR xmm5, xmm4
|
||||
INSTR xmm0, xmm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vaddpd-avx-TP.S
Normal file
44
src/AVX/vaddpd-avx-TP.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vaddpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm1
|
||||
INSTR ymm4, ymm1, ymm0
|
||||
INSTR ymm5, ymm0, ymm2
|
||||
cmp i, N
|
||||
INSTR ymm6, ymm2, ymm0
|
||||
INSTR ymm7, ymm1, ymm2
|
||||
INSTR ymm8, ymm2, ymm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vaddpd-avx.S
Normal file
44
src/AVX/vaddpd-avx.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vaddpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
cmp i, N
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX/vaddpd-sse-TP.S
Normal file
42
src/AVX/vaddpd-sse-TP.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vaddpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX/vaddpd-sse.S
Normal file
42
src/AVX/vaddpd-sse.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vaddpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vaddps-avx-TP.S
Normal file
44
src/AVX/vaddps-avx-TP.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vaddps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm1
|
||||
INSTR ymm4, ymm1, ymm0
|
||||
INSTR ymm5, ymm0, ymm2
|
||||
cmp i, N
|
||||
INSTR ymm6, ymm2, ymm0
|
||||
INSTR ymm7, ymm1, ymm2
|
||||
INSTR ymm8, ymm2, ymm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vaddps-avx.S
Normal file
44
src/AVX/vaddps-avx.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vaddps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
cmp i, N
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX/vaddps-sse-TP.S
Normal file
42
src/AVX/vaddps-sse-TP.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vaddps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX/vaddps-sse.S
Normal file
42
src/AVX/vaddps-sse.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vaddps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX/vaddsd-TP.S
Normal file
42
src/AVX/vaddsd-TP.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vaddsd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX/vaddsd.S
Normal file
42
src/AVX/vaddsd.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vaddsd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX/vaddss-TP.S
Normal file
42
src/AVX/vaddss-TP.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vaddss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX/vaddss.S
Normal file
42
src/AVX/vaddss.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vaddss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
55
src/AVX/vdivpd-avx-TP.S
Normal file
55
src/AVX/vdivpd-avx-TP.S
Normal file
@@ -0,0 +1,55 @@
|
||||
#define INSTR vdivpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
|
||||
vaddpd ymm1, ymm0, ymm0 # create 2.0
|
||||
vaddpd ymm2, ymm0, ymm1 # create 3.0
|
||||
vaddpd ymm4, ymm1, ymm1 # create 4.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 8.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 16.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 32.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 64.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 128.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 256.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 512.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 1024.0
|
||||
vdivpd ymm1, ymm4, ymm2 # create 341.3333
|
||||
vdivpd ymm2, ymm0, ymm1 # create 1/341.3333
|
||||
vaddpd ymm0, ymm1, ymm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm1
|
||||
INSTR ymm4, ymm1, ymm0
|
||||
INSTR ymm5, ymm0, ymm2
|
||||
cmp i, N
|
||||
INSTR ymm6, ymm2, ymm0
|
||||
INSTR ymm7, ymm1, ymm2
|
||||
INSTR ymm8, ymm2, ymm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
55
src/AVX/vdivpd-avx.S
Normal file
55
src/AVX/vdivpd-avx.S
Normal file
@@ -0,0 +1,55 @@
|
||||
#define INSTR vdivpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
|
||||
vaddpd ymm1, ymm0, ymm0 # create 2.0
|
||||
vaddpd ymm2, ymm0, ymm1 # create 3.0
|
||||
vaddpd ymm4, ymm1, ymm1 # create 4.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 8.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 16.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 32.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 64.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 128.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 256.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 512.0
|
||||
vaddpd ymm4, ymm4, ymm4 # create 1024.0
|
||||
vdivpd ymm1, ymm4, ymm2 # create 341.3333
|
||||
vdivpd ymm2, ymm0, ymm1 # create 1/341.3333
|
||||
vaddpd ymm0, ymm1, ymm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm2
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
cmp i, N
|
||||
INSTR ymm0, ymm0, ymm2
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
54
src/AVX/vdivpd-sse-TP.S
Normal file
54
src/AVX/vdivpd-sse-TP.S
Normal file
@@ -0,0 +1,54 @@
|
||||
#define INSTR vdivpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddpd xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddpd xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddpd xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivpd xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivpd xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddpd xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
54
src/AVX/vdivpd-sse.S
Normal file
54
src/AVX/vdivpd-sse.S
Normal file
@@ -0,0 +1,54 @@
|
||||
#define INSTR vdivpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddpd xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddpd xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddpd xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddpd xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivpd xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivpd xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddpd xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
55
src/AVX/vdivps-avx-TP.S
Normal file
55
src/AVX/vdivps-avx-TP.S
Normal file
@@ -0,0 +1,55 @@
|
||||
#define INSTR vdivps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
|
||||
vaddps ymm1, ymm0, ymm0 # create 2.0
|
||||
vaddps ymm2, ymm0, ymm1 # create 3.0
|
||||
vaddps ymm4, ymm1, ymm1 # create 4.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 8.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 16.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 32.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 64.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 128.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 256.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 512.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 1024.0
|
||||
vdivps ymm1, ymm4, ymm2 # create 341.3333
|
||||
vdivps ymm2, ymm0, ymm1 # create 1/341.3333
|
||||
vaddps ymm0, ymm1, ymm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm1
|
||||
INSTR ymm4, ymm1, ymm2
|
||||
INSTR ymm5, ymm0, ymm2
|
||||
cmp i, N
|
||||
INSTR ymm6, ymm2, ymm0
|
||||
INSTR ymm7, ymm1, ymm2
|
||||
INSTR ymm8, ymm2, ymm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
55
src/AVX/vdivps-avx.S
Normal file
55
src/AVX/vdivps-avx.S
Normal file
@@ -0,0 +1,55 @@
|
||||
#define INSTR vdivps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
|
||||
vaddps ymm1, ymm0, ymm0 # create 2.0
|
||||
vaddps ymm2, ymm0, ymm1 # create 3.0
|
||||
vaddps ymm4, ymm1, ymm1 # create 4.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 8.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 16.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 32.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 64.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 128.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 256.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 512.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 1024.0
|
||||
vdivps ymm1, ymm4, ymm2 # create 341.3333
|
||||
vdivps ymm2, ymm0, ymm1 # create 1/341.3333
|
||||
vaddps ymm0, ymm1, ymm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm2
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
cmp i, N
|
||||
INSTR ymm0, ymm0, ymm2
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
54
src/AVX/vdivps-sse-TP.S
Normal file
54
src/AVX/vdivps-sse-TP.S
Normal file
@@ -0,0 +1,54 @@
|
||||
#define INSTR vdivps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddps xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddps xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddps xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivps xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
54
src/AVX/vdivps-sse.S
Normal file
54
src/AVX/vdivps-sse.S
Normal file
@@ -0,0 +1,54 @@
|
||||
#define INSTR vdivps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddps xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddps xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddps xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivps xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
54
src/AVX/vdivsd-TP.S
Normal file
54
src/AVX/vdivsd-TP.S
Normal file
@@ -0,0 +1,54 @@
|
||||
#define INSTR vdivsd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddsd xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddsd xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddsd xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivsd xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivsd xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddsd xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
54
src/AVX/vdivsd.S
Normal file
54
src/AVX/vdivsd.S
Normal file
@@ -0,0 +1,54 @@
|
||||
#define INSTR vdivsd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddsd xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddsd xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddsd xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddsd xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivsd xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivsd xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddsd xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
54
src/AVX/vdivss-TP.S
Normal file
54
src/AVX/vdivss-TP.S
Normal file
@@ -0,0 +1,54 @@
|
||||
#define INSTR vdivss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddss xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddss xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddss xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivss xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
54
src/AVX/vdivss.S
Normal file
54
src/AVX/vdivss.S
Normal file
@@ -0,0 +1,54 @@
|
||||
#define INSTR vdivss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddss xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddss xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddss xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddss xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivss xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
51
src/AVX/vfmadd213pd-avx-TP.S
Normal file
51
src/AVX/vfmadd213pd-avx-TP.S
Normal file
@@ -0,0 +1,51 @@
|
||||
#define INSTR vfmadd213pd
|
||||
#define NINST 13
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm1
|
||||
INSTR ymm4, ymm1, ymm0
|
||||
INSTR ymm5, ymm0, ymm2
|
||||
INSTR ymm6, ymm2, ymm0
|
||||
INSTR ymm7, ymm1, ymm2
|
||||
INSTR ymm8, ymm2, ymm1
|
||||
INSTR ymm9, ymm2, ymm1
|
||||
cmp i, N
|
||||
INSTR ymm10, ymm2, ymm1
|
||||
INSTR ymm11, ymm2, ymm1
|
||||
INSTR ymm12, ymm2, ymm1
|
||||
INSTR ymm13, ymm2, ymm1
|
||||
INSTR ymm14, ymm2, ymm1
|
||||
INSTR ymm15, ymm2, ymm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vfmadd213pd-avx.S
Normal file
44
src/AVX/vfmadd213pd-avx.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vfmadd213pd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm1, ymm1
|
||||
INSTR ymm0, ymm1, ymm1
|
||||
INSTR ymm0, ymm1, ymm1
|
||||
cmp i, N
|
||||
INSTR ymm0, ymm1, ymm1
|
||||
INSTR ymm0, ymm1, ymm1
|
||||
INSTR ymm0, ymm1, ymm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
49
src/AVX/vfmadd213pd-sse-TP.S
Normal file
49
src/AVX/vfmadd213pd-sse-TP.S
Normal file
@@ -0,0 +1,49 @@
|
||||
#define INSTR vfmadd213pd
|
||||
#define NINST 13
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
INSTR xmm9, xmm2, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm10, xmm2, xmm1
|
||||
INSTR xmm11, xmm2, xmm1
|
||||
INSTR xmm12, xmm2, xmm1
|
||||
INSTR xmm13, xmm2, xmm1
|
||||
INSTR xmm14, xmm2, xmm1
|
||||
INSTR xmm15, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX/vfmadd213pd-sse.S
Normal file
42
src/AVX/vfmadd213pd-sse.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vfmadd213pd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
51
src/AVX/vfmadd213ps-avx-TP.S
Normal file
51
src/AVX/vfmadd213ps-avx-TP.S
Normal file
@@ -0,0 +1,51 @@
|
||||
#define INSTR vfmadd213ps
|
||||
#define NINST 13
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm1
|
||||
INSTR ymm4, ymm1, ymm0
|
||||
INSTR ymm5, ymm0, ymm2
|
||||
INSTR ymm6, ymm2, ymm0
|
||||
INSTR ymm7, ymm1, ymm2
|
||||
INSTR ymm8, ymm2, ymm1
|
||||
INSTR ymm9, ymm2, ymm1
|
||||
cmp i, N
|
||||
INSTR ymm10, ymm2, ymm1
|
||||
INSTR ymm11, ymm2, ymm1
|
||||
INSTR ymm12, ymm2, ymm1
|
||||
INSTR ymm13, ymm2, ymm1
|
||||
INSTR ymm14, ymm2, ymm1
|
||||
INSTR ymm15, ymm2, ymm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vfmadd213ps-avx.S
Normal file
44
src/AVX/vfmadd213ps-avx.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vfmadd213ps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy SP 1.0
|
||||
vmovaps ymm1, ymm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm1, ymm1
|
||||
INSTR ymm0, ymm1, ymm1
|
||||
INSTR ymm0, ymm1, ymm1
|
||||
cmp i, N
|
||||
INSTR ymm0, ymm1, ymm1
|
||||
INSTR ymm0, ymm1, ymm1
|
||||
INSTR ymm0, ymm1, ymm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
49
src/AVX/vfmadd213ps-sse-TP.S
Normal file
49
src/AVX/vfmadd213ps-sse-TP.S
Normal file
@@ -0,0 +1,49 @@
|
||||
#define INSTR vfmadd213ps
|
||||
#define NINST 13
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
INSTR xmm9, xmm2, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm10, xmm2, xmm1
|
||||
INSTR xmm11, xmm2, xmm1
|
||||
INSTR xmm12, xmm2, xmm1
|
||||
INSTR xmm13, xmm2, xmm1
|
||||
INSTR xmm14, xmm2, xmm1
|
||||
INSTR xmm15, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX/vfmadd213ps-sse.S
Normal file
42
src/AVX/vfmadd213ps-sse.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vfmadd213ps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
49
src/AVX/vfmadd213sd-TP.S
Normal file
49
src/AVX/vfmadd213sd-TP.S
Normal file
@@ -0,0 +1,49 @@
|
||||
#define INSTR vfmadd213sd
|
||||
#define NINST 13
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
INSTR xmm9, xmm2, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm10, xmm2, xmm1
|
||||
INSTR xmm11, xmm2, xmm1
|
||||
INSTR xmm12, xmm2, xmm1
|
||||
INSTR xmm13, xmm2, xmm1
|
||||
INSTR xmm14, xmm2, xmm1
|
||||
INSTR xmm15, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX/vfmadd213sd.S
Normal file
42
src/AVX/vfmadd213sd.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vfmadd213sd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
49
src/AVX/vfmadd213ss-TP.S
Normal file
49
src/AVX/vfmadd213ss-TP.S
Normal file
@@ -0,0 +1,49 @@
|
||||
#define INSTR vfmadd213ss
|
||||
#define NINST 13
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
INSTR xmm9, xmm2, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm10, xmm2, xmm1
|
||||
INSTR xmm11, xmm2, xmm1
|
||||
INSTR xmm12, xmm2, xmm1
|
||||
INSTR xmm13, xmm2, xmm1
|
||||
INSTR xmm14, xmm2, xmm1
|
||||
INSTR xmm15, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
42
src/AVX/vfmadd213ss.S
Normal file
42
src/AVX/vfmadd213ss.S
Normal file
@@ -0,0 +1,42 @@
|
||||
#define INSTR vfmadd213ss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy SP 1.0
|
||||
vmovaps xmm1, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
INSTR xmm0, xmm1, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
46
src/AVX/vmulpd-avx-TP.S
Normal file
46
src/AVX/vmulpd-avx-TP.S
Normal file
@@ -0,0 +1,46 @@
|
||||
#define INSTR vmulpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# create SP 2.0
|
||||
vaddpd ymm1, ymm0, ymm0
|
||||
# create SP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm1
|
||||
INSTR ymm4, ymm1, ymm0
|
||||
INSTR ymm5, ymm0, ymm2
|
||||
cmp i, N
|
||||
INSTR ymm6, ymm2, ymm0
|
||||
INSTR ymm7, ymm1, ymm2
|
||||
INSTR ymm8, ymm2, ymm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
46
src/AVX/vmulpd-avx.S
Normal file
46
src/AVX/vmulpd-avx.S
Normal file
@@ -0,0 +1,46 @@
|
||||
#define INSTR vmulpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# create SP 2.0
|
||||
vaddpd ymm1, ymm0, ymm0
|
||||
# create SP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm2
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
cmp i, N
|
||||
INSTR ymm0, ymm0, ymm2
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vmulpd-sse-TP.S
Normal file
44
src/AVX/vmulpd-sse-TP.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vmulpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# create SP 2.0
|
||||
vaddpd xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vmulpd-sse.S
Normal file
44
src/AVX/vmulpd-sse.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vmulpd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# create SP 2.0
|
||||
vaddpd xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
46
src/AVX/vmulps-avx-TP.S
Normal file
46
src/AVX/vmulps-avx-TP.S
Normal file
@@ -0,0 +1,46 @@
|
||||
#define INSTR vmulps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# create SP 2.0
|
||||
vaddps ymm1, ymm0, ymm0
|
||||
# create SP 0.5
|
||||
vdivps ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm1
|
||||
INSTR ymm4, ymm1, ymm0
|
||||
INSTR ymm5, ymm0, ymm2
|
||||
cmp i, N
|
||||
INSTR ymm6, ymm2, ymm0
|
||||
INSTR ymm7, ymm1, ymm2
|
||||
INSTR ymm8, ymm2, ymm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
46
src/AVX/vmulps-avx.S
Normal file
46
src/AVX/vmulps-avx.S
Normal file
@@ -0,0 +1,46 @@
|
||||
#define INSTR vmulps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# create SP 2.0
|
||||
vaddps ymm1, ymm0, ymm0
|
||||
# create SP 0.5
|
||||
vdivps ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm2
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
cmp i, N
|
||||
INSTR ymm0, ymm0, ymm2
|
||||
INSTR ymm0, ymm0, ymm1
|
||||
INSTR ymm0, ymm0, ymm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vmulps-sse-TP.S
Normal file
44
src/AVX/vmulps-sse-TP.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vmulps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# create SP 2.0
|
||||
vaddps xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivps xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vmulps-sse.S
Normal file
44
src/AVX/vmulps-sse.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vmulps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# create SP 2.0
|
||||
vaddps xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivps xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vmulsd-TP.S
Normal file
44
src/AVX/vmulsd-TP.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vmulsd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# create SP 2.0
|
||||
vaddps xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivps xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vmulsd.S
Normal file
44
src/AVX/vmulsd.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vmulsd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# create SP 2.0
|
||||
vaddpd xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vmulss-TP.S
Normal file
44
src/AVX/vmulss-TP.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vmulss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# create SP 2.0
|
||||
vaddps xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivps xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm1
|
||||
INSTR xmm4, xmm1, xmm0
|
||||
INSTR xmm5, xmm0, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm6, xmm2, xmm0
|
||||
INSTR xmm7, xmm1, xmm2
|
||||
INSTR xmm8, xmm2, xmm1
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
44
src/AVX/vmulss.S
Normal file
44
src/AVX/vmulss.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR vmulss
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# create SP 2.0
|
||||
vaddps xmm1, xmm0, xmm0
|
||||
# create SP 0.5
|
||||
vdivps xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
cmp i, N
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
INSTR xmm0, xmm0, xmm1
|
||||
INSTR xmm0, xmm0, xmm2
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
60
src/AVX/vrcpps-avx-TP.S
Normal file
60
src/AVX/vrcpps-avx-TP.S
Normal file
@@ -0,0 +1,60 @@
|
||||
#define INSTR vrcpps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
|
||||
vaddps ymm1, ymm0, ymm0 # create 2.0
|
||||
vaddps ymm2, ymm0, ymm1 # create 3.0
|
||||
vaddps ymm4, ymm1, ymm1 # create 4.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 8.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 16.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 32.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 64.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 128.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 256.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 512.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 1024.0
|
||||
vdivps ymm1, ymm4, ymm2 # create 341.3333
|
||||
vdivps ymm2, ymm0, ymm1 # create 1/341.3333
|
||||
vaddps ymm0, ymm1, ymm1 # create 2*341.3333
|
||||
vmovaps ymm1, ymm0
|
||||
vmovaps ymm2, ymm0
|
||||
vmovaps ymm3, ymm0
|
||||
vmovaps ymm4, ymm0
|
||||
vmovaps ymm5, ymm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm10, ymm0
|
||||
INSTR ymm11, ymm1
|
||||
INSTR ymm12, ymm2
|
||||
cmp i, N
|
||||
INSTR ymm13, ymm3
|
||||
INSTR ymm14, ymm4
|
||||
INSTR ymm15, ymm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
55
src/AVX/vrcpps-avx.S
Normal file
55
src/AVX/vrcpps-avx.S
Normal file
@@ -0,0 +1,55 @@
|
||||
#define INSTR vrcpps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
|
||||
vaddps ymm1, ymm0, ymm0 # create 2.0
|
||||
vaddps ymm2, ymm0, ymm1 # create 3.0
|
||||
vaddps ymm4, ymm1, ymm1 # create 4.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 8.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 16.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 32.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 64.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 128.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 256.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 512.0
|
||||
vaddps ymm4, ymm4, ymm4 # create 1024.0
|
||||
vdivps ymm1, ymm4, ymm2 # create 341.3333
|
||||
vdivps ymm2, ymm0, ymm1 # create 1/341.3333
|
||||
vaddps ymm0, ymm1, ymm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm1, ymm0
|
||||
INSTR ymm2, ymm1
|
||||
INSTR ymm3, ymm2
|
||||
cmp i, N
|
||||
INSTR ymm4, ymm3
|
||||
INSTR ymm5, ymm4
|
||||
INSTR ymm0, ymm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
59
src/AVX/vrcpps-sse-TP.S
Normal file
59
src/AVX/vrcpps-sse-TP.S
Normal file
@@ -0,0 +1,59 @@
|
||||
#define INSTR vrcpps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddps xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddps xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddps xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivps xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
vmovaps xmm1, xmm0
|
||||
vmovaps xmm2, xmm0
|
||||
vmovaps xmm3, xmm0
|
||||
vmovaps xmm4, xmm0
|
||||
vmovaps xmm5, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm10, xmm0
|
||||
INSTR xmm11, xmm1
|
||||
INSTR xmm12, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm13, xmm3
|
||||
INSTR xmm14, xmm4
|
||||
INSTR xmm15, xmm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
54
src/AVX/vrcpps-sse.S
Normal file
54
src/AVX/vrcpps-sse.S
Normal file
@@ -0,0 +1,54 @@
|
||||
#define INSTR vrcpps
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create SP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
|
||||
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
|
||||
vaddps xmm1, xmm0, xmm0 # create 2.0
|
||||
vaddps xmm2, xmm0, xmm1 # create 3.0
|
||||
vaddps xmm4, xmm1, xmm1 # create 4.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 8.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 16.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 32.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 64.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 128.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 256.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 512.0
|
||||
vaddps xmm4, xmm4, xmm4 # create 1024.0
|
||||
vdivps xmm1, xmm4, xmm2 # create 341.3333
|
||||
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
|
||||
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm2, xmm1
|
||||
INSTR xmm3, xmm2
|
||||
cmp i, N
|
||||
INSTR xmm4, xmm3
|
||||
INSTR xmm5, xmm4
|
||||
INSTR xmm0, xmm5
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
47
src/vsx/xvadddp.S
Normal file
47
src/vsx/xvadddp.S
Normal file
@@ -0,0 +1,47 @@
|
||||
#define INSTR xvadddp
|
||||
#define NINST 6
|
||||
#define N 3
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.align 16
|
||||
zero:
|
||||
.double 0.0, 0.0
|
||||
one:
|
||||
.double 1.0, 1.0
|
||||
.text
|
||||
.abiversion 2
|
||||
.section ".toc","aw"
|
||||
.section ".text"
|
||||
.align 2
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
latency :
|
||||
0: addis 2,12,.TOC.-0b@ha
|
||||
addi 2,2,.TOC.-0b@l
|
||||
.localentry latency, .-latency
|
||||
|
||||
mtctr N # move to count register
|
||||
# load DP FP zero
|
||||
li 10, 0
|
||||
|
||||
addis 9,2,zero@toc@ha
|
||||
addi 9,9,zero@toc@l
|
||||
lxvd2x 0, 0, 9
|
||||
|
||||
addis 9,2,one@toc@ha
|
||||
addi 9,9,one@toc@l
|
||||
lxvd2x 1, 0, 9
|
||||
loop:
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
bdnz loop
|
||||
xvmovdp 1, 0
|
||||
blr
|
||||
.size latency, .-latency
|
47
src/vsx/xvaddsp.S
Normal file
47
src/vsx/xvaddsp.S
Normal file
@@ -0,0 +1,47 @@
|
||||
#define INSTR xvaddsp
|
||||
#define NINST 6
|
||||
#define N 3
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.align 16
|
||||
zero:
|
||||
.single 0.0, 0.0
|
||||
one:
|
||||
.single 1.0, 1.0
|
||||
.text
|
||||
.abiversion 2
|
||||
.section ".toc","aw"
|
||||
.section ".text"
|
||||
.align 2
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
latency :
|
||||
0: addis 2,12,.TOC.-0b@ha
|
||||
addi 2,2,.TOC.-0b@l
|
||||
.localentry latency, .-latency
|
||||
|
||||
mtctr N # move to count register
|
||||
# load DP FP zero
|
||||
li 10, 0
|
||||
|
||||
addis 9,2,zero@toc@ha
|
||||
addi 9,9,zero@toc@l
|
||||
lxvd2x 0, 0, 9
|
||||
|
||||
addis 9,2,one@toc@ha
|
||||
addi 9,9,one@toc@l
|
||||
lxvd2x 1, 0, 9
|
||||
loop:
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
bdnz loop
|
||||
xvmovdp 1, 0
|
||||
blr
|
||||
.size latency, .-latency
|
49
src/vsx/xvdivdp.S
Normal file
49
src/vsx/xvdivdp.S
Normal file
@@ -0,0 +1,49 @@
|
||||
#define INSTR xvdivdp
|
||||
#define NINST 6
|
||||
#define N 3
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.align 16
|
||||
half:
|
||||
.double 0.5, 0.5
|
||||
one:
|
||||
.double 1.0, 1.0
|
||||
two:
|
||||
.double 2.0, 2.0
|
||||
.text
|
||||
.abiversion 2
|
||||
.section ".toc","aw"
|
||||
.section ".text"
|
||||
.align 2
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
latency :
|
||||
0: addis 2,12,.TOC.-0b@ha
|
||||
addi 2,2,.TOC.-0b@l
|
||||
.localentry latency, .-latency
|
||||
|
||||
mtctr N # move to count register
|
||||
li 10, 0 # offset zero
|
||||
addis 9,2,one@toc@ha # upper 32 bit of address
|
||||
addi 9,9,one@toc@l # lower 32 bit of address
|
||||
lxvd2x 0, 0, 9
|
||||
addis 9,2,half@toc@ha # upper 32 bit of address
|
||||
addi 9,9,half@toc@l # lower 32 bit of address
|
||||
lxvd2x 1, 0, 9
|
||||
addis 9,2,two@toc@ha # upper 32 bit of address
|
||||
addi 9,9,two@toc@l # lower 32 bit of address
|
||||
lxvd2x 2, 0, 9
|
||||
loop:
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 2
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 2
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 2
|
||||
bdnz loop
|
||||
xvmovdp 1, 0
|
||||
blr
|
||||
.size latency, .-latency
|
49
src/vsx/xvdivsp.S
Normal file
49
src/vsx/xvdivsp.S
Normal file
@@ -0,0 +1,49 @@
|
||||
#define INSTR xvdivsp
|
||||
#define NINST 6
|
||||
#define N 3
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.align 16
|
||||
half:
|
||||
.single 0.5, 0.5
|
||||
one:
|
||||
.single 1.0, 1.0
|
||||
two:
|
||||
.single 2.0, 2.0
|
||||
.text
|
||||
.abiversion 2
|
||||
.section ".toc","aw"
|
||||
.section ".text"
|
||||
.align 2
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
latency :
|
||||
0: addis 2,12,.TOC.-0b@ha
|
||||
addi 2,2,.TOC.-0b@l
|
||||
.localentry latency, .-latency
|
||||
|
||||
mtctr N # move to count register
|
||||
li 10, 0 # offset zero
|
||||
addis 9,2,one@toc@ha # upper 32 bit of address
|
||||
addi 9,9,one@toc@l # lower 32 bit of address
|
||||
lxvd2x 0, 0, 9
|
||||
addis 9,2,half@toc@ha # upper 32 bit of address
|
||||
addi 9,9,half@toc@l # lower 32 bit of address
|
||||
lxvd2x 1, 0, 9
|
||||
addis 9,2,two@toc@ha # upper 32 bit of address
|
||||
addi 9,9,two@toc@l # lower 32 bit of address
|
||||
lxvd2x 2, 0, 9
|
||||
loop:
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 2
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 2
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 2
|
||||
bdnz loop
|
||||
xvmovdp 1, 0
|
||||
blr
|
||||
.size latency, .-latency
|
53
src/vsx/xvmaddadp.S
Normal file
53
src/vsx/xvmaddadp.S
Normal file
@@ -0,0 +1,53 @@
|
||||
#define INSTR xvmaddadp
|
||||
#define NINST 6
|
||||
#define N 3
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.align 16
|
||||
zero:
|
||||
.double 0.0, 0.0
|
||||
two:
|
||||
.double 2.0, 2.0
|
||||
three:
|
||||
.double 3.0, 3.0
|
||||
.text
|
||||
.abiversion 2
|
||||
.section ".toc","aw"
|
||||
.section ".text"
|
||||
.align 2
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
latency :
|
||||
0: addis 2,12,.TOC.-0b@ha
|
||||
addi 2,2,.TOC.-0b@l
|
||||
.localentry latency, .-latency
|
||||
|
||||
mtctr N # move to count register
|
||||
# load DP FP zero
|
||||
li 10, 0
|
||||
|
||||
addis 9,2,zero@toc@ha
|
||||
addi 9,9,zero@toc@l
|
||||
lxvd2x 0, 0, 9
|
||||
|
||||
addis 9,2,two@toc@ha
|
||||
addi 9,9,two@toc@l
|
||||
lxvd2x 1, 0, 9
|
||||
|
||||
addis 9,2,three@toc@ha
|
||||
addi 9,9,three@toc@l
|
||||
lxvd2x 2, 0, 9
|
||||
loop:
|
||||
INSTR 0, 1, 2
|
||||
INSTR 0, 1, 2
|
||||
INSTR 0, 1, 2
|
||||
INSTR 0, 1, 2
|
||||
INSTR 0, 1, 2
|
||||
INSTR 0, 1, 2
|
||||
bdnz loop
|
||||
xvmovdp 1, 0
|
||||
blr
|
||||
.size latency, .-latency
|
44
src/vsx/xvmuldp.S
Normal file
44
src/vsx/xvmuldp.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR xvmuldp
|
||||
#define NINST 6
|
||||
#define N 3
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.align 16
|
||||
zero:
|
||||
.double 0.0, 0.0
|
||||
one:
|
||||
.double 1.0, 1.0
|
||||
.text
|
||||
.abiversion 2
|
||||
.section ".toc","aw"
|
||||
.section ".text"
|
||||
.align 2
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
latency :
|
||||
0: addis 2,12,.TOC.-0b@ha
|
||||
addi 2,2,.TOC.-0b@l
|
||||
.localentry latency, .-latency
|
||||
|
||||
mtctr N # move to count register
|
||||
li 10, 0 # offset zero
|
||||
addis 9,2,one@toc@ha # upper 32 bit of address
|
||||
addi 9,9,one@toc@l # lower 32 bit of address
|
||||
lxvd2x 0, 0, 9
|
||||
addis 9,2,one@toc@ha # upper 32 bit of address
|
||||
addi 9,9,one@toc@l # lower 32 bit of address
|
||||
lxvd2x 1, 0, 9
|
||||
loop:
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
bdnz loop
|
||||
xvmovdp 1, 0
|
||||
blr
|
||||
.size latency, .-latency
|
44
src/vsx/xvmulsp.S
Normal file
44
src/vsx/xvmulsp.S
Normal file
@@ -0,0 +1,44 @@
|
||||
#define INSTR xvmulsp
|
||||
#define NINST 6
|
||||
#define N 3
|
||||
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.align 16
|
||||
zero:
|
||||
.single 0.0, 0.0
|
||||
one:
|
||||
.single 1.0, 1.0
|
||||
.text
|
||||
.abiversion 2
|
||||
.section ".toc","aw"
|
||||
.section ".text"
|
||||
.align 2
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
latency :
|
||||
0: addis 2,12,.TOC.-0b@ha
|
||||
addi 2,2,.TOC.-0b@l
|
||||
.localentry latency, .-latency
|
||||
|
||||
mtctr N # move to count register
|
||||
li 10, 0 # offset zero
|
||||
addis 9,2,one@toc@ha # upper 32 bit of address
|
||||
addi 9,9,one@toc@l # lower 32 bit of address
|
||||
lxvd2x 0, 0, 9
|
||||
addis 9,2,one@toc@ha # upper 32 bit of address
|
||||
addi 9,9,one@toc@l # lower 32 bit of address
|
||||
lxvd2x 1, 0, 9
|
||||
loop:
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
INSTR 0, 0, 1
|
||||
bdnz loop
|
||||
xvmovdp 1, 0
|
||||
blr
|
||||
.size latency, .-latency
|
Reference in New Issue
Block a user