initial import

This commit is contained in:
Johannes Hofmann
2017-05-19 12:18:17 +02:00
parent ccf4f3333d
commit b99cacd528
87 changed files with 4142 additions and 0 deletions

26
Makefile Normal file
View File

@@ -0,0 +1,26 @@
COMPILER=ICC
TARGET = ibench
SRC_DIR = src
KDIRS += $(patsubst $(SRC_DIR)/%, %, $(wildcard $(SRC_DIR)/*))
Q = @
include include_$(COMPILER).mk
$(TARGET): ibench.c $(KDIRS) $(KERNELS)
$(Q)echo "===> COMPILING $@"
$(Q)$(CC) $(CFLAGS) $< -o $@ -ldl
$(KDIRS):
$(Q)mkdir $(KDIRS)
%.so:
$(Q)echo "===> ASSEMBLING $@"
$(Q)$(AS) $(LFLAGS) $(patsubst %.so, $(SRC_DIR)/%.S, $@) -o $@
.PHONY: clean
clean:
$(Q)echo "===> CLEAN"
$(Q)rm -rf $(KDIRS)
$(Q)rm -f $(TARGET)

105
ibench.c Normal file
View File

@@ -0,0 +1,105 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <dlfcn.h>
#include <immintrin.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <sys/types.h>
double (*latency)(int);
int *ninst;
void benchmark(const int N, float freq, char *sofile) {
struct timeval start, end;
double benchtime;
char *instr = strtok(sofile, ".");
double result;
// run benchmark
gettimeofday(&start, NULL);
result = (*latency)(N);
gettimeofday(&end, NULL);
benchtime = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec);
// divide by 1e6 (usec -> s), ninst (number of instr per loop),
// N/1e9 (loop count vs. GHz); multiply by frequency
benchtime = benchtime / (1e6 * *ninst / freq * (N / 1e9));
printf("%s:%s\t%.3f (clock cycles)\t[DEBUG - result: %f]\n", instr, strlen(instr) + 1 < 8 ? "\t" : "", benchtime, result);
}
int main(int argc, const char *argv[]) {
// one million runs
const int N = 1000000;
float freq = 0.0f;
// need a target directory containing benchmarks
if (argc < 2) {
printf("please specify a directory containing the shared objects with benchmarks to run\n");
exit(EXIT_FAILURE);
}
// did the command line specify a frequency?
if (argc < 3) {
printf("Please specify the CPU frequency in GHz. For best results make "
"sure the frequency is fixed, otherwise SpeedStep/Turbo Boost "
"might distort the results.\n");
exit(EXIT_FAILURE);
}
freq = atof(argv[2]);
printf("Using frequency %.2fGHz.\n", freq);
// perform benchmark for all shared objects in target directory
DIR *dirp;
struct dirent *dp;
struct stat st;
if ((dirp = opendir(argv[1])) == NULL) {
perror("opendir");
exit(EXIT_FAILURE);
}
while ((dp = readdir(dirp)) != NULL) {
// only try .so files
char *suffix = ".so";
int lensuffix = strlen(suffix);
if (strncmp(dp->d_name + strlen(dp->d_name) - lensuffix, ".so", 3))
continue;
// load .so
void *handle;
size_t len1 = strlen(argv[1]);
size_t len2 = strlen(dp->d_name);
// directory might be missing a trailing '/'
char *relpath;
if ((relpath = malloc(len1 + len2 + 2)) == NULL) {
perror("malloc");
exit(EXIT_FAILURE);
}
snprintf(relpath, len1 + len2 + 2, "%s/%s", argv[1], dp->d_name);
if ((handle = dlopen(relpath, RTLD_LAZY)) == NULL) {
fprintf(stderr, "dlopen: failed to open %s: %s\n", relpath,
dlerror());
exit(EXIT_FAILURE);
}
if ((latency = (double (*)(int))dlsym(handle, "latency")) == NULL) {
fprintf(stderr, "dlsym: couldn't find function latency in %s: %s\n",
relpath, dlerror());
return (EXIT_FAILURE);
}
if ((ninst = (int *)dlsym(handle, "ninst")) == NULL) {
fprintf(stderr, "dlsym: couldn't find symbol ninst in %s: %s\n",
relpath, dlerror());
return (EXIT_FAILURE);
}
free(relpath);
// do actual benchmark
benchmark(N, freq, dp->d_name);
dlclose(handle);
}
return 0;
}

9
include_GCC.mk Normal file
View File

@@ -0,0 +1,9 @@
CC = gcc
AS = gcc
CFLAGS = -O3 -x assembler-with-cpp
LFLAGS = -shared
KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/gp/*.S))
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/sse/*.S))
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/avx/*.S))
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/avx2/*.S))

9
include_ICC.mk Normal file
View File

@@ -0,0 +1,9 @@
CC = icc
AS = icc
CFLAGS = -O3
LFLAGS = -shared
KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/gp/*.S))
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/sse/*.S))
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/AVX/*.S))
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/AVX-512/*.S))

7
include_MIC.mk Normal file
View File

@@ -0,0 +1,7 @@
CC = icc
AS = icc
CFLAGS = -O3 -mmic
LFLAGS = -shared -mmic
KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/gp/*.S))
KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/imci/*.S))

6
include_POWER8.mk Normal file
View File

@@ -0,0 +1,6 @@
CC = xlc
AS = xlc
CFLAGS = -O3
LFLAGS = -shared
KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/vsx/*.S))

View File

@@ -0,0 +1,46 @@
#define INSTR vaddpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy DP 1.0
vmovapd zmm1, zmm0
loop:
inc i
INSTR zmm3, zmm0, zmm1
INSTR zmm4, zmm1, zmm0
INSTR zmm5, zmm0, zmm2
cmp i, N
INSTR zmm6, zmm2, zmm0
INSTR zmm7, zmm1, zmm2
INSTR zmm8, zmm2, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,46 @@
#define INSTR vaddpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy DP 1.0
vmovapd zmm1, zmm0
loop:
inc i
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm1
cmp i, N
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,46 @@
#define INSTR vaddps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy SP 1.0
vmovaps zmm1, zmm0
loop:
inc i
INSTR zmm3, zmm0, zmm1
INSTR zmm4, zmm1, zmm0
INSTR zmm5, zmm0, zmm2
cmp i, N
INSTR zmm6, zmm2, zmm0
INSTR zmm7, zmm1, zmm2
INSTR zmm8, zmm2, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,46 @@
#define INSTR vaddps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy SP 1.0
vmovaps zmm1, zmm0
loop:
inc i
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm1
cmp i, N
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,59 @@
#define INSTR vdivpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
vaddpd zmm1, zmm0, zmm0 # create 2.0
vaddpd zmm2, zmm0, zmm1 # create 3.0
vaddpd zmm4, zmm1, zmm1 # create 4.0
vaddpd zmm4, zmm4, zmm4 # create 8.0
vaddpd zmm4, zmm4, zmm4 # create 16.0
vaddpd zmm4, zmm4, zmm4 # create 32.0
vaddpd zmm4, zmm4, zmm4 # create 64.0
vaddpd zmm4, zmm4, zmm4 # create 128.0
vaddpd zmm4, zmm4, zmm4 # create 256.0
vaddpd zmm4, zmm4, zmm4 # create 512.0
vaddpd zmm4, zmm4, zmm4 # create 1024.0
vdivpd zmm1, zmm4, zmm2 # create 341.3333
vdivpd zmm2, zmm0, zmm1 # create 1/341.3333
vaddpd zmm0, zmm1, zmm1 # create 2*341.3333
loop:
inc i
INSTR zmm3, zmm0, zmm1
INSTR zmm4, zmm1, zmm0
INSTR zmm5, zmm0, zmm2
cmp i, N
INSTR zmm6, zmm2, zmm0
INSTR zmm7, zmm1, zmm2
INSTR zmm8, zmm2, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,59 @@
#define INSTR vdivpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
vaddpd zmm1, zmm0, zmm0 # create 2.0
vaddpd zmm2, zmm0, zmm1 # create 3.0
vaddpd zmm4, zmm1, zmm1 # create 4.0
vaddpd zmm4, zmm4, zmm4 # create 8.0
vaddpd zmm4, zmm4, zmm4 # create 16.0
vaddpd zmm4, zmm4, zmm4 # create 32.0
vaddpd zmm4, zmm4, zmm4 # create 64.0
vaddpd zmm4, zmm4, zmm4 # create 128.0
vaddpd zmm4, zmm4, zmm4 # create 256.0
vaddpd zmm4, zmm4, zmm4 # create 512.0
vaddpd zmm4, zmm4, zmm4 # create 1024.0
vdivpd zmm1, zmm4, zmm2 # create 341.3333
vdivpd zmm2, zmm0, zmm1 # create 1/341.3333
vaddpd zmm0, zmm1, zmm1 # create 2*341.3333
loop:
inc i
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm2
INSTR zmm0, zmm0, zmm1
cmp i, N
INSTR zmm0, zmm0, zmm2
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,59 @@
#define INSTR vdivps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
vaddps zmm1, zmm0, zmm0 # create 2.0
vaddps zmm2, zmm0, zmm1 # create 3.0
vaddps zmm4, zmm1, zmm1 # create 4.0
vaddps zmm4, zmm4, zmm4 # create 8.0
vaddps zmm4, zmm4, zmm4 # create 16.0
vaddps zmm4, zmm4, zmm4 # create 32.0
vaddps zmm4, zmm4, zmm4 # create 64.0
vaddps zmm4, zmm4, zmm4 # create 128.0
vaddps zmm4, zmm4, zmm4 # create 256.0
vaddps zmm4, zmm4, zmm4 # create 512.0
vaddps zmm4, zmm4, zmm4 # create 1024.0
vdivps zmm1, zmm4, zmm2 # create 341.3333
vdivps zmm2, zmm0, zmm1 # create 1/341.3333
vaddps zmm0, zmm1, zmm1 # create 2*341.3333
loop:
inc i
INSTR zmm3, zmm0, zmm1
INSTR zmm4, zmm1, zmm2
INSTR zmm5, zmm0, zmm2
cmp i, N
INSTR zmm6, zmm2, zmm0
INSTR zmm7, zmm1, zmm2
INSTR zmm8, zmm2, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,59 @@
#define INSTR vdivps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
vaddps zmm1, zmm0, zmm0 # create 2.0
vaddps zmm2, zmm0, zmm1 # create 3.0
vaddps zmm4, zmm1, zmm1 # create 4.0
vaddps zmm4, zmm4, zmm4 # create 8.0
vaddps zmm4, zmm4, zmm4 # create 16.0
vaddps zmm4, zmm4, zmm4 # create 32.0
vaddps zmm4, zmm4, zmm4 # create 64.0
vaddps zmm4, zmm4, zmm4 # create 128.0
vaddps zmm4, zmm4, zmm4 # create 256.0
vaddps zmm4, zmm4, zmm4 # create 512.0
vaddps zmm4, zmm4, zmm4 # create 1024.0
vdivps zmm1, zmm4, zmm2 # create 341.3333
vdivps zmm2, zmm0, zmm1 # create 1/341.3333
vaddps zmm0, zmm1, zmm1 # create 2*341.3333
loop:
inc i
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm2
INSTR zmm0, zmm0, zmm1
cmp i, N
INSTR zmm0, zmm0, zmm2
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,53 @@
#define INSTR vfmadd213pd
#define NINST 13
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy DP 1.0
vmovapd zmm1, zmm0
loop:
inc i
INSTR zmm3, zmm0, zmm1
INSTR zmm4, zmm1, zmm0
INSTR zmm5, zmm0, zmm2
INSTR zmm6, zmm2, zmm0
INSTR zmm7, zmm1, zmm2
INSTR zmm8, zmm2, zmm1
INSTR zmm9, zmm2, zmm1
cmp i, N
INSTR zmm10, zmm2, zmm1
INSTR zmm11, zmm2, zmm1
INSTR zmm12, zmm2, zmm1
INSTR zmm13, zmm2, zmm1
INSTR zmm14, zmm2, zmm1
INSTR zmm15, zmm2, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,46 @@
#define INSTR vfmadd213pd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy DP 1.0
vmovapd zmm1, zmm0
loop:
inc i
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
cmp i, N
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,53 @@
#define INSTR vfmadd213ps
#define NINST 13
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy SP 1.0
vmovaps zmm1, zmm0
loop:
inc i
INSTR zmm3, zmm0, zmm1
INSTR zmm4, zmm1, zmm0
INSTR zmm5, zmm0, zmm2
INSTR zmm6, zmm2, zmm0
INSTR zmm7, zmm1, zmm2
INSTR zmm8, zmm2, zmm1
INSTR zmm9, zmm2, zmm1
cmp i, N
INSTR zmm10, zmm2, zmm1
INSTR zmm11, zmm2, zmm1
INSTR zmm12, zmm2, zmm1
INSTR zmm13, zmm2, zmm1
INSTR zmm14, zmm2, zmm1
INSTR zmm15, zmm2, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,46 @@
#define INSTR vfmadd213ps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# copy SP 1.0
vmovaps zmm1, zmm0
loop:
inc i
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
cmp i, N
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
INSTR zmm0, zmm1, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,48 @@
#define INSTR vmulpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# create AVX-512 DP 2.0
vaddpd zmm1, zmm0, zmm0
# create AVX-512 DP 0.5
vdivpd zmm2, zmm0, zmm1
loop:
inc i
INSTR zmm3, zmm0, zmm1
INSTR zmm4, zmm1, zmm0
INSTR zmm5, zmm0, zmm2
cmp i, N
INSTR zmm6, zmm2, zmm0
INSTR zmm7, zmm1, zmm2
INSTR zmm8, zmm2, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,48 @@
#define INSTR vmulpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# create AVX-512 DP 2.0
vaddpd zmm1, zmm0, zmm0
# create AVX-512 DP 0.5
vdivpd zmm2, zmm0, zmm1
loop:
inc i
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm2
INSTR zmm0, zmm0, zmm1
cmp i, N
INSTR zmm0, zmm0, zmm2
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,48 @@
#define INSTR vmulps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# create AVX-512 DP 2.0
vaddps zmm1, zmm0, zmm0
# create AVX-512 DP 0.5
vdivps zmm2, zmm0, zmm1
loop:
inc i
INSTR zmm3, zmm0, zmm1
INSTR zmm4, zmm1, zmm0
INSTR zmm5, zmm0, zmm2
cmp i, N
INSTR zmm6, zmm2, zmm0
INSTR zmm7, zmm1, zmm2
INSTR zmm8, zmm2, zmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,48 @@
#define INSTR vmulps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
# create AVX-512 DP 2.0
vaddps zmm1, zmm0, zmm0
# create AVX-512 DP 0.5
vdivps zmm2, zmm0, zmm1
loop:
inc i
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm2
INSTR zmm0, zmm0, zmm1
cmp i, N
INSTR zmm0, zmm0, zmm2
INSTR zmm0, zmm0, zmm1
INSTR zmm0, zmm0, zmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,63 @@
#define INSTR vrcp14pd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
vaddpd zmm1, zmm0, zmm0 # create 2.0
vaddpd zmm2, zmm0, zmm1 # create 3.0
vaddpd zmm4, zmm1, zmm1 # create 4.0
vaddpd zmm4, zmm4, zmm4 # create 8.0
vaddpd zmm4, zmm4, zmm4 # create 16.0
vaddpd zmm4, zmm4, zmm4 # create 32.0
vaddpd zmm4, zmm4, zmm4 # create 64.0
vaddpd zmm4, zmm4, zmm4 # create 128.0
vaddpd zmm4, zmm4, zmm4 # create 256.0
vaddpd zmm4, zmm4, zmm4 # create 512.0
vaddpd zmm4, zmm4, zmm4 # create 1024.0
vdivpd zmm1, zmm4, zmm2 # create 341.3333
vdivpd zmm2, zmm0, zmm1 # create 1/341.3333
vaddpd zmm0, zmm1, zmm1 # create 2*341.3333
vmovapd zmm1, zmm0
vmovapd zmm2, zmm0
vmovapd zmm3, zmm0
vmovapd zmm4, zmm0
vmovapd zmm5, zmm0
loop:
inc i
INSTR zmm10, zmm0
INSTR zmm11, zmm1
INSTR zmm12, zmm2
cmp i, N
INSTR zmm13, zmm3
INSTR zmm14, zmm4
INSTR zmm15, zmm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,58 @@
#define INSTR vrcp14pd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
vaddpd zmm1, zmm0, zmm0 # create 2.0
vaddpd zmm2, zmm0, zmm1 # create 3.0
vaddpd zmm4, zmm1, zmm1 # create 4.0
vaddpd zmm4, zmm4, zmm4 # create 8.0
vaddpd zmm4, zmm4, zmm4 # create 16.0
vaddpd zmm4, zmm4, zmm4 # create 32.0
vaddpd zmm4, zmm4, zmm4 # create 64.0
vaddpd zmm4, zmm4, zmm4 # create 128.0
vaddpd zmm4, zmm4, zmm4 # create 256.0
vaddpd zmm4, zmm4, zmm4 # create 512.0
vaddpd zmm4, zmm4, zmm4 # create 1024.0
vdivpd zmm1, zmm4, zmm2 # create 341.3333
vdivpd zmm2, zmm0, zmm1 # create 1/341.3333
vaddpd zmm0, zmm1, zmm1 # create 2*341.3333
loop:
inc i
INSTR zmm1, zmm0
INSTR zmm2, zmm1
INSTR zmm3, zmm2
cmp i, N
INSTR zmm4, zmm3
INSTR zmm5, zmm4
INSTR zmm0, zmm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,63 @@
#define INSTR vrcp14ps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
vaddps zmm1, zmm0, zmm0 # create 2.0
vaddps zmm2, zmm0, zmm1 # create 3.0
vaddps zmm4, zmm1, zmm1 # create 4.0
vaddps zmm4, zmm4, zmm4 # create 8.0
vaddps zmm4, zmm4, zmm4 # create 16.0
vaddps zmm4, zmm4, zmm4 # create 32.0
vaddps zmm4, zmm4, zmm4 # create 64.0
vaddps zmm4, zmm4, zmm4 # create 128.0
vaddps zmm4, zmm4, zmm4 # create 256.0
vaddps zmm4, zmm4, zmm4 # create 512.0
vaddps zmm4, zmm4, zmm4 # create 1024.0
vdivps zmm1, zmm4, zmm2 # create 341.3333
vdivps zmm2, zmm0, zmm1 # create 1/341.3333
vaddps zmm0, zmm1, zmm1 # create 2*341.3333
vmovaps zmm1, zmm0
vmovaps zmm2, zmm0
vmovaps zmm3, zmm0
vmovaps zmm4, zmm0
vmovaps zmm5, zmm0
loop:
inc i
INSTR zmm10, zmm0
INSTR zmm11, zmm1
INSTR zmm12, zmm2
cmp i, N
INSTR zmm13, zmm3
INSTR zmm14, zmm4
INSTR zmm15, zmm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,58 @@
#define INSTR vrcp14ps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SSE SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# expand from AVX to AVX-512
vinsertf64x4 zmm0, zmm0, ymm0, 0x1
vaddps zmm1, zmm0, zmm0 # create 2.0
vaddps zmm2, zmm0, zmm1 # create 3.0
vaddps zmm4, zmm1, zmm1 # create 4.0
vaddps zmm4, zmm4, zmm4 # create 8.0
vaddps zmm4, zmm4, zmm4 # create 16.0
vaddps zmm4, zmm4, zmm4 # create 32.0
vaddps zmm4, zmm4, zmm4 # create 64.0
vaddps zmm4, zmm4, zmm4 # create 128.0
vaddps zmm4, zmm4, zmm4 # create 256.0
vaddps zmm4, zmm4, zmm4 # create 512.0
vaddps zmm4, zmm4, zmm4 # create 1024.0
vdivps zmm1, zmm4, zmm2 # create 341.3333
vdivps zmm2, zmm0, zmm1 # create 1/341.3333
vaddps zmm0, zmm1, zmm1 # create 2*341.3333
loop:
inc i
INSTR zmm1, zmm0
INSTR zmm2, zmm1
INSTR zmm3, zmm2
cmp i, N
INSTR zmm4, zmm3
INSTR zmm5, zmm4
INSTR zmm0, zmm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

59
src/AVX/rcpss-TP.S Normal file
View File

@@ -0,0 +1,59 @@
#define INSTR rcpss
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddss xmm1, xmm0, xmm0 # create 2.0
vaddss xmm2, xmm0, xmm1 # create 3.0
vaddss xmm4, xmm1, xmm1 # create 4.0
vaddss xmm4, xmm4, xmm4 # create 8.0
vaddss xmm4, xmm4, xmm4 # create 16.0
vaddss xmm4, xmm4, xmm4 # create 32.0
vaddss xmm4, xmm4, xmm4 # create 64.0
vaddss xmm4, xmm4, xmm4 # create 128.0
vaddss xmm4, xmm4, xmm4 # create 256.0
vaddss xmm4, xmm4, xmm4 # create 512.0
vaddss xmm4, xmm4, xmm4 # create 1024.0
vdivss xmm1, xmm4, xmm2 # create 341.3333
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
movss xmm1, xmm0
movss xmm2, xmm0
movss xmm3, xmm0
movss xmm4, xmm0
movss xmm5, xmm0
loop:
inc i
INSTR xmm10, xmm0
INSTR xmm11, xmm1
INSTR xmm12, xmm2
cmp i, N
INSTR xmm13, xmm3
INSTR xmm14, xmm4
INSTR xmm15, xmm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

54
src/AVX/rcpss.S Normal file
View File

@@ -0,0 +1,54 @@
#define INSTR vrcpps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddps xmm1, xmm0, xmm0 # create 2.0
vaddps xmm2, xmm0, xmm1 # create 3.0
vaddps xmm4, xmm1, xmm1 # create 4.0
vaddps xmm4, xmm4, xmm4 # create 8.0
vaddps xmm4, xmm4, xmm4 # create 16.0
vaddps xmm4, xmm4, xmm4 # create 32.0
vaddps xmm4, xmm4, xmm4 # create 64.0
vaddps xmm4, xmm4, xmm4 # create 128.0
vaddps xmm4, xmm4, xmm4 # create 256.0
vaddps xmm4, xmm4, xmm4 # create 512.0
vaddps xmm4, xmm4, xmm4 # create 1024.0
vdivps xmm1, xmm4, xmm2 # create 341.3333
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
loop:
inc i
INSTR xmm1, xmm0
INSTR xmm2, xmm1
INSTR xmm3, xmm2
cmp i, N
INSTR xmm4, xmm3
INSTR xmm5, xmm4
INSTR xmm0, xmm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vaddpd-avx-TP.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vaddpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
loop:
inc i
INSTR ymm3, ymm0, ymm1
INSTR ymm4, ymm1, ymm0
INSTR ymm5, ymm0, ymm2
cmp i, N
INSTR ymm6, ymm2, ymm0
INSTR ymm7, ymm1, ymm2
INSTR ymm8, ymm2, ymm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vaddpd-avx.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vaddpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
loop:
inc i
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm1
cmp i, N
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

42
src/AVX/vaddpd-sse-TP.S Normal file
View File

@@ -0,0 +1,42 @@
#define INSTR vaddpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
cmp i, N
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

42
src/AVX/vaddpd-sse.S Normal file
View File

@@ -0,0 +1,42 @@
#define INSTR vaddpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vaddps-avx-TP.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vaddps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
loop:
inc i
INSTR ymm3, ymm0, ymm1
INSTR ymm4, ymm1, ymm0
INSTR ymm5, ymm0, ymm2
cmp i, N
INSTR ymm6, ymm2, ymm0
INSTR ymm7, ymm1, ymm2
INSTR ymm8, ymm2, ymm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vaddps-avx.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vaddps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
loop:
inc i
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm1
cmp i, N
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

42
src/AVX/vaddps-sse-TP.S Normal file
View File

@@ -0,0 +1,42 @@
#define INSTR vaddps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
cmp i, N
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

42
src/AVX/vaddps-sse.S Normal file
View File

@@ -0,0 +1,42 @@
#define INSTR vaddps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

42
src/AVX/vaddsd-TP.S Normal file
View File

@@ -0,0 +1,42 @@
#define INSTR vaddsd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
cmp i, N
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

42
src/AVX/vaddsd.S Normal file
View File

@@ -0,0 +1,42 @@
#define INSTR vaddsd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

42
src/AVX/vaddss-TP.S Normal file
View File

@@ -0,0 +1,42 @@
#define INSTR vaddss
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
cmp i, N
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

42
src/AVX/vaddss.S Normal file
View File

@@ -0,0 +1,42 @@
#define INSTR vaddss
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

55
src/AVX/vdivpd-avx-TP.S Normal file
View File

@@ -0,0 +1,55 @@
#define INSTR vdivpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vinsertf128 ymm0, ymm0, xmm0, 0x1
vaddpd ymm1, ymm0, ymm0 # create 2.0
vaddpd ymm2, ymm0, ymm1 # create 3.0
vaddpd ymm4, ymm1, ymm1 # create 4.0
vaddpd ymm4, ymm4, ymm4 # create 8.0
vaddpd ymm4, ymm4, ymm4 # create 16.0
vaddpd ymm4, ymm4, ymm4 # create 32.0
vaddpd ymm4, ymm4, ymm4 # create 64.0
vaddpd ymm4, ymm4, ymm4 # create 128.0
vaddpd ymm4, ymm4, ymm4 # create 256.0
vaddpd ymm4, ymm4, ymm4 # create 512.0
vaddpd ymm4, ymm4, ymm4 # create 1024.0
vdivpd ymm1, ymm4, ymm2 # create 341.3333
vdivpd ymm2, ymm0, ymm1 # create 1/341.3333
vaddpd ymm0, ymm1, ymm1 # create 2*341.3333
loop:
inc i
INSTR ymm3, ymm0, ymm1
INSTR ymm4, ymm1, ymm0
INSTR ymm5, ymm0, ymm2
cmp i, N
INSTR ymm6, ymm2, ymm0
INSTR ymm7, ymm1, ymm2
INSTR ymm8, ymm2, ymm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

55
src/AVX/vdivpd-avx.S Normal file
View File

@@ -0,0 +1,55 @@
#define INSTR vdivpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vinsertf128 ymm0, ymm0, xmm0, 0x1
vaddpd ymm1, ymm0, ymm0 # create 2.0
vaddpd ymm2, ymm0, ymm1 # create 3.0
vaddpd ymm4, ymm1, ymm1 # create 4.0
vaddpd ymm4, ymm4, ymm4 # create 8.0
vaddpd ymm4, ymm4, ymm4 # create 16.0
vaddpd ymm4, ymm4, ymm4 # create 32.0
vaddpd ymm4, ymm4, ymm4 # create 64.0
vaddpd ymm4, ymm4, ymm4 # create 128.0
vaddpd ymm4, ymm4, ymm4 # create 256.0
vaddpd ymm4, ymm4, ymm4 # create 512.0
vaddpd ymm4, ymm4, ymm4 # create 1024.0
vdivpd ymm1, ymm4, ymm2 # create 341.3333
vdivpd ymm2, ymm0, ymm1 # create 1/341.3333
vaddpd ymm0, ymm1, ymm1 # create 2*341.3333
loop:
inc i
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm2
INSTR ymm0, ymm0, ymm1
cmp i, N
INSTR ymm0, ymm0, ymm2
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

54
src/AVX/vdivpd-sse-TP.S Normal file
View File

@@ -0,0 +1,54 @@
#define INSTR vdivpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddpd xmm1, xmm0, xmm0 # create 2.0
vaddpd xmm2, xmm0, xmm1 # create 3.0
vaddpd xmm4, xmm1, xmm1 # create 4.0
vaddpd xmm4, xmm4, xmm4 # create 8.0
vaddpd xmm4, xmm4, xmm4 # create 16.0
vaddpd xmm4, xmm4, xmm4 # create 32.0
vaddpd xmm4, xmm4, xmm4 # create 64.0
vaddpd xmm4, xmm4, xmm4 # create 128.0
vaddpd xmm4, xmm4, xmm4 # create 256.0
vaddpd xmm4, xmm4, xmm4 # create 512.0
vaddpd xmm4, xmm4, xmm4 # create 1024.0
vdivpd xmm1, xmm4, xmm2 # create 341.3333
vdivpd xmm2, xmm0, xmm1 # create 1/341.3333
vaddpd xmm0, xmm1, xmm1 # create 2*341.3333
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
cmp i, N
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

54
src/AVX/vdivpd-sse.S Normal file
View File

@@ -0,0 +1,54 @@
#define INSTR vdivpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddpd xmm1, xmm0, xmm0 # create 2.0
vaddpd xmm2, xmm0, xmm1 # create 3.0
vaddpd xmm4, xmm1, xmm1 # create 4.0
vaddpd xmm4, xmm4, xmm4 # create 8.0
vaddpd xmm4, xmm4, xmm4 # create 16.0
vaddpd xmm4, xmm4, xmm4 # create 32.0
vaddpd xmm4, xmm4, xmm4 # create 64.0
vaddpd xmm4, xmm4, xmm4 # create 128.0
vaddpd xmm4, xmm4, xmm4 # create 256.0
vaddpd xmm4, xmm4, xmm4 # create 512.0
vaddpd xmm4, xmm4, xmm4 # create 1024.0
vdivpd xmm1, xmm4, xmm2 # create 341.3333
vdivpd xmm2, xmm0, xmm1 # create 1/341.3333
vaddpd xmm0, xmm1, xmm1 # create 2*341.3333
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

55
src/AVX/vdivps-avx-TP.S Normal file
View File

@@ -0,0 +1,55 @@
#define INSTR vdivps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vinsertf128 ymm0, ymm0, xmm0, 0x1
vaddps ymm1, ymm0, ymm0 # create 2.0
vaddps ymm2, ymm0, ymm1 # create 3.0
vaddps ymm4, ymm1, ymm1 # create 4.0
vaddps ymm4, ymm4, ymm4 # create 8.0
vaddps ymm4, ymm4, ymm4 # create 16.0
vaddps ymm4, ymm4, ymm4 # create 32.0
vaddps ymm4, ymm4, ymm4 # create 64.0
vaddps ymm4, ymm4, ymm4 # create 128.0
vaddps ymm4, ymm4, ymm4 # create 256.0
vaddps ymm4, ymm4, ymm4 # create 512.0
vaddps ymm4, ymm4, ymm4 # create 1024.0
vdivps ymm1, ymm4, ymm2 # create 341.3333
vdivps ymm2, ymm0, ymm1 # create 1/341.3333
vaddps ymm0, ymm1, ymm1 # create 2*341.3333
loop:
inc i
INSTR ymm3, ymm0, ymm1
INSTR ymm4, ymm1, ymm2
INSTR ymm5, ymm0, ymm2
cmp i, N
INSTR ymm6, ymm2, ymm0
INSTR ymm7, ymm1, ymm2
INSTR ymm8, ymm2, ymm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

55
src/AVX/vdivps-avx.S Normal file
View File

@@ -0,0 +1,55 @@
#define INSTR vdivps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vinsertf128 ymm0, ymm0, xmm0, 0x1
vaddps ymm1, ymm0, ymm0 # create 2.0
vaddps ymm2, ymm0, ymm1 # create 3.0
vaddps ymm4, ymm1, ymm1 # create 4.0
vaddps ymm4, ymm4, ymm4 # create 8.0
vaddps ymm4, ymm4, ymm4 # create 16.0
vaddps ymm4, ymm4, ymm4 # create 32.0
vaddps ymm4, ymm4, ymm4 # create 64.0
vaddps ymm4, ymm4, ymm4 # create 128.0
vaddps ymm4, ymm4, ymm4 # create 256.0
vaddps ymm4, ymm4, ymm4 # create 512.0
vaddps ymm4, ymm4, ymm4 # create 1024.0
vdivps ymm1, ymm4, ymm2 # create 341.3333
vdivps ymm2, ymm0, ymm1 # create 1/341.3333
vaddps ymm0, ymm1, ymm1 # create 2*341.3333
loop:
inc i
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm2
INSTR ymm0, ymm0, ymm1
cmp i, N
INSTR ymm0, ymm0, ymm2
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

54
src/AVX/vdivps-sse-TP.S Normal file
View File

@@ -0,0 +1,54 @@
#define INSTR vdivps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddps xmm1, xmm0, xmm0 # create 2.0
vaddps xmm2, xmm0, xmm1 # create 3.0
vaddps xmm4, xmm1, xmm1 # create 4.0
vaddps xmm4, xmm4, xmm4 # create 8.0
vaddps xmm4, xmm4, xmm4 # create 16.0
vaddps xmm4, xmm4, xmm4 # create 32.0
vaddps xmm4, xmm4, xmm4 # create 64.0
vaddps xmm4, xmm4, xmm4 # create 128.0
vaddps xmm4, xmm4, xmm4 # create 256.0
vaddps xmm4, xmm4, xmm4 # create 512.0
vaddps xmm4, xmm4, xmm4 # create 1024.0
vdivps xmm1, xmm4, xmm2 # create 341.3333
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
cmp i, N
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

54
src/AVX/vdivps-sse.S Normal file
View File

@@ -0,0 +1,54 @@
#define INSTR vdivps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddps xmm1, xmm0, xmm0 # create 2.0
vaddps xmm2, xmm0, xmm1 # create 3.0
vaddps xmm4, xmm1, xmm1 # create 4.0
vaddps xmm4, xmm4, xmm4 # create 8.0
vaddps xmm4, xmm4, xmm4 # create 16.0
vaddps xmm4, xmm4, xmm4 # create 32.0
vaddps xmm4, xmm4, xmm4 # create 64.0
vaddps xmm4, xmm4, xmm4 # create 128.0
vaddps xmm4, xmm4, xmm4 # create 256.0
vaddps xmm4, xmm4, xmm4 # create 512.0
vaddps xmm4, xmm4, xmm4 # create 1024.0
vdivps xmm1, xmm4, xmm2 # create 341.3333
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

54
src/AVX/vdivsd-TP.S Normal file
View File

@@ -0,0 +1,54 @@
#define INSTR vdivsd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddsd xmm1, xmm0, xmm0 # create 2.0
vaddsd xmm2, xmm0, xmm1 # create 3.0
vaddsd xmm4, xmm1, xmm1 # create 4.0
vaddsd xmm4, xmm4, xmm4 # create 8.0
vaddsd xmm4, xmm4, xmm4 # create 16.0
vaddsd xmm4, xmm4, xmm4 # create 32.0
vaddsd xmm4, xmm4, xmm4 # create 64.0
vaddsd xmm4, xmm4, xmm4 # create 128.0
vaddsd xmm4, xmm4, xmm4 # create 256.0
vaddsd xmm4, xmm4, xmm4 # create 512.0
vaddsd xmm4, xmm4, xmm4 # create 1024.0
vdivsd xmm1, xmm4, xmm2 # create 341.3333
vdivsd xmm2, xmm0, xmm1 # create 1/341.3333
vaddsd xmm0, xmm1, xmm1 # create 2*341.3333
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
cmp i, N
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

54
src/AVX/vdivsd.S Normal file
View File

@@ -0,0 +1,54 @@
#define INSTR vdivsd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddsd xmm1, xmm0, xmm0 # create 2.0
vaddsd xmm2, xmm0, xmm1 # create 3.0
vaddsd xmm4, xmm1, xmm1 # create 4.0
vaddsd xmm4, xmm4, xmm4 # create 8.0
vaddsd xmm4, xmm4, xmm4 # create 16.0
vaddsd xmm4, xmm4, xmm4 # create 32.0
vaddsd xmm4, xmm4, xmm4 # create 64.0
vaddsd xmm4, xmm4, xmm4 # create 128.0
vaddsd xmm4, xmm4, xmm4 # create 256.0
vaddsd xmm4, xmm4, xmm4 # create 512.0
vaddsd xmm4, xmm4, xmm4 # create 1024.0
vdivsd xmm1, xmm4, xmm2 # create 341.3333
vdivsd xmm2, xmm0, xmm1 # create 1/341.3333
vaddsd xmm0, xmm1, xmm1 # create 2*341.3333
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

54
src/AVX/vdivss-TP.S Normal file
View File

@@ -0,0 +1,54 @@
#define INSTR vdivss
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddss xmm1, xmm0, xmm0 # create 2.0
vaddss xmm2, xmm0, xmm1 # create 3.0
vaddss xmm4, xmm1, xmm1 # create 4.0
vaddss xmm4, xmm4, xmm4 # create 8.0
vaddss xmm4, xmm4, xmm4 # create 16.0
vaddss xmm4, xmm4, xmm4 # create 32.0
vaddss xmm4, xmm4, xmm4 # create 64.0
vaddss xmm4, xmm4, xmm4 # create 128.0
vaddss xmm4, xmm4, xmm4 # create 256.0
vaddss xmm4, xmm4, xmm4 # create 512.0
vaddss xmm4, xmm4, xmm4 # create 1024.0
vdivss xmm1, xmm4, xmm2 # create 341.3333
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
cmp i, N
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

54
src/AVX/vdivss.S Normal file
View File

@@ -0,0 +1,54 @@
#define INSTR vdivss
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddss xmm1, xmm0, xmm0 # create 2.0
vaddss xmm2, xmm0, xmm1 # create 3.0
vaddss xmm4, xmm1, xmm1 # create 4.0
vaddss xmm4, xmm4, xmm4 # create 8.0
vaddss xmm4, xmm4, xmm4 # create 16.0
vaddss xmm4, xmm4, xmm4 # create 32.0
vaddss xmm4, xmm4, xmm4 # create 64.0
vaddss xmm4, xmm4, xmm4 # create 128.0
vaddss xmm4, xmm4, xmm4 # create 256.0
vaddss xmm4, xmm4, xmm4 # create 512.0
vaddss xmm4, xmm4, xmm4 # create 1024.0
vdivss xmm1, xmm4, xmm2 # create 341.3333
vdivss xmm2, xmm0, xmm1 # create 1/341.3333
vaddss xmm0, xmm1, xmm1 # create 2*341.3333
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,51 @@
#define INSTR vfmadd213pd
#define NINST 13
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
loop:
inc i
INSTR ymm3, ymm0, ymm1
INSTR ymm4, ymm1, ymm0
INSTR ymm5, ymm0, ymm2
INSTR ymm6, ymm2, ymm0
INSTR ymm7, ymm1, ymm2
INSTR ymm8, ymm2, ymm1
INSTR ymm9, ymm2, ymm1
cmp i, N
INSTR ymm10, ymm2, ymm1
INSTR ymm11, ymm2, ymm1
INSTR ymm12, ymm2, ymm1
INSTR ymm13, ymm2, ymm1
INSTR ymm14, ymm2, ymm1
INSTR ymm15, ymm2, ymm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vfmadd213pd-avx.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vfmadd213pd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
loop:
inc i
INSTR ymm0, ymm1, ymm1
INSTR ymm0, ymm1, ymm1
INSTR ymm0, ymm1, ymm1
cmp i, N
INSTR ymm0, ymm1, ymm1
INSTR ymm0, ymm1, ymm1
INSTR ymm0, ymm1, ymm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,49 @@
#define INSTR vfmadd213pd
#define NINST 13
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
INSTR xmm9, xmm2, xmm1
cmp i, N
INSTR xmm10, xmm2, xmm1
INSTR xmm11, xmm2, xmm1
INSTR xmm12, xmm2, xmm1
INSTR xmm13, xmm2, xmm1
INSTR xmm14, xmm2, xmm1
INSTR xmm15, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

42
src/AVX/vfmadd213pd-sse.S Normal file
View File

@@ -0,0 +1,42 @@
#define INSTR vfmadd213pd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
cmp i, N
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,51 @@
#define INSTR vfmadd213ps
#define NINST 13
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
loop:
inc i
INSTR ymm3, ymm0, ymm1
INSTR ymm4, ymm1, ymm0
INSTR ymm5, ymm0, ymm2
INSTR ymm6, ymm2, ymm0
INSTR ymm7, ymm1, ymm2
INSTR ymm8, ymm2, ymm1
INSTR ymm9, ymm2, ymm1
cmp i, N
INSTR ymm10, ymm2, ymm1
INSTR ymm11, ymm2, ymm1
INSTR ymm12, ymm2, ymm1
INSTR ymm13, ymm2, ymm1
INSTR ymm14, ymm2, ymm1
INSTR ymm15, ymm2, ymm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vfmadd213ps-avx.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vfmadd213ps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy SP 1.0
vmovaps ymm1, ymm0
loop:
inc i
INSTR ymm0, ymm1, ymm1
INSTR ymm0, ymm1, ymm1
INSTR ymm0, ymm1, ymm1
cmp i, N
INSTR ymm0, ymm1, ymm1
INSTR ymm0, ymm1, ymm1
INSTR ymm0, ymm1, ymm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,49 @@
#define INSTR vfmadd213ps
#define NINST 13
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
INSTR xmm9, xmm2, xmm1
cmp i, N
INSTR xmm10, xmm2, xmm1
INSTR xmm11, xmm2, xmm1
INSTR xmm12, xmm2, xmm1
INSTR xmm13, xmm2, xmm1
INSTR xmm14, xmm2, xmm1
INSTR xmm15, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

42
src/AVX/vfmadd213ps-sse.S Normal file
View File

@@ -0,0 +1,42 @@
#define INSTR vfmadd213ps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
cmp i, N
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

49
src/AVX/vfmadd213sd-TP.S Normal file
View File

@@ -0,0 +1,49 @@
#define INSTR vfmadd213sd
#define NINST 13
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
INSTR xmm9, xmm2, xmm1
cmp i, N
INSTR xmm10, xmm2, xmm1
INSTR xmm11, xmm2, xmm1
INSTR xmm12, xmm2, xmm1
INSTR xmm13, xmm2, xmm1
INSTR xmm14, xmm2, xmm1
INSTR xmm15, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

42
src/AVX/vfmadd213sd.S Normal file
View File

@@ -0,0 +1,42 @@
#define INSTR vfmadd213sd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
cmp i, N
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

49
src/AVX/vfmadd213ss-TP.S Normal file
View File

@@ -0,0 +1,49 @@
#define INSTR vfmadd213ss
#define NINST 13
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
INSTR xmm9, xmm2, xmm1
cmp i, N
INSTR xmm10, xmm2, xmm1
INSTR xmm11, xmm2, xmm1
INSTR xmm12, xmm2, xmm1
INSTR xmm13, xmm2, xmm1
INSTR xmm14, xmm2, xmm1
INSTR xmm15, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

42
src/AVX/vfmadd213ss.S Normal file
View File

@@ -0,0 +1,42 @@
#define INSTR vfmadd213ss
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy SP 1.0
vmovaps xmm1, xmm0
loop:
inc i
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
cmp i, N
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
INSTR xmm0, xmm1, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

46
src/AVX/vmulpd-avx-TP.S Normal file
View File

@@ -0,0 +1,46 @@
#define INSTR vmulpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# create SP 2.0
vaddpd ymm1, ymm0, ymm0
# create SP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0, ymm1
INSTR ymm4, ymm1, ymm0
INSTR ymm5, ymm0, ymm2
cmp i, N
INSTR ymm6, ymm2, ymm0
INSTR ymm7, ymm1, ymm2
INSTR ymm8, ymm2, ymm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

46
src/AVX/vmulpd-avx.S Normal file
View File

@@ -0,0 +1,46 @@
#define INSTR vmulpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# create SP 2.0
vaddpd ymm1, ymm0, ymm0
# create SP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm2
INSTR ymm0, ymm0, ymm1
cmp i, N
INSTR ymm0, ymm0, ymm2
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vmulpd-sse-TP.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vmulpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# create SP 2.0
vaddpd xmm1, xmm0, xmm0
# create SP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
cmp i, N
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vmulpd-sse.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vmulpd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# create SP 2.0
vaddpd xmm1, xmm0, xmm0
# create SP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

46
src/AVX/vmulps-avx-TP.S Normal file
View File

@@ -0,0 +1,46 @@
#define INSTR vmulps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# create SP 2.0
vaddps ymm1, ymm0, ymm0
# create SP 0.5
vdivps ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0, ymm1
INSTR ymm4, ymm1, ymm0
INSTR ymm5, ymm0, ymm2
cmp i, N
INSTR ymm6, ymm2, ymm0
INSTR ymm7, ymm1, ymm2
INSTR ymm8, ymm2, ymm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

46
src/AVX/vmulps-avx.S Normal file
View File

@@ -0,0 +1,46 @@
#define INSTR vmulps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# create SP 2.0
vaddps ymm1, ymm0, ymm0
# create SP 0.5
vdivps ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm2
INSTR ymm0, ymm0, ymm1
cmp i, N
INSTR ymm0, ymm0, ymm2
INSTR ymm0, ymm0, ymm1
INSTR ymm0, ymm0, ymm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vmulps-sse-TP.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vmulps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# create SP 2.0
vaddps xmm1, xmm0, xmm0
# create SP 0.5
vdivps xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
cmp i, N
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vmulps-sse.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vmulps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# create SP 2.0
vaddps xmm1, xmm0, xmm0
# create SP 0.5
vdivps xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vmulsd-TP.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vmulsd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# create SP 2.0
vaddps xmm1, xmm0, xmm0
# create SP 0.5
vdivps xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
cmp i, N
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vmulsd.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vmulsd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# create SP 2.0
vaddpd xmm1, xmm0, xmm0
# create SP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vmulss-TP.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vmulss
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# create SP 2.0
vaddps xmm1, xmm0, xmm0
# create SP 0.5
vdivps xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm1
INSTR xmm4, xmm1, xmm0
INSTR xmm5, xmm0, xmm2
cmp i, N
INSTR xmm6, xmm2, xmm0
INSTR xmm7, xmm1, xmm2
INSTR xmm8, xmm2, xmm1
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

44
src/AVX/vmulss.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR vmulss
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# create SP 2.0
vaddps xmm1, xmm0, xmm0
# create SP 0.5
vdivps xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
cmp i, N
INSTR xmm0, xmm0, xmm2
INSTR xmm0, xmm0, xmm1
INSTR xmm0, xmm0, xmm2
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

60
src/AVX/vrcpps-avx-TP.S Normal file
View File

@@ -0,0 +1,60 @@
#define INSTR vrcpps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vinsertf128 ymm0, ymm0, xmm0, 0x1
vaddps ymm1, ymm0, ymm0 # create 2.0
vaddps ymm2, ymm0, ymm1 # create 3.0
vaddps ymm4, ymm1, ymm1 # create 4.0
vaddps ymm4, ymm4, ymm4 # create 8.0
vaddps ymm4, ymm4, ymm4 # create 16.0
vaddps ymm4, ymm4, ymm4 # create 32.0
vaddps ymm4, ymm4, ymm4 # create 64.0
vaddps ymm4, ymm4, ymm4 # create 128.0
vaddps ymm4, ymm4, ymm4 # create 256.0
vaddps ymm4, ymm4, ymm4 # create 512.0
vaddps ymm4, ymm4, ymm4 # create 1024.0
vdivps ymm1, ymm4, ymm2 # create 341.3333
vdivps ymm2, ymm0, ymm1 # create 1/341.3333
vaddps ymm0, ymm1, ymm1 # create 2*341.3333
vmovaps ymm1, ymm0
vmovaps ymm2, ymm0
vmovaps ymm3, ymm0
vmovaps ymm4, ymm0
vmovaps ymm5, ymm0
loop:
inc i
INSTR ymm10, ymm0
INSTR ymm11, ymm1
INSTR ymm12, ymm2
cmp i, N
INSTR ymm13, ymm3
INSTR ymm14, ymm4
INSTR ymm15, ymm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

55
src/AVX/vrcpps-avx.S Normal file
View File

@@ -0,0 +1,55 @@
#define INSTR vrcpps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vinsertf128 ymm0, ymm0, xmm0, 0x1
vaddps ymm1, ymm0, ymm0 # create 2.0
vaddps ymm2, ymm0, ymm1 # create 3.0
vaddps ymm4, ymm1, ymm1 # create 4.0
vaddps ymm4, ymm4, ymm4 # create 8.0
vaddps ymm4, ymm4, ymm4 # create 16.0
vaddps ymm4, ymm4, ymm4 # create 32.0
vaddps ymm4, ymm4, ymm4 # create 64.0
vaddps ymm4, ymm4, ymm4 # create 128.0
vaddps ymm4, ymm4, ymm4 # create 256.0
vaddps ymm4, ymm4, ymm4 # create 512.0
vaddps ymm4, ymm4, ymm4 # create 1024.0
vdivps ymm1, ymm4, ymm2 # create 341.3333
vdivps ymm2, ymm0, ymm1 # create 1/341.3333
vaddps ymm0, ymm1, ymm1 # create 2*341.3333
loop:
inc i
INSTR ymm1, ymm0
INSTR ymm2, ymm1
INSTR ymm3, ymm2
cmp i, N
INSTR ymm4, ymm3
INSTR ymm5, ymm4
INSTR ymm0, ymm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

59
src/AVX/vrcpps-sse-TP.S Normal file
View File

@@ -0,0 +1,59 @@
#define INSTR vrcpps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddps xmm1, xmm0, xmm0 # create 2.0
vaddps xmm2, xmm0, xmm1 # create 3.0
vaddps xmm4, xmm1, xmm1 # create 4.0
vaddps xmm4, xmm4, xmm4 # create 8.0
vaddps xmm4, xmm4, xmm4 # create 16.0
vaddps xmm4, xmm4, xmm4 # create 32.0
vaddps xmm4, xmm4, xmm4 # create 64.0
vaddps xmm4, xmm4, xmm4 # create 128.0
vaddps xmm4, xmm4, xmm4 # create 256.0
vaddps xmm4, xmm4, xmm4 # create 512.0
vaddps xmm4, xmm4, xmm4 # create 1024.0
vdivps xmm1, xmm4, xmm2 # create 341.3333
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
vmovaps xmm1, xmm0
vmovaps xmm2, xmm0
vmovaps xmm3, xmm0
vmovaps xmm4, xmm0
vmovaps xmm5, xmm0
loop:
inc i
INSTR xmm10, xmm0
INSTR xmm11, xmm1
INSTR xmm12, xmm2
cmp i, N
INSTR xmm13, xmm3
INSTR xmm14, xmm4
INSTR xmm15, xmm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

54
src/AVX/vrcpps-sse.S Normal file
View File

@@ -0,0 +1,54 @@
#define INSTR vrcpps
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create SP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1))
vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
vaddps xmm1, xmm0, xmm0 # create 2.0
vaddps xmm2, xmm0, xmm1 # create 3.0
vaddps xmm4, xmm1, xmm1 # create 4.0
vaddps xmm4, xmm4, xmm4 # create 8.0
vaddps xmm4, xmm4, xmm4 # create 16.0
vaddps xmm4, xmm4, xmm4 # create 32.0
vaddps xmm4, xmm4, xmm4 # create 64.0
vaddps xmm4, xmm4, xmm4 # create 128.0
vaddps xmm4, xmm4, xmm4 # create 256.0
vaddps xmm4, xmm4, xmm4 # create 512.0
vaddps xmm4, xmm4, xmm4 # create 1024.0
vdivps xmm1, xmm4, xmm2 # create 341.3333
vdivps xmm2, xmm0, xmm1 # create 1/341.3333
vaddps xmm0, xmm1, xmm1 # create 2*341.3333
loop:
inc i
INSTR xmm1, xmm0
INSTR xmm2, xmm1
INSTR xmm3, xmm2
cmp i, N
INSTR xmm4, xmm3
INSTR xmm5, xmm4
INSTR xmm0, xmm5
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

47
src/vsx/xvadddp.S Normal file
View File

@@ -0,0 +1,47 @@
#define INSTR xvadddp
#define NINST 6
#define N 3
.globl ninst
.data
ninst:
.long NINST
.align 16
zero:
.double 0.0, 0.0
one:
.double 1.0, 1.0
.text
.abiversion 2
.section ".toc","aw"
.section ".text"
.align 2
.globl latency
.type latency, @function
latency :
0: addis 2,12,.TOC.-0b@ha
addi 2,2,.TOC.-0b@l
.localentry latency, .-latency
mtctr N # move to count register
# load DP FP zero
li 10, 0
addis 9,2,zero@toc@ha
addi 9,9,zero@toc@l
lxvd2x 0, 0, 9
addis 9,2,one@toc@ha
addi 9,9,one@toc@l
lxvd2x 1, 0, 9
loop:
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
bdnz loop
xvmovdp 1, 0
blr
.size latency, .-latency

47
src/vsx/xvaddsp.S Normal file
View File

@@ -0,0 +1,47 @@
#define INSTR xvaddsp
#define NINST 6
#define N 3
.globl ninst
.data
ninst:
.long NINST
.align 16
zero:
.single 0.0, 0.0
one:
.single 1.0, 1.0
.text
.abiversion 2
.section ".toc","aw"
.section ".text"
.align 2
.globl latency
.type latency, @function
latency :
0: addis 2,12,.TOC.-0b@ha
addi 2,2,.TOC.-0b@l
.localentry latency, .-latency
mtctr N # move to count register
# load DP FP zero
li 10, 0
addis 9,2,zero@toc@ha
addi 9,9,zero@toc@l
lxvd2x 0, 0, 9
addis 9,2,one@toc@ha
addi 9,9,one@toc@l
lxvd2x 1, 0, 9
loop:
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
bdnz loop
xvmovdp 1, 0
blr
.size latency, .-latency

49
src/vsx/xvdivdp.S Normal file
View File

@@ -0,0 +1,49 @@
#define INSTR xvdivdp
#define NINST 6
#define N 3
.globl ninst
.data
ninst:
.long NINST
.align 16
half:
.double 0.5, 0.5
one:
.double 1.0, 1.0
two:
.double 2.0, 2.0
.text
.abiversion 2
.section ".toc","aw"
.section ".text"
.align 2
.globl latency
.type latency, @function
latency :
0: addis 2,12,.TOC.-0b@ha
addi 2,2,.TOC.-0b@l
.localentry latency, .-latency
mtctr N # move to count register
li 10, 0 # offset zero
addis 9,2,one@toc@ha # upper 32 bit of address
addi 9,9,one@toc@l # lower 32 bit of address
lxvd2x 0, 0, 9
addis 9,2,half@toc@ha # upper 32 bit of address
addi 9,9,half@toc@l # lower 32 bit of address
lxvd2x 1, 0, 9
addis 9,2,two@toc@ha # upper 32 bit of address
addi 9,9,two@toc@l # lower 32 bit of address
lxvd2x 2, 0, 9
loop:
INSTR 0, 0, 1
INSTR 0, 0, 2
INSTR 0, 0, 1
INSTR 0, 0, 2
INSTR 0, 0, 1
INSTR 0, 0, 2
bdnz loop
xvmovdp 1, 0
blr
.size latency, .-latency

49
src/vsx/xvdivsp.S Normal file
View File

@@ -0,0 +1,49 @@
#define INSTR xvdivsp
#define NINST 6
#define N 3
.globl ninst
.data
ninst:
.long NINST
.align 16
half:
.single 0.5, 0.5
one:
.single 1.0, 1.0
two:
.single 2.0, 2.0
.text
.abiversion 2
.section ".toc","aw"
.section ".text"
.align 2
.globl latency
.type latency, @function
latency :
0: addis 2,12,.TOC.-0b@ha
addi 2,2,.TOC.-0b@l
.localentry latency, .-latency
mtctr N # move to count register
li 10, 0 # offset zero
addis 9,2,one@toc@ha # upper 32 bit of address
addi 9,9,one@toc@l # lower 32 bit of address
lxvd2x 0, 0, 9
addis 9,2,half@toc@ha # upper 32 bit of address
addi 9,9,half@toc@l # lower 32 bit of address
lxvd2x 1, 0, 9
addis 9,2,two@toc@ha # upper 32 bit of address
addi 9,9,two@toc@l # lower 32 bit of address
lxvd2x 2, 0, 9
loop:
INSTR 0, 0, 1
INSTR 0, 0, 2
INSTR 0, 0, 1
INSTR 0, 0, 2
INSTR 0, 0, 1
INSTR 0, 0, 2
bdnz loop
xvmovdp 1, 0
blr
.size latency, .-latency

53
src/vsx/xvmaddadp.S Normal file
View File

@@ -0,0 +1,53 @@
#define INSTR xvmaddadp
#define NINST 6
#define N 3
.globl ninst
.data
ninst:
.long NINST
.align 16
zero:
.double 0.0, 0.0
two:
.double 2.0, 2.0
three:
.double 3.0, 3.0
.text
.abiversion 2
.section ".toc","aw"
.section ".text"
.align 2
.globl latency
.type latency, @function
latency :
0: addis 2,12,.TOC.-0b@ha
addi 2,2,.TOC.-0b@l
.localentry latency, .-latency
mtctr N # move to count register
# load DP FP zero
li 10, 0
addis 9,2,zero@toc@ha
addi 9,9,zero@toc@l
lxvd2x 0, 0, 9
addis 9,2,two@toc@ha
addi 9,9,two@toc@l
lxvd2x 1, 0, 9
addis 9,2,three@toc@ha
addi 9,9,three@toc@l
lxvd2x 2, 0, 9
loop:
INSTR 0, 1, 2
INSTR 0, 1, 2
INSTR 0, 1, 2
INSTR 0, 1, 2
INSTR 0, 1, 2
INSTR 0, 1, 2
bdnz loop
xvmovdp 1, 0
blr
.size latency, .-latency

44
src/vsx/xvmuldp.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR xvmuldp
#define NINST 6
#define N 3
.globl ninst
.data
ninst:
.long NINST
.align 16
zero:
.double 0.0, 0.0
one:
.double 1.0, 1.0
.text
.abiversion 2
.section ".toc","aw"
.section ".text"
.align 2
.globl latency
.type latency, @function
latency :
0: addis 2,12,.TOC.-0b@ha
addi 2,2,.TOC.-0b@l
.localentry latency, .-latency
mtctr N # move to count register
li 10, 0 # offset zero
addis 9,2,one@toc@ha # upper 32 bit of address
addi 9,9,one@toc@l # lower 32 bit of address
lxvd2x 0, 0, 9
addis 9,2,one@toc@ha # upper 32 bit of address
addi 9,9,one@toc@l # lower 32 bit of address
lxvd2x 1, 0, 9
loop:
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
bdnz loop
xvmovdp 1, 0
blr
.size latency, .-latency

44
src/vsx/xvmulsp.S Normal file
View File

@@ -0,0 +1,44 @@
#define INSTR xvmulsp
#define NINST 6
#define N 3
.globl ninst
.data
ninst:
.long NINST
.align 16
zero:
.single 0.0, 0.0
one:
.single 1.0, 1.0
.text
.abiversion 2
.section ".toc","aw"
.section ".text"
.align 2
.globl latency
.type latency, @function
latency :
0: addis 2,12,.TOC.-0b@ha
addi 2,2,.TOC.-0b@l
.localentry latency, .-latency
mtctr N # move to count register
li 10, 0 # offset zero
addis 9,2,one@toc@ha # upper 32 bit of address
addi 9,9,one@toc@l # lower 32 bit of address
lxvd2x 0, 0, 9
addis 9,2,one@toc@ha # upper 32 bit of address
addi 9,9,one@toc@l # lower 32 bit of address
lxvd2x 1, 0, 9
loop:
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
INSTR 0, 0, 1
bdnz loop
xvmovdp 1, 0
blr
.size latency, .-latency