Commit 5be33214 authored by Gaëtan Cassiers's avatar Gaëtan Cassiers
Browse files

prepare for supercop submission

parent d56d4a8a
......@@ -3,3 +3,5 @@
/prim_bench_iaca/
/prim_bench_real/
/spook_bench/
/submission/
*.o
MIT License
Copyright (c) 2019 Gaëtan Cassiers
Copyright (c) 2019, 2020 Gaëtan Cassiers and UCLouvain
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......
......@@ -3,8 +3,8 @@
CC=gcc
CFLAGS="-std=gnu99 -g -Ofast -mtune=skylake-avx512"
BENCH_DIR_IACA=../prim_bench_iaca
BENCH_DIR_REAL=../prim_bench_real
BENCH_DIR_IACA=../bench_results/prim_bench_iaca
BENCH_DIR_REAL=../bench_results/prim_bench_real
export PROC_FREQ=2.0 # Processor frequency (GHz)
......@@ -20,11 +20,13 @@ grep -v ^# < $BENCH_RUNS | while read -r line
do
set -- x $line;
TYPE=$2
PRIM=${TYPE}_$3
NB=$3
PRIM=${TYPE}_${NB}bit
ARCH=$4
DEF_TYPE=`echo $TYPE | tr [a-z] [A-Z]`_TYPE_${NB}_BIT
echo bench $TYPE $PRIM $ARCH ...;
FULLNAME=./$BENCH_DIR_IACA/$PRIM-$ARCH-iaca
$CC $CFLAGS -march=$ARCH -D BENCH_IACA -c ../src/$PRIM.c -o $FULLNAME.o
$CC $CFLAGS -march=$ARCH -D BENCH_IACA -D $DEF_TYPE -c ../src/$PRIM.c -o $FULLNAME.o
iaca -arch SKX $FULLNAME.o > $FULLNAME.txt
done
......@@ -32,7 +34,10 @@ for f in `ls $BENCH_DIR_IACA/*.txt`
do
INSTANCE=`echo $f | awk -F/ '{print $NF}' | cut -d '.' -f 1`
CYCLES=`grep 'Block Throughput' $f | cut -d ' ' -f 3`
echo $INSTANCE $CYCLES >> $BENCH_DIR_IACA/results.txt
if [[ "$CYCLES" != "" ]]
then
echo $INSTANCE $CYCLES >> $BENCH_DIR_IACA/results.txt
fi
done
# Real benchmark
......@@ -50,12 +55,14 @@ grep -v ^# < $BENCH_RUNS | while read -r line
do
set -- x $line;
TYPE=$2
PRIM=${TYPE}_$3
NB=$3
PRIM=${TYPE}_${NB}bit
ARCH=$4
DEF_TYPE=`echo $TYPE | tr [a-z] [A-Z]`_TYPE_${NB}_BIT
echo bench $TYPE $PRIM $ARCH ...;
FULLNAME=./$BENCH_DIR_REAL/$PRIM-$ARCH-bench
BENCH_HARNESS=bench_$TYPE
$CC $CFLAGS -march=$ARCH -c ../src/$PRIM.c -o $FULLNAME.o
$CC $CFLAGS -march=$ARCH -D $DEF_TYPE -c ../src/$PRIM.c -o $FULLNAME.o
$CC $CFLAGS -march=$ARCH -D N_ITER=$N_ITER -I ../src -c ./src/$BENCH_HARNESS.c -o $BENCH_DIR_REAL/$BENCH_HARNESS.o
$CC $CFLAGS -march=$ARCH $FULLNAME.o $BENCH_DIR_REAL/$BENCH_HARNESS.o -o $FULLNAME
$FULLNAME > $FULLNAME.txt
......
clyde 32bit x86-64
clyde 32bit haswell
clyde 32bit skylake-avx512
clyde 64bit haswell
clyde 64bit skylake-avx512
shadow 32bit x86-64
shadow 32bit haswell
shadow 32bit skylake-avx512
shadow 128bit x86-64
shadow 128bit haswell
shadow 128bit skylake-avx512
shadow 256bit haswell
shadow 256bit skylake-avx512
shadow 512bit skylake-avx512
clyde 32 x86-64
clyde 32 haswell
clyde 32 skylake-avx512
clyde 64 haswell
clyde 64 skylake-avx512
shadow 32 x86-64
shadow 32 haswell
shadow 32 skylake-avx512
shadow 128 x86-64
shadow 128 haswell
shadow 128 skylake-avx512
#shadow 256 haswell
#shadow 256 skylake-avx512
#shadow 512 skylake-avx512
clyde 32bit x86-64
clyde 32bit haswell
clyde 32bit skylake-avx512
clyde 64bit haswell
clyde 64bit skylake-avx512
shadow 32bit x86-64
shadow 32bit haswell
shadow 32bit skylake-avx512
shadow 128bit x86-64
shadow 128bit haswell
shadow 128bit skylake-avx512
#shadow 256bit haswell
#shadow 256bit skylake-avx512
#shadow 512bit skylake-avx512
clyde 32 x86-64
clyde 32 haswell
clyde 32 skylake-avx512
clyde 64 haswell
clyde 64 skylake-avx512
shadow 32 x86-64
shadow 32 haswell
shadow 32 skylake-avx512
shadow 128 x86-64
shadow 128 haswell
shadow 128 skylake-avx512
#shadow 256 haswell
#shadow 256 skylake-avx512
#shadow 512 skylake-avx512
......@@ -28,6 +28,7 @@ def parse_real_line(s):
return (parse_prim_id(prim_id), val)
def parse_iaca_line(s):
print(s)
prim_id, cycles = s.split(' ')[:2]
cycles = PRIM_NS*float(cycles)
return (parse_prim_id(prim_id), fmt_cycles(cycles))
......
......@@ -16,8 +16,11 @@ mkdir -p $BENCH_DIR
grep -v ^# < $BENCH_RUNS | while read -r line
do
set -- x $line;
CLYDE=clyde_$3
SHADOW=shadow_$5
CLYDE=clyde_$3bit
SHADOW=shadow_$5bit
CLYDE_DEF=CLYDE_TYPE_$3_BIT
SHADOW_DEF=SHADOW_TYPE_$5_BIT
DEFS=-D CLYDE_DEF -D SHADOW_DEF
ARCH=$6
TYPE=$2
echo bench $CLYDE $SHADOW $ARCH ...;
......@@ -26,11 +29,11 @@ do
BENCH_HARNESS=bench_spook
CLYDE_O=$BENCH_DIR/$CLYDE-$ARCH.o
SHADOW_O=$BENCH_DIR/$SHADOW-$ARCH.o
$CC $CFLAGS -march=$ARCH -c ../src/$CLYDE.c -o $CLYDE_O
$CC $CFLAGS -march=$ARCH -c ../src/$SHADOW.c -o $SHADOW_O
$CC $CFLAGS -march=$ARCH -c ../src/s1p.c -o $BENCH_DIR/s1p-$ARCH.o
$CC $CFLAGS -march=$ARCH -c ../src/encrypt.c -o $BENCH_DIR/encrypt-$ARCH.o
$CC $CFLAGS -march=$ARCH -I ../src -D N_ITER=$N_ITER -c src/$BENCH_HARNESS.c -o $BENCH_DIR/$BENCH_HARNESS-$ARCH.o
$CC $CFLAGS -march=$ARCH $DEFS -c ../src/$CLYDE.c -o $CLYDE_O
$CC $CFLAGS -march=$ARCH $DEFS -c ../src/$SHADOW.c -o $SHADOW_O
$CC $CFLAGS -march=$ARCH $DEFS -c ../src/s1p.c -o $BENCH_DIR/s1p-$ARCH.o
$CC $CFLAGS -march=$ARCH $DEFS -c ../src/encrypt.c -o $BENCH_DIR/encrypt-$ARCH.o
$CC $CFLAGS -march=$ARCH $DEFS -I ../src -D N_ITER=$N_ITER -c src/$BENCH_HARNESS.c -o $BENCH_DIR/$BENCH_HARNESS-$ARCH.o
$CC $CFLAGS -march=$ARCH -flto $CLYDE_O $SHADOW_O $BENCH_DIR/s1p-$ARCH.o $BENCH_DIR/encrypt-$ARCH.o $BENCH_DIR/$BENCH_HARNESS-$ARCH.o -o $FULLNAME
$FULLNAME > $FULLNAME.txt
done
......
clyde 32bit shadow 32bit x86-64
clyde 32bit shadow 32bit haswell
clyde 32bit shadow 32bit skylake-avx512
clyde 64bit shadow 32bit haswell
clyde 64bit shadow 32bit skylake-avx512
clyde 32bit shadow 128bit x86-64
clyde 32bit shadow 128bit haswell
clyde 32bit shadow 128bit skylake-avx512
clyde 64bit shadow 128bit haswell
clyde 64bit shadow 128bit skylake-avx512
#clyde 32bit shadow 256bit haswell
#clyde 32bit shadow 256bit skylake-avx512
#clyde 64bit shadow 256bit haswell
#clyde 64bit shadow 256bit skylake-avx512
#clyde 32bit shadow 512bit skylake-avx512
#clyde 64bit shadow 512bit skylake-avx512
clyde 32 shadow 32 x86-64
clyde 32 shadow 32 haswell
clyde 32 shadow 32 skylake-avx512
clyde 64 shadow 32 haswell
clyde 64 shadow 32 skylake-avx512
clyde 32 shadow 128 x86-64
clyde 32 shadow 128 haswell
clyde 32 shadow 128 skylake-avx512
clyde 64 shadow 128 haswell
clyde 64 shadow 128 skylake-avx512
#clyde 32 shadow 256 haswell
#clyde 32 shadow 256 skylake-avx512
#clyde 64 shadow 256 haswell
#clyde 64 shadow 256 skylake-avx512
#clyde 32 shadow 512 skylake-avx512
#clyde 64 shadow 512 skylake-avx512
clyde_32bit-haswell-iaca 42.05
clyde_32bit-skylake-avx512-iaca 42.05
clyde_32bit-x86-64-iaca 62.11
clyde_64bit-haswell-iaca 35.00
clyde_64bit-skylake-avx512-iaca 35.00
shadow_128bit-haswell-iaca 55.63
shadow_128bit-skylake-avx512-iaca 37.63
shadow_128bit-x86-64-iaca 54.32
shadow_256bit-haswell-iaca 49.26
shadow_256bit-skylake-avx512-iaca 40.37
shadow_512bit-skylake-avx512-iaca 32.00
clyde_32bit-x86-64-iaca 62.00
clyde_64bit-haswell-iaca 36.00
clyde_64bit-skylake-avx512-iaca 36.00
shadow_128bit-haswell-iaca 68.00
shadow_128bit-skylake-avx512-iaca 66.00
shadow_128bit-x86-64-iaca 69.00
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - ./../prim_bench_iaca/shadow_256bit-haswell-iaca.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 49.26 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 28.6 0.0 | 28.7 | 2.0 2.0 | 2.0 2.0 | 0.0 | 28.7 | 1.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1* | | | | | | | | | vmovdqa xmm3, xmm0
| 1* | | | | | | | | | vmovdqa xmm4, xmm2
| 1 | | | | | | 1.0 | | | vextracti128 xmm1, ymm2, 0x1
| 1 | | | | | | 1.0 | | | vextracti128 xmm0, ymm0, 0x1
| 1 | 0.6 | 0.4 | | | | | | | vpand xmm2, xmm3, xmm2
| 1 | 0.4 | 0.6 | | | | | | | vpxor xmm2, xmm2, xmm1
| 1 | 0.6 | 0.4 | | | | | | | vpand xmm1, xmm0, xmm4
| 1 | 0.4 | 0.6 | | | | | | | vpxor xmm1, xmm1, xmm3
| 1* | | | | | | | | | vmovdqa xmm3, xmm1
| 1 | 0.6 | 0.4 | | | | | | | vpand xmm1, xmm2, xmm1
| 1 | 0.4 | 0.6 | | | | | | | vpxor xmm1, xmm1, xmm0
| 1 | 0.6 | 0.4 | | | | | | | vpand xmm0, xmm0, xmm2
| 1 | | | | | | 1.0 | | | vinserti128 ymm1, ymm3, xmm1, 0x1
| 1 | 0.4 | 0.6 | | | | | | | vpxor xmm0, xmm0, xmm4
| 1* | | | | | | | | | vmovdqa xmm3, xmm2
| 1 | | | | | | 1.0 | | | vinserti128 ymm0, ymm3, xmm0, 0x1
| 1 | 0.6 | 0.4 | | | | | | | vpslld ymm2, ymm1, 0x14
| 1 | 0.4 | 0.6 | | | | | | | vpsrld ymm3, ymm1, 0xc
| 1 | 0.6 | 0.4 | | | | | | | vpslld ymm4, ymm0, 0x14
| 1 | 0.4 | 0.6 | | | | | | | vpor ymm3, ymm3, ymm2
| 1 | 0.6 | 0.4 | | | | | | | vpsrld ymm2, ymm0, 0xc
| 1 | 0.4 | 0.6 | | | | | | | vpxor ymm3, ymm3, ymm1
| 1 | 0.6 | 0.4 | | | | | | | vpor ymm2, ymm2, ymm4
| 1 | 0.4 | 0.6 | | | | | | | vpxor ymm2, ymm2, ymm0
| 1 | 0.6 | 0.4 | | | | | | | vpsrld ymm4, ymm3, 0x3
| 1 | 0.4 | 0.6 | | | | | | | vpslld ymm5, ymm3, 0x1d
| 1 | | | | | | 1.0 | | | vpor ymm5, ymm5, ymm4
| 1 | 0.6 | 0.4 | | | | | | | vpslld ymm6, ymm2, 0x1d
| 1 | 0.4 | 0.6 | | | | | | | vpsrld ymm4, ymm2, 0x3
| 1 | | | | | | 1.0 | | | vpor ymm6, ymm6, ymm4
| 1 | 0.6 | 0.4 | | | | | | | vpslld ymm4, ymm1, 0xf
| 1 | 0.4 | 0.6 | | | | | | | vpsrld ymm1, ymm1, 0x11
| 1 | | | | | | 1.0 | | | vpor ymm1, ymm4, ymm1
| 1 | 0.6 | 0.4 | | | | | | | vpslld ymm4, ymm0, 0xf
| 1 | 0.4 | 0.6 | | | | | | | vpsrld ymm0, ymm0, 0x11
| 1 | | | | | | 1.0 | | | vpxor ymm3, ymm1, ymm3
| 1 | 0.3 | 0.4 | | | | 0.3 | | | vpor ymm0, ymm4, ymm0
| 1 | 0.3 | 0.3 | | | | 0.4 | | | vpxor ymm5, ymm5, ymm3
| 1 | 0.4 | 0.3 | | | | 0.3 | | | vpxor ymm2, ymm0, ymm2
| 1 | 0.3 | 0.4 | | | | 0.3 | | | vpxor ymm2, ymm6, ymm2
| 1 | 0.7 | 0.3 | | | | | | | vpsrld ymm1, ymm5, 0x1f
| 1 | 0.3 | 0.7 | | | | | | | vpslld ymm4, ymm5, 0x1
| 1 | | | | | | 1.0 | | | vpor ymm4, ymm4, ymm1
| 1 | 0.7 | 0.3 | | | | | | | vpslld ymm3, ymm2, 0x1
| 1 | 0.3 | 0.7 | | | | | | | vpsrld ymm1, ymm2, 0x1f
| 1 | | | | | | 1.0 | | | vpor ymm3, ymm3, ymm1
| 1 | 0.3 | 0.3 | | | | 0.4 | | | vpxor ymm3, ymm3, ymm2
| 1 | 0.4 | 0.3 | | | | 0.3 | | | vpxor ymm4, ymm4, ymm5
| 1 | 0.6 | 0.4 | | | | | | | vpsrld ymm1, ymm3, 0x1a
| 1 | 0.4 | 0.6 | | | | | | | vpslld ymm0, ymm3, 0x6
| 1 | | | | | | 1.0 | | | vpor ymm0, ymm0, ymm1
| 1 | 0.6 | 0.4 | | | | | | | vpsrld ymm6, ymm4, 0xf
| 1 | 0.4 | 0.6 | | | | | | | vpslld ymm1, ymm4, 0x11
| 1 | | | | | | 1.0 | | | vpor ymm6, ymm6, ymm1
| 1 | 0.6 | 0.4 | | | | | | | vpslld ymm1, ymm4, 0x7
| 1 | 0.4 | 0.6 | | | | | | | vpsrld ymm4, ymm4, 0x19
| 1 | | | | | | 1.0 | | | vpor ymm1, ymm1, ymm4
| 1 | 0.6 | 0.4 | | | | | | | vpsrld ymm4, ymm3, 0xf
| 1 | 0.4 | 0.6 | | | | | | | vpslld ymm3, ymm3, 0x11
| 2^ | | | 1.0 1.0 | | | 1.0 | | | vpxor ymm5, ymm5, ymmword ptr [rax]
| 1 | 0.3 | 0.4 | | | | 0.3 | | | vpor ymm3, ymm4, ymm3
| 2^ | 0.3 | 0.3 | | 1.0 1.0 | | 0.4 | | | vpxor ymm2, ymm2, ymmword ptr [rax+0x20]
| 1 | 0.4 | 0.3 | | | | 0.3 | | | vpxor ymm1, ymm1, ymm3
| 1 | 0.3 | 0.4 | | | | 0.3 | | | vpxor ymm0, ymm0, ymm6
| 1 | 0.3 | 0.3 | | | | 0.4 | | | vpxor ymm1, ymm1, ymm2
| 1 | 0.4 | 0.3 | | | | 0.3 | | | vpxor ymm0, ymm0, ymm5
| 1* | | | | | | | | | vmovdqa xmm5, xmm0
| 1* | | | | | | | | | vmovdqa xmm4, xmm1
| 1 | | | | | | 1.0 | | | vextracti128 xmm1, ymm1, 0x1
| 1 | 0.6 | 0.4 | | | | | | | vpand xmm2, xmm4, xmm5
| 1 | 0.4 | 0.6 | | | | | | | vpand xmm3, xmm1, xmm5
| 1 | | | | | | 1.0 | | | vextracti128 xmm0, ymm0, 0x1
| 1 | 0.6 | 0.4 | | | | | | | vpxor xmm0, xmm2, xmm0
| 1 | 0.4 | 0.6 | | | | | | | vpxor xmm3, xmm3, xmm4
| 1* | | | | | | | | | vmovdqa xmm4, xmm3
| 1 | 0.3 | 0.4 | | | | 0.3 | | | vpand xmm3, xmm0, xmm3
| 1 | 0.3 | 0.3 | | | | 0.4 | | | vpxor xmm3, xmm3, xmm1
| 1 | | | | | | 1.0 | | | vinserti128 ymm4, ymm4, xmm3, 0x1
| 1 | 0.7 | 0.3 | | | | | | | vpand xmm1, xmm1, xmm0
| 1* | | | | | | | | | vmovdqa xmm3, xmm0
| 1 | | | | | | 1.0 | | | vpshufd ymm2, ymm4, 0xbf
| 1 | 0.3 | 0.7 | | | | | | | vpxor xmm1, xmm1, xmm5
| 1 | | | | | | 1.0 | | | vinserti128 ymm1, ymm3, xmm1, 0x1
| 2^ | 0.7 | 0.3 | 1.0 1.0 | | | | | | vpxor ymm0, ymm2, ymmword ptr [rax+0x40]
| 1 | | | | | | 1.0 | | | vpshufd ymm3, ymm4, 0x5a
| 1 | | | | | | 1.0 | | | vpshufd ymm2, ymm4, 0x1
| 1 | 0.3 | 0.7 | | | | | | | vpxor ymm3, ymm2, ymm3
| 1 | 0.7 | 0.3 | | | | | | | vpxor ymm2, ymm0, ymm3
| 1 | | | | | | 1.0 | | | vpshufd ymm0, ymm1, 0x1
| 1 | | | | | | 1.0 | | | vpshufd ymm3, ymm1, 0x5a
| 1 | | | | | | 1.0 | | | vpshufd ymm1, ymm1, 0xbf
| 2^ | 0.3 | 0.7 | | 1.0 1.0 | | | | | vpxor ymm1, ymm1, ymmword ptr [rax+0x60]
| 1 | 0.7 | 0.3 | | | | | | | vpxor ymm0, ymm0, ymm3
| 1 | | | | | | | 1.0 | | sub rax, 0xffffffffffffff80
| 1 | 0.3 | 0.7 | | | | | | | vpxor ymm0, ymm0, ymm1
| 1* | | | | | | | | | cmp rdx, rax
| 0*F | | | | | | | | | jnz 0xfffffffffffffe48
Total Num Of Uops: 100
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - ./../prim_bench_iaca/shadow_256bit-skylake-avx512-iaca.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 40.37 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 23.4 0.0 | 23.3 | 2.0 2.0 | 2.0 2.0 | 0.0 | 23.3 | 1.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | 0.7 | 0.3 | | | | | | | vmovdqa64 xmm5, xmm2
| 1 | 0.3 | 0.7 | | | | | | | vmovdqa64 xmm3, xmm0
| 1 | | | | | | 1.0 | | | vextracti64x2 xmm0, ymm0, 0x1
| 1 | 0.7 | 0.3 | | | | | | | vpand xmm4, xmm3, xmm5
| 1 | 0.3 | 0.7 | | | | | | | vpand xmm1, xmm0, xmm5
| 1 | | | | | | 1.0 | | | vextracti64x2 xmm2, ymm2, 0x1
| 1 | 0.7 | 0.3 | | | | | | | vpxor xmm2, xmm4, xmm2
| 1 | 0.3 | 0.7 | | | | | | | vpxor xmm1, xmm1, xmm3
| 1 | 0.7 | 0.3 | | | | | | | vmovdqa64 xmm3, xmm1
| 1 | 0.3 | 0.7 | | | | | | | vpand xmm1, xmm2, xmm1
| 1 | 0.7 | 0.3 | | | | | | | vpxor xmm1, xmm1, xmm0
| 1 | 0.3 | 0.7 | | | | | | | vpand xmm0, xmm0, xmm2
| 1 | | | | | | 1.0 | | | vinserti64x2 ymm3, ymm3, xmm1, 0x1
| 1 | 0.7 | 0.3 | | | | | | | vpxor xmm0, xmm0, xmm5
| 1 | 0.3 | 0.7 | | | | | | | vmovdqa64 xmm1, xmm2
| 1 | | | | | | 1.0 | | | vinserti64x2 ymm0, ymm1, xmm0, 0x1
| 1 | 0.7 | 0.3 | | | | | | | vprord ymm5, ymm3, 0xc
| 1 | 0.3 | 0.7 | | | | | | | vprord ymm2, ymm0, 0xc
| 1 | | | | | | | 1.0 | | sub rax, 0xffffffffffffff80
| 1 | 0.7 | 0.3 | | | | | | | vpxor ymm5, ymm5, ymm3
| 1 | 0.3 | 0.7 | | | | | | | vpxor ymm2, ymm2, ymm0
| 1 | 0.7 | 0.3 | | | | | | | vprold ymm3, ymm3, 0xf
| 1 | 0.3 | 0.7 | | | | | | | vprold ymm0, ymm0, 0xf
| 1 | 0.7 | 0.3 | | | | | | | vprord ymm1, ymm5, 0x3
| 1 | 0.3 | 0.7 | | | | | | | vprord ymm4, ymm2, 0x3
| 1 | | | | | | 1.0 | | | vpxor ymm3, ymm3, ymm5
| 1 | | | | | | 1.0 | | | vpxor ymm0, ymm0, ymm2
| 1 | 0.4 | 0.3 | | | | 0.3 | | | vpxor ymm3, ymm1, ymm3
| 1 | 0.3 | 0.4 | | | | 0.3 | | | vpxor ymm4, ymm4, ymm0
| 1 | 0.7 | 0.3 | | | | | | | vprold ymm1, ymm3, 0x1
| 1 | 0.3 | 0.7 | | | | | | | vprold ymm5, ymm4, 0x1
| 1 | | | | | | 1.0 | | | vpxor ymm1, ymm1, ymm3
| 1 | 0.3 | 0.3 | | | | 0.4 | | | vpxor ymm5, ymm5, ymm4
| 1 | 0.7 | 0.3 | | | | | | | vprord ymm0, ymm1, 0xf
| 1 | 0.3 | 0.7 | | | | | | | vprold ymm2, ymm5, 0x6
| 1 | 0.7 | 0.3 | | | | | | | vprord ymm5, ymm5, 0xf
| 2^ | | | 1.0 1.0 | | | 1.0 | | | vpxor ymm4, ymm4, ymmword ptr [rax-0x60]
| 2^ | | 0.4 | | 1.0 1.0 | | 0.6 | | | vpxor ymm3, ymm3, ymmword ptr [rax-0x80]
| 1 | 0.3 | 0.3 | | | | 0.4 | | | vpxor ymm2, ymm2, ymm0
| 1 | 0.7 | 0.3 | | | | | | | vprold ymm0, ymm1, 0x7
| 1 | | 0.4 | | | | 0.6 | | | vpxor ymm3, ymm2, ymm3
| 1 | 0.3 | 0.3 | | | | 0.4 | | | vpxor ymm0, ymm0, ymm5
| 1 | 0.4 | 0.3 | | | | 0.3 | | | vpxor ymm0, ymm0, ymm4
| 1 | 0.3 | 0.4 | | | | 0.3 | | | vmovdqa64 xmm5, xmm3
| 1 | 0.3 | 0.3 | | | | 0.4 | | | vmovdqa64 xmm4, xmm0
| 1 | | | | | | 1.0 | | | vextracti64x2 xmm0, ymm0, 0x1
| 1 | 0.7 | 0.3 | | | | | | | vpand xmm2, xmm4, xmm5
| 1 | 0.3 | 0.7 | | | | | | | vpand xmm1, xmm0, xmm5
| 1 | | | | | | 1.0 | | | vextracti64x2 xmm3, ymm3, 0x1
| 1 | 0.7 | 0.3 | | | | | | | vpxor xmm3, xmm2, xmm3
| 1 | 0.3 | 0.7 | | | | | | | vpxor xmm1, xmm1, xmm4
| 1 | 0.4 | 0.3 | | | | 0.3 | | | vmovdqa64 xmm4, xmm1
| 1 | 0.3 | 0.4 | | | | 0.3 | | | vpand xmm1, xmm3, xmm1
| 1 | 0.3 | 0.3 | | | | 0.4 | | | vpxor xmm1, xmm1, xmm0
| 1 | 0.4 | 0.3 | | | | 0.3 | | | vpand xmm0, xmm0, xmm3
| 1 | | | | | | 1.0 | | | vinserti64x2 ymm4, ymm4, xmm1, 0x1
| 1 | 0.6 | 0.4 | | | | | | | vpxor xmm0, xmm0, xmm5
| 1 | 0.4 | 0.6 | | | | | | | vmovdqa64 xmm1, xmm3
| 1 | | | | | | 1.0 | | | vinserti64x2 ymm1, ymm1, xmm0, 0x1
| 1 | | | | | | 1.0 | | | vpshufd ymm2, ymm4, 0xbf
| 1 | | | | | | 1.0 | | | vpshufd ymm0, ymm4, 0x1
| 1 | | | | | | 1.0 | | | vpshufd ymm4, ymm4, 0x5a
| 1 | 0.6 | 0.4 | | | | | | | vpxor ymm4, ymm0, ymm4
| 1 | | | | | | 1.0 | | | vpshufd ymm3, ymm1, 0x5a
| 1 | | | | | | 1.0 | | | vpshufd ymm0, ymm1, 0x1
| 1 | | | | | | 1.0 | | | vpshufd ymm1, ymm1, 0xbf
| 2^ | 0.4 | 0.6 | 1.0 1.0 | | | | | | vpxor ymm2, ymm2, ymmword ptr [rax-0x40]
| 2^ | 0.6 | 0.4 | | 1.0 1.0 | | | | | vpxor ymm1, ymm1, ymmword ptr [rax-0x20]
| 1 | 0.4 | 0.6 | | | | | | | vpxor ymm0, ymm0, ymm3
| 1 | 0.6 | 0.4 | | | | | | | vpxor ymm2, ymm2, ymm4
| 1 | 0.4 | 0.6 | | | | | | | vpxor ymm0, ymm0, ymm1
| 1* | | | | | | | | | cmp rdx, rax
| 0*F | | | | | | | | | jnz 0xfffffffffffffe83
Total Num Of Uops: 76
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment