Commit d56d4a8a authored by Gaëtan Cassiers's avatar Gaëtan Cassiers
Browse files

V2 perf

parent cbb42186
......@@ -10,7 +10,9 @@ The code was compiled with gcc 8.2 for the following targets:
* haswell
* skylake-avx512
## Intel IACA
## Spook V1
### Intel IACA
Below are the extimates of cycle count for various primitive implementations given by the [IACA 3.0](https://software.intel.com/en-us/articles/intel-architecture-code-analyzer) tool.
......@@ -31,7 +33,7 @@ Shadow512:
|shadow_32bit| | | |
|shadow_512bit| | |192.00|
## Benchmark
### Benchmark
The benchmark was run on a Intel(R) Xeon(R) Gold 6128 CPU @ 3.40GHz.
......@@ -81,3 +83,40 @@ Spook128su512v1 (Cycles per byte), throughput for m=2048 bytes:
|clyde_64bit-shadow_32bit| |15.23|11.79|
|clyde_64bit-shadow_512bit| | |15.36|
## Spook V2
Currently, only the 128 bit and 32 bit implementations of shadow have been written (while
Clyde is unchanged), leading to the following results:
Shadow512v2 (Cycles), IACA
| |x86-64|haswell|skylake-avx512|
|-|-|-|-|
|shadow_128bit|414.00|396.00|312.00|
Shadow512v2 (Cycles), Benchmark
| |x86-64|haswell|skylake-avx512|
|-|-|-|-|
|shadow_32bit|958.80|831.20|830.80|
|shadow_128bit|473.60|444.20|362.40|
Spook128su512v1 (Cycles per byte), max throughput:
| |x86-64|haswell|skylake-avx512|
|-|-|-|-|
|clyde_32bit-shadow_128bit|14.77|14.58|11.86|
|clyde_32bit-shadow_32bit|30.38|26.74|26.76|
|clyde_64bit-shadow_128bit| |14.58|11.88|
|clyde_64bit-shadow_32bit| |26.72|26.90|
Spook128su512v1 (Cycles per byte), throughput for m=2048 bytes:
| |x86-64|haswell|skylake-avx512|
|-|-|-|-|
|clyde_32bit-shadow_128bit|15.28|15.09|12.33|
|clyde_32bit-shadow_32bit|31.24|27.36|27.40|
|clyde_64bit-shadow_128bit| |15.08|12.37|
|clyde_64bit-shadow_32bit| |27.42|27.60|
......@@ -14,7 +14,10 @@ def parse_spook_id(s):
return ((clyde, shadow), '-'.join((clyde_f, shadow_f)), arch)
def fmt_cycles(cycles):
return '{:.2f}'.format(cycles)
try:
return '{:.2f}'.format(cycles)
except ValueError:
return cycles
def parse_line(s):
try:
......@@ -25,7 +28,7 @@ def parse_line(s):
val = None
else:
n_bytes, _, _, ns_iter, _, ns_byte = res.strip().split(' ')
throughput = fmt_cycles(PROC_FREQ*float(ns_byte))
throughput = PROC_FREQ*float(ns_byte)
val = (int(n_bytes), throughput)
_, implem, arch = parse_spook_id(spook_id.strip())
return ((implem, arch), val)
......@@ -40,17 +43,18 @@ for (implem, arch), val in map(parse_line, open(RES_FILE).read().splitlines()):
if val is not None:
results.setdefault((implem, arch), dict())[val[0]] = val[1]
print('results', results)
implems = list(sorted(set(implem for implem, _ in results.keys())))
archs = ['x86-64', 'haswell', 'skylake-avx512']
max_throughput_table = [
[str(min(results.get((implem, arch), dict()).values(), default=' ')) for arch in archs]
[fmt_cycles(min(results.get((implem, arch), dict()).values(), default=' ')) for arch in archs]
for implem in implems]
def throughput_bytes(n):
return [[str(results.get((implem, arch), dict()).get(n, ' ')) for arch in archs] for implem in implems]
return [[fmt_cycles(results.get((implem, arch), dict()).get(n, ' ')) for arch in archs] for implem in implems]
print(
'max throughput (cycles/byte):\n\n',
......
......@@ -44,6 +44,7 @@
(SHADOW_NBYTES / (LS_ROWS* LS_ROW_BYTES)) // Bundles in the mLS design
typedef uint32_t row_set __attribute__ ((vector_size (16)));
typedef int32_t srow_set __attribute__ ((vector_size (16)));
typedef struct __attribute__((aligned(64))) shadow_simd {
row_set rows[4];
......@@ -118,8 +119,8 @@ static void transpose_state(shadow_simd *simd) {
*/
}
static row_set xtime(row_set x) {
row_set b = x >> 31;
return (x << 1) ^ b ^ (b << 8);
row_set b = (row_set) (((srow_set) x) >> 31);
return (x << 1) ^ (b & 0x101);
}
static void dbox_mls_layer_simd(shadow_simd *simd) {
#if SMALL_PERM==0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment