Benchmark visualization#
Import Python libraries
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
Functions for importing computation time
def import_computation_times(
filename: str,
) -> tuple[np.ndarray, dict[str, np.ndarray]]:
with open(filename) as f:
imported_data = yaml.safe_load(f)
x_values = np.array(sorted(imported_data))
y_values = defaultdict(list)
for x in x_values:
dct = imported_data[x]
for k, v in dct.items():
y_values[k].append(v)
y_values = {k: np.array(v) for k, v in y_values.items()}
return x_values, y_values
x_1cpu, Y_1CPU = import_computation_times("computation-times-1cpu.yaml")
x_8cpu, Y_8CPU = import_computation_times("computation-times-8cpu.yaml")
np.testing.assert_allclose(x_1cpu, x_8cpu)
X = x_1cpu
del x_1cpu, x_8cpu
Show code cell source
parametrized = "parametrized"
substituted = "substituted"
df = pd.DataFrame(
{
("sample size", ""): X[::2],
("1 CPU", "parametrized"): Y_1CPU[f"{parametrized}, run 2"].mean(axis=1)[::2],
("1 CPU", "substituted"): Y_1CPU[f"{substituted}, run 2"].mean(axis=1)[::2],
("8 CPUs", "parametrized"): Y_8CPU[f"{parametrized}, run 2"].mean(axis=1)[::2],
("8 CPUs", "substituted"): Y_8CPU[f"{substituted}, run 2"].mean(axis=1)[::2],
},
)
df = df.set_index("sample size")
fig, ax = plt.subplots(figsize=(10, 7))
ax.set_xlabel("Sample size")
ax.set_ylabel("Time (s)")
df.plot(ax=ax)
df.style.format(formatter="{:.3g}").format_index(formatter="{:,}")
1 CPU | 8 CPUs | |||
---|---|---|---|---|
parametrized | substituted | parametrized | substituted | |
sample size | ||||
1 | 0.000192 | 3.01e-05 | 0.000162 | 2.78e-05 |
10 | 0.000172 | 0.000135 | 0.000157 | 0.000113 |
100 | 0.000278 | 0.000175 | 0.000329 | 0.000231 |
1,000 | 0.00153 | 0.00113 | 0.000607 | 0.00062 |
10,000 | 0.0137 | 0.0105 | 0.00208 | 0.00279 |
100,000 | 0.143 | 0.106 | 0.0168 | 0.0191 |
1,000,000 | 1.38 | 1.08 | 0.166 | 0.177 |
10,000,000 | 13.8 | 11 | 2.13 | 1.89 |
Show code cell source
def create_overall_plot(
y_values: dict[str, np.ndarray],
title: str,
filename: str | None = None,
max_x: int | None = None,
):
plt.rc("font", size=12)
fig, axes = plt.subplots(figsize=(8, 4), ncols=2, tight_layout=True)
ax1, ax2 = axes
fig.suptitle(title)
ax1.set_title("First run")
ax2.set_title("Second run (XLA cache)")
ax1.set_ylabel("Computation time (s)")
ax2.set_yscale("log")
for ax in axes:
ax.set_xlabel("Number of events")
ax.set_xscale("log")
ax.grid(axis="y")
style = dict(
fmt=".",
)
selector = np.full(X.shape, True)
if max_x is not None:
selector = max_x >= X
ax1.errorbar(
X[selector],
y_values["parametrized, run 1"].mean(axis=1)[selector],
yerr=y_values["parametrized, run 1"].std(axis=1)[selector],
label="parametrized",
**style,
)
ax1.errorbar(
X[selector],
y_values["substituted, run 1"].mean(axis=1)[selector],
yerr=y_values["substituted, run 1"].std(axis=1)[selector],
label="substituted",
**style,
)
ax2.errorbar(
X[selector],
y_values["parametrized, run 2"].mean(axis=1)[selector],
yerr=y_values["parametrized, run 2"].std(axis=1)[selector],
label="parametrized",
**style,
)
ax2.errorbar(
X[selector],
y_values["substituted, run 2"].mean(axis=1)[selector],
yerr=y_values["substituted, run 2"].std(axis=1)[selector],
label="substituted",
**style,
)
ax1.legend(loc="upper left")
ax1.set_ylim(0, ax1.get_ylim()[1])
plt.savefig(filename, transparent=True)
plt.show()
create_overall_plot(Y_1CPU, "1 CPU", filename="overall-1cpu.svg")
create_overall_plot(Y_8CPU, "8 CPUs", filename="overall-8cpu.svg")
create_overall_plot(Y_1CPU, "1 CPU", filename="overall-1cpu-max1e6.svg", max_x=1e6)
create_overall_plot(Y_8CPU, "8 CPUs", filename="overall-8cpu-max1e6.svg", max_x=1e6)
Show code cell source
def get_ratio(category: str, substract_base_time: bool) -> np.ndarray:
warmup_category = f"{category[:-7]}, warm-up"
y_1cpu = Y_1CPU[category]
y_1cpu_base = Y_1CPU[warmup_category]
y_8cpu = Y_8CPU[category]
y_8cpu_base = Y_8CPU[warmup_category]
if substract_base_time:
return (y_1cpu - y_1cpu_base.mean()) / (y_8cpu - y_8cpu_base.mean())
return y_1cpu / y_8cpu
def create_multithreading_ratio_plot(substract_base_time: bool, savefig: bool = False):
plt.rc("font", size=12)
fig, axes = plt.subplots(figsize=(8, 4), ncols=2, tight_layout=True)
ax1, ax2 = axes
if substract_base_time:
fig.suptitle("1 CPU vs 8 CPUs (substracting base time)")
else:
fig.suptitle("1 CPU vs 8 CPUs")
ax1.set_title("First run")
ax2.set_title("Second run (XLA cache)")
ax1.set_ylabel("1 CPU vs 8 CPUs")
for ax in axes:
ax.set_xlabel("Number of events")
ax.set_xscale("log")
ax.set_ylim(0, 10)
ax.axhline(8, color="gray", linestyle="--")
style = dict(
fmt=".",
)
y_values = get_ratio("parametrized, run 1", substract_base_time)
ax1.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="parametrized",
**style,
)
y_values = get_ratio("substituted, run 1", substract_base_time)
ax1.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="substituted",
**style,
)
y_values = get_ratio("parametrized, run 2", substract_base_time)
ax2.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="parametrized",
**style,
)
y_values = get_ratio("substituted, run 2", substract_base_time)
ax2.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="substituted",
**style,
)
ax1.legend(loc="upper left")
ax1.set_ylim(0, ax1.get_ylim()[1])
if savefig:
plt.savefig("computation-times-ratio.svg", transparent=True)
plt.show()
create_multithreading_ratio_plot(substract_base_time=False, savefig=True)
create_multithreading_ratio_plot(substract_base_time=True)
Show code cell source
def create_substitution_ratio_plot(savefig: bool = False):
plt.rc("font", size=12)
_, axes = plt.subplots(figsize=(8, 4), ncols=2, tight_layout=True)
ax1, ax2 = axes
ax1.set_title("1 CPU")
ax2.set_title("8 CPUs")
ax1.set_ylabel("parametrized vs substituted (%)")
for ax in axes:
ax.set_xlabel("Number of events")
ax.set_xscale("log")
ax.grid(axis="y")
style = dict(
fmt=".",
)
y_values = 100 * Y_1CPU["substituted, run 1"] / Y_1CPU["parametrized, run 1"]
ax1.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="1 CPU",
**style,
)
y_values = 100 * Y_8CPU["substituted, run 1"] / Y_8CPU["parametrized, run 1"]
ax1.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="8 CPUs",
**style,
)
y_values = 100 * Y_1CPU["substituted, run 2"] / Y_1CPU["parametrized, run 2"]
ax2.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="1 CPU",
**style,
)
y_values = 100 * Y_8CPU["substituted, run 2"] / Y_8CPU["parametrized, run 2"]
ax2.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="8 CPUs",
**style,
)
ax1.legend(loc="upper left")
ax1.set_ylim(0, ax1.get_ylim()[1])
if savefig:
plt.savefig("computation-times-substitution-ratio.svg", transparent=True)
plt.show()
create_substitution_ratio_plot(savefig=True)
Show code cell source
def create_linear_check_plot():
plt.rc("font", size=12)
_, axes = plt.subplots(
figsize=(8, 4),
ncols=2,
sharey=True,
tight_layout=True,
)
ax1, ax2 = axes
ax1.set_title("Parametrized")
ax2.set_title("Substituted analytically")
ax1.set_ylabel("Computation times - base time (s)")
for ax in axes:
ax.set_xlabel("Number of events")
ax.grid(axis="y")
style = dict(
fmt="o--",
)
category = "parametrized, run 1"
y_values = Y_1CPU[category]
ax1.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="1 CPU",
**style,
)
y_values = Y_8CPU[category]
ax1.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="8 CPUs",
**style,
)
category = "substituted, run 1"
y_values = Y_1CPU[category]
ax2.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="1 CPU",
**style,
)
y_values = Y_8CPU[category]
ax2.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="8 CPUs",
**style,
)
ax1.legend(loc="upper left")
plt.savefig("computation-times-linear.svg", transparent=True)
plt.show()
create_linear_check_plot()