Benchmark visualization#
Import Python libraries
Hide code cell source
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
Functions for importing computation time
Hide code cell source
def import_computation_times(
filename: str,
) -> tuple[np.ndarray, dict[str, np.ndarray]]:
with open(filename) as f:
imported_data = yaml.safe_load(f)
x_values = np.array(sorted(imported_data))
y_values = defaultdict(list)
for x in x_values:
dct = imported_data[x]
for k, v in dct.items():
y_values[k].append(v)
y_values = {k: np.array(v) for k, v in y_values.items()}
return x_values, y_values
x_1cpu, Y_1CPU = import_computation_times("computation-times-1cpu.yaml")
x_8cpu, Y_8CPU = import_computation_times("computation-times-8cpu.yaml")
np.testing.assert_allclose(x_1cpu, x_8cpu)
X = x_1cpu
del x_1cpu, x_8cpu
Show code cell source
Hide code cell source
parametrized = "parametrized"
substituted = "substituted"
df = pd.DataFrame(
{
("sample size", ""): X[::2],
("1 CPU", "parametrized"): Y_1CPU[f"{parametrized}, run 2"].mean(axis=1)[::2],
("1 CPU", "substituted"): Y_1CPU[f"{substituted}, run 2"].mean(axis=1)[::2],
("8 CPUs", "parametrized"): Y_8CPU[f"{parametrized}, run 2"].mean(axis=1)[::2],
("8 CPUs", "substituted"): Y_8CPU[f"{substituted}, run 2"].mean(axis=1)[::2],
},
)
df = df.set_index("sample size")
fig, ax = plt.subplots(figsize=(10, 7))
ax.set_xlabel("Sample size")
ax.set_ylabel("Time (s)")
df.plot(ax=ax)
df.style.format(formatter="{:.3g}").format_index(formatter="{:,}")
1 CPU | 8 CPUs | |||
---|---|---|---|---|
parametrized | substituted | parametrized | substituted | |
sample size | ||||
1 | 0.000192 | 3.01e-05 | 0.000162 | 2.78e-05 |
10 | 0.000172 | 0.000135 | 0.000157 | 0.000113 |
100 | 0.000278 | 0.000175 | 0.000329 | 0.000231 |
1,000 | 0.00153 | 0.00113 | 0.000607 | 0.00062 |
10,000 | 0.0137 | 0.0105 | 0.00208 | 0.00279 |
100,000 | 0.143 | 0.106 | 0.0168 | 0.0191 |
1,000,000 | 1.38 | 1.08 | 0.166 | 0.177 |
10,000,000 | 13.8 | 11 | 2.13 | 1.89 |
Show code cell source
Hide code cell source
def create_overall_plot(
y_values: dict[str, np.ndarray],
title: str,
filename: str | None = None,
max_x: int | None = None,
):
plt.rc("font", size=12)
fig, axes = plt.subplots(figsize=(8, 4), ncols=2, tight_layout=True)
ax1, ax2 = axes
fig.suptitle(title)
ax1.set_title("First run")
ax2.set_title("Second run (XLA cache)")
ax1.set_ylabel("Computation time (s)")
ax2.set_yscale("log")
for ax in axes:
ax.set_xlabel("Number of events")
ax.set_xscale("log")
ax.grid(axis="y")
style = dict(
fmt=".",
)
selector = np.full(X.shape, True)
if max_x is not None:
selector = max_x >= X
ax1.errorbar(
X[selector],
y_values["parametrized, run 1"].mean(axis=1)[selector],
yerr=y_values["parametrized, run 1"].std(axis=1)[selector],
label="parametrized",
**style,
)
ax1.errorbar(
X[selector],
y_values["substituted, run 1"].mean(axis=1)[selector],
yerr=y_values["substituted, run 1"].std(axis=1)[selector],
label="substituted",
**style,
)
ax2.errorbar(
X[selector],
y_values["parametrized, run 2"].mean(axis=1)[selector],
yerr=y_values["parametrized, run 2"].std(axis=1)[selector],
label="parametrized",
**style,
)
ax2.errorbar(
X[selector],
y_values["substituted, run 2"].mean(axis=1)[selector],
yerr=y_values["substituted, run 2"].std(axis=1)[selector],
label="substituted",
**style,
)
ax1.legend(loc="upper left")
ax1.set_ylim(0, ax1.get_ylim()[1])
plt.savefig(filename, transparent=True)
plt.show()
create_overall_plot(Y_1CPU, "1 CPU", filename="overall-1cpu.svg")
create_overall_plot(Y_8CPU, "8 CPUs", filename="overall-8cpu.svg")
create_overall_plot(Y_1CPU, "1 CPU", filename="overall-1cpu-max1e6.svg", max_x=1e6)
create_overall_plot(Y_8CPU, "8 CPUs", filename="overall-8cpu-max1e6.svg", max_x=1e6)
Show code cell source
Hide code cell source
def get_ratio(category: str, substract_base_time: bool) -> np.ndarray:
warmup_category = f"{category[:-7]}, warm-up"
y_1cpu = Y_1CPU[category]
y_1cpu_base = Y_1CPU[warmup_category]
y_8cpu = Y_8CPU[category]
y_8cpu_base = Y_8CPU[warmup_category]
if substract_base_time:
return (y_1cpu - y_1cpu_base.mean()) / (y_8cpu - y_8cpu_base.mean())
return y_1cpu / y_8cpu
def create_multithreading_ratio_plot(substract_base_time: bool, savefig: bool = False):
plt.rc("font", size=12)
fig, axes = plt.subplots(figsize=(8, 4), ncols=2, tight_layout=True)
ax1, ax2 = axes
if substract_base_time:
fig.suptitle("1 CPU vs 8 CPUs (substracting base time)")
else:
fig.suptitle("1 CPU vs 8 CPUs")
ax1.set_title("First run")
ax2.set_title("Second run (XLA cache)")
ax1.set_ylabel("1 CPU vs 8 CPUs")
for ax in axes:
ax.set_xlabel("Number of events")
ax.set_xscale("log")
ax.set_ylim(0, 10)
ax.axhline(8, color="gray", linestyle="--")
style = dict(
fmt=".",
)
y_values = get_ratio("parametrized, run 1", substract_base_time)
ax1.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="parametrized",
**style,
)
y_values = get_ratio("substituted, run 1", substract_base_time)
ax1.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="substituted",
**style,
)
y_values = get_ratio("parametrized, run 2", substract_base_time)
ax2.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="parametrized",
**style,
)
y_values = get_ratio("substituted, run 2", substract_base_time)
ax2.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="substituted",
**style,
)
ax1.legend(loc="upper left")
ax1.set_ylim(0, ax1.get_ylim()[1])
if savefig:
plt.savefig("computation-times-ratio.svg", transparent=True)
plt.show()
create_multithreading_ratio_plot(substract_base_time=False, savefig=True)
create_multithreading_ratio_plot(substract_base_time=True)
Show code cell source
Hide code cell source
def create_substitution_ratio_plot(savefig: bool = False):
plt.rc("font", size=12)
_, axes = plt.subplots(figsize=(8, 4), ncols=2, tight_layout=True)
ax1, ax2 = axes
ax1.set_title("1 CPU")
ax2.set_title("8 CPUs")
ax1.set_ylabel("parametrized vs substituted (%)")
for ax in axes:
ax.set_xlabel("Number of events")
ax.set_xscale("log")
ax.grid(axis="y")
style = dict(
fmt=".",
)
y_values = 100 * Y_1CPU["substituted, run 1"] / Y_1CPU["parametrized, run 1"]
ax1.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="1 CPU",
**style,
)
y_values = 100 * Y_8CPU["substituted, run 1"] / Y_8CPU["parametrized, run 1"]
ax1.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="8 CPUs",
**style,
)
y_values = 100 * Y_1CPU["substituted, run 2"] / Y_1CPU["parametrized, run 2"]
ax2.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="1 CPU",
**style,
)
y_values = 100 * Y_8CPU["substituted, run 2"] / Y_8CPU["parametrized, run 2"]
ax2.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="8 CPUs",
**style,
)
ax1.legend(loc="upper left")
ax1.set_ylim(0, ax1.get_ylim()[1])
if savefig:
plt.savefig("computation-times-substitution-ratio.svg", transparent=True)
plt.show()
create_substitution_ratio_plot(savefig=True)
Show code cell source
Hide code cell source
def create_linear_check_plot():
plt.rc("font", size=12)
_, axes = plt.subplots(
figsize=(8, 4),
ncols=2,
sharey=True,
tight_layout=True,
)
ax1, ax2 = axes
ax1.set_title("Parametrized")
ax2.set_title("Substituted analytically")
ax1.set_ylabel("Computation times - base time (s)")
for ax in axes:
ax.set_xlabel("Number of events")
ax.grid(axis="y")
style = dict(
fmt="o--",
)
category = "parametrized, run 1"
y_values = Y_1CPU[category]
ax1.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="1 CPU",
**style,
)
y_values = Y_8CPU[category]
ax1.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="8 CPUs",
**style,
)
category = "substituted, run 1"
y_values = Y_1CPU[category]
ax2.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="1 CPU",
**style,
)
y_values = Y_8CPU[category]
ax2.errorbar(
X,
y_values.mean(axis=1),
yerr=y_values.std(axis=1),
label="8 CPUs",
**style,
)
ax1.legend(loc="upper left")
plt.savefig("computation-times-linear.svg", transparent=True)
plt.show()
create_linear_check_plot()