7.5. Benchmarking#

Tip

This notebook benchmarks JAX on a single CPU core. Compare with Julia results as reported in ComPWA/polarimetry#27. See also the Extended benchmark #68 discussion.

Note

This notebook uses only one run and one loop for %timeit, because JAX seems to cache its return values.

Hide code cell source

import logging
from collections import defaultdict

import attrs
import numpy as np
import pandas as pd
import sympy as sp
from ampform_dpd.io import cached
from IPython.display import Markdown
from psutil import cpu_count

from polarimetry import formulate_polarimetry
from polarimetry.data import (
    create_data_transformer,
    generate_meshgrid_sample,
    generate_phasespace_sample,
)
from polarimetry.io import mute_jax_warnings
from polarimetry.lhcb import load_model_builder, load_model_parameters
from polarimetry.lhcb.particle import load_particles

LOGGER = logging.getLogger()
LOGGER.setLevel(logging.ERROR)
mute_jax_warnings()

model_choice = 0
model_file = "../../data/model-definitions.yaml"
particles = load_particles("../../data/particle-definitions.yaml")
amplitude_builder = load_model_builder(model_file, particles, model_choice)
imported_parameter_values = load_model_parameters(
    model_file, amplitude_builder.decay, model_choice, particles
)
reference_subsystem = 1
model = amplitude_builder.formulate(reference_subsystem)
model = attrs.evolve(
    model,
    parameter_defaults={**model.parameter_defaults, **imported_parameter_values},
)

timing_parametrized = defaultdict(dict)
timing_substituted = defaultdict(dict)

print("Physical cores:", cpu_count(logical=False))
print("Total cores:", cpu_count(logical=True))
Physical cores: 2
Total cores: 4
%%time
polarimetry_exprs = formulate_polarimetry(amplitude_builder, reference_subsystem)
unfolded_polarimetry_exprs = [
    cached.unfold(expr, model.amplitudes) for expr in polarimetry_exprs
]
unfolded_intensity_expr = cached.unfold(model)
CPU times: user 5.05 s, sys: 15.9 ms, total: 5.07 s
Wall time: 5.06 s

7.5.1. DataTransformer performance#

n_events = 100_000
phsp_sample = generate_phasespace_sample(model.decay, n_events, seed=0)
transformer = create_data_transformer(model)
%timeit -n1 -r1 transformer(phsp_sample)  # first run, so no cache and JIT-compilation
%timeit -n1 -r1 transformer(phsp_sample)  # second run with cache
%timeit -n1 -r1 transformer(phsp_sample)  # third run with cache
phsp_sample = transformer(phsp_sample)
random_point = {k: v[0] if len(v.shape) > 0 else v for k, v in phsp_sample.items()}
322 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
969 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1.16 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
res = 54
grid_sample = generate_meshgrid_sample(model.decay, res)
%timeit -n1 -r1 transformer(grid_sample)  # first run, without cache, but already compiled
%timeit -n1 -r1 transformer(grid_sample)  # second run with cache
%timeit -n1 -r1 transformer(grid_sample)  # third run with cache
grid_sample = transformer(grid_sample)
346 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
452 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
191 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

7.5.2. Parametrized function#

Total number of mathematical operations:

  • \(\alpha_x\): 86,766

  • \(\alpha_y\): 86,770

  • \(\alpha_z\): 86,766

  • \(I_\mathrm{tot}\): 28,430

%%time
parametrized_polarimetry_funcs = [
    cached.lambdify(expr, model.parameter_defaults)
    for expr in unfolded_polarimetry_exprs
]
parametrized_intensity_func = cached.lambdify(
    unfolded_intensity_expr, model.parameter_defaults
)
CPU times: user 5.48 s, sys: 46.9 ms, total: 5.52 s
Wall time: 5.52 s
rng = np.random.default_rng(seed=0)
original_parameters = dict(parametrized_intensity_func.parameters)
modified_parameters = {
    k: rng.uniform(0.9, 1.1) * v
    for k, v in parametrized_intensity_func.parameters.items()
}

7.5.2.1. One data point#

7.5.2.1.1. JIT-compilation#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(random_point)
<TimeitResult : 956 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](random_point)
array = parametrized_polarimetry_funcs[1](random_point)
array = parametrized_polarimetry_funcs[2](random_point)
<TimeitResult : 4.26 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.1.2. Compiled performance#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(random_point)
<TimeitResult : 729 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](random_point)
array = parametrized_polarimetry_funcs[1](random_point)
array = parametrized_polarimetry_funcs[2](random_point)
<TimeitResult : 1.19 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.2. 54x54 grid sample#

7.5.2.2.1. Compiled but uncached#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(grid_sample)
<TimeitResult : 8.48 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](grid_sample)
array = parametrized_polarimetry_funcs[1](grid_sample)
array = parametrized_polarimetry_funcs[2](grid_sample)
<TimeitResult : 59.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.2.2. Second run with cache#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(grid_sample)
<TimeitResult : 840 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](grid_sample)
array = parametrized_polarimetry_funcs[1](grid_sample)
array = parametrized_polarimetry_funcs[2](grid_sample)
<TimeitResult : 1.42 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.3. 100.000 event phase space sample#

7.5.2.3.1. Compiled but uncached#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(phsp_sample)
<TimeitResult : 2.71 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](phsp_sample)
array = parametrized_polarimetry_funcs[1](phsp_sample)
array = parametrized_polarimetry_funcs[2](phsp_sample)
<TimeitResult : 17.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.3.2. Second run with cache#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(phsp_sample)
<TimeitResult : 920 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](phsp_sample)
array = parametrized_polarimetry_funcs[1](phsp_sample)
array = parametrized_polarimetry_funcs[2](phsp_sample)
<TimeitResult : 2.11 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.4. Recompilation after parameter modification#

parametrized_intensity_func.update_parameters(modified_parameters)
for func in parametrized_polarimetry_funcs:
    func.update_parameters(modified_parameters)

7.5.2.4.1. Compiled but uncached#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(phsp_sample)
<TimeitResult : 2.83 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](phsp_sample)
array = parametrized_polarimetry_funcs[1](phsp_sample)
array = parametrized_polarimetry_funcs[2](phsp_sample)
<TimeitResult : 16.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.4.2. Second run with cache#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(phsp_sample)
<TimeitResult : 960 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](phsp_sample)
array = parametrized_polarimetry_funcs[1](phsp_sample)
array = parametrized_polarimetry_funcs[2](phsp_sample)
<TimeitResult : 2.36 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
parametrized_intensity_func.update_parameters(original_parameters)
for func in parametrized_polarimetry_funcs:
    func.update_parameters(original_parameters)

7.5.3. All parameters substituted#

subs_polarimetry_exprs = [
    cached.xreplace(expr, model.parameter_defaults)
    for expr in unfolded_polarimetry_exprs
]
subs_intensity_expr = cached.xreplace(unfolded_intensity_expr, model.parameter_defaults)

Number of mathematical operations after substituting all parameters:

  • \(\alpha_x\): 31,488

  • \(\alpha_y\): 31,492

  • \(\alpha_z\): 31,488

  • \(I_\mathrm{tot}\): 10,360

%%time
polarimetry_funcs = [cached.lambdify(expr) for expr in subs_polarimetry_exprs]
intensity_func = cached.lambdify(subs_intensity_expr)
CPU times: user 2.82 s, sys: 16 ms, total: 2.83 s
Wall time: 2.83 s

7.5.3.1. One data point#

7.5.3.1.1. JIT-compilation#

%%timeit -n1 -r1 -q -o
array = intensity_func(random_point)
<TimeitResult : 554 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = polarimetry_funcs[0](random_point)
array = polarimetry_funcs[1](random_point)
array = polarimetry_funcs[2](random_point)
<TimeitResult : 2.55 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.3.1.2. Compiled performance#

%%timeit -n1 -r1 -q -o
array = intensity_func(random_point)
<TimeitResult : 156 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = polarimetry_funcs[0](random_point)
array = polarimetry_funcs[1](random_point)
array = polarimetry_funcs[2](random_point)
<TimeitResult : 378 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.3.2. 54x54 grid sample#

7.5.3.2.1. Compiled but uncached#

%%timeit -n1 -r1 -q -o
array = intensity_func(grid_sample)
<TimeitResult : 6.28 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = polarimetry_funcs[0](grid_sample)
array = polarimetry_funcs[1](grid_sample)
array = polarimetry_funcs[2](grid_sample)
<TimeitResult : 45 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.3.2.2. Second run with cache#

%%timeit -n1 -r1 -q -o
array = intensity_func(grid_sample)
<TimeitResult : 188 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = polarimetry_funcs[0](grid_sample)
array = polarimetry_funcs[1](grid_sample)
array = polarimetry_funcs[2](grid_sample)
<TimeitResult : 333 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.3.3. 100.000 event phase space sample#

7.5.3.3.1. Compiled but uncached#

%%timeit -n1 -r1 -q -o
array = intensity_func(phsp_sample)
<TimeitResult : 1.87 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = polarimetry_funcs[0](phsp_sample)
array = polarimetry_funcs[1](phsp_sample)
array = polarimetry_funcs[2](phsp_sample)
<TimeitResult : 12.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.3.3.2. Second run with cache#

%%timeit -n1 -r1 -q -o
array = intensity_func(phsp_sample)
<TimeitResult : 156 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = polarimetry_funcs[0](phsp_sample)
array = polarimetry_funcs[1](phsp_sample)
array = polarimetry_funcs[2](phsp_sample)
<TimeitResult : 414 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.4. Summary#

Hide code cell source

def collect_sorted_row_title() -> list[str]:
    row_titles = {}
    row_titles.update(timing_parametrized["intensity"])
    row_titles.update(timing_parametrized["polarimetry"])
    row_titles.update(timing_substituted["intensity"])
    row_titles.update(timing_substituted["polarimetry"])
    return list(row_titles)


def remove_loop_info(timing) -> str:
    if timing is None:
        return ""
    pattern = " ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)"
    return str(timing).replace(pattern, "")


row_titles = collect_sorted_row_title()
values = [
    (
        remove_loop_info(timing_parametrized["intensity"].get(row)),
        remove_loop_info(timing_parametrized["polarimetry"].get(row)),
        remove_loop_info(timing_substituted["intensity"].get(row)),
        remove_loop_info(timing_substituted["polarimetry"].get(row)),
    )
    for row in row_titles
]
columns = pd.MultiIndex.from_tuples(
    [
        ("parametrized", "I"),
        ("parametrized", "ɑ"),
        ("substituted", "I"),
        ("substituted", "ɑ"),
    ],
)
df = pd.DataFrame(values, index=row_titles, columns=columns)
df.style.set_table_styles([
    dict(selector="th", props=[("text-align", "left")]),
    dict(selector="td", props=[("text-align", "left")]),
])
  parametrized substituted
  I ɑ I ɑ
random point (compilation) 956 ms 4.26 s 554 ms 2.55 s
random point (cached) 729 μs 1.19 ms 156 μs 378 μs
54x54 grid 8.48 s 59.4 s 6.28 s 45 s
54x54 grid (cached) 840 μs 1.42 ms 188 μs 333 μs
100,000 phsp 2.71 s 17.1 s 1.87 s 12.4 s
100,000 phsp (cached) 920 μs 2.11 ms 156 μs 414 μs
modified 100,000 phsp 2.83 s 16.9 s
modified 100,000 phsp (cached) 960 μs 2.36 ms