7.5. Benchmarking#

Tip

This notebook benchmarks JAX on a single CPU core. Compare with Julia results as reported in ComPWA/polarimetry#27. See also the Extended benchmark #68 discussion.

Note

This notebook uses only one run and one loop for %timeit, because JAX seems to cache its return values.

Hide code cell source

import logging
from collections import defaultdict

import attrs
import numpy as np
import pandas as pd
import sympy as sp
from ampform_dpd.io import cached
from IPython.display import Markdown
from psutil import cpu_count

from polarimetry import formulate_polarimetry
from polarimetry.data import (
    create_data_transformer,
    generate_meshgrid_sample,
    generate_phasespace_sample,
)
from polarimetry.io import mute_jax_warnings
from polarimetry.lhcb import load_model_builder, load_model_parameters
from polarimetry.lhcb.particle import load_particles

LOGGER = logging.getLogger()
LOGGER.setLevel(logging.ERROR)
mute_jax_warnings()

model_choice = 0
model_file = "../../data/model-definitions.yaml"
particles = load_particles("../../data/particle-definitions.yaml")
amplitude_builder = load_model_builder(model_file, particles, model_choice)
imported_parameter_values = load_model_parameters(
    model_file, amplitude_builder.decay, model_choice, particles
)
reference_subsystem = 1
model = amplitude_builder.formulate(reference_subsystem)
model = attrs.evolve(
    model,
    parameter_defaults={**model.parameter_defaults, **imported_parameter_values},
)

timing_parametrized = defaultdict(dict)
timing_substituted = defaultdict(dict)

print("Physical cores:", cpu_count(logical=False))
print("Total cores:", cpu_count(logical=True))
Physical cores: 2
Total cores: 4
%%time
polarimetry_exprs = formulate_polarimetry(amplitude_builder, reference_subsystem)
unfolded_polarimetry_exprs = [
    cached.unfold(expr, model.amplitudes) for expr in polarimetry_exprs
]
unfolded_intensity_expr = cached.unfold(model)
CPU times: user 4.85 s, sys: 12.7 ms, total: 4.86 s
Wall time: 4.86 s

7.5.1. DataTransformer performance#

n_events = 100_000
phsp_sample = generate_phasespace_sample(model.decay, n_events, seed=0)
transformer = create_data_transformer(model)
%timeit -n1 -r1 transformer(phsp_sample)  # first run, so no cache and JIT-compilation
%timeit -n1 -r1 transformer(phsp_sample)  # second run with cache
%timeit -n1 -r1 transformer(phsp_sample)  # third run with cache
phsp_sample = transformer(phsp_sample)
random_point = {k: v[0] if len(v.shape) > 0 else v for k, v in phsp_sample.items()}
326 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1.14 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1.69 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
res = 54
grid_sample = generate_meshgrid_sample(model.decay, res)
%timeit -n1 -r1 transformer(grid_sample)  # first run, without cache, but already compiled
%timeit -n1 -r1 transformer(grid_sample)  # second run with cache
%timeit -n1 -r1 transformer(grid_sample)  # third run with cache
grid_sample = transformer(grid_sample)
337 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
451 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
151 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

7.5.2. Parametrized function#

Total number of mathematical operations:

  • \(\alpha_x\): 86,766

  • \(\alpha_y\): 86,770

  • \(\alpha_z\): 86,766

  • \(I_\mathrm{tot}\): 28,430

%%time
parametrized_polarimetry_funcs = [
    cached.lambdify(expr, model.parameter_defaults)
    for expr in unfolded_polarimetry_exprs
]
parametrized_intensity_func = cached.lambdify(
    unfolded_intensity_expr, model.parameter_defaults
)
CPU times: user 5.13 s, sys: 39.9 ms, total: 5.17 s
Wall time: 5.18 s
rng = np.random.default_rng(seed=0)
original_parameters = dict(parametrized_intensity_func.parameters)
modified_parameters = {
    k: rng.uniform(0.9, 1.1) * v
    for k, v in parametrized_intensity_func.parameters.items()
}

7.5.2.1. One data point#

7.5.2.1.1. JIT-compilation#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(random_point)
<TimeitResult : 1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](random_point)
array = parametrized_polarimetry_funcs[1](random_point)
array = parametrized_polarimetry_funcs[2](random_point)
<TimeitResult : 4.32 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.1.2. Compiled performance#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(random_point)
<TimeitResult : 642 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](random_point)
array = parametrized_polarimetry_funcs[1](random_point)
array = parametrized_polarimetry_funcs[2](random_point)
<TimeitResult : 1.22 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.2. 54x54 grid sample#

7.5.2.2.1. Compiled but uncached#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(grid_sample)
<TimeitResult : 8.87 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](grid_sample)
array = parametrized_polarimetry_funcs[1](grid_sample)
array = parametrized_polarimetry_funcs[2](grid_sample)
<TimeitResult : 1min 1s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.2.2. Second run with cache#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(grid_sample)
<TimeitResult : 770 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](grid_sample)
array = parametrized_polarimetry_funcs[1](grid_sample)
array = parametrized_polarimetry_funcs[2](grid_sample)
<TimeitResult : 1.59 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.3. 100.000 event phase space sample#

7.5.2.3.1. Compiled but uncached#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(phsp_sample)
<TimeitResult : 2.77 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](phsp_sample)
array = parametrized_polarimetry_funcs[1](phsp_sample)
array = parametrized_polarimetry_funcs[2](phsp_sample)
<TimeitResult : 17.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.3.2. Second run with cache#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(phsp_sample)
<TimeitResult : 845 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](phsp_sample)
array = parametrized_polarimetry_funcs[1](phsp_sample)
array = parametrized_polarimetry_funcs[2](phsp_sample)
<TimeitResult : 2.41 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.4. Recompilation after parameter modification#

parametrized_intensity_func.update_parameters(modified_parameters)
for func in parametrized_polarimetry_funcs:
    func.update_parameters(modified_parameters)

7.5.2.4.1. Compiled but uncached#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(phsp_sample)
<TimeitResult : 2.78 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](phsp_sample)
array = parametrized_polarimetry_funcs[1](phsp_sample)
array = parametrized_polarimetry_funcs[2](phsp_sample)
<TimeitResult : 17.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.2.4.2. Second run with cache#

%%timeit -n1 -r1 -q -o
array = parametrized_intensity_func(phsp_sample)
<TimeitResult : 1.61 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = parametrized_polarimetry_funcs[0](phsp_sample)
array = parametrized_polarimetry_funcs[1](phsp_sample)
array = parametrized_polarimetry_funcs[2](phsp_sample)
<TimeitResult : 1.95 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
parametrized_intensity_func.update_parameters(original_parameters)
for func in parametrized_polarimetry_funcs:
    func.update_parameters(original_parameters)

7.5.3. All parameters substituted#

subs_polarimetry_exprs = [
    cached.xreplace(expr, model.parameter_defaults)
    for expr in unfolded_polarimetry_exprs
]
subs_intensity_expr = cached.xreplace(unfolded_intensity_expr, model.parameter_defaults)

Number of mathematical operations after substituting all parameters:

  • \(\alpha_x\): 31,488

  • \(\alpha_y\): 31,492

  • \(\alpha_z\): 31,488

  • \(I_\mathrm{tot}\): 10,360

%%time
polarimetry_funcs = [cached.lambdify(expr) for expr in subs_polarimetry_exprs]
intensity_func = cached.lambdify(subs_intensity_expr)
CPU times: user 2.64 s, sys: 12 ms, total: 2.65 s
Wall time: 2.65 s

7.5.3.1. One data point#

7.5.3.1.1. JIT-compilation#

%%timeit -n1 -r1 -q -o
array = intensity_func(random_point)
<TimeitResult : 540 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = polarimetry_funcs[0](random_point)
array = polarimetry_funcs[1](random_point)
array = polarimetry_funcs[2](random_point)
<TimeitResult : 2.54 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.3.1.2. Compiled performance#

%%timeit -n1 -r1 -q -o
array = intensity_func(random_point)
<TimeitResult : 113 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = polarimetry_funcs[0](random_point)
array = polarimetry_funcs[1](random_point)
array = polarimetry_funcs[2](random_point)
<TimeitResult : 441 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.3.2. 54x54 grid sample#

7.5.3.2.1. Compiled but uncached#

%%timeit -n1 -r1 -q -o
array = intensity_func(grid_sample)
<TimeitResult : 6.61 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = polarimetry_funcs[0](grid_sample)
array = polarimetry_funcs[1](grid_sample)
array = polarimetry_funcs[2](grid_sample)
<TimeitResult : 47.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.3.2.2. Second run with cache#

%%timeit -n1 -r1 -q -o
array = intensity_func(grid_sample)
<TimeitResult : 131 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = polarimetry_funcs[0](grid_sample)
array = polarimetry_funcs[1](grid_sample)
array = polarimetry_funcs[2](grid_sample)
<TimeitResult : 1.04 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.3.3. 100.000 event phase space sample#

7.5.3.3.1. Compiled but uncached#

%%timeit -n1 -r1 -q -o
array = intensity_func(phsp_sample)
<TimeitResult : 1.92 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = polarimetry_funcs[0](phsp_sample)
array = polarimetry_funcs[1](phsp_sample)
array = polarimetry_funcs[2](phsp_sample)
<TimeitResult : 13.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.3.3.2. Second run with cache#

%%timeit -n1 -r1 -q -o
array = intensity_func(phsp_sample)
<TimeitResult : 844 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>
%%timeit -n1 -r1 -q -o
array = polarimetry_funcs[0](phsp_sample)
array = polarimetry_funcs[1](phsp_sample)
array = polarimetry_funcs[2](phsp_sample)
<TimeitResult : 189 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

7.5.4. Summary#

Hide code cell source

def collect_sorted_row_title() -> list[str]:
    row_titles = {}
    row_titles.update(timing_parametrized["intensity"])
    row_titles.update(timing_parametrized["polarimetry"])
    row_titles.update(timing_substituted["intensity"])
    row_titles.update(timing_substituted["polarimetry"])
    return list(row_titles)


def remove_loop_info(timing) -> str:
    if timing is None:
        return ""
    pattern = " ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)"
    return str(timing).replace(pattern, "")


row_titles = collect_sorted_row_title()
values = [
    (
        remove_loop_info(timing_parametrized["intensity"].get(row)),
        remove_loop_info(timing_parametrized["polarimetry"].get(row)),
        remove_loop_info(timing_substituted["intensity"].get(row)),
        remove_loop_info(timing_substituted["polarimetry"].get(row)),
    )
    for row in row_titles
]
columns = pd.MultiIndex.from_tuples(
    [
        ("parametrized", "I"),
        ("parametrized", "ɑ"),
        ("substituted", "I"),
        ("substituted", "ɑ"),
    ],
)
df = pd.DataFrame(values, index=row_titles, columns=columns)
df.style.set_table_styles([
    dict(selector="th", props=[("text-align", "left")]),
    dict(selector="td", props=[("text-align", "left")]),
])
  parametrized substituted
  I ɑ I ɑ
random point (compilation) 1 s 4.32 s 540 ms 2.54 s
random point (cached) 642 μs 1.22 ms 113 μs 441 μs
54x54 grid 8.87 s 1min 1s 6.61 s 47.1 s
54x54 grid (cached) 770 μs 1.59 ms 131 μs 1.04 ms
100,000 phsp 2.77 s 17.7 s 1.92 s 13.1 s
100,000 phsp (cached) 845 μs 2.41 ms 844 μs 189 μs
modified 100,000 phsp 2.78 s 17.6 s
modified 100,000 phsp (cached) 1.61 ms 1.95 ms