jitted function inside unjitted shard_map seems slow #23032

DBraun · 2024-08-13T14:58:27Z

DBraun
Aug 13, 2024

The function foo that I want to decorate with shard_map has untraceable control-flow, which I want to avoid refactoring.

def foo(x, key: jax.Array):
    if random.uniform(key) < 0.5:  # untraceable control-flow
        return jit_batch_inference(x)
    else:
        return jit_batch_inference(x)

Since foo uses control-flow, I can't jax.jit a shard_map-decorated version of it. However, since the inner function jit_batch_inference is already jitted, I expect an unjitted shard_map-decorated foo to be fast, and yet it's 100x slower. How can I speed up run_shard_map_inference in the code below while still allowing the control-flow inside shard_inference?

from functools import partial

from flax import linen as nn
import jax
from jax import numpy as jnp
from jax import random
from jax.experimental import mesh_utils
from jax.experimental.shard_map import shard_map
from jax.sharding import NamedSharding, Mesh, PartitionSpec as P
import tqdm

n_gpus = jax.device_count()
print("Num GPUS:", n_gpus)
devices = mesh_utils.create_device_mesh((n_gpus,))

mesh = Mesh(devices, ("data",))
data_sharding = NamedSharding(mesh, P("data"))
replicated_sharding = NamedSharding(mesh, P())


class Model(nn.Module):

    n_layers: int = 10
    features: int = 10

    @nn.compact
    def __call__(self, x):
        for _ in range(self.n_layers):
            x = nn.Dense(features=self.features)(x)
            x = nn.relu(x)
        return x


Model = nn.vmap(Model, variable_axes={"params": None}, split_rngs={"params": False})

B = 4
IN_FEATURES = 10
N_LAYERS = 1
FEATURES = 10

dummy_input = jnp.zeros(shape=(B, IN_FEATURES))

model = Model(n_layers=N_LAYERS, features=FEATURES)

params = model.init({"params": random.key(0), "rng_stream": random.key(1)}, dummy_input)["params"]
# params = jax.device_put(params, replicated_sharding)  # todo: not sure about this

print(model.tabulate({"params": random.key(0), "rng_stream": random.key(1)}, dummy_input))


@jax.jit
def jit_batch_inference(x):
    return model.apply({"params": params}, x)


@jax.jit
def jit_shard_batch_inference(x):
    x = model.apply({"params": params}, x)
    # x = jax.lax.with_sharding_constraint(x, data_sharding)  # todo: not sure about this
    return x


@partial(
    shard_map,
    mesh=mesh,
    in_specs=(P("data"), P(None)),
    out_specs=P("data"),
    # check_rep=False,  # todo:
)
def shard_inference(x, key):
    if random.uniform(key) < 0.5:  # need to allow control-flow
        return jit_shard_batch_inference(x)
    else:
        return jit_shard_batch_inference(x)


def plain_inference(x, key):
    if random.uniform(key) < 0.5:  # need to allow control-flow
        return jit_batch_inference(x)
    else:
        return jit_batch_inference(x)


def run_shard_map_inference():
    x = jnp.zeros((B, IN_FEATURES))
    key = random.PRNGKey(0)

    x = jax.device_put(x, data_sharding)
    # key = jax.device_put(key, data_sharding)  # todo: not sure about this

    for _ in tqdm.trange(1000):
        out = shard_inference(x, key)
        assert out.shape == (B, FEATURES)


def run_plain_inference():
    x = jnp.zeros((B, IN_FEATURES))
    key = random.PRNGKey(0)

    for _ in tqdm.trange(1000):
        out = plain_inference(x, key)
        assert out.shape == (B, FEATURES)


print("\nRunning plain inference")  # hundreds of iterations/sec
run_plain_inference()

print("\nRunning shard_map inference")  # 1-3 sec/iteration
run_shard_map_inference()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

jitted function inside unjitted shard_map seems slow #23032

{{title}}

Replies: 0 comments

Select a reply

jitted function inside unjitted shard_map seems slow #23032

DBraun Aug 13, 2024

Replies: 0 comments

DBraun
Aug 13, 2024