Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

duplicaing an iname results in an unschedulable kernel #752

Open
isuruf opened this issue Feb 12, 2023 · 2 comments
Open

duplicaing an iname results in an unschedulable kernel #752

isuruf opened this issue Feb 12, 2023 · 2 comments

Comments

@isuruf
Copy link
Collaborator

isuruf commented Feb 12, 2023

I'm not sure why it becomes unschedulable.

import loopy as lp
import numpy as np
from pymbolic.primitives import *
import immutables

e2p_from_single_box_knl = lp.make_kernel(
    [
    "[ntgt_boxes] -> { [itgt_box] : 0 <= itgt_box < ntgt_boxes }",
    "{ [idim, idim_0] : 0 <= idim <= 2 and 0 <= idim_0 <= 2 }",
    "{ [itgt_offset_outer, itgt_offset_inner] : itgt_offset_inner >= 0 and -32itgt_offset_outer <= itgt_offset_inner <= 46 - 32itgt_offset_outer and itgt_offset_inner <= 31 }",
    "{ [icoeff_outer, icoeff_inner] : icoeff_inner >= 0 and -32icoeff_outer <= icoeff_inner <= 120 - 32icoeff_outer and icoeff_inner <= 31 }",
    "{ [iknl, iknl_0] : iknl = 0 and iknl_0 = 0 }",
    "{ [dummy] : 0 <= dummy <= 31 }",
    "[ntargets] -> { [] : ntargets > 0 }",
    "{ [e2p_idim] : 0 <= e2p_idim <= 2 }",
    "{ [e2p_iorder0] : 0 < e2p_iorder0 <= 10 }",
    "{ [e2p_zero_idx] : 1 = 0 }",
    "{ [e2p_icoeff_outer, e2p_icoeff_inner] : e2p_icoeff_inner >= 0 and -32e2p_icoeff_outer <= e2p_icoeff_inner <= 120 - 32e2p_icoeff_outer and e2p_icoeff_inner <= 31 }",
    "{ [e2p_x0] : 0 <= e2p_x0 <= 10 }",
    "[e2p_x0] -> { [e2p_iorder1] : e2p_x0 <= e2p_iorder1 <= 10 }",
    "[e2p_iorder1, e2p_x0] -> { [e2p_x2] : 0 <= e2p_x2 <= e2p_iorder1 - e2p_x0 }",
    "[e2p_iorder1, e2p_x0, e2p_x2] -> { [e2p_x1] : e2p_x1 = e2p_iorder1 - e2p_x0 - e2p_x2 }",
    "[e2p_x0] -> { [e2p_iorder2] : e2p_x0 <= e2p_iorder2 <= 10 }",
    "[e2p_iorder2, e2p_x0] -> { [e2p_y2] : 0 <= e2p_y2 <= e2p_iorder2 - e2p_x0 }",
    "[e2p_iorder2, e2p_x0, e2p_y2] -> { [e2p_y1] : e2p_y1 = e2p_iorder2 - e2p_x0 - e2p_y2 }",
    ],
    '''

    kernel_scaling = (1 / 4)*3.141592653589793**(-1) {id=kernel_scaling, inames=+dummy:itgt_box}
    tgt_ibox = target_boxes[itgt_box] {id=fetch_init0, inames=dummy:itgt_box}
    itgt_start = box_target_starts[tgt_ibox] {id=fetch_init1, dep=fetch_init0, inames=dummy:itgt_box}
    itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] {id=fetch_init2, dep=fetch_init0:fetch_init1, inames=dummy:itgt_box}
    center[idim] = centers[idim, tgt_ibox] {id=fetch_center, dep=fetch_init0, inames=dummy:itgt_box:idim}
    coeffs[icoeff_inner + icoeff_outer*32] = src_expansions[tgt_ibox + (-1)*src_base_ibox, icoeff_inner + icoeff_outer*32] {id=fetch_coeffs, dep=fetch_init0, inames=icoeff_outer:itgt_box:icoeff_inner}
    itgt = itgt_start + itgt_offset_inner + itgt_offset_outer*32 {id=insn, dep=fetch_init1, inames=itgt_offset_outer:itgt_offset_inner:itgt_box}
    run_itgt = itgt < itgt_end {id=insn_0, dep=fetch_init2:insn, inames=itgt_offset_outer:itgt_offset_inner:itgt_box}
    tgt[idim_0] = targets[idim_0, itgt] {id=fetch_tgt, dep=insn:insn_0, inames=itgt_offset_outer:itgt_offset_inner:itgt_box:idim_0}
    result_temp[iknl_0] = 0 {id=init_result, dep=insn_0, inames=itgt_offset_outer:itgt_offset_inner:itgt_box:iknl_0}
    ... nop {id=e2p__start, dep=fetch_coeffs:fetch_tgt:insn_0:init_result:fetch_center:insn, inames=+itgt_offset_outer:itgt_offset_inner:itgt_box}
    e2p_b[e2p_idim] = (tgt[e2p_idim] + (-1)*center[e2p_idim])*(1 / rscale) {id=e2p_set_b, dep=e2p__start, inames=itgt_offset_outer:itgt_offset_inner:e2p_idim:itgt_box}
    e2p_power_b[e2p_idim, e2p_zero_idx] = 0 {id=e2p_zero_monomials, dep=e2p__start, inames=+itgt_offset_inner:e2p_idim:itgt_offset_outer:e2p_zero_idx:itgt_box}
    e2p_power_b[e2p_idim, 0] = 1 {id=e2p_init_monomials, dep=e2p__start:e2p_zero_monomials, inames=+itgt_offset_outer:itgt_offset_inner:e2p_idim:itgt_box}
    e2p_power_b[e2p_idim, e2p_iorder0] = e2p_power_b[e2p_idim, e2p_iorder0 + -1]*e2p_b[e2p_idim]*(1 / e2p_iorder0) {id=e2p_update_monomials, dep=e2p_set_b:e2p_init_monomials:e2p__start, inames=+itgt_offset_inner:e2p_idim:itgt_offset_outer:e2p_iorder0:itgt_box}
    e2p_coeffs_copy[e2p_icoeff_inner + e2p_icoeff_outer*32] = coeffs[e2p_icoeff_inner + e2p_icoeff_outer*32] {id=e2p_copy_coeffs, dep=e2p__start, inames=+e2p_icoeff_outer:e2p_icoeff_inner:itgt_box:itgt_offset_outer}
    e2p_coeffs_copy[((e2p_x0 % 2 + e2p_x1 + e2p_x2)*(e2p_x0 % 2 + e2p_x1 + e2p_x2 + 1)*(e2p_x0 % 2 + e2p_x1 + e2p_x2 + 2)) // 6 + (e2p_x0 % 2 + e2p_x1 + e2p_x2 + 2)*(e2p_x0 % 2) + (-1)*(((e2p_x0 % 2)*(e2p_x0 % 2 + 1)) // 2) + e2p_x1 if e2p_x0 % 2 + e2p_x1 + e2p_x2 < 1 else (2*(e2p_x0 % 2 + e2p_x1 + e2p_x2)*(2 + e2p_x0 % 2 + e2p_x1 + e2p_x2 + -2) + (e2p_x0 % 2)*(3 + (-1)*(e2p_x0 % 2) + 2*(e2p_x0 % 2 + e2p_x1 + e2p_x2))) // 2 + e2p_x1] = e2p_coeffs_copy[(((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2)*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 1)*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 2)) // 6 + ((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 2)*((e2p_x0 + -2) % 2) + (-1)*((((e2p_x0 + -2) % 2)*((e2p_x0 + -2) % 2 + 1)) // 2) + e2p_x1 + 2 if (e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 < 1 else (2*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2)*(2 + (e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + -2) + ((e2p_x0 + -2) % 2)*(3 + (-1)*((e2p_x0 + -2) % 2) + 2*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2))) // 2 + e2p_x1 + 2]*(-1.0) + e2p_coeffs_copy[(((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2)*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 1)*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 2)) // 6 + ((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 2)*((e2p_x0 + -2) % 2) + (-1)*((((e2p_x0 + -2) % 2)*((e2p_x0 + -2) % 2 + 1)) // 2) + e2p_x1 if (e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 < 1 else (2*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2)*(2 + (e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + -2) + ((e2p_x0 + -2) % 2)*(3 + (-1)*((e2p_x0 + -2) % 2) + 2*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2))) // 2 + e2p_x1]*(-1.0) {id=e2p_update_coeffs, dep=e2p__start:e2p_copy_coeffs, inames=+e2p_x2:e2p_iorder1:itgt_offset_outer:e2p_x0:e2p_x1:itgt_box}
    result_temp[0] = result_temp[0] + e2p_coeffs_copy[((e2p_x0 % 2 + e2p_y1 + e2p_y2)*(e2p_x0 % 2 + e2p_y1 + e2p_y2 + 1)*(e2p_x0 % 2 + e2p_y1 + e2p_y2 + 2)) // 6 + (e2p_x0 % 2 + e2p_y1 + e2p_y2 + 2)*(e2p_x0 % 2) + (-1)*(((e2p_x0 % 2)*(e2p_x0 % 2 + 1)) // 2) + e2p_y1 if e2p_x0 % 2 + e2p_y1 + e2p_y2 < 1 else (2*(e2p_x0 % 2 + e2p_y1 + e2p_y2)*(2 + e2p_x0 % 2 + e2p_y1 + e2p_y2 + -2) + (e2p_x0 % 2)*(3 + (-1)*(e2p_x0 % 2) + 2*(e2p_x0 % 2 + e2p_y1 + e2p_y2))) // 2 + e2p_y1]*e2p_power_b[0, e2p_x0]*e2p_power_b[1, e2p_y1]*e2p_power_b[2, e2p_y2] {id=e2p_write_0, dep=e2p_update_monomials:e2p_update_coeffs:e2p__start, inames=+itgt_offset_inner:itgt_offset_outer:e2p_iorder2:e2p_x0:e2p_y1:e2p_y2:itgt_box}
    ... nop {id=update_result, dep=e2p_write_0:e2p_update_monomials:e2p_zero_monomials:e2p_update_coeffs:e2p_set_b:e2p_init_monomials:e2p_copy_coeffs, inames=+itgt_offset_outer:itgt_offset_inner:itgt_box}
    result[iknl, itgt] = result_temp[iknl]*kernel_scaling {id=write_result, dep=update_result:insn:insn_0:kernel_scaling, inames=iknl:itgt_offset_inner:itgt_box:itgt_offset_outer}
    ''', [
        lp.GlobalArg(
            name="targets", dtype=None,
            shape=(3, Variable('ntargets')), for_atomic=False),
        lp.GlobalArg(
            name="box_target_starts", dtype=None,
            shape=None, for_atomic=False),
        lp.GlobalArg(
            name="box_target_counts_nonchild", dtype=None,
            shape=None, for_atomic=False),
        lp.GlobalArg(
            name="centers", dtype=None,
            shape=(3, Variable('naligned_boxes')), for_atomic=False),
        lp.ValueArg(
            name="rscale",
            dtype=None),
        lp.GlobalArg(
            name="result", dtype=None,
            shape=(1, Variable('ntargets')), for_atomic=False),
        lp.GlobalArg(
            name="src_expansions", dtype=None,
            shape=(Variable('nsrc_level_boxes'), 121), for_atomic=False),
        lp.ValueArg(
            name="nsrc_level_boxes",
            dtype=np.int32),
        lp.ValueArg(
            name="naligned_boxes",
            dtype=np.int32),
        lp.ValueArg(
            name="src_base_ibox",
            dtype=np.int32),
        lp.ValueArg(
            name="ntargets",
            dtype=np.int32),
        lp.ValueArg(
            name="ntgt_boxes",
            dtype=None),
        lp.GlobalArg(
            name="target_boxes", dtype=None,
            shape=(Variable('ntgt_boxes'),), for_atomic=False),
        lp.TemporaryVariable(
            name="kernel_scaling",
            shape=(), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="tgt_ibox",
            shape=(), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="itgt_start",
            shape=(), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="itgt_end",
            shape=(), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="center",
            shape=(3,), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="coeffs",
            shape=(121,), for_atomic=False,
            address_space=lp.AddressSpace.LOCAL,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="itgt",
            shape=(), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="run_itgt",
            shape=(), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="tgt",
            shape=(3,), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="result_temp",
            shape=(1,), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="e2p_b",
            shape=(3,), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="e2p_power_b",
            shape=(3, 11), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="e2p_coeffs_copy",
            shape=(121,), for_atomic=False,
            address_space=lp.AddressSpace.LOCAL,
            read_only=False,
            ),
        ],
        lang_version=(2018, 2),
        iname_slab_increments=immutables.Map({'itgt_offset_outer': (0, 0), 'e2p_icoeff_outer': (0, 0), 'icoeff_outer': (0, 0)}),
        applied_iname_rewrites=({Variable('itgt_offset'): Sum((Variable('itgt_offset_inner'), Product((Variable('itgt_offset_outer'), 32))))}, {Variable('icoeff'): Sum((Variable('icoeff_inner'), Product((Variable('icoeff_outer'), 32))))}, {Variable('e2p_icoeff'): Sum((Variable('e2p_icoeff_inner'), Product((Variable('e2p_icoeff_outer'), 32))))}),
        name="e2p_from_single_box",
        )

e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "idim_0:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "itgt_offset_inner:l.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "idim:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_idim:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "iknl_0:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder1:l.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_icoeff_inner:l.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder0:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "icoeff_inner:l.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder2:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "iknl:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "itgt_box:g.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "dummy:l.0")
knl = lp.merge([e2p_from_single_box_knl])
            
knl = lp.add_and_infer_dtypes(knl, {"targets": np.float64, "box_target_starts": np.int32,
  "box_target_counts_nonchild": np.int32, "target_boxes": np.int32,
  "centers": np.float64, "rscale": np.float64, "result": np.float64, "src_expansions": np.float64})

print(lp.generate_code_v2(knl).device_code())
knl = lp.split_iname(knl, "e2p_x0", 2)
knl = lp.duplicate_inames(knl, "e2p_x0_inner", within="id:e2p_update_coeffs")
print(lp.generate_code_v2(knl).device_code())
@kaushikcfd
Copy link
Collaborator

kaushikcfd commented Feb 12, 2023

I did not look closely at the provided kernel, but that can happen in the following case:

import loopy as lp

knl = lp.make_kernel(
    "{[i, j]: 0<=i,j<10}",
    """
    for i
        <> tmp = 10
        for j
            out1[i, j] = i*j*tmp
            out2[i, j] = (i+j)*tmp
        end
    end
    """)

lp.generate_code_v2(knl)  # generates code
knl = lp.duplicate_inames(knl, "i", within="writes:out1")
lp.generate_code_v2(knl)  # FAILS due to unschedulable loop nesting

I would not consider the above behavior a loopy bug as it simply did what the user demanded.

@inducer
Copy link
Owner

inducer commented Feb 17, 2023

Agree with @kaushikcfd. Good to close?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants