Skip to content

Commit

Permalink
test vectorizability of blas-callables with vector inputs
Browse files Browse the repository at this point in the history
  • Loading branch information
kaushikcfd committed Jul 11, 2022
1 parent 0feea8a commit 47eb2d5
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 44 deletions.
127 changes: 107 additions & 20 deletions examples/python/call-external.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from loopy.diagnostic import LoopyError
from loopy.target.c import CTarget
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
from loopy.target.c.c_execution import CCompiler
from codepy.toolchain import GCCToolchain


# {{{ blas callable
Expand All @@ -22,7 +24,7 @@ def with_types(self, arg_id_to_dtype, callables_table):

if vec_dtype.numpy_dtype == np.float32:
name_in_target = "cblas_sgemv"
elif vec_dtype. numpy_dtype == np.float64:
elif vec_dtype.numpy_dtype == np.float64:
name_in_target = "cblas_dgemv"
else:
raise LoopyError("GEMV is only supported for float32 and float64 "
Expand All @@ -47,30 +49,37 @@ def with_descrs(self, arg_id_to_descr, callables_table):
assert mat_descr.shape[0] == res_descr.shape[0]
assert len(vec_descr.shape) == len(res_descr.shape) == 1
# handling only the easy case when stride == 1
assert vec_descr.dim_tags[0].stride == 1
assert mat_descr.dim_tags[1].stride == 1
assert res_descr.dim_tags[0].stride == 1

return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table

def emit_call_insn(self, insn, target, expression_to_code_mapper):
from pymbolic import var
from loopy.codegen import UnvectorizableError
mat_descr = self.arg_id_to_descr[0]
vec_descr = self.arg_id_to_descr[1]
res_descr = self.arg_id_to_descr[-1]
m, n = mat_descr.shape
ecm = expression_to_code_mapper

if ecm.codegen_state.vectorization_info is not None:
raise UnvectorizableError("cannot vectorize BLAS-gemv.")

mat, vec = insn.expression.parameters
result, = insn.assignees

c_parameters = [var("CblasRowMajor"),
var("CblasNoTrans"),
m, n,
1,
1, # alpha
ecm(mat).expr,
1,
mat_descr.dim_tags[0].stride, # LDA
ecm(vec).expr,
1,
vec_descr.dim_tags[0].stride, # INCX
0, # beta
ecm(result).expr,
1]
res_descr.dim_tags[0].stride # INCY
]
return (var(self.name_in_target)(*c_parameters),
False # cblas_gemv does not return anything
)
Expand All @@ -83,17 +92,95 @@ def generate_preambles(self, target):
# }}}


n = 10

knl = lp.make_kernel(
"{:}",
def transform_1(knl):
return knl


def transform_2(knl):
# A similar transformation is applied to kernels containing
# SLATE <https://www.firedrakeproject.org/firedrake.slate.html>
# callables.
knl = lp.split_iname(knl, "e", 4, inner_iname="e_inner", slabs=(0, 1))
knl = lp.privatize_temporaries_with_inames(knl, "e_inner")
knl = lp.tag_inames(knl, {"e_inner": "vec"})
if 0:
# Easy codegen exercise, but misses vectorizing certain instructions.
knl = lp.tag_array_axes(knl, "tmp3", "c,vec")
else:
knl = lp.tag_array_axes(knl, "tmp3,tmp2", "c,vec")
return knl


def main():

compiler = CCompiler(toolchain=GCCToolchain(
cc="gcc",
cflags="-std=c99 -O3 -fPIC".split(),
ldflags="-shared".split(),
libraries=["blas"],
library_dirs=[],
defines=[],
undefines=[],
source_suffix="c",
so_ext=".so",
o_ext=".o",
include_dirs=[]))

knl = lp.make_kernel(
"{[e,i1,i2]: 0<=e<n and 0<=i1,i2<4}",
"""
y[:] = gemv(A[:, :], x[:])
""", [
lp.GlobalArg("A", dtype=np.float64, shape=(n, n)),
lp.GlobalArg("x", dtype=np.float64, shape=(n, )),
lp.GlobalArg("y", shape=(n, )), ...],
target=CTarget())

knl = lp.register_callable(knl, "gemv", CBLASGEMV(name="gemv"))
print(lp.generate_code_v2(knl).device_code())
for e
for i1
tmp1[i1] = 3*x[e, i1]
end
tmp2[:] = matvec(A[:, :], tmp1[:])
for i2
<> tmp3[i2] = 2 * tmp2[i2]
out[e, i2] = tmp3[i2]
end
end
""",
kernel_data=[
lp.TemporaryVariable("tmp1",
shape=(4, ),
dtype=None),
lp.TemporaryVariable("tmp2",
shape=(4, ),
dtype=None),
lp.GlobalArg("A",
shape=(4, 4),
dtype="float64"),
lp.GlobalArg("x",
shape=lp.auto,
dtype="float64"),
...],
target=lp.ExecutableCVectorExtensionsTarget(compiler=compiler),
lang_version=(2018, 2))

knl = lp.register_callable(knl, "matvec", CBLASGEMV("matvec"))

for transform_func in [transform_1, transform_2]:
knl = transform_func(knl)
print("Generated code from '{transform_func.__name__} -----'")
print(lp.generate_code_v2(knl).device_code())
print(75 * "-")

# {{ verify the result is correct.

from numpy.random import default_rng

rng = default_rng(seed=0)
a = rng.random((4, 4))
x = rng.random((100, 4))

_, (out,) = knl(A=a, x=x)

np.testing.assert_allclose(6*np.einsum("ij,ej->ei",
a, x),
out)

# }}}


if __name__ == "__main__":
main()
34 changes: 10 additions & 24 deletions test/test_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,23 +737,6 @@ def test_c_vector_extensions():
print(lp.generate_code_v2(knl).device_code())


def test_omp_simd_tag():
knl = lp.make_kernel(
"{[i]: 0<=i<16}",
"""
y[i] = 2 * x[i]
""")

knl = lp.add_dtypes(knl, {"x": "float64"})
knl = lp.split_iname(knl, "i", 4)
knl = lp.tag_inames(knl, {"i_inner": lp.OpenMPSIMDTag()})

code_str = lp.generate_code_v2(knl).device_code()

assert any(line.strip() == "#pragma omp simd"
for line in code_str.split("\n"))


def test_vec_tag_with_omp_simd_fallback():
knl = lp.make_kernel(
"{[i, j1, j2, j3]: 0<=i<10 and 0<=j1,j2,j3<4}",
Expand All @@ -764,11 +747,13 @@ def test_vec_tag_with_omp_simd_fallback():
""",
[lp.GlobalArg("x, y", shape=lp.auto, dtype=float)],
seq_dependencies=True,
target=lp.ExecutableCVectorExtensionsTarget())
target=lp.ExecutableCVectorExtensionsTarget(
lp.VectorizationFallback.OMP_SIMD)
)

knl = lp.tag_inames(knl, {"j1": lp.VectorizeTag(lp.OpenMPSIMDTag()),
"j2": lp.VectorizeTag(lp.OpenMPSIMDTag()),
"j3": lp.VectorizeTag(lp.OpenMPSIMDTag())})
knl = lp.tag_inames(knl, {"j1": "vec",
"j2": "vec",
"j3": "vec"})
knl = lp.tag_array_axes(knl, "temp1,temp2", "vec")

code_str = lp.generate_code_v2(knl).device_code()
Expand All @@ -794,15 +779,16 @@ def test_vec_extensions_with_multiple_loopy_body_insns():
end
""",
seq_dependencies=True,
target=lp.ExecutableCVectorExtensionsTarget())
target=lp.ExecutableCVectorExtensionsTarget(
lp.VectorizationFallback.OMP_SIMD)
)

knl = lp.add_dtypes(knl, {"dat0": "float64"})
knl = lp.split_iname(knl, "n", 4, slabs=(1, 1),
inner_iname="n_batch")
knl = lp.privatize_temporaries_with_inames(knl, "n_batch")
knl = lp.tag_array_axes(knl, "tmp", "vec")
knl = lp.tag_inames(knl, {
"n_batch": lp.VectorizeTag(lp.OpenMPSIMDTag())})
knl = lp.tag_inames(knl, {"n_batch": "vec"})

_, (out,) = knl(N=100)
np.testing.assert_allclose(out, 2*np.ones((100, 1)))
Expand Down

0 comments on commit 47eb2d5

Please sign in to comment.