Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Explicitly stage strided loads #7230

Merged
merged 29 commits into from
Dec 16, 2022
Merged
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1fd8bb0
Add a pass to do explicit densification of strided loads
abadams Dec 8, 2022
009018b
densify more types of strided load
abadams Dec 8, 2022
86961c9
Reorder downsample in local laplacian for slightly better performance
abadams Dec 9, 2022
b880336
Move allocation padding into the IR. Still WIP.
abadams Dec 9, 2022
331e43f
Simplify concat_bits handling
abadams Dec 9, 2022
ffe6f0a
Use evidence from parent scopes to densify
abadams Dec 9, 2022
964c940
Disallow padding allocations with custom new expressions
abadams Dec 9, 2022
523dc69
Add test for parent scopes
abadams Dec 9, 2022
0260fb9
Remove debugging prints. Avoid nested ramps.
abadams Dec 9, 2022
d856df3
Avoid parent scope loops
abadams Dec 9, 2022
aa92026
Update cmakefiles
abadams Dec 9, 2022
69b486d
Fix for large_buffers
abadams Dec 9, 2022
37b1cc5
Pad stack allocations too
abadams Dec 9, 2022
05733e7
Restore vld2/3/4 generation on non-Apple ARM chips
abadams Dec 10, 2022
b9e7417
Appease clang-format and clang-tidy
abadams Dec 12, 2022
ae0d0d8
Silence clang-tidy
abadams Dec 12, 2022
0d977e7
Better comments
abadams Dec 12, 2022
96ac6c0
Comment improvements
abadams Dec 12, 2022
4f733aa
Nuke code that reads out of bounds
abadams Dec 13, 2022
ff9a1b6
Fix stage_strided_loads test
abadams Dec 13, 2022
8f22adb
Change strategy for loads from external buffers
abadams Dec 13, 2022
b1dd3a2
Add explanatory comment to ARM backend
abadams Dec 13, 2022
068412f
Fix cpp backend shuffling
abadams Dec 13, 2022
df3bf08
Fix missing msan annotations
abadams Dec 13, 2022
a1f2a12
Magnify heap cost effect in stack_vs_heap performance test
abadams Dec 13, 2022
f3a9e11
Merge branch 'abadams/stage_strided_loads' of https://github.com/hali…
abadams Dec 13, 2022
ee6cea3
Address review comments
abadams Dec 15, 2022
2f7bf16
clang-tidy
abadams Dec 15, 2022
16b90bb
Fix for when same load node occurs in two different allocate nodes
abadams Dec 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Simplify concat_bits handling
abadams committed Dec 9, 2022

Verified

This commit was signed with the committer’s verified signature.
jalaziz Jameel Al-Aziz
commit 331e43fde4dafb7bac99aae451a09f68ead2e3ad
41 changes: 9 additions & 32 deletions src/FlattenNestedRamps.cpp
Original file line number Diff line number Diff line change
@@ -118,40 +118,17 @@ class FlattenRamps : public IRMutator {
}
};

/** Simplify bit concatenation of interleaved loads to vector reinterprets of
* dense loads. Must be done to both vectors and scalars after flattening nested
* ramps, because it can expand a flat ramp into a wider one. */
class SimplifyConcatBits : public IRMutator {
/** Lower bit concatenation into vector interleaving followed by a vector
* reinterpret. */
class LowerConcatBits : public IRMutator {
using IRMutator::visit;

Expr visit(const Call *op) override {
if (op->is_intrinsic(Call::concat_bits)) {
// Simplify a concat of a load of adjacent bits to a reinterpret of a load of a small vector.
const Load *l0 = op->args[0].as<Load>();
bool ok = true;
const int n = (int)(op->args.size());
for (int i = 0; ok && i < n; i++) {
const Load *li = op->args[i].as<Load>();
ok &= (li != nullptr);
if (!ok) {
break;
}
const Ramp *r = li->index.as<Ramp>();
Expr base = r ? r->base : li->index;
ok &= (is_const_one(li->predicate) &&
l0->name == li->name &&
can_prove(l0->index + i == li->index) &&
(r == nullptr || is_const(r->stride, n)));
}

if (ok) {
internal_assert(l0);
const Ramp *r0 = l0->index.as<Ramp>();
int new_lanes = (r0 ? r0->lanes : 1) * n;
Expr base = r0 ? r0->base : l0->index;
Expr idx = Ramp::make(base, 1, new_lanes);
return mutate(Reinterpret::make(op->type, Load::make(l0->type.with_lanes(n * l0->type.lanes()), l0->name, idx, l0->image, l0->param, const_true(new_lanes), l0->alignment)));
}
// Rewrite concat_bits into a shuffle followed by a vector reinterpret.
Expr shuf = simplify(Shuffle::make_interleave(op->args));
Expr e = Reinterpret::make(op->type, shuf);
return mutate(e);
}

return IRMutator::visit(op);
@@ -161,11 +138,11 @@ class SimplifyConcatBits : public IRMutator {
} // namespace

Stmt flatten_nested_ramps(const Stmt &s) {
return SimplifyConcatBits().mutate(FlattenRamps().mutate(s));
return LowerConcatBits().mutate(FlattenRamps().mutate(s));
}

Expr flatten_nested_ramps(const Expr &e) {
return SimplifyConcatBits().mutate(FlattenRamps().mutate(e));
return LowerConcatBits().mutate(FlattenRamps().mutate(e));
}

} // namespace Internal