diff --git a/impeller/compiler/shader_lib/impeller/gaussian.glsl b/impeller/compiler/shader_lib/impeller/gaussian.glsl index 62874bec96d9c..9dd104d6e4ce2 100644 --- a/impeller/compiler/shader_lib/impeller/gaussian.glsl +++ b/impeller/compiler/shader_lib/impeller/gaussian.glsl @@ -6,51 +6,53 @@ #define GAUSSIAN_GLSL_ #include +#include /// Gaussian distribution function. -float IPGaussian(float x, float sigma) { - float variance = sigma * sigma; - return exp(-0.5 * x * x / variance) / (kSqrtTwoPi * sigma); +float16_t IPGaussian(float16_t x, float16_t sigma) { + float16_t variance = sigma * sigma; + return exp(-0.5hf * x * x / variance) / (float16_t(kSqrtTwoPi) * sigma); } /// Abramowitz and Stegun erf approximation. -float IPErf(float x) { - float a = abs(x); +float16_t IPErf(float16_t x) { + float16_t a = abs(x); // 0.278393*x + 0.230389*x^2 + 0.078108*x^4 + 1 - float b = (0.278393 + (0.230389 + 0.078108 * a * a) * a) * a + 1.0; - return sign(x) * (1 - 1 / (b * b * b * b)); + float16_t b = + (0.278393hf + (0.230389hf + 0.078108hf * a * a) * a) * a + 1.0hf; + return sign(x) * (1.0hf - 1.0hf / (b * b * b * b)); } /// Vec2 variation for the Abramowitz and Stegun erf approximation. -vec2 IPVec2Erf(vec2 x) { - vec2 a = abs(x); +f16vec2 IPVec2Erf(f16vec2 x) { + f16vec2 a = abs(x); // 0.278393*x + 0.230389*x^2 + 0.078108*x^4 + 1 - vec2 b = (0.278393 + (0.230389 + 0.078108 * a * a) * a) * a + 1.0; - return sign(x) * (1 - 1 / (b * b * b * b)); + f16vec2 b = (0.278393hf + (0.230389hf + 0.078108hf * a * a) * a) * a + 1.0hf; + return sign(x) * (1.0hf - 1.0hf / (b * b * b * b)); } /// The indefinite integral of the Gaussian function. /// Uses a very close approximation of Erf. -float IPGaussianIntegral(float x, float sigma) { +float16_t IPGaussianIntegral(float16_t x, float16_t sigma) { // ( 1 + erf( x * (sqrt(2) / (2 * sigma) ) ) / 2 - return (1 + IPErf(x * (kHalfSqrtTwo / sigma))) * 0.5; + return (1.0hf + IPErf(x * (float16_t(kHalfSqrtTwo) / sigma))) * 0.5hf; } /// Vec2 variation for the indefinite integral of the Gaussian function. /// Uses a very close approximation of Erf. -vec2 IPVec2GaussianIntegral(vec2 x, float sigma) { +f16vec2 IPVec2GaussianIntegral(f16vec2 x, float16_t sigma) { // ( 1 + erf( x * (sqrt(2) / (2 * sigma) ) ) / 2 - return (1 + IPVec2Erf(x * (kHalfSqrtTwo / sigma))) * 0.5; + return (1.0hf + IPVec2Erf(x * (float16_t(kHalfSqrtTwo) / sigma))) * 0.5hf; } /// Simpler (but less accurate) approximation of the Gaussian integral. -vec2 IPVec2FastGaussianIntegral(vec2 x, float sigma) { - return 1 / (1 + exp(-kSqrtThree / sigma * x)); +f16vec2 IPVec2FastGaussianIntegral(f16vec2 x, float16_t sigma) { + return 1.0hf / (1.0hf + exp(float16_t(-kSqrtThree) / sigma * x)); } /// Simple logistic sigmoid with a domain of [-1, 1] and range of [0, 1]. -float IPSigmoid(float x) { - return 1.03731472073 / (1 + exp(-4 * x)) - 0.0186573603638; +float16_t IPSigmoid(float16_t x) { + return 1.03731472073hf / (1.0hf + exp(-4.0hf * x)) - 0.0186573603638hf; } #endif diff --git a/impeller/compiler/shader_lib/impeller/texture.glsl b/impeller/compiler/shader_lib/impeller/texture.glsl index 3bbb580fb275f..d8b244294a756 100644 --- a/impeller/compiler/shader_lib/impeller/texture.glsl +++ b/impeller/compiler/shader_lib/impeller/texture.glsl @@ -143,6 +143,15 @@ vec4 IPSampleDecal(sampler2D texture_sampler, vec2 coords) { return texture(texture_sampler, coords); } +/// Sample a texture with decal tile mode. +f16vec4 IPHalfSampleDecal(f16sampler2D texture_sampler, f16vec2 coords) { + if (any(lessThan(coords, f16vec2(0.0hf))) || + any(greaterThanEqual(coords, f16vec2(1.0)))) { + return f16vec4(0.0); + } + return texture(texture_sampler, coords); +} + /// Sample a texture, emulating a specific tile mode. /// /// This is useful for Impeller graphics backend that don't have native support diff --git a/impeller/entity/shaders/border_mask_blur.frag b/impeller/entity/shaders/border_mask_blur.frag index b28dfc8210380..e0e89b2e8edc5 100644 --- a/impeller/entity/shaders/border_mask_blur.frag +++ b/impeller/entity/shaders/border_mask_blur.frag @@ -15,42 +15,42 @@ // integral (using an erf approximation) to the 4 edges of the UV rectangle and // multiplying them. -uniform sampler2D texture_sampler; +uniform f16sampler2D texture_sampler; uniform FragInfo { - float src_factor; - float inner_blur_factor; - float outer_blur_factor; + float16_t src_factor; + float16_t inner_blur_factor; + float16_t outer_blur_factor; - vec2 sigma_uv; + f16vec2 sigma_uv; } frag_info; -in vec2 v_texture_coords; +in f16vec2 v_texture_coords; -out vec4 frag_color; +out f16vec4 frag_color; -float BoxBlurMask(vec2 uv) { +float16_t BoxBlurMask(f16vec2 uv) { // LTRB - return IPGaussianIntegral(uv.x, frag_info.sigma_uv.x) * // - IPGaussianIntegral(uv.y, frag_info.sigma_uv.y) * // - IPGaussianIntegral(1 - uv.x, frag_info.sigma_uv.x) * // - IPGaussianIntegral(1 - uv.y, frag_info.sigma_uv.y); + return IPGaussianIntegral(uv.x, frag_info.sigma_uv.x) * // + IPGaussianIntegral(uv.y, frag_info.sigma_uv.y) * // + IPGaussianIntegral(1.0hf - uv.x, frag_info.sigma_uv.x) * // + IPGaussianIntegral(1.0hf - uv.y, frag_info.sigma_uv.y); } void main() { - vec4 image_color = texture(texture_sampler, v_texture_coords); - float blur_factor = BoxBlurMask(v_texture_coords); + f16vec4 image_color = texture(texture_sampler, v_texture_coords); + float16_t blur_factor = BoxBlurMask(v_texture_coords); - float within_bounds = - float(v_texture_coords.x >= 0 && v_texture_coords.y >= 0 && - v_texture_coords.x < 1 && v_texture_coords.y < 1); - float inner_factor = + float16_t within_bounds = + float16_t(v_texture_coords.x >= 0.0hf && v_texture_coords.y >= 0.0hf && + v_texture_coords.x < 1.0hf && v_texture_coords.y < 1.0hf); + float16_t inner_factor = (frag_info.inner_blur_factor * blur_factor + frag_info.src_factor) * within_bounds; - float outer_factor = - frag_info.outer_blur_factor * blur_factor * (1 - within_bounds); + float16_t outer_factor = + frag_info.outer_blur_factor * blur_factor * (1.0hf - within_bounds); - float mask_factor = inner_factor + outer_factor; + float16_t mask_factor = inner_factor + outer_factor; frag_color = image_color * mask_factor; } diff --git a/impeller/entity/shaders/border_mask_blur.vert b/impeller/entity/shaders/border_mask_blur.vert index bff59a4747e65..74b9ae422483a 100644 --- a/impeller/entity/shaders/border_mask_blur.vert +++ b/impeller/entity/shaders/border_mask_blur.vert @@ -15,10 +15,10 @@ frame_info; in vec2 vertices; in vec2 texture_coords; -out vec2 v_texture_coords; +out f16vec2 v_texture_coords; void main() { gl_Position = frame_info.mvp * vec4(vertices, 0.0, 1.0); - v_texture_coords = - IPRemapCoords(texture_coords, frame_info.texture_sampler_y_coord_scale); + v_texture_coords = f16vec2( + IPRemapCoords(texture_coords, frame_info.texture_sampler_y_coord_scale)); } diff --git a/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl b/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl index 4a218303efc27..c99adfe449e24 100644 --- a/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl +++ b/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl @@ -18,52 +18,52 @@ #include #include -uniform sampler2D texture_sampler; +uniform f16sampler2D texture_sampler; uniform BlurInfo { - vec2 texture_size; - vec2 blur_direction; + f16vec2 texture_size; + f16vec2 blur_direction; // The blur sigma and radius have a linear relationship which is defined // host-side, but both are useful controls here. Sigma (pixels per standard // deviation) is used to define the gaussian function itself, whereas the // radius is used to limit how much of the function is integrated. - float blur_sigma; - float blur_radius; + float16_t blur_sigma; + float16_t blur_radius; } blur_info; #if ENABLE_ALPHA_MASK -uniform sampler2D alpha_mask_sampler; +uniform f16sampler2D alpha_mask_sampler; uniform MaskInfo { - float src_factor; - float inner_blur_factor; - float outer_blur_factor; + float16_t src_factor; + float16_t inner_blur_factor; + float16_t outer_blur_factor; } mask_info; #endif -vec4 Sample(sampler2D tex, vec2 coords) { +f16vec4 Sample(f16sampler2D tex, f16vec2 coords) { #if ENABLE_DECAL_SPECIALIZATION - return IPSampleDecal(tex, coords); + return IPHalfSampleDecal(tex, coords); #else return texture(tex, coords); #endif } -in vec2 v_texture_coords; -in vec2 v_src_texture_coords; +in f16vec2 v_texture_coords; +in f16vec2 v_src_texture_coords; -out vec4 frag_color; +out f16vec4 frag_color; void main() { - vec4 total_color = vec4(0); - float gaussian_integral = 0; - vec2 blur_uv_offset = blur_info.blur_direction / blur_info.texture_size; + f16vec4 total_color = f16vec4(0.0hf); + float16_t gaussian_integral = 0.0hf; + f16vec2 blur_uv_offset = blur_info.blur_direction / blur_info.texture_size; - for (float i = -blur_info.blur_radius; i <= blur_info.blur_radius; i++) { - float gaussian = IPGaussian(i, blur_info.blur_sigma); + for (float16_t i = -blur_info.blur_radius; i <= blur_info.blur_radius; i++) { + float16_t gaussian = IPGaussian(i, blur_info.blur_sigma); gaussian_integral += gaussian; total_color += gaussian * @@ -75,11 +75,12 @@ void main() { frag_color = total_color / gaussian_integral; #if ENABLE_ALPHA_MASK - vec4 src_color = Sample(alpha_mask_sampler, // sampler - v_src_texture_coords // texture coordinates + f16vec4 src_color = Sample(alpha_mask_sampler, // sampler + v_src_texture_coords // texture coordinates ); - float blur_factor = mask_info.inner_blur_factor * float(src_color.a > 0) + - mask_info.outer_blur_factor * float(src_color.a == 0); + float16_t blur_factor = + mask_info.inner_blur_factor * float16_t(src_color.a > 0.0hf) + + mask_info.outer_blur_factor * float16_t(src_color.a == 0.0hf); frag_color = frag_color * blur_factor + src_color * mask_info.src_factor; #endif diff --git a/impeller/entity/shaders/gaussian_blur/gaussian_blur.vert b/impeller/entity/shaders/gaussian_blur/gaussian_blur.vert index f402003b13bf1..96b2ccf5c66c8 100644 --- a/impeller/entity/shaders/gaussian_blur/gaussian_blur.vert +++ b/impeller/entity/shaders/gaussian_blur/gaussian_blur.vert @@ -16,13 +16,13 @@ in vec2 vertices; in vec2 texture_coords; in vec2 src_texture_coords; -out vec2 v_texture_coords; -out vec2 v_src_texture_coords; +out f16vec2 v_texture_coords; +out f16vec2 v_src_texture_coords; void main() { gl_Position = frame_info.mvp * vec4(vertices, 0.0, 1.0); - v_texture_coords = - IPRemapCoords(texture_coords, frame_info.texture_sampler_y_coord_scale); - v_src_texture_coords = IPRemapCoords( - src_texture_coords, frame_info.alpha_mask_sampler_y_coord_scale); + v_texture_coords = f16vec2( + IPRemapCoords(texture_coords, frame_info.texture_sampler_y_coord_scale)); + v_src_texture_coords = f16vec2(IPRemapCoords( + src_texture_coords, frame_info.alpha_mask_sampler_y_coord_scale)); } diff --git a/impeller/entity/shaders/rrect_blur.frag b/impeller/entity/shaders/rrect_blur.frag index 5b0ddff80976c..9b342481187b2 100644 --- a/impeller/entity/shaders/rrect_blur.frag +++ b/impeller/entity/shaders/rrect_blur.frag @@ -6,58 +6,61 @@ #include uniform FragInfo { - vec4 color; - float blur_sigma; - vec2 rect_size; - float corner_radius; + f16vec4 color; + f16vec2 rect_size; + float16_t blur_sigma; + float16_t corner_radius; } frag_info; -in vec2 v_position; +in f16vec2 v_position; -out vec4 frag_color; +out f16vec4 frag_color; const int kSampleCount = 4; -float RRectDistance(vec2 sample_position, vec2 half_size) { - vec2 space = abs(sample_position) - half_size + frag_info.corner_radius; - return length(max(space, 0.0)) + min(max(space.x, space.y), 0.0) - - frag_info.corner_radius; +float16_t RRectDistance(f16vec2 sample_position, f16vec2 half_size) { + f16vec2 space = abs(sample_position) - half_size + frag_info.corner_radius; + return length(max(space, float16_t(0.0hf))) + + min(max(space.x, space.y), float16_t(0.0hf)) - frag_info.corner_radius; } /// Closed form unidirectional rounded rect blur mask solution using the /// analytical Gaussian integral (with approximated erf). -float RRectShadowX(vec2 sample_position, vec2 half_size) { +float16_t RRectShadowX(f16vec2 sample_position, f16vec2 half_size) { // Compute the X direction distance field (not incorporating the Y distance) // for the rounded rect. - float space = - min(0, half_size.y - frag_info.corner_radius - abs(sample_position.y)); - float rrect_distance = + float16_t space = + min(float16_t(0.0hf), + half_size.y - frag_info.corner_radius - abs(sample_position.y)); + float16_t rrect_distance = half_size.x - frag_info.corner_radius + - sqrt(max(0, frag_info.corner_radius * frag_info.corner_radius - - space * space)); + sqrt(max( + float16_t(0.0hf), + frag_info.corner_radius * frag_info.corner_radius - space * space)); // Map the linear distance field to the approximate Gaussian integral. - vec2 integral = IPVec2FastGaussianIntegral( - sample_position.x + vec2(-rrect_distance, rrect_distance), + f16vec2 integral = IPVec2FastGaussianIntegral( + sample_position.x + f16vec2(-rrect_distance, rrect_distance), frag_info.blur_sigma); return integral.y - integral.x; } -float RRectShadow(vec2 sample_position, vec2 half_size) { +float16_t RRectShadow(f16vec2 sample_position, f16vec2 half_size) { // Limit the sampling range to 3 standard deviations in the Y direction from // the kernel center to incorporate 99.7% of the color contribution. - float half_sampling_range = frag_info.blur_sigma * 3; + float16_t half_sampling_range = frag_info.blur_sigma * 3.0hf; - float begin_y = max(-half_sampling_range, sample_position.y - half_size.y); - float end_y = min(half_sampling_range, sample_position.y + half_size.y); - float interval = (end_y - begin_y) / kSampleCount; + float16_t begin_y = + max(-half_sampling_range, sample_position.y - half_size.y); + float16_t end_y = min(half_sampling_range, sample_position.y + half_size.y); + float16_t interval = (end_y - begin_y) / float16_t(kSampleCount); // Sample the X blur kSampleCount times, weighted by the Gaussian function. - float result = 0; + float16_t result = 0.0hf; for (int sample_i = 0; sample_i < kSampleCount; sample_i++) { - float y = begin_y + interval * (sample_i + 0.5); - result += RRectShadowX(vec2(sample_position.x, sample_position.y - y), + float16_t y = begin_y + interval * (float16_t(sample_i) + 0.5hf); + result += RRectShadowX(f16vec2(sample_position.x, sample_position.y - y), half_size) * IPGaussian(y, frag_info.blur_sigma) * interval; } @@ -68,10 +71,10 @@ float RRectShadow(vec2 sample_position, vec2 half_size) { void main() { frag_color = frag_info.color; - vec2 half_size = frag_info.rect_size * 0.5; - vec2 sample_position = v_position - half_size; + f16vec2 half_size = frag_info.rect_size * 0.5hf; + f16vec2 sample_position = v_position - half_size; - if (frag_info.blur_sigma > 0) { + if (frag_info.blur_sigma > 0.0hf) { frag_color *= RRectShadow(sample_position, half_size); } else { frag_color *= -RRectDistance(sample_position, half_size); diff --git a/impeller/entity/shaders/rrect_blur.vert b/impeller/entity/shaders/rrect_blur.vert index 87382f6b4dcbe..6ca9e06bba4b8 100644 --- a/impeller/entity/shaders/rrect_blur.vert +++ b/impeller/entity/shaders/rrect_blur.vert @@ -11,10 +11,10 @@ frame_info; in vec2 position; -out vec2 v_position; +out f16vec2 v_position; void main() { gl_Position = frame_info.mvp * vec4(position, 0.0, 1.0); // The fragment stage uses local coordinates to compute the blur. - v_position = position; + v_position = f16vec2(position); } diff --git a/impeller/tools/malioc.json b/impeller/tools/malioc.json index 9f6418deddff2..2b2b1231a5cf4 100644 --- a/impeller/tools/malioc.json +++ b/impeller/tools/malioc.json @@ -1440,7 +1440,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 5, + "fp16_arithmetic": 44, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -1448,8 +1448,8 @@ "arith_fma" ], "longest_path_cycles": [ - 0.8125, - 0.8125, + 0.875, + 0.875, 0.203125, 0.25, 0.0, @@ -1470,8 +1470,8 @@ "arith_fma" ], "shortest_path_cycles": [ - 0.8125, - 0.8125, + 0.875, + 0.875, 0.203125, 0.25, 0.0, @@ -1483,8 +1483,8 @@ "arith_fma" ], "total_cycles": [ - 0.8125, - 0.8125, + 0.875, + 0.875, 0.203125, 0.25, 0.0, @@ -1495,7 +1495,7 @@ "stack_spill_bytes": 0, "thread_occupancy": 100, "uniform_registers_used": 12, - "work_registers_used": 22 + "work_registers_used": 18 } } } @@ -5806,7 +5806,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 10, + "fp16_arithmetic": 86, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -5814,9 +5814,9 @@ "arith_fma" ], "longest_path_cycles": [ - 0.8125, - 0.8125, - 0.234375, + 0.90625, + 0.90625, + 0.265625, 0.25, 0.0, 0.25, @@ -5836,9 +5836,9 @@ "arith_fma" ], "shortest_path_cycles": [ - 0.8125, - 0.8125, - 0.203125, + 0.90625, + 0.90625, + 0.234375, 0.25, 0.0, 0.25, @@ -5849,9 +5849,9 @@ "arith_fma" ], "total_cycles": [ - 0.8125, - 0.8125, - 0.234375, + 0.90625, + 0.90625, + 0.265625, 0.25, 0.0, 0.25, @@ -5860,8 +5860,8 @@ }, "stack_spill_bytes": 0, "thread_occupancy": 100, - "uniform_registers_used": 10, - "work_registers_used": 32 + "uniform_registers_used": 12, + "work_registers_used": 29 } } }, @@ -5906,7 +5906,7 @@ }, "thread_occupancy": 100, "uniform_registers_used": 1, - "work_registers_used": 3 + "work_registers_used": 2 } } } @@ -6636,7 +6636,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 63, + "fp16_arithmetic": 68, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -6662,14 +6662,13 @@ ], "shortest_path_bound_pipelines": [ "arith_total", - "arith_cvt", "arith_sfu", "varying" ], "shortest_path_cycles": [ 0.25, - 0.171875, - 0.25, + 0.15625, + 0.1875, 0.25, 0.0, 0.25, @@ -6684,7 +6683,7 @@ "total_cycles": [ 0.5, 0.359375, - 0.484375, + 0.421875, 0.5, 0.0, 0.5, @@ -6693,7 +6692,7 @@ }, "stack_spill_bytes": 0, "thread_occupancy": 100, - "uniform_registers_used": 10, + "uniform_registers_used": 12, "work_registers_used": 21 } } @@ -6724,7 +6723,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 4.619999885559082, + 3.9600000381469727, 2.0, 0.0 ], @@ -6732,7 +6731,7 @@ "arithmetic" ], "total_cycles": [ - 8.666666984558105, + 8.0, 2.0, 2.0 ] @@ -6757,7 +6756,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 58, + "fp16_arithmetic": 64, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -6786,9 +6785,9 @@ "texture" ], "shortest_path_cycles": [ - 0.171875, - 0.171875, - 0.109375, + 0.15625, + 0.15625, + 0.09375, 0.0625, 0.0, 0.25, @@ -6801,7 +6800,7 @@ "total_cycles": [ 0.359375, 0.359375, - 0.234375, + 0.21875, 0.125, 0.0, 0.5, @@ -6811,7 +6810,7 @@ "stack_spill_bytes": 0, "thread_occupancy": 100, "uniform_registers_used": 12, - "work_registers_used": 20 + "work_registers_used": 19 } } }, @@ -6841,7 +6840,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 3.299999952316284, + 2.9700000286102295, 2.0, 1.0 ], @@ -6849,14 +6848,14 @@ "arithmetic" ], "total_cycles": [ - 5.333333492279053, + 5.0, 2.0, 2.0 ] }, "thread_occupancy": 100, "uniform_registers_used": 2, - "work_registers_used": 4 + "work_registers_used": 3 } } } @@ -6874,7 +6873,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 61, + "fp16_arithmetic": 70, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -6900,12 +6899,13 @@ ], "shortest_path_bound_pipelines": [ "arith_total", - "arith_cvt" + "arith_cvt", + "arith_sfu" ], "shortest_path_cycles": [ - 0.078125, - 0.046875, - 0.078125, + 0.0625, + 0.03125, + 0.0625, 0.0625, 0.0, 0.0, @@ -6918,7 +6918,7 @@ "total_cycles": [ 0.3125, 0.234375, - 0.296875, + 0.28125, 0.3125, 0.0, 0.25, @@ -6958,7 +6958,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 2.9700000286102295, + 2.309999942779541, 1.0, 0.0 ], @@ -6966,7 +6966,7 @@ "arithmetic" ], "total_cycles": [ - 6.666666507720947, + 6.0, 1.0, 1.0 ] @@ -6991,7 +6991,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 57, + "fp16_arithmetic": 66, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -7017,12 +7017,13 @@ ], "shortest_path_bound_pipelines": [ "arith_total", - "arith_cvt" + "arith_cvt", + "arith_sfu" ], "shortest_path_cycles": [ - 0.078125, - 0.046875, - 0.078125, + 0.0625, + 0.03125, + 0.0625, 0.0625, 0.0, 0.0, @@ -7035,7 +7036,7 @@ "total_cycles": [ 0.234375, 0.234375, - 0.203125, + 0.1875, 0.125, 0.0, 0.25, @@ -7045,7 +7046,7 @@ "stack_spill_bytes": 0, "thread_occupancy": 100, "uniform_registers_used": 8, - "work_registers_used": 20 + "work_registers_used": 19 } } }, @@ -7075,7 +7076,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 2.309999942779541, + 1.9800000190734863, 1.0, 0.0 ], @@ -7083,14 +7084,14 @@ "arithmetic" ], "total_cycles": [ - 4.333333492279053, + 4.0, 1.0, 1.0 ] }, "thread_occupancy": 100, "uniform_registers_used": 1, - "work_registers_used": 4 + "work_registers_used": 3 } } } @@ -8920,17 +8921,17 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 33, + "fp16_arithmetic": 68, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ "arith_total", - "arith_fma" + "arith_sfu" ], "longest_path_cycles": [ - 1.5125000476837158, - 1.5125000476837158, - 0.546875, + 1.5, + 1.3875000476837158, + 0.737500011920929, 1.5, 0.0, 0.125, @@ -8960,12 +8961,12 @@ ], "total_bound_pipelines": [ "arith_total", - "arith_fma" + "arith_sfu" ], "total_cycles": [ - 1.6375000476837158, - 1.6375000476837158, - 0.578125, + 1.5625, + 1.5125000476837158, + 0.762499988079071, 1.5625, 0.0, 0.125, @@ -8974,7 +8975,7 @@ }, "stack_spill_bytes": 0, "thread_occupancy": 100, - "uniform_registers_used": 20, + "uniform_registers_used": 16, "work_registers_used": 32 } } @@ -8989,12 +8990,12 @@ "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ - null + "arithmetic" ], "longest_path_cycles": [ - null, - null, - null + 22.110000610351562, + 1.0, + 0.0 ], "pipelines": [ "arithmetic", @@ -9013,14 +9014,14 @@ "arithmetic" ], "total_cycles": [ - 10.666666984558105, + 10.0, 1.0, 0.0 ] }, "thread_occupancy": 100, - "uniform_registers_used": 1, - "work_registers_used": 4 + "uniform_registers_used": 2, + "work_registers_used": 3 } } } @@ -12273,17 +12274,17 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 37, + "fp16_arithmetic": 65, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ "arith_total", - "arith_fma" + "arith_sfu" ], "longest_path_cycles": [ - 1.5499999523162842, - 1.5499999523162842, - 0.515625, + 1.5, + 1.4249999523162842, + 0.699999988079071, 1.5, 0.0, 0.125, @@ -12313,12 +12314,12 @@ ], "total_bound_pipelines": [ "arith_total", - "arith_fma" + "arith_sfu" ], "total_cycles": [ - 1.6749999523162842, - 1.6749999523162842, - 0.5625, + 1.5625, + 1.5499999523162842, + 0.75, 1.5625, 0.0, 0.125, @@ -12328,7 +12329,7 @@ "stack_spill_bytes": 0, "thread_occupancy": 100, "uniform_registers_used": 18, - "work_registers_used": 32 + "work_registers_used": 31 } } }