diff --git a/impeller/compiler/shader_lib/impeller/gaussian.glsl b/impeller/compiler/shader_lib/impeller/gaussian.glsl index 9dd104d6e4ce2..62874bec96d9c 100644 --- a/impeller/compiler/shader_lib/impeller/gaussian.glsl +++ b/impeller/compiler/shader_lib/impeller/gaussian.glsl @@ -6,53 +6,51 @@ #define GAUSSIAN_GLSL_ #include -#include /// Gaussian distribution function. -float16_t IPGaussian(float16_t x, float16_t sigma) { - float16_t variance = sigma * sigma; - return exp(-0.5hf * x * x / variance) / (float16_t(kSqrtTwoPi) * sigma); +float IPGaussian(float x, float sigma) { + float variance = sigma * sigma; + return exp(-0.5 * x * x / variance) / (kSqrtTwoPi * sigma); } /// Abramowitz and Stegun erf approximation. -float16_t IPErf(float16_t x) { - float16_t a = abs(x); +float IPErf(float x) { + float a = abs(x); // 0.278393*x + 0.230389*x^2 + 0.078108*x^4 + 1 - float16_t b = - (0.278393hf + (0.230389hf + 0.078108hf * a * a) * a) * a + 1.0hf; - return sign(x) * (1.0hf - 1.0hf / (b * b * b * b)); + float b = (0.278393 + (0.230389 + 0.078108 * a * a) * a) * a + 1.0; + return sign(x) * (1 - 1 / (b * b * b * b)); } /// Vec2 variation for the Abramowitz and Stegun erf approximation. -f16vec2 IPVec2Erf(f16vec2 x) { - f16vec2 a = abs(x); +vec2 IPVec2Erf(vec2 x) { + vec2 a = abs(x); // 0.278393*x + 0.230389*x^2 + 0.078108*x^4 + 1 - f16vec2 b = (0.278393hf + (0.230389hf + 0.078108hf * a * a) * a) * a + 1.0hf; - return sign(x) * (1.0hf - 1.0hf / (b * b * b * b)); + vec2 b = (0.278393 + (0.230389 + 0.078108 * a * a) * a) * a + 1.0; + return sign(x) * (1 - 1 / (b * b * b * b)); } /// The indefinite integral of the Gaussian function. /// Uses a very close approximation of Erf. -float16_t IPGaussianIntegral(float16_t x, float16_t sigma) { +float IPGaussianIntegral(float x, float sigma) { // ( 1 + erf( x * (sqrt(2) / (2 * sigma) ) ) / 2 - return (1.0hf + IPErf(x * (float16_t(kHalfSqrtTwo) / sigma))) * 0.5hf; + return (1 + IPErf(x * (kHalfSqrtTwo / sigma))) * 0.5; } /// Vec2 variation for the indefinite integral of the Gaussian function. /// Uses a very close approximation of Erf. -f16vec2 IPVec2GaussianIntegral(f16vec2 x, float16_t sigma) { +vec2 IPVec2GaussianIntegral(vec2 x, float sigma) { // ( 1 + erf( x * (sqrt(2) / (2 * sigma) ) ) / 2 - return (1.0hf + IPVec2Erf(x * (float16_t(kHalfSqrtTwo) / sigma))) * 0.5hf; + return (1 + IPVec2Erf(x * (kHalfSqrtTwo / sigma))) * 0.5; } /// Simpler (but less accurate) approximation of the Gaussian integral. -f16vec2 IPVec2FastGaussianIntegral(f16vec2 x, float16_t sigma) { - return 1.0hf / (1.0hf + exp(float16_t(-kSqrtThree) / sigma * x)); +vec2 IPVec2FastGaussianIntegral(vec2 x, float sigma) { + return 1 / (1 + exp(-kSqrtThree / sigma * x)); } /// Simple logistic sigmoid with a domain of [-1, 1] and range of [0, 1]. -float16_t IPSigmoid(float16_t x) { - return 1.03731472073hf / (1.0hf + exp(-4.0hf * x)) - 0.0186573603638hf; +float IPSigmoid(float x) { + return 1.03731472073 / (1 + exp(-4 * x)) - 0.0186573603638; } #endif diff --git a/impeller/compiler/shader_lib/impeller/texture.glsl b/impeller/compiler/shader_lib/impeller/texture.glsl index d8b244294a756..3bbb580fb275f 100644 --- a/impeller/compiler/shader_lib/impeller/texture.glsl +++ b/impeller/compiler/shader_lib/impeller/texture.glsl @@ -143,15 +143,6 @@ vec4 IPSampleDecal(sampler2D texture_sampler, vec2 coords) { return texture(texture_sampler, coords); } -/// Sample a texture with decal tile mode. -f16vec4 IPHalfSampleDecal(f16sampler2D texture_sampler, f16vec2 coords) { - if (any(lessThan(coords, f16vec2(0.0hf))) || - any(greaterThanEqual(coords, f16vec2(1.0)))) { - return f16vec4(0.0); - } - return texture(texture_sampler, coords); -} - /// Sample a texture, emulating a specific tile mode. /// /// This is useful for Impeller graphics backend that don't have native support diff --git a/impeller/entity/shaders/border_mask_blur.frag b/impeller/entity/shaders/border_mask_blur.frag index e0e89b2e8edc5..b28dfc8210380 100644 --- a/impeller/entity/shaders/border_mask_blur.frag +++ b/impeller/entity/shaders/border_mask_blur.frag @@ -15,42 +15,42 @@ // integral (using an erf approximation) to the 4 edges of the UV rectangle and // multiplying them. -uniform f16sampler2D texture_sampler; +uniform sampler2D texture_sampler; uniform FragInfo { - float16_t src_factor; - float16_t inner_blur_factor; - float16_t outer_blur_factor; + float src_factor; + float inner_blur_factor; + float outer_blur_factor; - f16vec2 sigma_uv; + vec2 sigma_uv; } frag_info; -in f16vec2 v_texture_coords; +in vec2 v_texture_coords; -out f16vec4 frag_color; +out vec4 frag_color; -float16_t BoxBlurMask(f16vec2 uv) { +float BoxBlurMask(vec2 uv) { // LTRB - return IPGaussianIntegral(uv.x, frag_info.sigma_uv.x) * // - IPGaussianIntegral(uv.y, frag_info.sigma_uv.y) * // - IPGaussianIntegral(1.0hf - uv.x, frag_info.sigma_uv.x) * // - IPGaussianIntegral(1.0hf - uv.y, frag_info.sigma_uv.y); + return IPGaussianIntegral(uv.x, frag_info.sigma_uv.x) * // + IPGaussianIntegral(uv.y, frag_info.sigma_uv.y) * // + IPGaussianIntegral(1 - uv.x, frag_info.sigma_uv.x) * // + IPGaussianIntegral(1 - uv.y, frag_info.sigma_uv.y); } void main() { - f16vec4 image_color = texture(texture_sampler, v_texture_coords); - float16_t blur_factor = BoxBlurMask(v_texture_coords); + vec4 image_color = texture(texture_sampler, v_texture_coords); + float blur_factor = BoxBlurMask(v_texture_coords); - float16_t within_bounds = - float16_t(v_texture_coords.x >= 0.0hf && v_texture_coords.y >= 0.0hf && - v_texture_coords.x < 1.0hf && v_texture_coords.y < 1.0hf); - float16_t inner_factor = + float within_bounds = + float(v_texture_coords.x >= 0 && v_texture_coords.y >= 0 && + v_texture_coords.x < 1 && v_texture_coords.y < 1); + float inner_factor = (frag_info.inner_blur_factor * blur_factor + frag_info.src_factor) * within_bounds; - float16_t outer_factor = - frag_info.outer_blur_factor * blur_factor * (1.0hf - within_bounds); + float outer_factor = + frag_info.outer_blur_factor * blur_factor * (1 - within_bounds); - float16_t mask_factor = inner_factor + outer_factor; + float mask_factor = inner_factor + outer_factor; frag_color = image_color * mask_factor; } diff --git a/impeller/entity/shaders/border_mask_blur.vert b/impeller/entity/shaders/border_mask_blur.vert index 74b9ae422483a..bff59a4747e65 100644 --- a/impeller/entity/shaders/border_mask_blur.vert +++ b/impeller/entity/shaders/border_mask_blur.vert @@ -15,10 +15,10 @@ frame_info; in vec2 vertices; in vec2 texture_coords; -out f16vec2 v_texture_coords; +out vec2 v_texture_coords; void main() { gl_Position = frame_info.mvp * vec4(vertices, 0.0, 1.0); - v_texture_coords = f16vec2( - IPRemapCoords(texture_coords, frame_info.texture_sampler_y_coord_scale)); + v_texture_coords = + IPRemapCoords(texture_coords, frame_info.texture_sampler_y_coord_scale); } diff --git a/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl b/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl index c99adfe449e24..4a218303efc27 100644 --- a/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl +++ b/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl @@ -18,52 +18,52 @@ #include #include -uniform f16sampler2D texture_sampler; +uniform sampler2D texture_sampler; uniform BlurInfo { - f16vec2 texture_size; - f16vec2 blur_direction; + vec2 texture_size; + vec2 blur_direction; // The blur sigma and radius have a linear relationship which is defined // host-side, but both are useful controls here. Sigma (pixels per standard // deviation) is used to define the gaussian function itself, whereas the // radius is used to limit how much of the function is integrated. - float16_t blur_sigma; - float16_t blur_radius; + float blur_sigma; + float blur_radius; } blur_info; #if ENABLE_ALPHA_MASK -uniform f16sampler2D alpha_mask_sampler; +uniform sampler2D alpha_mask_sampler; uniform MaskInfo { - float16_t src_factor; - float16_t inner_blur_factor; - float16_t outer_blur_factor; + float src_factor; + float inner_blur_factor; + float outer_blur_factor; } mask_info; #endif -f16vec4 Sample(f16sampler2D tex, f16vec2 coords) { +vec4 Sample(sampler2D tex, vec2 coords) { #if ENABLE_DECAL_SPECIALIZATION - return IPHalfSampleDecal(tex, coords); + return IPSampleDecal(tex, coords); #else return texture(tex, coords); #endif } -in f16vec2 v_texture_coords; -in f16vec2 v_src_texture_coords; +in vec2 v_texture_coords; +in vec2 v_src_texture_coords; -out f16vec4 frag_color; +out vec4 frag_color; void main() { - f16vec4 total_color = f16vec4(0.0hf); - float16_t gaussian_integral = 0.0hf; - f16vec2 blur_uv_offset = blur_info.blur_direction / blur_info.texture_size; + vec4 total_color = vec4(0); + float gaussian_integral = 0; + vec2 blur_uv_offset = blur_info.blur_direction / blur_info.texture_size; - for (float16_t i = -blur_info.blur_radius; i <= blur_info.blur_radius; i++) { - float16_t gaussian = IPGaussian(i, blur_info.blur_sigma); + for (float i = -blur_info.blur_radius; i <= blur_info.blur_radius; i++) { + float gaussian = IPGaussian(i, blur_info.blur_sigma); gaussian_integral += gaussian; total_color += gaussian * @@ -75,12 +75,11 @@ void main() { frag_color = total_color / gaussian_integral; #if ENABLE_ALPHA_MASK - f16vec4 src_color = Sample(alpha_mask_sampler, // sampler - v_src_texture_coords // texture coordinates + vec4 src_color = Sample(alpha_mask_sampler, // sampler + v_src_texture_coords // texture coordinates ); - float16_t blur_factor = - mask_info.inner_blur_factor * float16_t(src_color.a > 0.0hf) + - mask_info.outer_blur_factor * float16_t(src_color.a == 0.0hf); + float blur_factor = mask_info.inner_blur_factor * float(src_color.a > 0) + + mask_info.outer_blur_factor * float(src_color.a == 0); frag_color = frag_color * blur_factor + src_color * mask_info.src_factor; #endif diff --git a/impeller/entity/shaders/gaussian_blur/gaussian_blur.vert b/impeller/entity/shaders/gaussian_blur/gaussian_blur.vert index 96b2ccf5c66c8..f402003b13bf1 100644 --- a/impeller/entity/shaders/gaussian_blur/gaussian_blur.vert +++ b/impeller/entity/shaders/gaussian_blur/gaussian_blur.vert @@ -16,13 +16,13 @@ in vec2 vertices; in vec2 texture_coords; in vec2 src_texture_coords; -out f16vec2 v_texture_coords; -out f16vec2 v_src_texture_coords; +out vec2 v_texture_coords; +out vec2 v_src_texture_coords; void main() { gl_Position = frame_info.mvp * vec4(vertices, 0.0, 1.0); - v_texture_coords = f16vec2( - IPRemapCoords(texture_coords, frame_info.texture_sampler_y_coord_scale)); - v_src_texture_coords = f16vec2(IPRemapCoords( - src_texture_coords, frame_info.alpha_mask_sampler_y_coord_scale)); + v_texture_coords = + IPRemapCoords(texture_coords, frame_info.texture_sampler_y_coord_scale); + v_src_texture_coords = IPRemapCoords( + src_texture_coords, frame_info.alpha_mask_sampler_y_coord_scale); } diff --git a/impeller/entity/shaders/rrect_blur.frag b/impeller/entity/shaders/rrect_blur.frag index 9b342481187b2..5b0ddff80976c 100644 --- a/impeller/entity/shaders/rrect_blur.frag +++ b/impeller/entity/shaders/rrect_blur.frag @@ -6,61 +6,58 @@ #include uniform FragInfo { - f16vec4 color; - f16vec2 rect_size; - float16_t blur_sigma; - float16_t corner_radius; + vec4 color; + float blur_sigma; + vec2 rect_size; + float corner_radius; } frag_info; -in f16vec2 v_position; +in vec2 v_position; -out f16vec4 frag_color; +out vec4 frag_color; const int kSampleCount = 4; -float16_t RRectDistance(f16vec2 sample_position, f16vec2 half_size) { - f16vec2 space = abs(sample_position) - half_size + frag_info.corner_radius; - return length(max(space, float16_t(0.0hf))) + - min(max(space.x, space.y), float16_t(0.0hf)) - frag_info.corner_radius; +float RRectDistance(vec2 sample_position, vec2 half_size) { + vec2 space = abs(sample_position) - half_size + frag_info.corner_radius; + return length(max(space, 0.0)) + min(max(space.x, space.y), 0.0) - + frag_info.corner_radius; } /// Closed form unidirectional rounded rect blur mask solution using the /// analytical Gaussian integral (with approximated erf). -float16_t RRectShadowX(f16vec2 sample_position, f16vec2 half_size) { +float RRectShadowX(vec2 sample_position, vec2 half_size) { // Compute the X direction distance field (not incorporating the Y distance) // for the rounded rect. - float16_t space = - min(float16_t(0.0hf), - half_size.y - frag_info.corner_radius - abs(sample_position.y)); - float16_t rrect_distance = + float space = + min(0, half_size.y - frag_info.corner_radius - abs(sample_position.y)); + float rrect_distance = half_size.x - frag_info.corner_radius + - sqrt(max( - float16_t(0.0hf), - frag_info.corner_radius * frag_info.corner_radius - space * space)); + sqrt(max(0, frag_info.corner_radius * frag_info.corner_radius - + space * space)); // Map the linear distance field to the approximate Gaussian integral. - f16vec2 integral = IPVec2FastGaussianIntegral( - sample_position.x + f16vec2(-rrect_distance, rrect_distance), + vec2 integral = IPVec2FastGaussianIntegral( + sample_position.x + vec2(-rrect_distance, rrect_distance), frag_info.blur_sigma); return integral.y - integral.x; } -float16_t RRectShadow(f16vec2 sample_position, f16vec2 half_size) { +float RRectShadow(vec2 sample_position, vec2 half_size) { // Limit the sampling range to 3 standard deviations in the Y direction from // the kernel center to incorporate 99.7% of the color contribution. - float16_t half_sampling_range = frag_info.blur_sigma * 3.0hf; + float half_sampling_range = frag_info.blur_sigma * 3; - float16_t begin_y = - max(-half_sampling_range, sample_position.y - half_size.y); - float16_t end_y = min(half_sampling_range, sample_position.y + half_size.y); - float16_t interval = (end_y - begin_y) / float16_t(kSampleCount); + float begin_y = max(-half_sampling_range, sample_position.y - half_size.y); + float end_y = min(half_sampling_range, sample_position.y + half_size.y); + float interval = (end_y - begin_y) / kSampleCount; // Sample the X blur kSampleCount times, weighted by the Gaussian function. - float16_t result = 0.0hf; + float result = 0; for (int sample_i = 0; sample_i < kSampleCount; sample_i++) { - float16_t y = begin_y + interval * (float16_t(sample_i) + 0.5hf); - result += RRectShadowX(f16vec2(sample_position.x, sample_position.y - y), + float y = begin_y + interval * (sample_i + 0.5); + result += RRectShadowX(vec2(sample_position.x, sample_position.y - y), half_size) * IPGaussian(y, frag_info.blur_sigma) * interval; } @@ -71,10 +68,10 @@ float16_t RRectShadow(f16vec2 sample_position, f16vec2 half_size) { void main() { frag_color = frag_info.color; - f16vec2 half_size = frag_info.rect_size * 0.5hf; - f16vec2 sample_position = v_position - half_size; + vec2 half_size = frag_info.rect_size * 0.5; + vec2 sample_position = v_position - half_size; - if (frag_info.blur_sigma > 0.0hf) { + if (frag_info.blur_sigma > 0) { frag_color *= RRectShadow(sample_position, half_size); } else { frag_color *= -RRectDistance(sample_position, half_size); diff --git a/impeller/entity/shaders/rrect_blur.vert b/impeller/entity/shaders/rrect_blur.vert index 6ca9e06bba4b8..87382f6b4dcbe 100644 --- a/impeller/entity/shaders/rrect_blur.vert +++ b/impeller/entity/shaders/rrect_blur.vert @@ -11,10 +11,10 @@ frame_info; in vec2 position; -out f16vec2 v_position; +out vec2 v_position; void main() { gl_Position = frame_info.mvp * vec4(position, 0.0, 1.0); // The fragment stage uses local coordinates to compute the blur. - v_position = f16vec2(position); + v_position = position; } diff --git a/impeller/tools/malioc.json b/impeller/tools/malioc.json index 2b2b1231a5cf4..9f6418deddff2 100644 --- a/impeller/tools/malioc.json +++ b/impeller/tools/malioc.json @@ -1440,7 +1440,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 44, + "fp16_arithmetic": 5, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -1448,8 +1448,8 @@ "arith_fma" ], "longest_path_cycles": [ - 0.875, - 0.875, + 0.8125, + 0.8125, 0.203125, 0.25, 0.0, @@ -1470,8 +1470,8 @@ "arith_fma" ], "shortest_path_cycles": [ - 0.875, - 0.875, + 0.8125, + 0.8125, 0.203125, 0.25, 0.0, @@ -1483,8 +1483,8 @@ "arith_fma" ], "total_cycles": [ - 0.875, - 0.875, + 0.8125, + 0.8125, 0.203125, 0.25, 0.0, @@ -1495,7 +1495,7 @@ "stack_spill_bytes": 0, "thread_occupancy": 100, "uniform_registers_used": 12, - "work_registers_used": 18 + "work_registers_used": 22 } } } @@ -5806,7 +5806,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 86, + "fp16_arithmetic": 10, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -5814,9 +5814,9 @@ "arith_fma" ], "longest_path_cycles": [ - 0.90625, - 0.90625, - 0.265625, + 0.8125, + 0.8125, + 0.234375, 0.25, 0.0, 0.25, @@ -5836,9 +5836,9 @@ "arith_fma" ], "shortest_path_cycles": [ - 0.90625, - 0.90625, - 0.234375, + 0.8125, + 0.8125, + 0.203125, 0.25, 0.0, 0.25, @@ -5849,9 +5849,9 @@ "arith_fma" ], "total_cycles": [ - 0.90625, - 0.90625, - 0.265625, + 0.8125, + 0.8125, + 0.234375, 0.25, 0.0, 0.25, @@ -5860,8 +5860,8 @@ }, "stack_spill_bytes": 0, "thread_occupancy": 100, - "uniform_registers_used": 12, - "work_registers_used": 29 + "uniform_registers_used": 10, + "work_registers_used": 32 } } }, @@ -5906,7 +5906,7 @@ }, "thread_occupancy": 100, "uniform_registers_used": 1, - "work_registers_used": 2 + "work_registers_used": 3 } } } @@ -6636,7 +6636,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 68, + "fp16_arithmetic": 63, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -6662,13 +6662,14 @@ ], "shortest_path_bound_pipelines": [ "arith_total", + "arith_cvt", "arith_sfu", "varying" ], "shortest_path_cycles": [ 0.25, - 0.15625, - 0.1875, + 0.171875, + 0.25, 0.25, 0.0, 0.25, @@ -6683,7 +6684,7 @@ "total_cycles": [ 0.5, 0.359375, - 0.421875, + 0.484375, 0.5, 0.0, 0.5, @@ -6692,7 +6693,7 @@ }, "stack_spill_bytes": 0, "thread_occupancy": 100, - "uniform_registers_used": 12, + "uniform_registers_used": 10, "work_registers_used": 21 } } @@ -6723,7 +6724,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 3.9600000381469727, + 4.619999885559082, 2.0, 0.0 ], @@ -6731,7 +6732,7 @@ "arithmetic" ], "total_cycles": [ - 8.0, + 8.666666984558105, 2.0, 2.0 ] @@ -6756,7 +6757,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 64, + "fp16_arithmetic": 58, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -6785,9 +6786,9 @@ "texture" ], "shortest_path_cycles": [ - 0.15625, - 0.15625, - 0.09375, + 0.171875, + 0.171875, + 0.109375, 0.0625, 0.0, 0.25, @@ -6800,7 +6801,7 @@ "total_cycles": [ 0.359375, 0.359375, - 0.21875, + 0.234375, 0.125, 0.0, 0.5, @@ -6810,7 +6811,7 @@ "stack_spill_bytes": 0, "thread_occupancy": 100, "uniform_registers_used": 12, - "work_registers_used": 19 + "work_registers_used": 20 } } }, @@ -6840,7 +6841,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 2.9700000286102295, + 3.299999952316284, 2.0, 1.0 ], @@ -6848,14 +6849,14 @@ "arithmetic" ], "total_cycles": [ - 5.0, + 5.333333492279053, 2.0, 2.0 ] }, "thread_occupancy": 100, "uniform_registers_used": 2, - "work_registers_used": 3 + "work_registers_used": 4 } } } @@ -6873,7 +6874,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 70, + "fp16_arithmetic": 61, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -6899,13 +6900,12 @@ ], "shortest_path_bound_pipelines": [ "arith_total", - "arith_cvt", - "arith_sfu" + "arith_cvt" ], "shortest_path_cycles": [ - 0.0625, - 0.03125, - 0.0625, + 0.078125, + 0.046875, + 0.078125, 0.0625, 0.0, 0.0, @@ -6918,7 +6918,7 @@ "total_cycles": [ 0.3125, 0.234375, - 0.28125, + 0.296875, 0.3125, 0.0, 0.25, @@ -6958,7 +6958,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 2.309999942779541, + 2.9700000286102295, 1.0, 0.0 ], @@ -6966,7 +6966,7 @@ "arithmetic" ], "total_cycles": [ - 6.0, + 6.666666507720947, 1.0, 1.0 ] @@ -6991,7 +6991,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 66, + "fp16_arithmetic": 57, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -7017,13 +7017,12 @@ ], "shortest_path_bound_pipelines": [ "arith_total", - "arith_cvt", - "arith_sfu" + "arith_cvt" ], "shortest_path_cycles": [ - 0.0625, - 0.03125, - 0.0625, + 0.078125, + 0.046875, + 0.078125, 0.0625, 0.0, 0.0, @@ -7036,7 +7035,7 @@ "total_cycles": [ 0.234375, 0.234375, - 0.1875, + 0.203125, 0.125, 0.0, 0.25, @@ -7046,7 +7045,7 @@ "stack_spill_bytes": 0, "thread_occupancy": 100, "uniform_registers_used": 8, - "work_registers_used": 19 + "work_registers_used": 20 } } }, @@ -7076,7 +7075,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 1.9800000190734863, + 2.309999942779541, 1.0, 0.0 ], @@ -7084,14 +7083,14 @@ "arithmetic" ], "total_cycles": [ - 4.0, + 4.333333492279053, 1.0, 1.0 ] }, "thread_occupancy": 100, "uniform_registers_used": 1, - "work_registers_used": 3 + "work_registers_used": 4 } } } @@ -8921,17 +8920,17 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 68, + "fp16_arithmetic": 33, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ "arith_total", - "arith_sfu" + "arith_fma" ], "longest_path_cycles": [ - 1.5, - 1.3875000476837158, - 0.737500011920929, + 1.5125000476837158, + 1.5125000476837158, + 0.546875, 1.5, 0.0, 0.125, @@ -8961,12 +8960,12 @@ ], "total_bound_pipelines": [ "arith_total", - "arith_sfu" + "arith_fma" ], "total_cycles": [ - 1.5625, - 1.5125000476837158, - 0.762499988079071, + 1.6375000476837158, + 1.6375000476837158, + 0.578125, 1.5625, 0.0, 0.125, @@ -8975,7 +8974,7 @@ }, "stack_spill_bytes": 0, "thread_occupancy": 100, - "uniform_registers_used": 16, + "uniform_registers_used": 20, "work_registers_used": 32 } } @@ -8990,12 +8989,12 @@ "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ - "arithmetic" + null ], "longest_path_cycles": [ - 22.110000610351562, - 1.0, - 0.0 + null, + null, + null ], "pipelines": [ "arithmetic", @@ -9014,14 +9013,14 @@ "arithmetic" ], "total_cycles": [ - 10.0, + 10.666666984558105, 1.0, 0.0 ] }, "thread_occupancy": 100, - "uniform_registers_used": 2, - "work_registers_used": 3 + "uniform_registers_used": 1, + "work_registers_used": 4 } } } @@ -12274,17 +12273,17 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 65, + "fp16_arithmetic": 37, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ "arith_total", - "arith_sfu" + "arith_fma" ], "longest_path_cycles": [ - 1.5, - 1.4249999523162842, - 0.699999988079071, + 1.5499999523162842, + 1.5499999523162842, + 0.515625, 1.5, 0.0, 0.125, @@ -12314,12 +12313,12 @@ ], "total_bound_pipelines": [ "arith_total", - "arith_sfu" + "arith_fma" ], "total_cycles": [ - 1.5625, - 1.5499999523162842, - 0.75, + 1.6749999523162842, + 1.6749999523162842, + 0.5625, 1.5625, 0.0, 0.125, @@ -12329,7 +12328,7 @@ "stack_spill_bytes": 0, "thread_occupancy": 100, "uniform_registers_used": 18, - "work_registers_used": 31 + "work_registers_used": 32 } } }