fmtlib · vitaut · Aug 24, 2022 · Aug 18, 2022 · vitaut · Aug 20, 2022
diff --git a/include/fmt/format.h b/include/fmt/format.h
@@ -602,6 +602,7 @@ FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end,
  */
 FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
     -> const char* {
+  constexpr const int prefix_masks[] = {0x00, 0x80, 0xe0, 0xf0, 0xf8};
   constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
   constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
   constexpr const int shiftc[] = {0, 18, 12, 6, 0};
@@ -628,6 +629,8 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
   *e |= uchar(s[3]) >> 6;
   *e ^= 0x2a;  // top two bits of each tail byte correct?
   *e >>= shifte[len];
+  *e |= ((uchar(s[0]) & prefix_masks[len]) !=
+         uchar((prefix_masks[len] << 1) & 0xFF));  // first byte correct?
 EXPECT_EQ(fmt::format("{}", vec{"\xf4\x8f\xbf\xc0"}), 
           "[\"\\xf4\\x8f\\xbf\\xc0\"]"); 
 EXPECT_EQ(fmt::format("{}", vec{"\xf4\x8f\xbf\xc0"}), 
           "[\"\\xf4\\x8f\\xbf\\xc0\"]"); 
 
   return next;
 }
@@ -643,8 +646,8 @@ FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
     auto error = 0;
     auto end = utf8_decode(buf_ptr, &cp, &error);
     bool result = f(error ? invalid_code_point : cp,
-                    string_view(ptr, to_unsigned(end - buf_ptr)));
-    return result ? end : nullptr;
+                    string_view(ptr, error ? 1 : to_unsigned(end - buf_ptr)));
+    return result ? (error ? buf_ptr + 1 : end) : nullptr;
   };
   auto p = s.data();
   const size_t block_size = 4;  // utf8_decode always reads blocks of 4 chars.

diff --git a/test/ranges-test.cc b/test/ranges-test.cc
@@ -380,8 +380,15 @@ TEST(ranges_test, escape_string) {
     EXPECT_EQ(fmt::format("{}", vec{"\xcd\xb8"}), "[\"\\u0378\"]");
     // Unassigned Unicode code points.
     EXPECT_EQ(fmt::format("{}", vec{"\xf0\xaa\x9b\x9e"}), "[\"\\U0002a6de\"]");
+    // Broken utf-8.
     EXPECT_EQ(fmt::format("{}", vec{"\xf4\x8f\xbf\xc0"}),
               "[\"\\xf4\\x8f\\xbf\\xc0\"]");
+    EXPECT_EQ(fmt::format("{}", vec{"\xf0\x28"}), "[\"\\xf0(\"]");
+    EXPECT_EQ(fmt::format("{}", vec{"\xe1\x28"}), "[\"\\xe1(\"]");
+    EXPECT_EQ(fmt::format("{}", vec{std::string("\xf0\x28\0\0anything", 12)}),
+              "[\"\\xf0(\\x00\\x00anything\"]");
+
+    // Correct utf-8.
     EXPECT_EQ(fmt::format("{}", vec{"понедельник"}), "[\"понедельник\"]");
   }
 }