From 8c5c428622ee62dc9dc4243adee7df6c2ed385ca Mon Sep 17 00:00:00 2001
From: Roman Koshelev <roman.koshelev@bk.ru>
Date: Sun, 12 Sep 2021 18:07:16 +0300
Subject: [PATCH 1/2] Add copy2() constexpr

---
 include/fmt/format.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/include/fmt/format.h b/include/fmt/format.h
index 5402c2808fb1..db0986a287a1 100644
--- a/include/fmt/format.h
+++ b/include/fmt/format.h
@@ -1049,11 +1049,19 @@ inline auto equal2(const char* lhs, const char* rhs) -> bool {
 }
 
 // Copies two characters from src to dst.
-template <typename Char> void copy2(Char* dst, const char* src) {
-  *dst++ = static_cast<Char>(*src++);
-  *dst = static_cast<Char>(*src);
+template <typename Char>
+FMT_CONSTEXPR20 FMT_INLINE void copy2(Char* dst, const char* src) { 
+  if (!is_constant_evaluated() && std::is_same<Char, char>::value) {
+    memcpy(dst, src, 2); 
+  } else {
+    // We read both bytes before writing so that the compiler can do it in
+    // one pair of read/write instructions (even if Char aliases char)
+    char dc0 = *src++;
+    char dc1 = *src;
+    *dst++ = static_cast<Char>(dc0);
+    *dst = static_cast<Char>(dc1);
+  }
 }
-FMT_INLINE void copy2(char* dst, const char* src) { memcpy(dst, src, 2); }
 
 template <typename Iterator> struct format_decimal_result {
   Iterator begin;

From 9052207625336cd8e09d8a4da5e68988a951d708 Mon Sep 17 00:00:00 2001
From: Roman Koshelev <roman.koshelev@bk.ru>
Date: Sun, 12 Sep 2021 18:09:05 +0300
Subject: [PATCH 2/2] Removed redundant format_decimal implementation for
 constexpr context

---
 include/fmt/format.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/include/fmt/format.h b/include/fmt/format.h
index db0986a287a1..2621cf848003 100644
--- a/include/fmt/format.h
+++ b/include/fmt/format.h
@@ -1077,14 +1077,6 @@ FMT_CONSTEXPR20 auto format_decimal(Char* out, UInt value, int size)
   FMT_ASSERT(size >= count_digits(value), "invalid digit count");
   out += size;
   Char* end = out;
-  if (is_constant_evaluated()) {
-    while (value >= 10) {
-      *--out = static_cast<Char>('0' + value % 10);
-      value /= 10;
-    }
-    *--out = static_cast<Char>('0' + value);
-    return {out, end};
-  }
   while (value >= 100) {
     // Integer division is slow so do it for a group of two digits instead
     // of for every digit. The idea comes from the talk by Alexandrescu