Skip to content

Commit

Permalink
ENG-1433: Attempt to set HECs based on cgroup CPU limits
Browse files Browse the repository at this point in the history
PR-URL: hasura/graphql-engine-mono#11114
GitOrigin-RevId: 2ed158f44606af0245a0b334756e02c8d42866a7
  • Loading branch information
jberryman authored and hasura-bot committed Feb 5, 2025
1 parent de14150 commit ae67fae
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 0 deletions.
1 change: 1 addition & 0 deletions server/graphql-engine.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,7 @@ library
, Hasura.Server.Telemetry.Counters
, Hasura.Server.Auth.JWT
, Hasura.GC
, Hasura.CpuDetect

, Hasura.LogicalModelResolver.Codec
, Hasura.LogicalModelResolver.Lenses
Expand Down
3 changes: 3 additions & 0 deletions server/src-exec/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import Hasura.App.State
)
import Hasura.Backends.Postgres.Connection.MonadTx
import Hasura.Backends.Postgres.Connection.Settings
import Hasura.CpuDetect qualified as CpuDetect
import Hasura.GC qualified as GC
import Hasura.Logging (Hasura, LogLevel (..), defaultEnabledEngineLogTypes)
import Hasura.Prelude
Expand Down Expand Up @@ -112,6 +113,8 @@ runApp env (HGEOptions rci metadataDbUrl hgeCmd) = do

let Loggers _ logger _ = appEnvLoggers appEnv

liftIO $ CpuDetect.tryAutoSetNumCapabilities logger

_idleGCThread <-
C.forkImmortal "ourIdleGC" logger
$ GC.ourIdleGC logger (seconds 0.3) (seconds 10) (seconds 60)
Expand Down
150 changes: 150 additions & 0 deletions server/src-lib/Hasura/CpuDetect.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
-- | The haskell runtime does not consider cgroup limits (such as are used by
-- kubernetes and docker to size containers) when deciding the number of
-- capabilities to run with `+RTS -N`. This module is about trying to set a
-- good default for `-N` automatically at runtime by considering cgroup limits.
--
-- If we don't do this we risk poor performance as the OS deschedules our
-- capability threads, GC is performed less promptly, and we have extra memory
-- usage for the nursery on each unnecessary capability (and users will observe
-- this as increased memory usage if they bump up the size of the instance
-- hosting their pods, say, or even slowly growing memory leak -like
-- behavior).
module Hasura.CpuDetect (tryAutoSetNumCapabilities) where

import Control.Exception
import Data.SerializableBlob qualified as SB
import Data.String
import GHC.Conc
import Hasura.Logging
import Hasura.Prelude
import System.Directory (doesDirectoryExist, doesFileExist)

-- | Try to intelligently 'setNumCapabilities' taking into account cgroups CPU
-- limits. This should be called just once, near the top of each server Main.
tryAutoSetNumCapabilities ::
Logger Hasura ->
IO ()
tryAutoSetNumCapabilities (Logger logger) = do
-- the current `+RTS -N` value:
capabilitiesBefore <- getNumCapabilities
processors <- getNumProcessors
-- Abort if the user set capabilities explicitly. Note we can't detect
-- where they intentionally set this to the number of processors.
if capabilitiesBefore /= processors
then
logg LevelInfo
$ "It looks like `+RTS -N` was passed explicitly. Leaving haskell capabilities at "
<> showBlob capabilitiesBefore
else handle (handler capabilitiesBefore) (setBasedOnCgroups capabilitiesBefore)
where
logg level = logger . UnstructuredLog level . SB.fromText

showBlob :: (Show a, IsString b) => a -> b
showBlob = fromString . show

setBasedOnCgroups capabilitiesBefore = do
detectCgroupVersion >>= \case
Nothing ->
logg LevelInfo
$ "Could not detect cgroups. "
<> leavingThingsAloneMsg capabilitiesBefore
Just v -> do
getCpuLimit v >>= \case
InCgroupNoLimitSet ->
logg LevelInfo
$ "Running within a cgroup but no CPU limit detected. Leaving haskell capabilities at "
<> showBlob capabilitiesBefore
NotInCgroup ->
logg LevelInfo
$ "We don't appear to be running within a cgroup. Leaving haskell capabilities at "
<> showBlob capabilitiesBefore
InCgroupLimited cpuAllocation -> do
processors <- getNumProcessors
-- `floor` is also an options here, but rounding up just seems
-- like the better/conservative choice here.
-- Docker prevents setting cpuAllocation > processors, but I
-- don't think that is disallowed in cgroups generally, so clamp
-- here.
let caps = min processors (ceiling cpuAllocation)
setNumCapabilities caps
logg LevelInfo
$ "Detected "
<> showBlob cpuAllocation
<> " CPU cgroup limit. "
<> "Setting number of haskell capabilities to: "
<> showBlob caps

getCpuLimit V1 = getCpuLimitV1
getCpuLimit V2 = getCpuLimitV2

leavingThingsAloneMsg capabilitiesBefore =
"Leaving haskell capabilities at "
<> showBlob capabilitiesBefore
<> ". If running in docker or kubernetes with CPU limits, for best performance you should start the engine "
<> "with `+RTS -N<cpu_limit>` on the command line, or `GHCRTS=-N<cpu_limit>` in the environment."

handler :: Int -> SomeException -> IO ()
handler capabilitiesBefore e =
logg LevelWarn
$ "Failed to automatically detect capabilities, with error:"
<> showBlob e
<> "\n"
<> leavingThingsAloneMsg capabilitiesBefore
<> "\n Please report this as a bug"

data CgroupVersion = V1 | V2 deriving (Eq, Show)

-- | Try to detect which version of cgroups the OS is using (if any?)
detectCgroupVersion :: IO (Maybe CgroupVersion)
detectCgroupVersion = do
cgroupControllersExists <- doesFileExist "/sys/fs/cgroup/cgroup.controllers"
cgroupDirExists <- doesDirectoryExist "/sys/fs/cgroup"
return
$ if cgroupControllersExists
then Just V2
else
if cgroupDirExists
then Just V1
else Nothing

data CPULimits
= -- | It looks like we're running in a cgroup but CPU is unlimited
InCgroupNoLimitSet
| -- | It doesn't look like the process is running in a cgroup
NotInCgroup
| -- | It looks like we're limited to N CPU (likely running in docker or k8s)
InCgroupLimited Double
deriving (Eq, Show)

-- | Lots of possible exceptions here
getCpuLimitV1 :: IO CPULimits
getCpuLimitV1 = do
quotaExists <- doesFileExist "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"
periodExists <- doesFileExist "/sys/fs/cgroup/cpu/cpu.cfs_period_us"
if quotaExists && periodExists
then do
quota <- readFile "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"
period <- readFile "/sys/fs/cgroup/cpu/cpu.cfs_period_us"
if quota == "-1"
then return InCgroupNoLimitSet
else do
let !limit = read quota / read period
return $ InCgroupLimited limit
else return NotInCgroup

-- | Lots of possible exceptions here
getCpuLimitV2 :: IO CPULimits
getCpuLimitV2 = do
cpuMaxExists <- doesFileExist "/sys/fs/cgroup/cpu.max"
if cpuMaxExists
then do
maxContent <- readFile "/sys/fs/cgroup/cpu.max"
(quota, period) <- case words maxContent of
[quota, period] -> return (quota, period)
_ -> throwIO $ userError $ "Unexpected format for /sys/fs/cgroup/cpu.max: " <> show maxContent
if quota == "max"
then return InCgroupNoLimitSet
else do
let !limit = read quota / read period
return $ InCgroupLimited limit
else return NotInCgroup

0 comments on commit ae67fae

Please sign in to comment.