-
Notifications
You must be signed in to change notification settings - Fork 76
Toggle primary read-only when disk capacity hits threshold #59
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
64b034e
6b2060a
00b96b2
9a4f04e
32a7daf
ffe51d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,9 +10,11 @@ import ( | |
"github.com/superfly/fly-checks/check" | ||
) | ||
|
||
// Primary will be made read-only when disk capacity reaches this percentage. | ||
const diskCapacityPercentageThreshold = 90.0 | ||
|
||
// CheckPostgreSQL health, replication, etc | ||
func CheckPostgreSQL(ctx context.Context, checks *check.CheckSuite) (*check.CheckSuite, error) { | ||
|
||
node, err := flypg.NewNode() | ||
if err != nil { | ||
return checks, errors.Wrap(err, "failed to initialize node") | ||
|
@@ -23,18 +25,65 @@ func CheckPostgreSQL(ctx context.Context, checks *check.CheckSuite) (*check.Chec | |
return checks, errors.Wrap(err, "failed to connect with local node") | ||
} | ||
|
||
repConn, err := node.RepMgr.NewLocalConnection(ctx) | ||
if err != nil { | ||
return checks, fmt.Errorf("failed to connect to repmgr node: %s", err) | ||
} | ||
|
||
member, err := node.RepMgr.Member(ctx, repConn) | ||
if err != nil { | ||
return checks, fmt.Errorf("failed to resolve local member role: %s", err) | ||
} | ||
|
||
// Cleanup connections | ||
checks.OnCompletion = func() { | ||
localConn.Close(ctx) | ||
repConn.Close(ctx) | ||
} | ||
|
||
checks.AddCheck("connections", func() (string, error) { | ||
return connectionCount(ctx, localConn) | ||
}) | ||
|
||
if member.Role == flypg.PrimaryRoleName && member.Active { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want to expose this healthcheck on replicas without setting them to readonly? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like it might be useful info There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a VM check that communicates general capacity that should cover that. It makes me think though that maybe we need a new name for the check. |
||
// Check that provides additional insight into disk capacity and | ||
// how close we are to hitting the readonly threshold. | ||
checks.AddCheck("disk-capacity", func() (string, error) { | ||
return diskCapacityCheck(ctx, localConn, node) | ||
}) | ||
} | ||
|
||
return checks, nil | ||
} | ||
|
||
func diskCapacityCheck(ctx context.Context, localConn *pgx.Conn, node *flypg.Node) (string, error) { | ||
// Calculate current disk usage | ||
size, available, err := diskUsage("/data/") | ||
if err != nil { | ||
return "", fmt.Errorf("failed to calculate disk usage: %s", err) | ||
} | ||
|
||
usedPercentage := float64(size-available) / float64(size) * 100 | ||
|
||
// Turn primary read-only | ||
if usedPercentage > diskCapacityPercentageThreshold { | ||
if err := flypg.SetReadOnly(ctx, node, localConn); err != nil { | ||
return "", fmt.Errorf("failed to turn primary readonly: %s", err) | ||
} | ||
|
||
return "", fmt.Errorf("%0.1f%% - readonly mode enabled, extend your volume to re-enable writes", usedPercentage) | ||
} | ||
|
||
// Don't attempt to turn read/write if zombie lock exists. | ||
if !flypg.ZombieLockExists() { | ||
if err := flypg.UnsetReadOnly(ctx, node, localConn); err != nil { | ||
return "", fmt.Errorf("failed to turn primary read/write: %s", err) | ||
} | ||
} | ||
|
||
return fmt.Sprintf("%0.1f%% - readonly mode will be enabled at %0.1f%%", usedPercentage, diskCapacityPercentageThreshold), nil | ||
} | ||
|
||
func connectionCount(ctx context.Context, local *pgx.Conn) (string, error) { | ||
sql := `select used, res_for_super as reserved, max_conn as max from | ||
(select count(*) used from pg_stat_activity) q1, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
package flypg | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"os" | ||
"time" | ||
|
||
"github.com/fly-apps/postgres-flex/internal/flypg/admin" | ||
"github.com/jackc/pgx/v5" | ||
) | ||
|
||
const ( | ||
readOnlyLockFile = "/data/readonly.lock" | ||
readOnlyEnabled = "on" | ||
readOnlyDisabled = "off" | ||
) | ||
|
||
func SetReadOnly(ctx context.Context, n *Node, conn *pgx.Conn) error { | ||
if err := writeReadOnlyLock(); err != nil { | ||
return fmt.Errorf("failed to set readonly lock: %s", err) | ||
} | ||
|
||
databases, err := admin.ListDatabases(ctx, conn) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
for _, db := range databases { | ||
// exclude administrative dbs | ||
if db.Name == "repmgr" || db.Name == "postgres" { | ||
continue | ||
} | ||
|
||
// Route configuration change through PGBouncer | ||
dbConn, err := n.NewPrimaryConnection(ctx, db.Name) | ||
if err != err { | ||
return fmt.Errorf("failed to establish connection to db %s: %s", db.Name, err) | ||
} | ||
defer dbConn.Close(ctx) | ||
|
||
// Set readonly | ||
if _, err = dbConn.Exec(ctx, "SET default_transaction_read_only=true;"); err != nil { | ||
return fmt.Errorf("failed to set readonly on db %s: %s", db.Name, err) | ||
} | ||
|
||
// Query configuration value and confirm the value change. | ||
var status string | ||
dbConn.QueryRow(ctx, "SHOW default_transaction_read_only;").Scan(&status) | ||
if err != nil { | ||
return fmt.Errorf("failed to verify readonly was unset: %s", err) | ||
} | ||
|
||
if status == readOnlyDisabled { | ||
return fmt.Errorf("failed to turn database '%s' readonly", db.Name) | ||
} | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func UnsetReadOnly(ctx context.Context, n *Node, conn *pgx.Conn) error { | ||
// Skip if there's no readonly lock present | ||
if !ReadOnlyLockExists() { | ||
return nil | ||
} | ||
|
||
databases, err := admin.ListDatabases(ctx, conn) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
for _, db := range databases { | ||
// exclude administrative dbs | ||
if db.Name == "repmgr" || db.Name == "postgres" { | ||
continue | ||
} | ||
|
||
// Route configuration change through PGBouncer | ||
dbConn, err := n.NewPrimaryConnection(ctx, db.Name) | ||
if err != err { | ||
return fmt.Errorf("failed to establish connection to db %s: %s", db.Name, err) | ||
} | ||
defer dbConn.Close(ctx) | ||
|
||
// Disable readonly | ||
_, err = dbConn.Exec(ctx, "SET default_transaction_read_only=false;") | ||
if err != nil { | ||
return fmt.Errorf("failed to unset readonly on db %s: %s", db.Name, err) | ||
} | ||
|
||
// Query configuration value and confirm the value change. | ||
var status string | ||
dbConn.QueryRow(ctx, "SHOW default_transaction_read_only;").Scan(&status) | ||
if err != nil { | ||
return fmt.Errorf("failed to verify readonly was unset: %s", err) | ||
} | ||
|
||
if status == readOnlyEnabled { | ||
return fmt.Errorf("failed to turn database '%s' read/write : %s", db.Name, err) | ||
} | ||
} | ||
|
||
if err := removeReadOnlyLock(); err != nil { | ||
return fmt.Errorf("failed to remove readonly lock: %s", err) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func ReadOnlyLockExists() bool { | ||
_, err := os.Stat(readOnlyLockFile) | ||
if os.IsNotExist(err) { | ||
return false | ||
} | ||
|
||
return true | ||
} | ||
|
||
func writeReadOnlyLock() error { | ||
if ReadOnlyLockExists() { | ||
return nil | ||
} | ||
|
||
if err := os.WriteFile(readOnlyLockFile, []byte(time.Now().String()), 0644); err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func removeReadOnlyLock() error { | ||
if !ReadOnlyLockExists() { | ||
return nil | ||
} | ||
|
||
if err := os.Remove(readOnlyLockFile); err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've been meaning to make health-check stuff configurable but that can be out of scope of this PR