Skip to content

Commit

Permalink
mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and sw…
Browse files Browse the repository at this point in the history
…ap is backed by network storage

If swap is backed by network storage such as NBD, there is a risk that a
large number of reclaimers can hang the system by consuming all
PF_MEMALLOC reserves.  To avoid these hangs, the administrator must tune
min_free_kbytes in advance which is a bit fragile.

This patch throttles direct reclaimers if half the PF_MEMALLOC reserves
are in use.  If the system is routinely getting throttled the system
administrator can increase min_free_kbytes so degradation is smoother but
the system will keep running.

Signed-off-by: Mel Gorman <[email protected]>
Cc: David Miller <[email protected]>
Cc: Neil Brown <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Mike Christie <[email protected]>
Cc: Eric B Munson <[email protected]>
Cc: Eric Dumazet <[email protected]>
Cc: Sebastian Andrzej Siewior <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Christoph Lameter <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Mel Gorman authored and torvalds committed Aug 1, 2012
1 parent 7f338fe commit 5515061
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 8 deletions.
1 change: 1 addition & 0 deletions include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,7 @@ typedef struct pglist_data {
range, including holes */
int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
int kswapd_max_order;
enum zone_type classzone_idx;
Expand Down
1 change: 1 addition & 0 deletions mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -4389,6 +4389,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
pgdat_resize_init(pgdat);
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat->kswapd_max_order = 0;
pgdat_page_cgroup_init(pgdat);

Expand Down
128 changes: 120 additions & 8 deletions mm/vmscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -2112,6 +2112,80 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
return 0;
}

static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
{
struct zone *zone;
unsigned long pfmemalloc_reserve = 0;
unsigned long free_pages = 0;
int i;
bool wmark_ok;

for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}

wmark_ok = free_pages > pfmemalloc_reserve / 2;

/* kswapd must be awake if processes are being throttled */
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
pgdat->classzone_idx = min(pgdat->classzone_idx,
(enum zone_type)ZONE_NORMAL);
wake_up_interruptible(&pgdat->kswapd_wait);
}

return wmark_ok;
}

/*
* Throttle direct reclaimers if backing storage is backed by the network
* and the PFMEMALLOC reserve for the preferred node is getting dangerously
* depleted. kswapd will continue to make progress and wake the processes
* when the low watermark is reached
*/
static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
struct zone *zone;
int high_zoneidx = gfp_zone(gfp_mask);
pg_data_t *pgdat;

/*
* Kernel threads should not be throttled as they may be indirectly
* responsible for cleaning pages necessary for reclaim to make forward
* progress. kjournald for example may enter direct reclaim while
* committing a transaction where throttling it could forcing other
* processes to block on log_wait_commit().
*/
if (current->flags & PF_KTHREAD)
return;

/* Check if the pfmemalloc reserves are ok */
first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
pgdat = zone->zone_pgdat;
if (pfmemalloc_watermark_ok(pgdat))
return;

/*
* If the caller cannot enter the filesystem, it's possible that it
* is due to the caller holding an FS lock or performing a journal
* transaction in the case of a filesystem like ext[3|4]. In this case,
* it is not safe to block on pfmemalloc_wait as kswapd could be
* blocked waiting on the same lock. Instead, throttle for up to a
* second before continuing.
*/
if (!(gfp_mask & __GFP_FS)) {
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
pfmemalloc_watermark_ok(pgdat), HZ);
return;
}

/* Throttle until kswapd wakes the process */
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
pfmemalloc_watermark_ok(pgdat));
}

unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
Expand All @@ -2131,6 +2205,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.gfp_mask = sc.gfp_mask,
};

throttle_direct_reclaim(gfp_mask, zonelist, nodemask);

/*
* Do not enter reclaim if fatal signal is pending. 1 is returned so
* that the page allocator does not consider triggering OOM
*/
if (fatal_signal_pending(current))
return 1;

trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
gfp_mask);
Expand Down Expand Up @@ -2275,8 +2358,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
return balanced_pages >= (present_pages >> 2);
}

/* is kswapd sleeping prematurely? */
static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
/*
* Prepare kswapd for sleeping. This verifies that there are no processes
* waiting in throttle_direct_reclaim() and that watermarks have been met.
*
* Returns true if kswapd is ready to sleep
*/
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
int classzone_idx)
{
int i;
Expand All @@ -2285,7 +2373,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,

/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
return true;
return false;

/*
* There is a potential race between when kswapd checks its watermarks
* and a process gets throttled. There is also a potential race if
* processes get throttled, kswapd wakes, a large process exits therby
* balancing the zones that causes kswapd to miss a wakeup. If kswapd
* is going to sleep, no process should be sleeping on pfmemalloc_wait
* so wake them now if necessary. If necessary, processes will wake
* kswapd and get throttled again
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
wake_up(&pgdat->pfmemalloc_wait);
return false;
}

/* Check the watermark levels */
for (i = 0; i <= classzone_idx; i++) {
Expand Down Expand Up @@ -2318,9 +2420,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
* must be balanced
*/
if (order)
return !pgdat_balanced(pgdat, balanced, classzone_idx);
return pgdat_balanced(pgdat, balanced, classzone_idx);
else
return !all_zones_ok;
return all_zones_ok;
}

/*
Expand Down Expand Up @@ -2546,6 +2648,16 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
}

}

/*
* If the low watermark is met there is no need for processes
* to be throttled on pfmemalloc_wait as they should not be
* able to safely make forward progress. Wake them
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
pfmemalloc_watermark_ok(pgdat))
wake_up(&pgdat->pfmemalloc_wait);

if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
Expand Down Expand Up @@ -2647,7 +2759,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
}

/*
* Return the order we were reclaiming at so sleeping_prematurely()
* Return the order we were reclaiming at so prepare_kswapd_sleep()
* makes a decision on the order we were last reclaiming at. However,
* if another caller entered the allocator slow path while kswapd
* was awake, order will remain at the higher level
Expand All @@ -2667,7 +2779,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

/* Try to sleep for a short interval */
if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
Expand All @@ -2677,7 +2789,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);

/*
Expand Down

0 comments on commit 5515061

Please sign in to comment.