Skip to content

Commit 5515061

Browse files
Mel Gormantorvalds
Mel Gorman
authored andcommitted
mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
If swap is backed by network storage such as NBD, there is a risk that a large number of reclaimers can hang the system by consuming all PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune min_free_kbytes in advance which is a bit fragile. This patch throttles direct reclaimers if half the PF_MEMALLOC reserves are in use. If the system is routinely getting throttled the system administrator can increase min_free_kbytes so degradation is smoother but the system will keep running. Signed-off-by: Mel Gorman <[email protected]> Cc: David Miller <[email protected]> Cc: Neil Brown <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Mike Christie <[email protected]> Cc: Eric B Munson <[email protected]> Cc: Eric Dumazet <[email protected]> Cc: Sebastian Andrzej Siewior <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Christoph Lameter <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 7f338fe commit 5515061

File tree

3 files changed

+122
-8
lines changed

3 files changed

+122
-8
lines changed

include/linux/mmzone.h

+1
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,7 @@ typedef struct pglist_data {
705705
range, including holes */
706706
int node_id;
707707
wait_queue_head_t kswapd_wait;
708+
wait_queue_head_t pfmemalloc_wait;
708709
struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
709710
int kswapd_max_order;
710711
enum zone_type classzone_idx;

mm/page_alloc.c

+1
Original file line numberDiff line numberDiff line change
@@ -4389,6 +4389,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
43894389
pgdat_resize_init(pgdat);
43904390
pgdat->nr_zones = 0;
43914391
init_waitqueue_head(&pgdat->kswapd_wait);
4392+
init_waitqueue_head(&pgdat->pfmemalloc_wait);
43924393
pgdat->kswapd_max_order = 0;
43934394
pgdat_page_cgroup_init(pgdat);
43944395

mm/vmscan.c

+120-8
Original file line numberDiff line numberDiff line change
@@ -2112,6 +2112,80 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
21122112
return 0;
21132113
}
21142114

2115+
static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2116+
{
2117+
struct zone *zone;
2118+
unsigned long pfmemalloc_reserve = 0;
2119+
unsigned long free_pages = 0;
2120+
int i;
2121+
bool wmark_ok;
2122+
2123+
for (i = 0; i <= ZONE_NORMAL; i++) {
2124+
zone = &pgdat->node_zones[i];
2125+
pfmemalloc_reserve += min_wmark_pages(zone);
2126+
free_pages += zone_page_state(zone, NR_FREE_PAGES);
2127+
}
2128+
2129+
wmark_ok = free_pages > pfmemalloc_reserve / 2;
2130+
2131+
/* kswapd must be awake if processes are being throttled */
2132+
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2133+
pgdat->classzone_idx = min(pgdat->classzone_idx,
2134+
(enum zone_type)ZONE_NORMAL);
2135+
wake_up_interruptible(&pgdat->kswapd_wait);
2136+
}
2137+
2138+
return wmark_ok;
2139+
}
2140+
2141+
/*
2142+
* Throttle direct reclaimers if backing storage is backed by the network
2143+
* and the PFMEMALLOC reserve for the preferred node is getting dangerously
2144+
* depleted. kswapd will continue to make progress and wake the processes
2145+
* when the low watermark is reached
2146+
*/
2147+
static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2148+
nodemask_t *nodemask)
2149+
{
2150+
struct zone *zone;
2151+
int high_zoneidx = gfp_zone(gfp_mask);
2152+
pg_data_t *pgdat;
2153+
2154+
/*
2155+
* Kernel threads should not be throttled as they may be indirectly
2156+
* responsible for cleaning pages necessary for reclaim to make forward
2157+
* progress. kjournald for example may enter direct reclaim while
2158+
* committing a transaction where throttling it could forcing other
2159+
* processes to block on log_wait_commit().
2160+
*/
2161+
if (current->flags & PF_KTHREAD)
2162+
return;
2163+
2164+
/* Check if the pfmemalloc reserves are ok */
2165+
first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
2166+
pgdat = zone->zone_pgdat;
2167+
if (pfmemalloc_watermark_ok(pgdat))
2168+
return;
2169+
2170+
/*
2171+
* If the caller cannot enter the filesystem, it's possible that it
2172+
* is due to the caller holding an FS lock or performing a journal
2173+
* transaction in the case of a filesystem like ext[3|4]. In this case,
2174+
* it is not safe to block on pfmemalloc_wait as kswapd could be
2175+
* blocked waiting on the same lock. Instead, throttle for up to a
2176+
* second before continuing.
2177+
*/
2178+
if (!(gfp_mask & __GFP_FS)) {
2179+
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2180+
pfmemalloc_watermark_ok(pgdat), HZ);
2181+
return;
2182+
}
2183+
2184+
/* Throttle until kswapd wakes the process */
2185+
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2186+
pfmemalloc_watermark_ok(pgdat));
2187+
}
2188+
21152189
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
21162190
gfp_t gfp_mask, nodemask_t *nodemask)
21172191
{
@@ -2131,6 +2205,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
21312205
.gfp_mask = sc.gfp_mask,
21322206
};
21332207

2208+
throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
2209+
2210+
/*
2211+
* Do not enter reclaim if fatal signal is pending. 1 is returned so
2212+
* that the page allocator does not consider triggering OOM
2213+
*/
2214+
if (fatal_signal_pending(current))
2215+
return 1;
2216+
21342217
trace_mm_vmscan_direct_reclaim_begin(order,
21352218
sc.may_writepage,
21362219
gfp_mask);
@@ -2275,8 +2358,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
22752358
return balanced_pages >= (present_pages >> 2);
22762359
}
22772360

2278-
/* is kswapd sleeping prematurely? */
2279-
static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2361+
/*
2362+
* Prepare kswapd for sleeping. This verifies that there are no processes
2363+
* waiting in throttle_direct_reclaim() and that watermarks have been met.
2364+
*
2365+
* Returns true if kswapd is ready to sleep
2366+
*/
2367+
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
22802368
int classzone_idx)
22812369
{
22822370
int i;
@@ -2285,7 +2373,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
22852373

22862374
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
22872375
if (remaining)
2288-
return true;
2376+
return false;
2377+
2378+
/*
2379+
* There is a potential race between when kswapd checks its watermarks
2380+
* and a process gets throttled. There is also a potential race if
2381+
* processes get throttled, kswapd wakes, a large process exits therby
2382+
* balancing the zones that causes kswapd to miss a wakeup. If kswapd
2383+
* is going to sleep, no process should be sleeping on pfmemalloc_wait
2384+
* so wake them now if necessary. If necessary, processes will wake
2385+
* kswapd and get throttled again
2386+
*/
2387+
if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2388+
wake_up(&pgdat->pfmemalloc_wait);
2389+
return false;
2390+
}
22892391

22902392
/* Check the watermark levels */
22912393
for (i = 0; i <= classzone_idx; i++) {
@@ -2318,9 +2420,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
23182420
* must be balanced
23192421
*/
23202422
if (order)
2321-
return !pgdat_balanced(pgdat, balanced, classzone_idx);
2423+
return pgdat_balanced(pgdat, balanced, classzone_idx);
23222424
else
2323-
return !all_zones_ok;
2425+
return all_zones_ok;
23242426
}
23252427

23262428
/*
@@ -2546,6 +2648,16 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
25462648
}
25472649

25482650
}
2651+
2652+
/*
2653+
* If the low watermark is met there is no need for processes
2654+
* to be throttled on pfmemalloc_wait as they should not be
2655+
* able to safely make forward progress. Wake them
2656+
*/
2657+
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
2658+
pfmemalloc_watermark_ok(pgdat))
2659+
wake_up(&pgdat->pfmemalloc_wait);
2660+
25492661
if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
25502662
break; /* kswapd: all done */
25512663
/*
@@ -2647,7 +2759,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
26472759
}
26482760

26492761
/*
2650-
* Return the order we were reclaiming at so sleeping_prematurely()
2762+
* Return the order we were reclaiming at so prepare_kswapd_sleep()
26512763
* makes a decision on the order we were last reclaiming at. However,
26522764
* if another caller entered the allocator slow path while kswapd
26532765
* was awake, order will remain at the higher level
@@ -2667,7 +2779,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
26672779
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
26682780

26692781
/* Try to sleep for a short interval */
2670-
if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2782+
if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
26712783
remaining = schedule_timeout(HZ/10);
26722784
finish_wait(&pgdat->kswapd_wait, &wait);
26732785
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2677,7 +2789,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
26772789
* After a short sleep, check if it was a premature sleep. If not, then
26782790
* go fully to sleep until explicitly woken up.
26792791
*/
2680-
if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2792+
if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
26812793
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
26822794

26832795
/*

0 commit comments

Comments
 (0)