@@ -2112,6 +2112,80 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2112
2112
return 0 ;
2113
2113
}
2114
2114
2115
+ static bool pfmemalloc_watermark_ok (pg_data_t * pgdat )
2116
+ {
2117
+ struct zone * zone ;
2118
+ unsigned long pfmemalloc_reserve = 0 ;
2119
+ unsigned long free_pages = 0 ;
2120
+ int i ;
2121
+ bool wmark_ok ;
2122
+
2123
+ for (i = 0 ; i <= ZONE_NORMAL ; i ++ ) {
2124
+ zone = & pgdat -> node_zones [i ];
2125
+ pfmemalloc_reserve += min_wmark_pages (zone );
2126
+ free_pages += zone_page_state (zone , NR_FREE_PAGES );
2127
+ }
2128
+
2129
+ wmark_ok = free_pages > pfmemalloc_reserve / 2 ;
2130
+
2131
+ /* kswapd must be awake if processes are being throttled */
2132
+ if (!wmark_ok && waitqueue_active (& pgdat -> kswapd_wait )) {
2133
+ pgdat -> classzone_idx = min (pgdat -> classzone_idx ,
2134
+ (enum zone_type )ZONE_NORMAL );
2135
+ wake_up_interruptible (& pgdat -> kswapd_wait );
2136
+ }
2137
+
2138
+ return wmark_ok ;
2139
+ }
2140
+
2141
+ /*
2142
+ * Throttle direct reclaimers if backing storage is backed by the network
2143
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2144
+ * depleted. kswapd will continue to make progress and wake the processes
2145
+ * when the low watermark is reached
2146
+ */
2147
+ static void throttle_direct_reclaim (gfp_t gfp_mask , struct zonelist * zonelist ,
2148
+ nodemask_t * nodemask )
2149
+ {
2150
+ struct zone * zone ;
2151
+ int high_zoneidx = gfp_zone (gfp_mask );
2152
+ pg_data_t * pgdat ;
2153
+
2154
+ /*
2155
+ * Kernel threads should not be throttled as they may be indirectly
2156
+ * responsible for cleaning pages necessary for reclaim to make forward
2157
+ * progress. kjournald for example may enter direct reclaim while
2158
+ * committing a transaction where throttling it could forcing other
2159
+ * processes to block on log_wait_commit().
2160
+ */
2161
+ if (current -> flags & PF_KTHREAD )
2162
+ return ;
2163
+
2164
+ /* Check if the pfmemalloc reserves are ok */
2165
+ first_zones_zonelist (zonelist , high_zoneidx , NULL , & zone );
2166
+ pgdat = zone -> zone_pgdat ;
2167
+ if (pfmemalloc_watermark_ok (pgdat ))
2168
+ return ;
2169
+
2170
+ /*
2171
+ * If the caller cannot enter the filesystem, it's possible that it
2172
+ * is due to the caller holding an FS lock or performing a journal
2173
+ * transaction in the case of a filesystem like ext[3|4]. In this case,
2174
+ * it is not safe to block on pfmemalloc_wait as kswapd could be
2175
+ * blocked waiting on the same lock. Instead, throttle for up to a
2176
+ * second before continuing.
2177
+ */
2178
+ if (!(gfp_mask & __GFP_FS )) {
2179
+ wait_event_interruptible_timeout (pgdat -> pfmemalloc_wait ,
2180
+ pfmemalloc_watermark_ok (pgdat ), HZ );
2181
+ return ;
2182
+ }
2183
+
2184
+ /* Throttle until kswapd wakes the process */
2185
+ wait_event_killable (zone -> zone_pgdat -> pfmemalloc_wait ,
2186
+ pfmemalloc_watermark_ok (pgdat ));
2187
+ }
2188
+
2115
2189
unsigned long try_to_free_pages (struct zonelist * zonelist , int order ,
2116
2190
gfp_t gfp_mask , nodemask_t * nodemask )
2117
2191
{
@@ -2131,6 +2205,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2131
2205
.gfp_mask = sc .gfp_mask ,
2132
2206
};
2133
2207
2208
+ throttle_direct_reclaim (gfp_mask , zonelist , nodemask );
2209
+
2210
+ /*
2211
+ * Do not enter reclaim if fatal signal is pending. 1 is returned so
2212
+ * that the page allocator does not consider triggering OOM
2213
+ */
2214
+ if (fatal_signal_pending (current ))
2215
+ return 1 ;
2216
+
2134
2217
trace_mm_vmscan_direct_reclaim_begin (order ,
2135
2218
sc .may_writepage ,
2136
2219
gfp_mask );
@@ -2275,8 +2358,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2275
2358
return balanced_pages >= (present_pages >> 2 );
2276
2359
}
2277
2360
2278
- /* is kswapd sleeping prematurely? */
2279
- static bool sleeping_prematurely (pg_data_t * pgdat , int order , long remaining ,
2361
+ /*
2362
+ * Prepare kswapd for sleeping. This verifies that there are no processes
2363
+ * waiting in throttle_direct_reclaim() and that watermarks have been met.
2364
+ *
2365
+ * Returns true if kswapd is ready to sleep
2366
+ */
2367
+ static bool prepare_kswapd_sleep (pg_data_t * pgdat , int order , long remaining ,
2280
2368
int classzone_idx )
2281
2369
{
2282
2370
int i ;
@@ -2285,7 +2373,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2285
2373
2286
2374
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2287
2375
if (remaining )
2288
- return true;
2376
+ return false;
2377
+
2378
+ /*
2379
+ * There is a potential race between when kswapd checks its watermarks
2380
+ * and a process gets throttled. There is also a potential race if
2381
+ * processes get throttled, kswapd wakes, a large process exits therby
2382
+ * balancing the zones that causes kswapd to miss a wakeup. If kswapd
2383
+ * is going to sleep, no process should be sleeping on pfmemalloc_wait
2384
+ * so wake them now if necessary. If necessary, processes will wake
2385
+ * kswapd and get throttled again
2386
+ */
2387
+ if (waitqueue_active (& pgdat -> pfmemalloc_wait )) {
2388
+ wake_up (& pgdat -> pfmemalloc_wait );
2389
+ return false;
2390
+ }
2289
2391
2290
2392
/* Check the watermark levels */
2291
2393
for (i = 0 ; i <= classzone_idx ; i ++ ) {
@@ -2318,9 +2420,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2318
2420
* must be balanced
2319
2421
*/
2320
2422
if (order )
2321
- return ! pgdat_balanced (pgdat , balanced , classzone_idx );
2423
+ return pgdat_balanced (pgdat , balanced , classzone_idx );
2322
2424
else
2323
- return ! all_zones_ok ;
2425
+ return all_zones_ok ;
2324
2426
}
2325
2427
2326
2428
/*
@@ -2546,6 +2648,16 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2546
2648
}
2547
2649
2548
2650
}
2651
+
2652
+ /*
2653
+ * If the low watermark is met there is no need for processes
2654
+ * to be throttled on pfmemalloc_wait as they should not be
2655
+ * able to safely make forward progress. Wake them
2656
+ */
2657
+ if (waitqueue_active (& pgdat -> pfmemalloc_wait ) &&
2658
+ pfmemalloc_watermark_ok (pgdat ))
2659
+ wake_up (& pgdat -> pfmemalloc_wait );
2660
+
2549
2661
if (all_zones_ok || (order && pgdat_balanced (pgdat , balanced , * classzone_idx )))
2550
2662
break ; /* kswapd: all done */
2551
2663
/*
@@ -2647,7 +2759,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2647
2759
}
2648
2760
2649
2761
/*
2650
- * Return the order we were reclaiming at so sleeping_prematurely ()
2762
+ * Return the order we were reclaiming at so prepare_kswapd_sleep ()
2651
2763
* makes a decision on the order we were last reclaiming at. However,
2652
2764
* if another caller entered the allocator slow path while kswapd
2653
2765
* was awake, order will remain at the higher level
@@ -2667,7 +2779,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2667
2779
prepare_to_wait (& pgdat -> kswapd_wait , & wait , TASK_INTERRUPTIBLE );
2668
2780
2669
2781
/* Try to sleep for a short interval */
2670
- if (! sleeping_prematurely (pgdat , order , remaining , classzone_idx )) {
2782
+ if (prepare_kswapd_sleep (pgdat , order , remaining , classzone_idx )) {
2671
2783
remaining = schedule_timeout (HZ /10 );
2672
2784
finish_wait (& pgdat -> kswapd_wait , & wait );
2673
2785
prepare_to_wait (& pgdat -> kswapd_wait , & wait , TASK_INTERRUPTIBLE );
@@ -2677,7 +2789,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2677
2789
* After a short sleep, check if it was a premature sleep. If not, then
2678
2790
* go fully to sleep until explicitly woken up.
2679
2791
*/
2680
- if (! sleeping_prematurely (pgdat , order , remaining , classzone_idx )) {
2792
+ if (prepare_kswapd_sleep (pgdat , order , remaining , classzone_idx )) {
2681
2793
trace_mm_vmscan_kswapd_sleep (pgdat -> node_id );
2682
2794
2683
2795
/*
0 commit comments