Skip to content

Commit 1d3d443

Browse files
Glauber CostaAl Viro
Glauber Costa
authored and
Al Viro
committed
vmscan: per-node deferred work
The list_lru infrastructure already keeps per-node LRU lists in its node-specific list_lru_node arrays and provide us with a per-node API, and the shrinkers are properly equiped with node information. This means that we can now focus our shrinking effort in a single node, but the work that is deferred from one run to another is kept global at nr_in_batch. Work can be deferred, for instance, during direct reclaim under a GFP_NOFS allocation, where situation, all the filesystem shrinkers will be prevented from running and accumulate in nr_in_batch the amount of work they should have done, but could not. This creates an impedance problem, where upon node pressure, work deferred will accumulate and end up being flushed in other nodes. The problem we describe is particularly harmful in big machines, where many nodes can accumulate at the same time, all adding to the global counter nr_in_batch. As we accumulate more and more, we start to ask for the caches to flush even bigger numbers. The result is that the caches are depleted and do not stabilize. To achieve stable steady state behavior, we need to tackle it differently. In this patch we keep the deferred count per-node, in the new array nr_deferred[] (the name is also a bit more descriptive) and will never accumulate that to other nodes. Signed-off-by: Glauber Costa <glommer@openvz.org> Cc: Dave Chinner <dchinner@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Cc: Arve Hjønnevåg <arve@android.com> Cc: Carlos Maiolino <cmaiolino@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Chuck Lever <chuck.lever@oracle.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: David Rientjes <rientjes@google.com> Cc: Gleb Natapov <gleb@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: J. Bruce Fields <bfields@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Stultz <john.stultz@linaro.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Kent Overstreet <koverstreet@google.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Thomas Hellstrom <thellstrom@vmware.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
1 parent 0ce3d74 commit 1d3d443

File tree

2 files changed

+152
-103
lines changed

2 files changed

+152
-103
lines changed

include/linux/shrinker.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ struct shrink_control {
1919

2020
/* shrink from these nodes */
2121
nodemask_t nodes_to_scan;
22+
/* current node being shrunk (for NUMA aware shrinkers) */
23+
int nid;
2224
};
2325

2426
#define SHRINK_STOP (~0UL)
@@ -44,6 +46,8 @@ struct shrink_control {
4446
* due to potential deadlocks. If SHRINK_STOP is returned, then no further
4547
* attempts to call the @scan_objects will be made from the current reclaim
4648
* context.
49+
*
50+
* @flags determine the shrinker abilities, like numa awareness
4751
*/
4852
struct shrinker {
4953
int (*shrink)(struct shrinker *, struct shrink_control *sc);
@@ -54,12 +58,18 @@ struct shrinker {
5458

5559
int seeks; /* seeks to recreate an obj */
5660
long batch; /* reclaim batch size, 0 = default */
61+
unsigned long flags;
5762

5863
/* These are for internal use */
5964
struct list_head list;
60-
atomic_long_t nr_in_batch; /* objs pending delete */
65+
/* objs pending delete, per node */
66+
atomic_long_t *nr_deferred;
6167
};
6268
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
63-
extern void register_shrinker(struct shrinker *);
69+
70+
/* Flags */
71+
#define SHRINKER_NUMA_AWARE (1 << 0)
72+
73+
extern int register_shrinker(struct shrinker *);
6474
extern void unregister_shrinker(struct shrinker *);
6575
#endif

mm/vmscan.c

Lines changed: 140 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -155,14 +155,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
155155
}
156156

157157
/*
158-
* Add a shrinker callback to be called from the vm
158+
* Add a shrinker callback to be called from the vm.
159159
*/
160-
void register_shrinker(struct shrinker *shrinker)
160+
int register_shrinker(struct shrinker *shrinker)
161161
{
162-
atomic_long_set(&shrinker->nr_in_batch, 0);
162+
size_t size = sizeof(*shrinker->nr_deferred);
163+
164+
/*
165+
* If we only have one possible node in the system anyway, save
166+
* ourselves the trouble and disable NUMA aware behavior. This way we
167+
* will save memory and some small loop time later.
168+
*/
169+
if (nr_node_ids == 1)
170+
shrinker->flags &= ~SHRINKER_NUMA_AWARE;
171+
172+
if (shrinker->flags & SHRINKER_NUMA_AWARE)
173+
size *= nr_node_ids;
174+
175+
shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
176+
if (!shrinker->nr_deferred)
177+
return -ENOMEM;
178+
163179
down_write(&shrinker_rwsem);
164180
list_add_tail(&shrinker->list, &shrinker_list);
165181
up_write(&shrinker_rwsem);
182+
return 0;
166183
}
167184
EXPORT_SYMBOL(register_shrinker);
168185

@@ -186,6 +203,118 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
186203
}
187204

188205
#define SHRINK_BATCH 128
206+
207+
static unsigned long
208+
shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
209+
unsigned long nr_pages_scanned, unsigned long lru_pages)
210+
{
211+
unsigned long freed = 0;
212+
unsigned long long delta;
213+
long total_scan;
214+
long max_pass;
215+
long nr;
216+
long new_nr;
217+
int nid = shrinkctl->nid;
218+
long batch_size = shrinker->batch ? shrinker->batch
219+
: SHRINK_BATCH;
220+
221+
if (shrinker->count_objects)
222+
max_pass = shrinker->count_objects(shrinker, shrinkctl);
223+
else
224+
max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
225+
if (max_pass == 0)
226+
return 0;
227+
228+
/*
229+
* copy the current shrinker scan count into a local variable
230+
* and zero it so that other concurrent shrinker invocations
231+
* don't also do this scanning work.
232+
*/
233+
nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
234+
235+
total_scan = nr;
236+
delta = (4 * nr_pages_scanned) / shrinker->seeks;
237+
delta *= max_pass;
238+
do_div(delta, lru_pages + 1);
239+
total_scan += delta;
240+
if (total_scan < 0) {
241+
printk(KERN_ERR
242+
"shrink_slab: %pF negative objects to delete nr=%ld\n",
243+
shrinker->shrink, total_scan);
244+
total_scan = max_pass;
245+
}
246+
247+
/*
248+
* We need to avoid excessive windup on filesystem shrinkers
249+
* due to large numbers of GFP_NOFS allocations causing the
250+
* shrinkers to return -1 all the time. This results in a large
251+
* nr being built up so when a shrink that can do some work
252+
* comes along it empties the entire cache due to nr >>>
253+
* max_pass. This is bad for sustaining a working set in
254+
* memory.
255+
*
256+
* Hence only allow the shrinker to scan the entire cache when
257+
* a large delta change is calculated directly.
258+
*/
259+
if (delta < max_pass / 4)
260+
total_scan = min(total_scan, max_pass / 2);
261+
262+
/*
263+
* Avoid risking looping forever due to too large nr value:
264+
* never try to free more than twice the estimate number of
265+
* freeable entries.
266+
*/
267+
if (total_scan > max_pass * 2)
268+
total_scan = max_pass * 2;
269+
270+
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
271+
nr_pages_scanned, lru_pages,
272+
max_pass, delta, total_scan);
273+
274+
while (total_scan >= batch_size) {
275+
276+
if (shrinker->scan_objects) {
277+
unsigned long ret;
278+
shrinkctl->nr_to_scan = batch_size;
279+
ret = shrinker->scan_objects(shrinker, shrinkctl);
280+
281+
if (ret == SHRINK_STOP)
282+
break;
283+
freed += ret;
284+
} else {
285+
int nr_before;
286+
long ret;
287+
288+
nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
289+
ret = do_shrinker_shrink(shrinker, shrinkctl,
290+
batch_size);
291+
if (ret == -1)
292+
break;
293+
if (ret < nr_before)
294+
freed += nr_before - ret;
295+
}
296+
297+
count_vm_events(SLABS_SCANNED, batch_size);
298+
total_scan -= batch_size;
299+
300+
cond_resched();
301+
}
302+
303+
/*
304+
* move the unused scan count back into the shrinker in a
305+
* manner that handles concurrent updates. If we exhausted the
306+
* scan, there is no need to do an update.
307+
*/
308+
if (total_scan > 0)
309+
new_nr = atomic_long_add_return(total_scan,
310+
&shrinker->nr_deferred[nid]);
311+
else
312+
new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
313+
314+
trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
315+
return freed;
316+
}
317+
189318
/*
190319
* Call the shrink functions to age shrinkable caches
191320
*
@@ -227,108 +356,18 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
227356
}
228357

229358
list_for_each_entry(shrinker, &shrinker_list, list) {
230-
unsigned long long delta;
231-
long total_scan;
232-
long max_pass;
233-
long nr;
234-
long new_nr;
235-
long batch_size = shrinker->batch ? shrinker->batch
236-
: SHRINK_BATCH;
237-
238-
if (shrinker->count_objects)
239-
max_pass = shrinker->count_objects(shrinker, shrinkctl);
240-
else
241-
max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
242-
if (max_pass == 0)
243-
continue;
244-
245-
/*
246-
* copy the current shrinker scan count into a local variable
247-
* and zero it so that other concurrent shrinker invocations
248-
* don't also do this scanning work.
249-
*/
250-
nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
251-
252-
total_scan = nr;
253-
delta = (4 * nr_pages_scanned) / shrinker->seeks;
254-
delta *= max_pass;
255-
do_div(delta, lru_pages + 1);
256-
total_scan += delta;
257-
if (total_scan < 0) {
258-
printk(KERN_ERR
259-
"shrink_slab: %pF negative objects to delete nr=%ld\n",
260-
shrinker->shrink, total_scan);
261-
total_scan = max_pass;
262-
}
263-
264-
/*
265-
* We need to avoid excessive windup on filesystem shrinkers
266-
* due to large numbers of GFP_NOFS allocations causing the
267-
* shrinkers to return -1 all the time. This results in a large
268-
* nr being built up so when a shrink that can do some work
269-
* comes along it empties the entire cache due to nr >>>
270-
* max_pass. This is bad for sustaining a working set in
271-
* memory.
272-
*
273-
* Hence only allow the shrinker to scan the entire cache when
274-
* a large delta change is calculated directly.
275-
*/
276-
if (delta < max_pass / 4)
277-
total_scan = min(total_scan, max_pass / 2);
278-
279-
/*
280-
* Avoid risking looping forever due to too large nr value:
281-
* never try to free more than twice the estimate number of
282-
* freeable entries.
283-
*/
284-
if (total_scan > max_pass * 2)
285-
total_scan = max_pass * 2;
286-
287-
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
288-
nr_pages_scanned, lru_pages,
289-
max_pass, delta, total_scan);
290-
291-
while (total_scan >= batch_size) {
292-
293-
if (shrinker->scan_objects) {
294-
unsigned long ret;
295-
shrinkctl->nr_to_scan = batch_size;
296-
ret = shrinker->scan_objects(shrinker, shrinkctl);
359+
for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
360+
if (!node_online(shrinkctl->nid))
361+
continue;
297362

298-
if (ret == SHRINK_STOP)
299-
break;
300-
freed += ret;
301-
} else {
302-
int nr_before;
303-
long ret;
304-
305-
nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
306-
ret = do_shrinker_shrink(shrinker, shrinkctl,
307-
batch_size);
308-
if (ret == -1)
309-
break;
310-
if (ret < nr_before)
311-
freed += nr_before - ret;
312-
}
363+
if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
364+
(shrinkctl->nid != 0))
365+
break;
313366

314-
count_vm_events(SLABS_SCANNED, batch_size);
315-
total_scan -= batch_size;
367+
freed += shrink_slab_node(shrinkctl, shrinker,
368+
nr_pages_scanned, lru_pages);
316369

317-
cond_resched();
318370
}
319-
320-
/*
321-
* move the unused scan count back into the shrinker in a
322-
* manner that handles concurrent updates. If we exhausted the
323-
* scan, there is no need to do an update.
324-
*/
325-
if (total_scan > 0)
326-
new_nr = atomic_long_add_return(total_scan,
327-
&shrinker->nr_in_batch);
328-
else
329-
new_nr = atomic_long_read(&shrinker->nr_in_batch);
330-
331-
trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
332371
}
333372
up_read(&shrinker_rwsem);
334373
out:

0 commit comments

Comments
 (0)