Skip to content

Commit 68f23b8

Browse files
tytsotorvalds
authored andcommitted
memcg: fix a crash in wb_workfn when a device disappears
Without memcg, there is a one-to-one mapping between the bdi and bdi_writeback structures. In this world, things are fairly straightforward; the first thing bdi_unregister() does is to shutdown the bdi_writeback structure (or wb), and part of that writeback ensures that no other work queued against the wb, and that the wb is fully drained. With memcg, however, there is a one-to-many relationship between the bdi and bdi_writeback structures; that is, there are multiple wb objects which can all point to a single bdi. There is a refcount which prevents the bdi object from being released (and hence, unregistered). So in theory, the bdi_unregister() *should* only get called once its refcount goes to zero (bdi_put will drop the refcount, and when it is zero, release_bdi gets called, which calls bdi_unregister). Unfortunately, del_gendisk() in block/gen_hd.c never got the memo about the Brave New memcg World, and calls bdi_unregister directly. It does this without informing the file system, or the memcg code, or anything else. This causes the root wb associated with the bdi to be unregistered, but none of the memcg-specific wb's are shutdown. So when one of these wb's are woken up to do delayed work, they try to dereference their wb->bdi->dev to fetch the device name, but unfortunately bdi->dev is now NULL, thanks to the bdi_unregister() called by del_gendisk(). As a result, *boom*. Fortunately, it looks like the rest of the writeback path is perfectly happy with bdi->dev and bdi->owner being NULL, so the simplest fix is to create a bdi_dev_name() function which can handle bdi->dev being NULL. This also allows us to bulletproof the writeback tracepoints to prevent them from dereferencing a NULL pointer and crashing the kernel if one is tracing with memcg's enabled, and an iSCSI device dies or a USB storage stick is pulled. The most common way of triggering this will be hotremoval of a device while writeback with memcg enabled is going on. It was triggering several times a day in a heavily loaded production environment. Google Bug Id: 145475544 Link: https://lore.kernel.org/r/20191227194829.150110-1-tytso@mit.edu Link: http://lkml.kernel.org/r/20191228005211.163952-1-tytso@mit.edu Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: Chris Mason <clm@fb.com> Cc: Tejun Heo <tj@kernel.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 69334ca commit 68f23b8

File tree

4 files changed

+29
-21
lines changed

4 files changed

+29
-21
lines changed

fs/fs-writeback.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2063,7 +2063,7 @@ void wb_workfn(struct work_struct *work)
20632063
struct bdi_writeback, dwork);
20642064
long pages_written;
20652065

2066-
set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
2066+
set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
20672067
current->flags |= PF_SWAPWRITE;
20682068

20692069
if (likely(!current_is_workqueue_rescuer() ||

include/linux/backing-dev.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <linux/fs.h>
1414
#include <linux/sched.h>
1515
#include <linux/blkdev.h>
16+
#include <linux/device.h>
1617
#include <linux/writeback.h>
1718
#include <linux/blk-cgroup.h>
1819
#include <linux/backing-dev-defs.h>
@@ -504,4 +505,13 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi)
504505
(1 << WB_async_congested));
505506
}
506507

508+
extern const char *bdi_unknown_name;
509+
510+
static inline const char *bdi_dev_name(struct backing_dev_info *bdi)
511+
{
512+
if (!bdi || !bdi->dev)
513+
return bdi_unknown_name;
514+
return dev_name(bdi->dev);
515+
}
516+
507517
#endif /* _LINUX_BACKING_DEV_H */

include/trace/events/writeback.h

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ DECLARE_EVENT_CLASS(writeback_page_template,
6767

6868
TP_fast_assign(
6969
strscpy_pad(__entry->name,
70-
mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)",
71-
32);
70+
bdi_dev_name(mapping ? inode_to_bdi(mapping->host) :
71+
NULL), 32);
7272
__entry->ino = mapping ? mapping->host->i_ino : 0;
7373
__entry->index = page->index;
7474
),
@@ -111,8 +111,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
111111
struct backing_dev_info *bdi = inode_to_bdi(inode);
112112

113113
/* may be called for files on pseudo FSes w/ unregistered bdi */
114-
strscpy_pad(__entry->name,
115-
bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
114+
strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
116115
__entry->ino = inode->i_ino;
117116
__entry->state = inode->i_state;
118117
__entry->flags = flags;
@@ -193,7 +192,7 @@ TRACE_EVENT(inode_foreign_history,
193192
),
194193

195194
TP_fast_assign(
196-
strncpy(__entry->name, dev_name(inode_to_bdi(inode)->dev), 32);
195+
strncpy(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32);
197196
__entry->ino = inode->i_ino;
198197
__entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc);
199198
__entry->history = history;
@@ -222,7 +221,7 @@ TRACE_EVENT(inode_switch_wbs,
222221
),
223222

224223
TP_fast_assign(
225-
strncpy(__entry->name, dev_name(old_wb->bdi->dev), 32);
224+
strncpy(__entry->name, bdi_dev_name(old_wb->bdi), 32);
226225
__entry->ino = inode->i_ino;
227226
__entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb);
228227
__entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb);
@@ -255,7 +254,7 @@ TRACE_EVENT(track_foreign_dirty,
255254
struct address_space *mapping = page_mapping(page);
256255
struct inode *inode = mapping ? mapping->host : NULL;
257256

258-
strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
257+
strncpy(__entry->name, bdi_dev_name(wb->bdi), 32);
259258
__entry->bdi_id = wb->bdi->id;
260259
__entry->ino = inode ? inode->i_ino : 0;
261260
__entry->memcg_id = wb->memcg_css->id;
@@ -288,7 +287,7 @@ TRACE_EVENT(flush_foreign,
288287
),
289288

290289
TP_fast_assign(
291-
strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
290+
strncpy(__entry->name, bdi_dev_name(wb->bdi), 32);
292291
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
293292
__entry->frn_bdi_id = frn_bdi_id;
294293
__entry->frn_memcg_id = frn_memcg_id;
@@ -318,7 +317,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
318317

319318
TP_fast_assign(
320319
strscpy_pad(__entry->name,
321-
dev_name(inode_to_bdi(inode)->dev), 32);
320+
bdi_dev_name(inode_to_bdi(inode)), 32);
322321
__entry->ino = inode->i_ino;
323322
__entry->sync_mode = wbc->sync_mode;
324323
__entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc);
@@ -361,9 +360,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
361360
__field(ino_t, cgroup_ino)
362361
),
363362
TP_fast_assign(
364-
strscpy_pad(__entry->name,
365-
wb->bdi->dev ? dev_name(wb->bdi->dev) :
366-
"(unknown)", 32);
363+
strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
367364
__entry->nr_pages = work->nr_pages;
368365
__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
369366
__entry->sync_mode = work->sync_mode;
@@ -416,7 +413,7 @@ DECLARE_EVENT_CLASS(writeback_class,
416413
__field(ino_t, cgroup_ino)
417414
),
418415
TP_fast_assign(
419-
strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32);
416+
strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
420417
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
421418
),
422419
TP_printk("bdi %s: cgroup_ino=%lu",
@@ -438,7 +435,7 @@ TRACE_EVENT(writeback_bdi_register,
438435
__array(char, name, 32)
439436
),
440437
TP_fast_assign(
441-
strscpy_pad(__entry->name, dev_name(bdi->dev), 32);
438+
strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
442439
),
443440
TP_printk("bdi %s",
444441
__entry->name
@@ -463,7 +460,7 @@ DECLARE_EVENT_CLASS(wbc_class,
463460
),
464461

465462
TP_fast_assign(
466-
strscpy_pad(__entry->name, dev_name(bdi->dev), 32);
463+
strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
467464
__entry->nr_to_write = wbc->nr_to_write;
468465
__entry->pages_skipped = wbc->pages_skipped;
469466
__entry->sync_mode = wbc->sync_mode;
@@ -514,7 +511,7 @@ TRACE_EVENT(writeback_queue_io,
514511
),
515512
TP_fast_assign(
516513
unsigned long *older_than_this = work->older_than_this;
517-
strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32);
514+
strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
518515
__entry->older = older_than_this ? *older_than_this : 0;
519516
__entry->age = older_than_this ?
520517
(jiffies - *older_than_this) * 1000 / HZ : -1;
@@ -600,7 +597,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
600597
),
601598

602599
TP_fast_assign(
603-
strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32);
600+
strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
604601
__entry->write_bw = KBps(wb->write_bandwidth);
605602
__entry->avg_write_bw = KBps(wb->avg_write_bandwidth);
606603
__entry->dirty_rate = KBps(dirty_rate);
@@ -665,7 +662,7 @@ TRACE_EVENT(balance_dirty_pages,
665662

666663
TP_fast_assign(
667664
unsigned long freerun = (thresh + bg_thresh) / 2;
668-
strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32);
665+
strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
669666

670667
__entry->limit = global_wb_domain.dirty_limit;
671668
__entry->setpoint = (global_wb_domain.dirty_limit +
@@ -726,7 +723,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
726723

727724
TP_fast_assign(
728725
strscpy_pad(__entry->name,
729-
dev_name(inode_to_bdi(inode)->dev), 32);
726+
bdi_dev_name(inode_to_bdi(inode)), 32);
730727
__entry->ino = inode->i_ino;
731728
__entry->state = inode->i_state;
732729
__entry->dirtied_when = inode->dirtied_when;
@@ -800,7 +797,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
800797

801798
TP_fast_assign(
802799
strscpy_pad(__entry->name,
803-
dev_name(inode_to_bdi(inode)->dev), 32);
800+
bdi_dev_name(inode_to_bdi(inode)), 32);
804801
__entry->ino = inode->i_ino;
805802
__entry->state = inode->i_state;
806803
__entry->dirtied_when = inode->dirtied_when;

mm/backing-dev.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ struct backing_dev_info noop_backing_dev_info = {
2121
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
2222

2323
static struct class *bdi_class;
24+
const char *bdi_unknown_name = "(unknown)";
2425

2526
/*
2627
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU

0 commit comments

Comments
 (0)