Skip to content

Commit 6b37b15

Browse files
committed
runtime: don't take allglock in tracebackothers
tracebackothers is called from fatal throw/panic. A fatal throw may be taken with allglock held (notably in the allocator when allglock is held), which would cause a deadlock in tracebackothers when we try to take allglock again. Locking allglock here is also often a lock order violation w.r.t. the locks held when throw was called. Avoid the deadlock and ordering issues by skipping locking altogether. It is OK to miss concurrently created Gs (which are generally avoided by freezetheworld(), and which were possible previously anyways if created after the loop). Fatal throw/panic freezetheworld(), which should freeze other threads that may be racing to modify allgs. However, freezetheworld() does _not_ guarantee that it stops all other threads, so we can't simply drop the lock. Fixes #42669 Updates #43175 Change-Id: I657aec46ed35fd5d1b3f1ba25b500128ab26b088 Reviewed-on: https://go-review.googlesource.com/c/go/+/270861 Reviewed-by: Michael Knyszek <mknyszek@google.com> Trust: Michael Pratt <mpratt@google.com>
1 parent 9eef49c commit 6b37b15

File tree

4 files changed

+56
-16
lines changed

4 files changed

+56
-16
lines changed

src/runtime/mgcmark.go

-1
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,6 @@ fail:
132132
println("gp", gp, "goid", gp.goid,
133133
"status", readgstatus(gp),
134134
"gcscandone", gp.gcscandone)
135-
unlock(&allglock) // Avoid self-deadlock with traceback.
136135
throw("scan missed a g")
137136
}
138137

src/runtime/proc.go

+39-4
Original file line numberDiff line numberDiff line change
@@ -490,8 +490,29 @@ func lockedOSThread() bool {
490490
}
491491

492492
var (
493-
allgs []*g
493+
// allgs contains all Gs ever created (including dead Gs), and thus
494+
// never shrinks.
495+
//
496+
// Access via the slice is protected by allglock or stop-the-world.
497+
// Readers that cannot take the lock may (carefully!) use the atomic
498+
// variables below.
494499
allglock mutex
500+
allgs []*g
501+
502+
// allglen and allgptr are atomic variables that contain len(allg) and
503+
// &allg[0] respectively. Proper ordering depends on totally-ordered
504+
// loads and stores. Writes are protected by allglock.
505+
//
506+
// allgptr is updated before allglen. Readers should read allglen
507+
// before allgptr to ensure that allglen is always <= len(allgptr). New
508+
// Gs appended during the race can be missed. For a consistent view of
509+
// all Gs, allglock must be held.
510+
//
511+
// allgptr copies should always be stored as a concrete type or
512+
// unsafe.Pointer, not uintptr, to ensure that GC can still reach it
513+
// even if it points to a stale array.
514+
allglen uintptr
515+
allgptr **g
495516
)
496517

497518
func allgadd(gp *g) {
@@ -501,10 +522,25 @@ func allgadd(gp *g) {
501522

502523
lock(&allglock)
503524
allgs = append(allgs, gp)
504-
allglen = uintptr(len(allgs))
525+
if &allgs[0] != allgptr {
526+
atomicstorep(unsafe.Pointer(&allgptr), unsafe.Pointer(&allgs[0]))
527+
}
528+
atomic.Storeuintptr(&allglen, uintptr(len(allgs)))
505529
unlock(&allglock)
506530
}
507531

532+
// atomicAllG returns &allgs[0] and len(allgs) for use with atomicAllGIndex.
533+
func atomicAllG() (**g, uintptr) {
534+
length := atomic.Loaduintptr(&allglen)
535+
ptr := (**g)(atomic.Loadp(unsafe.Pointer(&allgptr)))
536+
return ptr, length
537+
}
538+
539+
// atomicAllGIndex returns ptr[i] with the allgptr returned from atomicAllG.
540+
func atomicAllGIndex(ptr **g, i uintptr) *g {
541+
return *(**g)(add(unsafe.Pointer(ptr), i*sys.PtrSize))
542+
}
543+
508544
const (
509545
// Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
510546
// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
@@ -4266,7 +4302,7 @@ func badunlockosthread() {
42664302
}
42674303

42684304
func gcount() int32 {
4269-
n := int32(allglen) - sched.gFree.n - int32(atomic.Load(&sched.ngsys))
4305+
n := int32(atomic.Loaduintptr(&allglen)) - sched.gFree.n - int32(atomic.Load(&sched.ngsys))
42704306
for _, _p_ := range allp {
42714307
n -= _p_.gFree.n
42724308
}
@@ -4970,7 +5006,6 @@ func checkdead() {
49705006
case _Grunnable,
49715007
_Grunning,
49725008
_Gsyscall:
4973-
unlock(&allglock)
49745009
print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
49755010
throw("checkdead: runnable g")
49765011
}

src/runtime/runtime2.go

-1
Original file line numberDiff line numberDiff line change
@@ -1052,7 +1052,6 @@ func (w waitReason) String() string {
10521052
}
10531053

10541054
var (
1055-
allglen uintptr
10561055
allm *m
10571056
gomaxprocs int32
10581057
ncpu int32

src/runtime/traceback.go

+17-10
Original file line numberDiff line numberDiff line change
@@ -917,17 +917,25 @@ func tracebackothers(me *g) {
917917
level, _, _ := gotraceback()
918918

919919
// Show the current goroutine first, if we haven't already.
920-
g := getg()
921-
gp := g.m.curg
922-
if gp != nil && gp != me {
920+
curgp := getg().m.curg
921+
if curgp != nil && curgp != me {
923922
print("\n")
924-
goroutineheader(gp)
925-
traceback(^uintptr(0), ^uintptr(0), 0, gp)
923+
goroutineheader(curgp)
924+
traceback(^uintptr(0), ^uintptr(0), 0, curgp)
926925
}
927926

928-
lock(&allglock)
929-
for _, gp := range allgs {
930-
if gp == me || gp == g.m.curg || readgstatus(gp) == _Gdead || isSystemGoroutine(gp, false) && level < 2 {
927+
// We can't take allglock here because this may be during fatal
928+
// throw/panic, where locking allglock could be out-of-order or a
929+
// direct deadlock.
930+
//
931+
// Instead, use atomic access to allgs which requires no locking. We
932+
// don't lock against concurrent creation of new Gs, but even with
933+
// allglock we may miss Gs created after this loop.
934+
ptr, length := atomicAllG()
935+
for i := uintptr(0); i < length; i++ {
936+
gp := atomicAllGIndex(ptr, i)
937+
938+
if gp == me || gp == curgp || readgstatus(gp) == _Gdead || isSystemGoroutine(gp, false) && level < 2 {
931939
continue
932940
}
933941
print("\n")
@@ -936,14 +944,13 @@ func tracebackothers(me *g) {
936944
// called from a signal handler initiated during a
937945
// systemstack call. The original G is still in the
938946
// running state, and we want to print its stack.
939-
if gp.m != g.m && readgstatus(gp)&^_Gscan == _Grunning {
947+
if gp.m != getg().m && readgstatus(gp)&^_Gscan == _Grunning {
940948
print("\tgoroutine running on other thread; stack unavailable\n")
941949
printcreatedby(gp)
942950
} else {
943951
traceback(^uintptr(0), ^uintptr(0), 0, gp)
944952
}
945953
}
946-
unlock(&allglock)
947954
}
948955

949956
// tracebackHexdump hexdumps part of stk around frame.sp and frame.fp

0 commit comments

Comments
 (0)