From e5916e6f7b9a733461098b5dad22c3fcddb4d25a Mon Sep 17 00:00:00 2001 From: zzkcode Date: Sun, 22 Oct 2023 16:21:47 +0800 Subject: [PATCH 1/6] runtime: let the fault thread to crash the process --- src/runtime/signal_unix.go | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go index cd9fd5d796571b..169a11f7e3c943 100644 --- a/src/runtime/signal_unix.go +++ b/src/runtime/signal_unix.go @@ -752,20 +752,34 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) { } if docrash { + isCrashThread := false + if crashing == 0 { + isCrashThread = true + } crashing++ if crashing < mcount()-int32(extraMLength.Load()) { // There are other m's that need to dump their stacks. // Relay SIGQUIT to the next m by sending it to the current process. // All m's that have already received SIGQUIT have signal masks blocking // receipt of any signals, so the SIGQUIT will go to an m that hasn't seen it yet. - // When the last m receives the SIGQUIT, it will fall through to the call to - // crash below. Just in case the relaying gets botched, each m involved in + // The first m will wait until all ms received the SIGQUIT, then crash/exit. + // Just in case the relaying gets botched, each m involved in // the relay sleeps for 5 seconds and then does the crash/exit itself. - // In expected operation, the last m has received the SIGQUIT and run - // crash/exit and the process is gone, all long before any of the - // 5-second sleeps have finished. + // In expected operation, the first m will wait until the last m has received the SIGQUIT, + // and then run crash/exit and the process is gone. + // However, if it spends more than 5 seconds to send SIGQUIT to all ms, + // any of ms may crash/exit the process after waiting for 5 seconds. print("\n-----\n\n") raiseproc(_SIGQUIT) + } + // fix #63277 + if isCrashThread { + i := 0 + for (crashing < mcount()-int32(extraMLength.Load()) && i < 10) { + i++ + usleep(500 * 1000) + } + } else { usleep(5 * 1000 * 1000) } printDebugLog() From 40765fe9b7772c88b04bbedf6d7695ac5d209304 Mon Sep 17 00:00:00 2001 From: zzkcode Date: Wed, 8 Nov 2023 21:39:35 +0800 Subject: [PATCH 2/6] runtime: update comments and make crashing atomic --- src/runtime/signal_unix.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go index 169a11f7e3c943..d7b0964d46bc1b 100644 --- a/src/runtime/signal_unix.go +++ b/src/runtime/signal_unix.go @@ -597,7 +597,7 @@ func adjustSignalStack(sig uint32, mp *m, gsigStack *gsignalStack) bool { // crashing is the number of m's we have waited for when implementing // GOTRACEBACK=crash when a signal is received. -var crashing int32 +var crashing atomic.Int32 // testSigtrap and testSigusr1 are used by the runtime tests. If // non-nil, it is called on SIGTRAP/SIGUSR1. If it returns true, the @@ -730,7 +730,7 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) { mp.throwing = throwTypeRuntime mp.caughtsig.set(gp) - if crashing == 0 { + if crashing.Load() == 0 { startpanic_m() } @@ -740,11 +740,11 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) { if level > 0 { goroutineheader(gp) tracebacktrap(c.sigpc(), c.sigsp(), c.siglr(), gp) - if crashing > 0 && gp != mp.curg && mp.curg != nil && readgstatus(mp.curg)&^_Gscan == _Grunning { + if crashing.Load() > 0 && gp != mp.curg && mp.curg != nil && readgstatus(mp.curg)&^_Gscan == _Grunning { // tracebackothers on original m skipped this one; trace it now. goroutineheader(mp.curg) traceback(^uintptr(0), ^uintptr(0), 0, mp.curg) - } else if crashing == 0 { + } else if crashing.Load() == 0 { tracebackothers(gp) print("\n") } @@ -753,11 +753,11 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) { if docrash { isCrashThread := false - if crashing == 0 { + if crashing.Load() == 0 { isCrashThread = true } - crashing++ - if crashing < mcount()-int32(extraMLength.Load()) { + crashing.Add(1) + if crashing.Load() < mcount()-int32(extraMLength.Load()) { // There are other m's that need to dump their stacks. // Relay SIGQUIT to the next m by sending it to the current process. // All m's that have already received SIGQUIT have signal masks blocking @@ -765,17 +765,17 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) { // The first m will wait until all ms received the SIGQUIT, then crash/exit. // Just in case the relaying gets botched, each m involved in // the relay sleeps for 5 seconds and then does the crash/exit itself. - // In expected operation, the first m will wait until the last m has received the SIGQUIT, + // The faulting m is crashing first so it is the faulting thread in the core dump (see issue #63277): + // in expected operation, the first m will wait until the last m has received the SIGQUIT, // and then run crash/exit and the process is gone. // However, if it spends more than 5 seconds to send SIGQUIT to all ms, // any of ms may crash/exit the process after waiting for 5 seconds. print("\n-----\n\n") raiseproc(_SIGQUIT) } - // fix #63277 if isCrashThread { i := 0 - for (crashing < mcount()-int32(extraMLength.Load()) && i < 10) { + for (crashing.Load() < mcount()-int32(extraMLength.Load())) && i < 10 { i++ usleep(500 * 1000) } From 398a0fe7955d47d3f2367cd40b6ff2f3797e2986 Mon Sep 17 00:00:00 2001 From: zzkcode Date: Thu, 9 Nov 2023 21:30:19 +0800 Subject: [PATCH 3/6] runtime: add test for fault thread crash process --- src/runtime/runtime-gdb_unix_test.go | 173 +++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/src/runtime/runtime-gdb_unix_test.go b/src/runtime/runtime-gdb_unix_test.go index 5413306f772da3..4a329c086af013 100644 --- a/src/runtime/runtime-gdb_unix_test.go +++ b/src/runtime/runtime-gdb_unix_test.go @@ -230,3 +230,176 @@ func TestGdbCoreSignalBacktrace(t *testing.T) { t.Fatalf("could not find runtime symbol in backtrace after signal handler:\n%s", rest) } } + +const coreCrashThreadSource = ` +package main + +/* +#cgo CFLAGS: -g -O0 -Wall +#include +#include +void trigger_crash() +{ + int* ptr = NULL; + *ptr = 1024; +} +*/ +import "C" +import ( + "flag" + "fmt" + "os" + "runtime/debug" + "syscall" +) + +func enableCore() { + debug.SetTraceback("crash") + + var lim syscall.Rlimit + err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim) + if err != nil { + panic(fmt.Sprintf("error getting rlimit: %v", err)) + } + lim.Cur = lim.Max + fmt.Fprintf(os.Stderr, "Setting RLIMIT_CORE = %+#v\n", lim) + err = syscall.Setrlimit(syscall.RLIMIT_CORE, &lim) + if err != nil { + panic(fmt.Sprintf("error setting rlimit: %v", err)) + } +} + +func main() { + flag.Parse() + + enableCore() + + C.trigger_crash() +} +` + +// TestGdbCoreCrashThreadBacktrace tests that runtime could let the fault thread to crash process +// and make fault thread as number one thread while gdb in a core file +func TestGdbCoreCrashThreadBacktrace(t *testing.T) { + if runtime.GOOS != "linux" { + // N.B. This test isn't fundamentally Linux-only, but it needs + // to know how to enable/find core files on each OS. + t.Skip("Test only supported on Linux") + } + if runtime.GOARCH != "386" && runtime.GOARCH != "amd64" { + // TODO(go.dev/issue/25218): Other architectures use sigreturn + // via VDSO, which we somehow don't handle correctly. + t.Skip("Backtrace through signal handler only works on 386 and amd64") + } + + checkGdbEnvironment(t) + t.Parallel() + checkGdbVersion(t) + + // Ensure there is enough RLIMIT_CORE available to generate a full core. + var lim syscall.Rlimit + err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim) + if err != nil { + t.Fatalf("error getting rlimit: %v", err) + } + // Minimum RLIMIT_CORE max to allow. This is a conservative estimate. + // Most systems allow infinity. + const minRlimitCore = 100 << 20 // 100 MB + if lim.Max < minRlimitCore { + t.Skipf("RLIMIT_CORE max too low: %#+v", lim) + } + + // Make sure core pattern will send core to the current directory. + b, err := os.ReadFile("/proc/sys/kernel/core_pattern") + if err != nil { + t.Fatalf("error reading core_pattern: %v", err) + } + if string(b) != "core\n" { + t.Skipf("Unexpected core pattern %q", string(b)) + } + + coreUsesPID := false + b, err = os.ReadFile("/proc/sys/kernel/core_uses_pid") + if err == nil { + switch string(bytes.TrimSpace(b)) { + case "0": + case "1": + coreUsesPID = true + default: + t.Skipf("unexpected core_uses_pid value %q", string(b)) + } + } + + dir := t.TempDir() + + // Build the source code. + src := filepath.Join(dir, "main.go") + err = os.WriteFile(src, []byte(coreCrashThreadSource), 0644) + if err != nil { + t.Fatalf("failed to create file: %v", err) + } + cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe", "main.go") + cmd.Dir = dir + out, err := testenv.CleanCmdEnv(cmd).CombinedOutput() + if err != nil { + t.Fatalf("building source %v\n%s", err, out) + } + + // Start the test binary. + cmd = testenv.Command(t, "./a.exe") + cmd.Dir = dir + var output bytes.Buffer + cmd.Stdout = &output // for test logging + cmd.Stderr = &output + + if err := cmd.Start(); err != nil { + t.Fatalf("error starting test binary: %v", err) + } + + pid := cmd.Process.Pid + + err = cmd.Wait() + t.Logf("child output:\n%s", output.String()) + if err == nil { + t.Fatalf("Wait succeeded, want SIGSEGV") + } + ee, ok := err.(*exec.ExitError) + if !ok { + t.Fatalf("Wait err got %T %v, want exec.ExitError", ee, ee) + } + ws, ok := ee.Sys().(syscall.WaitStatus) + if !ok { + t.Fatalf("Sys got %T %v, want syscall.WaitStatus", ee.Sys(), ee.Sys()) + } + if ws.Signal() != syscall.SIGABRT { + t.Fatalf("Signal got %d want SIGABRT", ws.Signal()) + } + if !ws.CoreDump() { + t.Fatalf("CoreDump got %v want true", ws.CoreDump()) + } + + coreFile := "core" + if coreUsesPID { + coreFile += fmt.Sprintf(".%d", pid) + } + + // Execute gdb commands. + args := []string{"-nx", "-batch", + "-iex", "add-auto-load-safe-path " + filepath.Join(testenv.GOROOT(t), "src", "runtime"), + "-ex", "backtrace", + filepath.Join(dir, "a.exe"), + filepath.Join(dir, coreFile), + } + cmd = testenv.Command(t, "gdb", args...) + + got, err := cmd.CombinedOutput() + t.Logf("gdb output:\n%s", got) + if err != nil { + t.Fatalf("gdb exited with error: %v", err) + } + + re := regexp.MustCompile(`#.* trigger_crash`) + if found := re.Find(got) != nil; !found { + t.Fatalf("could not find trigger_crash in backtrace") + } +} From 9e171360ce32bd60522ce01c6a1e077228aeb917 Mon Sep 17 00:00:00 2001 From: zzkcode Date: Tue, 21 Nov 2023 22:48:56 +0800 Subject: [PATCH 4/6] runtime: use CompareAndSwap to avoid racy issue while two threads crash concurrently & fix typo --- src/runtime/runtime-gdb_unix_test.go | 2 +- src/runtime/signal_unix.go | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/runtime/runtime-gdb_unix_test.go b/src/runtime/runtime-gdb_unix_test.go index 4a329c086af013..910a93d628858b 100644 --- a/src/runtime/runtime-gdb_unix_test.go +++ b/src/runtime/runtime-gdb_unix_test.go @@ -361,7 +361,7 @@ func TestGdbCoreCrashThreadBacktrace(t *testing.T) { err = cmd.Wait() t.Logf("child output:\n%s", output.String()) if err == nil { - t.Fatalf("Wait succeeded, want SIGSEGV") + t.Fatalf("Wait succeeded, want SIGABRT") } ee, ok := err.(*exec.ExitError) if !ok { diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go index d7b0964d46bc1b..84391d58ed7f22 100644 --- a/src/runtime/signal_unix.go +++ b/src/runtime/signal_unix.go @@ -753,10 +753,11 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) { if docrash { isCrashThread := false - if crashing.Load() == 0 { + if crashing.CompareAndSwap(0, 1) { isCrashThread = true + } else { + crashing.Add(1) } - crashing.Add(1) if crashing.Load() < mcount()-int32(extraMLength.Load()) { // There are other m's that need to dump their stacks. // Relay SIGQUIT to the next m by sending it to the current process. From 2a9f9e22d315dc049ce35dc67d20de2f67b6d4a0 Mon Sep 17 00:00:00 2001 From: zzkcode Date: Thu, 30 Nov 2023 00:03:43 +0800 Subject: [PATCH 5/6] runtime: extract canGenerateCore --- src/runtime/runtime-gdb_unix_test.go | 115 ++++++++++----------------- 1 file changed, 43 insertions(+), 72 deletions(-) diff --git a/src/runtime/runtime-gdb_unix_test.go b/src/runtime/runtime-gdb_unix_test.go index 910a93d628858b..2dc1e31e13fe4a 100644 --- a/src/runtime/runtime-gdb_unix_test.go +++ b/src/runtime/runtime-gdb_unix_test.go @@ -20,6 +20,43 @@ import ( "testing" ) +func canGenerateCore(t *testing.T) bool { + // Ensure there is enough RLIMIT_CORE available to generate a full core. + var lim syscall.Rlimit + err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim) + if err != nil { + t.Fatalf("error getting rlimit: %v", err) + } + // Minimum RLIMIT_CORE max to allow. This is a conservative estimate. + // Most systems allow infinity. + const minRlimitCore = 100 << 20 // 100 MB + if lim.Max < minRlimitCore { + t.Skipf("RLIMIT_CORE max too low: %#+v", lim) + } + + // Make sure core pattern will send core to the current directory. + b, err := os.ReadFile("/proc/sys/kernel/core_pattern") + if err != nil { + t.Fatalf("error reading core_pattern: %v", err) + } + if string(b) != "core\n" { + t.Skipf("Unexpected core pattern %q", string(b)) + } + + coreUsesPID := false + b, err = os.ReadFile("/proc/sys/kernel/core_uses_pid") + if err == nil { + switch string(bytes.TrimSpace(b)) { + case "0": + case "1": + coreUsesPID = true + default: + t.Skipf("unexpected core_uses_pid value %q", string(b)) + } + } + return coreUsesPID +} + const coreSignalSource = ` package main @@ -81,45 +118,12 @@ func TestGdbCoreSignalBacktrace(t *testing.T) { t.Parallel() checkGdbVersion(t) - // Ensure there is enough RLIMIT_CORE available to generate a full core. - var lim syscall.Rlimit - err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim) - if err != nil { - t.Fatalf("error getting rlimit: %v", err) - } - // Minimum RLIMIT_CORE max to allow. This is a conservative estimate. - // Most systems allow infinity. - const minRlimitCore = 100 << 20 // 100 MB - if lim.Max < minRlimitCore { - t.Skipf("RLIMIT_CORE max too low: %#+v", lim) - } - - // Make sure core pattern will send core to the current directory. - b, err := os.ReadFile("/proc/sys/kernel/core_pattern") - if err != nil { - t.Fatalf("error reading core_pattern: %v", err) - } - if string(b) != "core\n" { - t.Skipf("Unexpected core pattern %q", string(b)) - } - - coreUsesPID := false - b, err = os.ReadFile("/proc/sys/kernel/core_uses_pid") - if err == nil { - switch string(bytes.TrimSpace(b)) { - case "0": - case "1": - coreUsesPID = true - default: - t.Skipf("unexpected core_uses_pid value %q", string(b)) - } - } - - dir := t.TempDir() + coreUsesPID := canGenerateCore(t) // Build the source code. + dir := t.TempDir() src := filepath.Join(dir, "main.go") - err = os.WriteFile(src, []byte(coreSignalSource), 0644) + err := os.WriteFile(src, []byte(coreSignalSource), 0644) if err != nil { t.Fatalf("failed to create file: %v", err) } @@ -296,45 +300,12 @@ func TestGdbCoreCrashThreadBacktrace(t *testing.T) { t.Parallel() checkGdbVersion(t) - // Ensure there is enough RLIMIT_CORE available to generate a full core. - var lim syscall.Rlimit - err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim) - if err != nil { - t.Fatalf("error getting rlimit: %v", err) - } - // Minimum RLIMIT_CORE max to allow. This is a conservative estimate. - // Most systems allow infinity. - const minRlimitCore = 100 << 20 // 100 MB - if lim.Max < minRlimitCore { - t.Skipf("RLIMIT_CORE max too low: %#+v", lim) - } - - // Make sure core pattern will send core to the current directory. - b, err := os.ReadFile("/proc/sys/kernel/core_pattern") - if err != nil { - t.Fatalf("error reading core_pattern: %v", err) - } - if string(b) != "core\n" { - t.Skipf("Unexpected core pattern %q", string(b)) - } - - coreUsesPID := false - b, err = os.ReadFile("/proc/sys/kernel/core_uses_pid") - if err == nil { - switch string(bytes.TrimSpace(b)) { - case "0": - case "1": - coreUsesPID = true - default: - t.Skipf("unexpected core_uses_pid value %q", string(b)) - } - } - - dir := t.TempDir() + coreUsesPID := canGenerateCore(t) // Build the source code. + dir := t.TempDir() src := filepath.Join(dir, "main.go") - err = os.WriteFile(src, []byte(coreCrashThreadSource), 0644) + err := os.WriteFile(src, []byte(coreCrashThreadSource), 0644) if err != nil { t.Fatalf("failed to create file: %v", err) } From f4615c23f663a2f0794ca9e5c86fc2f0cc8552d7 Mon Sep 17 00:00:00 2001 From: zzkcode Date: Thu, 30 Nov 2023 19:38:07 +0800 Subject: [PATCH 6/6] runtime: remove CGO CFLAGS -Wall --- src/runtime/runtime-gdb_unix_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/runtime-gdb_unix_test.go b/src/runtime/runtime-gdb_unix_test.go index 2dc1e31e13fe4a..23eb3e2362f184 100644 --- a/src/runtime/runtime-gdb_unix_test.go +++ b/src/runtime/runtime-gdb_unix_test.go @@ -239,7 +239,7 @@ const coreCrashThreadSource = ` package main /* -#cgo CFLAGS: -g -O0 -Wall +#cgo CFLAGS: -g -O0 #include #include void trigger_crash()