Skip to content

cockroach starter could better report early termination #1145

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 10, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 114 additions & 11 deletions test-utils/src/dev/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use std::ffi::{OsStr, OsString};
use std::fmt;
use std::ops::Deref;
use std::os::unix::ffi::OsStringExt;
use std::os::unix::process::ExitStatusExt;
use std::path::Path;
use std::path::PathBuf;
use std::process::Stdio;
Expand Down Expand Up @@ -308,10 +309,8 @@ impl CockroachStarter {
let exited = process_exited(&mut child_process);
let listen_url_file = self.listen_url_file.clone();
async move {
if exited {
return Err(poll::CondCheckError::Failed(
CockroachStartError::Exited,
));
if let Some(exit_error) = exited {
return Err(poll::CondCheckError::Failed(exit_error));
}

// When ready, CockroachDB will write the URL on which it's
Expand Down Expand Up @@ -379,11 +378,23 @@ pub enum CockroachStartError {
source: std::io::Error,
},

#[error("wrong version of CockroachDB installed. expected '{expected:}', found: '{found:?}")]
#[error(
"wrong version of CockroachDB installed. expected '{expected:}', \
found: '{found:?}"
)]
BadVersion { expected: String, found: Result<String, anyhow::Error> },

#[error("cockroach failed to start (see error output above)")]
Exited,
#[error(
"cockroach exited unexpectedly with status {exit_code} \
(see error output above)"
)]
Exited { exit_code: i32 },

#[error(
"cockroach unexpectedly terminated by signal {signal} \
(see error output above)"
)]
Signaled { signal: i32 },

#[error("error parsing listen URL {listen_url:?}")]
BadListenUrl {
Expand All @@ -398,6 +409,12 @@ pub enum CockroachStartError {
may still exist)"
)]
TimedOut { pid: u32, time_waited: Duration },

#[error("unknown error waiting for cockroach to start")]
Unknown {
#[source]
source: anyhow::Error,
},
}

/// Manages a CockroachDB process running as a single-node cluster
Expand Down Expand Up @@ -590,10 +607,34 @@ pub async fn check_db_version() -> Result<(), CockroachStartError> {
///
/// It seems we can infer that "the exit status is not available at this time"
/// means that the process has not exited. After all, if it _had_ exited, we'd
/// fall into the first case. It's not clear under what conditions this function
/// could ever fail. It's not clear from the source that it's even possible.
fn process_exited(child_process: &mut tokio::process::Child) -> bool {
child_process.try_wait().unwrap().is_some()
/// fall into the first case. It's not clear under what conditions this
/// function could ever fail. It's not clear from the source that it's even
/// possible.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like tokio ultimately calls try_wait from std, which calls waitpid with WNOHANG. Per the manpage of waitpid (at least on illumos), it doesn't seem possible for any of the error cases to trigger (absent some weird bug where the process doesn't have the right pid or something like that), at least assuming we can't get EINTR if we passed WNOHANG (which I'm not sure about?).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I don't remember the details (this comment didn't actually change here, it's pretty old) but I came to a similar conclusion and left this vague note.

fn process_exited(
child_process: &mut tokio::process::Child,
) -> Option<CockroachStartError> {
child_process.try_wait().unwrap().map(interpret_exit)
}

fn interpret_exit(
exit_status: std::process::ExitStatus,
) -> CockroachStartError {
if let Some(exit_code) = exit_status.code() {
CockroachStartError::Exited { exit_code }
} else {
if let Some(signal) = exit_status.signal() {
CockroachStartError::Signaled { signal }
} else {
// This case should not be possible.
CockroachStartError::Unknown {
source: anyhow!(
"process has an exit status, but no exit \
code nor signal: {:?}",
exit_status
),
}
}
}
}

/// Populate a database with the Omicron schema and any initial objects
Expand Down Expand Up @@ -844,11 +885,13 @@ mod test {
use super::CockroachStartError;
use super::CockroachStarter;
use super::CockroachStarterBuilder;
use crate::dev::db::process_exited;
use crate::dev::poll;
use crate::dev::process_running;
use std::env;
use std::path::Path;
use std::path::PathBuf;
use std::process::Stdio;
use std::time::Duration;
use tempfile::tempdir;
use tokio::fs;
Expand Down Expand Up @@ -1237,4 +1280,64 @@ mod test {
(did not expect any of these fields: application_name)"
));
}

// Tests the way `process_exited()` checks and interprets the exit status
// for normal process termination.
#[tokio::test]
async fn test_process_exit_normal() {
// Launch a process that exits with a known status code.
let mut child_process = tokio::process::Command::new("bash")
.args(&["-c", "exit 3"])
.spawn()
.expect("failed to invoke bash");
let pid = child_process.id().unwrap();
println!("launched child process {}", pid);

// The only way we have to wait for the process to exit also consumes
// its exit status.
let result = child_process
.wait()
.await
.expect("failed to wait for child process completion");
let exit_status = super::interpret_exit(result);

println!("process exited: {:?}", exit_status);
assert!(matches!(exit_status,
CockroachStartError::Exited { exit_code } if exit_code == 3
));
}

// Tests the way `process_exited()` checks and interprets the exit status
// for abnormal process termination (by a signal).
#[tokio::test]
async fn test_process_exit_abnormal() {
// Launch a process that will hang until we can kill it with a known
// signal.
let mut child_process = tokio::process::Command::new("cat")
.stdin(Stdio::piped())
.spawn()
.expect("failed to invoke cat");
let pid = child_process.id().unwrap();
println!("launched child process {}", pid);

// The child must not have exited yet because it's waiting on stdin.
let exited = process_exited(&mut child_process);
assert!(exited.is_none());

// Kill the child process with a known signal.
child_process.start_kill().unwrap();

// The only way we have to wait for the process to exit also consumes
// its exit status.
let result = child_process
.wait()
.await
.expect("failed to wait for child process completion");
let exit_status = super::interpret_exit(result);

println!("process exited: {:?}", exit_status);
assert!(matches!(exit_status,
CockroachStartError::Signaled { signal } if signal == libc::SIGKILL
));
}
}