Skip to content

Commit 6772610

Browse files
committed
wip
1 parent f60dc8f commit 6772610

File tree

1 file changed

+133
-26
lines changed

1 file changed

+133
-26
lines changed

bin/ev-reth/src/main.rs

Lines changed: 133 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -63,26 +63,53 @@ struct NodeConfig {
6363
}
6464

6565
impl NodeConfig {
66-
/// Minimum allowed shutdown timeout in seconds
67-
const MIN_SHUTDOWN_TIMEOUT_SECS: u64 = 1;
68-
69-
/// Default shutdown timeout in seconds (optimized for containers)
70-
const DEFAULT_SHUTDOWN_TIMEOUT_SECS: u64 = 15;
71-
72-
/// Maximum allowed shutdown timeout in seconds (5 minutes)
73-
const MAX_SHUTDOWN_TIMEOUT_SECS: u64 = 300;
74-
75-
/// Minimum allowed status check interval in seconds
76-
const MIN_STATUS_CHECK_INTERVAL_SECS: u64 = 1;
77-
78-
/// Default status check interval in seconds (1 hour)
79-
const DEFAULT_STATUS_CHECK_INTERVAL_SECS: u64 = 3600;
80-
81-
/// Maximum allowed status check interval in seconds (6 hours)
82-
const MAX_STATUS_CHECK_INTERVAL_SECS: u64 = 21600;
83-
84-
/// Default maximum number of fallback status checks
85-
const DEFAULT_MAX_FALLBACK_CHECKS: u64 = 24;
66+
/// Minimum shutdown timeout prevents immediate termination that could cause data corruption
67+
///
68+
/// Set to 1 second to ensure basic cleanup operations can complete while preventing
69+
/// indefinite hangs during shutdown sequences.
70+
pub const MIN_SHUTDOWN_TIMEOUT_SECS: u64 = 1;
71+
72+
/// Default timeout optimized for Kubernetes pod termination grace period (30s)
73+
/// Set to 15s to allow cleanup before SIGKILL
74+
///
75+
/// This provides sufficient time for graceful shutdown in containerized environments
76+
/// while leaving buffer time before the container orchestrator sends SIGKILL.
77+
/// The 15-second timeout allows for connection draining, state persistence, and
78+
/// cleanup operations to complete normally.
79+
pub const DEFAULT_SHUTDOWN_TIMEOUT_SECS: u64 = 15;
80+
81+
/// Maximum shutdown timeout prevents indefinite hangs during node termination
82+
///
83+
/// Set to 5 minutes (300s) to handle complex shutdown scenarios while ensuring
84+
/// the node doesn't hang indefinitely. This upper bound protects against
85+
/// deadlocks or resource contention that could prevent clean shutdown.
86+
pub const MAX_SHUTDOWN_TIMEOUT_SECS: u64 = 300;
87+
88+
/// Minimum status check interval ensures reasonable monitoring frequency
89+
///
90+
/// Set to 1 second to prevent excessive CPU usage from overly frequent status checks
91+
/// while still allowing responsive monitoring when needed.
92+
pub const MIN_STATUS_CHECK_INTERVAL_SECS: u64 = 1;
93+
94+
/// Default status check interval balances monitoring with resource efficiency
95+
///
96+
/// Set to 1 hour (3600s) to provide periodic health status logging without
97+
/// overwhelming logs or consuming excessive resources. This interval is suitable
98+
/// for long-running production deployments where occasional status updates are sufficient.
99+
pub const DEFAULT_STATUS_CHECK_INTERVAL_SECS: u64 = 3600;
100+
101+
/// Maximum status check interval prevents excessively sparse monitoring
102+
///
103+
/// Set to 6 hours (21600s) to ensure status checks occur at least 4 times per day,
104+
/// providing minimum visibility into node health for operational monitoring.
105+
pub const MAX_STATUS_CHECK_INTERVAL_SECS: u64 = 21600;
106+
107+
/// Default maximum fallback status checks limits resource usage during fallback mode
108+
///
109+
/// Set to 24 checks to provide up to 24 hours of status logging (at default 1-hour intervals)
110+
/// before switching to efficient indefinite waiting. This prevents log spam while
111+
/// maintaining visibility during extended fallback periods.
112+
pub const DEFAULT_MAX_FALLBACK_CHECKS: u64 = 24;
86113

87114
/// Load configuration from environment variables with validation
88115
fn from_env() -> Self {
@@ -215,7 +242,7 @@ async fn signal_fallback_mechanism(config: &NodeConfig) {
215242
}
216243

217244
/// Handle shutdown signals with optimized, non-redundant signal handling
218-
async fn handle_shutdown_signals() {
245+
async fn handle_shutdown_signals(config: &NodeConfig) {
219246
#[cfg(unix)]
220247
{
221248
// On Unix systems, handle SIGTERM and SIGINT separately to avoid redundancy
@@ -261,8 +288,7 @@ async fn handle_shutdown_signals() {
261288
tracing::warn!(
262289
"No signal handling available, shutdown will only occur on natural node exit"
263290
);
264-
let config = NodeConfig::from_env();
265-
signal_fallback_mechanism(&config).await;
291+
signal_fallback_mechanism(config).await;
266292
}
267293
}
268294
}
@@ -275,8 +301,7 @@ async fn handle_shutdown_signals() {
275301
tracing::warn!(
276302
"No signal handling available, shutdown will only occur on natural node exit"
277303
);
278-
let config = NodeConfig::from_env();
279-
signal_fallback_mechanism(&config).await;
304+
signal_fallback_mechanism(config).await;
280305
} else {
281306
tracing::info!("Received SIGINT/Ctrl+C, initiating graceful shutdown");
282307
}
@@ -382,9 +407,91 @@ where
382407
}
383408
}
384409

410+
/// Validate critical environment variables at startup to prevent runtime issues
411+
fn validate_env_vars() -> Result<(), String> {
412+
// Validate shutdown timeout
413+
if let Ok(val) = std::env::var("EV_RETH_SHUTDOWN_TIMEOUT") {
414+
let timeout = val.parse::<u64>().map_err(|_| {
415+
format!("Invalid EV_RETH_SHUTDOWN_TIMEOUT: '{}' - must be a valid number", val)
416+
})?;
417+
418+
if timeout < NodeConfig::MIN_SHUTDOWN_TIMEOUT_SECS {
419+
return Err(format!(
420+
"EV_RETH_SHUTDOWN_TIMEOUT: {} is below minimum of {}s",
421+
timeout, NodeConfig::MIN_SHUTDOWN_TIMEOUT_SECS
422+
));
423+
}
424+
425+
if timeout > NodeConfig::MAX_SHUTDOWN_TIMEOUT_SECS {
426+
return Err(format!(
427+
"EV_RETH_SHUTDOWN_TIMEOUT: {} exceeds maximum of {}s",
428+
timeout, NodeConfig::MAX_SHUTDOWN_TIMEOUT_SECS
429+
));
430+
}
431+
}
432+
433+
// Validate status check interval
434+
if let Ok(val) = std::env::var("EV_RETH_STATUS_CHECK_INTERVAL") {
435+
let interval = val.parse::<u64>().map_err(|_| {
436+
format!("Invalid EV_RETH_STATUS_CHECK_INTERVAL: '{}' - must be a valid number", val)
437+
})?;
438+
439+
if interval < NodeConfig::MIN_STATUS_CHECK_INTERVAL_SECS {
440+
return Err(format!(
441+
"EV_RETH_STATUS_CHECK_INTERVAL: {} is below minimum of {}s",
442+
interval, NodeConfig::MIN_STATUS_CHECK_INTERVAL_SECS
443+
));
444+
}
445+
446+
if interval > NodeConfig::MAX_STATUS_CHECK_INTERVAL_SECS {
447+
return Err(format!(
448+
"EV_RETH_STATUS_CHECK_INTERVAL: {} exceeds maximum of {}s",
449+
interval, NodeConfig::MAX_STATUS_CHECK_INTERVAL_SECS
450+
));
451+
}
452+
}
453+
454+
// Validate fallback status checks flag
455+
if let Ok(val) = std::env::var("EV_RETH_ENABLE_FALLBACK_STATUS_CHECKS") {
456+
let normalized = val.to_lowercase();
457+
if normalized != "true" && normalized != "false" {
458+
return Err(format!(
459+
"Invalid EV_RETH_ENABLE_FALLBACK_STATUS_CHECKS: '{}' - must be 'true' or 'false'",
460+
val
461+
));
462+
}
463+
}
464+
465+
// Validate max fallback checks
466+
if let Ok(val) = std::env::var("EV_RETH_MAX_FALLBACK_CHECKS") {
467+
val.parse::<u64>().map_err(|_| {
468+
format!("Invalid EV_RETH_MAX_FALLBACK_CHECKS: '{}' - must be a valid number", val)
469+
})?;
470+
}
471+
472+
// Validate RUST_BACKTRACE if set by user (we set it ourselves if not present)
473+
if let Ok(val) = std::env::var("RUST_BACKTRACE") {
474+
let normalized = val.to_lowercase();
475+
if normalized != "0" && normalized != "1" && normalized != "full" {
476+
return Err(format!(
477+
"Invalid RUST_BACKTRACE: '{}' - must be '0', '1', or 'full'",
478+
val
479+
));
480+
}
481+
}
482+
483+
Ok(())
484+
}
485+
385486
fn main() {
386487
tracing::info!("=== EV-RETH NODE STARTING ===");
387488

489+
// Validate environment variables early to catch configuration issues
490+
if let Err(err) = validate_env_vars() {
491+
eprintln!("Environment variable validation failed: {}", err);
492+
std::process::exit(1);
493+
}
494+
388495
reth_cli_util::sigsegv_handler::install();
389496

390497
// Enable backtraces unless a RUST_BACKTRACE value has already been explicitly provided.
@@ -428,7 +535,7 @@ fn main() {
428535
tracing::info!("Node exited naturally");
429536
result
430537
}
431-
_ = handle_shutdown_signals() => {
538+
_ = handle_shutdown_signals(&config) => {
432539
tracing::info!("Shutdown signal received, initiating graceful shutdown");
433540

434541
// Structured shutdown phases for better observability (informational only)

0 commit comments

Comments
 (0)