diff --git a/ballista/scheduler/src/scheduler_process.rs b/ballista/scheduler/src/scheduler_process.rs index 24bc58b9b0..fcc5f74b2f 100644 --- a/ballista/scheduler/src/scheduler_process.rs +++ b/ballista/scheduler/src/scheduler_process.rs @@ -40,7 +40,7 @@ use datafusion_proto::logical_plan::AsLogicalPlan; use datafusion_proto::physical_plan::AsExecutionPlan; use datafusion_proto::protobuf::{LogicalPlanNode, PhysicalPlanNode}; use http::StatusCode; -use log::info; +use log::{info, warn}; use std::{net::SocketAddr, sync::Arc}; use tonic::service::RoutesBuilder; /// Creates as initialized scheduler service @@ -58,6 +58,17 @@ pub async fn create_scheduler< config.scheduling_policy ); + if config.bind_host != "127.0.0.1" && config.external_host == "localhost" { + warn!( + "Scheduler is bound to {} but --external-host is still the default 'localhost'. \ + Executors will be told to call back to 'localhost:{}' for task status and \ + heartbeats, which is unlikely to be reachable from other hosts or pods. \ + Set --external-host to a hostname or IP that executors can resolve to this \ + scheduler.", + config.bind_host, config.bind_port + ); + } + let codec_logical = config .override_logical_codec .clone() diff --git a/docker-compose.yml b/docker-compose.yml index c21230ec0e..87c57568af 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,7 +21,7 @@ services: build: dockerfile: dev/docker/ballista-scheduler.Dockerfile context: . - command: "--bind-host 0.0.0.0" + command: "--bind-host 0.0.0.0 --external-host ballista-scheduler" ports: - "50050:50050" environment: diff --git a/docs/source/user-guide/deployment/docker-compose.md b/docs/source/user-guide/deployment/docker-compose.md index ff3c533611..147f92e284 100644 --- a/docs/source/user-guide/deployment/docker-compose.md +++ b/docs/source/user-guide/deployment/docker-compose.md @@ -48,6 +48,12 @@ ballista-executor_1 | INFO ballista_executor: Ballista v52.0.0 Rust Executor l The scheduler listens on port 50050 and this is the port that clients will need to connect to. +Note that the scheduler is started with `--external-host ballista-scheduler` so +that it advertises the Compose service name to executors. Executors call back to +that address to report task status; if it is left at the default of `localhost`, +status updates fail because executors try to dial `localhost:50050` inside their +own container. + ## Connect from the Ballista CLI ```shell diff --git a/docs/source/user-guide/deployment/kubernetes.md b/docs/source/user-guide/deployment/kubernetes.md index a979a6c8c5..f18b0b61b7 100644 --- a/docs/source/user-guide/deployment/kubernetes.md +++ b/docs/source/user-guide/deployment/kubernetes.md @@ -106,6 +106,14 @@ persistentvolumeclaim/data-pv-claim created Copy the following yaml to a `cluster.yaml` file and change `` with the name of your Ballista Docker image. +The scheduler is started with `--external-host=ballista-scheduler` so that it +advertises the cluster-internal Service DNS name to executors. Executors call +back to that address to report task status and heartbeats; if it is left at the +default of `localhost`, executors will try to dial `localhost:50050` inside +their own pod and fail with `Fail to connect to scheduler localhost:50050`. +Replace `ballista-scheduler` with whatever Service name resolves to the +scheduler in your namespace. + ```yaml apiVersion: v1 kind: Service @@ -140,7 +148,9 @@ spec: containers: - name: ballista-scheduler image: /datafusion-ballista-scheduler:latest - args: ["--bind-port=50050"] + args: + - "--bind-port=50050" + - "--external-host=ballista-scheduler" ports: - containerPort: 50050 name: flight