NVIDIA
diff --git a/‎docs/design/JobLauncher_and_JobHandle.md‎
Lines changed: 124 additions & 184 deletions b/‎docs/design/JobLauncher_and_JobHandle.md‎
Lines changed: 124 additions & 184 deletions
diff --git a/‎nvflare/apis/job_launcher_spec.py‎
Lines changed: 3 additions & 3 deletions b/‎nvflare/apis/job_launcher_spec.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎nvflare/app_opt/job_launcher/docker_launcher.py‎
Lines changed: 11 additions & 13 deletions b/‎nvflare/app_opt/job_launcher/docker_launcher.py‎
Lines changed: 11 additions & 13 deletions
diff --git a/‎nvflare/app_opt/job_launcher/k8s_launcher.py‎
Lines changed: 91 additions & 34 deletions b/‎nvflare/app_opt/job_launcher/k8s_launcher.py‎
Lines changed: 91 additions & 34 deletions
diff --git a/‎nvflare/private/fed/client/communicator.py‎
Lines changed: 3 additions & 4 deletions b/‎nvflare/private/fed/client/communicator.py‎
Lines changed: 3 additions & 4 deletions
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 
 from nvflare.apis.fl_component import FLComponent
 from nvflare.apis.fl_constant import FLContextKey
@@ -56,7 +56,7 @@ def add_launcher(launcher, fl_ctx: FLContext):
     fl_ctx.set_prop(FLContextKey.JOB_LAUNCHER, job_launcher, private=True, sticky=False)
 
 
-class JobHandleSpec:
+class JobHandleSpec(ABC):
     @abstractmethod
     def terminate(self):
         """To terminate the job run.
@@ -85,7 +85,7 @@ def wait(self):
         raise NotImplementedError()
 
 
-class JobLauncherSpec(FLComponent):
+class JobLauncherSpec(FLComponent, ABC):
     @abstractmethod
     def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec:
         """To launch a job run.
 
@@ -45,10 +45,10 @@ class DOCKER_STATE:
 
 
 class DockerJobHandle(JobHandleSpec):
-    def __init__(self, timeout=None):
+    def __init__(self, container, timeout=None):
         super().__init__()
 
-        self.container = None
+        self.container = container
         self.timeout = timeout
         self.logger = logging.getLogger(self.__class__.__name__)
 
@@ -68,9 +68,6 @@ def wait(self):
         if self.container:
             self.enter_states([DOCKER_STATE.EXITED, DOCKER_STATE.DEAD], self.timeout)
 
-    def _set_container(self, container):
-        self.container = container
-
     def _get_container(self):
         try:
             docker_client = docker.from_env()
@@ -123,7 +120,6 @@ def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec:
         docker_workspace = os.environ.get("NVFL_DOCKER_WORKSPACE")
         self.logger.info(f"launch_job {job_id} in docker_workspace: {docker_workspace}")
         docker_client = docker.from_env()
-        handle = DockerJobHandle()
         try:
             container = docker_client.containers.run(
                 job_image,
@@ -141,22 +137,24 @@ def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec:
                 # ports=ports,  # Map container ports to host ports (optional)
             )
             self.logger.info(f"Launch the job in DockerJobLauncher using image: {job_image}")
-            handle._set_container(container)
+
+            handle = DockerJobHandle(container)
             try:
-                launched = handle.enter_states([DOCKER_STATE.RUNNING], timeout=self.timeout)
-                if not launched:
+                if handle.enter_states([DOCKER_STATE.RUNNING], timeout=self.timeout):
+                    return handle
+                else:
                     handle.terminate()
-                return handle
+                    return None
             except:
                 handle.terminate()
-                return handle
+                return None
 
         except docker.errors.ImageNotFound:
             self.logger.error(f"Failed to launcher job: {job_id} in DockerJobLauncher. Image '{job_image}' not found.")
-            return handle
+            return None
         except docker.errors.APIError as e:
             self.logger.error(f"Error starting container: {e}")
-            return handle
+            return None
 
     def handle_event(self, event_type: str, fl_ctx: FLContext):
         if event_type == EventType.BEFORE_JOB_LAUNCH:
 
@@ -11,7 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import logging
+import re
 import time
 from abc import abstractmethod
 from enum import Enum
@@ -89,8 +91,29 @@ class PV_NAME(Enum):
 ]
 
 
+def uuid4_to_rfc1123(uuid_str: str) -> str:
+    name = uuid_str.lower()
+    # Strip any chars that aren't alphanumeric or hyphen
+    name = re.sub(r"[^a-z0-9-]", "", name)
+    # Prefix with a letter if it starts with a digit
+    if name and name[0].isdigit():
+        name = "j" + name
+    # Kubernetes label limit: 63 chars; strip trailing hyphens after truncation
+    # (truncation can expose a hyphen that was interior before slicing)
+    return name[:63].rstrip("-")
+
+
 class K8sJobHandle(JobHandleSpec):
-    def __init__(self, job_id: str, api_instance: core_v1_api, job_config: dict, namespace="default", timeout=None):
+    def __init__(
+        self,
+        job_id: str,
+        api_instance: core_v1_api,
+        job_config: dict,
+        namespace="default",
+        timeout=None,
+        pending_timeout=30,
+        python_path="/usr/local/bin/python",
+    ):
         super().__init__()
         self.job_id = job_id
         self.timeout = timeout
@@ -113,8 +136,7 @@ def __init__(self, job_id: str, api_instance: core_v1_api, job_config: dict, nam
             {
                 "image": None,
                 "name": None,
-                "resources": None,
-                "command": ["/usr/local/bin/python"],
+                "command": [python_path],
                 "args": None,  # args_list + args_dict + args_sets
                 "volumeMounts": None,  # volume_mount_list
                 "imagePullPolicy": "Always",
@@ -127,14 +149,13 @@ def __init__(self, job_id: str, api_instance: core_v1_api, job_config: dict, nam
         self.container_volume_mount_list = []
         self._make_manifest(job_config)
         self._stuck_count = 0
-        self._stuck_grace_period = 10  # seconds to wait before counting Pending as stuck
-        self._max_stuck_count = (self.timeout + self._stuck_grace_period) if self.timeout is not None else None
+        self._max_stuck_count = self.timeout if self.timeout is not None else pending_timeout
         self.logger = logging.getLogger(self.__class__.__name__)
 
     def _make_manifest(self, job_config):
         self.container_volume_mount_list.extend(job_config.get("volume_mount_list", []))
         set_list = job_config.get("set_list")
-        if set_list is None:
+        if not set_list:
             self.container_args_module_args_sets = list()
         else:
             self.container_args_module_args_sets = ["--set"] + set_list
@@ -147,57 +168,64 @@ def _make_manifest(self, job_config):
             if v is None:
                 continue
             self.container_args_module_args_dict_as_list.append(k)
-            self.container_args_module_args_dict_as_list.append(v)
+            self.container_args_module_args_dict_as_list.append(str(v))
         self.volume_list.extend(job_config.get("volume_list", []))
         self.pod_manifest["metadata"]["name"] = job_config.get("name")
         self.pod_manifest["spec"]["containers"] = self.container_list
         self.pod_manifest["spec"]["volumes"] = self.volume_list
 
-        self.container_list[0]["image"] = job_config.get("image", "nvflare/nvflare:2.8.0")
+        image = job_config.get("image")
+        if not image:
+            raise ValueError("job_config must contain a non-empty 'image' key")
+        self.container_list[0]["image"] = image
         self.container_list[0]["name"] = job_config.get("container_name", "nvflare_job")
         self.container_list[0]["args"] = (
             self.container_args_python_args_list
             + self.container_args_module_args_dict_as_list
             + self.container_args_module_args_sets
         )
         self.container_list[0]["volumeMounts"] = self.container_volume_mount_list
-        if job_config.get("resources", {}).get("limits", {}).get("nvidia.com/gpu") is not None:
+        if job_config.get("resources", {}).get("limits", {}).get("nvidia.com/gpu"):
             self.container_list[0]["resources"] = job_config.get("resources")
 
     def get_manifest(self):
-        return self.pod_manifest
+        return copy.deepcopy(self.pod_manifest)
 
-    def enter_states(self, job_states_to_enter: list, timeout=None):
+    def enter_states(self, job_states_to_enter: list):
         starting_time = time.time()
         if not isinstance(job_states_to_enter, (list, tuple)):
             job_states_to_enter = [job_states_to_enter]
         if not all([isinstance(js, JobState) for js in job_states_to_enter]):
             raise ValueError(f"expect job_states_to_enter with valid values, but get {job_states_to_enter}")
         while True:
             pod_phase = self._query_phase()
-            if self._stuck(pod_phase):
+            if self._stuck_in_pending(pod_phase):
                 self.terminate()
                 return False
             job_state = POD_STATE_MAPPING.get(pod_phase, JobState.UNKNOWN)
             if job_state in job_states_to_enter:
                 return True
-            elif timeout is not None and time.time() - starting_time > timeout:
+            elif pod_phase in [POD_Phase.FAILED.value, POD_Phase.SUCCEEDED.value]:  # terminal state
+                self.terminal_state = POD_STATE_MAPPING.get(pod_phase, JobState.UNKNOWN)
+                return False
+            elif self.timeout is not None and time.time() - starting_time > self.timeout:
+                self.terminate()
                 return False
             time.sleep(1)
 
     def terminate(self):
         try:
-            resp = self.api_instance.delete_namespaced_pod(
-                name=self.job_id, namespace=self.namespace, grace_period_seconds=0
-            )
+            self.api_instance.delete_namespaced_pod(name=self.job_id, namespace=self.namespace, grace_period_seconds=0)
             self.terminal_state = JobState.TERMINATED
         except ApiException as e:
-            # If the pod is already gone, treat it as terminated; otherwise, leave state unchanged.
             if getattr(e, "status", None) == 404:
                 self.logger.info(f"job {self.job_id} pod not found during termination; assuming terminated")
-                self.terminal_state = JobState.TERMINATED
             else:
                 self.logger.error(f"failed to terminate job {self.job_id}: {e}")
+            self.terminal_state = JobState.TERMINATED
+        except Exception as e:
+            self.logger.error(f"unexpected error terminating job {self.job_id}: {e}")
+            self.terminal_state = JobState.TERMINATED
         return None
 
     def poll(self):
@@ -210,20 +238,24 @@ def _query_phase(self):
         try:
             resp = self.api_instance.read_namespaced_pod(name=self.job_id, namespace=self.namespace)
         except ApiException as e:
+            self.logger.warning(f"failed to query pod phase {self.job_id}: {e}")
+            return POD_Phase.UNKNOWN.value
+        except Exception as e:
+            self.logger.warning(f"unexpected error querying pod phase {self.job_id}: {e}")
             return POD_Phase.UNKNOWN.value
         return resp.status.phase
 
     def _query_state(self):
         pod_phase = self._query_phase()
         return POD_STATE_MAPPING.get(pod_phase, JobState.UNKNOWN)
 
-    def _stuck(self, current_phase):
-        if self._max_stuck_count is None:
-            return False
+    def _stuck_in_pending(self, current_phase):
         if current_phase == POD_Phase.PENDING.value:
             self._stuck_count += 1
-            if self._stuck_count > self._max_stuck_count:
+            if self._max_stuck_count is not None and self._stuck_count >= self._max_stuck_count:
                 return True
+        else:
+            self._stuck_count = 0
         return False
 
     def wait(self):
@@ -246,6 +278,8 @@ def __init__(
         data_pvc_file_path: str,
         timeout=None,
         namespace="default",
+        pending_timeout=30,
+        python_path="/usr/local/bin/python",
     ):
         super().__init__()
         self.logger = logging.getLogger(self.__class__.__name__)
@@ -255,15 +289,18 @@ def __init__(
         self.data_pvc_file_path = data_pvc_file_path
         self.timeout = timeout
         self.namespace = namespace
+        self.pending_timeout = pending_timeout
+        self.python_path = python_path
         with open(data_pvc_file_path, "rt") as f:
             data_pvc_dict = yaml.safe_load(f)
         if not data_pvc_dict:
             raise ValueError(f"data_pvc_file_path '{data_pvc_file_path}' is empty or contains no PVC entries.")
         # data_pvc_dict will be pvc: mountPath
         # currently, support one pvc and always mount to /var/tmp/nvflare/data
         # ie, ignore the mountPath in data_pvc_dict
+        if not isinstance(data_pvc_dict, dict):
+            raise ValueError(f"file at data_pvc_file_path '{data_pvc_file_path}' does not contain a dictionary.")
         self.data_pvc = list(data_pvc_dict.keys())[0]
-
         config.load_kube_config(config_file_path)
         try:
             c = Configuration().get_default_copy()
@@ -276,17 +313,22 @@ def __init__(
 
     def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec:
         site_name = fl_ctx.get_identity_name()
-        job_id = job_meta.get(JobConstants.JOB_ID)
+        raw_job_id = job_meta.get(JobConstants.JOB_ID)
+        if not raw_job_id:
+            raise RuntimeError(f"missing {JobConstants.JOB_ID} in job_meta")
+        job_id = uuid4_to_rfc1123(raw_job_id)
         args = fl_ctx.get_prop(FLContextKey.ARGS)
         job_image = extract_job_image(job_meta, site_name)
         site_resources = job_meta.get(JobMetaKey.RESOURCE_SPEC.value, {}).get(site_name, {})
         job_resource = site_resources.get("num_of_gpus", None)
-
         job_args = fl_ctx.get_prop(FLContextKey.JOB_PROCESS_ARGS)
         if not job_args:
             raise RuntimeError(f"missing {FLContextKey.JOB_PROCESS_ARGS} in FLContext")
 
-        _, job_cmd = job_args[JobProcessArgs.EXE_MODULE]
+        exe_module_entry = job_args.get(JobProcessArgs.EXE_MODULE)
+        if not exe_module_entry:
+            raise RuntimeError(f"missing {JobProcessArgs.EXE_MODULE} in {FLContextKey.JOB_PROCESS_ARGS}")
+        _, job_cmd = exe_module_entry
         job_config = {
             "name": job_id,
             "image": job_image,
@@ -299,21 +341,36 @@ def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec:
                 {"name": PV_NAME.ETC.value, "persistentVolumeClaim": {"claimName": self.etc_pvc}},
             ],
             "module_args": self.get_module_args(job_id, fl_ctx),
-            "set_list": args.set,
-            "resources": {"limits": {"nvidia.com/gpu": job_resource}},
         }
-
-        job_handle = K8sJobHandle(job_id, self.core_v1, job_config, namespace=self.namespace, timeout=self.timeout)
+        if args is not None and getattr(args, "set", None) is not None:
+            job_config.update({"set_list": args.set})
+        if job_resource:
+            job_config.update({"resources": {"limits": {"nvidia.com/gpu": job_resource}}})
+        job_handle = K8sJobHandle(
+            job_id,
+            self.core_v1,
+            job_config,
+            namespace=self.namespace,
+            timeout=self.timeout,
+            pending_timeout=self.pending_timeout,
+            python_path=self.python_path,
+        )
         pod_manifest = job_handle.get_manifest()
         self.logger.debug(f"launch job with k8s_launcher. {pod_manifest=}")
         try:
             self.core_v1.create_namespaced_pod(body=pod_manifest, namespace=self.namespace)
-            job_handle.enter_states([JobState.RUNNING], timeout=self.timeout)
+        except Exception as e:
+            self.logger.error(f"failed to launch job {job_id}: {e}")
+            job_handle.terminal_state = JobState.TERMINATED
             return job_handle
-        except ApiException as e:
-            self.logger.error(f"failed to launch job {self.job_id}: {e}")
+        try:
+            entered_running = job_handle.enter_states([JobState.RUNNING])
+        except BaseException:
             job_handle.terminate()
-            return job_handle
+            raise
+        if not entered_running:
+            self.logger.warning(f"unable to enter running phase {job_id}")
+        return job_handle
 
     def handle_event(self, event_type: str, fl_ctx: FLContext):
         if event_type == EventType.BEFORE_JOB_LAUNCH:
 
@@ -59,7 +59,6 @@ def __init__(
         client_register_interval=2,
         timeout=5.0,
         maint_msg_timeout=5.0,
-        cell_creation_timeout=15.0,
     ):
         """To init the Communicator.
 
@@ -80,7 +79,7 @@ def __init__(
         self.client_register_interval = client_register_interval
         self.timeout = timeout
         self.maint_msg_timeout = maint_msg_timeout
-        self.creation_timeout = cell_creation_timeout
+
         # token and token_signature are issued by the Server after the client is authenticated
         # they are added to every message going to the server as proof of authentication
         self.token = None
@@ -274,9 +273,9 @@ def client_registration(self, client_name, project_name, fl_ctx: FLContext):
         start = time.time()
         while not self.cell:
             self.logger.info("Waiting for the client cell to be created.")
-            if time.time() - start > self.creation_timeout:
+            if time.time() - start > 15.0:
                 raise RuntimeError("Client cell could not be created. Failed to login the client.")
-            time.sleep(1)
+            time.sleep(0.5)
 
         shared_fl_ctx = gen_new_peer_ctx(fl_ctx)
         private_key_file = None