Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/reference/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ _Appears in:_


DeletionCondition specifies the trigger conditions for a deletion action.
Exactly one of JobStatus or JobDeploymentStatus must be specified:
- JobStatus (application-level): Match the Ray job execution status.
- JobDeploymentStatus (infrastructure-level): Match the RayJob deployment lifecycle status. This is particularly useful for cleaning up resources when Ray jobs fail to be submitted.



Expand All @@ -116,7 +119,7 @@ _Appears in:_

| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `ttlSeconds` _integer_ | TTLSeconds is the time in seconds from when the JobStatus<br />reaches the specified terminal state to when this deletion action should be triggered.<br />The value must be a non-negative integer. | 0 | Minimum: 0 <br /> |
| `ttlSeconds` _integer_ | TTLSeconds is the time in seconds from when the JobStatus or JobDeploymentStatus<br />reaches the specified terminal state to when this deletion action should be triggered.<br />The value must be a non-negative integer. | 0 | Minimum: 0 <br /> |


#### DeletionPolicy
Expand Down
13 changes: 11 additions & 2 deletions helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 16 additions & 3 deletions ray-operator/apis/ray/v1/rayjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,26 @@ type DeletionRule struct {
}

// DeletionCondition specifies the trigger conditions for a deletion action.
// Exactly one of JobStatus or JobDeploymentStatus must be specified:
// - JobStatus (application-level): Match the Ray job execution status.
// - JobDeploymentStatus (infrastructure-level): Match the RayJob deployment lifecycle status. This is particularly useful for cleaning up resources when Ray jobs fail to be submitted.
//
// +kubebuilder:validation:XValidation:rule="!(has(self.jobStatus) && has(self.jobDeploymentStatus))",message="JobStatus and JobDeploymentStatus cannot be used together within the same deletion condition."
// +kubebuilder:validation:XValidation:rule="has(self.jobStatus) || has(self.jobDeploymentStatus)",message="the deletion condition requires either the JobStatus or the JobDeploymentStatus field."
type DeletionCondition struct {
// JobStatus is the terminal status of the RayJob that triggers this condition. This field is required.
// JobStatus is the terminal status of the RayJob that triggers this condition.
// For the initial implementation, only "SUCCEEDED" and "FAILED" are supported.
// +kubebuilder:validation:Enum=SUCCEEDED;FAILED
JobStatus JobStatus `json:"jobStatus"`
// +optional
JobStatus *JobStatus `json:"jobStatus,omitempty"`

// JobDeploymentStatus is the terminal status of the RayJob deployment that triggers this condition.
// For the initial implementation, only "Failed" is supported.
// +kubebuilder:validation:Enum=Failed
// +optional
JobDeploymentStatus *JobDeploymentStatus `json:"jobDeploymentStatus,omitempty"`

// TTLSeconds is the time in seconds from when the JobStatus
// TTLSeconds is the time in seconds from when the JobStatus or JobDeploymentStatus
// reaches the specified terminal state to when this deletion action should be triggered.
// The value must be a non-negative integer.
// +kubebuilder:default=0
Expand Down
16 changes: 14 additions & 2 deletions ray-operator/apis/ray/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 11 additions & 2 deletions ray-operator/config/crd/bases/ray.io_rayjobs.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions ray-operator/config/samples/ray-job.deletion-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ spec:
# DeletionStrategy defines the deletion policies for a RayJob.
# It allows for fine-grained control over resource cleanup after a job finishes.
# DeletionRules is a list of deletion rules, processed based on their trigger conditions.
# Currently, both JobStatus and JobDeploymentStatus are supported as deletion conditions:
# - JobStatus (application-level): Match the Ray job execution status.
# - Currently, only "SUCCEEDED" and "FAILED" are supported.
# - JobDeploymentStatus (infrastructure-level): Match the RayJob deployment lifecycle status. This is particularly useful for cleaning up resources when Ray jobs fail to be submitted.
# - Currently, only "Failed" is supported.
# For each deletion rule, exactly one of JobStatus and JobDeploymentStatus must be specified.
# While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
# the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings.
deletionStrategy:
Expand Down
15 changes: 13 additions & 2 deletions ray-operator/controllers/ray/rayjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1216,8 +1216,8 @@ func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv

// Categorize all applicable and incomplete rules into "overdue" or "pending".
for _, rule := range rayJob.Spec.DeletionStrategy.DeletionRules {
// Skip rules that don't match the current job status.
if rule.Condition.JobStatus != rayJob.Status.JobStatus {
// Skip rules that don't match the current job status or job deployment status.
if !isDeletionRuleMatched(rule, rayJob) {
continue
}

Expand Down Expand Up @@ -1382,6 +1382,17 @@ func (r *RayJobReconciler) executeDeletionPolicy(ctx context.Context, rayJob *ra
return ctrl.Result{}, nil
}

// isDeletionRuleMatched checks if the deletion rule matches the current job status or job deployment status.
func isDeletionRuleMatched(rule rayv1.DeletionRule, rayJob *rayv1.RayJob) bool {
if rule.Condition.JobStatus != nil {
return *rule.Condition.JobStatus == rayJob.Status.JobStatus
}
if rule.Condition.JobDeploymentStatus != nil {
return *rule.Condition.JobDeploymentStatus == rayJob.Status.JobDeploymentStatus
}
return false
}

// isDeletionActionCompleted checks if the state corresponding to a deletion policy is already achieved.
// This is crucial for making the reconciliation loop idempotent by checking the actual cluster state.
func (r *RayJobReconciler) isDeletionActionCompleted(ctx context.Context, rayJob *rayv1.RayJob, policy rayv1.DeletionPolicyType) (bool, error) {
Expand Down
Loading
Loading