diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
index 1a0da50fa07a..471a883940ff 100644
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2017-2019 XGBoost contributors
+/**
+ * Copyright 2017-2026, XGBoost contributors
  */
 
 /**
@@ -64,7 +64,8 @@ namespace xgboost {
 void SetCudaSetDeviceHandler(void (*handler)(int));
 #endif  // __CUDACC__
 
-template <typename T> struct HostDeviceVectorImpl;
+template <typename T>
+struct HostDeviceVectorImpl;
 
 /*!
  * \brief Controls data access from the GPU.
@@ -78,7 +79,8 @@ template <typename T> struct HostDeviceVectorImpl;
  *   - Data is being manipulated on the host. Host has write access, device doesn't have access.
  */
 enum GPUAccess {
-  kNone, kRead,
+  kNone,
+  kRead,
   // write implies read
   kWrite
 };
@@ -88,9 +90,15 @@ class HostDeviceVector {
   static_assert(std::is_standard_layout_v<T>, "HostDeviceVector admits only POD types");
 
  public:
-  explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU());
-  HostDeviceVector(std::initializer_list<T> init, DeviceOrd device = DeviceOrd::CPU());
-  explicit HostDeviceVector(const std::vector<T>& init, DeviceOrd device = DeviceOrd::CPU());
+  using value_type = T;  // NOLINT
+
+ public:
+  explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU(),
+                            Context const* ctx = nullptr);
+  HostDeviceVector(std::initializer_list<T> init, DeviceOrd device = DeviceOrd::CPU(),
+                   Context const* ctx = nullptr);
+  explicit HostDeviceVector(const std::vector<T>& init, DeviceOrd device = DeviceOrd::CPU(),
+                            Context const* ctx = nullptr);
   ~HostDeviceVector();
 
   HostDeviceVector(const HostDeviceVector<T>&) = delete;
@@ -103,44 +111,54 @@ class HostDeviceVector {
   [[nodiscard]] std::size_t Size() const;
   [[nodiscard]] std::size_t SizeBytes() const { return this->Size() * sizeof(T); }
   [[nodiscard]] DeviceOrd Device() const;
-  common::Span<T> DeviceSpan();
-  common::Span<const T> ConstDeviceSpan() const;
-  common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
-  T* DevicePointer();
-  const T* ConstDevicePointer() const;
-  const T* DevicePointer() const { return ConstDevicePointer(); }
-
-  T* HostPointer() { return HostVector().data(); }
-  common::Span<T> HostSpan() { return common::Span<T>{HostVector()}; }
-  common::Span<T const> HostSpan() const { return common::Span<T const>{HostVector()}; }
-  common::Span<T const> ConstHostSpan() const { return HostSpan(); }
-  const T* ConstHostPointer() const { return ConstHostVector().data(); }
-  const T* HostPointer() const { return ConstHostPointer(); }
-
-  void Fill(T v);
-  void Copy(const HostDeviceVector<T>& other);
-  void Copy(const std::vector<T>& other);
-  void Copy(std::initializer_list<T> other);
-
-  void Extend(const HostDeviceVector<T>& other);
-
-  std::vector<T>& HostVector();
-  const std::vector<T>& ConstHostVector() const;
-  const std::vector<T>& HostVector() const {return ConstHostVector(); }
+  common::Span<T> DeviceSpan(Context const* ctx = nullptr);
+  common::Span<const T> ConstDeviceSpan(Context const* ctx = nullptr) const;
+  common::Span<const T> DeviceSpan(Context const* ctx = nullptr) const {
+    return ConstDeviceSpan(ctx);
+  }
+  T* DevicePointer(Context const* ctx = nullptr);
+  const T* ConstDevicePointer(Context const* ctx = nullptr) const;
+  const T* DevicePointer(Context const* ctx = nullptr) const { return ConstDevicePointer(ctx); }
+
+  T* HostPointer(Context const* ctx = nullptr) { return HostVector(ctx).data(); }
+  common::Span<T> HostSpan(Context const* ctx = nullptr) {
+    return common::Span<T>{HostVector(ctx)};
+  }
+  common::Span<T const> HostSpan(Context const* ctx = nullptr) const {
+    return common::Span<T const>{HostVector(ctx)};
+  }
+  common::Span<T const> ConstHostSpan(Context const* ctx = nullptr) const { return HostSpan(ctx); }
+  const T* ConstHostPointer(Context const* ctx = nullptr) const {
+    return ConstHostVector(ctx).data();
+  }
+  const T* HostPointer(Context const* ctx = nullptr) const { return ConstHostPointer(ctx); }
+
+  void Fill(T v, Context const* ctx = nullptr);
+  void Copy(const HostDeviceVector<T>& other, Context const* ctx = nullptr);
+  void Copy(const std::vector<T>& other, Context const* ctx = nullptr);
+  void Copy(std::initializer_list<T> other, Context const* ctx = nullptr);
+
+  void Extend(const HostDeviceVector<T>& other, Context const* ctx = nullptr);
+
+  std::vector<T>& HostVector(Context const* ctx = nullptr);
+  const std::vector<T>& ConstHostVector(Context const* ctx = nullptr) const;
+  const std::vector<T>& HostVector(Context const* ctx = nullptr) const {
+    return ConstHostVector(ctx);
+  }
 
   [[nodiscard]] bool HostCanRead() const;
   [[nodiscard]] bool HostCanWrite() const;
   [[nodiscard]] bool DeviceCanRead() const;
   [[nodiscard]] bool DeviceCanWrite() const;
   [[nodiscard]] GPUAccess DeviceAccess() const;
-
-  void SetDevice(DeviceOrd device) const;
+  // FIXME(jiamingy): Until we can fully unify the context, we will have both ctx and device here.
+  void SetDevice(DeviceOrd device, Context const* ctx = nullptr) const;
 
   void Resize(std::size_t new_size);
-  /** @brief Resize and initialize the data if the new size is larger than the old size. */
-  void Resize(std::size_t new_size, T v);
+  void Resize(Context const* ctx, std::size_t new_size);
 
-  using value_type = T;  // NOLINT
+  /** @brief Resize and initialize the data if the new size is larger than the old size. */
+  void Resize(Context const* ctx, std::size_t new_size, T v);
 
  private:
   HostDeviceVectorImpl<T>* impl_;
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 4d4412ea31e0..9eed8dce5091 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -595,13 +595,13 @@ auto MakeTensorView(Context const *ctx, Order order, common::Span<T, ext> data,
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> *data, S &&...shape) {
-  auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan();
+  auto span = ctx->IsCPU() ? data->HostSpan(ctx) : data->DeviceSpan(ctx);
   return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
 }
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> const *data, S &&...shape) {
-  auto span = ctx->IsCPU() ? data->ConstHostSpan() : data->ConstDeviceSpan();
+  auto span = ctx->IsCPU() ? data->ConstHostSpan(ctx) : data->ConstDeviceSpan(ctx);
   return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
 }
 
@@ -768,15 +768,15 @@ class Tensor {
   Order order_{Order::kC};
 
   template <typename I, std::int32_t D>
-  void Initialize(I const (&shape)[D], DeviceOrd device) {
+  void Initialize(I const (&shape)[D], DeviceOrd device, Context const *ctx = nullptr) {
     static_assert(D <= kDim, "Invalid shape.");
     std::copy(shape, shape + D, shape_);
     for (auto i = D; i < kDim; ++i) {
       shape_[i] = 1;
     }
     if (!device.IsCPU()) {
-      data_.SetDevice(device);
-      data_.ConstDevicePointer();  // Pull to device;
+      data_.SetDevice(device, ctx);
+      data_.ConstDevicePointer(ctx);  // Pull to device;
     }
     CHECK_EQ(data_.Size(), detail::CalcSize(shape_));
   }
@@ -791,11 +791,13 @@ class Tensor {
    * See \ref TensorView for parameters of this constructor.
    */
   template <typename I, int32_t D>
-  explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC)
-      : Tensor{common::Span<I const, D>{shape}, device, order} {}
+  explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC,
+                  Context const *ctx = nullptr)
+      : Tensor{common::Span<I const, D>{shape}, device, order, ctx} {}
 
   template <typename I, size_t D>
-  explicit Tensor(common::Span<I const, D> shape, DeviceOrd device, Order order = kC)
+  explicit Tensor(common::Span<I const, D> shape, DeviceOrd device, Order order = kC,
+                  Context const *ctx = nullptr)
       : order_{order} {
     // No device unroll as this is a host only function.
     std::copy(shape.data(), shape.data() + D, shape_);
@@ -804,33 +806,34 @@ class Tensor {
     }
     auto size = detail::CalcSize(shape_);
     if (!device.IsCPU()) {
-      data_.SetDevice(device);
+      data_.SetDevice(device, ctx);
     }
-    data_.Resize(size);
+    data_.Resize(ctx, size);
     if (!device.IsCPU()) {
-      data_.DevicePointer();  // Pull to device
+      data_.DevicePointer(ctx);  // Pull to device
     }
   }
   /**
    * Initialize from 2 host iterators.
    */
   template <typename It, typename I, int32_t D>
-  explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC)
+  explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC,
+                  Context const *ctx = nullptr)
       : order_{order} {
     auto &h_vec = data_.HostVector();
     h_vec.insert(h_vec.begin(), begin, end);
     // shape
-    this->Initialize(shape, device);
+    this->Initialize(shape, device, ctx);
   }
 
   template <typename I, int32_t D>
   explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
-                  Order order = kC)
+                  Order order = kC, Context const *ctx = nullptr)
       : order_{order} {
     auto &h_vec = data_.HostVector();
     h_vec = data;
     // shape
-    this->Initialize(shape, device);
+    this->Initialize(shape, device, ctx);
   }
   /**
    * \brief Index operator. Not thread safe, should not be used in performance critical
@@ -852,29 +855,29 @@ class Tensor {
   /**
    * @brief Get a @ref TensorView for this tensor.
    */
-  auto View(DeviceOrd device) {
+  auto View(DeviceOrd device, Context const *ctx = nullptr) {
     if (device.IsCPU()) {
-      auto span = data_.HostSpan();
+      auto span = data_.HostSpan(ctx);
       return TensorView<T, kDim>{span, shape_, device, order_};
     } else {
-      data_.SetDevice(device);
-      auto span = data_.DeviceSpan();
+      data_.SetDevice(device, ctx);
+      auto span = data_.DeviceSpan(ctx);
       return TensorView<T, kDim>{span, shape_, device, order_};
     }
   }
-  auto View(DeviceOrd device) const {
+  auto View(DeviceOrd device, Context const *ctx = nullptr) const {
     if (device.IsCPU()) {
-      auto span = data_.ConstHostSpan();
+      auto span = data_.ConstHostSpan(ctx);
       return TensorView<T const, kDim>{span, shape_, device, order_};
     } else {
-      data_.SetDevice(device);
-      auto span = data_.ConstDeviceSpan();
+      data_.SetDevice(device, ctx);
+      auto span = data_.ConstDeviceSpan(ctx);
       return TensorView<T const, kDim>{span, shape_, device, order_};
     }
   }
 
-  auto HostView() { return this->View(DeviceOrd::CPU()); }
-  auto HostView() const { return this->View(DeviceOrd::CPU()); }
+  auto HostView(Context const *ctx = nullptr) { return this->View(DeviceOrd::CPU(), ctx); }
+  auto HostView(Context const *ctx = nullptr) const { return this->View(DeviceOrd::CPU(), ctx); }
 
   [[nodiscard]] std::size_t Size() const { return data_.Size(); }
   [[nodiscard]] bool Empty() const { return Size() == 0; }
@@ -950,7 +953,9 @@ class Tensor {
   /**
    * \brief Set device ordinal for this tensor.
    */
-  void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }
+  void SetDevice(DeviceOrd device, Context const *ctx = nullptr) const {
+    data_.SetDevice(device, ctx);
+  }
   [[nodiscard]] DeviceOrd Device() const { return data_.Device(); }
 };
 
@@ -966,7 +971,7 @@ using Vector = Tensor<T, 1>;
 template <typename T, typename... Index>
 auto Empty(Context const *ctx, Index &&...index) {
   Tensor<T, sizeof...(Index)> t;
-  t.SetDevice(ctx->Device());
+  t.SetDevice(ctx->Device(), ctx);
   t.Reshape(index...);
   return t;
 }
@@ -977,7 +982,7 @@ auto Empty(Context const *ctx, Index &&...index) {
 template <typename T, std::int32_t kDim>
 auto EmptyLike(Context const *ctx, Tensor<T, kDim> const &in) {
   Tensor<T, kDim> t;
-  t.SetDevice(ctx->Device());
+  t.SetDevice(ctx->Device(), ctx);
   t.Reshape(in.Shape());
   return t;
 }
@@ -988,9 +993,9 @@ auto EmptyLike(Context const *ctx, Tensor<T, kDim> const &in) {
 template <typename T, typename... Index>
 auto Constant(Context const *ctx, T v, Index &&...index) {
   Tensor<T, sizeof...(Index)> t;
-  t.SetDevice(ctx->Device());
+  t.SetDevice(ctx->Device(), ctx);
   t.Reshape(index...);
-  t.Data()->Fill(std::move(v));
+  t.Data()->Fill(std::move(v), ctx);
   return t;
 }
 
diff --git a/include/xgboost/multi_target_tree_model.h b/include/xgboost/multi_target_tree_model.h
index 35a4ddecc1ba..cbe94f89126d 100644
--- a/include/xgboost/multi_target_tree_model.h
+++ b/include/xgboost/multi_target_tree_model.h
@@ -90,12 +90,12 @@ class MultiTargetTree : public Model {
    * @param weight   The weight vector for the root node.
    * @param sum_hess The sum of hessians for the root node (coverage).
    */
-  void SetRoot(linalg::VectorView<float const> weight, float sum_hess);
+  void SetRoot(Context const* ctx, linalg::VectorView<float const> weight, float sum_hess);
   /**
    * @brief Expand a leaf into split node.
    */
-  void Expand(bst_node_t nidx, bst_feature_t split_idx, float split_cond, bool default_left,
-              linalg::VectorView<float const> base_weight,
+  void Expand(Context const* ctx, bst_node_t nidx, bst_feature_t split_idx, float split_cond,
+              bool default_left, linalg::VectorView<float const> base_weight,
               linalg::VectorView<float const> left_weight,
               linalg::VectorView<float const> right_weight, float loss_chg, float sum_hess,
               float left_sum, float right_sum);
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index e08473e64a6e..e39875c9c064 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -315,8 +315,8 @@ class RegTree : public Model {
    * @param left_sum  The sum of hessians for the left child (coverage).
    * @param right_sum The sum of hessians for the right child (coverage).
    */
-  void ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split_cond, bool default_left,
-                  linalg::VectorView<float const> base_weight,
+  void ExpandNode(Context const* ctx, bst_node_t nidx, bst_feature_t split_index, float split_cond,
+                  bool default_left, linalg::VectorView<float const> base_weight,
                   linalg::VectorView<float const> left_weight,
                   linalg::VectorView<float const> right_weight, float loss_chg, float sum_hess,
                   float left_sum, float right_sum);
@@ -355,7 +355,7 @@ class RegTree : public Model {
   /**
    * @brief Expands a leaf node with categories for a multi-target tree.
    */
-  void ExpandCategorical(bst_node_t nidx, bst_feature_t split_index,
+  void ExpandCategorical(Context const* ctx, bst_node_t nidx, bst_feature_t split_index,
                          common::Span<const uint32_t> split_cat, bool default_left,
                          linalg::VectorView<float const> base_weight,
                          linalg::VectorView<float const> left_weight,
@@ -414,9 +414,9 @@ class RegTree : public Model {
    * @param weight   Internal split weight, with size equals to reduced targets.
    * @param sum_hess The sum of hessians for the root node (coverage).
    */
-  void SetRoot(linalg::VectorView<float const> weight, float sum_hess) {
+  void SetRoot(Context const* ctx, linalg::VectorView<float const> weight, float sum_hess) {
     CHECK(IsMultiTarget());
-    return this->p_mt_tree_->SetRoot(weight, sum_hess);
+    return this->p_mt_tree_->SetRoot(ctx, weight, sum_hess);
   }
   /**
    * @brief Get the maximum depth.
diff --git a/plugin/sycl/common/host_device_vector.cc b/plugin/sycl/common/host_device_vector.cc
index 0a32fae40279..e1706dc356ab 100644
--- a/plugin/sycl/common/host_device_vector.cc
+++ b/plugin/sycl/common/host_device_vector.cc
@@ -14,8 +14,8 @@
 #include "xgboost/host_device_vector.h"
 #pragma GCC diagnostic pop
 
-#include "../device_manager.h"
 #include "../data.h"
+#include "../device_manager.h"
 #include "../predictor/node.h"
 
 namespace xgboost {
@@ -46,10 +46,11 @@ class HostDeviceVectorImpl {
     }
   }
 
-  HostDeviceVectorImpl(HostDeviceVectorImpl<T>&& that) : device_{that.device_},
-                                                         data_h_{std::move(that.data_h_)},
-                                                         data_d_{std::move(that.data_d_)},
-                                                         device_access_{that.device_access_} {}
+  HostDeviceVectorImpl(HostDeviceVectorImpl<T>&& that)
+      : device_{that.device_},
+        data_h_{std::move(that.data_h_)},
+        data_d_{std::move(that.data_d_)},
+        device_access_{that.device_access_} {}
 
   std::vector<T>& HostVector() {
     SyncHost(GPUAccess::kNone);
@@ -62,14 +63,15 @@ class HostDeviceVectorImpl {
   }
 
   void SetDevice(DeviceOrd device) {
-    if (device_ == device) { return; }
+    if (device_ == device) {
+      return;
+    }
     if (device_.IsSycl()) {
       SyncHost(GPUAccess::kNone);
     }
 
     if (device_.IsSycl() && device.IsSycl()) {
-      CHECK_EQ(device_, device)
-          << "New device is different from previous one.";
+      CHECK_EQ(device_, device) << "New device is different from previous one.";
     }
     device_ = device;
     if (device_.IsSycl()) {
@@ -97,20 +99,26 @@ class HostDeviceVectorImpl {
   }
 
   void SyncHost(GPUAccess access) {
-    if (HostCanAccess(access)) { return; }
+    if (HostCanAccess(access)) {
+      return;
+    }
     if (HostCanRead()) {
       // data is present, just need to deny access to the device
       device_access_ = access;
       return;
     }
     device_access_ = access;
-    if (data_h_.size() != data_d_->Size()) { data_h_.resize(data_d_->Size()); }
+    if (data_h_.size() != data_d_->Size()) {
+      data_h_.resize(data_d_->Size());
+    }
     SetDevice();
     qu_->memcpy(data_h_.data(), data_d_->Data(), data_d_->Size() * sizeof(T)).wait();
   }
 
   void SyncDevice(GPUAccess access) {
-    if (DeviceCanAccess(access)) { return; }
+    if (DeviceCanAccess(access)) {
+      return;
+    }
     if (DeviceCanRead()) {
       device_access_ = access;
       return;
@@ -130,9 +138,7 @@ class HostDeviceVectorImpl {
   bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
   GPUAccess Access() const { return device_access_; }
 
-  size_t Size() const {
-    return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->Size() : 0;
-  }
+  size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->Size() : 0; }
 
   DeviceOrd Device() const { return device_; }
 
@@ -214,7 +220,9 @@ class HostDeviceVectorImpl {
 
  private:
   void ResizeDevice(size_t new_size) {
-    if (data_d_ && new_size == data_d_->Size()) { return; }
+    if (data_d_ && new_size == data_d_->Size()) {
+      return;
+    }
     SetDevice();
     data_d_->Resize(qu_, new_size);
   }
@@ -254,20 +262,21 @@ class HostDeviceVectorImpl {
 };
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
-  : impl_(nullptr) {
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device, Context const*)
+    : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(size, v, device);
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
-  : impl_(nullptr) {
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device,
+                                      Context const*)
+    : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(init, device);
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
-  : impl_(nullptr) {
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device, Context const*)
+    : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(init, device);
 }
 
@@ -278,7 +287,9 @@ HostDeviceVector<T>::HostDeviceVector(HostDeviceVector<T>&& that) {
 
 template <typename T>
 HostDeviceVector<T>& HostDeviceVector<T>::operator=(HostDeviceVector<T>&& that) {
-  if (this == &that) { return *this; }
+  if (this == &that) {
+    return *this;
+  }
 
   std::unique_ptr<HostDeviceVectorImpl<T>> new_impl(
       new HostDeviceVectorImpl<T>(std::move(*that.impl_)));
@@ -294,7 +305,9 @@ HostDeviceVector<T>::~HostDeviceVector() {
 }
 
 template <typename T>
-size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
+size_t HostDeviceVector<T>::Size() const {
+  return impl_->Size();
+}
 
 template <typename T>
 DeviceOrd HostDeviceVector<T>::Device() const {
@@ -302,65 +315,72 @@ DeviceOrd HostDeviceVector<T>::Device() const {
 }
 
 template <typename T>
-T* HostDeviceVector<T>::DevicePointer() {
+T* HostDeviceVector<T>::DevicePointer(Context const*) {
   return impl_->DevicePointer();
 }
 
 template <typename T>
-const T* HostDeviceVector<T>::ConstDevicePointer() const {
+const T* HostDeviceVector<T>::ConstDevicePointer(Context const*) const {
   return impl_->ConstDevicePointer();
 }
 
 template <typename T>
-common::Span<T> HostDeviceVector<T>::DeviceSpan() {
+common::Span<T> HostDeviceVector<T>::DeviceSpan(Context const*) {
   return impl_->DeviceSpan();
 }
 
 template <typename T>
-common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(Context const*) const {
   return impl_->ConstDeviceSpan();
 }
 
 template <typename T>
-std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
+std::vector<T>& HostDeviceVector<T>::HostVector(Context const*) {
+  return impl_->HostVector();
+}
 
 template <typename T>
-const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
+const std::vector<T>& HostDeviceVector<T>::ConstHostVector(Context const*) const {
   return impl_->ConstHostVector();
 }
 
 template <typename T>
-void HostDeviceVector<T>::Resize(size_t new_size, T v) {
+void HostDeviceVector<T>::Resize(Context const*, std::size_t new_size, T v) {
   impl_->Resize(new_size, v);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Resize(size_t new_size) {
+void HostDeviceVector<T>::Resize(std::size_t new_size) {
+  impl_->Resize(new_size);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Resize(Context const*, std::size_t new_size) {
   impl_->Resize(new_size);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Fill(T v) {
+void HostDeviceVector<T>::Fill(T v, Context const*) {
   impl_->Fill(v);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
+void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other, Context const*) {
   impl_->Copy(other.impl_);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
+void HostDeviceVector<T>::Copy(const std::vector<T>& other, Context const*) {
   impl_->Copy(other);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
+void HostDeviceVector<T>::Copy(std::initializer_list<T> other, Context const*) {
   impl_->Copy(other);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Extend(HostDeviceVector const& other) {
+void HostDeviceVector<T>::Extend(HostDeviceVector const& other, Context const*) {
   impl_->Extend(other.impl_);
 }
 
@@ -390,7 +410,7 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
 }
 
 template <typename T>
-void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
+void HostDeviceVector<T>::SetDevice(DeviceOrd device, Context const*) const {
   impl_->SetDevice(device);
 }
 
@@ -399,7 +419,7 @@ template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<double>;
 template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<GradientPairPrecise>;
-template class HostDeviceVector<std::int32_t>;   // bst_node_t
+template class HostDeviceVector<std::int32_t>;  // bst_node_t
 template class HostDeviceVector<std::uint8_t>;
 template class HostDeviceVector<std::int8_t>;
 template class HostDeviceVector<FeatureType>;
diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc
index 4f29f2d28fc6..c2ec9fd408e7 100644
--- a/plugin/sycl/data/gradient_index.cc
+++ b/plugin/sycl/data/gradient_index.cc
@@ -2,13 +2,12 @@
  * Copyright 2017-2024 by Contributors
  * \file gradient_index.cc
  */
-#include <vector>
-#include <limits>
-#include <algorithm>
-
 #include "gradient_index.h"
 
+#include <algorithm>
+#include <limits>
 #include <sycl/sycl.hpp>
+#include <vector>
 
 namespace xgboost {
 namespace sycl {
@@ -49,10 +48,8 @@ void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) {
 }
 
 template <typename BinIdxType, bool isDense>
-void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
-                                    Context const * ctx,
-                                    BinIdxType* index_data,
-                                    DMatrix *dmat) {
+void GHistIndexMatrix::SetIndexData(::sycl::queue* qu, Context const* ctx, BinIdxType* index_data,
+                                    DMatrix* dmat) {
   if (nbins == 0) return;
   const bst_float* cut_values = cut.cut_values_.ConstDevicePointer();
   const uint32_t* cut_ptrs = cut.cut_ptrs_.ConstDevicePointer();
@@ -60,11 +57,11 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
 
   BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
 
-  for (auto &batch : dmat->GetBatches<SparsePage>()) {
+  for (auto& batch : dmat->GetBatches<SparsePage>()) {
     batch.data.SetDevice(ctx->Device());
     batch.offset.SetDevice(ctx->Device());
-    const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer();
-    const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer();
+    const xgboost::Entry* data_ptr = batch.data.ConstDevicePointer();
+    const bst_idx_t* offset_vec = batch.offset.ConstDevicePointer();
     size_t batch_size = batch.Size();
     if (batch_size > 0) {
       const auto base_rowid = batch.base_rowid;
@@ -101,8 +98,9 @@ void GHistIndexMatrix::ResizeIndex(::sycl::queue* qu, size_t n_index) {
   if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense_) {
     index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize);
     index.Resize(qu, (sizeof(uint8_t)) * n_index);
-  } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max())  &&
-    max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense_) {
+  } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
+              max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
+             isDense_) {
     index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize);
     index.Resize(qu, (sizeof(uint16_t)) * n_index);
   } else {
@@ -111,10 +109,7 @@ void GHistIndexMatrix::ResizeIndex(::sycl::queue* qu, size_t n_index) {
   }
 }
 
-void GHistIndexMatrix::Init(::sycl::queue* qu,
-                            Context const * ctx,
-                            DMatrix *dmat,
-                            int max_bins) {
+void GHistIndexMatrix::Init(::sycl::queue* qu, Context const* ctx, DMatrix* dmat, int max_bins) {
   nfeatures = dmat->Info().num_col_;
 
   cut = xgboost::common::SketchOnDMatrix(ctx, dmat, max_bins);
@@ -132,7 +127,7 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
   }
 
   hit_count.SetDevice(ctx->Device());
-  hit_count.Resize(nbins, 0);
+  hit_count.Resize(ctx, nbins, 0);
 
   const bool isDense = dmat->IsDense();
   this->isDense_ = isDense;
@@ -168,7 +163,7 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
       CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize);
       SetIndexData<uint32_t, true>(qu, ctx, index.data<uint32_t>(), dmat);
     }
-  /* For sparse DMatrix we have to store index of feature for each bin
+    /* For sparse DMatrix we have to store index of feature for each bin
      in index field to chose right offset. So offset is nullptr and index is not reduced */
   } else {
     sort_buff.Resize(qu, n_rows * row_stride * sizeof(uint32_t));
diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh
index a275c34370ad..61040c2e5d9b 100644
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -344,7 +344,7 @@ template <typename... Args>
 void RunLengthEncode(curt::StreamRef stream, Args &&...args) {
   std::size_t n_bytes = 0;
   dh::safe_cuda(cub::DeviceRunLengthEncode::Encode(nullptr, n_bytes, args..., stream));
-  dh::CachingDeviceUVector<char> tmp(n_bytes);
+  dh::CachingDeviceUVector<char> tmp(n_bytes, stream);
   dh::safe_cuda(cub::DeviceRunLengthEncode::Encode(tmp.data(), n_bytes, args..., stream));
 }
 
@@ -352,7 +352,7 @@ template <typename... Args>
 void SegmentedSum(curt::StreamRef stream, Args &&...args) {
   std::size_t n_bytes = 0;
   dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, n_bytes, args..., stream));
-  dh::CachingDeviceUVector<char> tmp(n_bytes);
+  dh::CachingDeviceUVector<char> tmp(n_bytes, stream);
   dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(tmp.data(), n_bytes, args..., stream));
 }
 
diff --git a/src/common/device_compression.cu b/src/common/device_compression.cu
index 7f688bb7deeb..497438311d1c 100644
--- a/src/common/device_compression.cu
+++ b/src/common/device_compression.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2025, XGBoost contributors
+ * Copyright 2025-2026, XGBoost contributors
  *
  * We use NVComp to perform compression and access the DE API directly for
  * decompression. Invoking the DE directly can help us avoid unnecessary kernal launches
@@ -172,7 +172,7 @@ SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(curt::StreamRef s,
   std::vector<std::size_t> in_chunk_sizes(n_chunks);
   std::vector<std::size_t> out_chunk_sizes(n_chunks);
 
-  dh::DeviceUVector<std::int32_t> status(n_chunks);
+  dh::DeviceUVector<std::int32_t> status(n_chunks, s);
   for (std::size_t i = 0; i < n_chunks; ++i) {
     in_chunk_ptrs[i] = in_compressed_data.subspan(last_in, params[i].src_act_nbytes).data();
     in_chunk_sizes[i] = params[i].src_act_nbytes;
@@ -195,8 +195,8 @@ SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(curt::StreamRef s,
     std::memset(this->de_params.data() + i, 0, sizeof(CUmemDecompressParams));
   }
 
-  FillDecompParams(d_in_chunk_ptrs.data().get(), d_in_chunk_sizes.data().get(), de_params.ToSpan(),
-                   this->act_nbytes.data().get(), d_out_chunk_sizes.data().get(), status.data(), s);
+  FillDecompParams(d_in_chunk_ptrs.data(), d_in_chunk_sizes.data(), de_params.ToSpan(),
+                   this->act_nbytes.data().get(), d_out_chunk_sizes.data(), status.data(), s);
   dh::XGBCachingDeviceAllocator<char> alloc;
   bool valid = thrust::all_of(thrust::cuda::par_nosync(alloc).on(s), status.cbegin(), status.cend(),
                               ChkOp{});
@@ -301,8 +301,8 @@ void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr,
                                   dh::ToSpan(d_out_ptrs).size_bytes(), cudaMemcpyDefault, stream));
     // Run nvcomp
     SafeNvComp(nvcompBatchedSnappyDecompressAsync(
-        mgr_impl->d_in_chunk_ptrs.data().get(), mgr_impl->d_in_chunk_sizes.data().get(),
-        mgr_impl->d_out_chunk_sizes.data().get(), mgr_impl->act_nbytes.data().get(), n_chunks,
+        mgr_impl->d_in_chunk_ptrs.data(), mgr_impl->d_in_chunk_sizes.data(),
+        mgr_impl->d_out_chunk_sizes.data(), mgr_impl->act_nbytes.data().get(), n_chunks,
         tmp.data().get(), n_tmp_bytes, d_out_ptrs.data().get(),
         nvcompBatchedSnappyDecompressDefaultOpts, status.data().get(), stream));
   }
@@ -326,7 +326,7 @@ void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr,
    */
   std::size_t n_chunks = (in.size() + chunk_size - 1) / chunk_size;
   if (n_chunks == 0) {
-    p_out->clear();
+    p_out->clear(ctx->CUDACtx()->Stream());
     return {};
   }
   std::size_t last = 0;
@@ -343,11 +343,11 @@ void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr,
   }
   CHECK_EQ(last, in.size());
 
-  dh::DeviceUVector<void const*> in_ptrs(h_in_ptrs.size());
+  dh::DeviceUVector<void const*> in_ptrs(h_in_ptrs.size(), cuctx->Stream());
   dh::safe_cuda(cudaMemcpyAsync(in_ptrs.data(), h_in_ptrs.data(),
                                 common::Span{h_in_ptrs}.size_bytes(), cudaMemcpyDefault,
                                 cuctx->Stream()));
-  dh::DeviceUVector<std::size_t> in_sizes(h_in_sizes.size());
+  dh::DeviceUVector<std::size_t> in_sizes(h_in_sizes.size(), cuctx->Stream());
   dh::safe_cuda(cudaMemcpyAsync(in_sizes.data(), h_in_sizes.data(),
                                 common::Span{h_in_sizes}.size_bytes(), cudaMemcpyDefault,
                                 cuctx->Stream()));
@@ -363,12 +363,12 @@ void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr,
       n_chunks, chunk_size, nvcomp_batched_snappy_opts, &comp_temp_bytes,
       /*max_total_uncompressed_bytes=*/in.size()));
   CHECK_EQ(comp_temp_bytes, 0);
-  dh::DeviceUVector<char> comp_tmp(comp_temp_bytes);
+  dh::DeviceUVector<char> comp_tmp(comp_temp_bytes, cuctx->Stream());
 
   std::size_t max_out_nbytes = 0;
   SafeNvComp(nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
       std::min(max_in_nbytes, chunk_size), nvcomp_batched_snappy_opts, &max_out_nbytes));
-  p_out->resize(max_out_nbytes * n_chunks);
+  p_out->resize(max_out_nbytes * n_chunks, cuctx->Stream());
   std::vector<void*> h_out_ptrs(n_chunks);
   std::vector<std::size_t> h_out_sizes(n_chunks);
   auto s_out = dh::ToSpan(*p_out);
@@ -377,12 +377,14 @@ void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr,
     h_out_ptrs[i] = chunk.data();
     h_out_sizes[i] = chunk.size();
   }
-  dh::DeviceUVector<void*> out_ptrs(h_out_ptrs.size());
+  dh::DeviceUVector<void*> out_ptrs(h_out_ptrs.size(), cuctx->Stream());
   dh::safe_cuda(cudaMemcpyAsync(out_ptrs.data(), h_out_ptrs.data(),
-                                common::Span{h_out_ptrs}.size_bytes(), cudaMemcpyDefault));
-  dh::DeviceUVector<std::size_t> out_sizes(h_out_sizes.size());
+                                common::Span{h_out_ptrs}.size_bytes(), cudaMemcpyDefault,
+                                cuctx->Stream()));
+  dh::DeviceUVector<std::size_t> out_sizes(h_out_sizes.size(), cuctx->Stream());
   dh::safe_cuda(cudaMemcpyAsync(out_sizes.data(), h_out_sizes.data(),
-                                common::Span{h_out_sizes}.size_bytes(), cudaMemcpyDefault));
+                                common::Span{h_out_sizes}.size_bytes(), cudaMemcpyDefault,
+                                cuctx->Stream()));
 
   /**
    * Compress
diff --git a/src/common/device_compression.cuh b/src/common/device_compression.cuh
index 6ab3e62719d4..2102adfba872 100644
--- a/src/common/device_compression.cuh
+++ b/src/common/device_compression.cuh
@@ -65,11 +65,11 @@ void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr,
 struct SnappyDecomprMgrImpl {
   std::size_t n_dst_bytes{0};
   // src of the CUmemDecompressParams
-  dh::device_vector<void const*> d_in_chunk_ptrs;
+  dh::DeviceUVector<void const*> d_in_chunk_ptrs;
   // srcNumBytes of the CUmemDecompressParams
-  dh::device_vector<std::size_t> d_in_chunk_sizes;
+  dh::DeviceUVector<std::size_t> d_in_chunk_sizes;
   // dstNumBytes of the CUmemDecompressParams
-  dh::device_vector<std::size_t> d_out_chunk_sizes;
+  dh::DeviceUVector<std::size_t> d_out_chunk_sizes;
   // dstActBytes of the CUmemDecompressParams
   dh::device_vector<std::size_t> act_nbytes;
 
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 8b9829217dac..eb4f81744c95 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -709,10 +709,10 @@ template <class Src, class Dst>
 void CopyTo(Src const &src, Dst *dst,
             ::xgboost::curt::StreamRef stream = ::xgboost::curt::DefaultStream()) {
   if (src.empty()) {
-    dst->clear();
+    dst->clear(stream);
     return;
   }
-  dst->resize(src.size());
+  dst->resize(src.size(), stream);
   using SVT = std::remove_cv_t<typename Src::value_type>;
   using DVT = std::remove_cv_t<typename Dst::value_type>;
   static_assert(std::is_same_v<SVT, DVT>, "Host and device containers must have same value type.");
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 2572132cfbfe..6ef3ccd2fc4b 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -268,10 +268,12 @@ class ThrustAllocMrAdapter : public rmm::mr::thrust_allocator<T> {
     using other = ThrustAllocMrAdapter<U>;  // NOLINT(readability-identifier-naming)
   };
 
-
+  // Similar to RMM's thrust adapter, it uses the default stream.
   ThrustAllocMrAdapter()
       : rmm::mr::thrust_allocator<T>{
             rmm::cuda_stream_view{cudaStream_t{xgboost::curt::DefaultStream()}}} {};
+
+  void SetStream([[maybe_unused]] cudaStream_t s) {}
 };
 
 template <typename T>
@@ -289,6 +291,7 @@ class XGBAsyncPoolAllocator : public thrust::device_malloc_allocator<T> {
   // entirely on Windows.
   std::int32_t use_async_pool_;
 #endif
+  ::xgboost::curt::StreamRef stream_{::xgboost::curt::DefaultStream()};
 
  public:
   using Super = thrust::device_malloc_allocator<T>;
@@ -307,6 +310,8 @@ class XGBAsyncPoolAllocator : public thrust::device_malloc_allocator<T> {
     using other = XGBAsyncPoolAllocator<U>;  // NOLINT(readability-identifier-naming)
   };
 
+  void SetStream(::xgboost::curt::StreamRef stream) { this->stream_ = stream; }
+
   pointer allocate(std::size_t n) {  // NOLINT
 #if defined(xgboost_IS_WIN)
     return Super::allocate(n);
@@ -317,7 +322,7 @@ class XGBAsyncPoolAllocator : public thrust::device_malloc_allocator<T> {
 
     T *raw_ptr = nullptr;
     auto n_bytes = xgboost::common::SizeBytes<T>(n);
-    safe_cuda(cudaMallocAsync(&raw_ptr, n_bytes, xgboost::curt::DefaultStream()));
+    safe_cuda(cudaMallocAsync(&raw_ptr, n_bytes, this->stream_));
     return thrust::device_pointer_cast(raw_ptr);
 #endif
   }
@@ -330,7 +335,7 @@ class XGBAsyncPoolAllocator : public thrust::device_malloc_allocator<T> {
       return Super::deallocate(ptr, n);
     }
 
-    safe_cuda(cudaFreeAsync(thrust::raw_pointer_cast(ptr), xgboost::curt::DefaultStream()));
+    safe_cuda(cudaFreeAsync(thrust::raw_pointer_cast(ptr), this->stream_));
 #endif
   }
 
@@ -490,7 +495,9 @@ class DeviceUVectorImpl {
 
  public:
   DeviceUVectorImpl() = default;
-  explicit DeviceUVectorImpl(std::size_t n) { this->resize(n); }
+  explicit DeviceUVectorImpl(std::size_t n, ::xgboost::curt::StreamRef stream) {
+    this->resize(n, stream);
+  }
   DeviceUVectorImpl(DeviceUVectorImpl const &that) = delete;
   DeviceUVectorImpl &operator=(DeviceUVectorImpl const &that) = delete;
   DeviceUVectorImpl(DeviceUVectorImpl &&that) = default;
@@ -499,7 +506,7 @@ class DeviceUVectorImpl {
   [[nodiscard]] std::size_t Capacity() const { return this->capacity_; }
 
   // Resize without init.
-  void resize(std::size_t n) {  // NOLINT
+  void resize(std::size_t n, ::xgboost::curt::StreamRef stream) {  // NOLINT
     using ::xgboost::common::SizeBytes;
 
     if (n <= this->Capacity()) {
@@ -509,6 +516,7 @@ class DeviceUVectorImpl {
     }
     CHECK_LE(this->size(), this->Capacity());
 
+    this->alloc_.SetStream(stream);
     Alloc alloc = this->alloc_;
     decltype(data_) new_ptr{thrust::raw_pointer_cast(this->alloc_.allocate(n)),
                             [=](T *ptr) mutable {
@@ -518,9 +526,8 @@ class DeviceUVectorImpl {
                             }};
     CHECK(new_ptr.get());
 
-    auto s = ::xgboost::curt::DefaultStream();
     safe_cuda(cudaMemcpyAsync(new_ptr.get(), this->data(), SizeBytes<T>(this->size()),
-                              cudaMemcpyDefault, s));
+                              cudaMemcpyDefault, stream));
     this->size_ = n;
     this->capacity_ = n;
 
@@ -529,17 +536,16 @@ class DeviceUVectorImpl {
     // std::swap(this->data_, new_ptr);
   }
   // Resize with init
-  void resize(std::size_t n, T const &v) {  // NOLINT
+  void resize(std::size_t n, T const &v, ::xgboost::curt::StreamRef stream) {  // NOLINT
     auto orig = this->size();
-    this->resize(n);
+    this->resize(n, stream);
     if (orig < n) {
-      auto exec = thrust::cuda::par_nosync.on(::xgboost::curt::DefaultStream());
+      auto exec = thrust::cuda::par_nosync.on(cudaStream_t{stream});
       thrust::fill(exec, this->begin() + orig, this->end(), v);
     }
   }
-
-  void clear() {  // NOLINT
-    this->resize(0);
+  void clear(::xgboost::curt::StreamRef stream) {  // NOLINT
+    this->resize(0, stream);
   }
 
   [[nodiscard]] std::size_t size() const { return this->size_; }  // NOLINT
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index d059888f5643..7960d5f01880 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -169,7 +169,7 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
       sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
         return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
       });
-  detail::GetColumnSizesScan(ctx->CUDACtx(), ctx->Device(), info.num_col_, num_cuts_per_feature,
+  detail::GetColumnSizesScan(ctx, info.num_col_, num_cuts_per_feature,
                              IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
                              &column_sizes_scan);
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
@@ -270,7 +270,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
   info.weights_.SetDevice(ctx->Device());
   auto d_weight = UnifyWeight(cuctx, info, hessian, &weight);
 
-  SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, ctx->Device());
+  SketchContainer sketch_container(ctx, info.feature_types, max_bin, info.num_col_);
   CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
   for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
     std::size_t page_nnz = page.data.Size();
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index 3b1ad6312584..c89afa736248 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2025, XGBoost contributors
+ * Copyright 2020-2026, XGBoost contributors
  *
  * \brief Front end and utilities for GPU based sketching.  Works on sliding window
  *        instead of stream.
@@ -138,24 +138,25 @@ void LaunchGetColumnSizeKernel(CUDAContext const* cuctx, DeviceOrd device,
 }
 
 template <typename BatchIt>
-void GetColumnSizesScan(CUDAContext const* cuctx, DeviceOrd device, size_t num_columns,
+void GetColumnSizesScan(Context const* ctx, size_t num_columns,
                         std::size_t num_cuts_per_feature, IterSpan<BatchIt> batch_iter,
                         data::IsValidFunctor is_valid,
                         HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
                         dh::caching_device_vector<size_t>* column_sizes_scan) {
+  auto cuctx = ctx->CUDACtx();
   column_sizes_scan->resize(num_columns + 1);
-  cuts_ptr->SetDevice(device);
-  cuts_ptr->Resize(num_columns + 1, 0);
+  cuts_ptr->SetDevice(ctx->Device());
+  cuts_ptr->Resize(ctx, num_columns + 1, 0);
 
   auto d_column_sizes_scan = dh::ToSpan(*column_sizes_scan);
-  LaunchGetColumnSizeKernel(cuctx, device, batch_iter, is_valid, d_column_sizes_scan);
+  LaunchGetColumnSizeKernel(cuctx, ctx->Device(), batch_iter, is_valid, d_column_sizes_scan);
   // Calculate cuts CSC pointer
   auto cut_ptr_it = dh::MakeTransformIterator<size_t>(
       column_sizes_scan->begin(), [=] __device__(size_t column_size) {
         return thrust::min(num_cuts_per_feature, column_size);
       });
-  thrust::exclusive_scan(cuctx->CTP(), cut_ptr_it,
-                         cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
+  thrust::exclusive_scan(cuctx->CTP(), cut_ptr_it, cut_ptr_it + column_sizes_scan->size(),
+                         cuts_ptr->DevicePointer());
   thrust::exclusive_scan(cuctx->CTP(), column_sizes_scan->begin(), column_sizes_scan->end(),
                          column_sizes_scan->begin());
 }
@@ -168,9 +169,8 @@ size_t RequiredSampleCutsPerColumn(int max_bins, size_t num_rows);
 
 // Count the valid entries in each column and copy them out.
 template <typename AdapterBatch, typename BatchIter>
-void MakeEntriesFromAdapter(CUDAContext const* cuctx, AdapterBatch const& batch,
-                            BatchIter batch_iter, Range1d range, float missing, size_t columns,
-                            size_t cuts_per_feature, DeviceOrd device,
+void MakeEntriesFromAdapter(Context const* ctx, AdapterBatch const& batch, BatchIter batch_iter,
+                            Range1d range, float missing, size_t columns, size_t cuts_per_feature,
                             HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
                             dh::caching_device_vector<size_t>* column_sizes_scan,
                             dh::device_vector<Entry>* sorted_entries) {
@@ -182,13 +182,13 @@ void MakeEntriesFromAdapter(CUDAContext const* cuctx, AdapterBatch const& batch,
   auto span = IterSpan{batch_iter + range.begin(), n};
   data::IsValidFunctor is_valid(missing);
   // Work out how many valid entries we have in each column
-  GetColumnSizesScan(cuctx, device, columns, cuts_per_feature, span, is_valid, cut_sizes_scan,
+  GetColumnSizesScan(ctx, columns, cuts_per_feature, span, is_valid, cut_sizes_scan,
                      column_sizes_scan);
   size_t num_valid = column_sizes_scan->back();
   // Copy current subset of valid elements into temporary storage and sort
   sorted_entries->resize(num_valid);
-  CopyIf(cuctx, entry_iter + range.begin(), entry_iter + range.end(), sorted_entries->begin(),
-         is_valid);
+  CopyIf(ctx->CUDACtx(), entry_iter + range.begin(), entry_iter + range.end(),
+         sorted_entries->begin(), is_valid);
 }
 
 void SortByWeight(Context const* ctx, dh::device_vector<float>* weights,
@@ -242,9 +242,8 @@ void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInf
   HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
   cuts_ptr.SetDevice(ctx->Device());
   CUDAContext const* cuctx = ctx->CUDACtx();
-  detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, n_features,
-                                 num_cuts, ctx->Device(), &cuts_ptr, &column_sizes_scan,
-                                 &sorted_entries);
+  detail::MakeEntriesFromAdapter(ctx, batch, batch_iter, {begin, end}, missing, n_features,
+                                 num_cuts, &cuts_ptr, &column_sizes_scan, &sorted_entries);
   thrust::sort(cuctx->TP(), sorted_entries.begin(), sorted_entries.end(), detail::EntryCompareOp());
 
   if (sketch_container->HasCategorical()) {
@@ -279,8 +278,8 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons
   dh::device_vector<Entry> sorted_entries;
   dh::caching_device_vector<size_t> column_sizes_scan;
   HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
-  detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, columns,
-                                 num_cuts_per_feature, ctx->Device(), &cuts_ptr, &column_sizes_scan,
+  detail::MakeEntriesFromAdapter(ctx, batch, batch_iter, {begin, end}, missing, columns,
+                                 num_cuts_per_feature, &cuts_ptr, &column_sizes_scan,
                                  &sorted_entries);
   data::IsValidFunctor is_valid(missing);
 
@@ -299,24 +298,20 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons
           bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx);
           return weights[group_idx];
         });
-    auto retit = thrust::copy_if(cuctx->CTP(),
-                                 weight_iter + begin, weight_iter + end,
-                                 batch_iter + begin,
-                                 d_temp_weights.data(),  // output
-                                 is_valid);
+    auto retit =
+        thrust::copy_if(cuctx->CTP(), weight_iter + begin, weight_iter + end, batch_iter + begin,
+                        d_temp_weights.data(),  // output
+                        is_valid);
     CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
   } else {
     CHECK_EQ(batch.NumRows(), weights.size());
     auto const weight_iter = dh::MakeTransformIterator<float>(
         thrust::make_counting_iterator(0lu),
-        [=]__device__(size_t idx) -> float {
-          return weights[batch.GetElement(idx).row_idx];
-        });
-    auto retit = thrust::copy_if(cuctx->CTP(),
-                                 weight_iter + begin, weight_iter + end,
-                                 batch_iter + begin,
-                                 d_temp_weights.data(),  // output
-                                 is_valid);
+        [=] __device__(size_t idx) -> float { return weights[batch.GetElement(idx).row_idx]; });
+    auto retit =
+        thrust::copy_if(cuctx->CTP(), weight_iter + begin, weight_iter + end, batch_iter + begin,
+                        d_temp_weights.data(),  // output
+                        is_valid);
     CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
   }
 
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index a1201258da49..c04d42a9c1b8 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -6,13 +6,16 @@
 
 // dummy implementation of HostDeviceVector in case CUDA is not used
 
+#include "xgboost/host_device_vector.h"
+
 #include <xgboost/base.h>
 #include <xgboost/data.h>
+
 #include <cstdint>
 #include <memory>
 #include <utility>
+
 #include "xgboost/tree_model.h"
-#include "xgboost/host_device_vector.h"
 
 namespace xgboost {
 
@@ -20,12 +23,10 @@ template <typename T>
 struct HostDeviceVectorImpl {
   explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {}
   HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
-  explicit HostDeviceVectorImpl(std::vector<T>  init) : data_h_(std::move(init)) {}
+  explicit HostDeviceVectorImpl(std::vector<T> init) : data_h_(std::move(init)) {}
   HostDeviceVectorImpl(HostDeviceVectorImpl&& that) : data_h_(std::move(that.data_h_)) {}
 
-  void Swap(HostDeviceVectorImpl &other) {
-     data_h_.swap(other.data_h_);
-  }
+  void Swap(HostDeviceVectorImpl& other) { data_h_.swap(other.data_h_); }
 
   std::vector<T>& Vec() { return data_h_; }
 
@@ -34,20 +35,20 @@ struct HostDeviceVectorImpl {
 };
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
-  : impl_(nullptr) {
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd, Context const*)
+    : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(size, v);
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
-  : impl_(nullptr) {
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd, Context const*)
+    : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(init);
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
-  : impl_(nullptr) {
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd, Context const*)
+    : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(init);
 }
 
@@ -58,7 +59,9 @@ HostDeviceVector<T>::HostDeviceVector(HostDeviceVector<T>&& that) {
 
 template <typename T>
 HostDeviceVector<T>& HostDeviceVector<T>::operator=(HostDeviceVector<T>&& that) {
-  if (this == &that) { return *this; }
+  if (this == &that) {
+    return *this;
+  }
 
   std::unique_ptr<HostDeviceVectorImpl<T>> new_impl(
       new HostDeviceVectorImpl<T>(std::move(*that.impl_)));
@@ -79,72 +82,85 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
 }
 
 template <typename T>
-size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
+size_t HostDeviceVector<T>::Size() const {
+  return impl_->Vec().size();
+}
 
 template <typename T>
-DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }
+DeviceOrd HostDeviceVector<T>::Device() const {
+  return DeviceOrd::CPU();
+}
 
 template <typename T>
-T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
+T* HostDeviceVector<T>::DevicePointer(Context const*) {
+  return nullptr;
+}
 
 template <typename T>
-const T* HostDeviceVector<T>::ConstDevicePointer() const {
+const T* HostDeviceVector<T>::ConstDevicePointer(Context const*) const {
   return nullptr;
 }
 
 template <typename T>
-common::Span<T> HostDeviceVector<T>::DeviceSpan() {
+common::Span<T> HostDeviceVector<T>::DeviceSpan(Context const*) {
   return common::Span<T>();
 }
 
 template <typename T>
-common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(Context const*) const {
   return common::Span<const T>();
 }
 
 template <typename T>
-std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->Vec(); }
+std::vector<T>& HostDeviceVector<T>::HostVector(Context const*) {
+  return impl_->Vec();
+}
 
 template <typename T>
-const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
+const std::vector<T>& HostDeviceVector<T>::ConstHostVector(Context const*) const {
   return impl_->Vec();
 }
 
 template <typename T>
-void HostDeviceVector<T>::Resize(size_t new_size, T v) {
+void HostDeviceVector<T>::Resize(std::size_t new_size) {
+  impl_->Vec().resize(new_size, T{});
+}
+
+template <typename T>
+void HostDeviceVector<T>::Resize(Context const*, std::size_t new_size, T v) {
   impl_->Vec().resize(new_size, v);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Resize(size_t new_size) {
+void HostDeviceVector<T>::Resize(Context const*, std::size_t new_size) {
   impl_->Vec().resize(new_size, T{});
 }
 
 template <typename T>
-void HostDeviceVector<T>::Fill(T v) {
+void HostDeviceVector<T>::Fill(T v, Context const*) {
   std::fill(HostVector().begin(), HostVector().end(), v);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
+void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other, Context const*) {
   CHECK_EQ(Size(), other.Size());
   std::copy(other.HostVector().begin(), other.HostVector().end(), HostVector().begin());
 }
 
 template <typename T>
-void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
+void HostDeviceVector<T>::Copy(const std::vector<T>& other, Context const*) {
   CHECK_EQ(Size(), other.size());
   std::copy(other.begin(), other.end(), HostVector().begin());
 }
 
 template <typename T>
-void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
+void HostDeviceVector<T>::Copy(std::initializer_list<T> other, Context const*) {
   CHECK_EQ(Size(), other.size());
   std::copy(other.begin(), other.end(), HostVector().begin());
 }
 
 template <typename T>
-void HostDeviceVector<T>::Extend(HostDeviceVector const& other) {
+void HostDeviceVector<T>::Extend(HostDeviceVector const& other, Context const*) {
   auto ori_size = this->Size();
   this->HostVector().resize(ori_size + other.Size());
   std::copy(other.ConstHostVector().cbegin(), other.ConstHostVector().cend(),
@@ -172,14 +188,14 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
 }
 
 template <typename T>
-void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
+void HostDeviceVector<T>::SetDevice(DeviceOrd, Context const*) const {}
 
 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<double>;
 template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<GradientPairPrecise>;
-template class HostDeviceVector<std::int32_t>;   // bst_node_t
+template class HostDeviceVector<std::int32_t>;  // bst_node_t
 template class HostDeviceVector<std::uint8_t>;
 template class HostDeviceVector<std::int8_t>;
 template class HostDeviceVector<FeatureType>;
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index d492285cb01a..c1c4c763bfb7 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2025, XGBoost contributors
+ * Copyright 2017-2026, XGBoost contributors
  */
 #include <thrust/fill.h>
 
@@ -7,7 +7,8 @@
 #include <cstddef>  // for size_t
 #include <cstdint>
 
-#include "cuda_stream.h"  // for DefaultStream
+#include "cuda_context.cuh"  // for CUDAContext
+#include "cuda_stream.h"     // for DefaultStream
 #include "device_helpers.cuh"
 #include "device_vector.cuh"  // for DeviceUVector
 #include "xgboost/data.h"
@@ -19,18 +20,27 @@ namespace xgboost {
 // the handler to call instead of cudaSetDevice; only used for testing
 static void (*cudaSetDeviceHandler)(int) = nullptr;  // NOLINT
 
-void SetCudaSetDeviceHandler(void (*handler)(int)) {
-  cudaSetDeviceHandler = handler;
+void SetCudaSetDeviceHandler(void (*handler)(int)) { cudaSetDeviceHandler = handler; }
+
+namespace {
+curt::StreamRef GetStream(CUDAContext const* ctx) {
+  return ctx ? ctx->Stream() : curt::DefaultStream();
+}
+
+CUDAContext const* GetCUDACtx(Context const* ctx) {
+  return ctx && ctx->IsCUDA() ? ctx->CUDACtx() : nullptr;
 }
+}  // namespace
 
 template <typename T>
 class HostDeviceVectorImpl {
  public:
-  HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
+  HostDeviceVectorImpl(CUDAContext const* ctx, size_t size, T v, DeviceOrd device)
+      : device_(device) {
     if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      data_d_->resize(size, v);
+      data_d_->resize(size, v, GetStream(ctx));
     } else {
       data_h_.resize(size, v);
     }
@@ -38,21 +48,22 @@ class HostDeviceVectorImpl {
 
   // Initializer can be std::vector<T> or std::initializer_list<T>
   template <class Initializer>
-  HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
+  HostDeviceVectorImpl(CUDAContext const* ctx, const Initializer& init, DeviceOrd device)
+      : device_(device) {
     if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
-      LazyResizeDevice(init.size());
-      Copy(init);
+      LazyResizeDevice(init.size(), ctx);
+      Copy(ctx, init);
     } else {
       data_h_ = init;
     }
   }
 
-  HostDeviceVectorImpl(HostDeviceVectorImpl<T>&& that) :
-    device_{that.device_},
-    data_h_{std::move(that.data_h_)},
-    data_d_{std::move(that.data_d_)},
-    gpu_access_{that.gpu_access_} {}
+  HostDeviceVectorImpl(HostDeviceVectorImpl<T>&& that)
+      : device_{that.device_},
+        data_h_{std::move(that.data_h_)},
+        data_d_{std::move(that.data_d_)},
+        gpu_access_{that.gpu_access_} {}
 
   ~HostDeviceVectorImpl() {
     if (device_.IsCUDA()) {
@@ -66,100 +77,101 @@ class HostDeviceVectorImpl {
 
   [[nodiscard]] DeviceOrd Device() const { return device_; }
 
-  T* DevicePointer() {
-    LazySyncDevice(GPUAccess::kWrite);
+  T* DevicePointer(CUDAContext const* ctx) {
+    LazySyncDevice(ctx, GPUAccess::kWrite);
     return data_d_->data();
   }
 
-  const T* ConstDevicePointer() {
-    LazySyncDevice(GPUAccess::kRead);
+  const T* ConstDevicePointer(CUDAContext const* ctx) {
+    LazySyncDevice(ctx, GPUAccess::kRead);
     return data_d_->data();
   }
 
-  common::Span<T> DeviceSpan() {
-    LazySyncDevice(GPUAccess::kWrite);
-    return {this->DevicePointer(), Size()};
+  common::Span<T> DeviceSpan(CUDAContext const* ctx) {
+    LazySyncDevice(ctx, GPUAccess::kWrite);
+    return {this->DevicePointer(ctx), Size()};
   }
 
-  common::Span<const T> ConstDeviceSpan() {
-    LazySyncDevice(GPUAccess::kRead);
-    return {this->ConstDevicePointer(), Size()};
+  common::Span<const T> ConstDeviceSpan(CUDAContext const* ctx) {
+    LazySyncDevice(ctx, GPUAccess::kRead);
+    return {this->ConstDevicePointer(ctx), Size()};
   }
 
-  void Fill(T v) {  // NOLINT
+  void Fill(T v, CUDAContext const* ctx) {  // NOLINT
     if (HostCanWrite()) {
       std::fill(data_h_.begin(), data_h_.end(), v);
     } else {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
       auto s_data = dh::ToSpan(*data_d_);
-      dh::LaunchN(data_d_->size(), curt::DefaultStream(),
-                  [=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; });
+      dh::LaunchN(data_d_->size(), GetStream(ctx), [=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; });
     }
   }
 
-  void Copy(HostDeviceVectorImpl<T>* other) {
+  void Copy(CUDAContext const* ctx, HostDeviceVectorImpl<T>* other) {
     CHECK_EQ(Size(), other->Size());
-    SetDevice(other->device_);
+    SetDevice(other->device_, ctx);
     // Data is on host.
     if (HostCanWrite() && other->HostCanWrite()) {
       std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
       return;
     }
     SetDevice();
-    CopyToDevice(other);
+    CopyToDevice(ctx, other);
   }
 
-  void Copy(const std::vector<T>& other) {
+  void Copy(CUDAContext const* ctx, const std::vector<T>& other) {
     CHECK_EQ(Size(), other.size());
     if (HostCanWrite()) {
       std::copy(other.begin(), other.end(), data_h_.begin());
     } else {
-      CopyToDevice(other.data());
+      CopyToDevice(ctx, other.data());
     }
   }
 
-  void Copy(std::initializer_list<T> other) {
+  void Copy(CUDAContext const* ctx, std::initializer_list<T> other) {
     CHECK_EQ(Size(), other.size());
     if (HostCanWrite()) {
       std::copy(other.begin(), other.end(), data_h_.begin());
     } else {
-      CopyToDevice(other.begin());
+      CopyToDevice(ctx, other.begin());
     }
   }
 
-  void Extend(HostDeviceVectorImpl* other) {
+  void Extend(CUDAContext const* ctx, HostDeviceVectorImpl* other) {
     auto ori_size = this->Size();
-    this->Resize(ori_size + other->Size(), T{});
+    this->Resize(ctx, ori_size + other->Size(), T{});
     if (HostCanWrite() && other->HostCanRead()) {
-      auto& h_vec = this->HostVector();
-      auto& other_vec = other->HostVector();
+      auto& h_vec = this->HostVector(ctx);
+      auto& other_vec = other->HostVector(ctx);
       CHECK_EQ(h_vec.size(), ori_size + other->Size());
       std::copy(other_vec.cbegin(), other_vec.cend(), h_vec.begin() + ori_size);
     } else {
-      auto ptr = other->ConstDevicePointer();
+      auto ptr = other->ConstDevicePointer(ctx);
       SetDevice();
       CHECK_EQ(this->Device(), other->Device());
-      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr,
+      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer(ctx) + ori_size, ptr,
                                     other->Size() * sizeof(T), cudaMemcpyDeviceToDevice,
-                                    curt::DefaultStream()));
+                                    GetStream(ctx)));
     }
   }
 
-  std::vector<T>& HostVector() {
-    LazySyncHost(GPUAccess::kNone);
+  std::vector<T>& HostVector(CUDAContext const* ctx) {
+    LazySyncHost(ctx, GPUAccess::kNone);
     return data_h_;
   }
 
-  const std::vector<T>& ConstHostVector() {
-    LazySyncHost(GPUAccess::kRead);
+  const std::vector<T>& ConstHostVector(CUDAContext const* ctx) {
+    LazySyncHost(ctx, GPUAccess::kRead);
     return data_h_;
   }
 
-  void SetDevice(DeviceOrd device) {
-    if (device_ == device) { return; }
+  void SetDevice(DeviceOrd device, CUDAContext const* ctx) {
+    if (device_ == device) {
+      return;
+    }
     if (device_.IsCUDA()) {
-      LazySyncHost(GPUAccess::kNone);
+      LazySyncHost(ctx, GPUAccess::kNone);
     }
 
     if (device_.IsCUDA() && device.IsCUDA()) {
@@ -168,12 +180,12 @@ class HostDeviceVectorImpl {
     }
     device_ = device;
     if (device_.IsCUDA()) {
-      LazyResizeDevice(data_h_.size());
+      LazyResizeDevice(data_h_.size(), ctx);
     }
   }
 
   template <typename... U>
-  auto Resize(std::size_t new_size, U&&... args) {
+  auto Resize(CUDAContext const* ctx, std::size_t new_size, U&&... args) {
     if (new_size == Size()) {
       return;
     }
@@ -182,41 +194,47 @@ class HostDeviceVectorImpl {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
       auto old_size = data_d_->size();
-      data_d_->resize(new_size, std::forward<U>(args)...);
+      data_d_->resize(new_size, std::forward<U>(args)..., GetStream(ctx));
     } else {
       // resize on host
-      LazySyncHost(GPUAccess::kNone);
+      LazySyncHost(ctx, GPUAccess::kNone);
       auto old_size = data_h_.size();
       data_h_.resize(new_size, std::forward<U>(args)...);
     }
   }
 
-  void LazySyncHost(GPUAccess access) {
-    if (HostCanAccess(access)) { return; }
+  void LazySyncHost(CUDAContext const* ctx, GPUAccess access) {
+    if (HostCanAccess(access)) {
+      return;
+    }
     if (HostCanRead()) {
-      // data is present, just need to deny access to the device
       gpu_access_ = access;
       return;
     }
     gpu_access_ = access;
-    if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
+    if (data_h_.size() != data_d_->size()) {
+      data_h_.resize(data_d_->size());
+    }
     SetDevice();
-    dh::safe_cuda(cudaMemcpy(data_h_.data(), data_d_->data(), data_d_->size() * sizeof(T),
-                             cudaMemcpyDeviceToHost));
+    auto stream = GetStream(ctx);
+    dh::safe_cuda(cudaMemcpyAsync(data_h_.data(), data_d_->data(), data_d_->size() * sizeof(T),
+                                  cudaMemcpyDeviceToHost, stream));
+    dh::safe_cuda(cudaStreamSynchronize(stream));
   }
 
-  void LazySyncDevice(GPUAccess access) {
-    if (DeviceCanAccess(access)) { return; }
+  void LazySyncDevice(CUDAContext const* ctx, GPUAccess access) {
+    if (DeviceCanAccess(access)) {
+      return;
+    }
     if (DeviceCanRead()) {
-      // deny read to the host
       gpu_access_ = access;
       return;
     }
     // data is on the host
-    LazyResizeDevice(data_h_.size());
+    LazyResizeDevice(data_h_.size(), ctx);
     SetDevice();
     dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), data_h_.data(), data_d_->size() * sizeof(T),
-                                  cudaMemcpyHostToDevice, curt::DefaultStream()));
+                                  cudaMemcpyHostToDevice, GetStream(ctx)));
     gpu_access_ = access;
   }
 
@@ -234,31 +252,33 @@ class HostDeviceVectorImpl {
   std::unique_ptr<dh::DeviceUVector<T>> data_d_{};
   GPUAccess gpu_access_{GPUAccess::kNone};
 
-  void CopyToDevice(HostDeviceVectorImpl* other) {
+  void CopyToDevice(CUDAContext const* ctx, HostDeviceVectorImpl* other) {
     if (other->HostCanWrite()) {
-      CopyToDevice(other->data_h_.data());
+      CopyToDevice(ctx, other->data_h_.data());
     } else {
-      LazyResizeDevice(Size());
+      LazyResizeDevice(Size(), ctx);
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
       dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), other->data_d_->data(),
                                     data_d_->size() * sizeof(T), cudaMemcpyDefault,
-                                    curt::DefaultStream()));
+                                    GetStream(ctx)));
     }
   }
 
-  void CopyToDevice(const T* begin) {
-    LazyResizeDevice(Size());
+  void CopyToDevice(CUDAContext const* ctx, const T* begin) {
+    LazyResizeDevice(Size(), ctx);
     gpu_access_ = GPUAccess::kWrite;
     SetDevice();
     dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), begin, data_d_->size() * sizeof(T),
-                                  cudaMemcpyDefault, curt::DefaultStream()));
+                                  cudaMemcpyDefault, GetStream(ctx)));
   }
 
-  void LazyResizeDevice(size_t new_size) {
-    if (data_d_ && new_size == data_d_->size()) { return; }
+  void LazyResizeDevice(size_t new_size, CUDAContext const* ctx) {
+    if (data_d_ && new_size == data_d_->size()) {
+      return;
+    }
     SetDevice();
-    data_d_->resize(new_size);
+    data_d_->resize(new_size, GetStream(ctx));
   }
 
   void SetDevice() {
@@ -275,17 +295,19 @@ class HostDeviceVectorImpl {
   }
 };
 
-template<typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
-    : impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device, Context const* ctx)
+    : impl_(new HostDeviceVectorImpl<T>(GetCUDACtx(ctx), size, v, device)) {}
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
-    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device,
+                                      Context const* ctx)
+    : impl_(new HostDeviceVectorImpl<T>(GetCUDACtx(ctx), init, device)) {}
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
-    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device,
+                                      Context const* ctx)
+    : impl_(new HostDeviceVectorImpl<T>(GetCUDACtx(ctx), init, device)) {}
 
 template <typename T>
 HostDeviceVector<T>::HostDeviceVector(HostDeviceVector<T>&& other)
@@ -293,7 +315,9 @@ HostDeviceVector<T>::HostDeviceVector(HostDeviceVector<T>&& other)
 
 template <typename T>
 HostDeviceVector<T>& HostDeviceVector<T>::operator=(HostDeviceVector<T>&& other) {
-  if (this == &other) { return *this; }
+  if (this == &other) {
+    return *this;
+  }
 
   std::unique_ptr<HostDeviceVectorImpl<T>> new_impl(
       new HostDeviceVectorImpl<T>(std::move(*other.impl_)));
@@ -309,7 +333,9 @@ HostDeviceVector<T>::~HostDeviceVector() {
 }
 
 template <typename T>
-size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
+size_t HostDeviceVector<T>::Size() const {
+  return impl_->Size();
+}
 
 template <typename T>
 DeviceOrd HostDeviceVector<T>::Device() const {
@@ -317,56 +343,58 @@ DeviceOrd HostDeviceVector<T>::Device() const {
 }
 
 template <typename T>
-T* HostDeviceVector<T>::DevicePointer() {
-  return impl_->DevicePointer();
+T* HostDeviceVector<T>::DevicePointer(Context const* ctx) {
+  return impl_->DevicePointer(GetCUDACtx(ctx));
 }
 
 template <typename T>
-const T* HostDeviceVector<T>::ConstDevicePointer() const {
-  return impl_->ConstDevicePointer();
+const T* HostDeviceVector<T>::ConstDevicePointer(Context const* ctx) const {
+  return impl_->ConstDevicePointer(GetCUDACtx(ctx));
 }
 
 template <typename T>
-common::Span<T> HostDeviceVector<T>::DeviceSpan() {
-  return impl_->DeviceSpan();
+common::Span<T> HostDeviceVector<T>::DeviceSpan(Context const* ctx) {
+  return impl_->DeviceSpan(GetCUDACtx(ctx));
 }
 
 template <typename T>
-common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
-  return impl_->ConstDeviceSpan();
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(Context const* ctx) const {
+  return impl_->ConstDeviceSpan(GetCUDACtx(ctx));
 }
 
 template <typename T>
-void HostDeviceVector<T>::Fill(T v) {
-  impl_->Fill(v);
+void HostDeviceVector<T>::Fill(T v, Context const* ctx) {
+  impl_->Fill(v, GetCUDACtx(ctx));
 }
 
 template <typename T>
-void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
-  impl_->Copy(other.impl_);
+void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other, Context const* ctx) {
+  impl_->Copy(GetCUDACtx(ctx), other.impl_);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
-  impl_->Copy(other);
+void HostDeviceVector<T>::Copy(const std::vector<T>& other, Context const* ctx) {
+  impl_->Copy(GetCUDACtx(ctx), other);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
-  impl_->Copy(other);
+void HostDeviceVector<T>::Copy(std::initializer_list<T> other, Context const* ctx) {
+  impl_->Copy(GetCUDACtx(ctx), other);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Extend(HostDeviceVector const& other) {
-  impl_->Extend(other.impl_);
+void HostDeviceVector<T>::Extend(HostDeviceVector const& other, Context const* ctx) {
+  impl_->Extend(GetCUDACtx(ctx), other.impl_);
 }
 
 template <typename T>
-std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
+std::vector<T>& HostDeviceVector<T>::HostVector(Context const* ctx) {
+  return impl_->HostVector(GetCUDACtx(ctx));
+}
 
 template <typename T>
-const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
-  return impl_->ConstHostVector();
+const std::vector<T>& HostDeviceVector<T>::ConstHostVector(Context const* ctx) const {
+  return impl_->ConstHostVector(GetCUDACtx(ctx));
 }
 
 template <typename T>
@@ -395,18 +423,23 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
 }
 
 template <typename T>
-void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
-  impl_->SetDevice(device);
+void HostDeviceVector<T>::SetDevice(DeviceOrd device, Context const* ctx) const {
+  impl_->SetDevice(device, GetCUDACtx(ctx));
 }
 
 template <typename T>
 void HostDeviceVector<T>::Resize(std::size_t new_size) {
-  impl_->Resize(new_size);
+  impl_->Resize(nullptr, new_size);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Resize(Context const* ctx, std::size_t new_size) {
+  impl_->Resize(GetCUDACtx(ctx), new_size);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Resize(std::size_t new_size, T v) {
-  impl_->Resize(new_size, v);
+void HostDeviceVector<T>::Resize(Context const* ctx, std::size_t new_size, T v) {
+  impl_->Resize(GetCUDACtx(ctx), new_size, v);
 }
 
 // explicit instantiations are required, as HostDeviceVector isn't header-only
@@ -415,7 +448,7 @@ template class HostDeviceVector<double>;
 template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<GradientPairPrecise>;
 template class HostDeviceVector<GradientPairInt64>;
-template class HostDeviceVector<std::int32_t>;   // bst_node_t
+template class HostDeviceVector<std::int32_t>;  // bst_node_t
 template class HostDeviceVector<std::uint8_t>;
 template class HostDeviceVector<std::int8_t>;
 template class HostDeviceVector<FeatureType>;
diff --git a/src/common/linalg_op.cu b/src/common/linalg_op.cu
index 718f0193300c..53a517d748a3 100644
--- a/src/common/linalg_op.cu
+++ b/src/common/linalg_op.cu
@@ -18,7 +18,7 @@ void SmallHistogram(Context const* ctx, linalg::MatrixView<float const> indices,
   auto n_bins = bins.Size();
   auto cuctx = ctx->CUDACtx();
   // Sort for segmented sum
-  dh::DeviceUVector<std::size_t> sorted_idx(indices.Size());
+  dh::DeviceUVector<std::size_t> sorted_idx(indices.Size(), cuctx->Stream());
   common::ArgSort<true>(ctx, indices.Values(), dh::ToSpan(sorted_idx));
   auto d_sorted_idx = dh::ToSpan(sorted_idx);
 
@@ -27,8 +27,8 @@ void SmallHistogram(Context const* ctx, linalg::MatrixView<float const> indices,
 
   dh::device_vector<std::size_t> counts_out(n_bins + 1, 0);
   // Obtain the segment boundaries for the segmented sum.
-  dh::DeviceUVector<float> unique(n_bins);
-  dh::CachingDeviceUVector<std::size_t> num_runs(1);
+  dh::DeviceUVector<float> unique(n_bins, cuctx->Stream());
+  dh::CachingDeviceUVector<std::size_t> num_runs(1, cuctx->Stream());
   common::RunLengthEncode(cuctx->Stream(), key_it, unique.begin(), counts_out.begin() + 1,
                           num_runs.begin(), indices.Size());
   thrust::inclusive_scan(cuctx->CTP(), counts_out.begin(), counts_out.end(), counts_out.begin());
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 776de3682e48..6c436711d1c9 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -607,7 +607,7 @@ void SketchContainer::AllReduce(Context const *ctx, bool is_column_split) {
   timer_.Stop(__func__);
 
   // Merge them into a new sketch.
-  SketchContainer new_sketch(this->feature_types_, num_bins_, this->num_columns_, ctx->Device());
+  SketchContainer new_sketch(ctx, this->feature_types_, num_bins_, this->num_columns_);
   for (size_t i = 0; i < allworkers.size(); ++i) {
     auto worker = allworkers[i];
     auto worker_ptr =
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 50e65fd0c032..11f34ea237b4 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2025, XGBoost Contributors
+ * Copyright 2020-2026, XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_QUANTILE_CUH_
 #define XGBOOST_COMMON_QUANTILE_CUH_
@@ -7,13 +7,10 @@
 #include <thrust/logical.h>  // for any_of
 
 #include <algorithm>
-#include <cstddef>     // for size_t
-#include <functional>  // for equal_to
+#include <cstddef>  // for size_t
 
 #include "categorical.h"
-#include "common.h"          // for HumanMemUnit
-#include "cuda_context.cuh"  // for CUDAContext
-#include "cuda_rt_utils.h"   // for SetDevice
+#include "common.h"  // for HumanMemUnit
 #include "device_helpers.cuh"
 #include "error_msg.h"  // for InvalidMaxBin
 #include "quantile.h"
@@ -77,7 +74,7 @@ class SketchContainer {
   Span<SketchEntry> Column(bst_feature_t i) {
     auto data = dh::ToSpan(this->entries_);
     auto h_ptr = columns_ptr_.ConstHostSpan();
-    auto c = data.subspan(h_ptr[i], h_ptr[i+1] - h_ptr[i]);
+    auto c = data.subspan(h_ptr[i], h_ptr[i + 1] - h_ptr[i]);
     return c;
   }
 
@@ -88,20 +85,20 @@ class SketchContainer {
    * \param num_columns Total number of columns in dataset.
    * \param device      GPU ID.
    */
-  SketchContainer(HostDeviceVector<FeatureType> const& feature_types, bst_bin_t max_bin,
-                  bst_feature_t num_columns, DeviceOrd device)
+  SketchContainer(Context const* ctx, HostDeviceVector<FeatureType> const& feature_types,
+                  bst_bin_t max_bin, bst_feature_t num_columns)
       : num_columns_{num_columns}, num_bins_{max_bin} {
-    CHECK(device.IsCUDA());
+    CHECK(ctx->IsCUDA());
     // Initialize Sketches for this dmatrix
-    this->columns_ptr_.SetDevice(device);
-    this->columns_ptr_.Resize(num_columns + 1, 0);
-    this->columns_ptr_tmp_.SetDevice(device);
-    this->columns_ptr_tmp_.Resize(num_columns + 1, 0);
+    this->columns_ptr_.SetDevice(ctx->Device());
+    this->columns_ptr_.Resize(ctx, num_columns + 1, 0);
+    this->columns_ptr_tmp_.SetDevice(ctx->Device());
+    this->columns_ptr_tmp_.Resize(ctx, num_columns + 1, 0);
 
     this->feature_types_.Resize(feature_types.Size());
     this->feature_types_.Copy(feature_types);
     // Pull to device.
-    this->feature_types_.SetDevice(device);
+    this->feature_types_.SetDevice(ctx->Device());
     this->feature_types_.ConstDeviceSpan();
     this->feature_types_.ConstHostSpan();
 
@@ -118,9 +115,9 @@ class SketchContainer {
    */
   [[nodiscard]] std::size_t MemCapacityBytes() const {
     auto constexpr kE = sizeof(typename decltype(this->entries_)::value_type);
-    auto n_bytes =
-        (this->entries_.capacity() + this->entries_tmp_.capacity() + this->prune_buffer_.capacity()) *
-        kE;
+    auto n_bytes = (this->entries_.capacity() + this->entries_tmp_.capacity() +
+                    this->prune_buffer_.capacity()) *
+                   kE;
     n_bytes += (this->columns_ptr_.Size() + this->columns_ptr_tmp_.Size()) * sizeof(OffsetT);
     n_bytes += this->feature_types_.Size() * sizeof(FeatureType);
 
@@ -194,7 +191,6 @@ class SketchContainer {
 
   SketchContainer(const SketchContainer&) = delete;
   SketchContainer& operator=(const SketchContainer&) = delete;
-
 };
 }  // namespace xgboost::common
 
diff --git a/src/common/ranking_utils.cc b/src/common/ranking_utils.cc
index d477225a4efe..d5a63d045507 100644
--- a/src/common/ranking_utils.cc
+++ b/src/common/ranking_utils.cc
@@ -1,13 +1,13 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023-2026, XGBoost contributors
  */
 #include "ranking_utils.h"
 
-#include <algorithm>          // for copy_n, max, min, none_of, all_of
-#include <cstddef>            // for size_t
-#include <cstdio>             // for sscanf
-#include <functional>         // for greater
-#include <string>             // for char_traits, string
+#include <algorithm>   // for copy_n, max, min, none_of, all_of
+#include <cstddef>     // for size_t
+#include <cstdio>      // for sscanf
+#include <functional>  // for greater
+#include <string>      // for char_traits, string
 
 #include "algorithm.h"        // for ArgSort
 #include "linalg_op.h"        // for cbegin, cend
@@ -22,7 +22,7 @@
 namespace xgboost::ltr {
 void RankingCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
   if (info.group_ptr_.empty()) {
-    group_ptr_.Resize(2, 0);
+    group_ptr_.Resize(ctx, 2, 0);
     group_ptr_.HostVector()[1] = info.num_row_;
   } else {
     group_ptr_.HostVector() = info.group_ptr_;
@@ -75,7 +75,7 @@ common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*,
 void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
   auto const h_group_ptr = this->DataGroupPtr(ctx);
 
-  discounts_.Resize(MaxGroupSize(), 0);
+  discounts_.Resize(ctx, MaxGroupSize(), 0);
   auto& h_discounts = discounts_.HostVector();
   for (std::size_t i = 0; i < MaxGroupSize(); ++i) {
     h_discounts[i] = CalcDCGDiscount(i);
diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu
index 28e75dca47ea..003b7674b5fd 100644
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@@ -135,7 +135,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
 
   group_ptr_.SetDevice(ctx->Device());
   if (info.group_ptr_.empty()) {
-    group_ptr_.Resize(2, 0);
+    group_ptr_.Resize(ctx, 2, bst_group_t{0});
     group_ptr_.HostVector()[1] = info.num_row_;
   } else {
     auto const& h_group_ptr = info.group_ptr_;
@@ -154,7 +154,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
       thrust::reduce(cuctx->CTP(), it, it + n_groups, 0ul, thrust::maximum<std::size_t>{});
 
   threads_group_ptr_.SetDevice(ctx->Device());
-  threads_group_ptr_.Resize(n_groups + 1, 0);
+  threads_group_ptr_.Resize(ctx, n_groups + 1, std::size_t{0});
   auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
   if (param_.HasTruncation()) {
     n_cuda_threads_ =
@@ -169,7 +169,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
   }
 
   sorted_idx_cache_.SetDevice(ctx->Device());
-  sorted_idx_cache_.Resize(info.labels.Size(), 0);
+  sorted_idx_cache_.Resize(ctx, info.labels.Size(), std::size_t{0});
 
   auto weight = common::MakeOptionalWeights(ctx->Device(), info.weights_);
   auto w_it =
diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh
index bc8b7d7c010a..6fcb9710f88e 100644
--- a/src/common/ref_resource_view.cuh
+++ b/src/common/ref_resource_view.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2024-2025, XGBoost Contributors
+ * Copyright 2024-2026, XGBoost Contributors
  */
 #pragma once
 
@@ -17,8 +17,11 @@ namespace xgboost::common {
  * @brief Make a fixed size `RefResourceView` with cudaMalloc resource.
  */
 template <typename T>
-[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(std::size_t n_elements) {
-  auto resource = std::make_shared<common::CudaMallocResource>(n_elements * sizeof(T));
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
+                                                            std::size_t n_elements) {
+  CHECK(ctx);
+  auto resource = std::make_shared<common::CudaMallocResource>(n_elements * sizeof(T),
+                                                               ctx->CUDACtx()->Stream());
   auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
   return ref;
 }
@@ -36,7 +39,7 @@ template <typename T>
 template <typename T>
 [[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
                                                             std::size_t n_elements, T const& init) {
-  auto ref = MakeFixedVecWithCudaMalloc<T>(n_elements);
+  auto ref = MakeFixedVecWithCudaMalloc<T>(ctx, n_elements);
   thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init);
   return ref;
 }
diff --git a/src/common/resource.cuh b/src/common/resource.cuh
index 0760ec4fa81f..588711678c80 100644
--- a/src/common/resource.cuh
+++ b/src/common/resource.cuh
@@ -18,18 +18,22 @@ namespace xgboost::common {
  */
 class CudaMallocResource : public ResourceHandler {
   dh::DeviceUVector<std::byte> storage_;
+  curt::StreamRef stream_;
 
-  void Clear() noexcept(true) { this->Resize(0); }
+  void Clear() noexcept(true) { this->Resize(0, this->stream_); }
 
  public:
-  explicit CudaMallocResource(std::size_t n_bytes) : ResourceHandler{kCudaMalloc} {
-    this->Resize(n_bytes);
+  explicit CudaMallocResource(std::size_t n_bytes, curt::StreamRef stream)
+      : ResourceHandler{kCudaMalloc}, stream_{stream} {
+    this->Resize(n_bytes, stream);
   }
   ~CudaMallocResource() noexcept(true) override { this->Clear(); }
 
   [[nodiscard]] void* Data() override { return storage_.data(); }
   [[nodiscard]] std::size_t Size() const override { return storage_.size(); }
-  void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
+  void Resize(std::size_t n_bytes, curt::StreamRef stream) {
+    this->storage_.resize(n_bytes, stream);
+  }
 };
 
 /**
diff --git a/src/data/cat_container.cu b/src/data/cat_container.cu
index d957089b8ea1..adbcd39b2df3 100644
--- a/src/data/cat_container.cu
+++ b/src/data/cat_container.cu
@@ -137,7 +137,7 @@ struct CatContainerImpl {
 
 [[nodiscard]] std::tuple<CatAccessor, dh::DeviceUVector<std::int32_t>> MakeCatAccessor(
     Context const* ctx, enc::DeviceColumnsView const& new_enc, CatContainer const* orig_cats) {
-  dh::DeviceUVector<std::int32_t> mapping(new_enc.n_total_cats);
+  dh::DeviceUVector<std::int32_t> mapping(new_enc.n_total_cats, ctx->CUDACtx()->Stream());
   auto d_sorted_idx = orig_cats->RefSortedIndex(ctx);
   auto orig_enc = orig_cats->DeviceView(ctx);
   enc::Recode(EncPolicy, orig_enc, d_sorted_idx, new_enc, dh::ToSpan(mapping));
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 60db1ab7b9e2..c0d70f0ecdcf 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -158,7 +158,7 @@ __global__ void CompressBinEllpackKernel(
       [=] XGBOOST_DEVICE(std::size_t i) { return dptrs[i] - dptrs[i - 1]; });
   CHECK_GE(dptrs.size(), 2);
   auto max_it = thrust::max_element(cuctx->CTP(), it, it + dptrs.size() - 1);
-  dh::CachingDeviceUVector<PtrT> max_element(1);
+  dh::CachingDeviceUVector<PtrT> max_element(1, cuctx->Stream());
   auto d_me = max_element.data();
   dh::LaunchN(1, cuctx->Stream(), [=] XGBOOST_DEVICE(std::size_t i) { d_me[i] = *max_it; });
   dh::safe_cuda(cudaMemcpyAsync(&n_symbols_dense, d_me, sizeof(PtrT), cudaMemcpyDeviceToHost,
@@ -595,7 +595,7 @@ void EllpackPageImpl::CreateHistIndices(Context const* ctx, const SparsePage& ro
 
     /*! \brief row offset in SparsePage (the input data). */
     using OffT = typename std::remove_reference_t<decltype(offset_vec)>::value_type;
-    dh::DeviceUVector<OffT> row_ptrs(batch_nrows + 1);
+    dh::DeviceUVector<OffT> row_ptrs(batch_nrows + 1, ctx->CUDACtx()->Stream());
     auto size =
         std::distance(offset_vec.data() + batch_row_begin, offset_vec.data() + batch_row_end + 1);
     dh::safe_cuda(cudaMemcpyAsync(row_ptrs.data(), offset_vec.data() + batch_row_begin,
@@ -604,7 +604,7 @@ void EllpackPageImpl::CreateHistIndices(Context const* ctx, const SparsePage& ro
 
     // number of entries in this batch.
     size_t n_entries = ent_cnt_end - ent_cnt_begin;
-    dh::DeviceUVector<Entry> entries_d(n_entries);
+    dh::DeviceUVector<Entry> entries_d(n_entries, ctx->CUDACtx()->Stream());
     // copy data entries to device.
     if (row_batch.data.DeviceCanRead()) {
       auto const& d_data = row_batch.data.ConstDeviceSpan();
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index a5a2b3748100..7696100f7215 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2025, XGBoost contributors
+ * Copyright 2019-2026, XGBoost contributors
  */
 #include <dmlc/registry.h>
 
@@ -21,7 +21,7 @@ DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);
 namespace {
 // Function to support system without HMM or ATS
 template <typename T>
-[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi,
+[[nodiscard]] bool ReadDeviceVec(Context const* ctx, common::AlignedResourceReadStream* fi,
                                  common::RefResourceView<T>* vec) {
   xgboost_NVTX_FN_RANGE();
 
@@ -40,9 +40,9 @@ template <typename T>
     return false;
   }
 
-  *vec = common::MakeFixedVecWithCudaMalloc<T>(n);
+  *vec = common::MakeFixedVecWithCudaMalloc<T>(ctx, n);
   dh::safe_cuda(
-      cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, curt::DefaultStream()));
+      cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
   return true;
 }
 }  // namespace
@@ -61,8 +61,10 @@ template <typename T>
   RET_IF_NOT(fi->Read(&impl->is_dense));
   RET_IF_NOT(fi->Read(&impl->info.row_stride));
 
+  Context ctx = Context{}.MakeCUDA(curt::CurrentDevice());
+
   if (this->param_.prefetch_copy || !has_hmm_ats_) {
-    RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer));
+    RET_IF_NOT(ReadDeviceVec(&ctx, fi, &impl->gidx_buffer));
   } else {
     RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
   }
@@ -73,7 +75,7 @@ template <typename T>
 
   impl->SetCuts(this->cuts_);
 
-  curt::DefaultStream().Sync();
+  ctx.CUDACtx()->Stream().Sync();
   return true;
 }
 
@@ -94,7 +96,7 @@ template <typename T>
   bytes += fo->Write(impl->base_rowid);
   bytes += fo->Write(impl->NumSymbols());
 
-  curt::DefaultStream().Sync();
+  ctx.CUDACtx()->Stream().Sync();
   return bytes;
 }
 
@@ -128,7 +130,7 @@ template <typename T>
     dispatch();
   }
 
-  curt::DefaultStream().Sync();
+  ctx.CUDACtx()->Stream().Sync();
 
   return true;
 }
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 5cddd94996da..2c74ce266863 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -228,7 +228,7 @@ class EllpackHostCacheStreamImpl {
 
       // Device cache
       auto remaining = old_impl->gidx_buffer.size_bytes() - n_bytes;
-      auto d_page = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(remaining);
+      auto d_page = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(&ctx, remaining);
       if (remaining > 0) {
         dh::safe_cuda(cudaMemcpyAsync(d_page.data(), old_impl->gidx_buffer.data() + n_bytes,
                                       remaining, cudaMemcpyDefault));
@@ -323,7 +323,8 @@ class EllpackHostCacheStreamImpl {
       // Copy the data in the same order as written
       // Normal host cache
       auto n_bytes = this->cache_->GidxSizeBytes(this->ptr_);
-      out_impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(n_bytes);
+      out_impl->gidx_buffer =
+          common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(ctx, n_bytes);
       if (!h_page->gidx_buffer.empty()) {
         dh::safe_cuda(cudaMemcpyAsync(out_impl->gidx_buffer.data(), h_page->gidx_buffer.data(),
                                       h_page->gidx_buffer.size_bytes(), cudaMemcpyDefault,
diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu
index 82ba59cb4bc3..0b830223c642 100644
--- a/src/data/quantile_dmatrix.cu
+++ b/src/data/quantile_dmatrix.cu
@@ -78,8 +78,8 @@ void MakeSketches(Context const* ctx,
      */
     if (!ref) {
       if (!sketch) {
-        sketch = std::make_unique<common::SketchContainer>(proxy->Info().feature_types, p.max_bin,
-                                                           ext_info.n_features, dh::GetDevice(ctx));
+        sketch = std::make_unique<common::SketchContainer>(p_ctx, proxy->Info().feature_types,
+                                                           p.max_bin, ext_info.n_features);
       }
       proxy->Info().weights_.SetDevice(dh::GetDevice(ctx));
       DispatchAny(proxy, [&](auto const& value) {
@@ -113,8 +113,8 @@ void MakeSketches(Context const* ctx,
   if (!ref) {
     if (!sketch) {
       // Empty local input can happen in distributed settings.
-      sketch = std::make_unique<common::SketchContainer>(proxy->Info().feature_types, p.max_bin,
-                                                         ext_info.n_features, dh::GetDevice(ctx));
+      sketch = std::make_unique<common::SketchContainer>(p_ctx, proxy->Info().feature_types,
+                                                         p.max_bin, ext_info.n_features);
     }
     *cuts = sketch->MakeCuts(ctx, info.IsColumnSplit());
     sketch.reset();
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index 2e0a683de9c7..070be130df16 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -3,7 +3,7 @@
  */
 #include <thrust/sort.h>
 
-#include <cub/cub.cuh>         // NOLINT
+#include <cub/cub.cuh>  // NOLINT
 
 #include "../collective/aggregator.h"
 #include "../common/cuda_context.cuh"  // CUDAContext
@@ -95,7 +95,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
   auto d_node_idx = nidx.DeviceSpan();
 
   nptr.SetDevice(ctx->Device());
-  nptr.Resize(n_leaf + 1, 0);
+  nptr.Resize(ctx, n_leaf + 1, bst_idx_t{0});
   auto d_node_ptr = nptr.DeviceSpan();
 
   dh::LaunchN(n_leaf, [=] XGBOOST_DEVICE(size_t i) {
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index f4c0d00c6aeb..7e6df62a96c7 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -258,10 +258,11 @@ struct CopyViews {
 
   void operator()(dh::DeviceUVector<TreeViewVar>* p_dst, std::vector<TreeViewVar>&& src) {
     xgboost_NVTX_FN_RANGE();
-    p_dst->resize(src.size());
+    auto stream = ctx->CUDACtx()->Stream();
+    p_dst->resize(src.size(), stream);
     auto d_dst = dh::ToSpan(*p_dst);
-    dh::safe_cuda(cudaMemcpyAsync(d_dst.data(), src.data(), d_dst.size_bytes(), cudaMemcpyDefault,
-                                  ctx->CUDACtx()->Stream()));
+    dh::safe_cuda(
+        cudaMemcpyAsync(d_dst.data(), src.data(), d_dst.size_bytes(), cudaMemcpyDefault, stream));
   }
 };
 
diff --git a/src/predictor/interpretability/shap.cu b/src/predictor/interpretability/shap.cu
index 50d680f10a77..6e48e8005405 100644
--- a/src/predictor/interpretability/shap.cu
+++ b/src/predictor/interpretability/shap.cu
@@ -59,10 +59,11 @@ struct CopyViews {
 
   void operator()(dh::DeviceUVector<TreeViewVar>* p_dst, std::vector<TreeViewVar>&& src) {
     xgboost_NVTX_FN_RANGE();
-    p_dst->resize(src.size());
+    auto stream = ctx->CUDACtx()->Stream();
+    p_dst->resize(src.size(), stream);
     auto d_dst = dh::ToSpan(*p_dst);
-    dh::safe_cuda(cudaMemcpyAsync(d_dst.data(), src.data(), d_dst.size_bytes(), cudaMemcpyDefault,
-                                  ctx->CUDACtx()->Stream()));
+    dh::safe_cuda(
+        cudaMemcpyAsync(d_dst.data(), src.data(), d_dst.size_bytes(), cudaMemcpyDefault, stream));
   }
 };
 
@@ -213,7 +214,7 @@ void ExtractPaths(Context const* ctx,
     });
     auto max_cat_it =
         thrust::max_element(ctx->CUDACtx()->CTP(), max_elem_it, max_elem_it + d_model.n_nodes);
-    dh::CachingDeviceUVector<std::size_t> d_max_cat(1);
+    dh::CachingDeviceUVector<std::size_t> d_max_cat(1, ctx->CUDACtx()->Stream());
     auto s_max_cat = dh::ToSpan(d_max_cat);
     dh::LaunchN(1, ctx->CUDACtx()->Stream(),
                 [=] __device__(std::size_t) { s_max_cat[0] = *max_cat_it; });
diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
index 592fb3e02069..6147249c23ec 100644
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2025, XGBoost Contributors
+ * Copyright 2017-2026, XGBoost Contributors
  */
 #include "xgboost/predictor.h"
 
@@ -58,7 +58,7 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<float>
   CHECK_NE(model.learner_model_param->num_output_group, 0);
 
   if (!ctx_->Device().IsCPU()) {
-    out_preds->SetDevice(ctx_->Device());
+    out_preds->SetDevice(ctx_->Device(), this->ctx_);
   }
 
   // Cannot rely on the Resize to fill as it might skip if the size is already correct.
@@ -69,14 +69,14 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<float>
   if (!base_margin->Empty()) {
     ValidateBaseMarginShape(info.base_margin_, info.num_row_,
                             model.learner_model_param->OutputLength());
-    out_preds->Copy(*base_margin);
+    out_preds->Copy(*base_margin, this->ctx_);
     return;
   }
 
   auto base_score = model.learner_model_param->BaseScore(this->ctx_->Device());
   if (base_score.Size() == 1) {
     // Fill a scalar
-    out_preds->Fill(model.learner_model_param->BaseScore(DeviceOrd::CPU())(0));
+    out_preds->Fill(model.learner_model_param->BaseScore(DeviceOrd::CPU())(0), this->ctx_);
     return;
   }
 
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 43fff2beb9b5..1219a46c2005 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -471,15 +471,16 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
 
 GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
                                                      EvaluateSplitSharedInputs shared_inputs) {
-  dh::CachingDeviceUVector<EvaluateSplitInputs> inputs(1);
-  dh::safe_cuda(cudaMemcpyAsync(inputs.data(), &input, sizeof(input), cudaMemcpyDefault));
+  auto stream = ctx->CUDACtx()->Stream();
+  dh::CachingDeviceUVector<EvaluateSplitInputs> inputs(1, stream);
+  dh::safe_cuda(cudaMemcpyAsync(inputs.data(), &input, sizeof(input), cudaMemcpyDefault, stream));
 
   dh::TemporaryArray<GPUExpandEntry> out_entries(1);
   this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
                        shared_inputs, dh::ToSpan(out_entries));
   GPUExpandEntry root_entry;
   dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
-                                cudaMemcpyDeviceToHost));
+                                cudaMemcpyDeviceToHost, stream));
   return root_entry;
 }
 }  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index e78e89cd4f7b..ce22ae69243a 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -134,8 +134,8 @@ class GPUHistEvaluator {
   }
 
  public:
-  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, DeviceOrd device)
-      : tree_evaluator_{param, n_features, device}, param_{param} {}
+  GPUHistEvaluator(Context const *ctx, TrainParam const &param, bst_feature_t n_features)
+      : tree_evaluator_{ctx, param, n_features}, param_{param} {}
   /**
    * \brief Reset the evaluator, should be called before any use.
    */
@@ -166,12 +166,12 @@ class GPUHistEvaluator {
   /**
    * \brief Add a split to the internal tree evaluator.
    */
-  void ApplyTreeSplit(GPUExpandEntry const &candidate, RegTree *p_tree) {
+  void ApplyTreeSplit(Context const *ctx, GPUExpandEntry const &candidate, RegTree *p_tree) {
     auto &tree = *p_tree;
     // Set up child constraints
     auto left_child = tree[candidate.nidx].LeftChild();
     auto right_child = tree[candidate.nidx].RightChild();
-    tree_evaluator_.AddSplit(candidate.nidx, left_child, right_child,
+    tree_evaluator_.AddSplit(ctx, candidate.nidx, left_child, right_child,
                              tree[candidate.nidx].SplitIndex(), candidate.left_weight,
                              candidate.right_weight);
   }
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index 945069773d06..815c48af39a7 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -20,7 +20,7 @@ void GPUHistEvaluator::Reset(Context const *ctx, common::HistogramCuts const &cu
                              common::Span<FeatureType const> ft, bst_feature_t n_features,
                              TrainParam const &param, bool is_column_split) {
   param_ = param;
-  tree_evaluator_ = TreeEvaluator{param, n_features, ctx->Device()};
+  tree_evaluator_ = TreeEvaluator{ctx, param, n_features};
   has_categoricals_ = cuts.HasCategorical();
   if (cuts.HasCategorical()) {
     auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
index 4b8c2b65da03..599d51222513 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cu
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -297,9 +297,10 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
     return;
   }
 
+  auto stream = ctx->CUDACtx()->Stream();
   // Allocate weight and split sum storage on demand for the maximum node ID being evaluated.
-  this->AllocNodeWeight(max_nidx, n_targets);
-  this->split_sums_.Alloc(max_nidx, n_targets);
+  this->AllocNodeWeight(max_nidx, n_targets, stream);
+  this->split_sums_.Alloc(max_nidx, n_targets, stream);
 
   // Calculate total scan buffer size needed for all nodes
   auto node_hist_size = n_targets * shared_inputs.Features() * n_bins_per_feat_tar;
@@ -307,7 +308,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
 
   // Scan the histograms. One for forward and the other for backward.
   // Since there's only store op on the scan buffer, no need to initialize it.
-  this->scan_buffer_.resize(total_hist_size * 2);
+  this->scan_buffer_.resize(total_hist_size * 2, stream);
 
   // Create spans for each node's scan results
   std::vector<common::Span<GradientPairInt64>> h_scans(n_nodes);
@@ -444,7 +445,7 @@ void MultiHistEvaluator::ApplyTreeSplit(Context const *ctx, RegTree const *p_tre
   auto max_node = thrust::reduce(
       ctx->CUDACtx()->CTP(), max_in_it, max_in_it + d_candidates.size(), 0,
       [=] XGBOOST_DEVICE(bst_node_t l, bst_node_t r) { return cuda::std::max(l, r); });
-  this->AllocNodeSum(max_node, n_targets);
+  this->AllocNodeSum(max_node, n_targets, ctx->CUDACtx()->Stream());
 
   auto node_sums = this->node_sums_.View();
   // Use the internal split sums buffer instead of candidate.split.child_sum . It may be
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cuh b/src/tree/gpu_hist/multi_evaluate_splits.cuh
index 0628010f90db..9c1006648383 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cuh
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cuh
@@ -48,10 +48,10 @@ class MultiHistEvaluator {
     /**
      * @brief Allocate storage for node sums up to the given node ID.
      */
-    void Alloc(bst_node_t nidx, bst_target_t n_targets) {
+    void Alloc(bst_node_t nidx, bst_target_t n_targets, curt::StreamRef stream) {
       auto end = (nidx + 1) * n_targets;
       if (this->node_sums.size() < end) {
-        this->node_sums.resize(end);
+        this->node_sums.resize(end, stream);
       }
     }
     [[nodiscard]] common::Span<GradientPairInt64> GetNode(bst_node_t nidx, bst_target_t n_targets) {
@@ -98,8 +98,8 @@ class MultiHistEvaluator {
   /**
    * @brief Allocate storage for node sums up to the given node ID.
    */
-  void AllocNodeSum(bst_node_t nidx, bst_target_t n_targets) {
-    this->node_sums_.Alloc(nidx, n_targets);
+  void AllocNodeSum(bst_node_t nidx, bst_target_t n_targets, curt::StreamRef stream) {
+    this->node_sums_.Alloc(nidx, n_targets, stream);
   }
   [[nodiscard]] common::Span<GradientPairInt64> GetNodeSum(bst_node_t nidx,
                                                            bst_target_t n_targets) {
@@ -109,10 +109,10 @@ class MultiHistEvaluator {
   /**
    * @brief Allocate storage for weights up to the given node ID.
    */
-  void AllocNodeWeight(bst_node_t nidx, bst_target_t n_targets) {
+  void AllocNodeWeight(bst_node_t nidx, bst_target_t n_targets, curt::StreamRef stream) {
     auto required = (nidx + 1) * n_targets * NodeWeightBuffer::kWeightsPerNode;
     if (this->node_weights_.size() < required) {
-      this->node_weights_.resize(required);
+      this->node_weights_.resize(required, stream);
     }
   }
   [[nodiscard]] NodeWeightBuffer GetNodeWeights(bst_target_t n_targets) {
diff --git a/src/tree/gpu_hist/quantiser.cu b/src/tree/gpu_hist/quantiser.cu
index 17105c94bc61..d62f29a3557d 100644
--- a/src/tree/gpu_hist/quantiser.cu
+++ b/src/tree/gpu_hist/quantiser.cu
@@ -116,10 +116,11 @@ GradientQuantiserGroup::GradientQuantiserGroup(Context const* ctx,
   }
 
   // Copy to device.
-  d_quantizers_.resize(n_targets);
+  auto stream = ctx->CUDACtx()->Stream();
+  d_quantizers_.resize(n_targets, stream);
   dh::safe_cuda(cudaMemcpyAsync(d_quantizers_.data(), h_quantizers_.data(),
                                 n_targets * sizeof(GradientQuantiser), cudaMemcpyHostToDevice,
-                                ctx->CUDACtx()->Stream()));
+                                stream));
 }
 
 GradientQuantiserGroup::GradientQuantiserGroup(Context const* ctx,
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index a54e854173f6..7e43a2b4694b 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -11,9 +11,10 @@
 
 namespace xgboost::tree {
 void RowPartitioner::Reset(Context const* ctx, bst_idx_t n_samples, bst_idx_t base_rowid) {
+  auto stream = ctx->CUDACtx()->Stream();
   ridx_segments_.clear();
-  ridx_.resize(n_samples);
-  tmp_.clear();
+  ridx_.resize(n_samples, stream);
+  tmp_.clear(stream);
   n_nodes_ = 1;  // Root
 
   CHECK_LE(n_samples, std::numeric_limits<cuda_impl::RowIndexT>::max());
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index f7fa5d87150d..e4f16a4ff8d3 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -180,7 +180,7 @@ void SortPositionBatch(Context const* ctx, common::Span<const PerNodeData<OpData
                                                                       total_rows),
                                                                   ctx->CUDACtx()->Stream());
     dh::safe_cuda(ret);
-    tmp->resize(n_bytes);
+    tmp->resize(n_bytes, ctx->CUDACtx()->Stream());
   }
   n_bytes = tmp->size();
   auto ret =
@@ -473,7 +473,7 @@ class RowPartitionerBatches {
       CHECK_LE(n_samples, std::numeric_limits<cuda_impl::RowIndexT>::max());
       n_max_samples = std::max(n_samples, n_max_samples);
     }
-    this->ridx_tmp_.resize(n_max_samples);
+    this->ridx_tmp_.resize(n_max_samples, ctx->CUDACtx()->Stream());
   }
 
   // Accessors
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 0b9eed3f3c33..8a2759e5e115 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -438,7 +438,7 @@ class HistEvaluator {
     // Set up child constraints
     auto left_child = tree[candidate.nid].LeftChild();
     auto right_child = tree[candidate.nid].RightChild();
-    tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
+    tree_evaluator_.AddSplit(ctx_, candidate.nid, left_child, right_child,
                              tree[candidate.nid].SplitIndex(), left_weight, right_weight);
     evaluator = tree_evaluator_.GetEvaluator();
 
@@ -476,7 +476,7 @@ class HistEvaluator {
       : ctx_{ctx},
         param_{param},
         column_sampler_{std::move(sampler)},
-        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), DeviceOrd::CPU()},
+        tree_evaluator_{ctx, *param, static_cast<bst_feature_t>(info.num_col_)},
         is_col_split_{info.IsColumnSplit()} {
     interaction_constraints_.Configure(*param, info.num_col_);
     column_sampler_->Init(ctx, info.num_col_, info.feature_weights, param_->colsample_bynode,
@@ -751,14 +751,15 @@ class HistMultiEvaluator {
     float sum_hess = left_sum_hess + right_sum_hess;
 
     if (candidate.split.is_cat) {
-      p_tree->ExpandCategorical(candidate.nid, candidate.split.SplitIndex(),
+      p_tree->ExpandCategorical(ctx_, candidate.nid, candidate.split.SplitIndex(),
                                 candidate.split.cat_bits, candidate.split.DefaultLeft(),
                                 base_weight, left_weight, right_weight, loss_chg, sum_hess,
                                 left_sum_hess, right_sum_hess);
     } else {
-      p_tree->ExpandNode(candidate.nid, candidate.split.SplitIndex(), candidate.split.split_value,
-                         candidate.split.DefaultLeft(), base_weight, left_weight, right_weight,
-                         loss_chg, sum_hess, left_sum_hess, right_sum_hess);
+      p_tree->ExpandNode(ctx_, candidate.nid, candidate.split.SplitIndex(),
+                         candidate.split.split_value, candidate.split.DefaultLeft(), base_weight,
+                         left_weight, right_weight, loss_chg, sum_hess, left_sum_hess,
+                         right_sum_hess);
     }
 
     CHECK(p_tree->IsMultiTarget());
diff --git a/src/tree/multi_target_tree_model.cc b/src/tree/multi_target_tree_model.cc
index 21642ac9854b..83d46152d857 100644
--- a/src/tree/multi_target_tree_model.cc
+++ b/src/tree/multi_target_tree_model.cc
@@ -57,12 +57,13 @@ MultiTargetTree::MultiTargetTree(MultiTargetTree const& that)
   this->sum_hess_.Copy(that.sum_hess_);
 }
 
-void MultiTargetTree::SetRoot(linalg::VectorView<float const> weight, float sum_hess) {
+void MultiTargetTree::SetRoot(Context const* ctx, linalg::VectorView<float const> weight,
+                              float sum_hess) {
   CHECK(!weight.Empty());
   auto const next_nidx = RegTree::kRoot + 1;
 
   this->weights_.SetDevice(weight.Device());
-  this->weights_.Resize(weight.Size(), DftBadValue());
+  this->weights_.Resize(ctx, weight.Size(), DftBadValue());
 
   CHECK_LE(weight.Size(), this->NumTargets());
   CHECK_GE(weights_.Size(), next_nidx * weight.Size());
@@ -80,16 +81,17 @@ void MultiTargetTree::SetRoot(linalg::VectorView<float const> weight, float sum_
   }
 
   // Set root statistics
-  sum_hess_.Resize(next_nidx, 0.0f);
+  sum_hess_.Resize(ctx, next_nidx, 0.0f);
   sum_hess_.HostVector()[RegTree::kRoot] = sum_hess;
-  loss_chg_.Resize(next_nidx, 0.0f);
+  loss_chg_.Resize(ctx, next_nidx, 0.0f);
 
   CHECK_EQ(this->param_->num_nodes, 1);
   CHECK_EQ(this->NumSplitTargets(), weight.Size());
 }
 
-void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float split_cond,
-                             bool default_left, linalg::VectorView<float const> base_weight,
+void MultiTargetTree::Expand(Context const* ctx, bst_node_t nidx, bst_feature_t split_idx,
+                             float split_cond, bool default_left,
+                             linalg::VectorView<float const> base_weight,
                              linalg::VectorView<float const> left_weight,
                              linalg::VectorView<float const> right_weight, float loss_chg,
                              float sum_hess, float left_sum, float right_sum) {
@@ -102,9 +104,9 @@ void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float spl
 
   std::size_t n = param_->num_nodes + 2;
   CHECK_LT(split_idx, this->param_->num_feature);
-  left_.Resize(n, InvalidNodeId());
-  right_.Resize(n, InvalidNodeId());
-  parent_.Resize(n, InvalidNodeId());
+  left_.Resize(ctx, n, InvalidNodeId());
+  right_.Resize(ctx, n, InvalidNodeId());
+  parent_.Resize(ctx, n, InvalidNodeId());
 
   auto left_child = parent_.Size() - 2;
   auto right_child = parent_.Size() - 1;
@@ -124,7 +126,7 @@ void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float spl
   split_index_.Resize(n);
   split_index_.HostVector()[nidx] = split_idx;
 
-  split_conds_.Resize(n, DftBadValue());
+  split_conds_.Resize(ctx, n, DftBadValue());
   split_conds_.HostVector()[nidx] = split_cond;
 
   default_left_.Resize(n);
@@ -148,10 +150,10 @@ void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float spl
     r_weight(i) = right_weight(i);
   }
 
-  loss_chg_.Resize(n, 0.0f);
+  loss_chg_.Resize(ctx, n, 0.0f);
   loss_chg_.HostVector()[nidx] = loss_chg;
 
-  sum_hess_.Resize(n, 0.0f);
+  sum_hess_.Resize(ctx, n, 0.0f);
   auto& h_hess = sum_hess_.HostVector();
   h_hess[nidx] = sum_hess;
   h_hess[left_child] = left_sum;
diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h
index 43e093b31370..558527677fc4 100644
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2023 by Contributors
+ * Copyright 2018-2026, XGBoost Contributors
  * \file split_evaluator.h
  * \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation.
  * \author Henry Gouk
@@ -33,12 +33,12 @@ class TreeEvaluator {
   bool has_constraint_;
 
  public:
-  TreeEvaluator(TrainParam const& p, bst_feature_t n_features, DeviceOrd device) {
-    device_ = device;
-    if (device.IsCUDA()) {
-      lower_bounds_.SetDevice(device);
-      upper_bounds_.SetDevice(device);
-      monotone_.SetDevice(device);
+  TreeEvaluator(Context const* ctx, TrainParam const& p, bst_feature_t n_features)
+      : device_{ctx->Device()} {
+    if (device_.IsCUDA()) {
+      lower_bounds_.SetDevice(device_);
+      upper_bounds_.SetDevice(device_);
+      monotone_.SetDevice(device_);
     }
 
     if (p.monotone_constraints.empty()) {
@@ -50,8 +50,8 @@ class TreeEvaluator {
       monotone_.HostVector() = p.monotone_constraints;
       monotone_.HostVector().resize(n_features, 0);
       // Initialised to some small size, can grow if needed
-      lower_bounds_.Resize(256, -std::numeric_limits<float>::max());
-      upper_bounds_.Resize(256, std::numeric_limits<float>::max());
+      lower_bounds_.Resize(ctx, 256, -std::numeric_limits<float>::max());
+      upper_bounds_.Resize(ctx, 256, std::numeric_limits<float>::max());
       has_constraint_ = true;
     }
 
@@ -79,7 +79,7 @@ class TreeEvaluator {
       float wright = this->CalcWeight(nidx, param, right);
 
       float gain = this->CalcGainGivenWeight(param, left, wleft) +
-                    this->CalcGainGivenWeight(param, right, wright);
+                   this->CalcGainGivenWeight(param, right, wright);
 
       if (constraint == 0) {
         // no constraint
@@ -92,7 +92,7 @@ class TreeEvaluator {
     }
 
     template <typename GradientSumT>
-    XGBOOST_DEVICE float CalcWeight(bst_node_t nodeid, const ParamT &param,
+    XGBOOST_DEVICE float CalcWeight(bst_node_t nodeid, const ParamT& param,
                                     GradientSumT const& stats) const {
       float w = ::xgboost::tree::CalcWeight(param, stats);
       if (!has_constraint) {
@@ -138,11 +138,10 @@ class TreeEvaluator {
         return Divide(common::Sqr(ThresholdL1(stats.GetGrad(), p.reg_alpha)),
                       (stats.GetHess() + p.reg_lambda));
       }
-      return tree::CalcGainGivenWeight<ParamT, float>(p, stats.GetGrad(),
-                                                      stats.GetHess(), w);
+      return tree::CalcGainGivenWeight<ParamT, float>(p, stats.GetGrad(), stats.GetHess(), w);
     }
     template <typename GradientSumT>
-    XGBOOST_DEVICE float CalcGain(bst_node_t nid, ParamT const &p,
+    XGBOOST_DEVICE float CalcGain(bst_node_t nid, ParamT const& p,
                                   GradientSumT const& stats) const {
       return this->CalcGainGivenWeight(p, stats, this->CalcWeight(nid, p, stats));
     }
@@ -150,7 +149,8 @@ class TreeEvaluator {
 
  public:
   /* Get a view to the evaluator that can be passed down to device. */
-  template <typename ParamT = TrainParam> auto GetEvaluator() const {
+  template <typename ParamT = TrainParam>
+  auto GetEvaluator() const {
     if (device_.IsCUDA()) {
       auto constraints = monotone_.ConstDevicePointer();
       return SplitEvaluator<ParamT>{constraints, lower_bounds_.ConstDevicePointer(),
@@ -163,7 +163,7 @@ class TreeEvaluator {
   }
 
   template <bool CompiledWithCuda = WITH_CUDA()>
-  void AddSplit(bst_node_t nodeid, bst_node_t leftid, bst_node_t rightid,
+  void AddSplit(Context const* ctx, bst_node_t nodeid, bst_node_t leftid, bst_node_t rightid,
                 bst_feature_t f, float left_weight, float right_weight) {
     if (!has_constraint_) {
       return;
@@ -171,15 +171,14 @@ class TreeEvaluator {
 
     size_t max_nidx = std::max(leftid, rightid);
     if (lower_bounds_.Size() <= max_nidx) {
-      lower_bounds_.Resize(max_nidx * 2 + 1, -std::numeric_limits<float>::max());
+      lower_bounds_.Resize(ctx, max_nidx * 2 + 1, -std::numeric_limits<float>::max());
     }
     if (upper_bounds_.Size() <= max_nidx) {
-      upper_bounds_.Resize(max_nidx * 2 + 1, std::numeric_limits<float>::max());
+      upper_bounds_.Resize(ctx, max_nidx * 2 + 1, std::numeric_limits<float>::max());
     }
 
     common::Transform<>::Init(
-        [=] XGBOOST_DEVICE(size_t, common::Span<float> lower,
-                           common::Span<float> upper,
+        [=] XGBOOST_DEVICE(size_t, common::Span<float> lower, common::Span<float> upper,
                            common::Span<int> monotone) {
           lower[leftid] = lower[nodeid];
           upper[leftid] = upper[nodeid];
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 519cc15f800d..32c3f82d7377 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -829,8 +829,9 @@ void RegTree::ExpandNode(bst_node_t nid, unsigned split_index, bst_float split_v
   this->split_types_.HostVector().at(nid) = FeatureType::kNumerical;
 }
 
-void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split_cond,
-                         bool default_left, linalg::VectorView<float const> base_weight,
+void RegTree::ExpandNode(Context const* ctx, bst_node_t nidx, bst_feature_t split_index,
+                         float split_cond, bool default_left,
+                         linalg::VectorView<float const> base_weight,
                          linalg::VectorView<float const> left_weight,
                          linalg::VectorView<float const> right_weight, float loss_chg,
                          float sum_hess, float left_sum, float right_sum) {
@@ -839,8 +840,8 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
   CHECK(this->p_mt_tree_);
   CHECK_GT(param_.size_leaf_vector, 1);
 
-  this->p_mt_tree_->Expand(nidx, split_index, split_cond, default_left, base_weight, left_weight,
-                           right_weight, loss_chg, sum_hess, left_sum, right_sum);
+  this->p_mt_tree_->Expand(ctx, nidx, split_index, split_cond, default_left, base_weight,
+                           left_weight, right_weight, loss_chg, sum_hess, left_sum, right_sum);
 
   split_types_.HostVector().resize(this->Size(), FeatureType::kNumerical);
   split_categories_segments_.HostVector().resize(this->Size());
@@ -877,14 +878,14 @@ void RegTree::ExpandCategorical(bst_node_t nidx, bst_feature_t split_index,
   h_split_categories_segments.at(nidx).size = split_cat.size();
 }
 
-void RegTree::ExpandCategorical(bst_node_t nidx, bst_feature_t split_index,
+void RegTree::ExpandCategorical(Context const* ctx, bst_node_t nidx, bst_feature_t split_index,
                                 common::Span<common::KCatBitField::value_type> split_cat,
                                 bool default_left, linalg::VectorView<float const> base_weight,
                                 linalg::VectorView<float const> left_weight,
                                 linalg::VectorView<float const> right_weight, float loss_chg,
                                 float sum_hess, float left_sum, float right_sum) {
   CHECK(IsMultiTarget());
-  this->ExpandNode(nidx, split_index, DftBadValue(), default_left, base_weight, left_weight,
+  this->ExpandNode(ctx, nidx, split_index, DftBadValue(), default_left, base_weight, left_weight,
                    right_weight, loss_chg, sum_hess, left_sum, right_sum);
 
   auto& h_split_categories = split_categories_.HostVector();
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 09eb5f58c762..974631067ed5 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -164,7 +164,7 @@ class ColMaker : public TreeUpdater {
           colmaker_train_param_{colmaker_train_param},
           ctx_{ctx},
           column_sampler_{std::move(column_sampler)},
-          tree_evaluator_(param_, column_densities.size(), DeviceOrd::CPU()),
+          tree_evaluator_(ctx, param_, column_densities.size()),
           interaction_constraints_{std::move(_interaction_constraints)},
           column_densities_(column_densities) {}
     // update one tree, growing
@@ -186,7 +186,7 @@ class ColMaker : public TreeUpdater {
           int cleft = (*p_tree)[nid].LeftChild();
           int cright = (*p_tree)[nid].RightChild();
 
-          tree_evaluator_.AddSplit(nid, cleft, cright, snode_[nid].best.SplitIndex(),
+          tree_evaluator_.AddSplit(ctx_, nid, cleft, cright, snode_[nid].best.SplitIndex(),
                                    snode_[cleft].weight, snode_[cright].weight);
           interaction_constraints_.Split(nid, snode_[nid].best.SplitIndex(), cleft, cright);
         }
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 9ebdf222d57c..57f77f96901a 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -145,7 +145,7 @@ struct GPUHistMakerDevice {
                      std::shared_ptr<common::ColumnSampler> column_sampler, MetaInfo const& info,
                      std::vector<bst_idx_t> batch_ptr,
                      std::shared_ptr<common::HistogramCuts const> cuts, bool dense_compressed)
-      : evaluator_{_param, static_cast<bst_feature_t>(info.num_col_), ctx->Device()},
+      : evaluator_{ctx, _param, static_cast<bst_feature_t>(info.num_col_)},
         ctx_{ctx},
         column_sampler_{std::move(column_sampler)},
         batch_ptr_{std::move(batch_ptr)},
@@ -647,7 +647,7 @@ struct GPUHistMakerDevice {
                       candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight,
                       candidate.split.loss_chg, parent_hess, left_hess, right_hess);
     }
-    evaluator_.ApplyTreeSplit(candidate, p_tree);
+    evaluator_.ApplyTreeSplit(ctx_, candidate, p_tree);
 
     const auto& parent = tree[candidate.nidx];
     interaction_constraints.Split(candidate.nidx, parent.SplitIndex(), parent.LeftChild(),
diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
index b0c0e00fb521..d5acefb89aee 100644
--- a/src/tree/updater_gpu_hist.cuh
+++ b/src/tree/updater_gpu_hist.cuh
@@ -235,7 +235,7 @@ class MultiTargetHistMaker {
     auto n_targets = d_gpair.Shape(1);
 
     // Calculate the root sum
-    this->evaluator_.AllocNodeSum(RegTree::kRoot, n_targets);
+    this->evaluator_.AllocNodeSum(RegTree::kRoot, n_targets, ctx_->CUDACtx()->Stream());
     auto d_root_sum = this->evaluator_.GetNodeSum(RegTree::kRoot, n_targets);
     CalcRootSum(this->ctx_, d_gpair, d_root_sum);
 
@@ -262,7 +262,7 @@ class MultiTargetHistMaker {
     auto weights = this->evaluator_.GetNodeWeights(n_targets);
     // Root's sum_hess is the sum of left and right child hessians
     float root_sum_hess = static_cast<float>(entry.left_sum + entry.right_sum);
-    p_tree->SetRoot(linalg::MakeVec(this->ctx_->Device(), weights.Base(RegTree::kRoot)),
+    p_tree->SetRoot(this->ctx_, linalg::MakeVec(this->ctx_->Device(), weights.Base(RegTree::kRoot)),
                     root_sum_hess);
 
     return entry;
@@ -286,9 +286,10 @@ class MultiTargetHistMaker {
       float left_sum = static_cast<float>(candidate.left_sum);
       float right_sum = static_cast<float>(candidate.right_sum);
       float sum_hess = left_sum + right_sum;
-      p_tree->ExpandNode(candidate.nidx, candidate.split.findex, candidate.split.fvalue,
-                         candidate.split.dir == kLeftDir, linalg::MakeVec(h_base_weight),
-                         linalg::MakeVec(h_left_weight), linalg::MakeVec(h_right_weight), loss_chg,
+      p_tree->ExpandNode(this->ctx_, candidate.nidx, candidate.split.findex,
+                         candidate.split.fvalue, candidate.split.dir == kLeftDir,
+                         linalg::MakeVec(h_base_weight), linalg::MakeVec(h_left_weight),
+                         linalg::MakeVec(h_right_weight), loss_chg,
                          sum_hess, left_sum, right_sum);
     }
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 988a493c102f..d83ce2be1608 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -253,7 +253,7 @@ class MultiTargetHistBuilder {
     for (bst_target_t t{0}; t < n_targets; ++t) {
       root_sum_hess += static_cast<float>(h_root_sum(t).GetHess());
     }
-    p_tree->SetRoot(weight_t, root_sum_hess);
+    p_tree->SetRoot(ctx_, weight_t, root_sum_hess);
     std::vector<BoundedHistCollection const *> hists;
     std::vector<MultiExpandEntry> nodes{{RegTree::kRoot, 0}};
 
diff --git a/tests/cpp/common/test_device_compression.cu b/tests/cpp/common/test_device_compression.cu
index 99d246b0325b..9a44fd3da21b 100644
--- a/tests/cpp/common/test_device_compression.cu
+++ b/tests/cpp/common/test_device_compression.cu
@@ -26,7 +26,7 @@ TEST(NvComp, Snappy) {
 #endif
   auto ctx = MakeCUDACtx(0);
   auto cuctx = ctx.CUDACtx();
-  dh::DeviceUVector<common::CompressedByteT> in(1024);
+  dh::DeviceUVector<common::CompressedByteT> in(1024, cuctx->Stream());
   thrust::sequence(ctx.CUDACtx()->CTP(), in.begin(), in.end(), 0);
   dh::DeviceUVector<std::uint8_t> compr;
 
@@ -57,7 +57,7 @@ class TestNvComp : public ::testing::TestWithParam<std::tuple<std::size_t, std::
     auto ctx = MakeCUDACtx(0);
     auto cuctx = ctx.CUDACtx();
 
-    dh::DeviceUVector<common::CompressedByteT> in(n_bytes);
+    dh::DeviceUVector<common::CompressedByteT> in(n_bytes, cuctx->Stream());
     thrust::sequence(ctx.CUDACtx()->CTP(), in.begin(), in.end(), 0);
     dh::DeviceUVector<std::uint8_t> compr;
 
diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu
index 16a847648eb7..9313f9584082 100644
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@@ -33,33 +33,35 @@ TEST(AsyncPoolAllocator, Basic) {
 #endif  // !defined(XGBOOST_USE_RMM)
 
 TEST(DeviceUVector, Basic) {
+  auto stream = xgboost::curt::DefaultStream();
+
   GlobalMemoryLogger().Clear();
   std::int32_t verbosity{3};
   std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
   DeviceUVector<float> uvec;
-  uvec.resize(12);
+  uvec.resize(12, stream);
   auto peak = GlobalMemoryLogger().PeakMemory();
   auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
   ASSERT_EQ(peak, n_bytes);
   std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
 
-  DeviceUVector<double> uvec1{16};
+  DeviceUVector<double> uvec1{16, stream};
   ASSERT_EQ(uvec1.size(), 16);
-  uvec1.resize(3);
+  uvec1.resize(3, stream);
   ASSERT_EQ(uvec1.size(), 3);
   ASSERT_EQ(uvec1.Capacity(), 16);
   ASSERT_EQ(std::distance(uvec1.begin(), uvec1.end()), uvec1.size());
   auto orig = uvec1.size();
 
   thrust::sequence(dh::CachingThrustPolicy(), uvec1.begin(), uvec1.end(), 0);
-  uvec1.resize(32);
+  uvec1.resize(32, stream);
   ASSERT_EQ(uvec1.size(), 32);
   ASSERT_EQ(uvec1.Capacity(), 32);
   auto eq = thrust::equal(dh::CachingThrustPolicy(), uvec1.cbegin(), uvec1.cbegin() + orig,
                           thrust::make_counting_iterator(0));
   ASSERT_TRUE(eq);
 
-  uvec1.clear();
+  uvec1.clear(stream);
   ASSERT_EQ(uvec1.size(), 0);
   ASSERT_EQ(uvec1.Capacity(), 32);
 }
@@ -151,7 +153,7 @@ TEST(TestVirtualMem, Version) {
   PinnedMemory pinned;
 #if defined(xgboost_IS_WIN)
   ASSERT_FALSE(pinned.IsVm());
-#else  // defined(xgboost_IS_WIN)
+#else   // defined(xgboost_IS_WIN)
   if (major == 12 && minor >= 5 || major > 12) {
     ASSERT_TRUE(pinned.IsVm());
   } else {
diff --git a/tests/cpp/common/test_gpu_compressed_iterator.cu b/tests/cpp/common/test_gpu_compressed_iterator.cu
index e633eefb9599..c5d149956a9e 100644
--- a/tests/cpp/common/test_gpu_compressed_iterator.cu
+++ b/tests/cpp/common/test_gpu_compressed_iterator.cu
@@ -32,11 +32,9 @@ struct ReadSymbolFunction {
   CompressedIterator<int> ci;
   int* output_data_d;
   ReadSymbolFunction(CompressedIterator<int> ci, int* output_data_d)
-    : ci(ci), output_data_d(output_data_d) {}
+      : ci(ci), output_data_d(output_data_d) {}
 
-  __device__ void operator()(size_t i) {
-    output_data_d[i] = ci[i];
-  }
+  __device__ void operator()(size_t i) { output_data_d[i] = ci[i]; }
 };
 
 TEST(CompressedIterator, TestGPU) {
@@ -49,20 +47,17 @@ TEST(CompressedIterator, TestGPU) {
   for (auto alphabet_size : test_cases) {
     for (int i = 0; i < repetitions; i++) {
       std::vector<int> input(num_elements);
-      std::generate(input.begin(), input.end(),
-        [=]() { return rand() % alphabet_size; });
+      std::generate(input.begin(), input.end(), [=]() { return rand() % alphabet_size; });
       CompressedBufferWriter cbw(alphabet_size);
       thrust::device_vector<int> input_d(input);
 
       thrust::device_vector<unsigned char> buffer_d(
-        CompressedBufferWriter::CalculateBufferSize(input.size(),
-          alphabet_size));
+          CompressedBufferWriter::CalculateBufferSize(input.size(), alphabet_size));
 
       // write the data on device
       auto input_data_d = input_d.data().get();
       auto buffer_data_d = buffer_d.data().get();
-      dh::LaunchN(input_d.size(),
-                  WriteSymbolFunction(cbw, buffer_data_d, input_data_d));
+      dh::LaunchN(input_d.size(), WriteSymbolFunction(cbw, buffer_data_d, input_data_d));
 
       // read the data on device
       CompressedIterator<int> ci(buffer_d.data().get(), alphabet_size);
@@ -89,7 +84,7 @@ class TestDoubleCompressedIter : public ::testing::TestWithParam<std::size_t> {
   std::size_t n_symbols_{11};
 
   void SetUp() override {
-    input_.resize(n_symbols_ * 3);
+    input_.resize(n_symbols_ * 3, ctx_.CUDACtx()->Stream());
     auto policy = ctx_.CUDACtx()->CTP();
     for (std::size_t i = 0; i < 3; ++i) {
       auto beg = input_.begin() + n_symbols_ * i;
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 6c5763c58cab..25ca3b83f120 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -314,7 +314,7 @@ template <typename Adapter>
 auto MakeUnweightedCutsForTest(Context const* ctx, Adapter adapter, int32_t num_bins,
                                float missing) {
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), DeviceOrd::CUDA(0));
+  SketchContainer sketch_container(ctx, ft, num_bins, adapter.NumColumns());
   MetaInfo info;
   AdapterDeviceSketch(ctx, adapter.Value(), num_bins, info, missing, &sketch_container);
   return sketch_container.MakeCuts(ctx, info.IsColumnSplit());
@@ -371,7 +371,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories, int32_t num_b
   }
 
   ASSERT_EQ(info.feature_types.Size(), 1);
-  SketchContainer container(info.feature_types, num_bins, 1, DeviceOrd::CUDA(0));
+  SketchContainer container(&ctx, info.feature_types, num_bins, 1);
   AdapterDeviceSketch(&ctx, adapter.Value(), num_bins, info,
                       std::numeric_limits<float>::quiet_NaN(), &container);
   auto cuts = container.MakeCuts(&ctx, info.IsColumnSplit());
@@ -590,7 +590,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
   data::CupyAdapter adapter(m);
   auto const& batch = adapter.Value();
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, kBins, kCols, DeviceOrd::CUDA(0));
+  SketchContainer sketch_container(&ctx, ft, kBins, kCols);
   AdapterDeviceSketch(&ctx, adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
 
@@ -629,7 +629,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
       // https://github.com/dmlc/xgboost/issues/7946
       h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups);
     }
-    SketchContainer sketch_container{ft, kBins, kCols, DeviceOrd::CUDA(0)};
+    SketchContainer sketch_container{&ctx, ft, kBins, kCols};
     AdapterDeviceSketch(&ctx, adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                         &sketch_container);
     weighted = sketch_container.MakeCuts(&ctx, info.IsColumnSplit());
diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
index 7c3c2cd070cf..4066919c0723 100644
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -1,9 +1,10 @@
 /**
- * Copyright 2018-2024, XGBoost contributors
+ * Copyright 2018-2026, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <xgboost/context.h>
 #include <xgboost/host_device_vector.h>
 
 #include "../../../src/common/cuda_rt_utils.h"  // for SetDevice
@@ -25,28 +26,22 @@ struct HostDeviceVectorSetDeviceHandler {
     SetCudaSetDeviceHandler(f);
   }
 
-  ~HostDeviceVectorSetDeviceHandler() {
-    SetCudaSetDeviceHandler(nullptr);
-  }
+  ~HostDeviceVectorSetDeviceHandler() { SetCudaSetDeviceHandler(nullptr); }
 };
 
-void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int> *v) {
-  // create the vector
-  v->SetDevice(device);
+void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int>* v,
+                          Context const* ctx) {
+  v->SetDevice(device, ctx);
   v->Resize(n);
 
   ASSERT_EQ(v->Size(), n);
   ASSERT_EQ(v->Device(), device);
-  // ensure that the device have read-write access
   ASSERT_TRUE(v->DeviceCanRead());
   ASSERT_TRUE(v->DeviceCanWrite());
-  // ensure that the host has no access
   ASSERT_FALSE(v->HostCanRead());
   ASSERT_FALSE(v->HostCanWrite());
 
-  // fill in the data on the host
-  std::vector<int>& data_h = v->HostVector();
-  // ensure that the host has full access, while the device have none
+  std::vector<int>& data_h = v->HostVector(ctx);
   ASSERT_TRUE(v->HostCanRead());
   ASSERT_TRUE(v->HostCanWrite());
   ASSERT_FALSE(v->DeviceCanRead());
@@ -55,137 +50,134 @@ void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int> *v)
   std::copy_n(thrust::make_counting_iterator(0), n, data_h.begin());
 }
 
-void PlusOne(HostDeviceVector<int> *v) {
+void PlusOne(HostDeviceVector<int>* v) {
   auto device = v->Device();
   SetDeviceForTest(device);
   thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v),
-                    [=]__device__(unsigned int a){ return a + 1; });
+                    [=] __device__(unsigned int a) { return a + 1; });
   ASSERT_TRUE(v->DeviceCanWrite());
 }
 
-void CheckDevice(HostDeviceVector<int>* v,
-                 size_t size,
-                 unsigned int first,
-                 GPUAccess access) {
+void CheckDevice(HostDeviceVector<int>* v, size_t size, unsigned int first, GPUAccess access) {
   ASSERT_EQ(v->Size(), size);
   SetDeviceForTest(v->Device());
 
-  ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v),
-                            thrust::make_counting_iterator(first)));
+  ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v), thrust::make_counting_iterator(first)));
   ASSERT_TRUE(v->DeviceCanRead());
-  // ensure that the device has at most the access specified by access
   ASSERT_EQ(v->DeviceCanWrite(), access == GPUAccess::kWrite);
   ASSERT_EQ(v->HostCanRead(), access == GPUAccess::kRead);
   ASSERT_FALSE(v->HostCanWrite());
 
-  ASSERT_TRUE(thrust::equal(dh::tbegin(*v), dh::tend(*v),
-                            thrust::make_counting_iterator(first)));
+  ASSERT_TRUE(thrust::equal(dh::tbegin(*v), dh::tend(*v), thrust::make_counting_iterator(first)));
   ASSERT_TRUE(v->DeviceCanRead());
   ASSERT_TRUE(v->DeviceCanWrite());
   ASSERT_FALSE(v->HostCanRead());
   ASSERT_FALSE(v->HostCanWrite());
 }
 
-void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
-  const std::vector<int>& data_h = access == GPUAccess::kNone ?
-    v->HostVector() : v->ConstHostVector();
+void CheckHost(HostDeviceVector<int>* v, GPUAccess access, Context const* ctx) {
+  const std::vector<int>& data_h =
+      access == GPUAccess::kNone ? v->HostVector(ctx) : v->ConstHostVector(ctx);
   for (size_t i = 0; i < v->Size(); ++i) {
     ASSERT_EQ(data_h.at(i), i + 1);
   }
   ASSERT_TRUE(v->HostCanRead());
   ASSERT_EQ(v->HostCanWrite(), access == GPUAccess::kNone);
   ASSERT_EQ(v->DeviceCanRead(), access == GPUAccess::kRead);
-  // the devices should have no write access
   ASSERT_FALSE(v->DeviceCanWrite());
 }
 
-void TestHostDeviceVector(size_t n, DeviceOrd device) {
+void TestHostDeviceVector(size_t n, Context const* ctx) {
   HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
   HostDeviceVector<int> v;
-  InitHostDeviceVector(n, device, &v);
+  InitHostDeviceVector(n, ctx->Device(), &v, ctx);
   CheckDevice(&v, n, 0, GPUAccess::kRead);
   PlusOne(&v);
   CheckDevice(&v, n, 1, GPUAccess::kWrite);
-  CheckHost(&v, GPUAccess::kRead);
-  CheckHost(&v, GPUAccess::kNone);
+  CheckHost(&v, GPUAccess::kRead, ctx);
+  CheckHost(&v, GPUAccess::kNone, ctx);
 }
 
 TEST(HostDeviceVector, Basic) {
   size_t n = 1001;
-  DeviceOrd device = DeviceOrd::CUDA(0);
-  TestHostDeviceVector(n, device);
+  auto ctx = Context{}.MakeCUDA(0);
+  TestHostDeviceVector(n, &ctx);
 }
 
 TEST(HostDeviceVector, Copy) {
   size_t n = 1001;
-  auto device = DeviceOrd::CUDA(0);
+  auto ctx = Context{}.MakeCUDA(0);
   HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
 
   HostDeviceVector<int> v;
   {
-    // a separate scope to ensure that v1 is gone before further checks
     HostDeviceVector<int> v1;
-    InitHostDeviceVector(n, device, &v1);
+    InitHostDeviceVector(n, ctx.Device(), &v1, &ctx);
     v.Resize(v1.Size());
-    v.Copy(v1);
+    v.Copy(v1, &ctx);
   }
   CheckDevice(&v, n, 0, GPUAccess::kRead);
   PlusOne(&v);
   CheckDevice(&v, n, 1, GPUAccess::kWrite);
-  CheckHost(&v, GPUAccess::kRead);
-  CheckHost(&v, GPUAccess::kNone);
+  CheckHost(&v, GPUAccess::kRead, &ctx);
+  CheckHost(&v, GPUAccess::kNone, &ctx);
 }
 
 TEST(HostDeviceVector, SetDevice) {
-  std::vector<int> h_vec (2345);
+  auto ctx = Context{}.MakeCUDA(0);
+
+  std::vector<int> h_vec(2345);
   for (size_t i = 0; i < h_vec.size(); ++i) {
     h_vec[i] = i;
   }
-  HostDeviceVector<int> vec (h_vec);
-  auto device = DeviceOrd::CUDA(0);
+  HostDeviceVector<int> vec(h_vec);
 
-  vec.SetDevice(device);
+  vec.SetDevice(ctx.Device(), &ctx);
   ASSERT_EQ(vec.Size(), h_vec.size());
-  vec.DeviceSpan();  // sync to device
+  vec.DeviceSpan(&ctx);  // sync to device
 
-  vec.SetDevice(DeviceOrd::CPU());  // pull back to cpu.
+  vec.SetDevice(DeviceOrd::CPU(), &ctx);  // pull back to cpu.
   ASSERT_EQ(vec.Size(), h_vec.size());
   ASSERT_EQ(vec.Device(), DeviceOrd::CPU());
 
-  auto h_vec_1 = vec.HostVector();
+  auto h_vec_1 = vec.HostVector(&ctx);
   ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
 }
 
 TEST(HostDeviceVector, Span) {
-  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
-  vec.SetDevice(DeviceOrd::CUDA(0));
-  auto span = vec.DeviceSpan();
+  auto ctx = Context{}.MakeCUDA(0);
+
+  HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
+  vec.SetDevice(ctx.Device(), &ctx);
+  auto span = vec.DeviceSpan(&ctx);
   ASSERT_EQ(vec.Size(), span.size());
-  ASSERT_EQ(vec.DevicePointer(), span.data());
-  auto const_span = vec.ConstDeviceSpan();
+  ASSERT_EQ(vec.DevicePointer(&ctx), span.data());
+  auto const_span = vec.ConstDeviceSpan(&ctx);
   ASSERT_EQ(vec.Size(), const_span.size());
-  ASSERT_EQ(vec.ConstDevicePointer(), const_span.data());
+  ASSERT_EQ(vec.ConstDevicePointer(&ctx), const_span.data());
 
-  auto h_span = vec.ConstHostSpan();
+  auto h_span = vec.ConstHostSpan(&ctx);
   ASSERT_TRUE(vec.HostCanRead());
   ASSERT_FALSE(vec.HostCanWrite());
   ASSERT_EQ(h_span.size(), vec.Size());
-  ASSERT_EQ(h_span.data(), vec.ConstHostPointer());
+  ASSERT_EQ(h_span.data(), vec.ConstHostPointer(&ctx));
 
-  h_span = vec.HostSpan();
+  h_span = vec.HostSpan(&ctx);
   ASSERT_TRUE(vec.HostCanWrite());
 }
 
 TEST(HostDeviceVector, Empty) {
-  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
-  HostDeviceVector<float> another { std::move(vec) };
+  HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
+  HostDeviceVector<float> another{std::move(vec)};
   ASSERT_FALSE(another.Empty());
   ASSERT_TRUE(vec.Empty());
 }
 
 TEST(HostDeviceVector, Resize) {
+  auto ctx = Context{}.MakeCUDA(0);
+
   auto check = [&](HostDeviceVector<float> const& vec) {
-    auto const& h_vec = vec.ConstHostSpan();
+    auto const& h_vec = vec.ConstHostSpan(&ctx);
     for (std::size_t i = 0; i < 4; ++i) {
       ASSERT_EQ(h_vec[i], i + 1);
     }
@@ -195,26 +187,26 @@ TEST(HostDeviceVector, Resize) {
   };
   {
     HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
-    vec.SetDevice(DeviceOrd::CUDA(0));
-    vec.ConstDeviceSpan();
+    vec.SetDevice(ctx.Device(), &ctx);
+    vec.ConstDeviceSpan(&ctx);
     ASSERT_TRUE(vec.DeviceCanRead());
     ASSERT_FALSE(vec.DeviceCanWrite());
-    vec.DeviceSpan();
-    vec.Resize(7, 3.0f);
+    vec.DeviceSpan(&ctx);
+    vec.Resize(&ctx, 7, 3.0f);
     ASSERT_TRUE(vec.DeviceCanWrite());
     check(vec);
   }
   {
-    HostDeviceVector<float> vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::CUDA(0)};
+    HostDeviceVector<float> vec{{1.0f, 2.0f, 3.0f, 4.0f}, ctx.Device(), &ctx};
     ASSERT_TRUE(vec.DeviceCanWrite());
-    vec.Resize(7, 3.0f);
+    vec.Resize(&ctx, 7, 3.0f);
     ASSERT_TRUE(vec.DeviceCanWrite());
     check(vec);
   }
   {
     HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
     ASSERT_TRUE(vec.HostCanWrite());
-    vec.Resize(7, 3.0f);
+    vec.Resize(&ctx, 7, 3.0f);
     ASSERT_TRUE(vec.HostCanWrite());
     check(vec);
   }
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index 6a34513db5b1..28060adca8d9 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2025, XGBoost Contributors
+ * Copyright 2021-2026, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/equal.h>                       // for equal
@@ -21,17 +21,17 @@ namespace {
 void TestElementWiseKernel() {
   auto ctx = MakeCUDACtx(0);
   auto device = ctx.Device();
-  Tensor<float, 3> l{{2, 3, 4}, device};
+  Tensor<float, 3> l{{2, 3, 4}, device, kC, &ctx};
   {
     /**
      * Non-contiguous
      */
     // GPU view
-    auto t = l.View(device).Slice(linalg::All(), 1, linalg::All());
+    auto t = l.View(device, &ctx).Slice(linalg::All(), 1, linalg::All());
     ASSERT_FALSE(t.CContiguous());
     cuda_impl::TransformIdxKernel(&ctx, t, [] XGBOOST_DEVICE(std::size_t i, float) { return i; });
     // CPU view
-    t = l.View(DeviceOrd::CPU()).Slice(linalg::All(), 1, linalg::All());
+    t = l.View(DeviceOrd::CPU(), &ctx).Slice(linalg::All(), 1, linalg::All());
     std::size_t k = 0;
     for (size_t i = 0; i < l.Shape(0); ++i) {
       for (size_t j = 0; j < l.Shape(2); ++j) {
@@ -39,7 +39,7 @@ void TestElementWiseKernel() {
       }
     }
 
-    t = l.View(device).Slice(linalg::All(), 1, linalg::All());
+    t = l.View(device, &ctx).Slice(linalg::All(), 1, linalg::All());
     cuda_impl::ElementWiseKernel(
         t, [=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable { t(i, j) = i + j; });
 
@@ -55,11 +55,11 @@ void TestElementWiseKernel() {
     /**
      * Contiguous
      */
-    auto t = l.View(device);
+    auto t = l.View(device, &ctx);
     cuda_impl::TransformIdxKernel(&ctx, t, [] XGBOOST_DEVICE(size_t i, float) { return i; });
     ASSERT_TRUE(t.CContiguous());
     // CPU view
-    t = l.View(DeviceOrd::CPU());
+    t = l.View(DeviceOrd::CPU(), &ctx);
 
     size_t ind = 0;
     for (size_t i = 0; i < l.Shape(0); ++i) {
@@ -138,7 +138,7 @@ TEST(Linalg, SmallHistogram) {
 
   linalg::MatrixView<float> indices =
       linalg::MakeTensorView(&ctx, dh::ToSpan(values), values.size(), 1);
-  dh::CachingDeviceUVector<float> bins(n_bins);
+  dh::CachingDeviceUVector<float> bins(n_bins, ctx.CUDACtx()->Stream());
   HostDeviceVector<float> weights;
   SmallHistogram(&ctx, indices, common::MakeOptionalWeights(ctx.Device(), weights),
                  linalg::MakeTensorView(&ctx, dh::ToSpan(bins), bins.size()));
diff --git a/tests/cpp/common/test_linalg.h b/tests/cpp/common/test_linalg.h
index d79ed1422931..8ebeab5add3a 100644
--- a/tests/cpp/common/test_linalg.h
+++ b/tests/cpp/common/test_linalg.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2025, XGBoost Contributors
+ * Copyright 2025-2026, XGBoost Contributors
  */
 #pragma once
 
@@ -19,8 +19,9 @@ void TestLinalgDispatch(Context const* ctx, Fn&& fn) {
   std::iota(data.begin(), data.end(), 0.0);
   Vector<double> vec(data.begin(), data.end(), {data.size()}, DeviceOrd::CPU());
 
-  TransformKernel(ctx, vec.View(ctx->Device()), [=] XGBOOST_DEVICE(double v) { return fn(v); });
-  auto h_v = vec.HostView();
+  TransformKernel(ctx, vec.View(ctx->Device(), ctx),
+                  [=] XGBOOST_DEVICE(double v) { return fn(v); });
+  auto h_v = vec.HostView(ctx);
   for (std::size_t i = 0; i < h_v.Size(); ++i) {
     ASSERT_EQ(h_v(i), fn(i));
   }
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index 43090312cef5..47a4ac5def83 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -40,7 +40,7 @@ TEST(GPUQuantile, Basic) {
   auto ctx = MakeCUDACtx(0);
   constexpr size_t kCols = 100, kBins = 256;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, kBins, kCols, ctx.Device());
+  SketchContainer sketch(&ctx, ft, kBins, kCols);
   dh::caching_device_vector<Entry> entries;
   dh::device_vector<bst_idx_t> cuts_ptr(kCols + 1);
   thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
@@ -86,7 +86,7 @@ TEST(GPUQuantile, Prune) {
   RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) {
     auto ctx = MakeCUDACtx(0);
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, ctx.Device());
+    SketchContainer sketch(&ctx, ft, n_bins, kCols);
 
     HostDeviceVector<float> storage;
     std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
@@ -124,7 +124,7 @@ TEST(GPUQuantile, PruneDuplicated) {
   RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) {
     auto ctx = MakeCUDACtx(0);
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, ctx.Device());
+    SketchContainer sketch(&ctx, ft, n_bins, kCols);
 
     HostDeviceVector<float> storage;
     std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
@@ -165,7 +165,7 @@ TEST(GPUQuantile, MergeEmpty) {
   size_t n_bins = 10;
   auto ctx = MakeCUDACtx(0);
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, kCols, ctx.Device());
+  SketchContainer sketch_0(&ctx, ft, n_bins, kCols);
   HostDeviceVector<float> storage_0;
   std::string interface_str_0 =
       RandomDataGenerator{kRows, kCols, 0}.Device(ctx.Device()).GenerateArrayInterface(&storage_0);
@@ -205,7 +205,7 @@ TEST(GPUQuantile, MergeBasic) {
   RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) {
     auto ctx = MakeCUDACtx(0);
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_0(ft, n_bins, kCols, ctx.Device());
+    SketchContainer sketch_0(&ctx, ft, n_bins, kCols);
     HostDeviceVector<float> storage_0;
     std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
                                       .Device(ctx.Device())
@@ -215,7 +215,7 @@ TEST(GPUQuantile, MergeBasic) {
     AdapterDeviceSketch(&ctx, adapter_0.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(), &sketch_0);
 
-    SketchContainer sketch_1(ft, n_bins, kCols, ctx.Device());
+    SketchContainer sketch_1(&ctx, ft, n_bins, kCols);
     HostDeviceVector<float> storage_1;
     std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0}
                                       .Device(ctx.Device())
@@ -249,7 +249,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
   MetaInfo info;
   int32_t seed = 0;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, cols, ctx.Device());
+  SketchContainer sketch_0(&ctx, ft, n_bins, cols);
   HostDeviceVector<float> storage_0;
   std::string interface_str_0 = RandomDataGenerator{rows, cols, 0}
                                     .Device(ctx.Device())
@@ -260,7 +260,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
                       std::numeric_limits<float>::quiet_NaN(), &sketch_0);
 
   size_t f_rows = rows * frac;
-  SketchContainer sketch_1(ft, n_bins, cols, ctx.Device());
+  SketchContainer sketch_1(&ctx, ft, n_bins, cols);
   HostDeviceVector<float> storage_1;
   std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0}
                                     .Device(ctx.Device())
@@ -316,8 +316,8 @@ TEST(GPUQuantile, MergeCategorical) {
 
   HostDeviceVector<FeatureType> ft;
   ft.HostVector() = {FeatureType::kCategorical, FeatureType::kNumerical};
-  SketchContainer sketch_0(ft, n_bins, kCols, ctx.Device());
-  SketchContainer sketch_1(ft, n_bins, kCols, ctx.Device());
+  SketchContainer sketch_0(&ctx, ft, n_bins, kCols);
+  SketchContainer sketch_1(&ctx, ft, n_bins, kCols);
 
   std::vector<Entry> entries_0{{0, 0.0f}, {0, 0.0f}, {0, 1.0f}, {0, 2.0f},
                                {0, 2.0f}, {1, 0.1f}, {1, 0.2f}, {1, 0.4f}};
@@ -358,7 +358,7 @@ TEST(GPUQuantile, MultiMerge) {
     // Set up single node version
     HostDeviceVector<FeatureType> ft;
     auto ctx = MakeCUDACtx(0);
-    SketchContainer sketch_on_single_node(ft, n_bins, kCols, ctx.Device());
+    SketchContainer sketch_on_single_node(&ctx, ft, n_bins, kCols);
 
     size_t intermediate_num_cuts =
         std::min(kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
@@ -371,7 +371,7 @@ TEST(GPUQuantile, MultiMerge) {
                                       .GenerateArrayInterface(&storage);
       data::CupyAdapter adapter(interface_str);
       HostDeviceVector<FeatureType> ft;
-      containers.emplace_back(ft, n_bins, kCols, ctx.Device());
+      containers.emplace_back(&ctx, ft, n_bins, kCols);
       AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info,
                           std::numeric_limits<float>::quiet_NaN(), &containers.back());
     }
@@ -421,7 +421,7 @@ void TestAllReduceBasic() {
      * Set up single node version.
      */
     HostDeviceVector<FeatureType> ft({}, device);
-    SketchContainer sketch_on_single_node(ft, n_bins, kCols, device);
+    SketchContainer sketch_on_single_node(&ctx, ft, n_bins, kCols);
 
     size_t intermediate_num_cuts =
         std::min(kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
@@ -434,7 +434,7 @@ void TestAllReduceBasic() {
                                       .GenerateArrayInterface(&storage);
       data::CupyAdapter adapter(interface_str);
       HostDeviceVector<FeatureType> ft({}, device);
-      containers.emplace_back(ft, n_bins, kCols, device);
+      containers.emplace_back(&ctx, ft, n_bins, kCols);
       AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info,
                           std::numeric_limits<float>::quiet_NaN(), &containers.back());
     }
@@ -450,7 +450,7 @@ void TestAllReduceBasic() {
      * the exact same copy of data.
      */
     auto rank = collective::GetRank();
-    SketchContainer sketch_distributed(ft, n_bins, kCols, device);
+    SketchContainer sketch_distributed(&ctx, ft, n_bins, kCols);
     HostDeviceVector<float> storage({}, device);
     std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
                                     .Device(device)
@@ -561,7 +561,7 @@ void TestSameOnAllWorkers() {
     auto const device = DeviceOrd::CUDA(GPUIDX);
     Context ctx = MakeCUDACtx(device.ordinal);
     HostDeviceVector<FeatureType> ft({}, device);
-    SketchContainer sketch_distributed(ft, n_bins, kCols, device);
+    SketchContainer sketch_distributed(&ctx, ft, n_bins, kCols);
     HostDeviceVector<float> storage({}, device);
     std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
                                     .Device(device)
@@ -637,7 +637,7 @@ TEST(GPUQuantile, Push) {
   columns_ptr[1] = kRows;
 
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, ctx.Device());
+  SketchContainer sketch(&ctx, ft, n_bins, kCols);
   sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows,
               kRows, {});
 
@@ -678,7 +678,7 @@ TEST(GPUQuantile, MultiColPush) {
 
   int32_t n_bins = 16;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, ctx.Device());
+  SketchContainer sketch(&ctx, ft, n_bins, kCols);
   dh::device_vector<Entry> d_entries{entries};
 
   dh::device_vector<size_t> columns_ptr(kCols + 1, 0);
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 3cb357845a0b..cd30f91fa8fc 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -249,7 +249,7 @@ void RandomDataGenerator::GenerateDense(HostDeviceVector<float>* out) const {
   CHECK(out);
 
   SimpleLCG lcg{lcg_};
-  out->Resize(rows_ * cols_, 0);
+  out->Resize(rows_ * cols_);
   auto& h_data = out->HostVector();
   float sparsity = sparsity_ * (upper_ - lower_) + lower_;
   for (auto& v : h_data) {
diff --git a/tests/cpp/objective/test_objective.cc b/tests/cpp/objective/test_objective.cc
index 5df765789c3c..f98159654726 100644
--- a/tests/cpp/objective/test_objective.cc
+++ b/tests/cpp/objective/test_objective.cc
@@ -27,6 +27,7 @@ TEST(Objective, PredTransform) {
   xgboost::Context tparam;
   tparam.UpdateAllowUnknown(Args{{"device", "cuda"}});
   size_t n = 100;
+  Context ctx;
 
   for (const auto& entry : ::dmlc::Registry<::xgboost::ObjFunctionReg>::List()) {
     std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create(entry->name, &tparam)};
@@ -40,7 +41,7 @@ TEST(Objective, PredTransform) {
       obj->Configure(Args{{"expectile_alpha", "0.5"}});
     }
     HostDeviceVector<float> predts;
-    predts.Resize(n, 3.14f);  // prediction is performed on host.
+    predts.Resize(&ctx, n, 3.14f);  // prediction is performed on host.
     ASSERT_FALSE(predts.DeviceCanRead());
     obj->PredTransform(&predts);
     ASSERT_FALSE(predts.DeviceCanRead());
diff --git a/tests/cpp/objective/test_objective_helpers.h b/tests/cpp/objective/test_objective_helpers.h
index 4d049772b98f..eaaf25c3b744 100644
--- a/tests/cpp/objective/test_objective_helpers.h
+++ b/tests/cpp/objective/test_objective_helpers.h
@@ -14,7 +14,7 @@ namespace xgboost {
 inline auto MakePositionsForTest(bst_idx_t n_samples, bst_node_t left_nidx, bst_node_t right_nidx,
                                  HostDeviceVector<bst_node_t>* p_position) {
   HostDeviceVector<bst_node_t>& position = *p_position;
-  position.Resize(n_samples, 0);
+  position.Resize(n_samples);
   auto& h_position = position.HostVector();
   for (size_t i = 0; i < n_samples; ++i) {
     if (i < n_samples / 2) {
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index b8b97c701611..a1457490fa38 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -215,7 +215,7 @@ void TestUpdatePredictionCache(bool use_subsampling) {
   }
 
   PredictionCacheEntry predtion_cache;
-  predtion_cache.predictions.Resize(kRows * kClasses, 0);
+  predtion_cache.predictions.Resize(&ctx, kRows * kClasses, 0.0f);
   // after one training iteration predtion_cache is filled with cached in QuantileHistMaker
   // prediction values
   gbm->DoBoost(dmat.get(), &gpair, &predtion_cache, nullptr);
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 5cb8e7df425d..4d3e7c7523fb 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -768,8 +768,8 @@ void TestVectorLeafPrediction(Context const *ctx) {
   std::vector<float> r_w(mparam.LeafLength(), 2.0f);
 
   auto &tree = trees.front();
-  tree->SetRoot(linalg::MakeVec(p_w.data(), p_w.size()), /*sum_hess=*/1.0f);
-  tree->ExpandNode(0, static_cast<bst_feature_t>(1), 2.0, true,
+  tree->SetRoot(ctx, linalg::MakeVec(p_w.data(), p_w.size()), /*sum_hess=*/1.0f);
+  tree->ExpandNode(ctx, 0, static_cast<bst_feature_t>(1), 2.0, true,
                    linalg::MakeVec(p_w.data(), p_w.size()), linalg::MakeVec(l_w.data(), l_w.size()),
                    linalg::MakeVec(r_w.data(), r_w.size()), /*loss_chg=*/0.5f, /*sum_hess=*/1.0f,
                    /*left_sum=*/0.6f, /*right_sum=*/0.4f);
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index c4c34ecafc61..ed0172c30749 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -57,7 +57,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
                                           cuts_.cut_values_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  GPUHistEvaluator evaluator{&ctx, param_, static_cast<bst_feature_t>(feature_set.size())};
 
   evaluator.Reset(&ctx, cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
@@ -98,7 +98,7 @@ TEST(GpuHist, PartitionBasic) {
       false,
   };
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  GPUHistEvaluator evaluator{&ctx, tparam, static_cast<bst_feature_t>(feature_set.size())};
   evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
 
   {
@@ -208,7 +208,7 @@ TEST(GpuHist, PartitionTwoFeatures) {
                                           cuts.cut_values_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  GPUHistEvaluator evaluator{&ctx, tparam, static_cast<bst_feature_t>(feature_set.size())};
   evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
 
   {
@@ -266,7 +266,7 @@ TEST(GpuHist, PartitionTwoNodes) {
                                           cuts.cut_values_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  GPUHistEvaluator evaluator{&ctx, tparam, static_cast<bst_feature_t>(feature_set.size())};
   evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
 
   {
@@ -321,7 +321,7 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                                           cuts.cut_values_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  GPUHistEvaluator evaluator{&ctx, tparam, static_cast<bst_feature_t>(feature_set.size())};
   evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
@@ -354,7 +354,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
   EvaluateSplitSharedInputs shared_inputs{
       param, quantiser, {}, dh::ToSpan(feature_segments), dh::ToSpan(feature_values), false};
 
-  GPUHistEvaluator evaluator(tparam, feature_set.size(), FstCU());
+  GPUHistEvaluator evaluator(&ctx, tparam, feature_set.size());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
@@ -367,7 +367,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
 TEST(GpuHist, EvaluateSingleSplitEmpty) {
   auto ctx = MakeCUDACtx(0);
   TrainParam tparam = ZeroParam();
-  GPUHistEvaluator evaluator(tparam, 1, FstCU());
+  GPUHistEvaluator evaluator(&ctx, tparam, 1);
   DeviceSplitCandidate result =
       evaluator
           .EvaluateSingleSplit(
@@ -398,7 +398,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
   EvaluateSplitSharedInputs shared_inputs{
       param, quantiser, {}, dh::ToSpan(feature_segments), dh::ToSpan(feature_values), false};
 
-  GPUHistEvaluator evaluator(tparam, 2, FstCU());
+  GPUHistEvaluator evaluator(&ctx, tparam, 2);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
@@ -426,7 +426,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
   EvaluateSplitSharedInputs shared_inputs{
       param, quantiser, {}, dh::ToSpan(feature_segments), dh::ToSpan(feature_values), false};
 
-  GPUHistEvaluator evaluator(tparam, 2, FstCU());
+  GPUHistEvaluator evaluator(&ctx, tparam, 2);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
@@ -456,7 +456,7 @@ TEST(GpuHist, EvaluateSplits) {
   EvaluateSplitSharedInputs shared_inputs{
       param, quantiser, {}, dh::ToSpan(feature_segments), dh::ToSpan(feature_values), false};
 
-  GPUHistEvaluator evaluator{tparam, 2, FstCU()};
+  GPUHistEvaluator evaluator{&ctx, tparam, 2};
   dh::device_vector<EvaluateSplitInputs> inputs =
       std::vector<EvaluateSplitInputs>{input_left, input_right};
   evaluator.LaunchEvaluateSplits(&ctx, input_left.feature_set.size(), dh::ToSpan(inputs),
@@ -474,7 +474,7 @@ TEST(GpuHist, EvaluateSplits) {
 TEST_F(TestPartitionBasedSplit, GpuHist) {
   auto ctx = MakeCUDACtx(0);
   dh::device_vector<FeatureType> ft{std::vector<FeatureType>{FeatureType::kCategorical}};
-  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(info_.num_col_), ctx.Device()};
+  GPUHistEvaluator evaluator{&ctx, param_, static_cast<bst_feature_t>(info_.num_col_)};
 
   cuts_.cut_ptrs_.SetDevice(ctx.Device());
   cuts_.cut_values_.SetDevice(ctx.Device());
@@ -539,7 +539,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
                                           cuts.cut_values_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  GPUHistEvaluator evaluator{&ctx, tparam, static_cast<bst_feature_t>(feature_set.size())};
   evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 15aca958504a..3ade5c842d45 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -32,7 +32,7 @@ void TestUpdatePositionBatch() {
     EXPECT_EQ(rows[i], i);
   }
   std::vector<int> extra_data = {0};
-  dh::DeviceUVector<cuda_impl::RowIndexT> ridx_tmp(kNumRows);
+  dh::DeviceUVector<cuda_impl::RowIndexT> ridx_tmp(kNumRows, ctx.CUDACtx()->Stream());
   // Send the first five training instances to the right node
   // and the second 5 to the left node
   rp.UpdatePositionBatch(
@@ -65,7 +65,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
 
   auto op = [=] __device__(auto ridx, int split_index, int data) {
     return ridx % 2 == 0;
-  };
+  };  // NOLINT
   std::vector<int> op_data(segments.size());
   std::vector<PerNodeData<int>> h_batch_info(segments.size());
   dh::TemporaryArray<PerNodeData<int>> d_batch_info(segments.size());
@@ -85,7 +85,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
 
   auto op_without_data = [=] __device__(auto ridx) {
     return ridx % 2 == 0;
-  };
+  };  // NOLINT
   for (size_t i = 0; i < segments.size(); i++) {
     auto begin = ridx.begin() + segments[i].begin;
     auto end = ridx.begin() + segments[i].end;
@@ -162,7 +162,7 @@ void TestExternalMemory() {
 
     partitioners.emplace_back(std::make_unique<RowPartitioner>());
     partitioners.back()->Reset(&ctx, page.Size(), page.BaseRowId());
-    dh::DeviceUVector<cuda_impl::RowIndexT> ridx_tmp(page.Size());
+    dh::DeviceUVector<cuda_impl::RowIndexT> ridx_tmp(page.Size(), ctx.CUDACtx()->Stream());
     std::vector<RegTree::Node> splits{tree[0]};
     page.Impl()->Visit(&ctx, {}, [&](auto&& acc) {
       partitioners.back()->UpdatePositionBatch(&ctx, {0}, {1}, {2}, splits, dh::ToSpan(ridx_tmp),
@@ -205,7 +205,7 @@ void TestEmptyNode(std::int32_t n_workers) {
     bst_idx_t base_rowid = 0;
     partitioner.Reset(&ctx, n_samples, base_rowid);
     std::vector<RegTree::Node> splits(1);
-    dh::DeviceUVector<cuda_impl::RowIndexT> ridx_tmp(n_samples);
+    dh::DeviceUVector<cuda_impl::RowIndexT> ridx_tmp(n_samples, ctx.CUDACtx()->Stream());
     partitioner.UpdatePositionBatch(
         &ctx, {0}, {1}, {2}, splits, dh::ToSpan(ridx_tmp),
         [] XGBOOST_DEVICE(bst_idx_t ridx, std::int32_t /*nidx_in_batch*/, RegTree::Node) {
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 4296ba49a3f4..a7328e86eca6 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -57,11 +57,11 @@ void TestPartitionBasedSplit::SetUp() {
     total_gpair_ += e;
   }
 
-  auto enumerate = [this, n_feat = info_.num_col_](common::GHistRow hist,
-                                                   GradientPairPrecise parent_sum) {
+  auto enumerate = [this, &ctx, n_feat = info_.num_col_](common::GHistRow hist,
+                                                         GradientPairPrecise parent_sum) {
     int32_t best_thresh = -1;
     float best_score{-std::numeric_limits<float>::infinity()};
-    TreeEvaluator evaluator{param_, static_cast<bst_feature_t>(n_feat), DeviceOrd::CPU()};
+    TreeEvaluator evaluator{&ctx, param_, static_cast<bst_feature_t>(n_feat)};
     auto tree_evaluator = evaluator.GetEvaluator<TrainParam>();
     GradientPairPrecise left_sum;
     auto parent_gain = tree_evaluator.CalcGain(0, param_, GradStats{total_gpair_});
@@ -205,7 +205,7 @@ TEST(HistMultiEvaluator, Evaluate) {
   for (bst_target_t t{0}; t < n_targets; ++t) {
     root_sum_hess += static_cast<float>(root_sum.HostView()(t).GetHess());
   }
-  tree.SetRoot(weight.HostView(), root_sum_hess);
+  tree.SetRoot(&ctx, weight.HostView(), root_sum_hess);
   auto w = weight.HostView();
   ASSERT_EQ(w.Size(), n_targets);
   ASSERT_EQ(w(0), -1.5);
@@ -410,7 +410,7 @@ TEST(HistMultiEvaluator, CategoricalOneHot) {
   for (bst_target_t t = 0; t < n_targets; ++t) {
     root_sum_hess += static_cast<float>(root_sum.HostView()(t).GetHess());
   }
-  tree.SetRoot(weight.HostView(), root_sum_hess);
+  tree.SetRoot(&ctx, weight.HostView(), root_sum_hess);
 
   std::vector<MultiExpandEntry> entries(1, {0, 0});
   std::vector<BoundedHistCollection const *> ptrs;
diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc
index 88118ca31136..892381a93366 100644
--- a/tests/cpp/tree/test_multi_target_tree_model.cc
+++ b/tests/cpp/tree/test_multi_target_tree_model.cc
@@ -28,11 +28,12 @@ std::unique_ptr<RegTree> MakeMtTreeForTest(bst_target_t n_targets) {
     std::iota(h_data.begin(), h_data.end(), init);
   };
 
+  Context ctx;
   linalg::Vector<float> base_weight;
   base_weight.ModifyInplace([&](HostDeviceVector<float>* data, common::Span<std::size_t> shape) {
     iota_weights(1.0f, data, shape);
   });
-  tree->SetRoot(base_weight.HostView(), /*sum_hess=*/1.0f);
+  tree->SetRoot(&ctx, base_weight.HostView(), /*sum_hess=*/1.0f);
 
   linalg::Vector<float> left_weight;
   left_weight.ModifyInplace([&](HostDeviceVector<float>* data, common::Span<std::size_t> shape) {
@@ -43,7 +44,7 @@ std::unique_ptr<RegTree> MakeMtTreeForTest(bst_target_t n_targets) {
     iota_weights(3.0f, data, shape);
   });
 
-  tree->ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
+  tree->ExpandNode(&ctx, RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
                    left_weight.HostView(), right_weight.HostView(), /*loss_chg=*/0.5f,
                    /*sum_hess=*/1.0f, /*left_sum=*/0.6f, /*right_sum=*/0.4f);
   tree->GetMultiTargetTree()->SetLeaves();
@@ -109,11 +110,12 @@ void TestTreeDump(std::string format, std::string leaf_key) {
 
   {
     // Test the "..."
+    Context ctx;
     bst_target_t n_targets{4};
     RegTree tree{n_targets, n_features};
     linalg::Vector<float> weight{{1.0f, 2.0f, 3.0f, 4.0f}, {4ul}, DeviceOrd::CPU()};
-    tree.SetRoot(weight.HostView(), /*sum_hess=*/1.0f);
-    tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, weight.HostView(),
+    tree.SetRoot(&ctx, weight.HostView(), /*sum_hess=*/1.0f);
+    tree.ExpandNode(&ctx, RegTree::kRoot, /*split_idx=*/1, 0.5f, true, weight.HostView(),
                     weight.HostView(), weight.HostView(), /*loss_chg=*/0.5f, /*sum_hess=*/1.0f,
                     /*left_sum=*/0.6f, /*right_sum=*/0.4f);
     tree.GetMultiTargetTree()->SetLeaves();
@@ -144,13 +146,14 @@ TEST(MultiTargetTree, SetLeaves) {
   std::unique_ptr<RegTree> tree{std::make_unique<RegTree>(n_targets, n_features)};
   CHECK(tree->IsMultiTarget());
   // Reduce to 2 targets
+  Context ctx;
   linalg::Vector<float> base_weight{{1.0f, 2.0f}, {2ul}, DeviceOrd::CPU()};
-  tree->SetRoot(base_weight.HostView(), /*sum_hess=*/1.0f);
+  tree->SetRoot(&ctx, base_weight.HostView(), /*sum_hess=*/1.0f);
   ASSERT_EQ(tree->GetMultiTargetTree()->NumSplitTargets(), 2);
 
   linalg::Vector<float> left_weight{{2.0f, 3.0f}, {2ul}, DeviceOrd::CPU()};
   linalg::Vector<float> right_weight{{3.0f, 4.0f}, {2ul}, DeviceOrd::CPU()};
-  tree->ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
+  tree->ExpandNode(&ctx, RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
                    left_weight.HostView(), right_weight.HostView(), /*loss_chg=*/0.5f,
                    /*sum_hess=*/1.0f, /*left_sum=*/0.6f, /*right_sum=*/0.4f);
 
diff --git a/tests/cpp/tree/test_partitioner.h b/tests/cpp/tree/test_partitioner.h
index 4afeb540670d..ae717945831e 100644
--- a/tests/cpp/tree/test_partitioner.h
+++ b/tests/cpp/tree/test_partitioner.h
@@ -1,14 +1,13 @@
 /**
- * Copyright 2021-2026 by XGBoost contributors.
+ * Copyright 2021-2026, XGBoost contributors.
  */
-#ifndef XGBOOST_TESTS_CPP_TREE_TEST_PARTITIONER_H_
-#define XGBOOST_TESTS_CPP_TREE_TEST_PARTITIONER_H_
-#include <xgboost/context.h>                      // for Context
-#include <xgboost/linalg.h>                       // for Constant, Vector
-#include <xgboost/logging.h>                      // for CHECK
-#include <xgboost/tree_model.h>                   // for RegTree
+#pragma once
+#include <xgboost/context.h>     // for Context
+#include <xgboost/linalg.h>      // for Constant, Vector
+#include <xgboost/logging.h>     // for CHECK
+#include <xgboost/tree_model.h>  // for RegTree
 
-#include <vector>                                 // for vector
+#include <vector>  // for vector
 
 #include "../../../src/tree/hist/expand_entry.h"  // for CPUExpandEntry, MultiExpandEntry
 
@@ -33,8 +32,8 @@ inline void GetMultiSplitForTest(RegTree *tree, float split_value,
   linalg::Vector<float> base_weight{linalg::Constant(&ctx, 0.0f, n_targets)};
   linalg::Vector<float> left_weight{linalg::Constant(&ctx, 0.0f, n_targets)};
   linalg::Vector<float> right_weight{linalg::Constant(&ctx, 0.0f, n_targets)};
-  tree->SetRoot(base_weight.HostView(), /*sum_hess=*/0.0f);
-  tree->ExpandNode(/*nidx=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value,
+  tree->SetRoot(&ctx, base_weight.HostView(), /*sum_hess=*/0.0f);
+  tree->ExpandNode(&ctx, /*nidx=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value,
                    /*default_left=*/true, base_weight.HostView(), left_weight.HostView(),
                    right_weight.HostView(), /*loss_chg=*/0.0f, /*sum_hess=*/0.0f, /*left_sum=*/0.0f,
                    /*right_sum=*/0.0f);
@@ -44,4 +43,3 @@ inline void GetMultiSplitForTest(RegTree *tree, float split_value,
   tree->GetMultiTargetTree()->SetLeaves();
 }
 }  // namespace xgboost::tree
-#endif  // XGBOOST_TESTS_CPP_TREE_TEST_PARTITIONER_H_