diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h index 1a0da50fa07a..471a883940ff 100644 --- a/include/xgboost/host_device_vector.h +++ b/include/xgboost/host_device_vector.h @@ -1,5 +1,5 @@ -/*! - * Copyright 2017-2019 XGBoost contributors +/** + * Copyright 2017-2026, XGBoost contributors */ /** @@ -64,7 +64,8 @@ namespace xgboost { void SetCudaSetDeviceHandler(void (*handler)(int)); #endif // __CUDACC__ -template struct HostDeviceVectorImpl; +template +struct HostDeviceVectorImpl; /*! * \brief Controls data access from the GPU. @@ -78,7 +79,8 @@ template struct HostDeviceVectorImpl; * - Data is being manipulated on the host. Host has write access, device doesn't have access. */ enum GPUAccess { - kNone, kRead, + kNone, + kRead, // write implies read kWrite }; @@ -88,9 +90,15 @@ class HostDeviceVector { static_assert(std::is_standard_layout_v, "HostDeviceVector admits only POD types"); public: - explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU()); - HostDeviceVector(std::initializer_list init, DeviceOrd device = DeviceOrd::CPU()); - explicit HostDeviceVector(const std::vector& init, DeviceOrd device = DeviceOrd::CPU()); + using value_type = T; // NOLINT + + public: + explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU(), + Context const* ctx = nullptr); + HostDeviceVector(std::initializer_list init, DeviceOrd device = DeviceOrd::CPU(), + Context const* ctx = nullptr); + explicit HostDeviceVector(const std::vector& init, DeviceOrd device = DeviceOrd::CPU(), + Context const* ctx = nullptr); ~HostDeviceVector(); HostDeviceVector(const HostDeviceVector&) = delete; @@ -103,44 +111,54 @@ class HostDeviceVector { [[nodiscard]] std::size_t Size() const; [[nodiscard]] std::size_t SizeBytes() const { return this->Size() * sizeof(T); } [[nodiscard]] DeviceOrd Device() const; - common::Span DeviceSpan(); - common::Span ConstDeviceSpan() const; - common::Span DeviceSpan() const { return ConstDeviceSpan(); } - T* DevicePointer(); - const T* ConstDevicePointer() const; - const T* DevicePointer() const { return ConstDevicePointer(); } - - T* HostPointer() { return HostVector().data(); } - common::Span HostSpan() { return common::Span{HostVector()}; } - common::Span HostSpan() const { return common::Span{HostVector()}; } - common::Span ConstHostSpan() const { return HostSpan(); } - const T* ConstHostPointer() const { return ConstHostVector().data(); } - const T* HostPointer() const { return ConstHostPointer(); } - - void Fill(T v); - void Copy(const HostDeviceVector& other); - void Copy(const std::vector& other); - void Copy(std::initializer_list other); - - void Extend(const HostDeviceVector& other); - - std::vector& HostVector(); - const std::vector& ConstHostVector() const; - const std::vector& HostVector() const {return ConstHostVector(); } + common::Span DeviceSpan(Context const* ctx = nullptr); + common::Span ConstDeviceSpan(Context const* ctx = nullptr) const; + common::Span DeviceSpan(Context const* ctx = nullptr) const { + return ConstDeviceSpan(ctx); + } + T* DevicePointer(Context const* ctx = nullptr); + const T* ConstDevicePointer(Context const* ctx = nullptr) const; + const T* DevicePointer(Context const* ctx = nullptr) const { return ConstDevicePointer(ctx); } + + T* HostPointer(Context const* ctx = nullptr) { return HostVector(ctx).data(); } + common::Span HostSpan(Context const* ctx = nullptr) { + return common::Span{HostVector(ctx)}; + } + common::Span HostSpan(Context const* ctx = nullptr) const { + return common::Span{HostVector(ctx)}; + } + common::Span ConstHostSpan(Context const* ctx = nullptr) const { return HostSpan(ctx); } + const T* ConstHostPointer(Context const* ctx = nullptr) const { + return ConstHostVector(ctx).data(); + } + const T* HostPointer(Context const* ctx = nullptr) const { return ConstHostPointer(ctx); } + + void Fill(T v, Context const* ctx = nullptr); + void Copy(const HostDeviceVector& other, Context const* ctx = nullptr); + void Copy(const std::vector& other, Context const* ctx = nullptr); + void Copy(std::initializer_list other, Context const* ctx = nullptr); + + void Extend(const HostDeviceVector& other, Context const* ctx = nullptr); + + std::vector& HostVector(Context const* ctx = nullptr); + const std::vector& ConstHostVector(Context const* ctx = nullptr) const; + const std::vector& HostVector(Context const* ctx = nullptr) const { + return ConstHostVector(ctx); + } [[nodiscard]] bool HostCanRead() const; [[nodiscard]] bool HostCanWrite() const; [[nodiscard]] bool DeviceCanRead() const; [[nodiscard]] bool DeviceCanWrite() const; [[nodiscard]] GPUAccess DeviceAccess() const; - - void SetDevice(DeviceOrd device) const; + // FIXME(jiamingy): Until we can fully unify the context, we will have both ctx and device here. + void SetDevice(DeviceOrd device, Context const* ctx = nullptr) const; void Resize(std::size_t new_size); - /** @brief Resize and initialize the data if the new size is larger than the old size. */ - void Resize(std::size_t new_size, T v); + void Resize(Context const* ctx, std::size_t new_size); - using value_type = T; // NOLINT + /** @brief Resize and initialize the data if the new size is larger than the old size. */ + void Resize(Context const* ctx, std::size_t new_size, T v); private: HostDeviceVectorImpl* impl_; diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h index 4d4412ea31e0..9eed8dce5091 100644 --- a/include/xgboost/linalg.h +++ b/include/xgboost/linalg.h @@ -595,13 +595,13 @@ auto MakeTensorView(Context const *ctx, Order order, common::Span data, template auto MakeTensorView(Context const *ctx, HostDeviceVector *data, S &&...shape) { - auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan(); + auto span = ctx->IsCPU() ? data->HostSpan(ctx) : data->DeviceSpan(ctx); return MakeTensorView(ctx->Device(), span, std::forward(shape)...); } template auto MakeTensorView(Context const *ctx, HostDeviceVector const *data, S &&...shape) { - auto span = ctx->IsCPU() ? data->ConstHostSpan() : data->ConstDeviceSpan(); + auto span = ctx->IsCPU() ? data->ConstHostSpan(ctx) : data->ConstDeviceSpan(ctx); return MakeTensorView(ctx->Device(), span, std::forward(shape)...); } @@ -768,15 +768,15 @@ class Tensor { Order order_{Order::kC}; template - void Initialize(I const (&shape)[D], DeviceOrd device) { + void Initialize(I const (&shape)[D], DeviceOrd device, Context const *ctx = nullptr) { static_assert(D <= kDim, "Invalid shape."); std::copy(shape, shape + D, shape_); for (auto i = D; i < kDim; ++i) { shape_[i] = 1; } if (!device.IsCPU()) { - data_.SetDevice(device); - data_.ConstDevicePointer(); // Pull to device; + data_.SetDevice(device, ctx); + data_.ConstDevicePointer(ctx); // Pull to device; } CHECK_EQ(data_.Size(), detail::CalcSize(shape_)); } @@ -791,11 +791,13 @@ class Tensor { * See \ref TensorView for parameters of this constructor. */ template - explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC) - : Tensor{common::Span{shape}, device, order} {} + explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC, + Context const *ctx = nullptr) + : Tensor{common::Span{shape}, device, order, ctx} {} template - explicit Tensor(common::Span shape, DeviceOrd device, Order order = kC) + explicit Tensor(common::Span shape, DeviceOrd device, Order order = kC, + Context const *ctx = nullptr) : order_{order} { // No device unroll as this is a host only function. std::copy(shape.data(), shape.data() + D, shape_); @@ -804,33 +806,34 @@ class Tensor { } auto size = detail::CalcSize(shape_); if (!device.IsCPU()) { - data_.SetDevice(device); + data_.SetDevice(device, ctx); } - data_.Resize(size); + data_.Resize(ctx, size); if (!device.IsCPU()) { - data_.DevicePointer(); // Pull to device + data_.DevicePointer(ctx); // Pull to device } } /** * Initialize from 2 host iterators. */ template - explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC) + explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC, + Context const *ctx = nullptr) : order_{order} { auto &h_vec = data_.HostVector(); h_vec.insert(h_vec.begin(), begin, end); // shape - this->Initialize(shape, device); + this->Initialize(shape, device, ctx); } template explicit Tensor(std::initializer_list data, I const (&shape)[D], DeviceOrd device, - Order order = kC) + Order order = kC, Context const *ctx = nullptr) : order_{order} { auto &h_vec = data_.HostVector(); h_vec = data; // shape - this->Initialize(shape, device); + this->Initialize(shape, device, ctx); } /** * \brief Index operator. Not thread safe, should not be used in performance critical @@ -852,29 +855,29 @@ class Tensor { /** * @brief Get a @ref TensorView for this tensor. */ - auto View(DeviceOrd device) { + auto View(DeviceOrd device, Context const *ctx = nullptr) { if (device.IsCPU()) { - auto span = data_.HostSpan(); + auto span = data_.HostSpan(ctx); return TensorView{span, shape_, device, order_}; } else { - data_.SetDevice(device); - auto span = data_.DeviceSpan(); + data_.SetDevice(device, ctx); + auto span = data_.DeviceSpan(ctx); return TensorView{span, shape_, device, order_}; } } - auto View(DeviceOrd device) const { + auto View(DeviceOrd device, Context const *ctx = nullptr) const { if (device.IsCPU()) { - auto span = data_.ConstHostSpan(); + auto span = data_.ConstHostSpan(ctx); return TensorView{span, shape_, device, order_}; } else { - data_.SetDevice(device); - auto span = data_.ConstDeviceSpan(); + data_.SetDevice(device, ctx); + auto span = data_.ConstDeviceSpan(ctx); return TensorView{span, shape_, device, order_}; } } - auto HostView() { return this->View(DeviceOrd::CPU()); } - auto HostView() const { return this->View(DeviceOrd::CPU()); } + auto HostView(Context const *ctx = nullptr) { return this->View(DeviceOrd::CPU(), ctx); } + auto HostView(Context const *ctx = nullptr) const { return this->View(DeviceOrd::CPU(), ctx); } [[nodiscard]] std::size_t Size() const { return data_.Size(); } [[nodiscard]] bool Empty() const { return Size() == 0; } @@ -950,7 +953,9 @@ class Tensor { /** * \brief Set device ordinal for this tensor. */ - void SetDevice(DeviceOrd device) const { data_.SetDevice(device); } + void SetDevice(DeviceOrd device, Context const *ctx = nullptr) const { + data_.SetDevice(device, ctx); + } [[nodiscard]] DeviceOrd Device() const { return data_.Device(); } }; @@ -966,7 +971,7 @@ using Vector = Tensor; template auto Empty(Context const *ctx, Index &&...index) { Tensor t; - t.SetDevice(ctx->Device()); + t.SetDevice(ctx->Device(), ctx); t.Reshape(index...); return t; } @@ -977,7 +982,7 @@ auto Empty(Context const *ctx, Index &&...index) { template auto EmptyLike(Context const *ctx, Tensor const &in) { Tensor t; - t.SetDevice(ctx->Device()); + t.SetDevice(ctx->Device(), ctx); t.Reshape(in.Shape()); return t; } @@ -988,9 +993,9 @@ auto EmptyLike(Context const *ctx, Tensor const &in) { template auto Constant(Context const *ctx, T v, Index &&...index) { Tensor t; - t.SetDevice(ctx->Device()); + t.SetDevice(ctx->Device(), ctx); t.Reshape(index...); - t.Data()->Fill(std::move(v)); + t.Data()->Fill(std::move(v), ctx); return t; } diff --git a/include/xgboost/multi_target_tree_model.h b/include/xgboost/multi_target_tree_model.h index 35a4ddecc1ba..cbe94f89126d 100644 --- a/include/xgboost/multi_target_tree_model.h +++ b/include/xgboost/multi_target_tree_model.h @@ -90,12 +90,12 @@ class MultiTargetTree : public Model { * @param weight The weight vector for the root node. * @param sum_hess The sum of hessians for the root node (coverage). */ - void SetRoot(linalg::VectorView weight, float sum_hess); + void SetRoot(Context const* ctx, linalg::VectorView weight, float sum_hess); /** * @brief Expand a leaf into split node. */ - void Expand(bst_node_t nidx, bst_feature_t split_idx, float split_cond, bool default_left, - linalg::VectorView base_weight, + void Expand(Context const* ctx, bst_node_t nidx, bst_feature_t split_idx, float split_cond, + bool default_left, linalg::VectorView base_weight, linalg::VectorView left_weight, linalg::VectorView right_weight, float loss_chg, float sum_hess, float left_sum, float right_sum); diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h index e08473e64a6e..e39875c9c064 100644 --- a/include/xgboost/tree_model.h +++ b/include/xgboost/tree_model.h @@ -315,8 +315,8 @@ class RegTree : public Model { * @param left_sum The sum of hessians for the left child (coverage). * @param right_sum The sum of hessians for the right child (coverage). */ - void ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split_cond, bool default_left, - linalg::VectorView base_weight, + void ExpandNode(Context const* ctx, bst_node_t nidx, bst_feature_t split_index, float split_cond, + bool default_left, linalg::VectorView base_weight, linalg::VectorView left_weight, linalg::VectorView right_weight, float loss_chg, float sum_hess, float left_sum, float right_sum); @@ -355,7 +355,7 @@ class RegTree : public Model { /** * @brief Expands a leaf node with categories for a multi-target tree. */ - void ExpandCategorical(bst_node_t nidx, bst_feature_t split_index, + void ExpandCategorical(Context const* ctx, bst_node_t nidx, bst_feature_t split_index, common::Span split_cat, bool default_left, linalg::VectorView base_weight, linalg::VectorView left_weight, @@ -414,9 +414,9 @@ class RegTree : public Model { * @param weight Internal split weight, with size equals to reduced targets. * @param sum_hess The sum of hessians for the root node (coverage). */ - void SetRoot(linalg::VectorView weight, float sum_hess) { + void SetRoot(Context const* ctx, linalg::VectorView weight, float sum_hess) { CHECK(IsMultiTarget()); - return this->p_mt_tree_->SetRoot(weight, sum_hess); + return this->p_mt_tree_->SetRoot(ctx, weight, sum_hess); } /** * @brief Get the maximum depth. diff --git a/plugin/sycl/common/host_device_vector.cc b/plugin/sycl/common/host_device_vector.cc index 0a32fae40279..e1706dc356ab 100644 --- a/plugin/sycl/common/host_device_vector.cc +++ b/plugin/sycl/common/host_device_vector.cc @@ -14,8 +14,8 @@ #include "xgboost/host_device_vector.h" #pragma GCC diagnostic pop -#include "../device_manager.h" #include "../data.h" +#include "../device_manager.h" #include "../predictor/node.h" namespace xgboost { @@ -46,10 +46,11 @@ class HostDeviceVectorImpl { } } - HostDeviceVectorImpl(HostDeviceVectorImpl&& that) : device_{that.device_}, - data_h_{std::move(that.data_h_)}, - data_d_{std::move(that.data_d_)}, - device_access_{that.device_access_} {} + HostDeviceVectorImpl(HostDeviceVectorImpl&& that) + : device_{that.device_}, + data_h_{std::move(that.data_h_)}, + data_d_{std::move(that.data_d_)}, + device_access_{that.device_access_} {} std::vector& HostVector() { SyncHost(GPUAccess::kNone); @@ -62,14 +63,15 @@ class HostDeviceVectorImpl { } void SetDevice(DeviceOrd device) { - if (device_ == device) { return; } + if (device_ == device) { + return; + } if (device_.IsSycl()) { SyncHost(GPUAccess::kNone); } if (device_.IsSycl() && device.IsSycl()) { - CHECK_EQ(device_, device) - << "New device is different from previous one."; + CHECK_EQ(device_, device) << "New device is different from previous one."; } device_ = device; if (device_.IsSycl()) { @@ -97,20 +99,26 @@ class HostDeviceVectorImpl { } void SyncHost(GPUAccess access) { - if (HostCanAccess(access)) { return; } + if (HostCanAccess(access)) { + return; + } if (HostCanRead()) { // data is present, just need to deny access to the device device_access_ = access; return; } device_access_ = access; - if (data_h_.size() != data_d_->Size()) { data_h_.resize(data_d_->Size()); } + if (data_h_.size() != data_d_->Size()) { + data_h_.resize(data_d_->Size()); + } SetDevice(); qu_->memcpy(data_h_.data(), data_d_->Data(), data_d_->Size() * sizeof(T)).wait(); } void SyncDevice(GPUAccess access) { - if (DeviceCanAccess(access)) { return; } + if (DeviceCanAccess(access)) { + return; + } if (DeviceCanRead()) { device_access_ = access; return; @@ -130,9 +138,7 @@ class HostDeviceVectorImpl { bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); } GPUAccess Access() const { return device_access_; } - size_t Size() const { - return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->Size() : 0; - } + size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->Size() : 0; } DeviceOrd Device() const { return device_; } @@ -214,7 +220,9 @@ class HostDeviceVectorImpl { private: void ResizeDevice(size_t new_size) { - if (data_d_ && new_size == data_d_->Size()) { return; } + if (data_d_ && new_size == data_d_->Size()) { + return; + } SetDevice(); data_d_->Resize(qu_, new_size); } @@ -254,20 +262,21 @@ class HostDeviceVectorImpl { }; template -HostDeviceVector::HostDeviceVector(size_t size, T v, DeviceOrd device) - : impl_(nullptr) { +HostDeviceVector::HostDeviceVector(size_t size, T v, DeviceOrd device, Context const*) + : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(size, v, device); } template -HostDeviceVector::HostDeviceVector(std::initializer_list init, DeviceOrd device) - : impl_(nullptr) { +HostDeviceVector::HostDeviceVector(std::initializer_list init, DeviceOrd device, + Context const*) + : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(init, device); } template -HostDeviceVector::HostDeviceVector(const std::vector& init, DeviceOrd device) - : impl_(nullptr) { +HostDeviceVector::HostDeviceVector(const std::vector& init, DeviceOrd device, Context const*) + : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(init, device); } @@ -278,7 +287,9 @@ HostDeviceVector::HostDeviceVector(HostDeviceVector&& that) { template HostDeviceVector& HostDeviceVector::operator=(HostDeviceVector&& that) { - if (this == &that) { return *this; } + if (this == &that) { + return *this; + } std::unique_ptr> new_impl( new HostDeviceVectorImpl(std::move(*that.impl_))); @@ -294,7 +305,9 @@ HostDeviceVector::~HostDeviceVector() { } template -size_t HostDeviceVector::Size() const { return impl_->Size(); } +size_t HostDeviceVector::Size() const { + return impl_->Size(); +} template DeviceOrd HostDeviceVector::Device() const { @@ -302,65 +315,72 @@ DeviceOrd HostDeviceVector::Device() const { } template -T* HostDeviceVector::DevicePointer() { +T* HostDeviceVector::DevicePointer(Context const*) { return impl_->DevicePointer(); } template -const T* HostDeviceVector::ConstDevicePointer() const { +const T* HostDeviceVector::ConstDevicePointer(Context const*) const { return impl_->ConstDevicePointer(); } template -common::Span HostDeviceVector::DeviceSpan() { +common::Span HostDeviceVector::DeviceSpan(Context const*) { return impl_->DeviceSpan(); } template -common::Span HostDeviceVector::ConstDeviceSpan() const { +common::Span HostDeviceVector::ConstDeviceSpan(Context const*) const { return impl_->ConstDeviceSpan(); } template -std::vector& HostDeviceVector::HostVector() { return impl_->HostVector(); } +std::vector& HostDeviceVector::HostVector(Context const*) { + return impl_->HostVector(); +} template -const std::vector& HostDeviceVector::ConstHostVector() const { +const std::vector& HostDeviceVector::ConstHostVector(Context const*) const { return impl_->ConstHostVector(); } template -void HostDeviceVector::Resize(size_t new_size, T v) { +void HostDeviceVector::Resize(Context const*, std::size_t new_size, T v) { impl_->Resize(new_size, v); } template -void HostDeviceVector::Resize(size_t new_size) { +void HostDeviceVector::Resize(std::size_t new_size) { + impl_->Resize(new_size); +} + +template +void HostDeviceVector::Resize(Context const*, std::size_t new_size) { impl_->Resize(new_size); } template -void HostDeviceVector::Fill(T v) { +void HostDeviceVector::Fill(T v, Context const*) { impl_->Fill(v); } template -void HostDeviceVector::Copy(const HostDeviceVector& other) { +void HostDeviceVector::Copy(const HostDeviceVector& other, Context const*) { impl_->Copy(other.impl_); } template -void HostDeviceVector::Copy(const std::vector& other) { +void HostDeviceVector::Copy(const std::vector& other, Context const*) { impl_->Copy(other); } template -void HostDeviceVector::Copy(std::initializer_list other) { +void HostDeviceVector::Copy(std::initializer_list other, Context const*) { impl_->Copy(other); } template -void HostDeviceVector::Extend(HostDeviceVector const& other) { +void HostDeviceVector::Extend(HostDeviceVector const& other, Context const*) { impl_->Extend(other.impl_); } @@ -390,7 +410,7 @@ GPUAccess HostDeviceVector::DeviceAccess() const { } template -void HostDeviceVector::SetDevice(DeviceOrd device) const { +void HostDeviceVector::SetDevice(DeviceOrd device, Context const*) const { impl_->SetDevice(device); } @@ -399,7 +419,7 @@ template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; -template class HostDeviceVector; // bst_node_t +template class HostDeviceVector; // bst_node_t template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc index 4f29f2d28fc6..c2ec9fd408e7 100644 --- a/plugin/sycl/data/gradient_index.cc +++ b/plugin/sycl/data/gradient_index.cc @@ -2,13 +2,12 @@ * Copyright 2017-2024 by Contributors * \file gradient_index.cc */ -#include -#include -#include - #include "gradient_index.h" +#include +#include #include +#include namespace xgboost { namespace sycl { @@ -49,10 +48,8 @@ void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) { } template -void GHistIndexMatrix::SetIndexData(::sycl::queue* qu, - Context const * ctx, - BinIdxType* index_data, - DMatrix *dmat) { +void GHistIndexMatrix::SetIndexData(::sycl::queue* qu, Context const* ctx, BinIdxType* index_data, + DMatrix* dmat) { if (nbins == 0) return; const bst_float* cut_values = cut.cut_values_.ConstDevicePointer(); const uint32_t* cut_ptrs = cut.cut_ptrs_.ConstDevicePointer(); @@ -60,11 +57,11 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu, BinIdxType* sort_data = reinterpret_cast(sort_buff.Data()); - for (auto &batch : dmat->GetBatches()) { + for (auto& batch : dmat->GetBatches()) { batch.data.SetDevice(ctx->Device()); batch.offset.SetDevice(ctx->Device()); - const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer(); - const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer(); + const xgboost::Entry* data_ptr = batch.data.ConstDevicePointer(); + const bst_idx_t* offset_vec = batch.offset.ConstDevicePointer(); size_t batch_size = batch.Size(); if (batch_size > 0) { const auto base_rowid = batch.base_rowid; @@ -101,8 +98,9 @@ void GHistIndexMatrix::ResizeIndex(::sycl::queue* qu, size_t n_index) { if ((max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && isDense_) { index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize); index.Resize(qu, (sizeof(uint8_t)) * n_index); - } else if ((max_num_bins - 1 > static_cast(std::numeric_limits::max()) && - max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && isDense_) { + } else if ((max_num_bins - 1 > static_cast(std::numeric_limits::max()) && + max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && + isDense_) { index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize); index.Resize(qu, (sizeof(uint16_t)) * n_index); } else { @@ -111,10 +109,7 @@ void GHistIndexMatrix::ResizeIndex(::sycl::queue* qu, size_t n_index) { } } -void GHistIndexMatrix::Init(::sycl::queue* qu, - Context const * ctx, - DMatrix *dmat, - int max_bins) { +void GHistIndexMatrix::Init(::sycl::queue* qu, Context const* ctx, DMatrix* dmat, int max_bins) { nfeatures = dmat->Info().num_col_; cut = xgboost::common::SketchOnDMatrix(ctx, dmat, max_bins); @@ -132,7 +127,7 @@ void GHistIndexMatrix::Init(::sycl::queue* qu, } hit_count.SetDevice(ctx->Device()); - hit_count.Resize(nbins, 0); + hit_count.Resize(ctx, nbins, 0); const bool isDense = dmat->IsDense(); this->isDense_ = isDense; @@ -168,7 +163,7 @@ void GHistIndexMatrix::Init(::sycl::queue* qu, CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize); SetIndexData(qu, ctx, index.data(), dmat); } - /* For sparse DMatrix we have to store index of feature for each bin + /* For sparse DMatrix we have to store index of feature for each bin in index field to chose right offset. So offset is nullptr and index is not reduced */ } else { sort_buff.Resize(qu, n_rows * row_stride * sizeof(uint32_t)); diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh index a275c34370ad..61040c2e5d9b 100644 --- a/src/common/algorithm.cuh +++ b/src/common/algorithm.cuh @@ -344,7 +344,7 @@ template void RunLengthEncode(curt::StreamRef stream, Args &&...args) { std::size_t n_bytes = 0; dh::safe_cuda(cub::DeviceRunLengthEncode::Encode(nullptr, n_bytes, args..., stream)); - dh::CachingDeviceUVector tmp(n_bytes); + dh::CachingDeviceUVector tmp(n_bytes, stream); dh::safe_cuda(cub::DeviceRunLengthEncode::Encode(tmp.data(), n_bytes, args..., stream)); } @@ -352,7 +352,7 @@ template void SegmentedSum(curt::StreamRef stream, Args &&...args) { std::size_t n_bytes = 0; dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, n_bytes, args..., stream)); - dh::CachingDeviceUVector tmp(n_bytes); + dh::CachingDeviceUVector tmp(n_bytes, stream); dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(tmp.data(), n_bytes, args..., stream)); } diff --git a/src/common/device_compression.cu b/src/common/device_compression.cu index 7f688bb7deeb..497438311d1c 100644 --- a/src/common/device_compression.cu +++ b/src/common/device_compression.cu @@ -1,5 +1,5 @@ /** - * Copyright 2025, XGBoost contributors + * Copyright 2025-2026, XGBoost contributors * * We use NVComp to perform compression and access the DE API directly for * decompression. Invoking the DE directly can help us avoid unnecessary kernal launches @@ -172,7 +172,7 @@ SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(curt::StreamRef s, std::vector in_chunk_sizes(n_chunks); std::vector out_chunk_sizes(n_chunks); - dh::DeviceUVector status(n_chunks); + dh::DeviceUVector status(n_chunks, s); for (std::size_t i = 0; i < n_chunks; ++i) { in_chunk_ptrs[i] = in_compressed_data.subspan(last_in, params[i].src_act_nbytes).data(); in_chunk_sizes[i] = params[i].src_act_nbytes; @@ -195,8 +195,8 @@ SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(curt::StreamRef s, std::memset(this->de_params.data() + i, 0, sizeof(CUmemDecompressParams)); } - FillDecompParams(d_in_chunk_ptrs.data().get(), d_in_chunk_sizes.data().get(), de_params.ToSpan(), - this->act_nbytes.data().get(), d_out_chunk_sizes.data().get(), status.data(), s); + FillDecompParams(d_in_chunk_ptrs.data(), d_in_chunk_sizes.data(), de_params.ToSpan(), + this->act_nbytes.data().get(), d_out_chunk_sizes.data(), status.data(), s); dh::XGBCachingDeviceAllocator alloc; bool valid = thrust::all_of(thrust::cuda::par_nosync(alloc).on(s), status.cbegin(), status.cend(), ChkOp{}); @@ -301,8 +301,8 @@ void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr, dh::ToSpan(d_out_ptrs).size_bytes(), cudaMemcpyDefault, stream)); // Run nvcomp SafeNvComp(nvcompBatchedSnappyDecompressAsync( - mgr_impl->d_in_chunk_ptrs.data().get(), mgr_impl->d_in_chunk_sizes.data().get(), - mgr_impl->d_out_chunk_sizes.data().get(), mgr_impl->act_nbytes.data().get(), n_chunks, + mgr_impl->d_in_chunk_ptrs.data(), mgr_impl->d_in_chunk_sizes.data(), + mgr_impl->d_out_chunk_sizes.data(), mgr_impl->act_nbytes.data().get(), n_chunks, tmp.data().get(), n_tmp_bytes, d_out_ptrs.data().get(), nvcompBatchedSnappyDecompressDefaultOpts, status.data().get(), stream)); } @@ -326,7 +326,7 @@ void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr, */ std::size_t n_chunks = (in.size() + chunk_size - 1) / chunk_size; if (n_chunks == 0) { - p_out->clear(); + p_out->clear(ctx->CUDACtx()->Stream()); return {}; } std::size_t last = 0; @@ -343,11 +343,11 @@ void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr, } CHECK_EQ(last, in.size()); - dh::DeviceUVector in_ptrs(h_in_ptrs.size()); + dh::DeviceUVector in_ptrs(h_in_ptrs.size(), cuctx->Stream()); dh::safe_cuda(cudaMemcpyAsync(in_ptrs.data(), h_in_ptrs.data(), common::Span{h_in_ptrs}.size_bytes(), cudaMemcpyDefault, cuctx->Stream())); - dh::DeviceUVector in_sizes(h_in_sizes.size()); + dh::DeviceUVector in_sizes(h_in_sizes.size(), cuctx->Stream()); dh::safe_cuda(cudaMemcpyAsync(in_sizes.data(), h_in_sizes.data(), common::Span{h_in_sizes}.size_bytes(), cudaMemcpyDefault, cuctx->Stream())); @@ -363,12 +363,12 @@ void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr, n_chunks, chunk_size, nvcomp_batched_snappy_opts, &comp_temp_bytes, /*max_total_uncompressed_bytes=*/in.size())); CHECK_EQ(comp_temp_bytes, 0); - dh::DeviceUVector comp_tmp(comp_temp_bytes); + dh::DeviceUVector comp_tmp(comp_temp_bytes, cuctx->Stream()); std::size_t max_out_nbytes = 0; SafeNvComp(nvcompBatchedSnappyCompressGetMaxOutputChunkSize( std::min(max_in_nbytes, chunk_size), nvcomp_batched_snappy_opts, &max_out_nbytes)); - p_out->resize(max_out_nbytes * n_chunks); + p_out->resize(max_out_nbytes * n_chunks, cuctx->Stream()); std::vector h_out_ptrs(n_chunks); std::vector h_out_sizes(n_chunks); auto s_out = dh::ToSpan(*p_out); @@ -377,12 +377,14 @@ void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr, h_out_ptrs[i] = chunk.data(); h_out_sizes[i] = chunk.size(); } - dh::DeviceUVector out_ptrs(h_out_ptrs.size()); + dh::DeviceUVector out_ptrs(h_out_ptrs.size(), cuctx->Stream()); dh::safe_cuda(cudaMemcpyAsync(out_ptrs.data(), h_out_ptrs.data(), - common::Span{h_out_ptrs}.size_bytes(), cudaMemcpyDefault)); - dh::DeviceUVector out_sizes(h_out_sizes.size()); + common::Span{h_out_ptrs}.size_bytes(), cudaMemcpyDefault, + cuctx->Stream())); + dh::DeviceUVector out_sizes(h_out_sizes.size(), cuctx->Stream()); dh::safe_cuda(cudaMemcpyAsync(out_sizes.data(), h_out_sizes.data(), - common::Span{h_out_sizes}.size_bytes(), cudaMemcpyDefault)); + common::Span{h_out_sizes}.size_bytes(), cudaMemcpyDefault, + cuctx->Stream())); /** * Compress diff --git a/src/common/device_compression.cuh b/src/common/device_compression.cuh index 6ab3e62719d4..2102adfba872 100644 --- a/src/common/device_compression.cuh +++ b/src/common/device_compression.cuh @@ -65,11 +65,11 @@ void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr, struct SnappyDecomprMgrImpl { std::size_t n_dst_bytes{0}; // src of the CUmemDecompressParams - dh::device_vector d_in_chunk_ptrs; + dh::DeviceUVector d_in_chunk_ptrs; // srcNumBytes of the CUmemDecompressParams - dh::device_vector d_in_chunk_sizes; + dh::DeviceUVector d_in_chunk_sizes; // dstNumBytes of the CUmemDecompressParams - dh::device_vector d_out_chunk_sizes; + dh::DeviceUVector d_out_chunk_sizes; // dstActBytes of the CUmemDecompressParams dh::device_vector act_nbytes; diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 8b9829217dac..eb4f81744c95 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -709,10 +709,10 @@ template void CopyTo(Src const &src, Dst *dst, ::xgboost::curt::StreamRef stream = ::xgboost::curt::DefaultStream()) { if (src.empty()) { - dst->clear(); + dst->clear(stream); return; } - dst->resize(src.size()); + dst->resize(src.size(), stream); using SVT = std::remove_cv_t; using DVT = std::remove_cv_t; static_assert(std::is_same_v, "Host and device containers must have same value type."); diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh index 2572132cfbfe..6ef3ccd2fc4b 100644 --- a/src/common/device_vector.cuh +++ b/src/common/device_vector.cuh @@ -268,10 +268,12 @@ class ThrustAllocMrAdapter : public rmm::mr::thrust_allocator { using other = ThrustAllocMrAdapter; // NOLINT(readability-identifier-naming) }; - + // Similar to RMM's thrust adapter, it uses the default stream. ThrustAllocMrAdapter() : rmm::mr::thrust_allocator{ rmm::cuda_stream_view{cudaStream_t{xgboost::curt::DefaultStream()}}} {}; + + void SetStream([[maybe_unused]] cudaStream_t s) {} }; template @@ -289,6 +291,7 @@ class XGBAsyncPoolAllocator : public thrust::device_malloc_allocator { // entirely on Windows. std::int32_t use_async_pool_; #endif + ::xgboost::curt::StreamRef stream_{::xgboost::curt::DefaultStream()}; public: using Super = thrust::device_malloc_allocator; @@ -307,6 +310,8 @@ class XGBAsyncPoolAllocator : public thrust::device_malloc_allocator { using other = XGBAsyncPoolAllocator; // NOLINT(readability-identifier-naming) }; + void SetStream(::xgboost::curt::StreamRef stream) { this->stream_ = stream; } + pointer allocate(std::size_t n) { // NOLINT #if defined(xgboost_IS_WIN) return Super::allocate(n); @@ -317,7 +322,7 @@ class XGBAsyncPoolAllocator : public thrust::device_malloc_allocator { T *raw_ptr = nullptr; auto n_bytes = xgboost::common::SizeBytes(n); - safe_cuda(cudaMallocAsync(&raw_ptr, n_bytes, xgboost::curt::DefaultStream())); + safe_cuda(cudaMallocAsync(&raw_ptr, n_bytes, this->stream_)); return thrust::device_pointer_cast(raw_ptr); #endif } @@ -330,7 +335,7 @@ class XGBAsyncPoolAllocator : public thrust::device_malloc_allocator { return Super::deallocate(ptr, n); } - safe_cuda(cudaFreeAsync(thrust::raw_pointer_cast(ptr), xgboost::curt::DefaultStream())); + safe_cuda(cudaFreeAsync(thrust::raw_pointer_cast(ptr), this->stream_)); #endif } @@ -490,7 +495,9 @@ class DeviceUVectorImpl { public: DeviceUVectorImpl() = default; - explicit DeviceUVectorImpl(std::size_t n) { this->resize(n); } + explicit DeviceUVectorImpl(std::size_t n, ::xgboost::curt::StreamRef stream) { + this->resize(n, stream); + } DeviceUVectorImpl(DeviceUVectorImpl const &that) = delete; DeviceUVectorImpl &operator=(DeviceUVectorImpl const &that) = delete; DeviceUVectorImpl(DeviceUVectorImpl &&that) = default; @@ -499,7 +506,7 @@ class DeviceUVectorImpl { [[nodiscard]] std::size_t Capacity() const { return this->capacity_; } // Resize without init. - void resize(std::size_t n) { // NOLINT + void resize(std::size_t n, ::xgboost::curt::StreamRef stream) { // NOLINT using ::xgboost::common::SizeBytes; if (n <= this->Capacity()) { @@ -509,6 +516,7 @@ class DeviceUVectorImpl { } CHECK_LE(this->size(), this->Capacity()); + this->alloc_.SetStream(stream); Alloc alloc = this->alloc_; decltype(data_) new_ptr{thrust::raw_pointer_cast(this->alloc_.allocate(n)), [=](T *ptr) mutable { @@ -518,9 +526,8 @@ class DeviceUVectorImpl { }}; CHECK(new_ptr.get()); - auto s = ::xgboost::curt::DefaultStream(); safe_cuda(cudaMemcpyAsync(new_ptr.get(), this->data(), SizeBytes(this->size()), - cudaMemcpyDefault, s)); + cudaMemcpyDefault, stream)); this->size_ = n; this->capacity_ = n; @@ -529,17 +536,16 @@ class DeviceUVectorImpl { // std::swap(this->data_, new_ptr); } // Resize with init - void resize(std::size_t n, T const &v) { // NOLINT + void resize(std::size_t n, T const &v, ::xgboost::curt::StreamRef stream) { // NOLINT auto orig = this->size(); - this->resize(n); + this->resize(n, stream); if (orig < n) { - auto exec = thrust::cuda::par_nosync.on(::xgboost::curt::DefaultStream()); + auto exec = thrust::cuda::par_nosync.on(cudaStream_t{stream}); thrust::fill(exec, this->begin() + orig, this->end(), v); } } - - void clear() { // NOLINT - this->resize(0); + void clear(::xgboost::curt::StreamRef stream) { // NOLINT + this->resize(0, stream); } [[nodiscard]] std::size_t size() const { return this->size_; } // NOLINT diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu index d059888f5643..7960d5f01880 100644 --- a/src/common/hist_util.cu +++ b/src/common/hist_util.cu @@ -169,7 +169,7 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple { return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size. }); - detail::GetColumnSizesScan(ctx->CUDACtx(), ctx->Device(), info.num_col_, num_cuts_per_feature, + detail::GetColumnSizesScan(ctx, info.num_col_, num_cuts_per_feature, IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr, &column_sizes_scan); auto d_cuts_ptr = cuts_ptr.DeviceSpan(); @@ -270,7 +270,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b info.weights_.SetDevice(ctx->Device()); auto d_weight = UnifyWeight(cuctx, info, hessian, &weight); - SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, ctx->Device()); + SketchContainer sketch_container(ctx, info.feature_types, max_bin, info.num_col_); CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty()); for (const auto& page : p_fmat->GetBatches()) { std::size_t page_nnz = page.data.Size(); diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh index 3b1ad6312584..c89afa736248 100644 --- a/src/common/hist_util.cuh +++ b/src/common/hist_util.cuh @@ -1,5 +1,5 @@ /** - * Copyright 2020-2025, XGBoost contributors + * Copyright 2020-2026, XGBoost contributors * * \brief Front end and utilities for GPU based sketching. Works on sliding window * instead of stream. @@ -138,24 +138,25 @@ void LaunchGetColumnSizeKernel(CUDAContext const* cuctx, DeviceOrd device, } template -void GetColumnSizesScan(CUDAContext const* cuctx, DeviceOrd device, size_t num_columns, +void GetColumnSizesScan(Context const* ctx, size_t num_columns, std::size_t num_cuts_per_feature, IterSpan batch_iter, data::IsValidFunctor is_valid, HostDeviceVector* cuts_ptr, dh::caching_device_vector* column_sizes_scan) { + auto cuctx = ctx->CUDACtx(); column_sizes_scan->resize(num_columns + 1); - cuts_ptr->SetDevice(device); - cuts_ptr->Resize(num_columns + 1, 0); + cuts_ptr->SetDevice(ctx->Device()); + cuts_ptr->Resize(ctx, num_columns + 1, 0); auto d_column_sizes_scan = dh::ToSpan(*column_sizes_scan); - LaunchGetColumnSizeKernel(cuctx, device, batch_iter, is_valid, d_column_sizes_scan); + LaunchGetColumnSizeKernel(cuctx, ctx->Device(), batch_iter, is_valid, d_column_sizes_scan); // Calculate cuts CSC pointer auto cut_ptr_it = dh::MakeTransformIterator( column_sizes_scan->begin(), [=] __device__(size_t column_size) { return thrust::min(num_cuts_per_feature, column_size); }); - thrust::exclusive_scan(cuctx->CTP(), cut_ptr_it, - cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer()); + thrust::exclusive_scan(cuctx->CTP(), cut_ptr_it, cut_ptr_it + column_sizes_scan->size(), + cuts_ptr->DevicePointer()); thrust::exclusive_scan(cuctx->CTP(), column_sizes_scan->begin(), column_sizes_scan->end(), column_sizes_scan->begin()); } @@ -168,9 +169,8 @@ size_t RequiredSampleCutsPerColumn(int max_bins, size_t num_rows); // Count the valid entries in each column and copy them out. template -void MakeEntriesFromAdapter(CUDAContext const* cuctx, AdapterBatch const& batch, - BatchIter batch_iter, Range1d range, float missing, size_t columns, - size_t cuts_per_feature, DeviceOrd device, +void MakeEntriesFromAdapter(Context const* ctx, AdapterBatch const& batch, BatchIter batch_iter, + Range1d range, float missing, size_t columns, size_t cuts_per_feature, HostDeviceVector* cut_sizes_scan, dh::caching_device_vector* column_sizes_scan, dh::device_vector* sorted_entries) { @@ -182,13 +182,13 @@ void MakeEntriesFromAdapter(CUDAContext const* cuctx, AdapterBatch const& batch, auto span = IterSpan{batch_iter + range.begin(), n}; data::IsValidFunctor is_valid(missing); // Work out how many valid entries we have in each column - GetColumnSizesScan(cuctx, device, columns, cuts_per_feature, span, is_valid, cut_sizes_scan, + GetColumnSizesScan(ctx, columns, cuts_per_feature, span, is_valid, cut_sizes_scan, column_sizes_scan); size_t num_valid = column_sizes_scan->back(); // Copy current subset of valid elements into temporary storage and sort sorted_entries->resize(num_valid); - CopyIf(cuctx, entry_iter + range.begin(), entry_iter + range.end(), sorted_entries->begin(), - is_valid); + CopyIf(ctx->CUDACtx(), entry_iter + range.begin(), entry_iter + range.end(), + sorted_entries->begin(), is_valid); } void SortByWeight(Context const* ctx, dh::device_vector* weights, @@ -242,9 +242,8 @@ void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInf HostDeviceVector cuts_ptr; cuts_ptr.SetDevice(ctx->Device()); CUDAContext const* cuctx = ctx->CUDACtx(); - detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, n_features, - num_cuts, ctx->Device(), &cuts_ptr, &column_sizes_scan, - &sorted_entries); + detail::MakeEntriesFromAdapter(ctx, batch, batch_iter, {begin, end}, missing, n_features, + num_cuts, &cuts_ptr, &column_sizes_scan, &sorted_entries); thrust::sort(cuctx->TP(), sorted_entries.begin(), sorted_entries.end(), detail::EntryCompareOp()); if (sketch_container->HasCategorical()) { @@ -279,8 +278,8 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons dh::device_vector sorted_entries; dh::caching_device_vector column_sizes_scan; HostDeviceVector cuts_ptr; - detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, columns, - num_cuts_per_feature, ctx->Device(), &cuts_ptr, &column_sizes_scan, + detail::MakeEntriesFromAdapter(ctx, batch, batch_iter, {begin, end}, missing, columns, + num_cuts_per_feature, &cuts_ptr, &column_sizes_scan, &sorted_entries); data::IsValidFunctor is_valid(missing); @@ -299,24 +298,20 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx); return weights[group_idx]; }); - auto retit = thrust::copy_if(cuctx->CTP(), - weight_iter + begin, weight_iter + end, - batch_iter + begin, - d_temp_weights.data(), // output - is_valid); + auto retit = + thrust::copy_if(cuctx->CTP(), weight_iter + begin, weight_iter + end, batch_iter + begin, + d_temp_weights.data(), // output + is_valid); CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size()); } else { CHECK_EQ(batch.NumRows(), weights.size()); auto const weight_iter = dh::MakeTransformIterator( thrust::make_counting_iterator(0lu), - [=]__device__(size_t idx) -> float { - return weights[batch.GetElement(idx).row_idx]; - }); - auto retit = thrust::copy_if(cuctx->CTP(), - weight_iter + begin, weight_iter + end, - batch_iter + begin, - d_temp_weights.data(), // output - is_valid); + [=] __device__(size_t idx) -> float { return weights[batch.GetElement(idx).row_idx]; }); + auto retit = + thrust::copy_if(cuctx->CTP(), weight_iter + begin, weight_iter + end, batch_iter + begin, + d_temp_weights.data(), // output + is_valid); CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size()); } diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc index a1201258da49..c04d42a9c1b8 100644 --- a/src/common/host_device_vector.cc +++ b/src/common/host_device_vector.cc @@ -6,13 +6,16 @@ // dummy implementation of HostDeviceVector in case CUDA is not used +#include "xgboost/host_device_vector.h" + #include #include + #include #include #include + #include "xgboost/tree_model.h" -#include "xgboost/host_device_vector.h" namespace xgboost { @@ -20,12 +23,10 @@ template struct HostDeviceVectorImpl { explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {} HostDeviceVectorImpl(std::initializer_list init) : data_h_(init) {} - explicit HostDeviceVectorImpl(std::vector init) : data_h_(std::move(init)) {} + explicit HostDeviceVectorImpl(std::vector init) : data_h_(std::move(init)) {} HostDeviceVectorImpl(HostDeviceVectorImpl&& that) : data_h_(std::move(that.data_h_)) {} - void Swap(HostDeviceVectorImpl &other) { - data_h_.swap(other.data_h_); - } + void Swap(HostDeviceVectorImpl& other) { data_h_.swap(other.data_h_); } std::vector& Vec() { return data_h_; } @@ -34,20 +35,20 @@ struct HostDeviceVectorImpl { }; template -HostDeviceVector::HostDeviceVector(size_t size, T v, DeviceOrd) - : impl_(nullptr) { +HostDeviceVector::HostDeviceVector(size_t size, T v, DeviceOrd, Context const*) + : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(size, v); } template -HostDeviceVector::HostDeviceVector(std::initializer_list init, DeviceOrd) - : impl_(nullptr) { +HostDeviceVector::HostDeviceVector(std::initializer_list init, DeviceOrd, Context const*) + : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(init); } template -HostDeviceVector::HostDeviceVector(const std::vector& init, DeviceOrd) - : impl_(nullptr) { +HostDeviceVector::HostDeviceVector(const std::vector& init, DeviceOrd, Context const*) + : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(init); } @@ -58,7 +59,9 @@ HostDeviceVector::HostDeviceVector(HostDeviceVector&& that) { template HostDeviceVector& HostDeviceVector::operator=(HostDeviceVector&& that) { - if (this == &that) { return *this; } + if (this == &that) { + return *this; + } std::unique_ptr> new_impl( new HostDeviceVectorImpl(std::move(*that.impl_))); @@ -79,72 +82,85 @@ GPUAccess HostDeviceVector::DeviceAccess() const { } template -size_t HostDeviceVector::Size() const { return impl_->Vec().size(); } +size_t HostDeviceVector::Size() const { + return impl_->Vec().size(); +} template -DeviceOrd HostDeviceVector::Device() const { return DeviceOrd::CPU(); } +DeviceOrd HostDeviceVector::Device() const { + return DeviceOrd::CPU(); +} template -T* HostDeviceVector::DevicePointer() { return nullptr; } +T* HostDeviceVector::DevicePointer(Context const*) { + return nullptr; +} template -const T* HostDeviceVector::ConstDevicePointer() const { +const T* HostDeviceVector::ConstDevicePointer(Context const*) const { return nullptr; } template -common::Span HostDeviceVector::DeviceSpan() { +common::Span HostDeviceVector::DeviceSpan(Context const*) { return common::Span(); } template -common::Span HostDeviceVector::ConstDeviceSpan() const { +common::Span HostDeviceVector::ConstDeviceSpan(Context const*) const { return common::Span(); } template -std::vector& HostDeviceVector::HostVector() { return impl_->Vec(); } +std::vector& HostDeviceVector::HostVector(Context const*) { + return impl_->Vec(); +} template -const std::vector& HostDeviceVector::ConstHostVector() const { +const std::vector& HostDeviceVector::ConstHostVector(Context const*) const { return impl_->Vec(); } template -void HostDeviceVector::Resize(size_t new_size, T v) { +void HostDeviceVector::Resize(std::size_t new_size) { + impl_->Vec().resize(new_size, T{}); +} + +template +void HostDeviceVector::Resize(Context const*, std::size_t new_size, T v) { impl_->Vec().resize(new_size, v); } template -void HostDeviceVector::Resize(size_t new_size) { +void HostDeviceVector::Resize(Context const*, std::size_t new_size) { impl_->Vec().resize(new_size, T{}); } template -void HostDeviceVector::Fill(T v) { +void HostDeviceVector::Fill(T v, Context const*) { std::fill(HostVector().begin(), HostVector().end(), v); } template -void HostDeviceVector::Copy(const HostDeviceVector& other) { +void HostDeviceVector::Copy(const HostDeviceVector& other, Context const*) { CHECK_EQ(Size(), other.Size()); std::copy(other.HostVector().begin(), other.HostVector().end(), HostVector().begin()); } template -void HostDeviceVector::Copy(const std::vector& other) { +void HostDeviceVector::Copy(const std::vector& other, Context const*) { CHECK_EQ(Size(), other.size()); std::copy(other.begin(), other.end(), HostVector().begin()); } template -void HostDeviceVector::Copy(std::initializer_list other) { +void HostDeviceVector::Copy(std::initializer_list other, Context const*) { CHECK_EQ(Size(), other.size()); std::copy(other.begin(), other.end(), HostVector().begin()); } template -void HostDeviceVector::Extend(HostDeviceVector const& other) { +void HostDeviceVector::Extend(HostDeviceVector const& other, Context const*) { auto ori_size = this->Size(); this->HostVector().resize(ori_size + other.Size()); std::copy(other.ConstHostVector().cbegin(), other.ConstHostVector().cend(), @@ -172,14 +188,14 @@ bool HostDeviceVector::DeviceCanWrite() const { } template -void HostDeviceVector::SetDevice(DeviceOrd) const {} +void HostDeviceVector::SetDevice(DeviceOrd, Context const*) const {} // explicit instantiations are required, as HostDeviceVector isn't header-only template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; -template class HostDeviceVector; // bst_node_t +template class HostDeviceVector; // bst_node_t template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index d492285cb01a..c1c4c763bfb7 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -1,5 +1,5 @@ /** - * Copyright 2017-2025, XGBoost contributors + * Copyright 2017-2026, XGBoost contributors */ #include @@ -7,7 +7,8 @@ #include // for size_t #include -#include "cuda_stream.h" // for DefaultStream +#include "cuda_context.cuh" // for CUDAContext +#include "cuda_stream.h" // for DefaultStream #include "device_helpers.cuh" #include "device_vector.cuh" // for DeviceUVector #include "xgboost/data.h" @@ -19,18 +20,27 @@ namespace xgboost { // the handler to call instead of cudaSetDevice; only used for testing static void (*cudaSetDeviceHandler)(int) = nullptr; // NOLINT -void SetCudaSetDeviceHandler(void (*handler)(int)) { - cudaSetDeviceHandler = handler; +void SetCudaSetDeviceHandler(void (*handler)(int)) { cudaSetDeviceHandler = handler; } + +namespace { +curt::StreamRef GetStream(CUDAContext const* ctx) { + return ctx ? ctx->Stream() : curt::DefaultStream(); +} + +CUDAContext const* GetCUDACtx(Context const* ctx) { + return ctx && ctx->IsCUDA() ? ctx->CUDACtx() : nullptr; } +} // namespace template class HostDeviceVectorImpl { public: - HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) { + HostDeviceVectorImpl(CUDAContext const* ctx, size_t size, T v, DeviceOrd device) + : device_(device) { if (device.IsCUDA()) { gpu_access_ = GPUAccess::kWrite; SetDevice(); - data_d_->resize(size, v); + data_d_->resize(size, v, GetStream(ctx)); } else { data_h_.resize(size, v); } @@ -38,21 +48,22 @@ class HostDeviceVectorImpl { // Initializer can be std::vector or std::initializer_list template - HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) { + HostDeviceVectorImpl(CUDAContext const* ctx, const Initializer& init, DeviceOrd device) + : device_(device) { if (device.IsCUDA()) { gpu_access_ = GPUAccess::kWrite; - LazyResizeDevice(init.size()); - Copy(init); + LazyResizeDevice(init.size(), ctx); + Copy(ctx, init); } else { data_h_ = init; } } - HostDeviceVectorImpl(HostDeviceVectorImpl&& that) : - device_{that.device_}, - data_h_{std::move(that.data_h_)}, - data_d_{std::move(that.data_d_)}, - gpu_access_{that.gpu_access_} {} + HostDeviceVectorImpl(HostDeviceVectorImpl&& that) + : device_{that.device_}, + data_h_{std::move(that.data_h_)}, + data_d_{std::move(that.data_d_)}, + gpu_access_{that.gpu_access_} {} ~HostDeviceVectorImpl() { if (device_.IsCUDA()) { @@ -66,100 +77,101 @@ class HostDeviceVectorImpl { [[nodiscard]] DeviceOrd Device() const { return device_; } - T* DevicePointer() { - LazySyncDevice(GPUAccess::kWrite); + T* DevicePointer(CUDAContext const* ctx) { + LazySyncDevice(ctx, GPUAccess::kWrite); return data_d_->data(); } - const T* ConstDevicePointer() { - LazySyncDevice(GPUAccess::kRead); + const T* ConstDevicePointer(CUDAContext const* ctx) { + LazySyncDevice(ctx, GPUAccess::kRead); return data_d_->data(); } - common::Span DeviceSpan() { - LazySyncDevice(GPUAccess::kWrite); - return {this->DevicePointer(), Size()}; + common::Span DeviceSpan(CUDAContext const* ctx) { + LazySyncDevice(ctx, GPUAccess::kWrite); + return {this->DevicePointer(ctx), Size()}; } - common::Span ConstDeviceSpan() { - LazySyncDevice(GPUAccess::kRead); - return {this->ConstDevicePointer(), Size()}; + common::Span ConstDeviceSpan(CUDAContext const* ctx) { + LazySyncDevice(ctx, GPUAccess::kRead); + return {this->ConstDevicePointer(ctx), Size()}; } - void Fill(T v) { // NOLINT + void Fill(T v, CUDAContext const* ctx) { // NOLINT if (HostCanWrite()) { std::fill(data_h_.begin(), data_h_.end(), v); } else { gpu_access_ = GPUAccess::kWrite; SetDevice(); auto s_data = dh::ToSpan(*data_d_); - dh::LaunchN(data_d_->size(), curt::DefaultStream(), - [=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; }); + dh::LaunchN(data_d_->size(), GetStream(ctx), [=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; }); } } - void Copy(HostDeviceVectorImpl* other) { + void Copy(CUDAContext const* ctx, HostDeviceVectorImpl* other) { CHECK_EQ(Size(), other->Size()); - SetDevice(other->device_); + SetDevice(other->device_, ctx); // Data is on host. if (HostCanWrite() && other->HostCanWrite()) { std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin()); return; } SetDevice(); - CopyToDevice(other); + CopyToDevice(ctx, other); } - void Copy(const std::vector& other) { + void Copy(CUDAContext const* ctx, const std::vector& other) { CHECK_EQ(Size(), other.size()); if (HostCanWrite()) { std::copy(other.begin(), other.end(), data_h_.begin()); } else { - CopyToDevice(other.data()); + CopyToDevice(ctx, other.data()); } } - void Copy(std::initializer_list other) { + void Copy(CUDAContext const* ctx, std::initializer_list other) { CHECK_EQ(Size(), other.size()); if (HostCanWrite()) { std::copy(other.begin(), other.end(), data_h_.begin()); } else { - CopyToDevice(other.begin()); + CopyToDevice(ctx, other.begin()); } } - void Extend(HostDeviceVectorImpl* other) { + void Extend(CUDAContext const* ctx, HostDeviceVectorImpl* other) { auto ori_size = this->Size(); - this->Resize(ori_size + other->Size(), T{}); + this->Resize(ctx, ori_size + other->Size(), T{}); if (HostCanWrite() && other->HostCanRead()) { - auto& h_vec = this->HostVector(); - auto& other_vec = other->HostVector(); + auto& h_vec = this->HostVector(ctx); + auto& other_vec = other->HostVector(ctx); CHECK_EQ(h_vec.size(), ori_size + other->Size()); std::copy(other_vec.cbegin(), other_vec.cend(), h_vec.begin() + ori_size); } else { - auto ptr = other->ConstDevicePointer(); + auto ptr = other->ConstDevicePointer(ctx); SetDevice(); CHECK_EQ(this->Device(), other->Device()); - dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr, + dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer(ctx) + ori_size, ptr, other->Size() * sizeof(T), cudaMemcpyDeviceToDevice, - curt::DefaultStream())); + GetStream(ctx))); } } - std::vector& HostVector() { - LazySyncHost(GPUAccess::kNone); + std::vector& HostVector(CUDAContext const* ctx) { + LazySyncHost(ctx, GPUAccess::kNone); return data_h_; } - const std::vector& ConstHostVector() { - LazySyncHost(GPUAccess::kRead); + const std::vector& ConstHostVector(CUDAContext const* ctx) { + LazySyncHost(ctx, GPUAccess::kRead); return data_h_; } - void SetDevice(DeviceOrd device) { - if (device_ == device) { return; } + void SetDevice(DeviceOrd device, CUDAContext const* ctx) { + if (device_ == device) { + return; + } if (device_.IsCUDA()) { - LazySyncHost(GPUAccess::kNone); + LazySyncHost(ctx, GPUAccess::kNone); } if (device_.IsCUDA() && device.IsCUDA()) { @@ -168,12 +180,12 @@ class HostDeviceVectorImpl { } device_ = device; if (device_.IsCUDA()) { - LazyResizeDevice(data_h_.size()); + LazyResizeDevice(data_h_.size(), ctx); } } template - auto Resize(std::size_t new_size, U&&... args) { + auto Resize(CUDAContext const* ctx, std::size_t new_size, U&&... args) { if (new_size == Size()) { return; } @@ -182,41 +194,47 @@ class HostDeviceVectorImpl { gpu_access_ = GPUAccess::kWrite; SetDevice(); auto old_size = data_d_->size(); - data_d_->resize(new_size, std::forward(args)...); + data_d_->resize(new_size, std::forward(args)..., GetStream(ctx)); } else { // resize on host - LazySyncHost(GPUAccess::kNone); + LazySyncHost(ctx, GPUAccess::kNone); auto old_size = data_h_.size(); data_h_.resize(new_size, std::forward(args)...); } } - void LazySyncHost(GPUAccess access) { - if (HostCanAccess(access)) { return; } + void LazySyncHost(CUDAContext const* ctx, GPUAccess access) { + if (HostCanAccess(access)) { + return; + } if (HostCanRead()) { - // data is present, just need to deny access to the device gpu_access_ = access; return; } gpu_access_ = access; - if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); } + if (data_h_.size() != data_d_->size()) { + data_h_.resize(data_d_->size()); + } SetDevice(); - dh::safe_cuda(cudaMemcpy(data_h_.data(), data_d_->data(), data_d_->size() * sizeof(T), - cudaMemcpyDeviceToHost)); + auto stream = GetStream(ctx); + dh::safe_cuda(cudaMemcpyAsync(data_h_.data(), data_d_->data(), data_d_->size() * sizeof(T), + cudaMemcpyDeviceToHost, stream)); + dh::safe_cuda(cudaStreamSynchronize(stream)); } - void LazySyncDevice(GPUAccess access) { - if (DeviceCanAccess(access)) { return; } + void LazySyncDevice(CUDAContext const* ctx, GPUAccess access) { + if (DeviceCanAccess(access)) { + return; + } if (DeviceCanRead()) { - // deny read to the host gpu_access_ = access; return; } // data is on the host - LazyResizeDevice(data_h_.size()); + LazyResizeDevice(data_h_.size(), ctx); SetDevice(); dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), data_h_.data(), data_d_->size() * sizeof(T), - cudaMemcpyHostToDevice, curt::DefaultStream())); + cudaMemcpyHostToDevice, GetStream(ctx))); gpu_access_ = access; } @@ -234,31 +252,33 @@ class HostDeviceVectorImpl { std::unique_ptr> data_d_{}; GPUAccess gpu_access_{GPUAccess::kNone}; - void CopyToDevice(HostDeviceVectorImpl* other) { + void CopyToDevice(CUDAContext const* ctx, HostDeviceVectorImpl* other) { if (other->HostCanWrite()) { - CopyToDevice(other->data_h_.data()); + CopyToDevice(ctx, other->data_h_.data()); } else { - LazyResizeDevice(Size()); + LazyResizeDevice(Size(), ctx); gpu_access_ = GPUAccess::kWrite; SetDevice(); dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), other->data_d_->data(), data_d_->size() * sizeof(T), cudaMemcpyDefault, - curt::DefaultStream())); + GetStream(ctx))); } } - void CopyToDevice(const T* begin) { - LazyResizeDevice(Size()); + void CopyToDevice(CUDAContext const* ctx, const T* begin) { + LazyResizeDevice(Size(), ctx); gpu_access_ = GPUAccess::kWrite; SetDevice(); dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), begin, data_d_->size() * sizeof(T), - cudaMemcpyDefault, curt::DefaultStream())); + cudaMemcpyDefault, GetStream(ctx))); } - void LazyResizeDevice(size_t new_size) { - if (data_d_ && new_size == data_d_->size()) { return; } + void LazyResizeDevice(size_t new_size, CUDAContext const* ctx) { + if (data_d_ && new_size == data_d_->size()) { + return; + } SetDevice(); - data_d_->resize(new_size); + data_d_->resize(new_size, GetStream(ctx)); } void SetDevice() { @@ -275,17 +295,19 @@ class HostDeviceVectorImpl { } }; -template -HostDeviceVector::HostDeviceVector(size_t size, T v, DeviceOrd device) - : impl_(new HostDeviceVectorImpl(size, v, device)) {} +template +HostDeviceVector::HostDeviceVector(size_t size, T v, DeviceOrd device, Context const* ctx) + : impl_(new HostDeviceVectorImpl(GetCUDACtx(ctx), size, v, device)) {} template -HostDeviceVector::HostDeviceVector(std::initializer_list init, DeviceOrd device) - : impl_(new HostDeviceVectorImpl(init, device)) {} +HostDeviceVector::HostDeviceVector(std::initializer_list init, DeviceOrd device, + Context const* ctx) + : impl_(new HostDeviceVectorImpl(GetCUDACtx(ctx), init, device)) {} template -HostDeviceVector::HostDeviceVector(const std::vector& init, DeviceOrd device) - : impl_(new HostDeviceVectorImpl(init, device)) {} +HostDeviceVector::HostDeviceVector(const std::vector& init, DeviceOrd device, + Context const* ctx) + : impl_(new HostDeviceVectorImpl(GetCUDACtx(ctx), init, device)) {} template HostDeviceVector::HostDeviceVector(HostDeviceVector&& other) @@ -293,7 +315,9 @@ HostDeviceVector::HostDeviceVector(HostDeviceVector&& other) template HostDeviceVector& HostDeviceVector::operator=(HostDeviceVector&& other) { - if (this == &other) { return *this; } + if (this == &other) { + return *this; + } std::unique_ptr> new_impl( new HostDeviceVectorImpl(std::move(*other.impl_))); @@ -309,7 +333,9 @@ HostDeviceVector::~HostDeviceVector() { } template -size_t HostDeviceVector::Size() const { return impl_->Size(); } +size_t HostDeviceVector::Size() const { + return impl_->Size(); +} template DeviceOrd HostDeviceVector::Device() const { @@ -317,56 +343,58 @@ DeviceOrd HostDeviceVector::Device() const { } template -T* HostDeviceVector::DevicePointer() { - return impl_->DevicePointer(); +T* HostDeviceVector::DevicePointer(Context const* ctx) { + return impl_->DevicePointer(GetCUDACtx(ctx)); } template -const T* HostDeviceVector::ConstDevicePointer() const { - return impl_->ConstDevicePointer(); +const T* HostDeviceVector::ConstDevicePointer(Context const* ctx) const { + return impl_->ConstDevicePointer(GetCUDACtx(ctx)); } template -common::Span HostDeviceVector::DeviceSpan() { - return impl_->DeviceSpan(); +common::Span HostDeviceVector::DeviceSpan(Context const* ctx) { + return impl_->DeviceSpan(GetCUDACtx(ctx)); } template -common::Span HostDeviceVector::ConstDeviceSpan() const { - return impl_->ConstDeviceSpan(); +common::Span HostDeviceVector::ConstDeviceSpan(Context const* ctx) const { + return impl_->ConstDeviceSpan(GetCUDACtx(ctx)); } template -void HostDeviceVector::Fill(T v) { - impl_->Fill(v); +void HostDeviceVector::Fill(T v, Context const* ctx) { + impl_->Fill(v, GetCUDACtx(ctx)); } template -void HostDeviceVector::Copy(const HostDeviceVector& other) { - impl_->Copy(other.impl_); +void HostDeviceVector::Copy(const HostDeviceVector& other, Context const* ctx) { + impl_->Copy(GetCUDACtx(ctx), other.impl_); } template -void HostDeviceVector::Copy(const std::vector& other) { - impl_->Copy(other); +void HostDeviceVector::Copy(const std::vector& other, Context const* ctx) { + impl_->Copy(GetCUDACtx(ctx), other); } template -void HostDeviceVector::Copy(std::initializer_list other) { - impl_->Copy(other); +void HostDeviceVector::Copy(std::initializer_list other, Context const* ctx) { + impl_->Copy(GetCUDACtx(ctx), other); } template -void HostDeviceVector::Extend(HostDeviceVector const& other) { - impl_->Extend(other.impl_); +void HostDeviceVector::Extend(HostDeviceVector const& other, Context const* ctx) { + impl_->Extend(GetCUDACtx(ctx), other.impl_); } template -std::vector& HostDeviceVector::HostVector() { return impl_->HostVector(); } +std::vector& HostDeviceVector::HostVector(Context const* ctx) { + return impl_->HostVector(GetCUDACtx(ctx)); +} template -const std::vector& HostDeviceVector::ConstHostVector() const { - return impl_->ConstHostVector(); +const std::vector& HostDeviceVector::ConstHostVector(Context const* ctx) const { + return impl_->ConstHostVector(GetCUDACtx(ctx)); } template @@ -395,18 +423,23 @@ GPUAccess HostDeviceVector::DeviceAccess() const { } template -void HostDeviceVector::SetDevice(DeviceOrd device) const { - impl_->SetDevice(device); +void HostDeviceVector::SetDevice(DeviceOrd device, Context const* ctx) const { + impl_->SetDevice(device, GetCUDACtx(ctx)); } template void HostDeviceVector::Resize(std::size_t new_size) { - impl_->Resize(new_size); + impl_->Resize(nullptr, new_size); +} + +template +void HostDeviceVector::Resize(Context const* ctx, std::size_t new_size) { + impl_->Resize(GetCUDACtx(ctx), new_size); } template -void HostDeviceVector::Resize(std::size_t new_size, T v) { - impl_->Resize(new_size, v); +void HostDeviceVector::Resize(Context const* ctx, std::size_t new_size, T v) { + impl_->Resize(GetCUDACtx(ctx), new_size, v); } // explicit instantiations are required, as HostDeviceVector isn't header-only @@ -415,7 +448,7 @@ template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; -template class HostDeviceVector; // bst_node_t +template class HostDeviceVector; // bst_node_t template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; diff --git a/src/common/linalg_op.cu b/src/common/linalg_op.cu index 718f0193300c..53a517d748a3 100644 --- a/src/common/linalg_op.cu +++ b/src/common/linalg_op.cu @@ -18,7 +18,7 @@ void SmallHistogram(Context const* ctx, linalg::MatrixView indices, auto n_bins = bins.Size(); auto cuctx = ctx->CUDACtx(); // Sort for segmented sum - dh::DeviceUVector sorted_idx(indices.Size()); + dh::DeviceUVector sorted_idx(indices.Size(), cuctx->Stream()); common::ArgSort(ctx, indices.Values(), dh::ToSpan(sorted_idx)); auto d_sorted_idx = dh::ToSpan(sorted_idx); @@ -27,8 +27,8 @@ void SmallHistogram(Context const* ctx, linalg::MatrixView indices, dh::device_vector counts_out(n_bins + 1, 0); // Obtain the segment boundaries for the segmented sum. - dh::DeviceUVector unique(n_bins); - dh::CachingDeviceUVector num_runs(1); + dh::DeviceUVector unique(n_bins, cuctx->Stream()); + dh::CachingDeviceUVector num_runs(1, cuctx->Stream()); common::RunLengthEncode(cuctx->Stream(), key_it, unique.begin(), counts_out.begin() + 1, num_runs.begin(), indices.Size()); thrust::inclusive_scan(cuctx->CTP(), counts_out.begin(), counts_out.end(), counts_out.begin()); diff --git a/src/common/quantile.cu b/src/common/quantile.cu index 776de3682e48..6c436711d1c9 100644 --- a/src/common/quantile.cu +++ b/src/common/quantile.cu @@ -607,7 +607,7 @@ void SketchContainer::AllReduce(Context const *ctx, bool is_column_split) { timer_.Stop(__func__); // Merge them into a new sketch. - SketchContainer new_sketch(this->feature_types_, num_bins_, this->num_columns_, ctx->Device()); + SketchContainer new_sketch(ctx, this->feature_types_, num_bins_, this->num_columns_); for (size_t i = 0; i < allworkers.size(); ++i) { auto worker = allworkers[i]; auto worker_ptr = diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh index 50e65fd0c032..11f34ea237b4 100644 --- a/src/common/quantile.cuh +++ b/src/common/quantile.cuh @@ -1,5 +1,5 @@ /** - * Copyright 2020-2025, XGBoost Contributors + * Copyright 2020-2026, XGBoost Contributors */ #ifndef XGBOOST_COMMON_QUANTILE_CUH_ #define XGBOOST_COMMON_QUANTILE_CUH_ @@ -7,13 +7,10 @@ #include // for any_of #include -#include // for size_t -#include // for equal_to +#include // for size_t #include "categorical.h" -#include "common.h" // for HumanMemUnit -#include "cuda_context.cuh" // for CUDAContext -#include "cuda_rt_utils.h" // for SetDevice +#include "common.h" // for HumanMemUnit #include "device_helpers.cuh" #include "error_msg.h" // for InvalidMaxBin #include "quantile.h" @@ -77,7 +74,7 @@ class SketchContainer { Span Column(bst_feature_t i) { auto data = dh::ToSpan(this->entries_); auto h_ptr = columns_ptr_.ConstHostSpan(); - auto c = data.subspan(h_ptr[i], h_ptr[i+1] - h_ptr[i]); + auto c = data.subspan(h_ptr[i], h_ptr[i + 1] - h_ptr[i]); return c; } @@ -88,20 +85,20 @@ class SketchContainer { * \param num_columns Total number of columns in dataset. * \param device GPU ID. */ - SketchContainer(HostDeviceVector const& feature_types, bst_bin_t max_bin, - bst_feature_t num_columns, DeviceOrd device) + SketchContainer(Context const* ctx, HostDeviceVector const& feature_types, + bst_bin_t max_bin, bst_feature_t num_columns) : num_columns_{num_columns}, num_bins_{max_bin} { - CHECK(device.IsCUDA()); + CHECK(ctx->IsCUDA()); // Initialize Sketches for this dmatrix - this->columns_ptr_.SetDevice(device); - this->columns_ptr_.Resize(num_columns + 1, 0); - this->columns_ptr_tmp_.SetDevice(device); - this->columns_ptr_tmp_.Resize(num_columns + 1, 0); + this->columns_ptr_.SetDevice(ctx->Device()); + this->columns_ptr_.Resize(ctx, num_columns + 1, 0); + this->columns_ptr_tmp_.SetDevice(ctx->Device()); + this->columns_ptr_tmp_.Resize(ctx, num_columns + 1, 0); this->feature_types_.Resize(feature_types.Size()); this->feature_types_.Copy(feature_types); // Pull to device. - this->feature_types_.SetDevice(device); + this->feature_types_.SetDevice(ctx->Device()); this->feature_types_.ConstDeviceSpan(); this->feature_types_.ConstHostSpan(); @@ -118,9 +115,9 @@ class SketchContainer { */ [[nodiscard]] std::size_t MemCapacityBytes() const { auto constexpr kE = sizeof(typename decltype(this->entries_)::value_type); - auto n_bytes = - (this->entries_.capacity() + this->entries_tmp_.capacity() + this->prune_buffer_.capacity()) * - kE; + auto n_bytes = (this->entries_.capacity() + this->entries_tmp_.capacity() + + this->prune_buffer_.capacity()) * + kE; n_bytes += (this->columns_ptr_.Size() + this->columns_ptr_tmp_.Size()) * sizeof(OffsetT); n_bytes += this->feature_types_.Size() * sizeof(FeatureType); @@ -194,7 +191,6 @@ class SketchContainer { SketchContainer(const SketchContainer&) = delete; SketchContainer& operator=(const SketchContainer&) = delete; - }; } // namespace xgboost::common diff --git a/src/common/ranking_utils.cc b/src/common/ranking_utils.cc index d477225a4efe..d5a63d045507 100644 --- a/src/common/ranking_utils.cc +++ b/src/common/ranking_utils.cc @@ -1,13 +1,13 @@ /** - * Copyright 2023 by XGBoost contributors + * Copyright 2023-2026, XGBoost contributors */ #include "ranking_utils.h" -#include // for copy_n, max, min, none_of, all_of -#include // for size_t -#include // for sscanf -#include // for greater -#include // for char_traits, string +#include // for copy_n, max, min, none_of, all_of +#include // for size_t +#include // for sscanf +#include // for greater +#include // for char_traits, string #include "algorithm.h" // for ArgSort #include "linalg_op.h" // for cbegin, cend @@ -22,7 +22,7 @@ namespace xgboost::ltr { void RankingCache::InitOnCPU(Context const* ctx, MetaInfo const& info) { if (info.group_ptr_.empty()) { - group_ptr_.Resize(2, 0); + group_ptr_.Resize(ctx, 2, 0); group_ptr_.HostVector()[1] = info.num_row_; } else { group_ptr_.HostVector() = info.group_ptr_; @@ -75,7 +75,7 @@ common::Span RankingCache::MakeRankOnCUDA(Context const*, void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) { auto const h_group_ptr = this->DataGroupPtr(ctx); - discounts_.Resize(MaxGroupSize(), 0); + discounts_.Resize(ctx, MaxGroupSize(), 0); auto& h_discounts = discounts_.HostVector(); for (std::size_t i = 0; i < MaxGroupSize(); ++i) { h_discounts[i] = CalcDCGDiscount(i); diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu index 28e75dca47ea..003b7674b5fd 100644 --- a/src/common/ranking_utils.cu +++ b/src/common/ranking_utils.cu @@ -135,7 +135,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) { group_ptr_.SetDevice(ctx->Device()); if (info.group_ptr_.empty()) { - group_ptr_.Resize(2, 0); + group_ptr_.Resize(ctx, 2, bst_group_t{0}); group_ptr_.HostVector()[1] = info.num_row_; } else { auto const& h_group_ptr = info.group_ptr_; @@ -154,7 +154,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) { thrust::reduce(cuctx->CTP(), it, it + n_groups, 0ul, thrust::maximum{}); threads_group_ptr_.SetDevice(ctx->Device()); - threads_group_ptr_.Resize(n_groups + 1, 0); + threads_group_ptr_.Resize(ctx, n_groups + 1, std::size_t{0}); auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan(); if (param_.HasTruncation()) { n_cuda_threads_ = @@ -169,7 +169,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) { } sorted_idx_cache_.SetDevice(ctx->Device()); - sorted_idx_cache_.Resize(info.labels.Size(), 0); + sorted_idx_cache_.Resize(ctx, info.labels.Size(), std::size_t{0}); auto weight = common::MakeOptionalWeights(ctx->Device(), info.weights_); auto w_it = diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh index bc8b7d7c010a..6fcb9710f88e 100644 --- a/src/common/ref_resource_view.cuh +++ b/src/common/ref_resource_view.cuh @@ -1,5 +1,5 @@ /** - * Copyright 2024-2025, XGBoost Contributors + * Copyright 2024-2026, XGBoost Contributors */ #pragma once @@ -17,8 +17,11 @@ namespace xgboost::common { * @brief Make a fixed size `RefResourceView` with cudaMalloc resource. */ template -[[nodiscard]] RefResourceView MakeFixedVecWithCudaMalloc(std::size_t n_elements) { - auto resource = std::make_shared(n_elements * sizeof(T)); +[[nodiscard]] RefResourceView MakeFixedVecWithCudaMalloc(Context const* ctx, + std::size_t n_elements) { + CHECK(ctx); + auto resource = std::make_shared(n_elements * sizeof(T), + ctx->CUDACtx()->Stream()); auto ref = RefResourceView{resource->DataAs(), n_elements, resource}; return ref; } @@ -36,7 +39,7 @@ template template [[nodiscard]] RefResourceView MakeFixedVecWithCudaMalloc(Context const* ctx, std::size_t n_elements, T const& init) { - auto ref = MakeFixedVecWithCudaMalloc(n_elements); + auto ref = MakeFixedVecWithCudaMalloc(ctx, n_elements); thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init); return ref; } diff --git a/src/common/resource.cuh b/src/common/resource.cuh index 0760ec4fa81f..588711678c80 100644 --- a/src/common/resource.cuh +++ b/src/common/resource.cuh @@ -18,18 +18,22 @@ namespace xgboost::common { */ class CudaMallocResource : public ResourceHandler { dh::DeviceUVector storage_; + curt::StreamRef stream_; - void Clear() noexcept(true) { this->Resize(0); } + void Clear() noexcept(true) { this->Resize(0, this->stream_); } public: - explicit CudaMallocResource(std::size_t n_bytes) : ResourceHandler{kCudaMalloc} { - this->Resize(n_bytes); + explicit CudaMallocResource(std::size_t n_bytes, curt::StreamRef stream) + : ResourceHandler{kCudaMalloc}, stream_{stream} { + this->Resize(n_bytes, stream); } ~CudaMallocResource() noexcept(true) override { this->Clear(); } [[nodiscard]] void* Data() override { return storage_.data(); } [[nodiscard]] std::size_t Size() const override { return storage_.size(); } - void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); } + void Resize(std::size_t n_bytes, curt::StreamRef stream) { + this->storage_.resize(n_bytes, stream); + } }; /** diff --git a/src/data/cat_container.cu b/src/data/cat_container.cu index d957089b8ea1..adbcd39b2df3 100644 --- a/src/data/cat_container.cu +++ b/src/data/cat_container.cu @@ -137,7 +137,7 @@ struct CatContainerImpl { [[nodiscard]] std::tuple> MakeCatAccessor( Context const* ctx, enc::DeviceColumnsView const& new_enc, CatContainer const* orig_cats) { - dh::DeviceUVector mapping(new_enc.n_total_cats); + dh::DeviceUVector mapping(new_enc.n_total_cats, ctx->CUDACtx()->Stream()); auto d_sorted_idx = orig_cats->RefSortedIndex(ctx); auto orig_enc = orig_cats->DeviceView(ctx); enc::Recode(EncPolicy, orig_enc, d_sorted_idx, new_enc, dh::ToSpan(mapping)); diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index 60db1ab7b9e2..c0d70f0ecdcf 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -158,7 +158,7 @@ __global__ void CompressBinEllpackKernel( [=] XGBOOST_DEVICE(std::size_t i) { return dptrs[i] - dptrs[i - 1]; }); CHECK_GE(dptrs.size(), 2); auto max_it = thrust::max_element(cuctx->CTP(), it, it + dptrs.size() - 1); - dh::CachingDeviceUVector max_element(1); + dh::CachingDeviceUVector max_element(1, cuctx->Stream()); auto d_me = max_element.data(); dh::LaunchN(1, cuctx->Stream(), [=] XGBOOST_DEVICE(std::size_t i) { d_me[i] = *max_it; }); dh::safe_cuda(cudaMemcpyAsync(&n_symbols_dense, d_me, sizeof(PtrT), cudaMemcpyDeviceToHost, @@ -595,7 +595,7 @@ void EllpackPageImpl::CreateHistIndices(Context const* ctx, const SparsePage& ro /*! \brief row offset in SparsePage (the input data). */ using OffT = typename std::remove_reference_t::value_type; - dh::DeviceUVector row_ptrs(batch_nrows + 1); + dh::DeviceUVector row_ptrs(batch_nrows + 1, ctx->CUDACtx()->Stream()); auto size = std::distance(offset_vec.data() + batch_row_begin, offset_vec.data() + batch_row_end + 1); dh::safe_cuda(cudaMemcpyAsync(row_ptrs.data(), offset_vec.data() + batch_row_begin, @@ -604,7 +604,7 @@ void EllpackPageImpl::CreateHistIndices(Context const* ctx, const SparsePage& ro // number of entries in this batch. size_t n_entries = ent_cnt_end - ent_cnt_begin; - dh::DeviceUVector entries_d(n_entries); + dh::DeviceUVector entries_d(n_entries, ctx->CUDACtx()->Stream()); // copy data entries to device. if (row_batch.data.DeviceCanRead()) { auto const& d_data = row_batch.data.ConstDeviceSpan(); diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu index a5a2b3748100..7696100f7215 100644 --- a/src/data/ellpack_page_raw_format.cu +++ b/src/data/ellpack_page_raw_format.cu @@ -1,5 +1,5 @@ /** - * Copyright 2019-2025, XGBoost contributors + * Copyright 2019-2026, XGBoost contributors */ #include @@ -21,7 +21,7 @@ DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format); namespace { // Function to support system without HMM or ATS template -[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi, +[[nodiscard]] bool ReadDeviceVec(Context const* ctx, common::AlignedResourceReadStream* fi, common::RefResourceView* vec) { xgboost_NVTX_FN_RANGE(); @@ -40,9 +40,9 @@ template return false; } - *vec = common::MakeFixedVecWithCudaMalloc(n); + *vec = common::MakeFixedVecWithCudaMalloc(ctx, n); dh::safe_cuda( - cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, curt::DefaultStream())); + cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, ctx->CUDACtx()->Stream())); return true; } } // namespace @@ -61,8 +61,10 @@ template RET_IF_NOT(fi->Read(&impl->is_dense)); RET_IF_NOT(fi->Read(&impl->info.row_stride)); + Context ctx = Context{}.MakeCUDA(curt::CurrentDevice()); + if (this->param_.prefetch_copy || !has_hmm_ats_) { - RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer)); + RET_IF_NOT(ReadDeviceVec(&ctx, fi, &impl->gidx_buffer)); } else { RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer)); } @@ -73,7 +75,7 @@ template impl->SetCuts(this->cuts_); - curt::DefaultStream().Sync(); + ctx.CUDACtx()->Stream().Sync(); return true; } @@ -94,7 +96,7 @@ template bytes += fo->Write(impl->base_rowid); bytes += fo->Write(impl->NumSymbols()); - curt::DefaultStream().Sync(); + ctx.CUDACtx()->Stream().Sync(); return bytes; } @@ -128,7 +130,7 @@ template dispatch(); } - curt::DefaultStream().Sync(); + ctx.CUDACtx()->Stream().Sync(); return true; } diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 5cddd94996da..2c74ce266863 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -228,7 +228,7 @@ class EllpackHostCacheStreamImpl { // Device cache auto remaining = old_impl->gidx_buffer.size_bytes() - n_bytes; - auto d_page = common::MakeFixedVecWithCudaMalloc(remaining); + auto d_page = common::MakeFixedVecWithCudaMalloc(&ctx, remaining); if (remaining > 0) { dh::safe_cuda(cudaMemcpyAsync(d_page.data(), old_impl->gidx_buffer.data() + n_bytes, remaining, cudaMemcpyDefault)); @@ -323,7 +323,8 @@ class EllpackHostCacheStreamImpl { // Copy the data in the same order as written // Normal host cache auto n_bytes = this->cache_->GidxSizeBytes(this->ptr_); - out_impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc(n_bytes); + out_impl->gidx_buffer = + common::MakeFixedVecWithCudaMalloc(ctx, n_bytes); if (!h_page->gidx_buffer.empty()) { dh::safe_cuda(cudaMemcpyAsync(out_impl->gidx_buffer.data(), h_page->gidx_buffer.data(), h_page->gidx_buffer.size_bytes(), cudaMemcpyDefault, diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu index 82ba59cb4bc3..0b830223c642 100644 --- a/src/data/quantile_dmatrix.cu +++ b/src/data/quantile_dmatrix.cu @@ -78,8 +78,8 @@ void MakeSketches(Context const* ctx, */ if (!ref) { if (!sketch) { - sketch = std::make_unique(proxy->Info().feature_types, p.max_bin, - ext_info.n_features, dh::GetDevice(ctx)); + sketch = std::make_unique(p_ctx, proxy->Info().feature_types, + p.max_bin, ext_info.n_features); } proxy->Info().weights_.SetDevice(dh::GetDevice(ctx)); DispatchAny(proxy, [&](auto const& value) { @@ -113,8 +113,8 @@ void MakeSketches(Context const* ctx, if (!ref) { if (!sketch) { // Empty local input can happen in distributed settings. - sketch = std::make_unique(proxy->Info().feature_types, p.max_bin, - ext_info.n_features, dh::GetDevice(ctx)); + sketch = std::make_unique(p_ctx, proxy->Info().feature_types, + p.max_bin, ext_info.n_features); } *cuts = sketch->MakeCuts(ctx, info.IsColumnSplit()); sketch.reset(); diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu index 2e0a683de9c7..070be130df16 100644 --- a/src/objective/adaptive.cu +++ b/src/objective/adaptive.cu @@ -3,7 +3,7 @@ */ #include -#include // NOLINT +#include // NOLINT #include "../collective/aggregator.h" #include "../common/cuda_context.cuh" // CUDAContext @@ -95,7 +95,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span pos auto d_node_idx = nidx.DeviceSpan(); nptr.SetDevice(ctx->Device()); - nptr.Resize(n_leaf + 1, 0); + nptr.Resize(ctx, n_leaf + 1, bst_idx_t{0}); auto d_node_ptr = nptr.DeviceSpan(); dh::LaunchN(n_leaf, [=] XGBOOST_DEVICE(size_t i) { diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index f4c0d00c6aeb..7e6df62a96c7 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -258,10 +258,11 @@ struct CopyViews { void operator()(dh::DeviceUVector* p_dst, std::vector&& src) { xgboost_NVTX_FN_RANGE(); - p_dst->resize(src.size()); + auto stream = ctx->CUDACtx()->Stream(); + p_dst->resize(src.size(), stream); auto d_dst = dh::ToSpan(*p_dst); - dh::safe_cuda(cudaMemcpyAsync(d_dst.data(), src.data(), d_dst.size_bytes(), cudaMemcpyDefault, - ctx->CUDACtx()->Stream())); + dh::safe_cuda( + cudaMemcpyAsync(d_dst.data(), src.data(), d_dst.size_bytes(), cudaMemcpyDefault, stream)); } }; diff --git a/src/predictor/interpretability/shap.cu b/src/predictor/interpretability/shap.cu index 50d680f10a77..6e48e8005405 100644 --- a/src/predictor/interpretability/shap.cu +++ b/src/predictor/interpretability/shap.cu @@ -59,10 +59,11 @@ struct CopyViews { void operator()(dh::DeviceUVector* p_dst, std::vector&& src) { xgboost_NVTX_FN_RANGE(); - p_dst->resize(src.size()); + auto stream = ctx->CUDACtx()->Stream(); + p_dst->resize(src.size(), stream); auto d_dst = dh::ToSpan(*p_dst); - dh::safe_cuda(cudaMemcpyAsync(d_dst.data(), src.data(), d_dst.size_bytes(), cudaMemcpyDefault, - ctx->CUDACtx()->Stream())); + dh::safe_cuda( + cudaMemcpyAsync(d_dst.data(), src.data(), d_dst.size_bytes(), cudaMemcpyDefault, stream)); } }; @@ -213,7 +214,7 @@ void ExtractPaths(Context const* ctx, }); auto max_cat_it = thrust::max_element(ctx->CUDACtx()->CTP(), max_elem_it, max_elem_it + d_model.n_nodes); - dh::CachingDeviceUVector d_max_cat(1); + dh::CachingDeviceUVector d_max_cat(1, ctx->CUDACtx()->Stream()); auto s_max_cat = dh::ToSpan(d_max_cat); dh::LaunchN(1, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t) { s_max_cat[0] = *max_cat_it; }); diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc index 592fb3e02069..6147249c23ec 100644 --- a/src/predictor/predictor.cc +++ b/src/predictor/predictor.cc @@ -1,5 +1,5 @@ /** - * Copyright 2017-2025, XGBoost Contributors + * Copyright 2017-2026, XGBoost Contributors */ #include "xgboost/predictor.h" @@ -58,7 +58,7 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector CHECK_NE(model.learner_model_param->num_output_group, 0); if (!ctx_->Device().IsCPU()) { - out_preds->SetDevice(ctx_->Device()); + out_preds->SetDevice(ctx_->Device(), this->ctx_); } // Cannot rely on the Resize to fill as it might skip if the size is already correct. @@ -69,14 +69,14 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector if (!base_margin->Empty()) { ValidateBaseMarginShape(info.base_margin_, info.num_row_, model.learner_model_param->OutputLength()); - out_preds->Copy(*base_margin); + out_preds->Copy(*base_margin, this->ctx_); return; } auto base_score = model.learner_model_param->BaseScore(this->ctx_->Device()); if (base_score.Size() == 1) { // Fill a scalar - out_preds->Fill(model.learner_model_param->BaseScore(DeviceOrd::CPU())(0)); + out_preds->Fill(model.learner_model_param->BaseScore(DeviceOrd::CPU())(0), this->ctx_); return; } diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 43fff2beb9b5..1219a46c2005 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -471,15 +471,16 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector inputs(1); - dh::safe_cuda(cudaMemcpyAsync(inputs.data(), &input, sizeof(input), cudaMemcpyDefault)); + auto stream = ctx->CUDACtx()->Stream(); + dh::CachingDeviceUVector inputs(1, stream); + dh::safe_cuda(cudaMemcpyAsync(inputs.data(), &input, sizeof(input), cudaMemcpyDefault, stream)); dh::TemporaryArray out_entries(1); this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs, dh::ToSpan(out_entries)); GPUExpandEntry root_entry; dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry), - cudaMemcpyDeviceToHost)); + cudaMemcpyDeviceToHost, stream)); return root_entry; } } // namespace xgboost::tree diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index e78e89cd4f7b..ce22ae69243a 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -134,8 +134,8 @@ class GPUHistEvaluator { } public: - GPUHistEvaluator(TrainParam const ¶m, bst_feature_t n_features, DeviceOrd device) - : tree_evaluator_{param, n_features, device}, param_{param} {} + GPUHistEvaluator(Context const *ctx, TrainParam const ¶m, bst_feature_t n_features) + : tree_evaluator_{ctx, param, n_features}, param_{param} {} /** * \brief Reset the evaluator, should be called before any use. */ @@ -166,12 +166,12 @@ class GPUHistEvaluator { /** * \brief Add a split to the internal tree evaluator. */ - void ApplyTreeSplit(GPUExpandEntry const &candidate, RegTree *p_tree) { + void ApplyTreeSplit(Context const *ctx, GPUExpandEntry const &candidate, RegTree *p_tree) { auto &tree = *p_tree; // Set up child constraints auto left_child = tree[candidate.nidx].LeftChild(); auto right_child = tree[candidate.nidx].RightChild(); - tree_evaluator_.AddSplit(candidate.nidx, left_child, right_child, + tree_evaluator_.AddSplit(ctx, candidate.nidx, left_child, right_child, tree[candidate.nidx].SplitIndex(), candidate.left_weight, candidate.right_weight); } diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index 945069773d06..815c48af39a7 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -20,7 +20,7 @@ void GPUHistEvaluator::Reset(Context const *ctx, common::HistogramCuts const &cu common::Span ft, bst_feature_t n_features, TrainParam const ¶m, bool is_column_split) { param_ = param; - tree_evaluator_ = TreeEvaluator{param, n_features, ctx->Device()}; + tree_evaluator_ = TreeEvaluator{ctx, param, n_features}; has_categoricals_ = cuts.HasCategorical(); if (cuts.HasCategorical()) { auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan(); diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu index 4b8c2b65da03..599d51222513 100644 --- a/src/tree/gpu_hist/multi_evaluate_splits.cu +++ b/src/tree/gpu_hist/multi_evaluate_splits.cu @@ -297,9 +297,10 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx, return; } + auto stream = ctx->CUDACtx()->Stream(); // Allocate weight and split sum storage on demand for the maximum node ID being evaluated. - this->AllocNodeWeight(max_nidx, n_targets); - this->split_sums_.Alloc(max_nidx, n_targets); + this->AllocNodeWeight(max_nidx, n_targets, stream); + this->split_sums_.Alloc(max_nidx, n_targets, stream); // Calculate total scan buffer size needed for all nodes auto node_hist_size = n_targets * shared_inputs.Features() * n_bins_per_feat_tar; @@ -307,7 +308,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx, // Scan the histograms. One for forward and the other for backward. // Since there's only store op on the scan buffer, no need to initialize it. - this->scan_buffer_.resize(total_hist_size * 2); + this->scan_buffer_.resize(total_hist_size * 2, stream); // Create spans for each node's scan results std::vector> h_scans(n_nodes); @@ -444,7 +445,7 @@ void MultiHistEvaluator::ApplyTreeSplit(Context const *ctx, RegTree const *p_tre auto max_node = thrust::reduce( ctx->CUDACtx()->CTP(), max_in_it, max_in_it + d_candidates.size(), 0, [=] XGBOOST_DEVICE(bst_node_t l, bst_node_t r) { return cuda::std::max(l, r); }); - this->AllocNodeSum(max_node, n_targets); + this->AllocNodeSum(max_node, n_targets, ctx->CUDACtx()->Stream()); auto node_sums = this->node_sums_.View(); // Use the internal split sums buffer instead of candidate.split.child_sum . It may be diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cuh b/src/tree/gpu_hist/multi_evaluate_splits.cuh index 0628010f90db..9c1006648383 100644 --- a/src/tree/gpu_hist/multi_evaluate_splits.cuh +++ b/src/tree/gpu_hist/multi_evaluate_splits.cuh @@ -48,10 +48,10 @@ class MultiHistEvaluator { /** * @brief Allocate storage for node sums up to the given node ID. */ - void Alloc(bst_node_t nidx, bst_target_t n_targets) { + void Alloc(bst_node_t nidx, bst_target_t n_targets, curt::StreamRef stream) { auto end = (nidx + 1) * n_targets; if (this->node_sums.size() < end) { - this->node_sums.resize(end); + this->node_sums.resize(end, stream); } } [[nodiscard]] common::Span GetNode(bst_node_t nidx, bst_target_t n_targets) { @@ -98,8 +98,8 @@ class MultiHistEvaluator { /** * @brief Allocate storage for node sums up to the given node ID. */ - void AllocNodeSum(bst_node_t nidx, bst_target_t n_targets) { - this->node_sums_.Alloc(nidx, n_targets); + void AllocNodeSum(bst_node_t nidx, bst_target_t n_targets, curt::StreamRef stream) { + this->node_sums_.Alloc(nidx, n_targets, stream); } [[nodiscard]] common::Span GetNodeSum(bst_node_t nidx, bst_target_t n_targets) { @@ -109,10 +109,10 @@ class MultiHistEvaluator { /** * @brief Allocate storage for weights up to the given node ID. */ - void AllocNodeWeight(bst_node_t nidx, bst_target_t n_targets) { + void AllocNodeWeight(bst_node_t nidx, bst_target_t n_targets, curt::StreamRef stream) { auto required = (nidx + 1) * n_targets * NodeWeightBuffer::kWeightsPerNode; if (this->node_weights_.size() < required) { - this->node_weights_.resize(required); + this->node_weights_.resize(required, stream); } } [[nodiscard]] NodeWeightBuffer GetNodeWeights(bst_target_t n_targets) { diff --git a/src/tree/gpu_hist/quantiser.cu b/src/tree/gpu_hist/quantiser.cu index 17105c94bc61..d62f29a3557d 100644 --- a/src/tree/gpu_hist/quantiser.cu +++ b/src/tree/gpu_hist/quantiser.cu @@ -116,10 +116,11 @@ GradientQuantiserGroup::GradientQuantiserGroup(Context const* ctx, } // Copy to device. - d_quantizers_.resize(n_targets); + auto stream = ctx->CUDACtx()->Stream(); + d_quantizers_.resize(n_targets, stream); dh::safe_cuda(cudaMemcpyAsync(d_quantizers_.data(), h_quantizers_.data(), n_targets * sizeof(GradientQuantiser), cudaMemcpyHostToDevice, - ctx->CUDACtx()->Stream())); + stream)); } GradientQuantiserGroup::GradientQuantiserGroup(Context const* ctx, diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index a54e854173f6..7e43a2b4694b 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -11,9 +11,10 @@ namespace xgboost::tree { void RowPartitioner::Reset(Context const* ctx, bst_idx_t n_samples, bst_idx_t base_rowid) { + auto stream = ctx->CUDACtx()->Stream(); ridx_segments_.clear(); - ridx_.resize(n_samples); - tmp_.clear(); + ridx_.resize(n_samples, stream); + tmp_.clear(stream); n_nodes_ = 1; // Root CHECK_LE(n_samples, std::numeric_limits::max()); diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index f7fa5d87150d..e4f16a4ff8d3 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -180,7 +180,7 @@ void SortPositionBatch(Context const* ctx, common::SpanCUDACtx()->Stream()); dh::safe_cuda(ret); - tmp->resize(n_bytes); + tmp->resize(n_bytes, ctx->CUDACtx()->Stream()); } n_bytes = tmp->size(); auto ret = @@ -473,7 +473,7 @@ class RowPartitionerBatches { CHECK_LE(n_samples, std::numeric_limits::max()); n_max_samples = std::max(n_samples, n_max_samples); } - this->ridx_tmp_.resize(n_max_samples); + this->ridx_tmp_.resize(n_max_samples, ctx->CUDACtx()->Stream()); } // Accessors diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index 0b9eed3f3c33..8a2759e5e115 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -438,7 +438,7 @@ class HistEvaluator { // Set up child constraints auto left_child = tree[candidate.nid].LeftChild(); auto right_child = tree[candidate.nid].RightChild(); - tree_evaluator_.AddSplit(candidate.nid, left_child, right_child, + tree_evaluator_.AddSplit(ctx_, candidate.nid, left_child, right_child, tree[candidate.nid].SplitIndex(), left_weight, right_weight); evaluator = tree_evaluator_.GetEvaluator(); @@ -476,7 +476,7 @@ class HistEvaluator { : ctx_{ctx}, param_{param}, column_sampler_{std::move(sampler)}, - tree_evaluator_{*param, static_cast(info.num_col_), DeviceOrd::CPU()}, + tree_evaluator_{ctx, *param, static_cast(info.num_col_)}, is_col_split_{info.IsColumnSplit()} { interaction_constraints_.Configure(*param, info.num_col_); column_sampler_->Init(ctx, info.num_col_, info.feature_weights, param_->colsample_bynode, @@ -751,14 +751,15 @@ class HistMultiEvaluator { float sum_hess = left_sum_hess + right_sum_hess; if (candidate.split.is_cat) { - p_tree->ExpandCategorical(candidate.nid, candidate.split.SplitIndex(), + p_tree->ExpandCategorical(ctx_, candidate.nid, candidate.split.SplitIndex(), candidate.split.cat_bits, candidate.split.DefaultLeft(), base_weight, left_weight, right_weight, loss_chg, sum_hess, left_sum_hess, right_sum_hess); } else { - p_tree->ExpandNode(candidate.nid, candidate.split.SplitIndex(), candidate.split.split_value, - candidate.split.DefaultLeft(), base_weight, left_weight, right_weight, - loss_chg, sum_hess, left_sum_hess, right_sum_hess); + p_tree->ExpandNode(ctx_, candidate.nid, candidate.split.SplitIndex(), + candidate.split.split_value, candidate.split.DefaultLeft(), base_weight, + left_weight, right_weight, loss_chg, sum_hess, left_sum_hess, + right_sum_hess); } CHECK(p_tree->IsMultiTarget()); diff --git a/src/tree/multi_target_tree_model.cc b/src/tree/multi_target_tree_model.cc index 21642ac9854b..83d46152d857 100644 --- a/src/tree/multi_target_tree_model.cc +++ b/src/tree/multi_target_tree_model.cc @@ -57,12 +57,13 @@ MultiTargetTree::MultiTargetTree(MultiTargetTree const& that) this->sum_hess_.Copy(that.sum_hess_); } -void MultiTargetTree::SetRoot(linalg::VectorView weight, float sum_hess) { +void MultiTargetTree::SetRoot(Context const* ctx, linalg::VectorView weight, + float sum_hess) { CHECK(!weight.Empty()); auto const next_nidx = RegTree::kRoot + 1; this->weights_.SetDevice(weight.Device()); - this->weights_.Resize(weight.Size(), DftBadValue()); + this->weights_.Resize(ctx, weight.Size(), DftBadValue()); CHECK_LE(weight.Size(), this->NumTargets()); CHECK_GE(weights_.Size(), next_nidx * weight.Size()); @@ -80,16 +81,17 @@ void MultiTargetTree::SetRoot(linalg::VectorView weight, float sum_ } // Set root statistics - sum_hess_.Resize(next_nidx, 0.0f); + sum_hess_.Resize(ctx, next_nidx, 0.0f); sum_hess_.HostVector()[RegTree::kRoot] = sum_hess; - loss_chg_.Resize(next_nidx, 0.0f); + loss_chg_.Resize(ctx, next_nidx, 0.0f); CHECK_EQ(this->param_->num_nodes, 1); CHECK_EQ(this->NumSplitTargets(), weight.Size()); } -void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float split_cond, - bool default_left, linalg::VectorView base_weight, +void MultiTargetTree::Expand(Context const* ctx, bst_node_t nidx, bst_feature_t split_idx, + float split_cond, bool default_left, + linalg::VectorView base_weight, linalg::VectorView left_weight, linalg::VectorView right_weight, float loss_chg, float sum_hess, float left_sum, float right_sum) { @@ -102,9 +104,9 @@ void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float spl std::size_t n = param_->num_nodes + 2; CHECK_LT(split_idx, this->param_->num_feature); - left_.Resize(n, InvalidNodeId()); - right_.Resize(n, InvalidNodeId()); - parent_.Resize(n, InvalidNodeId()); + left_.Resize(ctx, n, InvalidNodeId()); + right_.Resize(ctx, n, InvalidNodeId()); + parent_.Resize(ctx, n, InvalidNodeId()); auto left_child = parent_.Size() - 2; auto right_child = parent_.Size() - 1; @@ -124,7 +126,7 @@ void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float spl split_index_.Resize(n); split_index_.HostVector()[nidx] = split_idx; - split_conds_.Resize(n, DftBadValue()); + split_conds_.Resize(ctx, n, DftBadValue()); split_conds_.HostVector()[nidx] = split_cond; default_left_.Resize(n); @@ -148,10 +150,10 @@ void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float spl r_weight(i) = right_weight(i); } - loss_chg_.Resize(n, 0.0f); + loss_chg_.Resize(ctx, n, 0.0f); loss_chg_.HostVector()[nidx] = loss_chg; - sum_hess_.Resize(n, 0.0f); + sum_hess_.Resize(ctx, n, 0.0f); auto& h_hess = sum_hess_.HostVector(); h_hess[nidx] = sum_hess; h_hess[left_child] = left_sum; diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h index 43e093b31370..558527677fc4 100644 --- a/src/tree/split_evaluator.h +++ b/src/tree/split_evaluator.h @@ -1,5 +1,5 @@ /** - * Copyright 2018-2023 by Contributors + * Copyright 2018-2026, XGBoost Contributors * \file split_evaluator.h * \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation. * \author Henry Gouk @@ -33,12 +33,12 @@ class TreeEvaluator { bool has_constraint_; public: - TreeEvaluator(TrainParam const& p, bst_feature_t n_features, DeviceOrd device) { - device_ = device; - if (device.IsCUDA()) { - lower_bounds_.SetDevice(device); - upper_bounds_.SetDevice(device); - monotone_.SetDevice(device); + TreeEvaluator(Context const* ctx, TrainParam const& p, bst_feature_t n_features) + : device_{ctx->Device()} { + if (device_.IsCUDA()) { + lower_bounds_.SetDevice(device_); + upper_bounds_.SetDevice(device_); + monotone_.SetDevice(device_); } if (p.monotone_constraints.empty()) { @@ -50,8 +50,8 @@ class TreeEvaluator { monotone_.HostVector() = p.monotone_constraints; monotone_.HostVector().resize(n_features, 0); // Initialised to some small size, can grow if needed - lower_bounds_.Resize(256, -std::numeric_limits::max()); - upper_bounds_.Resize(256, std::numeric_limits::max()); + lower_bounds_.Resize(ctx, 256, -std::numeric_limits::max()); + upper_bounds_.Resize(ctx, 256, std::numeric_limits::max()); has_constraint_ = true; } @@ -79,7 +79,7 @@ class TreeEvaluator { float wright = this->CalcWeight(nidx, param, right); float gain = this->CalcGainGivenWeight(param, left, wleft) + - this->CalcGainGivenWeight(param, right, wright); + this->CalcGainGivenWeight(param, right, wright); if (constraint == 0) { // no constraint @@ -92,7 +92,7 @@ class TreeEvaluator { } template - XGBOOST_DEVICE float CalcWeight(bst_node_t nodeid, const ParamT ¶m, + XGBOOST_DEVICE float CalcWeight(bst_node_t nodeid, const ParamT& param, GradientSumT const& stats) const { float w = ::xgboost::tree::CalcWeight(param, stats); if (!has_constraint) { @@ -138,11 +138,10 @@ class TreeEvaluator { return Divide(common::Sqr(ThresholdL1(stats.GetGrad(), p.reg_alpha)), (stats.GetHess() + p.reg_lambda)); } - return tree::CalcGainGivenWeight(p, stats.GetGrad(), - stats.GetHess(), w); + return tree::CalcGainGivenWeight(p, stats.GetGrad(), stats.GetHess(), w); } template - XGBOOST_DEVICE float CalcGain(bst_node_t nid, ParamT const &p, + XGBOOST_DEVICE float CalcGain(bst_node_t nid, ParamT const& p, GradientSumT const& stats) const { return this->CalcGainGivenWeight(p, stats, this->CalcWeight(nid, p, stats)); } @@ -150,7 +149,8 @@ class TreeEvaluator { public: /* Get a view to the evaluator that can be passed down to device. */ - template auto GetEvaluator() const { + template + auto GetEvaluator() const { if (device_.IsCUDA()) { auto constraints = monotone_.ConstDevicePointer(); return SplitEvaluator{constraints, lower_bounds_.ConstDevicePointer(), @@ -163,7 +163,7 @@ class TreeEvaluator { } template - void AddSplit(bst_node_t nodeid, bst_node_t leftid, bst_node_t rightid, + void AddSplit(Context const* ctx, bst_node_t nodeid, bst_node_t leftid, bst_node_t rightid, bst_feature_t f, float left_weight, float right_weight) { if (!has_constraint_) { return; @@ -171,15 +171,14 @@ class TreeEvaluator { size_t max_nidx = std::max(leftid, rightid); if (lower_bounds_.Size() <= max_nidx) { - lower_bounds_.Resize(max_nidx * 2 + 1, -std::numeric_limits::max()); + lower_bounds_.Resize(ctx, max_nidx * 2 + 1, -std::numeric_limits::max()); } if (upper_bounds_.Size() <= max_nidx) { - upper_bounds_.Resize(max_nidx * 2 + 1, std::numeric_limits::max()); + upper_bounds_.Resize(ctx, max_nidx * 2 + 1, std::numeric_limits::max()); } common::Transform<>::Init( - [=] XGBOOST_DEVICE(size_t, common::Span lower, - common::Span upper, + [=] XGBOOST_DEVICE(size_t, common::Span lower, common::Span upper, common::Span monotone) { lower[leftid] = lower[nodeid]; upper[leftid] = upper[nodeid]; diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc index 519cc15f800d..32c3f82d7377 100644 --- a/src/tree/tree_model.cc +++ b/src/tree/tree_model.cc @@ -829,8 +829,9 @@ void RegTree::ExpandNode(bst_node_t nid, unsigned split_index, bst_float split_v this->split_types_.HostVector().at(nid) = FeatureType::kNumerical; } -void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split_cond, - bool default_left, linalg::VectorView base_weight, +void RegTree::ExpandNode(Context const* ctx, bst_node_t nidx, bst_feature_t split_index, + float split_cond, bool default_left, + linalg::VectorView base_weight, linalg::VectorView left_weight, linalg::VectorView right_weight, float loss_chg, float sum_hess, float left_sum, float right_sum) { @@ -839,8 +840,8 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split CHECK(this->p_mt_tree_); CHECK_GT(param_.size_leaf_vector, 1); - this->p_mt_tree_->Expand(nidx, split_index, split_cond, default_left, base_weight, left_weight, - right_weight, loss_chg, sum_hess, left_sum, right_sum); + this->p_mt_tree_->Expand(ctx, nidx, split_index, split_cond, default_left, base_weight, + left_weight, right_weight, loss_chg, sum_hess, left_sum, right_sum); split_types_.HostVector().resize(this->Size(), FeatureType::kNumerical); split_categories_segments_.HostVector().resize(this->Size()); @@ -877,14 +878,14 @@ void RegTree::ExpandCategorical(bst_node_t nidx, bst_feature_t split_index, h_split_categories_segments.at(nidx).size = split_cat.size(); } -void RegTree::ExpandCategorical(bst_node_t nidx, bst_feature_t split_index, +void RegTree::ExpandCategorical(Context const* ctx, bst_node_t nidx, bst_feature_t split_index, common::Span split_cat, bool default_left, linalg::VectorView base_weight, linalg::VectorView left_weight, linalg::VectorView right_weight, float loss_chg, float sum_hess, float left_sum, float right_sum) { CHECK(IsMultiTarget()); - this->ExpandNode(nidx, split_index, DftBadValue(), default_left, base_weight, left_weight, + this->ExpandNode(ctx, nidx, split_index, DftBadValue(), default_left, base_weight, left_weight, right_weight, loss_chg, sum_hess, left_sum, right_sum); auto& h_split_categories = split_categories_.HostVector(); diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index 09eb5f58c762..974631067ed5 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -164,7 +164,7 @@ class ColMaker : public TreeUpdater { colmaker_train_param_{colmaker_train_param}, ctx_{ctx}, column_sampler_{std::move(column_sampler)}, - tree_evaluator_(param_, column_densities.size(), DeviceOrd::CPU()), + tree_evaluator_(ctx, param_, column_densities.size()), interaction_constraints_{std::move(_interaction_constraints)}, column_densities_(column_densities) {} // update one tree, growing @@ -186,7 +186,7 @@ class ColMaker : public TreeUpdater { int cleft = (*p_tree)[nid].LeftChild(); int cright = (*p_tree)[nid].RightChild(); - tree_evaluator_.AddSplit(nid, cleft, cright, snode_[nid].best.SplitIndex(), + tree_evaluator_.AddSplit(ctx_, nid, cleft, cright, snode_[nid].best.SplitIndex(), snode_[cleft].weight, snode_[cright].weight); interaction_constraints_.Split(nid, snode_[nid].best.SplitIndex(), cleft, cright); } diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 9ebdf222d57c..57f77f96901a 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -145,7 +145,7 @@ struct GPUHistMakerDevice { std::shared_ptr column_sampler, MetaInfo const& info, std::vector batch_ptr, std::shared_ptr cuts, bool dense_compressed) - : evaluator_{_param, static_cast(info.num_col_), ctx->Device()}, + : evaluator_{ctx, _param, static_cast(info.num_col_)}, ctx_{ctx}, column_sampler_{std::move(column_sampler)}, batch_ptr_{std::move(batch_ptr)}, @@ -647,7 +647,7 @@ struct GPUHistMakerDevice { candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_hess, left_hess, right_hess); } - evaluator_.ApplyTreeSplit(candidate, p_tree); + evaluator_.ApplyTreeSplit(ctx_, candidate, p_tree); const auto& parent = tree[candidate.nidx]; interaction_constraints.Split(candidate.nidx, parent.SplitIndex(), parent.LeftChild(), diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh index b0c0e00fb521..d5acefb89aee 100644 --- a/src/tree/updater_gpu_hist.cuh +++ b/src/tree/updater_gpu_hist.cuh @@ -235,7 +235,7 @@ class MultiTargetHistMaker { auto n_targets = d_gpair.Shape(1); // Calculate the root sum - this->evaluator_.AllocNodeSum(RegTree::kRoot, n_targets); + this->evaluator_.AllocNodeSum(RegTree::kRoot, n_targets, ctx_->CUDACtx()->Stream()); auto d_root_sum = this->evaluator_.GetNodeSum(RegTree::kRoot, n_targets); CalcRootSum(this->ctx_, d_gpair, d_root_sum); @@ -262,7 +262,7 @@ class MultiTargetHistMaker { auto weights = this->evaluator_.GetNodeWeights(n_targets); // Root's sum_hess is the sum of left and right child hessians float root_sum_hess = static_cast(entry.left_sum + entry.right_sum); - p_tree->SetRoot(linalg::MakeVec(this->ctx_->Device(), weights.Base(RegTree::kRoot)), + p_tree->SetRoot(this->ctx_, linalg::MakeVec(this->ctx_->Device(), weights.Base(RegTree::kRoot)), root_sum_hess); return entry; @@ -286,9 +286,10 @@ class MultiTargetHistMaker { float left_sum = static_cast(candidate.left_sum); float right_sum = static_cast(candidate.right_sum); float sum_hess = left_sum + right_sum; - p_tree->ExpandNode(candidate.nidx, candidate.split.findex, candidate.split.fvalue, - candidate.split.dir == kLeftDir, linalg::MakeVec(h_base_weight), - linalg::MakeVec(h_left_weight), linalg::MakeVec(h_right_weight), loss_chg, + p_tree->ExpandNode(this->ctx_, candidate.nidx, candidate.split.findex, + candidate.split.fvalue, candidate.split.dir == kLeftDir, + linalg::MakeVec(h_base_weight), linalg::MakeVec(h_left_weight), + linalg::MakeVec(h_right_weight), loss_chg, sum_hess, left_sum, right_sum); } diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 988a493c102f..d83ce2be1608 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -253,7 +253,7 @@ class MultiTargetHistBuilder { for (bst_target_t t{0}; t < n_targets; ++t) { root_sum_hess += static_cast(h_root_sum(t).GetHess()); } - p_tree->SetRoot(weight_t, root_sum_hess); + p_tree->SetRoot(ctx_, weight_t, root_sum_hess); std::vector hists; std::vector nodes{{RegTree::kRoot, 0}}; diff --git a/tests/cpp/common/test_device_compression.cu b/tests/cpp/common/test_device_compression.cu index 99d246b0325b..9a44fd3da21b 100644 --- a/tests/cpp/common/test_device_compression.cu +++ b/tests/cpp/common/test_device_compression.cu @@ -26,7 +26,7 @@ TEST(NvComp, Snappy) { #endif auto ctx = MakeCUDACtx(0); auto cuctx = ctx.CUDACtx(); - dh::DeviceUVector in(1024); + dh::DeviceUVector in(1024, cuctx->Stream()); thrust::sequence(ctx.CUDACtx()->CTP(), in.begin(), in.end(), 0); dh::DeviceUVector compr; @@ -57,7 +57,7 @@ class TestNvComp : public ::testing::TestWithParam in(n_bytes); + dh::DeviceUVector in(n_bytes, cuctx->Stream()); thrust::sequence(ctx.CUDACtx()->CTP(), in.begin(), in.end(), 0); dh::DeviceUVector compr; diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index 16a847648eb7..9313f9584082 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -33,33 +33,35 @@ TEST(AsyncPoolAllocator, Basic) { #endif // !defined(XGBOOST_USE_RMM) TEST(DeviceUVector, Basic) { + auto stream = xgboost::curt::DefaultStream(); + GlobalMemoryLogger().Clear(); std::int32_t verbosity{3}; std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity); DeviceUVector uvec; - uvec.resize(12); + uvec.resize(12, stream); auto peak = GlobalMemoryLogger().PeakMemory(); auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size(); ASSERT_EQ(peak, n_bytes); std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity); - DeviceUVector uvec1{16}; + DeviceUVector uvec1{16, stream}; ASSERT_EQ(uvec1.size(), 16); - uvec1.resize(3); + uvec1.resize(3, stream); ASSERT_EQ(uvec1.size(), 3); ASSERT_EQ(uvec1.Capacity(), 16); ASSERT_EQ(std::distance(uvec1.begin(), uvec1.end()), uvec1.size()); auto orig = uvec1.size(); thrust::sequence(dh::CachingThrustPolicy(), uvec1.begin(), uvec1.end(), 0); - uvec1.resize(32); + uvec1.resize(32, stream); ASSERT_EQ(uvec1.size(), 32); ASSERT_EQ(uvec1.Capacity(), 32); auto eq = thrust::equal(dh::CachingThrustPolicy(), uvec1.cbegin(), uvec1.cbegin() + orig, thrust::make_counting_iterator(0)); ASSERT_TRUE(eq); - uvec1.clear(); + uvec1.clear(stream); ASSERT_EQ(uvec1.size(), 0); ASSERT_EQ(uvec1.Capacity(), 32); } @@ -151,7 +153,7 @@ TEST(TestVirtualMem, Version) { PinnedMemory pinned; #if defined(xgboost_IS_WIN) ASSERT_FALSE(pinned.IsVm()); -#else // defined(xgboost_IS_WIN) +#else // defined(xgboost_IS_WIN) if (major == 12 && minor >= 5 || major > 12) { ASSERT_TRUE(pinned.IsVm()); } else { diff --git a/tests/cpp/common/test_gpu_compressed_iterator.cu b/tests/cpp/common/test_gpu_compressed_iterator.cu index e633eefb9599..c5d149956a9e 100644 --- a/tests/cpp/common/test_gpu_compressed_iterator.cu +++ b/tests/cpp/common/test_gpu_compressed_iterator.cu @@ -32,11 +32,9 @@ struct ReadSymbolFunction { CompressedIterator ci; int* output_data_d; ReadSymbolFunction(CompressedIterator ci, int* output_data_d) - : ci(ci), output_data_d(output_data_d) {} + : ci(ci), output_data_d(output_data_d) {} - __device__ void operator()(size_t i) { - output_data_d[i] = ci[i]; - } + __device__ void operator()(size_t i) { output_data_d[i] = ci[i]; } }; TEST(CompressedIterator, TestGPU) { @@ -49,20 +47,17 @@ TEST(CompressedIterator, TestGPU) { for (auto alphabet_size : test_cases) { for (int i = 0; i < repetitions; i++) { std::vector input(num_elements); - std::generate(input.begin(), input.end(), - [=]() { return rand() % alphabet_size; }); + std::generate(input.begin(), input.end(), [=]() { return rand() % alphabet_size; }); CompressedBufferWriter cbw(alphabet_size); thrust::device_vector input_d(input); thrust::device_vector buffer_d( - CompressedBufferWriter::CalculateBufferSize(input.size(), - alphabet_size)); + CompressedBufferWriter::CalculateBufferSize(input.size(), alphabet_size)); // write the data on device auto input_data_d = input_d.data().get(); auto buffer_data_d = buffer_d.data().get(); - dh::LaunchN(input_d.size(), - WriteSymbolFunction(cbw, buffer_data_d, input_data_d)); + dh::LaunchN(input_d.size(), WriteSymbolFunction(cbw, buffer_data_d, input_data_d)); // read the data on device CompressedIterator ci(buffer_d.data().get(), alphabet_size); @@ -89,7 +84,7 @@ class TestDoubleCompressedIter : public ::testing::TestWithParam { std::size_t n_symbols_{11}; void SetUp() override { - input_.resize(n_symbols_ * 3); + input_.resize(n_symbols_ * 3, ctx_.CUDACtx()->Stream()); auto policy = ctx_.CUDACtx()->CTP(); for (std::size_t i = 0; i < 3; ++i) { auto beg = input_.begin() + n_symbols_ * i; diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index 6c5763c58cab..25ca3b83f120 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -314,7 +314,7 @@ template auto MakeUnweightedCutsForTest(Context const* ctx, Adapter adapter, int32_t num_bins, float missing) { HostDeviceVector ft; - SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), DeviceOrd::CUDA(0)); + SketchContainer sketch_container(ctx, ft, num_bins, adapter.NumColumns()); MetaInfo info; AdapterDeviceSketch(ctx, adapter.Value(), num_bins, info, missing, &sketch_container); return sketch_container.MakeCuts(ctx, info.IsColumnSplit()); @@ -371,7 +371,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories, int32_t num_b } ASSERT_EQ(info.feature_types.Size(), 1); - SketchContainer container(info.feature_types, num_bins, 1, DeviceOrd::CUDA(0)); + SketchContainer container(&ctx, info.feature_types, num_bins, 1); AdapterDeviceSketch(&ctx, adapter.Value(), num_bins, info, std::numeric_limits::quiet_NaN(), &container); auto cuts = container.MakeCuts(&ctx, info.IsColumnSplit()); @@ -590,7 +590,7 @@ void TestAdapterSketchFromWeights(bool with_group) { data::CupyAdapter adapter(m); auto const& batch = adapter.Value(); HostDeviceVector ft; - SketchContainer sketch_container(ft, kBins, kCols, DeviceOrd::CUDA(0)); + SketchContainer sketch_container(&ctx, ft, kBins, kCols); AdapterDeviceSketch(&ctx, adapter.Value(), kBins, info, std::numeric_limits::quiet_NaN(), &sketch_container); @@ -629,7 +629,7 @@ void TestAdapterSketchFromWeights(bool with_group) { // https://github.com/dmlc/xgboost/issues/7946 h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast(kGroups); } - SketchContainer sketch_container{ft, kBins, kCols, DeviceOrd::CUDA(0)}; + SketchContainer sketch_container{&ctx, ft, kBins, kCols}; AdapterDeviceSketch(&ctx, adapter.Value(), kBins, info, std::numeric_limits::quiet_NaN(), &sketch_container); weighted = sketch_container.MakeCuts(&ctx, info.IsColumnSplit()); diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu index 7c3c2cd070cf..4066919c0723 100644 --- a/tests/cpp/common/test_host_device_vector.cu +++ b/tests/cpp/common/test_host_device_vector.cu @@ -1,9 +1,10 @@ /** - * Copyright 2018-2024, XGBoost contributors + * Copyright 2018-2026, XGBoost contributors */ #include #include #include +#include #include #include "../../../src/common/cuda_rt_utils.h" // for SetDevice @@ -25,28 +26,22 @@ struct HostDeviceVectorSetDeviceHandler { SetCudaSetDeviceHandler(f); } - ~HostDeviceVectorSetDeviceHandler() { - SetCudaSetDeviceHandler(nullptr); - } + ~HostDeviceVectorSetDeviceHandler() { SetCudaSetDeviceHandler(nullptr); } }; -void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector *v) { - // create the vector - v->SetDevice(device); +void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector* v, + Context const* ctx) { + v->SetDevice(device, ctx); v->Resize(n); ASSERT_EQ(v->Size(), n); ASSERT_EQ(v->Device(), device); - // ensure that the device have read-write access ASSERT_TRUE(v->DeviceCanRead()); ASSERT_TRUE(v->DeviceCanWrite()); - // ensure that the host has no access ASSERT_FALSE(v->HostCanRead()); ASSERT_FALSE(v->HostCanWrite()); - // fill in the data on the host - std::vector& data_h = v->HostVector(); - // ensure that the host has full access, while the device have none + std::vector& data_h = v->HostVector(ctx); ASSERT_TRUE(v->HostCanRead()); ASSERT_TRUE(v->HostCanWrite()); ASSERT_FALSE(v->DeviceCanRead()); @@ -55,137 +50,134 @@ void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector *v) std::copy_n(thrust::make_counting_iterator(0), n, data_h.begin()); } -void PlusOne(HostDeviceVector *v) { +void PlusOne(HostDeviceVector* v) { auto device = v->Device(); SetDeviceForTest(device); thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v), - [=]__device__(unsigned int a){ return a + 1; }); + [=] __device__(unsigned int a) { return a + 1; }); ASSERT_TRUE(v->DeviceCanWrite()); } -void CheckDevice(HostDeviceVector* v, - size_t size, - unsigned int first, - GPUAccess access) { +void CheckDevice(HostDeviceVector* v, size_t size, unsigned int first, GPUAccess access) { ASSERT_EQ(v->Size(), size); SetDeviceForTest(v->Device()); - ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v), - thrust::make_counting_iterator(first))); + ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v), thrust::make_counting_iterator(first))); ASSERT_TRUE(v->DeviceCanRead()); - // ensure that the device has at most the access specified by access ASSERT_EQ(v->DeviceCanWrite(), access == GPUAccess::kWrite); ASSERT_EQ(v->HostCanRead(), access == GPUAccess::kRead); ASSERT_FALSE(v->HostCanWrite()); - ASSERT_TRUE(thrust::equal(dh::tbegin(*v), dh::tend(*v), - thrust::make_counting_iterator(first))); + ASSERT_TRUE(thrust::equal(dh::tbegin(*v), dh::tend(*v), thrust::make_counting_iterator(first))); ASSERT_TRUE(v->DeviceCanRead()); ASSERT_TRUE(v->DeviceCanWrite()); ASSERT_FALSE(v->HostCanRead()); ASSERT_FALSE(v->HostCanWrite()); } -void CheckHost(HostDeviceVector *v, GPUAccess access) { - const std::vector& data_h = access == GPUAccess::kNone ? - v->HostVector() : v->ConstHostVector(); +void CheckHost(HostDeviceVector* v, GPUAccess access, Context const* ctx) { + const std::vector& data_h = + access == GPUAccess::kNone ? v->HostVector(ctx) : v->ConstHostVector(ctx); for (size_t i = 0; i < v->Size(); ++i) { ASSERT_EQ(data_h.at(i), i + 1); } ASSERT_TRUE(v->HostCanRead()); ASSERT_EQ(v->HostCanWrite(), access == GPUAccess::kNone); ASSERT_EQ(v->DeviceCanRead(), access == GPUAccess::kRead); - // the devices should have no write access ASSERT_FALSE(v->DeviceCanWrite()); } -void TestHostDeviceVector(size_t n, DeviceOrd device) { +void TestHostDeviceVector(size_t n, Context const* ctx) { HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice); HostDeviceVector v; - InitHostDeviceVector(n, device, &v); + InitHostDeviceVector(n, ctx->Device(), &v, ctx); CheckDevice(&v, n, 0, GPUAccess::kRead); PlusOne(&v); CheckDevice(&v, n, 1, GPUAccess::kWrite); - CheckHost(&v, GPUAccess::kRead); - CheckHost(&v, GPUAccess::kNone); + CheckHost(&v, GPUAccess::kRead, ctx); + CheckHost(&v, GPUAccess::kNone, ctx); } TEST(HostDeviceVector, Basic) { size_t n = 1001; - DeviceOrd device = DeviceOrd::CUDA(0); - TestHostDeviceVector(n, device); + auto ctx = Context{}.MakeCUDA(0); + TestHostDeviceVector(n, &ctx); } TEST(HostDeviceVector, Copy) { size_t n = 1001; - auto device = DeviceOrd::CUDA(0); + auto ctx = Context{}.MakeCUDA(0); HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice); HostDeviceVector v; { - // a separate scope to ensure that v1 is gone before further checks HostDeviceVector v1; - InitHostDeviceVector(n, device, &v1); + InitHostDeviceVector(n, ctx.Device(), &v1, &ctx); v.Resize(v1.Size()); - v.Copy(v1); + v.Copy(v1, &ctx); } CheckDevice(&v, n, 0, GPUAccess::kRead); PlusOne(&v); CheckDevice(&v, n, 1, GPUAccess::kWrite); - CheckHost(&v, GPUAccess::kRead); - CheckHost(&v, GPUAccess::kNone); + CheckHost(&v, GPUAccess::kRead, &ctx); + CheckHost(&v, GPUAccess::kNone, &ctx); } TEST(HostDeviceVector, SetDevice) { - std::vector h_vec (2345); + auto ctx = Context{}.MakeCUDA(0); + + std::vector h_vec(2345); for (size_t i = 0; i < h_vec.size(); ++i) { h_vec[i] = i; } - HostDeviceVector vec (h_vec); - auto device = DeviceOrd::CUDA(0); + HostDeviceVector vec(h_vec); - vec.SetDevice(device); + vec.SetDevice(ctx.Device(), &ctx); ASSERT_EQ(vec.Size(), h_vec.size()); - vec.DeviceSpan(); // sync to device + vec.DeviceSpan(&ctx); // sync to device - vec.SetDevice(DeviceOrd::CPU()); // pull back to cpu. + vec.SetDevice(DeviceOrd::CPU(), &ctx); // pull back to cpu. ASSERT_EQ(vec.Size(), h_vec.size()); ASSERT_EQ(vec.Device(), DeviceOrd::CPU()); - auto h_vec_1 = vec.HostVector(); + auto h_vec_1 = vec.HostVector(&ctx); ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin())); } TEST(HostDeviceVector, Span) { - HostDeviceVector vec {1.0f, 2.0f, 3.0f, 4.0f}; - vec.SetDevice(DeviceOrd::CUDA(0)); - auto span = vec.DeviceSpan(); + auto ctx = Context{}.MakeCUDA(0); + + HostDeviceVector vec{1.0f, 2.0f, 3.0f, 4.0f}; + vec.SetDevice(ctx.Device(), &ctx); + auto span = vec.DeviceSpan(&ctx); ASSERT_EQ(vec.Size(), span.size()); - ASSERT_EQ(vec.DevicePointer(), span.data()); - auto const_span = vec.ConstDeviceSpan(); + ASSERT_EQ(vec.DevicePointer(&ctx), span.data()); + auto const_span = vec.ConstDeviceSpan(&ctx); ASSERT_EQ(vec.Size(), const_span.size()); - ASSERT_EQ(vec.ConstDevicePointer(), const_span.data()); + ASSERT_EQ(vec.ConstDevicePointer(&ctx), const_span.data()); - auto h_span = vec.ConstHostSpan(); + auto h_span = vec.ConstHostSpan(&ctx); ASSERT_TRUE(vec.HostCanRead()); ASSERT_FALSE(vec.HostCanWrite()); ASSERT_EQ(h_span.size(), vec.Size()); - ASSERT_EQ(h_span.data(), vec.ConstHostPointer()); + ASSERT_EQ(h_span.data(), vec.ConstHostPointer(&ctx)); - h_span = vec.HostSpan(); + h_span = vec.HostSpan(&ctx); ASSERT_TRUE(vec.HostCanWrite()); } TEST(HostDeviceVector, Empty) { - HostDeviceVector vec {1.0f, 2.0f, 3.0f, 4.0f}; - HostDeviceVector another { std::move(vec) }; + HostDeviceVector vec{1.0f, 2.0f, 3.0f, 4.0f}; + HostDeviceVector another{std::move(vec)}; ASSERT_FALSE(another.Empty()); ASSERT_TRUE(vec.Empty()); } TEST(HostDeviceVector, Resize) { + auto ctx = Context{}.MakeCUDA(0); + auto check = [&](HostDeviceVector const& vec) { - auto const& h_vec = vec.ConstHostSpan(); + auto const& h_vec = vec.ConstHostSpan(&ctx); for (std::size_t i = 0; i < 4; ++i) { ASSERT_EQ(h_vec[i], i + 1); } @@ -195,26 +187,26 @@ TEST(HostDeviceVector, Resize) { }; { HostDeviceVector vec{1.0f, 2.0f, 3.0f, 4.0f}; - vec.SetDevice(DeviceOrd::CUDA(0)); - vec.ConstDeviceSpan(); + vec.SetDevice(ctx.Device(), &ctx); + vec.ConstDeviceSpan(&ctx); ASSERT_TRUE(vec.DeviceCanRead()); ASSERT_FALSE(vec.DeviceCanWrite()); - vec.DeviceSpan(); - vec.Resize(7, 3.0f); + vec.DeviceSpan(&ctx); + vec.Resize(&ctx, 7, 3.0f); ASSERT_TRUE(vec.DeviceCanWrite()); check(vec); } { - HostDeviceVector vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::CUDA(0)}; + HostDeviceVector vec{{1.0f, 2.0f, 3.0f, 4.0f}, ctx.Device(), &ctx}; ASSERT_TRUE(vec.DeviceCanWrite()); - vec.Resize(7, 3.0f); + vec.Resize(&ctx, 7, 3.0f); ASSERT_TRUE(vec.DeviceCanWrite()); check(vec); } { HostDeviceVector vec{1.0f, 2.0f, 3.0f, 4.0f}; ASSERT_TRUE(vec.HostCanWrite()); - vec.Resize(7, 3.0f); + vec.Resize(&ctx, 7, 3.0f); ASSERT_TRUE(vec.HostCanWrite()); check(vec); } diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu index 6a34513db5b1..28060adca8d9 100644 --- a/tests/cpp/common/test_linalg.cu +++ b/tests/cpp/common/test_linalg.cu @@ -1,5 +1,5 @@ /** - * Copyright 2021-2025, XGBoost Contributors + * Copyright 2021-2026, XGBoost Contributors */ #include #include // for equal @@ -21,17 +21,17 @@ namespace { void TestElementWiseKernel() { auto ctx = MakeCUDACtx(0); auto device = ctx.Device(); - Tensor l{{2, 3, 4}, device}; + Tensor l{{2, 3, 4}, device, kC, &ctx}; { /** * Non-contiguous */ // GPU view - auto t = l.View(device).Slice(linalg::All(), 1, linalg::All()); + auto t = l.View(device, &ctx).Slice(linalg::All(), 1, linalg::All()); ASSERT_FALSE(t.CContiguous()); cuda_impl::TransformIdxKernel(&ctx, t, [] XGBOOST_DEVICE(std::size_t i, float) { return i; }); // CPU view - t = l.View(DeviceOrd::CPU()).Slice(linalg::All(), 1, linalg::All()); + t = l.View(DeviceOrd::CPU(), &ctx).Slice(linalg::All(), 1, linalg::All()); std::size_t k = 0; for (size_t i = 0; i < l.Shape(0); ++i) { for (size_t j = 0; j < l.Shape(2); ++j) { @@ -39,7 +39,7 @@ void TestElementWiseKernel() { } } - t = l.View(device).Slice(linalg::All(), 1, linalg::All()); + t = l.View(device, &ctx).Slice(linalg::All(), 1, linalg::All()); cuda_impl::ElementWiseKernel( t, [=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable { t(i, j) = i + j; }); @@ -55,11 +55,11 @@ void TestElementWiseKernel() { /** * Contiguous */ - auto t = l.View(device); + auto t = l.View(device, &ctx); cuda_impl::TransformIdxKernel(&ctx, t, [] XGBOOST_DEVICE(size_t i, float) { return i; }); ASSERT_TRUE(t.CContiguous()); // CPU view - t = l.View(DeviceOrd::CPU()); + t = l.View(DeviceOrd::CPU(), &ctx); size_t ind = 0; for (size_t i = 0; i < l.Shape(0); ++i) { @@ -138,7 +138,7 @@ TEST(Linalg, SmallHistogram) { linalg::MatrixView indices = linalg::MakeTensorView(&ctx, dh::ToSpan(values), values.size(), 1); - dh::CachingDeviceUVector bins(n_bins); + dh::CachingDeviceUVector bins(n_bins, ctx.CUDACtx()->Stream()); HostDeviceVector weights; SmallHistogram(&ctx, indices, common::MakeOptionalWeights(ctx.Device(), weights), linalg::MakeTensorView(&ctx, dh::ToSpan(bins), bins.size())); diff --git a/tests/cpp/common/test_linalg.h b/tests/cpp/common/test_linalg.h index d79ed1422931..8ebeab5add3a 100644 --- a/tests/cpp/common/test_linalg.h +++ b/tests/cpp/common/test_linalg.h @@ -1,5 +1,5 @@ /** - * Copyright 2025, XGBoost Contributors + * Copyright 2025-2026, XGBoost Contributors */ #pragma once @@ -19,8 +19,9 @@ void TestLinalgDispatch(Context const* ctx, Fn&& fn) { std::iota(data.begin(), data.end(), 0.0); Vector vec(data.begin(), data.end(), {data.size()}, DeviceOrd::CPU()); - TransformKernel(ctx, vec.View(ctx->Device()), [=] XGBOOST_DEVICE(double v) { return fn(v); }); - auto h_v = vec.HostView(); + TransformKernel(ctx, vec.View(ctx->Device(), ctx), + [=] XGBOOST_DEVICE(double v) { return fn(v); }); + auto h_v = vec.HostView(ctx); for (std::size_t i = 0; i < h_v.Size(); ++i) { ASSERT_EQ(h_v(i), fn(i)); } diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index 43090312cef5..47a4ac5def83 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -40,7 +40,7 @@ TEST(GPUQuantile, Basic) { auto ctx = MakeCUDACtx(0); constexpr size_t kCols = 100, kBins = 256; HostDeviceVector ft; - SketchContainer sketch(ft, kBins, kCols, ctx.Device()); + SketchContainer sketch(&ctx, ft, kBins, kCols); dh::caching_device_vector entries; dh::device_vector cuts_ptr(kCols + 1); thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0); @@ -86,7 +86,7 @@ TEST(GPUQuantile, Prune) { RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) { auto ctx = MakeCUDACtx(0); HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); + SketchContainer sketch(&ctx, ft, n_bins, kCols); HostDeviceVector storage; std::string interface_str = RandomDataGenerator{kRows, kCols, 0} @@ -124,7 +124,7 @@ TEST(GPUQuantile, PruneDuplicated) { RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) { auto ctx = MakeCUDACtx(0); HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); + SketchContainer sketch(&ctx, ft, n_bins, kCols); HostDeviceVector storage; std::string interface_str = RandomDataGenerator{kRows, kCols, 0} @@ -165,7 +165,7 @@ TEST(GPUQuantile, MergeEmpty) { size_t n_bins = 10; auto ctx = MakeCUDACtx(0); HostDeviceVector ft; - SketchContainer sketch_0(ft, n_bins, kCols, ctx.Device()); + SketchContainer sketch_0(&ctx, ft, n_bins, kCols); HostDeviceVector storage_0; std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}.Device(ctx.Device()).GenerateArrayInterface(&storage_0); @@ -205,7 +205,7 @@ TEST(GPUQuantile, MergeBasic) { RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) { auto ctx = MakeCUDACtx(0); HostDeviceVector ft; - SketchContainer sketch_0(ft, n_bins, kCols, ctx.Device()); + SketchContainer sketch_0(&ctx, ft, n_bins, kCols); HostDeviceVector storage_0; std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0} .Device(ctx.Device()) @@ -215,7 +215,7 @@ TEST(GPUQuantile, MergeBasic) { AdapterDeviceSketch(&ctx, adapter_0.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), &sketch_0); - SketchContainer sketch_1(ft, n_bins, kCols, ctx.Device()); + SketchContainer sketch_1(&ctx, ft, n_bins, kCols); HostDeviceVector storage_1; std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0} .Device(ctx.Device()) @@ -249,7 +249,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) { MetaInfo info; int32_t seed = 0; HostDeviceVector ft; - SketchContainer sketch_0(ft, n_bins, cols, ctx.Device()); + SketchContainer sketch_0(&ctx, ft, n_bins, cols); HostDeviceVector storage_0; std::string interface_str_0 = RandomDataGenerator{rows, cols, 0} .Device(ctx.Device()) @@ -260,7 +260,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) { std::numeric_limits::quiet_NaN(), &sketch_0); size_t f_rows = rows * frac; - SketchContainer sketch_1(ft, n_bins, cols, ctx.Device()); + SketchContainer sketch_1(&ctx, ft, n_bins, cols); HostDeviceVector storage_1; std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0} .Device(ctx.Device()) @@ -316,8 +316,8 @@ TEST(GPUQuantile, MergeCategorical) { HostDeviceVector ft; ft.HostVector() = {FeatureType::kCategorical, FeatureType::kNumerical}; - SketchContainer sketch_0(ft, n_bins, kCols, ctx.Device()); - SketchContainer sketch_1(ft, n_bins, kCols, ctx.Device()); + SketchContainer sketch_0(&ctx, ft, n_bins, kCols); + SketchContainer sketch_1(&ctx, ft, n_bins, kCols); std::vector entries_0{{0, 0.0f}, {0, 0.0f}, {0, 1.0f}, {0, 2.0f}, {0, 2.0f}, {1, 0.1f}, {1, 0.2f}, {1, 0.4f}}; @@ -358,7 +358,7 @@ TEST(GPUQuantile, MultiMerge) { // Set up single node version HostDeviceVector ft; auto ctx = MakeCUDACtx(0); - SketchContainer sketch_on_single_node(ft, n_bins, kCols, ctx.Device()); + SketchContainer sketch_on_single_node(&ctx, ft, n_bins, kCols); size_t intermediate_num_cuts = std::min(kRows * world, static_cast(n_bins * WQSketch::kFactor)); @@ -371,7 +371,7 @@ TEST(GPUQuantile, MultiMerge) { .GenerateArrayInterface(&storage); data::CupyAdapter adapter(interface_str); HostDeviceVector ft; - containers.emplace_back(ft, n_bins, kCols, ctx.Device()); + containers.emplace_back(&ctx, ft, n_bins, kCols); AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), &containers.back()); } @@ -421,7 +421,7 @@ void TestAllReduceBasic() { * Set up single node version. */ HostDeviceVector ft({}, device); - SketchContainer sketch_on_single_node(ft, n_bins, kCols, device); + SketchContainer sketch_on_single_node(&ctx, ft, n_bins, kCols); size_t intermediate_num_cuts = std::min(kRows * world, static_cast(n_bins * WQSketch::kFactor)); @@ -434,7 +434,7 @@ void TestAllReduceBasic() { .GenerateArrayInterface(&storage); data::CupyAdapter adapter(interface_str); HostDeviceVector ft({}, device); - containers.emplace_back(ft, n_bins, kCols, device); + containers.emplace_back(&ctx, ft, n_bins, kCols); AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), &containers.back()); } @@ -450,7 +450,7 @@ void TestAllReduceBasic() { * the exact same copy of data. */ auto rank = collective::GetRank(); - SketchContainer sketch_distributed(ft, n_bins, kCols, device); + SketchContainer sketch_distributed(&ctx, ft, n_bins, kCols); HostDeviceVector storage({}, device); std::string interface_str = RandomDataGenerator{kRows, kCols, 0} .Device(device) @@ -561,7 +561,7 @@ void TestSameOnAllWorkers() { auto const device = DeviceOrd::CUDA(GPUIDX); Context ctx = MakeCUDACtx(device.ordinal); HostDeviceVector ft({}, device); - SketchContainer sketch_distributed(ft, n_bins, kCols, device); + SketchContainer sketch_distributed(&ctx, ft, n_bins, kCols); HostDeviceVector storage({}, device); std::string interface_str = RandomDataGenerator{kRows, kCols, 0} .Device(device) @@ -637,7 +637,7 @@ TEST(GPUQuantile, Push) { columns_ptr[1] = kRows; HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); + SketchContainer sketch(&ctx, ft, n_bins, kCols); sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, kRows, {}); @@ -678,7 +678,7 @@ TEST(GPUQuantile, MultiColPush) { int32_t n_bins = 16; HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, ctx.Device()); + SketchContainer sketch(&ctx, ft, n_bins, kCols); dh::device_vector d_entries{entries}; dh::device_vector columns_ptr(kCols + 1, 0); diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 3cb357845a0b..cd30f91fa8fc 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -249,7 +249,7 @@ void RandomDataGenerator::GenerateDense(HostDeviceVector* out) const { CHECK(out); SimpleLCG lcg{lcg_}; - out->Resize(rows_ * cols_, 0); + out->Resize(rows_ * cols_); auto& h_data = out->HostVector(); float sparsity = sparsity_ * (upper_ - lower_) + lower_; for (auto& v : h_data) { diff --git a/tests/cpp/objective/test_objective.cc b/tests/cpp/objective/test_objective.cc index 5df765789c3c..f98159654726 100644 --- a/tests/cpp/objective/test_objective.cc +++ b/tests/cpp/objective/test_objective.cc @@ -27,6 +27,7 @@ TEST(Objective, PredTransform) { xgboost::Context tparam; tparam.UpdateAllowUnknown(Args{{"device", "cuda"}}); size_t n = 100; + Context ctx; for (const auto& entry : ::dmlc::Registry<::xgboost::ObjFunctionReg>::List()) { std::unique_ptr obj{xgboost::ObjFunction::Create(entry->name, &tparam)}; @@ -40,7 +41,7 @@ TEST(Objective, PredTransform) { obj->Configure(Args{{"expectile_alpha", "0.5"}}); } HostDeviceVector predts; - predts.Resize(n, 3.14f); // prediction is performed on host. + predts.Resize(&ctx, n, 3.14f); // prediction is performed on host. ASSERT_FALSE(predts.DeviceCanRead()); obj->PredTransform(&predts); ASSERT_FALSE(predts.DeviceCanRead()); diff --git a/tests/cpp/objective/test_objective_helpers.h b/tests/cpp/objective/test_objective_helpers.h index 4d049772b98f..eaaf25c3b744 100644 --- a/tests/cpp/objective/test_objective_helpers.h +++ b/tests/cpp/objective/test_objective_helpers.h @@ -14,7 +14,7 @@ namespace xgboost { inline auto MakePositionsForTest(bst_idx_t n_samples, bst_node_t left_nidx, bst_node_t right_nidx, HostDeviceVector* p_position) { HostDeviceVector& position = *p_position; - position.Resize(n_samples, 0); + position.Resize(n_samples); auto& h_position = position.HostVector(); for (size_t i = 0; i < n_samples; ++i) { if (i < n_samples / 2) { diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index b8b97c701611..a1457490fa38 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -215,7 +215,7 @@ void TestUpdatePredictionCache(bool use_subsampling) { } PredictionCacheEntry predtion_cache; - predtion_cache.predictions.Resize(kRows * kClasses, 0); + predtion_cache.predictions.Resize(&ctx, kRows * kClasses, 0.0f); // after one training iteration predtion_cache is filled with cached in QuantileHistMaker // prediction values gbm->DoBoost(dmat.get(), &gpair, &predtion_cache, nullptr); diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc index 5cb8e7df425d..4d3e7c7523fb 100644 --- a/tests/cpp/predictor/test_predictor.cc +++ b/tests/cpp/predictor/test_predictor.cc @@ -768,8 +768,8 @@ void TestVectorLeafPrediction(Context const *ctx) { std::vector r_w(mparam.LeafLength(), 2.0f); auto &tree = trees.front(); - tree->SetRoot(linalg::MakeVec(p_w.data(), p_w.size()), /*sum_hess=*/1.0f); - tree->ExpandNode(0, static_cast(1), 2.0, true, + tree->SetRoot(ctx, linalg::MakeVec(p_w.data(), p_w.size()), /*sum_hess=*/1.0f); + tree->ExpandNode(ctx, 0, static_cast(1), 2.0, true, linalg::MakeVec(p_w.data(), p_w.size()), linalg::MakeVec(l_w.data(), l_w.size()), linalg::MakeVec(r_w.data(), r_w.size()), /*loss_chg=*/0.5f, /*sum_hess=*/1.0f, /*left_sum=*/0.6f, /*right_sum=*/0.4f); diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index c4c34ecafc61..ed0172c30749 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -57,7 +57,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) { cuts_.cut_values_.ConstDeviceSpan(), false}; - GPUHistEvaluator evaluator{param_, static_cast(feature_set.size()), ctx.Device()}; + GPUHistEvaluator evaluator{&ctx, param_, static_cast(feature_set.size())}; evaluator.Reset(&ctx, cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; @@ -98,7 +98,7 @@ TEST(GpuHist, PartitionBasic) { false, }; - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; + GPUHistEvaluator evaluator{&ctx, tparam, static_cast(feature_set.size())}; evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false); { @@ -208,7 +208,7 @@ TEST(GpuHist, PartitionTwoFeatures) { cuts.cut_values_.ConstDeviceSpan(), false}; - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; + GPUHistEvaluator evaluator{&ctx, tparam, static_cast(feature_set.size())}; evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false); { @@ -266,7 +266,7 @@ TEST(GpuHist, PartitionTwoNodes) { cuts.cut_values_.ConstDeviceSpan(), false}; - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; + GPUHistEvaluator evaluator{&ctx, tparam, static_cast(feature_set.size())}; evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false); { @@ -321,7 +321,7 @@ void TestEvaluateSingleSplit(bool is_categorical) { cuts.cut_values_.ConstDeviceSpan(), false}; - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; + GPUHistEvaluator evaluator{&ctx, tparam, static_cast(feature_set.size())}; evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; @@ -354,7 +354,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { EvaluateSplitSharedInputs shared_inputs{ param, quantiser, {}, dh::ToSpan(feature_segments), dh::ToSpan(feature_values), false}; - GPUHistEvaluator evaluator(tparam, feature_set.size(), FstCU()); + GPUHistEvaluator evaluator(&ctx, tparam, feature_set.size()); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; EXPECT_EQ(result.findex, 0); @@ -367,7 +367,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { TEST(GpuHist, EvaluateSingleSplitEmpty) { auto ctx = MakeCUDACtx(0); TrainParam tparam = ZeroParam(); - GPUHistEvaluator evaluator(tparam, 1, FstCU()); + GPUHistEvaluator evaluator(&ctx, tparam, 1); DeviceSplitCandidate result = evaluator .EvaluateSingleSplit( @@ -398,7 +398,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) { EvaluateSplitSharedInputs shared_inputs{ param, quantiser, {}, dh::ToSpan(feature_segments), dh::ToSpan(feature_values), false}; - GPUHistEvaluator evaluator(tparam, 2, FstCU()); + GPUHistEvaluator evaluator(&ctx, tparam, 2); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; EXPECT_EQ(result.findex, 1); @@ -426,7 +426,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) { EvaluateSplitSharedInputs shared_inputs{ param, quantiser, {}, dh::ToSpan(feature_segments), dh::ToSpan(feature_values), false}; - GPUHistEvaluator evaluator(tparam, 2, FstCU()); + GPUHistEvaluator evaluator(&ctx, tparam, 2); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; EXPECT_EQ(result.findex, 0); @@ -456,7 +456,7 @@ TEST(GpuHist, EvaluateSplits) { EvaluateSplitSharedInputs shared_inputs{ param, quantiser, {}, dh::ToSpan(feature_segments), dh::ToSpan(feature_values), false}; - GPUHistEvaluator evaluator{tparam, 2, FstCU()}; + GPUHistEvaluator evaluator{&ctx, tparam, 2}; dh::device_vector inputs = std::vector{input_left, input_right}; evaluator.LaunchEvaluateSplits(&ctx, input_left.feature_set.size(), dh::ToSpan(inputs), @@ -474,7 +474,7 @@ TEST(GpuHist, EvaluateSplits) { TEST_F(TestPartitionBasedSplit, GpuHist) { auto ctx = MakeCUDACtx(0); dh::device_vector ft{std::vector{FeatureType::kCategorical}}; - GPUHistEvaluator evaluator{param_, static_cast(info_.num_col_), ctx.Device()}; + GPUHistEvaluator evaluator{&ctx, param_, static_cast(info_.num_col_)}; cuts_.cut_ptrs_.SetDevice(ctx.Device()); cuts_.cut_values_.SetDevice(ctx.Device()); @@ -539,7 +539,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) { cuts.cut_values_.ConstDeviceSpan(), false}; - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; + GPUHistEvaluator evaluator{&ctx, tparam, static_cast(feature_set.size())}; evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 15aca958504a..3ade5c842d45 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -32,7 +32,7 @@ void TestUpdatePositionBatch() { EXPECT_EQ(rows[i], i); } std::vector extra_data = {0}; - dh::DeviceUVector ridx_tmp(kNumRows); + dh::DeviceUVector ridx_tmp(kNumRows, ctx.CUDACtx()->Stream()); // Send the first five training instances to the right node // and the second 5 to the left node rp.UpdatePositionBatch( @@ -65,7 +65,7 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector op_data(segments.size()); std::vector> h_batch_info(segments.size()); dh::TemporaryArray> d_batch_info(segments.size()); @@ -85,7 +85,7 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector()); partitioners.back()->Reset(&ctx, page.Size(), page.BaseRowId()); - dh::DeviceUVector ridx_tmp(page.Size()); + dh::DeviceUVector ridx_tmp(page.Size(), ctx.CUDACtx()->Stream()); std::vector splits{tree[0]}; page.Impl()->Visit(&ctx, {}, [&](auto&& acc) { partitioners.back()->UpdatePositionBatch(&ctx, {0}, {1}, {2}, splits, dh::ToSpan(ridx_tmp), @@ -205,7 +205,7 @@ void TestEmptyNode(std::int32_t n_workers) { bst_idx_t base_rowid = 0; partitioner.Reset(&ctx, n_samples, base_rowid); std::vector splits(1); - dh::DeviceUVector ridx_tmp(n_samples); + dh::DeviceUVector ridx_tmp(n_samples, ctx.CUDACtx()->Stream()); partitioner.UpdatePositionBatch( &ctx, {0}, {1}, {2}, splits, dh::ToSpan(ridx_tmp), [] XGBOOST_DEVICE(bst_idx_t ridx, std::int32_t /*nidx_in_batch*/, RegTree::Node) { diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc index 4296ba49a3f4..a7328e86eca6 100644 --- a/tests/cpp/tree/hist/test_evaluate_splits.cc +++ b/tests/cpp/tree/hist/test_evaluate_splits.cc @@ -57,11 +57,11 @@ void TestPartitionBasedSplit::SetUp() { total_gpair_ += e; } - auto enumerate = [this, n_feat = info_.num_col_](common::GHistRow hist, - GradientPairPrecise parent_sum) { + auto enumerate = [this, &ctx, n_feat = info_.num_col_](common::GHistRow hist, + GradientPairPrecise parent_sum) { int32_t best_thresh = -1; float best_score{-std::numeric_limits::infinity()}; - TreeEvaluator evaluator{param_, static_cast(n_feat), DeviceOrd::CPU()}; + TreeEvaluator evaluator{&ctx, param_, static_cast(n_feat)}; auto tree_evaluator = evaluator.GetEvaluator(); GradientPairPrecise left_sum; auto parent_gain = tree_evaluator.CalcGain(0, param_, GradStats{total_gpair_}); @@ -205,7 +205,7 @@ TEST(HistMultiEvaluator, Evaluate) { for (bst_target_t t{0}; t < n_targets; ++t) { root_sum_hess += static_cast(root_sum.HostView()(t).GetHess()); } - tree.SetRoot(weight.HostView(), root_sum_hess); + tree.SetRoot(&ctx, weight.HostView(), root_sum_hess); auto w = weight.HostView(); ASSERT_EQ(w.Size(), n_targets); ASSERT_EQ(w(0), -1.5); @@ -410,7 +410,7 @@ TEST(HistMultiEvaluator, CategoricalOneHot) { for (bst_target_t t = 0; t < n_targets; ++t) { root_sum_hess += static_cast(root_sum.HostView()(t).GetHess()); } - tree.SetRoot(weight.HostView(), root_sum_hess); + tree.SetRoot(&ctx, weight.HostView(), root_sum_hess); std::vector entries(1, {0, 0}); std::vector ptrs; diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc index 88118ca31136..892381a93366 100644 --- a/tests/cpp/tree/test_multi_target_tree_model.cc +++ b/tests/cpp/tree/test_multi_target_tree_model.cc @@ -28,11 +28,12 @@ std::unique_ptr MakeMtTreeForTest(bst_target_t n_targets) { std::iota(h_data.begin(), h_data.end(), init); }; + Context ctx; linalg::Vector base_weight; base_weight.ModifyInplace([&](HostDeviceVector* data, common::Span shape) { iota_weights(1.0f, data, shape); }); - tree->SetRoot(base_weight.HostView(), /*sum_hess=*/1.0f); + tree->SetRoot(&ctx, base_weight.HostView(), /*sum_hess=*/1.0f); linalg::Vector left_weight; left_weight.ModifyInplace([&](HostDeviceVector* data, common::Span shape) { @@ -43,7 +44,7 @@ std::unique_ptr MakeMtTreeForTest(bst_target_t n_targets) { iota_weights(3.0f, data, shape); }); - tree->ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(), + tree->ExpandNode(&ctx, RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(), left_weight.HostView(), right_weight.HostView(), /*loss_chg=*/0.5f, /*sum_hess=*/1.0f, /*left_sum=*/0.6f, /*right_sum=*/0.4f); tree->GetMultiTargetTree()->SetLeaves(); @@ -109,11 +110,12 @@ void TestTreeDump(std::string format, std::string leaf_key) { { // Test the "..." + Context ctx; bst_target_t n_targets{4}; RegTree tree{n_targets, n_features}; linalg::Vector weight{{1.0f, 2.0f, 3.0f, 4.0f}, {4ul}, DeviceOrd::CPU()}; - tree.SetRoot(weight.HostView(), /*sum_hess=*/1.0f); - tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, weight.HostView(), + tree.SetRoot(&ctx, weight.HostView(), /*sum_hess=*/1.0f); + tree.ExpandNode(&ctx, RegTree::kRoot, /*split_idx=*/1, 0.5f, true, weight.HostView(), weight.HostView(), weight.HostView(), /*loss_chg=*/0.5f, /*sum_hess=*/1.0f, /*left_sum=*/0.6f, /*right_sum=*/0.4f); tree.GetMultiTargetTree()->SetLeaves(); @@ -144,13 +146,14 @@ TEST(MultiTargetTree, SetLeaves) { std::unique_ptr tree{std::make_unique(n_targets, n_features)}; CHECK(tree->IsMultiTarget()); // Reduce to 2 targets + Context ctx; linalg::Vector base_weight{{1.0f, 2.0f}, {2ul}, DeviceOrd::CPU()}; - tree->SetRoot(base_weight.HostView(), /*sum_hess=*/1.0f); + tree->SetRoot(&ctx, base_weight.HostView(), /*sum_hess=*/1.0f); ASSERT_EQ(tree->GetMultiTargetTree()->NumSplitTargets(), 2); linalg::Vector left_weight{{2.0f, 3.0f}, {2ul}, DeviceOrd::CPU()}; linalg::Vector right_weight{{3.0f, 4.0f}, {2ul}, DeviceOrd::CPU()}; - tree->ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(), + tree->ExpandNode(&ctx, RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(), left_weight.HostView(), right_weight.HostView(), /*loss_chg=*/0.5f, /*sum_hess=*/1.0f, /*left_sum=*/0.6f, /*right_sum=*/0.4f); diff --git a/tests/cpp/tree/test_partitioner.h b/tests/cpp/tree/test_partitioner.h index 4afeb540670d..ae717945831e 100644 --- a/tests/cpp/tree/test_partitioner.h +++ b/tests/cpp/tree/test_partitioner.h @@ -1,14 +1,13 @@ /** - * Copyright 2021-2026 by XGBoost contributors. + * Copyright 2021-2026, XGBoost contributors. */ -#ifndef XGBOOST_TESTS_CPP_TREE_TEST_PARTITIONER_H_ -#define XGBOOST_TESTS_CPP_TREE_TEST_PARTITIONER_H_ -#include // for Context -#include // for Constant, Vector -#include // for CHECK -#include // for RegTree +#pragma once +#include // for Context +#include // for Constant, Vector +#include // for CHECK +#include // for RegTree -#include // for vector +#include // for vector #include "../../../src/tree/hist/expand_entry.h" // for CPUExpandEntry, MultiExpandEntry @@ -33,8 +32,8 @@ inline void GetMultiSplitForTest(RegTree *tree, float split_value, linalg::Vector base_weight{linalg::Constant(&ctx, 0.0f, n_targets)}; linalg::Vector left_weight{linalg::Constant(&ctx, 0.0f, n_targets)}; linalg::Vector right_weight{linalg::Constant(&ctx, 0.0f, n_targets)}; - tree->SetRoot(base_weight.HostView(), /*sum_hess=*/0.0f); - tree->ExpandNode(/*nidx=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value, + tree->SetRoot(&ctx, base_weight.HostView(), /*sum_hess=*/0.0f); + tree->ExpandNode(&ctx, /*nidx=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value, /*default_left=*/true, base_weight.HostView(), left_weight.HostView(), right_weight.HostView(), /*loss_chg=*/0.0f, /*sum_hess=*/0.0f, /*left_sum=*/0.0f, /*right_sum=*/0.0f); @@ -44,4 +43,3 @@ inline void GetMultiSplitForTest(RegTree *tree, float split_value, tree->GetMultiTargetTree()->SetLeaves(); } } // namespace xgboost::tree -#endif // XGBOOST_TESTS_CPP_TREE_TEST_PARTITIONER_H_