alibaba · poor-circle · Mar 4, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,11 +1,21 @@
 cmake_minimum_required(VERSION 3.15)
+# Create an option to control CUDA usage, defaulting to whether CUDA was found
+option(YLT_ENABLE_CUDA "Enable CUDA support" OFF)
+
+# Set project languages based on the option and CUDA availability
+if(YLT_ENABLE_CUDA)
+    set(CMAKE_CUDA_ARCHITECTURES 86)
+    set(PROJECT_LANGUAGES CXX CUDA)
+else()
+    set(PROJECT_LANGUAGES CXX)
+endif()
+
 project(yaLanTingLibs
         VERSION 0.5.8
         DESCRIPTION "yaLanTingLibs"
         HOMEPAGE_URL "https://github.com/alibaba/yalantinglibs"
-        LANGUAGES CXX
+        LANGUAGES ${PROJECT_LANGUAGES}
         )
-
 # load pack finder
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Find/)
 

diff --git a/cmake/config.cmake b/cmake/config.cmake
@@ -78,6 +78,17 @@ if (YLT_ENABLE_IBV)
         target_link_libraries(${ylt_target_name} INTERFACE -libverbs)
     endif ()
 endif ()
+if (YLT_ENABLE_CUDA)
+    message(STATUS "Enable cuda support")
+    find_package(CUDAToolkit REQUIRED)
+    if(CMAKE_PROJECT_NAME STREQUAL "yaLanTingLibs")
+        add_compile_definitions("YLT_ENABLE_CUDA")
+        link_libraries(CUDA::cuda_driver)
+    else ()
+        target_compile_definitions(${ylt_target_name} INTERFACE "YLT_ENABLE_CUDA")
+        target_link_libraries(${ylt_target_name} INTERFACE CUDA::cuda_driver)
+    endif ()
+endif()
 
 option(YLT_ENABLE_PMR "Enable pmr support" OFF)
 message(STATUS "YLT_ENABLE_PMR: ${YLT_ENABLE_PMR}")

diff --git a/cmake/develop.cmake b/cmake/develop.cmake
@@ -53,7 +53,11 @@ if(ENABLE_SANITIZER AND NOT MSVC)
         if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
             check_asan(HAS_ASAN)
             if(HAS_ASAN)
-                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
+                if (YLT_ENABLE_CUDA)
+                    message(STATUS "address sanitizer is disabled when using CUDA")
+                else()
+                    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
+                endif()
             else()
                 message(WARNING "address sanitizer is no supported with current tool-chains")
             endif()

diff --git a/include/ylt/coro_io/coro_io.hpp b/include/ylt/coro_io/coro_io.hpp
@@ -44,6 +44,7 @@
 
 #include <asio/connect.hpp>
 #include <asio/experimental/channel.hpp>
+#include <asio/high_resolution_timer.hpp>
 #include <asio/ip/tcp.hpp>
 #include <asio/read.hpp>
 #include <asio/read_at.hpp>
@@ -627,6 +628,23 @@ class period_timer : public asio::steady_timer {
   }
 };
 
+class high_resolution_timer : public asio::high_resolution_timer {
+ public:
+  using asio::high_resolution_timer::high_resolution_timer;
+  template <typename T>
+  high_resolution_timer(coro_io::ExecutorWrapper<T> *executor)
+      : asio::high_resolution_timer(executor->get_asio_executor()) {}
+
+  async_simple::coro::Lazy<bool> async_await() noexcept {
+    auto ec = co_await async_io<std::error_code>(
+        [&](auto &&cb) {
+          this->async_wait(std::move(cb));
+        },
+        *this);
+    co_return !ec;
+  }
+};
+
 template <typename Duration, typename Executor>
 inline async_simple::coro::Lazy<bool> sleep_for(Duration d, Executor *e) {
   coro_io::period_timer timer(e);

diff --git a/include/ylt/coro_io/cuda/cuda_device.hpp b/include/ylt/coro_io/cuda/cuda_device.hpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2026, Alibaba Group Holding Limited;
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <span>
+#include <stdexcept>
+#include <vector>
+
+#include "async_simple/coro/ConditionVariable.h"
+#include "async_simple/coro/Lazy.h"
+#include "async_simple/coro/Mutex.h"
+#include "cuda.h"
+#include "ylt/coro_io/detail/client_queue.hpp"
+#include "ylt/easylog.hpp"
+
+#define YLT_CHECK_CUDA_ERR(err)                                       \
+  do {                                                                \
+    if (err != CUDA_SUCCESS && err != CUDA_ERROR_DEINITIALIZED) {     \
+      const char* err_str;                                            \
+      cuGetErrorString(err, &err_str);                                \
+      std::string tmp = "CUDA Driver error: " + std::string(err_str); \
+      ELOG_ERROR << tmp;                                              \
+      throw std::runtime_error(tmp);                                  \
+    }                                                                 \
+  } while (0)
+
+namespace coro_io {
+class cuda_device_t : public std::enable_shared_from_this<cuda_device_t> {
+ public:
+  static std::shared_ptr<std::vector<std::shared_ptr<cuda_device_t>>>
+  get_cuda_devices() {
+    static auto device =
+        std::make_shared<std::vector<std::shared_ptr<cuda_device_t>>>(
+            get_cuda_devices_impl());
+    return device;
+  }
+  static std::shared_ptr<cuda_device_t> get_cuda_device(int gpu_id) {
+    static auto devices = get_cuda_devices();
+    if (gpu_id >= devices->size() || gpu_id < 0) [[unlikely]] {
+      throw std::logic_error("Out of cuda devices index");
+    }
+    return (*devices)[gpu_id];
+  }
+
+  static bool get_cuda_p2p_linkable(int src_gpu_id, int dst_gpu_id) {
+    return get_cuda_p2p_topo()[src_gpu_id][dst_gpu_id];
+  }
+
+  operator CUcontext() const noexcept { return context_; }
+  cuda_device_t(const cuda_device_t&) = delete;
+  cuda_device_t(cuda_device_t&&) = delete;
+  cuda_device_t& operator=(const cuda_device_t&) = delete;
+  cuda_device_t& operator=(cuda_device_t&&) = delete;
+  ~cuda_device_t() {
+    ELOG_INFO << "release cuda device:" << name_ << "(" << gpu_id_ << ")";
+    cuDevicePrimaryCtxRelease(device_);
+  }
+
+  void close() {}
+
+  void set_context() {
+    static thread_local CUcontext ctx = nullptr;
+    if (ctx != context_) {
+      YLT_CHECK_CUDA_ERR(cuCtxSetCurrent(context_));
+      ctx = context_;
+    }
+  }
+
+ private:
+  static std::vector<std::shared_ptr<cuda_device_t>> get_cuda_devices_impl() {
+    YLT_CHECK_CUDA_ERR(cuInit(0));
+    int device_count = 0;
+    YLT_CHECK_CUDA_ERR(cuDeviceGetCount(&device_count));
+    std::vector<std::shared_ptr<cuda_device_t>> devices;
+    devices.reserve(device_count);
+    for (int i = 0; i < device_count; ++i) {
+      devices.emplace_back(std::make_shared<cuda_device_t>(i));
+    }
+    return devices;
+  }
+  static std::vector<std::vector<bool>> get_cuda_p2p_topo_impl() {
+    auto devices = get_cuda_devices();
+    size_t num_devices = devices->size();
+    std::vector<std::vector<bool>> topo(num_devices,
+                                        std::vector<bool>(num_devices, false));
+
+    for (size_t i = 0; i < num_devices; ++i) {
+      for (size_t j = 0; j < num_devices; ++j) {
+        if (i == j) {
+          topo[i][j] = true;  // A device can always access itself
+          continue;
+        }
+        int canAccessPeer;
+        YLT_CHECK_CUDA_ERR(cuDeviceCanAccessPeer(
+            &canAccessPeer, (*devices)[i]->device_, (*devices)[j]->device_));
+        topo[i][j] = static_cast<bool>(canAccessPeer);
+      }
+    }
+    return topo;
+  }
+  static std::span<std::vector<bool>> get_cuda_p2p_topo() {
+    static std::vector<std::vector<bool>> topo = get_cuda_p2p_topo_impl();
+    return topo;
+  }
+
+ public:
+  cuda_device_t(int gpu_id) : gpu_id_(gpu_id) {
+    YLT_CHECK_CUDA_ERR(cuDeviceGet(&device_, gpu_id_));
+    YLT_CHECK_CUDA_ERR(cuDevicePrimaryCtxRetain(&context_, device_));
+    name_.resize(256);
+    YLT_CHECK_CUDA_ERR(cuDeviceGetName(name_.data(), 256, device_));
+    auto pos = name_.find_last_not_of('\0');
+    if (pos != std::string::npos) {
+      name_.erase(pos + 1);
+    }
+    else {
+      name_.clear();
+    }
+    ELOG_INFO << "Get cuda device(" << gpu_id_ << "): " << name_;
+  }
+  int get_gpu_id() const noexcept { return gpu_id_; }
+  std::string_view name() const noexcept { return name_; }
+
+ private:
+  std::string name_;
+  int gpu_id_;
+  CUcontext context_;
+  CUdevice device_;
+};
+}  // namespace coro_io
diff --git a/include/ylt/coro_io/cuda/cuda_helper.hpp b/include/ylt/coro_io/cuda/cuda_helper.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2026, Alibaba Group Holding Limited;
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "cuda.h"
+namespace coro_io {
+inline const char* to_string(CUresult err) {
+  const char* result;
+  CUresult strerr = cuGetErrorString((CUresult)err, &result);
+  if (strerr != CUDA_SUCCESS) {
+    return "unknown error";
+  }
+  else {
+    return result;
+  }
+}
+}  // namespace coro_io