Skip to content

Commit d8c7a01

Browse files
pavanbalajimeta-codesync[bot]
authored andcommitted
Add backtrace to NCCLXException message (#744)
Summary: Pull Request resolved: #744 Embed a symbolized stack trace in the NCCLXException message using folly::symbolizer::getStackTraceStr(). Since NCCLXException is never caught within the torchcomms ncclx layer and propagates to upstream callers, embedding the backtrace directly in the message ensures it is always visible via e.what() without requiring the caller to use a separate exception tracer. Added NCCLXExceptionIncludesBacktrace unit test to verify the backtrace is present in the exception message. Reviewed By: mingrany Differential Revision: D93783839 fbshipit-source-id: d8be63b2daf97503740538849654bb791c908b7d
1 parent ff5a65c commit d8c7a01

2 files changed

Lines changed: 26 additions & 1 deletion

File tree

comms/torchcomms/ncclx/NcclxApi.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
#include "comms/torchcomms/ncclx/NcclxApi.hpp"
44

5+
#include <folly/debugging/symbolizer/Symbolizer.h>
6+
57
// Check NCCL version at compile time
68
#if NCCL_VERSION_CODE < NCCL_VERSION(2, 25, 0)
79
#error \
@@ -19,7 +21,8 @@ NCCLXException::NCCLXException(
1921
ncclComm_t comm)
2022
: message_(
2123
message + ": " + nccl_api.getErrorString(result) +
22-
" \nNCCL Last Error: " + nccl_api.getLastError(comm)),
24+
" \nNCCL Last Error: " + nccl_api.getLastError(comm) +
25+
" \nBacktrace:\n" + folly::symbolizer::getStackTraceStr()),
2326
result_(result) {}
2427

2528
const char* NCCLXException::what() const noexcept {

comms/torchcomms/ncclx/tests/unit/cpp/TorchCommNCCLXTest.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1484,6 +1484,28 @@ TEST_F(TorchCommNCCLXTest, NCCLXExceptionIncludesLastErrorString) {
14841484
EXPECT_EQ(exception.getResult(), ncclInternalError);
14851485
}
14861486

1487+
TEST_F(TorchCommNCCLXTest, NCCLXExceptionIncludesBacktrace) {
1488+
// Test that NCCLXException message includes a backtrace
1489+
1490+
nccl_mock_->setupDefaultBehaviors();
1491+
1492+
EXPECT_CALL(*nccl_mock_, getErrorString(ncclInternalError))
1493+
.WillOnce(Return("internal error"));
1494+
EXPECT_CALL(*nccl_mock_, getLastError(_)).WillOnce(Return("some error"));
1495+
1496+
ncclComm_t mock_comm = reinterpret_cast<ncclComm_t>(0x3000);
1497+
NCCLXException exception(
1498+
*nccl_mock_, "Test operation failed", ncclInternalError, mock_comm);
1499+
1500+
std::string what_message = exception.what();
1501+
EXPECT_TRUE(what_message.find("Backtrace:") != std::string::npos)
1502+
<< "Exception message should contain 'Backtrace:' label";
1503+
EXPECT_TRUE(
1504+
what_message.find("NCCLXException") != std::string::npos ||
1505+
what_message.find("getStackTraceStr") != std::string::npos)
1506+
<< "Backtrace should contain a recognizable frame: " << what_message;
1507+
}
1508+
14871509
TEST_F(TorchCommNCCLXTest, NCCLXExceptionFromFailedSendIncludesLastError) {
14881510
// Test that when send() fails, the thrown NCCLXException includes
14891511
// the NCCL last error string

0 commit comments

Comments
 (0)