PaddlePaddle · ZhangX-21 · Feb 2, 2026 · Jan 26, 2026 · Jan 27, 2026 · Jan 28, 2026
diff --git a/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc
@@ -56,6 +56,13 @@ void FusedLayerNormKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<float>(mean);
   dev_ctx.template Alloc<float>(variance);
 
+  if (m * n == 0) {
+    if (residual) {
+      dev_ctx.template Alloc<T>(residual_out);
+    }
+    return;
+  }
+
   DenseTensor residual_alpha_tmp;
   residual_alpha_tmp.Resize({1});
 

diff --git a/paddle/phi/kernels/xpu/activation_kernel.cc b/paddle/phi/kernels/xpu/activation_kernel.cc
@@ -239,7 +239,7 @@ void PowKernel(const Context& dev_ctx,
   const XPUType* x_data = reinterpret_cast<const XPUType*>(x.data<T>());
   XPUType* y_data = reinterpret_cast<XPUType*>(out->data<T>());
   XPUType pow_factor = static_cast<XPUType>(factor.to<T>());
-
+  if (x.numel() == 0) return;
   auto xpu_context = dev_ctx.x_context();
 
   int r = xpu::pow_tensor_scalar(
@@ -439,6 +439,10 @@ void SwishKernel(const Context& dev_ctx,
                  DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   dev_ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+
   int r = xpu::swish(dev_ctx.x_context(),
                      reinterpret_cast<const XPUType*>(x.data<T>()),
                      reinterpret_cast<XPUType*>(out->data<T>()),

diff --git a/paddle/phi/kernels/xpu/atan_kernel.cc b/paddle/phi/kernels/xpu/atan_kernel.cc
@@ -23,7 +23,7 @@ void AtanKernel(const Context& dev_ctx,
                 const DenseTensor& x,
                 DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-
+  if (x.numel() == 0) return;
   const T* x_ptr = x.data<T>();
   T* out_ptr = dev_ctx.template Alloc<T>(out);
 

diff --git a/paddle/phi/kernels/xpu/bce_loss_grad_kernel.cc b/paddle/phi/kernels/xpu/bce_loss_grad_kernel.cc
@@ -30,6 +30,10 @@ void BCELossGradKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(input_grad);
 
   auto x_numel = input.numel();
+  if (x_numel == 0) {
+    dev_ctx.template Alloc<T>(input_grad);
+    return;
+  }
   int r = xpu::bce_loss_grad<XPUType>(
       dev_ctx.x_context(),
       reinterpret_cast<const XPUType*>(input.data<T>()),

diff --git a/paddle/phi/kernels/xpu/bitwise.cc b/paddle/phi/kernels/xpu/bitwise.cc
@@ -26,6 +26,7 @@ void BitwiseNotKernel(const Context& dev_ctx,
                       DenseTensor* out) {
   using XPUDataType = typename XPUTypeTrait<T>::Type;
   dev_ctx.template Alloc<T>(out);
+  if (out && out->numel() == 0) return;
   int r = xpu::logical_not(dev_ctx.x_context(),
                            reinterpret_cast<const XPUDataType*>(x.data<T>()),
                            reinterpret_cast<XPUDataType*>(out->data<T>()),

diff --git a/paddle/phi/kernels/xpu/conv_kernel.cc b/paddle/phi/kernels/xpu/conv_kernel.cc
@@ -183,6 +183,10 @@ void Conv3DKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations_t,
                   const std::string& data_format,
                   DenseTensor* out) {
+  if (input.numel() == 0 || out->numel() == 0) {
+    dev_ctx.template Alloc<T>(out);
+    return;
+  }
   using XPUType = typename XPUTypeTrait<T>::Type;
   std::vector<int64_t> paddings(paddings_t.begin(), paddings_t.end());
   std::vector<int64_t> dilations(dilations_t.begin(), dilations_t.end());

diff --git a/paddle/phi/kernels/xpu/cumprod_kernel.cc b/paddle/phi/kernels/xpu/cumprod_kernel.cc
@@ -33,6 +33,9 @@ void CumprodKernel(const Context& dev_ctx,
   DDim shape = x->dims();
   std::vector<int64_t> xshape = vectorize<int64_t>(shape);
 
+  if (input.numel() == 0) {
+    return;
+  }
   if (dim < 0) dim += xshape.size();
   if (shape.size() == 0) {
     int r =

diff --git a/paddle/phi/kernels/xpu/diag_kernel.cc b/paddle/phi/kernels/xpu/diag_kernel.cc
@@ -38,6 +38,14 @@ void DiagKernel(const Context& dev_ctx,
   if (x.dims().size() == 0) {
     x_shape = std::vector<int64_t>({1});
   }
+  if (x.numel() == 0) {
+    int r_fill = xpu::constant<XPUType>(dev_ctx.x_context(),
+                                        out_data,
+                                        out->numel(),
+                                        static_cast<XPUType>(padding_value));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r_fill, "constant");
+    return;
+  }
 
   int r = xpu::diag<XPUType>(dev_ctx.x_context(),
                              x_data,

diff --git a/paddle/phi/kernels/xpu/embedding_grad_kernel.cc b/paddle/phi/kernels/xpu/embedding_grad_kernel.cc
@@ -60,6 +60,9 @@ void EmbeddingGradKernel(const Context& dev_ctx,
   int64_t ym = ids_numel;
   int64_t n = d_table_t->dims()[1];
 
+  if (xm == 0 || ym == 0 || n == 0) {
+    return;
+  }
   int r = xpu::embedding_grad<XPUType, int64_t>(
       dev_ctx.x_context(),
       reinterpret_cast<const XPUType*>(d_output_data),

diff --git a/paddle/phi/kernels/xpu/gather_nd_grad_kernel.cc b/paddle/phi/kernels/xpu/gather_nd_grad_kernel.cc
@@ -28,7 +28,9 @@ void GatherNdGradKernel(const Context &dev_ctx,
                         DenseTensor *x_grad) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   dev_ctx.template Alloc<T>(x_grad);
-
+  if (x_grad->numel() == 0) {
+    return;
+  }
   int r = 0;
   XPUType *dx_data = reinterpret_cast<XPUType *>(x_grad->data<T>());
   r = xpu::constant<XPUType>(

diff --git a/paddle/phi/kernels/xpu/huber_loss_kernel.cc b/paddle/phi/kernels/xpu/huber_loss_kernel.cc
@@ -28,6 +28,7 @@ void HuberLossKernel(const Context& dev_ctx,
                      DenseTensor* residual) {
   auto residual_data = dev_ctx.template Alloc<T>(residual);
   auto out_data = dev_ctx.template Alloc<T>(out);
+  if (input.numel() == 0) return;
   auto in0_data = input.data<T>();
   auto in1_data = label.data<T>();
 

diff --git a/paddle/phi/kernels/xpu/label_smooth_kernel.cc b/paddle/phi/kernels/xpu/label_smooth_kernel.cc
@@ -23,6 +23,10 @@ void LabelSmoothKernel(const Context& dev_ctx,
                        float epsilon,
                        DenseTensor* out) {
   auto label_dim = label.dims()[label.dims().size() - 1];
+  if (label.numel() == 0) {
+    dev_ctx.template Alloc<T>(out);
+    return;
+  }
   auto ptr = dev_ctx.template Alloc<T>(out);
   if (prior_dist.is_initialized()) {
     PADDLE_THROW(

diff --git a/paddle/phi/kernels/xpu/logical_kernel.cc b/paddle/phi/kernels/xpu/logical_kernel.cc
@@ -40,12 +40,16 @@ void LogicalBinaryKernel(
         xpu::Context*, const XPUType*, const XPUType*, bool*, int64_t)> func,
     std::string funcname = "logical") {
   dev_ctx.template Alloc<bool>(out);
-
+  if (out->numel() == 0) {
+    return;
+  }
   int r = 0;
   const auto* x_data = x.data<T>();
   const auto* y_data = y.data<T>();
   auto* out_data = out->data<T>();
-
+  if (x.numel() == 0 || y.numel() == 0) {
+    return;
+  }
   if (x.numel() == out->numel() && y.numel() == out->numel()) {
     r = func(dev_ctx.x_context(),
              reinterpret_cast<const XPUType*>(x_data),

diff --git a/paddle/phi/kernels/xpu/pad_kernel.cc b/paddle/phi/kernels/xpu/pad_kernel.cc
@@ -62,6 +62,11 @@ void PadKernel<phi::complex64, XPUContext>(const XPUContext& dev_ctx,
                                            DenseTensor* out) {
   using T = phi::complex64;
   dev_ctx.template Alloc<T>(out);
+  if (x.numel() == 0) {
+    phi::Full<T, XPUContext>(
+        dev_ctx, phi::IntArray(common::vectorize(out->dims())), pad_value, out);
+    return;
+  }
   std::vector<int64_t> pad_left, pad_right;
   std::vector<int64_t> xshape = vectorize<int64_t>(x.dims());
 

diff --git a/paddle/phi/kernels/xpu/prelu_grad_kernel.cc b/paddle/phi/kernels/xpu/prelu_grad_kernel.cc
@@ -34,6 +34,7 @@ void PReluGradKernel(const Context& dev_ctx,
       Full<T, Context>(dev_ctx, alpha_grad->dims(), 0, alpha_grad);
     }
   }
+  if (x.numel() == 0) return;
   const T* x_ptr = x.data<T>();
   const T* alpha_ptr = alpha.data<T>();
   const T* out_grad_ptr = out_grad.data<T>();

diff --git a/paddle/phi/kernels/xpu/scatter_grad_kernel.cc b/paddle/phi/kernels/xpu/scatter_grad_kernel.cc
@@ -37,6 +37,20 @@ void ScatterGradKernel(const Context &dev_ctx,
     }
     return;
   }
+  if (index.numel() == 0) {
+    if (x_grad) {
+      phi::Copy<Context>(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+    }
+    if (updates_grad) {
+      dev_ctx.template Alloc<T>(updates_grad);
+      phi::Full<T, Context>(
+          dev_ctx,
+          phi::IntArray(common::vectorize(updates_grad->dims())),
+          0,
+          updates_grad);
+    }
+    return;
+  }
   using XPUType = typename XPUTypeTrait<T>::Type;
 
   const auto &index_type = index.dtype();

diff --git a/paddle/phi/kernels/xpu/sign_kernel.cc b/paddle/phi/kernels/xpu/sign_kernel.cc
@@ -24,6 +24,9 @@ void SignKernel(const Context& dev_ctx,
                 const DenseTensor& x,
                 DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
+  if (x.numel() == 0) {
+    return;
+  }
   auto xpu_context = dev_ctx.x_context();
   int r = xpu::sign(xpu_context, x.data<T>(), out->data<T>(), x.numel());
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "sign");

diff --git a/paddle/phi/kernels/xpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/xpu/take_along_axis_grad_kernel.cc
@@ -50,7 +50,9 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
                         x_grad->numel(),
                         XPUType(0));
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
-
+  if (out_grad.numel() == 0 || index.numel() == 0) {
+    return;
+  }
   auto x_shape = vectorize<int64_t>(x.dims());
   auto out_grad_shape = vectorize<int64_t>(out_grad.dims());
   auto index_shape = vectorize<int64_t>(index.dims());

diff --git a/test/legacy_test/test_bce_loss.py b/test/legacy_test/test_bce_loss.py
@@ -330,6 +330,22 @@ def init_test_cast(self):
         self.shape = [0]
 
 
+class TestBCELossWithZeroSizeTensor(unittest.TestCase):
+    def test_bce_loss_with_zero_size_tensor(self):
+        paddle.disable_static()
+        input = paddle.to_tensor([], dtype='float32').reshape([0, 13125, 1])
+        label = paddle.to_tensor([], dtype='float32').reshape([0, 13125, 1])
+        input.stop_gradient = False
+        out = paddle.nn.functional.binary_cross_entropy(
+            input, label, reduction='sum'
+        )
+        loss = out.sum()
+        loss.backward()
+        self.assertEqual(loss.shape, [])
+        self.assertEqual(float(loss), 0.0)
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/test/legacy_test/test_conv3d_layer.py b/test/legacy_test/test_conv3d_layer.py
@@ -264,6 +264,7 @@ def add_cases(suite):
             padding="valid",
         )
     )
+    suite.addTest(Conv3DZeroSizeXPUTestCase(methodName='runTest'))
 
 
 def add_error_cases(suite):
@@ -399,5 +400,28 @@ def test_static_Compatibility(self):
                     )
 
 
+class Conv3DZeroSizeXPUTestCase(unittest.TestCase):
+    def runTest(self):
+        if not core.is_compiled_with_xpu():
+            return
+        paddle.device.set_device('xpu')
+        paddle.disable_static()
+        x = paddle.randn([4, 3, 0, 8, 8], dtype='float32')
+        w = paddle.randn([5, 3, 3, 3, 3], dtype='float32')
+        b = paddle.randn([5], dtype='float32')
+        out = paddle.nn.functional.conv3d(
+            x,
+            w,
+            b,
+            padding=[[0, 0], [0, 0], [1, 1], [2, 2], [2, 2]],
+            stride=1,
+            dilation=1,
+            groups=1,
+            data_format='NCDHW',
+        )
+        self.assertEqual(list(out.shape), [4, 5, 0, 10, 10])
+        paddle.enable_static()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_smooth_l1_loss.py b/test/legacy_test/test_smooth_l1_loss.py
@@ -490,6 +490,24 @@ def test_HuberLoss_class_api(self):
         np.testing.assert_allclose(dy_ret.numpy(), self.expected, rtol=1e-05)
 
 
+class SmoothL1Loss_ZeroSize_XPUSum(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+
+    def test_smooth_l1_loss_sum(self):
+        input_np = np.random.random([0, 102, 8]).astype(np.float32)
+        label_np = np.random.random([0, 102, 8]).astype(np.float32)
+        expected = smooth_l1_loss_np(input_np, label_np, reduction='sum')
+
+        paddle.disable_static()
+        smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='sum')
+        input = paddle.to_tensor(input_np)
+        label = paddle.to_tensor(label_np)
+        dy_ret = smooth_l1_loss(input, label)
+        np.testing.assert_allclose(dy_ret.numpy(), expected, rtol=1e-05)
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/test/xpu/test_atan_op_xpu.py b/test/xpu/test_atan_op_xpu.py
@@ -70,6 +70,16 @@ def init_input_shape(self):
             self.x_shape = [1]
 
 
+class TestAtanEmptyXPU(unittest.TestCase):
+    def test_atan_empty_tensor(self):
+        paddle.disable_static()
+        paddle.set_device('xpu')
+        x = paddle.empty([0, 16, 32], dtype='float16')
+        out = paddle.atan(x)
+        self.assertEqual(list(out.shape), [0, 16, 32])
+        paddle.enable_static()
+
+
 support_types = get_xpu_op_support_types("atan")
 for stype in support_types:
     create_test_class(globals(), XPUTestAtanOp, stype)

diff --git a/test/xpu/test_cumprod_op_xpu.py b/test/xpu/test_cumprod_op_xpu.py
@@ -169,6 +169,18 @@ def run(place):
                 run(place)
 
 
+class TestCumprodEmptyXPU(unittest.TestCase):
+    def test_cumprod_empty_tensor(self):
+        paddle.disable_static()
+        try:
+            paddle.set_device('xpu')
+            x = paddle.empty([0], dtype='float32')
+            out = paddle.cumprod(x, -1)
+            self.assertEqual(list(out.shape), [0])
+        finally:
+            paddle.enable_static()
+
+
 support_types = get_xpu_op_support_types('cumprod')
 for stype in support_types:
     create_test_class(globals(), XPUTestCumprodOP, stype)

diff --git a/test/xpu/test_diag_v2_op_xpu.py b/test/xpu/test_diag_v2_op_xpu.py
@@ -308,6 +308,15 @@ def test_xpu(self):
             with base.program_guard(base.Program()):
                 self.run_static()
 
+    class TestDiagV2OpEmpty1DOffsetOutOfRange(TestDiagV2Op):
+        def init_config(self):
+            self.x = np.array([], dtype=self.dtype)
+            self.offset = 2
+            self.padding_value = 0.0
+            n = self.x.size
+            dim = n + abs(self.offset)
+            self.out = np.zeros((dim, dim), dtype=self.dtype)
+
 
 support_types = get_xpu_op_support_types('diag_v2')
 for stype in support_types:

diff --git a/test/xpu/test_fft_xpu.py b/test/xpu/test_fft_xpu.py
@@ -325,6 +325,16 @@ def test_fftn(self):
         # ('test_axis_not_default', rand_x(5), None, (1, 2), 'backward'),
         # ('test_norm_forward', rand_x(5), None, (1, 2), 'forward'),
         # ('test_norm_ortho', rand_x(5), None, (1, 2), 'ortho'),
+        (
+            'test_xpu_0size_pad',
+            (
+                np.random.randn(50, 8, 0, 14, 14)
+                + 1j * np.random.randn(50, 8, 0, 14, 14)
+            ).astype(np.complex64),
+            (39, 14, 14),
+            None,
+            'backward',
+        ),
     ],
 )
 class TestIFftn(unittest.TestCase):

diff --git a/test/xpu/test_gather_nd_op_xpu.py b/test/xpu/test_gather_nd_op_xpu.py
@@ -202,5 +202,24 @@ def test_2(self):
         self.assertEqual(output.shape, [2, 0, 3, 2])
 
 
+class TestGatherNdEmptyXPU(unittest.TestCase):
+    def test_gather_nd_with_empty_index(self):
+        paddle.disable_static()
+        try:
+            paddle.set_device('xpu')
+            x = paddle.rand([1, 20, 0], dtype='float32')
+            x.stop_gradient = False
+            index = paddle.to_tensor([[1, 2]], dtype='int64')
+            out = paddle.gather_nd(
+                x,
+                index,
+            )
+            self.assertEqual(list(out.shape), [1, 0])
+            loss = out.sum()
+            loss.backward()
+        finally:
+            paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()