@@ -686,6 +686,22 @@ struct __parallel_radix_sort_iteration
686686 __scan_wg_size =
687687 sycl::min (__scan_wg_size, oneapi::dpl::__internal::__kernel_work_group_size (__q, __local_scan_kernel));
688688 __count_wg_size = sycl::max (__count_sg_size, __reorder_sg_size);
689+ #else
690+ // When kernel compilation is disabled, use conservative fallback values
691+ // Get device sub-group sizes and pick a suitable one for radix sort
692+ const auto __subgroup_sizes = __q.get_device ().template get_info <sycl::info::device::sub_group_sizes>();
693+ // The radix sort kernels are optimized for sub-group size 16 to avoid register spills
694+ // and efficiently handle 4-bit radix (16 buckets). Prefer 16, then 32, then 8.
695+ if (std::find (__subgroup_sizes.begin (), __subgroup_sizes.end (), 16 ) != __subgroup_sizes.end ())
696+ __reorder_sg_size = 16 ;
697+ else if (std::find (__subgroup_sizes.begin (), __subgroup_sizes.end (), 32 ) != __subgroup_sizes.end ())
698+ __reorder_sg_size = 32 ;
699+ else if (std::find (__subgroup_sizes.begin (), __subgroup_sizes.end (), 8 ) != __subgroup_sizes.end ())
700+ __reorder_sg_size = 8 ;
701+ // else keep __reorder_sg_size = __max_sg_size
702+
703+ // For __count_wg_size, use the maximum of the current value and __reorder_sg_size
704+ __count_wg_size = sycl::max (__count_wg_size, __reorder_sg_size);
689705#endif
690706 const ::std::uint32_t __radix_states = 1 << __radix_bits;
691707
0 commit comments