Skip to content

Commit 5160d80

Browse files
Switched from separate sin and cos functions to sincos in CUDA, achieving around 30% speedup in local benchmark.
1 parent 721402d commit 5160d80

2 files changed

Lines changed: 10 additions & 4 deletions

File tree

src/cuda/rho1d_kernel.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ __global__ void rho1D_kernel_add(const double* proj, size_t N,
1717

1818
for (size_t i = 0; i < N; ++i) {
1919
double phase = proj[i] * q;
20-
sum = cuCadd(sum, make_cuDoubleComplex(cos(phase), -sin(phase)));
20+
double sPhase, cPhase;
21+
sincos(phase, &sPhase, &cPhase);
22+
sum = cuCadd(sum, make_cuDoubleComplex(sPhase, -cPhase));
2123
}
2224

2325
rho[qq] = cuCadd(rho[qq], sum);

src/cuda/rho2d_kernel.cu

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#include <cuComplex.h>
22
#include <math.h>
33

4-
extern "C" __global__ void rho2D_kernel_add(const double* proj1, const double* proj2, size_t N,
4+
extern "C" __global__ void rho2D_kernel_add(const double* proj1,
5+
const double* proj2, size_t N,
56
const double* q1vals, size_t M1,
67
const double* q2vals, size_t M2,
78
cuDoubleComplex* pos_pos,
@@ -24,8 +25,11 @@ extern "C" __global__ void rho2D_kernel_add(const double* proj1, const double* p
2425
double anglePP = phase1 + phase2;
2526
double anglePN = phase1 - phase2;
2627

27-
sumPP = cuCadd(sumPP, make_cuDoubleComplex(cos(anglePP), -sin(anglePP)));
28-
sumPN = cuCadd(sumPN, make_cuDoubleComplex(cos(anglePN), -sin(anglePN)));
28+
double sPP, cPP, sPN, cPN;
29+
sincos(anglePP, &sPP, &cPP);
30+
sincos(anglePN, &sPN, &cPN);
31+
sumPP = cuCadd(sumPP, make_cuDoubleComplex(cPP, -sPP));
32+
sumPN = cuCadd(sumPN, make_cuDoubleComplex(cPN, -sPN));
2933
}
3034

3135
int idx = qq1 * M2 + qq2;

0 commit comments

Comments
 (0)