Skip to content

Commit 00ea77e

Browse files
wewe5215mr-c
andcommitted
arm neon: fix cmla{_rot{90,180,270},}_lane with correct test-suite on ARMv8.3 system
Formatting normalization. Put similar functions in the same order. Replace the last usage of SIMDE_ARCH_ARM_CHECK(8, n) with feature checks. Co-Authored-By: Michael R. Crusoe <[email protected]>
1 parent afd77a9 commit 00ea77e

16 files changed

+2793
-2529
lines changed

simde/arm/neon/cadd_rot270.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ simde_float16x4_t simde_vcadd_rot270_f16(simde_float16x4_t a, simde_float16x4_t
5252
vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \
5353
__riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4);
5454
r_.sv64 = __riscv_vfadd_vv_f16m1(op1, a_.sv64, 4);
55-
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
55+
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
5656
((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
5757
b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 5, 0, 7, 2);
5858
r_.values = b_.values + a_.values;
@@ -91,7 +91,7 @@ simde_float16x8_t simde_vcaddq_rot270_f16(simde_float16x8_t a, simde_float16x8_t
9191
vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \
9292
__riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8));
9393
r_.sv128 = __riscv_vfadd_vv_f16m1(op1, a_.sv128, 8);
94-
#elif defined(SIMDE_SHUFFLE_VECTOR_) && \
94+
#elif defined(SIMDE_SHUFFLE_VECTOR_) && \
9595
((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
9696
b_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, -b_.values, b_.values, 9, 0, 11, 2, 13, 4, 15, 6);
9797
r_.values = b_.values + a_.values;
@@ -191,7 +191,7 @@ simde_float32x4_t simde_vcaddq_rot270_f32(simde_float32x4_t a, simde_float32x4_t
191191
SIMDE_FUNCTION_ATTRIBUTES
192192
simde_float64x2_t simde_vcaddq_rot270_f64(simde_float64x2_t a, simde_float64x2_t b)
193193
{
194-
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \
194+
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \
195195
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
196196
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0))
197197
return vcaddq_rot270_f64(a, b);

simde/arm/neon/cadd_rot90.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ simde_float16x4_t simde_vcadd_rot90_f16(simde_float16x4_t a, simde_float16x4_t b
5252
vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \
5353
__riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4);
5454
r_.sv64 = __riscv_vfadd_vv_f16m1(op1, a_.sv64, 4);
55-
#elif defined(SIMDE_SHUFFLE_VECTOR_) && \
55+
#elif defined(SIMDE_SHUFFLE_VECTOR_) && \
5656
((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
5757
b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 1, 4, 3, 6);
5858
r_.values = b_.values + a_.values;
@@ -91,7 +91,7 @@ simde_float16x8_t simde_vcaddq_rot90_f16(simde_float16x8_t a, simde_float16x8_t
9191
vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \
9292
__riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8));
9393
r_.sv128 = __riscv_vfadd_vv_f16m1(op1, a_.sv128, 8);
94-
#elif defined(SIMDE_SHUFFLE_VECTOR_) && \
94+
#elif defined(SIMDE_SHUFFLE_VECTOR_) && \
9595
((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
9696
b_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, -b_.values, b_.values, 1, 8, 3, 10, 5, 12, 7, 14);
9797
r_.values = b_.values + a_.values;
@@ -191,7 +191,7 @@ simde_float32x4_t simde_vcaddq_rot90_f32(simde_float32x4_t a, simde_float32x4_t
191191
SIMDE_FUNCTION_ATTRIBUTES
192192
simde_float64x2_t simde_vcaddq_rot90_f64(simde_float64x2_t a, simde_float64x2_t b)
193193
{
194-
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \
194+
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \
195195
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
196196
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0))
197197
return vcaddq_rot90_f64(a, b);

simde/arm/neon/cmla_lane.h

Lines changed: 186 additions & 220 deletions
Large diffs are not rendered by default.

simde/arm/neon/cmla_rot180.h

Lines changed: 44 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,48 @@ simde_vcmla_rot180_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4
7171
#define vcmla_rot180_f16(r, a, b) simde_vcmla_rot180_f16(r, a, b)
7272
#endif
7373

74+
SIMDE_FUNCTION_ATTRIBUTES
75+
simde_float32x2_t
76+
simde_vcmla_rot180_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) {
77+
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \
78+
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
79+
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
80+
return vcmla_rot180_f32(r, a, b);
81+
#else
82+
simde_float32x2_private
83+
r_ = simde_float32x2_to_private(r),
84+
a_ = simde_float32x2_to_private(a),
85+
b_ = simde_float32x2_to_private(b);
86+
87+
#if defined(SIMDE_SHUFFLE_VECTOR_)
88+
a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0);
89+
b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1);
90+
r_.values += b_.values * a_.values;
91+
#else
92+
SIMDE_VECTORIZE
93+
for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
94+
r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i];
95+
r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i];
96+
}
97+
#endif
98+
99+
return simde_float32x2_from_private(r_);
100+
#endif
101+
}
102+
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \
103+
!(defined(SIMDE_ARCH_ARM_COMPLEX) && \
104+
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
105+
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))))
106+
#undef vcmla_rot180_f32
107+
#define vcmla_rot180_f32(r, a, b) simde_vcmla_rot180_f32(r, a, b)
108+
#endif
109+
74110
SIMDE_FUNCTION_ATTRIBUTES
75111
simde_float16x8_t
76112
simde_vcmlaq_rot180_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) {
77-
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
113+
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \
78114
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \
79-
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
80-
defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX)
115+
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
81116
return vcmlaq_rot180_f16(r, a, b);
82117
#else
83118
simde_float16x8_private
@@ -101,51 +136,13 @@ simde_vcmlaq_rot180_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x
101136
#endif
102137
}
103138
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \
104-
!((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \
105-
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
106-
defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX)))
139+
!(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \
140+
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \
141+
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))))
107142
#undef vcmlaq_rot180_f16
108143
#define vcmlaq_rot180_f16(r, a, b) simde_vcmlaq_rot180_f16(r, a, b)
109144
#endif
110145

111-
112-
SIMDE_FUNCTION_ATTRIBUTES
113-
simde_float32x2_t
114-
simde_vcmla_rot180_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) {
115-
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
116-
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
117-
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
118-
defined(SIMDE_ARCH_ARM_COMPLEX)
119-
return vcmla_rot180_f32(r, a, b);
120-
#else
121-
simde_float32x2_private
122-
r_ = simde_float32x2_to_private(r),
123-
a_ = simde_float32x2_to_private(a),
124-
b_ = simde_float32x2_to_private(b);
125-
126-
#if defined(SIMDE_SHUFFLE_VECTOR_)
127-
a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0);
128-
b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1);
129-
r_.values += b_.values * a_.values;
130-
#else
131-
SIMDE_VECTORIZE
132-
for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
133-
r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i];
134-
r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i];
135-
}
136-
#endif
137-
138-
return simde_float32x2_from_private(r_);
139-
#endif
140-
}
141-
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \
142-
!((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
143-
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
144-
defined(SIMDE_ARCH_ARM_COMPLEX)))
145-
#undef vcmla_rot180_f32
146-
#define vcmla_rot180_f32(r, a, b) simde_vcmla_rot180_f32(r, a, b)
147-
#endif
148-
149146
SIMDE_FUNCTION_ATTRIBUTES
150147
simde_float32x4_t
151148
simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) {
@@ -180,9 +177,9 @@ simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x
180177
#endif
181178
}
182179
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \
183-
!((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
184-
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
185-
defined(SIMDE_ARCH_ARM_COMPLEX)))
180+
!(defined(SIMDE_ARCH_ARM_COMPLEX) && \
181+
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
182+
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))))
186183
#undef vcmlaq_rot180_f32
187184
#define vcmlaq_rot180_f32(r, a, b) simde_vcmlaq_rot180_f32(r, a, b)
188185
#endif

0 commit comments

Comments
 (0)