|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| 2 | +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \ |
| 3 | +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64LE |
| 4 | + |
| 5 | +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \ |
| 6 | +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64 |
| 7 | + |
| 8 | +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc-ibm-aix \ |
| 9 | +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_32 |
| 10 | + |
| 11 | +; The current implementation is comparing vector of non-zeros in register v2 with v3. v3 is then negated and converts: |
| 12 | +; 0XFFFF -> 0 |
| 13 | +; 0 -> 1 |
| 14 | +; An optimized version is to follow this NFC patch |
| 15 | + |
| 16 | +define i32 @cols_needed(<4 x i16> %wide.load) { |
| 17 | +; POWERPC_64LE-LABEL: cols_needed: |
| 18 | +; POWERPC_64LE: # %bb.0: # %entry |
| 19 | +; POWERPC_64LE-NEXT: xxlxor v3, v3, v3 |
| 20 | +; POWERPC_64LE-NEXT: li r3, 0 |
| 21 | +; POWERPC_64LE-NEXT: vcmpequh v2, v2, v3 |
| 22 | +; POWERPC_64LE-NEXT: xxleqv v3, v3, v3 |
| 23 | +; POWERPC_64LE-NEXT: vmrglh v2, v2, v2 |
| 24 | +; POWERPC_64LE-NEXT: vsubuwm v2, v2, v3 |
| 25 | +; POWERPC_64LE-NEXT: xxswapd v3, v2 |
| 26 | +; POWERPC_64LE-NEXT: vadduwm v2, v2, v3 |
| 27 | +; POWERPC_64LE-NEXT: xxspltw v3, v2, 2 |
| 28 | +; POWERPC_64LE-NEXT: vadduwm v2, v2, v3 |
| 29 | +; POWERPC_64LE-NEXT: vextuwrx r3, r3, v2 |
| 30 | +; POWERPC_64LE-NEXT: blr |
| 31 | +; |
| 32 | +; POWERPC_64-LABEL: cols_needed: |
| 33 | +; POWERPC_64: # %bb.0: # %entry |
| 34 | +; POWERPC_64-NEXT: xxlxor v3, v3, v3 |
| 35 | +; POWERPC_64-NEXT: li r3, 0 |
| 36 | +; POWERPC_64-NEXT: vcmpequh v2, v2, v3 |
| 37 | +; POWERPC_64-NEXT: xxleqv v3, v3, v3 |
| 38 | +; POWERPC_64-NEXT: vmrghh v2, v2, v2 |
| 39 | +; POWERPC_64-NEXT: vsubuwm v2, v2, v3 |
| 40 | +; POWERPC_64-NEXT: xxswapd v3, v2 |
| 41 | +; POWERPC_64-NEXT: vadduwm v2, v2, v3 |
| 42 | +; POWERPC_64-NEXT: xxspltw v3, v2, 1 |
| 43 | +; POWERPC_64-NEXT: vadduwm v2, v2, v3 |
| 44 | +; POWERPC_64-NEXT: vextuwlx r3, r3, v2 |
| 45 | +; POWERPC_64-NEXT: blr |
| 46 | +; |
| 47 | +; POWERPC_32-LABEL: cols_needed: |
| 48 | +; POWERPC_32: # %bb.0: # %entry |
| 49 | +; POWERPC_32-NEXT: xxlxor v3, v3, v3 |
| 50 | +; POWERPC_32-NEXT: vcmpequh v2, v2, v3 |
| 51 | +; POWERPC_32-NEXT: xxleqv v3, v3, v3 |
| 52 | +; POWERPC_32-NEXT: vmrghh v2, v2, v2 |
| 53 | +; POWERPC_32-NEXT: vsubuwm v2, v2, v3 |
| 54 | +; POWERPC_32-NEXT: xxswapd v3, v2 |
| 55 | +; POWERPC_32-NEXT: vadduwm v2, v2, v3 |
| 56 | +; POWERPC_32-NEXT: xxspltw v3, v2, 1 |
| 57 | +; POWERPC_32-NEXT: vadduwm v2, v2, v3 |
| 58 | +; POWERPC_32-NEXT: stxv v2, -16(r1) |
| 59 | +; POWERPC_32-NEXT: lwz r3, -16(r1) |
| 60 | +; POWERPC_32-NEXT: blr |
| 61 | +entry: |
| 62 | + %0 = icmp ne <4 x i16> %wide.load, zeroinitializer |
| 63 | + %1 = zext <4 x i1> %0 to <4 x i32> |
| 64 | + %2 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) |
| 65 | + ret i32 %2 |
| 66 | +} |
| 67 | + |
| 68 | +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) |
| 69 | +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #0 |
| 70 | + |
| 71 | +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } |
0 commit comments