|
| 1 | +/*************************************************************************** |
| 2 | +Copyright (c) 2017, The OpenBLAS Project |
| 3 | +All rights reserved. |
| 4 | +Redistribution and use in source and binary forms, with or without |
| 5 | +modification, are permitted provided that the following conditions are |
| 6 | +met: |
| 7 | +1. Redistributions of source code must retain the above copyright |
| 8 | +notice, this list of conditions and the following disclaimer. |
| 9 | +2. Redistributions in binary form must reproduce the above copyright |
| 10 | +notice, this list of conditions and the following disclaimer in |
| 11 | +the documentation and/or other materials provided with the |
| 12 | +distribution. |
| 13 | +3. Neither the name of the OpenBLAS project nor the names of |
| 14 | +its contributors may be used to endorse or promote products |
| 15 | +derived from this software without specific prior written permission. |
| 16 | +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE |
| 20 | +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 21 | +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| 22 | +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| 23 | +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 24 | +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
| 25 | +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 26 | +*****************************************************************************/ |
| 27 | + |
| 28 | +#include "common.h" |
| 29 | + |
| 30 | +#include <arm_neon.h> |
| 31 | + |
| 32 | +#define N "x0" /* vector length */ |
| 33 | +#define X "x1" /* "X" vector address */ |
| 34 | +#define INC_X "x2" /* "X" stride */ |
| 35 | +#define J "x5" /* loop variable */ |
| 36 | + |
| 37 | +#define REG0 "xzr" |
| 38 | +#define SUMF "d0" |
| 39 | +#define TMPF "d1" |
| 40 | + |
| 41 | +/******************************************************************************/ |
| 42 | + |
| 43 | +#define KERNEL_F1 \ |
| 44 | + "ldr "TMPF", ["X"] \n" \ |
| 45 | + "add "X", "X", #8 \n" \ |
| 46 | + "fadd "SUMF", "SUMF", "TMPF" \n" |
| 47 | + |
| 48 | +#define KERNEL_F32 \ |
| 49 | + "ldr q16, ["X"] \n" \ |
| 50 | + "ldr q17, ["X", #16] \n" \ |
| 51 | + "ldr q18, ["X", #32] \n" \ |
| 52 | + "ldr q19, ["X", #48] \n" \ |
| 53 | + "ldp q20, q21, ["X", #64] \n" \ |
| 54 | + "ldp q22, q23, ["X", #96] \n" \ |
| 55 | + "ldp q24, q25, ["X", #128] \n" \ |
| 56 | + "ldp q26, q27, ["X", #160] \n" \ |
| 57 | + "fadd v16.2d, v16.2d, v17.2d \n" \ |
| 58 | + "fadd v18.2d, v18.2d, v19.2d \n" \ |
| 59 | + "ldp q28, q29, ["X", #192] \n" \ |
| 60 | + "ldp q30, q31, ["X", #224] \n" \ |
| 61 | + "add "X", "X", #256 \n" \ |
| 62 | + "fadd v20.2d, v20.2d, v21.2d \n" \ |
| 63 | + "fadd v22.2d, v22.2d, v23.2d \n" \ |
| 64 | + "PRFM PLDL1KEEP, ["X", #1024] \n" \ |
| 65 | + "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ |
| 66 | + "fadd v24.2d, v24.2d, v25.2d \n" \ |
| 67 | + "fadd v26.2d, v26.2d, v27.2d \n" \ |
| 68 | + "fadd v28.2d, v28.2d, v29.2d \n" \ |
| 69 | + "fadd v30.2d, v30.2d, v31.2d \n" \ |
| 70 | + "fadd v0.2d, v0.2d, v16.2d \n" \ |
| 71 | + "fadd v1.2d, v1.2d, v18.2d \n" \ |
| 72 | + "fadd v2.2d, v2.2d, v20.2d \n" \ |
| 73 | + "fadd v3.2d, v3.2d, v22.2d \n" \ |
| 74 | + "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ |
| 75 | + "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ |
| 76 | + "fadd v4.2d, v4.2d, v24.2d \n" \ |
| 77 | + "fadd v5.2d, v5.2d, v26.2d \n" \ |
| 78 | + "fadd v6.2d, v6.2d, v28.2d \n" \ |
| 79 | + "fadd v7.2d, v7.2d, v30.2d \n" |
| 80 | + |
| 81 | +#define KERNEL_F32_FINALIZE \ |
| 82 | + "fadd v0.2d, v0.2d, v1.2d \n" \ |
| 83 | + "fadd v2.2d, v2.2d, v3.2d \n" \ |
| 84 | + "fadd v4.2d, v4.2d, v5.2d \n" \ |
| 85 | + "fadd v6.2d, v6.2d, v7.2d \n" \ |
| 86 | + "fadd v0.2d, v0.2d, v2.2d \n" \ |
| 87 | + "fadd v4.2d, v4.2d, v6.2d \n" \ |
| 88 | + "fadd v0.2d, v0.2d, v4.2d \n" \ |
| 89 | + "faddp "SUMF", v0.2d \n" |
| 90 | + |
| 91 | +#define INIT_S \ |
| 92 | + "lsl "INC_X", "INC_X", #3 \n" |
| 93 | + |
| 94 | +#define KERNEL_S1 \ |
| 95 | + "ldr "TMPF", ["X"] \n" \ |
| 96 | + "add "X", "X", "INC_X" \n" \ |
| 97 | + "fadd "SUMF", "SUMF", "TMPF" \n" |
| 98 | + |
| 99 | + |
| 100 | +#if defined(SMP) |
| 101 | +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, |
| 102 | + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, |
| 103 | + void *c, BLASLONG ldc, int (*function)(), int nthreads); |
| 104 | +#endif |
| 105 | + |
| 106 | + |
| 107 | +static FLOAT dsum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) |
| 108 | +{ |
| 109 | + FLOAT dsum = 0.0 ; |
| 110 | + |
| 111 | + if ( n < 0 ) return(dsum); |
| 112 | + |
| 113 | + __asm__ __volatile__ ( |
| 114 | + " mov "N", %[N_] \n" |
| 115 | + " mov "X", %[X_] \n" |
| 116 | + " mov "INC_X", %[INCX_] \n" |
| 117 | + " fmov "SUMF", "REG0" \n" |
| 118 | + " fmov d1, "REG0" \n" |
| 119 | + " fmov d2, "REG0" \n" |
| 120 | + " fmov d3, "REG0" \n" |
| 121 | + " fmov d4, "REG0" \n" |
| 122 | + " fmov d5, "REG0" \n" |
| 123 | + " fmov d6, "REG0" \n" |
| 124 | + " fmov d7, "REG0" \n" |
| 125 | + " cmp "N", xzr \n" |
| 126 | + " ble 9f //dsum_kernel_L999 \n" |
| 127 | + " cmp "INC_X", xzr \n" |
| 128 | + " ble 9f //dsum_kernel_L999 \n" |
| 129 | + " cmp "INC_X", #1 \n" |
| 130 | + " bne 5f //dsum_kernel_S_BEGIN \n" |
| 131 | + |
| 132 | + "1: //dsum_kernel_F_BEGIN: \n" |
| 133 | + " asr "J", "N", #5 \n" |
| 134 | + " cmp "J", xzr \n" |
| 135 | + " beq 3f //dsum_kernel_F1 \n" |
| 136 | + |
| 137 | +#if !(defined(__clang__) && defined(OS_WINDOWS)) |
| 138 | + ".align 5 \n" |
| 139 | +#endif |
| 140 | + "2: //dsum_kernel_F32: \n" |
| 141 | + " "KERNEL_F32" \n" |
| 142 | + " subs "J", "J", #1 \n" |
| 143 | + " bne 2b //dsum_kernel_F32 \n" |
| 144 | + " "KERNEL_F32_FINALIZE" \n" |
| 145 | + |
| 146 | + "3: //dsum_kernel_F1: \n" |
| 147 | + " ands "J", "N", #31 \n" |
| 148 | + " ble 9f //dsum_kernel_L999 \n" |
| 149 | + |
| 150 | + "4: //dsum_kernel_F10: \n" |
| 151 | + " "KERNEL_F1" \n" |
| 152 | + " subs "J", "J", #1 \n" |
| 153 | + " bne 4b //dsum_kernel_F10 \n" |
| 154 | + " b 9f //dsum_kernel_L999 \n" |
| 155 | + |
| 156 | + "5: //dsum_kernel_S_BEGIN: \n" |
| 157 | + " "INIT_S" \n" |
| 158 | + " asr "J", "N", #2 \n" |
| 159 | + " cmp "J", xzr \n" |
| 160 | + " ble 7f //dsum_kernel_S1 \n" |
| 161 | + |
| 162 | + "6: //dsum_kernel_S4: \n" |
| 163 | + " "KERNEL_S1" \n" |
| 164 | + " "KERNEL_S1" \n" |
| 165 | + " "KERNEL_S1" \n" |
| 166 | + " "KERNEL_S1" \n" |
| 167 | + " subs "J", "J", #1 \n" |
| 168 | + " bne 6b //dsum_kernel_S4 \n" |
| 169 | + |
| 170 | + "7: //dsum_kernel_S1: \n" |
| 171 | + " ands "J", "N", #3 \n" |
| 172 | + " ble 9f //dsum_kernel_L999 \n" |
| 173 | + |
| 174 | + "8: //dsum_kernel_S10: \n" |
| 175 | + " "KERNEL_S1" \n" |
| 176 | + " subs "J", "J", #1 \n" |
| 177 | + " bne 8b //dsum_kernel_S10 \n" |
| 178 | + |
| 179 | + "9: //dsum_kernel_L999: \n" |
| 180 | + " fmov %[DSUM_], "SUMF" \n" |
| 181 | + |
| 182 | + : [DSUM_] "=r" (dsum) //%0 |
| 183 | + : [N_] "r" (n), //%1 |
| 184 | + [X_] "r" (x), //%2 |
| 185 | + [INCX_] "r" (inc_x) //%3 |
| 186 | + : "cc", |
| 187 | + "memory", |
| 188 | + "x0", "x1", "x2", "x3", "x4", "x5", |
| 189 | + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" |
| 190 | + ); |
| 191 | + |
| 192 | + return dsum; |
| 193 | +} |
| 194 | + |
| 195 | +#if defined(SMP) |
| 196 | +static int dsum_thread_function(BLASLONG n, BLASLONG dummy0, |
| 197 | + BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, |
| 198 | + BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) |
| 199 | +{ |
| 200 | + *result = dsum_compute(n, x, inc_x); |
| 201 | + |
| 202 | + return 0; |
| 203 | +} |
| 204 | +#endif |
| 205 | + |
| 206 | +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) |
| 207 | +{ |
| 208 | +#if defined(SMP) |
| 209 | + int nthreads; |
| 210 | + FLOAT dummy_alpha; |
| 211 | +#endif |
| 212 | + FLOAT dsum = 0.0; |
| 213 | + |
| 214 | +#if defined(SMP) |
| 215 | + if (inc_x == 0 || n <= 10000) |
| 216 | + nthreads = 1; |
| 217 | + else |
| 218 | + nthreads = num_cpu_avail(1); |
| 219 | + |
| 220 | + if (nthreads == 1) { |
| 221 | + dsum = dsum_compute(n, x, inc_x); |
| 222 | + } else { |
| 223 | + int mode, i; |
| 224 | + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; |
| 225 | + FLOAT *ptr; |
| 226 | + |
| 227 | + mode = BLAS_DOUBLE; |
| 228 | + |
| 229 | + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, |
| 230 | + x, inc_x, NULL, 0, result, 0, |
| 231 | + ( void *)dsum_thread_function, nthreads); |
| 232 | + |
| 233 | + ptr = (FLOAT *)result; |
| 234 | + for (i = 0; i < nthreads; i++) { |
| 235 | + dsum = dsum + (*ptr); |
| 236 | + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); |
| 237 | + } |
| 238 | + } |
| 239 | +#else |
| 240 | + dsum = dsum_compute(n, x, inc_x); |
| 241 | +#endif |
| 242 | + |
| 243 | + return dsum; |
| 244 | +} |
0 commit comments