diff --git a/src/nmod_poly/evaluate_nmod.c b/src/nmod_poly/evaluate_nmod.c index 5e843005da..4f0b67313f 100644 --- a/src/nmod_poly/evaluate_nmod.c +++ b/src/nmod_poly/evaluate_nmod.c @@ -1,6 +1,6 @@ /* Copyright (C) 2010 William Hart - Copyright (C) 2024 Vincent Neiger + Copyright (C) 2025 Vincent Neiger This file is part of FLINT. @@ -15,91 +15,680 @@ #include "nmod_poly.h" #include "nmod.h" -ulong -_nmod_poly_evaluate_nmod(nn_srcptr poly, slong len, ulong c, nmod_t mod) +/*--------------------------------------*/ +/* special value: 1 (one) and -1 (mone) */ +/*--------------------------------------*/ + +/* general */ +static ulong _nmod_poly_evaluate_one(nn_srcptr poly, slong len, ulong modn) { - slong m; - ulong val; + if (len == 0) + return 0; + + if (len <= 3) + { + ulong val = poly[0]; + for (slong m = 1; m < len; m++) + val = n_addmod(val, poly[m], modn); + return val; + } + + ulong val0, val1, val2, val3; + + val0 = poly[0]; + val1 = poly[1]; + val2 = poly[2]; + val3 = poly[3]; + + slong m = 4; + + for ( ; m+3 < len; m += 4) + { + val0 = n_addmod(val0, poly[m+0], modn); + val1 = n_addmod(val1, poly[m+1], modn); + val2 = n_addmod(val2, poly[m+2], modn); + val3 = n_addmod(val3, poly[m+3], modn); + } + + /* gather results */ + val2 = n_addmod(val2, val3, modn); + val0 = n_addmod(val0, val1, modn); + val0 = n_addmod(val0, val2, modn); + + /* last few terms */ + for ( ; m < len; m++) + val0 = n_addmod(val0, poly[m], modn); + return val0; +} + +/* requires nbits(modn) <= FLINT_BITS - 1 */ +static ulong _nmod_poly_evaluate_one1(nn_srcptr poly, slong len, ulong modn) +{ if (len == 0) return 0; - if (len == 1 || c == 0) + if (len <= 3) + { + ulong val = poly[0]; + for (slong m = 1; m < len; m++) + val = n_addmod(val, poly[m], modn); + return val; + } + + ulong val0, val1, val2, val3; + + val0 = poly[0]; + val1 = poly[1]; + val2 = poly[2]; + val3 = poly[3]; + + slong m = 4; + + for ( ; m+3 < len; m += 4) + { + val0 = val0 + poly[m+0]; + val1 = val1 + poly[m+1]; + val2 = val2 + poly[m+2]; + val3 = val3 + poly[m+3]; + if (val0 >= modn) + val0 -= modn; + if (val1 >= modn) + val1 -= modn; + if (val2 >= modn) + val2 -= modn; + if (val3 >= modn) + val3 -= modn; + } + + /* handle 2 more terms if they exist */ + if (m+1 < len) + { + val0 = val0 + poly[m+0]; + val1 = val1 + poly[m+1]; + if (val0 >= modn) + val0 -= modn; + if (val1 >= modn) + val1 -= modn; + m += 2; + } + + /* remains 0 or 1 term */ + if (m < len) + { + val0 = val0 + poly[m+0]; + if (val0 >= modn) + val0 -= modn; + } + + /* gather results */ + val2 = n_addmod(val2, val3, modn); + val0 = n_addmod(val0, val1, modn); + val0 = n_addmod(val0, val2, modn); + + return val0; +} + +/* requires nbits(modn) <= FLINT_BITS - 2 */ +static ulong _nmod_poly_evaluate_one2(nn_srcptr poly, slong len, ulong modn) +{ + if (len == 0) + return 0; + + if (len <= 7) + { + ulong val = poly[0]; + for (slong m = 1; m < len; m++) + val = n_addmod(val, poly[m], modn); + return val; + } + + ulong val0, val1, val2, val3; + + /* initial values, in [0, 2*modn) */ + val0 = poly[0] + poly[4]; + val1 = poly[1] + poly[5]; + val2 = poly[2] + poly[6]; + val3 = poly[3] + poly[7]; + + slong m = 8; + for ( ; m+7 < len; m += 8) + { + /* add new values */ + val0 += poly[m+0] + poly[m+4]; + val1 += poly[m+1] + poly[m+5]; + val2 += poly[m+2] + poly[m+6]; + val3 += poly[m+3] + poly[m+7]; + + /* reduce to [0, 2*modn) */ + if (val0 >= 2*modn) + val0 -= 2*modn; + if (val1 >= 2*modn) + val1 -= 2*modn; + if (val2 >= 2*modn) + val2 -= 2*modn; + if (val3 >= 2*modn) + val3 -= 2*modn; + } + + /* handle 4 more terms if they exist */ + if (m+3 < len) + { + val0 += poly[m+0]; + val1 += poly[m+1]; + val2 += poly[m+2]; + val3 += poly[m+3]; + m += 4; + } + + /* handle 2 more terms if they exist */ + if (m+1 < len) + { + val0 += poly[m+0]; + val1 += poly[m+1]; + m += 2; + } + + /* remains 0 or 1 term */ + if (m < len) + val2 += poly[m+0]; + + /* reduce to [0, 2*modn) */ + if (val0 >= 2*modn) + val0 -= 2*modn; + if (val1 >= 2*modn) + val1 -= 2*modn; + if (val2 >= 2*modn) + val2 -= 2*modn; + if (val3 >= 2*modn) + val3 -= 2*modn; + + /* gather results */ + val0 = val0 + val1; + if (val0 >= 2*modn) + val0 -= 2*modn; + val2 = val2 + val3; + if (val2 >= 2*modn) + val2 -= 2*modn; + val0 = val0 + val2; + if (val0 >= 2*modn) + val0 -= 2*modn; + if (val0 >= modn) + val0 -= modn; + + return val0; +} + +/* general */ +static ulong _nmod_poly_evaluate_mone(nn_srcptr poly, slong len, ulong modn) +{ + if (len == 0) + return 0; + + if (len == 1) return poly[0]; - m = len - 1; + if (len == 2) + return n_submod(poly[0], poly[1], modn); - val = poly[m]; - m--; + if (len == 3) + return n_addmod(n_submod(poly[0], poly[1], modn), + poly[2], modn); - for ( ; m >= 0; m--) + ulong val0, val1, val2, val3; + + val0 = poly[0]; + val1 = poly[1]; + val2 = poly[2]; + val3 = poly[3]; + + slong m = 4; + + for ( ; m+3 < len; m += 4) + { + val0 = n_addmod(val0, poly[m+0], modn); + val1 = n_addmod(val1, poly[m+1], modn); + val2 = n_addmod(val2, poly[m+2], modn); + val3 = n_addmod(val3, poly[m+3], modn); + } + + /* handle 2 more terms if they exist */ + if (m+1 < len) { - val = nmod_mul(val, c, mod); - val = n_addmod(val, poly[m], mod.n); + val0 = n_addmod(val0, poly[m+0], modn); + val1 = n_addmod(val1, poly[m+1], modn); + m += 2; } - return val; + /* remains 0 or 1 term */ + if (m < len) + val0 = n_addmod(val0, poly[m+0], modn); + + /* gather results */ + val2 = n_submod(val2, val3, modn); + val0 = n_submod(val0, val1, modn); + val0 = n_addmod(val0, val2, modn); + + return val0; } -ulong -_nmod_poly_evaluate_nmod_precomp(nn_srcptr poly, slong len, ulong c, ulong c_precomp, ulong modn) +/* requires nbits(modn) <= FLINT_BITS - 1 */ +static ulong _nmod_poly_evaluate_mone1(nn_srcptr poly, slong len, ulong modn) { - slong m; - ulong val; + if (len == 0) + return 0; + + if (len == 1) + return poly[0]; + + if (len == 2) + return n_submod(poly[0], poly[1], modn); + + if (len == 3) + return n_addmod(n_submod(poly[0], poly[1], modn), + poly[2], modn); + + ulong val0, val1, val2, val3; + + val0 = poly[0]; + val1 = poly[1]; + val2 = poly[2]; + val3 = poly[3]; + slong m = 4; + + for ( ; m+3 < len; m += 4) + { + val0 = val0 + poly[m+0]; + val1 = val1 + poly[m+1]; + val2 = val2 + poly[m+2]; + val3 = val3 + poly[m+3]; + if (val0 >= modn) + val0 -= modn; + if (val1 >= modn) + val1 -= modn; + if (val2 >= modn) + val2 -= modn; + if (val3 >= modn) + val3 -= modn; + } + + /* handle 2 more terms if they exist */ + if (m+1 < len) + { + val0 = val0 + poly[m+0]; + val1 = val1 + poly[m+1]; + if (val0 >= modn) + val0 -= modn; + if (val1 >= modn) + val1 -= modn; + m += 2; + } + + /* remains 0 or 1 term */ + if (m < len) + { + val0 = val0 + poly[m+0]; + if (val0 >= modn) + val0 -= modn; + } + + /* gather results */ + val2 = n_submod(val2, val3, modn); + val0 = n_submod(val0, val1, modn); + val0 = n_addmod(val0, val2, modn); + + return val0; +} + +/* requires nbits(modn) <= FLINT_BITS - 2 */ +static ulong _nmod_poly_evaluate_mone2(nn_srcptr poly, slong len, ulong modn) +{ + if (len == 0) + return 0; + + if (len == 1) + return poly[0]; + + if (len == 2) + return n_submod(poly[0], poly[1], modn); + + if (len == 3) + return n_addmod(n_submod(poly[0], poly[1], modn), + poly[2], modn); + + if (len <= 7) + { + ulong val0, val1; + + val0 = poly[0] + poly[2]; + val1 = poly[1] + poly[3]; + + /* handle 2 more terms if they exist */ + slong m = 4; + if (m+1 < len) + { + val0 += poly[m+0]; + val1 += poly[m+1]; + m += 2; + } + + if (m < len) + val0 += poly[m]; + + /* remains 0 or 1 term */ + + if (val0 >= 2*modn) + val0 -= 2*modn; + if (val1 >= 2*modn) + val1 -= 2*modn; + val0 = val0 + 2*modn - val1; + if (val0 >= 2*modn) + val0 -= 2*modn; + if (val0 >= modn) + val0 -= modn; + + return val0; + } + + ulong val0, val1, val2, val3; + + /* initial values, in [0, 2*modn) */ + val0 = poly[0] + poly[4]; + val1 = poly[1] + poly[5]; + val2 = poly[2] + poly[6]; + val3 = poly[3] + poly[7]; + + slong m = 8; + for ( ; m+7 < len; m += 8) + { + /* add new values */ + val0 += poly[m+0] + poly[m+4]; + val1 += poly[m+1] + poly[m+5]; + val2 += poly[m+2] + poly[m+6]; + val3 += poly[m+3] + poly[m+7]; + + /* reduce to [0, 2*modn) */ + if (val0 >= 2*modn) + val0 -= 2*modn; + if (val1 >= 2*modn) + val1 -= 2*modn; + if (val2 >= 2*modn) + val2 -= 2*modn; + if (val3 >= 2*modn) + val3 -= 2*modn; + } + + /* handle 4 more terms if they exist */ + if (m+3 < len) + { + val0 += poly[m+0]; + val1 += poly[m+1]; + val2 += poly[m+2]; + val3 += poly[m+3]; + m += 4; + } + + /* handle 2 more terms if they exist */ + if (m+1 < len) + { + val0 += poly[m+0]; + val1 += poly[m+1]; + m += 2; + } + + /* remains 0 or 1 term */ + if (m < len) + val2 += poly[m+0]; + + /* reduce to [0, 2*modn) */ + if (val0 >= 2*modn) + val0 -= 2*modn; + if (val1 >= 2*modn) + val1 -= 2*modn; + if (val2 >= 2*modn) + val2 -= 2*modn; + if (val3 >= 2*modn) + val3 -= 2*modn; + + /* gather results */ + val0 = val0 + val2; + if (val0 >= 2*modn) + val0 -= 2*modn; + val1 = val1 + val3; + if (val1 >= 2*modn) + val1 -= 2*modn; + val0 = val0 + 2*modn - val1; + if (val0 >= 2*modn) + val0 -= 2*modn; + if (val0 >= modn) + val0 -= modn; + + return val0; +} + +/*-----------*/ +/* general c */ +/*-----------*/ + +ulong _nmod_poly_evaluate_nmod(nn_srcptr poly, slong len, ulong c, nmod_t mod) +{ if (len == 0) return 0; if (len == 1 || c == 0) return poly[0]; + if (len <= 11) + { + slong m = len - 1; + ulong val = poly[m]; + m -= 1; + for ( ; m >= 0; m--) + { + val = nmod_mul(val, c, mod); + val = n_addmod(val, poly[m], mod.n); + } + return val; + } + + ulong c4, val0, val1, val2, val3; + slong m; + + c4 = nmod_mul(c, c, mod); + c4 = nmod_mul(c4, c4, mod); + m = len - 1; + val0 = poly[m-3]; + val1 = poly[m-2]; + val2 = poly[m-1]; + val3 = poly[m-0]; + m -= 4; - val = poly[m]; - m--; + for ( ; m-3 >= 0; m -= 4) + { + val0 = nmod_mul(val0, c4, mod); + val0 = n_addmod(val0, poly[m-3], mod.n); + val1 = nmod_mul(val1, c4, mod); + val1 = n_addmod(val1, poly[m-2], mod.n); + val2 = nmod_mul(val2, c4, mod); + val2 = n_addmod(val2, poly[m-1], mod.n); + val3 = nmod_mul(val3, c4, mod); + val3 = n_addmod(val3, poly[m-0], mod.n); + } + + /* gather results with Horner */ + val2 = n_addmod(val2, nmod_mul(val3, c, mod), mod.n); + val1 = n_addmod(val1, nmod_mul(val2, c, mod), mod.n); + val0 = n_addmod(val0, nmod_mul(val1, c, mod), mod.n); + /* last few terms */ for ( ; m >= 0; m--) { - val = n_mulmod_shoup(c, val, c_precomp, modn); - val = n_addmod(val, poly[m], modn); + val0 = nmod_mul(val0, c, mod); + val0 = n_addmod(val0, poly[m], mod.n); } - return val; + return val0; } -ulong -_nmod_poly_evaluate_nmod_precomp_lazy(nn_srcptr poly, slong len, ulong c, ulong c_precomp, ulong modn) +ulong _nmod_poly_evaluate_nmod_precomp(nn_srcptr poly, slong len, ulong c, ulong c_precomp, ulong modn) { + if (len == 0) + return 0; + + if (len == 1 || c == 0) + return poly[0]; + + if (len <= 11) + { + slong m = len - 1; + ulong val = poly[m]; + m -= 1; + for ( ; m >= 0; m--) + { + val = n_mulmod_shoup(c, val, c_precomp, modn); + val = n_addmod(val, poly[m], modn); + } + return val; + } + + ulong val0, val1, val2, val3; slong m; - ulong val, p_hi, p_lo; + /* precomputations for c**4 */ + ulong c4, c4_precomp; + val0 = n_mulmod_precomp_shoup_rem_from_quo(c_precomp, modn); + n_mulmod_and_precomp_shoup(&c4, &c4_precomp, c, c, c_precomp, val0, c_precomp, modn); + val0 = n_mulmod_precomp_shoup_rem_from_quo(c4_precomp, modn); + n_mulmod_and_precomp_shoup(&c4, &c4_precomp, c4, c4, c4_precomp, val0, c4_precomp, modn); + + m = len - 1; + val0 = poly[m-3]; + val1 = poly[m-2]; + val2 = poly[m-1]; + val3 = poly[m-0]; + m -= 4; + + for ( ; m-3 >= 0; m -= 4) + { + val0 = n_mulmod_shoup(c4, val0, c4_precomp, modn); + val0 = n_addmod(val0, poly[m-3], modn); + val1 = n_mulmod_shoup(c4, val1, c4_precomp, modn); + val1 = n_addmod(val1, poly[m-2], modn); + val2 = n_mulmod_shoup(c4, val2, c4_precomp, modn); + val2 = n_addmod(val2, poly[m-1], modn); + val3 = n_mulmod_shoup(c4, val3, c4_precomp, modn); + val3 = n_addmod(val3, poly[m-0], modn); + } + + /* gather results with Horner */ + val3 = n_mulmod_shoup(c, val3, c_precomp, modn); + val2 = n_addmod(val2, val3, modn); + val2 = n_mulmod_shoup(c, val2, c_precomp, modn); + val1 = n_addmod(val1, val2, modn); + val1 = n_mulmod_shoup(c, val1, c_precomp, modn); + val0 = n_addmod(val0, val1, modn); + + /* last few terms */ + for ( ; m >= 0; m--) + { + val0 = n_mulmod_shoup(c, val0, c_precomp, modn); + val0 = n_addmod(val0, poly[m], modn); + } + + return val0; +} + +ulong _nmod_poly_evaluate_nmod_precomp_lazy(nn_srcptr poly, slong len, ulong c, ulong c_precomp, ulong modn) +{ if (len == 0) return 0; if (len == 1 || c == 0) return poly[0]; + if (len <= 15) + { + ulong p_hi, p_lo; + slong m = len - 1; + ulong val = poly[m]; + m -= 1; + for ( ; m >= 0; m--) + { + umul_ppmm(p_hi, p_lo, c_precomp, val); + val = poly[m] + c * val - p_hi * modn; + } + return val; + } + + ulong p_hi, p_lo; + ulong val0, val1, val2, val3; + slong m; + + /* precomputations for c**4 */ + ulong c4, c4_precomp; + val0 = n_mulmod_precomp_shoup_rem_from_quo(c_precomp, modn); + n_mulmod_and_precomp_shoup(&c4, &c4_precomp, c, c, c_precomp, val0, c_precomp, modn); + val0 = n_mulmod_precomp_shoup_rem_from_quo(c4_precomp, modn); + n_mulmod_and_precomp_shoup(&c4, &c4_precomp, c4, c4, c4_precomp, val0, c4_precomp, modn); + m = len - 1; + val0 = poly[m-3]; + val1 = poly[m-2]; + val2 = poly[m-1]; + val3 = poly[m-0]; + m -= 4; - val = poly[m]; - m--; + for ( ; m-3 >= 0; m -= 4) + { + umul_ppmm(p_hi, p_lo, c4_precomp, val0); + val0 = poly[m-3] + c4 * val0 - p_hi * modn; + umul_ppmm(p_hi, p_lo, c4_precomp, val1); + val1 = poly[m-2] + c4 * val1 - p_hi * modn; + umul_ppmm(p_hi, p_lo, c4_precomp, val2); + val2 = poly[m-1] + c4 * val2 - p_hi * modn; + umul_ppmm(p_hi, p_lo, c4_precomp, val3); + val3 = poly[m-0] + c4 * val3 - p_hi * modn; + } + + /* each val is in [0, max(poly) + 2*modn) */ + /* --> reduce to [0, max(poly)] */ + if (val2 >= 2*modn) + val2 -= 2*modn; + else if (val2 >= modn) + val2 -= modn; + if (val1 >= 2*modn) + val1 -= 2*modn; + else if (val1 >= modn) + val1 -= modn; + if (val0 >= 2*modn) + val0 -= 2*modn; + else if (val0 >= modn) + val0 -= modn; + /* gather results with Horner */ + umul_ppmm(p_hi, p_lo, c_precomp, val3); + val2 = val2 + c * val3 - p_hi * modn; + umul_ppmm(p_hi, p_lo, c_precomp, val2); + val1 = val1 + c * val2 - p_hi * modn; + umul_ppmm(p_hi, p_lo, c_precomp, val1); + val0 = val0 + c * val1 - p_hi * modn; + /* last few terms */ for ( ; m >= 0; m--) { - // computes either val = (c*val mod n) or val = (c*val mod n) + n - // see documentation of ulong_extras / n_mulmod_shoup for details - umul_ppmm(p_hi, p_lo, c_precomp, val); - val = c * val - p_hi * modn; - // lazy addition, yields val in [0..k+2n-1), where max(poly) < k - // --> if k == n (poly is reduced mod n), constraint: 3n-1 <= 2**FLINT_BITS - val += poly[m]; + umul_ppmm(p_hi, p_lo, c_precomp, val0); + val0 = poly[m] + c * val0 - p_hi * modn; } - return val; + return val0; } -ulong -nmod_poly_evaluate_nmod(const nmod_poly_t poly, ulong c) +/*----------------*/ +/* main interface */ +/*----------------*/ + +ulong nmod_poly_evaluate_nmod(const nmod_poly_t poly, ulong c) { if (poly->length == 0) return 0; @@ -107,6 +696,26 @@ nmod_poly_evaluate_nmod(const nmod_poly_t poly, ulong c) if (poly->length == 1 || c == 0) return poly->coeffs[0]; + if (c == 1) + { + if (poly->mod.norm >= 2) + return _nmod_poly_evaluate_one2(poly->coeffs, poly->length, poly->mod.n); + else if (poly->mod.norm == 1) + return _nmod_poly_evaluate_one1(poly->coeffs, poly->length, poly->mod.n); + else + return _nmod_poly_evaluate_one(poly->coeffs, poly->length, poly->mod.n); + } + + if (c == poly->mod.n - 1) + { + if (poly->mod.norm >= 2) + return _nmod_poly_evaluate_mone2(poly->coeffs, poly->length, poly->mod.n); + else if (poly->mod.norm == 1) + return _nmod_poly_evaluate_mone1(poly->coeffs, poly->length, poly->mod.n); + else + return _nmod_poly_evaluate_mone(poly->coeffs, poly->length, poly->mod.n); + } + // if degree below the n_mulmod_shoup threshold // or modulus forbids n_mulmod_shoup usage, use nmod_mul #if FLINT_MULMOD_SHOUP_THRESHOLD <= 2 diff --git a/src/nmod_poly/profile/p-evaluate_nmod.c b/src/nmod_poly/profile/p-evaluate_nmod.c index b9854c32c0..f4d5e46ca8 100644 --- a/src/nmod_poly/profile/p-evaluate_nmod.c +++ b/src/nmod_poly/profile/p-evaluate_nmod.c @@ -15,10 +15,26 @@ #include "nmod_poly.h" #include "nmod_vec.h" +#define NB_LENS 20 +#define NB_REPS 10 + +/* only set if wanting the full bench with static functions */ +/* --> this implies temporarily making non-static in the relevant files */ +#define __DO_BENCH_STATIC 0 +#if __DO_BENCH_STATIC /* declare prototypes */ +ulong _nmod_poly_evaluate_one(nn_srcptr poly, slong len, ulong modn); +ulong _nmod_poly_evaluate_one1(nn_srcptr poly, slong len, ulong modn); +ulong _nmod_poly_evaluate_one2(nn_srcptr poly, slong len, ulong modn); +ulong _nmod_poly_evaluate_mone(nn_srcptr poly, slong len, ulong modn); +ulong _nmod_poly_evaluate_mone1(nn_srcptr poly, slong len, ulong modn); +ulong _nmod_poly_evaluate_mone2(nn_srcptr poly, slong len, ulong modn); +#endif + typedef struct { flint_bitcnt_t bits; slong length; + slong ctype; /* -1: c == -1; +1: c == +1; else: c random */ } info_t; void sample_interface(void * arg, ulong count) @@ -27,6 +43,7 @@ void sample_interface(void * arg, ulong count) info_t * info = (info_t *) arg; flint_bitcnt_t bits = info->bits; slong length = info->length; + slong ctype = info->ctype; slong i, j; nmod_poly_t poly; @@ -34,7 +51,7 @@ void sample_interface(void * arg, ulong count) FLINT_TEST_INIT(state); - for (i = 0; i < count; i++) + for (i = 0; i < (slong)count; i++) { n = n_randbits(state, bits); if (n == UWORD(0)) n++; @@ -44,10 +61,15 @@ void sample_interface(void * arg, ulong count) for (j = 0; j < length; j++) poly->coeffs[j] = n_randint(state, n); - pt = n_randint(state, n); + if (ctype == -1) + pt = n-1; + else if (ctype == 1) + pt = UWORD(1); + else + pt = n_randint(state, n); prof_start(); - for (j = 0; j < 100; j++) + for (j = 0; j < NB_REPS; j++) nmod_poly_evaluate_nmod(poly, pt); prof_stop(); @@ -64,6 +86,7 @@ void sample_generic(void * arg, ulong count) info_t * info = (info_t *) arg; flint_bitcnt_t bits = info->bits; slong length = info->length; + slong ctype = info->ctype; slong i, j; ulong * poly; @@ -73,7 +96,7 @@ void sample_generic(void * arg, ulong count) poly = _nmod_vec_init(length); - for (i = 0; i < count; i++) + for (i = 0; i < (slong)count; i++) { n = n_randbits(state, bits); if (n == UWORD(0)) n++; @@ -83,10 +106,15 @@ void sample_generic(void * arg, ulong count) for (j = 0; j < length; j++) poly[j] = n_randint(state, n); - pt = n_randint(state, n); + if (ctype == -1) + pt = n-1; + else if (ctype == 1) + pt = UWORD(1); + else + pt = n_randint(state, n); prof_start(); - for (j = 0; j < 100; j++) + for (j = 0; j < NB_REPS; j++) _nmod_poly_evaluate_nmod(poly, length, pt, mod); prof_stop(); } @@ -103,6 +131,7 @@ void sample_precomp(void * arg, ulong count) info_t * info = (info_t *) arg; flint_bitcnt_t bits = info->bits; slong length = info->length; + slong ctype = info->ctype; slong i, j; ulong * poly; @@ -112,7 +141,7 @@ void sample_precomp(void * arg, ulong count) poly = _nmod_vec_init(length); - for (i = 0; i < count; i++) + for (i = 0; i < (slong)count; i++) { n = n_randbits(state, bits); if (n == UWORD(0)) n++; @@ -122,10 +151,15 @@ void sample_precomp(void * arg, ulong count) for (j = 0; j < length; j++) poly[j] = n_randint(state, n); - pt = n_randint(state, n); + if (ctype == -1) + pt = n-1; + else if (ctype == 1) + pt = UWORD(1); + else + pt = n_randint(state, n); prof_start(); - for (j = 0; j < 100; j++) + for (j = 0; j < NB_REPS; j++) { const ulong pt_precomp = n_mulmod_precomp_shoup(pt, n); _nmod_poly_evaluate_nmod_precomp(poly, length, pt, pt_precomp, n); @@ -145,6 +179,7 @@ void sample_precomp_lazy(void * arg, ulong count) info_t * info = (info_t *) arg; flint_bitcnt_t bits = info->bits; slong length = info->length; + slong ctype = info->ctype; slong i, j; ulong * poly; @@ -154,7 +189,7 @@ void sample_precomp_lazy(void * arg, ulong count) poly = _nmod_vec_init(length); - for (i = 0; i < count; i++) + for (i = 0; i < (slong)count; i++) { n = n_randbits(state, bits); if (n == UWORD(0)) n++; @@ -164,10 +199,15 @@ void sample_precomp_lazy(void * arg, ulong count) for (j = 0; j < length; j++) poly[j] = n_randint(state, n); - pt = n_randint(state, n); + if (ctype == -1) + pt = n-1; + else if (ctype == 1) + pt = UWORD(1); + else + pt = n_randint(state, n); prof_start(); - for (j = 0; j < 100; j++) + for (j = 0; j < NB_REPS; j++) { const ulong pt_precomp = n_mulmod_precomp_shoup(pt, n); ulong val = _nmod_poly_evaluate_nmod_precomp_lazy(poly, length, pt, pt_precomp, n); @@ -184,17 +224,74 @@ void sample_precomp_lazy(void * arg, ulong count) FLINT_TEST_CLEAR(state); } +#if __DO_BENCH_STATIC + +#define SAMPLE_ONE_MONE(variant) \ +void sample_##variant(void * arg, ulong count) \ +{ \ + ulong n; \ + info_t * info = (info_t *) arg; \ + flint_bitcnt_t bits = info->bits; \ + slong length = info->length; \ + slong i, j; \ + \ + ulong * poly; \ + \ + FLINT_TEST_INIT(state); \ + \ + poly = _nmod_vec_init(length); \ + \ + for (i = 0; i < (slong)count; i++) \ + { \ + n = n_randbits(state, bits); \ + if (n == UWORD(0)) n++; \ + \ + for (j = 0; j < length; j++) \ + poly[j] = n_randint(state, n); \ + \ + prof_start(); \ + for (j = 0; j < NB_REPS; j++) \ + _nmod_poly_evaluate_##variant( \ + poly, length, n); \ + prof_stop(); \ + } \ + \ + _nmod_vec_clear(poly); \ + \ + FLINT_TEST_CLEAR(state); \ +} + +SAMPLE_ONE_MONE(one) +SAMPLE_ONE_MONE(one1) +SAMPLE_ONE_MONE(one2) +SAMPLE_ONE_MONE(mone) +SAMPLE_ONE_MONE(mone1) +SAMPLE_ONE_MONE(mone2) +#endif + int main(void) { - slong lengths[18] = {1, 2, 3, 4, 6, 8, - 10, 12, 16, 20, 32, 45, - 64, 128, 256, 1024, 8192, 65536}; + slong lengths[NB_LENS] = + { 1, 2, 3, 4, 6, 8, + 10, 12, 16, 20, 32, 45, + 64, 128, 256, 1024, + 8192, 65536, 200000, 1000000}; double min, max; - double mins[18]; // note: max seems to be consistently identical or extremely close to min - double mins_generic[18]; - double mins_precomp[18]; - double mins_precomp_lazy[18]; + double mins_interface[NB_LENS]; + double mins_interface_one[NB_LENS]; + double mins_interface_mone[NB_LENS]; + double mins_generic[NB_LENS]; + double mins_precomp[NB_LENS]; + double mins_precomp_lazy[NB_LENS]; +#if __DO_BENCH_STATIC + double mins_one[NB_LENS]; + double mins_one1[NB_LENS]; + double mins_one2[NB_LENS]; + double mins_mone[NB_LENS]; + double mins_mone1[NB_LENS]; + double mins_mone2[NB_LENS]; +#endif info_t info; flint_bitcnt_t i; @@ -206,54 +303,113 @@ int main(void) info.bits = i; printf("nbits = %ld\n", i); - for (int len = 0; len < 18; ++len) + for (int len = 0; len < NB_LENS; ++len) { info.length = lengths[len]; + info.ctype = 0; prof_repeat(&min, &max, sample_interface, (void *) &info); - mins[len-1] = min; + mins_interface[len-1] = min; prof_repeat(&min, &max, sample_generic, (void *) &info); mins_generic[len-1] = min; prof_repeat(&min, &max, sample_precomp, (void *) &info); mins_precomp[len-1] = min; prof_repeat(&min, &max, sample_precomp_lazy, (void *) &info); mins_precomp_lazy[len-1] = min; + + info.ctype = 1; + prof_repeat(&min, &max, sample_interface, (void *) &info); + mins_interface_one[len-1] = min; + + info.ctype = -1; + prof_repeat(&min, &max, sample_interface, (void *) &info); + mins_interface_mone[len-1] = min; + +#if __DO_BENCH_STATIC + prof_repeat(&min, &max, sample_one, (void *) &info); + mins_one[len-1] = min; + prof_repeat(&min, &max, sample_one1, (void *) &info); + mins_one1[len-1] = min; + prof_repeat(&min, &max, sample_one2, (void *) &info); + mins_one2[len-1] = min; + prof_repeat(&min, &max, sample_mone, (void *) &info); + mins_mone[len-1] = min; + prof_repeat(&min, &max, sample_mone1, (void *) &info); + mins_mone1[len-1] = min; + prof_repeat(&min, &max, sample_mone2, (void *) &info); + mins_mone2[len-1] = min; +#endif } +#if __DO_BENCH_STATIC if (i < FLINT_BITS-1) { - for (int len = 0; len < 18; ++len) + for (int len = 0; len < NB_LENS; ++len) { - flint_printf(" len %ld\t%.2lf\t%.2lf\t%.2lf\t%.2lf", + double fac = NB_REPS * (double)lengths[len] * (double)FLINT_CLOCK_SCALE_FACTOR; + flint_printf(" len %ld\t\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf", lengths[len], - (mins[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR)/(lengths[len]*100), - (mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR)/(lengths[len]*100), - (mins_precomp[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR)/(lengths[len]*100), - (mins_precomp_lazy[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR)/(lengths[len]*100)); + mins_interface[len-1]/fac, + mins_generic[len-1]/fac, + mins_precomp[len-1]/fac, + mins_precomp_lazy[len-1]/fac, + mins_one[len-1]/fac, + mins_one1[len-1]/fac, + mins_one2[len-1]/fac, + mins_mone[len-1]/fac, + mins_mone1[len-1]/fac, + mins_mone2[len-1]/fac); flint_printf("\n"); } } - else if (i < FLINT_BITS) +#else + if (i < FLINT_BITS-1) { - for (int len = 0; len < 18; ++len) + flint_printf(" interf generic precomp lazy one mone\n"); + for (int len = 0; len < NB_LENS; ++len) { - flint_printf(" len %ld\t%.2lf\t%.2lf\t%.2lf\t na", + double fac = NB_REPS * (double)lengths[len] * (double)FLINT_CLOCK_SCALE_FACTOR; + flint_printf(" len %ld\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf", lengths[len], - (mins[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR)/(lengths[len]*100), - (mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR)/(lengths[len]*100), - (mins_precomp[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR)/(lengths[len]*100), - (mins_precomp_lazy[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR)/(lengths[len]*100)); + mins_interface[len-1]/fac, + mins_generic[len-1]/fac, + mins_precomp[len-1]/fac, + mins_precomp_lazy[len-1]/fac, + mins_interface_one[len-1]/fac, + mins_interface_mone[len-1]/fac); flint_printf("\n"); } } - else // i == FLINT_BITS +#endif + if (i == FLINT_BITS-1) { - for (int len = 0; len < 18; ++len) + flint_printf(" interf generic precomp one mone\n"); + for (int len = 0; len < NB_LENS; ++len) { - flint_printf(" len %ld\t%.2lf\t%.2lf\t na \t na", + double fac = NB_REPS * (double)lengths[len] * (double)FLINT_CLOCK_SCALE_FACTOR; + flint_printf(" len %ld\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf", lengths[len], - (mins[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR)/(lengths[len]*100), - (mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR)/(lengths[len]*100)); + mins_interface[len-1]/fac, + mins_generic[len-1]/fac, + mins_precomp[len-1]/fac, + mins_interface_one[len-1]/fac, + mins_interface_mone[len-1]/fac); + flint_printf("\n"); + } + } + + if (i == FLINT_BITS) + { + flint_printf(" interf generic one mone\n"); + for (int len = 0; len < NB_LENS; ++len) + { + double fac = NB_REPS * (double)lengths[len] * (double)FLINT_CLOCK_SCALE_FACTOR; + flint_printf(" len %ld\t%.2lf\t%.2lf\t%.2lf\t%.2lf", + lengths[len], + mins_interface[len-1]/fac, + mins_generic[len-1]/fac, + mins_interface_one[len-1]/fac, + mins_interface_mone[len-1]/fac); flint_printf("\n"); } } @@ -263,3 +419,5 @@ int main(void) return 0; } + +#undef __DO_BENCH_STATIC diff --git a/src/nmod_poly/test/t-evaluate_nmod.c b/src/nmod_poly/test/t-evaluate_nmod.c index 95c3196ed2..8c455c4a6f 100644 --- a/src/nmod_poly/test/t-evaluate_nmod.c +++ b/src/nmod_poly/test/t-evaluate_nmod.c @@ -1,5 +1,6 @@ /* Copyright (C) 2010 William Hart + Copyright (C) 2025 Vincent Neiger This file is part of FLINT. @@ -13,19 +14,49 @@ #include "ulong_extras.h" #include "nmod_poly.h" +/* only set to 1 if wanting the full test with static functions */ +/* --> this implies temporarily making non-static in the relevant files, */ +/* and recompiling flint */ +#define __DO_TEST_STATIC 0 +#if __DO_TEST_STATIC /* declare prototypes */ +ulong _nmod_poly_evaluate_one(nn_srcptr poly, slong len, ulong modn); +ulong _nmod_poly_evaluate_one1(nn_srcptr poly, slong len, ulong modn); +ulong _nmod_poly_evaluate_one2(nn_srcptr poly, slong len, ulong modn); +ulong _nmod_poly_evaluate_mone(nn_srcptr poly, slong len, ulong modn); +ulong _nmod_poly_evaluate_mone1(nn_srcptr poly, slong len, ulong modn); +ulong _nmod_poly_evaluate_mone2(nn_srcptr poly, slong len, ulong modn); +#endif + TEST_FUNCTION_START(nmod_poly_evaluate_nmod, state) { int i, j, result = 1; /* Check evaluation at 1 gives sum of coeffs */ - for (i = 0; i < 1000 * flint_test_multiplier(); i++) + for (i = 0; i < 5000 * flint_test_multiplier(); i++) { + ulong n; nmod_poly_t a; - ulong n = n_randtest_not_zero(state); ulong sum, eval, eval_v1, eval_v2, eval_v3; +#if __DO_TEST_STATIC + ulong eval_one, eval_one1, eval_one2; +#endif + + /* 50% tests with random modulus; 50% with large ones */ + if (i < 2500 * flint_test_multiplier()) + n = n_randtest_not_zero(state); + else if (i < 3500 * flint_test_multiplier()) + n = n_randbits(state, 62); + else if (i < 4500 * flint_test_multiplier()) + n = n_randbits(state, 63); + else +#if FLINT_BITS == 64 + n = UWORD(6148914691236517205); +#else // FLINT_BITS == 32 + n = UWORD(1431655765); +#endif nmod_poly_init(a, n); - nmod_poly_randtest(a, state, n_randint(state, 100)); + nmod_poly_randtest(a, state, n_randint(state, 150)); // main function eval = nmod_poly_evaluate_nmod(a, UWORD(1)); @@ -76,21 +107,78 @@ TEST_FUNCTION_START(nmod_poly_evaluate_nmod, state) flint_abort(); } +#if __DO_TEST_STATIC + eval_one = _nmod_poly_evaluate_one(a->coeffs, a->length, a->mod.n); + + if (FLINT_BIT_COUNT(a->mod.n) <= FLINT_BITS - 2) + { + eval_one1 = _nmod_poly_evaluate_one1(a->coeffs, a->length, a->mod.n); + eval_one2 = _nmod_poly_evaluate_one2(a->coeffs, a->length, a->mod.n); + } + else if (FLINT_BIT_COUNT(a->mod.n) == FLINT_BITS - 1) + { + eval_one1 = _nmod_poly_evaluate_one1(a->coeffs, a->length, a->mod.n); + eval_one2 = eval_one; + } + else + { + eval_one1 = eval_one; + eval_one2 = eval_one; + } + + result = (sum == eval_one) && (eval_one == eval_one1) && (eval_one == eval_one2); + if (!result) + { + flint_printf("FAIL (one):\n"); + flint_printf("a->length = %wd, n = %wu\n", a->length, a->mod.n); + flint_printf("sum = %wu, eval_one = %wu, eval_one1 = %wu, eval_one2 = %wu\n", + sum, eval_one, eval_one1, eval_one2); + nmod_poly_print(a), flint_printf("\n\n"); + fflush(stdout); + flint_abort(); + } +#endif + nmod_poly_clear(a); } /* Check a(c) + b(c) = (a + b)(c) */ - for (i = 0; i < 1000 * flint_test_multiplier(); i++) + for (i = 0; i < 5000 * flint_test_multiplier(); i++) { + ulong n; nmod_poly_t a, b, acopy; - ulong n = n_randtest_not_zero(state); ulong eval1, eval2, c, eval1_v1, eval1_v2, eval1_v3, eval2_v1, eval2_v2, eval2_v3; + if (i < 2500 * flint_test_multiplier()) + n = n_randtest_not_zero(state); + else if (i < 3500 * flint_test_multiplier()) + n = n_randbits(state, 62); + else if (i < 4500 * flint_test_multiplier()) + n = n_randbits(state, 63); + else +#if FLINT_BITS == 64 + n = UWORD(6148914691236517205); +#else // FLINT_BITS == 32 + n = UWORD(1431655765); +#endif + + nmod_poly_init(a, n); nmod_poly_init(acopy, n); nmod_poly_init(b, n); - nmod_poly_randtest(a, state, n_randint(state, 100)); - nmod_poly_randtest(b, state, n_randint(state, 100)); + + if (i % 8 != 0) + nmod_poly_randtest(a, state, n_randint(state, 150)); + else + { + /* really stress the overflow-related issues for lazy variant */ + nmod_poly_fit_length(a, n_randint(state, 150)); + _nmod_poly_set_length(a, a->alloc); + for (slong kk = 0; kk < a->length; kk++) + a->coeffs[kk] = n-1; + _nmod_poly_normalise(a); + } + nmod_poly_randtest(b, state, n_randint(state, 150)); nmod_poly_set(acopy, a); c = n_randint(state, n); @@ -111,7 +199,7 @@ TEST_FUNCTION_START(nmod_poly_evaluate_nmod, state) { ulong c_precomp = n_mulmod_precomp_shoup(c, n); eval1_v2 = _nmod_poly_evaluate_nmod_precomp(a->coeffs, a->length, c, c_precomp, n); - eval1_v2 = n_addmod(eval1_v2, + eval1_v2 = n_addmod(eval1_v2, _nmod_poly_evaluate_nmod_precomp(b->coeffs, b->length, c, c_precomp, n), n); @@ -178,11 +266,12 @@ TEST_FUNCTION_START(nmod_poly_evaluate_nmod, state) if (!result) { flint_printf("FAIL:\n"); + flint_printf("modulus: %wu, alength: %wu, blength: %wu\n", n, acopy->length, b->length); flint_printf("eval1 = %wu, eval2 = %wu\n", eval1, eval2); flint_printf("eval1_v1 = %wu, eval2_v1 = %wu\n", eval1_v1, eval2_v1); flint_printf("eval1_v2 = %wu, eval2_v2 = %wu\n", eval1_v2, eval2_v2); flint_printf("eval1_v3 = %wu, eval2_v3 = %wu\n", eval1_v3, eval2_v3); - nmod_poly_print(a), flint_printf("\n\n"); + nmod_poly_print(acopy), flint_printf("\n\n"); nmod_poly_print(b), flint_printf("\n\n"); fflush(stdout); flint_abort(); @@ -193,5 +282,98 @@ TEST_FUNCTION_START(nmod_poly_evaluate_nmod, state) nmod_poly_clear(b); } +#if __DO_TEST_STATIC + /* Check evaluation at 1 and -1, specialized functions */ + for (i = 0; i < 5000 * flint_test_multiplier(); i++) + { + ulong n; + nmod_poly_t a; + ulong eval; + ulong eval_one, eval_one1, eval_one2; + ulong eval_mone, eval_mone1, eval_mone2; + + /* 50% tests with random modulus; 50% with large ones */ + if (i < 2500 * flint_test_multiplier()) + n = n_randtest_not_zero(state); + else if (i < 3500 * flint_test_multiplier()) + n = n_randbits(state, 62); + else + n = n_randbits(state, 63); + + nmod_poly_init(a, n); + nmod_poly_randtest(a, state, n_randint(state, 150)); + + // general function, not specialized + eval = _nmod_poly_evaluate_nmod(a->coeffs, a->length, UWORD(1), a->mod); + + eval_one = _nmod_poly_evaluate_one(a->coeffs, a->length, a->mod.n); + + if (FLINT_BIT_COUNT(a->mod.n) <= FLINT_BITS - 2) + { + eval_one1 = _nmod_poly_evaluate_one1(a->coeffs, a->length, a->mod.n); + eval_one2 = _nmod_poly_evaluate_one2(a->coeffs, a->length, a->mod.n); + } + else if (FLINT_BIT_COUNT(a->mod.n) == FLINT_BITS - 1) + { + eval_one1 = _nmod_poly_evaluate_one1(a->coeffs, a->length, a->mod.n); + eval_one2 = eval_one; + } + else + { + eval_one1 = eval_one; + eval_one2 = eval_one; + } + + result = (eval == eval_one) && (eval_one == eval_one1) && (eval_one == eval_one2); + if (!result) + { + flint_printf("FAIL (one):\n"); + flint_printf("a->length = %wd, n = %wu\n", a->length, a->mod.n); + flint_printf("sum = %wu, eval_one = %wu, eval_one1 = %wu, eval_one2 = %wu\n", + eval, eval_one, eval_one1, eval_one2); + nmod_poly_print(a), flint_printf("\n\n"); + fflush(stdout); + flint_abort(); + } + + // general function, not specialized + eval = _nmod_poly_evaluate_nmod(a->coeffs, a->length, n - UWORD(1), a->mod); + + eval_mone = _nmod_poly_evaluate_mone(a->coeffs, a->length, a->mod.n); + + if (FLINT_BIT_COUNT(a->mod.n) <= FLINT_BITS - 2) + { + eval_mone1 = _nmod_poly_evaluate_mone1(a->coeffs, a->length, a->mod.n); + eval_mone2 = _nmod_poly_evaluate_mone2(a->coeffs, a->length, a->mod.n); + } + else if (FLINT_BIT_COUNT(a->mod.n) == FLINT_BITS - 1) + { + eval_mone1 = _nmod_poly_evaluate_mone1(a->coeffs, a->length, a->mod.n); + eval_mone2 = eval_mone; + } + else + { + eval_mone1 = eval_mone; + eval_mone2 = eval_mone; + } + + result = (eval == eval_mone) && (eval_mone == eval_mone1) && (eval_mone == eval_mone2); + if (!result) + { + flint_printf("FAIL (mone):\n"); + flint_printf("a->length = %wd, n = %wu\n", a->length, a->mod.n); + flint_printf("correct = %wu, eval_mone = %wu, eval_mone1 = %wu, eval_mone2 = %wu\n", + eval, eval_mone, eval_mone1, eval_mone2); + nmod_poly_print(a), flint_printf("\n\n"); + fflush(stdout); + flint_abort(); + } + + nmod_poly_clear(a); + } +#endif + TEST_FUNCTION_END(state); } + +#undef __DO_TEST_STATIC