#define PROBLEM "https://judge.yosupo.jp/problem/binomial_coefficient" // #include "../../template/template.hpp" // #include "../../modulo/arbitrary-mod-binomial-large.hpp" // #include "../../misc/fastio.hpp" using namespace Nyaan; void Nyaan::solve() { int T, m; rd(T, m); arbitrary_mod_binomial C(m); while (T--) { unsigned long long n, k; rd(n, k); auto ans = C.C(n, k); wtn(ans); } } /* #include "misc/rng.hpp" #include "misc/timer.hpp" using mint = LazyMontgomeryModInt<998244353>; void verify() { Timer timer; simd_prime_binomial C(998244353); cerr << "time:" << timer.elapsed() << endl; mint f = 1, a = 1, one = 1; for (int i = 1; i < 998244353; i++) { f *= a; a += one; if (rng() % (1u << 16) == 0) { mint f2 = C.fac(i); if (f != f2) exit(1); } } out("OK"); cout.flush(); exit(0); } */
#line 1 "verify/verify-yosupo-math/yosupo-binomial-coefficient-large.test.cpp" #define PROBLEM "https://judge.yosupo.jp/problem/binomial_coefficient" // #line 2 "template/template.hpp" using namespace std; // intrinstic #include <immintrin.h> #include <algorithm> #include <array> #include <bitset> #include <cassert> #include <cctype> #include <cfenv> #include <cfloat> #include <chrono> #include <cinttypes> #include <climits> #include <cmath> #include <complex> #include <cstdarg> #include <cstddef> #include <cstdint> #include <cstdio> #include <cstdlib> #include <cstring> #include <deque> #include <fstream> #include <functional> #include <initializer_list> #include <iomanip> #include <ios> #include <iostream> #include <istream> #include <iterator> #include <limits> #include <list> #include <map> #include <memory> #include <new> #include <numeric> #include <ostream> #include <queue> #include <random> #include <set> #include <sstream> #include <stack> #include <streambuf> #include <string> #include <tuple> #include <type_traits> #include <typeinfo> #include <unordered_map> #include <unordered_set> #include <utility> #include <vector> // utility #line 1 "template/util.hpp" namespace Nyaan { using ll = long long; using i64 = long long; using u64 = unsigned long long; using i128 = __int128_t; using u128 = __uint128_t; template <typename T> using V = vector<T>; template <typename T> using VV = vector<vector<T>>; using vi = vector<int>; using vl = vector<long long>; using vd = V<double>; using vs = V<string>; using vvi = vector<vector<int>>; using vvl = vector<vector<long long>>; template <typename T> using minpq = priority_queue<T, vector<T>, greater<T>>; template <typename T, typename U> struct P : pair<T, U> { template <typename... Args> P(Args... args) : pair<T, U>(args...) {} using pair<T, U>::first; using pair<T, U>::second; P &operator+=(const P &r) { first += r.first; second += r.second; return *this; } P &operator-=(const P &r) { first -= r.first; second -= r.second; return *this; } P &operator*=(const P &r) { first *= r.first; second *= r.second; return *this; } template <typename S> P &operator*=(const S &r) { first *= r, second *= r; return *this; } P operator+(const P &r) const { return P(*this) += r; } P operator-(const P &r) const { return P(*this) -= r; } P operator*(const P &r) const { return P(*this) *= r; } template <typename S> P operator*(const S &r) const { return P(*this) *= r; } P operator-() const { return P{-first, -second}; } }; using pl = P<ll, ll>; using pi = P<int, int>; using vp = V<pl>; constexpr int inf = 1001001001; constexpr long long infLL = 4004004004004004004LL; template <typename T> int sz(const T &t) { return t.size(); } template <typename T, typename U> inline bool amin(T &x, U y) { return (y < x) ? (x = y, true) : false; } template <typename T, typename U> inline bool amax(T &x, U y) { return (x < y) ? (x = y, true) : false; } template <typename T> inline T Max(const vector<T> &v) { return *max_element(begin(v), end(v)); } template <typename T> inline T Min(const vector<T> &v) { return *min_element(begin(v), end(v)); } template <typename T> inline long long Sum(const vector<T> &v) { return accumulate(begin(v), end(v), 0LL); } template <typename T> int lb(const vector<T> &v, const T &a) { return lower_bound(begin(v), end(v), a) - begin(v); } template <typename T> int ub(const vector<T> &v, const T &a) { return upper_bound(begin(v), end(v), a) - begin(v); } constexpr long long TEN(int n) { long long ret = 1, x = 10; for (; n; x *= x, n >>= 1) ret *= (n & 1 ? x : 1); return ret; } template <typename T, typename U> pair<T, U> mkp(const T &t, const U &u) { return make_pair(t, u); } template <typename T> vector<T> mkrui(const vector<T> &v, bool rev = false) { vector<T> ret(v.size() + 1); if (rev) { for (int i = int(v.size()) - 1; i >= 0; i--) ret[i] = v[i] + ret[i + 1]; } else { for (int i = 0; i < int(v.size()); i++) ret[i + 1] = ret[i] + v[i]; } return ret; }; template <typename T> vector<T> mkuni(const vector<T> &v) { vector<T> ret(v); sort(ret.begin(), ret.end()); ret.erase(unique(ret.begin(), ret.end()), ret.end()); return ret; } template <typename F> vector<int> mkord(int N, F f) { vector<int> ord(N); iota(begin(ord), end(ord), 0); sort(begin(ord), end(ord), f); return ord; } template <typename T> vector<int> mkinv(vector<T> &v) { int max_val = *max_element(begin(v), end(v)); vector<int> inv(max_val + 1, -1); for (int i = 0; i < (int)v.size(); i++) inv[v[i]] = i; return inv; } vector<int> mkiota(int n) { vector<int> ret(n); iota(begin(ret), end(ret), 0); return ret; } template <typename T> T mkrev(const T &v) { T w{v}; reverse(begin(w), end(w)); return w; } template <typename T> bool nxp(T &v) { return next_permutation(begin(v), end(v)); } // 返り値の型は入力の T に依存 // i 要素目 : [0, a[i]) template <typename T> vector<vector<T>> product(const vector<T> &a) { vector<vector<T>> ret; vector<T> v; auto dfs = [&](auto rc, int i) -> void { if (i == (int)a.size()) { ret.push_back(v); return; } for (int j = 0; j < a[i]; j++) v.push_back(j), rc(rc, i + 1), v.pop_back(); }; dfs(dfs, 0); return ret; } // F : function(void(T&)), mod を取る操作 // T : 整数型のときはオーバーフローに注意する template <typename T> T Power(T a, long long n, const T &I, const function<void(T &)> &f) { T res = I; for (; n; f(a = a * a), n >>= 1) { if (n & 1) f(res = res * a); } return res; } // T : 整数型のときはオーバーフローに注意する template <typename T> T Power(T a, long long n, const T &I = T{1}) { return Power(a, n, I, function<void(T &)>{[](T &) -> void {}}); } template <typename T> T Rev(const T &v) { T res = v; reverse(begin(res), end(res)); return res; } template <typename T> vector<T> Transpose(const vector<T> &v) { using U = typename T::value_type; if(v.empty()) return {}; int H = v.size(), W = v[0].size(); vector res(W, T(H, U{})); for (int i = 0; i < H; i++) { for (int j = 0; j < W; j++) { res[j][i] = v[i][j]; } } return res; } template <typename T> vector<T> Rotate(const vector<T> &v, int clockwise = true) { using U = typename T::value_type; int H = v.size(), W = v[0].size(); vector res(W, T(H, U{})); for (int i = 0; i < H; i++) { for (int j = 0; j < W; j++) { if (clockwise) { res[W - 1 - j][i] = v[i][j]; } else { res[j][H - 1 - i] = v[i][j]; } } } return res; } } // namespace Nyaan #line 58 "template/template.hpp" // bit operation #line 1 "template/bitop.hpp" namespace Nyaan { __attribute__((target("popcnt"))) inline int popcnt(const u64 &a) { return __builtin_popcountll(a); } inline int lsb(const u64 &a) { return a ? __builtin_ctzll(a) : 64; } inline int ctz(const u64 &a) { return a ? __builtin_ctzll(a) : 64; } inline int msb(const u64 &a) { return a ? 63 - __builtin_clzll(a) : -1; } template <typename T> inline int gbit(const T &a, int i) { return (a >> i) & 1; } template <typename T> inline void sbit(T &a, int i, bool b) { if (gbit(a, i) != b) a ^= T(1) << i; } constexpr long long PW(int n) { return 1LL << n; } constexpr long long MSK(int n) { return (1LL << n) - 1; } } // namespace Nyaan #line 61 "template/template.hpp" // inout #line 1 "template/inout.hpp" namespace Nyaan { template <typename T, typename U> ostream &operator<<(ostream &os, const pair<T, U> &p) { os << p.first << " " << p.second; return os; } template <typename T, typename U> istream &operator>>(istream &is, pair<T, U> &p) { is >> p.first >> p.second; return is; } template <typename T> ostream &operator<<(ostream &os, const vector<T> &v) { int s = (int)v.size(); for (int i = 0; i < s; i++) os << (i ? " " : "") << v[i]; return os; } template <typename T> istream &operator>>(istream &is, vector<T> &v) { for (auto &x : v) is >> x; return is; } istream &operator>>(istream &is, __int128_t &x) { string S; is >> S; x = 0; int flag = 0; for (auto &c : S) { if (c == '-') { flag = true; continue; } x *= 10; x += c - '0'; } if (flag) x = -x; return is; } istream &operator>>(istream &is, __uint128_t &x) { string S; is >> S; x = 0; for (auto &c : S) { x *= 10; x += c - '0'; } return is; } ostream &operator<<(ostream &os, __int128_t x) { if (x == 0) return os << 0; if (x < 0) os << '-', x = -x; string S; while (x) S.push_back('0' + x % 10), x /= 10; reverse(begin(S), end(S)); return os << S; } ostream &operator<<(ostream &os, __uint128_t x) { if (x == 0) return os << 0; string S; while (x) S.push_back('0' + x % 10), x /= 10; reverse(begin(S), end(S)); return os << S; } void in() {} template <typename T, class... U> void in(T &t, U &...u) { cin >> t; in(u...); } void out() { cout << "\n"; } template <typename T, class... U, char sep = ' '> void out(const T &t, const U &...u) { cout << t; if (sizeof...(u)) cout << sep; out(u...); } struct IoSetupNya { IoSetupNya() { cin.tie(nullptr); ios::sync_with_stdio(false); cout << fixed << setprecision(15); cerr << fixed << setprecision(7); } } iosetupnya; } // namespace Nyaan #line 64 "template/template.hpp" // debug #line 1 "template/debug.hpp" namespace DebugImpl { template <typename U, typename = void> struct is_specialize : false_type {}; template <typename U> struct is_specialize< U, typename conditional<false, typename U::iterator, void>::type> : true_type {}; template <typename U> struct is_specialize< U, typename conditional<false, decltype(U::first), void>::type> : true_type {}; template <typename U> struct is_specialize<U, enable_if_t<is_integral<U>::value, void>> : true_type { }; void dump(const char& t) { cerr << t; } void dump(const string& t) { cerr << t; } void dump(const bool& t) { cerr << (t ? "true" : "false"); } void dump(__int128_t t) { if (t == 0) cerr << 0; if (t < 0) cerr << '-', t = -t; string S; while (t) S.push_back('0' + t % 10), t /= 10; reverse(begin(S), end(S)); cerr << S; } void dump(__uint128_t t) { if (t == 0) cerr << 0; string S; while (t) S.push_back('0' + t % 10), t /= 10; reverse(begin(S), end(S)); cerr << S; } template <typename U, enable_if_t<!is_specialize<U>::value, nullptr_t> = nullptr> void dump(const U& t) { cerr << t; } template <typename T> void dump(const T& t, enable_if_t<is_integral<T>::value>* = nullptr) { string res; if (t == Nyaan::inf) res = "inf"; if constexpr (is_signed<T>::value) { if (t == -Nyaan::inf) res = "-inf"; } if constexpr (sizeof(T) == 8) { if (t == Nyaan::infLL) res = "inf"; if constexpr (is_signed<T>::value) { if (t == -Nyaan::infLL) res = "-inf"; } } if (res.empty()) res = to_string(t); cerr << res; } template <typename T, typename U> void dump(const pair<T, U>&); template <typename T> void dump(const pair<T*, int>&); template <typename T> void dump(const T& t, enable_if_t<!is_void<typename T::iterator>::value>* = nullptr) { cerr << "[ "; for (auto it = t.begin(); it != t.end();) { dump(*it); cerr << (++it == t.end() ? "" : ", "); } cerr << " ]"; } template <typename T, typename U> void dump(const pair<T, U>& t) { cerr << "( "; dump(t.first); cerr << ", "; dump(t.second); cerr << " )"; } template <typename T> void dump(const pair<T*, int>& t) { cerr << "[ "; for (int i = 0; i < t.second; i++) { dump(t.first[i]); cerr << (i == t.second - 1 ? "" : ", "); } cerr << " ]"; } void trace() { cerr << endl; } template <typename Head, typename... Tail> void trace(Head&& head, Tail&&... tail) { cerr << " "; dump(head); if (sizeof...(tail) != 0) cerr << ","; trace(std::forward<Tail>(tail)...); } } // namespace DebugImpl #ifdef NyaanDebug #define trc(...) \ do { \ cerr << "## " << #__VA_ARGS__ << " = "; \ DebugImpl::trace(__VA_ARGS__); \ } while (0) #else #define trc(...) (void(0)) #endif #ifdef NyaanLocal #define trc2(...) \ do { \ cerr << "## " << #__VA_ARGS__ << " = "; \ DebugImpl::trace(__VA_ARGS__); \ } while (0) #else #define trc2(...) (void(0)) #endif #line 67 "template/template.hpp" // macro #line 1 "template/macro.hpp" #define each(x, v) for (auto&& x : v) #define each2(x, y, v) for (auto&& [x, y] : v) #define all(v) (v).begin(), (v).end() #define rep(i, N) for (long long i = 0; i < (long long)(N); i++) #define repr(i, N) for (long long i = (long long)(N)-1; i >= 0; i--) #define rep1(i, N) for (long long i = 1; i <= (long long)(N); i++) #define repr1(i, N) for (long long i = (N); (long long)(i) > 0; i--) #define reg(i, a, b) for (long long i = (a); i < (b); i++) #define regr(i, a, b) for (long long i = (b)-1; i >= (a); i--) #define fi first #define se second #define ini(...) \ int __VA_ARGS__; \ in(__VA_ARGS__) #define inl(...) \ long long __VA_ARGS__; \ in(__VA_ARGS__) #define ins(...) \ string __VA_ARGS__; \ in(__VA_ARGS__) #define in2(s, t) \ for (int i = 0; i < (int)s.size(); i++) { \ in(s[i], t[i]); \ } #define in3(s, t, u) \ for (int i = 0; i < (int)s.size(); i++) { \ in(s[i], t[i], u[i]); \ } #define in4(s, t, u, v) \ for (int i = 0; i < (int)s.size(); i++) { \ in(s[i], t[i], u[i], v[i]); \ } #define die(...) \ do { \ Nyaan::out(__VA_ARGS__); \ return; \ } while (0) #line 70 "template/template.hpp" namespace Nyaan { void solve(); } int main() { Nyaan::solve(); } #line 4 "verify/verify-yosupo-math/yosupo-binomial-coefficient-large.test.cpp" // #line 2 "modulo/arbitrary-mod-binomial-large.hpp" #line 1 "atcoder/math.hpp" #line 8 "atcoder/math.hpp" #line 1 "atcoder/internal_math.hpp" #line 5 "atcoder/internal_math.hpp" #ifdef _MSC_VER #include <intrin.h> #endif namespace atcoder { namespace internal { // @param m `1 <= m` // @return x mod m constexpr long long safe_mod(long long x, long long m) { x %= m; if (x < 0) x += m; return x; } // Fast modular multiplication by barrett reduction // Reference: https://en.wikipedia.org/wiki/Barrett_reduction // NOTE: reconsider after Ice Lake struct barrett { unsigned int _m; unsigned long long im; // @param m `1 <= m < 2^31` barrett(unsigned int m) : _m(m), im((unsigned long long)(-1) / m + 1) {} // @return m unsigned int umod() const { return _m; } // @param a `0 <= a < m` // @param b `0 <= b < m` // @return `a * b % m` unsigned int mul(unsigned int a, unsigned int b) const { // [1] m = 1 // a = b = im = 0, so okay // [2] m >= 2 // im = ceil(2^64 / m) // -> im * m = 2^64 + r (0 <= r < m) // let z = a*b = c*m + d (0 <= c, d < m) // a*b * im = (c*m + d) * im = c*(im*m) + d*im = c*2^64 + c*r + d*im // c*r + d*im < m * m + m * im < m * m + 2^64 + m <= 2^64 + m * (m + 1) < 2^64 * 2 // ((ab * im) >> 64) == c or c + 1 unsigned long long z = a; z *= b; #ifdef _MSC_VER unsigned long long x; _umul128(z, im, &x); #else unsigned long long x = (unsigned long long)(((unsigned __int128)(z)*im) >> 64); #endif unsigned int v = (unsigned int)(z - x * _m); if (_m <= v) v += _m; return v; } }; // @param n `0 <= n` // @param m `1 <= m` // @return `(x ** n) % m` constexpr long long pow_mod_constexpr(long long x, long long n, int m) { if (m == 1) return 0; unsigned int _m = (unsigned int)(m); unsigned long long r = 1; unsigned long long y = safe_mod(x, m); while (n) { if (n & 1) r = (r * y) % _m; y = (y * y) % _m; n >>= 1; } return r; } // Reference: // M. Forisek and J. Jancina, // Fast Primality Testing for Integers That Fit into a Machine Word // @param n `0 <= n` constexpr bool is_prime_constexpr(int n) { if (n <= 1) return false; if (n == 2 || n == 7 || n == 61) return true; if (n % 2 == 0) return false; long long d = n - 1; while (d % 2 == 0) d /= 2; constexpr long long bases[3] = {2, 7, 61}; for (long long a : bases) { long long t = d; long long y = pow_mod_constexpr(a, t, n); while (t != n - 1 && y != 1 && y != n - 1) { y = y * y % n; t <<= 1; } if (y != n - 1 && t % 2 == 0) { return false; } } return true; } template <int n> constexpr bool is_prime = is_prime_constexpr(n); // @param b `1 <= b` // @return pair(g, x) s.t. g = gcd(a, b), xa = g (mod b), 0 <= x < b/g constexpr std::pair<long long, long long> inv_gcd(long long a, long long b) { a = safe_mod(a, b); if (a == 0) return {b, 0}; // Contracts: // [1] s - m0 * a = 0 (mod b) // [2] t - m1 * a = 0 (mod b) // [3] s * |m1| + t * |m0| <= b long long s = b, t = a; long long m0 = 0, m1 = 1; while (t) { long long u = s / t; s -= t * u; m0 -= m1 * u; // |m1 * u| <= |m1| * s <= b // [3]: // (s - t * u) * |m1| + t * |m0 - m1 * u| // <= s * |m1| - t * u * |m1| + t * (|m0| + |m1| * u) // = s * |m1| + t * |m0| <= b auto tmp = s; s = t; t = tmp; tmp = m0; m0 = m1; m1 = tmp; } // by [3]: |m0| <= b/g // by g != b: |m0| < b/g if (m0 < 0) m0 += b / s; return {s, m0}; } // Compile time primitive root // @param m must be prime // @return primitive root (and minimum in now) constexpr int primitive_root_constexpr(int m) { if (m == 2) return 1; if (m == 167772161) return 3; if (m == 469762049) return 3; if (m == 754974721) return 11; if (m == 998244353) return 3; int divs[20] = {}; divs[0] = 2; int cnt = 1; int x = (m - 1) / 2; while (x % 2 == 0) x /= 2; for (int i = 3; (long long)(i)*i <= x; i += 2) { if (x % i == 0) { divs[cnt++] = i; while (x % i == 0) { x /= i; } } } if (x > 1) { divs[cnt++] = x; } for (int g = 2;; g++) { bool ok = true; for (int i = 0; i < cnt; i++) { if (pow_mod_constexpr(g, (m - 1) / divs[i], m) == 1) { ok = false; break; } } if (ok) return g; } } template <int m> constexpr int primitive_root = primitive_root_constexpr(m); } // namespace internal } // namespace atcoder #line 10 "atcoder/math.hpp" namespace atcoder { long long pow_mod(long long x, long long n, int m) { assert(0 <= n && 1 <= m); if (m == 1) return 0; internal::barrett bt((unsigned int)(m)); unsigned int r = 1, y = (unsigned int)(internal::safe_mod(x, m)); while (n) { if (n & 1) r = bt.mul(r, y); y = bt.mul(y, y); n >>= 1; } return r; } long long inv_mod(long long x, long long m) { assert(1 <= m); auto z = internal::inv_gcd(x, m); assert(z.first == 1); return z.second; } // (rem, mod) std::pair<long long, long long> crt(const std::vector<long long>& r, const std::vector<long long>& m) { assert(r.size() == m.size()); int n = int(r.size()); // Contracts: 0 <= r0 < m0 long long r0 = 0, m0 = 1; for (int i = 0; i < n; i++) { assert(1 <= m[i]); long long r1 = internal::safe_mod(r[i], m[i]), m1 = m[i]; if (m0 < m1) { std::swap(r0, r1); std::swap(m0, m1); } if (m0 % m1 == 0) { if (r0 % m1 != r1) return {0, 0}; continue; } // assume: m0 > m1, lcm(m0, m1) >= 2 * max(m0, m1) // (r0, m0), (r1, m1) -> (r2, m2 = lcm(m0, m1)); // r2 % m0 = r0 // r2 % m1 = r1 // -> (r0 + x*m0) % m1 = r1 // -> x*u0*g % (u1*g) = (r1 - r0) (u0*g = m0, u1*g = m1) // -> x = (r1 - r0) / g * inv(u0) (mod u1) // im = inv(u0) (mod u1) (0 <= im < u1) long long g, im; std::tie(g, im) = internal::inv_gcd(m0, m1); long long u1 = (m1 / g); // |r1 - r0| < (m0 + m1) <= lcm(m0, m1) if ((r1 - r0) % g) return {0, 0}; // u1 * u1 <= m1 * m1 / g / g <= m0 * m1 / g = lcm(m0, m1) long long x = (r1 - r0) / g % u1 * im % u1; // |r0| + |m0 * x| // < m0 + m0 * (u1 - 1) // = m0 + m0 * m1 / g - m0 // = lcm(m0, m1) r0 += x * m0; m0 *= u1; // -> lcm(m0, m1) if (r0 < 0) r0 += m0; } return {r0, m0}; } long long floor_sum(long long n, long long m, long long a, long long b) { long long ans = 0; if (a < 0) { unsigned long long a2 = internal::safe_mod(a, m); ans -= 1ULL * n * (n - 1) / 2 * ((a2 - a) / m); a = a2; } if (b < 0) { unsigned long long b2 = internal::safe_mod(b, m); ans -= 1ULL * n * ((b2 - b) / m); b = b2; } if (a >= m) { ans += (n - 1) * n * (a / m) / 2; a %= m; } if (b >= m) { ans += n * (b / m); b %= m; } long long y_max = (a * n + b) / m, x_max = (y_max * m - b); if (y_max == 0) return ans; ans += (n - (x_max + a - 1) / a) * y_max; ans += floor_sum(y_max, a, m, (a - x_max % a) % a); return ans; } } // namespace atcoder #line 2 "modint/barrett-reduction.hpp" #line 4 "modint/barrett-reduction.hpp" using namespace std; struct Barrett { using u32 = unsigned int; using i64 = long long; using u64 = unsigned long long; u32 m; u64 im; Barrett() : m(), im() {} Barrett(int n) : m(n), im(u64(-1) / m + 1) {} constexpr inline i64 quo(u64 n) { u64 x = u64((__uint128_t(n) * im) >> 64); u32 r = n - x * m; return m <= r ? x - 1 : x; } constexpr inline i64 rem(u64 n) { u64 x = u64((__uint128_t(n) * im) >> 64); u32 r = n - x * m; return m <= r ? r + m : r; } constexpr inline pair<i64, int> quorem(u64 n) { u64 x = u64((__uint128_t(n) * im) >> 64); u32 r = n - x * m; if (m <= r) return {x - 1, r + m}; return {x, r}; } constexpr inline i64 pow(u64 n, i64 p) { u32 a = rem(n), r = m == 1 ? 0 : 1; while (p) { if (p & 1) r = rem(u64(r) * a); a = rem(u64(a) * a); p >>= 1; } return r; } }; #line 5 "modulo/arbitrary-mod-binomial-large.hpp" #define PRIME_POWER_BINOMIAL_M_MAX ((1LL << 30) - 1) #define PRIME_POWER_BINOMIAL_N_MAX 20000000 struct simd_prime_binomial { using u32 = unsigned int; using i64 = long long; using u64 = unsigned long long; using m256 = __m256i; u32 get_r(u32 _mod) { u32 ret = _mod; for (int i = 0; i < 4; ++i) ret *= 2 - _mod * ret; return ret; } inline u32 reduce(const u64& b) { return (b + u64(u32(b) * u32(-r)) * mod) >> 32; } inline u32 mul(const u32& a, const u32& b) { return reduce(u64(a) * b); } inline u32 add(const u32& a, const u32& b) { u32 c = a + b - 2 * mod; if (c > 2 * mod) c += 2 * mod; return c; } inline u32 cast(const i64& b) { return reduce(u64(b % mod + mod) * n2); } inline u32 raw_cast(const u64& b) { return reduce(b * n2); } u64 get(const u32& b) { u32 a = reduce(b); return a >= mod ? a - mod : a; } u32 inv(u32 b) { u32 e = mod - 2, a = raw_cast(1); while (e) { if (e & 1) a = mul(a, b); b = mul(b, b); e >>= 1; } return a; } __attribute__((target("avx2"), optimize("O3", "unroll-loops"))) inline m256 simd_mulhi(const m256& a, const m256& b) { m256 a13 = _mm256_shuffle_epi32(a, 0xF5); m256 b13 = _mm256_shuffle_epi32(b, 0xF5); m256 prod02 = _mm256_mul_epu32(a, b); m256 prod13 = _mm256_mul_epu32(a13, b13); m256 unpalo = _mm256_unpacklo_epi32(prod02, prod13); m256 unpahi = _mm256_unpackhi_epi32(prod02, prod13); m256 prod = _mm256_unpackhi_epi64(unpalo, unpahi); return prod; } __attribute__((target("avx2"), optimize("O3", "unroll-loops"))) inline m256 simd_sub(const m256& a, const m256& b) { m256 ret = _mm256_sub_epi32(a, b); m256 cmp = _mm256_cmpgt_epi32(M0, ret); m256 add = _mm256_and_si256(cmp, M2); return _mm256_add_epi32(add, ret); } __attribute__((target("avx2"), optimize("O3", "unroll-loops"))) inline m256 simd_mul(const m256& A, const m256& B) { m256 a13 = _mm256_shuffle_epi32(A, 0xF5); m256 b13 = _mm256_shuffle_epi32(B, 0xF5); m256 prod02 = _mm256_mul_epu32(A, B); m256 prod13 = _mm256_mul_epu32(a13, b13); m256 unpalo = _mm256_unpacklo_epi32(prod02, prod13); m256 unpahi = _mm256_unpackhi_epi32(prod02, prod13); m256 prodlo = _mm256_unpacklo_epi64(unpalo, unpahi); m256 prodhi = _mm256_unpackhi_epi64(unpalo, unpahi); m256 hiplm1 = _mm256_add_epi32(prodhi, M1); m256 lomulr = _mm256_mullo_epi32(prodlo, R); m256 lomulrmulm1 = simd_mulhi(lomulr, M1); return _mm256_sub_epi32(hiplm1, lomulrmulm1); } __attribute__((target("avx2"), optimize("O3", "unroll-loops"))) inline void transpose8_ps(__m256& row0, __m256& row1, __m256& row2, __m256& row3, __m256& row4, __m256& row5, __m256& row6, __m256& row7) { __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; __t0 = _mm256_unpacklo_ps(row0, row1); __t1 = _mm256_unpackhi_ps(row0, row1); __t2 = _mm256_unpacklo_ps(row2, row3); __t3 = _mm256_unpackhi_ps(row2, row3); __t4 = _mm256_unpacklo_ps(row4, row5); __t5 = _mm256_unpackhi_ps(row4, row5); __t6 = _mm256_unpacklo_ps(row6, row7); __t7 = _mm256_unpackhi_ps(row6, row7); __tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); __tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); __tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); __tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); __tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0)); __tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2)); __tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0)); __tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2)); row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); } __attribute__((target("avx2"), optimize("O3", "unroll-loops"))) void precalc() { __attribute__((aligned(32))) u32 b1[32]; __attribute__((aligned(32))) __m256 b2[8]; int max = ((mod / 2) / v + 12) / 8 * 8; f.resize(max + 1, raw_cast(1)); for (int i = 0; i < 32; i++) b1[i] = raw_cast(i + 1); m256 A0 = _mm256_set1_epi32(b1[0]); m256 A1 = _mm256_set1_epi32(b1[0]); m256 A2 = _mm256_set1_epi32(b1[0]); m256 A3 = _mm256_set1_epi32(b1[0]); m256 B0 = _mm256_load_si256((m256*)(b1 + 0)); m256 B1 = _mm256_load_si256((m256*)(b1 + 8)); m256 B2 = _mm256_load_si256((m256*)(b1 + 16)); m256 B3 = _mm256_load_si256((m256*)(b1 + 24)); m256 DI = _mm256_set1_epi32(mod * 2 - b1[31]); for (int i = 1; i < (int)f.size(); i += 8) { for (int j = 0; j < 8; j++) { for (u32 loop = 0; loop < v / 32; loop++) { A0 = simd_mul(A0, B0), A1 = simd_mul(A1, B1); A2 = simd_mul(A2, B2), A3 = simd_mul(A3, B3); B0 = simd_sub(B0, DI), B1 = simd_sub(B1, DI); B2 = simd_sub(B2, DI), B3 = simd_sub(B3, DI); } m256 C0 = simd_mul(A0, A1); m256 C1 = simd_mul(A2, A3); m256 C2 = simd_mul(C0, C1); _mm256_store_si256((m256*)(b2 + j), C2); } transpose8_ps(b2[0], b2[1], b2[2], b2[3], b2[4], b2[5], b2[6], b2[7]); m256 D0 = simd_mul(m256(b2[0]), m256(b2[1])); m256 D1 = simd_mul(m256(b2[2]), m256(b2[3])); m256 D2 = simd_mul(m256(b2[4]), m256(b2[5])); m256 D3 = simd_mul(m256(b2[6]), m256(b2[7])); m256 D4 = simd_mul(D0, D1); m256 D5 = simd_mul(D2, D3); m256 D6 = simd_mul(D4, D5); _mm256_storeu_si256((m256*)(f.data() + i), D6); } } u32 mod, r, n2; Barrett bm; m256 R, M0, M1, M2; static constexpr u32 v = 128; static_assert(v % 32 == 0); vector<u32> f; simd_prime_binomial() = default; __attribute__((target("avx2"), optimize("O3", "unroll-loops"))) simd_prime_binomial(u32 _mod) : mod(_mod) { assert(2 < mod && mod < (1u << 30)); assert(mod % 2 != 0); r = get_r(mod); n2 = -u64(mod) % mod; bm = Barrett(mod); R = _mm256_set1_epi32(r); M0 = _mm256_set1_epi32(0); M1 = _mm256_set1_epi32(mod); M2 = _mm256_set1_epi32(mod * 2); precalc(); } u32 raw_fac(u64 n) { assert(n < mod); if (n * 2 > mod + 2) { u64 x = raw_fac(mod - 1 - n); if (n % 2 == 0) x = mod * 2 - x; return inv(x); } u32 a = f[n / v], i = n / v * v + 1; u32 j = raw_cast(i), o = raw_cast(1); while (i++ <= n) { a = mul(a, j), j = add(j, o); } return a; } inline u32 fac(u64 n) { if (n >= mod) return 0; return get(raw_fac(n)); } u32 C(long long n, long long m) { if (n < 0 or m < 0 or n < m) return 0; u32 num = raw_cast(1), denom = raw_cast(1); while (n) { long long n0, m0; tie(n, n0) = bm.quorem(n); tie(m, m0) = bm.quorem(m); if (n0 < m0) return 0; num = mul(num, raw_fac(n0)); denom = mul(denom, raw_fac(n0 - m0)); denom = mul(denom, raw_fac(m0)); } return get(mul(num, inv(denom))); } }; struct prime_power_binomial { int p, q, M; vector<int> fac, ifac, inv; int delta; Barrett bm, bp; prime_power_binomial() = default; prime_power_binomial(int _p, int _q) : p(_p), q(_q) { assert(1 < p && p <= PRIME_POWER_BINOMIAL_M_MAX); assert(_q > 0); long long m = 1; while (_q--) { m *= p; assert(m <= PRIME_POWER_BINOMIAL_M_MAX); } M = m; bm = Barrett(M), bp = Barrett(p); enumerate(); delta = (p == 2 && q >= 3) ? 1 : M - 1; } void enumerate() { int MX = min<int>(M, PRIME_POWER_BINOMIAL_N_MAX + 10); fac.resize(MX); ifac.resize(MX); inv.resize(MX); fac[0] = ifac[0] = inv[0] = 1; fac[1] = ifac[1] = inv[1] = 1; for (int i = 2; i < MX; i++) { if (i % p == 0) { fac[i] = fac[i - 1]; fac[i + 1] = bm.rem(1LL * fac[i - 1] * (i + 1)); i++; } else { fac[i] = bm.rem(1LL * fac[i - 1] * i); } } ifac[MX - 1] = bm.pow(fac[MX - 1], M / p * (p - 1) - 1); for (int i = MX - 2; i > 1; --i) { if (i % p == 0) { ifac[i] = bm.rem(1LL * ifac[i + 1] * (i + 1)); ifac[i - 1] = ifac[i]; i--; } else { ifac[i] = bm.rem(1LL * ifac[i + 1] * (i + 1)); } } } long long Lucas(long long n, long long m) { int res = 1; while (n) { int n0, m0; tie(n, n0) = bp.quorem(n); tie(m, m0) = bp.quorem(m); if (n0 < m0) return 0; res = bm.rem(1LL * res * fac[n0]); int buf = bm.rem(1LL * ifac[n0 - m0] * ifac[m0]); res = bm.rem(1LL * res * buf); } return res; } long long C(long long n, long long m) { if (n < m || n < 0 || m < 0) return 0; if (q == 1) return Lucas(n, m); long long r = n - m; int e0 = 0, eq = 0, i = 0; int res = 1; while (n) { res = bm.rem(1LL * res * fac[bm.rem(n)]); res = bm.rem(1LL * res * ifac[bm.rem(m)]); res = bm.rem(1LL * res * ifac[bm.rem(r)]); n = bp.quo(n); m = bp.quo(m); r = bp.quo(r); int eps = n - m - r; e0 += eps; if (e0 >= q) return 0; if (++i >= q) eq += eps; } if (eq & 1) res = bm.rem(1LL * res * delta); res = bm.rem(1LL * res * bm.pow(p, e0)); return res; } }; // constraints: // M <= 1e9 and max(N) <= 1e18 struct arbitrary_mod_binomial { int mod; vector<int> M1, M2; vector<prime_power_binomial> cs1; vector<simd_prime_binomial> cs2; void push_cs(int i, int j, int k) { if (i < PRIME_POWER_BINOMIAL_N_MAX || j != 1) { cs1.emplace_back(i, j); M1.push_back(k); } else { assert(j == 1); cs2.emplace_back(i); M2.push_back(k); } } arbitrary_mod_binomial(long long md) : mod(md) { assert(1 <= md); assert(md <= PRIME_POWER_BINOMIAL_M_MAX); for (int i = 2; i * i <= md; i++) { if (md % i == 0) { int j = 0, k = 1; while (md % i == 0) md /= i, j++, k *= i; push_cs(i, j, k); } } if (md != 1) push_cs(md, 1, md); assert(M1.size() == cs1.size()); assert(M2.size() == cs2.size()); } long long C(long long n, long long m) { if (mod == 1) return 0; vector<long long> rem, d; for (int i = 0; i < (int)cs1.size(); i++) { rem.push_back(cs1[i].C(n, m)); d.push_back(M1[i]); } for (int i = 0; i < (int)cs2.size(); i++) { rem.push_back(cs2[i].C(n, m)); d.push_back(M2[i]); } return atcoder::crt(rem, d).first; } }; #undef PRIME_POWER_BINOMIAL_M_MAX #undef PRIME_POWER_BINOMIAL_N_MAX #line 6 "verify/verify-yosupo-math/yosupo-binomial-coefficient-large.test.cpp" // #line 2 "misc/fastio.hpp" #line 8 "misc/fastio.hpp" using namespace std; #line 2 "internal/internal-type-traits.hpp" #line 4 "internal/internal-type-traits.hpp" using namespace std; namespace internal { template <typename T> using is_broadly_integral = typename conditional_t<is_integral_v<T> || is_same_v<T, __int128_t> || is_same_v<T, __uint128_t>, true_type, false_type>::type; template <typename T> using is_broadly_signed = typename conditional_t<is_signed_v<T> || is_same_v<T, __int128_t>, true_type, false_type>::type; template <typename T> using is_broadly_unsigned = typename conditional_t<is_unsigned_v<T> || is_same_v<T, __uint128_t>, true_type, false_type>::type; #define ENABLE_VALUE(x) \ template <typename T> \ constexpr bool x##_v = x<T>::value; ENABLE_VALUE(is_broadly_integral); ENABLE_VALUE(is_broadly_signed); ENABLE_VALUE(is_broadly_unsigned); #undef ENABLE_VALUE #define ENABLE_HAS_TYPE(var) \ template <class, class = void> \ struct has_##var : false_type {}; \ template <class T> \ struct has_##var<T, void_t<typename T::var>> : true_type {}; \ template <class T> \ constexpr auto has_##var##_v = has_##var<T>::value; #define ENABLE_HAS_VAR(var) \ template <class, class = void> \ struct has_##var : false_type {}; \ template <class T> \ struct has_##var<T, void_t<decltype(T::var)>> : true_type {}; \ template <class T> \ constexpr auto has_##var##_v = has_##var<T>::value; } // namespace internal #line 12 "misc/fastio.hpp" namespace fastio { static constexpr int SZ = 1 << 17; static constexpr int offset = 64; char inbuf[SZ], outbuf[SZ]; int in_left = 0, in_right = 0, out_right = 0; struct Pre { char num[40000]; constexpr Pre() : num() { for (int i = 0; i < 10000; i++) { int n = i; for (int j = 3; j >= 0; j--) { num[i * 4 + j] = n % 10 + '0'; n /= 10; } } } } constexpr pre; void load() { int len = in_right - in_left; memmove(inbuf, inbuf + in_left, len); in_right = len + fread(inbuf + len, 1, SZ - len, stdin); in_left = 0; } void flush() { fwrite(outbuf, 1, out_right, stdout); out_right = 0; } void skip_space() { if (in_left + offset > in_right) load(); while (inbuf[in_left] <= ' ') in_left++; } void single_read(char& c) { if (in_left + offset > in_right) load(); skip_space(); c = inbuf[in_left++]; } void single_read(string& S) { skip_space(); while (true) { if (in_left == in_right) load(); int i = in_left; for (; i != in_right; i++) { if (inbuf[i] <= ' ') break; } copy(inbuf + in_left, inbuf + i, back_inserter(S)); in_left = i; if (i != in_right) break; } } template <typename T, enable_if_t<internal::is_broadly_integral_v<T>>* = nullptr> void single_read(T& x) { if (in_left + offset > in_right) load(); skip_space(); char c = inbuf[in_left++]; [[maybe_unused]] bool minus = false; if constexpr (internal::is_broadly_signed_v<T>) { if (c == '-') minus = true, c = inbuf[in_left++]; } x = 0; while (c >= '0') { x = x * 10 + (c & 15); c = inbuf[in_left++]; } if constexpr (internal::is_broadly_signed_v<T>) { if (minus) x = -x; } } void rd() {} template <typename Head, typename... Tail> void rd(Head& head, Tail&... tail) { single_read(head); rd(tail...); } void single_write(const char& c) { if (out_right > SZ - offset) flush(); outbuf[out_right++] = c; } void single_write(const bool& b) { if (out_right > SZ - offset) flush(); outbuf[out_right++] = b ? '1' : '0'; } void single_write(const string& S) { flush(), fwrite(S.data(), 1, S.size(), stdout); } void single_write(const char* p) { flush(), fwrite(p, 1, strlen(p), stdout); } template <typename T, enable_if_t<internal::is_broadly_integral_v<T>>* = nullptr> void single_write(const T& _x) { if (out_right > SZ - offset) flush(); if (_x == 0) { outbuf[out_right++] = '0'; return; } T x = _x; if constexpr (internal::is_broadly_signed_v<T>) { if (x < 0) outbuf[out_right++] = '-', x = -x; } constexpr int buffer_size = sizeof(T) * 10 / 4; char buf[buffer_size]; int i = buffer_size; while (x >= 10000) { i -= 4; memcpy(buf + i, pre.num + (x % 10000) * 4, 4); x /= 10000; } if (x < 100) { if (x < 10) { outbuf[out_right] = '0' + x; ++out_right; } else { uint32_t q = (uint32_t(x) * 205) >> 11; uint32_t r = uint32_t(x) - q * 10; outbuf[out_right] = '0' + q; outbuf[out_right + 1] = '0' + r; out_right += 2; } } else { if (x < 1000) { memcpy(outbuf + out_right, pre.num + (x << 2) + 1, 3); out_right += 3; } else { memcpy(outbuf + out_right, pre.num + (x << 2), 4); out_right += 4; } } memcpy(outbuf + out_right, buf + i, buffer_size - i); out_right += buffer_size - i; } void wt() {} template <typename Head, typename... Tail> void wt(const Head& head, const Tail&... tail) { single_write(head); wt(std::forward<const Tail>(tail)...); } template <typename... Args> void wtn(const Args&... x) { wt(std::forward<const Args>(x)...); wt('\n'); } struct Dummy { Dummy() { atexit(flush); } } dummy; } // namespace fastio using fastio::rd; using fastio::skip_space; using fastio::wt; using fastio::wtn; #line 8 "verify/verify-yosupo-math/yosupo-binomial-coefficient-large.test.cpp" using namespace Nyaan; void Nyaan::solve() { int T, m; rd(T, m); arbitrary_mod_binomial C(m); while (T--) { unsigned long long n, k; rd(n, k); auto ans = C.C(n, k); wtn(ans); } } /* #include "misc/rng.hpp" #include "misc/timer.hpp" using mint = LazyMontgomeryModInt<998244353>; void verify() { Timer timer; simd_prime_binomial C(998244353); cerr << "time:" << timer.elapsed() << endl; mint f = 1, a = 1, one = 1; for (int i = 1; i < 998244353; i++) { f *= a; a += one; if (rng() % (1u << 16) == 0) { mint f2 = C.fac(i); if (f != f2) exit(1); } } out("OK"); cout.flush(); exit(0); } */