#define PROBLEM "https://judge.yosupo.jp/problem/convolution_mod_1000000007" // #include "../../template/template.hpp" // #include "../../misc/fastio.hpp" #include "../../ntt/complex-fft.hpp" namespace ArbitraryModConvolution { // naive Toom-3 template <unsigned int MOD> vector<int> toom_3(const vector<int>& a, const vector<int>& b) { auto precalc = [](const vector<int>& _a) -> array<vector<int>, 5> { int n = _a.size(); vector<int> p0(n), p1(n), pm1(n), pm2(n), pinf(n); for (int i = 0; i < n; i++) { int m0 = _a[i] & 1023; int m1 = (_a[i] >> 10) & 1023; int m2 = (_a[i] >> 20) & 1023; p0[i] = m0; p1[i] = m0 + m1 + m2; pm1[i] = m0 - m1 + m2; pm2[i] = m0 - 2 * m1 + 4 * m2; pinf[i] = m2; } return {{p0, p1, pm1, pm2, pinf}}; }; auto [a0, a1, am1, am2, ainf] = precalc(a); auto [b0, b1, bm1, bm2, binf] = precalc(b); auto c0 = CooleyTukey::multiply(a0, b0); auto c1 = CooleyTukey::multiply(a1, b1); auto cm1 = CooleyTukey::multiply(am1, bm1); auto cm2 = CooleyTukey::multiply(am2, bm2); auto cinf = CooleyTukey::multiply(ainf, binf); vector<int> c(c0.size()); for (int i = 0; i < (int)c.size(); i++) { long long r0 = c0[i]; long long r4 = cinf[i]; long long r3 = (cm2[i] - c1[i]) / 3; long long r1 = (c1[i] - cm1[i]) / 2; long long r2 = cm1[i] - c0[i]; r3 = (r2 - r3) / 2 + r4 * 2; r2 += r1 - r4; r1 -= r3; long long ret = r4 % MOD * 1048576; ret += r3 % MOD * 1024 + r2; ret = ret % MOD * 1048576; ret += r1 % MOD * 1024 + r0; ret %= MOD; if (ret < 0) ret += MOD; c[i] = ret; } return c; } } // namespace ArbitraryModConvolution void Nyaan::solve() { using namespace ArbitraryModConvolution; int N, M; rd(N, M); vector<int> a(N), b(M); for (auto& x : a) rd(x); for (auto& x : b) rd(x); auto c = toom_3<1000000007>(a, b); // auto c = CooleyTukey::karatsuba<1000000007>(a, b); for (int i = 0; i < N + M - 1; i++) { wt(c[i], " \n"[i == N + M - 2]); } }
#line 1 "verify/verify-yosupo-ntt/yosupo-convolution-real-fft-toom-3.test.cpp" #define PROBLEM "https://judge.yosupo.jp/problem/convolution_mod_1000000007" // #line 2 "template/template.hpp" using namespace std; // intrinstic #include <immintrin.h> #include <algorithm> #include <array> #include <bitset> #include <cassert> #include <cctype> #include <cfenv> #include <cfloat> #include <chrono> #include <cinttypes> #include <climits> #include <cmath> #include <complex> #include <cstdarg> #include <cstddef> #include <cstdint> #include <cstdio> #include <cstdlib> #include <cstring> #include <deque> #include <fstream> #include <functional> #include <initializer_list> #include <iomanip> #include <ios> #include <iostream> #include <istream> #include <iterator> #include <limits> #include <list> #include <map> #include <memory> #include <new> #include <numeric> #include <ostream> #include <queue> #include <random> #include <set> #include <sstream> #include <stack> #include <streambuf> #include <string> #include <tuple> #include <type_traits> #include <typeinfo> #include <unordered_map> #include <unordered_set> #include <utility> #include <vector> // utility #line 1 "template/util.hpp" namespace Nyaan { using ll = long long; using i64 = long long; using u64 = unsigned long long; using i128 = __int128_t; using u128 = __uint128_t; template <typename T> using V = vector<T>; template <typename T> using VV = vector<vector<T>>; using vi = vector<int>; using vl = vector<long long>; using vd = V<double>; using vs = V<string>; using vvi = vector<vector<int>>; using vvl = vector<vector<long long>>; template <typename T> using minpq = priority_queue<T, vector<T>, greater<T>>; template <typename T, typename U> struct P : pair<T, U> { template <typename... Args> P(Args... args) : pair<T, U>(args...) {} using pair<T, U>::first; using pair<T, U>::second; P &operator+=(const P &r) { first += r.first; second += r.second; return *this; } P &operator-=(const P &r) { first -= r.first; second -= r.second; return *this; } P &operator*=(const P &r) { first *= r.first; second *= r.second; return *this; } template <typename S> P &operator*=(const S &r) { first *= r, second *= r; return *this; } P operator+(const P &r) const { return P(*this) += r; } P operator-(const P &r) const { return P(*this) -= r; } P operator*(const P &r) const { return P(*this) *= r; } template <typename S> P operator*(const S &r) const { return P(*this) *= r; } P operator-() const { return P{-first, -second}; } }; using pl = P<ll, ll>; using pi = P<int, int>; using vp = V<pl>; constexpr int inf = 1001001001; constexpr long long infLL = 4004004004004004004LL; template <typename T> int sz(const T &t) { return t.size(); } template <typename T, typename U> inline bool amin(T &x, U y) { return (y < x) ? (x = y, true) : false; } template <typename T, typename U> inline bool amax(T &x, U y) { return (x < y) ? (x = y, true) : false; } template <typename T> inline T Max(const vector<T> &v) { return *max_element(begin(v), end(v)); } template <typename T> inline T Min(const vector<T> &v) { return *min_element(begin(v), end(v)); } template <typename T> inline long long Sum(const vector<T> &v) { return accumulate(begin(v), end(v), 0LL); } template <typename T> int lb(const vector<T> &v, const T &a) { return lower_bound(begin(v), end(v), a) - begin(v); } template <typename T> int ub(const vector<T> &v, const T &a) { return upper_bound(begin(v), end(v), a) - begin(v); } constexpr long long TEN(int n) { long long ret = 1, x = 10; for (; n; x *= x, n >>= 1) ret *= (n & 1 ? x : 1); return ret; } template <typename T, typename U> pair<T, U> mkp(const T &t, const U &u) { return make_pair(t, u); } template <typename T> vector<T> mkrui(const vector<T> &v, bool rev = false) { vector<T> ret(v.size() + 1); if (rev) { for (int i = int(v.size()) - 1; i >= 0; i--) ret[i] = v[i] + ret[i + 1]; } else { for (int i = 0; i < int(v.size()); i++) ret[i + 1] = ret[i] + v[i]; } return ret; }; template <typename T> vector<T> mkuni(const vector<T> &v) { vector<T> ret(v); sort(ret.begin(), ret.end()); ret.erase(unique(ret.begin(), ret.end()), ret.end()); return ret; } template <typename F> vector<int> mkord(int N, F f) { vector<int> ord(N); iota(begin(ord), end(ord), 0); sort(begin(ord), end(ord), f); return ord; } template <typename T> vector<int> mkinv(vector<T> &v) { int max_val = *max_element(begin(v), end(v)); vector<int> inv(max_val + 1, -1); for (int i = 0; i < (int)v.size(); i++) inv[v[i]] = i; return inv; } vector<int> mkiota(int n) { vector<int> ret(n); iota(begin(ret), end(ret), 0); return ret; } template <typename T> T mkrev(const T &v) { T w{v}; reverse(begin(w), end(w)); return w; } template <typename T> bool nxp(vector<T> &v) { return next_permutation(begin(v), end(v)); } // 返り値の型は入力の T に依存 // i 要素目 : [0, a[i]) template <typename T> vector<vector<T>> product(const vector<T> &a) { vector<vector<T>> ret; vector<T> v; auto dfs = [&](auto rc, int i) -> void { if (i == (int)a.size()) { ret.push_back(v); return; } for (int j = 0; j < a[i]; j++) v.push_back(j), rc(rc, i + 1), v.pop_back(); }; dfs(dfs, 0); return ret; } // F : function(void(T&)), mod を取る操作 // T : 整数型のときはオーバーフローに注意する template <typename T> T Power(T a, long long n, const T &I, const function<void(T &)> &f) { T res = I; for (; n; f(a = a * a), n >>= 1) { if (n & 1) f(res = res * a); } return res; } // T : 整数型のときはオーバーフローに注意する template <typename T> T Power(T a, long long n, const T &I) { return Power(a, n, I, function<void(T &)>{[](T &) -> void {}}); } } // namespace Nyaan #line 58 "template/template.hpp" // bit operation #line 1 "template/bitop.hpp" namespace Nyaan { __attribute__((target("popcnt"))) inline int popcnt(const u64 &a) { return _mm_popcnt_u64(a); } inline int lsb(const u64 &a) { return a ? __builtin_ctzll(a) : 64; } inline int ctz(const u64 &a) { return a ? __builtin_ctzll(a) : 64; } inline int msb(const u64 &a) { return a ? 63 - __builtin_clzll(a) : -1; } template <typename T> inline int gbit(const T &a, int i) { return (a >> i) & 1; } template <typename T> inline void sbit(T &a, int i, bool b) { if (gbit(a, i) != b) a ^= T(1) << i; } constexpr long long PW(int n) { return 1LL << n; } constexpr long long MSK(int n) { return (1LL << n) - 1; } } // namespace Nyaan #line 61 "template/template.hpp" // inout #line 1 "template/inout.hpp" namespace Nyaan { template <typename T, typename U> ostream &operator<<(ostream &os, const pair<T, U> &p) { os << p.first << " " << p.second; return os; } template <typename T, typename U> istream &operator>>(istream &is, pair<T, U> &p) { is >> p.first >> p.second; return is; } template <typename T> ostream &operator<<(ostream &os, const vector<T> &v) { int s = (int)v.size(); for (int i = 0; i < s; i++) os << (i ? " " : "") << v[i]; return os; } template <typename T> istream &operator>>(istream &is, vector<T> &v) { for (auto &x : v) is >> x; return is; } istream &operator>>(istream &is, __int128_t &x) { string S; is >> S; x = 0; int flag = 0; for (auto &c : S) { if (c == '-') { flag = true; continue; } x *= 10; x += c - '0'; } if (flag) x = -x; return is; } istream &operator>>(istream &is, __uint128_t &x) { string S; is >> S; x = 0; for (auto &c : S) { x *= 10; x += c - '0'; } return is; } ostream &operator<<(ostream &os, __int128_t x) { if (x == 0) return os << 0; if (x < 0) os << '-', x = -x; string S; while (x) S.push_back('0' + x % 10), x /= 10; reverse(begin(S), end(S)); return os << S; } ostream &operator<<(ostream &os, __uint128_t x) { if (x == 0) return os << 0; string S; while (x) S.push_back('0' + x % 10), x /= 10; reverse(begin(S), end(S)); return os << S; } void in() {} template <typename T, class... U> void in(T &t, U &...u) { cin >> t; in(u...); } void out() { cout << "\n"; } template <typename T, class... U, char sep = ' '> void out(const T &t, const U &...u) { cout << t; if (sizeof...(u)) cout << sep; out(u...); } struct IoSetupNya { IoSetupNya() { cin.tie(nullptr); ios::sync_with_stdio(false); cout << fixed << setprecision(15); cerr << fixed << setprecision(7); } } iosetupnya; } // namespace Nyaan #line 64 "template/template.hpp" // debug #line 1 "template/debug.hpp" namespace DebugImpl { template <typename U, typename = void> struct is_specialize : false_type {}; template <typename U> struct is_specialize< U, typename conditional<false, typename U::iterator, void>::type> : true_type {}; template <typename U> struct is_specialize< U, typename conditional<false, decltype(U::first), void>::type> : true_type {}; template <typename U> struct is_specialize<U, enable_if_t<is_integral<U>::value, void>> : true_type { }; void dump(const char& t) { cerr << t; } void dump(const string& t) { cerr << t; } void dump(const bool& t) { cerr << (t ? "true" : "false"); } void dump(__int128_t t) { if (t == 0) cerr << 0; if (t < 0) cerr << '-', t = -t; string S; while (t) S.push_back('0' + t % 10), t /= 10; reverse(begin(S), end(S)); cerr << S; } void dump(__uint128_t t) { if (t == 0) cerr << 0; string S; while (t) S.push_back('0' + t % 10), t /= 10; reverse(begin(S), end(S)); cerr << S; } template <typename U, enable_if_t<!is_specialize<U>::value, nullptr_t> = nullptr> void dump(const U& t) { cerr << t; } template <typename T> void dump(const T& t, enable_if_t<is_integral<T>::value>* = nullptr) { string res; if (t == Nyaan::inf) res = "inf"; if constexpr (is_signed<T>::value) { if (t == -Nyaan::inf) res = "-inf"; } if constexpr (sizeof(T) == 8) { if (t == Nyaan::infLL) res = "inf"; if constexpr (is_signed<T>::value) { if (t == -Nyaan::infLL) res = "-inf"; } } if (res.empty()) res = to_string(t); cerr << res; } template <typename T, typename U> void dump(const pair<T, U>&); template <typename T> void dump(const pair<T*, int>&); template <typename T> void dump(const T& t, enable_if_t<!is_void<typename T::iterator>::value>* = nullptr) { cerr << "[ "; for (auto it = t.begin(); it != t.end();) { dump(*it); cerr << (++it == t.end() ? "" : ", "); } cerr << " ]"; } template <typename T, typename U> void dump(const pair<T, U>& t) { cerr << "( "; dump(t.first); cerr << ", "; dump(t.second); cerr << " )"; } template <typename T> void dump(const pair<T*, int>& t) { cerr << "[ "; for (int i = 0; i < t.second; i++) { dump(t.first[i]); cerr << (i == t.second - 1 ? "" : ", "); } cerr << " ]"; } void trace() { cerr << endl; } template <typename Head, typename... Tail> void trace(Head&& head, Tail&&... tail) { cerr << " "; dump(head); if (sizeof...(tail) != 0) cerr << ","; trace(forward<Tail>(tail)...); } } // namespace DebugImpl #ifdef NyaanDebug #define trc(...) \ do { \ cerr << "## " << #__VA_ARGS__ << " = "; \ DebugImpl::trace(__VA_ARGS__); \ } while (0) #else #define trc(...) (void(0)) #endif #ifdef NyaanLocal #define trc2(...) \ do { \ cerr << "## " << #__VA_ARGS__ << " = "; \ DebugImpl::trace(__VA_ARGS__); \ } while (0) #else #define trc2(...) (void(0)) #endif #line 67 "template/template.hpp" // macro #line 1 "template/macro.hpp" #define each(x, v) for (auto&& x : v) #define each2(x, y, v) for (auto&& [x, y] : v) #define all(v) (v).begin(), (v).end() #define rep(i, N) for (long long i = 0; i < (long long)(N); i++) #define repr(i, N) for (long long i = (long long)(N)-1; i >= 0; i--) #define rep1(i, N) for (long long i = 1; i <= (long long)(N); i++) #define repr1(i, N) for (long long i = (N); (long long)(i) > 0; i--) #define reg(i, a, b) for (long long i = (a); i < (b); i++) #define regr(i, a, b) for (long long i = (b)-1; i >= (a); i--) #define fi first #define se second #define ini(...) \ int __VA_ARGS__; \ in(__VA_ARGS__) #define inl(...) \ long long __VA_ARGS__; \ in(__VA_ARGS__) #define ins(...) \ string __VA_ARGS__; \ in(__VA_ARGS__) #define in2(s, t) \ for (int i = 0; i < (int)s.size(); i++) { \ in(s[i], t[i]); \ } #define in3(s, t, u) \ for (int i = 0; i < (int)s.size(); i++) { \ in(s[i], t[i], u[i]); \ } #define in4(s, t, u, v) \ for (int i = 0; i < (int)s.size(); i++) { \ in(s[i], t[i], u[i], v[i]); \ } #define die(...) \ do { \ Nyaan::out(__VA_ARGS__); \ return; \ } while (0) #line 70 "template/template.hpp" namespace Nyaan { void solve(); } int main() { Nyaan::solve(); } #line 4 "verify/verify-yosupo-ntt/yosupo-convolution-real-fft-toom-3.test.cpp" // #line 2 "misc/fastio.hpp" #line 8 "misc/fastio.hpp" using namespace std; #line 2 "internal/internal-type-traits.hpp" #line 4 "internal/internal-type-traits.hpp" using namespace std; namespace internal { template <typename T> using is_broadly_integral = typename conditional_t<is_integral_v<T> || is_same_v<T, __int128_t> || is_same_v<T, __uint128_t>, true_type, false_type>::type; template <typename T> using is_broadly_signed = typename conditional_t<is_signed_v<T> || is_same_v<T, __int128_t>, true_type, false_type>::type; template <typename T> using is_broadly_unsigned = typename conditional_t<is_unsigned_v<T> || is_same_v<T, __uint128_t>, true_type, false_type>::type; #define ENABLE_VALUE(x) \ template <typename T> \ constexpr bool x##_v = x<T>::value; ENABLE_VALUE(is_broadly_integral); ENABLE_VALUE(is_broadly_signed); ENABLE_VALUE(is_broadly_unsigned); #undef ENABLE_VALUE #define ENABLE_HAS_TYPE(var) \ template <class, class = void> \ struct has_##var : false_type {}; \ template <class T> \ struct has_##var<T, void_t<typename T::var>> : true_type {}; \ template <class T> \ constexpr auto has_##var##_v = has_##var<T>::value; #define ENABLE_HAS_VAR(var) \ template <class, class = void> \ struct has_##var : false_type {}; \ template <class T> \ struct has_##var<T, void_t<decltype(T::var)>> : true_type {}; \ template <class T> \ constexpr auto has_##var##_v = has_##var<T>::value; } // namespace internal #line 12 "misc/fastio.hpp" namespace fastio { static constexpr int SZ = 1 << 17; static constexpr int offset = 64; char inbuf[SZ], outbuf[SZ]; int in_left = 0, in_right = 0, out_right = 0; struct Pre { char num[40000]; constexpr Pre() : num() { for (int i = 0; i < 10000; i++) { int n = i; for (int j = 3; j >= 0; j--) { num[i * 4 + j] = n % 10 + '0'; n /= 10; } } } } constexpr pre; void load() { int len = in_right - in_left; memmove(inbuf, inbuf + in_left, len); in_right = len + fread(inbuf + len, 1, SZ - len, stdin); in_left = 0; } void flush() { fwrite(outbuf, 1, out_right, stdout); out_right = 0; } void skip_space() { if (in_left + offset > in_right) load(); while (inbuf[in_left] <= ' ') in_left++; } void single_read(char& c) { if (in_left + offset > in_right) load(); skip_space(); c = inbuf[in_left++]; } void single_read(string& S) { skip_space(); while (true) { if (in_left == in_right) load(); int i = in_left; for (; i != in_right; i++) { if (inbuf[i] <= ' ') break; } copy(inbuf + in_left, inbuf + i, back_inserter(S)); in_left = i; if (i != in_right) break; } } template <typename T, enable_if_t<internal::is_broadly_integral_v<T>>* = nullptr> void single_read(T& x) { if (in_left + offset > in_right) load(); skip_space(); char c = inbuf[in_left++]; [[maybe_unused]] bool minus = false; if constexpr (internal::is_broadly_signed_v<T>) { if (c == '-') minus = true, c = inbuf[in_left++]; } x = 0; while (c >= '0') { x = x * 10 + (c & 15); c = inbuf[in_left++]; } if constexpr (internal::is_broadly_signed_v<T>) { if (minus) x = -x; } } void rd() {} template <typename Head, typename... Tail> void rd(Head& head, Tail&... tail) { single_read(head); rd(tail...); } void single_write(const char& c) { if (out_right > SZ - offset) flush(); outbuf[out_right++] = c; } void single_write(const bool& b) { if (out_right > SZ - offset) flush(); outbuf[out_right++] = b ? '1' : '0'; } void single_write(const string& S) { flush(), fwrite(S.data(), 1, S.size(), stdout); } void single_write(const char* p) { flush(), fwrite(p, 1, strlen(p), stdout); } template <typename T, enable_if_t<internal::is_broadly_integral_v<T>>* = nullptr> void single_write(const T& _x) { if (out_right > SZ - offset) flush(); if (_x == 0) { outbuf[out_right++] = '0'; return; } T x = _x; if constexpr (internal::is_broadly_signed_v<T>) { if (x < 0) outbuf[out_right++] = '-', x = -x; } constexpr int buffer_size = sizeof(T) * 10 / 4; char buf[buffer_size]; int i = buffer_size; while (x >= 10000) { i -= 4; memcpy(buf + i, pre.num + (x % 10000) * 4, 4); x /= 10000; } if (x < 100) { if (x < 10) { outbuf[out_right] = '0' + x; ++out_right; } else { uint32_t q = (uint32_t(x) * 205) >> 11; uint32_t r = uint32_t(x) - q * 10; outbuf[out_right] = '0' + q; outbuf[out_right + 1] = '0' + r; out_right += 2; } } else { if (x < 1000) { memcpy(outbuf + out_right, pre.num + (x << 2) + 1, 3); out_right += 3; } else { memcpy(outbuf + out_right, pre.num + (x << 2), 4); out_right += 4; } } memcpy(outbuf + out_right, buf + i, buffer_size - i); out_right += buffer_size - i; } void wt() {} template <typename Head, typename... Tail> void wt(const Head& head, const Tail&... tail) { single_write(head); wt(forward<const Tail>(tail)...); } template <typename... Args> void wtn(const Args&... x) { wt(forward<const Args>(x)...); wt('\n'); } struct Dummy { Dummy() { atexit(flush); } } dummy; } // namespace fastio using fastio::rd; using fastio::skip_space; using fastio::wt; using fastio::wtn; #line 2 "ntt/complex-fft.hpp" namespace ArbitraryModConvolution { template <typename T> struct Cp { T x, y; constexpr Cp() : x(0), y(0) {} constexpr Cp(T _x, T _y) : x(_x), y(_y) {} constexpr inline Cp operator+(const Cp& c) const { return Cp(x + c.x, y + c.y); } constexpr inline Cp operator-(const Cp& c) const { return Cp(x - c.x, y - c.y); } constexpr inline Cp operator*(const Cp& c) const { return Cp(x * c.x - y * c.y, x * c.y + y * c.x); } constexpr inline Cp operator-() const { return Cp(-x, -y); } constexpr inline Cp conj() const { return Cp(x, -y); } constexpr inline Cp rotl() const { return Cp(-y, x); } friend ostream& operator<<(ostream& os, const Cp& c) { os << "(" << c.x << ", " << c.y << ")" << endl; return os; } }; using C = Cp<double>; const long double PI = acosl(-1); struct CooleyTukey { static vector<C> w; static void setw(int k) { --k; if ((int)w.size() >= (1 << k)) return; w.resize(1 << k); vector<Cp<long double>> base(k); const long double arg = PI / (1 << k); for (int i = 0, j = 1 << (k - 1); j; i++, j >>= 1) { complex<long double> z = exp(complex<long double>(1i) * (arg * j)); base[i] = Cp<long double>{z.real(), z.imag()}; } genw(0, k - 1, Cp<long double>{1, 0}, base); } static void genw(int i, int b, Cp<long double> z, const vector<Cp<long double>>& base) { if (b == -1) { w[i].x = z.x, w[i].y = z.y; } else { genw(i, b - 1, z, base); genw(i | (1 << b), b - 1, z * base[b], base); } } static void fft(vector<C>& a, int k) { if (k <= 0) return; if (k == 1) { C a1 = a[1]; a[1] = a[0] - a[1]; a[0] = a[0] + a1; return; } if (k & 1) { int v = 1 << (k - 1); for (int j = 0; j < v; ++j) { C ajv = a[j + v]; a[j + v] = a[j] - ajv; a[j] = a[j] + ajv; } } int u = 1 << (k & 1), v = 1 << (k - 2 - (k & 1)); while (v) { { int j0 = 0; int j1 = v; int j2 = j1 + v; int j3 = j2 + v; int je = v; for (; j0 < je; ++j0, ++j1, ++j2, ++j3) { C t0 = a[j0], t1 = a[j1], t2 = a[j2], t3 = a[j3]; C t0p2 = t0 + t2, t1p3 = t1 + t3; C t0m2 = t0 - t2, t1m3 = (t1 - t3) * w[1]; a[j0] = t0p2 + t1p3, a[j1] = t0p2 - t1p3; a[j2] = t0m2 + t1m3, a[j3] = t0m2 - t1m3; } } // jh >= 1 for (int jh = 1; jh < u; ++jh) { int j0 = jh * v * 4; int j1 = j0 + v; int j2 = j1 + v; int j3 = j2 + v; int je = j1; C ww = w[jh]; C xx = w[jh << 1]; C wx = ww * xx; for (; j0 < je; ++j0, ++j1, ++j2, ++j3) { C t0 = a[j0], t1 = a[j1] * xx, t2 = a[j2] * ww, t3 = a[j3] * wx; C t0p2 = t0 + t2, t1p3 = t1 + t3; C t0m2 = t0 - t2, t1m3 = (t1 - t3) * w[1]; a[j0] = t0p2 + t1p3, a[j1] = t0p2 - t1p3; a[j2] = t0m2 + t1m3, a[j3] = t0m2 - t1m3; } } u <<= 2, v >>= 2; } } static void ifft(vector<C>& a, int k) { if ((int)a.size() <= 1) return; if (k == 1) { C a1 = a[1]; a[1] = a[0] - a[1]; a[0] = a[0] + a1; return; } int u = 1 << (k - 2); int v = 1; while (u) { // jh = 0 { int j0 = 0; int j1 = v; int j2 = j1 + v; int j3 = j2 + v; for (; j0 < v; ++j0, ++j1, ++j2, ++j3) { C t0 = a[j0], t1 = a[j1], t2 = a[j2], t3 = a[j3]; C t0p1 = t0 + t1, t2p3 = t2 + t3; C t0m1 = t0 - t1, t2m3 = (t2 - t3) * w[1].conj(); a[j0] = t0p1 + t2p3, a[j2] = t0p1 - t2p3; a[j1] = t0m1 + t2m3, a[j3] = t0m1 - t2m3; } } // jh >= 1 for (int jh = 1; jh < u; ++jh) { int j0 = (jh * v) << 2; int j1 = j0 + v; int j2 = j1 + v; int j3 = j2 + v; int je = j1; C ww = w[jh].conj(); C xx = w[jh << 1].conj(); C yy = w[(jh << 1) + 1].conj(); for (; j0 < je; ++j0, ++j1, ++j2, ++j3) { C t0 = a[j0], t1 = a[j1], t2 = a[j2], t3 = a[j3]; C t0p1 = t0 + t1, t2p3 = t2 + t3; C t0m1 = (t0 - t1) * xx, t2m3 = (t2 - t3) * yy; a[j0] = t0p1 + t2p3, a[j2] = (t0p1 - t2p3) * ww; a[j1] = t0m1 + t2m3, a[j3] = (t0m1 - t2m3) * ww; } } u >>= 2; v <<= 2; } if (k & 1) { u = 1 << (k - 1); for (int j = 0; j < u; j++) { C ajv = a[j] - a[j + u]; a[j] = a[j] + a[j + u]; a[j + u] = ajv; } } } static void fft_real(vector<C>& AL, vector<C>& AH, int k) { fft(AL, k); AH[0] = C{AL[0].y * 2.0, 0}; AL[0] = C{AL[0].x * 2.0, 0}; AH[1] = C{AL[1].y * 2.0, 0}; AL[1] = C{AL[1].x * 2.0, 0}; for (int i = 2, y = 2; y < (1 << k); y <<= 1) { for (; i < 2 * y; i += 2) { int j = i ^ (y - 1); AH[i] = (AL[j].conj() - AL[i]).rotl(); AL[i] = (AL[j].conj() + AL[i]); AH[j] = AH[i].conj(); AL[j] = AL[i].conj(); } } } // naive convolution for int template <typename T, enable_if_t<is_integral<T>::value, nullptr_t> = nullptr> static vector<long long> multiply(const vector<T>& s, const vector<T>& t) { int l = s.size() + t.size() - 1; if (min(s.size(), t.size()) <= 40) { vector<long long> u(l); for (int i = 0; i < (int)s.size(); i++) { for (int j = 0; j < (int)t.size(); j++) u[i + j] += 1LL * s[i] * t[j]; } return u; } int k = 2, M = 4; while (M < l) M <<= 1, ++k; setw(k); auto round = [](double x) -> long long { return (long long)(x + (x > 0 ? 0.5 : -0.5)); }; vector<C> a(M); for (int i = 0; i < (int)s.size(); i++) a[i].x = s[i]; for (int i = 0; i < (int)t.size(); i++) a[i].y = t[i]; fft(a, k); a[0].y = 4.0 * a[0].x * a[0].y; a[1].y = 4.0 * a[1].x * a[1].y; a[0].x = a[1].x = 0.0; for (int i = 2; i < M; i += 2) { int c = 1 << (31 - __builtin_clz(i)); int j = i ^ (c - 1); a[i] = (a[i] + a[j].conj()) * (a[i] - a[j].conj()); a[j] = -a[i].conj(); } vector<C> b(M / 2); for (int j = 0; j < M / 2; j++) { C tmp1 = a[j * 2 + 0] + a[j * 2 + 1]; C tmp2 = (a[j * 2 + 0] - a[j * 2 + 1]) * w[j].conj(); b[j] = tmp1 + tmp2.rotl(); } ifft(b, k - 1); vector<long long> u(l); for (int i = 0; i < l; i++) { if (i & 1) { u[i] = round(-b[i >> 1].x / (4.0 * M)); } else { u[i] = round(b[i >> 1].y / (4.0 * M)); } } return u; } static vector<double> multiply(const vector<double>& s, const vector<double>& t) { int l = s.size() + t.size() - 1; if (min(s.size(), t.size()) <= 40) { vector<double> u(l); for (int i = 0; i < (int)s.size(); i++) { for (int j = 0; j < (int)t.size(); j++) u[i + j] += s[i] * t[j]; } return u; } int k = 2, M = 4; while (M < l) M <<= 1, ++k; setw(k); vector<C> a(M); for (int i = 0; i < (int)s.size(); i++) a[i].x = s[i]; for (int i = 0; i < (int)t.size(); i++) a[i].y = t[i]; fft(a, k); a[0].y = 4.0 * a[0].x * a[0].y; a[1].y = 4.0 * a[1].x * a[1].y; a[0].x = a[1].x = 0.0; for (int i = 2; i < M; i += 2) { int c = 1 << (31 - __builtin_clz(i)); int j = i ^ (c - 1); a[i] = (a[i] + a[j].conj()) * (a[i] - a[j].conj()); a[j] = -a[i].conj(); } vector<C> b(M / 2); for (int j = 0; j < M / 2; j++) { C tmp1 = a[j * 2 + 0] + a[j * 2 + 1]; C tmp2 = (a[j * 2 + 0] - a[j * 2 + 1]) * w[j].conj(); b[j] = tmp1 + tmp2.rotl(); } ifft(b, k - 1); vector<double> u(l); for (int i = 0; i < l; i++) { if (i & 1) { u[i] = -b[i >> 1].x / (4.0 * M); } else { u[i] = b[i >> 1].y / (4.0 * M); } } return u; } template <unsigned int MOD = -1u> static conditional_t<MOD == -1u, vector<__uint128_t>, vector<int>> multiply_15bit(const vector<int>& a, const vector<int>& b) { using u64 = unsigned long long; constexpr u64 B = 32000; int l = a.size() + b.size() - 1; int k = 2, M = 4; while (M < l) M <<= 1, ++k; setw(k); auto round = [](double x) -> u64 { return u64(x + 0.5); }; vector<C> AL(M), AH(M), BL(M), BH(M); for (int i = 0; i < (int)a.size(); i++) { AL[i] = C{double(a[i] % B), double(a[i] / B)}; } for (int i = 0; i < (int)b.size(); i++) { BL[i] = C{double(b[i] % B), double(b[i] / B)}; } fft_real(AL, AH, k); fft_real(BL, BH, k); for (int i = 0; i < M; i++) { C tmp = AL[i] * BL[i] + (AH[i] * BH[i]).rotl(); BH[i] = AL[i] * BH[i] + (AH[i] * BL[i]).rotl(); BL[i] = tmp; } ifft(BL, k); ifft(BH, k); using return_type = conditional_t<MOD + 1u == 0u, vector<__uint128_t>, vector<int>>; return_type u(l); double im = 1.0 / (4.0 * M); for (int i = 0; i < l; i++) { BL[i].x *= im, BL[i].y *= im; BH[i].x *= im, BH[i].y *= im; u64 s1 = round(BL[i].x); u64 s2 = round(BH[i].x) + round(BH[i].y); u64 s3 = round(BL[i].y); if constexpr (MOD == -1u) { u[i] += __uint128_t(s1); u[i] += __uint128_t(s2) * B; u[i] += __uint128_t(s3) * B * B; } else { u[i] += s1 % MOD; u[i] += s2 % MOD * B % MOD; if (u[i] >= MOD) u[i] -= MOD; u[i] += s3 % MOD * (B * B % MOD) % MOD; if (u[i] >= MOD) u[i] -= MOD; } } return u; } }; vector<C> CooleyTukey::w; } // namespace ArbitraryModConvolution #line 7 "verify/verify-yosupo-ntt/yosupo-convolution-real-fft-toom-3.test.cpp" namespace ArbitraryModConvolution { // naive Toom-3 template <unsigned int MOD> vector<int> toom_3(const vector<int>& a, const vector<int>& b) { auto precalc = [](const vector<int>& _a) -> array<vector<int>, 5> { int n = _a.size(); vector<int> p0(n), p1(n), pm1(n), pm2(n), pinf(n); for (int i = 0; i < n; i++) { int m0 = _a[i] & 1023; int m1 = (_a[i] >> 10) & 1023; int m2 = (_a[i] >> 20) & 1023; p0[i] = m0; p1[i] = m0 + m1 + m2; pm1[i] = m0 - m1 + m2; pm2[i] = m0 - 2 * m1 + 4 * m2; pinf[i] = m2; } return {{p0, p1, pm1, pm2, pinf}}; }; auto [a0, a1, am1, am2, ainf] = precalc(a); auto [b0, b1, bm1, bm2, binf] = precalc(b); auto c0 = CooleyTukey::multiply(a0, b0); auto c1 = CooleyTukey::multiply(a1, b1); auto cm1 = CooleyTukey::multiply(am1, bm1); auto cm2 = CooleyTukey::multiply(am2, bm2); auto cinf = CooleyTukey::multiply(ainf, binf); vector<int> c(c0.size()); for (int i = 0; i < (int)c.size(); i++) { long long r0 = c0[i]; long long r4 = cinf[i]; long long r3 = (cm2[i] - c1[i]) / 3; long long r1 = (c1[i] - cm1[i]) / 2; long long r2 = cm1[i] - c0[i]; r3 = (r2 - r3) / 2 + r4 * 2; r2 += r1 - r4; r1 -= r3; long long ret = r4 % MOD * 1048576; ret += r3 % MOD * 1024 + r2; ret = ret % MOD * 1048576; ret += r1 % MOD * 1024 + r0; ret %= MOD; if (ret < 0) ret += MOD; c[i] = ret; } return c; } } // namespace ArbitraryModConvolution void Nyaan::solve() { using namespace ArbitraryModConvolution; int N, M; rd(N, M); vector<int> a(N), b(M); for (auto& x : a) rd(x); for (auto& x : b) rd(x); auto c = toom_3<1000000007>(a, b); // auto c = CooleyTukey::karatsuba<1000000007>(a, b); for (int i = 0; i < N + M - 1; i++) { wt(c[i], " \n"[i == N + M - 2]); } }