Nyaan's Library

This documentation is automatically generated by online-judge-tools/verification-helper

View on GitHub

:heavy_check_mark: ntt/complex-fft.hpp

Verified with

Code

#pragma once

namespace ArbitraryModConvolution {

template <typename T>
struct Cp {
  T x, y;
  constexpr Cp() : x(0), y(0) {}
  constexpr Cp(T _x, T _y) : x(_x), y(_y) {}
  constexpr inline Cp operator+(const Cp& c) const {
    return Cp(x + c.x, y + c.y);
  }
  constexpr inline Cp operator-(const Cp& c) const {
    return Cp(x - c.x, y - c.y);
  }
  constexpr inline Cp operator*(const Cp& c) const {
    return Cp(x * c.x - y * c.y, x * c.y + y * c.x);
  }
  constexpr inline Cp operator-() const { return Cp(-x, -y); }
  constexpr inline Cp conj() const { return Cp(x, -y); }
  constexpr inline Cp rotl() const { return Cp(-y, x); }
  friend ostream& operator<<(ostream& os, const Cp& c) {
    os << "(" << c.x << ", " << c.y << ")" << endl;
    return os;
  }
};

using C = Cp<double>;
const long double PI = acosl(-1);

struct CooleyTukey {
  static vector<C> w;

  static void setw(int k) {
    --k;
    if ((int)w.size() >= (1 << k)) return;
    w.resize(1 << k);
    vector<Cp<long double>> base(k);
    const long double arg = PI / (1 << k);
    for (int i = 0, j = 1 << (k - 1); j; i++, j >>= 1) {
      complex<long double> z = exp(complex<long double>(1i) * (arg * j));
      base[i] = Cp<long double>{z.real(), z.imag()};
    }
    genw(0, k - 1, Cp<long double>{1, 0}, base);
  }

  static void genw(int i, int b, Cp<long double> z,
                   const vector<Cp<long double>>& base) {
    if (b == -1) {
      w[i].x = z.x, w[i].y = z.y;
    } else {
      genw(i, b - 1, z, base);
      genw(i | (1 << b), b - 1, z * base[b], base);
    }
  }

  static void fft(vector<C>& a, int k) {
    if (k <= 0) return;
    if (k == 1) {
      C a1 = a[1];
      a[1] = a[0] - a[1];
      a[0] = a[0] + a1;
      return;
    }
    if (k & 1) {
      int v = 1 << (k - 1);
      for (int j = 0; j < v; ++j) {
        C ajv = a[j + v];
        a[j + v] = a[j] - ajv;
        a[j] = a[j] + ajv;
      }
    }
    int u = 1 << (k & 1), v = 1 << (k - 2 - (k & 1));
    while (v) {
      {
        int j0 = 0;
        int j1 = v;
        int j2 = j1 + v;
        int j3 = j2 + v;
        int je = v;
        for (; j0 < je; ++j0, ++j1, ++j2, ++j3) {
          C t0 = a[j0], t1 = a[j1], t2 = a[j2], t3 = a[j3];
          C t0p2 = t0 + t2, t1p3 = t1 + t3;
          C t0m2 = t0 - t2, t1m3 = (t1 - t3) * w[1];
          a[j0] = t0p2 + t1p3, a[j1] = t0p2 - t1p3;
          a[j2] = t0m2 + t1m3, a[j3] = t0m2 - t1m3;
        }
      }
      // jh >= 1
      for (int jh = 1; jh < u; ++jh) {
        int j0 = jh * v * 4;
        int j1 = j0 + v;
        int j2 = j1 + v;
        int j3 = j2 + v;
        int je = j1;
        C ww = w[jh];
        C xx = w[jh << 1];
        C wx = ww * xx;
        for (; j0 < je; ++j0, ++j1, ++j2, ++j3) {
          C t0 = a[j0], t1 = a[j1] * xx, t2 = a[j2] * ww, t3 = a[j3] * wx;
          C t0p2 = t0 + t2, t1p3 = t1 + t3;
          C t0m2 = t0 - t2, t1m3 = (t1 - t3) * w[1];
          a[j0] = t0p2 + t1p3, a[j1] = t0p2 - t1p3;
          a[j2] = t0m2 + t1m3, a[j3] = t0m2 - t1m3;
        }
      }
      u <<= 2, v >>= 2;
    }
  }

  static void ifft(vector<C>& a, int k) {
    if ((int)a.size() <= 1) return;
    if (k == 1) {
      C a1 = a[1];
      a[1] = a[0] - a[1];
      a[0] = a[0] + a1;
      return;
    }
    int u = 1 << (k - 2);
    int v = 1;
    while (u) {
      // jh = 0
      {
        int j0 = 0;
        int j1 = v;
        int j2 = j1 + v;
        int j3 = j2 + v;
        for (; j0 < v; ++j0, ++j1, ++j2, ++j3) {
          C t0 = a[j0], t1 = a[j1], t2 = a[j2], t3 = a[j3];
          C t0p1 = t0 + t1, t2p3 = t2 + t3;
          C t0m1 = t0 - t1, t2m3 = (t2 - t3) * w[1].conj();
          a[j0] = t0p1 + t2p3, a[j2] = t0p1 - t2p3;
          a[j1] = t0m1 + t2m3, a[j3] = t0m1 - t2m3;
        }
      }
      // jh >= 1
      for (int jh = 1; jh < u; ++jh) {
        int j0 = (jh * v) << 2;
        int j1 = j0 + v;
        int j2 = j1 + v;
        int j3 = j2 + v;
        int je = j1;
        C ww = w[jh].conj();
        C xx = w[jh << 1].conj();
        C yy = w[(jh << 1) + 1].conj();
        for (; j0 < je; ++j0, ++j1, ++j2, ++j3) {
          C t0 = a[j0], t1 = a[j1], t2 = a[j2], t3 = a[j3];
          C t0p1 = t0 + t1, t2p3 = t2 + t3;
          C t0m1 = (t0 - t1) * xx, t2m3 = (t2 - t3) * yy;
          a[j0] = t0p1 + t2p3, a[j2] = (t0p1 - t2p3) * ww;
          a[j1] = t0m1 + t2m3, a[j3] = (t0m1 - t2m3) * ww;
        }
      }
      u >>= 2;
      v <<= 2;
    }
    if (k & 1) {
      u = 1 << (k - 1);
      for (int j = 0; j < u; j++) {
        C ajv = a[j] - a[j + u];
        a[j] = a[j] + a[j + u];
        a[j + u] = ajv;
      }
    }
  }

  static void fft_real(vector<C>& AL, vector<C>& AH, int k) {
    fft(AL, k);
    AH[0] = C{AL[0].y * 2.0, 0};
    AL[0] = C{AL[0].x * 2.0, 0};
    AH[1] = C{AL[1].y * 2.0, 0};
    AL[1] = C{AL[1].x * 2.0, 0};
    for (int i = 2, y = 2; y < (1 << k); y <<= 1) {
      for (; i < 2 * y; i += 2) {
        int j = i ^ (y - 1);
        AH[i] = (AL[j].conj() - AL[i]).rotl();
        AL[i] = (AL[j].conj() + AL[i]);
        AH[j] = AH[i].conj();
        AL[j] = AL[i].conj();
      }
    }
  }

  // naive convolution for int
  template <typename T, enable_if_t<is_integral<T>::value, nullptr_t> = nullptr>
  static vector<long long> multiply(const vector<T>& s, const vector<T>& t) {
    int l = s.size() + t.size() - 1;
    if (min(s.size(), t.size()) <= 40) {
      vector<long long> u(l);
      for (int i = 0; i < (int)s.size(); i++) {
        for (int j = 0; j < (int)t.size(); j++) u[i + j] += 1LL * s[i] * t[j];
      }
      return u;
    }

    int k = 2, M = 4;
    while (M < l) M <<= 1, ++k;
    setw(k);
    auto round = [](double x) -> long long {
      return (long long)(x + (x > 0 ? 0.5 : -0.5));
    };

    vector<C> a(M);
    for (int i = 0; i < (int)s.size(); i++) a[i].x = s[i];
    for (int i = 0; i < (int)t.size(); i++) a[i].y = t[i];
    fft(a, k);

    a[0].y = 4.0 * a[0].x * a[0].y;
    a[1].y = 4.0 * a[1].x * a[1].y;
    a[0].x = a[1].x = 0.0;
    for (int i = 2; i < M; i += 2) {
      int c = 1 << (31 - __builtin_clz(i));
      int j = i ^ (c - 1);
      a[i] = (a[i] + a[j].conj()) * (a[i] - a[j].conj());
      a[j] = -a[i].conj();
    }

    vector<C> b(M / 2);
    for (int j = 0; j < M / 2; j++) {
      C tmp1 = a[j * 2 + 0] + a[j * 2 + 1];
      C tmp2 = (a[j * 2 + 0] - a[j * 2 + 1]) * w[j].conj();
      b[j] = tmp1 + tmp2.rotl();
    }
    ifft(b, k - 1);

    vector<long long> u(l);
    for (int i = 0; i < l; i++) {
      if (i & 1) {
        u[i] = round(-b[i >> 1].x / (4.0 * M));
      } else {
        u[i] = round(b[i >> 1].y / (4.0 * M));
      }
    }
    return u;
  }

  static vector<double> multiply(const vector<double>& s,
                                 const vector<double>& t) {
    int l = s.size() + t.size() - 1;
    if (min(s.size(), t.size()) <= 40) {
      vector<double> u(l);
      for (int i = 0; i < (int)s.size(); i++) {
        for (int j = 0; j < (int)t.size(); j++) u[i + j] += s[i] * t[j];
      }
      return u;
    }

    int k = 2, M = 4;
    while (M < l) M <<= 1, ++k;
    setw(k);

    vector<C> a(M);
    for (int i = 0; i < (int)s.size(); i++) a[i].x = s[i];
    for (int i = 0; i < (int)t.size(); i++) a[i].y = t[i];
    fft(a, k);

    a[0].y = 4.0 * a[0].x * a[0].y;
    a[1].y = 4.0 * a[1].x * a[1].y;
    a[0].x = a[1].x = 0.0;
    for (int i = 2; i < M; i += 2) {
      int c = 1 << (31 - __builtin_clz(i));
      int j = i ^ (c - 1);
      a[i] = (a[i] + a[j].conj()) * (a[i] - a[j].conj());
      a[j] = -a[i].conj();
    }

    vector<C> b(M / 2);
    for (int j = 0; j < M / 2; j++) {
      C tmp1 = a[j * 2 + 0] + a[j * 2 + 1];
      C tmp2 = (a[j * 2 + 0] - a[j * 2 + 1]) * w[j].conj();
      b[j] = tmp1 + tmp2.rotl();
    }
    ifft(b, k - 1);

    vector<double> u(l);
    for (int i = 0; i < l; i++) {
      if (i & 1) {
        u[i] = -b[i >> 1].x / (4.0 * M);
      } else {
        u[i] = b[i >> 1].y / (4.0 * M);
      }
    }
    return u;
  }

  template <unsigned int MOD = -1u>
  static conditional_t<MOD == -1u, vector<__uint128_t>, vector<int>>
  multiply_15bit(const vector<int>& a, const vector<int>& b) {
    using u64 = unsigned long long;
    constexpr u64 B = 32000;
    int l = a.size() + b.size() - 1;
    int k = 2, M = 4;
    while (M < l) M <<= 1, ++k;
    setw(k);
    auto round = [](double x) -> u64 { return u64(x + 0.5); };

    vector<C> AL(M), AH(M), BL(M), BH(M);
    for (int i = 0; i < (int)a.size(); i++) {
      AL[i] = C{double(a[i] % B), double(a[i] / B)};
    }
    for (int i = 0; i < (int)b.size(); i++) {
      BL[i] = C{double(b[i] % B), double(b[i] / B)};
    }

    fft_real(AL, AH, k);
    fft_real(BL, BH, k);

    for (int i = 0; i < M; i++) {
      C tmp = AL[i] * BL[i] + (AH[i] * BH[i]).rotl();
      BH[i] = AL[i] * BH[i] + (AH[i] * BL[i]).rotl();
      BL[i] = tmp;
    }

    ifft(BL, k);
    ifft(BH, k);

    using return_type =
        conditional_t<MOD + 1u == 0u, vector<__uint128_t>, vector<int>>;

    return_type u(l);
    double im = 1.0 / (4.0 * M);
    for (int i = 0; i < l; i++) {
      BL[i].x *= im, BL[i].y *= im;
      BH[i].x *= im, BH[i].y *= im;
      u64 s1 = round(BL[i].x);
      u64 s2 = round(BH[i].x) + round(BH[i].y);
      u64 s3 = round(BL[i].y);

      if constexpr (MOD == -1u) {
        u[i] += __uint128_t(s1);
        u[i] += __uint128_t(s2) * B;
        u[i] += __uint128_t(s3) * B * B;
      } else {
        u[i] += s1 % MOD;
        u[i] += s2 % MOD * B % MOD;
        if (u[i] >= MOD) u[i] -= MOD;
        u[i] += s3 % MOD * (B * B % MOD) % MOD;
        if (u[i] >= MOD) u[i] -= MOD;
      }
    }
    return u;
  }
};
vector<C> CooleyTukey::w;

}  // namespace ArbitraryModConvolution
#line 2 "ntt/complex-fft.hpp"

namespace ArbitraryModConvolution {

template <typename T>
struct Cp {
  T x, y;
  constexpr Cp() : x(0), y(0) {}
  constexpr Cp(T _x, T _y) : x(_x), y(_y) {}
  constexpr inline Cp operator+(const Cp& c) const {
    return Cp(x + c.x, y + c.y);
  }
  constexpr inline Cp operator-(const Cp& c) const {
    return Cp(x - c.x, y - c.y);
  }
  constexpr inline Cp operator*(const Cp& c) const {
    return Cp(x * c.x - y * c.y, x * c.y + y * c.x);
  }
  constexpr inline Cp operator-() const { return Cp(-x, -y); }
  constexpr inline Cp conj() const { return Cp(x, -y); }
  constexpr inline Cp rotl() const { return Cp(-y, x); }
  friend ostream& operator<<(ostream& os, const Cp& c) {
    os << "(" << c.x << ", " << c.y << ")" << endl;
    return os;
  }
};

using C = Cp<double>;
const long double PI = acosl(-1);

struct CooleyTukey {
  static vector<C> w;

  static void setw(int k) {
    --k;
    if ((int)w.size() >= (1 << k)) return;
    w.resize(1 << k);
    vector<Cp<long double>> base(k);
    const long double arg = PI / (1 << k);
    for (int i = 0, j = 1 << (k - 1); j; i++, j >>= 1) {
      complex<long double> z = exp(complex<long double>(1i) * (arg * j));
      base[i] = Cp<long double>{z.real(), z.imag()};
    }
    genw(0, k - 1, Cp<long double>{1, 0}, base);
  }

  static void genw(int i, int b, Cp<long double> z,
                   const vector<Cp<long double>>& base) {
    if (b == -1) {
      w[i].x = z.x, w[i].y = z.y;
    } else {
      genw(i, b - 1, z, base);
      genw(i | (1 << b), b - 1, z * base[b], base);
    }
  }

  static void fft(vector<C>& a, int k) {
    if (k <= 0) return;
    if (k == 1) {
      C a1 = a[1];
      a[1] = a[0] - a[1];
      a[0] = a[0] + a1;
      return;
    }
    if (k & 1) {
      int v = 1 << (k - 1);
      for (int j = 0; j < v; ++j) {
        C ajv = a[j + v];
        a[j + v] = a[j] - ajv;
        a[j] = a[j] + ajv;
      }
    }
    int u = 1 << (k & 1), v = 1 << (k - 2 - (k & 1));
    while (v) {
      {
        int j0 = 0;
        int j1 = v;
        int j2 = j1 + v;
        int j3 = j2 + v;
        int je = v;
        for (; j0 < je; ++j0, ++j1, ++j2, ++j3) {
          C t0 = a[j0], t1 = a[j1], t2 = a[j2], t3 = a[j3];
          C t0p2 = t0 + t2, t1p3 = t1 + t3;
          C t0m2 = t0 - t2, t1m3 = (t1 - t3) * w[1];
          a[j0] = t0p2 + t1p3, a[j1] = t0p2 - t1p3;
          a[j2] = t0m2 + t1m3, a[j3] = t0m2 - t1m3;
        }
      }
      // jh >= 1
      for (int jh = 1; jh < u; ++jh) {
        int j0 = jh * v * 4;
        int j1 = j0 + v;
        int j2 = j1 + v;
        int j3 = j2 + v;
        int je = j1;
        C ww = w[jh];
        C xx = w[jh << 1];
        C wx = ww * xx;
        for (; j0 < je; ++j0, ++j1, ++j2, ++j3) {
          C t0 = a[j0], t1 = a[j1] * xx, t2 = a[j2] * ww, t3 = a[j3] * wx;
          C t0p2 = t0 + t2, t1p3 = t1 + t3;
          C t0m2 = t0 - t2, t1m3 = (t1 - t3) * w[1];
          a[j0] = t0p2 + t1p3, a[j1] = t0p2 - t1p3;
          a[j2] = t0m2 + t1m3, a[j3] = t0m2 - t1m3;
        }
      }
      u <<= 2, v >>= 2;
    }
  }

  static void ifft(vector<C>& a, int k) {
    if ((int)a.size() <= 1) return;
    if (k == 1) {
      C a1 = a[1];
      a[1] = a[0] - a[1];
      a[0] = a[0] + a1;
      return;
    }
    int u = 1 << (k - 2);
    int v = 1;
    while (u) {
      // jh = 0
      {
        int j0 = 0;
        int j1 = v;
        int j2 = j1 + v;
        int j3 = j2 + v;
        for (; j0 < v; ++j0, ++j1, ++j2, ++j3) {
          C t0 = a[j0], t1 = a[j1], t2 = a[j2], t3 = a[j3];
          C t0p1 = t0 + t1, t2p3 = t2 + t3;
          C t0m1 = t0 - t1, t2m3 = (t2 - t3) * w[1].conj();
          a[j0] = t0p1 + t2p3, a[j2] = t0p1 - t2p3;
          a[j1] = t0m1 + t2m3, a[j3] = t0m1 - t2m3;
        }
      }
      // jh >= 1
      for (int jh = 1; jh < u; ++jh) {
        int j0 = (jh * v) << 2;
        int j1 = j0 + v;
        int j2 = j1 + v;
        int j3 = j2 + v;
        int je = j1;
        C ww = w[jh].conj();
        C xx = w[jh << 1].conj();
        C yy = w[(jh << 1) + 1].conj();
        for (; j0 < je; ++j0, ++j1, ++j2, ++j3) {
          C t0 = a[j0], t1 = a[j1], t2 = a[j2], t3 = a[j3];
          C t0p1 = t0 + t1, t2p3 = t2 + t3;
          C t0m1 = (t0 - t1) * xx, t2m3 = (t2 - t3) * yy;
          a[j0] = t0p1 + t2p3, a[j2] = (t0p1 - t2p3) * ww;
          a[j1] = t0m1 + t2m3, a[j3] = (t0m1 - t2m3) * ww;
        }
      }
      u >>= 2;
      v <<= 2;
    }
    if (k & 1) {
      u = 1 << (k - 1);
      for (int j = 0; j < u; j++) {
        C ajv = a[j] - a[j + u];
        a[j] = a[j] + a[j + u];
        a[j + u] = ajv;
      }
    }
  }

  static void fft_real(vector<C>& AL, vector<C>& AH, int k) {
    fft(AL, k);
    AH[0] = C{AL[0].y * 2.0, 0};
    AL[0] = C{AL[0].x * 2.0, 0};
    AH[1] = C{AL[1].y * 2.0, 0};
    AL[1] = C{AL[1].x * 2.0, 0};
    for (int i = 2, y = 2; y < (1 << k); y <<= 1) {
      for (; i < 2 * y; i += 2) {
        int j = i ^ (y - 1);
        AH[i] = (AL[j].conj() - AL[i]).rotl();
        AL[i] = (AL[j].conj() + AL[i]);
        AH[j] = AH[i].conj();
        AL[j] = AL[i].conj();
      }
    }
  }

  // naive convolution for int
  template <typename T, enable_if_t<is_integral<T>::value, nullptr_t> = nullptr>
  static vector<long long> multiply(const vector<T>& s, const vector<T>& t) {
    int l = s.size() + t.size() - 1;
    if (min(s.size(), t.size()) <= 40) {
      vector<long long> u(l);
      for (int i = 0; i < (int)s.size(); i++) {
        for (int j = 0; j < (int)t.size(); j++) u[i + j] += 1LL * s[i] * t[j];
      }
      return u;
    }

    int k = 2, M = 4;
    while (M < l) M <<= 1, ++k;
    setw(k);
    auto round = [](double x) -> long long {
      return (long long)(x + (x > 0 ? 0.5 : -0.5));
    };

    vector<C> a(M);
    for (int i = 0; i < (int)s.size(); i++) a[i].x = s[i];
    for (int i = 0; i < (int)t.size(); i++) a[i].y = t[i];
    fft(a, k);

    a[0].y = 4.0 * a[0].x * a[0].y;
    a[1].y = 4.0 * a[1].x * a[1].y;
    a[0].x = a[1].x = 0.0;
    for (int i = 2; i < M; i += 2) {
      int c = 1 << (31 - __builtin_clz(i));
      int j = i ^ (c - 1);
      a[i] = (a[i] + a[j].conj()) * (a[i] - a[j].conj());
      a[j] = -a[i].conj();
    }

    vector<C> b(M / 2);
    for (int j = 0; j < M / 2; j++) {
      C tmp1 = a[j * 2 + 0] + a[j * 2 + 1];
      C tmp2 = (a[j * 2 + 0] - a[j * 2 + 1]) * w[j].conj();
      b[j] = tmp1 + tmp2.rotl();
    }
    ifft(b, k - 1);

    vector<long long> u(l);
    for (int i = 0; i < l; i++) {
      if (i & 1) {
        u[i] = round(-b[i >> 1].x / (4.0 * M));
      } else {
        u[i] = round(b[i >> 1].y / (4.0 * M));
      }
    }
    return u;
  }

  static vector<double> multiply(const vector<double>& s,
                                 const vector<double>& t) {
    int l = s.size() + t.size() - 1;
    if (min(s.size(), t.size()) <= 40) {
      vector<double> u(l);
      for (int i = 0; i < (int)s.size(); i++) {
        for (int j = 0; j < (int)t.size(); j++) u[i + j] += s[i] * t[j];
      }
      return u;
    }

    int k = 2, M = 4;
    while (M < l) M <<= 1, ++k;
    setw(k);

    vector<C> a(M);
    for (int i = 0; i < (int)s.size(); i++) a[i].x = s[i];
    for (int i = 0; i < (int)t.size(); i++) a[i].y = t[i];
    fft(a, k);

    a[0].y = 4.0 * a[0].x * a[0].y;
    a[1].y = 4.0 * a[1].x * a[1].y;
    a[0].x = a[1].x = 0.0;
    for (int i = 2; i < M; i += 2) {
      int c = 1 << (31 - __builtin_clz(i));
      int j = i ^ (c - 1);
      a[i] = (a[i] + a[j].conj()) * (a[i] - a[j].conj());
      a[j] = -a[i].conj();
    }

    vector<C> b(M / 2);
    for (int j = 0; j < M / 2; j++) {
      C tmp1 = a[j * 2 + 0] + a[j * 2 + 1];
      C tmp2 = (a[j * 2 + 0] - a[j * 2 + 1]) * w[j].conj();
      b[j] = tmp1 + tmp2.rotl();
    }
    ifft(b, k - 1);

    vector<double> u(l);
    for (int i = 0; i < l; i++) {
      if (i & 1) {
        u[i] = -b[i >> 1].x / (4.0 * M);
      } else {
        u[i] = b[i >> 1].y / (4.0 * M);
      }
    }
    return u;
  }

  template <unsigned int MOD = -1u>
  static conditional_t<MOD == -1u, vector<__uint128_t>, vector<int>>
  multiply_15bit(const vector<int>& a, const vector<int>& b) {
    using u64 = unsigned long long;
    constexpr u64 B = 32000;
    int l = a.size() + b.size() - 1;
    int k = 2, M = 4;
    while (M < l) M <<= 1, ++k;
    setw(k);
    auto round = [](double x) -> u64 { return u64(x + 0.5); };

    vector<C> AL(M), AH(M), BL(M), BH(M);
    for (int i = 0; i < (int)a.size(); i++) {
      AL[i] = C{double(a[i] % B), double(a[i] / B)};
    }
    for (int i = 0; i < (int)b.size(); i++) {
      BL[i] = C{double(b[i] % B), double(b[i] / B)};
    }

    fft_real(AL, AH, k);
    fft_real(BL, BH, k);

    for (int i = 0; i < M; i++) {
      C tmp = AL[i] * BL[i] + (AH[i] * BH[i]).rotl();
      BH[i] = AL[i] * BH[i] + (AH[i] * BL[i]).rotl();
      BL[i] = tmp;
    }

    ifft(BL, k);
    ifft(BH, k);

    using return_type =
        conditional_t<MOD + 1u == 0u, vector<__uint128_t>, vector<int>>;

    return_type u(l);
    double im = 1.0 / (4.0 * M);
    for (int i = 0; i < l; i++) {
      BL[i].x *= im, BL[i].y *= im;
      BH[i].x *= im, BH[i].y *= im;
      u64 s1 = round(BL[i].x);
      u64 s2 = round(BH[i].x) + round(BH[i].y);
      u64 s3 = round(BL[i].y);

      if constexpr (MOD == -1u) {
        u[i] += __uint128_t(s1);
        u[i] += __uint128_t(s2) * B;
        u[i] += __uint128_t(s3) * B * B;
      } else {
        u[i] += s1 % MOD;
        u[i] += s2 % MOD * B % MOD;
        if (u[i] >= MOD) u[i] -= MOD;
        u[i] += s3 % MOD * (B * B % MOD) % MOD;
        if (u[i] >= MOD) u[i] -= MOD;
      }
    }
    return u;
  }
};
vector<C> CooleyTukey::w;

}  // namespace ArbitraryModConvolution
Back to top page