#include #include #include #include #include #include namespace sangi {

// ========================================================================
// メイン sqrt 関数
// ========================================================================

Int IntSqrt::sqrt(const Int& value) {
    // 特殊状態の処理
    if (value.isNaN()) [[unlikely]] {
        return Int::NaN();
    }

    if (value.isNegative()) [[unlikely]] {
        // 負の数の平方根は NaN
        Int result = Int::NaN();
        result.setState(NumericState::NaN, NumericError::NegativeSqrt);
        return result;
    }

    if (value.getState() == NumericState::PositiveInfinity) [[unlikely]] {
        return Int::PositiveInfinity();
    }

    if (value.isZero()) [[unlikely]] {
        return Int::Zero();
    }

    if (value.isOne()) [[unlikely]] {
        return Int::One();
    }

    // 1 limb fast path: isqrt(uint64_t) を直接計算
    size_t an = value.size();
    if (an == 1) {
        uint64_t v = value.word(0);
        uint64_t s = static_cast (std::sqrt(static_cast (v)));
        // Newton 補正 (double の丸め誤差)
        if (s > 0 && s * s > v) s--;
        if ((s + 1) * (s + 1) <= v) s++;
        return Int(s);
    }

    // 2+ limbs: mpn レベル Newton sqrt
    {
        // value.data() を直接参照 (コピー不要 — sqrtrem は ap を書き換えないため)
        const uint64_t* ap = value.data();
        size_t sn = (an + 1) / 2;

        // スタックバッファで小サイズの arena アクセスを回避
        constexpr size_t STACK_LIMIT = 64;
        uint64_t sp_stack[STACK_LIMIT];
        size_t scratch_sz = mpn::sqrtrem_scratch_size(an);
        uint64_t scratch_stack[STACK_LIMIT * 4];

        uint64_t* sp;
        uint64_t* scratch;
        ScratchScope scope;
        if (sn <= STACK_LIMIT && scratch_sz <= STACK_LIMIT * 4) {
            sp = sp_stack;
            scratch = scratch_stack;
        } else {
            sp = getThreadArena().alloc_limbs(sn);
            scratch = getThreadArena().alloc_limbs(scratch_sz);
        }

        size_t result_n = mpn::sqrtrem(sp, nullptr, ap, an, scratch);
        if (result_n == 0) return Int::Zero();
        return Int::fromRawWords(std::span (sp, result_n), 1);
    }
}

// ========================================================================
// sqrt_small: double 精度での平方根計算
// ========================================================================

Int IntSqrt::sqrt_small(const Int& value) {
    // MSB < 52 ビットの場合、double で正確に計算できる
    double x = value.toDouble();
    double sqrt_x = std::sqrt(x);
    uint64_t result = static_cast (sqrt_x);
    return Int(result);
}

// ========================================================================
// sqrtRem: 平方根と余りを返す
// ========================================================================

Int IntSqrt::sqrtRem(const Int& value, Int& remainder) {
    // 特殊状態の処理
    if (value.isNaN()) {
        remainder = Int::NaN();
        return Int::NaN();
    }

    if (value.isNegative()) {
        Int result = Int::NaN();
        result.setState(NumericState::NaN, NumericError::NegativeSqrt);
        remainder = Int::NaN();
        return result;
    }

    if (value.getState() == NumericState::PositiveInfinity) {
        remainder = Int::NaN();  // 余りは定義できない
        return Int::PositiveInfinity();
    }

    if (value.isZero()) {
        remainder = Int::Zero();
        return Int::Zero();
    }

    if (value.isOne()) {
        remainder = Int::Zero();
        return Int::One();
    }

    // mpn レベル sqrtrem で平方根と余りを同時に計算
    size_t bit_length = value.bitLength();
    if (bit_length < 52) {
        Int s = sqrt_small(value);
        Int s_sq; IntOps::square(s, s_sq);
        remainder = value - s_sq;
        return s;
    }

    {
        size_t an = value.size();
        ScratchScope scope;
        uint64_t* ap = getThreadArena().alloc_limbs(an);
        for (size_t i = 0; i < an; i++) ap[i] = value.word(i);

        size_t sn = (an + 1) / 2;
        uint64_t* sp = getThreadArena().alloc_limbs(sn);
        uint64_t* rem_p = getThreadArena().alloc_limbs(an);
        std::memset(rem_p, 0, an * sizeof(uint64_t));
        size_t scratch_sz = mpn::sqrtrem_scratch_size(an);
        uint64_t* scratch = getThreadArena().alloc_limbs(scratch_sz);

        size_t result_n = mpn::sqrtrem(sp, rem_p, ap, an, scratch);

        size_t rem_n = mpn::normalized_size(rem_p, an);
        if (rem_n == 0) {
            remainder = Int::Zero();
        } else {
            remainder = Int::fromRawWords(std::span (rem_p, rem_n), 1);
        }

        if (result_n == 0) return Int::Zero();
        return Int::fromRawWords(std::span (sp, result_n), 1);
    }
}

// ========================================================================
// isSquare: 完全平方数判定
// GMP perfsqr.c に学んだ 3 層フィルタ:
//   Filter 1: up[0] % 256 ビットテーブル (O(1), 82.81% 棄却)
//   Filter 2: mod_34lsub1 + 小素数の平方剰余チェック (O(n), ~97.81% 追加棄却)
//   Filter 3: sqrtRem で最終検証
//   合計: 99.62% の非平方数を sqrtrem 前に棄却
// ========================================================================

// ================================================================
// consteval によるコンパイル時テーブル生成
// ================================================================

// mod p の平方剰余ビットマスクをコンパイル時に生成 (p < 64)
consteval uint64_t make_qr_mask(unsigned p) {
    uint64_t mask = 0;
    for (unsigned i = 0; i < p; i++)
        mask |= 1ULL << ((i * i) % p);
    return mask;
}

// mod p の平方剰余ビットマスクをコンパイル時に生成 (p >= 64, 2 limb)
struct QRMask128 { uint64_t lo; uint64_t hi; };
consteval QRMask128 make_qr_mask_wide(unsigned p) {
    uint64_t lo = 0, hi = 0;
    for (unsigned i = 0; i < p; i++) {
        unsigned r = (i * i) % p;
        if (r < 64) lo |= 1ULL << r;
        else        hi |= 1ULL << (r - 64);
    }
    return {lo, hi};
}

// up[0] % 256 の平方剰余ビットテーブル (GMP perfsqr.h の sq_res_0x100 相当)
// ビットが 1 なら平方剰余の可能性あり、0 なら確実に非平方数
consteval std::array make_sq_res_256() {
    std::array t{};
    for (unsigned i = 0; i < 256; i++) {
        unsigned r = (i * i) & 0xFF;
        t[r / 64] |= 1ULL << (r % 64);
    }
    return t;
}
static constexpr auto SQ_RES_256 = make_sq_res_256();

// 各素数の平方剰余マスク (コンパイル時検証可能)
static constexpr uint64_t QR_9  = make_qr_mask(9);   // 棄却率 55.56%
static constexpr uint64_t QR_5  = make_qr_mask(5);   // 棄却率 40%
static constexpr uint64_t QR_7  = make_qr_mask(7);   // 棄却率 42.86%
static constexpr uint64_t QR_13 = make_qr_mask(13);  // 棄却率 46.15%
static constexpr uint64_t QR_17 = make_qr_mask(17);  // 棄却率 47.06%
static constexpr auto QR_97 = make_qr_mask_wide(97);  // 棄却率 49.48%

// ★ GMP 風 packed 合成数テーブル (% 回数削減)
// 7*13=91 と 5*17=85 の平方剰余を packed ビットテーブルで一括チェック
static constexpr auto QR_91 = make_qr_mask_wide(91);  // 7*13
static constexpr auto QR_85 = make_qr_mask_wide(85);  // 5*17

// ★ 追加フィルタ: mod_34lsub1 以外の小素数でも検査
// 11, 19, 23, 29, 31 を追加 (総棄却率をさらに向上)
static constexpr uint64_t QR_11 = make_qr_mask(11);
static constexpr uint64_t QR_19 = make_qr_mask(19);
static constexpr uint64_t QR_23 = make_qr_mask(23);
static constexpr uint64_t QR_29 = make_qr_mask(29);
static constexpr uint64_t QR_31 = make_qr_mask(31);

// mod_34lsub1 の結果 r を小素数で平方剰余チェック
// ★ GMP 風 packed test: 6回→4回の % + 追加 5 素数
static bool check_sq_residue_mod34(uint64_t r) {
    constexpr uint64_t M48 = (1ULL << 48) - 1;
    r = (r & M48) + (r >> 48);

    // GMP 風 packed: 91=7*13, 85=5*17 で 2 回の % で 4 素数分をカバー
    if (((QR_9  >> (r % 9))  & 1) == 0) return false;
    { uint64_t r91 = r % 91;
      if (r91 < 64) { if (((QR_91.lo >> r91) & 1) == 0) return false; }
      else { if (((QR_91.hi >> (r91 - 64)) & 1) == 0) return false; }
    }
    { uint64_t r85 = r % 85;
      if (r85 < 64) { if (((QR_85.lo >> r85) & 1) == 0) return false; }
      else { if (((QR_85.hi >> (r85 - 64)) & 1) == 0) return false; }
    }
    { uint64_t r97 = r % 97;
      if (r97 < 64) { if (((QR_97.lo >> r97) & 1) == 0) return false; }
      else { if (((QR_97.hi >> (r97 - 64)) & 1) == 0) return false; }
    }
    return true;
}

// ★ 追加フィルタ: word(0) から直接小素数チェック (mod_34lsub1 不要)
[[maybe_unused]] static bool check_sq_residue_extra(uint64_t w0) {
    if (((QR_11 >> (w0 % 11)) & 1) == 0) return false;
    if (((QR_19 >> (w0 % 19)) & 1) == 0) return false;
    if (((QR_23 >> (w0 % 23)) & 1) == 0) return false;
    if (((QR_29 >> (w0 % 29)) & 1) == 0) return false;
    if (((QR_31 >> (w0 % 31)) & 1) == 0) return false;
    return true;
}

bool IntSqrt::isSquare(const Int& value, Int* pSqrt) {
    // 特殊状態の処理
    if (value.isNaN()) [[unlikely]] {
        if (pSqrt) *pSqrt = Int::NaN();
        return false;
    }

    if (value.isInfinite()) [[unlikely]] {
        if (pSqrt) *pSqrt = Int::NaN();
        return false;
    }

    // 負の数は平方数ではない
    if (value.isNegative()) [[unlikely]] {
        return false;
    }

    // 0 と 1 は平方数
    if (value.isZero()) [[unlikely]] {
        if (pSqrt) *pSqrt = Int::Zero();
        return true;
    }

    if (value.isOne()) [[unlikely]] {
        if (pSqrt) *pSqrt = Int::One();
        return true;
    }

    // Filter 1: up[0] % 256 ビットテーブル (O(1), 82.81% 棄却)
    {
        unsigned idx = static_cast (value.word(0) & 0xFF);
        if (((SQ_RES_256[idx / 64] >> (idx % 64)) & 1) == 0)
            return false;
    }

    // Filter 1b: 除去 — check_sq_residue_extra は word(0) % p を使うが
    // 多ワード数では value % p ≠ word(0) % p なので偽陰性が発生する

    // Filter 2: mod_34lsub1 ベースの平方剰余フィルタ (O(n), GMP 風 packed test)
    {
        size_t vn = value.size();
        constexpr size_t BUF = 128;
        uint64_t stack_buf[BUF];
        uint64_t* wp = (vn <= BUF) ? stack_buf : new uint64_t[vn];
        for (size_t i = 0; i < vn; i++) wp[i] = value.word(i);
        uint64_t r = mpn::mod_34lsub1(wp, vn);
        if (wp != stack_buf) delete[] wp;
        if (!check_sq_residue_mod34(r))
            return false;
    }

    // Filter 3: mpn::sqrtrem_check_exact で最終検証
    // ★ Codex 指摘: sqrtRem (Int 構築) ではなく bool 専用パスを使用
    {
        size_t an = value.size();
        ScratchScope scope;
        uint64_t* ap = getThreadArena().alloc_limbs(an);
        for (size_t i = 0; i < an; i++) ap[i] = value.word(i);
        size_t sn = (an + 1) / 2;
        uint64_t* sp = getThreadArena().alloc_limbs(sn);
        size_t scratch_sz = mpn::sqrtrem_scratch_size(an);
        uint64_t* scratch = getThreadArena().alloc_limbs(scratch_sz);
        auto [result_n, is_exact] = mpn::sqrtrem_check_exact(sp, ap, an, scratch);
        if (pSqrt && is_exact && result_n > 0) {
            *pSqrt = Int::fromRawWords(std::span (sp, result_n), 1);
        }
        return is_exact;
    }
}


// ★ mpn 直接べき乗: pow(x, n-1) を Int::pow より高速に計算
// n=3: x^2, n=5: x^4, n=7: x^6 を addition chain で構築
// 戻り値: {x^(n-1), x^n} (x^n は optional)
// ★ limb 直接返し版: アロケータが返すポインタをそのまま返す (コピー不要)
// 呼び出し元がバッファの寿命を管理すること
struct MpnPowResult {
    const uint64_t* nm1; size_t nm1_n;  // x^(n-1)
    const uint64_t* pn;  size_t pn_n;   // x^n (need_xn のみ)
};

// ★ Tier 3: アロケータをテンプレート化 — arena でもローカル bump でも使える
template static MpnPowResult mpn_pow_for_root_impl(
    const uint64_t* xp, size_t xn_sz, uint32_t n, bool need_xn,
    AllocFn alloc_fn)
{
    auto alloc_and_square = [&](const uint64_t* a, size_t an) -> std::pair {
        size_t rn = 2 * an;
        uint64_t* r = alloc_fn(rn);
        uint64_t* sc = alloc_fn(mpn::square_scratch_size(an));
        mpn::square(r, a, an, sc);
        return {r, mpn::normalized_size(r, rn)};
    };
    auto alloc_and_mul = [&](const uint64_t* a, size_t an, const uint64_t* b, size_t bn) -> std::pair {
        size_t rn = an + bn;
        uint64_t* r = alloc_fn(rn);
        uint64_t* sc = alloc_fn(mpn::multiply_scratch_size(an, bn));
        mpn::multiply(r, a, an, b, bn, sc);
        return {r, mpn::normalized_size(r, rn)};
    };

    MpnPowResult res = {nullptr, 0, nullptr, 0};
    if (n == 3) {
        auto [x2, x2n] = alloc_and_square(xp, xn_sz);
        res.nm1 = x2; res.nm1_n = x2n;
        if (need_xn) { auto [x3, x3n] = alloc_and_mul(x2, x2n, xp, xn_sz); res.pn = x3; res.pn_n = x3n; }
    } else if (n == 5) {
        auto [x2, x2n] = alloc_and_square(xp, xn_sz);
        auto [x4, x4n] = alloc_and_square(x2, x2n);
        res.nm1 = x4; res.nm1_n = x4n;
        if (need_xn) { auto [x5, x5n] = alloc_and_mul(x4, x4n, xp, xn_sz); res.pn = x5; res.pn_n = x5n; }
    } else if (n == 7) {
        auto [x2, x2n] = alloc_and_square(xp, xn_sz);
        auto [x3, x3n] = alloc_and_mul(x2, x2n, xp, xn_sz);
        auto [x6, x6n] = alloc_and_square(x3, x3n);
        res.nm1 = x6; res.nm1_n = x6n;
        if (need_xn) { auto [x7, x7n] = alloc_and_mul(x6, x6n, xp, xn_sz); res.pn = x7; res.pn_n = x7n; }
    } else {
        // 一般 n: binary exponentiation で x^(n-1) を計算
        uint32_t e = n - 1;

        // result = x (初期値)
        size_t rn = xn_sz;
        uint64_t* rp = alloc_fn(xn_sz);
        std::memcpy(rp, xp, xn_sz * sizeof(uint64_t));

        int top_bit = 31 - std::countl_zero(e);
        for (int bit = top_bit - 1; bit >= 0; bit--) {
            auto [sq, sqn] = alloc_and_square(rp, rn);
            rp = sq; rn = sqn;

            if (e & (1u << bit)) {
                auto [prod, prodn] = alloc_and_mul(rp, rn, xp, xn_sz);
                rp = prod; rn = prodn;
            }
        }
        res.nm1 = rp; res.nm1_n = rn;
        if (need_xn) {
            auto [xn_p, xn_n] = alloc_and_mul(rp, rn, xp, xn_sz);
            res.pn = xn_p; res.pn_n = xn_n;
        }
    }
    return res;
}

// ★ arena 版 (PD ループ外・mpn_pow_for_root から使用)
static MpnPowResult mpn_pow_for_root_raw(
    const uint64_t* xp, size_t xn_sz, uint32_t n, bool need_xn)
{
    return mpn_pow_for_root_impl(xp, xn_sz, n, need_xn,
        [](size_t s) { return getThreadArena().alloc_limbs(s); });
}

// ★ Tier 3: PD ループ 1 ステップ分の最大 scratch サイズを計算
// pow(need_xn=false) + 除算 + 加算の全バッファを含む
static size_t pd_step_scratch_size(uint32_t n, size_t max_xn, size_t val_n) {
    auto a8 = [](size_t s) -> size_t { return (s + 7) & ~size_t(7); };
    size_t total = 0;

    // ap_buf (value >> shift)
    total += a8(val_n + 1);

    if (n == 3) {
        // x²
        total += a8(2 * max_xn);
        total += a8(mpn::square_scratch_size(max_xn));
        // quotient
        total += a8(val_n + 2);
        // divide_q_approx scratch (余り不要, 補正乗算なし)
        if (2 * max_xn >= 2)
            total += a8(mpn::divide_q_approx_scratch_size(val_n, 2 * max_xn));
        // tp (2*x + quot)
        total += a8(val_n + 4);
    } else {
        // pow(x, n-1) — bump allocator 上に展開される全中間結果
        // n=3,5,7 は専用パス、一般 n は binary exp
        if (n == 5) {
            total += a8(2 * max_xn) + a8(mpn::square_scratch_size(max_xn));     // x²
            total += a8(4 * max_xn) + a8(mpn::square_scratch_size(2 * max_xn)); // x⁴
        } else if (n == 7) {
            total += a8(2 * max_xn) + a8(mpn::square_scratch_size(max_xn));                    // x²
            total += a8(3 * max_xn) + a8(mpn::multiply_scratch_size(2 * max_xn, max_xn));      // x³
            total += a8(6 * max_xn) + a8(mpn::square_scratch_size(3 * max_xn));                 // x⁶
        } else {
            // binary exp of (n-1)
            uint32_t e = n - 1;
            int top_bit = 31 - std::countl_zero(e);
            total += a8(max_xn);  // initial copy
            size_t cur = max_xn;
            for (int bit = top_bit - 1; bit >= 0; bit--) {
                total += a8(2 * cur) + a8(mpn::square_scratch_size(cur));
                cur = 2 * cur;
                if (e & (1u << bit)) {
                    total += a8(cur + max_xn) + a8(mpn::multiply_scratch_size(cur, max_xn));
                    cur = cur + max_xn;
                }
            }
        }
        size_t max_pow_sz = (n - 1) * max_xn;
        // quotient
        total += a8(val_n + 2);
        // divide_q_approx scratch (余り不要, 補正乗算なし)
        if (max_pow_sz >= 2)
            total += a8(mpn::divide_q_approx_scratch_size(val_n, max_pow_sz));
        // tp ((n-1)*x + quot)
        total += a8(val_n + 4);
    }
    return total;
}

// Int 版ラッパー (PD ループ外で使用)
// 全ての n に対して mpn_pow_for_root_raw を使用 (Int::pow フォールバック不要)
static std::pair mpn_pow_for_root(const Int& x, uint32_t n, bool need_xn) {
    if (x.isZero() || x.isOne()) return {x, x};
    ScratchScope scope;
    size_t xn_sz = x.size();
    uint64_t* xbuf = getThreadArena().alloc_limbs(xn_sz);
    for (size_t i = 0; i < xn_sz; i++) xbuf[i] = x.word(i);
    auto r = mpn_pow_for_root_raw(xbuf, xn_sz, n, need_xn);
    Int pm1 = Int::fromRawWords(std::span (r.nm1, r.nm1_n), 1);
    Int pn = (need_xn && r.pn) ? Int::fromRawWords(std::span (r.pn, r.pn_n), 1) : Int(0);
    return {pm1, pn};
}

Int IntSqrt::nthRoot(const Int& value, uint32_t n) {
    // n = 0 または n = 1 の特殊ケース
    if (n == 0) {
        // n = 0 は無効な引数
        return Int::NaN();
    }

    if (n == 1) {
        return value;
    }

    // n = 2 の場合は sqrt を使用
    if (n == 2) {
        return sqrt(value);
    }

    // 特殊状態の処理
    if (value.isNaN()) {
        return Int::NaN();
    }

    // 負の値の処理
    if (value.isNegative()) {
        // n が奇数の場合は負の根を返す: -root(|value|, n)
        if (n & 1) {
            Int result = nthRoot(-value, n);
            return -result;
        } else {
            // n が偶数の場合は NaN（負の偶数乗根は実数で存在しない）
            Int result = Int::NaN();
            result.setState(NumericState::NaN, NumericError::NegativeSqrt);
            return result;
        }
    }

    if (value.getState() == NumericState::PositiveInfinity) {
        return Int::PositiveInfinity();
    }

    if (value.isZero()) {
        return Int::Zero();
    }

    if (value.isOne()) {
        return Int::One();
    }

    // Precision Doubling Newton + floor 検証
    //
    // アルゴリズム概要 (Brent & Zimmermann, Modern Computer Arithmetic §1.5.2):
    //   1. double 近似で ~53/n ビットの初期値を取得
    //   2. 精度倍増 Newton: 各ステップで正確ビット数を約2倍に拡大
    //      精度スケジュール: c_prev = ceil((c + L) / 2), L = 2*ceil(log2(n)) + 2
    //   3. 最終ステップで remainder 追跡による Newton 補正 + floor 保証
    //
    // 計算量:
    //   PD ループ: pow(x, n-1) を各ステップで計算。等比級数の性質により
    //   全ステップの合計コストは最終ステップの約2倍（≈ 2·M(P)·log(n)）。
    //   後処理: 最終ステップの pow を Newton 補正と floor 検証に再利用。
    //   Newton 補正 Q > 0 の場合のみ検証用 pow を1回追加。

    int B = static_cast (value.bitLength());
    int ni = static_cast (n);

    // value < 2^n の場合、root は 1
    if (B <= ni) {
        return Int::One();
    }

    // --- 初期近似 (double ベース) ---
    Int x;
    {
        int shift = (B > 53) ? (B - 53) : 0;
        double m_d = (shift > 0) ? (value >> shift).toDouble() : value.toDouble();
        int q = shift / ni;
        int r = shift % ni;
        double root_d = std::pow(m_d, 1.0 / n) * std::exp2(static_cast (r) / n);
        int64_t ri = std::max(static_cast (1), static_cast (root_d));
        x = (q > 0) ? (Int(ri) << q) : Int(ri);
    }
    if (x.isZero()) x = Int::One();

    const Int n_minus_1(static_cast (n - 1));
    const Int n_int(static_cast (n));

    int P = (B + ni - 1) / ni;     // root の目標ビット精度
    int x_bits = static_cast (x.bitLength());

    // double 近似の正確なビット数
    int P0 = std::min(x_bits, std::max(53 / ni - 1, 4));

    if (P > P0 + 10) {
        // === Precision Doubling Newton (Brent-Zimmermann 式) ===

        // ceil(log2(n))
        int logk = 0;
        { unsigned tmp = n - 1; while (tmp > 0) { logk++; tmp >>= 1; } }
        int L = 2 * logk + 2;

        // 精度スケジュールを P から逆向きに構築
        int sizes[64];
        int ns = 0;
        sizes[0] = P;
        while (sizes[ns] > P0 && ns < 60) {
            int next = (sizes[ns] + L + 1) / 2;
            if (next >= sizes[ns]) break;
            sizes[ns + 1] = next;
            ns++;
        }
        // 正順に反転: sizes[0]=最小, sizes[ns]=P
        for (int i = 0, j = ns; i < j; i++, j--) {
            std::swap(sizes[i], sizes[j]);
        }

        // 初期精度に切り詰め
        if (x_bits > sizes[0]) {
            x = x >> (x_bits - sizes[0]);
        }
        int cur_prec = static_cast (x.bitLength());

        // --- PD Newton ループ: step 1 .. ns-1 ---
        // ★ Tier 3: scratch 事前確保 — ループ内の TLS lookup と mark/rewind を完全除去
        {
            ScratchScope pd_scope;
            // x を limb 配列に展開
            size_t max_limbs = (P + 63) / 64 + 4;
            uint64_t* xp = getThreadArena().alloc_limbs(max_limbs);
            size_t xn = x.size();
            for (size_t i = 0; i < xn; i++) xp[i] = x.word(i);

            // ★ Tier 3: PD ループ全ステップ分の scratch を一括確保
            size_t val_n = value.size();
            size_t pd_scratch_sz = pd_step_scratch_size(n, max_limbs, val_n);
            uint64_t* pd_buf = getThreadArena().alloc_limbs(pd_scratch_sz);

            // ★ Tier 5: step ns を含む (最終 Newton ステップも PD ループ内で実行)
            // → 別途 Newton 補正 Q を計算する必要なし、ループ後は floor 検証のみ
            for (int step = 1; step <= ns; step++) {
                // ★ ローカル bump allocator: 毎ステップ先頭にリセット
                size_t buf_off = 0;
                auto balloc = [&](size_t s) -> uint64_t* {
                    if (s == 0) return pd_buf;  // 0 サイズは no-op
                    uint64_t* p = pd_buf + buf_off;
                    buf_off += (s + 7) & ~size_t(7);
                    return p;
                };

                int new_prec = sizes[step];
                int extend = new_prec - cur_prec;
                if (extend > 0) {
                    size_t limb_sh = extend / 64;
                    unsigned bit_sh = extend % 64;
                    size_t new_xn = xn + limb_sh + (bit_sh > 0 ? 1 : 0);
                    if (new_xn > max_limbs) new_xn = max_limbs;
                    if (bit_sh > 0) {
                        uint64_t carry = mpn::lshift(xp + limb_sh, xp, xn, bit_sh);
                        if (xn + limb_sh < max_limbs) xp[xn + limb_sh] = carry;
                    } else if (limb_sh > 0) {
                        for (size_t i = xn; i > 0; i--) xp[i - 1 + limb_sh] = xp[i - 1];
                    }
                    for (size_t i = 0; i < limb_sh; i++) xp[i] = 0;
                    xn = mpn::normalized_size(xp, new_xn);
                }

                // ★ Tier 1: trailing zero stripping
                // precision extension で追加されたゼロリムを検出し、
                // pow と除算を非ゼロ部分のみで実行（オペランド半減）
                size_t x_tz = 0;
                while (x_tz < xn && xp[x_tz] == 0) x_tz++;
                size_t xn_nz = xn - x_tz;
                if (xn_nz == 0) { cur_prec = 0; continue; }

                // a = value >> a_shift (mpn 直接シフト)
                int new_scale = P - new_prec;
                int a_shift = ni * new_scale;
                size_t a_limb_off = static_cast (a_shift) / 64;
                unsigned a_bit_off = static_cast (a_shift) % 64;
                size_t a_n = (a_limb_off < val_n) ? (val_n - a_limb_off) : 0;
                // a をローカルバッファ上に構築
                uint64_t* ap_buf = nullptr;
                if (a_n > 0 && a_shift > 0) {
                    ap_buf = balloc(a_n + 1);
                    for (size_t i = 0; i < a_n; i++) ap_buf[i] = value.word(i + a_limb_off);
                    if (a_bit_off > 0) mpn::rshift(ap_buf, ap_buf, a_n, a_bit_off);
                    a_n = mpn::normalized_size(ap_buf, a_n);
                } else if (a_n > 0) {
                    ap_buf = balloc(a_n);
                    for (size_t i = 0; i < a_n; i++) ap_buf[i] = value.word(i);
                }
                if (a_n == 0) { cur_prec = 0; continue; }

                // ★ Tier 1: a の下位を除算精度に影響しない範囲で除去
                // x^(n-1) の trailing zero = (n-1)*x_tz limbs 分を a からも除去
                size_t div_tz = (n == 3) ? 2 * x_tz : static_cast (n - 1) * x_tz;
                if (div_tz >= a_n) div_tz = (a_n > 1) ? a_n - 1 : 0;

              if (n == 3) {
                // ★ n=3 専用: x_new = (2*x + a/x²) / 3
                // x² は非ゼロ部分のみで計算 (trailing zero 暗黙)
                size_t x2n = 2 * xn_nz;
                uint64_t* x2p = balloc(x2n);
                uint64_t* sq_sc = balloc(mpn::square_scratch_size(xn_nz));
                mpn::square(x2p, xp + x_tz, xn_nz, sq_sc);
                x2n = mpn::normalized_size(x2p, x2n);

                // quot = a_eff / x²_nz (trailing zero 除去後)
                const uint64_t* a_div = ap_buf + div_tz;
                size_t a_div_n = a_n - div_tz;
                uint64_t* qp_div = nullptr; size_t qn_div = 0;
                if (a_div_n >= x2n && x2n >= 2) {
                    size_t q_sz = a_div_n - x2n + 1;
                    qp_div = balloc(q_sz + 2);
                    size_t dsz = mpn::divide_q_approx_scratch_size(a_div_n, x2n);
                    uint64_t* dsc = balloc(dsz);
                    qn_div = mpn::divide_q_approx(qp_div, a_div, a_div_n, x2p, x2n, dsc);
                    qn_div = mpn::normalized_size(qp_div, qn_div);
                } else if (x2n == 1 && a_div_n > 0) {
                    qp_div = balloc(a_div_n);
                    mpn::divmod_1(qp_div, a_div, a_div_n, x2p[0]);
                    qn_div = mpn::normalized_size(qp_div, a_div_n);
                }

                // tp = 2*x + quot (x はフルサイズ xn のまま)
                size_t tn = xn + 1;
                uint64_t* tp = balloc(std::max(tn, qn_div) + 2);
                tp[xn] = mpn::mul_1(tp, xp, xn, 2ULL); // 2*x
                tn = mpn::normalized_size(tp, tn);
                if (qn_div > 0 && qp_div) {
                    if (qn_div > tn) { for (size_t i = tn; i < qn_div; i++) tp[i] = 0; tn = qn_div; }
                    uint64_t carry = mpn::add(tp, tp, tn, qp_div, qn_div);
                    if (carry) { tp[tn] = carry; tn++; }
                }

                // xp = tp / 3
                mpn::divmod_1(xp, tp, tn, 3ULL);
                xn = mpn::normalized_size(xp, tn);
              } else {
                // 一般パス (n=5,7,11,...) — 非ゼロ部分のみで pow 実行
                MpnPowResult pw = mpn_pow_for_root_impl(xp + x_tz, xn_nz, n, false, balloc);

                // quot = a_eff / xpm1_nz (trailing zero 除去後)
                const uint64_t* a_div = ap_buf + div_tz;
                size_t a_div_n = a_n - div_tz;
                size_t p_sz = pw.nm1_n;
                uint64_t* qp_div = nullptr; size_t qn_div = 0;
                if (a_div_n >= p_sz && p_sz >= 2) {
                    size_t q_sz = a_div_n - p_sz + 1;
                    qp_div = balloc(q_sz + 2);
                    size_t dsz = mpn::divide_q_approx_scratch_size(a_div_n, p_sz);
                    uint64_t* dsc = balloc(dsz);
                    qn_div = mpn::divide_q_approx(qp_div, a_div, a_div_n, pw.nm1, p_sz, dsc);
                    qn_div = mpn::normalized_size(qp_div, qn_div);
                } else if (p_sz == 1 && a_div_n > 0) {
                    qp_div = balloc(a_div_n);
                    mpn::divmod_1(qp_div, a_div, a_div_n, pw.nm1[0]);
                    qn_div = mpn::normalized_size(qp_div, a_div_n);
                }

                uint64_t nm1_word = static_cast (n - 1);
                size_t tn = xn + 1;
                uint64_t* tp = balloc(std::max(tn, qn_div) + 2);
                tp[xn] = mpn::mul_1(tp, xp, xn, nm1_word);
                tn = mpn::normalized_size(tp, tn);

                // tp += quot (limb 直接)
                if (qn_div > 0 && qp_div) {
                    if (qn_div > tn) {
                        for (size_t i = tn; i < qn_div; i++) tp[i] = 0;
                        tn = qn_div;
                    }
                    uint64_t carry = mpn::add(tp, tp, tn, qp_div, qn_div);
                    if (carry) { tp[tn] = carry; tn++; }
                }

                // xp = tp / n (1 word 除算)
                mpn::divmod_1(xp, tp, tn, static_cast (n));
                xn = mpn::normalized_size(xp, tn);
              } // end else (general n)

                cur_prec = static_cast (xn * 64 - (xn > 0 ? std::countl_zero(xp[xn-1]) : 64));
            }

            // xp → x に戻す (step ns 完了後、full P bits)
            x = Int::fromRawWords(std::span (xp, xn), 1);
        }
        // ★ Tier 5: PD ループが step ns まで実行したため、
        // 別途 Newton 補正は不要。下の floor 検証に直接委譲。
    } else {
        // 小さい root: 標準 Newton（PD 不要）
        constexpr int MAX_ITER = 200;
        for (int iter = 0; iter < MAX_ITER; iter++) {
            auto [xpm1, _xpn_unused] = mpn_pow_for_root(x, n, false);
            Int quot;
            IntOps::divUnchecked(value, xpm1, quot);
            // x_new = (n_minus_1 * x + quot) / n_int (Unchecked: 特殊状態なし)
            Int tmp;
            IntOps::mulUnchecked(n_minus_1, x, tmp);
            IntOps::addUnchecked(tmp, quot, tmp);
            Int x_new;
            IntOps::divUnchecked(tmp, n_int, x_new);
            if (x_new == x) break;
            Int diff = (x_new > x) ? (x_new - x) : (x - x_new);
            if (diff <= Int::One()) {
                if (x_new < x) x = x_new;
                break;
            }
            x = x_new;
        }
    }

    // === 小さい root 用の最終検証 ===
    {
        auto [xpm1, _xpn_unused] = mpn_pow_for_root(x, n, false);
        Int R = value - xpm1 * x;

        while (R.isNegative()) {
            x = x - Int::One();
            if (x.isZero()) return x;
            xpm1 = mpn_pow_for_root(x, n, false).first;
            R = value - xpm1 * x;
        }

        // ★ Codex 改善 #6: pow(x+1,n) を差分判定に置換
        // (x+1)^n - x^n = Σ_{k=0}^{n-1} C(n,k) * x^k
        // 一次項 = n*x^(n-1), R >= 一次項 なら undershoot の可能性
        // 安全判定: R >= n*xpm1 なら (x+1)^n ≤ value かを mpn で確認
        if (R >= n_int * xpm1) {
            auto [_, xp1_n] = mpn_pow_for_root(x + Int::One(), n, true);
            if (xp1_n <= value) {
                x = x + Int::One();
            }
        }
    }

    return x;
}


// ========================================================================
// nthRootRem: n乗根と余りを同時に計算
// ========================================================================

Int IntSqrt::nthRootRem(const Int& value, uint32_t n, Int& remainder) {
    // root を計算
    Int root = nthRoot(value, n);

    // 特殊状態の処理
    if (root.isNaN()) {
        remainder = Int::NaN();
        return root;
    }

    if (root.isInfinite()) {
        remainder = Int::NaN();  // 余りは定義できない
        return root;
    }

    // remainder = value - root^n
    // ★ Codex 改善 #1: pow(root,n) を mpn_pow_for_root で高速計算
    // mpn_pow_for_root は符号を無視する (常に正を返す) ため、
    // 負の root の場合は |root| で計算して符号を補正する
    Int abs_root = root.isNegative() ? -root : root;
    auto [_, root_power] = mpn_pow_for_root(abs_root, n, true);
    // root が負で n が奇数 → root^n は負
    if (root.isNegative() && (n & 1)) {
        remainder = value + root_power;  // value - (-root_power)
    } else {
        remainder = value - root_power;
    }

    return root;
}

// =============================================================================
// isPerfectPower — value = b^k (k >= 2) となる k が存在するか判定
// =============================================================================

bool IntSqrt::isPerfectPower(const Int& value, Int* pBase, uint32_t* pExp) {
    if (value.isNaN() || value.isInfinite()) return false;
    if (value.isNegative()) return false;  // 負の完全冪は未対応
    if (value.isZero()) {
        if (pBase) *pBase = Int(0);
        if (pExp) *pExp = 2;
        return true;  // 0 = 0^2
    }
    if (value == Int::One()) {
        if (pBase) *pBase = Int(1);
        if (pExp) *pExp = 2;
        return true;  // 1 = 1^2
    }

    size_t bits = value.bitLength();

    // k の上限: 2^k <= value なので k <= bitLength
    uint32_t max_k = static_cast (bits);

    // 素数の指数のみ試行すれば十分 (e.g., a^6 = (a^2)^3 = (a^3)^2)
    // 小さい素数から順に試行
    static const uint32_t primes[] = {
        2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61
    };

    for (uint32_t p : primes) {
        if (p > max_k) break;

        Int root = nthRoot(value, p);
        if (pow(root, p) == value) {
            if (pBase) *pBase = root;
            if (pExp) *pExp = p;
            return true;
        }
    }

    return false;
}

} // namespace sangi