#include #include #include #include #include #include #include #include #include #include namespace calx {

    //=============================================================================
    // ユーティリティ関数
    //=============================================================================

    // 入力の effective_bits_ に基づき、計算精度 (10進桁数) を決定する。
    // 入力精度が要求精度より十分低い場合、入力精度 + guard で計算し高速化する。
    static int effectiveComputePrecision(int input_eff, int requested_precision) {
        // 超越関数は入力が正確な値 (e.g. 1.5 = 2ビット) でも
        // 要求精度分の正確な結果を返す必要がある。
        // 入力の有効ビットが要求精度の半分未満の場合、精度削減を無効化する。
        constexpr int GUARD = 32;
        int req_bits = Float::precisionToBits(requested_precision);
        if (input_eff >= INT_MAX || input_eff + GUARD >= req_bits) {
            return requested_precision;
        }
        // 入力の有効ビットが少なくても、要求精度を下回らない
        // (超越関数の出力精度は入力精度に制限されない)
        return requested_precision;
    }

    // 結果の精度フィールドを設定し、入力の有効ビットを反映する。
    // input_eff < requested ならば effective_bits_ を input_eff に設定。
    static void finalizeResult(Float& result, int input_eff, int precision) {
        result.setResultPrecision(precision);
        int req_bits = Float::precisionToBits(precision);
        if (input_eff < INT_MAX && input_eff < req_bits) {
            result.setEffectiveBits(input_eff);
        }
    }

    //=============================================================================
    // Ziv の反復戦略 — 超越関数の correct rounding
    //=============================================================================

    // 計算結果 y の仮数部を target_bits で丸めたとき、丸め方向が確定するか判定。
    // guard_bits 個のガードビットが丸め境界から十分離れていれば true。
    static bool canRoundCorrectly(const Float& y, int target_bits,
                                  int guard_bits, RoundingMode mode) {
        if (y.isZero() || y.isNaN() || y.isInfinity()) return true;

        int bit_length = static_cast (y.mantissa().bitLength());
        if (bit_length <= target_bits) return true;  // 丸め不要

        // 判定に使えるガードビット数 (計算誤差 < 1 ulp を考慮し margin=2)
        constexpr int MARGIN = 2;
        int usable = guard_bits - MARGIN;
        if (usable < 1) return false;

        // ガードビット領域: bit 位置 [shift - guard_bits, shift - 1]
        // shift = bit_length - target_bits
        int shift = bit_length - target_bits;
        const uint64_t* data = y.mantissa().data();

        // usable 個のガードビットを読み取る
        // 位置: bit (shift - 1) が最上位ガードビット, bit (shift - usable) が最下位
        // これらが全 0 または全 1 なら丸め境界に近すぎる

        int hi_pos = shift - 1;                     // 最上位ガードビット位置
        int lo_pos = shift - usable;                // 最下位ガードビット位置
        if (lo_pos < 0) lo_pos = 0;

        // 全ビットが 0 かチェック
        bool all_zero = true;
        // 全ビットが 1 かチェック
        bool all_one = true;

        // ワード単位でスキャン
        size_t lo_word = static_cast (lo_pos) / 64;
        size_t hi_word = static_cast (hi_pos) / 64;

        for (size_t w = lo_word; w <= hi_word; ++w) {
            uint64_t word = data[w];

            // このワードで調べるビット範囲のマスクを作成
            unsigned lo_bit = (w == lo_word) ? static_cast (lo_pos % 64) : 0;
            unsigned hi_bit = (w == hi_word) ? static_cast (hi_pos % 64) : 63;
            unsigned width = hi_bit - lo_bit + 1;

            uint64_t mask;
            if (width >= 64) {
                mask = ~uint64_t(0);
            } else {
                mask = ((uint64_t(1) << width) - 1) << lo_bit;
            }

            uint64_t bits = word & mask;
            if (bits != 0) all_zero = false;
            if (bits != mask) all_one = false;

            // 両方 false なら早期終了
            if (!all_zero && !all_one) return true;
        }

        // 全 0 or 全 1 → 丸め境界に近すぎて方向不明
        return false;
    }

    // Ziv の反復戦略テンプレート。
    // computeFunc(x, eff_x, precision) → Float を呼び、correct rounding を保証する。
    // computeFunc の precision は10進桁数。内部で +10 guard digits を付加する前提。
    template static Float zivRound(ComputeFunc&& computeFunc,
                          const Float& x, int eff_x, int precision) {
        constexpr int MAX_ITER = 6;
        int target_bits = Float::precisionToBits(precision);
        RoundingMode mode = Float::roundingMode();

        int extra_guard = 10;  // 初回: 10 guard digits ≈ 33 bits

        for (int iter = 0; iter < MAX_ITER; ++iter) {
            int wp = precision + extra_guard;
            Float y = computeFunc(x, eff_x, wp);

            int actual_bits = static_cast (y.mantissa().bitLength());
            int guard_bits = actual_bits - target_bits;

            if (guard_bits > 0 &&
                canRoundCorrectly(y, target_bits, guard_bits, mode)) {
                finalizeResult(y, eff_x, precision);
                return y;
            }

            // ガードビットを増やしてリトライ
            extra_guard *= 2;
        }

        // フォールバック: faithful rounding (実用上ほぼ到達しない)
        Float y = computeFunc(x, eff_x, precision + extra_guard);
        finalizeResult(y, eff_x, precision);
        return y;
    }

    //=============================================================================
    // mpn 直接計算 — Taylor 級数ループのアロケーション排除
    //=============================================================================

    // mpn 名前空間は calx::mpn (MpnOps.hpp で定義)

    // 軽量浮動小数点: arena 内のポインタ + ワード数 + 指数
    struct RawFloat {
        uint64_t* d;      // 仮数部 (arena 内, d[0]=LSB)
        size_t nw;         // 実ワード数 (先頭ゼロ除去済み)
        int64_t exp;       // 2進指数 (値 = mantissa * 2^exp)
    };

    // 仮数の正規化: MSB を最上位ワードのビット 63 に配置
    // rf_mul/rf_divmod_1 の後に呼び出して指数の一貫性を保つ
    static void rf_normalize(RawFloat& a) {
        if (a.nw == 0) return;
        // 先頭ゼロワードを除去
        a.nw = mpn::normalized_size(a.d, a.nw);
        if (a.nw == 0) return;
        // ビットレベル: 最上位ワードの leading zeros 分だけ左シフト
        unsigned lz = static_cast (std::countl_zero(a.d[a.nw - 1]));
        if (lz > 0) {
            for (size_t i = a.nw - 1; i > 0; --i) {
                a.d[i] = (a.d[i] << lz) | (a.d[i - 1] >> (64 - lz));
            }
            a.d[0] <<= lz;
            a.exp -= static_cast (lz);
        }
    }

    // 乗算 + 切り詰め: dst = a * b, 上位 target_nw ワードのみ保持
    static void rf_mul(RawFloat& dst, const RawFloat& a, const RawFloat& b,
                       uint64_t* prod_buf, uint64_t* scratch, size_t target_nw) {
        if (a.nw == 0 || b.nw == 0) {
            dst.nw = 0; dst.exp = 0; return;
        }

        int64_t prod_exp = a.exp + b.exp;

        // mulhigh_n 最適化: 両方 target_nw サイズなら上位のみ計算 (コスト ≈ 2/3)
        if (a.nw == target_nw && b.nw == target_nw && target_nw >= 8) {
            size_t mh_scratch = mpn::mulhigh_n_scratch_size(target_nw);
            // scratch が足りるか確認 (multiply_scratch_size >= mulhigh_scratch_size)
            mpn::mulhigh_n(prod_buf, a.d, b.d, target_nw, scratch);
            // mulhigh_n は rp[0..n-1] = 上位 n ワード (近似、O(1) 誤差)
            std::memcpy(dst.d, prod_buf, target_nw * sizeof(uint64_t));
            dst.nw = mpn::normalized_size(dst.d, target_nw);
            // mulhigh の結果は a*b >> (target_nw * 64) の近似
            dst.exp = prod_exp + static_cast (target_nw) * 64;
            rf_normalize(dst);
            return;
        }

        size_t prod_n = a.nw + b.nw;
        mpn::multiply(prod_buf, a.d, a.nw, b.d, b.nw, scratch);
        size_t actual = mpn::normalized_size(prod_buf, prod_n);

        if (actual > target_nw) {
            size_t drop = actual - target_nw;
            std::memcpy(dst.d, prod_buf + drop, target_nw * sizeof(uint64_t));
            dst.nw = mpn::normalized_size(dst.d, target_nw);
            dst.exp = prod_exp + static_cast (drop) * 64;
        } else {
            std::memcpy(dst.d, prod_buf, actual * sizeof(uint64_t));
            dst.nw = actual;
            dst.exp = prod_exp;
        }
        rf_normalize(dst);
    }

    // キャッシュ付き乗算: dst = a * b, b の forward NTT をキャッシュ
    // b が定数で繰り返し乗算する場合に forward NTT を 1 回省略できる
    static void rf_mul_cached(RawFloat& dst, const RawFloat& a, const RawFloat& b,
                       uint64_t* prod_buf, size_t target_nw,
                       prime_ntt::NttCache& cache) {
        if (a.nw == 0 || b.nw == 0) {
            dst.nw = 0; dst.exp = 0; return;
        }
        // NTT 閾値未満は通常乗算にフォールバック
        size_t min_nw = std::min(a.nw, b.nw);
        if (min_nw < 3000) {
            // scratch が必要 → thread_local で確保
            thread_local std::vector scratch_buf;
            size_t scratch_need = mpn::multiply_scratch_size(
                std::max(a.nw, b.nw), min_nw);
            if (scratch_buf.size() < scratch_need) scratch_buf.resize(scratch_need);
            rf_mul(dst, a, b, prod_buf, scratch_buf.data(), target_nw);
            return;
        }
        size_t prod_n = a.nw + b.nw;
        prime_ntt::mul_prime_ntt_cached(prod_buf, a.d, a.nw, b.d, b.nw, cache);
        size_t actual = mpn::normalized_size(prod_buf, prod_n);
        int64_t prod_exp = a.exp + b.exp;

        if (actual > target_nw) {
            size_t drop = actual - target_nw;
            std::memcpy(dst.d, prod_buf + drop, target_nw * sizeof(uint64_t));
            dst.nw = mpn::normalized_size(dst.d, target_nw);
            dst.exp = prod_exp + static_cast (drop) * 64;
        } else {
            std::memcpy(dst.d, prod_buf, actual * sizeof(uint64_t));
            dst.nw = actual;
            dst.exp = prod_exp;
        }
        rf_normalize(dst);
    }

    // 自乗 + 切り詰め: dst = a^2, 上位 target_nw ワードのみ保持
    // mpn::square は乗算より高速 (半対角の加算を省略)
    static void rf_sqr(RawFloat& dst, const RawFloat& a,
                       uint64_t* prod_buf, uint64_t* scratch, size_t target_nw) {
        if (a.nw == 0) {
            dst.nw = 0; dst.exp = 0; return;
        }
        size_t prod_n = 2 * a.nw;
        mpn::square(prod_buf, a.d, a.nw, scratch);
        size_t actual = mpn::normalized_size(prod_buf, prod_n);
        int64_t prod_exp = a.exp + a.exp;

        if (actual > target_nw) {
            size_t drop = actual - target_nw;
            std::memcpy(dst.d, prod_buf + drop, target_nw * sizeof(uint64_t));
            dst.nw = mpn::normalized_size(dst.d, target_nw);
            dst.exp = prod_exp + static_cast (drop) * 64;
        } else {
            std::memcpy(dst.d, prod_buf, actual * sizeof(uint64_t));
            dst.nw = actual;
            dst.exp = prod_exp;
        }
        rf_normalize(dst);
    }

    // 単一リム除算 (インプレース): a /= divisor
    static void rf_divmod_1(RawFloat& a, uint64_t divisor) {
        if (a.nw == 0) return;
        mpn::divmod_1(a.d, a.d, a.nw, divisor);
        rf_normalize(a);
    }

    // 単一リム乗算 (インプレース): a *= multiplier
    static void rf_mul_1(RawFloat& a, uint64_t multiplier) {
        if (a.nw == 0 || multiplier == 0) { a.nw = 0; a.exp = 0; return; }
        if (multiplier == 1) return;
        uint64_t carry = mpn::mul_1(a.d, a.d, a.nw, multiplier);
        if (carry) {
            a.d[a.nw] = carry;
            a.nw++;
        }
        rf_normalize(a);
    }

    // src を shift_bits ビット右シフトして dst に加算 (インプレース)
    // dst[0..dst_nw-1] += (src >> shift_bits) のオーバーラップ部分
    // 戻り値: dst_nw を超えるキャリー (0 or 1)
    static uint64_t shift_add(uint64_t* dst, size_t dst_nw,
                              const uint64_t* src, size_t src_nw,
                              size_t shift_bits) {
        size_t word_shift = shift_bits / 64;
        unsigned bit_shift = static_cast (shift_bits % 64);

        if (word_shift >= src_nw) return 0;

        size_t remaining = src_nw - word_shift;
        size_t overlap = std::min(remaining, dst_nw);
        if (overlap == 0) return 0;

        uint64_t carry = 0;

        if (bit_shift == 0) {
            carry = mpn::add(dst, dst, overlap, src + word_shift, overlap);
        } else {
            for (size_t i = 0; i < overlap; ++i) {
                uint64_t lo = src[word_shift + i] >> bit_shift;
                uint64_t hi = (word_shift + i + 1 < src_nw) ?
                              (src[word_shift + i + 1] << (64 - bit_shift)) : 0;
                uint64_t shifted = lo | hi;

                uint64_t sum = dst[i] + shifted;
                uint64_t c = (sum < dst[i]) ? 1ULL : 0ULL;
                sum += carry;
                c += (sum < carry) ? 1ULL : 0ULL;
                dst[i] = sum;
                carry = c;
            }
        }

        // キャリー伝播
        for (size_t i = overlap; carry && i < dst_nw; ++i) {
            dst[i] += carry;
            carry = (dst[i] < carry) ? 1ULL : 0ULL;
        }

        return carry;
    }

    // src を shift_bits ビット右シフトして dst から減算 (インプレース)
    // dst[0..dst_nw-1] -= (src >> shift_bits) のオーバーラップ部分
    // 戻り値: ボロー (0 or 1)
    static uint64_t shift_sub(uint64_t* dst, size_t dst_nw,
                              const uint64_t* src, size_t src_nw,
                              size_t shift_bits) {
        size_t word_shift = shift_bits / 64;
        unsigned bit_shift = static_cast (shift_bits % 64);

        if (word_shift >= src_nw) return 0;

        size_t remaining = src_nw - word_shift;
        size_t overlap = std::min(remaining, dst_nw);
        if (overlap == 0) return 0;

        uint64_t borrow = 0;

        if (bit_shift == 0) {
            borrow = mpn::sub(dst, dst, overlap, src + word_shift, overlap);
        } else {
            for (size_t i = 0; i < overlap; ++i) {
                uint64_t lo = src[word_shift + i] >> bit_shift;
                uint64_t hi = (word_shift + i + 1 < src_nw) ?
                              (src[word_shift + i + 1] << (64 - bit_shift)) : 0;
                uint64_t shifted = lo | hi;

                uint64_t diff = dst[i] - shifted;
                uint64_t b = (diff > dst[i]) ? 1ULL : 0ULL;
                uint64_t diff2 = diff - borrow;
                b += (diff2 > diff) ? 1ULL : 0ULL;
                dst[i] = diff2;
                borrow = b;
            }
        }

        // ボロー伝播
        for (size_t i = overlap; borrow && i < dst_nw; ++i) {
            uint64_t old_val = dst[i];
            dst[i] -= borrow;
            borrow = (dst[i] > old_val) ? 1ULL : 0ULL;
        }

        return borrow;
    }

    // 加算 (インプレース): result += term (ビットレベルアライメント)
    // 戻り値: false なら term は negligible (収束判定に使用)
    static bool rf_add(RawFloat& result, const RawFloat& term, size_t nw_max) {
        if (term.nw == 0) return false;
        if (result.nw == 0) {
            std::memcpy(result.d, term.d, term.nw * sizeof(uint64_t));
            result.nw = term.nw; result.exp = term.exp;
            return true;
        }

        int64_t exp_diff = result.exp - term.exp;

        if (exp_diff < 0) {
            // term の方が大きい指数: result = term + (旧result >> |exp_diff|)
            size_t neg_diff = static_cast (-exp_diff);
            if (neg_diff / 64 >= result.nw) {
                // result は negligible → term で上書き
                std::memcpy(result.d, term.d, term.nw * sizeof(uint64_t));
                result.nw = term.nw; result.exp = term.exp;
                return true;
            }

            // 旧 result を退避 (稀なケース — Taylor 初回のみ)
            std::vector old_data(result.d, result.d + result.nw);
            size_t old_nw = result.nw;

            // result ← term
            std::memset(result.d, 0, nw_max * sizeof(uint64_t));
            std::memcpy(result.d, term.d, term.nw * sizeof(uint64_t));
            result.nw = term.nw;
            result.exp = term.exp;

            // result += (旧result >> neg_diff)
            uint64_t carry = shift_add(result.d, result.nw,
                                       old_data.data(), old_nw, neg_diff);
            if (carry && result.nw < nw_max) {
                result.d[result.nw] = carry;
                result.nw++;
            }
            return true;
        }

        // exp_diff >= 0: result += (term >> exp_diff)
        size_t total_shift = static_cast (exp_diff);
        if (total_shift / 64 >= term.nw) return false;  // negligible

        uint64_t carry = shift_add(result.d, result.nw,
                                   term.d, term.nw, total_shift);
        if (carry && result.nw < nw_max) {
            result.d[result.nw] = carry;
            result.nw++;
        }
        return true;
    }

    // 減算 (インプレース): result -= term (sin/cos 用)
    static bool rf_sub(RawFloat& result, const RawFloat& term, size_t nw_max) {
        if (term.nw == 0) return false;
        if (result.nw == 0) return false;

        int64_t exp_diff = result.exp - term.exp;
        if (exp_diff < 0) return true;  // term > result: 理論上到達しない

        size_t total_shift = static_cast (exp_diff);
        if (total_shift / 64 >= term.nw) return false;  // negligible

        shift_sub(result.d, result.nw, term.d, term.nw, total_shift);
        result.nw = mpn::normalized_size(result.d, result.nw);
        return true;
    }

    // Float → RawFloat 抽出 (正規化済み、arena に確保)
    static RawFloat rf_extract(const Float& x, int nw, ScratchArena& arena) {
        auto x_vec = x.mantissa().words();
        size_t x_nw_orig = x_vec.size();
        int64_t x_exp = x.exponent();

        uint64_t* x_d;
        size_t x_nw;
        if (x_nw_orig > static_cast (nw)) {
            x_d = arena.alloc_limbs(nw);
            size_t drop = x_nw_orig - nw;
            std::memcpy(x_d, x_vec.data() + drop, nw * sizeof(uint64_t));
            x_exp += static_cast (drop) * 64;
            x_nw = mpn::normalized_size(x_d, nw);
        } else {
            x_d = arena.alloc_limbs(x_nw_orig);
            std::memcpy(x_d, x_vec.data(), x_nw_orig * sizeof(uint64_t));
            x_nw = x_nw_orig;
        }
        RawFloat rf{x_d, x_nw, x_exp};
        rf_normalize(rf);
        return rf;
    }

    // RawFloat → Float 変換
    static Float rf_to_float(const RawFloat& rf, bool negative, int working_precision) {
        if (rf.nw == 0) return Float();
        std::vector words(rf.d, rf.d + rf.nw);
        Int mantissa = Int::fromRawWords(words, 1);
        Float result(mantissa, rf.exp, negative);
        int wp_bits = Float::precisionToBits(working_precision);
        result.setEffectiveBits(wp_bits);
        result.truncateToApprox(working_precision);
        return result;
    }

    //=============================================================================
    // 指数関数 (exp) — Taylor級数 + 引数半減/二乗復元
    //=============================================================================

    // 素朴な Taylor 級数 (RawFloat 出力版, x の符号対応)
    // divmod_1 + normalize を分離した版 (normalize は乗算前に遅延可能)
    static void rf_divmod_1_no_norm(RawFloat& a, uint64_t divisor) {
        if (a.nw == 0) return;
        mpn::divmod_1(a.d, a.d, a.nw, divisor);
        // normalize は呼ばない — 上位ゼロワードだけ除去
        a.nw = mpn::normalized_size(a.d, a.nw);
    }

    static void rf_exp_naive(RawFloat& result, const RawFloat& x_rf, bool x_negative,
                              int nw, uint64_t* prod_buf, uint64_t* scratch,
                              ScratchArena& arena, int working_precision) {
        uint64_t* term_d = arena.alloc_limbs(nw + 2);
        // term = 1.0
        std::memset(term_d, 0, (nw + 2) * sizeof(uint64_t));
        term_d[nw - 1] = uint64_t(1) << 63;
        RawFloat term{term_d, static_cast (nw), -static_cast (nw * 64 - 1)};
        // result = 1.0
        std::memset(result.d, 0, (nw + 2) * sizeof(uint64_t));
        result.d[nw - 1] = uint64_t(1) << 63;
        result.nw = static_cast (nw);
        result.exp = -static_cast (nw * 64 - 1);

        int max_terms = static_cast (working_precision * 1.2) + 10;
        for (int n = 1; n <= max_terms; ++n) {
            // term = term * x / n
            // rf_mul は内部で normalize する (乗算結果の正規化が必要なため)
            rf_mul(term, term, x_rf, prod_buf, scratch, nw);
            // divmod_1: normalize を省略し、上位ゼロワード除去のみ
            // (次の rf_mul で正規化されるため、ビットレベル normalize は不要)
            // ただし rf_add/rf_sub は exp_diff で位置を決めるため、
            // 正しい exp が必要 → add の前に normalize する
            rf_divmod_1_no_norm(term, static_cast (n));
            if (term.nw == 0) break;
            // add/sub の前に normalize (exp の正確さが必要)
            rf_normalize(term);
            if (x_negative && (n & 1)) {
                if (!rf_sub(result, term, nw + 1)) break;
            } else {
                if (!rf_add(result, term, nw + 1)) break;
            }
        }
    }

    // Paterson-Stockmeyer Taylor 級数 (x > 0 前提)
    // O(sqrt(l)) 回のフルサイズ乗算 + O(l) 回の単語除算
    // MPFR exp2_aux2 と同等のアルゴリズム
    static void rf_exp_ps(RawFloat& result, const RawFloat& x_rf, int nw,
                           uint64_t* prod_buf, uint64_t* scratch,
                           ScratchArena& arena, int working_precision) {
        int target_bits = nw * 64;
        // 項数推定: |x| ≈ 2^x_msb, 各項は ~|x_msb| ビットずつ減少
        int64_t x_msb = x_rf.exp + static_cast (x_rf.nw) * 64;
        int x_log2 = (x_msb <= 0) ? static_cast (-x_msb) : 1;
        if (x_log2 < 1) x_log2 = 1;
        int l_est = target_bits / x_log2 + 10;

        int m = static_cast (std::sqrt(static_cast (l_est)));
        if (m < 2) m = 2;
        if (m > 256) m = 256;

        // R[0..m] の事前計算: R[i] = x^i (ヒープ回避のため arena + 固定配列)
        constexpr int R_MAX = 260;  // m の上限は 256
        RawFloat R[R_MAX];
        for (int i = 0; i <= m; ++i) {
            R[i].d = arena.alloc_limbs(nw + 2);
            std::memset(R[i].d, 0, (nw + 2) * sizeof(uint64_t));
            R[i].nw = 0;
            R[i].exp = 0;
        }
        // R[0] = 1.0
        R[0].d[nw - 1] = uint64_t(1) << 63;
        R[0].nw = static_cast (nw);
        R[0].exp = -static_cast (nw * 64 - 1);
        // R[1] = x
        std::memcpy(R[1].d, x_rf.d, x_rf.nw * sizeof(uint64_t));
        R[1].nw = x_rf.nw;
        R[1].exp = x_rf.exp;
        // R[2..m]: 偶数は二乗 (rf_sqr)、奇数は R[i-1]*R[1]
        // R[1] の NTT キャッシュ (奇数 R[i] の計算で再利用)
        prime_ntt::NttCache cache_r1;
        rf_sqr(R[2], R[1], prod_buf, scratch, nw);
        for (int i = 3; i <= m; ++i) {
            if ((i & 1) == 0) {
                rf_sqr(R[i], R[i/2], prod_buf, scratch, nw);
            } else {
                rf_mul_cached(R[i], R[i-1], R[1], prod_buf, nw, cache_r1);
            }
        }

        // R[m] の NTT キャッシュ (giant step で繰り返し使用)
        prime_ntt::NttCache cache_rm;

        // rr = 1.0 (x^l / l! を追跡)
        RawFloat rr;
        rr.d = arena.alloc_limbs(nw + 2);
        std::memset(rr.d, 0, (nw + 2) * sizeof(uint64_t));
        rr.d[nw - 1] = uint64_t(1) << 63;
        rr.nw = static_cast (nw);
        rr.exp = -static_cast (nw * 64 - 1);

        // t = baby step 作業用
        RawFloat t;
        t.d = arena.alloc_limbs(nw + 2);

        // result = 0
        std::memset(result.d, 0, (nw + 2) * sizeof(uint64_t));
        result.nw = 0;
        result.exp = 0;

        int l = 0;
        int max_giant_steps = l_est / m + 5;

        for (int gs = 0; gs < max_giant_steps; ++gs) {
            // Baby step: Horner で sum_{i=0}^{m-1} R[i] * l!/(l+i)! を評価
            std::memcpy(t.d, R[m-1].d, R[m-1].nw * sizeof(uint64_t));
            t.nw = R[m-1].nw;
            t.exp = R[m-1].exp;

            for (int i = m - 2; i >= 0; --i) {
                rf_divmod_1(t, static_cast (l + i + 1));
                if (t.nw == 0) {
                    std::memcpy(t.d, R[i].d, R[i].nw * sizeof(uint64_t));
                    t.nw = R[i].nw;
                    t.exp = R[i].exp;
                } else {
                    rf_add(t, R[i], nw + 1);
                }
            }

            // t *= rr (最初の giant step では rr = 1 なので乗算不要)
            if (gs > 0) {
                rf_mul(t, t, rr, prod_buf, scratch, nw);
            }
            if (t.nw == 0) break;

            // result += t
            rf_add(result, t, nw + 1);

            // rr 更新: rr = rr * R[m] / ((l+1)(l+2)...(l+m))
            rf_mul_cached(rr, rr, R[m], prod_buf, nw, cache_rm);
            for (int i = 1; i <= m; ++i) {
                rf_divmod_1(rr, static_cast (l + i));
            }
            l += m;
            if (rr.nw == 0) break;
        }
    }

    // sin/cos Taylor の項数推定 (Stirling 近似考慮)
    // u^l / (2l)! < 2^(-target_bits) を解く
    // log2(u^l / (2l)!) ≈ l*log2(u) - 2l*log2(2l/e) = -l*(u_log2 + 2*log2(2l/e))
    static int estimate_sincos_terms(int target_bits, int u_log2) {
        double tb = static_cast (target_bits);
        double ul = static_cast