#include #include #include namespace calx {

    // 計算ポリシーのベースクラス
    template class ComputePolicyBase {
    public:
        using value_type = T;
        using size_type = std::size_t;
    };

    // 前方宣言
    template class DefaultComputePolicy;
    template class SIMDComputePolicy;
    template class MKLComputePolicy;

    // デフォルト計算ポリシー
    template class DefaultComputePolicy : public ComputePolicyBase {
    public:
        using value_type = T;
        using size_type = std::size_t;

        // ベクトル加算
        template static void vector_add(const VecA& a, const VecB& b, VecResult& result) {
            if (a.size() != b.size() || a.size() != result.size()) {
                throw std::invalid_argument("Vector dimensions mismatch for addition");
            }

            for (size_type i = 0; i < a.size(); ++i) {
                result[i] = a[i] + b[i];

                // NumericState システムの適用
                if (numeric_state_traits ::is_supported) {
                    if (numeric_state_traits ::isSpecial(a[i]) ||
                        numeric_state_traits ::isSpecial(b[i])) {
                        // NumericState伝播処理
                    }
                }
            }
        }

        // ベクトル減算
        template static void vector_subtract(const VecA& a, const VecB& b, VecResult& result) {
            if (a.size() != b.size() || a.size() != result.size()) {
                throw std::invalid_argument("Vector dimensions mismatch for subtraction");
            }

            for (size_type i = 0; i < a.size(); ++i) {
                result[i] = a[i] - b[i];

                // NumericState システムの適用
                if (numeric_state_traits ::is_supported) {
                    if (numeric_state_traits ::isSpecial(a[i]) ||
                        numeric_state_traits ::isSpecial(b[i])) {
                        // NumericState伝播処理
                    }
                }
            }
        }

        // ベクトル内積
        template static T dot(const VecA& a, const VecB& b) {
            if (a.size() != b.size()) {
                throw std::invalid_argument("Vector dimensions mismatch for dot product");
            }

            T result = static_cast (0);
            for (size_type i = 0; i < a.size(); ++i) {
                result += a[i] * b[i];

                // NumericState システムの適用
                if (numeric_state_traits ::is_supported) {
                    if (numeric_state_traits ::isSpecial(a[i]) ||
                        numeric_state_traits ::isSpecial(b[i])) {
                        // NumericState伝播処理
                    }
                }
            }

            return result;
        }

        // ベクトルノルム（L2ノルム）
        template static T norm(const Vec& vec) {
            T sum_of_squares = static_cast (0);

            for (size_type i = 0; i < vec.size(); ++i) {
                sum_of_squares += vec[i] * vec[i];

                // NumericState システムの適用
                if (numeric_state_traits ::is_supported) {
                    if (numeric_state_traits ::isSpecial(vec[i])) {
                        // NumericState伝播処理
                    }
                }
            }

            return std::sqrt(sum_of_squares);
        }

        // ベクトルスカラー乗算
        template static void vector_scalar_multiply(const Vec& vec, const T& scalar, VecResult& result) {
            if (vec.size() != result.size()) {
                throw std::invalid_argument("Vector dimensions mismatch for scalar multiplication");
            }

            for (size_type i = 0; i < vec.size(); ++i) {
                result[i] = vec[i] * scalar;

                // NumericState システムの適用
                if (numeric_state_traits ::is_supported) {
                    if (numeric_state_traits ::isSpecial(vec[i]) ||
                        numeric_state_traits ::isSpecial(scalar)) {
                        // NumericState伝播処理
                    }
                }
            }
        }

        // ベクトル要素ごとの乗算
        template static void elementWiseMultiply(const VecA& a, const VecB& b, VecResult& result) {
            if (a.size() != b.size() || a.size() != result.size()) {
                throw std::invalid_argument("Vector dimensions mismatch for element-wise multiplication");
            }

            for (size_type i = 0; i < a.size(); ++i) {
                result[i] = a[i] * b[i];

                // NumericState システムの適用
                if (numeric_state_traits ::is_supported) {
                    if (numeric_state_traits ::isSpecial(a[i]) ||
                        numeric_state_traits ::isSpecial(b[i])) {
                        // NumericState伝播処理
                    }
                }
            }
        }

        // ベクトル要素ごとの除算
        template static void elementWiseDivide(const VecA& a, const VecB& b, VecResult& result) {
            if (a.size() != b.size() || a.size() != result.size()) {
                throw std::invalid_argument("Vector dimensions mismatch for element-wise division");
            }

            for (size_type i = 0; i < a.size(); ++i) {
                if (b[i] == static_cast (0)) {
                    // ゼロ除算を処理（特殊値を設定するか例外をスローする）
                    if (numeric_state_traits ::is_supported && numeric_state_traits ::isSpecial(a[i])) {
                        // 特殊状態を設定
                    }
                    else {
                        throw std::invalid_argument("Division by zero");
                    }
                }
                else {
                    result[i] = a[i] / b[i];

                    // NumericState システムの適用
                    if (numeric_state_traits ::is_supported) {
                        if (numeric_state_traits ::isSpecial(a[i]) ||
                            numeric_state_traits ::isSpecial(b[i])) {
                            // NumericState伝播処理
                        }
                    }
                }
            }
        }

        // 行列操作のための関数

        // 行列加算
        template static void add(const MatA& a, const MatB& b, MatResult& result) {
            if (a.rows() != b.rows() || a.cols() != b.cols() ||
                a.rows() != result.rows() || a.cols() != result.cols()) {
                throw std::invalid_argument("Matrix dimensions mismatch for addition");
            }

            for (size_type i = 0; i < a.rows(); ++i) {
                for (size_type j = 0; j < a.cols(); ++j) {
                    result(i, j) = a(i, j) + b(i, j);
                }
            }
        }

        // 行列減算
        template static void subtract(const MatA& a, const MatB& b, MatResult& result) {
            if (a.rows() != b.rows() || a.cols() != b.cols() ||
                a.rows() != result.rows() || a.cols() != result.cols()) {
                throw std::invalid_argument("Matrix dimensions mismatch for subtraction");
            }

            for (size_type i = 0; i < a.rows(); ++i) {
                for (size_type j = 0; j < a.cols(); ++j) {
                    result(i, j) = a(i, j) - b(i, j);
                }
            }
        }

        // 行列乗算
        template static void multiply(const MatA& a, const MatB& b, MatResult& result) {
            if (a.cols() != b.rows() || result.rows() != a.rows() || result.cols() != b.cols()) {
                throw std::invalid_argument("Matrix dimensions mismatch for multiplication");
            }

            const size_type M = a.rows();
            const size_type N = b.cols();
            const size_type K = a.cols();

            // ゼロ初期化
            T* c_data = result.data();
            const T zero_val = numeric_traits ::zero();
            for (size_type i = 0; i < M * N; ++i)
                c_data[i] = zero_val;

            // raw ポインタアクセス (行優先レイアウト前提)
            const T* a_data = a.data();
            const T* b_data = b.data();
            const size_type lda = a.cols();
            const size_type ldb = b.cols();
            const size_type ldc = result.cols();

            gemm_blocked(a_data, b_data, c_data, M, N, K, lda, ldb, ldc);
        }

    private:
        // ================================================================
        // ブロック gemm: タイリング + レジスタブロッキング + B パッキング
        // ================================================================
        //
        // MR × NR マイクロカーネル: C を YMM レジスタに保持し k ループで FMA 累積
        // → C の load/store が k あたり 1 回から全体で 1 回に削減 (BK 倍の帯域削減)
        //
        // B パッキング: B パネルを NR 幅の列パネルに再配置
        // → k 方向のストライドアクセス (ldb 飛び) を連続アクセスに変換

        static constexpr size_type MR = 4;   // レジスタブロック行数

        // NR: レジスタブロック列数 (SIMD パケット 2 本分)
        static constexpr size_type gemm_NR() {
            if constexpr (has_simd_packet_v)
                return PacketTraits ::size * 2;  // double: 8, float: 16
            else
                return 4;
        }

        static constexpr size_type BM = 64;    // 行ブロック
        static constexpr size_type BN = 64;    // 列ブロック (NR の倍数)
        static constexpr size_type BK = 128;   // 内積ブロック (レジスタブロッキングで増やせる)

        // B パッキング: B[k0:k0+bk, j0:j0+bn] を NR 幅列パネルに再配置
        // 出力レイアウト: panel p の k 番目 → packB[p * bk * NR + k * NR + 0..NR-1]
        static void pack_b_panel(const T* b, T* packB,
                                  size_type k0, size_type bk,
                                  size_type j0, size_type bn,
                                  size_type ldb) {
            const size_type NR = gemm_NR();
            const size_type n_full = bn / NR;
            // フルパネル
            for (size_type p = 0; p < n_full; ++p) {
                T* dst = packB + p * bk * NR;
                for (size_type k = 0; k < bk; ++k) {
                    const T* src = &b[(k0 + k) * ldb + j0 + p * NR];
                    std::memcpy(dst + k * NR, src, NR * sizeof(T));
                }
            }
            // 残端パネル (NR 未満) — ゼロパディング
            const size_type rem = bn % NR;
            if (rem > 0) {
                T* dst = packB + n_full * bk * NR;
                for (size_type k = 0; k < bk; ++k) {
                    const T* src = &b[(k0 + k) * ldb + j0 + n_full * NR];
                    size_type r = 0;
                    for (; r < rem; ++r)
                        dst[k * NR + r] = src[r];
                    for (; r < NR; ++r)
                        dst[k * NR + r] = T{0};
                }
            }
        }

        // レジスタブロック マイクロカーネル (packed B 版): C[i:i+4, 0:NR] += A * packB
        // C は 8 本の YMM レジスタで保持、k ループ全体で累積
        static void gemm_micro_kernel(const T* a, const T* packB_panel, T* c,
                                       size_type i, size_type bk,
                                       size_type lda, size_type ldc) {
            if constexpr (has_simd_packet_v) {
                using PT = PacketTraits;
                constexpr size_type PS = PT::size;
                constexpr size_type NR = PS * 2;

                auto c00 = PT::load(&c[(i+0)*ldc]);
                auto c01 = PT::load(&c[(i+0)*ldc + PS]);
                auto c10 = PT::load(&c[(i+1)*ldc]);
                auto c11 = PT::load(&c[(i+1)*ldc + PS]);
                auto c20 = PT::load(&c[(i+2)*ldc]);
                auto c21 = PT::load(&c[(i+2)*ldc + PS]);
                auto c30 = PT::load(&c[(i+3)*ldc]);
                auto c31 = PT::load(&c[(i+3)*ldc + PS]);

                for (size_type k = 0; k < bk; ++k) {
                    const auto b0 = PT::load(&packB_panel[k * NR]);
                    const auto b1 = PT::load(&packB_panel[k * NR + PS]);

                    const auto a0 = PT::set1(a[(i+0)*lda + k]);
                    c00 = PT::fmadd(a0, b0, c00);
                    c01 = PT::fmadd(a0, b1, c01);

                    const auto a1 = PT::set1(a[(i+1)*lda + k]);
                    c10 = PT::fmadd(a1, b0, c10);
                    c11 = PT::fmadd(a1, b1, c11);

                    const auto a2 = PT::set1(a[(i+2)*lda + k]);
                    c20 = PT::fmadd(a2, b0, c20);
                    c21 = PT::fmadd(a2, b1, c21);

                    const auto a3 = PT::set1(a[(i+3)*lda + k]);
                    c30 = PT::fmadd(a3, b0, c30);
                    c31 = PT::fmadd(a3, b1, c31);
                }

                PT::store(&c[(i+0)*ldc], c00);
                PT::store(&c[(i+0)*ldc + PS], c01);
                PT::store(&c[(i+1)*ldc], c10);
                PT::store(&c[(i+1)*ldc + PS], c11);
                PT::store(&c[(i+2)*ldc], c20);
                PT::store(&c[(i+2)*ldc + PS], c21);
                PT::store(&c[(i+3)*ldc], c30);
                PT::store(&c[(i+3)*ldc + PS], c31);
            }
        }

        // レジスタブロック マイクロカーネル (直接 B 版): パッキングなし
        // 小行列でパッキングオーバーヘッドを回避
        static void gemm_micro_kernel_direct(const T* a, const T* b, T* c,
                                              size_type i, size_type j,
                                              size_type k0, size_type bk,
                                              size_type lda, size_type ldb, size_type ldc) {
            if constexpr (has_simd_packet_v) {
                using PT = PacketTraits