fix: MSVC C2026 string limit (#173), OpenCL batch-inv kernels, source graph tooling

- Split embedded OpenCL kernel_source into kernel_parts[] array so no single string literal exceeds MSVC's 65535-byte limit. clCreateProgramWithSource now receives multiple source strings. - Added batch-inversion kernels (field_inv, affine_add, jac_to_affine) using per-workgroup Montgomery's trick with __local memory. - OpenCL BIP352 benchmark scaffold and kernel stubs. - Source graph kit for indexed codebase exploration. - Assorted doc, benchmark, and audit report updates.
2026-03-19 16:43:55 +00:00 · 2026-03-19 16:43:55 +00:00 · fea2420fe7
commit fea2420fe7
parent cfda151728
33 changed files with 14406 additions and 245 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -78,6 +78,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `coin_address()` CASHADDR dispatch now correctly routes to `coin_address_cashaddr()` --
  Bitcoin Cash addresses generate via CashAddr instead of falling through to Base58Check.
 - All 28 coins now generate addresses correctly (was 27; BCH fixed, Tron added).
+- **ARM64 Android hash dispatch** -- `hash_accel` now routes `sha256_33`, `sha256_32`,
+  `hash160_33`, and `sha256_compress_dispatch` through ARMv8 SHA-256 instructions when
+  building for AArch64 targets with SHA2 support. On RK3588 / Android NDK r27.2 this reduced
+  `ecdsa_sign` from 25.89 us to 22.22 us, `schnorr_sign` (precomputed) from 17.73 us to 16.67 us,
+  and `ct::ecdsa_sign` from 70.50 us to 67.11 us, with verify paths remaining effectively flat.
+- **x86 Schnorr batch verify allocation path** -- `batch_verify.cpp` now reserves the
+  full batch size for the uncached x-only pubkey cache instead of capping capacity at 64.
+  Local i5-14400F reruns reduced uncached `schnorr_batch_verify` from 20.27 us/sig to about
+  19.94-20.06 us/sig at N=128 and from 18.56 us/sig to about 18.01-18.45 us/sig at N=192,
+  with `comprehensive` remaining green.

 ---

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -63,6 +63,9 @@ set(SECP256K1_MARCH "" CACHE STRING "x86-64 -march override (empty = auto-detect

 # Warning policy: promote warnings to errors (recommended for CI)
 option(SECP256K1_WERROR "Treat compiler warnings as errors (-Werror / /WX)" OFF)
+option(UFSECP_REFRESH_SOURCE_GRAPH "Refresh the repo source graph during builds" ON)
+
+find_package(Python3 COMPONENTS Interpreter QUIET)

 # Global compile definitions
 if(SECP256K1_SPEED_FIRST)
@ -345,6 +348,21 @@ if(SECP256K1_INSTALL)
 endif()

 # -- CPack packaging ---------------------------------------------------------
+set(UFSECP_SOURCE_GRAPH_TOOL "${CMAKE_CURRENT_SOURCE_DIR}/tools/source_graph_kit/source_graph.py")
+if(UFSECP_REFRESH_SOURCE_GRAPH)
+    if(Python3_Interpreter_FOUND AND EXISTS "${UFSECP_SOURCE_GRAPH_TOOL}")
+        add_custom_target(ufsecp_source_graph_refresh ALL
+            COMMAND "${Python3_EXECUTABLE}" "${UFSECP_SOURCE_GRAPH_TOOL}" build -i
+            WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+            COMMENT "Refreshing UltrafastSecp256k1 source graph incrementally"
+            USES_TERMINAL
+            VERBATIM
+        )
+    else()
+        message(STATUS "secp256k1-fast: source graph refresh disabled (missing Python3 interpreter or source_graph.py)")
+    endif()
+endif()
+
 set(CPACK_PACKAGE_NAME "UltrafastSecp256k1")
 set(CPACK_PACKAGE_VERSION "${PROJECT_VERSION}")
 set(CPACK_PACKAGE_VENDOR "shrec")
--- a/benchmarks/comparison/bench_unified_full_local_20260317.json
+++ b/benchmarks/comparison/bench_unified_full_local_20260317.json
@ -0,0 +1,229 @@
+{
+  "metadata": {
+    "cpu": "Intel(R) Core(TM) i5-14400F",
+    "compiler": "GCC 14.2.0",
+    "arch": "x86-64",
+    "timer": "RDTSCP",
+    "tsc_ghz": 2.496,
+    "passes": 11,
+    "warmup": 500,
+    "pool_size": 64
+  },
+  "results": [
+    {"section": "FIELD ARITHMETIC (Ultra)", "name": "field_mul", "ns": 10.78},
+    {"section": "FIELD ARITHMETIC (Ultra)", "name": "field_sqr", "ns": 10.06},
+    {"section": "FIELD ARITHMETIC (Ultra)", "name": "field_inv", "ns": 645.81},
+    {"section": "FIELD ARITHMETIC (Ultra)", "name": "field_add", "ns": 3.92},
+    {"section": "FIELD ARITHMETIC (Ultra)", "name": "field_sub", "ns": 4.18},
+    {"section": "FIELD ARITHMETIC (Ultra)", "name": "field_negate", "ns": 5.66},
+    {"section": "FIELD ARITHMETIC (Ultra)", "name": "field_from_bytes (32B)", "ns": 2.80},
+    {"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_mul", "ns": 19.96},
+    {"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_inv", "ns": 859.65},
+    {"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_add", "ns": 4.14},
+    {"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_negate", "ns": 2.35},
+    {"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_from_bytes (32B)", "ns": 2.56},
+    {"section": "POINT ARITHMETIC (Ultra)", "name": "pubkey_create (k*G)", "ns": 4750.01},
+    {"section": "POINT ARITHMETIC (Ultra)", "name": "scalar_mul (k*P)", "ns": 19404.73},
+    {"section": "POINT ARITHMETIC (Ultra)", "name": "scalar_mul_with_plan", "ns": 16596.88},
+    {"section": "POINT ARITHMETIC (Ultra)", "name": "dual_mul (a*G + b*P)", "ns": 18738.36},
+    {"section": "POINT ARITHMETIC (Ultra)", "name": "point_add (affine+affine)", "ns": 761.58},
+    {"section": "POINT ARITHMETIC (Ultra)", "name": "point_add (J+A mixed)", "ns": 118.54},
+    {"section": "POINT ARITHMETIC (Ultra)", "name": "point_dbl", "ns": 67.58},
+    {"section": "POINT ARITHMETIC (Ultra)", "name": "normalize (J->affine)", "ns": 2.63},
+    {"section": "POINT ARITHMETIC (Ultra)", "name": "batch_normalize /pt (N=64)", "ns": 8.15},
+    {"section": "POINT ARITHMETIC (Ultra)", "name": "next_inplace (+=G)", "ns": 132.52},
+    {"section": "POINT ARITHMETIC (Ultra)", "name": "KPlan::from_scalar(w=4)", "ns": 1103.67},
+    {"section": "POINT SERIALIZATION (Ultra)", "name": "to_compressed (33B)", "ns": 7.19},
+    {"section": "POINT SERIALIZATION (Ultra)", "name": "to_uncompressed (65B)", "ns": 6.98},
+    {"section": "POINT SERIALIZATION (Ultra)", "name": "x_only_bytes (32B)", "ns": 3.05},
+    {"section": "POINT SERIALIZATION (Ultra)", "name": "x_bytes_and_parity", "ns": 4.15},
+    {"section": "POINT SERIALIZATION (Ultra)", "name": "has_even_y", "ns": 1.74},
+    {"section": "POINT SERIALIZATION (Ultra)", "name": "batch_to_compressed /pt (N=64)", "ns": 2.03},
+    {"section": "POINT SERIALIZATION (Ultra)", "name": "batch_x_only_bytes /pt (N=64)", "ns": 1.71},
+    {"section": "POINT SERIALIZATION (Ultra)", "name": "msm /pt (N=128)", "ns": 6130.30},
+    {"section": "POINT SERIALIZATION (Ultra)", "name": "pippenger_msm /pt (N=128)", "ns": 6158.43},
+    {"section": "POINT SERIALIZATION (Ultra)", "name": "precompute_g_multiples /pt (N=64)", "ns": 248.30},
+    {"section": "POINT SERIALIZATION (Ultra)", "name": "precompute_point_multiples /pt (N=64)", "ns": 240.09},
+    {"section": "ECDSA -- Ultra FAST", "name": "ecdsa_sign", "ns": 6450.90},
+    {"section": "ECDSA -- Ultra FAST", "name": "ecdsa_sign_verified", "ns": 37580.34},
+    {"section": "ECDSA -- Ultra FAST", "name": "ecdsa_verify", "ns": 20846.59},
+    {"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_keypair_create", "ns": 5405.04},
+    {"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_sign", "ns": 5295.75},
+    {"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_sign_verified", "ns": 27132.24},
+    {"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_verify (cached xonly)", "ns": 20279.62},
+    {"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_verify (raw bytes)", "ns": 21640.76},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Scalar::from_bytes (32B->scalar)", "ns": 2.56},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Scalar::inverse (safegcd)", "ns": 849.50},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Scalar::mul", "ns": 19.74},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Scalar::negate", "ns": 2.40},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "glv_decompose", "ns": 74.70},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Point::dbl (jac52_double)", "ns": 57.55},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Point::add (J+A mixed)", "ns": 121.40},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "dual_scalar_mul_gen_point", "ns": 19001.51},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::from_4x64_limbs", "ns": 1.39},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::mul (52-bit)", "ns": 15.76},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::sqr (52-bit)", "ns": 13.50},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::inverse_safegcd", "ns": 725.37},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::inverse (Fermat)", "ns": 3828.50},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::add (52-bit)", "ns": 0.53},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::negate (52-bit)", "ns": 0.49},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::normalize", "ns": 3.50},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "SHA256 (BIP0340/challenge)", "ns": 107.37},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "tagged_hash (recompute tag)", "ns": 196.91},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "cached_tagged_hash (midstate)", "ns": 70.00},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "lift_x (4x64 sqrt)", "ns": 5094.45},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "lift_x (FE52 sqrt)", "ns": 3347.34},
+    {"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE::parse_bytes_strict", "ns": 3.41},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(N=4)", "ns": 78874.28},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "  -> per-sig amortized (N=4)", "ns": 19718.57},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(N=16)", "ns": 325401.49},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "  -> per-sig amortized (N=16)", "ns": 20337.59},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(N=64)", "ns": 1329107.06},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "  -> per-sig amortized (N=64)", "ns": 20767.30},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(N=192)", "ns": 3283487.41},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "  -> per-sig amortized (N=192)", "ns": 17101.50},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(repeated,N=192)", "ns": 2884848.94},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "  -> per-sig repeated (N=192)", "ns": 15025.25},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_seed_only(N=192)", "ns": 16218.78},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_weights_only(N=192)", "ns": 10063.15},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_R_lift_only(N=192)", "ns": 926909.98},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_P_lift_only(N=192)", "ns": 951004.08},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_challenge_only(N=192)", "ns": 16512.13},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_xonly_parse_only(N=192)", "ns": 659796.91},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_P_lift+challenge_only(N=192)", "ns": 937008.04},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_lift+challenge(N=192)", "ns": 1977220.42},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_setup_only(N=192)", "ns": 2008198.98},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "  -> setup per-sig (N=192)", "ns": 10459.37},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_seed_only(repeated,N=192)", "ns": 14453.32},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_weights_only(repeated,N=192)", "ns": 8768.24},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_R_lift_only(repeated,N=192)", "ns": 1004852.26},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_P_lift_only(repeated,N=192)", "ns": 945079.24},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_challenge_only(repeated,N=192)", "ns": 18516.63},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_xonly_parse_only(repeated,N=192)", "ns": 663956.75},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_P_lift+challenge_only(repeated,N=192)", "ns": 953751.95},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_lift+challenge(repeated,N=192)", "ns": 1912494.47},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_setup_only(repeated,N=192)", "ns": 1908150.13},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "  -> setup repeated per-sig (N=192)", "ns": 9938.28},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "ecdsa_batch_verify(N=4)", "ns": 76754.13},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "  -> per-sig amortized (N=4)", "ns": 19188.53},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "ecdsa_batch_verify(N=16)", "ns": 304265.14},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "  -> per-sig amortized (N=16)", "ns": 19016.57},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "ecdsa_batch_verify(N=64)", "ns": 1230289.00},
+    {"section": "BATCH VERIFICATION (FAST)", "name": "  -> per-sig amortized (N=64)", "ns": 19223.27},
+    {"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::scalar_inverse (SafeGCD)", "ns": 1351.46},
+    {"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::generator_mul (k*G)", "ns": 9533.74},
+    {"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::scalar_mul (k*P)", "ns": 21251.51},
+    {"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::point_dbl", "ns": 70.57},
+    {"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::point_add_complete (11M+6S)", "ns": 203.36},
+    {"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::point_add_mixed_complete (7M+5S)", "ns": 135.48},
+    {"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::point_add_mixed_unified (7M+5S)", "ns": 131.32},
+    {"section": "CT SIGNING (Ultra CT)", "name": "ct::ecdsa_sign", "ns": 12761.02},
+    {"section": "CT SIGNING (Ultra CT)", "name": "  CT overhead (ECDSA)", "ratio": 1.9782},
+    {"section": "CT SIGNING (Ultra CT)", "name": "ct::ecdsa_sign_verified", "ns": 43190.57},
+    {"section": "CT SIGNING (Ultra CT)", "name": "ct::schnorr_sign", "ns": 11070.78},
+    {"section": "CT SIGNING (Ultra CT)", "name": "  CT overhead (Schnorr)", "ratio": 2.0905},
+    {"section": "CT SIGNING (Ultra CT)", "name": "ct::schnorr_sign_verified", "ns": 33161.61},
+    {"section": "CT SIGNING (Ultra CT)", "name": "ct::schnorr_keypair_create", "ns": 12088.90},
+    {"section": "CT SIGNING (Ultra CT)", "name": "  CT overhead (keypair)", "ratio": 2.2366},
+    {"section": "ETHEREUM OPERATIONS", "name": "keccak256 (32B)", "ns": 254.19},
+    {"section": "ETHEREUM OPERATIONS", "name": "ethereum_address", "ns": 228.43},
+    {"section": "ETHEREUM OPERATIONS", "name": "eip191_hash", "ns": 225.03},
+    {"section": "ETHEREUM OPERATIONS", "name": "eth_sign_hash", "ns": 6525.31},
+    {"section": "ETHEREUM OPERATIONS", "name": "ecdsa_sign_recoverable", "ns": 6598.01},
+    {"section": "ETHEREUM OPERATIONS", "name": "ecrecover", "ns": 27095.12},
+    {"section": "ETHEREUM OPERATIONS", "name": "eth_personal_sign", "ns": 6787.62},
+    {"section": "ETHEREUM OPERATIONS", "name": "ethereum_address_eip55", "ns": 564.81},
+    {"section": "REAL-WORLD FLOWS", "name": "ecdh_compute (SHA256 shared secret)", "ns": 20215.17},
+    {"section": "REAL-WORLD FLOWS", "name": "ecdh_compute_raw (x-only shared)", "ns": 20134.59},
+    {"section": "REAL-WORLD FLOWS", "name": "taproot_output_key (BIP-341 key path)", "ns": 10438.59},
+    {"section": "REAL-WORLD FLOWS", "name": "taproot_tweak_privkey (BIP-341)", "ns": 11246.91},
+    {"section": "REAL-WORLD FLOWS", "name": "bip32_master_key (64B seed)", "ns": 933.32},
+    {"section": "REAL-WORLD FLOWS", "name": "bip32_coin_derive_key (BTC m/84'/0'/0'/0/0)", "ns": 77986.98},
+    {"section": "REAL-WORLD FLOWS", "name": "coin_address_from_seed (BTC end-to-end)", "ns": 91654.63},
+    {"section": "REAL-WORLD FLOWS", "name": "coin_address_from_seed (ETH end-to-end)", "ns": 91281.62},
+    {"section": "REAL-WORLD FLOWS", "name": "silent_payment_create_output", "ns": 24181.18},
+    {"section": "REAL-WORLD FLOWS", "name": "silent_payment_scan (single output set)", "ns": 34901.09},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "field_mul", "ns": 11.61},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "field_sqr", "ns": 10.51},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "field_inv_var", "ns": 833.17},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "field_add", "ns": 6.57},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "field_negate", "ns": 6.32},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "field_normalize", "ns": 7.41},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "field_from_bytes (set_b32)", "ns": 6.97},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_mul", "ns": 26.42},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_inverse (CT)", "ns": 1421.11},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_inverse_var", "ns": 856.24},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_add", "ns": 5.23},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_negate", "ns": 7.00},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_from_bytes (set_b32)", "ns": 5.01},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "point_dbl (gej_double_var)", "ns": 78.64},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "point_add (gej_add_ge_var)", "ns": 141.13},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "ecmult (a*P + b*G, Strauss)", "ns": 21020.33},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "ecmult_gen (k*G, comb)", "ns": 9723.23},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "generator_mul (ec_pubkey_create)", "ns": 11384.81},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_mul_P (k*P, tweak_mul)", "ns": 20135.59},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "serialize_compressed (33B)", "ns": 17.67},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "serialize_uncompressed (65B)", "ns": 22.52},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "point_add (pubkey_combine)", "ns": 1774.01},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "ecdsa_sign", "ns": 17203.14},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "ecdsa_verify", "ns": 22448.31},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "schnorr_keypair_create", "ns": 11751.95},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "schnorr_sign (BIP-340)", "ns": 13712.35},
+    {"section": "libsecp256k1 (bitcoin-core)", "name": "schnorr_verify (BIP-340)", "ns": 24529.62},
+    {"section": "OpenSSL (ECDSA, secp256k1)", "name": "generator_mul (EC_POINT_mul k*G)", "ns": 213014.57},
+    {"section": "OpenSSL (ECDSA, secp256k1)", "name": "ecdsa_sign (ECDSA_do_sign)", "ns": 222950.90},
+    {"section": "OpenSSL (ECDSA, secp256k1)", "name": "ecdsa_verify (ECDSA_do_verify)", "ns": 214672.40},
+    {"section": "FIELD ARITHMETIC", "name": "mul", "ns": 10.78},
+    {"section": "FIELD ARITHMETIC", "name": "sqr", "ns": 10.06},
+    {"section": "FIELD ARITHMETIC", "name": "inv", "ns": 645.81},
+    {"section": "FIELD ARITHMETIC", "name": "add", "ns": 3.92},
+    {"section": "FIELD ARITHMETIC", "name": "sub", "ns": 4.18},
+    {"section": "FIELD ARITHMETIC", "name": "negate", "ns": 5.66},
+    {"section": "FIELD ARITHMETIC", "name": "normalize (FE52)", "ns": 3.50},
+    {"section": "FIELD ARITHMETIC", "name": "from_bytes (32B)", "ns": 2.80},
+    {"section": "FIELD ARITHMETIC", "name": "FE52 add (hot path)", "ns": 0.53},
+    {"section": "FIELD ARITHMETIC", "name": "FE52 neg (hot path)", "ns": 0.49},
+    {"section": "SCALAR ARITHMETIC", "name": "mul", "ns": 19.96},
+    {"section": "SCALAR ARITHMETIC", "name": "inv (CT)", "ns": 849.50},
+    {"section": "SCALAR ARITHMETIC", "name": "inv (var-time)", "ns": 849.50},
+    {"section": "SCALAR ARITHMETIC", "name": "add", "ns": 4.14},
+    {"section": "SCALAR ARITHMETIC", "name": "negate", "ns": 2.35},
+    {"section": "SCALAR ARITHMETIC", "name": "from_bytes (32B)", "ns": 2.56},
+    {"section": "POINT ARITHMETIC", "name": "dbl (Jacobian)", "ns": 67.58},
+    {"section": "POINT ARITHMETIC", "name": "add (mixed J+A)", "ns": 118.54},
+    {"section": "POINT ARITHMETIC", "name": "ecmult (a*P+b*G)", "ns": 18738.36},
+    {"section": "POINT ARITHMETIC", "name": "ecmult_gen (k*G raw)", "ns": 4750.01},
+    {"section": "POINT ARITHMETIC", "name": "pubkey_create (API)", "ns": 4750.01},
+    {"section": "POINT ARITHMETIC", "name": "scalar_mul (k*P)", "ns": 19404.73},
+    {"section": "POINT ARITHMETIC", "name": "scalar_mul (KPlan)", "ns": 16596.88},
+    {"section": "POINT ARITHMETIC", "name": "point_add (combine)", "ns": 761.58},
+    {"section": "SERIALIZATION", "name": "compressed (33B)", "ns": 7.19},
+    {"section": "SERIALIZATION", "name": "uncompressed (65B)", "ns": 6.98},
+    {"section": "SIGNING (FAST vs libsecp CT)", "name": "ECDSA Sign", "ns": 6450.90},
+    {"section": "SIGNING (FAST vs libsecp CT)", "name": "Schnorr Sign", "ns": 5295.75},
+    {"section": "SIGNING (FAST vs libsecp CT)", "name": "Schnorr Keypair", "ns": 5405.04},
+    {"section": "VERIFICATION", "name": "ECDSA Verify", "ns": 20846.59},
+    {"section": "VERIFICATION", "name": "Schnorr Verify (cached)", "ns": 20279.62},
+    {"section": "VERIFICATION", "name": "Schnorr Verify (raw)", "ns": 21640.76},
+    {"section": "CT-vs-CT (fair signing)", "name": "ECDSA Sign", "ns": 12761.02},
+    {"section": "CT-vs-CT (fair signing)", "name": "Schnorr Sign", "ns": 11070.78},
+    {"section": "CT-vs-CT (fair signing)", "name": "ECDSA Verify", "ns": 20846.59},
+    {"section": "CT-vs-CT (fair signing)", "name": "Schnorr Verify", "ns": 21640.76},
+    {"section": "ETHEREUM / RECOVERY", "name": "sign_recoverable", "ns": 6598.01},
+    {"section": "ETHEREUM / RECOVERY", "name": "ecrecover", "ns": 27095.12},
+    {"section": "ETHEREUM / RECOVERY", "name": "eth_sign_hash", "ns": 6525.31},
+    {"section": "ETHEREUM / RECOVERY", "name": "eth_personal_sign", "ns": 6787.62},
+    {"section": "FAST path (Ultra FAST vs OpenSSL)", "name": "Generator * k", "ratio": 44.8451},
+    {"section": "FAST path (Ultra FAST vs OpenSSL)", "name": "ECDSA Sign", "ratio": 34.5612},
+    {"section": "FAST path (Ultra FAST vs OpenSSL)", "name": "ECDSA Verify", "ratio": 10.2977},
+    {"section": "CT path (Ultra CT vs OpenSSL)", "name": "ECDSA Sign (CT vs CT)", "ratio": 17.4712},
+    {"section": "CT path (Ultra CT vs OpenSSL)", "name": "ECDSA Verify", "ratio": 10.2977},
+    {"section": "ZK Proofs & Commitments", "name": "Pedersen commit", "ns": 30575.55},
+    {"section": "ZK Proofs & Commitments", "name": "Knowledge prove (sigma)", "ns": 20371.57},
+    {"section": "ZK Proofs & Commitments", "name": "Knowledge verify", "ns": 21392.29},
+    {"section": "ZK Proofs & Commitments", "name": "DLEQ prove", "ns": 44028.64},
+    {"section": "ZK Proofs & Commitments", "name": "DLEQ verify", "ns": 57020.44},
+    {"section": "ZK Proofs & Commitments", "name": "Bulletproof range_prove (64b)", "ns": 13055460.41},
+    {"section": "ZK Proofs & Commitments", "name": "Bulletproof range_verify (64b)", "ns": 1259727.10}
+  ]
+}
--- a/benchmarks/comparison/bench_unified_full_local_20260317.txt
+++ b/benchmarks/comparison/bench_unified_full_local_20260317.txt
@ -0,0 +1,538 @@
+  CPU frequency warmup (3000 ms heavy load)... stable at 2.496 GHz (569198 k*G ops)
+Running integrity check... OK
+
+======================================================================
+  UltrafastSecp256k1 -- Unified Apple-to-Apple Benchmark
+======================================================================
+
+  CPU:       Intel(R) Core(TM) i5-14400F
+  TSC freq:  2.496 GHz
+  Core:      1 (pinned to core 0, priority elevated)
+  Compiler:  GCC 14.2.0
+  Arch:      x86-64
+  Ultra:     UltrafastSecp256k1
+  libsecp:   bitcoin-core libsecp256k1 v0.7.x
+  Harness:   3s CPU ramp-up, 500 warmup/op, 11 passes, IQR outlier removal, median
+  Timer:     RDTSCP
+  Pool:      64 independent key/msg/sig sets
+  NOTE:      Both Ultra and libsecp use IDENTICAL harness
+
+----------------------------------------------+------------+
+| FIELD ARITHMETIC (Ultra)                     |      ns/op |
+----------------------------------------------+------------+
+| field_mul                                    |       10.8 |
+| field_sqr                                    |       10.1 |
+| field_inv                                    |      645.8 |
+| field_add                                    |        3.9 |
+| field_sub                                    |        4.2 |
+| field_negate                                 |        5.7 |
+| field_from_bytes (32B)                       |        2.8 |
+----------------------------------------------+------------+
+
+----------------------------------------------+------------+
+| SCALAR ARITHMETIC (Ultra)                    |      ns/op |
+----------------------------------------------+------------+
+| scalar_mul                                   |       20.0 |
+| scalar_inv                                   |      859.7 |
+| scalar_add                                   |        4.1 |
+| scalar_negate                                |        2.3 |
+| scalar_from_bytes (32B)                      |        2.6 |
+----------------------------------------------+------------+
+
+----------------------------------------------+------------+
+| POINT ARITHMETIC (Ultra)                     |      ns/op |
+----------------------------------------------+------------+
+| pubkey_create (k*G)                          |     4750.0 |
+| scalar_mul (k*P)                             |    19404.7 |
+| scalar_mul_with_plan                         |    16596.9 |
+| dual_mul (a*G + b*P)                         |    18738.4 |
+| point_add (affine+affine)                    |      761.6 |
+| point_add (J+A mixed)                        |      118.5 |
+| point_dbl                                    |       67.6 |
+| normalize (J->affine)                        |        2.6 |
+| batch_normalize /pt (N=64)                   |        8.2 |
+| next_inplace (+=G)                           |      132.5 |
+| KPlan::from_scalar(w=4)                      |     1103.7 |
+----------------------------------------------+------------+
+
+----------------------------------------------+------------+
+| POINT SERIALIZATION (Ultra)                  |      ns/op |
+----------------------------------------------+------------+
+| to_compressed (33B)                          |        7.2 |
+| to_uncompressed (65B)                        |        7.0 |
+| x_only_bytes (32B)                           |        3.1 |
+| x_bytes_and_parity                           |        4.1 |
+| has_even_y                                   |        1.7 |
+| batch_to_compressed /pt (N=64)               |        2.0 |
+| batch_x_only_bytes /pt (N=64)                |        1.7 |
+| msm /pt (N=128)                              |     6130.3 |
+| pippenger_msm /pt (N=128)                    |     6158.4 |
+| precompute_g_multiples /pt (N=64)            |      248.3 |
+| precompute_point_multiples /pt (N=64)        |      240.1 |
+----------------------------------------------+------------+
+
+----------------------------------------------+------------+
+| ECDSA -- Ultra FAST                          |      ns/op |
+----------------------------------------------+------------+
+| ecdsa_sign                                   |     6450.9 |
+| ecdsa_sign_verified                          |    37580.3 |
+| ecdsa_verify                                 |    20846.6 |
+----------------------------------------------+------------+
+
+----------------------------------------------+------------+
+| SCHNORR / BIP-340 -- Ultra FAST              |      ns/op |
+----------------------------------------------+------------+
+| schnorr_keypair_create                       |     5405.0 |
+| schnorr_sign                                 |     5295.8 |
+| schnorr_sign_verified                        |    27132.2 |
+| schnorr_verify (cached xonly)                |    20279.6 |
+| schnorr_verify (raw bytes)                   |    21640.8 |
+----------------------------------------------+------------+
+
+----------------------------------------------+------------+
+| MICRO-DIAGNOSTICS (sub-ops)                  |      ns/op |
+----------------------------------------------+------------+
+| Scalar::from_bytes (32B->scalar)             |        2.6 |
+| Scalar::inverse (safegcd)                    |      849.5 |
+| Scalar::mul                                  |       19.7 |
+| Scalar::negate                               |        2.4 |
+| glv_decompose                                |       74.7 |
+| Point::dbl (jac52_double)                    |       57.6 |
+| Point::add (J+A mixed)                       |      121.4 |
+| dual_scalar_mul_gen_point                    |    19001.5 |
+| FE52::from_4x64_limbs                        |        1.4 |
+| FE52::mul (52-bit)                           |       15.8 |
+| FE52::sqr (52-bit)                           |       13.5 |
+| FE52::inverse_safegcd                        |      725.4 |
+| FE52::inverse (Fermat)                       |     3828.5 |
+|   -> SafeGCD/Fermat speedup                  |     5.28x  |
+| FE52::add (52-bit)                           |        0.5 |
+| FE52::negate (52-bit)                        |        0.5 |
+| FE52::normalize                              |        3.5 |
+| SHA256 (BIP0340/challenge)                   |      107.4 |
+| tagged_hash (recompute tag)                  |      196.9 |
+| cached_tagged_hash (midstate)                |       70.0 |
+|   -> midstate speedup                        |     2.81x  |
+| lift_x (4x64 sqrt)                           |     5094.4 |
+| lift_x (FE52 sqrt)                           |     3347.3 |
+|   -> FE52/4x64 speedup                       |     1.52x  |
+| FE::parse_bytes_strict                       |        3.4 |
+----------------------------------------------+------------+
+
+  ---- VERIFY COST DECOMPOSITION ----
+  ECDSA verify breakdown (estimated):
+    scalar_inv (1x):              849.5 ns
+    scalar_mul (2x):               39.5 ns
+    dual_scalar_mul:            19001.5 ns
+    from_bytes + overhead:          2.6 ns
+    --------------------------------
+    SUM (sub-ops):              19893.0 ns
+    MEASURED ecdsa_verify:      20846.6 ns
+    UNEXPLAINED gap:              953.6 ns  (4.6%)
+
+  Schnorr verify breakdown (estimated):
+    SHA256 challenge:          (included in total)
+    scalar_negate:                  2.4 ns
+    dual_scalar_mul:            19001.5 ns
+    lift_x (sqrt):             (included in total)
+    from_bytes:                     2.6 ns
+    --------------------------------
+    SUM (sub-ops, partial):     19006.5 ns
+    MEASURED schnorr_verify:    20279.6 ns
+    UNEXPLAINED gap:             1273.2 ns  (SHA256+lift_x+Z-check)
+
+  Verify vs libsecp breakdown:
+    Our dual_mul:               19001.5 ns
+    Our scalar_inv:               849.5 ns
+    Our dual+inv:               19851.0 ns
+    Total ECDSA verify:         20846.6 ns
+    Overhead (verify - d+i):      995.6 ns
+
+  ---- SIGN COST DECOMPOSITION (FAST path) ----
+  ecdsa_sign = RFC6979 + k*G + field_inv + scalar_inv + scalar_muls
+    k*G (generator_mul):         4750.0 ns
+    field_inv (R.x):              645.8 ns
+    scalar_inv (k^-1):            849.5 ns
+    scalar_mul (2x):               39.5 ns
+    --------------------------------
+    Core signing (no RFC6979):    6284.8 ns
+    MEASURED ecdsa_sign:          6450.9 ns
+    RFC6979 overhead:              166.1 ns  (2.6%)
+    MEASURED ecdsa_sign_verified:37580.3 ns
+    sign-then-verify overhead:   31129.4 ns  (pubkey + verify)
+
+----------------------------------------------+------------+
+| BATCH VERIFICATION (FAST)                    |      ns/op |
+----------------------------------------------+------------+
+| schnorr_batch_verify(N=4)                    |    78874.3 |
+|   -> per-sig amortized (N=4)                 |    19718.6 |
+|   -> speedup vs individual                   |     1.03x  |
+| schnorr_batch_verify(N=16)                   |   325401.5 |
+|   -> per-sig amortized (N=16)                |    20337.6 |
+|   -> speedup vs individual                   |     1.00x  |
+| schnorr_batch_verify(N=64)                   |  1329107.1 |
+|   -> per-sig amortized (N=64)                |    20767.3 |
+|   -> speedup vs individual                   |     0.98x  |
+| schnorr_batch_verify(N=192)                  |  3283487.4 |
+|   -> per-sig amortized (N=192)               |    17101.5 |
+|   -> speedup vs individual                   |     1.19x  |
+| schnorr_batch_verify(repeated,N=192)         |  2884848.9 |
+|   -> per-sig repeated (N=192)                |    15025.3 |
+|   -> repeated speedup vs individual          |     1.35x  |
+| schnorr_batch_seed_only(N=192)               |    16218.8 |
+| schnorr_batch_weights_only(N=192)            |    10063.2 |
+| schnorr_batch_R_lift_only(N=192)             |   926910.0 |
+| schnorr_batch_P_lift_only(N=192)             |   951004.1 |
+| schnorr_batch_challenge_only(N=192)          |    16512.1 |
+| schnorr_batch_xonly_parse_only(N=192)        |   659796.9 |
+| schnorr_batch_P_lift+challenge_only(N=192)   |   937008.0 |
+| schnorr_batch_lift+challenge(N=192)          |  1977220.4 |
+| schnorr_batch_setup_only(N=192)              |  2008199.0 |
+|   -> setup per-sig (N=192)                   |    10459.4 |
+|   -> setup share of full (N=192)             |    60.34% |
+| schnorr_batch_seed_only(repeated,N=192)      |    14453.3 |
+| schnorr_batch_weights_only(repeated,N=192)   |     8768.2 |
+| schnorr_batch_R_lift_only(repeated,N=192)    |  1004852.3 |
+| schnorr_batch_P_lift_only(repeated,N=192)    |   945079.2 |
+| schnorr_batch_challenge_only(repeated,N=192) |    18516.6 |
+| schnorr_batch_xonly_parse_only(repeated,N=192) |   663956.8 |
+| schnorr_batch_P_lift+challenge_only(repeated,N=192) |   953751.9 |
+| schnorr_batch_lift+challenge(repeated,N=192) |  1912494.5 |
+| schnorr_batch_setup_only(repeated,N=192)     |  1908150.1 |
+|   -> setup repeated per-sig (N=192)          |     9938.3 |
+|   -> setup share repeated (N=192)            |    65.68% |
+|                                              |            |
+| ecdsa_batch_verify(N=4)                      |    76754.1 |
+|   -> per-sig amortized (N=4)                 |    19188.5 |
+|   -> speedup vs individual                   |     1.09x  |
+| ecdsa_batch_verify(N=16)                     |   304265.1 |
+|   -> per-sig amortized (N=16)                |    19016.6 |
+|   -> speedup vs individual                   |     1.10x  |
+| ecdsa_batch_verify(N=64)                     |  1230289.0 |
+|   -> per-sig amortized (N=64)                |    19223.3 |
+|   -> speedup vs individual                   |     1.08x  |
+----------------------------------------------+------------+
+
+----------------------------------------------+------------+
+| CT POINT ARITHMETIC (sub-ops)                |      ns/op |
+----------------------------------------------+------------+
+| ct::scalar_inverse (SafeGCD)                 |     1351.5 |
+| ct::generator_mul (k*G)                      |     9533.7 |
+| ct::scalar_mul (k*P)                         |    21251.5 |
+| ct::point_dbl                                |       70.6 |
+| ct::point_add_complete (11M+6S)              |      203.4 |
+| ct::point_add_mixed_complete (7M+5S)         |      135.5 |
+| ct::point_add_mixed_unified (7M+5S)          |      131.3 |
+----------------------------------------------+------------+
+
+  ---- CT vs FAST point ops ----
+  FAST Point::dbl                          57.6 ns
+  FAST Point::add                         121.4 ns
+  FAST pubkey_create (k*G)               4750.0 ns
+  FAST scalar_mul (k*P)                 19404.7 ns
+  CT   generator_mul (k*G)               9533.7 ns
+  CT   scalar_mul (k*P)                 21251.5 ns
+  CT/FAST ratio (k*G):  2.01x overhead
+  CT/FAST ratio (k*P):  1.10x overhead
+
+----------------------------------------------+------------+
+| CT SIGNING (Ultra CT)                        |      ns/op |
+----------------------------------------------+------------+
+| ct::ecdsa_sign                               |    12761.0 |
+|   CT overhead (ECDSA)                        |      1.98x |
+| ct::ecdsa_sign_verified                      |    43190.6 |
+| ct::schnorr_sign                             |    11070.8 |
+|   CT overhead (Schnorr)                      |      2.09x |
+| ct::schnorr_sign_verified                    |    33161.6 |
+| ct::schnorr_keypair_create                   |    12088.9 |
+|   CT overhead (keypair)                      |      2.24x |
+----------------------------------------------+------------+
+
+  ---- CT ECDSA SIGN DECOMPOSITION ----
+    ct::generator_mul (R=k*G):   9533.7 ns
+    ct::scalar_inverse (k^-1):   1351.5 ns
+    field_inv (R.x affine):       645.8 ns
+    scalar_mul (2x):               39.5 ns
+    --------------------------------
+    SUM (sub-ops):              11570.5 ns
+    MEASURED ct::ecdsa_sign:    12761.0 ns
+    UNEXPLAINED gap:             1190.5 ns  (9.3%, RFC6979+checks)
+
+  ---- CT SCHNORR SIGN DECOMPOSITION ----
+    ct::generator_mul (R=k*G):   9533.7 ns
+    SHA256 (tag+nonce+msg):    (included in total)
+    scalar_mul + negate:           22.1 ns
+    --------------------------------
+    SUM (sub-ops, partial):      9555.9 ns
+    MEASURED ct::schnorr_sign:  11070.8 ns
+    UNEXPLAINED gap:             1514.9 ns  (SHA256+aux+serialize)
+
+  ---- CT vs libsecp (true apples-to-apples) ----
+  CT   ecdsa_sign                       12761.0 ns
+  lib  ecdsa_sign                      (measured after libsecp section)
+  CT   schnorr_sign                     11070.8 ns
+  lib  schnorr_sign                    (measured after libsecp section)
+
+----------------------------------------------+------------+
+| ETHEREUM OPERATIONS                          |      ns/op |
+----------------------------------------------+------------+
+| keccak256 (32B)                              |      254.2 |
+| ethereum_address                             |      228.4 |
+| eip191_hash                                  |      225.0 |
+| eth_sign_hash                                |     6525.3 |
+| ecdsa_sign_recoverable                       |     6598.0 |
+| ecrecover                                    |    27095.1 |
+| eth_personal_sign                            |     6787.6 |
+| ethereum_address_eip55                       |      564.8 |
+----------------------------------------------+------------+
+
+----------------------------------------------+------------+
+| REAL-WORLD FLOWS                             |      ns/op |
+----------------------------------------------+------------+
+| ecdh_compute (SHA256 shared secret)          |    20215.2 |
+| ecdh_compute_raw (x-only shared)             |    20134.6 |
+| taproot_output_key (BIP-341 key path)        |    10438.6 |
+| taproot_tweak_privkey (BIP-341)              |    11246.9 |
+| bip32_master_key (64B seed)                  |      933.3 |
+| bip32_coin_derive_key (BTC m/84'/0'/0'/0/0)  |    77987.0 |
+| coin_address_from_seed (BTC end-to-end)      |    91654.6 |
+| coin_address_from_seed (ETH end-to-end)      |    91281.6 |
+| silent_payment_create_output                 |    24181.2 |
+| silent_payment_scan (single output set)      |    34901.1 |
+----------------------------------------------+------------+
+
+Running libsecp256k1 benchmark (same harness: RDTSCP, 3s ramp-up, 500 warmup, 11 passes, IQR)...
+----------------------------------------------+------------+
+| libsecp256k1 (bitcoin-core)                  |      ns/op |
+----------------------------------------------+------------+
+| field_mul                                    |       11.6 |
+| field_sqr                                    |       10.5 |
+| field_inv_var                                |      833.2 |
+| field_add                                    |        6.6 |
+| field_negate                                 |        6.3 |
+| field_normalize                              |        7.4 |
+| field_from_bytes (set_b32)                   |        7.0 |
+| scalar_mul                                   |       26.4 |
+| scalar_inverse (CT)                          |     1421.1 |
+| scalar_inverse_var                           |      856.2 |
+| scalar_add                                   |        5.2 |
+| scalar_negate                                |        7.0 |
+| scalar_from_bytes (set_b32)                  |        5.0 |
+| point_dbl (gej_double_var)                   |       78.6 |
+| point_add (gej_add_ge_var)                   |      141.1 |
+| ecmult (a*P + b*G, Strauss)                  |    21020.3 |
+| ecmult_gen (k*G, comb)                       |     9723.2 |
+| generator_mul (ec_pubkey_create)             |    11384.8 |
+| scalar_mul_P (k*P, tweak_mul)                |    20135.6 |
+| serialize_compressed (33B)                   |       17.7 |
+| serialize_uncompressed (65B)                 |       22.5 |
+| point_add (pubkey_combine)                   |     1774.0 |
+| ecdsa_sign                                   |    17203.1 |
+| ecdsa_verify                                 |    22448.3 |
+| schnorr_keypair_create                       |    11751.9 |
+| schnorr_sign (BIP-340)                       |    13712.3 |
+| schnorr_verify (BIP-340)                     |    24529.6 |
+----------------------------------------------+------------+
+
+Running OpenSSL benchmark (OpenSSL 3.0.13 30 Jan 2024, same harness)...
+----------------------------------------------+------------+
+| OpenSSL (ECDSA, secp256k1)                   |      ns/op |
+----------------------------------------------+------------+
+| generator_mul (EC_POINT_mul k*G)             |   213014.6 |
+| ecdsa_sign (ECDSA_do_sign)                   |   222950.9 |
+| ecdsa_verify (ECDSA_do_verify)               |   214672.4 |
+----------------------------------------------+------------+
+  (OpenSSL has no BIP-340 Schnorr -- ECDSA-only comparison)
+
+======================================================================
+  HEAD-TO-HEAD: UltrafastSecp256k1 vs libsecp256k1
+  (ratio > 1.0 = Ultra wins, < 1.0 = libsecp wins)
+======================================================================
+
+------------------------------------+----------+----------+-----------+
+| FIELD ARITHMETIC                   | Ultra ns |  libsecp |     ratio |
+------------------------------------+----------+----------+-----------+
+| mul                                |     10.8 |     11.6 |     1.08x |
+| sqr                                |     10.1 |     10.5 |     1.04x |
+| inv                                |    645.8 |    833.2 |     1.29x |
+| add                                |      3.9 |      6.6 |     1.67x |
+| sub                                |      4.2 |      --- |       --- |
+| negate                             |      5.7 |      6.3 |     1.12x |
+| normalize (FE52)                   |      3.5 |      7.4 |     2.12x |
+| from_bytes (32B)                   |      2.8 |      7.0 |     2.49x |
+| FE52 add (hot path)                |      0.5 |      6.6 |    12.48x |
+| FE52 neg (hot path)                |      0.5 |      6.3 |    12.93x |
+------------------------------------+----------+----------+-----------+
+
+------------------------------------+----------+----------+-----------+
+| SCALAR ARITHMETIC                  | Ultra ns |  libsecp |     ratio |
+------------------------------------+----------+----------+-----------+
+| mul                                |     20.0 |     26.4 |     1.32x |
+| inv (CT)                           |    849.5 |   1421.1 |     1.67x |
+| inv (var-time)                     |    849.5 |    856.2 |     1.01x |
+| add                                |      4.1 |      5.2 |     1.26x |
+| negate                             |      2.3 |      7.0 |     2.98x |
+| from_bytes (32B)                   |      2.6 |      5.0 |     1.96x |
+------------------------------------+----------+----------+-----------+
+
+------------------------------------+----------+----------+-----------+
+| POINT ARITHMETIC                   | Ultra ns |  libsecp |     ratio |
+------------------------------------+----------+----------+-----------+
+| dbl (Jacobian)                     |     67.6 |     78.6 |     1.16x |
+| add (mixed J+A)                    |    118.5 |    141.1 |     1.19x |
+| ecmult (a*P+b*G)                   |  18738.4 |  21020.3 |     1.12x |
+| ecmult_gen (k*G raw)               |   4750.0 |   9723.2 |     2.05x |
+| pubkey_create (API)                |   4750.0 |  11384.8 |     2.40x |
+| scalar_mul (k*P)                   |  19404.7 |  20135.6 |     1.04x |
+| scalar_mul (KPlan)                 |  16596.9 |  20135.6 |     1.21x |
+| point_add (combine)                |    761.6 |   1774.0 |     2.33x |
+------------------------------------+----------+----------+-----------+
+
+------------------------------------+----------+----------+-----------+
+| SERIALIZATION                      | Ultra ns |  libsecp |     ratio |
+------------------------------------+----------+----------+-----------+
+| compressed (33B)                   |      7.2 |     17.7 |     2.46x |
+| uncompressed (65B)                 |      7.0 |     22.5 |     3.23x |
+------------------------------------+----------+----------+-----------+
+
+------------------------------------+----------+----------+-----------+
+| SIGNING (FAST vs libsecp CT)       | Ultra ns |  libsecp |     ratio |
+------------------------------------+----------+----------+-----------+
+| ECDSA Sign                         |   6450.9 |  17203.1 |     2.67x |
+| Schnorr Sign                       |   5295.8 |  13712.3 |     2.59x |
+| Schnorr Keypair                    |   5405.0 |  11751.9 |     2.17x |
+------------------------------------+----------+----------+-----------+
+
+------------------------------------+----------+----------+-----------+
+| VERIFICATION                       | Ultra ns |  libsecp |     ratio |
+------------------------------------+----------+----------+-----------+
+| ECDSA Verify                       |  20846.6 |  22448.3 |     1.08x |
+| Schnorr Verify (cached)            |  20279.6 |  24529.6 |     1.21x |
+| Schnorr Verify (raw)               |  21640.8 |  24529.6 |     1.13x |
+------------------------------------+----------+----------+-----------+
+
+------------------------------------+----------+----------+-----------+
+| CT-vs-CT (fair signing)            | Ultra ns |  libsecp |     ratio |
+------------------------------------+----------+----------+-----------+
+| ECDSA Sign                         |  12761.0 |  17203.1 |     1.35x |
+| Schnorr Sign                       |  11070.8 |  13712.3 |     1.24x |
+| ECDSA Verify                       |  20846.6 |  22448.3 |     1.08x |
+| Schnorr Verify                     |  21640.8 |  24529.6 |     1.13x |
+------------------------------------+----------+----------+-----------+
+
+------------------------------------+----------+----------+-----------+
+| ETHEREUM / RECOVERY                | Ultra ns |  libsecp |     ratio |
+------------------------------------+----------+----------+-----------+
+| sign_recoverable                   |   6598.0 |  15920.0 |     2.41x |
+| ecrecover                          |  27095.1 |  26314.1 |     0.97x |
+| eth_sign_hash                      |   6525.3 |  15920.0 |     2.44x |
+| eth_personal_sign                  |   6787.6 |  15920.0 |     2.35x |
+------------------------------------+----------+----------+-----------+
+
+======================================================================
+  APPLE-TO-APPLE: UltrafastSecp256k1 / OpenSSL
+  (ratio > 1.0 = Ultra wins, < 1.0 = OpenSSL wins)
+======================================================================
+
+----------------------------------------------+------------+
+| FAST path (Ultra FAST vs OpenSSL)            |      ratio |
+----------------------------------------------+------------+
+| Generator * k                                |     44.85x |
+| ECDSA Sign                                   |     34.56x |
+| ECDSA Verify                                 |     10.30x |
+----------------------------------------------+------------+
+
+----------------------------------------------+------------+
+| CT path (Ultra CT vs OpenSSL)                |      ratio |
+----------------------------------------------+------------+
+| ECDSA Sign (CT vs CT)                        |     17.47x |
+| ECDSA Verify                                 |     10.30x |
+----------------------------------------------+------------+
+
+----------------------------------------------+------------+
+| ZK Proofs & Commitments                      |      ns/op |
+----------------------------------------------+------------+
+| Pedersen commit                              |    30575.5 |
+| Knowledge prove (sigma)                      |    20371.6 |
+| Knowledge verify                             |    21392.3 |
+| DLEQ prove                                   |    44028.6 |
+| DLEQ verify                                  |    57020.4 |
+| Bulletproof range_prove (64b)                | 13055460.4 |
+| Bulletproof range_verify (64b)               |  1259727.1 |
+----------------------------------------------+------------+
+
+======================================================================
+  THROUGHPUT SUMMARY (1 core, pinned)
+======================================================================
+
+  --- Ultra FAST ---
+  ECDSA sign                                 6.45 us  ->     155.0 k op/s
+  ECDSA verify                              20.85 us  ->      48.0 k op/s
+  Schnorr sign                               5.30 us  ->     188.8 k op/s
+  Schnorr verify (cached)                   20.28 us  ->      49.3 k op/s
+  Schnorr verify (raw)                      21.64 us  ->      46.2 k op/s
+  pubkey_create (k*G)                        4.75 us  ->     210.5 k op/s
+  ECDH                                      20.22 us  ->      49.5 k op/s
+  Taproot output key                        10.44 us  ->      95.8 k op/s
+  BIP32 derive (BTC)                        77.99 us  ->      12.8 k op/s
+  Silent Payment sender                     24.18 us  ->      41.4 k op/s
+  Silent Payment scan                       34.90 us  ->      28.7 k op/s
+
+  --- Ultra CT ---
+  CT ECDSA sign                             12.76 us  ->      78.4 k op/s
+  CT Schnorr sign                           11.07 us  ->      90.3 k op/s
+
+  --- Ultra ZK ---
+  Pedersen commit                           30.58 us  ->      32.7 k op/s
+  Knowledge prove                           20.37 us  ->      49.1 k op/s
+  Knowledge verify                          21.39 us  ->      46.7 k op/s
+  DLEQ prove                                44.03 us  ->      22.7 k op/s
+  DLEQ verify                               57.02 us  ->      17.5 k op/s
+  Bulletproof range_prove                13055.46 us  ->        77   op/s
+  Bulletproof range_verify                1259.73 us  ->       794   op/s
+
+  --- libsecp256k1 ---
+  field_mul                                  0.01 us  ->     86.16 M op/s
+  field_sqr                                  0.01 us  ->     95.12 M op/s
+  field_inv_var                              0.83 us  ->      1.20 M op/s
+  scalar_mul                                 0.03 us  ->     37.85 M op/s
+  scalar_inverse (CT)                        1.42 us  ->     703.7 k op/s
+  scalar_inverse_var                         0.86 us  ->      1.17 M op/s
+  point_dbl                                  0.08 us  ->     12.72 M op/s
+  point_add (mixed)                          0.14 us  ->      7.09 M op/s
+  ecmult (a*P+b*G)                          21.02 us  ->      47.6 k op/s
+  ecmult_gen (k*G raw)                       9.72 us  ->     102.8 k op/s
+  generator_mul (API)                       11.38 us  ->      87.8 k op/s
+  scalar_mul_P (k*P)                        20.14 us  ->      49.7 k op/s
+  ECDSA sign                                17.20 us  ->      58.1 k op/s
+  ECDSA verify                              22.45 us  ->      44.5 k op/s
+  Schnorr sign                              13.71 us  ->      72.9 k op/s
+  Schnorr verify                            24.53 us  ->      40.8 k op/s
+
+  --- OpenSSL ---
+  ECDSA sign                               222.95 us  ->       4.5 k op/s
+  ECDSA verify                             214.67 us  ->       4.7 k op/s
+  generator_mul (k*G)                      213.01 us  ->       4.7 k op/s
+
+======================================================================
+  BITCOIN BLOCK VALIDATION ESTIMATES (1 core)
+======================================================================
+
+  Pre-Taproot block (~3000 ECDSA verify):
+    Wall time:     62.5 ms
+    Blocks/sec:    16.0
+
+  Taproot block (~2000 Schnorr + ~1000 ECDSA):
+    Wall time:     64.1 ms
+    Blocks/sec:    15.6
+
+  TX throughput (1 core):
+    ECDSA:       47969 tx/sec
+    Schnorr:     46209 tx/sec
+
+======================================================================
+  Intel(R) Core(TM) i5-14400F | 1 core pinned | GCC 14.2.0
+  UltrafastSecp256k1 vs libsecp256k1 vs OpenSSL -- Unified Benchmark
+======================================================================
+
+  JSON report written to: /tmp/bench_today.json
--- a/cpu/include/secp256k1/hash_accel.hpp
+++ b/cpu/include/secp256k1/hash_accel.hpp
@ -9,10 +9,11 @@
 // ## Three tiers of acceleration (runtime-detected):
 //
 //   Tier 0: SCALAR   -- Portable C++ (baseline, always available)
-//   Tier 1: SHA-NI   -- Intel SHA Extensions (single-message HW accel, ~3-5x)
-//   Tier 2: AVX2     -- 4-way multi-buffer SHA-256 (interleaved, ~8-12x)
+//   Tier 1: ARM SHA2 -- ARMv8 SHA-256 instructions (single-message HW accel)
+//   Tier 2: SHA-NI   -- Intel SHA Extensions (single-message HW accel, ~3-5x)
+//   Tier 3: AVX2     -- 4-way multi-buffer SHA-256 (interleaved, ~8-12x)
 //                       + optimized RIPEMD-160 with BMI/BMI2
-//   Tier 3: AVX-512  -- 8-way multi-buffer SHA-256 (if available, ~16x)
+//   Tier 4: AVX-512  -- 8-way multi-buffer SHA-256 (if available, ~16x)
 //
 // ## Hot-path API for search pipeline:
 //
@ -48,9 +49,10 @@ namespace secp256k1::hash {

 enum class HashTier : int {
    SCALAR  = 0,
-    SHA_NI  = 1,  // Intel SHA Extensions
-    AVX2    = 2,  // 4-way multi-buffer
-    AVX512  = 3,  // 8-way multi-buffer
+    ARM_SHA2 = 1, // ARMv8 SHA-256 instructions
+    SHA_NI  = 2,  // Intel SHA Extensions
+    AVX2    = 3,  // 4-way multi-buffer
+    AVX512  = 4,  // 8-way multi-buffer
 };

 /// Detect best available hashing tier at runtime.
--- a/cpu/src/batch_verify.cpp
+++ b/cpu/src/batch_verify.cpp
@ -227,7 +227,7 @@ std::vector<std::size_t> schnorr_batch_identify_invalid_impl(

 bool schnorr_batch_verify(const SchnorrBatchEntry* entries, std::size_t n) {
    std::vector<SchnorrXonlyPubkey> pubkey_cache;
-    pubkey_cache.reserve((n < 64) ? n : 64);
+    pubkey_cache.reserve(n);

    auto verify_one = [](const SchnorrBatchEntry& entry) {
        return schnorr_verify(entry.pubkey_x, entry.message, entry.signature);
--- a/cpu/src/field.cpp
+++ b/cpu/src/field.cpp
@ -1016,10 +1016,15 @@ limbs4 mul_impl(const limbs4& a, const limbs4& b) {
    arm64::field_mul_arm64(out.data(), a.data(), b.data());
    return out;
 #elif defined(SECP256K1_HAS_ASM) && (defined(__x86_64__) || defined(_M_X64))
-    // x86-64: Direct assembly call -- zero-copy, no FieldElement wrapper overhead
-    limbs4 out;
-    field_mul_full_asm(a.data(), b.data(), out.data());
-    return out;
+    // x86-64: Runtime dispatch — assembly requires BMI2+ADX (mulx/adcx/adox).
+    // Fall back to portable path on CPUs that lack these extensions (e.g. Jasper Lake).
+    static bool const asm_available = has_bmi2_support() && has_adx_support();
+    if (asm_available) {
+        limbs4 out;
+        field_mul_full_asm(a.data(), b.data(), out.data());
+        return out;
+    }
+    return reduce(mul_wide(a, b));
 #elif defined(SECP256K1_NO_ASM)
    // Generic no-asm fallback
    auto result = reduce(mul_wide(a, b));
@ -1055,10 +1060,15 @@ limbs4 square_impl(const limbs4& a) {
    arm64::field_sqr_arm64(out.data(), a.data());
    return out;
 #elif defined(SECP256K1_HAS_ASM) && (defined(__x86_64__) || defined(_M_X64))
-    // x86-64: Direct assembly call -- zero-copy, no FieldElement wrapper overhead
-    limbs4 out;
-    field_sqr_full_asm(a.data(), out.data());
-    return out;
+    // x86-64: Runtime dispatch — assembly requires BMI2+ADX (mulx/adcx/adox).
+    // Fall back to portable path on CPUs that lack these extensions (e.g. Jasper Lake).
+    static bool const asm_available = has_bmi2_support() && has_adx_support();
+    if (asm_available) {
+        limbs4 out;
+        field_sqr_full_asm(a.data(), out.data());
+        return out;
+    }
+    return reduce(mul_wide(a, a));
 #elif defined(SECP256K1_NO_ASM)
    // Generic no-asm fallback
    return reduce(mul_wide(a, a));
--- a/cpu/src/hash_accel.cpp
+++ b/cpu/src/hash_accel.cpp
@ -26,6 +26,11 @@
 #include <cstring>

 // Architecture detection
+#if defined(__aarch64__) || defined(_M_ARM64)
+    #define SECP256K1_ARM64_TARGET 1
+    #include <arm_neon.h>
+#endif
+
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
    #define SECP256K1_X86_TARGET 1
    #ifdef _MSC_VER
@ -105,10 +110,19 @@ bool avx512_available() noexcept {
 #endif
 }

+bool arm_sha2_available() noexcept {
+#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
+    return true;
+#else
+    return false;
+#endif
+}
+
 HashTier detect_hash_tier() noexcept {
    // SHA-NI usually coexists with AVX2 on modern CPUs (Zen, Ice Lake+)
    // SHA-NI single-message is often faster than multi-buffer AVX2 for
    // sequential work. For batch, AVX2 multi-buffer wins.
+    if (arm_sha2_available()) return HashTier::ARM_SHA2;
    if (sha_ni_available()) return HashTier::SHA_NI;
    if (avx2_available())   return HashTier::AVX2;
    return HashTier::SCALAR;
@ -116,6 +130,7 @@ HashTier detect_hash_tier() noexcept {

 const char* hash_tier_name(HashTier tier) noexcept {
    switch (tier) {
+        case HashTier::ARM_SHA2: return "ARM SHA2";
        case HashTier::SHA_NI:  return "SHA-NI";
        case HashTier::AVX2:    return "AVX2";
        case HashTier::AVX512:  return "AVX-512";
@ -392,6 +407,90 @@ void hash160_33(const std::uint8_t* pubkey33, std::uint8_t* out20) noexcept {

 } // namespace scalar

+// ============================================================================
+// ARMv8 SHA2 -- Hardware-accelerated SHA-256
+// ============================================================================
+
+#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
+
+namespace armsha {
+
+void sha256_compress(const std::uint8_t block[64], std::uint32_t state[8]) noexcept {
+    std::uint32_t w[64];
+
+    for (int i = 0; i < 16; ++i) {
+        w[i] = load_be32(block + static_cast<std::size_t>(i) * 4);
+    }
+    for (int i = 16; i < 64; ++i) {
+        std::uint32_t const s0 = rotr32(w[i - 15], 7) ^ rotr32(w[i - 15], 18) ^ (w[i - 15] >> 3);
+        std::uint32_t const s1 = rotr32(w[i - 2], 17) ^ rotr32(w[i - 2], 19) ^ (w[i - 2] >> 10);
+        w[i] = w[i - 16] + s0 + w[i - 7] + s1;
+    }
+
+    uint32x4_t abcd = vld1q_u32(state + 0);
+    uint32x4_t efgh = vld1q_u32(state + 4);
+    uint32x4_t const abcd_save = abcd;
+    uint32x4_t const efgh_save = efgh;
+
+    for (int i = 0; i < 64; i += 4) {
+        uint32x4_t const msg = vld1q_u32(w + i);
+        uint32x4_t const k = vld1q_u32(SHA256_K + i);
+        uint32x4_t const wk = vaddq_u32(msg, k);
+        abcd = vsha256hq_u32(abcd, efgh, wk);
+        efgh = vsha256h2q_u32(efgh, abcd, wk);
+    }
+
+    abcd = vaddq_u32(abcd, abcd_save);
+    efgh = vaddq_u32(efgh, efgh_save);
+
+    vst1q_u32(state + 0, abcd);
+    vst1q_u32(state + 4, efgh);
+}
+
+void sha256_33(const std::uint8_t* pubkey33, std::uint8_t* out32) noexcept {
+    alignas(16) std::uint8_t block[64];
+    std::memcpy(block, pubkey33, 33);
+    block[33] = 0x80;
+    std::memset(block + 34, 0, 22);
+    block[56] = 0; block[57] = 0; block[58] = 0; block[59] = 0;
+    block[60] = 0; block[61] = 0; block[62] = 0x01; block[63] = 0x08;
+
+    std::uint32_t state[8];
+    std::memcpy(state, SHA256_IV, sizeof(state));
+    sha256_compress(block, state);
+
+    for (int i = 0; i < 8; ++i) {
+        store_be32(out32 + static_cast<std::size_t>(i) * 4, state[i]);
+    }
+}
+
+void sha256_32(const std::uint8_t* in32, std::uint8_t* out32) noexcept {
+    alignas(16) std::uint8_t block[64];
+    std::memcpy(block, in32, 32);
+    block[32] = 0x80;
+    std::memset(block + 33, 0, 23);
+    block[56] = 0; block[57] = 0; block[58] = 0; block[59] = 0;
+    block[60] = 0; block[61] = 0; block[62] = 0x01; block[63] = 0x00;
+
+    std::uint32_t state[8];
+    std::memcpy(state, SHA256_IV, sizeof(state));
+    sha256_compress(block, state);
+
+    for (int i = 0; i < 8; ++i) {
+        store_be32(out32 + static_cast<std::size_t>(i) * 4, state[i]);
+    }
+}
+
+void hash160_33(const std::uint8_t* pubkey33, std::uint8_t* out20) noexcept {
+    std::uint8_t sha_out[32];
+    sha256_33(pubkey33, sha_out);
+    scalar::ripemd160_32(sha_out, out20);
+}
+
+} // namespace armsha
+
+#endif // SECP256K1_ARM64_TARGET && __ARM_FEATURE_SHA2
+
 // ============================================================================
 // SHA-NI (Intel SHA Extensions) -- Hardware-accelerated SHA-256
 // ============================================================================
@ -616,6 +715,12 @@ std::array<std::uint8_t, 32> sha256(const void* data, std::size_t len) noexcept
 }

 void sha256_33(const std::uint8_t* pubkey33, std::uint8_t* out32) noexcept {
+#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
+    if (arm_sha2_available()) {
+        armsha::sha256_33(pubkey33, out32);
+        return;
+    }
+#endif
 #ifdef SECP256K1_X86_TARGET
    if (sha_ni_available()) {
        shani::sha256_33(pubkey33, out32);
@ -626,6 +731,12 @@ void sha256_33(const std::uint8_t* pubkey33, std::uint8_t* out32) noexcept {
 }

 void sha256_32(const std::uint8_t* in32, std::uint8_t* out32) noexcept {
+#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
+    if (arm_sha2_available()) {
+        armsha::sha256_32(in32, out32);
+        return;
+    }
+#endif
 #ifdef SECP256K1_X86_TARGET
    if (sha_ni_available()) {
        shani::sha256_32(in32, out32);
@ -714,6 +825,12 @@ std::array<std::uint8_t, 20> hash160(const void* data, std::size_t len) noexcept
 }

 void hash160_33(const std::uint8_t* pubkey33, std::uint8_t* out20) noexcept {
+#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
+    if (arm_sha2_available()) {
+        armsha::hash160_33(pubkey33, out20);
+        return;
+    }
+#endif
 #ifdef SECP256K1_X86_TARGET
    if (sha_ni_available()) {
        shani::hash160_33(pubkey33, out20);
@ -775,6 +892,12 @@ namespace secp256k1::detail {

 void sha256_compress_dispatch(const std::uint8_t block[64],
                              std::uint32_t state[8]) noexcept {
+#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
+    if (secp256k1::hash::arm_sha2_available()) {
+        secp256k1::hash::armsha::sha256_compress(block, state);
+        return;
+    }
+#endif
 #ifdef SECP256K1_X86_TARGET
    if (secp256k1::hash::sha_ni_available()) {
        secp256k1::hash::shani::sha256_compress(block, state);
--- a/cuda/README.md
+++ b/cuda/README.md
@ -212,17 +212,25 @@ CPU-computed data transfers directly to GPU via `cudaMemcpy` (little-endian, sam

 | Operation | Time |
 |-----------|------|
-| field_mul (a*b mod p) | 85 ns |
-| field_sqr (a^2 mod p) | 66 ns |
-| field_add (a+b mod p) | 18 ns |
-| field_sub (a-b mod p) | 16 ns |
-| field_inverse | 2,621 ns |
-| **fast scalar_mul (k*G)** | **7.6 us** |
-| fast scalar_mul (k*P) | 77.6 us |
-| CT scalar_mul (k*G) | 545 us |
-| ECDH (full CT) | 545 us |
+| field_mul (a*b mod p) | 68.3 ns |
+| field_sqr (a^2 mod p) | 50 ns |
+| field_add (a+b mod p) | 8 ns |
+| field_inverse | 2 us |
+| **fast scalar_mul (k*G)** | **15.27 us** |
+| fast scalar_mul (k*P) | 130.33 us |
+| ECDSA sign | 22.22 us |
+| Schnorr sign (precomputed) | 16.67 us |
+| ECDSA verify | 150.13 us |

-> Backend: ARM64 inline assembly (MUL/UMULH). ~5x faster than generic C++.
+> Backend: ARM64 inline assembly (MUL/UMULH). Latest rerun kept the ARMv8 SHA2 dispatch win for signing-heavy paths on RK3588.
+
+### Latest RTX 5060 Ti Refresh
+
+- CUDA local rerun via `gpu_bench_unified`: `k*G = 129.5 ns` at TPB 256 on batch 65536.
+- OpenCL retained revalidation: `kG (batch=65536) = 115.1 ns`, `kP (batch=65536) = 263.1 ns`, `kG (kernel) = 98.7 ns`.
+- CUDA TPB 512 was not retained as a default because the same harness produced invalid CT timings while only marginally improving `k*G`.
+
+See `../docs/BENCHMARKS.md` for the current cross-platform benchmark matrix and retained-vs-rejected rerun notes.

 ---

--- a/cuda/src/bench_bip352.cu
+++ b/cuda/src/bench_bip352.cu
@ -50,7 +50,7 @@ using CpuKPlan  = secp256k1::fast::KPlan;
 // ============================================================================
 // Configuration
 // ============================================================================
-static constexpr int BENCH_N       = 10000;
+static constexpr int BENCH_N       = 500000;
 static constexpr int BENCH_WARMUP  = 3;
 static constexpr int BENCH_PASSES  = 11;
 static constexpr int DETAIL_N      = 1000;
--- a/docs/BENCHMARKING.md
+++ b/docs/BENCHMARKING.md
@ -82,27 +82,45 @@ build-bench\cpu\bench_unified.exe
 ### 2. ARM64 Android (Cross-compile via NDK)

 Requires:
- Android NDK (tested with r27, Clang 18.0.1)
+- Android NDK (tested with r27.2.12479018, Clang 18.0.3)
 - Android device/emulator (arm64-v8a)
 - ADB

 ```bash
-# Configure with NDK toolchain
-cmake -S . -B build-android -G Ninja \
+# Configure with the Android CMake entrypoint.
+# Use a clean Android-only build dir to avoid root/android cache mismatches.
+cmake -S android -B build-android-ndk-arm64 -G Ninja \
  -DCMAKE_BUILD_TYPE=Release \
  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_HOME/build/cmake/android.toolchain.cmake \
  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_STL=c++_static \
  -DANDROID_PLATFORM=android-28
  
 # Build
-cmake --build build-android --target bench_hornet -j
+cmake --build build-android-ndk-arm64 --target bench_hornet -j

 # Deploy and run
-adb push build-android/android/test/bench_hornet /data/local/tmp/
-adb shell chmod +x /data/local/tmp/bench_hornet
-adb shell /data/local/tmp/bench_hornet
+adb shell 'mkdir -p /data/local/tmp/ufsecp'
+adb push build-android-ndk-arm64/bench_hornet /data/local/tmp/ufsecp/bench_hornet
+adb shell 'chmod 755 /data/local/tmp/ufsecp/bench_hornet && /data/local/tmp/ufsecp/bench_hornet'
 ```

+Measured Android rerun retained the ARMv8 SHA2 dispatch path in `cpu/src/hash_accel.cpp`.
+On RK3588 big cores this moved the signing-heavy hot path materially while leaving verify
+and point arithmetic essentially flat:
+
+| Operation | Baseline | With ARM SHA2 dispatch | Delta |
+|-----------|----------|------------------------|-------|
+| ECDSA Sign | 25.89 us | 22.22 us | 1.17x faster |
+| Schnorr Sign (precomputed) | 17.73 us | 16.67 us | 1.06x faster |
+| Schnorr Sign (raw privkey) | 33.01 us | 31.99 us | 1.03x faster |
+| CT ECDSA Sign | 70.50 us | 67.11 us | 1.05x faster |
+| CT Schnorr Sign | 59.87 us | 59.10 us | 1.01x faster |
+
+Rejected Android ARM64 experiments from the same campaign: forcing `SECP256K1_USE_4X64_POINT_OPS`,
+changing `SECP256K1_GLV_WINDOW_WIDTH` to 4 or 6, and using default PGO as the shipped path.
+Those variants did not beat the retained source-level SHA2 dispatch win on the connected RK3588 device.
+
 ### 3. RISC-V 64 (Cross-compile for Milk-V Mars / SiFive U74)

 Requires:
--- a/docs/BENCHMARKS.md
+++ b/docs/BENCHMARKS.md
@ -11,14 +11,18 @@ Benchmark results for UltrafastSecp256k1 across all supported platforms.
 | **x86-64 (i5-14400F, Clang 19)** | **12.8 ns** | **6.7 us** | **17.6 us** | **21.3 us** | **24.3 us** | **1.09x** |
 | x86-64 (Clang 21, Win) | 17 ns (5x52) | 5 us | 25 us | -- | -- | -- |
 | RISC-V 64 (SiFive U74, Clang 21) | 176 ns | 40.2 us | 150.5 us | **181.8 us** | -- | **1.13x** |
-| ARM64 (RK3588, A76) | 74 ns | 14 us | 131 us | -- | -- | -- |
+| ARM64 (RK3588, A76, Android NDK r27.2) | 68.3 ns | 15.27 us | 130.33 us | **150.13 us** | -- | -- |
 | ESP32-S3 (LX7, 240 MHz) | 7,458 ns | 2,483 us | -- | -- | -- | -- |
 | ESP32 (LX6, 240 MHz) | 6,993 ns | 6,203 us | -- | -- | -- | -- |
 | STM32F103 (CM3, 72 MHz) | 15,331 ns | 37,982 us | -- | -- | -- | -- |
-| CUDA (RTX 5060 Ti) | 0.2 ns | 217.7 ns | 225.8 ns | -- | **263.7 ns** | -- |
-| OpenCL (RTX 5060 Ti) | 0.2 ns | 295.1 ns | -- | -- | -- | -- |
+| CUDA (RTX 5060 Ti) | 0.2 ns | 129.5 ns | 225.8 ns | -- | **263.7 ns** | -- |
+| OpenCL (RTX 5060 Ti) | 0.2 ns | 115.1 ns | 263.1 ns | -- | -- | -- |
 | Metal (Apple M3 Pro) | 1.9 ns | 3.00 us | 2.94 us | -- | -- | -- |

+GPU rows use the latest retained local rerun per backend. For OpenCL, the public
+GPU C ABI still covers 4 of the 6 first-wave operations; the missing two are
+batch ECDSA verify and batch Schnorr verify.
+
 ---

 ## Real-World Flow Coverage
@ -56,6 +60,46 @@ These values are mainly intended as workflow reference points. For publishable
 cross-machine comparisons, use the full pinned benchmark methodology and JSON
 artifacts from `bench_unified`.

+### x86-64 Batch Verify Rerun (2026-03-17)
+
+A retained low-risk x86 CPU improvement was keeping the Schnorr batch pubkey cache
+capacity aligned with the full batch size in `cpu/src/batch_verify.cpp` instead of
+clamping reserve capacity to 64 entries. This avoids avoidable vector reallocations
+when uncached batches grow beyond 64 signatures.
+
+Quick reruns on the local i5-14400F validation machine showed the improvement on the
+uncached Schnorr path while preserving correctness (`ctest -R 'comprehensive|multiscalar'` PASS):
+
+| Operation | Before | After | Delta |
+|-----------|--------|-------|-------|
+| Schnorr batch verify N=128 | 20.27 us/sig | 19.94-20.06 us/sig | up to 1.6% faster |
+| Schnorr batch verify N=192 | 18.56 us/sig | 18.01-18.45 us/sig | up to 3.0% faster |
+
+This change does not materially affect the cached-path benchmark; the measured win is specifically
+the uncached parse-and-resolve flow for larger Schnorr batches.
+
+### Cross-Platform Refresh Status (2026-03-18)
+
+Recent retained reruns and validation passes across the active optimization campaign:
+
+| Platform | Latest validated result | Status |
+|----------|-------------------------|--------|
+| x86-64 / Linux | Schnorr batch verify `N=128`: 19.94-20.06 us/sig, `N=192`: 18.01-18.45 us/sig | Retained low-risk pubkey-cache reserve improvement |
+| Android ARM64 / RK3588 | ECDSA Sign 22.22 us, Schnorr Sign (precomputed) 16.67 us, CT ECDSA Sign 67.11 us | Retained ARMv8 SHA2 dispatch win |
+| OpenCL / RTX 5060 Ti | `kG (batch=65536)` 115.1 ns, `kP (batch=65536)` 263.1 ns, `kG (kernel)` 98.7 ns | Revalidated retained tuning; `opencl_test` and `opencl_audit_runner` passed |
+| CUDA / RTX 5060 Ti | `k*G` 129.5 ns at TPB 256; TPB 512 reached 128.5 ns but CT rows became invalid in the same harness | No safe global retune retained yet |
+| RISC-V / Milk-V Mars | Latest native rerun remains the 2026-03-07 Mars baseline below | Current local environment has toolchain but no runnable board/emulator path |
+
+This page keeps the last trustworthy result per platform. When a rerun only proves that an
+experiment is unstable or not worth shipping, it is recorded here but not promoted as a retained
+default.
+
+OpenCL's current 4/6 C ABI status refers specifically to the generic GPU host ABI in
+`ufsecp_gpu.h`: `generator_mul_batch`, `ecdh_batch`, `hash160_pubkey_batch`, and
+`msm` are implemented on the OpenCL backend, while `ecdsa_verify_batch` and
+`schnorr_verify_batch` currently return `UFSECP_ERR_GPU_UNSUPPORTED` until the
+extended verify kernels are promoted into the backend bridge.
+
 ---

 ## x86-64 Benchmarks
@ -229,6 +273,20 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
 | Range Prove (64-bit) | 3,711,570 ns | 0.27 k/s | Bulletproof, CT path, batch 256 |
 | Range Verify (64-bit) | 764,649 ns | 1.3 k/s | Full IPA verification, batch 256 |

+### CUDA Launch-Width Triage (2026-03-18)
+
+The latest local rerun on the RTX 5060 Ti used `gpu_bench_unified` to check whether a global block-size
+retune should replace the current default. The answer was no: there is not yet a safe retained win.
+
+| TPB | k*G (generator) | CT k*G | CT k*P | Verdict |
+|-----|-----------------|--------|--------|---------|
+| 256 | 129.5 ns | 98.7 ns | 162.8 ns | Stable reference rerun |
+| 512 | 128.5 ns | invalid (`0.0 ns`) | invalid (`0.1 ns`) | Rejected; CT timing became unstable |
+
+The `512`-thread launch showed only a marginal `k*G` gain, while the same harness produced invalid
+constant-time timings. Until the CT timing methodology is tightened, no global CUDA TPB default change
+is retained from this sweep.
+
 **GPU vs CPU ZK Speedup (single-core throughput):**

 | Operation | CPU (i5-14400F) | GPU (RTX 5060 Ti) | GPU/CPU Speedup |
@ -249,6 +307,20 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
 **OpenCL:** 3.0 CUDA, Driver 580.126.09  
 **Build:** Clang 19, Release, -O3, PTX inline assembly  

+### OpenCL GPU C ABI Coverage (2026-03-18)
+
+| C ABI operation | OpenCL status | Notes |
+|-----------------|---------------|-------|
+| `ufsecp_gpu_generator_mul_batch` | Implemented | Uses `batch_scalar_mul_generator` + `batch_jacobian_to_affine` |
+| `ufsecp_gpu_ecdsa_verify_batch` | Missing | Returns `UFSECP_ERR_GPU_UNSUPPORTED` |
+| `ufsecp_gpu_schnorr_verify_batch` | Missing | Returns `UFSECP_ERR_GPU_UNSUPPORTED` |
+| `ufsecp_gpu_ecdh_batch` | Implemented | GPU scalar mul, CPU SHA-256 finalization |
+| `ufsecp_gpu_hash160_pubkey_batch` | Implemented | Public-data batch hashing |
+| `ufsecp_gpu_msm` | Implemented | GPU scalar mul + CPU-side affine reduction |
+
+The missing OpenCL pieces are therefore the two batch verify paths. Core ECC,
+ECDH, Hash160, and MSM are already wired through the backend-neutral C ABI.
+
 ### Kernel-Only Timing (no buffer alloc/copy overhead)

 | Operation | Time/Op | Throughput | Notes |
@ -260,7 +332,8 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
 | Field Inv | 14.3 ns | 69.97 M/s | batch 1M |
 | Point Double | 0.9 ns | 1,139 M/s | batch 256K |
 | Point Add | 1.6 ns | 630.6 M/s | batch 256K |
-| kG (kernel) | 295.1 ns | 3.39 M/s | batch 256K |
+| kG (kernel) | 98.7 ns | 10.13 M/s | batch 65K |
+| kP (kernel) | 238.1 ns | 4.20 M/s | batch 65K |

 ### End-to-End Timing (including buffer transfers)

@ -271,8 +344,10 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
 | Field Inv | 29.0 ns | 34.43 M/s | batch 1M |
 | Point Double | 58.4 ns | 17.11 M/s | batch 1M |
 | Point Add | 111.9 ns | 8.94 M/s | batch 1M |
-| kG (batch=65K) | 307.7 ns | 3.25 M/s | |
-| kG (batch=16K) | 311.6 ns | 3.21 M/s | |
+| kG (batch=65536) | 115.1 ns | 8.69 M/s | retained 2026-03-17 revalidation |
+| kP (batch=65536) | 263.1 ns | 3.80 M/s | retained 2026-03-17 revalidation |
+| kP upload | 6.7 ns | 149.25 M/s | host-to-device transfer slice |
+| kP readback | 12.4 ns | 80.65 M/s | device-to-host transfer slice |

 ### CUDA / OpenCL Configuration

@ -291,7 +366,7 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
 | Field Inv | 10.2 ns | 14.3 ns | **CUDA 1.40x** |
 | Point Double | 0.8 ns | 0.9 ns | CUDA 1.13x |
 | Point Add | 1.6 ns | 1.6 ns | Tie |
-| Scalar Mul (kG) | 217.7 ns | 295.1 ns | **CUDA 1.36x** |
+| Scalar Mul (kG) | 129.5 ns | 98.7 ns | **OpenCL 1.31x** |
 | ECDSA Sign | 204.8 ns | -- | CUDA only |
 | ECDSA Verify | 410.1 ns | -- | CUDA only |
 | Schnorr Sign | 273.4 ns | -- | CUDA only |
@ -301,6 +376,11 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
 | DLEQ Prove | 675.4 ns | -- | CUDA only |
 | DLEQ Verify | 1,912.0 ns | -- | CUDA only |

+`kG` above uses the latest retained local reruns on the same RTX 5060 Ti host:
+CUDA `gpu_bench_unified` at TPB 256 (`129.5 ns`) and OpenCL `opencl_benchmark`
+kernel timing (`98.7 ns`). CUDA still leads on verify and ZK because those paths
+are not yet exposed on OpenCL.
+
 ---

 ## Apple Metal Benchmarks
@ -353,30 +433,49 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.

 **Hardware:** RK3588 (Cortex-A76 @ 2.256 GHz, pinned to big cores)  
 **OS:** Android  
-**Compiler:** NDK r26, Clang 17.0.2  
+**Compiler:** NDK r27.2.12479018, Clang 18.0.3  
 **Assembly:** ARM64 inline (MUL/UMULH)  
 **Field:** 10x26 (optimal for ARM64)

 | Operation | Time | Notes |
 |-----------|------|-------|
-| Field Mul | 74 ns | ARM64 MUL/UMULH, 10x26 |
+| Field Mul | 68.3 ns | ARM64 MUL/UMULH, 10x26 |
 | Field Square | 50 ns | |
 | Field Add | 8 ns | |
 | Field Negate | 18 ns | |
 | Field Inverse | 2 us | Fermat's theorem |
 | Point Add | 992 ns | Jacobian coordinates |
 | Point Double | 548 ns | |
-| Generator Mul (kxG) | 14 us | Precomputed tables |
-| Scalar Mul (kxP) | 131 us | GLV + wNAF |
-| ECDSA Sign | 30 us | RFC 6979 |
-| ECDSA Verify | 153 us | Shamir + GLV |
-| Schnorr Sign (BIP-340) | 38 us | |
-| Schnorr Verify (BIP-340) | 173 us | |
+| Generator Mul (kxG) | 15.27 us | Precomputed tables |
+| Scalar Mul (kxP) | 130.33 us | GLV + wNAF |
+| ECDSA Sign | 22.22 us | ARMv8 SHA2 dispatch retained |
+| ECDSA Verify | 150.13 us | Shamir + GLV |
+| Schnorr Sign (BIP-340) | 16.67 us | Precomputed keypair path |
+| Schnorr Verify (BIP-340) | 153.63 us | Raw pubkey path is similar |
 | Batch Inverse (n=100) | 265 ns/elem | Montgomery's trick |
 | Batch Inverse (n=1000) | 240 ns/elem | |

 ARM64 10x26 representation with MUL/UMULH assembly provides optimal field arithmetic performance.

+### Android ARM64 Optimization Rerun (2026-03-17)
+
+This rerun used the connected RK3588 Android device and `android/test/bench_hornet_android.cpp`
+as the benchmark truth source. The retained code change was enabling the existing ARMv8 SHA-256
+instruction path in `hash_accel.cpp` for `sha256_33`, `sha256_32`, `hash160_33`, and
+`sha256_compress_dispatch`.
+
+| Operation | Baseline | Retained result | Delta |
+|-----------|----------|-----------------|-------|
+| ECDSA Sign | 25.89 us | 22.22 us | 14.2% faster |
+| Schnorr Sign (precomputed) | 17.73 us | 16.67 us | 6.0% faster |
+| Schnorr Sign (raw privkey) | 33.01 us | 31.99 us | 3.1% faster |
+| CT ECDSA Sign | 70.50 us | 67.11 us | 4.8% faster |
+| CT Schnorr Sign | 59.87 us | 59.10 us | 1.3% faster |
+
+No meaningful win was found from forcing `SECP256K1_USE_4X64_POINT_OPS`, from changing
+`SECP256K1_GLV_WINDOW_WIDTH` to 4 or 6, or from keeping PGO as the default Android path.
+Those variants were measured and rejected.
+
 ---

 ## ESP32-S3 Benchmarks (Embedded)
--- a/docs/README.md
+++ b/docs/README.md
@ -10,7 +10,7 @@
 |----------|-------------|
 | [API Reference](API_REFERENCE.md) | Complete CPU + CUDA + WASM function reference |
 | [Building](BUILDING.md) | Build instructions for all 10+ platforms |
-| [Benchmarks](BENCHMARKS.md) | Performance data: CPU, GPU, embedded, mobile |
+| [Benchmarks](BENCHMARKS.md) | Performance data: CPU, GPU, embedded, mobile, including the 2026-03 x86, Android, CUDA, and OpenCL refresh |
 | [ESP32 Setup](ESP32_SETUP.md) | ESP32-S3/PICO-D4 flashing & testing guide |
 | [RISC-V Optimizations](../RISCV_OPTIMIZATIONS.md) | RISC-V assembly & RVV details |
 | [Porting Guide](../PORTING.md) | Add new platforms, architectures, GPU backends |
--- a/docs/wiki/Android-Guide.md
+++ b/docs/wiki/Android-Guide.md
@ -58,17 +58,24 @@ cd libs\UltrafastSecp256k1\android\
 ### Build (Manual CMake)

 ```bash
-cmake -S android -B android/build-android-arm64 \
+cmake -S android -B build-android-ndk-arm64 \
    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_HOME/build/cmake/android.toolchain.cmake \
    -DANDROID_ABI=arm64-v8a \
-    -DANDROID_PLATFORM=android-24 \
+    -DANDROID_PLATFORM=android-28 \
    -DANDROID_STL=c++_static \
    -DCMAKE_BUILD_TYPE=Release \
    -G Ninja

-cmake --build android/build-android-arm64 -j
+cmake --build build-android-ndk-arm64 --target bench_hornet -j
+
+adb shell 'mkdir -p /data/local/tmp/ufsecp'
+adb push build-android-ndk-arm64/bench_hornet /data/local/tmp/ufsecp/bench_hornet
+adb shell 'chmod 755 /data/local/tmp/ufsecp/bench_hornet && /data/local/tmp/ufsecp/bench_hornet'
 ```

+Use a clean Android-only build directory. Reusing a build directory first configured from the
+repository root can trigger a CMake source/cache mismatch when switching to `android/` as the source tree.
+
 ### Output

 ```
@ -217,6 +224,21 @@ NDK Clang additionally uses:

 \* CT mode uses generic C++ (for constant-time guarantees)

+### Android ARM64 rerun retained on-device SHA2 dispatch
+
+Measured on the connected RK3588 Android device with `bench_hornet` after wiring the ARMv8 SHA2
+path into `hash_accel.cpp` hot wrappers:
+
+| Operation | Baseline | Retained result | Delta |
+|-----------|----------|-----------------|-------|
+| ECDSA sign | 25.89 us | 22.22 us | 14.2% faster |
+| Schnorr sign (precomputed) | 17.73 us | 16.67 us | 6.0% faster |
+| Schnorr sign (raw privkey) | 33.01 us | 31.99 us | 3.1% faster |
+| CT ECDSA sign | 70.50 us | 67.11 us | 4.8% faster |
+
+The same rerun rejected forced 4x64 point ops, GLV window retuning, and keeping Android PGO as the
+default path because they did not outperform the retained SHA2 dispatch result on this device.
+
 ### ARMv7 (32-bit) Limitations

 - No `__int128` -> `SECP256K1_NO_INT128` fallback (portable 64x64->128)
--- a/fix_alerts.py
+++ b/fix_alerts.py
@ -0,0 +1,649 @@
+#!/usr/bin/env python3
+"""Apply all readability-braces and misc-const-correctness fixes to ufsecp_impl.cpp"""
+
+import sys
+
+PATH = "include/ufsecp/ufsecp_impl.cpp"
+
+# Each entry: (old_string, new_string)
+REPLACEMENTS = [
+    # L1380: if (!ok)
+    (
+        "    if (!ok)\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid mnemonic\");",
+        "    if (!ok) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid mnemonic\");\n"
+        "    }",
+    ),
+    # L1382: if (*entropy_len < ent.length)
+    (
+        "    if (*entropy_len < ent.length)\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"entropy buffer too small\");",
+        "    if (*entropy_len < ent.length) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"entropy buffer too small\");\n"
+        "    }",
+    ),
+    # L1404: if (!FE::parse_bytes_strict(e, pk_fe)) in ufsecp_schnorr_batch_verify
+    (
+        "    for (size_t i = 0; i < n; ++i) {\n"
+        "        const uint8_t* e = entries + i * 128;\n"
+        "        // Strict: reject x-only pubkey >= p at ABI gate\n"
+        "        FE pk_fe;\n"
+        "        if (!FE::parse_bytes_strict(e, pk_fe))\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p) in batch\");\n"
+        "        std::memcpy(batch[i].pubkey_x.data(), e, 32);\n"
+        "        std::memcpy(batch[i].message.data(), e + 32, 32);\n"
+        "        if (!secp256k1::SchnorrSignature::parse_strict(e + 64, batch[i].signature))\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid Schnorr sig in batch\");",
+        "    for (size_t i = 0; i < n; ++i) {\n"
+        "        const uint8_t* e = entries + i * 128;\n"
+        "        // Strict: reject x-only pubkey >= p at ABI gate\n"
+        "        FE pk_fe;\n"
+        "        if (!FE::parse_bytes_strict(e, pk_fe)) {\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p) in batch\");\n"
+        "        }\n"
+        "        std::memcpy(batch[i].pubkey_x.data(), e, 32);\n"
+        "        std::memcpy(batch[i].message.data(), e + 32, 32);\n"
+        "        if (!secp256k1::SchnorrSignature::parse_strict(e + 64, batch[i].signature)) {\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid Schnorr sig in batch\");\n"
+        "        }",
+    ),
+    # L1448: if (!FE::parse_bytes_strict(e, pk_fe)) in ufsecp_schnorr_batch_identify_invalid
+    (
+        "    for (size_t i = 0; i < n; ++i) {\n"
+        "        const uint8_t* e = entries + i * 128;\n"
+        "        FE pk_fe;\n"
+        "        if (!FE::parse_bytes_strict(e, pk_fe))\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p) in batch\");\n"
+        "        std::memcpy(batch[i].pubkey_x.data(), e, 32);\n"
+        "        std::memcpy(batch[i].message.data(), e + 32, 32);\n"
+        "        if (!secp256k1::SchnorrSignature::parse_strict(e + 64, batch[i].signature))\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid Schnorr sig in batch\");",
+        "    for (size_t i = 0; i < n; ++i) {\n"
+        "        const uint8_t* e = entries + i * 128;\n"
+        "        FE pk_fe;\n"
+        "        if (!FE::parse_bytes_strict(e, pk_fe)) {\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p) in batch\");\n"
+        "        }\n"
+        "        std::memcpy(batch[i].pubkey_x.data(), e, 32);\n"
+        "        std::memcpy(batch[i].message.data(), e + 32, 32);\n"
+        "        if (!secp256k1::SchnorrSignature::parse_strict(e + 64, batch[i].signature)) {\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid Schnorr sig in batch\");\n"
+        "        }",
+    ),
+    # L1661: widening + braces — in ufsecp_musig2_start_sign_session
+    (
+        "    for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
+        "        Scalar s;\n"
+        "        if (!scalar_parse_strict(keyagg + 38 + i * 32, s))\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient in keyagg\");\n"
+        "        kagg.key_coefficients.push_back(s);\n"
+        "    }",
+        "    for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
+        "        Scalar s;\n"
+        "        if (!scalar_parse_strict(keyagg + 38 + static_cast<size_t>(i) * 32, s)) {\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient in keyagg\");\n"
+        "        }\n"
+        "        kagg.key_coefficients.push_back(s);\n"
+        "    }",
+    ),
+    # L1707: widening + braces — inside { } block in ufsecp_musig2_partial_sign
+    (
+        "        auto qc = kagg.Q.to_compressed(); std::memcpy(kagg.Q_x.data(), qc.data() + 1, 32);\n"
+        "        for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
+        "            Scalar s; if (!scalar_parse_strict(keyagg + 38 + i * 32, s))\n"
+        "                return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient\");\n"
+        "            kagg.key_coefficients.push_back(s); } }\n"
+        "      secp256k1::MuSig2Session sess;",
+        "        auto qc = kagg.Q.to_compressed(); std::memcpy(kagg.Q_x.data(), qc.data() + 1, 32);\n"
+        "        for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
+        "            Scalar s;\n"
+        "            if (!scalar_parse_strict(keyagg + 38 + static_cast<size_t>(i) * 32, s)) {\n"
+        "                return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient\");\n"
+        "            }\n"
+        "            kagg.key_coefficients.push_back(s);\n"
+        "        }\n"
+        "    }\n"
+        "      secp256k1::MuSig2Session sess;",
+    ),
+    # L1715: if (!scalar_parse_strict(session + 33, sess.b)) — in ufsecp_musig2_partial_sign
+    (
+        "      if (!scalar_parse_strict(session + 33, sess.b))\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
+        "      if (!scalar_parse_strict(session + 65, sess.e))\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
+        "      sess.R_negated = (session[97] != 0);\n"
+        "      auto psig = secp256k1::musig2_partial_sign",
+        "      if (!scalar_parse_strict(session + 33, sess.b)) {\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
+        "      }\n"
+        "      if (!scalar_parse_strict(session + 65, sess.e)) {\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
+        "      }\n"
+        "      sess.R_negated = (session[97] != 0);\n"
+        "      auto psig = secp256k1::musig2_partial_sign",
+    ),
+    # L1756: widening + braces — inside { } block in ufsecp_musig2_partial_verify
+    (
+        "        auto qc = kagg.Q.to_compressed(); std::memcpy(kagg.Q_x.data(), qc.data() + 1, 32);\n"
+        "        for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
+        "            Scalar s; if (!scalar_parse_strict(keyagg + 38 + i * 32, s))\n"
+        "                return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient\");\n"
+        "            kagg.key_coefficients.push_back(s); } }\n"
+        "      secp256k1::MuSig2Session sess;\n"
+        "      sess.R = point_from_compressed(session);\n"
+        "      if (sess.R.is_infinity()) {\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session R point\");\n"
+        "      }\n"
+        "      if (!scalar_parse_strict(session + 33, sess.b))\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
+        "      if (!scalar_parse_strict(session + 65, sess.e))\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
+        "      sess.R_negated = (session[97] != 0);\n"
+        "      if (!secp256k1::musig2_partial_verify",
+        "        auto qc = kagg.Q.to_compressed(); std::memcpy(kagg.Q_x.data(), qc.data() + 1, 32);\n"
+        "        for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
+        "            Scalar s;\n"
+        "            if (!scalar_parse_strict(keyagg + 38 + static_cast<size_t>(i) * 32, s)) {\n"
+        "                return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient\");\n"
+        "            }\n"
+        "            kagg.key_coefficients.push_back(s);\n"
+        "        }\n"
+        "    }\n"
+        "      secp256k1::MuSig2Session sess;\n"
+        "      sess.R = point_from_compressed(session);\n"
+        "      if (sess.R.is_infinity()) {\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session R point\");\n"
+        "      }\n"
+        "      if (!scalar_parse_strict(session + 33, sess.b)) {\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
+        "      }\n"
+        "      if (!scalar_parse_strict(session + 65, sess.e)) {\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
+        "      }\n"
+        "      sess.R_negated = (session[97] != 0);\n"
+        "      if (!secp256k1::musig2_partial_verify",
+    ),
+    # L1791+L1793: in ufsecp_musig2_partial_sig_agg
+    (
+        "      if (!scalar_parse_strict(session + 33, sess.b))\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
+        "      if (!scalar_parse_strict(session + 65, sess.e))\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
+        "      sess.R_negated = (session[97] != 0);\n"
+        "      auto final_sig",
+        "      if (!scalar_parse_strict(session + 33, sess.b)) {\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
+        "      }\n"
+        "      if (!scalar_parse_strict(session + 65, sess.e)) {\n"
+        "          return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
+        "      }\n"
+        "      sess.R_negated = (session[97] != 0);\n"
+        "      auto final_sig",
+    ),
+    # L1822+L1823: const for coeff_count and needed_commits
+    (
+        "    size_t coeff_count = commit.coeffs.size();\n"
+        "    size_t needed_commits = 8 + coeff_count * 33;",
+        "    const size_t coeff_count = commit.coeffs.size();\n"
+        "    const size_t needed_commits = 8 + coeff_count * 33;",
+    ),
+    # L1845: for (auto& s : shares) — erase in ufsecp_frost_keygen_begin
+    (
+        "    // Erase secret shares from memory\n"
+        "    for (auto& s : shares)\n"
+        "        secp256k1::detail::secure_erase(&s.value, sizeof(s.value));\n"
+        "    return UFSECP_OK;\n"
+        "}\n"
+        "\n"
+        "ufsecp_error_t ufsecp_frost_keygen_finalize(",
+        "    // Erase secret shares from memory\n"
+        "    for (auto& s : shares) {\n"
+        "        secp256k1::detail::secure_erase(&s.value, sizeof(s.value));\n"
+        "    }\n"
+        "    return UFSECP_OK;\n"
+        "}\n"
+        "\n"
+        "ufsecp_error_t ufsecp_frost_keygen_finalize(",
+    ),
+    # L1864: uint32_t cc; — init-variables
+    (
+        "        secp256k1::FrostCommitment fc;\n"
+        "        uint32_t cc;\n"
+        "        if (pos + 8 > commits_len)\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"truncated commit header\");\n"
+        "        std::memcpy(&cc, all_commits + pos, 4); pos += 4;\n"
+        "        std::memcpy(&fc.from, all_commits + pos, 4); pos += 4;\n"
+        "        if (pos + static_cast<size_t>(cc) * 33 > commits_len)\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"truncated commit coefficients\");\n"
+        "        for (uint32_t j = 0; j < cc; ++j) {\n"
+        "            auto pt = point_from_compressed(all_commits + pos);",
+        "        secp256k1::FrostCommitment fc;\n"
+        "        uint32_t cc = 0;\n"
+        "        if (pos + 8 > commits_len) {\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"truncated commit header\");\n"
+        "        }\n"
+        "        std::memcpy(&cc, all_commits + pos, 4); pos += 4;\n"
+        "        std::memcpy(&fc.from, all_commits + pos, 4); pos += 4;\n"
+        "        if (pos + static_cast<size_t>(cc) * 33 > commits_len) {\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"truncated commit coefficients\");\n"
+        "        }\n"
+        "        for (uint32_t j = 0; j < cc; ++j) {\n"
+        "            auto pt = point_from_compressed(all_commits + pos);",
+    ),
+    # L1889: if (!scalar_parse_strict(s + 4, v)) in ufsecp_frost_keygen_finalize
+    (
+        "        if (!scalar_parse_strict(s + 4, v))\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid share scalar\");",
+        "        if (!scalar_parse_strict(s + 4, v)) {\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid share scalar\");\n"
+        "        }",
+    ),
+    # L1895+L1898: if (!ok) + for (auto& s : shares) — erase in ufsecp_frost_keygen_finalize
+    (
+        "    if (!ok)\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_INTERNAL, \"FROST keygen finalize failed\");\n"
+        "    // Erase secret shares\n"
+        "    for (auto& s : shares)\n"
+        "        secp256k1::detail::secure_erase(&s.value, sizeof(s.value));",
+        "    if (!ok) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_INTERNAL, \"FROST keygen finalize failed\");\n"
+        "    }\n"
+        "    // Erase secret shares\n"
+        "    for (auto& s : shares) {\n"
+        "        secp256k1::detail::secure_erase(&s.value, sizeof(s.value));\n"
+        "    }",
+    ),
+    # L1955: if (!scalar_parse_strict(keypkg + 12, kp.signing_share))
+    (
+        "    if (!scalar_parse_strict(keypkg + 12, kp.signing_share))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"invalid signing share in keypkg\");",
+        "    if (!scalar_parse_strict(keypkg + 12, kp.signing_share)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"invalid signing share in keypkg\");\n"
+        "    }",
+    ),
+    # L1967+L1969: if (!scalar_parse_strict(nonce, h)) + if (!scalar_parse_strict(nonce + 32, b))
+    (
+        "    if (!scalar_parse_strict(nonce, h))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid hiding nonce\");\n"
+        "    if (!scalar_parse_strict(nonce + 32, b))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid binding nonce\");",
+        "    if (!scalar_parse_strict(nonce, h)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid hiding nonce\");\n"
+        "    }\n"
+        "    if (!scalar_parse_strict(nonce + 32, b)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid binding nonce\");\n"
+        "    }",
+    ),
+    # L2006+L2012: multi-line if null check + scalar parse in ufsecp_frost_verify_partial
+    (
+        "    if (!ctx || !partial_sig || !verification_share33 || !nonce_commits || !msg32 || !group_pubkey33)\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    ctx_clear_err(ctx);\n"
+        "    secp256k1::FrostPartialSig psig;\n"
+        "    std::memcpy(&psig.id, partial_sig, 4);\n"
+        "    Scalar z;\n"
+        "    if (!scalar_parse_strict(partial_sig + 4, z))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid partial sig scalar\");",
+        "    if (!ctx || !partial_sig || !verification_share33 || !nonce_commits || !msg32 || !group_pubkey33) {\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    }\n"
+        "    ctx_clear_err(ctx);\n"
+        "    secp256k1::FrostPartialSig psig;\n"
+        "    std::memcpy(&psig.id, partial_sig, 4);\n"
+        "    Scalar z;\n"
+        "    if (!scalar_parse_strict(partial_sig + 4, z)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid partial sig scalar\");\n"
+        "    }",
+    ),
+    # L2057+L2065: multi-line if null check + scalar parse in ufsecp_frost_aggregate
+    (
+        "    if (!ctx || !partial_sigs || !nonce_commits || !group_pubkey33 || !msg32 || !sig64_out)\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    ctx_clear_err(ctx);\n"
+        "    std::vector<secp256k1::FrostPartialSig> psigs(n);\n"
+        "    for (size_t i = 0; i < n; ++i) {\n"
+        "        const uint8_t* ps = partial_sigs + i * 36;\n"
+        "        std::memcpy(&psigs[i].id, ps, 4);\n"
+        "        Scalar z;\n"
+        "        if (!scalar_parse_strict(ps + 4, z))\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid partial sig scalar\");",
+        "    if (!ctx || !partial_sigs || !nonce_commits || !group_pubkey33 || !msg32 || !sig64_out) {\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    }\n"
+        "    ctx_clear_err(ctx);\n"
+        "    std::vector<secp256k1::FrostPartialSig> psigs(n);\n"
+        "    for (size_t i = 0; i < n; ++i) {\n"
+        "        const uint8_t* ps = partial_sigs + i * 36;\n"
+        "        std::memcpy(&psigs[i].id, ps, 4);\n"
+        "        Scalar z;\n"
+        "        if (!scalar_parse_strict(ps + 4, z)) {\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid partial sig scalar\");\n"
+        "        }",
+    ),
+    # L2144+L2150: in ufsecp_schnorr_adaptor_verify
+    (
+        "    if (!scalar_parse_strict(pre_sig + 33, shat))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
+        "    as.s_hat = shat;\n"
+        "    as.needs_negation = (pre_sig[65] != 0);\n"
+        "    // Strict: reject x-only pubkey >= p at ABI gate\n"
+        "    FE pk_fe;\n"
+        "    if (!FE::parse_bytes_strict(pubkey_x, pk_fe))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p)\");",
+        "    if (!scalar_parse_strict(pre_sig + 33, shat)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
+        "    }\n"
+        "    as.s_hat = shat;\n"
+        "    as.needs_negation = (pre_sig[65] != 0);\n"
+        "    // Strict: reject x-only pubkey >= p at ABI gate\n"
+        "    FE pk_fe;\n"
+        "    if (!FE::parse_bytes_strict(pubkey_x, pk_fe)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p)\");\n"
+        "    }",
+    ),
+    # L2176: in ufsecp_schnorr_adaptor_adapt
+    (
+        "    if (!scalar_parse_strict(pre_sig + 33, shat))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
+        "    as.s_hat = shat;\n"
+        "    as.needs_negation = (pre_sig[65] != 0);\n"
+        "    Scalar secret;\n"
+        "    if (!scalar_parse_strict_nonzero(adaptor_secret, secret))",
+        "    if (!scalar_parse_strict(pre_sig + 33, shat)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
+        "    }\n"
+        "    as.s_hat = shat;\n"
+        "    as.needs_negation = (pre_sig[65] != 0);\n"
+        "    Scalar secret;\n"
+        "    if (!scalar_parse_strict_nonzero(adaptor_secret, secret))",
+    ),
+    # L2203: in ufsecp_schnorr_adaptor_extract
+    (
+        "    if (!scalar_parse_strict(pre_sig + 33, shat))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
+        "    as.s_hat = shat;\n"
+        "    as.needs_negation = (pre_sig[65] != 0);\n"
+        "    secp256k1::SchnorrSignature sig;",
+        "    if (!scalar_parse_strict(pre_sig + 33, shat)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
+        "    }\n"
+        "    as.s_hat = shat;\n"
+        "    as.needs_negation = (pre_sig[65] != 0);\n"
+        "    secp256k1::SchnorrSignature sig;",
+    ),
+    # L2810+L2815+L2817: in ufsecp_silent_payment_address_create
+    (
+        "    if (!ctx || !scan_privkey || !spend_privkey || !scan_pubkey33_out ||\n"
+        "        !spend_pubkey33_out || !addr_out || !addr_len)\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    ctx_clear_err(ctx);\n"
+        "\n"
+        "    Scalar scan_sk, spend_sk;\n"
+        "    if (!scalar_parse_strict_nonzero(scan_privkey, scan_sk))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"scan privkey is zero or >= n\");\n"
+        "    if (!scalar_parse_strict_nonzero(spend_privkey, spend_sk))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"spend privkey is zero or >= n\");",
+        "    if (!ctx || !scan_privkey || !spend_privkey || !scan_pubkey33_out ||\n"
+        "        !spend_pubkey33_out || !addr_out || !addr_len) {\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    }\n"
+        "    ctx_clear_err(ctx);\n"
+        "\n"
+        "    Scalar scan_sk, spend_sk;\n"
+        "    if (!scalar_parse_strict_nonzero(scan_privkey, scan_sk)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"scan privkey is zero or >= n\");\n"
+        "    }\n"
+        "    if (!scalar_parse_strict_nonzero(spend_privkey, spend_sk)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"spend privkey is zero or >= n\");\n"
+        "    }",
+    ),
+    # L2827: if (addr_str.size() >= *addr_len)
+    (
+        "    if (addr_str.size() >= *addr_len)\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"address buffer too small\");",
+        "    if (addr_str.size() >= *addr_len) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"address buffer too small\");\n"
+        "    }",
+    ),
+    # L2846+L2855+L2864+L2868: in ufsecp_silent_payment_create_output
+    (
+        "    if (!ctx || !input_privkeys || n_inputs == 0 || !scan_pubkey33 ||\n"
+        "        !spend_pubkey33 || !output_pubkey33_out)\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    ctx_clear_err(ctx);\n"
+        "\n"
+        "    // Parse input private keys\n"
+        "    std::vector<Scalar> privkeys;\n"
+        "    privkeys.reserve(n_inputs);\n"
+        "    for (size_t i = 0; i < n_inputs; ++i) {\n"
+        "        Scalar sk;\n"
+        "        if (!scalar_parse_strict_nonzero(input_privkeys + i * 32, sk))\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"input privkey is zero or >= n\");\n"
+        "        privkeys.push_back(sk);\n"
+        "    }\n"
+        "\n"
+        "    // Parse recipient address\n"
+        "    secp256k1::SilentPaymentAddress recipient;\n"
+        "    recipient.scan_pubkey = point_from_compressed(scan_pubkey33);\n"
+        "    recipient.spend_pubkey = point_from_compressed(spend_pubkey33);\n"
+        "    if (recipient.scan_pubkey.is_infinity() || recipient.spend_pubkey.is_infinity())\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid recipient pubkey\");\n"
+        "\n"
+        "    auto [output_point, tweak] = secp256k1::silent_payment_create_output(privkeys, recipient, k);\n"
+        "    if (output_point.is_infinity())\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_ARITH, \"output point is infinity\");",
+        "    if (!ctx || !input_privkeys || n_inputs == 0 || !scan_pubkey33 ||\n"
+        "        !spend_pubkey33 || !output_pubkey33_out) {\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    }\n"
+        "    ctx_clear_err(ctx);\n"
+        "\n"
+        "    // Parse input private keys\n"
+        "    std::vector<Scalar> privkeys;\n"
+        "    privkeys.reserve(n_inputs);\n"
+        "    for (size_t i = 0; i < n_inputs; ++i) {\n"
+        "        Scalar sk;\n"
+        "        if (!scalar_parse_strict_nonzero(input_privkeys + i * 32, sk)) {\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"input privkey is zero or >= n\");\n"
+        "        }\n"
+        "        privkeys.push_back(sk);\n"
+        "    }\n"
+        "\n"
+        "    // Parse recipient address\n"
+        "    secp256k1::SilentPaymentAddress recipient;\n"
+        "    recipient.scan_pubkey = point_from_compressed(scan_pubkey33);\n"
+        "    recipient.spend_pubkey = point_from_compressed(spend_pubkey33);\n"
+        "    if (recipient.scan_pubkey.is_infinity() || recipient.spend_pubkey.is_infinity()) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid recipient pubkey\");\n"
+        "    }\n"
+        "\n"
+        "    auto [output_point, tweak] = secp256k1::silent_payment_create_output(privkeys, recipient, k);\n"
+        "    if (output_point.is_infinity()) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_ARITH, \"output point is infinity\");\n"
+        "    }",
+    ),
+    # L2879: for (auto& sk : privkeys) — erase in ufsecp_silent_payment_create_output
+    (
+        "    for (auto& sk : privkeys)\n"
+        "        secp256k1::detail::secure_erase(&sk, sizeof(sk));\n"
+        "    return UFSECP_OK;\n"
+        "}\n"
+        "\n"
+        "ufsecp_error_t ufsecp_silent_payment_scan(",
+        "    for (auto& sk : privkeys) {\n"
+        "        secp256k1::detail::secure_erase(&sk, sizeof(sk));\n"
+        "    }\n"
+        "    return UFSECP_OK;\n"
+        "}\n"
+        "\n"
+        "ufsecp_error_t ufsecp_silent_payment_scan(",
+    ),
+    # L2894+L2896+L2901+L2903+L2911: in ufsecp_silent_payment_scan
+    (
+        "    if (!ctx || !scan_privkey || !spend_privkey || !input_pubkeys33 ||\n"
+        "        !output_xonly32 || !n_found)\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    if (n_input_pubkeys == 0 || n_outputs == 0)\n"
+        "        return UFSECP_ERR_BAD_INPUT;\n"
+        "    ctx_clear_err(ctx);\n"
+        "\n"
+        "    Scalar scan_sk, spend_sk;\n"
+        "    if (!scalar_parse_strict_nonzero(scan_privkey, scan_sk))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"scan privkey is zero or >= n\");\n"
+        "    if (!scalar_parse_strict_nonzero(spend_privkey, spend_sk))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"spend privkey is zero or >= n\");\n"
+        "\n"
+        "    // Parse input pubkeys\n"
+        "    std::vector<Point> input_pks;\n"
+        "    input_pks.reserve(n_input_pubkeys);\n"
+        "    for (size_t i = 0; i < n_input_pubkeys; ++i) {\n"
+        "        auto pk = point_from_compressed(input_pubkeys33 + i * 33);\n"
+        "        if (pk.is_infinity())\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid input pubkey\");",
+        "    if (!ctx || !scan_privkey || !spend_privkey || !input_pubkeys33 ||\n"
+        "        !output_xonly32 || !n_found) {\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    }\n"
+        "    if (n_input_pubkeys == 0 || n_outputs == 0) {\n"
+        "        return UFSECP_ERR_BAD_INPUT;\n"
+        "    }\n"
+        "    ctx_clear_err(ctx);\n"
+        "\n"
+        "    Scalar scan_sk, spend_sk;\n"
+        "    if (!scalar_parse_strict_nonzero(scan_privkey, scan_sk)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"scan privkey is zero or >= n\");\n"
+        "    }\n"
+        "    if (!scalar_parse_strict_nonzero(spend_privkey, spend_sk)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"spend privkey is zero or >= n\");\n"
+        "    }\n"
+        "\n"
+        "    // Parse input pubkeys\n"
+        "    std::vector<Point> input_pks;\n"
+        "    input_pks.reserve(n_input_pubkeys);\n"
+        "    for (size_t i = 0; i < n_input_pubkeys; ++i) {\n"
+        "        auto pk = point_from_compressed(input_pubkeys33 + i * 33);\n"
+        "        if (pk.is_infinity()) {\n"
+        "            return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid input pubkey\");\n"
+        "        }",
+    ),
+    # L2953+L2964+L2968+L2972: in ufsecp_ecies_encrypt
+    (
+        "    if (!ctx || !recipient_pubkey33 || !plaintext || !envelope_out || !envelope_len)\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    if (plaintext_len == 0) {\n"
+        "        return UFSECP_ERR_BAD_INPUT;\n"
+        "    }\n"
+        "    ctx_clear_err(ctx);\n"
+        "\n"
+        "    if (plaintext_len > SIZE_MAX - UFSECP_ECIES_OVERHEAD) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"plaintext_len too large\");\n"
+        "    }\n"
+        "    size_t const needed = plaintext_len + UFSECP_ECIES_OVERHEAD;\n"
+        "    if (*envelope_len < needed)\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"envelope buffer too small\");\n"
+        "\n"
+        "    auto pk = point_from_compressed(recipient_pubkey33);\n"
+        "    if (pk.is_infinity())\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid recipient pubkey\");\n"
+        "\n"
+        "    auto envelope = secp256k1::ecies_encrypt(pk, plaintext, plaintext_len);\n"
+        "    if (envelope.empty())\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_INTERNAL, \"ECIES encryption failed\");",
+        "    if (!ctx || !recipient_pubkey33 || !plaintext || !envelope_out || !envelope_len) {\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    }\n"
+        "    if (plaintext_len == 0) {\n"
+        "        return UFSECP_ERR_BAD_INPUT;\n"
+        "    }\n"
+        "    ctx_clear_err(ctx);\n"
+        "\n"
+        "    if (plaintext_len > SIZE_MAX - UFSECP_ECIES_OVERHEAD) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"plaintext_len too large\");\n"
+        "    }\n"
+        "    size_t const needed = plaintext_len + UFSECP_ECIES_OVERHEAD;\n"
+        "    if (*envelope_len < needed) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"envelope buffer too small\");\n"
+        "    }\n"
+        "\n"
+        "    auto pk = point_from_compressed(recipient_pubkey33);\n"
+        "    if (pk.is_infinity()) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid recipient pubkey\");\n"
+        "    }\n"
+        "\n"
+        "    auto envelope = secp256k1::ecies_encrypt(pk, plaintext, plaintext_len);\n"
+        "    if (envelope.empty()) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_INTERNAL, \"ECIES encryption failed\");\n"
+        "    }",
+    ),
+    # L2985+L2987+L2992+L2996+L3002: in ufsecp_ecies_decrypt
+    (
+        "    if (!ctx || !privkey || !envelope || !plaintext_out || !plaintext_len)\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    if (envelope_len < 82) // min: 33 + 16 + 1 + 32\n"
+        "        return UFSECP_ERR_BAD_INPUT;\n"
+        "    ctx_clear_err(ctx);\n"
+        "\n"
+        "    size_t const expected_pt_len = envelope_len - UFSECP_ECIES_OVERHEAD;\n"
+        "    if (*plaintext_len < expected_pt_len)\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"plaintext buffer too small\");\n"
+        "\n"
+        "    Scalar sk;\n"
+        "    if (!scalar_parse_strict_nonzero(privkey, sk))\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"privkey is zero or >= n\");\n"
+        "\n"
+        "    auto pt = secp256k1::ecies_decrypt(sk, envelope, envelope_len);\n"
+        "    secp256k1::detail::secure_erase(&sk, sizeof(sk));\n"
+        "\n"
+        "    if (pt.empty())\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_VERIFY_FAIL, \"ECIES decryption failed (bad key or tampered)\");",
+        "    if (!ctx || !privkey || !envelope || !plaintext_out || !plaintext_len) {\n"
+        "        return UFSECP_ERR_NULL_ARG;\n"
+        "    }\n"
+        "    if (envelope_len < 82) { // min: 33 + 16 + 1 + 32\n"
+        "        return UFSECP_ERR_BAD_INPUT;\n"
+        "    }\n"
+        "    ctx_clear_err(ctx);\n"
+        "\n"
+        "    size_t const expected_pt_len = envelope_len - UFSECP_ECIES_OVERHEAD;\n"
+        "    if (*plaintext_len < expected_pt_len) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"plaintext buffer too small\");\n"
+        "    }\n"
+        "\n"
+        "    Scalar sk;\n"
+        "    if (!scalar_parse_strict_nonzero(privkey, sk)) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"privkey is zero or >= n\");\n"
+        "    }\n"
+        "\n"
+        "    auto pt = secp256k1::ecies_decrypt(sk, envelope, envelope_len);\n"
+        "    secp256k1::detail::secure_erase(&sk, sizeof(sk));\n"
+        "\n"
+        "    if (pt.empty()) {\n"
+        "        return ctx_set_err(ctx, UFSECP_ERR_VERIFY_FAIL, \"ECIES decryption failed (bad key or tampered)\");\n"
+        "    }",
+    ),
+]
+
+
+def main():
+    with open(PATH, "r") as f:
+        content = f.read()
+
+    for i, (old, new) in enumerate(REPLACEMENTS):
+        count = content.count(old)
+        if count == 0:
+            print(f"[FAIL] Replacement {i+1}: NOT FOUND")
+            print(f"  Looking for: {repr(old[:80])}")
+            sys.exit(1)
+        if count > 1:
+            print(f"[WARN] Replacement {i+1}: found {count} occurrences, replacing first")
+        content = content.replace(old, new, 1)
+        print(f"[OK] Replacement {i+1} applied")
+
+    with open(PATH, "w") as f:
+        f.write(content)
+    print(f"\nAll {len(REPLACEMENTS)} replacements applied to {PATH}")
+
+
+if __name__ == "__main__":
+    main()
--- a/fix_round4.py
+++ b/fix_round4.py
@ -0,0 +1,893 @@
+#!/usr/bin/env python3
+"""Fix all 211 code-scanning alerts across 13 files."""
+
+import re
+from pathlib import Path
+
+BASE = Path('/home/shrek/Secp256K1/Secp256K1fast/libs/UltrafastSecp256k1')
+
+
+def read(path):
+    return (BASE / path).read_text().splitlines(keepends=True)
+
+
+def save(path, lines):
+    (BASE / path).write_text(''.join(lines))
+
+
+# ============================================================================
+# Algorithmic helpers
+# ============================================================================
+
+def add_braces(lines, alert_lines_1based, tag=''):
+    """Add { } around single-statement bodies. Process bottom-to-top."""
+    fixed = 0
+    for lnum in sorted(alert_lines_1based, reverse=True):
+        idx = lnum - 1
+        if idx >= len(lines):
+            print(f'  SKIP {tag}L{lnum}: out of range ({len(lines)} lines)')
+            continue
+        line = lines[idx]
+        # Get indentation of the controlling statement
+        indent = len(line) - len(line.lstrip())
+        indent_str = line[:indent]
+        stripped = line.rstrip('\n\r').rstrip()
+
+        # Skip if already has brace at end
+        if stripped.endswith('{'):
+            print(f'  SKIP {tag}L{lnum}: already has {{')
+            continue
+
+        # Find next non-empty line (the body)
+        body_idx = idx + 1
+        while body_idx < len(lines) and lines[body_idx].strip() == '':
+            body_idx += 1
+
+        if body_idx >= len(lines):
+            print(f'  SKIP {tag}L{lnum}: no body line found')
+            continue
+
+        body_line_stripped = lines[body_idx].lstrip()
+        # Skip if body already starts with {
+        if body_line_stripped.startswith('{'):
+            print(f'  SKIP {tag}L{lnum}: body already has {{')
+            continue
+
+        # Apply fix
+        lines[idx] = stripped + ' {\n'
+        lines.insert(body_idx + 1, indent_str + '}\n')
+        fixed += 1
+
+    print(f'  -> {tag}braces fixed: {fixed}')
+    return lines
+
+
+def add_const_to_lines(lines, alert_lines_1based, tag=''):
+    """Prepend const to variable declarations, handling range-for loops."""
+    fixed = 0
+    for lnum in sorted(alert_lines_1based, reverse=True):
+        idx = lnum - 1
+        if idx >= len(lines):
+            continue
+        line = lines[idx]
+        stripped = line.lstrip()
+        leading = line[:len(line) - len(stripped)]
+
+        if stripped.startswith('const '):
+            print(f'  SKIP {tag}L{lnum}: already const')
+            continue
+
+        # Range-based for loop: for (TYPE var : container) -> for (const TYPE var : container)
+        if stripped.startswith('for (') and ':' in stripped:
+            # Match: for (TYPE var : ...
+            m = re.match(r'(for \()(\w[^:]+: .+)', stripped)
+            if m:
+                lines[idx] = leading + m.group(1) + 'const ' + m.group(2)
+                fixed += 1
+                continue
+
+        # Regular declaration
+        lines[idx] = leading + 'const ' + stripped
+        fixed += 1
+
+    print(f'  -> {tag}const fixed: {fixed}')
+    return lines
+
+
+# ============================================================================
+# File: include/ufsecp/ufsecp_impl.cpp
+# ============================================================================
+
+def fix_ufsecp_impl():
+    path = 'include/ufsecp/ufsecp_impl.cpp'
+    print(f'\n=== {path} ===')
+    lines = read(path)
+
+    # --- readability-braces-around-statements (59 alerts) ---
+    brace_lines = [
+        1242, 1245, 1248, 1260, 1274, 1277, 1281, 1294, 1297,
+        1300, 1314, 1318, 1322, 1340, 1343, 1345, 1355, 1368,
+        1412, 1415, 1431, 1435, 1438, 1457, 1462, 1477, 1481,
+        1486, 1514, 1516, 1519, 1522, 1525, 1542, 1545, 1549,
+        1567, 1577, 1594, 1691, 1695, 1699, 1701, 1749, 1753,
+        1787, 1801, 1831, 1834, 1844, 1856, 1974, 2047, 2068,
+        2071, 2076, 2138, 2832, 2834,
+    ]
+    lines = add_braces(lines, brace_lines, 'ufsecp_impl/')
+
+    # --- misc-const-correctness ---
+    const_lines = [1366, 1855, 1905, 2075, 3147, 3167, 3172]
+    lines = add_const_to_lines(lines, const_lines, 'ufsecp_impl/')
+
+    # --- modernize-use-auto ---
+    # L1573: uint32_t nk = static_cast<uint32_t>(...) -> auto nk = ...
+    # L1846: uint32_t cc32 = static_cast<uint32_t>(...) -> auto cc32 = ...
+    for lnum in [1573, 1846]:
+        idx = lnum - 1
+        line = lines[idx]
+        m = re.match(r'(\s*)uint32_t (\w+) = (static_cast<uint32_t>\(.+)', line)
+        if m:
+            lines[idx] = f'{m.group(1)}auto {m.group(2)} = {m.group(3)}'
+            print(f'  AUTO: L{lnum}')
+
+    # --- cppcoreguidelines-init-variables ---
+    # L1655: uint32_t nk; -> uint32_t nk = 0;
+    idx = 1655 - 1
+    if '    uint32_t nk;' in lines[idx]:
+        lines[idx] = lines[idx].replace('uint32_t nk;', 'uint32_t nk = 0;')
+        print('  INIT: L1655')
+
+    # L1706: { uint32_t nk; -> { uint32_t nk = 0;
+    idx = 1706 - 1
+    if 'uint32_t nk;' in lines[idx]:
+        lines[idx] = lines[idx].replace('uint32_t nk;', 'uint32_t nk = 0;')
+        print('  INIT: L1706')
+
+    # L1761: same pattern
+    idx = 1761 - 1
+    if 'uint32_t nk;' in lines[idx]:
+        lines[idx] = lines[idx].replace('uint32_t nk;', 'uint32_t nk = 0;')
+        print('  INIT: L1761')
+
+    # --- bugprone-implicit-widening-of-multiplication-result ---
+    # L1578: keyagg_out + 38 + i * 32 -> keyagg_out + 38 + static_cast<size_t>(i) * 32
+    idx = 1578 - 1
+    if 'i * 32' in lines[idx] and 'static_cast<size_t>(i)' not in lines[idx]:
+        lines[idx] = lines[idx].replace(
+            'keyagg_out + 38 + i * 32',
+            'keyagg_out + 38 + static_cast<size_t>(i) * 32'
+        )
+        print('  WIDENING: L1578')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# File: cpu/src/bip39.cpp
+# ============================================================================
+
+def fix_bip39():
+    path = 'cpu/src/bip39.cpp'
+    print(f'\n=== {path} ===')
+    lines = read(path)
+
+    brace_lines = [49, 50, 93, 110, 117, 138, 140, 150, 171, 196, 200, 223,
+                   246, 269, 273]
+    lines = add_braces(lines, brace_lines, 'bip39/')
+
+    const_lines = [33, 46, 47, 97, 126, 127, 128, 129, 136, 145,
+                   182, 183, 184, 185, 191, 193, 194, 199,
+                   255, 256, 257, 258, 264, 266, 267, 272]
+    lines = add_const_to_lines(lines, const_lines, 'bip39/')
+
+    # --- cppcoreguidelines-init-variables ---
+    # L137: some variable, need to find it
+    idx = 137 - 1
+    line = lines[idx]
+    # Pattern: TYPE var; (uninitialized) - add = 0 or = {} or = nullptr
+    m = re.match(r'(\s*)((?:int|uint\w*|size_t|bool|char|float|double)\s+\w+);(\s*(?://.*)?)\n', line)
+    if m:
+        type_and_var = m.group(2).rstrip()
+        # Determine default value
+        if 'bool' in type_and_var:
+            default = 'false'
+        elif 'float' in type_and_var or 'double' in type_and_var:
+            default = '0.0'
+        elif 'char*' in type_and_var or 'uint8_t*' in type_and_var:
+            default = 'nullptr'
+        else:
+            default = '0'
+        lines[idx] = f'{m.group(1)}{type_and_var} = {default};{m.group(3)}\n'
+        print(f'  INIT: L137 -> added = {default}')
+    else:
+        print(f'  INIT_SKIP: L137 pattern not matched: {repr(line[:60])}')
+
+    # --- modernize-use-auto ---
+    # L191 and L264: iterator/auto type replacement
+    for lnum in [191, 264]:
+        idx = lnum - 1
+        line = lines[idx]
+        # Pattern: SomeType::iterator it = or std::vector<...>::iterator it =
+        m = re.match(r'(\s*)(\w[\w:<>, *]+::iterator)(\s+\w+\s*=.+)', line)
+        if m:
+            lines[idx] = f'{m.group(1)}auto{m.group(3)}'
+            print(f'  AUTO: L{lnum}')
+        else:
+            # Try: SomeType it = container.begin()
+            m2 = re.match(r'(\s*)(\w[\w:<>, *]+\*?)(\s+\w+\s*=\s*\w.+\.begin\(\).+)', line)
+            if m2:
+                lines[idx] = f'{m2.group(1)}auto{m2.group(3)}'
+                print(f'  AUTO: L{lnum}')
+            else:
+                print(f'  AUTO_SKIP: L{lnum}: {repr(line[:60])}')
+
+    # --- cert-err33-c (unchecked fclose return) ---
+    # L34: std::fclose(f); -> (void)std::fclose(f);
+    idx = 34 - 1
+    line = lines[idx]
+    if 'std::fclose' in line and '(void)' not in line:
+        lines[idx] = line.replace('std::fclose', '(void)std::fclose')
+        print('  ERR33: L34 fclose')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# File: cpu/src/zk.cpp
+# ============================================================================
+
+def fix_zk():
+    path = 'cpu/src/zk.cpp'
+    print(f'\n=== {path} ===')
+    lines = read(path)
+
+    brace_lines = [45, 68, 381, 415, 423, 481, 503, 610, 615, 619, 623,
+                   664, 668, 675, 686, 688, 720, 785]
+    lines = add_braces(lines, brace_lines, 'zk/')
+
+    const_lines = [359, 363, 446, 448, 500, 642, 661]
+    lines = add_const_to_lines(lines, const_lines, 'zk/')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# File: cpu/src/message_signing.cpp
+# ============================================================================
+
+def fix_message_signing():
+    path = 'cpu/src/message_signing.cpp'
+    print(f'\n=== {path} ===')
+    lines = read(path)
+
+    brace_lines = [30, 35]
+    lines = add_braces(lines, brace_lines, 'msg_signing/')
+
+    const_lines = [65, 152, 153, 154, 155, 159, 193, 196]
+    lines = add_const_to_lines(lines, const_lines, 'msg_signing/')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# File: cpu/src/eth_signing.cpp
+# ============================================================================
+
+def fix_eth_signing():
+    path = 'cpu/src/eth_signing.cpp'
+    print(f'\n=== {path} ===')
+    lines = read(path)
+
+    # --- misc-unused-using-decls: L16 'using fast::Point;' ---
+    idx = 16 - 1
+    if 'using fast::Point' in lines[idx]:
+        lines[idx] = ''  # Remove the line (keep blank to preserve line numbers)
+        # Actually remove the line entirely
+        lines[idx] = '\n'
+        # Better: just delete and shift
+        del lines[idx]
+        # Now const_lines will shift by -1
+        print('  UNUSED-USING: L16 removed')
+        # After removal, adjust const lines
+        const_lines = [95, 96]  # shifted from [96, 97]
+    else:
+        print(f'  UNUSED-USING SKIP: L16: {repr(lines[idx][:50])}')
+        const_lines = [96, 97]
+
+    lines = add_const_to_lines(lines, const_lines, 'eth_signing/')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# File: cpu/src/address.cpp
+# ============================================================================
+
+def fix_address():
+    path = 'cpu/src/address.cpp'
+    print(f'\n=== {path} ===')
+    lines = read(path)
+
+    # L516: for (char c : prefix) -> for (const char c : prefix)
+    # L527: std::uint8_t version_byte = ... -> const std::uint8_t version_byte = ...
+    # L527: also modernize-use-auto -> auto version_byte = ...
+    const_lines = [516, 527]
+    lines = add_const_to_lines(lines, const_lines, 'address/')
+
+    # L527: modernize-use-auto: const std::uint8_t version_byte = static_cast<...>
+    # -> const auto version_byte = static_cast<...>
+    # This is handled by add_const adding 'const', but we also need to change the type
+    # Actually the modernize-use-auto wants: 'auto version_byte = static_cast<std::uint8_t>(...)'
+    # And const-correctness wants: 'const ... version_byte = ...'
+    # Combined: 'const auto version_byte = static_cast<std::uint8_t>(...)'
+    # Let's check what add_const_to_lines did for L527:
+    # Line 527 was: std::uint8_t version_byte = static_cast<std::uint8_t>(type << 3);
+    # After add_const: const std::uint8_t version_byte = ...
+    # But we also want to replace std::uint8_t with auto for modernize-use-auto:
+    # Find current state of L527 (0-indexed: 526, but const_lines processed in reverse,
+    # so L516 was processed first (higher reverse order), then L527)
+    # Actually both were processed with const_lines = [516, 527], processed in reverse: 527, 516
+    # After const processing, L527 has 'const std::uint8_t version_byte = ...'
+    # Now apply modernize-use-auto: replace 'const std::uint8_t' with 'const auto'
+    idx = 527 - 1
+    if idx < len(lines):
+        line = lines[idx]
+        if 'const std::uint8_t version_byte' in line:
+            lines[idx] = line.replace('const std::uint8_t version_byte',
+                                       'const auto version_byte')
+            print('  AUTO: L527')
+        elif 'const auto version_byte' in line:
+            print('  AUTO: L527 already auto')
+        else:
+            print(f'  AUTO_SKIP: L527: {repr(line[:60])}')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# File: cpu/src/wallet.cpp
+# ============================================================================
+
+def fix_wallet():
+    path = 'cpu/src/wallet.cpp'
+    print(f'\n=== {path} ===')
+    lines = read(path)
+
+    # L150, L171: bugprone-misplaced-widening-cast
+    # Pattern: static_cast<std::uint64_t>(27 + rsig.recid)
+    # Fix: static_cast<std::uint64_t>(27) + static_cast<std::uint64_t>(rsig.recid)
+    for lnum in [150, 171]:
+        idx = lnum - 1
+        if idx >= len(lines):
+            continue
+        line = lines[idx]
+        if 'static_cast<std::uint64_t>(27 + rsig.recid)' in line:
+            lines[idx] = line.replace(
+                'static_cast<std::uint64_t>(27 + rsig.recid)',
+                'static_cast<std::uint64_t>(27) + static_cast<std::uint64_t>(rsig.recid)'
+            )
+            print(f'  WIDEN: L{lnum}')
+        else:
+            print(f'  WIDEN_SKIP: L{lnum}: {repr(line[:60])}')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# File: cpu/src/coin_address.cpp
+# ============================================================================
+
+def fix_coin_address():
+    path = 'cpu/src/coin_address.cpp'
+    print(f'\n=== {path} ===')
+    lines = read(path)
+
+    # L170: std::string prefix = testnet ? ... -> const std::string prefix = ...
+    const_lines = [170]
+    lines = add_const_to_lines(lines, const_lines, 'coin_address/')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# File: cpu/tests/test_bip39.cpp
+# ============================================================================
+
+# Helper function for replacing sscanf with strtoul in hex_to_bytes
+HEX_TO_BYTES_SSCANF_BIP39 = '''\
+static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
+    for (size_t i = 0; i < len; ++i) {
+        unsigned int byte = 0;
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+        std::sscanf(hex + 2 * i, "%02x", &byte);
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+        out[i] = static_cast<uint8_t>(byte);
+    }
+}'''
+
+HEX_TO_BYTES_STRTOUL_BIP39 = '''\
+static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
+    for (size_t i = 0; i < len; ++i) {
+        char pair[3] = { hex[2 * i], hex[2 * i + 1], '\\0' };
+        char* endptr = nullptr;
+        const unsigned long val = std::strtoul(pair, &endptr, 16);
+        out[i] = (endptr == pair + 2) ? static_cast<uint8_t>(val) : 0;
+    }
+}'''
+
+BYTES_TO_HEX_OLD = '''\
+static std::string bytes_to_hex(const uint8_t* data, size_t len) {
+    std::string result;
+    result.reserve(len * 2);
+    for (size_t i = 0; i < len; ++i) {
+        char buf[3];
+        std::snprintf(buf, sizeof(buf), "%02x", data[i]);
+        result += buf;
+    }
+    return result;
+}'''
+
+BYTES_TO_HEX_NEW = '''\
+static std::string bytes_to_hex(const uint8_t* data, size_t len) {
+    std::string result;
+    result.reserve(len * 2);
+    for (size_t i = 0; i < len; ++i) {
+        char buf[3];
+        (void)std::snprintf(buf, sizeof(buf), "%02x", data[i]);
+        result += buf;
+    }
+    return result;
+}'''
+
+
+def fix_test_bip39():
+    path = 'cpu/tests/test_bip39.cpp'
+    print(f'\n=== {path} ===')
+    content = (BASE / path).read_text()
+
+    # cert-err33-c + cert-err34-c: replace sscanf with strtoul
+    if HEX_TO_BYTES_SSCANF_BIP39 in content:
+        content = content.replace(HEX_TO_BYTES_SSCANF_BIP39, HEX_TO_BYTES_STRTOUL_BIP39)
+        print('  ERR34: hex_to_bytes sscanf -> strtoul')
+    else:
+        print('  ERR34_SKIP: hex_to_bytes sscanf pattern not found')
+
+    # cert-err33-c: snprintf return unchecked
+    if BYTES_TO_HEX_OLD in content:
+        content = content.replace(BYTES_TO_HEX_OLD, BYTES_TO_HEX_NEW)
+        print('  ERR33: bytes_to_hex snprintf -> (void)snprintf')
+    else:
+        print('  ERR33_SKIP: bytes_to_hex pattern not found')
+
+    # clang-analyzer-core.NullDereference at L99
+    # CHECK(wl != nullptr, ...) then wl[0] - add explicit if
+    old_null = '    CHECK(wl != nullptr, "wordlist not null");\n    CHECK(std::strcmp(wl[0]'
+    new_null = '    CHECK(wl != nullptr, "wordlist not null");\n    if (!wl) { return; }\n    CHECK(std::strcmp(wl[0]'
+    if old_null in content:
+        content = content.replace(old_null, new_null)
+        print('  NULL_DEREF: L99 added null guard')
+    else:
+        print('  NULL_DEREF_SKIP: pattern not found')
+
+    (BASE / path).write_text(content)
+    # Now add const to specific lines
+    lines = read(path)
+
+    # After the sscanf->strtoul replacement, L32 changes. The line numbers may shift.
+    # The original file had 393 lines. After replacing 14-line block with 7-line block
+    # and 9-line block with 9-line block (same), the const lines may shift.
+    # Let's handle const by string pattern instead.
+    # L238, L252, L264: std::string hex = bytes_to_hex(...) -> const std::string hex = ...
+    for idx in range(len(lines)):
+        line = lines[idx]
+        stripped = line.lstrip()
+        if stripped.startswith('std::string hex = bytes_to_hex('):
+            leading = line[:len(line) - len(stripped)]
+            lines[idx] = leading + 'const ' + stripped
+            print(f'  CONST: L{idx+1} std::string hex')
+
+    # L340, L352, L365: for (char c : mnemonic) -> for (const char c : mnemonic)
+    for idx in range(len(lines)):
+        line = lines[idx]
+        if 'for (char c : mnemonic)' in line:
+            lines[idx] = line.replace('for (char c : mnemonic)',
+                                       'for (const char c : mnemonic)')
+            print(f'  CONST: L{idx+1} for (char c : mnemonic)')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# File: cpu/tests/test_ethereum.cpp
+# ============================================================================
+
+HEX_TO_BYTES_SSCANF_ETH = '''\
+static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
+    for (size_t i = 0; i < len; ++i) {
+        unsigned int byte = 0;
+        if (std::sscanf(hex + i * 2, "%02x", &byte) != 1) byte = 0;
+        out[i] = static_cast<uint8_t>(byte);
+    }
+}'''
+
+HEX_TO_BYTES_STRTOUL_ETH = '''\
+static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
+    for (size_t i = 0; i < len; ++i) {
+        char pair[3] = { hex[i * 2], hex[i * 2 + 1], '\\0' };
+        char* endptr = nullptr;
+        const unsigned long val = std::strtoul(pair, &endptr, 16);
+        out[i] = (endptr == pair + 2) ? static_cast<uint8_t>(val) : 0;
+    }
+}'''
+
+SNPRINTF_ETH_OLD = '        std::snprintf(buf, sizeof(buf), "Round-trip chain_id=%lu (%s)",'
+SNPRINTF_ETH_NEW = '        (void)std::snprintf(buf, sizeof(buf), "Round-trip chain_id=%lu (%s)",'
+
+
+def fix_test_ethereum():
+    path = 'cpu/tests/test_ethereum.cpp'
+    print(f'\n=== {path} ===')
+    content = (BASE / path).read_text()
+
+    # cert-err34-c: sscanf -> strtoul
+    if HEX_TO_BYTES_SSCANF_ETH in content:
+        content = content.replace(HEX_TO_BYTES_SSCANF_ETH, HEX_TO_BYTES_STRTOUL_ETH)
+        print('  ERR34: hex_to_bytes sscanf -> strtoul')
+    else:
+        print('  ERR34_SKIP: hex_to_bytes pattern not found')
+
+    # cert-err33-c at L352: snprintf return unchecked
+    if SNPRINTF_ETH_OLD in content:
+        content = content.replace(SNPRINTF_ETH_OLD, SNPRINTF_ETH_NEW)
+        print('  ERR33: snprintf -> (void)snprintf')
+    else:
+        print('  ERR33_SKIP: snprintf pattern not found')
+
+    # readability-simplify-boolean-expr: extract conditions to named bools
+    # L189: ASSERT_TRUE(sig.v == 27 || sig.v == 28, "legacy v should be 27 or 28");
+    # Fix: const bool v_ok = (sig.v == 27 || sig.v == 28); ASSERT_TRUE(v_ok, ...);
+    content = content.replace(
+        '    ASSERT_TRUE(sig.v == 27 || sig.v == 28, "legacy v should be 27 or 28");',
+        '    {\n        const bool v_ok = (sig.v == 27 || sig.v == 28);\n        ASSERT_TRUE(v_ok, "legacy v should be 27 or 28");\n    }'
+    )
+    content = content.replace(
+        '    ASSERT_TRUE(sig2.v == 37 || sig2.v == 38, "EIP-155 v should be 37 or 38");',
+        '    {\n        const bool v2_ok = (sig2.v == 37 || sig2.v == 38);\n        ASSERT_TRUE(v2_ok, "EIP-155 v should be 37 or 38");\n    }'
+    )
+    content = content.replace(
+        '    ASSERT_TRUE(sig.v == 27 || sig.v == 28, "v should be 27 or 28");',
+        '    {\n        const bool v_ok2 = (sig.v == 27 || sig.v == 28);\n        ASSERT_TRUE(v_ok2, "v should be 27 or 28");\n    }'
+    )
+    print('  SIMPLIFY-BOOL: test_ethereum sig.v checks')
+
+    (BASE / path).write_text(content)
+
+    # Add const to variable declarations (by pattern)
+    lines = read(path)
+
+    # Find and fix const alerts: Point pk = ..., Scalar sk = ..., auto vars, etc.
+    # L226: Point pk = ... -> const Point pk
+    # L264: std::array<...> zero{} - this is const alert? Let me check
+    # Actually the const alerts at L226, L264, L287, L302, L309, L317, L333
+    # are all variable declarations that should be const
+    const_patterns = [
+        'Point pk = ',
+        'Point pk2 = ',
+        'auto expected_addr = ',
+        'auto addr = ',
+        'auto addr2 = ',
+        'std::array<uint8_t, 32> hash{};',
+        'std::array<uint8_t, 32> wrong_hash{};',
+        'bool wrong = ',
+        'bool wrong2 = ',
+    ]
+    # Instead, use line numbers after adjusting for line-number shifts from replacements
+    # The simplify-bool fix added 3 blocks (each +4 lines = 3 lines inserted per block = +9 total)
+    # But let's use pattern matching instead of line numbers
+
+    # Pattern: find lines with variable declarations that are const-alerting
+    # Based on the alert line context I read:
+    # L226: Point pk = Point::generator().scalar_mul(sk);
+    # L264: std::array<uint8_t, 32> zero{};
+    # L287: Point pk = ...
+    # L302: bool valid = ...
+    # L309: bool wrong = ...
+    # L317: bool wrong2 = ...
+    # L333: Point pk = ...
+
+    for idx in range(len(lines)):
+        line = lines[idx]
+        stripped = line.lstrip()
+        leading = line[:len(line) - len(stripped)]
+
+        if stripped.startswith('const '):
+            continue
+
+        # Point pk = ... (not already const)
+        if re.match(r'Point pk\d? = ', stripped) and not stripped.startswith('const '):
+            lines[idx] = leading + 'const ' + stripped
+            print(f'  CONST: L{idx+1} Point pk')
+        elif re.match(r'(bool (valid|wrong\d?|r_zero|s_zero|all_zero)) = ', stripped):
+            lines[idx] = leading + 'const ' + stripped
+            print(f'  CONST: L{idx+1} bool')
+        elif re.match(r'std::array<uint8_t, 32> (hash|wrong_hash|zero)\{\}', stripped):
+            lines[idx] = leading + 'const ' + stripped
+            print(f'  CONST: L{idx+1} array')
+        elif re.match(r'auto expected_addr = ethernet_address_bytes', stripped) or \
+             re.match(r'auto expected_addr = ethereum_address_bytes', stripped):
+            lines[idx] = leading + 'const ' + stripped
+            print(f'  CONST: L{idx+1} auto expected_addr')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# File: cpu/tests/test_wallet.cpp
+# ============================================================================
+
+HEX_TO_BYTES_SSCANF_WALLET = '''\
+static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
+    for (size_t i = 0; i < len; ++i) {
+        unsigned int byte = 0;
+        if (std::sscanf(hex + i * 2, "%02x", &byte) != 1) byte = 0;
+        out[i] = static_cast<uint8_t>(byte);
+    }
+}'''
+
+HEX_TO_BYTES_STRTOUL_WALLET = '''\
+static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
+    for (size_t i = 0; i < len; ++i) {
+        char pair[3] = { hex[i * 2], hex[i * 2 + 1], '\\0' };
+        char* endptr = nullptr;
+        const unsigned long val = std::strtoul(pair, &endptr, 16);
+        out[i] = (endptr == pair + 2) ? static_cast<uint8_t>(val) : 0;
+    }
+}'''
+
+
+def fix_test_wallet():
+    path = 'cpu/tests/test_wallet.cpp'
+    print(f'\n=== {path} ===')
+    content = (BASE / path).read_text()
+
+    # misc-unused-using-decls: L45 'using fast::Point;'
+    if 'using fast::Point;\n' in content:
+        content = content.replace('using fast::Point;\n', '')
+        print('  UNUSED-USING: removed using fast::Point')
+    else:
+        print('  UNUSED-USING SKIP: using fast::Point not found')
+
+    # cert-err34-c: sscanf -> strtoul
+    if HEX_TO_BYTES_SSCANF_WALLET in content:
+        content = content.replace(HEX_TO_BYTES_SSCANF_WALLET, HEX_TO_BYTES_STRTOUL_WALLET)
+        print('  ERR34: hex_to_bytes sscanf -> strtoul')
+    else:
+        print('  ERR34_SKIP: hex_to_bytes sscanf pattern not found')
+
+    # readability-simplify-boolean-expr: extract to named bools
+    # L197: ASSERT_TRUE(wif[0] == 'K' || wif[0] == 'L', "WIF starts with K or L");
+    content = content.replace(
+        '    ASSERT_TRUE(wif[0] == \'K\' || wif[0] == \'L\', "WIF starts with K or L");',
+        '    {\n        const bool wif_prefix_ok = (wif[0] == \'K\' || wif[0] == \'L\');\n        ASSERT_TRUE(wif_prefix_ok, "WIF starts with K or L");\n    }'
+    )
+    # L397: ASSERT_TRUE(sig.recid >= 0 && sig.recid <= 3, "valid recid");
+    content = content.replace(
+        '    ASSERT_TRUE(sig.recid >= 0 && sig.recid <= 3, "valid recid");',
+        '    {\n        const bool recid_ok = (sig.recid >= 0 && sig.recid <= 3);\n        ASSERT_TRUE(recid_ok, "valid recid");\n    }'
+    )
+    # L505: ASSERT_TRUE(!btc.empty() && !ltc.empty() && !doge.empty(), "all non-empty");
+    content = content.replace(
+        '    ASSERT_TRUE(!btc.empty() && !ltc.empty() && !doge.empty(), "all non-empty");',
+        '    {\n        const bool coins_non_empty = !btc.empty() && !ltc.empty() && !doge.empty();\n        ASSERT_TRUE(coins_non_empty, "all non-empty");\n    }'
+    )
+    # L602: multi-line ASSERT_TRUE
+    content = content.replace(
+        '    ASSERT_TRUE(!p2pkh.empty() && !p2wpkh.empty() && !p2sh.empty() && !p2tr.empty(),\n                "all non-empty");',
+        '    {\n        const bool addrs_non_empty = !p2pkh.empty() && !p2wpkh.empty() && !p2sh.empty() && !p2tr.empty();\n        ASSERT_TRUE(addrs_non_empty, "all non-empty");\n    }'
+    )
+    print('  SIMPLIFY-BOOL: 4 bool expressions extracted')
+
+    (BASE / path).write_text(content)
+
+    # Add const to variable declarations (by pattern matching)
+    lines = read(path)
+    for idx in range(len(lines)):
+        line = lines[idx]
+        stripped = line.lstrip()
+        leading = line[:len(line) - len(stripped)]
+
+        if stripped.startswith('const '):
+            continue
+
+        # L290: size_t msg_len = sizeof(msg) - 1;
+        # L293: bool ok = bitcoin_verify_message(...)
+        # L298: bool bad = bitcoin_verify_message(...)
+        # L314: size_t msg_len = sizeof(msg) - 1;
+        # L336: size_t msg_len = sizeof(msg) - 1;
+        # L366: size_t msg_len = sizeof(msg) - 1;
+        # L369: bool verified = verify_message(...)
+        # L418: size_t msg_len = sizeof(msg) - 1;
+        # L437: size_t msg_len = sizeof(msg) - 1;
+        if re.match(r'size_t msg_len = sizeof\(msg\) - 1;', stripped):
+            lines[idx] = leading + 'const ' + stripped
+            print(f'  CONST: L{idx+1} size_t msg_len')
+        elif re.match(r'bool ok = bitcoin_verify_message\(', stripped):
+            lines[idx] = leading + 'const ' + stripped
+            print(f'  CONST: L{idx+1} bool ok')
+        elif re.match(r'bool bad = bitcoin_verify_message\(', stripped):
+            lines[idx] = leading + 'const ' + stripped
+            print(f'  CONST: L{idx+1} bool bad')
+        elif re.match(r'bool verified = verify_message\(', stripped):
+            lines[idx] = leading + 'const ' + stripped
+            print(f'  CONST: L{idx+1} bool verified')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# File: cpu/tests/test_zk.cpp
+# ============================================================================
+
+def fix_test_zk():
+    path = 'cpu/tests/test_zk.cpp'
+    print(f'\n=== {path} ===')
+    lines = read(path)
+
+    # All 10 alerts are misc-const-correctness at:
+    # L60, L95, L103, L117, L134, L267, L281, L295, L309, L325
+    const_lines = [60, 95, 103, 117, 134, 267, 281, 295, 309, 325]
+    lines = add_const_to_lines(lines, const_lines, 'test_zk/')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# File: audit/test_ffi_round_trip.cpp
+# ============================================================================
+
+def fix_test_ffi():
+    path = 'audit/test_ffi_round_trip.cpp'
+    print(f'\n=== {path} ===')
+    content = (BASE / path).read_text()
+
+    # L1055: misc-redundant-expression (tautological check)
+    # Fix: remove the first redundant half of the OR expression
+    old_check = (
+        'CHECK(ufsecp_bip39_validate(ctx, "abandon abandon abandon abandon abandon abandon '
+        'abandon abandon abandon abandon abandon abandon") != UFSECP_OK\n'
+        '          || ufsecp_bip39_validate(ctx, "abandon abandon abandon abandon '
+        'abandon abandon abandon abandon abandon abandon abandon abandon") == UFSECP_OK,\n'
+        '          "bip39_validate accepts or rejects known mnemonic");'
+    )
+    new_check = (
+        'CHECK(ufsecp_bip39_validate(ctx, "abandon abandon abandon abandon abandon abandon '
+        'abandon abandon abandon abandon abandon abandon") == UFSECP_OK,\n'
+        '          "bip39_validate accepts valid 12-word mnemonic");'
+    )
+    if old_check in content:
+        content = content.replace(old_check, new_check)
+        print('  REDUNDANT: L1055 tautological check fixed')
+    else:
+        print('  REDUNDANT_SKIP: L1055 exact pattern not found, trying partial match')
+        # Try a partial match
+        old_pattern = 'bip39_validate accepts or rejects known mnemonic'
+        if old_pattern in content:
+            # Need to find and replace the surrounding context
+            # Use regex for multi-line replacement
+            pattern = re.compile(
+                r'CHECK\(ufsecp_bip39_validate\(ctx,\s*"abandon[^"]+"\)\s*!=\s*UFSECP_OK\s*\n'
+                r'\s*\|\|\s*ufsecp_bip39_validate\(ctx,\s*"abandon[^"]+"\)\s*==\s*UFSECP_OK,\s*\n'
+                r'\s*"bip39_validate accepts or rejects known mnemonic"\)',
+                re.MULTILINE
+            )
+            replacement = (
+                'CHECK(ufsecp_bip39_validate(ctx, "abandon abandon abandon abandon abandon abandon '
+                'abandon abandon abandon abandon abandon abandon") == UFSECP_OK,\n'
+                '          "bip39_validate accepts valid 12-word mnemonic")'
+            )
+            content, n = pattern.subn(replacement, content)
+            if n:
+                print(f'  REDUNDANT: L1055 fixed via regex ({n} replacement)')
+            else:
+                print('  REDUNDANT_FAIL: could not fix L1055')
+
+    (BASE / path).write_text(content)
+    lines = read(path)
+
+    # L1317: size_t msg_len = 15; -> const size_t msg_len = 15;
+    # L1538: bool match = ... -> const bool match = ...
+    # Use pattern matching since line numbers may have shifted
+    for idx in range(len(lines)):
+        line = lines[idx]
+        stripped = line.lstrip()
+        leading = line[:len(line) - len(stripped)]
+
+        if stripped.startswith('const '):
+            continue
+
+        if stripped == 'size_t msg_len = 15;\n':
+            lines[idx] = leading + 'const ' + stripped
+            print(f'  CONST: L{idx+1} size_t msg_len = 15')
+        elif stripped.startswith('bool match = (std::memcmp('):
+            lines[idx] = leading + 'const ' + stripped
+            print(f'  CONST: L{idx+1} bool match')
+
+    save(path, lines)
+    print(f'  Saved {path} ({len(lines)} lines)')
+
+
+# ============================================================================
+# Main
+# ============================================================================
+
+if __name__ == '__main__':
+    print('Fix Round 4: resolving 211 code-scanning alerts')
+    print('=' * 60)
+
+    fix_ufsecp_impl()
+    fix_bip39()
+    fix_zk()
+    fix_message_signing()
+    fix_eth_signing()
+    fix_address()
+    fix_wallet()
+    fix_coin_address()
+    fix_test_bip39()
+    fix_test_ethereum()
+    fix_test_wallet()
+    fix_test_zk()
+    fix_test_ffi()
+
+    print('\n' + '=' * 60)
+    print('Done. Check brace balance:')
+    files = [
+        'include/ufsecp/ufsecp_impl.cpp',
+        'cpu/src/bip39.cpp',
+        'cpu/src/zk.cpp',
+        'cpu/src/message_signing.cpp',
+        'cpu/src/eth_signing.cpp',
+        'cpu/src/address.cpp',
+        'cpu/src/wallet.cpp',
+        'cpu/src/coin_address.cpp',
+        'cpu/tests/test_bip39.cpp',
+        'cpu/tests/test_ethereum.cpp',
+        'cpu/tests/test_wallet.cpp',
+        'cpu/tests/test_zk.cpp',
+        'audit/test_ffi_round_trip.cpp',
+    ]
+    all_ok = True
+    for f in files:
+        try:
+            text = (BASE / f).read_text()
+            opens = text.count('{')
+            closes = text.count('}')
+            ok = opens == closes
+            status = 'OK' if ok else f'MISMATCH ({opens} vs {closes})'
+            print(f'  {f}: {status}')
+            if not ok:
+                all_ok = False
+        except Exception as e:
+            print(f'  {f}: ERROR {e}')
+            all_ok = False
+
+    if all_ok:
+        print('\nAll brace counts balanced.')
+    else:
+        print('\nWARNING: Some files have mismatched braces!')
--- a/gpu/src/gpu_backend_opencl.cpp
+++ b/gpu/src/gpu_backend_opencl.cpp
@ -197,13 +197,11 @@ public:
        std::vector<secp256k1::opencl::AffinePoint> h_aff(count);
        ctx_->batch_jacobian_to_affine(h_jac.data(), h_aff.data(), count);

-        /* CPU: SHA-256(x_bytes) → 32-byte shared secret */
+        /* CPU: SHA-256(compressed shared point) to match ufsecp_ecdh/CUDA. */
        for (size_t i = 0; i < count; ++i) {
-            std::array<uint64_t, 4> xl;
-            std::memcpy(xl.data(), h_aff[i].x.limbs, 32);
-            auto fe = secp256k1::fast::FieldElement::from_limbs(xl);
-            auto xbytes = fe.to_bytes();
-            auto digest = secp256k1::SHA256::hash(xbytes.data(), 32);
+            uint8_t compressed[33];
+            affine_to_compressed(&h_aff[i], compressed);
+            auto digest = secp256k1::SHA256::hash(compressed, sizeof(compressed));
            std::memcpy(out_secrets32 + i * 32, digest.data(), 32);
        }

--- a/ocl_audit_report.json
+++ b/ocl_audit_report.json
@ -6,7 +6,7 @@
    "vendor": "NVIDIA Corporation",
    "version": "OpenCL 3.0 CUDA",
    "driver_version": "580.126.09",
-    "memory_mb": 15847,
+    "memory_mb": 15844,
    "compute_units": 36
  },
  "platform": {
@ -20,36 +20,36 @@
    "passed": 27,
    "failed": 0,
    "skipped": 0,
-    "total_seconds": 0.727543,
+    "total_seconds": 0.673606,
    "verdict": "AUDIT-READY"
  },
  "modules": [
-    { "id": "selftest_core", "name": "OpenCL Selftest (23+ kernel tests)", "section": "math_invariants", "result": "PASS", "time_ms": 152.799583, "error_code": 0 },
-    { "id": "field_add_sub", "name": "Field add/sub roundtrip", "section": "math_invariants", "result": "PASS", "time_ms": 0.307649, "error_code": 0 },
-    { "id": "field_mul_comm", "name": "Field mul commutativity", "section": "math_invariants", "result": "PASS", "time_ms": 0.140150, "error_code": 0 },
-    { "id": "field_inv", "name": "Field inverse roundtrip (a * a^-1 = 1)", "section": "math_invariants", "result": "PASS", "time_ms": 0.266819, "error_code": 0 },
-    { "id": "field_sqr", "name": "Field square == mul(a,a)", "section": "math_invariants", "result": "PASS", "time_ms": 0.120384, "error_code": 0 },
-    { "id": "field_negate", "name": "Field negate roundtrip (a + (-a) = 0)", "section": "math_invariants", "result": "PASS", "time_ms": 0.118151, "error_code": 0 },
-    { "id": "gen_mul_vec", "name": "Generator mul known vectors", "section": "math_invariants", "result": "PASS", "time_ms": 0.069495, "error_code": 0 },
-    { "id": "scalar_roundtrip", "name": "Scalar/Point consistency", "section": "math_invariants", "result": "PASS", "time_ms": 0.196458, "error_code": 0 },
-    { "id": "add_dbl_consist", "name": "Point add vs double consistency", "section": "math_invariants", "result": "PASS", "time_ms": 0.226176, "error_code": 0 },
-    { "id": "scalar_mul_lin", "name": "Scalar mul linearity (a+b)*G = aG+bG", "section": "math_invariants", "result": "PASS", "time_ms": 0.334149, "error_code": 0 },
-    { "id": "group_order", "name": "Group order basic checks", "section": "math_invariants", "result": "PASS", "time_ms": 0.224126, "error_code": 0 },
-    { "id": "batch_inv", "name": "Batch inversion (Montgomery trick)", "section": "math_invariants", "result": "PASS", "time_ms": 0.639383, "error_code": 0 },
-    { "id": "ecdsa_roundtrip", "name": "ECDSA sign + verify roundtrip", "section": "signatures", "result": "PASS", "time_ms": 7.346328, "error_code": 0 },
-    { "id": "schnorr_roundtrip", "name": "Schnorr/BIP-340 sign + verify roundtrip", "section": "signatures", "result": "PASS", "time_ms": 6.647268, "error_code": 0 },
-    { "id": "ecdsa_wrong_key", "name": "ECDSA verify rejects wrong pubkey", "section": "signatures", "result": "PASS", "time_ms": 6.214200, "error_code": 0 },
-    { "id": "batch_smul", "name": "Batch scalar mul generator", "section": "batch_advanced", "result": "PASS", "time_ms": 0.435053, "error_code": 0 },
-    { "id": "batch_j2a", "name": "Batch Jacobian to Affine", "section": "batch_advanced", "result": "PASS", "time_ms": 0.518009, "error_code": 0 },
-    { "id": "diff_smul", "name": "OpenCL-host differential scalar mul", "section": "differential", "result": "PASS", "time_ms": 0.070056, "error_code": 0 },
-    { "id": "rfc6979_determ", "name": "RFC-6979 ECDSA deterministic nonce", "section": "standard_vectors", "result": "PASS", "time_ms": 6.422079, "error_code": 0 },
-    { "id": "bip340_vectors", "name": "BIP-340 Schnorr known-key roundtrip", "section": "standard_vectors", "result": "PASS", "time_ms": 8.872533, "error_code": 0 },
-    { "id": "ecdsa_multi_key", "name": "ECDSA multi-key (10 keys) sign+verify", "section": "protocol_security", "result": "PASS", "time_ms": 60.908449, "error_code": 0 },
-    { "id": "schnorr_multi_key", "name": "Schnorr multi-key (10 keys) sign+verify", "section": "protocol_security", "result": "PASS", "time_ms": 55.286184, "error_code": 0 },
-    { "id": "fuzz_edge_scalar", "name": "Edge-case scalars (0*G, 1*G, G+G=2G)", "section": "fuzzing", "result": "PASS", "time_ms": 0.229781, "error_code": 0 },
-    { "id": "fuzz_ecdsa_zero", "name": "ECDSA rejects zero private key", "section": "fuzzing", "result": "PASS", "time_ms": 0.077824, "error_code": 0 },
-    { "id": "fuzz_schnorr_zero", "name": "Schnorr rejects zero private key", "section": "fuzzing", "result": "PASS", "time_ms": 0.082019, "error_code": 0 },
-    { "id": "perf_ecdsa_50", "name": "ECDSA 50-iteration stress", "section": "performance", "result": "PASS", "time_ms": 287.677880, "error_code": 0 },
-    { "id": "perf_schnorr_25", "name": "Schnorr 25-iteration stress", "section": "performance", "result": "PASS", "time_ms": 131.178937, "error_code": 0 }
+    { "id": "selftest_core", "name": "OpenCL Selftest (23+ kernel tests)", "section": "math_invariants", "result": "PASS", "time_ms": 137.571479, "error_code": 0 },
+    { "id": "field_add_sub", "name": "Field add/sub roundtrip", "section": "math_invariants", "result": "PASS", "time_ms": 0.335681, "error_code": 0 },
+    { "id": "field_mul_comm", "name": "Field mul commutativity", "section": "math_invariants", "result": "PASS", "time_ms": 0.215808, "error_code": 0 },
+    { "id": "field_inv", "name": "Field inverse roundtrip (a * a^-1 = 1)", "section": "math_invariants", "result": "PASS", "time_ms": 0.280040, "error_code": 0 },
+    { "id": "field_sqr", "name": "Field square == mul(a,a)", "section": "math_invariants", "result": "PASS", "time_ms": 0.129584, "error_code": 0 },
+    { "id": "field_negate", "name": "Field negate roundtrip (a + (-a) = 0)", "section": "math_invariants", "result": "PASS", "time_ms": 0.131630, "error_code": 0 },
+    { "id": "gen_mul_vec", "name": "Generator mul known vectors", "section": "math_invariants", "result": "PASS", "time_ms": 0.062463, "error_code": 0 },
+    { "id": "scalar_roundtrip", "name": "Scalar/Point consistency", "section": "math_invariants", "result": "PASS", "time_ms": 0.121435, "error_code": 0 },
+    { "id": "add_dbl_consist", "name": "Point add vs double consistency", "section": "math_invariants", "result": "PASS", "time_ms": 0.219232, "error_code": 0 },
+    { "id": "scalar_mul_lin", "name": "Scalar mul linearity (a+b)*G = aG+bG", "section": "math_invariants", "result": "PASS", "time_ms": 0.330590, "error_code": 0 },
+    { "id": "group_order", "name": "Group order basic checks", "section": "math_invariants", "result": "PASS", "time_ms": 0.199699, "error_code": 0 },
+    { "id": "batch_inv", "name": "Batch inversion (Montgomery trick)", "section": "math_invariants", "result": "PASS", "time_ms": 0.353371, "error_code": 0 },
+    { "id": "ecdsa_roundtrip", "name": "ECDSA sign + verify roundtrip", "section": "signatures", "result": "PASS", "time_ms": 7.218292, "error_code": 0 },
+    { "id": "schnorr_roundtrip", "name": "Schnorr/BIP-340 sign + verify roundtrip", "section": "signatures", "result": "PASS", "time_ms": 5.841064, "error_code": 0 },
+    { "id": "ecdsa_wrong_key", "name": "ECDSA verify rejects wrong pubkey", "section": "signatures", "result": "PASS", "time_ms": 6.111775, "error_code": 0 },
+    { "id": "batch_smul", "name": "Batch scalar mul generator", "section": "batch_advanced", "result": "PASS", "time_ms": 0.093349, "error_code": 0 },
+    { "id": "batch_j2a", "name": "Batch Jacobian to Affine", "section": "batch_advanced", "result": "PASS", "time_ms": 0.083400, "error_code": 0 },
+    { "id": "diff_smul", "name": "OpenCL-host differential scalar mul", "section": "differential", "result": "PASS", "time_ms": 0.020089, "error_code": 0 },
+    { "id": "rfc6979_determ", "name": "RFC-6979 ECDSA deterministic nonce", "section": "standard_vectors", "result": "PASS", "time_ms": 6.313681, "error_code": 0 },
+    { "id": "bip340_vectors", "name": "BIP-340 Schnorr known-key roundtrip", "section": "standard_vectors", "result": "PASS", "time_ms": 7.330723, "error_code": 0 },
+    { "id": "ecdsa_multi_key", "name": "ECDSA multi-key (10 keys) sign+verify", "section": "protocol_security", "result": "PASS", "time_ms": 59.571898, "error_code": 0 },
+    { "id": "schnorr_multi_key", "name": "Schnorr multi-key (10 keys) sign+verify", "section": "protocol_security", "result": "PASS", "time_ms": 47.122783, "error_code": 0 },
+    { "id": "fuzz_edge_scalar", "name": "Edge-case scalars (0*G, 1*G, G+G=2G)", "section": "fuzzing", "result": "PASS", "time_ms": 0.078238, "error_code": 0 },
+    { "id": "fuzz_ecdsa_zero", "name": "ECDSA rejects zero private key", "section": "fuzzing", "result": "PASS", "time_ms": 0.042447, "error_code": 0 },
+    { "id": "fuzz_schnorr_zero", "name": "Schnorr rejects zero private key", "section": "fuzzing", "result": "PASS", "time_ms": 0.045454, "error_code": 0 },
+    { "id": "perf_ecdsa_50", "name": "ECDSA 50-iteration stress", "section": "performance", "result": "PASS", "time_ms": 282.603579, "error_code": 0 },
+    { "id": "perf_schnorr_25", "name": "Schnorr 25-iteration stress", "section": "performance", "result": "PASS", "time_ms": 111.053794, "error_code": 0 }
  ]
 }
--- a/ocl_audit_report.txt
+++ b/ocl_audit_report.txt
@ -2,60 +2,60 @@
  UltrafastSecp256k1 -- OpenCL Unified Audit Report
  Framework v2.0.0
  Linux x86-64 | GCC 14.2.0 | Release
-  Device: NVIDIA GeForce RTX 5060 Ti (NVIDIA Corporation) | 36 CUs | 15847 MB
+  Device: NVIDIA GeForce RTX 5060 Ti (NVIDIA Corporation) | 36 CUs | 15844 MB
 ================================================================


  Section: math_invariants
  --------------------------------------------------
-  [PASS]  OpenCL Selftest (23+ kernel tests)  (152.8 ms)
-  [PASS]  Field add/sub roundtrip  (0.307649 ms)
-  [PASS]  Field mul commutativity  (0.14015 ms)
-  [PASS]  Field inverse roundtrip (a * a^-1 = 1)  (0.266819 ms)
-  [PASS]  Field square == mul(a,a)  (0.120384 ms)
-  [PASS]  Field negate roundtrip (a + (-a) = 0)  (0.118151 ms)
-  [PASS]  Generator mul known vectors  (0.069495 ms)
-  [PASS]  Scalar/Point consistency  (0.196458 ms)
-  [PASS]  Point add vs double consistency  (0.226176 ms)
-  [PASS]  Scalar mul linearity (a+b)*G = aG+bG  (0.334149 ms)
-  [PASS]  Group order basic checks  (0.224126 ms)
-  [PASS]  Batch inversion (Montgomery trick)  (0.639383 ms)
+  [PASS]  OpenCL Selftest (23+ kernel tests)  (137.571 ms)
+  [PASS]  Field add/sub roundtrip  (0.335681 ms)
+  [PASS]  Field mul commutativity  (0.215808 ms)
+  [PASS]  Field inverse roundtrip (a * a^-1 = 1)  (0.28004 ms)
+  [PASS]  Field square == mul(a,a)  (0.129584 ms)
+  [PASS]  Field negate roundtrip (a + (-a) = 0)  (0.13163 ms)
+  [PASS]  Generator mul known vectors  (0.062463 ms)
+  [PASS]  Scalar/Point consistency  (0.121435 ms)
+  [PASS]  Point add vs double consistency  (0.219232 ms)
+  [PASS]  Scalar mul linearity (a+b)*G = aG+bG  (0.33059 ms)
+  [PASS]  Group order basic checks  (0.199699 ms)
+  [PASS]  Batch inversion (Montgomery trick)  (0.353371 ms)

  Section: signatures
  --------------------------------------------------
-  [PASS]  ECDSA sign + verify roundtrip  (7.34633 ms)
-  [PASS]  Schnorr/BIP-340 sign + verify roundtrip  (6.64727 ms)
-  [PASS]  ECDSA verify rejects wrong pubkey  (6.2142 ms)
+  [PASS]  ECDSA sign + verify roundtrip  (7.21829 ms)
+  [PASS]  Schnorr/BIP-340 sign + verify roundtrip  (5.84106 ms)
+  [PASS]  ECDSA verify rejects wrong pubkey  (6.11177 ms)

  Section: batch_advanced
  --------------------------------------------------
-  [PASS]  Batch scalar mul generator  (0.435053 ms)
-  [PASS]  Batch Jacobian to Affine  (0.518009 ms)
+  [PASS]  Batch scalar mul generator  (0.093349 ms)
+  [PASS]  Batch Jacobian to Affine  (0.0834 ms)

  Section: differential
  --------------------------------------------------
-  [PASS]  OpenCL-host differential scalar mul  (0.070056 ms)
+  [PASS]  OpenCL-host differential scalar mul  (0.020089 ms)

  Section: standard_vectors
  --------------------------------------------------
-  [PASS]  RFC-6979 ECDSA deterministic nonce  (6.42208 ms)
-  [PASS]  BIP-340 Schnorr known-key roundtrip  (8.87253 ms)
+  [PASS]  RFC-6979 ECDSA deterministic nonce  (6.31368 ms)
+  [PASS]  BIP-340 Schnorr known-key roundtrip  (7.33072 ms)

  Section: protocol_security
  --------------------------------------------------
-  [PASS]  ECDSA multi-key (10 keys) sign+verify  (60.9084 ms)
-  [PASS]  Schnorr multi-key (10 keys) sign+verify  (55.2862 ms)
+  [PASS]  ECDSA multi-key (10 keys) sign+verify  (59.5719 ms)
+  [PASS]  Schnorr multi-key (10 keys) sign+verify  (47.1228 ms)

  Section: fuzzing
  --------------------------------------------------
-  [PASS]  Edge-case scalars (0*G, 1*G, G+G=2G)  (0.229781 ms)
-  [PASS]  ECDSA rejects zero private key  (0.077824 ms)
-  [PASS]  Schnorr rejects zero private key  (0.082019 ms)
+  [PASS]  Edge-case scalars (0*G, 1*G, G+G=2G)  (0.078238 ms)
+  [PASS]  ECDSA rejects zero private key  (0.042447 ms)
+  [PASS]  Schnorr rejects zero private key  (0.045454 ms)

  Section: performance
  --------------------------------------------------
-  [PASS]  ECDSA 50-iteration stress  (287.678 ms)
-  [PASS]  Schnorr 25-iteration stress  (131.179 ms)
+  [PASS]  ECDSA 50-iteration stress  (282.604 ms)
+  [PASS]  Schnorr 25-iteration stress  (111.054 ms)

 ================================================================
  VERDICT: AUDIT-READY
--- a/opencl/CMakeLists.txt
+++ b/opencl/CMakeLists.txt
@ -145,12 +145,20 @@ set(KERNEL_FILE_1 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_field.cl")
 set(KERNEL_FILE_2 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_point.cl")
 set(KERNEL_FILE_3 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_batch.cl")
 set(KERNEL_FILE_4 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_affine.cl")
+set(KERNEL_FILE_5 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_extended.cl")
+set(KERNEL_FILE_6 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_hash160.cl")
+set(KERNEL_FILE_7 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_ecdh.cl")
+set(KERNEL_FILE_8 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_bip352.cl")

 set(KERNEL_FILES_LIST
    ${KERNEL_FILE_1}
    ${KERNEL_FILE_2}
    ${KERNEL_FILE_3}
    ${KERNEL_FILE_4}
+    ${KERNEL_FILE_5}
+    ${KERNEL_FILE_6}
+    ${KERNEL_FILE_7}
+    ${KERNEL_FILE_8}
 )

 set(KERNEL_HEADER "${CMAKE_CURRENT_BINARY_DIR}/include/secp256k1_kernels_embedded.hpp")
@ -215,6 +223,20 @@ else()
    )
 endif()

+add_executable(opencl_bip352_benchmark
+    benchmarks/bench_bip352_opencl.cpp
+)
+
+target_link_libraries(opencl_bip352_benchmark PRIVATE
+    secp256k1_opencl
+    $<TARGET_NAME_IF_EXISTS:fastsecp256k1>
+    ${FASTSECP256K1_LIB}
+)
+
+target_compile_definitions(opencl_bip352_benchmark PRIVATE
+    SECP256K1_OPENCL_KERNEL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/kernels"
+)
+
 # =============================================================================
 # Test Executable
 # =============================================================================
@ -290,4 +312,3 @@ install(DIRECTORY kernels/
    DESTINATION share/secp256k1/opencl
    FILES_MATCHING PATTERN "*.cl"
 )
-
--- a/opencl/benchmarks/bench_bip352_opencl.cpp
+++ b/opencl/benchmarks/bench_bip352_opencl.cpp
@ -0,0 +1,641 @@
+#include "secp256k1_opencl.hpp"
+#include "secp256k1/batch_add_affine.hpp"
+#include "secp256k1/fast.hpp"
+#include "secp256k1/glv.hpp"
+#include "secp256k1/tagged_hash.hpp"
+
+#define CL_TARGET_OPENCL_VERSION 120
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <optional>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using CpuPoint = secp256k1::fast::Point;
+using CpuScalar = secp256k1::fast::Scalar;
+using CpuField = secp256k1::fast::FieldElement;
+using OclAffine = secp256k1::opencl::AffinePoint;
+using OclField = secp256k1::opencl::FieldElement;
+using OclScalar = secp256k1::opencl::Scalar;
+
+namespace {
+
+constexpr int BENCH_N = 10000;
+constexpr int BENCH_WARMUP = 3;
+constexpr int BENCH_PASSES = 11;
+// RTX 5060 Ti (and most NVIDIA): warp=32, SM occupancy peaks at 128-256 threads.
+// Previous defaults (64/32) left SMs underutilized.
+constexpr int DEFAULT_LOCAL_SIZE_FUSED = 128;
+constexpr int DEFAULT_LOCAL_SIZE_LUT   = 128;
+constexpr std::size_t LUT_WINDOWS = 16;
+constexpr std::size_t LUT_ENTRIES = 65536;
+
+constexpr uint8_t SCAN_KEY[32] = {
+    0xc4,0x23,0x9f,0xd6,0xfc,0x3d,0xb6,0xe2,
+    0x2b,0x8b,0xed,0x6a,0x49,0x21,0x9e,0x4e,
+    0x30,0xd7,0xd6,0xa3,0xb9,0x82,0x94,0xb1,
+    0x38,0xaf,0x4a,0xd3,0x00,0xda,0x1a,0x42
+};
+
+constexpr uint8_t SPEND_PUBKEY_COMPRESSED[33] = {
+    0x02,
+    0xe2,0xed,0x4b,0x9c,0xe9,0x14,0x5e,0x17,
+    0x21,0xf1,0x1f,0x99,0x5f,0x72,0x6e,0xf8,
+    0xcf,0x50,0xfc,0x85,0x92,0x89,0xac,0x94,
+    0x4b,0x2d,0xaf,0xe5,0x03,0xa3,0xc7,0x4c
+};
+
+// Must match BIP352ScanKeyGlv typedef in secp256k1_bip352.cl exactly.
+struct BIP352ScanKeyGlv {
+    std::int8_t  wnaf1[130]{};  // +0:   wNAF digits for k1 half-scalar
+    std::int8_t  wnaf2[130]{};  // +130: wNAF digits for k2 half-scalar
+    std::uint8_t k1_neg{0};     // +260: 1 if k1 negative (negate base.y)
+    std::uint8_t flip_phi{0};   // +261: 1 if phi table y should be negated
+    std::uint8_t pad0{0};       // +262: padding
+    std::uint8_t pad1{0};       // +263: padding
+}; // Total: 264 bytes
+
+// Compute 5-bit wNAF digits for a 128-bit half-scalar.
+// Mirrors the GPU's scalar_to_wnaf fixed-130-iteration version.
+// scalar_bytes: big-endian 32-byte scalar (upper 128 bits should be zero for GLV halves).
+static void host_compute_wnaf(const std::uint8_t* scalar_bytes, std::int8_t wnaf[130]) {
+    // Convert big-endian bytes to 4 little-endian 64-bit limbs (limb[0] = LSW).
+    std::uint64_t s[4] = {};
+    for (int limb = 0; limb < 4; ++limb) {
+        std::uint64_t v = 0;
+        int base = limb * 8;
+        for (int i = 0; i < 8; ++i) v = (v << 8) | scalar_bytes[base + i];
+        s[3 - limb] = v;
+    }
+    for (int i = 0; i < 130; i++) {
+        if (s[0] & 1ULL) {
+            int d = (int)(s[0] & 0x1FULL);
+            if (d >= 16) {
+                d -= 32;
+                std::uint64_t add = (std::uint64_t)(-d);
+                std::uint64_t prev = s[0]; s[0] += add;
+                if (s[0] < prev) { for (int j = 1; j < 4; j++) if (++s[j]) break; }
+            } else {
+                std::uint64_t prev = s[0]; s[0] -= (std::uint64_t)d;
+                if (s[0] > prev) { for (int j = 1; j < 4; j++) if (s[j]--) break; }
+            }
+            wnaf[i] = (std::int8_t)d;
+        } else {
+            wnaf[i] = 0;
+        }
+        s[0] = (s[0] >> 1) | (s[1] << 63);
+        s[1] = (s[1] >> 1) | (s[2] << 63);
+        s[2] = (s[2] >> 1) | (s[3] << 63);
+        s[3] >>= 1;
+    }
+}
+
+static const uint32_t host_sha256_k[64] = {
+    0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
+    0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
+    0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
+    0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
+    0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
+    0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
+    0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
+    0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+};
+
+inline uint32_t rotr32(uint32_t a, uint32_t b) {
+    return (a >> b) | (a << (32 - b));
+}
+
+void host_sha256(const uint8_t* msg, size_t len, uint8_t out[32]) {
+    uint32_t h0=0x6a09e667, h1=0xbb67ae85, h2=0x3c6ef372, h3=0xa54ff53a;
+    uint32_t h4=0x510e527f, h5=0x9b05688c, h6=0x1f83d9ab, h7=0x5be0cd19;
+
+    size_t bit_len = len * 8;
+    size_t padded = ((len + 9 + 63) / 64) * 64;
+    std::vector<uint8_t> buf(padded, 0);
+    std::memcpy(buf.data(), msg, len);
+    buf[len] = 0x80;
+    for (int i = 7; i >= 0; --i) buf[padded - 1 - i] = static_cast<uint8_t>(bit_len >> (i * 8));
+
+    for (size_t off = 0; off < padded; off += 64) {
+        uint32_t w[64];
+        for (int i = 0; i < 16; i++) {
+            w[i] = (static_cast<uint32_t>(buf[off+i*4]) << 24) |
+                   (static_cast<uint32_t>(buf[off+i*4+1]) << 16) |
+                   (static_cast<uint32_t>(buf[off+i*4+2]) << 8) |
+                   buf[off+i*4+3];
+        }
+        for (int i = 16; i < 64; i++) {
+            uint32_t s0 = rotr32(w[i-15], 7) ^ rotr32(w[i-15], 18) ^ (w[i-15] >> 3);
+            uint32_t s1 = rotr32(w[i-2], 17) ^ rotr32(w[i-2], 19) ^ (w[i-2] >> 10);
+            w[i] = w[i-16] + s0 + w[i-7] + s1;
+        }
+        uint32_t a=h0,b=h1,c=h2,d=h3,e=h4,f=h5,g=h6,hh=h7;
+        for (int i = 0; i < 64; i++) {
+            uint32_t S1 = rotr32(e,6)^rotr32(e,11)^rotr32(e,25);
+            uint32_t ch = (e&f)^(~e&g);
+            uint32_t t1 = hh+S1+ch+host_sha256_k[i]+w[i];
+            uint32_t S0 = rotr32(a,2)^rotr32(a,13)^rotr32(a,22);
+            uint32_t maj = (a&b)^(a&c)^(b&c);
+            uint32_t t2 = S0+maj;
+            hh=g; g=f; f=e; e=d+t1; d=c; c=b; b=a; a=t1+t2;
+        }
+        h0+=a; h1+=b; h2+=c; h3+=d; h4+=e; h5+=f; h6+=g; h7+=hh;
+    }
+
+    auto store = [&](uint32_t v, int i) {
+        out[i*4] = static_cast<uint8_t>(v >> 24);
+        out[i*4+1] = static_cast<uint8_t>(v >> 16);
+        out[i*4+2] = static_cast<uint8_t>(v >> 8);
+        out[i*4+3] = static_cast<uint8_t>(v);
+    };
+    store(h0,0); store(h1,1); store(h2,2); store(h3,3);
+    store(h4,4); store(h5,5); store(h6,6); store(h7,7);
+}
+
+CpuPoint point_from_compressed(const uint8_t* pub33) {
+    if (pub33[0] != 0x02 && pub33[0] != 0x03) return CpuPoint::infinity();
+    CpuField x;
+    if (!CpuField::parse_bytes_strict(pub33 + 1, x)) return CpuPoint::infinity();
+    auto x2 = x * x;
+    auto x3 = x2 * x;
+    auto y2 = x3 + CpuField::from_uint64(7);
+    auto t = y2;
+    auto a = t.square() * t;
+    auto b = a.square() * t;
+    auto c = b.square().square().square() * b;
+    auto d = c.square().square().square() * b;
+    auto e = d.square().square() * a;
+    auto f = e;
+    for (int i = 0; i < 11; ++i) f = f.square();
+    f = f * e;
+    auto g = f;
+    for (int i = 0; i < 22; ++i) g = g.square();
+    g = g * f;
+    auto h = g;
+    for (int i = 0; i < 44; ++i) h = h.square();
+    h = h * g;
+    auto j = h;
+    for (int i = 0; i < 88; ++i) j = j.square();
+    j = j * h;
+    auto k = j;
+    for (int i = 0; i < 44; ++i) k = k.square();
+    k = k * g;
+    auto m = k.square().square().square() * b;
+    auto y = m;
+    for (int i = 0; i < 23; ++i) y = y.square();
+    y = y * f;
+    for (int i = 0; i < 6; ++i) y = y.square();
+    y = y * a;
+    y = y.square().square();
+    if (!(y * y == y2)) return CpuPoint::infinity();
+    auto y_bytes = y.to_bytes();
+    bool y_is_odd = (y_bytes[31] & 1) != 0;
+    bool want_odd = (pub33[0] == 0x03);
+    if (y_is_odd != want_odd) y = CpuField::from_uint64(0) - y;
+    return CpuPoint::from_affine(x, y);
+}
+
+OclField bytes_to_ocl_field(const uint8_t* bytes32) {
+    OclField out{};
+    for (int limb = 0; limb < 4; ++limb) {
+        uint64_t v = 0;
+        int base = limb * 8;
+        for (int i = 0; i < 8; ++i) {
+            v = (v << 8) | bytes32[base + i];
+        }
+        out.limbs[3 - limb] = v;
+    }
+    return out;
+}
+
+
+OclAffine to_ocl_affine(const CpuPoint& p) {
+    OclAffine out{};
+    auto x = p.x().to_bytes();
+    auto y = p.y().to_bytes();
+    out.x = bytes_to_ocl_field(x.data());
+    out.y = bytes_to_ocl_field(y.data());
+    return out;
+}
+
+OclAffine to_ocl_affine(const secp256k1::fast::AffinePointCompact& p) {
+    OclAffine out{};
+    auto x = p.x.to_bytes();
+    auto y = p.y.to_bytes();
+    out.x = bytes_to_ocl_field(x.data());
+    out.y = bytes_to_ocl_field(y.data());
+    return out;
+}
+
+uint64_t extract_upper_64(const uint8_t* x_bytes) {
+    uint64_t v = 0;
+    for (int i = 0; i < 8; i++) v = (v << 8) | x_bytes[i];
+    return v;
+}
+
+std::string read_text(const std::string& path) {
+    std::ifstream in(path, std::ios::binary);
+    if (!in) throw std::runtime_error("failed to open: " + path);
+    std::ostringstream ss;
+    ss << in.rdbuf();
+    return ss.str();
+}
+
+std::string dirname_of(const std::string& path) {
+    auto pos = path.find_last_of("/\\");
+    return pos == std::string::npos ? "." : path.substr(0, pos);
+}
+
+std::string trim_left(std::string s) {
+    while (!s.empty() && (s.front() == ' ' || s.front() == '\t')) s.erase(s.begin());
+    return s;
+}
+
+std::string expand_kernel_file(const std::string& path, std::set<std::string>& include_stack) {
+    if (include_stack.count(path)) return {};
+    include_stack.insert(path);
+    std::istringstream in(read_text(path));
+    std::ostringstream out;
+    std::string dir = dirname_of(path);
+    std::string line;
+    while (std::getline(in, line)) {
+        std::string trimmed = trim_left(line);
+        if (trimmed.rfind("#include \"", 0) == 0) {
+            auto start = trimmed.find('"') + 1;
+            auto end = trimmed.find('"', start);
+            std::string child = dir + "/" + trimmed.substr(start, end - start);
+            out << expand_kernel_file(child, include_stack);
+            continue;
+        }
+        out << line << '\n';
+    }
+    include_stack.erase(path);
+    return out.str();
+}
+
+std::string load_bip352_kernel_source() {
+    std::set<std::string> stack;
+    return expand_kernel_file(std::string(SECP256K1_OPENCL_KERNEL_DIR) + "/secp256k1_bip352.cl", stack);
+}
+
+std::vector<OclAffine> build_generator_lut_host() {
+    std::vector<OclAffine> lut(LUT_WINDOWS * LUT_ENTRIES);
+    CpuPoint base = CpuPoint::generator();
+
+    for (std::size_t win = 0; win < LUT_WINDOWS; ++win) {
+        std::cout << "  Building LUT window " << win + 1 << "/" << LUT_WINDOWS << "...\n";
+        auto base_x = base.x();
+        auto base_y = base.y();
+        auto table = (win == 0)
+            ? secp256k1::fast::precompute_g_multiples(LUT_ENTRIES - 1)
+            : secp256k1::fast::precompute_point_multiples(base_x, base_y, LUT_ENTRIES - 1);
+
+        lut[win * LUT_ENTRIES] = OclAffine{};
+        for (std::size_t i = 0; i < table.size(); ++i) {
+            lut[win * LUT_ENTRIES + i + 1] = to_ocl_affine(table[i]);
+        }
+
+        for (int i = 0; i < 16; ++i) base.dbl_inplace();
+    }
+
+    return lut;
+}
+
+BIP352ScanKeyGlv build_scan_glv_plan() {
+    BIP352ScanKeyGlv out{};
+    auto scan_scalar = CpuScalar::from_bytes(SCAN_KEY);
+    auto decomp = secp256k1::fast::glv_decompose(scan_scalar);
+    auto k1 = decomp.k1.to_bytes();
+    auto k2 = decomp.k2.to_bytes();
+    out.k1_neg  = decomp.k1_neg ? 1 : 0;
+    out.flip_phi = (decomp.k1_neg != decomp.k2_neg) ? 1 : 0;
+    host_compute_wnaf(k1.data(), out.wnaf1);
+    host_compute_wnaf(k2.data(), out.wnaf2);
+    return out;
+}
+
+double median_iqr(std::vector<double> samples) {
+    if (samples.empty()) return 0.0;
+    std::sort(samples.begin(), samples.end());
+    const int n = static_cast<int>(samples.size());
+    if (n < 4) return samples[n / 2];
+    double q1 = samples[n / 4];
+    double q3 = samples[(3 * n) / 4];
+    double iqr = q3 - q1;
+    double lo = q1 - 1.5 * iqr;
+    double hi = q3 + 1.5 * iqr;
+    std::vector<double> filtered;
+    filtered.reserve(samples.size());
+    for (double v : samples) {
+        if (v >= lo && v <= hi) filtered.push_back(v);
+    }
+    if (filtered.empty()) filtered = std::move(samples);
+    return filtered[filtered.size() / 2];
+}
+
+void check_cl(cl_int err, const char* what) {
+    if (err != CL_SUCCESS) {
+        throw std::runtime_error(std::string(what) + " failed with OpenCL error " + std::to_string(err));
+    }
+}
+
+// Autotune OpenCL local_size by running a few passes at candidate sizes.
+// Mirrors CUDA's autotune_gpu_tpb. Returns best local size found.
+static int autotune_local_size(
+    const char* label,
+    cl_command_queue cl_q,
+    cl_kernel kernel,
+    size_t count,
+    size_t max_wg_size,
+    std::initializer_list<int> candidates)
+{
+    std::printf("Autotuning %s local size...\n", label);
+    int best = 0;
+    double best_ns = 0.0;
+
+    for (int ls : candidates) {
+        if (ls <= 0 || static_cast<size_t>(ls) > max_wg_size) continue;
+
+        size_t local  = static_cast<size_t>(ls);
+        size_t global = ((count + local - 1) / local) * local;
+
+        // warmup
+        for (int w = 0; w < 2; ++w) {
+            cl_int err2 = clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr);
+            if (err2 != CL_SUCCESS) goto next;
+        }
+        clFinish(cl_q);
+
+        {
+            constexpr int SAMPLE_PASSES = 5;
+            constexpr int SAMPLE_REPS   = 10;
+            std::vector<double> samples;
+            samples.reserve(SAMPLE_PASSES);
+            for (int p = 0; p < SAMPLE_PASSES; ++p) {
+                auto t0 = std::chrono::high_resolution_clock::now();
+                for (int r = 0; r < SAMPLE_REPS; ++r) {
+                    cl_int err2 = clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr);
+                    if (err2 != CL_SUCCESS) goto next;
+                }
+                clFinish(cl_q);
+                auto t1 = std::chrono::high_resolution_clock::now();
+                double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+                samples.push_back((ms * 1e6) / (static_cast<double>(count) * SAMPLE_REPS));
+            }
+            double ns = median_iqr(samples);
+            std::printf("  local=%3d -> %8.1f ns/op\n", ls, ns);
+            if (best == 0 || ns < best_ns) { best = ls; best_ns = ns; }
+        }
+        next:;
+    }
+
+    if (best == 0) best = DEFAULT_LOCAL_SIZE_FUSED;
+    std::printf("  selected local=%d for %s\n\n", best, label);
+    return best;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+    bool prefer_intel = false;
+    bool use_lut = false;
+    int platform_id = -1;
+    int device_id = 0;
+    int batch_n = BENCH_N;
+    int local_size = 0;
+
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg == "--intel") prefer_intel = true;
+        else if (arg == "--nvidia") prefer_intel = false;
+        else if (arg == "--lut") use_lut = true;
+        else if (arg == "--platform" && i + 1 < argc) platform_id = std::atoi(argv[++i]);
+        else if (arg == "--device" && i + 1 < argc) device_id = std::atoi(argv[++i]);
+        else if (arg == "--batch" && i + 1 < argc) batch_n = std::atoi(argv[++i]);
+        else if (arg == "--local" && i + 1 < argc) local_size = std::atoi(argv[++i]);
+    }
+    if (local_size == 0) {
+        local_size = use_lut ? DEFAULT_LOCAL_SIZE_LUT : DEFAULT_LOCAL_SIZE_FUSED;
+    }
+
+    secp256k1::opencl::DeviceConfig cfg;
+    cfg.prefer_intel = prefer_intel;
+    cfg.verbose = true;
+    cfg.platform_id = platform_id;
+    cfg.device_id = device_id;
+    auto ctx = secp256k1::opencl::Context::create(cfg);
+    if (!ctx || !ctx->is_valid()) {
+        std::cerr << "Failed to create OpenCL context\n";
+        return 1;
+    }
+
+    cl_context cl_ctx = static_cast<cl_context>(ctx->native_context());
+    cl_command_queue cl_q = static_cast<cl_command_queue>(ctx->native_queue());
+    cl_device_id cl_dev = nullptr;
+    check_cl(clGetCommandQueueInfo(cl_q, CL_QUEUE_DEVICE, sizeof(cl_dev), &cl_dev, nullptr),
+             "clGetCommandQueueInfo(CL_QUEUE_DEVICE)");
+
+    std::cout << "============================================================\n";
+    std::cout << "  BIP-352 Silent Payments Pipeline: CPU vs OpenCL\n";
+    std::cout << "============================================================\n";
+    std::cout << "  Device: " << ctx->device_info().name << " (" << ctx->device_info().vendor << ")\n";
+    std::cout << "  N = " << batch_n << " tweak points, " << BENCH_PASSES << " passes (median)\n\n";
+    std::cout << "  Local size = " << local_size << "\n\n";
+
+    std::cout << "Generating " << batch_n << " deterministic tweak points...\n";
+    std::vector<OclAffine> tweaks(static_cast<size_t>(batch_n));
+    CpuPoint last_tweak = CpuPoint::infinity();
+    uint8_t seed[32];
+    const char* tag = "bench_bip352_seed";
+    host_sha256(reinterpret_cast<const uint8_t*>(tag), std::strlen(tag), seed);
+    for (int i = 0; i < batch_n; ++i) {
+        uint8_t buf[36];
+        std::memcpy(buf, seed, 32);
+        buf[32] = static_cast<uint8_t>((i >> 24) & 0xff);
+        buf[33] = static_cast<uint8_t>((i >> 16) & 0xff);
+        buf[34] = static_cast<uint8_t>((i >> 8) & 0xff);
+        buf[35] = static_cast<uint8_t>(i & 0xff);
+        uint8_t scalar_bytes[32];
+        host_sha256(buf, 36, scalar_bytes);
+        CpuScalar s = CpuScalar::from_bytes(scalar_bytes);
+        CpuPoint p = CpuPoint::generator().scalar_mul(s);
+        if (i == batch_n - 1) last_tweak = p;
+        tweaks[static_cast<size_t>(i)] = to_ocl_affine(p);
+    }
+    std::cout << "Done.\n";
+
+    CpuPoint spend_cpu = point_from_compressed(SPEND_PUBKEY_COMPRESSED);
+    if (spend_cpu.is_infinity()) {
+        std::cerr << "Failed to decode spend pubkey\n";
+        return 1;
+    }
+    OclAffine spend = to_ocl_affine(spend_cpu);
+
+    std::cout << "Building OpenCL BIP352 pipeline kernel...\n";
+    std::string source = load_bip352_kernel_source();
+    const char* src_ptr = source.c_str();
+    size_t src_len = source.size();
+    cl_int err = CL_SUCCESS;
+    cl_program program = clCreateProgramWithSource(cl_ctx, 1, &src_ptr, &src_len, &err);
+    check_cl(err, "clCreateProgramWithSource");
+    std::string build_options = "-cl-std=CL1.2 -cl-fast-relaxed-math -cl-mad-enable"
+        " -cl-nv-opt-level=3";
+    err = clBuildProgram(program, 1, &cl_dev, build_options.c_str(), nullptr, nullptr);
+    if (err != CL_SUCCESS) {
+        size_t log_size = 0;
+        clGetProgramBuildInfo(program, cl_dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_size);
+        std::string log(log_size, '\0');
+        clGetProgramBuildInfo(program, cl_dev, CL_PROGRAM_BUILD_LOG, log_size, log.data(), nullptr);
+        std::cerr << "Build failed:\n" << log << "\n";
+        return 1;
+    }
+    const char* kernel_name = use_lut ? "bip352_pipeline_kernel_lut" : "bip352_pipeline_kernel";
+    cl_kernel kernel = clCreateKernel(program, kernel_name, &err);
+    check_cl(err, kernel_name);
+    std::cout << "Done.\n";
+
+    size_t count = static_cast<size_t>(batch_n);
+    size_t tweak_bytes = count * sizeof(OclAffine);
+    std::vector<uint64_t> prefixes(count);
+    std::vector<OclAffine> gen_lut;
+    BIP352ScanKeyGlv scan_plan{};
+
+    // Both paths now use BIP352ScanKeyGlv with precomputed wNAF digits.
+    scan_plan = build_scan_glv_plan();
+    cl_mem d_tweaks = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, tweak_bytes, tweaks.data(), &err);
+    check_cl(err, "clCreateBuffer(d_tweaks)");
+    cl_mem d_scan = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                   sizeof(BIP352ScanKeyGlv), &scan_plan, &err);
+    check_cl(err, "clCreateBuffer(d_scan)");
+    cl_mem d_spend = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(OclAffine), &spend, &err);
+    check_cl(err, "clCreateBuffer(d_spend)");
+    cl_mem d_prefixes = clCreateBuffer(cl_ctx, CL_MEM_WRITE_ONLY, count * sizeof(uint64_t), nullptr, &err);
+    check_cl(err, "clCreateBuffer(d_prefixes)");
+    cl_mem d_gen_lut = nullptr;
+    if (use_lut) {
+        std::cout << "Building CPU generator LUT (" << (LUT_WINDOWS * LUT_ENTRIES) << " affine points)...\n";
+        gen_lut = build_generator_lut_host();
+        std::cout << "Uploading generator LUT to OpenCL...\n";
+        d_gen_lut = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                   gen_lut.size() * sizeof(OclAffine), gen_lut.data(), &err);
+        check_cl(err, "clCreateBuffer(d_gen_lut)");
+    }
+
+    cl_uint cl_count = static_cast<cl_uint>(count);
+    check_cl(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_tweaks), "clSetKernelArg(0)");
+    check_cl(clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_scan), "clSetKernelArg(1)");
+    check_cl(clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_spend), "clSetKernelArg(2)");
+    if (use_lut) {
+        check_cl(clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_gen_lut), "clSetKernelArg(3)");
+        check_cl(clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_prefixes), "clSetKernelArg(4)");
+        check_cl(clSetKernelArg(kernel, 5, sizeof(cl_uint), &cl_count), "clSetKernelArg(5)");
+    } else {
+        check_cl(clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_prefixes), "clSetKernelArg(3)");
+        check_cl(clSetKernelArg(kernel, 4, sizeof(cl_uint), &cl_count), "clSetKernelArg(4)");
+    }
+
+    if (local_size <= 0) {
+        throw std::runtime_error("local size must be positive");
+    }
+    if (static_cast<std::size_t>(local_size) > ctx->device_info().max_work_group_size) {
+        throw std::runtime_error("local size exceeds device max work group size");
+    }
+
+    // Autotune: find optimal local size among candidates (mirrors CUDA autotune_gpu_tpb).
+    // Only autotune when no explicit --local was given (i.e., we're still at the default).
+    {
+        int default_ls = use_lut ? DEFAULT_LOCAL_SIZE_LUT : DEFAULT_LOCAL_SIZE_FUSED;
+        if (local_size == default_ls) {
+            int tuned = autotune_local_size(
+                use_lut ? "LUT kernel" : "fused kernel",
+                cl_q, kernel, count,
+                ctx->device_info().max_work_group_size,
+                {64, 128, 256, 384});
+            local_size = tuned;
+        }
+    }
+
+    size_t global = ((count + static_cast<size_t>(local_size) - 1) / static_cast<size_t>(local_size)) * static_cast<size_t>(local_size);
+    size_t local = static_cast<size_t>(local_size);
+    std::cout << "  Running with local_size=" << local_size << "\n";
+
+    for (int i = 0; i < BENCH_WARMUP; ++i) {
+        check_cl(clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr),
+                 "clEnqueueNDRangeKernel(warmup)");
+    }
+    check_cl(clFinish(cl_q), "clFinish(warmup)");
+
+    std::vector<double> samples;
+    samples.reserve(BENCH_PASSES);
+    std::cout << "\n--- OpenCL (" << (use_lut ? "fused pipeline + LUT" : "fused pipeline") << ") ---\n";
+    for (int pass = 0; pass < BENCH_PASSES; ++pass) {
+        auto t0 = std::chrono::high_resolution_clock::now();
+        check_cl(clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr),
+                 "clEnqueueNDRangeKernel");
+        check_cl(clFinish(cl_q), "clFinish");
+        auto t1 = std::chrono::high_resolution_clock::now();
+        double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        samples.push_back((ms * 1e6) / static_cast<double>(count));
+        std::printf("  pass %2d: %8.3f ms\n", pass + 1, ms);
+    }
+    double ns_per_op = median_iqr(samples);
+    double ops_per_sec = 1e9 / ns_per_op;
+
+    check_cl(clEnqueueReadBuffer(cl_q, d_prefixes, CL_TRUE, 0, count * sizeof(uint64_t), prefixes.data(), 0, nullptr, nullptr),
+             "clEnqueueReadBuffer");
+
+    CpuScalar scan_scalar = CpuScalar::from_bytes(SCAN_KEY);
+    CpuPoint shared = last_tweak.scalar_mul(scan_scalar);
+    auto shared_comp = shared.to_compressed();
+    uint8_t shared_ser[37];
+    std::memcpy(shared_ser, shared_comp.data(), 33);
+    shared_ser[33] = shared_ser[34] = shared_ser[35] = shared_ser[36] = 0;
+    auto tagged = secp256k1::detail::cached_tagged_hash(
+        secp256k1::detail::make_tag_midstate("BIP0352/SharedSecret"),
+        shared_ser,
+        sizeof(shared_ser));
+    CpuScalar hs = CpuScalar::from_bytes(tagged.data());
+    CpuPoint out = CpuPoint::generator().scalar_mul(hs);
+    CpuPoint cand = spend_cpu;
+    cand.add_inplace(out);
+    uint64_t cpu_validation = extract_upper_64(cand.x_only_bytes().data());
+    uint64_t ocl_validation = prefixes.back();
+
+    std::printf("\n  OpenCL%s: %.1f ns/op (%.2f M/s)\n", use_lut ? " LUT" : "", ns_per_op, ops_per_sec / 1e6);
+    std::printf("  validation prefix: 0x%016llx\n", static_cast<unsigned long long>(ocl_validation));
+    // CUDA reference: bench_bip352 on RTX 5060 Ti (SM 12.0, 36 SMs, 384 tpb).
+    // GLV (no LUT): 260.4 ns/op (3.84 M/s).  LUT: 127.2 ns/op (7.86 M/s).
+    constexpr double CUDA_GLV_NS = 260.4;
+    constexpr double CUDA_LUT_NS = 127.2;
+    double cuda_ref = use_lut ? CUDA_LUT_NS : CUDA_GLV_NS;
+    std::printf("  CUDA reference:    %.1f ns/op (%.2f M/s) [%s]\n",
+                cuda_ref, 1e9 / cuda_ref / 1e6, use_lut ? "LUT" : "GLV");
+    std::printf("  gap vs CUDA:       %.2fx\n", ns_per_op / cuda_ref);
+    std::printf("  Validation: %s\n", cpu_validation == ocl_validation ? "[OK] MATCH" : "[FAIL] MISMATCH");
+
+    clReleaseMemObject(d_tweaks);
+    clReleaseMemObject(d_scan);
+    clReleaseMemObject(d_spend);
+    clReleaseMemObject(d_prefixes);
+    if (d_gen_lut) clReleaseMemObject(d_gen_lut);
+    clReleaseKernel(kernel);
+    clReleaseProgram(program);
+    return cpu_validation == ocl_validation ? 0 : 2;
+}
--- a/opencl/benchmarks/bench_opencl.cpp
+++ b/opencl/benchmarks/bench_opencl.cpp
@ -195,6 +195,7 @@ int main(int argc, char* argv[]) {
    std::vector<Scalar> point_scalars(point_batch);
    std::vector<JacobianPoint> pd_in(point_batch), pd_out(point_batch);
    std::vector<JacobianPoint> pa_in1(point_batch), pa_in2(point_batch), pa_out(point_batch);
+    std::vector<AffinePoint> sm_points(point_batch);

    for (std::size_t i = 0; i < point_batch; ++i) {
        point_scalars[i] = {{rng(), rng(), rng(), rng()}};
@ -206,6 +207,7 @@ int main(int argc, char* argv[]) {
    }
    ctx->batch_scalar_mul_generator(point_scalars.data(), pa_in2.data(), point_batch);
    pa_in1 = pd_in;
+    ctx->batch_jacobian_to_affine(pd_in.data(), sm_points.data(), point_batch);

    {
        auto r = bench_batch("Point Double", [&]() {
@ -244,6 +246,13 @@ int main(int argc, char* argv[]) {
        }, bs, 1, 3);
        print_result(r);
        results.push_back(r);
+
+        std::string kp_name = "kP (batch=" + std::to_string(bs) + ")";
+        auto kp = bench_batch(kp_name, [&]() {
+            ctx->batch_scalar_mul(sm_scalars.data(), sm_points.data(), sm_results.data(), bs);
+        }, bs, 1, 3);
+        print_result(kp);
+        results.push_back(kp);
    }

    // ==========================================================================
@ -598,7 +607,8 @@ int main(int argc, char* argv[]) {
        {
            std::size_t smk_batch = std::min(point_batch, static_cast<std::size_t>(65536));
            cl_uint smk_cnt = static_cast<cl_uint>(smk_batch);
-            std::size_t smk_global = ((smk_batch + p_local_sz - 1) / p_local_sz) * p_local_sz;
+            std::size_t smk_local_sz = std::min<std::size_t>(128, p_local_sz);
+            std::size_t smk_global = ((smk_batch + smk_local_sz - 1) / smk_local_sz) * smk_local_sz;

            // Use existing point_scalars for scalar data
            cl_mem buf_sc = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
@ -616,12 +626,12 @@ int main(int argc, char* argv[]) {
            int smk_iters = 5;

            for (int i = 0; i < smk_warmup; ++i)
-                clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &p_local_sz, 0, nullptr, nullptr);
+                clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
            clFinish(cl_q);

            auto t0 = std::chrono::high_resolution_clock::now();
            for (int i = 0; i < smk_iters; ++i)
-                clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &p_local_sz, 0, nullptr, nullptr);
+                clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
            clFinish(cl_q);
            auto t1 = std::chrono::high_resolution_clock::now();

@ -634,6 +644,95 @@ int main(int argc, char* argv[]) {
            clReleaseMemObject(buf_smr);
        }

+        // Scalar Mul Arbitrary Point (kernel-only) -- same batch cap as kG
+        {
+            std::size_t smk_batch = std::min(point_batch, static_cast<std::size_t>(65536));
+            cl_uint smk_cnt = static_cast<cl_uint>(smk_batch);
+            std::size_t smk_local_sz = std::min<std::size_t>(128, p_local_sz);
+            std::size_t smk_global = ((smk_batch + smk_local_sz - 1) / smk_local_sz) * smk_local_sz;
+
+            std::vector<AffinePoint> sm_points(smk_batch);
+            ctx->batch_jacobian_to_affine(pd_in.data(), sm_points.data(), smk_batch);
+
+            cl_mem buf_sc = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                            smk_batch * sizeof(Scalar), (void*)point_scalars.data(), &err);
+            cl_mem buf_pts = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                             smk_batch * sizeof(AffinePoint), (void*)sm_points.data(), &err);
+            cl_mem buf_smr = clCreateBuffer(cl_ctx, CL_MEM_WRITE_ONLY,
+                                             smk_batch * sizeof(JacobianPoint), nullptr, &err);
+            clFinish(cl_q);
+
+            cl_kernel kern = (cl_kernel)ctx->native_kernel("scalar_mul");
+            clSetKernelArg(kern, 0, sizeof(cl_mem), &buf_sc);
+            clSetKernelArg(kern, 1, sizeof(cl_mem), &buf_pts);
+            clSetKernelArg(kern, 2, sizeof(cl_mem), &buf_smr);
+            clSetKernelArg(kern, 3, sizeof(cl_uint), &smk_cnt);
+
+            int smk_warmup = 2;
+            int smk_iters = 5;
+
+            for (int i = 0; i < smk_warmup; ++i)
+                clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
+            clFinish(cl_q);
+
+            auto t0 = std::chrono::high_resolution_clock::now();
+            for (int i = 0; i < smk_iters; ++i)
+                clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
+            clFinish(cl_q);
+            auto t1 = std::chrono::high_resolution_clock::now();
+
+            double ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
+            double total_ops = static_cast<double>(smk_batch) * smk_iters;
+            BenchResult r = {"kP (kernel)", ns / total_ops, total_ops / (ns * 1e-9)};
+            print_result(r); results.push_back(r);
+
+            for (int i = 0; i < smk_warmup; ++i) {
+                clEnqueueWriteBuffer(cl_q, buf_pts, CL_FALSE, 0,
+                                     smk_batch * sizeof(AffinePoint), (void*)sm_points.data(), 0, nullptr, nullptr);
+                clEnqueueWriteBuffer(cl_q, buf_sc, CL_FALSE, 0,
+                                     smk_batch * sizeof(Scalar), (void*)point_scalars.data(), 0, nullptr, nullptr);
+            }
+            clFinish(cl_q);
+
+            t0 = std::chrono::high_resolution_clock::now();
+            for (int i = 0; i < smk_iters; ++i) {
+                clEnqueueWriteBuffer(cl_q, buf_pts, CL_FALSE, 0,
+                                     smk_batch * sizeof(AffinePoint), (void*)sm_points.data(), 0, nullptr, nullptr);
+                clEnqueueWriteBuffer(cl_q, buf_sc, CL_FALSE, 0,
+                                     smk_batch * sizeof(Scalar), (void*)point_scalars.data(), 0, nullptr, nullptr);
+            }
+            clFinish(cl_q);
+            t1 = std::chrono::high_resolution_clock::now();
+
+            ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
+            BenchResult upload = {"kP (upload)", ns / total_ops, total_ops / (ns * 1e-9)};
+            print_result(upload); results.push_back(upload);
+
+            clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
+            clFinish(cl_q);
+
+            std::vector<JacobianPoint> sm_readback(smk_batch);
+            for (int i = 0; i < smk_warmup; ++i)
+                clEnqueueReadBuffer(cl_q, buf_smr, CL_FALSE, 0,
+                                    smk_batch * sizeof(JacobianPoint), sm_readback.data(), 0, nullptr, nullptr);
+            clFinish(cl_q);
+
+            t0 = std::chrono::high_resolution_clock::now();
+            for (int i = 0; i < smk_iters; ++i)
+                clEnqueueReadBuffer(cl_q, buf_smr, CL_FALSE, 0,
+                                    smk_batch * sizeof(JacobianPoint), sm_readback.data(), 0, nullptr, nullptr);
+            clFinish(cl_q);
+            t1 = std::chrono::high_resolution_clock::now();
+
+            ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
+            BenchResult readback = {"kP (readback)", ns / total_ops, total_ops / (ns * 1e-9)};
+            print_result(readback); results.push_back(readback);
+
+            clReleaseMemObject(buf_sc);
+            clReleaseMemObject(buf_pts);
+            clReleaseMemObject(buf_smr);
+        }
+
        clReleaseMemObject(buf_jp1);
        clReleaseMemObject(buf_jp2);
        clReleaseMemObject(buf_jpr);
--- a/opencl/include/secp256k1_opencl.hpp
+++ b/opencl/include/secp256k1_opencl.hpp
@ -37,7 +37,7 @@ struct DeviceConfig {
    int device_id = 0;              // GPU device index
    int platform_id = 0;            // Platform index (e.g., Intel, AMD)
    std::size_t max_batch_size = 65536;  // Max points per batch
-    std::size_t local_work_size = 256;   // Work group size (auto if 0)
+    std::size_t local_work_size = 0;     // Work group size (auto if 0)
    bool prefer_intel = true;       // Prefer Intel GPU if available
    bool verbose = false;           // Print device info on init
 };
--- a/opencl/kernels/secp256k1_bip352.cl
+++ b/opencl/kernels/secp256k1_bip352.cl
@ -0,0 +1,203 @@
+#ifndef SECP256K1_BIP352_CL
+#define SECP256K1_BIP352_CL
+
+#include "secp256k1_extended.cl"
+
+// BIP352ScanKeyGlv: precomputed GLV scan-key plan uploaded to __constant memory.
+// wNAF digits are computed on the CPU host and read directly in the kernel,
+// avoiding the GPU call to scalar_to_wnaf and eliminating 1040 bytes of
+// private-stack pressure (int wnaf1[130] + int wnaf2[130]).
+// Layout must match the host-side BIP352ScanKeyGlv struct exactly.
+typedef struct {
+    char wnaf1[130]; // wNAF digits for k1 half-scalar (precomputed, range [-15..15])
+    char wnaf2[130]; // wNAF digits for k2 half-scalar (precomputed, range [-15..15])
+    uchar k1_neg;    // 1 if k1 was negative: negate base.y before table build
+    uchar flip_phi;  // 1 if phi table y-coordinate should be negated
+    uchar pad0;
+    uchar pad1;
+} BIP352ScanKeyGlv;
+
+// SHA256("BIP0352/SharedSecret") || SHA256("BIP0352/SharedSecret")
+__constant uint BIP352_SHAREDSECRET_MIDSTATE[8] = {
+    0x88831537U, 0x5127079bU, 0x69c2137bU, 0xab0303e6U,
+    0x98fa21faU, 0x4a888523U, 0xbd99daabU, 0xf25e5e0aU
+};
+
+inline void bip352_tagged_sha256_impl(const uchar* data, uint data_len, uchar out[32]) {
+    SHA256Ctx ctx;
+    for (int i = 0; i < 8; i++) ctx.h[i] = BIP352_SHAREDSECRET_MIDSTATE[i];
+    ctx.buf_len = 0;
+    ctx.total_len = 64;
+    sha256_update(&ctx, data, data_len);
+    sha256_final(&ctx, out);
+}
+
+inline void bip352_shared_secret_input_impl(const JacobianPoint* p, uchar ser[37]) {
+    FieldElement z_inv, z_inv2, z_inv3, x_aff, y_aff;
+    field_inv_impl(&z_inv, &p->z);
+    field_sqr_impl(&z_inv2, &z_inv);
+    field_mul_impl(&z_inv3, &z_inv2, &z_inv);
+    field_mul_impl(&x_aff, &p->x, &z_inv2);
+    field_mul_impl(&y_aff, &p->y, &z_inv3);
+
+    uchar x_bytes[32], y_bytes[32];
+    field_to_bytes_impl(&x_aff, x_bytes);
+    field_to_bytes_impl(&y_aff, y_bytes);
+
+    ser[0] = (y_bytes[31] & 1) ? 0x03 : 0x02;
+    for (int i = 0; i < 32; i++) ser[1 + i] = x_bytes[i];
+    ser[33] = 0;
+    ser[34] = 0;
+    ser[35] = 0;
+    ser[36] = 0;
+}
+
+inline ulong point_prefix64_impl(const JacobianPoint* p) {
+    FieldElement z_inv, z_inv2, x_aff;
+    field_inv_impl(&z_inv, &p->z);
+    field_sqr_impl(&z_inv2, &z_inv);
+    field_mul_impl(&x_aff, &p->x, &z_inv2);
+
+    uchar x_bytes[32];
+    field_to_bytes_impl(&x_aff, x_bytes);
+
+    ulong prefix = 0;
+    for (int i = 0; i < 8; i++) {
+        prefix = (prefix << 8) | (ulong)x_bytes[i];
+    }
+    return prefix;
+}
+
+// Optimized GLV scalar multiply with pre-decomposed scan key.
+// Uses build_wnaf_table_zr_impl (Z-trick affine table) + derive_endo_table_impl
+// instead of the old Jacobian-Jacobian table -- eliminates 6 J-J adds per half,
+// replaces with 7 mixed (J+A) adds and 1 field_inv shared across 8 entries.
+// This matches the quality of scalar_mul_glv_impl in secp256k1_extended.cl.
+inline void scalar_mul_glv_predecomp_impl(
+    JacobianPoint* r,
+    const AffinePoint* p,
+    __constant const BIP352ScanKeyGlv* scan
+) {
+    AffinePoint base = *p;
+    if (scan->k1_neg) field_negate_impl(&base.y, &base.y);
+
+    // Build affine table[0..7] = {P, 3P, 5P, 7P, 9P, 11P, 13P, 15P} via Z-trick.
+    // One field_inv for the whole table instead of per-point.
+    AffinePoint table[8];
+    FieldElement globalz;
+    build_wnaf_table_zr_impl(&base, table, &globalz);
+
+    // Endomorphism table: endo_table[i] = phi(table[i]) with optional Y-negate.
+    AffinePoint endo_table[8];
+    derive_endo_table_impl(table, endo_table, scan->flip_phi);
+
+    // Shamir interleaved double-and-add with mixed (J+A) additions.
+    // wNAF digits are read directly from __constant memory (precomputed on CPU host),
+    // eliminating the GPU scalar_to_wnaf call and 1040 bytes of private stack.
+    point_set_infinity(r);
+    for (int i = 129; i >= 0; --i) {
+        if (!point_is_infinity(r)) point_double_impl(r, r);
+
+        int d1 = (int)scan->wnaf1[i];
+        if (d1 != 0) {
+            int idx = (((d1 > 0) ? d1 : -d1) - 1) >> 1;
+            AffinePoint pt = table[idx];
+            if (d1 < 0) field_negate_impl(&pt.y, &pt.y);
+            if (point_is_infinity(r)) { point_from_affine(r, &pt); }
+            else { point_add_mixed_impl(r, r, &pt); }
+        }
+
+        int d2 = (int)scan->wnaf2[i];
+        if (d2 != 0) {
+            int idx = (((d2 > 0) ? d2 : -d2) - 1) >> 1;
+            AffinePoint pt = endo_table[idx];
+            if (d2 < 0) field_negate_impl(&pt.y, &pt.y);
+            if (point_is_infinity(r)) { point_from_affine(r, &pt); }
+            else { point_add_mixed_impl(r, r, &pt); }
+        }
+    }
+
+    // Correct accumulated Z by the shared table Z factor.
+    if (!point_is_infinity(r)) {
+        FieldElement corrected_z;
+        field_mul_impl(&corrected_z, &r->z, &globalz);
+        r->z = corrected_z;
+    }
+}
+
+__kernel void bip352_pipeline_kernel(
+    __global const AffinePoint* tweak_points,
+    __constant const BIP352ScanKeyGlv* scan_key,
+    __global const AffinePoint* spend_point,
+    __global ulong* prefixes,
+    const uint count
+) {
+    uint gid = get_global_id(0);
+    if (gid >= count) return;
+
+    AffinePoint tweak = tweak_points[gid];
+    AffinePoint spend = spend_point[0];
+
+    JacobianPoint shared;
+    scalar_mul_glv_predecomp_impl(&shared, &tweak, scan_key);
+    if (point_is_infinity(&shared)) {
+        prefixes[gid] = 0;
+        return;
+    }
+
+    uchar ser[37];
+    bip352_shared_secret_input_impl(&shared, ser);
+
+    uchar hash[32];
+    bip352_tagged_sha256_impl(ser, 37, hash);
+
+    Scalar hs;
+    scalar_from_bytes_impl(hash, &hs);
+
+    JacobianPoint out;
+    scalar_mul_generator_windowed_impl(&out, &hs);
+
+    JacobianPoint cand;
+    point_add_mixed_impl(&cand, &out, &spend);
+    prefixes[gid] = point_prefix64_impl(&cand);
+}
+
+__kernel void bip352_pipeline_kernel_lut(
+    __global const AffinePoint* tweak_points,
+    __constant const BIP352ScanKeyGlv* scan_key,
+    __global const AffinePoint* spend_point,
+    __global const AffinePoint* gen_lut,
+    __global ulong* prefixes,
+    const uint count
+) {
+    uint gid = get_global_id(0);
+    if (gid >= count) return;
+
+    AffinePoint tweak = tweak_points[gid];
+    AffinePoint spend = spend_point[0];
+
+    JacobianPoint shared;
+    scalar_mul_glv_predecomp_impl(&shared, &tweak, scan_key);
+    if (point_is_infinity(&shared)) {
+        prefixes[gid] = 0;
+        return;
+    }
+
+    uchar ser[37];
+    bip352_shared_secret_input_impl(&shared, ser);
+
+    uchar hash[32];
+    bip352_tagged_sha256_impl(ser, 37, hash);
+
+    Scalar hs;
+    scalar_from_bytes_impl(hash, &hs);
+
+    JacobianPoint out;
+    scalar_mul_generator_lut_impl(&out, &hs, gen_lut);
+
+    JacobianPoint cand;
+    point_add_mixed_impl(&cand, &out, &spend);
+    prefixes[gid] = point_prefix64_impl(&cand);
+}
+
+#endif
--- a/opencl/kernels/secp256k1_extended.cl
+++ b/opencl/kernels/secp256k1_extended.cl
@ -564,6 +564,84 @@ inline void glv_decompose_impl(const Scalar* k, Scalar* k1, Scalar* k2,
 // GLV-accelerated scalar multiplication: k*P using Shamir's trick
 // with endomorphism phi(P) = (beta*x, y) where phi corresponds to lambda.
 // Uses interleaved wNAF w=5 for both half-scalars k1, k2.
+inline void build_wnaf_table_zr_impl(const AffinePoint* base, AffinePoint table[8],
+                                     FieldElement* globalz) {
+    JacobianPoint base_jac;
+    point_from_affine(&base_jac, base);
+
+    JacobianPoint doubled;
+    point_double_impl(&doubled, &base_jac);
+
+    FieldElement c = doubled.z;
+    FieldElement c2, c3;
+    field_sqr_impl(&c2, &c);
+    field_mul_impl(&c3, &c2, &c);
+
+    AffinePoint doubled_affine;
+    doubled_affine.x = doubled.x;
+    doubled_affine.y = doubled.y;
+
+    JacobianPoint accum;
+    field_mul_impl(&accum.x, &base->x, &c2);
+    field_mul_impl(&accum.y, &base->y, &c3);
+    accum.z.limbs[0] = 1UL;
+    accum.z.limbs[1] = 0UL;
+    accum.z.limbs[2] = 0UL;
+    accum.z.limbs[3] = 0UL;
+    accum.infinity = 0;
+
+    table[0].x = accum.x;
+    table[0].y = accum.y;
+
+    FieldElement zr[8];
+    zr[0] = c;
+
+    for (int i = 1; i < 8; ++i) {
+        FieldElement h;
+        point_add_mixed_h_impl(&accum, &accum, &doubled_affine, &h);
+        table[i].x = accum.x;
+        table[i].y = accum.y;
+        zr[i] = h;
+    }
+
+    field_mul_impl(globalz, &accum.z, &c);
+
+    FieldElement zs = zr[7];
+    for (int idx = 6; idx >= 0; --idx) {
+        if (idx != 6) {
+            FieldElement tmp;
+            field_mul_impl(&tmp, &zs, &zr[idx + 1]);
+            zs = tmp;
+        }
+
+        FieldElement zs2, zs3;
+        field_sqr_impl(&zs2, &zs);
+        field_mul_impl(&zs3, &zs2, &zs);
+
+        FieldElement tx, ty;
+        field_mul_impl(&tx, &table[idx].x, &zs2);
+        field_mul_impl(&ty, &table[idx].y, &zs3);
+        table[idx].x = tx;
+        table[idx].y = ty;
+    }
+}
+
+inline void derive_endo_table_impl(const AffinePoint table[8], AffinePoint endo_table[8],
+                                   int negate_y) {
+    FieldElement beta;
+    beta.limbs[0] = GLV_BETA0; beta.limbs[1] = GLV_BETA1;
+    beta.limbs[2] = GLV_BETA2; beta.limbs[3] = GLV_BETA3;
+
+    for (int i = 0; i < 8; ++i) {
+        field_mul_impl(&endo_table[i].x, &table[i].x, &beta);
+        if (negate_y) {
+            field_negate_impl(&endo_table[i].y, &table[i].y);
+        } else {
+            endo_table[i].y = table[i].y;
+        }
+    }
+}
+
 inline void scalar_mul_glv_impl(JacobianPoint* r, const Scalar* k, const AffinePoint* p) {
    Scalar k1, k2;
    int k1_neg, k2_neg;
@ -573,62 +651,48 @@ inline void scalar_mul_glv_impl(JacobianPoint* r, const Scalar* k, const AffineP
    AffinePoint base = *p;
    if (k1_neg) field_negate_impl(&base.y, &base.y);

-    // Build P precomp table: [P, 3P, 5P, ..., 15P] (8 entries, w=5)
-    JacobianPoint tbl_jac[8];
-    JacobianPoint dbl;
-    point_from_affine(&tbl_jac[0], &base);
-    point_double_impl(&dbl, &tbl_jac[0]);
-    for (int i = 1; i < 8; i++)
-        point_add_impl(&tbl_jac[i], &tbl_jac[i-1], &dbl);
+    AffinePoint table[8];
+    FieldElement globalz;
+    build_wnaf_table_zr_impl(&base, table, &globalz);

-    // Build phi(P) table: apply endomorphism, flip y if signs differ
-    AffinePoint endo_base;
-    FieldElement beta;
-    beta.limbs[0] = GLV_BETA0; beta.limbs[1] = GLV_BETA1;
-    beta.limbs[2] = GLV_BETA2; beta.limbs[3] = GLV_BETA3;
-    field_mul_impl(&endo_base.x, &base.x, &beta);
-    endo_base.y = base.y;
-    int flip_phi = (k1_neg != k2_neg);
-    if (flip_phi) field_negate_impl(&endo_base.y, &endo_base.y);
-
-    JacobianPoint tbl2_jac[8];
-    point_from_affine(&tbl2_jac[0], &endo_base);
-    JacobianPoint dbl2;
-    point_double_impl(&dbl2, &tbl2_jac[0]);
-    for (int i = 1; i < 8; i++)
-        point_add_impl(&tbl2_jac[i], &tbl2_jac[i-1], &dbl2);
+    AffinePoint endo_table[8];
+    derive_endo_table_impl(table, endo_table, (k1_neg != k2_neg));

    // wNAF encode both half-width scalars
-    int wnaf1[260], wnaf2[260];
-    int len1 = scalar_to_wnaf(&k1, wnaf1);
-    int len2 = scalar_to_wnaf(&k2, wnaf2);
-    int max_len = (len1 > len2) ? len1 : len2;
+    int wnaf1[130] = {0};
+    int wnaf2[130] = {0};
+    scalar_to_wnaf(&k1, wnaf1);
+    scalar_to_wnaf(&k2, wnaf2);

    // Shamir interleaved loop
    point_set_infinity(r);
-    for (int i = max_len - 1; i >= 0; --i) {
+    for (int i = 129; i >= 0; --i) {
        if (!point_is_infinity(r)) point_double_impl(r, r);

-        int d1 = (i < len1) ? wnaf1[i] : 0;
+        int d1 = wnaf1[i];
        if (d1 != 0) {
-            int idx = ((d1 > 0) ? d1 : -d1) >> 1;
-            if (idx >= 8) idx = 7;
-            JacobianPoint pt = tbl_jac[idx];
+            int idx = (((d1 > 0) ? d1 : -d1) - 1) >> 1;
+            AffinePoint pt = table[idx];
            if (d1 < 0) field_negate_impl(&pt.y, &pt.y);
-            if (point_is_infinity(r)) { *r = pt; }
-            else { JacobianPoint tmp; point_add_impl(&tmp, r, &pt); *r = tmp; }
+            if (point_is_infinity(r)) { point_from_affine(r, &pt); }
+            else { point_add_mixed_impl(r, r, &pt); }
        }

-        int d2 = (i < len2) ? wnaf2[i] : 0;
+        int d2 = wnaf2[i];
        if (d2 != 0) {
-            int idx = ((d2 > 0) ? d2 : -d2) >> 1;
-            if (idx >= 8) idx = 7;
-            JacobianPoint pt = tbl2_jac[idx];
+            int idx = (((d2 > 0) ? d2 : -d2) - 1) >> 1;
+            AffinePoint pt = endo_table[idx];
            if (d2 < 0) field_negate_impl(&pt.y, &pt.y);
-            if (point_is_infinity(r)) { *r = pt; }
-            else { JacobianPoint tmp; point_add_impl(&tmp, r, &pt); *r = tmp; }
+            if (point_is_infinity(r)) { point_from_affine(r, &pt); }
+            else { point_add_mixed_impl(r, r, &pt); }
        }
    }
+
+    if (!point_is_infinity(r)) {
+        FieldElement corrected_z;
+        field_mul_impl(&corrected_z, &r->z, &globalz);
+        r->z = corrected_z;
+    }
 }

 // Precomputed generator multiplication using fixed window w=4
--- a/opencl/kernels/secp256k1_field.cl
+++ b/opencl/kernels/secp256k1_field.cl
@ -59,6 +59,306 @@ typedef struct {
    ulong limbs[4];  // Little-endian: limbs[0] is LSB
 } FieldElement;

+// =============================================================================
+// NVIDIA OpenCL PTX Acceleration (Level 1+2+3)
+// =============================================================================
+// On consumer NVIDIA GPUs (Turing/Ampere/Ada/Blackwell), INT32 multiply
+// throughput is 32x higher than INT64. Inline PTX enables:
+//   Level 1+2: mad.lo.cc.u64/madc.hi.cc.u64 carry chains (no comparison-carry)
+//   Level 3:   mad.lo.cc.u32/madc.hi.cc.u32 32-bit Comba (INT32 throughput)
+// Fallback (AMD, Intel, portable): mul_hi + comparison-based carry unchanged.
+// Guard: __NV_CL_C_VERSION is defined only by NVIDIA's OpenCL compiler.
+// =============================================================================
+
+#ifdef __NV_CL_C_VERSION
+
+// 32-bit MAD accumulate: (r0:r1:r2) += a * b  [3-register 96-bit accumulator]
+#define OCL_MAD32(r0, r1, r2, a, b) \
+    __asm volatile( \
+        "mad.lo.cc.u32 %0, %3, %4, %0; \n\t" \
+        "madc.hi.cc.u32 %1, %3, %4, %1; \n\t" \
+        "addc.u32 %2, %2, 0; \n\t" \
+        : "+r"(r0), "+r"(r1), "+r"(r2) \
+        : "r"(a), "r"(b) \
+    )
+
+// 32-bit squaring diagonal: (r0:r1:r2) += a*a
+#define OCL_SQR32_D(r0, r1, r2, a) \
+    __asm volatile( \
+        "mad.lo.cc.u32 %0, %3, %3, %0; \n\t" \
+        "madc.hi.cc.u32 %1, %3, %3, %1; \n\t" \
+        "addc.u32 %2, %2, 0; \n\t" \
+        : "+r"(r0), "+r"(r1), "+r"(r2) \
+        : "r"(a) \
+    )
+
+// 32-bit squaring off-diagonal: (r0:r1:r2) += 2 * a*b
+#define OCL_SQR32_M2(r0, r1, r2, a, b) \
+    do { \
+        uint _lo, _hi; \
+        __asm volatile( \
+            "mul.lo.u32 %0, %2, %3; \n\t" \
+            "mul.hi.u32 %1, %2, %3; \n\t" \
+            : "=r"(_lo), "=r"(_hi) : "r"(a), "r"(b) \
+        ); \
+        __asm volatile( \
+            "add.cc.u32 %0, %0, %3; \n\t" \
+            "addc.cc.u32 %1, %1, %4; \n\t" \
+            "addc.u32 %2, %2, 0; \n\t" \
+            "add.cc.u32 %0, %0, %3; \n\t" \
+            "addc.cc.u32 %1, %1, %4; \n\t" \
+            "addc.u32 %2, %2, 0; \n\t" \
+            : "+r"(r0), "+r"(r1), "+r"(r2) : "r"(_lo), "r"(_hi) \
+        ); \
+    } while(0)
+
+// ----------------------------------------------------------------------------
+// 32-bit Comba multiplication: 4x64 FieldElement reinterpreted as 8x32 limbs.
+// Produces uint[16] raw output (little-endian 32-bit limbs of 512-bit product).
+// Mirrors CUDA's mul_256_comba32 from secp256k1_32_hybrid_final.cuh.
+// ----------------------------------------------------------------------------
+static inline void mul_256_comba32_ocl(
+    const FieldElement* a, const FieldElement* b, uint t32[16]
+) {
+    uint a32[8], b32[8];
+    for (int i = 0; i < 4; i++) {
+        a32[2*i]   = (uint)(a->limbs[i]);
+        a32[2*i+1] = (uint)(a->limbs[i] >> 32);
+        b32[2*i]   = (uint)(b->limbs[i]);
+        b32[2*i+1] = (uint)(b->limbs[i] >> 32);
+    }
+    uint r0 = 0, r1 = 0, r2 = 0;
+
+    OCL_MAD32(r0,r1,r2, a32[0],b32[0]);
+    t32[0]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[0],b32[1]); OCL_MAD32(r0,r1,r2, a32[1],b32[0]);
+    t32[1]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[0],b32[2]); OCL_MAD32(r0,r1,r2, a32[1],b32[1]); OCL_MAD32(r0,r1,r2, a32[2],b32[0]);
+    t32[2]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[0],b32[3]); OCL_MAD32(r0,r1,r2, a32[1],b32[2]); OCL_MAD32(r0,r1,r2, a32[2],b32[1]); OCL_MAD32(r0,r1,r2, a32[3],b32[0]);
+    t32[3]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[0],b32[4]); OCL_MAD32(r0,r1,r2, a32[1],b32[3]); OCL_MAD32(r0,r1,r2, a32[2],b32[2]); OCL_MAD32(r0,r1,r2, a32[3],b32[1]); OCL_MAD32(r0,r1,r2, a32[4],b32[0]);
+    t32[4]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[0],b32[5]); OCL_MAD32(r0,r1,r2, a32[1],b32[4]); OCL_MAD32(r0,r1,r2, a32[2],b32[3]); OCL_MAD32(r0,r1,r2, a32[3],b32[2]); OCL_MAD32(r0,r1,r2, a32[4],b32[1]); OCL_MAD32(r0,r1,r2, a32[5],b32[0]);
+    t32[5]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[0],b32[6]); OCL_MAD32(r0,r1,r2, a32[1],b32[5]); OCL_MAD32(r0,r1,r2, a32[2],b32[4]); OCL_MAD32(r0,r1,r2, a32[3],b32[3]); OCL_MAD32(r0,r1,r2, a32[4],b32[2]); OCL_MAD32(r0,r1,r2, a32[5],b32[1]); OCL_MAD32(r0,r1,r2, a32[6],b32[0]);
+    t32[6]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[0],b32[7]); OCL_MAD32(r0,r1,r2, a32[1],b32[6]); OCL_MAD32(r0,r1,r2, a32[2],b32[5]); OCL_MAD32(r0,r1,r2, a32[3],b32[4]); OCL_MAD32(r0,r1,r2, a32[4],b32[3]); OCL_MAD32(r0,r1,r2, a32[5],b32[2]); OCL_MAD32(r0,r1,r2, a32[6],b32[1]); OCL_MAD32(r0,r1,r2, a32[7],b32[0]);
+    t32[7]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[1],b32[7]); OCL_MAD32(r0,r1,r2, a32[2],b32[6]); OCL_MAD32(r0,r1,r2, a32[3],b32[5]); OCL_MAD32(r0,r1,r2, a32[4],b32[4]); OCL_MAD32(r0,r1,r2, a32[5],b32[3]); OCL_MAD32(r0,r1,r2, a32[6],b32[2]); OCL_MAD32(r0,r1,r2, a32[7],b32[1]);
+    t32[8]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[2],b32[7]); OCL_MAD32(r0,r1,r2, a32[3],b32[6]); OCL_MAD32(r0,r1,r2, a32[4],b32[5]); OCL_MAD32(r0,r1,r2, a32[5],b32[4]); OCL_MAD32(r0,r1,r2, a32[6],b32[3]); OCL_MAD32(r0,r1,r2, a32[7],b32[2]);
+    t32[9]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[3],b32[7]); OCL_MAD32(r0,r1,r2, a32[4],b32[6]); OCL_MAD32(r0,r1,r2, a32[5],b32[5]); OCL_MAD32(r0,r1,r2, a32[6],b32[4]); OCL_MAD32(r0,r1,r2, a32[7],b32[3]);
+    t32[10]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[4],b32[7]); OCL_MAD32(r0,r1,r2, a32[5],b32[6]); OCL_MAD32(r0,r1,r2, a32[6],b32[5]); OCL_MAD32(r0,r1,r2, a32[7],b32[4]);
+    t32[11]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[5],b32[7]); OCL_MAD32(r0,r1,r2, a32[6],b32[6]); OCL_MAD32(r0,r1,r2, a32[7],b32[5]);
+    t32[12]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[6],b32[7]); OCL_MAD32(r0,r1,r2, a32[7],b32[6]);
+    t32[13]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_MAD32(r0,r1,r2, a32[7],b32[7]);
+    t32[14]=r0; t32[15]=r1;
+}
+
+// 32-bit Comba squaring: ~40% fewer multiplications (symmetry exploitation).
+// Mirrors CUDA's sqr_256_comba32 from secp256k1_32_hybrid_final.cuh.
+static inline void sqr_256_comba32_ocl(const FieldElement* a, uint t32[16]) {
+    uint a32[8];
+    for (int i = 0; i < 4; i++) {
+        a32[2*i]   = (uint)(a->limbs[i]);
+        a32[2*i+1] = (uint)(a->limbs[i] >> 32);
+    }
+    uint r0 = 0, r1 = 0, r2 = 0;
+
+    OCL_SQR32_D(r0,r1,r2, a32[0]);
+    t32[0]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[0],a32[1]);
+    t32[1]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[0],a32[2]); OCL_SQR32_D(r0,r1,r2, a32[1]);
+    t32[2]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[0],a32[3]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[2]);
+    t32[3]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[0],a32[4]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[3]); OCL_SQR32_D(r0,r1,r2, a32[2]);
+    t32[4]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[0],a32[5]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[4]); OCL_SQR32_M2(r0,r1,r2, a32[2],a32[3]);
+    t32[5]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[0],a32[6]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[5]); OCL_SQR32_M2(r0,r1,r2, a32[2],a32[4]); OCL_SQR32_D(r0,r1,r2, a32[3]);
+    t32[6]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[0],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[6]); OCL_SQR32_M2(r0,r1,r2, a32[2],a32[5]); OCL_SQR32_M2(r0,r1,r2, a32[3],a32[4]);
+    t32[7]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[1],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[2],a32[6]); OCL_SQR32_M2(r0,r1,r2, a32[3],a32[5]); OCL_SQR32_D(r0,r1,r2, a32[4]);
+    t32[8]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[2],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[3],a32[6]); OCL_SQR32_M2(r0,r1,r2, a32[4],a32[5]);
+    t32[9]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[3],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[4],a32[6]); OCL_SQR32_D(r0,r1,r2, a32[5]);
+    t32[10]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[4],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[5],a32[6]);
+    t32[11]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[5],a32[7]); OCL_SQR32_D(r0,r1,r2, a32[6]);
+    t32[12]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_M2(r0,r1,r2, a32[6],a32[7]);
+    t32[13]=r0; r0=r1; r1=r2; r2=0;
+
+    OCL_SQR32_D(r0,r1,r2, a32[7]);
+    t32[14]=r0; t32[15]=r1;
+}
+
+// 32-bit reduction: T_hi x K_MOD (32-bit MAD chain) + conditional P-subtract.
+// Phase 1: T_hi[8..15] x 977 (scalar, 32-bit MAD chain)
+// Phase 1b: add T_hi << 32  (K_MOD = 2^32 + 977)
+// Phase 2: T_lo[0..7] += result (32-bit carry chain)
+// Phase 3+4: pack to 64-bit, fold overflow, conditional P-subtract (64-bit PTX)
+// Mirrors CUDA's reduce_512_to_256_32 from secp256k1_32_hybrid_final.cuh.
+static inline void reduce_512_to_256_32_ocl(uint t32[16], FieldElement* r) {
+    uint t0=t32[0], t1=t32[1], t2=t32[2], t3=t32[3];
+    uint t4=t32[4], t5=t32[5], t6=t32[6], t7=t32[7];
+    const uint t8 =t32[8],  t9 =t32[9],  t10=t32[10], t11=t32[11];
+    const uint t12=t32[12], t13=t32[13], t14=t32[14], t15=t32[15];
+
+    // Phase 1: A = T_hi[8..15] x 977 (32-bit scalar MAD chain -> 9 limbs)
+    uint a0, a1, a2, a3, a4, a5, a6, a7, a8;
+    __asm volatile(
+        "mul.lo.u32 %0, %9,  977;\n\t"
+        "mul.hi.u32 %1, %9,  977;\n\t"
+        "mad.lo.cc.u32 %1, %10, 977, %1;\n\t"
+        "madc.hi.u32 %2, %10, 977, 0;\n\t"
+        "mad.lo.cc.u32 %2, %11, 977, %2;\n\t"
+        "madc.hi.u32 %3, %11, 977, 0;\n\t"
+        "mad.lo.cc.u32 %3, %12, 977, %3;\n\t"
+        "madc.hi.u32 %4, %12, 977, 0;\n\t"
+        "mad.lo.cc.u32 %4, %13, 977, %4;\n\t"
+        "madc.hi.u32 %5, %13, 977, 0;\n\t"
+        "mad.lo.cc.u32 %5, %14, 977, %5;\n\t"
+        "madc.hi.u32 %6, %14, 977, 0;\n\t"
+        "mad.lo.cc.u32 %6, %15, 977, %6;\n\t"
+        "madc.hi.u32 %7, %15, 977, 0;\n\t"
+        "mad.lo.cc.u32 %7, %16, 977, %7;\n\t"
+        "madc.hi.u32 %8, %16, 977, 0;\n\t"
+        : "=r"(a0),"=r"(a1),"=r"(a2),"=r"(a3),"=r"(a4),
+          "=r"(a5),"=r"(a6),"=r"(a7),"=r"(a8)
+        : "r"(t8),"r"(t9),"r"(t10),"r"(t11),
+          "r"(t12),"r"(t13),"r"(t14),"r"(t15)
+    );
+
+    // Phase 1b: add T_hi << 32 (a[1..8] += T_hi[8..15], yielding a9 overflow)
+    uint a9;
+    __asm volatile(
+        "add.cc.u32  %0, %0, %9;\n\t"
+        "addc.cc.u32 %1, %1, %10;\n\t"
+        "addc.cc.u32 %2, %2, %11;\n\t"
+        "addc.cc.u32 %3, %3, %12;\n\t"
+        "addc.cc.u32 %4, %4, %13;\n\t"
+        "addc.cc.u32 %5, %5, %14;\n\t"
+        "addc.cc.u32 %6, %6, %15;\n\t"
+        "addc.cc.u32 %7, %7, %16;\n\t"
+        "addc.u32    %8, 0, 0;\n\t"
+        : "+r"(a1),"+r"(a2),"+r"(a3),"+r"(a4),
+          "+r"(a5),"+r"(a6),"+r"(a7),"+r"(a8),"=r"(a9)
+        : "r"(t8),"r"(t9),"r"(t10),"r"(t11),
+          "r"(t12),"r"(t13),"r"(t14),"r"(t15)
+    );
+
+    // Phase 2: T_lo[0..7] += A[0..7] (32-bit carry chain)
+    uint carry;
+    __asm volatile(
+        "add.cc.u32  %0, %0, %9;\n\t"
+        "addc.cc.u32 %1, %1, %10;\n\t"
+        "addc.cc.u32 %2, %2, %11;\n\t"
+        "addc.cc.u32 %3, %3, %12;\n\t"
+        "addc.cc.u32 %4, %4, %13;\n\t"
+        "addc.cc.u32 %5, %5, %14;\n\t"
+        "addc.cc.u32 %6, %6, %15;\n\t"
+        "addc.cc.u32 %7, %7, %16;\n\t"
+        "addc.u32    %8, 0, 0;\n\t"
+        : "+r"(t0),"+r"(t1),"+r"(t2),"+r"(t3),
+          "+r"(t4),"+r"(t5),"+r"(t6),"+r"(t7),"=r"(carry)
+        : "r"(a0),"r"(a1),"r"(a2),"r"(a3),
+          "r"(a4),"r"(a5),"r"(a6),"r"(a7)
+    );
+
+    // Phase 3: pack to 64-bit and fold overflow (extra * K)
+    ulong r0 = ((ulong)t1 << 32) | t0;
+    ulong r1 = ((ulong)t3 << 32) | t2;
+    ulong r2 = ((ulong)t5 << 32) | t4;
+    ulong r3 = ((ulong)t7 << 32) | t6;
+    ulong extra = (ulong)a8 + carry + ((ulong)a9 << 32);
+    ulong ek_lo, ek_hi;
+    __asm volatile(
+        "mul.lo.u64 %0, %2, %3;\n\t"
+        "mul.hi.u64 %1, %2, %3;\n\t"
+        : "=l"(ek_lo), "=l"(ek_hi)
+        : "l"(extra), "l"((ulong)SECP256K1_K)
+    );
+    ulong c;
+    __asm volatile(
+        "add.cc.u64  %0, %0, %5;\n\t"
+        "addc.cc.u64 %1, %1, %6;\n\t"
+        "addc.cc.u64 %2, %2, 0;\n\t"
+        "addc.cc.u64 %3, %3, 0;\n\t"
+        "addc.u64    %4, 0, 0;\n\t"
+        : "+l"(r0),"+l"(r1),"+l"(r2),"+l"(r3),"=l"(c)
+        : "l"(ek_lo),"l"(ek_hi)
+    );
+    if (c) {
+        __asm volatile(
+            "add.cc.u64  %0, %0, %4;\n\t"
+            "addc.cc.u64 %1, %1, 0;\n\t"
+            "addc.cc.u64 %2, %2, 0;\n\t"
+            "addc.u64    %3, %3, 0;\n\t"
+            : "+l"(r0),"+l"(r1),"+l"(r2),"+l"(r3)
+            : "l"((ulong)SECP256K1_K)
+        );
+    }
+
+    // Phase 4: conditional subtraction of P (64-bit PTX sub.cc chain)
+    ulong s0, s1, s2, s3, borrow;
+    __asm volatile(
+        "sub.cc.u64  %0, %5, %9;\n\t"
+        "subc.cc.u64 %1, %6, %10;\n\t"
+        "subc.cc.u64 %2, %7, %11;\n\t"
+        "subc.cc.u64 %3, %8, %12;\n\t"
+        "subc.u64    %4, 0, 0;\n\t"
+        : "=l"(s0),"=l"(s1),"=l"(s2),"=l"(s3),"=l"(borrow)
+        : "l"(r0),"l"(r1),"l"(r2),"l"(r3),
+          "l"(SECP256K1_P0),"l"(SECP256K1_P1),"l"(SECP256K1_P2),"l"(SECP256K1_P3)
+    );
+    if (borrow == 0) {
+        r->limbs[0]=s0; r->limbs[1]=s1; r->limbs[2]=s2; r->limbs[3]=s3;
+    } else {
+        r->limbs[0]=r0; r->limbs[1]=r1; r->limbs[2]=r2; r->limbs[3]=r3;
+    }
+}
+
+#endif // __NV_CL_C_VERSION
+
 // =============================================================================
 // Field Reduction: r = a mod p
 // Uses the fact that p = 2^256 - K where K = 0x1000003D1
@ -151,32 +451,56 @@ inline void field_reduce(FieldElement* r, const ulong* a8) {
 // =============================================================================

 inline void field_add_impl(FieldElement* r, const FieldElement* a, const FieldElement* b) {
+#ifdef __NV_CL_C_VERSION
+    // Level 2: native add.cc/addc carry chains (no comparison-based carry)
+    ulong s0, s1, s2, s3, carry;
+    __asm volatile(
+        "add.cc.u64  %0, %5, %9;\n\t"
+        "addc.cc.u64 %1, %6, %10;\n\t"
+        "addc.cc.u64 %2, %7, %11;\n\t"
+        "addc.cc.u64 %3, %8, %12;\n\t"
+        "addc.u64    %4, 0, 0;\n\t"
+        : "=l"(s0),"=l"(s1),"=l"(s2),"=l"(s3),"=l"(carry)
+        : "l"(a->limbs[0]),"l"(a->limbs[1]),"l"(a->limbs[2]),"l"(a->limbs[3]),
+          "l"(b->limbs[0]),"l"(b->limbs[1]),"l"(b->limbs[2]),"l"(b->limbs[3])
+    );
+    ulong d0, d1, d2, d3, borrow;
+    __asm volatile(
+        "sub.cc.u64  %0, %5, %9;\n\t"
+        "subc.cc.u64 %1, %6, %10;\n\t"
+        "subc.cc.u64 %2, %7, %11;\n\t"
+        "subc.cc.u64 %3, %8, %12;\n\t"
+        "subc.u64    %4, 0, 0;\n\t"
+        : "=l"(d0),"=l"(d1),"=l"(d2),"=l"(d3),"=l"(borrow)
+        : "l"(s0),"l"(s1),"l"(s2),"l"(s3),
+          "l"(SECP256K1_P0),"l"(SECP256K1_P1),"l"(SECP256K1_P2),"l"(SECP256K1_P3)
+    );
+    // use diff if: no borrow (s >= P) OR carry from add (sum overflowed 2^256)
+    ulong mask = ~borrow | (0UL - carry);
+    r->limbs[0] = (d0 & mask) | (s0 & ~mask);
+    r->limbs[1] = (d1 & mask) | (s1 & ~mask);
+    r->limbs[2] = (d2 & mask) | (s2 & ~mask);
+    r->limbs[3] = (d3 & mask) | (s3 & ~mask);
+#else
    ulong carry = 0;
    ulong sum[4];
-
-    // Add with carry chain
    sum[0] = add_with_carry(a->limbs[0], b->limbs[0], 0, &carry);
    sum[1] = add_with_carry(a->limbs[1], b->limbs[1], carry, &carry);
    sum[2] = add_with_carry(a->limbs[2], b->limbs[2], carry, &carry);
    sum[3] = add_with_carry(a->limbs[3], b->limbs[3], carry, &carry);
-
-    // Reduce: if carry or sum >= p, subtract p
    ulong borrow = 0;
    ulong diff[4];
-
    diff[0] = sub_with_borrow(sum[0], SECP256K1_P0, 0, &borrow);
    diff[1] = sub_with_borrow(sum[1], SECP256K1_P1, borrow, &borrow);
    diff[2] = sub_with_borrow(sum[2], SECP256K1_P2, borrow, &borrow);
    diff[3] = sub_with_borrow(sum[3], SECP256K1_P3, borrow, &borrow);
-
-    // If carry from addition or no borrow from subtraction, use diff
    ulong use_diff = (carry != 0) | (borrow == 0);
    ulong mask = use_diff ? ~0UL : 0UL;
-
    r->limbs[0] = (diff[0] & mask) | (sum[0] & ~mask);
    r->limbs[1] = (diff[1] & mask) | (sum[1] & ~mask);
    r->limbs[2] = (diff[2] & mask) | (sum[2] & ~mask);
    r->limbs[3] = (diff[3] & mask) | (sum[3] & ~mask);
+#endif
 }

 // =============================================================================
@ -184,29 +508,51 @@ inline void field_add_impl(FieldElement* r, const FieldElement* a, const FieldEl
 // =============================================================================

 inline void field_sub_impl(FieldElement* r, const FieldElement* a, const FieldElement* b) {
+#ifdef __NV_CL_C_VERSION
+    // Level 2: native sub.cc/subc + add.cc/addc carry chains
+    ulong d0, d1, d2, d3, borrow;
+    __asm volatile(
+        "sub.cc.u64  %0, %5, %9;\n\t"
+        "subc.cc.u64 %1, %6, %10;\n\t"
+        "subc.cc.u64 %2, %7, %11;\n\t"
+        "subc.cc.u64 %3, %8, %12;\n\t"
+        "subc.u64    %4, 0, 0;\n\t"
+        : "=l"(d0),"=l"(d1),"=l"(d2),"=l"(d3),"=l"(borrow)
+        : "l"(a->limbs[0]),"l"(a->limbs[1]),"l"(a->limbs[2]),"l"(a->limbs[3]),
+          "l"(b->limbs[0]),"l"(b->limbs[1]),"l"(b->limbs[2]),"l"(b->limbs[3])
+    );
+    // borrow = 0xFFFF...FFFF if a < b (underflow), 0 otherwise
+    ulong p0 = SECP256K1_P0 & borrow;
+    ulong p1 = SECP256K1_P1 & borrow;
+    ulong p2 = SECP256K1_P2 & borrow;
+    ulong p3 = SECP256K1_P3 & borrow;
+    __asm volatile(
+        "add.cc.u64  %0, %4, %8;\n\t"
+        "addc.cc.u64 %1, %5, %9;\n\t"
+        "addc.cc.u64 %2, %6, %10;\n\t"
+        "addc.u64    %3, %7, %11;\n\t"
+        : "=l"(r->limbs[0]),"=l"(r->limbs[1]),"=l"(r->limbs[2]),"=l"(r->limbs[3])
+        : "l"(d0),"l"(d1),"l"(d2),"l"(d3), "l"(p0),"l"(p1),"l"(p2),"l"(p3)
+    );
+#else
    ulong borrow = 0;
    ulong diff[4];
-
-    // Subtract with borrow chain
    diff[0] = sub_with_borrow(a->limbs[0], b->limbs[0], 0, &borrow);
    diff[1] = sub_with_borrow(a->limbs[1], b->limbs[1], borrow, &borrow);
    diff[2] = sub_with_borrow(a->limbs[2], b->limbs[2], borrow, &borrow);
    diff[3] = sub_with_borrow(a->limbs[3], b->limbs[3], borrow, &borrow);
-
-    // If borrow, add p (result was negative)
    ulong mask = borrow ? ~0UL : 0UL;
-
    ulong carry = 0;
    ulong adj[4];
    adj[0] = add_with_carry(diff[0], SECP256K1_P0 & mask, 0, &carry);
    adj[1] = add_with_carry(diff[1], SECP256K1_P1 & mask, carry, &carry);
    adj[2] = add_with_carry(diff[2], SECP256K1_P2 & mask, carry, &carry);
    adj[3] = add_with_carry(diff[3], SECP256K1_P3 & mask, carry, &carry);
-
    r->limbs[0] = adj[0];
    r->limbs[1] = adj[1];
    r->limbs[2] = adj[2];
    r->limbs[3] = adj[3];
+#endif
 }

 // =============================================================================
@ -228,6 +574,12 @@ inline void muladd2(ulong lo, ulong hi, ulong* c0, ulong* c1, ulong* c2) {
 }

 inline void field_mul_impl(FieldElement* r, const FieldElement* a, const FieldElement* b) {
+#ifdef __NV_CL_C_VERSION
+    // Level 3: 32-bit hybrid Comba + 32-bit reduction (INT32 throughput 32x > INT64)
+    uint t32[16];
+    mul_256_comba32_ocl(a, b, t32);
+    reduce_512_to_256_32_ocl(t32, r);
+#else
    ulong a0 = a->limbs[0], a1 = a->limbs[1], a2 = a->limbs[2], a3 = a->limbs[3];
    ulong b0 = b->limbs[0], b1 = b->limbs[1], b2 = b->limbs[2], b3 = b->limbs[3];
    ulong product[8];
@ -274,11 +626,11 @@ inline void field_mul_impl(FieldElement* r, const FieldElement* a, const FieldEl
    product[7] = c1;

    field_reduce(r, product);
+#endif // __NV_CL_C_VERSION
 }

 // =============================================================================
 // Field Squaring: r = a² mod p
-// Optimized: only need upper triangle of multiplication
 // =============================================================================

 // Forward declaration for field_sqr_n_impl
@ -293,6 +645,12 @@ inline void field_sqr_n_impl(FieldElement* r, int n) {
 }

 inline void field_sqr_impl(FieldElement* r, const FieldElement* a) {
+#ifdef __NV_CL_C_VERSION
+    // Level 3: 32-bit hybrid squaring (40% fewer multiplications + INT32 throughput)
+    uint t32[16];
+    sqr_256_comba32_ocl(a, t32);
+    reduce_512_to_256_32_ocl(t32, r);
+#else
    ulong a0 = a->limbs[0], a1 = a->limbs[1], a2 = a->limbs[2], a3 = a->limbs[3];
    ulong product[8];
    ulong c0, c1, c2;
@ -332,6 +690,7 @@ inline void field_sqr_impl(FieldElement* r, const FieldElement* a) {
    product[7] = c1;

    field_reduce(r, product);
+#endif // __NV_CL_C_VERSION
 }

 // =============================================================================
--- a/opencl/kernels/secp256k1_point.cl
+++ b/opencl/kernels/secp256k1_point.cl
@ -417,7 +417,7 @@ inline void scalar_add_u64(Scalar* a, ulong val, Scalar* r) {

 // Convert scalar to wNAF representation (window width 5)
 // Returns length of wNAF representation
-inline int scalar_to_wnaf(const Scalar* k, int wnaf[260]) {
+static inline int scalar_to_wnaf(const Scalar* k, int wnaf[260]) {
    Scalar temp = *k;
    int len = 0;
    const int window_size = 32;   // 2^5
--- a/opencl/src/opencl_audit_runner.cpp
+++ b/opencl/src/opencl_audit_runner.cpp
@ -768,6 +768,427 @@ static int audit_perf_schnorr_stress() {
    return 0;
 }

+// =============================================================================
+// Section 9: BIP-352 Silent Payments & GLV Correctness
+// =============================================================================
+
+// Helper: expand kernel file with #include directives resolved (like bench_bip352_opencl.cpp).
+static std::string bip352_expand_kernel(const std::string& path,
+                                        std::vector<std::string>& seen) {
+    if (std::find(seen.begin(), seen.end(), path) != seen.end()) return {};
+    seen.push_back(path);
+    std::string src = load_file(path);
+    if (src.empty()) return {};
+    std::string dir = path.substr(0, path.find_last_of("/\\"));
+    if (dir.empty()) dir = ".";
+    std::istringstream in(src);
+    std::ostringstream out;
+    std::string line;
+    while (std::getline(in, line)) {
+        size_t s = line.find_first_not_of(" \t");
+        std::string trimmed = (s != std::string::npos) ? line.substr(s) : line;
+        if (trimmed.rfind("#include \"", 0) == 0) {
+            size_t q1 = trimmed.find('"') + 1;
+            size_t q2 = trimmed.find('"', q1);
+            std::string child = dir + "/" + trimmed.substr(q1, q2 - q1);
+            out << bip352_expand_kernel(child, seen);
+        } else {
+            out << line << '\n';
+        }
+    }
+    return out.str();
+}
+
+// Host wNAF encoder: mirrors the GPU scalar_to_wnaf fixed-130-step version.
+// Encodes 128-bit scalar (s0=LSW, s1=MSW) into 5-bit signed wNAF digits.
+static void audit_host_wnaf(uint64_t s0, uint64_t s1, int8_t wnaf[130]) {
+    uint64_t s[4] = {s0, s1, 0, 0};
+    for (int i = 0; i < 130; i++) {
+        if (s[0] & 1ULL) {
+            int d = (int)(s[0] & 0x1FULL);
+            if (d >= 16) {
+                d -= 32;
+                uint64_t add = (uint64_t)(-d);
+                uint64_t prev = s[0]; s[0] += add;
+                if (s[0] < prev) { for (int j=1;j<4;j++) if (++s[j]) break; }
+            } else {
+                uint64_t prev = s[0]; s[0] -= (uint64_t)d;
+                if (s[0] > prev) { for (int j=1;j<4;j++) if (s[j]--) break; }
+            }
+            wnaf[i] = (int8_t)d;
+        } else { wnaf[i] = 0; }
+        s[0] = (s[0] >> 1) | (s[1] << 63);
+        s[1] = (s[1] >> 1) | (s[2] << 63);
+        s[2] = (s[2] >> 1) | (s[3] << 63);
+        s[3] >>= 1;
+    }
+}
+
+// Test 1: CPU wNAF round-trip — encode scalar, decode digits back, verify match.
+// Tests host_compute_wnaf correctness: this was the key change that fixed the -36 crash.
+static int audit_glv_wnaf_roundtrip() {
+    struct TC { uint64_t s0, s1; const char* label; };
+    static const TC cases[] = {
+        {1,  0, "k=1"},
+        {2,  0, "k=2"},
+        {15, 0, "k=15 (max single wNAF digit)"},
+        {16, 0, "k=16 (two-digit boundary)"},
+        {31, 0, "k=31 (wNAF carry: 32-1)"},
+        {0x5555555555555555ULL, 0x5555555555555555ULL, "k=0x5555... (alternating bits)"},
+        {0xFFFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL, "k near 2^127"},
+        // k1 half of SCAN_KEY GLV decomposition (lower 128 bits of full key)
+        {0x38af4ad300da1a42ULL, 0x30d7d6a3b98294b1ULL, "k1 from SCAN_KEY GLV half"},
+    };
+
+    for (auto& tc : cases) {
+        int8_t wnaf[130] = {};
+        audit_host_wnaf(tc.s0, tc.s1, wnaf);
+
+        // Reconstruct: sum(wnaf[i] * 2^i) for i=0..129 using 128-bit arithmetic.
+        // Use __uint128_t for correctness (GCC/Clang extension, fine on x86-64).
+        __uint128_t result = 0, power = 1;
+        for (int i = 0; i < 130; i++) {
+            if (wnaf[i] > 0)  result += (__uint128_t)(uint8_t) wnaf[i]  * power;
+            if (wnaf[i] < 0)  result -= (__uint128_t)(uint8_t)(-wnaf[i]) * power;
+            power <<= 1;
+        }
+        uint64_t r0 = (uint64_t)result;
+        uint64_t r1 = (uint64_t)(result >> 64);
+
+        if (r0 != tc.s0 || r1 != tc.s1) {
+            std::fprintf(stderr, "  [FAIL] wNAF roundtrip for %s: "
+                "expected (%016llx,%016llx) got (%016llx,%016llx)\n",
+                tc.label,
+                (unsigned long long)tc.s0, (unsigned long long)tc.s1,
+                (unsigned long long)r0,    (unsigned long long)r1);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+// Test 2: GLV large scalar consistency via OpenCL library.
+// Verifies k*G + G = (k+1)*G for three large scalars that stress the GLV path:
+//   - SCAN_KEY (256-bit random key, both GLV halves active)
+//   - 2^128 (decomposition boundary)
+//   - 0x5555... (alternating bit pattern, maximally stresses wNAF carry logic)
+static int audit_glv_large_scalar() {
+    // Helper: hex string (big-endian) -> little-endian Scalar limbs
+    auto from_hex = [](const char* hex) -> Scalar {
+        Scalar s{};
+        std::string h(hex);
+        while (h.size() < 64) h = "0" + h;
+        for (int i = 0; i < 4; i++) {
+            uint64_t v = 0;
+            for (int j = 0; j < 16; j++)  {
+                char c = h[(3 - i) * 16 + j];
+                int d = (c >= '0' && c <= '9') ? c - '0'
+                      : (c >= 'a' && c <= 'f') ? c - 'a' + 10
+                      : (c >= 'A' && c <= 'F') ? c - 'A' + 10 : 0;
+                v = (v << 4) | (uint64_t)d;
+            }
+            s.limbs[i] = v;
+        }
+        return s;
+    };
+
+    struct TC { Scalar k, kp1; const char* label; };
+    Scalar s_scan   = from_hex("c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a42");
+    Scalar s_scanp  = from_hex("c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a43");
+    Scalar s_2_128  = {{0UL, 0UL, 1UL, 0UL}};
+    Scalar s_2_128p = {{1UL, 0UL, 1UL, 0UL}};
+    Scalar s_alt    = {{0x5555555555555555ULL, 0x5555555555555555ULL,
+                         0x5555555555555555ULL, 0x5555555555555555ULL}};
+    Scalar s_altp   = {{0x5555555555555556ULL, 0x5555555555555555ULL,
+                         0x5555555555555555ULL, 0x5555555555555555ULL}};
+
+    TC cases[] = {
+        {s_scan,  s_scanp,  "SCAN_KEY (256-bit)"},
+        {s_2_128, s_2_128p, "k = 2^128 (GLV boundary)"},
+        {s_alt,   s_altp,   "k = 0x5555... (alternating bits)"},
+    };
+
+    Scalar one = sc_from_u64(1);
+    JacobianPoint oneG = g_ctx->scalar_mul_generator(one);
+
+    for (auto& tc : cases) {
+        JacobianPoint kG    = g_ctx->scalar_mul_generator(tc.k);
+        JacobianPoint kp1_a = g_ctx->point_add(kG, oneG);         // k*G + G
+        JacobianPoint kp1_b = g_ctx->scalar_mul_generator(tc.kp1); // (k+1)*G
+
+        AffinePoint a = jacobian_to_affine(kp1_a);
+        AffinePoint b = jacobian_to_affine(kp1_b);
+        if (!fe_eq(a.x, b.x) || !fe_eq(a.y, b.y)) {
+            std::fprintf(stderr, "  [FAIL] GLV %s: k*G+G != (k+1)*G\n", tc.label);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+// Struct layout matching BIP352ScanKeyGlv in secp256k1_bip352.cl.
+// Used by the BIP-352 kernel audit tests below.
+struct alignas(1) AuditBIP352ScanKeyGlv {
+    int8_t  wnaf1[130]; // +0:   wNAF digits for k1 half-scalar
+    int8_t  wnaf2[130]; // +130: wNAF digits for k2 half-scalar
+    uint8_t k1_neg;     // +260: 1 if k1 negative
+    uint8_t flip_phi;   // +261: 1 if phi table y should be negated
+    uint8_t pad0, pad1; // +262-263: padding
+};
+static_assert(sizeof(AuditBIP352ScanKeyGlv) == 264, "BIP352ScanKeyGlv size mismatch");
+
+// Kernel-side AffinePoint and FieldElement layout (must match .cl struct).
+struct AuditFieldElement { uint64_t limbs[4]; };
+struct AuditAffinePoint  { AuditFieldElement x, y; };
+
+// secp256k1 generator G in the kernel's field element representation (little-endian limbs).
+static AuditAffinePoint audit_generator_point() {
+    AuditAffinePoint g;
+    g.x.limbs[0] = 0x59F2815B16F81798ULL; g.x.limbs[1] = 0x029BFCDB2DCE28D9ULL;
+    g.x.limbs[2] = 0x55A06295CE870B07ULL; g.x.limbs[3] = 0x79BE667EF9DCBBACULL;
+    g.y.limbs[0] = 0x9C47D08FFB10D4B8ULL; g.y.limbs[1] = 0xFD17B448A6855419ULL;
+    g.y.limbs[2] = 0x5DA4FBFC0E1108A8ULL; g.y.limbs[3] = 0x483ADA7726A3C465ULL;
+    return g;
+}
+
+// Test 3: BIP-352 kernel compiles without error.
+static int audit_bip352_kernel_build() {
+    if (g_kernel_dir.empty()) return -1;
+    std::vector<std::string> seen;
+    std::string src = bip352_expand_kernel(g_kernel_dir + "/secp256k1_bip352.cl", seen);
+    if (src.empty()) return -1;
+
+    cl_context cl_ctx = (cl_context)g_ctx->native_context();
+    cl_command_queue cl_q = (cl_command_queue)g_ctx->native_queue();
+    cl_device_id cl_dev = nullptr;
+    clGetCommandQueueInfo(cl_q, CL_QUEUE_DEVICE, sizeof(cl_dev), &cl_dev, nullptr);
+
+    cl_int err;
+    const char* src_ptr = src.c_str();
+    size_t src_len = src.size();
+    cl_program prog = clCreateProgramWithSource(cl_ctx, 1, &src_ptr, &src_len, &err);
+    if (err != CL_SUCCESS) return 1;
+
+    err = clBuildProgram(prog, 1, &cl_dev, "-cl-std=CL1.2", nullptr, nullptr);
+    if (err != CL_SUCCESS) {
+        size_t log_size = 0;
+        clGetProgramBuildInfo(prog, cl_dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_size);
+        std::string log(log_size, '\0');
+        clGetProgramBuildInfo(prog, cl_dev, CL_PROGRAM_BUILD_LOG, log_size, log.data(), nullptr);
+        std::fprintf(stderr, "  BIP-352 build log:\n%s\n", log.c_str());
+        clReleaseProgram(prog);
+        return 2;
+    }
+
+    // Verify both kernel entry points exist
+    cl_kernel k_nolut = clCreateKernel(prog, "bip352_pipeline_kernel", &err);
+    if (err != CL_SUCCESS) { clReleaseProgram(prog); return 3; }
+    cl_kernel k_lut   = clCreateKernel(prog, "bip352_pipeline_kernel_lut", &err);
+    if (err != CL_SUCCESS) { clReleaseKernel(k_nolut); clReleaseProgram(prog); return 4; }
+
+    clReleaseKernel(k_nolut);
+    clReleaseKernel(k_lut);
+    clReleaseProgram(prog);
+    return 0;
+}
+
+// Test 4: Regression for CL_INVALID_COMMAND_QUEUE (-36) GPU fault.
+// Runs bip352_pipeline_kernel (no-LUT path) with 1 work item and verifies no crash.
+// The crash was caused by GPU private-memory overflow from int wnaf[130]×2 arrays.
+// Fix: precompute wNAF on CPU (BIP352ScanKeyGlv.wnaf1/wnaf2), read from __constant.
+// Three scan-key edge cases: k=1 (minimal), k from SCAN_KEY, k with all-15 wNAF digits.
+static int audit_bip352_no_crash() {
+    if (g_kernel_dir.empty()) return -1;
+    std::vector<std::string> seen;
+    std::string src = bip352_expand_kernel(g_kernel_dir + "/secp256k1_bip352.cl", seen);
+    if (src.empty()) return -1;
+
+    cl_context cl_ctx = (cl_context)g_ctx->native_context();
+    cl_command_queue cl_q = (cl_command_queue)g_ctx->native_queue();
+    cl_device_id cl_dev = nullptr;
+    clGetCommandQueueInfo(cl_q, CL_QUEUE_DEVICE, sizeof(cl_dev), &cl_dev, nullptr);
+
+    cl_int err;
+    const char* src_ptr = src.c_str();
+    size_t src_len = src.size();
+    cl_program prog = clCreateProgramWithSource(cl_ctx, 1, &src_ptr, &src_len, &err);
+    if (err != CL_SUCCESS) return 1;
+
+    err = clBuildProgram(prog, 1, &cl_dev, "-cl-std=CL1.2 -cl-fast-relaxed-math", nullptr, nullptr);
+    if (err != CL_SUCCESS) { clReleaseProgram(prog); return 2; }
+
+    cl_kernel kernel = clCreateKernel(prog, "bip352_pipeline_kernel", &err);
+    if (err != CL_SUCCESS) { clReleaseProgram(prog); return 3; }
+
+    // Edge case scan keys to test. k1_neg/flip_phi chosen to exercise both paths.
+    struct EdgeCase {
+        const char* label;
+        int8_t      wnaf1_0; // wnaf1[0] digit (rest 0)
+        int8_t      wnaf2_0; // wnaf2[0] digit (rest 0)
+        uint8_t     k1_neg, flip_phi;
+    };
+    static const EdgeCase edges[] = {
+        {"k=1 (minimal scalar)",    1, 0, 0, 0},
+        {"k1=15,k2=1 (max digit)",  15, 1, 0, 0},
+        {"k1_neg=1, flip_phi=1",    1, 1, 1, 1},  // negate path
+    };
+
+    AuditAffinePoint g_pt    = audit_generator_point();
+    AuditAffinePoint spend_pt = g_pt; // spend = G for simplicity
+
+    // Pre-allocate buffers (reused across edge cases)
+    cl_mem d_tweaks  = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY, sizeof(AuditAffinePoint), nullptr, &err);
+    cl_mem d_spend   = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                      sizeof(AuditAffinePoint), &spend_pt, &err);
+    cl_mem d_scan    = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY, sizeof(AuditBIP352ScanKeyGlv), nullptr, &err);
+    cl_mem d_prefixes = clCreateBuffer(cl_ctx, CL_MEM_WRITE_ONLY, sizeof(uint64_t), nullptr, &err);
+
+    cl_uint count = 1;
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_tweaks);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_scan);
+    clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_spend);
+    clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_prefixes);
+    clSetKernelArg(kernel, 4, sizeof(cl_uint), &count);
+
+    int result = 0;
+    for (auto& ec : edges) {
+        // Build scan plan
+        AuditBIP352ScanKeyGlv plan{};
+        plan.wnaf1[0]  = ec.wnaf1_0;
+        plan.wnaf2[0]  = ec.wnaf2_0;
+        plan.k1_neg    = ec.k1_neg;
+        plan.flip_phi  = ec.flip_phi;
+
+        // Upload tweak=G and scan plan
+        clEnqueueWriteBuffer(cl_q, d_tweaks, CL_TRUE, 0, sizeof(AuditAffinePoint), &g_pt, 0, nullptr, nullptr);
+        clEnqueueWriteBuffer(cl_q, d_scan,   CL_TRUE, 0, sizeof(AuditBIP352ScanKeyGlv), &plan, 0, nullptr, nullptr);
+
+        size_t global = 1, local = 1;
+        err = clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr);
+        if (err != CL_SUCCESS) { result = 10; break; }
+        err = clFinish(cl_q);
+        if (err != CL_SUCCESS) {
+            // -36 = CL_INVALID_COMMAND_QUEUE = GPU fault (regression for the private-stack overflow crash)
+            std::fprintf(stderr, "  [FAIL] bip352_no_crash edge='%s' clFinish error=%d"
+                " (expected 0; -36 = GPU fault regression)\n", ec.label, err);
+            result = 20 + err; // encode the OCL error
+            break;
+        }
+
+        uint64_t prefix = 0;
+        clEnqueueReadBuffer(cl_q, d_prefixes, CL_TRUE, 0, sizeof(uint64_t), &prefix, 0, nullptr, nullptr);
+        // prefix may be 0 if the point is infinity (edge case k1=0 path) — that's valid.
+        // What we really test is that we reach here without crashing.
+    }
+
+    clReleaseMemObject(d_tweaks);
+    clReleaseMemObject(d_scan);
+    clReleaseMemObject(d_spend);
+    clReleaseMemObject(d_prefixes);
+    clReleaseKernel(kernel);
+    clReleaseProgram(prog);
+    return result;
+}
+
+// Test 5: BIP-352 pipeline output matches expected prefix for known input.
+// Uses tweak=G, scan_key=SCAN_KEY. Expected prefix pre-computed by the CPU
+// validation path in bench_bip352_opencl (validation: 0xb63b4601066a6971
+// is the last-item prefix when batch=10000; for single item with tweak=G
+// and k=SCAN_KEY this is independently computed below).
+static int audit_bip352_correct() {
+    if (g_kernel_dir.empty()) return -1;
+    std::vector<std::string> seen;
+    std::string src = bip352_expand_kernel(g_kernel_dir + "/secp256k1_bip352.cl", seen);
+    if (src.empty()) return -1;
+
+    cl_context cl_ctx = (cl_context)g_ctx->native_context();
+    cl_command_queue cl_q = (cl_command_queue)g_ctx->native_queue();
+    cl_device_id cl_dev = nullptr;
+    clGetCommandQueueInfo(cl_q, CL_QUEUE_DEVICE, sizeof(cl_dev), &cl_dev, nullptr);
+
+    cl_int err;
+    const char* src_ptr = src.c_str();
+    size_t src_len = src.size();
+    cl_program prog = clCreateProgramWithSource(cl_ctx, 1, &src_ptr, &src_len, &err);
+    if (err != CL_SUCCESS) return 1;
+    err = clBuildProgram(prog, 1, &cl_dev, "-cl-std=CL1.2 -cl-fast-relaxed-math", nullptr, nullptr);
+    if (err != CL_SUCCESS) { clReleaseProgram(prog); return 2; }
+    cl_kernel kernel = clCreateKernel(prog, "bip352_pipeline_kernel", &err);
+    if (err != CL_SUCCESS) { clReleaseProgram(prog); return 3; }
+
+    // Build BIP352ScanKeyGlv for SCAN_KEY using the host wNAF encoder.
+    // SCAN_KEY = c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a42
+    // GLV decomposition (pre-computed, matches bench_bip352_opencl):
+    //   k1 (LE64): {0x5db6fc2bc78a0e07, 0x7fff7d82be8fb40f, 0, 0}  k1_neg=0
+    //   k2 (LE64): {0x62491d65b0efea74, 0x3ca3a038cb4bac36, 0, 0}  flip_phi=0
+    // (These are the GLV halves as output by secp256k1::fast::glv_decompose)
+    // We use the benchmark's own scan_key encoding to stay in sync; here we use
+    // the actual k1/k2 from a one-time CPU run of build_scan_glv_plan().
+    // Instead of hard-coding the decomposition (which requires CPU GLV logic),
+    // we test consistency: run 2 items (tweak=G), compare both give the same prefix.
+    // A truly independent correctness check is in bench_bip352_opencl --batch 1 --local 1.
+
+    // For this audit: run 2 identical tweaks, check both prefixes are equal (determinism).
+    AuditBIP352ScanKeyGlv plan{};
+    // k1=1, k2=0 (simplest: scan*tweak = 1*G = G for any decomposition where k1=1, k2=0)
+    plan.wnaf1[0] = 1;
+    plan.k1_neg = 0;
+    plan.flip_phi = 0;
+
+    AuditAffinePoint g_pt     = audit_generator_point();
+    AuditAffinePoint spend_pt = g_pt;
+    AuditAffinePoint tweaks[2] = {g_pt, g_pt}; // same tweak twice
+
+    cl_mem d_tweaks   = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                       2 * sizeof(AuditAffinePoint), tweaks, &err);
+    cl_mem d_scan     = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                       sizeof(AuditBIP352ScanKeyGlv), &plan, &err);
+    cl_mem d_spend    = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                       sizeof(AuditAffinePoint), &spend_pt, &err);
+    cl_mem d_prefixes = clCreateBuffer(cl_ctx, CL_MEM_WRITE_ONLY, 2 * sizeof(uint64_t), nullptr, &err);
+
+    cl_uint count = 2;
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_tweaks);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_scan);
+    clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_spend);
+    clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_prefixes);
+    clSetKernelArg(kernel, 4, sizeof(cl_uint), &count);
+
+    size_t global = 2, local = 1;
+    err = clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr);
+    if (err != CL_SUCCESS) { clReleaseProgram(prog); return 4; }
+    err = clFinish(cl_q);
+    if (err != CL_SUCCESS) {
+        std::fprintf(stderr, "  [FAIL] bip352_correct: clFinish error=%d\n", err);
+        clReleaseProgram(prog); return 5;
+    }
+
+    uint64_t prefixes[2] = {};
+    clEnqueueReadBuffer(cl_q, d_prefixes, CL_TRUE, 0, 2 * sizeof(uint64_t), prefixes, 0, nullptr, nullptr);
+
+    int result = 0;
+    // Both items have identical input so must produce identical prefix (determinism test)
+    if (prefixes[0] != prefixes[1]) {
+        std::fprintf(stderr, "  [FAIL] bip352_correct: non-deterministic output:"
+            " item[0]=0x%016llx item[1]=0x%016llx\n",
+            (unsigned long long)prefixes[0], (unsigned long long)prefixes[1]);
+        result = 6;
+    }
+    // Prefix must be non-zero (1*G = G is not the point at infinity)
+    if (prefixes[0] == 0) {
+        std::fprintf(stderr, "  [FAIL] bip352_correct: prefix=0 (unexpected infinity)\n");
+        result = 7;
+    }
+
+    clReleaseMemObject(d_tweaks);
+    clReleaseMemObject(d_scan);
+    clReleaseMemObject(d_spend);
+    clReleaseMemObject(d_prefixes);
+    clReleaseKernel(kernel);
+    clReleaseProgram(prog);
+    return result;
+}
+
 // =============================================================================
 // Module & Section Registry
 // =============================================================================
@ -781,6 +1202,7 @@ static const OclSectionInfo OCL_SECTIONS[] = {
    { "protocol_security", "Protocol Security (multi-key)" },
    { "fuzzing",           "Fuzzing & Adversarial Inputs" },
    { "performance",       "Performance Smoke Tests" },
+    { "bip352_glv",        "BIP-352 Silent Payments & GLV Correctness" },
 };
 static constexpr int NUM_OCL_SECTIONS = sizeof(OCL_SECTIONS) / sizeof(OCL_SECTIONS[0]);

@ -827,6 +1249,13 @@ static const OclAuditModule OCL_MODULES[] = {
    // Section 8: Performance Smoke
    { "perf_ecdsa_50",     "ECDSA 50-iteration stress",                   "performance", audit_perf_ecdsa_stress, false },
    { "perf_schnorr_25",   "Schnorr 25-iteration stress",                "performance", audit_perf_schnorr_stress, false },
+
+    // Section 9: BIP-352 Silent Payments & GLV Correctness
+    { "glv_wnaf_rt",       "CPU wNAF encode/decode roundtrip (8 scalars)",     "bip352_glv", audit_glv_wnaf_roundtrip,    false },
+    { "glv_large_k",       "GLV large scalar k*G+G=(k+1)*G (3 scalars)",       "bip352_glv", audit_glv_large_scalar,      false },
+    { "bip352_build",      "BIP-352 kernel compiles (both entry points)",       "bip352_glv", audit_bip352_kernel_build,   false },
+    { "bip352_nocrash",    "BIP-352 no GPU fault: -36 crash regression (3 edge cases)", "bip352_glv", audit_bip352_no_crash, false },
+    { "bip352_correct",    "BIP-352 pipeline determinism (2 identical tweaks)", "bip352_glv", audit_bip352_correct,        false },
 };
 static constexpr int NUM_OCL_MODULES = sizeof(OCL_MODULES) / sizeof(OCL_MODULES[0]);

@ -1080,6 +1509,7 @@ int main(int argc, char* argv[]) {
    auto dev = detect_ocl_device(*g_ctx);

    // Try to init extended kernels
+    g_kernel_dir = kernel_dir; // make available to audit modules
    if (!kernel_dir.empty()) {
        g_ext.init(*g_ctx, kernel_dir);
    }
--- a/opencl/src/opencl_context.cpp
+++ b/opencl/src/opencl_context.cpp
@ -383,9 +383,9 @@ bool Context::Impl::init(const DeviceConfig& cfg) {
    return true;
 }

-// Embedded kernel source (will be generated by CMake)
-// For now, include a minimal version
-static const char* kernel_source = R"KERNEL(
+// Embedded kernel source — split into separate array entries so that
+// no single string literal exceeds MSVC's 65535-byte C2026 limit.
+static const char* const kernel_parts[] = { R"KERNEL(
 // =============================================================================
 // Secp256k1 OpenCL Kernels - Embedded Version
 // =============================================================================
@ -635,6 +635,18 @@ inline void field_sqr_n_impl(FieldElement* r, int n) {
    for (int i = 0; i < n; i++) field_sqr_impl(r, r);
 }

+inline int field_is_zero_impl(const FieldElement* a) {
+    return (a->limbs[0] | a->limbs[1] | a->limbs[2] | a->limbs[3]) == 0;
+}
+
+inline void field_set_zero_impl(FieldElement* a) {
+    a->limbs[0] = 0; a->limbs[1] = 0; a->limbs[2] = 0; a->limbs[3] = 0;
+}
+
+inline void field_set_one_impl(FieldElement* a) {
+    a->limbs[0] = 1; a->limbs[1] = 0; a->limbs[2] = 0; a->limbs[3] = 0;
+}
+
 inline void field_inv_impl(FieldElement* r, const FieldElement* a) {
    FieldElement x2,x3,x6,x12,x24,x48,x96,x192,x7,x31,x223,x5,x11,x22,t;
    field_sqr_impl(&x2, a); field_mul_impl(&x2, &x2, a);
@ -687,12 +699,61 @@ __kernel void field_sqr(__global const FieldElement* a, __global FieldElement* r
 }

 __kernel void field_inv(__global const FieldElement* a, __global FieldElement* r, uint count) {
-    uint gid = get_global_id(0); if (gid >= count) return;
-    FieldElement a_local = a[gid];
-    FieldElement res; field_inv_impl(&res, &a_local); r[gid] = res;
+    #define BATCH_INV_LOCAL_MAX 256
+    __local FieldElement local_vals[BATCH_INV_LOCAL_MAX];
+    __local FieldElement local_prefix[BATCH_INV_LOCAL_MAX];
+    __local FieldElement local_invs[BATCH_INV_LOCAL_MAX];
+    __local uint local_nonzero[BATCH_INV_LOCAL_MAX];
+
+    uint gid = get_global_id(0);
+    uint lid = get_local_id(0);
+    uint lsize = get_local_size(0);
+    uint group_start = get_group_id(0) * lsize;
+    uint active = (group_start < count) ? min(lsize, count - group_start) : 0;
+
+    if (gid >= count) return;
+
+    if (lsize > BATCH_INV_LOCAL_MAX) {
+        FieldElement a_local = a[gid];
+        FieldElement res; field_inv_impl(&res, &a_local); r[gid] = res;
+        return;
+    }
+
+    FieldElement v = a[gid];
+    uint nz = field_is_zero_impl(&v) ? 0U : 1U;
+    local_nonzero[lid] = nz;
+    local_vals[lid] = v;
+    if (!nz) { FieldElement _t; field_set_one_impl(&_t); local_vals[lid] = _t; }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lid == 0) {
+        FieldElement acc;
+        field_set_one_impl(&acc);
+
+        for (uint i = 0; i < active; ++i) {
+            local_prefix[i] = acc;
+            if (local_nonzero[i]) { FieldElement _t = local_vals[i]; field_mul_impl(&acc, &acc, &_t); }
+        }
+
+        field_inv_impl(&acc, &acc);
+
+        for (int i = (int)active - 1; i >= 0; --i) {
+            if (local_nonzero[i]) {
+                FieldElement inv_i;
+                { FieldElement _t = local_prefix[i]; field_mul_impl(&inv_i, &acc, &_t); }
+                local_invs[i] = inv_i;
+                { FieldElement _t = local_vals[i]; field_mul_impl(&acc, &acc, &_t); }
+            } else {
+                FieldElement _t; field_set_zero_impl(&_t); local_invs[i] = _t;
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    r[gid] = local_invs[lid];
 }

-)KERNEL"
+)KERNEL",

 // ---- second segment (point operations + scalar mul + batch) ----
 R"KERNEL(
@ -961,6 +1022,11 @@ inline void scalar_mul_mod_n_cl(const Scalar* a, const Scalar* b, Scalar* r) {
    scalar_cond_sub_n_cl(r); scalar_cond_sub_n_cl(r); scalar_cond_sub_n_cl(r);
 }

+)KERNEL",
+
+// ---- third segment (scalar utilities + GLV + point operations) ----
+R"KERNEL(
+
 // Scalar bit length (uses clz intrinsic -- single instruction on GPU)
 inline int scalar_bitlen_cl(const Scalar* s) {
    for (int i = 3; i >= 0; i--) {
@ -1031,41 +1097,189 @@ inline void glv_decompose_cl(const Scalar* k, Scalar* k1, Scalar* k2, int* k1_ne
    *k1_neg = k1_is_neg; *k2_neg = k2_is_neg;
 }

-// GLV + interleaved binary scalar multiplication: k*P
-// GPU-optimized: NO tables, two affine bases, mixed additions, minimal registers (~92)
-// SIMT-aware: two independent if-blocks (not else-if) for optimal warp divergence
+inline void point_from_affine(JacobianPoint* j, const AffinePoint* a) {
+    j->x = a->x; j->y = a->y;
+    j->z.limbs[0] = 1UL; j->z.limbs[1] = 0UL; j->z.limbs[2] = 0UL; j->z.limbs[3] = 0UL;
+    j->infinity = 0;
+}
+
+inline void point_add_mixed_h_impl(JacobianPoint* r, const JacobianPoint* p,
+                                   const AffinePoint* q, FieldElement* h_out) {
+    h_out->limbs[0] = 1UL; h_out->limbs[1] = 0UL; h_out->limbs[2] = 0UL; h_out->limbs[3] = 0UL;
+    if (point_is_infinity(p)) { point_from_affine(r, q); return; }
+
+    FieldElement Z1Z1, U2, S2, H, HH, I, J, rr, V, X3, Y3, Z3, t1, t2;
+    field_sqr_impl(&Z1Z1, &p->z);
+    field_mul_impl(&U2, &q->x, &Z1Z1);
+    field_mul_impl(&t1, &q->y, &p->z);
+    field_mul_impl(&S2, &t1, &Z1Z1);
+    field_sub_impl(&H, &U2, &p->x);
+
+    if ((H.limbs[0] | H.limbs[1] | H.limbs[2] | H.limbs[3]) == 0) {
+        field_sub_impl(&t1, &S2, &p->y);
+        if ((t1.limbs[0] | t1.limbs[1] | t1.limbs[2] | t1.limbs[3]) == 0)
+            { point_double_impl(r, p); return; }
+        point_set_infinity(r); return;
+    }
+    field_add_impl(h_out, &H, &H);
+    field_sqr_impl(&HH, &H);
+    field_add_impl(&I, &HH, &HH); field_add_impl(&I, &I, &I);
+    field_mul_impl(&J, &H, &I);
+    field_sub_impl(&rr, &S2, &p->y); field_add_impl(&rr, &rr, &rr);
+    field_mul_impl(&V, &p->x, &I);
+    field_sqr_impl(&X3, &rr);
+    field_sub_impl(&X3, &X3, &J);
+    field_add_impl(&t1, &V, &V); field_sub_impl(&X3, &X3, &t1);
+    field_sub_impl(&t1, &V, &X3); field_mul_impl(&Y3, &rr, &t1);
+    field_mul_impl(&t2, &p->y, &J); field_add_impl(&t2, &t2, &t2);
+    field_sub_impl(&Y3, &Y3, &t2);
+    field_add_impl(&t1, &p->z, &H); field_sqr_impl(&Z3, &t1);
+    field_sub_impl(&Z3, &Z3, &Z1Z1); field_sub_impl(&Z3, &Z3, &HH);
+    r->x = X3; r->y = Y3; r->z = Z3; r->infinity = 0;
+}
+
+inline void build_wnaf_table_zr_cl(const AffinePoint* base, AffinePoint table[8], FieldElement* globalz) {
+    JacobianPoint base_jac;
+    point_from_affine(&base_jac, base);
+
+    JacobianPoint doubled;
+    point_double_impl(&doubled, &base_jac);
+
+    FieldElement c = doubled.z;
+    FieldElement c2, c3;
+    field_sqr_impl(&c2, &c);
+    field_mul_impl(&c3, &c2, &c);
+
+    AffinePoint doubled_affine;
+    doubled_affine.x = doubled.x;
+    doubled_affine.y = doubled.y;
+
+    JacobianPoint accum;
+    field_mul_impl(&accum.x, &base->x, &c2);
+    field_mul_impl(&accum.y, &base->y, &c3);
+    accum.z.limbs[0] = 1UL; accum.z.limbs[1] = 0UL; accum.z.limbs[2] = 0UL; accum.z.limbs[3] = 0UL;
+    accum.infinity = 0;
+
+    table[0].x = accum.x;
+    table[0].y = accum.y;
+
+    FieldElement zr[8];
+    zr[0] = c;
+
+    for (int i = 1; i < 8; ++i) {
+        FieldElement h;
+        point_add_mixed_h_impl(&accum, &accum, &doubled_affine, &h);
+        table[i].x = accum.x;
+        table[i].y = accum.y;
+        zr[i] = h;
+    }
+
+    field_mul_impl(globalz, &accum.z, &c);
+
+    FieldElement zs = zr[7];
+    for (int idx = 6; idx >= 0; --idx) {
+        if (idx != 6) {
+            FieldElement tmp;
+            field_mul_impl(&tmp, &zs, &zr[idx + 1]);
+            zs = tmp;
+        }
+
+        FieldElement zs2, zs3;
+        field_sqr_impl(&zs2, &zs);
+        field_mul_impl(&zs3, &zs2, &zs);
+
+        FieldElement tx, ty;
+        field_mul_impl(&tx, &table[idx].x, &zs2);
+        field_mul_impl(&ty, &table[idx].y, &zs3);
+        table[idx].x = tx;
+        table[idx].y = ty;
+    }
+}
+
+inline void derive_endo_table_cl(const AffinePoint table[8], AffinePoint endo_table[8], int negate_y) {
+    FieldElement beta;
+    beta.limbs[0]=GLV_BETA0; beta.limbs[1]=GLV_BETA1;
+    beta.limbs[2]=GLV_BETA2; beta.limbs[3]=GLV_BETA3;
+
+    for (int i = 0; i < 8; ++i) {
+        field_mul_impl(&endo_table[i].x, &table[i].x, &beta);
+        if (negate_y) field_neg_impl(&endo_table[i].y, &table[i].y);
+        else endo_table[i].y = table[i].y;
+    }
+}
+
+static inline void scalar_to_wnaf(const Scalar* k, int wnaf[130]) {
+    ulong s[4];
+    for (int i = 0; i < 4; i++) s[i] = k->limbs[i];
+    for (int i = 0; i < 130; i++) {
+        if (s[0] & 1UL) {
+            int d = (int)(s[0] & 0x1FUL);
+            if (d >= 16) {
+                d -= 32;
+                ulong add = (ulong)(-d);
+                ulong prev = s[0]; s[0] += add;
+                if (s[0] < prev) { for (int j=1;j<4;j++) if (++s[j]) break; }
+            } else {
+                ulong prev = s[0]; s[0] -= (ulong)d;
+                if (s[0] > prev) { for (int j=1;j<4;j++) if (s[j]--) break; }
+            }
+            wnaf[i] = d;
+        } else { wnaf[i] = 0; }
+        s[0] = (s[0] >> 1) | (s[1] << 63);
+        s[1] = (s[1] >> 1) | (s[2] << 63);
+        s[2] = (s[2] >> 1) | (s[3] << 63);
+        s[3] >>= 1;
+    }
+}
+
 inline void scalar_mul_glv_cl(JacobianPoint* r, const Scalar* k, const AffinePoint* base) {
    if (scalar_is_zero_cl(k)) { point_set_infinity(r); return; }

    Scalar k1, k2; int k1_neg, k2_neg;
    glv_decompose_cl(k, &k1, &k2, &k1_neg, &k2_neg);

-    // Two affine bases: P and phi(P) = (beta*P.x, (+/-)P.y)
    AffinePoint P = *base;
    if (k1_neg) field_neg_impl(&P.y, &P.y);

-    FieldElement beta;
-    beta.limbs[0]=GLV_BETA0; beta.limbs[1]=GLV_BETA1;
-    beta.limbs[2]=GLV_BETA2; beta.limbs[3]=GLV_BETA3;
+    AffinePoint table[8];
+    FieldElement globalz;
+    build_wnaf_table_zr_cl(&P, table, &globalz);

-    AffinePoint phi_P;
-    field_mul_impl(&phi_P.x, &P.x, &beta);
-    if (k1_neg != k2_neg) field_neg_impl(&phi_P.y, &P.y);
-    else phi_P.y = P.y;
+    AffinePoint endo_table[8];
+    derive_endo_table_cl(table, endo_table, k1_neg != k2_neg);

-    // Find max bit length of k1, k2
-    int bl1 = scalar_bitlen_cl(&k1);
-    int bl2 = scalar_bitlen_cl(&k2);
-    int max_bit = (bl1 > bl2) ? bl1 : bl2;
+    int wnaf1[130] = {0};
+    int wnaf2[130] = {0};
+    scalar_to_wnaf(&k1, wnaf1);
+    scalar_to_wnaf(&k2, wnaf2);

-    // Interleaved binary double-and-add with mixed additions
    point_set_infinity(r);
-    for (int i = max_bit - 1; i >= 0; --i) {
+    for (int i = 129; i >= 0; --i) {
        if (!point_is_infinity(r)) point_double_impl(r, r);
-        int b1 = (int)((k1.limbs[i >> 6] >> (i & 63)) & 1UL);
-        int b2 = (int)((k2.limbs[i >> 6] >> (i & 63)) & 1UL);
-        if (b1) point_add_mixed_impl(r, r, &P);
-        if (b2) point_add_mixed_impl(r, r, &phi_P);
+
+        int d1 = wnaf1[i];
+        if (d1 != 0) {
+            int idx = (((d1 > 0) ? d1 : -d1) - 1) >> 1;
+            AffinePoint pt = table[idx];
+            if (d1 < 0) field_neg_impl(&pt.y, &pt.y);
+            if (point_is_infinity(r)) point_from_affine(r, &pt);
+            else point_add_mixed_impl(r, r, &pt);
+        }
+
+        int d2 = wnaf2[i];
+        if (d2 != 0) {
+            int idx = (((d2 > 0) ? d2 : -d2) - 1) >> 1;
+            AffinePoint pt = endo_table[idx];
+            if (d2 < 0) field_neg_impl(&pt.y, &pt.y);
+            if (point_is_infinity(r)) point_from_affine(r, &pt);
+            else point_add_mixed_impl(r, r, &pt);
+        }
+    }
+
+    if (!point_is_infinity(r)) {
+        FieldElement corrected_z;
+        field_mul_impl(&corrected_z, &r->z, &globalz);
+        r->z = corrected_z;
    }
 }

@ -1125,40 +1339,41 @@ inline int get_window_4bit(const Scalar* s, int pos) {
    return (int)(v & 0xFUL);
 }

-__kernel void scalar_mul_generator(__global const Scalar* scalars, __global JacobianPoint* results, uint count) {
-    uint gid = get_global_id(0); if (gid >= count) return;
-    Scalar k = scalars[gid];
-    JacobianPoint R;
-    if ((k.limbs[0]|k.limbs[1]|k.limbs[2]|k.limbs[3]) == 0) { point_set_infinity(&R); results[gid] = R; return; }
+inline void scalar_mul_generator_glv_impl(JacobianPoint* r, const Scalar* k) {
+    if ((k->limbs[0]|k->limbs[1]|k->limbs[2]|k->limbs[3]) == 0) {
+        point_set_infinity(r);
+        return;
+    }

    Scalar k1, k2; int k1_neg, k2_neg;
-    glv_decompose_cl(&k, &k1, &k2, &k1_neg, &k2_neg);
+    glv_decompose_cl(k, &k1, &k2, &k1_neg, &k2_neg);

-    // Compute actual number of 4-bit windows needed
-    int bl1 = scalar_bitlen_cl(&k1);
-    int bl2 = scalar_bitlen_cl(&k2);
-    int max_bits = (bl1 > bl2) ? bl1 : bl2;
-    int num_windows = (max_bits + 3) / 4;
-
-    point_set_infinity(&R);
-    for (int w = num_windows - 1; w >= 0; --w) {
-        if (!point_is_infinity(&R)) {
-            point_double_impl(&R, &R); point_double_impl(&R, &R);
-            point_double_impl(&R, &R); point_double_impl(&R, &R);
+    point_set_infinity(r);
+    for (int w = 31; w >= 0; --w) {
+        if (!point_is_infinity(r)) {
+            point_double_impl(r, r); point_double_impl(r, r);
+            point_double_impl(r, r); point_double_impl(r, r);
        }
        int w1 = get_window_4bit(&k1, w);
        if (w1) {
            AffinePoint pt = GENERATOR_TABLE_NIBBLE[w1];
            if (k1_neg) field_neg_impl(&pt.y, &pt.y);
-            point_add_mixed_impl(&R, &R, &pt);
+            point_add_mixed_impl(r, r, &pt);
        }
        int w2 = get_window_4bit(&k2, w);
        if (w2) {
            AffinePoint pt = GENERATOR_TABLE_NIBBLE_PHI[w2];
            if (k2_neg) field_neg_impl(&pt.y, &pt.y);
-            point_add_mixed_impl(&R, &R, &pt);
+            point_add_mixed_impl(r, r, &pt);
        }
    }
+}
+
+__kernel void scalar_mul_generator(__global const Scalar* scalars, __global JacobianPoint* results, uint count) {
+    uint gid = get_global_id(0); if (gid >= count) return;
+    Scalar k = scalars[gid];
+    JacobianPoint R;
+    scalar_mul_generator_glv_impl(&R, &k);
    results[gid] = R;
 }

@ -1278,12 +1493,62 @@ __kernel void affine_add(
    __global FieldElement* rx, __global FieldElement* ry,
    const uint count
 ) {
+    #define BATCH_INV_LOCAL_MAX 256
+    __local FieldElement local_h[BATCH_INV_LOCAL_MAX];
+    __local FieldElement local_prefix[BATCH_INV_LOCAL_MAX];
+    __local FieldElement local_h_inv[BATCH_INV_LOCAL_MAX];
+    __local uint local_nonzero[BATCH_INV_LOCAL_MAX];
+
    uint gid = get_global_id(0);
+    uint lid = get_local_id(0);
+    uint lsize = get_local_size(0);
+    uint group_start = get_group_id(0) * lsize;
+    uint active = (group_start < count) ? min(lsize, count - group_start) : 0;
    if (gid >= count) return;
+
    FieldElement lpx = px[gid], lpy = py[gid];
    FieldElement lqx = qx[gid], lqy = qy[gid];
+
+    if (lsize > BATCH_INV_LOCAL_MAX) {
+        AffinePoint r;
+        affine_add_impl(&r, &lpx, &lpy, &lqx, &lqy);
+        rx[gid] = r.x;
+        ry[gid] = r.y;
+        return;
+    }
+
+    { FieldElement _t; field_sub_impl(&_t, &lqx, &lpx); local_h[lid] = _t; }
+    { FieldElement _t = local_h[lid]; local_nonzero[lid] = field_is_zero_impl(&_t) ? 0U : 1U; }
+    if (!local_nonzero[lid]) { FieldElement _t; field_set_one_impl(&_t); local_h[lid] = _t; }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lid == 0) {
+        FieldElement acc;
+        field_set_one_impl(&acc);
+
+        for (uint i = 0; i < active; ++i) {
+            local_prefix[i] = acc;
+            if (local_nonzero[i]) { FieldElement _t = local_h[i]; field_mul_impl(&acc, &acc, &_t); }
+        }
+
+        field_inv_impl(&acc, &acc);
+
+        for (int i = (int)active - 1; i >= 0; --i) {
+            if (local_nonzero[i]) {
+                FieldElement inv_i;
+                { FieldElement _t = local_prefix[i]; field_mul_impl(&inv_i, &acc, &_t); }
+                local_h_inv[i] = inv_i;
+                { FieldElement _t = local_h[i]; field_mul_impl(&acc, &acc, &_t); }
+            } else {
+                FieldElement _t; field_set_zero_impl(&_t); local_h_inv[i] = _t;
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
    AffinePoint r;
-    affine_add_impl(&r, &lpx, &lpy, &lqx, &lqy);
+    { FieldElement _hinv = local_h_inv[lid]; affine_add_lambda_impl(&r, &lpx, &lpy, &lqx, &lqy, &_hinv); }
    rx[gid] = r.x;
    ry[gid] = r.y;
 }
@ -1330,24 +1595,80 @@ __kernel void jacobian_to_affine(
    __global FieldElement* ax, __global FieldElement* ay,
    const uint count
 ) {
+    #define BATCH_INV_LOCAL_MAX 256
+    __local FieldElement local_z[BATCH_INV_LOCAL_MAX];
+    __local FieldElement local_prefix[BATCH_INV_LOCAL_MAX];
+    __local FieldElement local_z_inv[BATCH_INV_LOCAL_MAX];
+    __local uint local_nonzero[BATCH_INV_LOCAL_MAX];
+
    uint gid = get_global_id(0);
+    uint lid = get_local_id(0);
+    uint lsize = get_local_size(0);
+    uint group_start = get_group_id(0) * lsize;
+    uint active = (group_start < count) ? min(lsize, count - group_start) : 0;
    if (gid >= count) return;
+
    FieldElement lx = jx[gid], ly = jy[gid], lz = jz[gid];
-    AffinePoint r;
-    jacobian_to_affine_convert_impl(&r, &lx, &ly, &lz);
-    ax[gid] = r.x;
-    ay[gid] = r.y;
+
+    if (lsize > BATCH_INV_LOCAL_MAX) {
+        AffinePoint r;
+        jacobian_to_affine_convert_impl(&r, &lx, &ly, &lz);
+        ax[gid] = r.x;
+        ay[gid] = r.y;
+        return;
+    }
+
+    local_z[lid] = lz;
+    { FieldElement _t = local_z[lid]; local_nonzero[lid] = field_is_zero_impl(&_t) ? 0U : 1U; }
+    if (!local_nonzero[lid]) { FieldElement _t; field_set_one_impl(&_t); local_z[lid] = _t; }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lid == 0) {
+        FieldElement acc;
+        field_set_one_impl(&acc);
+
+        for (uint i = 0; i < active; ++i) {
+            local_prefix[i] = acc;
+            if (local_nonzero[i]) { FieldElement _t = local_z[i]; field_mul_impl(&acc, &acc, &_t); }
+        }
+
+        field_inv_impl(&acc, &acc);
+
+        for (int i = (int)active - 1; i >= 0; --i) {
+            if (local_nonzero[i]) {
+                FieldElement inv_i;
+                { FieldElement _t = local_prefix[i]; field_mul_impl(&inv_i, &acc, &_t); }
+                local_z_inv[i] = inv_i;
+                { FieldElement _t = local_z[i]; field_mul_impl(&acc, &acc, &_t); }
+            } else {
+                FieldElement _t; field_set_zero_impl(&_t); local_z_inv[i] = _t;
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    FieldElement z_inv2, z_inv3;
+    { FieldElement _t = local_z_inv[lid]; field_sqr_impl(&z_inv2, &_t); }
+    { FieldElement _t = local_z_inv[lid]; field_mul_impl(&z_inv3, &z_inv2, &_t); }
+    { FieldElement _ax; field_mul_impl(&_ax, &lx, &z_inv2); ax[gid] = _ax; }
+    { FieldElement _ay; field_mul_impl(&_ay, &ly, &z_inv3); ay[gid] = _ay; }
 }
-)KERNEL";
+)KERNEL" };

 bool Context::Impl::build_program() {
    cl_int err;

-    // Create program from source
-    const char* sources[] = {kernel_source};
-    std::size_t lengths[] = {std::strlen(kernel_source)};
+    // Create program from source (multiple parts avoid MSVC C2026 limit)
+    constexpr cl_uint num_parts = sizeof(kernel_parts) / sizeof(kernel_parts[0]);
+    const char* sources[num_parts];
+    std::size_t lengths[num_parts];
+    for (cl_uint i = 0; i < num_parts; ++i) {
+        sources[i] = kernel_parts[i];
+        lengths[i] = std::strlen(kernel_parts[i]);
+    }

-    program = clCreateProgramWithSource(context, 1, sources, lengths, &err);
+    program = clCreateProgramWithSource(context, num_parts, sources, lengths, &err);
    if (err != CL_SUCCESS) {
        last_error = std::string("Failed to create program: ") + cl_error_string(err);
        return false;
@ -1730,6 +2051,14 @@ static void compute_work_sizes(std::size_t count, std::size_t max_wg, std::size_
    global = ((count + local - 1) / local) * local;
 }

+static void compute_scalar_mul_work_sizes(std::size_t count, std::size_t requested_local,
+                                          std::size_t auto_local, std::size_t max_wg, std::size_t& local,
+                                          std::size_t& global) {
+    const std::size_t tuned_auto_local = std::min(auto_local, max_wg);
+    local = requested_local == 0 ? tuned_auto_local : std::min(requested_local, max_wg);
+    global = ((count + local - 1) / local) * local;
+}
+
 void Context::batch_field_add(const FieldElement* a, const FieldElement* b,
                               FieldElement* results, std::size_t count) {
    if (count == 0) return;
@ -1882,7 +2211,6 @@ void Context::batch_scalar_mul_generator(const Scalar* scalars, JacobianPoint* r
        impl_->cache_smg_count = count;
    }

-    // Upload scalars (async)
    clEnqueueWriteBuffer(impl_->queue, impl_->cache_smg_scalars, CL_FALSE, 0,
                         count * sizeof(Scalar), scalars, 0, nullptr, nullptr);

@ -1891,13 +2219,11 @@ void Context::batch_scalar_mul_generator(const Scalar* scalars, JacobianPoint* r
    clSetKernelArg(impl_->kernel_scalar_mul_generator, 1, sizeof(cl_mem), &impl_->cache_smg_results);
    clSetKernelArg(impl_->kernel_scalar_mul_generator, 2, sizeof(cl_uint), &cnt);

-    // Calculate work group size
-    std::size_t local_size = impl_->config.local_work_size;
-    if (local_size == 0 || local_size > impl_->device_info.max_work_group_size) {
-        local_size = std::min(static_cast<std::size_t>(256), impl_->device_info.max_work_group_size);
-    }
-
-    std::size_t global_size = ((count + local_size - 1) / local_size) * local_size;
+    std::size_t local_size, global_size;
+    compute_scalar_mul_work_sizes(count, impl_->config.local_work_size,
+                                  128,
+                                  impl_->device_info.max_work_group_size,
+                                  local_size, global_size);

    clEnqueueNDRangeKernel(impl_->queue, impl_->kernel_scalar_mul_generator, 1, nullptr,
                           &global_size, &local_size, 0, nullptr, nullptr);
@ -1925,10 +2251,11 @@ void Context::batch_scalar_mul(const Scalar* scalars, const AffinePoint* points,
        impl_->cache_sm_count = count;
    }

+    clEnqueueWriteBuffer(impl_->queue, impl_->cache_sm_points, CL_TRUE, 0,
+                         count * sizeof(AffinePoint), points, 0, nullptr, nullptr);
    clEnqueueWriteBuffer(impl_->queue, impl_->cache_sm_scalars, CL_FALSE, 0,
                         count * sizeof(Scalar), scalars, 0, nullptr, nullptr);
-    clEnqueueWriteBuffer(impl_->queue, impl_->cache_sm_points, CL_FALSE, 0,
-                         count * sizeof(AffinePoint), points, 0, nullptr, nullptr);
+    clFlush(impl_->queue);

    cl_uint cnt = static_cast<cl_uint>(count);
    clSetKernelArg(impl_->kernel_scalar_mul, 0, sizeof(cl_mem), &impl_->cache_sm_scalars);
@ -1936,8 +2263,11 @@ void Context::batch_scalar_mul(const Scalar* scalars, const AffinePoint* points,
    clSetKernelArg(impl_->kernel_scalar_mul, 2, sizeof(cl_mem), &impl_->cache_sm_results);
    clSetKernelArg(impl_->kernel_scalar_mul, 3, sizeof(cl_uint), &cnt);

-    std::size_t local_size = std::min(static_cast<std::size_t>(256), impl_->device_info.max_work_group_size);
-    std::size_t global_size = ((count + local_size - 1) / local_size) * local_size;
+    std::size_t local_size, global_size;
+    compute_scalar_mul_work_sizes(count, impl_->config.local_work_size,
+                                  128,
+                                  impl_->device_info.max_work_group_size,
+                                  local_size, global_size);

    clEnqueueNDRangeKernel(impl_->queue, impl_->kernel_scalar_mul, 1, nullptr,
                           &global_size, &local_size, 0, nullptr, nullptr);
@ -2043,6 +2373,8 @@ void* Context::native_kernel(const char* name) const {
    if (n == "point_add") return impl_->kernel_point_add;
    if (n == "scalar_mul") return impl_->kernel_scalar_mul;
    if (n == "scalar_mul_generator") return impl_->kernel_scalar_mul_generator;
+    if (n == "batch_jacobian_to_affine") return impl_->kernel_batch_jacobian_to_affine;
+    if (n == "batch_jacobian_to_affine_kernel") return impl_->kernel_batch_jacobian_to_affine;
    if (n == "affine_add") return impl_->kernel_affine_add;
    if (n == "affine_add_lambda") return impl_->kernel_affine_add_lambda;
    if (n == "affine_add_x_only") return impl_->kernel_affine_add_x_only;
--- a/opencl/src/opencl_selftest.cpp
+++ b/opencl/src/opencl_selftest.cpp
@ -1128,6 +1128,133 @@ bool selftest(bool verbose, int platform_id, int device_id) {
        if (verbose) SELFTEST_PRINT(pass ? "    PASS\n" : "    FAIL\n");
    }

+    // ==========================================================================
+    // Test 41: BIP-352 SCAN_KEY smoke — large 256-bit scalar, must not be infinity
+    // Regression: verifies that scalar_mul_generator handles a real-world key
+    // that stresses the GLV decomposition path (both half-scalars non-trivial).
+    // ==========================================================================
+    {
+        total++;
+        if (verbose) SELFTEST_PRINT("\nBIP-352 SCAN_KEY k*G smoke (not infinity):\n");
+        bool pass = true;
+
+        // SCAN_KEY used in bench_bip352_opencl — 256-bit, both GLV halves non-zero
+        Scalar k_scan = scalar_from_hex(
+            "c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a42");
+        JacobianPoint P = ctx->scalar_mul_generator(k_scan);
+        AffinePoint Pa = jacobian_to_affine(P);
+        // Sanity: x-coordinate must be non-zero (point at infinity has x=0)
+        if ((Pa.x.limbs[0] | Pa.x.limbs[1] | Pa.x.limbs[2] | Pa.x.limbs[3]) == 0) {
+            if (verbose) SELFTEST_PRINT("    FAIL: SCAN_KEY * G produced x=0 (infinity)\n");
+            pass = false;
+        }
+
+        if (pass) passed++;
+        if (verbose) SELFTEST_PRINT(pass ? "    PASS\n" : "    FAIL\n");
+    }
+
+    // ==========================================================================
+    // Test 42: GLV large scalar consistency — k*G + G = (k+1)*G for SCAN_KEY
+    // Checks that GLV decomposition is correct for a full 256-bit key by
+    // cross-checking with the additive property: (k+1)*G = k*G + 1*G.
+    // ==========================================================================
+    {
+        total++;
+        if (verbose) SELFTEST_PRINT("\nGLV large scalar consistency: k*G + G = (k+1)*G:\n");
+        bool pass = true;
+
+        Scalar k   = scalar_from_hex(
+            "c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a42");
+        Scalar kp1 = scalar_from_hex(
+            "c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a43");
+        Scalar one = scalar_from_u64(1);
+
+        JacobianPoint kG     = ctx->scalar_mul_generator(k);
+        JacobianPoint oneG   = ctx->scalar_mul_generator(one);
+        JacobianPoint kp1_a  = ctx->point_add(kG, oneG);    // k*G + G
+        JacobianPoint kp1_b  = ctx->scalar_mul_generator(kp1); // (k+1)*G
+
+        AffinePoint a = jacobian_to_affine(kp1_a);
+        AffinePoint b = jacobian_to_affine(kp1_b);
+        if (field_to_hex(a.x) != field_to_hex(b.x) || field_to_hex(a.y) != field_to_hex(b.y)) {
+            if (verbose) {
+                SELFTEST_PRINT("    FAIL: k*G + G != (k+1)*G\n");
+                SELFTEST_PRINT("    k*G+G  x: %s\n", field_to_hex(a.x).c_str());
+                SELFTEST_PRINT("    (k+1)G x: %s\n", field_to_hex(b.x).c_str());
+            }
+            pass = false;
+        }
+
+        if (pass) passed++;
+        if (verbose) SELFTEST_PRINT(pass ? "    PASS\n" : "    FAIL\n");
+    }
+
+    // ==========================================================================
+    // Test 43: GLV 2^128 boundary — (2^128)*G + G = (2^128+1)*G
+    // The GLV decomposition boundary sits near 2^128; a scalar k = 2^128
+    // forces the high half of the GLV decomposition to be active. Regression
+    // for any off-by-one in the half-scalar split.
+    // ==========================================================================
+    {
+        total++;
+        if (verbose) SELFTEST_PRINT("\nGLV 2^128 boundary: 2^128*G + G = (2^128+1)*G:\n");
+        bool pass = true;
+
+        // k = 2^128: limbs[2]=1 (little-endian), others=0
+        Scalar k_128  = {{0UL, 0UL, 1UL, 0UL}};
+        Scalar k_128p = {{1UL, 0UL, 1UL, 0UL}}; // 2^128 + 1
+        Scalar one    = scalar_from_u64(1);
+
+        JacobianPoint kG    = ctx->scalar_mul_generator(k_128);
+        JacobianPoint oneG  = ctx->scalar_mul_generator(one);
+        JacobianPoint kp1_a = ctx->point_add(kG, oneG);
+        JacobianPoint kp1_b = ctx->scalar_mul_generator(k_128p);
+
+        AffinePoint a = jacobian_to_affine(kp1_a);
+        AffinePoint b = jacobian_to_affine(kp1_b);
+        if (field_to_hex(a.x) != field_to_hex(b.x) || field_to_hex(a.y) != field_to_hex(b.y)) {
+            if (verbose) SELFTEST_PRINT("    FAIL: 2^128*G + G != (2^128+1)*G\n");
+            pass = false;
+        }
+
+        if (pass) passed++;
+        if (verbose) SELFTEST_PRINT(pass ? "    PASS\n" : "    FAIL\n");
+    }
+
+    // ==========================================================================
+    // Test 44: wNAF alternating-bit stress — 0x5555...*G + G = 0x5556...*G
+    // Alternating 0101... bits maximally stress wNAF digit selection:
+    // every bit triggers a non-adjacent form carry/borrow. Catches bugs in
+    // the w=5 wNAF encoder that surface only with specific bit patterns.
+    // ==========================================================================
+    {
+        total++;
+        if (verbose) SELFTEST_PRINT("\nwNAF alternating-bit stress: 0x5555...*G + G:\n");
+        bool pass = true;
+
+        // k = 0x5555555555555555 * 4 limbs = repeating 01 bits in every position
+        Scalar k_alt  = {{0x5555555555555555ULL, 0x5555555555555555ULL,
+                           0x5555555555555555ULL, 0x5555555555555555ULL}};
+        Scalar k_altp = {{0x5555555555555556ULL, 0x5555555555555555ULL,
+                           0x5555555555555555ULL, 0x5555555555555555ULL}};
+        Scalar one = scalar_from_u64(1);
+
+        JacobianPoint kG    = ctx->scalar_mul_generator(k_alt);
+        JacobianPoint oneG  = ctx->scalar_mul_generator(one);
+        JacobianPoint kp1_a = ctx->point_add(kG, oneG);
+        JacobianPoint kp1_b = ctx->scalar_mul_generator(k_altp);
+
+        AffinePoint a = jacobian_to_affine(kp1_a);
+        AffinePoint b = jacobian_to_affine(kp1_b);
+        if (field_to_hex(a.x) != field_to_hex(b.x) || field_to_hex(a.y) != field_to_hex(b.y)) {
+            if (verbose) SELFTEST_PRINT("    FAIL: 0x5555...*G + G != 0x5556...*G\n");
+            pass = false;
+        }
+
+        if (pass) passed++;
+        if (verbose) SELFTEST_PRINT(pass ? "    PASS\n" : "    FAIL\n");
+    }
+
    // ==========================================================================
    // Test 40: Distributive k*(P+Q) = k*P + k*Q
    // ==========================================================================
--- a/tools/source_graph_kit/source_graph.db
+++ b/tools/source_graph_kit/source_graph.db
--- a/tools/source_graph_kit/source_graph.py
+++ b/tools/source_graph_kit/source_graph.py
--- a/tools/source_graph_kit/source_graph.toml
+++ b/tools/source_graph_kit/source_graph.toml
@ -0,0 +1,51 @@
+[project]
+name = "UltrafastSecp256k1"
+language = "cpp"
+
+[[source_dirs]]
+label = "cpu"
+path = "../cpu"
+
+[[source_dirs]]
+label = "include"
+path = "../include"
+
+[[source_dirs]]
+label = "audit"
+path = "../audit"
+optional = true
+
+[[source_dirs]]
+label = "benchmarks"
+path = "../benchmarks"
+optional = true
+
+[[source_dirs]]
+label = "cuda"
+path = "../cuda"
+optional = true
+
+[[source_dirs]]
+label = "examples"
+path = "../examples"
+optional = true
+
+[[source_dirs]]
+label = "gpu"
+path = "../gpu"
+optional = true
+
+[[source_dirs]]
+label = "metal"
+path = "../metal"
+optional = true
+
+[[source_dirs]]
+label = "opencl"
+path = "../opencl"
+optional = true
+
+[[source_dirs]]
+label = "tests"
+path = "../tests"
+optional = true