fix: MSVC C2026 string limit (#173), OpenCL batch-inv kernels, source graph tooling

- Split embedded OpenCL kernel_source into kernel_parts[] array
  so no single string literal exceeds MSVC's 65535-byte limit.
  clCreateProgramWithSource now receives multiple source strings.
- Added batch-inversion kernels (field_inv, affine_add, jac_to_affine)
  using per-workgroup Montgomery's trick with __local memory.
- OpenCL BIP352 benchmark scaffold and kernel stubs.
- Source graph kit for indexed codebase exploration.
- Assorted doc, benchmark, and audit report updates.
This commit is contained in:
shrec 2026-03-19 16:43:55 +00:00
parent cfda151728
commit fea2420fe7
No known key found for this signature in database
33 changed files with 14406 additions and 245 deletions

View File

@ -78,6 +78,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `coin_address()` CASHADDR dispatch now correctly routes to `coin_address_cashaddr()` --
Bitcoin Cash addresses generate via CashAddr instead of falling through to Base58Check.
- All 28 coins now generate addresses correctly (was 27; BCH fixed, Tron added).
- **ARM64 Android hash dispatch** -- `hash_accel` now routes `sha256_33`, `sha256_32`,
`hash160_33`, and `sha256_compress_dispatch` through ARMv8 SHA-256 instructions when
building for AArch64 targets with SHA2 support. On RK3588 / Android NDK r27.2 this reduced
`ecdsa_sign` from 25.89 us to 22.22 us, `schnorr_sign` (precomputed) from 17.73 us to 16.67 us,
and `ct::ecdsa_sign` from 70.50 us to 67.11 us, with verify paths remaining effectively flat.
- **x86 Schnorr batch verify allocation path** -- `batch_verify.cpp` now reserves the
full batch size for the uncached x-only pubkey cache instead of capping capacity at 64.
Local i5-14400F reruns reduced uncached `schnorr_batch_verify` from 20.27 us/sig to about
19.94-20.06 us/sig at N=128 and from 18.56 us/sig to about 18.01-18.45 us/sig at N=192,
with `comprehensive` remaining green.
---

View File

@ -63,6 +63,9 @@ set(SECP256K1_MARCH "" CACHE STRING "x86-64 -march override (empty = auto-detect
# Warning policy: promote warnings to errors (recommended for CI)
option(SECP256K1_WERROR "Treat compiler warnings as errors (-Werror / /WX)" OFF)
option(UFSECP_REFRESH_SOURCE_GRAPH "Refresh the repo source graph during builds" ON)
find_package(Python3 COMPONENTS Interpreter QUIET)
# Global compile definitions
if(SECP256K1_SPEED_FIRST)
@ -345,6 +348,21 @@ if(SECP256K1_INSTALL)
endif()
# -- CPack packaging ---------------------------------------------------------
set(UFSECP_SOURCE_GRAPH_TOOL "${CMAKE_CURRENT_SOURCE_DIR}/tools/source_graph_kit/source_graph.py")
if(UFSECP_REFRESH_SOURCE_GRAPH)
if(Python3_Interpreter_FOUND AND EXISTS "${UFSECP_SOURCE_GRAPH_TOOL}")
add_custom_target(ufsecp_source_graph_refresh ALL
COMMAND "${Python3_EXECUTABLE}" "${UFSECP_SOURCE_GRAPH_TOOL}" build -i
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
COMMENT "Refreshing UltrafastSecp256k1 source graph incrementally"
USES_TERMINAL
VERBATIM
)
else()
message(STATUS "secp256k1-fast: source graph refresh disabled (missing Python3 interpreter or source_graph.py)")
endif()
endif()
set(CPACK_PACKAGE_NAME "UltrafastSecp256k1")
set(CPACK_PACKAGE_VERSION "${PROJECT_VERSION}")
set(CPACK_PACKAGE_VENDOR "shrec")

View File

@ -0,0 +1,229 @@
{
"metadata": {
"cpu": "Intel(R) Core(TM) i5-14400F",
"compiler": "GCC 14.2.0",
"arch": "x86-64",
"timer": "RDTSCP",
"tsc_ghz": 2.496,
"passes": 11,
"warmup": 500,
"pool_size": 64
},
"results": [
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_mul", "ns": 10.78},
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_sqr", "ns": 10.06},
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_inv", "ns": 645.81},
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_add", "ns": 3.92},
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_sub", "ns": 4.18},
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_negate", "ns": 5.66},
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_from_bytes (32B)", "ns": 2.80},
{"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_mul", "ns": 19.96},
{"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_inv", "ns": 859.65},
{"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_add", "ns": 4.14},
{"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_negate", "ns": 2.35},
{"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_from_bytes (32B)", "ns": 2.56},
{"section": "POINT ARITHMETIC (Ultra)", "name": "pubkey_create (k*G)", "ns": 4750.01},
{"section": "POINT ARITHMETIC (Ultra)", "name": "scalar_mul (k*P)", "ns": 19404.73},
{"section": "POINT ARITHMETIC (Ultra)", "name": "scalar_mul_with_plan", "ns": 16596.88},
{"section": "POINT ARITHMETIC (Ultra)", "name": "dual_mul (a*G + b*P)", "ns": 18738.36},
{"section": "POINT ARITHMETIC (Ultra)", "name": "point_add (affine+affine)", "ns": 761.58},
{"section": "POINT ARITHMETIC (Ultra)", "name": "point_add (J+A mixed)", "ns": 118.54},
{"section": "POINT ARITHMETIC (Ultra)", "name": "point_dbl", "ns": 67.58},
{"section": "POINT ARITHMETIC (Ultra)", "name": "normalize (J->affine)", "ns": 2.63},
{"section": "POINT ARITHMETIC (Ultra)", "name": "batch_normalize /pt (N=64)", "ns": 8.15},
{"section": "POINT ARITHMETIC (Ultra)", "name": "next_inplace (+=G)", "ns": 132.52},
{"section": "POINT ARITHMETIC (Ultra)", "name": "KPlan::from_scalar(w=4)", "ns": 1103.67},
{"section": "POINT SERIALIZATION (Ultra)", "name": "to_compressed (33B)", "ns": 7.19},
{"section": "POINT SERIALIZATION (Ultra)", "name": "to_uncompressed (65B)", "ns": 6.98},
{"section": "POINT SERIALIZATION (Ultra)", "name": "x_only_bytes (32B)", "ns": 3.05},
{"section": "POINT SERIALIZATION (Ultra)", "name": "x_bytes_and_parity", "ns": 4.15},
{"section": "POINT SERIALIZATION (Ultra)", "name": "has_even_y", "ns": 1.74},
{"section": "POINT SERIALIZATION (Ultra)", "name": "batch_to_compressed /pt (N=64)", "ns": 2.03},
{"section": "POINT SERIALIZATION (Ultra)", "name": "batch_x_only_bytes /pt (N=64)", "ns": 1.71},
{"section": "POINT SERIALIZATION (Ultra)", "name": "msm /pt (N=128)", "ns": 6130.30},
{"section": "POINT SERIALIZATION (Ultra)", "name": "pippenger_msm /pt (N=128)", "ns": 6158.43},
{"section": "POINT SERIALIZATION (Ultra)", "name": "precompute_g_multiples /pt (N=64)", "ns": 248.30},
{"section": "POINT SERIALIZATION (Ultra)", "name": "precompute_point_multiples /pt (N=64)", "ns": 240.09},
{"section": "ECDSA -- Ultra FAST", "name": "ecdsa_sign", "ns": 6450.90},
{"section": "ECDSA -- Ultra FAST", "name": "ecdsa_sign_verified", "ns": 37580.34},
{"section": "ECDSA -- Ultra FAST", "name": "ecdsa_verify", "ns": 20846.59},
{"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_keypair_create", "ns": 5405.04},
{"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_sign", "ns": 5295.75},
{"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_sign_verified", "ns": 27132.24},
{"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_verify (cached xonly)", "ns": 20279.62},
{"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_verify (raw bytes)", "ns": 21640.76},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Scalar::from_bytes (32B->scalar)", "ns": 2.56},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Scalar::inverse (safegcd)", "ns": 849.50},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Scalar::mul", "ns": 19.74},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Scalar::negate", "ns": 2.40},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "glv_decompose", "ns": 74.70},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Point::dbl (jac52_double)", "ns": 57.55},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Point::add (J+A mixed)", "ns": 121.40},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "dual_scalar_mul_gen_point", "ns": 19001.51},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::from_4x64_limbs", "ns": 1.39},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::mul (52-bit)", "ns": 15.76},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::sqr (52-bit)", "ns": 13.50},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::inverse_safegcd", "ns": 725.37},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::inverse (Fermat)", "ns": 3828.50},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::add (52-bit)", "ns": 0.53},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::negate (52-bit)", "ns": 0.49},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::normalize", "ns": 3.50},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "SHA256 (BIP0340/challenge)", "ns": 107.37},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "tagged_hash (recompute tag)", "ns": 196.91},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "cached_tagged_hash (midstate)", "ns": 70.00},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "lift_x (4x64 sqrt)", "ns": 5094.45},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "lift_x (FE52 sqrt)", "ns": 3347.34},
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE::parse_bytes_strict", "ns": 3.41},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(N=4)", "ns": 78874.28},
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=4)", "ns": 19718.57},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(N=16)", "ns": 325401.49},
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=16)", "ns": 20337.59},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(N=64)", "ns": 1329107.06},
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=64)", "ns": 20767.30},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(N=192)", "ns": 3283487.41},
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=192)", "ns": 17101.50},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(repeated,N=192)", "ns": 2884848.94},
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig repeated (N=192)", "ns": 15025.25},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_seed_only(N=192)", "ns": 16218.78},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_weights_only(N=192)", "ns": 10063.15},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_R_lift_only(N=192)", "ns": 926909.98},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_P_lift_only(N=192)", "ns": 951004.08},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_challenge_only(N=192)", "ns": 16512.13},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_xonly_parse_only(N=192)", "ns": 659796.91},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_P_lift+challenge_only(N=192)", "ns": 937008.04},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_lift+challenge(N=192)", "ns": 1977220.42},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_setup_only(N=192)", "ns": 2008198.98},
{"section": "BATCH VERIFICATION (FAST)", "name": " -> setup per-sig (N=192)", "ns": 10459.37},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_seed_only(repeated,N=192)", "ns": 14453.32},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_weights_only(repeated,N=192)", "ns": 8768.24},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_R_lift_only(repeated,N=192)", "ns": 1004852.26},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_P_lift_only(repeated,N=192)", "ns": 945079.24},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_challenge_only(repeated,N=192)", "ns": 18516.63},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_xonly_parse_only(repeated,N=192)", "ns": 663956.75},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_P_lift+challenge_only(repeated,N=192)", "ns": 953751.95},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_lift+challenge(repeated,N=192)", "ns": 1912494.47},
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_setup_only(repeated,N=192)", "ns": 1908150.13},
{"section": "BATCH VERIFICATION (FAST)", "name": " -> setup repeated per-sig (N=192)", "ns": 9938.28},
{"section": "BATCH VERIFICATION (FAST)", "name": "ecdsa_batch_verify(N=4)", "ns": 76754.13},
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=4)", "ns": 19188.53},
{"section": "BATCH VERIFICATION (FAST)", "name": "ecdsa_batch_verify(N=16)", "ns": 304265.14},
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=16)", "ns": 19016.57},
{"section": "BATCH VERIFICATION (FAST)", "name": "ecdsa_batch_verify(N=64)", "ns": 1230289.00},
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=64)", "ns": 19223.27},
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::scalar_inverse (SafeGCD)", "ns": 1351.46},
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::generator_mul (k*G)", "ns": 9533.74},
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::scalar_mul (k*P)", "ns": 21251.51},
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::point_dbl", "ns": 70.57},
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::point_add_complete (11M+6S)", "ns": 203.36},
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::point_add_mixed_complete (7M+5S)", "ns": 135.48},
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::point_add_mixed_unified (7M+5S)", "ns": 131.32},
{"section": "CT SIGNING (Ultra CT)", "name": "ct::ecdsa_sign", "ns": 12761.02},
{"section": "CT SIGNING (Ultra CT)", "name": " CT overhead (ECDSA)", "ratio": 1.9782},
{"section": "CT SIGNING (Ultra CT)", "name": "ct::ecdsa_sign_verified", "ns": 43190.57},
{"section": "CT SIGNING (Ultra CT)", "name": "ct::schnorr_sign", "ns": 11070.78},
{"section": "CT SIGNING (Ultra CT)", "name": " CT overhead (Schnorr)", "ratio": 2.0905},
{"section": "CT SIGNING (Ultra CT)", "name": "ct::schnorr_sign_verified", "ns": 33161.61},
{"section": "CT SIGNING (Ultra CT)", "name": "ct::schnorr_keypair_create", "ns": 12088.90},
{"section": "CT SIGNING (Ultra CT)", "name": " CT overhead (keypair)", "ratio": 2.2366},
{"section": "ETHEREUM OPERATIONS", "name": "keccak256 (32B)", "ns": 254.19},
{"section": "ETHEREUM OPERATIONS", "name": "ethereum_address", "ns": 228.43},
{"section": "ETHEREUM OPERATIONS", "name": "eip191_hash", "ns": 225.03},
{"section": "ETHEREUM OPERATIONS", "name": "eth_sign_hash", "ns": 6525.31},
{"section": "ETHEREUM OPERATIONS", "name": "ecdsa_sign_recoverable", "ns": 6598.01},
{"section": "ETHEREUM OPERATIONS", "name": "ecrecover", "ns": 27095.12},
{"section": "ETHEREUM OPERATIONS", "name": "eth_personal_sign", "ns": 6787.62},
{"section": "ETHEREUM OPERATIONS", "name": "ethereum_address_eip55", "ns": 564.81},
{"section": "REAL-WORLD FLOWS", "name": "ecdh_compute (SHA256 shared secret)", "ns": 20215.17},
{"section": "REAL-WORLD FLOWS", "name": "ecdh_compute_raw (x-only shared)", "ns": 20134.59},
{"section": "REAL-WORLD FLOWS", "name": "taproot_output_key (BIP-341 key path)", "ns": 10438.59},
{"section": "REAL-WORLD FLOWS", "name": "taproot_tweak_privkey (BIP-341)", "ns": 11246.91},
{"section": "REAL-WORLD FLOWS", "name": "bip32_master_key (64B seed)", "ns": 933.32},
{"section": "REAL-WORLD FLOWS", "name": "bip32_coin_derive_key (BTC m/84'/0'/0'/0/0)", "ns": 77986.98},
{"section": "REAL-WORLD FLOWS", "name": "coin_address_from_seed (BTC end-to-end)", "ns": 91654.63},
{"section": "REAL-WORLD FLOWS", "name": "coin_address_from_seed (ETH end-to-end)", "ns": 91281.62},
{"section": "REAL-WORLD FLOWS", "name": "silent_payment_create_output", "ns": 24181.18},
{"section": "REAL-WORLD FLOWS", "name": "silent_payment_scan (single output set)", "ns": 34901.09},
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_mul", "ns": 11.61},
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_sqr", "ns": 10.51},
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_inv_var", "ns": 833.17},
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_add", "ns": 6.57},
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_negate", "ns": 6.32},
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_normalize", "ns": 7.41},
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_from_bytes (set_b32)", "ns": 6.97},
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_mul", "ns": 26.42},
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_inverse (CT)", "ns": 1421.11},
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_inverse_var", "ns": 856.24},
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_add", "ns": 5.23},
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_negate", "ns": 7.00},
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_from_bytes (set_b32)", "ns": 5.01},
{"section": "libsecp256k1 (bitcoin-core)", "name": "point_dbl (gej_double_var)", "ns": 78.64},
{"section": "libsecp256k1 (bitcoin-core)", "name": "point_add (gej_add_ge_var)", "ns": 141.13},
{"section": "libsecp256k1 (bitcoin-core)", "name": "ecmult (a*P + b*G, Strauss)", "ns": 21020.33},
{"section": "libsecp256k1 (bitcoin-core)", "name": "ecmult_gen (k*G, comb)", "ns": 9723.23},
{"section": "libsecp256k1 (bitcoin-core)", "name": "generator_mul (ec_pubkey_create)", "ns": 11384.81},
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_mul_P (k*P, tweak_mul)", "ns": 20135.59},
{"section": "libsecp256k1 (bitcoin-core)", "name": "serialize_compressed (33B)", "ns": 17.67},
{"section": "libsecp256k1 (bitcoin-core)", "name": "serialize_uncompressed (65B)", "ns": 22.52},
{"section": "libsecp256k1 (bitcoin-core)", "name": "point_add (pubkey_combine)", "ns": 1774.01},
{"section": "libsecp256k1 (bitcoin-core)", "name": "ecdsa_sign", "ns": 17203.14},
{"section": "libsecp256k1 (bitcoin-core)", "name": "ecdsa_verify", "ns": 22448.31},
{"section": "libsecp256k1 (bitcoin-core)", "name": "schnorr_keypair_create", "ns": 11751.95},
{"section": "libsecp256k1 (bitcoin-core)", "name": "schnorr_sign (BIP-340)", "ns": 13712.35},
{"section": "libsecp256k1 (bitcoin-core)", "name": "schnorr_verify (BIP-340)", "ns": 24529.62},
{"section": "OpenSSL (ECDSA, secp256k1)", "name": "generator_mul (EC_POINT_mul k*G)", "ns": 213014.57},
{"section": "OpenSSL (ECDSA, secp256k1)", "name": "ecdsa_sign (ECDSA_do_sign)", "ns": 222950.90},
{"section": "OpenSSL (ECDSA, secp256k1)", "name": "ecdsa_verify (ECDSA_do_verify)", "ns": 214672.40},
{"section": "FIELD ARITHMETIC", "name": "mul", "ns": 10.78},
{"section": "FIELD ARITHMETIC", "name": "sqr", "ns": 10.06},
{"section": "FIELD ARITHMETIC", "name": "inv", "ns": 645.81},
{"section": "FIELD ARITHMETIC", "name": "add", "ns": 3.92},
{"section": "FIELD ARITHMETIC", "name": "sub", "ns": 4.18},
{"section": "FIELD ARITHMETIC", "name": "negate", "ns": 5.66},
{"section": "FIELD ARITHMETIC", "name": "normalize (FE52)", "ns": 3.50},
{"section": "FIELD ARITHMETIC", "name": "from_bytes (32B)", "ns": 2.80},
{"section": "FIELD ARITHMETIC", "name": "FE52 add (hot path)", "ns": 0.53},
{"section": "FIELD ARITHMETIC", "name": "FE52 neg (hot path)", "ns": 0.49},
{"section": "SCALAR ARITHMETIC", "name": "mul", "ns": 19.96},
{"section": "SCALAR ARITHMETIC", "name": "inv (CT)", "ns": 849.50},
{"section": "SCALAR ARITHMETIC", "name": "inv (var-time)", "ns": 849.50},
{"section": "SCALAR ARITHMETIC", "name": "add", "ns": 4.14},
{"section": "SCALAR ARITHMETIC", "name": "negate", "ns": 2.35},
{"section": "SCALAR ARITHMETIC", "name": "from_bytes (32B)", "ns": 2.56},
{"section": "POINT ARITHMETIC", "name": "dbl (Jacobian)", "ns": 67.58},
{"section": "POINT ARITHMETIC", "name": "add (mixed J+A)", "ns": 118.54},
{"section": "POINT ARITHMETIC", "name": "ecmult (a*P+b*G)", "ns": 18738.36},
{"section": "POINT ARITHMETIC", "name": "ecmult_gen (k*G raw)", "ns": 4750.01},
{"section": "POINT ARITHMETIC", "name": "pubkey_create (API)", "ns": 4750.01},
{"section": "POINT ARITHMETIC", "name": "scalar_mul (k*P)", "ns": 19404.73},
{"section": "POINT ARITHMETIC", "name": "scalar_mul (KPlan)", "ns": 16596.88},
{"section": "POINT ARITHMETIC", "name": "point_add (combine)", "ns": 761.58},
{"section": "SERIALIZATION", "name": "compressed (33B)", "ns": 7.19},
{"section": "SERIALIZATION", "name": "uncompressed (65B)", "ns": 6.98},
{"section": "SIGNING (FAST vs libsecp CT)", "name": "ECDSA Sign", "ns": 6450.90},
{"section": "SIGNING (FAST vs libsecp CT)", "name": "Schnorr Sign", "ns": 5295.75},
{"section": "SIGNING (FAST vs libsecp CT)", "name": "Schnorr Keypair", "ns": 5405.04},
{"section": "VERIFICATION", "name": "ECDSA Verify", "ns": 20846.59},
{"section": "VERIFICATION", "name": "Schnorr Verify (cached)", "ns": 20279.62},
{"section": "VERIFICATION", "name": "Schnorr Verify (raw)", "ns": 21640.76},
{"section": "CT-vs-CT (fair signing)", "name": "ECDSA Sign", "ns": 12761.02},
{"section": "CT-vs-CT (fair signing)", "name": "Schnorr Sign", "ns": 11070.78},
{"section": "CT-vs-CT (fair signing)", "name": "ECDSA Verify", "ns": 20846.59},
{"section": "CT-vs-CT (fair signing)", "name": "Schnorr Verify", "ns": 21640.76},
{"section": "ETHEREUM / RECOVERY", "name": "sign_recoverable", "ns": 6598.01},
{"section": "ETHEREUM / RECOVERY", "name": "ecrecover", "ns": 27095.12},
{"section": "ETHEREUM / RECOVERY", "name": "eth_sign_hash", "ns": 6525.31},
{"section": "ETHEREUM / RECOVERY", "name": "eth_personal_sign", "ns": 6787.62},
{"section": "FAST path (Ultra FAST vs OpenSSL)", "name": "Generator * k", "ratio": 44.8451},
{"section": "FAST path (Ultra FAST vs OpenSSL)", "name": "ECDSA Sign", "ratio": 34.5612},
{"section": "FAST path (Ultra FAST vs OpenSSL)", "name": "ECDSA Verify", "ratio": 10.2977},
{"section": "CT path (Ultra CT vs OpenSSL)", "name": "ECDSA Sign (CT vs CT)", "ratio": 17.4712},
{"section": "CT path (Ultra CT vs OpenSSL)", "name": "ECDSA Verify", "ratio": 10.2977},
{"section": "ZK Proofs & Commitments", "name": "Pedersen commit", "ns": 30575.55},
{"section": "ZK Proofs & Commitments", "name": "Knowledge prove (sigma)", "ns": 20371.57},
{"section": "ZK Proofs & Commitments", "name": "Knowledge verify", "ns": 21392.29},
{"section": "ZK Proofs & Commitments", "name": "DLEQ prove", "ns": 44028.64},
{"section": "ZK Proofs & Commitments", "name": "DLEQ verify", "ns": 57020.44},
{"section": "ZK Proofs & Commitments", "name": "Bulletproof range_prove (64b)", "ns": 13055460.41},
{"section": "ZK Proofs & Commitments", "name": "Bulletproof range_verify (64b)", "ns": 1259727.10}
]
}

View File

@ -0,0 +1,538 @@
CPU frequency warmup (3000 ms heavy load)... stable at 2.496 GHz (569198 k*G ops)
Running integrity check... OK
======================================================================
UltrafastSecp256k1 -- Unified Apple-to-Apple Benchmark
======================================================================
CPU: Intel(R) Core(TM) i5-14400F
TSC freq: 2.496 GHz
Core: 1 (pinned to core 0, priority elevated)
Compiler: GCC 14.2.0
Arch: x86-64
Ultra: UltrafastSecp256k1
libsecp: bitcoin-core libsecp256k1 v0.7.x
Harness: 3s CPU ramp-up, 500 warmup/op, 11 passes, IQR outlier removal, median
Timer: RDTSCP
Pool: 64 independent key/msg/sig sets
NOTE: Both Ultra and libsecp use IDENTICAL harness
+----------------------------------------------+------------+
| FIELD ARITHMETIC (Ultra) | ns/op |
+----------------------------------------------+------------+
| field_mul | 10.8 |
| field_sqr | 10.1 |
| field_inv | 645.8 |
| field_add | 3.9 |
| field_sub | 4.2 |
| field_negate | 5.7 |
| field_from_bytes (32B) | 2.8 |
+----------------------------------------------+------------+
+----------------------------------------------+------------+
| SCALAR ARITHMETIC (Ultra) | ns/op |
+----------------------------------------------+------------+
| scalar_mul | 20.0 |
| scalar_inv | 859.7 |
| scalar_add | 4.1 |
| scalar_negate | 2.3 |
| scalar_from_bytes (32B) | 2.6 |
+----------------------------------------------+------------+
+----------------------------------------------+------------+
| POINT ARITHMETIC (Ultra) | ns/op |
+----------------------------------------------+------------+
| pubkey_create (k*G) | 4750.0 |
| scalar_mul (k*P) | 19404.7 |
| scalar_mul_with_plan | 16596.9 |
| dual_mul (a*G + b*P) | 18738.4 |
| point_add (affine+affine) | 761.6 |
| point_add (J+A mixed) | 118.5 |
| point_dbl | 67.6 |
| normalize (J->affine) | 2.6 |
| batch_normalize /pt (N=64) | 8.2 |
| next_inplace (+=G) | 132.5 |
| KPlan::from_scalar(w=4) | 1103.7 |
+----------------------------------------------+------------+
+----------------------------------------------+------------+
| POINT SERIALIZATION (Ultra) | ns/op |
+----------------------------------------------+------------+
| to_compressed (33B) | 7.2 |
| to_uncompressed (65B) | 7.0 |
| x_only_bytes (32B) | 3.1 |
| x_bytes_and_parity | 4.1 |
| has_even_y | 1.7 |
| batch_to_compressed /pt (N=64) | 2.0 |
| batch_x_only_bytes /pt (N=64) | 1.7 |
| msm /pt (N=128) | 6130.3 |
| pippenger_msm /pt (N=128) | 6158.4 |
| precompute_g_multiples /pt (N=64) | 248.3 |
| precompute_point_multiples /pt (N=64) | 240.1 |
+----------------------------------------------+------------+
+----------------------------------------------+------------+
| ECDSA -- Ultra FAST | ns/op |
+----------------------------------------------+------------+
| ecdsa_sign | 6450.9 |
| ecdsa_sign_verified | 37580.3 |
| ecdsa_verify | 20846.6 |
+----------------------------------------------+------------+
+----------------------------------------------+------------+
| SCHNORR / BIP-340 -- Ultra FAST | ns/op |
+----------------------------------------------+------------+
| schnorr_keypair_create | 5405.0 |
| schnorr_sign | 5295.8 |
| schnorr_sign_verified | 27132.2 |
| schnorr_verify (cached xonly) | 20279.6 |
| schnorr_verify (raw bytes) | 21640.8 |
+----------------------------------------------+------------+
+----------------------------------------------+------------+
| MICRO-DIAGNOSTICS (sub-ops) | ns/op |
+----------------------------------------------+------------+
| Scalar::from_bytes (32B->scalar) | 2.6 |
| Scalar::inverse (safegcd) | 849.5 |
| Scalar::mul | 19.7 |
| Scalar::negate | 2.4 |
| glv_decompose | 74.7 |
| Point::dbl (jac52_double) | 57.6 |
| Point::add (J+A mixed) | 121.4 |
| dual_scalar_mul_gen_point | 19001.5 |
| FE52::from_4x64_limbs | 1.4 |
| FE52::mul (52-bit) | 15.8 |
| FE52::sqr (52-bit) | 13.5 |
| FE52::inverse_safegcd | 725.4 |
| FE52::inverse (Fermat) | 3828.5 |
| -> SafeGCD/Fermat speedup | 5.28x |
| FE52::add (52-bit) | 0.5 |
| FE52::negate (52-bit) | 0.5 |
| FE52::normalize | 3.5 |
| SHA256 (BIP0340/challenge) | 107.4 |
| tagged_hash (recompute tag) | 196.9 |
| cached_tagged_hash (midstate) | 70.0 |
| -> midstate speedup | 2.81x |
| lift_x (4x64 sqrt) | 5094.4 |
| lift_x (FE52 sqrt) | 3347.3 |
| -> FE52/4x64 speedup | 1.52x |
| FE::parse_bytes_strict | 3.4 |
+----------------------------------------------+------------+
---- VERIFY COST DECOMPOSITION ----
ECDSA verify breakdown (estimated):
scalar_inv (1x): 849.5 ns
scalar_mul (2x): 39.5 ns
dual_scalar_mul: 19001.5 ns
from_bytes + overhead: 2.6 ns
--------------------------------
SUM (sub-ops): 19893.0 ns
MEASURED ecdsa_verify: 20846.6 ns
UNEXPLAINED gap: 953.6 ns (4.6%)
Schnorr verify breakdown (estimated):
SHA256 challenge: (included in total)
scalar_negate: 2.4 ns
dual_scalar_mul: 19001.5 ns
lift_x (sqrt): (included in total)
from_bytes: 2.6 ns
--------------------------------
SUM (sub-ops, partial): 19006.5 ns
MEASURED schnorr_verify: 20279.6 ns
UNEXPLAINED gap: 1273.2 ns (SHA256+lift_x+Z-check)
Verify vs libsecp breakdown:
Our dual_mul: 19001.5 ns
Our scalar_inv: 849.5 ns
Our dual+inv: 19851.0 ns
Total ECDSA verify: 20846.6 ns
Overhead (verify - d+i): 995.6 ns
---- SIGN COST DECOMPOSITION (FAST path) ----
ecdsa_sign = RFC6979 + k*G + field_inv + scalar_inv + scalar_muls
k*G (generator_mul): 4750.0 ns
field_inv (R.x): 645.8 ns
scalar_inv (k^-1): 849.5 ns
scalar_mul (2x): 39.5 ns
--------------------------------
Core signing (no RFC6979): 6284.8 ns
MEASURED ecdsa_sign: 6450.9 ns
RFC6979 overhead: 166.1 ns (2.6%)
MEASURED ecdsa_sign_verified:37580.3 ns
sign-then-verify overhead: 31129.4 ns (pubkey + verify)
+----------------------------------------------+------------+
| BATCH VERIFICATION (FAST) | ns/op |
+----------------------------------------------+------------+
| schnorr_batch_verify(N=4) | 78874.3 |
| -> per-sig amortized (N=4) | 19718.6 |
| -> speedup vs individual | 1.03x |
| schnorr_batch_verify(N=16) | 325401.5 |
| -> per-sig amortized (N=16) | 20337.6 |
| -> speedup vs individual | 1.00x |
| schnorr_batch_verify(N=64) | 1329107.1 |
| -> per-sig amortized (N=64) | 20767.3 |
| -> speedup vs individual | 0.98x |
| schnorr_batch_verify(N=192) | 3283487.4 |
| -> per-sig amortized (N=192) | 17101.5 |
| -> speedup vs individual | 1.19x |
| schnorr_batch_verify(repeated,N=192) | 2884848.9 |
| -> per-sig repeated (N=192) | 15025.3 |
| -> repeated speedup vs individual | 1.35x |
| schnorr_batch_seed_only(N=192) | 16218.8 |
| schnorr_batch_weights_only(N=192) | 10063.2 |
| schnorr_batch_R_lift_only(N=192) | 926910.0 |
| schnorr_batch_P_lift_only(N=192) | 951004.1 |
| schnorr_batch_challenge_only(N=192) | 16512.1 |
| schnorr_batch_xonly_parse_only(N=192) | 659796.9 |
| schnorr_batch_P_lift+challenge_only(N=192) | 937008.0 |
| schnorr_batch_lift+challenge(N=192) | 1977220.4 |
| schnorr_batch_setup_only(N=192) | 2008199.0 |
| -> setup per-sig (N=192) | 10459.4 |
| -> setup share of full (N=192) | 60.34% |
| schnorr_batch_seed_only(repeated,N=192) | 14453.3 |
| schnorr_batch_weights_only(repeated,N=192) | 8768.2 |
| schnorr_batch_R_lift_only(repeated,N=192) | 1004852.3 |
| schnorr_batch_P_lift_only(repeated,N=192) | 945079.2 |
| schnorr_batch_challenge_only(repeated,N=192) | 18516.6 |
| schnorr_batch_xonly_parse_only(repeated,N=192) | 663956.8 |
| schnorr_batch_P_lift+challenge_only(repeated,N=192) | 953751.9 |
| schnorr_batch_lift+challenge(repeated,N=192) | 1912494.5 |
| schnorr_batch_setup_only(repeated,N=192) | 1908150.1 |
| -> setup repeated per-sig (N=192) | 9938.3 |
| -> setup share repeated (N=192) | 65.68% |
| | |
| ecdsa_batch_verify(N=4) | 76754.1 |
| -> per-sig amortized (N=4) | 19188.5 |
| -> speedup vs individual | 1.09x |
| ecdsa_batch_verify(N=16) | 304265.1 |
| -> per-sig amortized (N=16) | 19016.6 |
| -> speedup vs individual | 1.10x |
| ecdsa_batch_verify(N=64) | 1230289.0 |
| -> per-sig amortized (N=64) | 19223.3 |
| -> speedup vs individual | 1.08x |
+----------------------------------------------+------------+
+----------------------------------------------+------------+
| CT POINT ARITHMETIC (sub-ops) | ns/op |
+----------------------------------------------+------------+
| ct::scalar_inverse (SafeGCD) | 1351.5 |
| ct::generator_mul (k*G) | 9533.7 |
| ct::scalar_mul (k*P) | 21251.5 |
| ct::point_dbl | 70.6 |
| ct::point_add_complete (11M+6S) | 203.4 |
| ct::point_add_mixed_complete (7M+5S) | 135.5 |
| ct::point_add_mixed_unified (7M+5S) | 131.3 |
+----------------------------------------------+------------+
---- CT vs FAST point ops ----
FAST Point::dbl 57.6 ns
FAST Point::add 121.4 ns
FAST pubkey_create (k*G) 4750.0 ns
FAST scalar_mul (k*P) 19404.7 ns
CT generator_mul (k*G) 9533.7 ns
CT scalar_mul (k*P) 21251.5 ns
CT/FAST ratio (k*G): 2.01x overhead
CT/FAST ratio (k*P): 1.10x overhead
+----------------------------------------------+------------+
| CT SIGNING (Ultra CT) | ns/op |
+----------------------------------------------+------------+
| ct::ecdsa_sign | 12761.0 |
| CT overhead (ECDSA) | 1.98x |
| ct::ecdsa_sign_verified | 43190.6 |
| ct::schnorr_sign | 11070.8 |
| CT overhead (Schnorr) | 2.09x |
| ct::schnorr_sign_verified | 33161.6 |
| ct::schnorr_keypair_create | 12088.9 |
| CT overhead (keypair) | 2.24x |
+----------------------------------------------+------------+
---- CT ECDSA SIGN DECOMPOSITION ----
ct::generator_mul (R=k*G): 9533.7 ns
ct::scalar_inverse (k^-1): 1351.5 ns
field_inv (R.x affine): 645.8 ns
scalar_mul (2x): 39.5 ns
--------------------------------
SUM (sub-ops): 11570.5 ns
MEASURED ct::ecdsa_sign: 12761.0 ns
UNEXPLAINED gap: 1190.5 ns (9.3%, RFC6979+checks)
---- CT SCHNORR SIGN DECOMPOSITION ----
ct::generator_mul (R=k*G): 9533.7 ns
SHA256 (tag+nonce+msg): (included in total)
scalar_mul + negate: 22.1 ns
--------------------------------
SUM (sub-ops, partial): 9555.9 ns
MEASURED ct::schnorr_sign: 11070.8 ns
UNEXPLAINED gap: 1514.9 ns (SHA256+aux+serialize)
---- CT vs libsecp (true apples-to-apples) ----
CT ecdsa_sign 12761.0 ns
lib ecdsa_sign (measured after libsecp section)
CT schnorr_sign 11070.8 ns
lib schnorr_sign (measured after libsecp section)
+----------------------------------------------+------------+
| ETHEREUM OPERATIONS | ns/op |
+----------------------------------------------+------------+
| keccak256 (32B) | 254.2 |
| ethereum_address | 228.4 |
| eip191_hash | 225.0 |
| eth_sign_hash | 6525.3 |
| ecdsa_sign_recoverable | 6598.0 |
| ecrecover | 27095.1 |
| eth_personal_sign | 6787.6 |
| ethereum_address_eip55 | 564.8 |
+----------------------------------------------+------------+
+----------------------------------------------+------------+
| REAL-WORLD FLOWS | ns/op |
+----------------------------------------------+------------+
| ecdh_compute (SHA256 shared secret) | 20215.2 |
| ecdh_compute_raw (x-only shared) | 20134.6 |
| taproot_output_key (BIP-341 key path) | 10438.6 |
| taproot_tweak_privkey (BIP-341) | 11246.9 |
| bip32_master_key (64B seed) | 933.3 |
| bip32_coin_derive_key (BTC m/84'/0'/0'/0/0) | 77987.0 |
| coin_address_from_seed (BTC end-to-end) | 91654.6 |
| coin_address_from_seed (ETH end-to-end) | 91281.6 |
| silent_payment_create_output | 24181.2 |
| silent_payment_scan (single output set) | 34901.1 |
+----------------------------------------------+------------+
Running libsecp256k1 benchmark (same harness: RDTSCP, 3s ramp-up, 500 warmup, 11 passes, IQR)...
+----------------------------------------------+------------+
| libsecp256k1 (bitcoin-core) | ns/op |
+----------------------------------------------+------------+
| field_mul | 11.6 |
| field_sqr | 10.5 |
| field_inv_var | 833.2 |
| field_add | 6.6 |
| field_negate | 6.3 |
| field_normalize | 7.4 |
| field_from_bytes (set_b32) | 7.0 |
| scalar_mul | 26.4 |
| scalar_inverse (CT) | 1421.1 |
| scalar_inverse_var | 856.2 |
| scalar_add | 5.2 |
| scalar_negate | 7.0 |
| scalar_from_bytes (set_b32) | 5.0 |
| point_dbl (gej_double_var) | 78.6 |
| point_add (gej_add_ge_var) | 141.1 |
| ecmult (a*P + b*G, Strauss) | 21020.3 |
| ecmult_gen (k*G, comb) | 9723.2 |
| generator_mul (ec_pubkey_create) | 11384.8 |
| scalar_mul_P (k*P, tweak_mul) | 20135.6 |
| serialize_compressed (33B) | 17.7 |
| serialize_uncompressed (65B) | 22.5 |
| point_add (pubkey_combine) | 1774.0 |
| ecdsa_sign | 17203.1 |
| ecdsa_verify | 22448.3 |
| schnorr_keypair_create | 11751.9 |
| schnorr_sign (BIP-340) | 13712.3 |
| schnorr_verify (BIP-340) | 24529.6 |
+----------------------------------------------+------------+
Running OpenSSL benchmark (OpenSSL 3.0.13 30 Jan 2024, same harness)...
+----------------------------------------------+------------+
| OpenSSL (ECDSA, secp256k1) | ns/op |
+----------------------------------------------+------------+
| generator_mul (EC_POINT_mul k*G) | 213014.6 |
| ecdsa_sign (ECDSA_do_sign) | 222950.9 |
| ecdsa_verify (ECDSA_do_verify) | 214672.4 |
+----------------------------------------------+------------+
(OpenSSL has no BIP-340 Schnorr -- ECDSA-only comparison)
======================================================================
HEAD-TO-HEAD: UltrafastSecp256k1 vs libsecp256k1
(ratio > 1.0 = Ultra wins, < 1.0 = libsecp wins)
======================================================================
+------------------------------------+----------+----------+-----------+
| FIELD ARITHMETIC | Ultra ns | libsecp | ratio |
+------------------------------------+----------+----------+-----------+
| mul | 10.8 | 11.6 | 1.08x |
| sqr | 10.1 | 10.5 | 1.04x |
| inv | 645.8 | 833.2 | 1.29x |
| add | 3.9 | 6.6 | 1.67x |
| sub | 4.2 | --- | --- |
| negate | 5.7 | 6.3 | 1.12x |
| normalize (FE52) | 3.5 | 7.4 | 2.12x |
| from_bytes (32B) | 2.8 | 7.0 | 2.49x |
| FE52 add (hot path) | 0.5 | 6.6 | 12.48x |
| FE52 neg (hot path) | 0.5 | 6.3 | 12.93x |
+------------------------------------+----------+----------+-----------+
+------------------------------------+----------+----------+-----------+
| SCALAR ARITHMETIC | Ultra ns | libsecp | ratio |
+------------------------------------+----------+----------+-----------+
| mul | 20.0 | 26.4 | 1.32x |
| inv (CT) | 849.5 | 1421.1 | 1.67x |
| inv (var-time) | 849.5 | 856.2 | 1.01x |
| add | 4.1 | 5.2 | 1.26x |
| negate | 2.3 | 7.0 | 2.98x |
| from_bytes (32B) | 2.6 | 5.0 | 1.96x |
+------------------------------------+----------+----------+-----------+
+------------------------------------+----------+----------+-----------+
| POINT ARITHMETIC | Ultra ns | libsecp | ratio |
+------------------------------------+----------+----------+-----------+
| dbl (Jacobian) | 67.6 | 78.6 | 1.16x |
| add (mixed J+A) | 118.5 | 141.1 | 1.19x |
| ecmult (a*P+b*G) | 18738.4 | 21020.3 | 1.12x |
| ecmult_gen (k*G raw) | 4750.0 | 9723.2 | 2.05x |
| pubkey_create (API) | 4750.0 | 11384.8 | 2.40x |
| scalar_mul (k*P) | 19404.7 | 20135.6 | 1.04x |
| scalar_mul (KPlan) | 16596.9 | 20135.6 | 1.21x |
| point_add (combine) | 761.6 | 1774.0 | 2.33x |
+------------------------------------+----------+----------+-----------+
+------------------------------------+----------+----------+-----------+
| SERIALIZATION | Ultra ns | libsecp | ratio |
+------------------------------------+----------+----------+-----------+
| compressed (33B) | 7.2 | 17.7 | 2.46x |
| uncompressed (65B) | 7.0 | 22.5 | 3.23x |
+------------------------------------+----------+----------+-----------+
+------------------------------------+----------+----------+-----------+
| SIGNING (FAST vs libsecp CT) | Ultra ns | libsecp | ratio |
+------------------------------------+----------+----------+-----------+
| ECDSA Sign | 6450.9 | 17203.1 | 2.67x |
| Schnorr Sign | 5295.8 | 13712.3 | 2.59x |
| Schnorr Keypair | 5405.0 | 11751.9 | 2.17x |
+------------------------------------+----------+----------+-----------+
+------------------------------------+----------+----------+-----------+
| VERIFICATION | Ultra ns | libsecp | ratio |
+------------------------------------+----------+----------+-----------+
| ECDSA Verify | 20846.6 | 22448.3 | 1.08x |
| Schnorr Verify (cached) | 20279.6 | 24529.6 | 1.21x |
| Schnorr Verify (raw) | 21640.8 | 24529.6 | 1.13x |
+------------------------------------+----------+----------+-----------+
+------------------------------------+----------+----------+-----------+
| CT-vs-CT (fair signing) | Ultra ns | libsecp | ratio |
+------------------------------------+----------+----------+-----------+
| ECDSA Sign | 12761.0 | 17203.1 | 1.35x |
| Schnorr Sign | 11070.8 | 13712.3 | 1.24x |
| ECDSA Verify | 20846.6 | 22448.3 | 1.08x |
| Schnorr Verify | 21640.8 | 24529.6 | 1.13x |
+------------------------------------+----------+----------+-----------+
+------------------------------------+----------+----------+-----------+
| ETHEREUM / RECOVERY | Ultra ns | libsecp | ratio |
+------------------------------------+----------+----------+-----------+
| sign_recoverable | 6598.0 | 15920.0 | 2.41x |
| ecrecover | 27095.1 | 26314.1 | 0.97x |
| eth_sign_hash | 6525.3 | 15920.0 | 2.44x |
| eth_personal_sign | 6787.6 | 15920.0 | 2.35x |
+------------------------------------+----------+----------+-----------+
======================================================================
APPLE-TO-APPLE: UltrafastSecp256k1 / OpenSSL
(ratio > 1.0 = Ultra wins, < 1.0 = OpenSSL wins)
======================================================================
+----------------------------------------------+------------+
| FAST path (Ultra FAST vs OpenSSL) | ratio |
+----------------------------------------------+------------+
| Generator * k | 44.85x |
| ECDSA Sign | 34.56x |
| ECDSA Verify | 10.30x |
+----------------------------------------------+------------+
+----------------------------------------------+------------+
| CT path (Ultra CT vs OpenSSL) | ratio |
+----------------------------------------------+------------+
| ECDSA Sign (CT vs CT) | 17.47x |
| ECDSA Verify | 10.30x |
+----------------------------------------------+------------+
+----------------------------------------------+------------+
| ZK Proofs & Commitments | ns/op |
+----------------------------------------------+------------+
| Pedersen commit | 30575.5 |
| Knowledge prove (sigma) | 20371.6 |
| Knowledge verify | 21392.3 |
| DLEQ prove | 44028.6 |
| DLEQ verify | 57020.4 |
| Bulletproof range_prove (64b) | 13055460.4 |
| Bulletproof range_verify (64b) | 1259727.1 |
+----------------------------------------------+------------+
======================================================================
THROUGHPUT SUMMARY (1 core, pinned)
======================================================================
--- Ultra FAST ---
ECDSA sign 6.45 us -> 155.0 k op/s
ECDSA verify 20.85 us -> 48.0 k op/s
Schnorr sign 5.30 us -> 188.8 k op/s
Schnorr verify (cached) 20.28 us -> 49.3 k op/s
Schnorr verify (raw) 21.64 us -> 46.2 k op/s
pubkey_create (k*G) 4.75 us -> 210.5 k op/s
ECDH 20.22 us -> 49.5 k op/s
Taproot output key 10.44 us -> 95.8 k op/s
BIP32 derive (BTC) 77.99 us -> 12.8 k op/s
Silent Payment sender 24.18 us -> 41.4 k op/s
Silent Payment scan 34.90 us -> 28.7 k op/s
--- Ultra CT ---
CT ECDSA sign 12.76 us -> 78.4 k op/s
CT Schnorr sign 11.07 us -> 90.3 k op/s
--- Ultra ZK ---
Pedersen commit 30.58 us -> 32.7 k op/s
Knowledge prove 20.37 us -> 49.1 k op/s
Knowledge verify 21.39 us -> 46.7 k op/s
DLEQ prove 44.03 us -> 22.7 k op/s
DLEQ verify 57.02 us -> 17.5 k op/s
Bulletproof range_prove 13055.46 us -> 77 op/s
Bulletproof range_verify 1259.73 us -> 794 op/s
--- libsecp256k1 ---
field_mul 0.01 us -> 86.16 M op/s
field_sqr 0.01 us -> 95.12 M op/s
field_inv_var 0.83 us -> 1.20 M op/s
scalar_mul 0.03 us -> 37.85 M op/s
scalar_inverse (CT) 1.42 us -> 703.7 k op/s
scalar_inverse_var 0.86 us -> 1.17 M op/s
point_dbl 0.08 us -> 12.72 M op/s
point_add (mixed) 0.14 us -> 7.09 M op/s
ecmult (a*P+b*G) 21.02 us -> 47.6 k op/s
ecmult_gen (k*G raw) 9.72 us -> 102.8 k op/s
generator_mul (API) 11.38 us -> 87.8 k op/s
scalar_mul_P (k*P) 20.14 us -> 49.7 k op/s
ECDSA sign 17.20 us -> 58.1 k op/s
ECDSA verify 22.45 us -> 44.5 k op/s
Schnorr sign 13.71 us -> 72.9 k op/s
Schnorr verify 24.53 us -> 40.8 k op/s
--- OpenSSL ---
ECDSA sign 222.95 us -> 4.5 k op/s
ECDSA verify 214.67 us -> 4.7 k op/s
generator_mul (k*G) 213.01 us -> 4.7 k op/s
======================================================================
BITCOIN BLOCK VALIDATION ESTIMATES (1 core)
======================================================================
Pre-Taproot block (~3000 ECDSA verify):
Wall time: 62.5 ms
Blocks/sec: 16.0
Taproot block (~2000 Schnorr + ~1000 ECDSA):
Wall time: 64.1 ms
Blocks/sec: 15.6
TX throughput (1 core):
ECDSA: 47969 tx/sec
Schnorr: 46209 tx/sec
======================================================================
Intel(R) Core(TM) i5-14400F | 1 core pinned | GCC 14.2.0
UltrafastSecp256k1 vs libsecp256k1 vs OpenSSL -- Unified Benchmark
======================================================================
JSON report written to: /tmp/bench_today.json

View File

@ -9,10 +9,11 @@
// ## Three tiers of acceleration (runtime-detected):
//
// Tier 0: SCALAR -- Portable C++ (baseline, always available)
// Tier 1: SHA-NI -- Intel SHA Extensions (single-message HW accel, ~3-5x)
// Tier 2: AVX2 -- 4-way multi-buffer SHA-256 (interleaved, ~8-12x)
// Tier 1: ARM SHA2 -- ARMv8 SHA-256 instructions (single-message HW accel)
// Tier 2: SHA-NI -- Intel SHA Extensions (single-message HW accel, ~3-5x)
// Tier 3: AVX2 -- 4-way multi-buffer SHA-256 (interleaved, ~8-12x)
// + optimized RIPEMD-160 with BMI/BMI2
// Tier 3: AVX-512 -- 8-way multi-buffer SHA-256 (if available, ~16x)
// Tier 4: AVX-512 -- 8-way multi-buffer SHA-256 (if available, ~16x)
//
// ## Hot-path API for search pipeline:
//
@ -48,9 +49,10 @@ namespace secp256k1::hash {
enum class HashTier : int {
SCALAR = 0,
SHA_NI = 1, // Intel SHA Extensions
AVX2 = 2, // 4-way multi-buffer
AVX512 = 3, // 8-way multi-buffer
ARM_SHA2 = 1, // ARMv8 SHA-256 instructions
SHA_NI = 2, // Intel SHA Extensions
AVX2 = 3, // 4-way multi-buffer
AVX512 = 4, // 8-way multi-buffer
};
/// Detect best available hashing tier at runtime.

View File

@ -227,7 +227,7 @@ std::vector<std::size_t> schnorr_batch_identify_invalid_impl(
bool schnorr_batch_verify(const SchnorrBatchEntry* entries, std::size_t n) {
std::vector<SchnorrXonlyPubkey> pubkey_cache;
pubkey_cache.reserve((n < 64) ? n : 64);
pubkey_cache.reserve(n);
auto verify_one = [](const SchnorrBatchEntry& entry) {
return schnorr_verify(entry.pubkey_x, entry.message, entry.signature);

View File

@ -1016,10 +1016,15 @@ limbs4 mul_impl(const limbs4& a, const limbs4& b) {
arm64::field_mul_arm64(out.data(), a.data(), b.data());
return out;
#elif defined(SECP256K1_HAS_ASM) && (defined(__x86_64__) || defined(_M_X64))
// x86-64: Direct assembly call -- zero-copy, no FieldElement wrapper overhead
limbs4 out;
field_mul_full_asm(a.data(), b.data(), out.data());
return out;
// x86-64: Runtime dispatch — assembly requires BMI2+ADX (mulx/adcx/adox).
// Fall back to portable path on CPUs that lack these extensions (e.g. Jasper Lake).
static bool const asm_available = has_bmi2_support() && has_adx_support();
if (asm_available) {
limbs4 out;
field_mul_full_asm(a.data(), b.data(), out.data());
return out;
}
return reduce(mul_wide(a, b));
#elif defined(SECP256K1_NO_ASM)
// Generic no-asm fallback
auto result = reduce(mul_wide(a, b));
@ -1055,10 +1060,15 @@ limbs4 square_impl(const limbs4& a) {
arm64::field_sqr_arm64(out.data(), a.data());
return out;
#elif defined(SECP256K1_HAS_ASM) && (defined(__x86_64__) || defined(_M_X64))
// x86-64: Direct assembly call -- zero-copy, no FieldElement wrapper overhead
limbs4 out;
field_sqr_full_asm(a.data(), out.data());
return out;
// x86-64: Runtime dispatch — assembly requires BMI2+ADX (mulx/adcx/adox).
// Fall back to portable path on CPUs that lack these extensions (e.g. Jasper Lake).
static bool const asm_available = has_bmi2_support() && has_adx_support();
if (asm_available) {
limbs4 out;
field_sqr_full_asm(a.data(), out.data());
return out;
}
return reduce(mul_wide(a, a));
#elif defined(SECP256K1_NO_ASM)
// Generic no-asm fallback
return reduce(mul_wide(a, a));

View File

@ -26,6 +26,11 @@
#include <cstring>
// Architecture detection
#if defined(__aarch64__) || defined(_M_ARM64)
#define SECP256K1_ARM64_TARGET 1
#include <arm_neon.h>
#endif
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
#define SECP256K1_X86_TARGET 1
#ifdef _MSC_VER
@ -105,10 +110,19 @@ bool avx512_available() noexcept {
#endif
}
bool arm_sha2_available() noexcept {
#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
return true;
#else
return false;
#endif
}
HashTier detect_hash_tier() noexcept {
// SHA-NI usually coexists with AVX2 on modern CPUs (Zen, Ice Lake+)
// SHA-NI single-message is often faster than multi-buffer AVX2 for
// sequential work. For batch, AVX2 multi-buffer wins.
if (arm_sha2_available()) return HashTier::ARM_SHA2;
if (sha_ni_available()) return HashTier::SHA_NI;
if (avx2_available()) return HashTier::AVX2;
return HashTier::SCALAR;
@ -116,6 +130,7 @@ HashTier detect_hash_tier() noexcept {
const char* hash_tier_name(HashTier tier) noexcept {
switch (tier) {
case HashTier::ARM_SHA2: return "ARM SHA2";
case HashTier::SHA_NI: return "SHA-NI";
case HashTier::AVX2: return "AVX2";
case HashTier::AVX512: return "AVX-512";
@ -392,6 +407,90 @@ void hash160_33(const std::uint8_t* pubkey33, std::uint8_t* out20) noexcept {
} // namespace scalar
// ============================================================================
// ARMv8 SHA2 -- Hardware-accelerated SHA-256
// ============================================================================
#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
namespace armsha {
void sha256_compress(const std::uint8_t block[64], std::uint32_t state[8]) noexcept {
std::uint32_t w[64];
for (int i = 0; i < 16; ++i) {
w[i] = load_be32(block + static_cast<std::size_t>(i) * 4);
}
for (int i = 16; i < 64; ++i) {
std::uint32_t const s0 = rotr32(w[i - 15], 7) ^ rotr32(w[i - 15], 18) ^ (w[i - 15] >> 3);
std::uint32_t const s1 = rotr32(w[i - 2], 17) ^ rotr32(w[i - 2], 19) ^ (w[i - 2] >> 10);
w[i] = w[i - 16] + s0 + w[i - 7] + s1;
}
uint32x4_t abcd = vld1q_u32(state + 0);
uint32x4_t efgh = vld1q_u32(state + 4);
uint32x4_t const abcd_save = abcd;
uint32x4_t const efgh_save = efgh;
for (int i = 0; i < 64; i += 4) {
uint32x4_t const msg = vld1q_u32(w + i);
uint32x4_t const k = vld1q_u32(SHA256_K + i);
uint32x4_t const wk = vaddq_u32(msg, k);
abcd = vsha256hq_u32(abcd, efgh, wk);
efgh = vsha256h2q_u32(efgh, abcd, wk);
}
abcd = vaddq_u32(abcd, abcd_save);
efgh = vaddq_u32(efgh, efgh_save);
vst1q_u32(state + 0, abcd);
vst1q_u32(state + 4, efgh);
}
void sha256_33(const std::uint8_t* pubkey33, std::uint8_t* out32) noexcept {
alignas(16) std::uint8_t block[64];
std::memcpy(block, pubkey33, 33);
block[33] = 0x80;
std::memset(block + 34, 0, 22);
block[56] = 0; block[57] = 0; block[58] = 0; block[59] = 0;
block[60] = 0; block[61] = 0; block[62] = 0x01; block[63] = 0x08;
std::uint32_t state[8];
std::memcpy(state, SHA256_IV, sizeof(state));
sha256_compress(block, state);
for (int i = 0; i < 8; ++i) {
store_be32(out32 + static_cast<std::size_t>(i) * 4, state[i]);
}
}
void sha256_32(const std::uint8_t* in32, std::uint8_t* out32) noexcept {
alignas(16) std::uint8_t block[64];
std::memcpy(block, in32, 32);
block[32] = 0x80;
std::memset(block + 33, 0, 23);
block[56] = 0; block[57] = 0; block[58] = 0; block[59] = 0;
block[60] = 0; block[61] = 0; block[62] = 0x01; block[63] = 0x00;
std::uint32_t state[8];
std::memcpy(state, SHA256_IV, sizeof(state));
sha256_compress(block, state);
for (int i = 0; i < 8; ++i) {
store_be32(out32 + static_cast<std::size_t>(i) * 4, state[i]);
}
}
void hash160_33(const std::uint8_t* pubkey33, std::uint8_t* out20) noexcept {
std::uint8_t sha_out[32];
sha256_33(pubkey33, sha_out);
scalar::ripemd160_32(sha_out, out20);
}
} // namespace armsha
#endif // SECP256K1_ARM64_TARGET && __ARM_FEATURE_SHA2
// ============================================================================
// SHA-NI (Intel SHA Extensions) -- Hardware-accelerated SHA-256
// ============================================================================
@ -616,6 +715,12 @@ std::array<std::uint8_t, 32> sha256(const void* data, std::size_t len) noexcept
}
void sha256_33(const std::uint8_t* pubkey33, std::uint8_t* out32) noexcept {
#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
if (arm_sha2_available()) {
armsha::sha256_33(pubkey33, out32);
return;
}
#endif
#ifdef SECP256K1_X86_TARGET
if (sha_ni_available()) {
shani::sha256_33(pubkey33, out32);
@ -626,6 +731,12 @@ void sha256_33(const std::uint8_t* pubkey33, std::uint8_t* out32) noexcept {
}
void sha256_32(const std::uint8_t* in32, std::uint8_t* out32) noexcept {
#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
if (arm_sha2_available()) {
armsha::sha256_32(in32, out32);
return;
}
#endif
#ifdef SECP256K1_X86_TARGET
if (sha_ni_available()) {
shani::sha256_32(in32, out32);
@ -714,6 +825,12 @@ std::array<std::uint8_t, 20> hash160(const void* data, std::size_t len) noexcept
}
void hash160_33(const std::uint8_t* pubkey33, std::uint8_t* out20) noexcept {
#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
if (arm_sha2_available()) {
armsha::hash160_33(pubkey33, out20);
return;
}
#endif
#ifdef SECP256K1_X86_TARGET
if (sha_ni_available()) {
shani::hash160_33(pubkey33, out20);
@ -775,6 +892,12 @@ namespace secp256k1::detail {
void sha256_compress_dispatch(const std::uint8_t block[64],
std::uint32_t state[8]) noexcept {
#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
if (secp256k1::hash::arm_sha2_available()) {
secp256k1::hash::armsha::sha256_compress(block, state);
return;
}
#endif
#ifdef SECP256K1_X86_TARGET
if (secp256k1::hash::sha_ni_available()) {
secp256k1::hash::shani::sha256_compress(block, state);

View File

@ -212,17 +212,25 @@ CPU-computed data transfers directly to GPU via `cudaMemcpy` (little-endian, sam
| Operation | Time |
|-----------|------|
| field_mul (a*b mod p) | 85 ns |
| field_sqr (a^2 mod p) | 66 ns |
| field_add (a+b mod p) | 18 ns |
| field_sub (a-b mod p) | 16 ns |
| field_inverse | 2,621 ns |
| **fast scalar_mul (k*G)** | **7.6 us** |
| fast scalar_mul (k*P) | 77.6 us |
| CT scalar_mul (k*G) | 545 us |
| ECDH (full CT) | 545 us |
| field_mul (a*b mod p) | 68.3 ns |
| field_sqr (a^2 mod p) | 50 ns |
| field_add (a+b mod p) | 8 ns |
| field_inverse | 2 us |
| **fast scalar_mul (k*G)** | **15.27 us** |
| fast scalar_mul (k*P) | 130.33 us |
| ECDSA sign | 22.22 us |
| Schnorr sign (precomputed) | 16.67 us |
| ECDSA verify | 150.13 us |
> Backend: ARM64 inline assembly (MUL/UMULH). ~5x faster than generic C++.
> Backend: ARM64 inline assembly (MUL/UMULH). Latest rerun kept the ARMv8 SHA2 dispatch win for signing-heavy paths on RK3588.
### Latest RTX 5060 Ti Refresh
- CUDA local rerun via `gpu_bench_unified`: `k*G = 129.5 ns` at TPB 256 on batch 65536.
- OpenCL retained revalidation: `kG (batch=65536) = 115.1 ns`, `kP (batch=65536) = 263.1 ns`, `kG (kernel) = 98.7 ns`.
- CUDA TPB 512 was not retained as a default because the same harness produced invalid CT timings while only marginally improving `k*G`.
See `../docs/BENCHMARKS.md` for the current cross-platform benchmark matrix and retained-vs-rejected rerun notes.
---

View File

@ -50,7 +50,7 @@ using CpuKPlan = secp256k1::fast::KPlan;
// ============================================================================
// Configuration
// ============================================================================
static constexpr int BENCH_N = 10000;
static constexpr int BENCH_N = 500000;
static constexpr int BENCH_WARMUP = 3;
static constexpr int BENCH_PASSES = 11;
static constexpr int DETAIL_N = 1000;

View File

@ -82,27 +82,45 @@ build-bench\cpu\bench_unified.exe
### 2. ARM64 Android (Cross-compile via NDK)
Requires:
- Android NDK (tested with r27, Clang 18.0.1)
- Android NDK (tested with r27.2.12479018, Clang 18.0.3)
- Android device/emulator (arm64-v8a)
- ADB
```bash
# Configure with NDK toolchain
cmake -S . -B build-android -G Ninja \
# Configure with the Android CMake entrypoint.
# Use a clean Android-only build dir to avoid root/android cache mismatches.
cmake -S android -B build-android-ndk-arm64 -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_HOME/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_STL=c++_static \
-DANDROID_PLATFORM=android-28
# Build
cmake --build build-android --target bench_hornet -j
cmake --build build-android-ndk-arm64 --target bench_hornet -j
# Deploy and run
adb push build-android/android/test/bench_hornet /data/local/tmp/
adb shell chmod +x /data/local/tmp/bench_hornet
adb shell /data/local/tmp/bench_hornet
adb shell 'mkdir -p /data/local/tmp/ufsecp'
adb push build-android-ndk-arm64/bench_hornet /data/local/tmp/ufsecp/bench_hornet
adb shell 'chmod 755 /data/local/tmp/ufsecp/bench_hornet && /data/local/tmp/ufsecp/bench_hornet'
```
Measured Android rerun retained the ARMv8 SHA2 dispatch path in `cpu/src/hash_accel.cpp`.
On RK3588 big cores this moved the signing-heavy hot path materially while leaving verify
and point arithmetic essentially flat:
| Operation | Baseline | With ARM SHA2 dispatch | Delta |
|-----------|----------|------------------------|-------|
| ECDSA Sign | 25.89 us | 22.22 us | 1.17x faster |
| Schnorr Sign (precomputed) | 17.73 us | 16.67 us | 1.06x faster |
| Schnorr Sign (raw privkey) | 33.01 us | 31.99 us | 1.03x faster |
| CT ECDSA Sign | 70.50 us | 67.11 us | 1.05x faster |
| CT Schnorr Sign | 59.87 us | 59.10 us | 1.01x faster |
Rejected Android ARM64 experiments from the same campaign: forcing `SECP256K1_USE_4X64_POINT_OPS`,
changing `SECP256K1_GLV_WINDOW_WIDTH` to 4 or 6, and using default PGO as the shipped path.
Those variants did not beat the retained source-level SHA2 dispatch win on the connected RK3588 device.
### 3. RISC-V 64 (Cross-compile for Milk-V Mars / SiFive U74)
Requires:

View File

@ -11,14 +11,18 @@ Benchmark results for UltrafastSecp256k1 across all supported platforms.
| **x86-64 (i5-14400F, Clang 19)** | **12.8 ns** | **6.7 us** | **17.6 us** | **21.3 us** | **24.3 us** | **1.09x** |
| x86-64 (Clang 21, Win) | 17 ns (5x52) | 5 us | 25 us | -- | -- | -- |
| RISC-V 64 (SiFive U74, Clang 21) | 176 ns | 40.2 us | 150.5 us | **181.8 us** | -- | **1.13x** |
| ARM64 (RK3588, A76) | 74 ns | 14 us | 131 us | -- | -- | -- |
| ARM64 (RK3588, A76, Android NDK r27.2) | 68.3 ns | 15.27 us | 130.33 us | **150.13 us** | -- | -- |
| ESP32-S3 (LX7, 240 MHz) | 7,458 ns | 2,483 us | -- | -- | -- | -- |
| ESP32 (LX6, 240 MHz) | 6,993 ns | 6,203 us | -- | -- | -- | -- |
| STM32F103 (CM3, 72 MHz) | 15,331 ns | 37,982 us | -- | -- | -- | -- |
| CUDA (RTX 5060 Ti) | 0.2 ns | 217.7 ns | 225.8 ns | -- | **263.7 ns** | -- |
| OpenCL (RTX 5060 Ti) | 0.2 ns | 295.1 ns | -- | -- | -- | -- |
| CUDA (RTX 5060 Ti) | 0.2 ns | 129.5 ns | 225.8 ns | -- | **263.7 ns** | -- |
| OpenCL (RTX 5060 Ti) | 0.2 ns | 115.1 ns | 263.1 ns | -- | -- | -- |
| Metal (Apple M3 Pro) | 1.9 ns | 3.00 us | 2.94 us | -- | -- | -- |
GPU rows use the latest retained local rerun per backend. For OpenCL, the public
GPU C ABI still covers 4 of the 6 first-wave operations; the missing two are
batch ECDSA verify and batch Schnorr verify.
---
## Real-World Flow Coverage
@ -56,6 +60,46 @@ These values are mainly intended as workflow reference points. For publishable
cross-machine comparisons, use the full pinned benchmark methodology and JSON
artifacts from `bench_unified`.
### x86-64 Batch Verify Rerun (2026-03-17)
A retained low-risk x86 CPU improvement was keeping the Schnorr batch pubkey cache
capacity aligned with the full batch size in `cpu/src/batch_verify.cpp` instead of
clamping reserve capacity to 64 entries. This avoids avoidable vector reallocations
when uncached batches grow beyond 64 signatures.
Quick reruns on the local i5-14400F validation machine showed the improvement on the
uncached Schnorr path while preserving correctness (`ctest -R 'comprehensive|multiscalar'` PASS):
| Operation | Before | After | Delta |
|-----------|--------|-------|-------|
| Schnorr batch verify N=128 | 20.27 us/sig | 19.94-20.06 us/sig | up to 1.6% faster |
| Schnorr batch verify N=192 | 18.56 us/sig | 18.01-18.45 us/sig | up to 3.0% faster |
This change does not materially affect the cached-path benchmark; the measured win is specifically
the uncached parse-and-resolve flow for larger Schnorr batches.
### Cross-Platform Refresh Status (2026-03-18)
Recent retained reruns and validation passes across the active optimization campaign:
| Platform | Latest validated result | Status |
|----------|-------------------------|--------|
| x86-64 / Linux | Schnorr batch verify `N=128`: 19.94-20.06 us/sig, `N=192`: 18.01-18.45 us/sig | Retained low-risk pubkey-cache reserve improvement |
| Android ARM64 / RK3588 | ECDSA Sign 22.22 us, Schnorr Sign (precomputed) 16.67 us, CT ECDSA Sign 67.11 us | Retained ARMv8 SHA2 dispatch win |
| OpenCL / RTX 5060 Ti | `kG (batch=65536)` 115.1 ns, `kP (batch=65536)` 263.1 ns, `kG (kernel)` 98.7 ns | Revalidated retained tuning; `opencl_test` and `opencl_audit_runner` passed |
| CUDA / RTX 5060 Ti | `k*G` 129.5 ns at TPB 256; TPB 512 reached 128.5 ns but CT rows became invalid in the same harness | No safe global retune retained yet |
| RISC-V / Milk-V Mars | Latest native rerun remains the 2026-03-07 Mars baseline below | Current local environment has toolchain but no runnable board/emulator path |
This page keeps the last trustworthy result per platform. When a rerun only proves that an
experiment is unstable or not worth shipping, it is recorded here but not promoted as a retained
default.
OpenCL's current 4/6 C ABI status refers specifically to the generic GPU host ABI in
`ufsecp_gpu.h`: `generator_mul_batch`, `ecdh_batch`, `hash160_pubkey_batch`, and
`msm` are implemented on the OpenCL backend, while `ecdsa_verify_batch` and
`schnorr_verify_batch` currently return `UFSECP_ERR_GPU_UNSUPPORTED` until the
extended verify kernels are promoted into the backend bridge.
---
## x86-64 Benchmarks
@ -229,6 +273,20 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
| Range Prove (64-bit) | 3,711,570 ns | 0.27 k/s | Bulletproof, CT path, batch 256 |
| Range Verify (64-bit) | 764,649 ns | 1.3 k/s | Full IPA verification, batch 256 |
### CUDA Launch-Width Triage (2026-03-18)
The latest local rerun on the RTX 5060 Ti used `gpu_bench_unified` to check whether a global block-size
retune should replace the current default. The answer was no: there is not yet a safe retained win.
| TPB | k*G (generator) | CT k*G | CT k*P | Verdict |
|-----|-----------------|--------|--------|---------|
| 256 | 129.5 ns | 98.7 ns | 162.8 ns | Stable reference rerun |
| 512 | 128.5 ns | invalid (`0.0 ns`) | invalid (`0.1 ns`) | Rejected; CT timing became unstable |
The `512`-thread launch showed only a marginal `k*G` gain, while the same harness produced invalid
constant-time timings. Until the CT timing methodology is tightened, no global CUDA TPB default change
is retained from this sweep.
**GPU vs CPU ZK Speedup (single-core throughput):**
| Operation | CPU (i5-14400F) | GPU (RTX 5060 Ti) | GPU/CPU Speedup |
@ -249,6 +307,20 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
**OpenCL:** 3.0 CUDA, Driver 580.126.09
**Build:** Clang 19, Release, -O3, PTX inline assembly
### OpenCL GPU C ABI Coverage (2026-03-18)
| C ABI operation | OpenCL status | Notes |
|-----------------|---------------|-------|
| `ufsecp_gpu_generator_mul_batch` | Implemented | Uses `batch_scalar_mul_generator` + `batch_jacobian_to_affine` |
| `ufsecp_gpu_ecdsa_verify_batch` | Missing | Returns `UFSECP_ERR_GPU_UNSUPPORTED` |
| `ufsecp_gpu_schnorr_verify_batch` | Missing | Returns `UFSECP_ERR_GPU_UNSUPPORTED` |
| `ufsecp_gpu_ecdh_batch` | Implemented | GPU scalar mul, CPU SHA-256 finalization |
| `ufsecp_gpu_hash160_pubkey_batch` | Implemented | Public-data batch hashing |
| `ufsecp_gpu_msm` | Implemented | GPU scalar mul + CPU-side affine reduction |
The missing OpenCL pieces are therefore the two batch verify paths. Core ECC,
ECDH, Hash160, and MSM are already wired through the backend-neutral C ABI.
### Kernel-Only Timing (no buffer alloc/copy overhead)
| Operation | Time/Op | Throughput | Notes |
@ -260,7 +332,8 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
| Field Inv | 14.3 ns | 69.97 M/s | batch 1M |
| Point Double | 0.9 ns | 1,139 M/s | batch 256K |
| Point Add | 1.6 ns | 630.6 M/s | batch 256K |
| kG (kernel) | 295.1 ns | 3.39 M/s | batch 256K |
| kG (kernel) | 98.7 ns | 10.13 M/s | batch 65K |
| kP (kernel) | 238.1 ns | 4.20 M/s | batch 65K |
### End-to-End Timing (including buffer transfers)
@ -271,8 +344,10 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
| Field Inv | 29.0 ns | 34.43 M/s | batch 1M |
| Point Double | 58.4 ns | 17.11 M/s | batch 1M |
| Point Add | 111.9 ns | 8.94 M/s | batch 1M |
| kG (batch=65K) | 307.7 ns | 3.25 M/s | |
| kG (batch=16K) | 311.6 ns | 3.21 M/s | |
| kG (batch=65536) | 115.1 ns | 8.69 M/s | retained 2026-03-17 revalidation |
| kP (batch=65536) | 263.1 ns | 3.80 M/s | retained 2026-03-17 revalidation |
| kP upload | 6.7 ns | 149.25 M/s | host-to-device transfer slice |
| kP readback | 12.4 ns | 80.65 M/s | device-to-host transfer slice |
### CUDA / OpenCL Configuration
@ -291,7 +366,7 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
| Field Inv | 10.2 ns | 14.3 ns | **CUDA 1.40x** |
| Point Double | 0.8 ns | 0.9 ns | CUDA 1.13x |
| Point Add | 1.6 ns | 1.6 ns | Tie |
| Scalar Mul (kG) | 217.7 ns | 295.1 ns | **CUDA 1.36x** |
| Scalar Mul (kG) | 129.5 ns | 98.7 ns | **OpenCL 1.31x** |
| ECDSA Sign | 204.8 ns | -- | CUDA only |
| ECDSA Verify | 410.1 ns | -- | CUDA only |
| Schnorr Sign | 273.4 ns | -- | CUDA only |
@ -301,6 +376,11 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
| DLEQ Prove | 675.4 ns | -- | CUDA only |
| DLEQ Verify | 1,912.0 ns | -- | CUDA only |
`kG` above uses the latest retained local reruns on the same RTX 5060 Ti host:
CUDA `gpu_bench_unified` at TPB 256 (`129.5 ns`) and OpenCL `opencl_benchmark`
kernel timing (`98.7 ns`). CUDA still leads on verify and ZK because those paths
are not yet exposed on OpenCL.
---
## Apple Metal Benchmarks
@ -353,30 +433,49 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
**Hardware:** RK3588 (Cortex-A76 @ 2.256 GHz, pinned to big cores)
**OS:** Android
**Compiler:** NDK r26, Clang 17.0.2
**Compiler:** NDK r27.2.12479018, Clang 18.0.3
**Assembly:** ARM64 inline (MUL/UMULH)
**Field:** 10x26 (optimal for ARM64)
| Operation | Time | Notes |
|-----------|------|-------|
| Field Mul | 74 ns | ARM64 MUL/UMULH, 10x26 |
| Field Mul | 68.3 ns | ARM64 MUL/UMULH, 10x26 |
| Field Square | 50 ns | |
| Field Add | 8 ns | |
| Field Negate | 18 ns | |
| Field Inverse | 2 us | Fermat's theorem |
| Point Add | 992 ns | Jacobian coordinates |
| Point Double | 548 ns | |
| Generator Mul (kxG) | 14 us | Precomputed tables |
| Scalar Mul (kxP) | 131 us | GLV + wNAF |
| ECDSA Sign | 30 us | RFC 6979 |
| ECDSA Verify | 153 us | Shamir + GLV |
| Schnorr Sign (BIP-340) | 38 us | |
| Schnorr Verify (BIP-340) | 173 us | |
| Generator Mul (kxG) | 15.27 us | Precomputed tables |
| Scalar Mul (kxP) | 130.33 us | GLV + wNAF |
| ECDSA Sign | 22.22 us | ARMv8 SHA2 dispatch retained |
| ECDSA Verify | 150.13 us | Shamir + GLV |
| Schnorr Sign (BIP-340) | 16.67 us | Precomputed keypair path |
| Schnorr Verify (BIP-340) | 153.63 us | Raw pubkey path is similar |
| Batch Inverse (n=100) | 265 ns/elem | Montgomery's trick |
| Batch Inverse (n=1000) | 240 ns/elem | |
ARM64 10x26 representation with MUL/UMULH assembly provides optimal field arithmetic performance.
### Android ARM64 Optimization Rerun (2026-03-17)
This rerun used the connected RK3588 Android device and `android/test/bench_hornet_android.cpp`
as the benchmark truth source. The retained code change was enabling the existing ARMv8 SHA-256
instruction path in `hash_accel.cpp` for `sha256_33`, `sha256_32`, `hash160_33`, and
`sha256_compress_dispatch`.
| Operation | Baseline | Retained result | Delta |
|-----------|----------|-----------------|-------|
| ECDSA Sign | 25.89 us | 22.22 us | 14.2% faster |
| Schnorr Sign (precomputed) | 17.73 us | 16.67 us | 6.0% faster |
| Schnorr Sign (raw privkey) | 33.01 us | 31.99 us | 3.1% faster |
| CT ECDSA Sign | 70.50 us | 67.11 us | 4.8% faster |
| CT Schnorr Sign | 59.87 us | 59.10 us | 1.3% faster |
No meaningful win was found from forcing `SECP256K1_USE_4X64_POINT_OPS`, from changing
`SECP256K1_GLV_WINDOW_WIDTH` to 4 or 6, or from keeping PGO as the default Android path.
Those variants were measured and rejected.
---
## ESP32-S3 Benchmarks (Embedded)

View File

@ -10,7 +10,7 @@
|----------|-------------|
| [API Reference](API_REFERENCE.md) | Complete CPU + CUDA + WASM function reference |
| [Building](BUILDING.md) | Build instructions for all 10+ platforms |
| [Benchmarks](BENCHMARKS.md) | Performance data: CPU, GPU, embedded, mobile |
| [Benchmarks](BENCHMARKS.md) | Performance data: CPU, GPU, embedded, mobile, including the 2026-03 x86, Android, CUDA, and OpenCL refresh |
| [ESP32 Setup](ESP32_SETUP.md) | ESP32-S3/PICO-D4 flashing & testing guide |
| [RISC-V Optimizations](../RISCV_OPTIMIZATIONS.md) | RISC-V assembly & RVV details |
| [Porting Guide](../PORTING.md) | Add new platforms, architectures, GPU backends |

View File

@ -58,17 +58,24 @@ cd libs\UltrafastSecp256k1\android\
### Build (Manual CMake)
```bash
cmake -S android -B android/build-android-arm64 \
cmake -S android -B build-android-ndk-arm64 \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_HOME/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-24 \
-DANDROID_PLATFORM=android-28 \
-DANDROID_STL=c++_static \
-DCMAKE_BUILD_TYPE=Release \
-G Ninja
cmake --build android/build-android-arm64 -j
cmake --build build-android-ndk-arm64 --target bench_hornet -j
adb shell 'mkdir -p /data/local/tmp/ufsecp'
adb push build-android-ndk-arm64/bench_hornet /data/local/tmp/ufsecp/bench_hornet
adb shell 'chmod 755 /data/local/tmp/ufsecp/bench_hornet && /data/local/tmp/ufsecp/bench_hornet'
```
Use a clean Android-only build directory. Reusing a build directory first configured from the
repository root can trigger a CMake source/cache mismatch when switching to `android/` as the source tree.
### Output
```
@ -217,6 +224,21 @@ NDK Clang additionally uses:
\* CT mode uses generic C++ (for constant-time guarantees)
### Android ARM64 rerun retained on-device SHA2 dispatch
Measured on the connected RK3588 Android device with `bench_hornet` after wiring the ARMv8 SHA2
path into `hash_accel.cpp` hot wrappers:
| Operation | Baseline | Retained result | Delta |
|-----------|----------|-----------------|-------|
| ECDSA sign | 25.89 us | 22.22 us | 14.2% faster |
| Schnorr sign (precomputed) | 17.73 us | 16.67 us | 6.0% faster |
| Schnorr sign (raw privkey) | 33.01 us | 31.99 us | 3.1% faster |
| CT ECDSA sign | 70.50 us | 67.11 us | 4.8% faster |
The same rerun rejected forced 4x64 point ops, GLV window retuning, and keeping Android PGO as the
default path because they did not outperform the retained SHA2 dispatch result on this device.
### ARMv7 (32-bit) Limitations
- No `__int128` -> `SECP256K1_NO_INT128` fallback (portable 64x64->128)

649
fix_alerts.py Normal file
View File

@ -0,0 +1,649 @@
#!/usr/bin/env python3
"""Apply all readability-braces and misc-const-correctness fixes to ufsecp_impl.cpp"""
import sys
PATH = "include/ufsecp/ufsecp_impl.cpp"
# Each entry: (old_string, new_string)
REPLACEMENTS = [
# L1380: if (!ok)
(
" if (!ok)\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid mnemonic\");",
" if (!ok) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid mnemonic\");\n"
" }",
),
# L1382: if (*entropy_len < ent.length)
(
" if (*entropy_len < ent.length)\n"
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"entropy buffer too small\");",
" if (*entropy_len < ent.length) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"entropy buffer too small\");\n"
" }",
),
# L1404: if (!FE::parse_bytes_strict(e, pk_fe)) in ufsecp_schnorr_batch_verify
(
" for (size_t i = 0; i < n; ++i) {\n"
" const uint8_t* e = entries + i * 128;\n"
" // Strict: reject x-only pubkey >= p at ABI gate\n"
" FE pk_fe;\n"
" if (!FE::parse_bytes_strict(e, pk_fe))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p) in batch\");\n"
" std::memcpy(batch[i].pubkey_x.data(), e, 32);\n"
" std::memcpy(batch[i].message.data(), e + 32, 32);\n"
" if (!secp256k1::SchnorrSignature::parse_strict(e + 64, batch[i].signature))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid Schnorr sig in batch\");",
" for (size_t i = 0; i < n; ++i) {\n"
" const uint8_t* e = entries + i * 128;\n"
" // Strict: reject x-only pubkey >= p at ABI gate\n"
" FE pk_fe;\n"
" if (!FE::parse_bytes_strict(e, pk_fe)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p) in batch\");\n"
" }\n"
" std::memcpy(batch[i].pubkey_x.data(), e, 32);\n"
" std::memcpy(batch[i].message.data(), e + 32, 32);\n"
" if (!secp256k1::SchnorrSignature::parse_strict(e + 64, batch[i].signature)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid Schnorr sig in batch\");\n"
" }",
),
# L1448: if (!FE::parse_bytes_strict(e, pk_fe)) in ufsecp_schnorr_batch_identify_invalid
(
" for (size_t i = 0; i < n; ++i) {\n"
" const uint8_t* e = entries + i * 128;\n"
" FE pk_fe;\n"
" if (!FE::parse_bytes_strict(e, pk_fe))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p) in batch\");\n"
" std::memcpy(batch[i].pubkey_x.data(), e, 32);\n"
" std::memcpy(batch[i].message.data(), e + 32, 32);\n"
" if (!secp256k1::SchnorrSignature::parse_strict(e + 64, batch[i].signature))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid Schnorr sig in batch\");",
" for (size_t i = 0; i < n; ++i) {\n"
" const uint8_t* e = entries + i * 128;\n"
" FE pk_fe;\n"
" if (!FE::parse_bytes_strict(e, pk_fe)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p) in batch\");\n"
" }\n"
" std::memcpy(batch[i].pubkey_x.data(), e, 32);\n"
" std::memcpy(batch[i].message.data(), e + 32, 32);\n"
" if (!secp256k1::SchnorrSignature::parse_strict(e + 64, batch[i].signature)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid Schnorr sig in batch\");\n"
" }",
),
# L1661: widening + braces — in ufsecp_musig2_start_sign_session
(
" for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
" Scalar s;\n"
" if (!scalar_parse_strict(keyagg + 38 + i * 32, s))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient in keyagg\");\n"
" kagg.key_coefficients.push_back(s);\n"
" }",
" for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
" Scalar s;\n"
" if (!scalar_parse_strict(keyagg + 38 + static_cast<size_t>(i) * 32, s)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient in keyagg\");\n"
" }\n"
" kagg.key_coefficients.push_back(s);\n"
" }",
),
# L1707: widening + braces — inside { } block in ufsecp_musig2_partial_sign
(
" auto qc = kagg.Q.to_compressed(); std::memcpy(kagg.Q_x.data(), qc.data() + 1, 32);\n"
" for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
" Scalar s; if (!scalar_parse_strict(keyagg + 38 + i * 32, s))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient\");\n"
" kagg.key_coefficients.push_back(s); } }\n"
" secp256k1::MuSig2Session sess;",
" auto qc = kagg.Q.to_compressed(); std::memcpy(kagg.Q_x.data(), qc.data() + 1, 32);\n"
" for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
" Scalar s;\n"
" if (!scalar_parse_strict(keyagg + 38 + static_cast<size_t>(i) * 32, s)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient\");\n"
" }\n"
" kagg.key_coefficients.push_back(s);\n"
" }\n"
" }\n"
" secp256k1::MuSig2Session sess;",
),
# L1715: if (!scalar_parse_strict(session + 33, sess.b)) — in ufsecp_musig2_partial_sign
(
" if (!scalar_parse_strict(session + 33, sess.b))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
" if (!scalar_parse_strict(session + 65, sess.e))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
" sess.R_negated = (session[97] != 0);\n"
" auto psig = secp256k1::musig2_partial_sign",
" if (!scalar_parse_strict(session + 33, sess.b)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
" }\n"
" if (!scalar_parse_strict(session + 65, sess.e)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
" }\n"
" sess.R_negated = (session[97] != 0);\n"
" auto psig = secp256k1::musig2_partial_sign",
),
# L1756: widening + braces — inside { } block in ufsecp_musig2_partial_verify
(
" auto qc = kagg.Q.to_compressed(); std::memcpy(kagg.Q_x.data(), qc.data() + 1, 32);\n"
" for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
" Scalar s; if (!scalar_parse_strict(keyagg + 38 + i * 32, s))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient\");\n"
" kagg.key_coefficients.push_back(s); } }\n"
" secp256k1::MuSig2Session sess;\n"
" sess.R = point_from_compressed(session);\n"
" if (sess.R.is_infinity()) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session R point\");\n"
" }\n"
" if (!scalar_parse_strict(session + 33, sess.b))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
" if (!scalar_parse_strict(session + 65, sess.e))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
" sess.R_negated = (session[97] != 0);\n"
" if (!secp256k1::musig2_partial_verify",
" auto qc = kagg.Q.to_compressed(); std::memcpy(kagg.Q_x.data(), qc.data() + 1, 32);\n"
" for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
" Scalar s;\n"
" if (!scalar_parse_strict(keyagg + 38 + static_cast<size_t>(i) * 32, s)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient\");\n"
" }\n"
" kagg.key_coefficients.push_back(s);\n"
" }\n"
" }\n"
" secp256k1::MuSig2Session sess;\n"
" sess.R = point_from_compressed(session);\n"
" if (sess.R.is_infinity()) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session R point\");\n"
" }\n"
" if (!scalar_parse_strict(session + 33, sess.b)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
" }\n"
" if (!scalar_parse_strict(session + 65, sess.e)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
" }\n"
" sess.R_negated = (session[97] != 0);\n"
" if (!secp256k1::musig2_partial_verify",
),
# L1791+L1793: in ufsecp_musig2_partial_sig_agg
(
" if (!scalar_parse_strict(session + 33, sess.b))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
" if (!scalar_parse_strict(session + 65, sess.e))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
" sess.R_negated = (session[97] != 0);\n"
" auto final_sig",
" if (!scalar_parse_strict(session + 33, sess.b)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
" }\n"
" if (!scalar_parse_strict(session + 65, sess.e)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
" }\n"
" sess.R_negated = (session[97] != 0);\n"
" auto final_sig",
),
# L1822+L1823: const for coeff_count and needed_commits
(
" size_t coeff_count = commit.coeffs.size();\n"
" size_t needed_commits = 8 + coeff_count * 33;",
" const size_t coeff_count = commit.coeffs.size();\n"
" const size_t needed_commits = 8 + coeff_count * 33;",
),
# L1845: for (auto& s : shares) — erase in ufsecp_frost_keygen_begin
(
" // Erase secret shares from memory\n"
" for (auto& s : shares)\n"
" secp256k1::detail::secure_erase(&s.value, sizeof(s.value));\n"
" return UFSECP_OK;\n"
"}\n"
"\n"
"ufsecp_error_t ufsecp_frost_keygen_finalize(",
" // Erase secret shares from memory\n"
" for (auto& s : shares) {\n"
" secp256k1::detail::secure_erase(&s.value, sizeof(s.value));\n"
" }\n"
" return UFSECP_OK;\n"
"}\n"
"\n"
"ufsecp_error_t ufsecp_frost_keygen_finalize(",
),
# L1864: uint32_t cc; — init-variables
(
" secp256k1::FrostCommitment fc;\n"
" uint32_t cc;\n"
" if (pos + 8 > commits_len)\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"truncated commit header\");\n"
" std::memcpy(&cc, all_commits + pos, 4); pos += 4;\n"
" std::memcpy(&fc.from, all_commits + pos, 4); pos += 4;\n"
" if (pos + static_cast<size_t>(cc) * 33 > commits_len)\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"truncated commit coefficients\");\n"
" for (uint32_t j = 0; j < cc; ++j) {\n"
" auto pt = point_from_compressed(all_commits + pos);",
" secp256k1::FrostCommitment fc;\n"
" uint32_t cc = 0;\n"
" if (pos + 8 > commits_len) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"truncated commit header\");\n"
" }\n"
" std::memcpy(&cc, all_commits + pos, 4); pos += 4;\n"
" std::memcpy(&fc.from, all_commits + pos, 4); pos += 4;\n"
" if (pos + static_cast<size_t>(cc) * 33 > commits_len) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"truncated commit coefficients\");\n"
" }\n"
" for (uint32_t j = 0; j < cc; ++j) {\n"
" auto pt = point_from_compressed(all_commits + pos);",
),
# L1889: if (!scalar_parse_strict(s + 4, v)) in ufsecp_frost_keygen_finalize
(
" if (!scalar_parse_strict(s + 4, v))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid share scalar\");",
" if (!scalar_parse_strict(s + 4, v)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid share scalar\");\n"
" }",
),
# L1895+L1898: if (!ok) + for (auto& s : shares) — erase in ufsecp_frost_keygen_finalize
(
" if (!ok)\n"
" return ctx_set_err(ctx, UFSECP_ERR_INTERNAL, \"FROST keygen finalize failed\");\n"
" // Erase secret shares\n"
" for (auto& s : shares)\n"
" secp256k1::detail::secure_erase(&s.value, sizeof(s.value));",
" if (!ok) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_INTERNAL, \"FROST keygen finalize failed\");\n"
" }\n"
" // Erase secret shares\n"
" for (auto& s : shares) {\n"
" secp256k1::detail::secure_erase(&s.value, sizeof(s.value));\n"
" }",
),
# L1955: if (!scalar_parse_strict(keypkg + 12, kp.signing_share))
(
" if (!scalar_parse_strict(keypkg + 12, kp.signing_share))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"invalid signing share in keypkg\");",
" if (!scalar_parse_strict(keypkg + 12, kp.signing_share)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"invalid signing share in keypkg\");\n"
" }",
),
# L1967+L1969: if (!scalar_parse_strict(nonce, h)) + if (!scalar_parse_strict(nonce + 32, b))
(
" if (!scalar_parse_strict(nonce, h))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid hiding nonce\");\n"
" if (!scalar_parse_strict(nonce + 32, b))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid binding nonce\");",
" if (!scalar_parse_strict(nonce, h)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid hiding nonce\");\n"
" }\n"
" if (!scalar_parse_strict(nonce + 32, b)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid binding nonce\");\n"
" }",
),
# L2006+L2012: multi-line if null check + scalar parse in ufsecp_frost_verify_partial
(
" if (!ctx || !partial_sig || !verification_share33 || !nonce_commits || !msg32 || !group_pubkey33)\n"
" return UFSECP_ERR_NULL_ARG;\n"
" ctx_clear_err(ctx);\n"
" secp256k1::FrostPartialSig psig;\n"
" std::memcpy(&psig.id, partial_sig, 4);\n"
" Scalar z;\n"
" if (!scalar_parse_strict(partial_sig + 4, z))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid partial sig scalar\");",
" if (!ctx || !partial_sig || !verification_share33 || !nonce_commits || !msg32 || !group_pubkey33) {\n"
" return UFSECP_ERR_NULL_ARG;\n"
" }\n"
" ctx_clear_err(ctx);\n"
" secp256k1::FrostPartialSig psig;\n"
" std::memcpy(&psig.id, partial_sig, 4);\n"
" Scalar z;\n"
" if (!scalar_parse_strict(partial_sig + 4, z)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid partial sig scalar\");\n"
" }",
),
# L2057+L2065: multi-line if null check + scalar parse in ufsecp_frost_aggregate
(
" if (!ctx || !partial_sigs || !nonce_commits || !group_pubkey33 || !msg32 || !sig64_out)\n"
" return UFSECP_ERR_NULL_ARG;\n"
" ctx_clear_err(ctx);\n"
" std::vector<secp256k1::FrostPartialSig> psigs(n);\n"
" for (size_t i = 0; i < n; ++i) {\n"
" const uint8_t* ps = partial_sigs + i * 36;\n"
" std::memcpy(&psigs[i].id, ps, 4);\n"
" Scalar z;\n"
" if (!scalar_parse_strict(ps + 4, z))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid partial sig scalar\");",
" if (!ctx || !partial_sigs || !nonce_commits || !group_pubkey33 || !msg32 || !sig64_out) {\n"
" return UFSECP_ERR_NULL_ARG;\n"
" }\n"
" ctx_clear_err(ctx);\n"
" std::vector<secp256k1::FrostPartialSig> psigs(n);\n"
" for (size_t i = 0; i < n; ++i) {\n"
" const uint8_t* ps = partial_sigs + i * 36;\n"
" std::memcpy(&psigs[i].id, ps, 4);\n"
" Scalar z;\n"
" if (!scalar_parse_strict(ps + 4, z)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid partial sig scalar\");\n"
" }",
),
# L2144+L2150: in ufsecp_schnorr_adaptor_verify
(
" if (!scalar_parse_strict(pre_sig + 33, shat))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
" as.s_hat = shat;\n"
" as.needs_negation = (pre_sig[65] != 0);\n"
" // Strict: reject x-only pubkey >= p at ABI gate\n"
" FE pk_fe;\n"
" if (!FE::parse_bytes_strict(pubkey_x, pk_fe))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p)\");",
" if (!scalar_parse_strict(pre_sig + 33, shat)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
" }\n"
" as.s_hat = shat;\n"
" as.needs_negation = (pre_sig[65] != 0);\n"
" // Strict: reject x-only pubkey >= p at ABI gate\n"
" FE pk_fe;\n"
" if (!FE::parse_bytes_strict(pubkey_x, pk_fe)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p)\");\n"
" }",
),
# L2176: in ufsecp_schnorr_adaptor_adapt
(
" if (!scalar_parse_strict(pre_sig + 33, shat))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
" as.s_hat = shat;\n"
" as.needs_negation = (pre_sig[65] != 0);\n"
" Scalar secret;\n"
" if (!scalar_parse_strict_nonzero(adaptor_secret, secret))",
" if (!scalar_parse_strict(pre_sig + 33, shat)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
" }\n"
" as.s_hat = shat;\n"
" as.needs_negation = (pre_sig[65] != 0);\n"
" Scalar secret;\n"
" if (!scalar_parse_strict_nonzero(adaptor_secret, secret))",
),
# L2203: in ufsecp_schnorr_adaptor_extract
(
" if (!scalar_parse_strict(pre_sig + 33, shat))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
" as.s_hat = shat;\n"
" as.needs_negation = (pre_sig[65] != 0);\n"
" secp256k1::SchnorrSignature sig;",
" if (!scalar_parse_strict(pre_sig + 33, shat)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
" }\n"
" as.s_hat = shat;\n"
" as.needs_negation = (pre_sig[65] != 0);\n"
" secp256k1::SchnorrSignature sig;",
),
# L2810+L2815+L2817: in ufsecp_silent_payment_address_create
(
" if (!ctx || !scan_privkey || !spend_privkey || !scan_pubkey33_out ||\n"
" !spend_pubkey33_out || !addr_out || !addr_len)\n"
" return UFSECP_ERR_NULL_ARG;\n"
" ctx_clear_err(ctx);\n"
"\n"
" Scalar scan_sk, spend_sk;\n"
" if (!scalar_parse_strict_nonzero(scan_privkey, scan_sk))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"scan privkey is zero or >= n\");\n"
" if (!scalar_parse_strict_nonzero(spend_privkey, spend_sk))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"spend privkey is zero or >= n\");",
" if (!ctx || !scan_privkey || !spend_privkey || !scan_pubkey33_out ||\n"
" !spend_pubkey33_out || !addr_out || !addr_len) {\n"
" return UFSECP_ERR_NULL_ARG;\n"
" }\n"
" ctx_clear_err(ctx);\n"
"\n"
" Scalar scan_sk, spend_sk;\n"
" if (!scalar_parse_strict_nonzero(scan_privkey, scan_sk)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"scan privkey is zero or >= n\");\n"
" }\n"
" if (!scalar_parse_strict_nonzero(spend_privkey, spend_sk)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"spend privkey is zero or >= n\");\n"
" }",
),
# L2827: if (addr_str.size() >= *addr_len)
(
" if (addr_str.size() >= *addr_len)\n"
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"address buffer too small\");",
" if (addr_str.size() >= *addr_len) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"address buffer too small\");\n"
" }",
),
# L2846+L2855+L2864+L2868: in ufsecp_silent_payment_create_output
(
" if (!ctx || !input_privkeys || n_inputs == 0 || !scan_pubkey33 ||\n"
" !spend_pubkey33 || !output_pubkey33_out)\n"
" return UFSECP_ERR_NULL_ARG;\n"
" ctx_clear_err(ctx);\n"
"\n"
" // Parse input private keys\n"
" std::vector<Scalar> privkeys;\n"
" privkeys.reserve(n_inputs);\n"
" for (size_t i = 0; i < n_inputs; ++i) {\n"
" Scalar sk;\n"
" if (!scalar_parse_strict_nonzero(input_privkeys + i * 32, sk))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"input privkey is zero or >= n\");\n"
" privkeys.push_back(sk);\n"
" }\n"
"\n"
" // Parse recipient address\n"
" secp256k1::SilentPaymentAddress recipient;\n"
" recipient.scan_pubkey = point_from_compressed(scan_pubkey33);\n"
" recipient.spend_pubkey = point_from_compressed(spend_pubkey33);\n"
" if (recipient.scan_pubkey.is_infinity() || recipient.spend_pubkey.is_infinity())\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid recipient pubkey\");\n"
"\n"
" auto [output_point, tweak] = secp256k1::silent_payment_create_output(privkeys, recipient, k);\n"
" if (output_point.is_infinity())\n"
" return ctx_set_err(ctx, UFSECP_ERR_ARITH, \"output point is infinity\");",
" if (!ctx || !input_privkeys || n_inputs == 0 || !scan_pubkey33 ||\n"
" !spend_pubkey33 || !output_pubkey33_out) {\n"
" return UFSECP_ERR_NULL_ARG;\n"
" }\n"
" ctx_clear_err(ctx);\n"
"\n"
" // Parse input private keys\n"
" std::vector<Scalar> privkeys;\n"
" privkeys.reserve(n_inputs);\n"
" for (size_t i = 0; i < n_inputs; ++i) {\n"
" Scalar sk;\n"
" if (!scalar_parse_strict_nonzero(input_privkeys + i * 32, sk)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"input privkey is zero or >= n\");\n"
" }\n"
" privkeys.push_back(sk);\n"
" }\n"
"\n"
" // Parse recipient address\n"
" secp256k1::SilentPaymentAddress recipient;\n"
" recipient.scan_pubkey = point_from_compressed(scan_pubkey33);\n"
" recipient.spend_pubkey = point_from_compressed(spend_pubkey33);\n"
" if (recipient.scan_pubkey.is_infinity() || recipient.spend_pubkey.is_infinity()) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid recipient pubkey\");\n"
" }\n"
"\n"
" auto [output_point, tweak] = secp256k1::silent_payment_create_output(privkeys, recipient, k);\n"
" if (output_point.is_infinity()) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_ARITH, \"output point is infinity\");\n"
" }",
),
# L2879: for (auto& sk : privkeys) — erase in ufsecp_silent_payment_create_output
(
" for (auto& sk : privkeys)\n"
" secp256k1::detail::secure_erase(&sk, sizeof(sk));\n"
" return UFSECP_OK;\n"
"}\n"
"\n"
"ufsecp_error_t ufsecp_silent_payment_scan(",
" for (auto& sk : privkeys) {\n"
" secp256k1::detail::secure_erase(&sk, sizeof(sk));\n"
" }\n"
" return UFSECP_OK;\n"
"}\n"
"\n"
"ufsecp_error_t ufsecp_silent_payment_scan(",
),
# L2894+L2896+L2901+L2903+L2911: in ufsecp_silent_payment_scan
(
" if (!ctx || !scan_privkey || !spend_privkey || !input_pubkeys33 ||\n"
" !output_xonly32 || !n_found)\n"
" return UFSECP_ERR_NULL_ARG;\n"
" if (n_input_pubkeys == 0 || n_outputs == 0)\n"
" return UFSECP_ERR_BAD_INPUT;\n"
" ctx_clear_err(ctx);\n"
"\n"
" Scalar scan_sk, spend_sk;\n"
" if (!scalar_parse_strict_nonzero(scan_privkey, scan_sk))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"scan privkey is zero or >= n\");\n"
" if (!scalar_parse_strict_nonzero(spend_privkey, spend_sk))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"spend privkey is zero or >= n\");\n"
"\n"
" // Parse input pubkeys\n"
" std::vector<Point> input_pks;\n"
" input_pks.reserve(n_input_pubkeys);\n"
" for (size_t i = 0; i < n_input_pubkeys; ++i) {\n"
" auto pk = point_from_compressed(input_pubkeys33 + i * 33);\n"
" if (pk.is_infinity())\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid input pubkey\");",
" if (!ctx || !scan_privkey || !spend_privkey || !input_pubkeys33 ||\n"
" !output_xonly32 || !n_found) {\n"
" return UFSECP_ERR_NULL_ARG;\n"
" }\n"
" if (n_input_pubkeys == 0 || n_outputs == 0) {\n"
" return UFSECP_ERR_BAD_INPUT;\n"
" }\n"
" ctx_clear_err(ctx);\n"
"\n"
" Scalar scan_sk, spend_sk;\n"
" if (!scalar_parse_strict_nonzero(scan_privkey, scan_sk)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"scan privkey is zero or >= n\");\n"
" }\n"
" if (!scalar_parse_strict_nonzero(spend_privkey, spend_sk)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"spend privkey is zero or >= n\");\n"
" }\n"
"\n"
" // Parse input pubkeys\n"
" std::vector<Point> input_pks;\n"
" input_pks.reserve(n_input_pubkeys);\n"
" for (size_t i = 0; i < n_input_pubkeys; ++i) {\n"
" auto pk = point_from_compressed(input_pubkeys33 + i * 33);\n"
" if (pk.is_infinity()) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid input pubkey\");\n"
" }",
),
# L2953+L2964+L2968+L2972: in ufsecp_ecies_encrypt
(
" if (!ctx || !recipient_pubkey33 || !plaintext || !envelope_out || !envelope_len)\n"
" return UFSECP_ERR_NULL_ARG;\n"
" if (plaintext_len == 0) {\n"
" return UFSECP_ERR_BAD_INPUT;\n"
" }\n"
" ctx_clear_err(ctx);\n"
"\n"
" if (plaintext_len > SIZE_MAX - UFSECP_ECIES_OVERHEAD) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"plaintext_len too large\");\n"
" }\n"
" size_t const needed = plaintext_len + UFSECP_ECIES_OVERHEAD;\n"
" if (*envelope_len < needed)\n"
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"envelope buffer too small\");\n"
"\n"
" auto pk = point_from_compressed(recipient_pubkey33);\n"
" if (pk.is_infinity())\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid recipient pubkey\");\n"
"\n"
" auto envelope = secp256k1::ecies_encrypt(pk, plaintext, plaintext_len);\n"
" if (envelope.empty())\n"
" return ctx_set_err(ctx, UFSECP_ERR_INTERNAL, \"ECIES encryption failed\");",
" if (!ctx || !recipient_pubkey33 || !plaintext || !envelope_out || !envelope_len) {\n"
" return UFSECP_ERR_NULL_ARG;\n"
" }\n"
" if (plaintext_len == 0) {\n"
" return UFSECP_ERR_BAD_INPUT;\n"
" }\n"
" ctx_clear_err(ctx);\n"
"\n"
" if (plaintext_len > SIZE_MAX - UFSECP_ECIES_OVERHEAD) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"plaintext_len too large\");\n"
" }\n"
" size_t const needed = plaintext_len + UFSECP_ECIES_OVERHEAD;\n"
" if (*envelope_len < needed) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"envelope buffer too small\");\n"
" }\n"
"\n"
" auto pk = point_from_compressed(recipient_pubkey33);\n"
" if (pk.is_infinity()) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid recipient pubkey\");\n"
" }\n"
"\n"
" auto envelope = secp256k1::ecies_encrypt(pk, plaintext, plaintext_len);\n"
" if (envelope.empty()) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_INTERNAL, \"ECIES encryption failed\");\n"
" }",
),
# L2985+L2987+L2992+L2996+L3002: in ufsecp_ecies_decrypt
(
" if (!ctx || !privkey || !envelope || !plaintext_out || !plaintext_len)\n"
" return UFSECP_ERR_NULL_ARG;\n"
" if (envelope_len < 82) // min: 33 + 16 + 1 + 32\n"
" return UFSECP_ERR_BAD_INPUT;\n"
" ctx_clear_err(ctx);\n"
"\n"
" size_t const expected_pt_len = envelope_len - UFSECP_ECIES_OVERHEAD;\n"
" if (*plaintext_len < expected_pt_len)\n"
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"plaintext buffer too small\");\n"
"\n"
" Scalar sk;\n"
" if (!scalar_parse_strict_nonzero(privkey, sk))\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"privkey is zero or >= n\");\n"
"\n"
" auto pt = secp256k1::ecies_decrypt(sk, envelope, envelope_len);\n"
" secp256k1::detail::secure_erase(&sk, sizeof(sk));\n"
"\n"
" if (pt.empty())\n"
" return ctx_set_err(ctx, UFSECP_ERR_VERIFY_FAIL, \"ECIES decryption failed (bad key or tampered)\");",
" if (!ctx || !privkey || !envelope || !plaintext_out || !plaintext_len) {\n"
" return UFSECP_ERR_NULL_ARG;\n"
" }\n"
" if (envelope_len < 82) { // min: 33 + 16 + 1 + 32\n"
" return UFSECP_ERR_BAD_INPUT;\n"
" }\n"
" ctx_clear_err(ctx);\n"
"\n"
" size_t const expected_pt_len = envelope_len - UFSECP_ECIES_OVERHEAD;\n"
" if (*plaintext_len < expected_pt_len) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"plaintext buffer too small\");\n"
" }\n"
"\n"
" Scalar sk;\n"
" if (!scalar_parse_strict_nonzero(privkey, sk)) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"privkey is zero or >= n\");\n"
" }\n"
"\n"
" auto pt = secp256k1::ecies_decrypt(sk, envelope, envelope_len);\n"
" secp256k1::detail::secure_erase(&sk, sizeof(sk));\n"
"\n"
" if (pt.empty()) {\n"
" return ctx_set_err(ctx, UFSECP_ERR_VERIFY_FAIL, \"ECIES decryption failed (bad key or tampered)\");\n"
" }",
),
]
def main():
with open(PATH, "r") as f:
content = f.read()
for i, (old, new) in enumerate(REPLACEMENTS):
count = content.count(old)
if count == 0:
print(f"[FAIL] Replacement {i+1}: NOT FOUND")
print(f" Looking for: {repr(old[:80])}")
sys.exit(1)
if count > 1:
print(f"[WARN] Replacement {i+1}: found {count} occurrences, replacing first")
content = content.replace(old, new, 1)
print(f"[OK] Replacement {i+1} applied")
with open(PATH, "w") as f:
f.write(content)
print(f"\nAll {len(REPLACEMENTS)} replacements applied to {PATH}")
if __name__ == "__main__":
main()

893
fix_round4.py Normal file
View File

@ -0,0 +1,893 @@
#!/usr/bin/env python3
"""Fix all 211 code-scanning alerts across 13 files."""
import re
from pathlib import Path
BASE = Path('/home/shrek/Secp256K1/Secp256K1fast/libs/UltrafastSecp256k1')
def read(path):
return (BASE / path).read_text().splitlines(keepends=True)
def save(path, lines):
(BASE / path).write_text(''.join(lines))
# ============================================================================
# Algorithmic helpers
# ============================================================================
def add_braces(lines, alert_lines_1based, tag=''):
"""Add { } around single-statement bodies. Process bottom-to-top."""
fixed = 0
for lnum in sorted(alert_lines_1based, reverse=True):
idx = lnum - 1
if idx >= len(lines):
print(f' SKIP {tag}L{lnum}: out of range ({len(lines)} lines)')
continue
line = lines[idx]
# Get indentation of the controlling statement
indent = len(line) - len(line.lstrip())
indent_str = line[:indent]
stripped = line.rstrip('\n\r').rstrip()
# Skip if already has brace at end
if stripped.endswith('{'):
print(f' SKIP {tag}L{lnum}: already has {{')
continue
# Find next non-empty line (the body)
body_idx = idx + 1
while body_idx < len(lines) and lines[body_idx].strip() == '':
body_idx += 1
if body_idx >= len(lines):
print(f' SKIP {tag}L{lnum}: no body line found')
continue
body_line_stripped = lines[body_idx].lstrip()
# Skip if body already starts with {
if body_line_stripped.startswith('{'):
print(f' SKIP {tag}L{lnum}: body already has {{')
continue
# Apply fix
lines[idx] = stripped + ' {\n'
lines.insert(body_idx + 1, indent_str + '}\n')
fixed += 1
print(f' -> {tag}braces fixed: {fixed}')
return lines
def add_const_to_lines(lines, alert_lines_1based, tag=''):
"""Prepend const to variable declarations, handling range-for loops."""
fixed = 0
for lnum in sorted(alert_lines_1based, reverse=True):
idx = lnum - 1
if idx >= len(lines):
continue
line = lines[idx]
stripped = line.lstrip()
leading = line[:len(line) - len(stripped)]
if stripped.startswith('const '):
print(f' SKIP {tag}L{lnum}: already const')
continue
# Range-based for loop: for (TYPE var : container) -> for (const TYPE var : container)
if stripped.startswith('for (') and ':' in stripped:
# Match: for (TYPE var : ...
m = re.match(r'(for \()(\w[^:]+: .+)', stripped)
if m:
lines[idx] = leading + m.group(1) + 'const ' + m.group(2)
fixed += 1
continue
# Regular declaration
lines[idx] = leading + 'const ' + stripped
fixed += 1
print(f' -> {tag}const fixed: {fixed}')
return lines
# ============================================================================
# File: include/ufsecp/ufsecp_impl.cpp
# ============================================================================
def fix_ufsecp_impl():
path = 'include/ufsecp/ufsecp_impl.cpp'
print(f'\n=== {path} ===')
lines = read(path)
# --- readability-braces-around-statements (59 alerts) ---
brace_lines = [
1242, 1245, 1248, 1260, 1274, 1277, 1281, 1294, 1297,
1300, 1314, 1318, 1322, 1340, 1343, 1345, 1355, 1368,
1412, 1415, 1431, 1435, 1438, 1457, 1462, 1477, 1481,
1486, 1514, 1516, 1519, 1522, 1525, 1542, 1545, 1549,
1567, 1577, 1594, 1691, 1695, 1699, 1701, 1749, 1753,
1787, 1801, 1831, 1834, 1844, 1856, 1974, 2047, 2068,
2071, 2076, 2138, 2832, 2834,
]
lines = add_braces(lines, brace_lines, 'ufsecp_impl/')
# --- misc-const-correctness ---
const_lines = [1366, 1855, 1905, 2075, 3147, 3167, 3172]
lines = add_const_to_lines(lines, const_lines, 'ufsecp_impl/')
# --- modernize-use-auto ---
# L1573: uint32_t nk = static_cast<uint32_t>(...) -> auto nk = ...
# L1846: uint32_t cc32 = static_cast<uint32_t>(...) -> auto cc32 = ...
for lnum in [1573, 1846]:
idx = lnum - 1
line = lines[idx]
m = re.match(r'(\s*)uint32_t (\w+) = (static_cast<uint32_t>\(.+)', line)
if m:
lines[idx] = f'{m.group(1)}auto {m.group(2)} = {m.group(3)}'
print(f' AUTO: L{lnum}')
# --- cppcoreguidelines-init-variables ---
# L1655: uint32_t nk; -> uint32_t nk = 0;
idx = 1655 - 1
if ' uint32_t nk;' in lines[idx]:
lines[idx] = lines[idx].replace('uint32_t nk;', 'uint32_t nk = 0;')
print(' INIT: L1655')
# L1706: { uint32_t nk; -> { uint32_t nk = 0;
idx = 1706 - 1
if 'uint32_t nk;' in lines[idx]:
lines[idx] = lines[idx].replace('uint32_t nk;', 'uint32_t nk = 0;')
print(' INIT: L1706')
# L1761: same pattern
idx = 1761 - 1
if 'uint32_t nk;' in lines[idx]:
lines[idx] = lines[idx].replace('uint32_t nk;', 'uint32_t nk = 0;')
print(' INIT: L1761')
# --- bugprone-implicit-widening-of-multiplication-result ---
# L1578: keyagg_out + 38 + i * 32 -> keyagg_out + 38 + static_cast<size_t>(i) * 32
idx = 1578 - 1
if 'i * 32' in lines[idx] and 'static_cast<size_t>(i)' not in lines[idx]:
lines[idx] = lines[idx].replace(
'keyagg_out + 38 + i * 32',
'keyagg_out + 38 + static_cast<size_t>(i) * 32'
)
print(' WIDENING: L1578')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# File: cpu/src/bip39.cpp
# ============================================================================
def fix_bip39():
path = 'cpu/src/bip39.cpp'
print(f'\n=== {path} ===')
lines = read(path)
brace_lines = [49, 50, 93, 110, 117, 138, 140, 150, 171, 196, 200, 223,
246, 269, 273]
lines = add_braces(lines, brace_lines, 'bip39/')
const_lines = [33, 46, 47, 97, 126, 127, 128, 129, 136, 145,
182, 183, 184, 185, 191, 193, 194, 199,
255, 256, 257, 258, 264, 266, 267, 272]
lines = add_const_to_lines(lines, const_lines, 'bip39/')
# --- cppcoreguidelines-init-variables ---
# L137: some variable, need to find it
idx = 137 - 1
line = lines[idx]
# Pattern: TYPE var; (uninitialized) - add = 0 or = {} or = nullptr
m = re.match(r'(\s*)((?:int|uint\w*|size_t|bool|char|float|double)\s+\w+);(\s*(?://.*)?)\n', line)
if m:
type_and_var = m.group(2).rstrip()
# Determine default value
if 'bool' in type_and_var:
default = 'false'
elif 'float' in type_and_var or 'double' in type_and_var:
default = '0.0'
elif 'char*' in type_and_var or 'uint8_t*' in type_and_var:
default = 'nullptr'
else:
default = '0'
lines[idx] = f'{m.group(1)}{type_and_var} = {default};{m.group(3)}\n'
print(f' INIT: L137 -> added = {default}')
else:
print(f' INIT_SKIP: L137 pattern not matched: {repr(line[:60])}')
# --- modernize-use-auto ---
# L191 and L264: iterator/auto type replacement
for lnum in [191, 264]:
idx = lnum - 1
line = lines[idx]
# Pattern: SomeType::iterator it = or std::vector<...>::iterator it =
m = re.match(r'(\s*)(\w[\w:<>, *]+::iterator)(\s+\w+\s*=.+)', line)
if m:
lines[idx] = f'{m.group(1)}auto{m.group(3)}'
print(f' AUTO: L{lnum}')
else:
# Try: SomeType it = container.begin()
m2 = re.match(r'(\s*)(\w[\w:<>, *]+\*?)(\s+\w+\s*=\s*\w.+\.begin\(\).+)', line)
if m2:
lines[idx] = f'{m2.group(1)}auto{m2.group(3)}'
print(f' AUTO: L{lnum}')
else:
print(f' AUTO_SKIP: L{lnum}: {repr(line[:60])}')
# --- cert-err33-c (unchecked fclose return) ---
# L34: std::fclose(f); -> (void)std::fclose(f);
idx = 34 - 1
line = lines[idx]
if 'std::fclose' in line and '(void)' not in line:
lines[idx] = line.replace('std::fclose', '(void)std::fclose')
print(' ERR33: L34 fclose')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# File: cpu/src/zk.cpp
# ============================================================================
def fix_zk():
path = 'cpu/src/zk.cpp'
print(f'\n=== {path} ===')
lines = read(path)
brace_lines = [45, 68, 381, 415, 423, 481, 503, 610, 615, 619, 623,
664, 668, 675, 686, 688, 720, 785]
lines = add_braces(lines, brace_lines, 'zk/')
const_lines = [359, 363, 446, 448, 500, 642, 661]
lines = add_const_to_lines(lines, const_lines, 'zk/')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# File: cpu/src/message_signing.cpp
# ============================================================================
def fix_message_signing():
path = 'cpu/src/message_signing.cpp'
print(f'\n=== {path} ===')
lines = read(path)
brace_lines = [30, 35]
lines = add_braces(lines, brace_lines, 'msg_signing/')
const_lines = [65, 152, 153, 154, 155, 159, 193, 196]
lines = add_const_to_lines(lines, const_lines, 'msg_signing/')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# File: cpu/src/eth_signing.cpp
# ============================================================================
def fix_eth_signing():
path = 'cpu/src/eth_signing.cpp'
print(f'\n=== {path} ===')
lines = read(path)
# --- misc-unused-using-decls: L16 'using fast::Point;' ---
idx = 16 - 1
if 'using fast::Point' in lines[idx]:
lines[idx] = '' # Remove the line (keep blank to preserve line numbers)
# Actually remove the line entirely
lines[idx] = '\n'
# Better: just delete and shift
del lines[idx]
# Now const_lines will shift by -1
print(' UNUSED-USING: L16 removed')
# After removal, adjust const lines
const_lines = [95, 96] # shifted from [96, 97]
else:
print(f' UNUSED-USING SKIP: L16: {repr(lines[idx][:50])}')
const_lines = [96, 97]
lines = add_const_to_lines(lines, const_lines, 'eth_signing/')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# File: cpu/src/address.cpp
# ============================================================================
def fix_address():
path = 'cpu/src/address.cpp'
print(f'\n=== {path} ===')
lines = read(path)
# L516: for (char c : prefix) -> for (const char c : prefix)
# L527: std::uint8_t version_byte = ... -> const std::uint8_t version_byte = ...
# L527: also modernize-use-auto -> auto version_byte = ...
const_lines = [516, 527]
lines = add_const_to_lines(lines, const_lines, 'address/')
# L527: modernize-use-auto: const std::uint8_t version_byte = static_cast<...>
# -> const auto version_byte = static_cast<...>
# This is handled by add_const adding 'const', but we also need to change the type
# Actually the modernize-use-auto wants: 'auto version_byte = static_cast<std::uint8_t>(...)'
# And const-correctness wants: 'const ... version_byte = ...'
# Combined: 'const auto version_byte = static_cast<std::uint8_t>(...)'
# Let's check what add_const_to_lines did for L527:
# Line 527 was: std::uint8_t version_byte = static_cast<std::uint8_t>(type << 3);
# After add_const: const std::uint8_t version_byte = ...
# But we also want to replace std::uint8_t with auto for modernize-use-auto:
# Find current state of L527 (0-indexed: 526, but const_lines processed in reverse,
# so L516 was processed first (higher reverse order), then L527)
# Actually both were processed with const_lines = [516, 527], processed in reverse: 527, 516
# After const processing, L527 has 'const std::uint8_t version_byte = ...'
# Now apply modernize-use-auto: replace 'const std::uint8_t' with 'const auto'
idx = 527 - 1
if idx < len(lines):
line = lines[idx]
if 'const std::uint8_t version_byte' in line:
lines[idx] = line.replace('const std::uint8_t version_byte',
'const auto version_byte')
print(' AUTO: L527')
elif 'const auto version_byte' in line:
print(' AUTO: L527 already auto')
else:
print(f' AUTO_SKIP: L527: {repr(line[:60])}')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# File: cpu/src/wallet.cpp
# ============================================================================
def fix_wallet():
path = 'cpu/src/wallet.cpp'
print(f'\n=== {path} ===')
lines = read(path)
# L150, L171: bugprone-misplaced-widening-cast
# Pattern: static_cast<std::uint64_t>(27 + rsig.recid)
# Fix: static_cast<std::uint64_t>(27) + static_cast<std::uint64_t>(rsig.recid)
for lnum in [150, 171]:
idx = lnum - 1
if idx >= len(lines):
continue
line = lines[idx]
if 'static_cast<std::uint64_t>(27 + rsig.recid)' in line:
lines[idx] = line.replace(
'static_cast<std::uint64_t>(27 + rsig.recid)',
'static_cast<std::uint64_t>(27) + static_cast<std::uint64_t>(rsig.recid)'
)
print(f' WIDEN: L{lnum}')
else:
print(f' WIDEN_SKIP: L{lnum}: {repr(line[:60])}')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# File: cpu/src/coin_address.cpp
# ============================================================================
def fix_coin_address():
path = 'cpu/src/coin_address.cpp'
print(f'\n=== {path} ===')
lines = read(path)
# L170: std::string prefix = testnet ? ... -> const std::string prefix = ...
const_lines = [170]
lines = add_const_to_lines(lines, const_lines, 'coin_address/')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# File: cpu/tests/test_bip39.cpp
# ============================================================================
# Helper function for replacing sscanf with strtoul in hex_to_bytes
HEX_TO_BYTES_SSCANF_BIP39 = '''\
static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
for (size_t i = 0; i < len; ++i) {
unsigned int byte = 0;
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
std::sscanf(hex + 2 * i, "%02x", &byte);
#ifdef __clang__
#pragma clang diagnostic pop
#endif
out[i] = static_cast<uint8_t>(byte);
}
}'''
HEX_TO_BYTES_STRTOUL_BIP39 = '''\
static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
for (size_t i = 0; i < len; ++i) {
char pair[3] = { hex[2 * i], hex[2 * i + 1], '\\0' };
char* endptr = nullptr;
const unsigned long val = std::strtoul(pair, &endptr, 16);
out[i] = (endptr == pair + 2) ? static_cast<uint8_t>(val) : 0;
}
}'''
BYTES_TO_HEX_OLD = '''\
static std::string bytes_to_hex(const uint8_t* data, size_t len) {
std::string result;
result.reserve(len * 2);
for (size_t i = 0; i < len; ++i) {
char buf[3];
std::snprintf(buf, sizeof(buf), "%02x", data[i]);
result += buf;
}
return result;
}'''
BYTES_TO_HEX_NEW = '''\
static std::string bytes_to_hex(const uint8_t* data, size_t len) {
std::string result;
result.reserve(len * 2);
for (size_t i = 0; i < len; ++i) {
char buf[3];
(void)std::snprintf(buf, sizeof(buf), "%02x", data[i]);
result += buf;
}
return result;
}'''
def fix_test_bip39():
path = 'cpu/tests/test_bip39.cpp'
print(f'\n=== {path} ===')
content = (BASE / path).read_text()
# cert-err33-c + cert-err34-c: replace sscanf with strtoul
if HEX_TO_BYTES_SSCANF_BIP39 in content:
content = content.replace(HEX_TO_BYTES_SSCANF_BIP39, HEX_TO_BYTES_STRTOUL_BIP39)
print(' ERR34: hex_to_bytes sscanf -> strtoul')
else:
print(' ERR34_SKIP: hex_to_bytes sscanf pattern not found')
# cert-err33-c: snprintf return unchecked
if BYTES_TO_HEX_OLD in content:
content = content.replace(BYTES_TO_HEX_OLD, BYTES_TO_HEX_NEW)
print(' ERR33: bytes_to_hex snprintf -> (void)snprintf')
else:
print(' ERR33_SKIP: bytes_to_hex pattern not found')
# clang-analyzer-core.NullDereference at L99
# CHECK(wl != nullptr, ...) then wl[0] - add explicit if
old_null = ' CHECK(wl != nullptr, "wordlist not null");\n CHECK(std::strcmp(wl[0]'
new_null = ' CHECK(wl != nullptr, "wordlist not null");\n if (!wl) { return; }\n CHECK(std::strcmp(wl[0]'
if old_null in content:
content = content.replace(old_null, new_null)
print(' NULL_DEREF: L99 added null guard')
else:
print(' NULL_DEREF_SKIP: pattern not found')
(BASE / path).write_text(content)
# Now add const to specific lines
lines = read(path)
# After the sscanf->strtoul replacement, L32 changes. The line numbers may shift.
# The original file had 393 lines. After replacing 14-line block with 7-line block
# and 9-line block with 9-line block (same), the const lines may shift.
# Let's handle const by string pattern instead.
# L238, L252, L264: std::string hex = bytes_to_hex(...) -> const std::string hex = ...
for idx in range(len(lines)):
line = lines[idx]
stripped = line.lstrip()
if stripped.startswith('std::string hex = bytes_to_hex('):
leading = line[:len(line) - len(stripped)]
lines[idx] = leading + 'const ' + stripped
print(f' CONST: L{idx+1} std::string hex')
# L340, L352, L365: for (char c : mnemonic) -> for (const char c : mnemonic)
for idx in range(len(lines)):
line = lines[idx]
if 'for (char c : mnemonic)' in line:
lines[idx] = line.replace('for (char c : mnemonic)',
'for (const char c : mnemonic)')
print(f' CONST: L{idx+1} for (char c : mnemonic)')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# File: cpu/tests/test_ethereum.cpp
# ============================================================================
HEX_TO_BYTES_SSCANF_ETH = '''\
static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
for (size_t i = 0; i < len; ++i) {
unsigned int byte = 0;
if (std::sscanf(hex + i * 2, "%02x", &byte) != 1) byte = 0;
out[i] = static_cast<uint8_t>(byte);
}
}'''
HEX_TO_BYTES_STRTOUL_ETH = '''\
static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
for (size_t i = 0; i < len; ++i) {
char pair[3] = { hex[i * 2], hex[i * 2 + 1], '\\0' };
char* endptr = nullptr;
const unsigned long val = std::strtoul(pair, &endptr, 16);
out[i] = (endptr == pair + 2) ? static_cast<uint8_t>(val) : 0;
}
}'''
SNPRINTF_ETH_OLD = ' std::snprintf(buf, sizeof(buf), "Round-trip chain_id=%lu (%s)",'
SNPRINTF_ETH_NEW = ' (void)std::snprintf(buf, sizeof(buf), "Round-trip chain_id=%lu (%s)",'
def fix_test_ethereum():
path = 'cpu/tests/test_ethereum.cpp'
print(f'\n=== {path} ===')
content = (BASE / path).read_text()
# cert-err34-c: sscanf -> strtoul
if HEX_TO_BYTES_SSCANF_ETH in content:
content = content.replace(HEX_TO_BYTES_SSCANF_ETH, HEX_TO_BYTES_STRTOUL_ETH)
print(' ERR34: hex_to_bytes sscanf -> strtoul')
else:
print(' ERR34_SKIP: hex_to_bytes pattern not found')
# cert-err33-c at L352: snprintf return unchecked
if SNPRINTF_ETH_OLD in content:
content = content.replace(SNPRINTF_ETH_OLD, SNPRINTF_ETH_NEW)
print(' ERR33: snprintf -> (void)snprintf')
else:
print(' ERR33_SKIP: snprintf pattern not found')
# readability-simplify-boolean-expr: extract conditions to named bools
# L189: ASSERT_TRUE(sig.v == 27 || sig.v == 28, "legacy v should be 27 or 28");
# Fix: const bool v_ok = (sig.v == 27 || sig.v == 28); ASSERT_TRUE(v_ok, ...);
content = content.replace(
' ASSERT_TRUE(sig.v == 27 || sig.v == 28, "legacy v should be 27 or 28");',
' {\n const bool v_ok = (sig.v == 27 || sig.v == 28);\n ASSERT_TRUE(v_ok, "legacy v should be 27 or 28");\n }'
)
content = content.replace(
' ASSERT_TRUE(sig2.v == 37 || sig2.v == 38, "EIP-155 v should be 37 or 38");',
' {\n const bool v2_ok = (sig2.v == 37 || sig2.v == 38);\n ASSERT_TRUE(v2_ok, "EIP-155 v should be 37 or 38");\n }'
)
content = content.replace(
' ASSERT_TRUE(sig.v == 27 || sig.v == 28, "v should be 27 or 28");',
' {\n const bool v_ok2 = (sig.v == 27 || sig.v == 28);\n ASSERT_TRUE(v_ok2, "v should be 27 or 28");\n }'
)
print(' SIMPLIFY-BOOL: test_ethereum sig.v checks')
(BASE / path).write_text(content)
# Add const to variable declarations (by pattern)
lines = read(path)
# Find and fix const alerts: Point pk = ..., Scalar sk = ..., auto vars, etc.
# L226: Point pk = ... -> const Point pk
# L264: std::array<...> zero{} - this is const alert? Let me check
# Actually the const alerts at L226, L264, L287, L302, L309, L317, L333
# are all variable declarations that should be const
const_patterns = [
'Point pk = ',
'Point pk2 = ',
'auto expected_addr = ',
'auto addr = ',
'auto addr2 = ',
'std::array<uint8_t, 32> hash{};',
'std::array<uint8_t, 32> wrong_hash{};',
'bool wrong = ',
'bool wrong2 = ',
]
# Instead, use line numbers after adjusting for line-number shifts from replacements
# The simplify-bool fix added 3 blocks (each +4 lines = 3 lines inserted per block = +9 total)
# But let's use pattern matching instead of line numbers
# Pattern: find lines with variable declarations that are const-alerting
# Based on the alert line context I read:
# L226: Point pk = Point::generator().scalar_mul(sk);
# L264: std::array<uint8_t, 32> zero{};
# L287: Point pk = ...
# L302: bool valid = ...
# L309: bool wrong = ...
# L317: bool wrong2 = ...
# L333: Point pk = ...
for idx in range(len(lines)):
line = lines[idx]
stripped = line.lstrip()
leading = line[:len(line) - len(stripped)]
if stripped.startswith('const '):
continue
# Point pk = ... (not already const)
if re.match(r'Point pk\d? = ', stripped) and not stripped.startswith('const '):
lines[idx] = leading + 'const ' + stripped
print(f' CONST: L{idx+1} Point pk')
elif re.match(r'(bool (valid|wrong\d?|r_zero|s_zero|all_zero)) = ', stripped):
lines[idx] = leading + 'const ' + stripped
print(f' CONST: L{idx+1} bool')
elif re.match(r'std::array<uint8_t, 32> (hash|wrong_hash|zero)\{\}', stripped):
lines[idx] = leading + 'const ' + stripped
print(f' CONST: L{idx+1} array')
elif re.match(r'auto expected_addr = ethernet_address_bytes', stripped) or \
re.match(r'auto expected_addr = ethereum_address_bytes', stripped):
lines[idx] = leading + 'const ' + stripped
print(f' CONST: L{idx+1} auto expected_addr')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# File: cpu/tests/test_wallet.cpp
# ============================================================================
HEX_TO_BYTES_SSCANF_WALLET = '''\
static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
for (size_t i = 0; i < len; ++i) {
unsigned int byte = 0;
if (std::sscanf(hex + i * 2, "%02x", &byte) != 1) byte = 0;
out[i] = static_cast<uint8_t>(byte);
}
}'''
HEX_TO_BYTES_STRTOUL_WALLET = '''\
static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
for (size_t i = 0; i < len; ++i) {
char pair[3] = { hex[i * 2], hex[i * 2 + 1], '\\0' };
char* endptr = nullptr;
const unsigned long val = std::strtoul(pair, &endptr, 16);
out[i] = (endptr == pair + 2) ? static_cast<uint8_t>(val) : 0;
}
}'''
def fix_test_wallet():
path = 'cpu/tests/test_wallet.cpp'
print(f'\n=== {path} ===')
content = (BASE / path).read_text()
# misc-unused-using-decls: L45 'using fast::Point;'
if 'using fast::Point;\n' in content:
content = content.replace('using fast::Point;\n', '')
print(' UNUSED-USING: removed using fast::Point')
else:
print(' UNUSED-USING SKIP: using fast::Point not found')
# cert-err34-c: sscanf -> strtoul
if HEX_TO_BYTES_SSCANF_WALLET in content:
content = content.replace(HEX_TO_BYTES_SSCANF_WALLET, HEX_TO_BYTES_STRTOUL_WALLET)
print(' ERR34: hex_to_bytes sscanf -> strtoul')
else:
print(' ERR34_SKIP: hex_to_bytes sscanf pattern not found')
# readability-simplify-boolean-expr: extract to named bools
# L197: ASSERT_TRUE(wif[0] == 'K' || wif[0] == 'L', "WIF starts with K or L");
content = content.replace(
' ASSERT_TRUE(wif[0] == \'K\' || wif[0] == \'L\', "WIF starts with K or L");',
' {\n const bool wif_prefix_ok = (wif[0] == \'K\' || wif[0] == \'L\');\n ASSERT_TRUE(wif_prefix_ok, "WIF starts with K or L");\n }'
)
# L397: ASSERT_TRUE(sig.recid >= 0 && sig.recid <= 3, "valid recid");
content = content.replace(
' ASSERT_TRUE(sig.recid >= 0 && sig.recid <= 3, "valid recid");',
' {\n const bool recid_ok = (sig.recid >= 0 && sig.recid <= 3);\n ASSERT_TRUE(recid_ok, "valid recid");\n }'
)
# L505: ASSERT_TRUE(!btc.empty() && !ltc.empty() && !doge.empty(), "all non-empty");
content = content.replace(
' ASSERT_TRUE(!btc.empty() && !ltc.empty() && !doge.empty(), "all non-empty");',
' {\n const bool coins_non_empty = !btc.empty() && !ltc.empty() && !doge.empty();\n ASSERT_TRUE(coins_non_empty, "all non-empty");\n }'
)
# L602: multi-line ASSERT_TRUE
content = content.replace(
' ASSERT_TRUE(!p2pkh.empty() && !p2wpkh.empty() && !p2sh.empty() && !p2tr.empty(),\n "all non-empty");',
' {\n const bool addrs_non_empty = !p2pkh.empty() && !p2wpkh.empty() && !p2sh.empty() && !p2tr.empty();\n ASSERT_TRUE(addrs_non_empty, "all non-empty");\n }'
)
print(' SIMPLIFY-BOOL: 4 bool expressions extracted')
(BASE / path).write_text(content)
# Add const to variable declarations (by pattern matching)
lines = read(path)
for idx in range(len(lines)):
line = lines[idx]
stripped = line.lstrip()
leading = line[:len(line) - len(stripped)]
if stripped.startswith('const '):
continue
# L290: size_t msg_len = sizeof(msg) - 1;
# L293: bool ok = bitcoin_verify_message(...)
# L298: bool bad = bitcoin_verify_message(...)
# L314: size_t msg_len = sizeof(msg) - 1;
# L336: size_t msg_len = sizeof(msg) - 1;
# L366: size_t msg_len = sizeof(msg) - 1;
# L369: bool verified = verify_message(...)
# L418: size_t msg_len = sizeof(msg) - 1;
# L437: size_t msg_len = sizeof(msg) - 1;
if re.match(r'size_t msg_len = sizeof\(msg\) - 1;', stripped):
lines[idx] = leading + 'const ' + stripped
print(f' CONST: L{idx+1} size_t msg_len')
elif re.match(r'bool ok = bitcoin_verify_message\(', stripped):
lines[idx] = leading + 'const ' + stripped
print(f' CONST: L{idx+1} bool ok')
elif re.match(r'bool bad = bitcoin_verify_message\(', stripped):
lines[idx] = leading + 'const ' + stripped
print(f' CONST: L{idx+1} bool bad')
elif re.match(r'bool verified = verify_message\(', stripped):
lines[idx] = leading + 'const ' + stripped
print(f' CONST: L{idx+1} bool verified')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# File: cpu/tests/test_zk.cpp
# ============================================================================
def fix_test_zk():
path = 'cpu/tests/test_zk.cpp'
print(f'\n=== {path} ===')
lines = read(path)
# All 10 alerts are misc-const-correctness at:
# L60, L95, L103, L117, L134, L267, L281, L295, L309, L325
const_lines = [60, 95, 103, 117, 134, 267, 281, 295, 309, 325]
lines = add_const_to_lines(lines, const_lines, 'test_zk/')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# File: audit/test_ffi_round_trip.cpp
# ============================================================================
def fix_test_ffi():
path = 'audit/test_ffi_round_trip.cpp'
print(f'\n=== {path} ===')
content = (BASE / path).read_text()
# L1055: misc-redundant-expression (tautological check)
# Fix: remove the first redundant half of the OR expression
old_check = (
'CHECK(ufsecp_bip39_validate(ctx, "abandon abandon abandon abandon abandon abandon '
'abandon abandon abandon abandon abandon abandon") != UFSECP_OK\n'
' || ufsecp_bip39_validate(ctx, "abandon abandon abandon abandon '
'abandon abandon abandon abandon abandon abandon abandon abandon") == UFSECP_OK,\n'
' "bip39_validate accepts or rejects known mnemonic");'
)
new_check = (
'CHECK(ufsecp_bip39_validate(ctx, "abandon abandon abandon abandon abandon abandon '
'abandon abandon abandon abandon abandon abandon") == UFSECP_OK,\n'
' "bip39_validate accepts valid 12-word mnemonic");'
)
if old_check in content:
content = content.replace(old_check, new_check)
print(' REDUNDANT: L1055 tautological check fixed')
else:
print(' REDUNDANT_SKIP: L1055 exact pattern not found, trying partial match')
# Try a partial match
old_pattern = 'bip39_validate accepts or rejects known mnemonic'
if old_pattern in content:
# Need to find and replace the surrounding context
# Use regex for multi-line replacement
pattern = re.compile(
r'CHECK\(ufsecp_bip39_validate\(ctx,\s*"abandon[^"]+"\)\s*!=\s*UFSECP_OK\s*\n'
r'\s*\|\|\s*ufsecp_bip39_validate\(ctx,\s*"abandon[^"]+"\)\s*==\s*UFSECP_OK,\s*\n'
r'\s*"bip39_validate accepts or rejects known mnemonic"\)',
re.MULTILINE
)
replacement = (
'CHECK(ufsecp_bip39_validate(ctx, "abandon abandon abandon abandon abandon abandon '
'abandon abandon abandon abandon abandon abandon") == UFSECP_OK,\n'
' "bip39_validate accepts valid 12-word mnemonic")'
)
content, n = pattern.subn(replacement, content)
if n:
print(f' REDUNDANT: L1055 fixed via regex ({n} replacement)')
else:
print(' REDUNDANT_FAIL: could not fix L1055')
(BASE / path).write_text(content)
lines = read(path)
# L1317: size_t msg_len = 15; -> const size_t msg_len = 15;
# L1538: bool match = ... -> const bool match = ...
# Use pattern matching since line numbers may have shifted
for idx in range(len(lines)):
line = lines[idx]
stripped = line.lstrip()
leading = line[:len(line) - len(stripped)]
if stripped.startswith('const '):
continue
if stripped == 'size_t msg_len = 15;\n':
lines[idx] = leading + 'const ' + stripped
print(f' CONST: L{idx+1} size_t msg_len = 15')
elif stripped.startswith('bool match = (std::memcmp('):
lines[idx] = leading + 'const ' + stripped
print(f' CONST: L{idx+1} bool match')
save(path, lines)
print(f' Saved {path} ({len(lines)} lines)')
# ============================================================================
# Main
# ============================================================================
if __name__ == '__main__':
print('Fix Round 4: resolving 211 code-scanning alerts')
print('=' * 60)
fix_ufsecp_impl()
fix_bip39()
fix_zk()
fix_message_signing()
fix_eth_signing()
fix_address()
fix_wallet()
fix_coin_address()
fix_test_bip39()
fix_test_ethereum()
fix_test_wallet()
fix_test_zk()
fix_test_ffi()
print('\n' + '=' * 60)
print('Done. Check brace balance:')
files = [
'include/ufsecp/ufsecp_impl.cpp',
'cpu/src/bip39.cpp',
'cpu/src/zk.cpp',
'cpu/src/message_signing.cpp',
'cpu/src/eth_signing.cpp',
'cpu/src/address.cpp',
'cpu/src/wallet.cpp',
'cpu/src/coin_address.cpp',
'cpu/tests/test_bip39.cpp',
'cpu/tests/test_ethereum.cpp',
'cpu/tests/test_wallet.cpp',
'cpu/tests/test_zk.cpp',
'audit/test_ffi_round_trip.cpp',
]
all_ok = True
for f in files:
try:
text = (BASE / f).read_text()
opens = text.count('{')
closes = text.count('}')
ok = opens == closes
status = 'OK' if ok else f'MISMATCH ({opens} vs {closes})'
print(f' {f}: {status}')
if not ok:
all_ok = False
except Exception as e:
print(f' {f}: ERROR {e}')
all_ok = False
if all_ok:
print('\nAll brace counts balanced.')
else:
print('\nWARNING: Some files have mismatched braces!')

View File

@ -197,13 +197,11 @@ public:
std::vector<secp256k1::opencl::AffinePoint> h_aff(count);
ctx_->batch_jacobian_to_affine(h_jac.data(), h_aff.data(), count);
/* CPU: SHA-256(x_bytes) → 32-byte shared secret */
/* CPU: SHA-256(compressed shared point) to match ufsecp_ecdh/CUDA. */
for (size_t i = 0; i < count; ++i) {
std::array<uint64_t, 4> xl;
std::memcpy(xl.data(), h_aff[i].x.limbs, 32);
auto fe = secp256k1::fast::FieldElement::from_limbs(xl);
auto xbytes = fe.to_bytes();
auto digest = secp256k1::SHA256::hash(xbytes.data(), 32);
uint8_t compressed[33];
affine_to_compressed(&h_aff[i], compressed);
auto digest = secp256k1::SHA256::hash(compressed, sizeof(compressed));
std::memcpy(out_secrets32 + i * 32, digest.data(), 32);
}

View File

@ -6,7 +6,7 @@
"vendor": "NVIDIA Corporation",
"version": "OpenCL 3.0 CUDA",
"driver_version": "580.126.09",
"memory_mb": 15847,
"memory_mb": 15844,
"compute_units": 36
},
"platform": {
@ -20,36 +20,36 @@
"passed": 27,
"failed": 0,
"skipped": 0,
"total_seconds": 0.727543,
"total_seconds": 0.673606,
"verdict": "AUDIT-READY"
},
"modules": [
{ "id": "selftest_core", "name": "OpenCL Selftest (23+ kernel tests)", "section": "math_invariants", "result": "PASS", "time_ms": 152.799583, "error_code": 0 },
{ "id": "field_add_sub", "name": "Field add/sub roundtrip", "section": "math_invariants", "result": "PASS", "time_ms": 0.307649, "error_code": 0 },
{ "id": "field_mul_comm", "name": "Field mul commutativity", "section": "math_invariants", "result": "PASS", "time_ms": 0.140150, "error_code": 0 },
{ "id": "field_inv", "name": "Field inverse roundtrip (a * a^-1 = 1)", "section": "math_invariants", "result": "PASS", "time_ms": 0.266819, "error_code": 0 },
{ "id": "field_sqr", "name": "Field square == mul(a,a)", "section": "math_invariants", "result": "PASS", "time_ms": 0.120384, "error_code": 0 },
{ "id": "field_negate", "name": "Field negate roundtrip (a + (-a) = 0)", "section": "math_invariants", "result": "PASS", "time_ms": 0.118151, "error_code": 0 },
{ "id": "gen_mul_vec", "name": "Generator mul known vectors", "section": "math_invariants", "result": "PASS", "time_ms": 0.069495, "error_code": 0 },
{ "id": "scalar_roundtrip", "name": "Scalar/Point consistency", "section": "math_invariants", "result": "PASS", "time_ms": 0.196458, "error_code": 0 },
{ "id": "add_dbl_consist", "name": "Point add vs double consistency", "section": "math_invariants", "result": "PASS", "time_ms": 0.226176, "error_code": 0 },
{ "id": "scalar_mul_lin", "name": "Scalar mul linearity (a+b)*G = aG+bG", "section": "math_invariants", "result": "PASS", "time_ms": 0.334149, "error_code": 0 },
{ "id": "group_order", "name": "Group order basic checks", "section": "math_invariants", "result": "PASS", "time_ms": 0.224126, "error_code": 0 },
{ "id": "batch_inv", "name": "Batch inversion (Montgomery trick)", "section": "math_invariants", "result": "PASS", "time_ms": 0.639383, "error_code": 0 },
{ "id": "ecdsa_roundtrip", "name": "ECDSA sign + verify roundtrip", "section": "signatures", "result": "PASS", "time_ms": 7.346328, "error_code": 0 },
{ "id": "schnorr_roundtrip", "name": "Schnorr/BIP-340 sign + verify roundtrip", "section": "signatures", "result": "PASS", "time_ms": 6.647268, "error_code": 0 },
{ "id": "ecdsa_wrong_key", "name": "ECDSA verify rejects wrong pubkey", "section": "signatures", "result": "PASS", "time_ms": 6.214200, "error_code": 0 },
{ "id": "batch_smul", "name": "Batch scalar mul generator", "section": "batch_advanced", "result": "PASS", "time_ms": 0.435053, "error_code": 0 },
{ "id": "batch_j2a", "name": "Batch Jacobian to Affine", "section": "batch_advanced", "result": "PASS", "time_ms": 0.518009, "error_code": 0 },
{ "id": "diff_smul", "name": "OpenCL-host differential scalar mul", "section": "differential", "result": "PASS", "time_ms": 0.070056, "error_code": 0 },
{ "id": "rfc6979_determ", "name": "RFC-6979 ECDSA deterministic nonce", "section": "standard_vectors", "result": "PASS", "time_ms": 6.422079, "error_code": 0 },
{ "id": "bip340_vectors", "name": "BIP-340 Schnorr known-key roundtrip", "section": "standard_vectors", "result": "PASS", "time_ms": 8.872533, "error_code": 0 },
{ "id": "ecdsa_multi_key", "name": "ECDSA multi-key (10 keys) sign+verify", "section": "protocol_security", "result": "PASS", "time_ms": 60.908449, "error_code": 0 },
{ "id": "schnorr_multi_key", "name": "Schnorr multi-key (10 keys) sign+verify", "section": "protocol_security", "result": "PASS", "time_ms": 55.286184, "error_code": 0 },
{ "id": "fuzz_edge_scalar", "name": "Edge-case scalars (0*G, 1*G, G+G=2G)", "section": "fuzzing", "result": "PASS", "time_ms": 0.229781, "error_code": 0 },
{ "id": "fuzz_ecdsa_zero", "name": "ECDSA rejects zero private key", "section": "fuzzing", "result": "PASS", "time_ms": 0.077824, "error_code": 0 },
{ "id": "fuzz_schnorr_zero", "name": "Schnorr rejects zero private key", "section": "fuzzing", "result": "PASS", "time_ms": 0.082019, "error_code": 0 },
{ "id": "perf_ecdsa_50", "name": "ECDSA 50-iteration stress", "section": "performance", "result": "PASS", "time_ms": 287.677880, "error_code": 0 },
{ "id": "perf_schnorr_25", "name": "Schnorr 25-iteration stress", "section": "performance", "result": "PASS", "time_ms": 131.178937, "error_code": 0 }
{ "id": "selftest_core", "name": "OpenCL Selftest (23+ kernel tests)", "section": "math_invariants", "result": "PASS", "time_ms": 137.571479, "error_code": 0 },
{ "id": "field_add_sub", "name": "Field add/sub roundtrip", "section": "math_invariants", "result": "PASS", "time_ms": 0.335681, "error_code": 0 },
{ "id": "field_mul_comm", "name": "Field mul commutativity", "section": "math_invariants", "result": "PASS", "time_ms": 0.215808, "error_code": 0 },
{ "id": "field_inv", "name": "Field inverse roundtrip (a * a^-1 = 1)", "section": "math_invariants", "result": "PASS", "time_ms": 0.280040, "error_code": 0 },
{ "id": "field_sqr", "name": "Field square == mul(a,a)", "section": "math_invariants", "result": "PASS", "time_ms": 0.129584, "error_code": 0 },
{ "id": "field_negate", "name": "Field negate roundtrip (a + (-a) = 0)", "section": "math_invariants", "result": "PASS", "time_ms": 0.131630, "error_code": 0 },
{ "id": "gen_mul_vec", "name": "Generator mul known vectors", "section": "math_invariants", "result": "PASS", "time_ms": 0.062463, "error_code": 0 },
{ "id": "scalar_roundtrip", "name": "Scalar/Point consistency", "section": "math_invariants", "result": "PASS", "time_ms": 0.121435, "error_code": 0 },
{ "id": "add_dbl_consist", "name": "Point add vs double consistency", "section": "math_invariants", "result": "PASS", "time_ms": 0.219232, "error_code": 0 },
{ "id": "scalar_mul_lin", "name": "Scalar mul linearity (a+b)*G = aG+bG", "section": "math_invariants", "result": "PASS", "time_ms": 0.330590, "error_code": 0 },
{ "id": "group_order", "name": "Group order basic checks", "section": "math_invariants", "result": "PASS", "time_ms": 0.199699, "error_code": 0 },
{ "id": "batch_inv", "name": "Batch inversion (Montgomery trick)", "section": "math_invariants", "result": "PASS", "time_ms": 0.353371, "error_code": 0 },
{ "id": "ecdsa_roundtrip", "name": "ECDSA sign + verify roundtrip", "section": "signatures", "result": "PASS", "time_ms": 7.218292, "error_code": 0 },
{ "id": "schnorr_roundtrip", "name": "Schnorr/BIP-340 sign + verify roundtrip", "section": "signatures", "result": "PASS", "time_ms": 5.841064, "error_code": 0 },
{ "id": "ecdsa_wrong_key", "name": "ECDSA verify rejects wrong pubkey", "section": "signatures", "result": "PASS", "time_ms": 6.111775, "error_code": 0 },
{ "id": "batch_smul", "name": "Batch scalar mul generator", "section": "batch_advanced", "result": "PASS", "time_ms": 0.093349, "error_code": 0 },
{ "id": "batch_j2a", "name": "Batch Jacobian to Affine", "section": "batch_advanced", "result": "PASS", "time_ms": 0.083400, "error_code": 0 },
{ "id": "diff_smul", "name": "OpenCL-host differential scalar mul", "section": "differential", "result": "PASS", "time_ms": 0.020089, "error_code": 0 },
{ "id": "rfc6979_determ", "name": "RFC-6979 ECDSA deterministic nonce", "section": "standard_vectors", "result": "PASS", "time_ms": 6.313681, "error_code": 0 },
{ "id": "bip340_vectors", "name": "BIP-340 Schnorr known-key roundtrip", "section": "standard_vectors", "result": "PASS", "time_ms": 7.330723, "error_code": 0 },
{ "id": "ecdsa_multi_key", "name": "ECDSA multi-key (10 keys) sign+verify", "section": "protocol_security", "result": "PASS", "time_ms": 59.571898, "error_code": 0 },
{ "id": "schnorr_multi_key", "name": "Schnorr multi-key (10 keys) sign+verify", "section": "protocol_security", "result": "PASS", "time_ms": 47.122783, "error_code": 0 },
{ "id": "fuzz_edge_scalar", "name": "Edge-case scalars (0*G, 1*G, G+G=2G)", "section": "fuzzing", "result": "PASS", "time_ms": 0.078238, "error_code": 0 },
{ "id": "fuzz_ecdsa_zero", "name": "ECDSA rejects zero private key", "section": "fuzzing", "result": "PASS", "time_ms": 0.042447, "error_code": 0 },
{ "id": "fuzz_schnorr_zero", "name": "Schnorr rejects zero private key", "section": "fuzzing", "result": "PASS", "time_ms": 0.045454, "error_code": 0 },
{ "id": "perf_ecdsa_50", "name": "ECDSA 50-iteration stress", "section": "performance", "result": "PASS", "time_ms": 282.603579, "error_code": 0 },
{ "id": "perf_schnorr_25", "name": "Schnorr 25-iteration stress", "section": "performance", "result": "PASS", "time_ms": 111.053794, "error_code": 0 }
]
}

View File

@ -2,60 +2,60 @@
UltrafastSecp256k1 -- OpenCL Unified Audit Report
Framework v2.0.0
Linux x86-64 | GCC 14.2.0 | Release
Device: NVIDIA GeForce RTX 5060 Ti (NVIDIA Corporation) | 36 CUs | 15847 MB
Device: NVIDIA GeForce RTX 5060 Ti (NVIDIA Corporation) | 36 CUs | 15844 MB
================================================================
Section: math_invariants
--------------------------------------------------
[PASS] OpenCL Selftest (23+ kernel tests) (152.8 ms)
[PASS] Field add/sub roundtrip (0.307649 ms)
[PASS] Field mul commutativity (0.14015 ms)
[PASS] Field inverse roundtrip (a * a^-1 = 1) (0.266819 ms)
[PASS] Field square == mul(a,a) (0.120384 ms)
[PASS] Field negate roundtrip (a + (-a) = 0) (0.118151 ms)
[PASS] Generator mul known vectors (0.069495 ms)
[PASS] Scalar/Point consistency (0.196458 ms)
[PASS] Point add vs double consistency (0.226176 ms)
[PASS] Scalar mul linearity (a+b)*G = aG+bG (0.334149 ms)
[PASS] Group order basic checks (0.224126 ms)
[PASS] Batch inversion (Montgomery trick) (0.639383 ms)
[PASS] OpenCL Selftest (23+ kernel tests) (137.571 ms)
[PASS] Field add/sub roundtrip (0.335681 ms)
[PASS] Field mul commutativity (0.215808 ms)
[PASS] Field inverse roundtrip (a * a^-1 = 1) (0.28004 ms)
[PASS] Field square == mul(a,a) (0.129584 ms)
[PASS] Field negate roundtrip (a + (-a) = 0) (0.13163 ms)
[PASS] Generator mul known vectors (0.062463 ms)
[PASS] Scalar/Point consistency (0.121435 ms)
[PASS] Point add vs double consistency (0.219232 ms)
[PASS] Scalar mul linearity (a+b)*G = aG+bG (0.33059 ms)
[PASS] Group order basic checks (0.199699 ms)
[PASS] Batch inversion (Montgomery trick) (0.353371 ms)
Section: signatures
--------------------------------------------------
[PASS] ECDSA sign + verify roundtrip (7.34633 ms)
[PASS] Schnorr/BIP-340 sign + verify roundtrip (6.64727 ms)
[PASS] ECDSA verify rejects wrong pubkey (6.2142 ms)
[PASS] ECDSA sign + verify roundtrip (7.21829 ms)
[PASS] Schnorr/BIP-340 sign + verify roundtrip (5.84106 ms)
[PASS] ECDSA verify rejects wrong pubkey (6.11177 ms)
Section: batch_advanced
--------------------------------------------------
[PASS] Batch scalar mul generator (0.435053 ms)
[PASS] Batch Jacobian to Affine (0.518009 ms)
[PASS] Batch scalar mul generator (0.093349 ms)
[PASS] Batch Jacobian to Affine (0.0834 ms)
Section: differential
--------------------------------------------------
[PASS] OpenCL-host differential scalar mul (0.070056 ms)
[PASS] OpenCL-host differential scalar mul (0.020089 ms)
Section: standard_vectors
--------------------------------------------------
[PASS] RFC-6979 ECDSA deterministic nonce (6.42208 ms)
[PASS] BIP-340 Schnorr known-key roundtrip (8.87253 ms)
[PASS] RFC-6979 ECDSA deterministic nonce (6.31368 ms)
[PASS] BIP-340 Schnorr known-key roundtrip (7.33072 ms)
Section: protocol_security
--------------------------------------------------
[PASS] ECDSA multi-key (10 keys) sign+verify (60.9084 ms)
[PASS] Schnorr multi-key (10 keys) sign+verify (55.2862 ms)
[PASS] ECDSA multi-key (10 keys) sign+verify (59.5719 ms)
[PASS] Schnorr multi-key (10 keys) sign+verify (47.1228 ms)
Section: fuzzing
--------------------------------------------------
[PASS] Edge-case scalars (0*G, 1*G, G+G=2G) (0.229781 ms)
[PASS] ECDSA rejects zero private key (0.077824 ms)
[PASS] Schnorr rejects zero private key (0.082019 ms)
[PASS] Edge-case scalars (0*G, 1*G, G+G=2G) (0.078238 ms)
[PASS] ECDSA rejects zero private key (0.042447 ms)
[PASS] Schnorr rejects zero private key (0.045454 ms)
Section: performance
--------------------------------------------------
[PASS] ECDSA 50-iteration stress (287.678 ms)
[PASS] Schnorr 25-iteration stress (131.179 ms)
[PASS] ECDSA 50-iteration stress (282.604 ms)
[PASS] Schnorr 25-iteration stress (111.054 ms)
================================================================
VERDICT: AUDIT-READY

View File

@ -145,12 +145,20 @@ set(KERNEL_FILE_1 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_field.cl")
set(KERNEL_FILE_2 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_point.cl")
set(KERNEL_FILE_3 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_batch.cl")
set(KERNEL_FILE_4 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_affine.cl")
set(KERNEL_FILE_5 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_extended.cl")
set(KERNEL_FILE_6 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_hash160.cl")
set(KERNEL_FILE_7 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_ecdh.cl")
set(KERNEL_FILE_8 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_bip352.cl")
set(KERNEL_FILES_LIST
${KERNEL_FILE_1}
${KERNEL_FILE_2}
${KERNEL_FILE_3}
${KERNEL_FILE_4}
${KERNEL_FILE_5}
${KERNEL_FILE_6}
${KERNEL_FILE_7}
${KERNEL_FILE_8}
)
set(KERNEL_HEADER "${CMAKE_CURRENT_BINARY_DIR}/include/secp256k1_kernels_embedded.hpp")
@ -215,6 +223,20 @@ else()
)
endif()
add_executable(opencl_bip352_benchmark
benchmarks/bench_bip352_opencl.cpp
)
target_link_libraries(opencl_bip352_benchmark PRIVATE
secp256k1_opencl
$<TARGET_NAME_IF_EXISTS:fastsecp256k1>
${FASTSECP256K1_LIB}
)
target_compile_definitions(opencl_bip352_benchmark PRIVATE
SECP256K1_OPENCL_KERNEL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/kernels"
)
# =============================================================================
# Test Executable
# =============================================================================
@ -290,4 +312,3 @@ install(DIRECTORY kernels/
DESTINATION share/secp256k1/opencl
FILES_MATCHING PATTERN "*.cl"
)

View File

@ -0,0 +1,641 @@
#include "secp256k1_opencl.hpp"
#include "secp256k1/batch_add_affine.hpp"
#include "secp256k1/fast.hpp"
#include "secp256k1/glv.hpp"
#include "secp256k1/tagged_hash.hpp"
#define CL_TARGET_OPENCL_VERSION 120
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#include <algorithm>
#include <array>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <functional>
#include <iostream>
#include <memory>
#include <optional>
#include <set>
#include <sstream>
#include <string>
#include <vector>
using CpuPoint = secp256k1::fast::Point;
using CpuScalar = secp256k1::fast::Scalar;
using CpuField = secp256k1::fast::FieldElement;
using OclAffine = secp256k1::opencl::AffinePoint;
using OclField = secp256k1::opencl::FieldElement;
using OclScalar = secp256k1::opencl::Scalar;
namespace {
constexpr int BENCH_N = 10000;
constexpr int BENCH_WARMUP = 3;
constexpr int BENCH_PASSES = 11;
// RTX 5060 Ti (and most NVIDIA): warp=32, SM occupancy peaks at 128-256 threads.
// Previous defaults (64/32) left SMs underutilized.
constexpr int DEFAULT_LOCAL_SIZE_FUSED = 128;
constexpr int DEFAULT_LOCAL_SIZE_LUT = 128;
constexpr std::size_t LUT_WINDOWS = 16;
constexpr std::size_t LUT_ENTRIES = 65536;
constexpr uint8_t SCAN_KEY[32] = {
0xc4,0x23,0x9f,0xd6,0xfc,0x3d,0xb6,0xe2,
0x2b,0x8b,0xed,0x6a,0x49,0x21,0x9e,0x4e,
0x30,0xd7,0xd6,0xa3,0xb9,0x82,0x94,0xb1,
0x38,0xaf,0x4a,0xd3,0x00,0xda,0x1a,0x42
};
constexpr uint8_t SPEND_PUBKEY_COMPRESSED[33] = {
0x02,
0xe2,0xed,0x4b,0x9c,0xe9,0x14,0x5e,0x17,
0x21,0xf1,0x1f,0x99,0x5f,0x72,0x6e,0xf8,
0xcf,0x50,0xfc,0x85,0x92,0x89,0xac,0x94,
0x4b,0x2d,0xaf,0xe5,0x03,0xa3,0xc7,0x4c
};
// Must match BIP352ScanKeyGlv typedef in secp256k1_bip352.cl exactly.
struct BIP352ScanKeyGlv {
std::int8_t wnaf1[130]{}; // +0: wNAF digits for k1 half-scalar
std::int8_t wnaf2[130]{}; // +130: wNAF digits for k2 half-scalar
std::uint8_t k1_neg{0}; // +260: 1 if k1 negative (negate base.y)
std::uint8_t flip_phi{0}; // +261: 1 if phi table y should be negated
std::uint8_t pad0{0}; // +262: padding
std::uint8_t pad1{0}; // +263: padding
}; // Total: 264 bytes
// Compute 5-bit wNAF digits for a 128-bit half-scalar.
// Mirrors the GPU's scalar_to_wnaf fixed-130-iteration version.
// scalar_bytes: big-endian 32-byte scalar (upper 128 bits should be zero for GLV halves).
static void host_compute_wnaf(const std::uint8_t* scalar_bytes, std::int8_t wnaf[130]) {
// Convert big-endian bytes to 4 little-endian 64-bit limbs (limb[0] = LSW).
std::uint64_t s[4] = {};
for (int limb = 0; limb < 4; ++limb) {
std::uint64_t v = 0;
int base = limb * 8;
for (int i = 0; i < 8; ++i) v = (v << 8) | scalar_bytes[base + i];
s[3 - limb] = v;
}
for (int i = 0; i < 130; i++) {
if (s[0] & 1ULL) {
int d = (int)(s[0] & 0x1FULL);
if (d >= 16) {
d -= 32;
std::uint64_t add = (std::uint64_t)(-d);
std::uint64_t prev = s[0]; s[0] += add;
if (s[0] < prev) { for (int j = 1; j < 4; j++) if (++s[j]) break; }
} else {
std::uint64_t prev = s[0]; s[0] -= (std::uint64_t)d;
if (s[0] > prev) { for (int j = 1; j < 4; j++) if (s[j]--) break; }
}
wnaf[i] = (std::int8_t)d;
} else {
wnaf[i] = 0;
}
s[0] = (s[0] >> 1) | (s[1] << 63);
s[1] = (s[1] >> 1) | (s[2] << 63);
s[2] = (s[2] >> 1) | (s[3] << 63);
s[3] >>= 1;
}
}
static const uint32_t host_sha256_k[64] = {
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
};
inline uint32_t rotr32(uint32_t a, uint32_t b) {
return (a >> b) | (a << (32 - b));
}
void host_sha256(const uint8_t* msg, size_t len, uint8_t out[32]) {
uint32_t h0=0x6a09e667, h1=0xbb67ae85, h2=0x3c6ef372, h3=0xa54ff53a;
uint32_t h4=0x510e527f, h5=0x9b05688c, h6=0x1f83d9ab, h7=0x5be0cd19;
size_t bit_len = len * 8;
size_t padded = ((len + 9 + 63) / 64) * 64;
std::vector<uint8_t> buf(padded, 0);
std::memcpy(buf.data(), msg, len);
buf[len] = 0x80;
for (int i = 7; i >= 0; --i) buf[padded - 1 - i] = static_cast<uint8_t>(bit_len >> (i * 8));
for (size_t off = 0; off < padded; off += 64) {
uint32_t w[64];
for (int i = 0; i < 16; i++) {
w[i] = (static_cast<uint32_t>(buf[off+i*4]) << 24) |
(static_cast<uint32_t>(buf[off+i*4+1]) << 16) |
(static_cast<uint32_t>(buf[off+i*4+2]) << 8) |
buf[off+i*4+3];
}
for (int i = 16; i < 64; i++) {
uint32_t s0 = rotr32(w[i-15], 7) ^ rotr32(w[i-15], 18) ^ (w[i-15] >> 3);
uint32_t s1 = rotr32(w[i-2], 17) ^ rotr32(w[i-2], 19) ^ (w[i-2] >> 10);
w[i] = w[i-16] + s0 + w[i-7] + s1;
}
uint32_t a=h0,b=h1,c=h2,d=h3,e=h4,f=h5,g=h6,hh=h7;
for (int i = 0; i < 64; i++) {
uint32_t S1 = rotr32(e,6)^rotr32(e,11)^rotr32(e,25);
uint32_t ch = (e&f)^(~e&g);
uint32_t t1 = hh+S1+ch+host_sha256_k[i]+w[i];
uint32_t S0 = rotr32(a,2)^rotr32(a,13)^rotr32(a,22);
uint32_t maj = (a&b)^(a&c)^(b&c);
uint32_t t2 = S0+maj;
hh=g; g=f; f=e; e=d+t1; d=c; c=b; b=a; a=t1+t2;
}
h0+=a; h1+=b; h2+=c; h3+=d; h4+=e; h5+=f; h6+=g; h7+=hh;
}
auto store = [&](uint32_t v, int i) {
out[i*4] = static_cast<uint8_t>(v >> 24);
out[i*4+1] = static_cast<uint8_t>(v >> 16);
out[i*4+2] = static_cast<uint8_t>(v >> 8);
out[i*4+3] = static_cast<uint8_t>(v);
};
store(h0,0); store(h1,1); store(h2,2); store(h3,3);
store(h4,4); store(h5,5); store(h6,6); store(h7,7);
}
CpuPoint point_from_compressed(const uint8_t* pub33) {
if (pub33[0] != 0x02 && pub33[0] != 0x03) return CpuPoint::infinity();
CpuField x;
if (!CpuField::parse_bytes_strict(pub33 + 1, x)) return CpuPoint::infinity();
auto x2 = x * x;
auto x3 = x2 * x;
auto y2 = x3 + CpuField::from_uint64(7);
auto t = y2;
auto a = t.square() * t;
auto b = a.square() * t;
auto c = b.square().square().square() * b;
auto d = c.square().square().square() * b;
auto e = d.square().square() * a;
auto f = e;
for (int i = 0; i < 11; ++i) f = f.square();
f = f * e;
auto g = f;
for (int i = 0; i < 22; ++i) g = g.square();
g = g * f;
auto h = g;
for (int i = 0; i < 44; ++i) h = h.square();
h = h * g;
auto j = h;
for (int i = 0; i < 88; ++i) j = j.square();
j = j * h;
auto k = j;
for (int i = 0; i < 44; ++i) k = k.square();
k = k * g;
auto m = k.square().square().square() * b;
auto y = m;
for (int i = 0; i < 23; ++i) y = y.square();
y = y * f;
for (int i = 0; i < 6; ++i) y = y.square();
y = y * a;
y = y.square().square();
if (!(y * y == y2)) return CpuPoint::infinity();
auto y_bytes = y.to_bytes();
bool y_is_odd = (y_bytes[31] & 1) != 0;
bool want_odd = (pub33[0] == 0x03);
if (y_is_odd != want_odd) y = CpuField::from_uint64(0) - y;
return CpuPoint::from_affine(x, y);
}
OclField bytes_to_ocl_field(const uint8_t* bytes32) {
OclField out{};
for (int limb = 0; limb < 4; ++limb) {
uint64_t v = 0;
int base = limb * 8;
for (int i = 0; i < 8; ++i) {
v = (v << 8) | bytes32[base + i];
}
out.limbs[3 - limb] = v;
}
return out;
}
OclAffine to_ocl_affine(const CpuPoint& p) {
OclAffine out{};
auto x = p.x().to_bytes();
auto y = p.y().to_bytes();
out.x = bytes_to_ocl_field(x.data());
out.y = bytes_to_ocl_field(y.data());
return out;
}
OclAffine to_ocl_affine(const secp256k1::fast::AffinePointCompact& p) {
OclAffine out{};
auto x = p.x.to_bytes();
auto y = p.y.to_bytes();
out.x = bytes_to_ocl_field(x.data());
out.y = bytes_to_ocl_field(y.data());
return out;
}
uint64_t extract_upper_64(const uint8_t* x_bytes) {
uint64_t v = 0;
for (int i = 0; i < 8; i++) v = (v << 8) | x_bytes[i];
return v;
}
std::string read_text(const std::string& path) {
std::ifstream in(path, std::ios::binary);
if (!in) throw std::runtime_error("failed to open: " + path);
std::ostringstream ss;
ss << in.rdbuf();
return ss.str();
}
std::string dirname_of(const std::string& path) {
auto pos = path.find_last_of("/\\");
return pos == std::string::npos ? "." : path.substr(0, pos);
}
std::string trim_left(std::string s) {
while (!s.empty() && (s.front() == ' ' || s.front() == '\t')) s.erase(s.begin());
return s;
}
std::string expand_kernel_file(const std::string& path, std::set<std::string>& include_stack) {
if (include_stack.count(path)) return {};
include_stack.insert(path);
std::istringstream in(read_text(path));
std::ostringstream out;
std::string dir = dirname_of(path);
std::string line;
while (std::getline(in, line)) {
std::string trimmed = trim_left(line);
if (trimmed.rfind("#include \"", 0) == 0) {
auto start = trimmed.find('"') + 1;
auto end = trimmed.find('"', start);
std::string child = dir + "/" + trimmed.substr(start, end - start);
out << expand_kernel_file(child, include_stack);
continue;
}
out << line << '\n';
}
include_stack.erase(path);
return out.str();
}
std::string load_bip352_kernel_source() {
std::set<std::string> stack;
return expand_kernel_file(std::string(SECP256K1_OPENCL_KERNEL_DIR) + "/secp256k1_bip352.cl", stack);
}
std::vector<OclAffine> build_generator_lut_host() {
std::vector<OclAffine> lut(LUT_WINDOWS * LUT_ENTRIES);
CpuPoint base = CpuPoint::generator();
for (std::size_t win = 0; win < LUT_WINDOWS; ++win) {
std::cout << " Building LUT window " << win + 1 << "/" << LUT_WINDOWS << "...\n";
auto base_x = base.x();
auto base_y = base.y();
auto table = (win == 0)
? secp256k1::fast::precompute_g_multiples(LUT_ENTRIES - 1)
: secp256k1::fast::precompute_point_multiples(base_x, base_y, LUT_ENTRIES - 1);
lut[win * LUT_ENTRIES] = OclAffine{};
for (std::size_t i = 0; i < table.size(); ++i) {
lut[win * LUT_ENTRIES + i + 1] = to_ocl_affine(table[i]);
}
for (int i = 0; i < 16; ++i) base.dbl_inplace();
}
return lut;
}
BIP352ScanKeyGlv build_scan_glv_plan() {
BIP352ScanKeyGlv out{};
auto scan_scalar = CpuScalar::from_bytes(SCAN_KEY);
auto decomp = secp256k1::fast::glv_decompose(scan_scalar);
auto k1 = decomp.k1.to_bytes();
auto k2 = decomp.k2.to_bytes();
out.k1_neg = decomp.k1_neg ? 1 : 0;
out.flip_phi = (decomp.k1_neg != decomp.k2_neg) ? 1 : 0;
host_compute_wnaf(k1.data(), out.wnaf1);
host_compute_wnaf(k2.data(), out.wnaf2);
return out;
}
double median_iqr(std::vector<double> samples) {
if (samples.empty()) return 0.0;
std::sort(samples.begin(), samples.end());
const int n = static_cast<int>(samples.size());
if (n < 4) return samples[n / 2];
double q1 = samples[n / 4];
double q3 = samples[(3 * n) / 4];
double iqr = q3 - q1;
double lo = q1 - 1.5 * iqr;
double hi = q3 + 1.5 * iqr;
std::vector<double> filtered;
filtered.reserve(samples.size());
for (double v : samples) {
if (v >= lo && v <= hi) filtered.push_back(v);
}
if (filtered.empty()) filtered = std::move(samples);
return filtered[filtered.size() / 2];
}
void check_cl(cl_int err, const char* what) {
if (err != CL_SUCCESS) {
throw std::runtime_error(std::string(what) + " failed with OpenCL error " + std::to_string(err));
}
}
// Autotune OpenCL local_size by running a few passes at candidate sizes.
// Mirrors CUDA's autotune_gpu_tpb. Returns best local size found.
static int autotune_local_size(
const char* label,
cl_command_queue cl_q,
cl_kernel kernel,
size_t count,
size_t max_wg_size,
std::initializer_list<int> candidates)
{
std::printf("Autotuning %s local size...\n", label);
int best = 0;
double best_ns = 0.0;
for (int ls : candidates) {
if (ls <= 0 || static_cast<size_t>(ls) > max_wg_size) continue;
size_t local = static_cast<size_t>(ls);
size_t global = ((count + local - 1) / local) * local;
// warmup
for (int w = 0; w < 2; ++w) {
cl_int err2 = clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr);
if (err2 != CL_SUCCESS) goto next;
}
clFinish(cl_q);
{
constexpr int SAMPLE_PASSES = 5;
constexpr int SAMPLE_REPS = 10;
std::vector<double> samples;
samples.reserve(SAMPLE_PASSES);
for (int p = 0; p < SAMPLE_PASSES; ++p) {
auto t0 = std::chrono::high_resolution_clock::now();
for (int r = 0; r < SAMPLE_REPS; ++r) {
cl_int err2 = clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr);
if (err2 != CL_SUCCESS) goto next;
}
clFinish(cl_q);
auto t1 = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
samples.push_back((ms * 1e6) / (static_cast<double>(count) * SAMPLE_REPS));
}
double ns = median_iqr(samples);
std::printf(" local=%3d -> %8.1f ns/op\n", ls, ns);
if (best == 0 || ns < best_ns) { best = ls; best_ns = ns; }
}
next:;
}
if (best == 0) best = DEFAULT_LOCAL_SIZE_FUSED;
std::printf(" selected local=%d for %s\n\n", best, label);
return best;
}
} // namespace
int main(int argc, char** argv) {
bool prefer_intel = false;
bool use_lut = false;
int platform_id = -1;
int device_id = 0;
int batch_n = BENCH_N;
int local_size = 0;
for (int i = 1; i < argc; ++i) {
std::string arg = argv[i];
if (arg == "--intel") prefer_intel = true;
else if (arg == "--nvidia") prefer_intel = false;
else if (arg == "--lut") use_lut = true;
else if (arg == "--platform" && i + 1 < argc) platform_id = std::atoi(argv[++i]);
else if (arg == "--device" && i + 1 < argc) device_id = std::atoi(argv[++i]);
else if (arg == "--batch" && i + 1 < argc) batch_n = std::atoi(argv[++i]);
else if (arg == "--local" && i + 1 < argc) local_size = std::atoi(argv[++i]);
}
if (local_size == 0) {
local_size = use_lut ? DEFAULT_LOCAL_SIZE_LUT : DEFAULT_LOCAL_SIZE_FUSED;
}
secp256k1::opencl::DeviceConfig cfg;
cfg.prefer_intel = prefer_intel;
cfg.verbose = true;
cfg.platform_id = platform_id;
cfg.device_id = device_id;
auto ctx = secp256k1::opencl::Context::create(cfg);
if (!ctx || !ctx->is_valid()) {
std::cerr << "Failed to create OpenCL context\n";
return 1;
}
cl_context cl_ctx = static_cast<cl_context>(ctx->native_context());
cl_command_queue cl_q = static_cast<cl_command_queue>(ctx->native_queue());
cl_device_id cl_dev = nullptr;
check_cl(clGetCommandQueueInfo(cl_q, CL_QUEUE_DEVICE, sizeof(cl_dev), &cl_dev, nullptr),
"clGetCommandQueueInfo(CL_QUEUE_DEVICE)");
std::cout << "============================================================\n";
std::cout << " BIP-352 Silent Payments Pipeline: CPU vs OpenCL\n";
std::cout << "============================================================\n";
std::cout << " Device: " << ctx->device_info().name << " (" << ctx->device_info().vendor << ")\n";
std::cout << " N = " << batch_n << " tweak points, " << BENCH_PASSES << " passes (median)\n\n";
std::cout << " Local size = " << local_size << "\n\n";
std::cout << "Generating " << batch_n << " deterministic tweak points...\n";
std::vector<OclAffine> tweaks(static_cast<size_t>(batch_n));
CpuPoint last_tweak = CpuPoint::infinity();
uint8_t seed[32];
const char* tag = "bench_bip352_seed";
host_sha256(reinterpret_cast<const uint8_t*>(tag), std::strlen(tag), seed);
for (int i = 0; i < batch_n; ++i) {
uint8_t buf[36];
std::memcpy(buf, seed, 32);
buf[32] = static_cast<uint8_t>((i >> 24) & 0xff);
buf[33] = static_cast<uint8_t>((i >> 16) & 0xff);
buf[34] = static_cast<uint8_t>((i >> 8) & 0xff);
buf[35] = static_cast<uint8_t>(i & 0xff);
uint8_t scalar_bytes[32];
host_sha256(buf, 36, scalar_bytes);
CpuScalar s = CpuScalar::from_bytes(scalar_bytes);
CpuPoint p = CpuPoint::generator().scalar_mul(s);
if (i == batch_n - 1) last_tweak = p;
tweaks[static_cast<size_t>(i)] = to_ocl_affine(p);
}
std::cout << "Done.\n";
CpuPoint spend_cpu = point_from_compressed(SPEND_PUBKEY_COMPRESSED);
if (spend_cpu.is_infinity()) {
std::cerr << "Failed to decode spend pubkey\n";
return 1;
}
OclAffine spend = to_ocl_affine(spend_cpu);
std::cout << "Building OpenCL BIP352 pipeline kernel...\n";
std::string source = load_bip352_kernel_source();
const char* src_ptr = source.c_str();
size_t src_len = source.size();
cl_int err = CL_SUCCESS;
cl_program program = clCreateProgramWithSource(cl_ctx, 1, &src_ptr, &src_len, &err);
check_cl(err, "clCreateProgramWithSource");
std::string build_options = "-cl-std=CL1.2 -cl-fast-relaxed-math -cl-mad-enable"
" -cl-nv-opt-level=3";
err = clBuildProgram(program, 1, &cl_dev, build_options.c_str(), nullptr, nullptr);
if (err != CL_SUCCESS) {
size_t log_size = 0;
clGetProgramBuildInfo(program, cl_dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_size);
std::string log(log_size, '\0');
clGetProgramBuildInfo(program, cl_dev, CL_PROGRAM_BUILD_LOG, log_size, log.data(), nullptr);
std::cerr << "Build failed:\n" << log << "\n";
return 1;
}
const char* kernel_name = use_lut ? "bip352_pipeline_kernel_lut" : "bip352_pipeline_kernel";
cl_kernel kernel = clCreateKernel(program, kernel_name, &err);
check_cl(err, kernel_name);
std::cout << "Done.\n";
size_t count = static_cast<size_t>(batch_n);
size_t tweak_bytes = count * sizeof(OclAffine);
std::vector<uint64_t> prefixes(count);
std::vector<OclAffine> gen_lut;
BIP352ScanKeyGlv scan_plan{};
// Both paths now use BIP352ScanKeyGlv with precomputed wNAF digits.
scan_plan = build_scan_glv_plan();
cl_mem d_tweaks = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, tweak_bytes, tweaks.data(), &err);
check_cl(err, "clCreateBuffer(d_tweaks)");
cl_mem d_scan = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(BIP352ScanKeyGlv), &scan_plan, &err);
check_cl(err, "clCreateBuffer(d_scan)");
cl_mem d_spend = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(OclAffine), &spend, &err);
check_cl(err, "clCreateBuffer(d_spend)");
cl_mem d_prefixes = clCreateBuffer(cl_ctx, CL_MEM_WRITE_ONLY, count * sizeof(uint64_t), nullptr, &err);
check_cl(err, "clCreateBuffer(d_prefixes)");
cl_mem d_gen_lut = nullptr;
if (use_lut) {
std::cout << "Building CPU generator LUT (" << (LUT_WINDOWS * LUT_ENTRIES) << " affine points)...\n";
gen_lut = build_generator_lut_host();
std::cout << "Uploading generator LUT to OpenCL...\n";
d_gen_lut = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
gen_lut.size() * sizeof(OclAffine), gen_lut.data(), &err);
check_cl(err, "clCreateBuffer(d_gen_lut)");
}
cl_uint cl_count = static_cast<cl_uint>(count);
check_cl(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_tweaks), "clSetKernelArg(0)");
check_cl(clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_scan), "clSetKernelArg(1)");
check_cl(clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_spend), "clSetKernelArg(2)");
if (use_lut) {
check_cl(clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_gen_lut), "clSetKernelArg(3)");
check_cl(clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_prefixes), "clSetKernelArg(4)");
check_cl(clSetKernelArg(kernel, 5, sizeof(cl_uint), &cl_count), "clSetKernelArg(5)");
} else {
check_cl(clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_prefixes), "clSetKernelArg(3)");
check_cl(clSetKernelArg(kernel, 4, sizeof(cl_uint), &cl_count), "clSetKernelArg(4)");
}
if (local_size <= 0) {
throw std::runtime_error("local size must be positive");
}
if (static_cast<std::size_t>(local_size) > ctx->device_info().max_work_group_size) {
throw std::runtime_error("local size exceeds device max work group size");
}
// Autotune: find optimal local size among candidates (mirrors CUDA autotune_gpu_tpb).
// Only autotune when no explicit --local was given (i.e., we're still at the default).
{
int default_ls = use_lut ? DEFAULT_LOCAL_SIZE_LUT : DEFAULT_LOCAL_SIZE_FUSED;
if (local_size == default_ls) {
int tuned = autotune_local_size(
use_lut ? "LUT kernel" : "fused kernel",
cl_q, kernel, count,
ctx->device_info().max_work_group_size,
{64, 128, 256, 384});
local_size = tuned;
}
}
size_t global = ((count + static_cast<size_t>(local_size) - 1) / static_cast<size_t>(local_size)) * static_cast<size_t>(local_size);
size_t local = static_cast<size_t>(local_size);
std::cout << " Running with local_size=" << local_size << "\n";
for (int i = 0; i < BENCH_WARMUP; ++i) {
check_cl(clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr),
"clEnqueueNDRangeKernel(warmup)");
}
check_cl(clFinish(cl_q), "clFinish(warmup)");
std::vector<double> samples;
samples.reserve(BENCH_PASSES);
std::cout << "\n--- OpenCL (" << (use_lut ? "fused pipeline + LUT" : "fused pipeline") << ") ---\n";
for (int pass = 0; pass < BENCH_PASSES; ++pass) {
auto t0 = std::chrono::high_resolution_clock::now();
check_cl(clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr),
"clEnqueueNDRangeKernel");
check_cl(clFinish(cl_q), "clFinish");
auto t1 = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
samples.push_back((ms * 1e6) / static_cast<double>(count));
std::printf(" pass %2d: %8.3f ms\n", pass + 1, ms);
}
double ns_per_op = median_iqr(samples);
double ops_per_sec = 1e9 / ns_per_op;
check_cl(clEnqueueReadBuffer(cl_q, d_prefixes, CL_TRUE, 0, count * sizeof(uint64_t), prefixes.data(), 0, nullptr, nullptr),
"clEnqueueReadBuffer");
CpuScalar scan_scalar = CpuScalar::from_bytes(SCAN_KEY);
CpuPoint shared = last_tweak.scalar_mul(scan_scalar);
auto shared_comp = shared.to_compressed();
uint8_t shared_ser[37];
std::memcpy(shared_ser, shared_comp.data(), 33);
shared_ser[33] = shared_ser[34] = shared_ser[35] = shared_ser[36] = 0;
auto tagged = secp256k1::detail::cached_tagged_hash(
secp256k1::detail::make_tag_midstate("BIP0352/SharedSecret"),
shared_ser,
sizeof(shared_ser));
CpuScalar hs = CpuScalar::from_bytes(tagged.data());
CpuPoint out = CpuPoint::generator().scalar_mul(hs);
CpuPoint cand = spend_cpu;
cand.add_inplace(out);
uint64_t cpu_validation = extract_upper_64(cand.x_only_bytes().data());
uint64_t ocl_validation = prefixes.back();
std::printf("\n OpenCL%s: %.1f ns/op (%.2f M/s)\n", use_lut ? " LUT" : "", ns_per_op, ops_per_sec / 1e6);
std::printf(" validation prefix: 0x%016llx\n", static_cast<unsigned long long>(ocl_validation));
// CUDA reference: bench_bip352 on RTX 5060 Ti (SM 12.0, 36 SMs, 384 tpb).
// GLV (no LUT): 260.4 ns/op (3.84 M/s). LUT: 127.2 ns/op (7.86 M/s).
constexpr double CUDA_GLV_NS = 260.4;
constexpr double CUDA_LUT_NS = 127.2;
double cuda_ref = use_lut ? CUDA_LUT_NS : CUDA_GLV_NS;
std::printf(" CUDA reference: %.1f ns/op (%.2f M/s) [%s]\n",
cuda_ref, 1e9 / cuda_ref / 1e6, use_lut ? "LUT" : "GLV");
std::printf(" gap vs CUDA: %.2fx\n", ns_per_op / cuda_ref);
std::printf(" Validation: %s\n", cpu_validation == ocl_validation ? "[OK] MATCH" : "[FAIL] MISMATCH");
clReleaseMemObject(d_tweaks);
clReleaseMemObject(d_scan);
clReleaseMemObject(d_spend);
clReleaseMemObject(d_prefixes);
if (d_gen_lut) clReleaseMemObject(d_gen_lut);
clReleaseKernel(kernel);
clReleaseProgram(program);
return cpu_validation == ocl_validation ? 0 : 2;
}

View File

@ -195,6 +195,7 @@ int main(int argc, char* argv[]) {
std::vector<Scalar> point_scalars(point_batch);
std::vector<JacobianPoint> pd_in(point_batch), pd_out(point_batch);
std::vector<JacobianPoint> pa_in1(point_batch), pa_in2(point_batch), pa_out(point_batch);
std::vector<AffinePoint> sm_points(point_batch);
for (std::size_t i = 0; i < point_batch; ++i) {
point_scalars[i] = {{rng(), rng(), rng(), rng()}};
@ -206,6 +207,7 @@ int main(int argc, char* argv[]) {
}
ctx->batch_scalar_mul_generator(point_scalars.data(), pa_in2.data(), point_batch);
pa_in1 = pd_in;
ctx->batch_jacobian_to_affine(pd_in.data(), sm_points.data(), point_batch);
{
auto r = bench_batch("Point Double", [&]() {
@ -244,6 +246,13 @@ int main(int argc, char* argv[]) {
}, bs, 1, 3);
print_result(r);
results.push_back(r);
std::string kp_name = "kP (batch=" + std::to_string(bs) + ")";
auto kp = bench_batch(kp_name, [&]() {
ctx->batch_scalar_mul(sm_scalars.data(), sm_points.data(), sm_results.data(), bs);
}, bs, 1, 3);
print_result(kp);
results.push_back(kp);
}
// ==========================================================================
@ -598,7 +607,8 @@ int main(int argc, char* argv[]) {
{
std::size_t smk_batch = std::min(point_batch, static_cast<std::size_t>(65536));
cl_uint smk_cnt = static_cast<cl_uint>(smk_batch);
std::size_t smk_global = ((smk_batch + p_local_sz - 1) / p_local_sz) * p_local_sz;
std::size_t smk_local_sz = std::min<std::size_t>(128, p_local_sz);
std::size_t smk_global = ((smk_batch + smk_local_sz - 1) / smk_local_sz) * smk_local_sz;
// Use existing point_scalars for scalar data
cl_mem buf_sc = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
@ -616,12 +626,12 @@ int main(int argc, char* argv[]) {
int smk_iters = 5;
for (int i = 0; i < smk_warmup; ++i)
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &p_local_sz, 0, nullptr, nullptr);
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
clFinish(cl_q);
auto t0 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < smk_iters; ++i)
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &p_local_sz, 0, nullptr, nullptr);
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
clFinish(cl_q);
auto t1 = std::chrono::high_resolution_clock::now();
@ -634,6 +644,95 @@ int main(int argc, char* argv[]) {
clReleaseMemObject(buf_smr);
}
// Scalar Mul Arbitrary Point (kernel-only) -- same batch cap as kG
{
std::size_t smk_batch = std::min(point_batch, static_cast<std::size_t>(65536));
cl_uint smk_cnt = static_cast<cl_uint>(smk_batch);
std::size_t smk_local_sz = std::min<std::size_t>(128, p_local_sz);
std::size_t smk_global = ((smk_batch + smk_local_sz - 1) / smk_local_sz) * smk_local_sz;
std::vector<AffinePoint> sm_points(smk_batch);
ctx->batch_jacobian_to_affine(pd_in.data(), sm_points.data(), smk_batch);
cl_mem buf_sc = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
smk_batch * sizeof(Scalar), (void*)point_scalars.data(), &err);
cl_mem buf_pts = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
smk_batch * sizeof(AffinePoint), (void*)sm_points.data(), &err);
cl_mem buf_smr = clCreateBuffer(cl_ctx, CL_MEM_WRITE_ONLY,
smk_batch * sizeof(JacobianPoint), nullptr, &err);
clFinish(cl_q);
cl_kernel kern = (cl_kernel)ctx->native_kernel("scalar_mul");
clSetKernelArg(kern, 0, sizeof(cl_mem), &buf_sc);
clSetKernelArg(kern, 1, sizeof(cl_mem), &buf_pts);
clSetKernelArg(kern, 2, sizeof(cl_mem), &buf_smr);
clSetKernelArg(kern, 3, sizeof(cl_uint), &smk_cnt);
int smk_warmup = 2;
int smk_iters = 5;
for (int i = 0; i < smk_warmup; ++i)
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
clFinish(cl_q);
auto t0 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < smk_iters; ++i)
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
clFinish(cl_q);
auto t1 = std::chrono::high_resolution_clock::now();
double ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
double total_ops = static_cast<double>(smk_batch) * smk_iters;
BenchResult r = {"kP (kernel)", ns / total_ops, total_ops / (ns * 1e-9)};
print_result(r); results.push_back(r);
for (int i = 0; i < smk_warmup; ++i) {
clEnqueueWriteBuffer(cl_q, buf_pts, CL_FALSE, 0,
smk_batch * sizeof(AffinePoint), (void*)sm_points.data(), 0, nullptr, nullptr);
clEnqueueWriteBuffer(cl_q, buf_sc, CL_FALSE, 0,
smk_batch * sizeof(Scalar), (void*)point_scalars.data(), 0, nullptr, nullptr);
}
clFinish(cl_q);
t0 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < smk_iters; ++i) {
clEnqueueWriteBuffer(cl_q, buf_pts, CL_FALSE, 0,
smk_batch * sizeof(AffinePoint), (void*)sm_points.data(), 0, nullptr, nullptr);
clEnqueueWriteBuffer(cl_q, buf_sc, CL_FALSE, 0,
smk_batch * sizeof(Scalar), (void*)point_scalars.data(), 0, nullptr, nullptr);
}
clFinish(cl_q);
t1 = std::chrono::high_resolution_clock::now();
ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
BenchResult upload = {"kP (upload)", ns / total_ops, total_ops / (ns * 1e-9)};
print_result(upload); results.push_back(upload);
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
clFinish(cl_q);
std::vector<JacobianPoint> sm_readback(smk_batch);
for (int i = 0; i < smk_warmup; ++i)
clEnqueueReadBuffer(cl_q, buf_smr, CL_FALSE, 0,
smk_batch * sizeof(JacobianPoint), sm_readback.data(), 0, nullptr, nullptr);
clFinish(cl_q);
t0 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < smk_iters; ++i)
clEnqueueReadBuffer(cl_q, buf_smr, CL_FALSE, 0,
smk_batch * sizeof(JacobianPoint), sm_readback.data(), 0, nullptr, nullptr);
clFinish(cl_q);
t1 = std::chrono::high_resolution_clock::now();
ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
BenchResult readback = {"kP (readback)", ns / total_ops, total_ops / (ns * 1e-9)};
print_result(readback); results.push_back(readback);
clReleaseMemObject(buf_sc);
clReleaseMemObject(buf_pts);
clReleaseMemObject(buf_smr);
}
clReleaseMemObject(buf_jp1);
clReleaseMemObject(buf_jp2);
clReleaseMemObject(buf_jpr);

View File

@ -37,7 +37,7 @@ struct DeviceConfig {
int device_id = 0; // GPU device index
int platform_id = 0; // Platform index (e.g., Intel, AMD)
std::size_t max_batch_size = 65536; // Max points per batch
std::size_t local_work_size = 256; // Work group size (auto if 0)
std::size_t local_work_size = 0; // Work group size (auto if 0)
bool prefer_intel = true; // Prefer Intel GPU if available
bool verbose = false; // Print device info on init
};

View File

@ -0,0 +1,203 @@
#ifndef SECP256K1_BIP352_CL
#define SECP256K1_BIP352_CL
#include "secp256k1_extended.cl"
// BIP352ScanKeyGlv: precomputed GLV scan-key plan uploaded to __constant memory.
// wNAF digits are computed on the CPU host and read directly in the kernel,
// avoiding the GPU call to scalar_to_wnaf and eliminating 1040 bytes of
// private-stack pressure (int wnaf1[130] + int wnaf2[130]).
// Layout must match the host-side BIP352ScanKeyGlv struct exactly.
typedef struct {
char wnaf1[130]; // wNAF digits for k1 half-scalar (precomputed, range [-15..15])
char wnaf2[130]; // wNAF digits for k2 half-scalar (precomputed, range [-15..15])
uchar k1_neg; // 1 if k1 was negative: negate base.y before table build
uchar flip_phi; // 1 if phi table y-coordinate should be negated
uchar pad0;
uchar pad1;
} BIP352ScanKeyGlv;
// SHA256("BIP0352/SharedSecret") || SHA256("BIP0352/SharedSecret")
__constant uint BIP352_SHAREDSECRET_MIDSTATE[8] = {
0x88831537U, 0x5127079bU, 0x69c2137bU, 0xab0303e6U,
0x98fa21faU, 0x4a888523U, 0xbd99daabU, 0xf25e5e0aU
};
inline void bip352_tagged_sha256_impl(const uchar* data, uint data_len, uchar out[32]) {
SHA256Ctx ctx;
for (int i = 0; i < 8; i++) ctx.h[i] = BIP352_SHAREDSECRET_MIDSTATE[i];
ctx.buf_len = 0;
ctx.total_len = 64;
sha256_update(&ctx, data, data_len);
sha256_final(&ctx, out);
}
inline void bip352_shared_secret_input_impl(const JacobianPoint* p, uchar ser[37]) {
FieldElement z_inv, z_inv2, z_inv3, x_aff, y_aff;
field_inv_impl(&z_inv, &p->z);
field_sqr_impl(&z_inv2, &z_inv);
field_mul_impl(&z_inv3, &z_inv2, &z_inv);
field_mul_impl(&x_aff, &p->x, &z_inv2);
field_mul_impl(&y_aff, &p->y, &z_inv3);
uchar x_bytes[32], y_bytes[32];
field_to_bytes_impl(&x_aff, x_bytes);
field_to_bytes_impl(&y_aff, y_bytes);
ser[0] = (y_bytes[31] & 1) ? 0x03 : 0x02;
for (int i = 0; i < 32; i++) ser[1 + i] = x_bytes[i];
ser[33] = 0;
ser[34] = 0;
ser[35] = 0;
ser[36] = 0;
}
inline ulong point_prefix64_impl(const JacobianPoint* p) {
FieldElement z_inv, z_inv2, x_aff;
field_inv_impl(&z_inv, &p->z);
field_sqr_impl(&z_inv2, &z_inv);
field_mul_impl(&x_aff, &p->x, &z_inv2);
uchar x_bytes[32];
field_to_bytes_impl(&x_aff, x_bytes);
ulong prefix = 0;
for (int i = 0; i < 8; i++) {
prefix = (prefix << 8) | (ulong)x_bytes[i];
}
return prefix;
}
// Optimized GLV scalar multiply with pre-decomposed scan key.
// Uses build_wnaf_table_zr_impl (Z-trick affine table) + derive_endo_table_impl
// instead of the old Jacobian-Jacobian table -- eliminates 6 J-J adds per half,
// replaces with 7 mixed (J+A) adds and 1 field_inv shared across 8 entries.
// This matches the quality of scalar_mul_glv_impl in secp256k1_extended.cl.
inline void scalar_mul_glv_predecomp_impl(
JacobianPoint* r,
const AffinePoint* p,
__constant const BIP352ScanKeyGlv* scan
) {
AffinePoint base = *p;
if (scan->k1_neg) field_negate_impl(&base.y, &base.y);
// Build affine table[0..7] = {P, 3P, 5P, 7P, 9P, 11P, 13P, 15P} via Z-trick.
// One field_inv for the whole table instead of per-point.
AffinePoint table[8];
FieldElement globalz;
build_wnaf_table_zr_impl(&base, table, &globalz);
// Endomorphism table: endo_table[i] = phi(table[i]) with optional Y-negate.
AffinePoint endo_table[8];
derive_endo_table_impl(table, endo_table, scan->flip_phi);
// Shamir interleaved double-and-add with mixed (J+A) additions.
// wNAF digits are read directly from __constant memory (precomputed on CPU host),
// eliminating the GPU scalar_to_wnaf call and 1040 bytes of private stack.
point_set_infinity(r);
for (int i = 129; i >= 0; --i) {
if (!point_is_infinity(r)) point_double_impl(r, r);
int d1 = (int)scan->wnaf1[i];
if (d1 != 0) {
int idx = (((d1 > 0) ? d1 : -d1) - 1) >> 1;
AffinePoint pt = table[idx];
if (d1 < 0) field_negate_impl(&pt.y, &pt.y);
if (point_is_infinity(r)) { point_from_affine(r, &pt); }
else { point_add_mixed_impl(r, r, &pt); }
}
int d2 = (int)scan->wnaf2[i];
if (d2 != 0) {
int idx = (((d2 > 0) ? d2 : -d2) - 1) >> 1;
AffinePoint pt = endo_table[idx];
if (d2 < 0) field_negate_impl(&pt.y, &pt.y);
if (point_is_infinity(r)) { point_from_affine(r, &pt); }
else { point_add_mixed_impl(r, r, &pt); }
}
}
// Correct accumulated Z by the shared table Z factor.
if (!point_is_infinity(r)) {
FieldElement corrected_z;
field_mul_impl(&corrected_z, &r->z, &globalz);
r->z = corrected_z;
}
}
__kernel void bip352_pipeline_kernel(
__global const AffinePoint* tweak_points,
__constant const BIP352ScanKeyGlv* scan_key,
__global const AffinePoint* spend_point,
__global ulong* prefixes,
const uint count
) {
uint gid = get_global_id(0);
if (gid >= count) return;
AffinePoint tweak = tweak_points[gid];
AffinePoint spend = spend_point[0];
JacobianPoint shared;
scalar_mul_glv_predecomp_impl(&shared, &tweak, scan_key);
if (point_is_infinity(&shared)) {
prefixes[gid] = 0;
return;
}
uchar ser[37];
bip352_shared_secret_input_impl(&shared, ser);
uchar hash[32];
bip352_tagged_sha256_impl(ser, 37, hash);
Scalar hs;
scalar_from_bytes_impl(hash, &hs);
JacobianPoint out;
scalar_mul_generator_windowed_impl(&out, &hs);
JacobianPoint cand;
point_add_mixed_impl(&cand, &out, &spend);
prefixes[gid] = point_prefix64_impl(&cand);
}
__kernel void bip352_pipeline_kernel_lut(
__global const AffinePoint* tweak_points,
__constant const BIP352ScanKeyGlv* scan_key,
__global const AffinePoint* spend_point,
__global const AffinePoint* gen_lut,
__global ulong* prefixes,
const uint count
) {
uint gid = get_global_id(0);
if (gid >= count) return;
AffinePoint tweak = tweak_points[gid];
AffinePoint spend = spend_point[0];
JacobianPoint shared;
scalar_mul_glv_predecomp_impl(&shared, &tweak, scan_key);
if (point_is_infinity(&shared)) {
prefixes[gid] = 0;
return;
}
uchar ser[37];
bip352_shared_secret_input_impl(&shared, ser);
uchar hash[32];
bip352_tagged_sha256_impl(ser, 37, hash);
Scalar hs;
scalar_from_bytes_impl(hash, &hs);
JacobianPoint out;
scalar_mul_generator_lut_impl(&out, &hs, gen_lut);
JacobianPoint cand;
point_add_mixed_impl(&cand, &out, &spend);
prefixes[gid] = point_prefix64_impl(&cand);
}
#endif

View File

@ -564,6 +564,84 @@ inline void glv_decompose_impl(const Scalar* k, Scalar* k1, Scalar* k2,
// GLV-accelerated scalar multiplication: k*P using Shamir's trick
// with endomorphism phi(P) = (beta*x, y) where phi corresponds to lambda.
// Uses interleaved wNAF w=5 for both half-scalars k1, k2.
inline void build_wnaf_table_zr_impl(const AffinePoint* base, AffinePoint table[8],
FieldElement* globalz) {
JacobianPoint base_jac;
point_from_affine(&base_jac, base);
JacobianPoint doubled;
point_double_impl(&doubled, &base_jac);
FieldElement c = doubled.z;
FieldElement c2, c3;
field_sqr_impl(&c2, &c);
field_mul_impl(&c3, &c2, &c);
AffinePoint doubled_affine;
doubled_affine.x = doubled.x;
doubled_affine.y = doubled.y;
JacobianPoint accum;
field_mul_impl(&accum.x, &base->x, &c2);
field_mul_impl(&accum.y, &base->y, &c3);
accum.z.limbs[0] = 1UL;
accum.z.limbs[1] = 0UL;
accum.z.limbs[2] = 0UL;
accum.z.limbs[3] = 0UL;
accum.infinity = 0;
table[0].x = accum.x;
table[0].y = accum.y;
FieldElement zr[8];
zr[0] = c;
for (int i = 1; i < 8; ++i) {
FieldElement h;
point_add_mixed_h_impl(&accum, &accum, &doubled_affine, &h);
table[i].x = accum.x;
table[i].y = accum.y;
zr[i] = h;
}
field_mul_impl(globalz, &accum.z, &c);
FieldElement zs = zr[7];
for (int idx = 6; idx >= 0; --idx) {
if (idx != 6) {
FieldElement tmp;
field_mul_impl(&tmp, &zs, &zr[idx + 1]);
zs = tmp;
}
FieldElement zs2, zs3;
field_sqr_impl(&zs2, &zs);
field_mul_impl(&zs3, &zs2, &zs);
FieldElement tx, ty;
field_mul_impl(&tx, &table[idx].x, &zs2);
field_mul_impl(&ty, &table[idx].y, &zs3);
table[idx].x = tx;
table[idx].y = ty;
}
}
inline void derive_endo_table_impl(const AffinePoint table[8], AffinePoint endo_table[8],
int negate_y) {
FieldElement beta;
beta.limbs[0] = GLV_BETA0; beta.limbs[1] = GLV_BETA1;
beta.limbs[2] = GLV_BETA2; beta.limbs[3] = GLV_BETA3;
for (int i = 0; i < 8; ++i) {
field_mul_impl(&endo_table[i].x, &table[i].x, &beta);
if (negate_y) {
field_negate_impl(&endo_table[i].y, &table[i].y);
} else {
endo_table[i].y = table[i].y;
}
}
}
inline void scalar_mul_glv_impl(JacobianPoint* r, const Scalar* k, const AffinePoint* p) {
Scalar k1, k2;
int k1_neg, k2_neg;
@ -573,62 +651,48 @@ inline void scalar_mul_glv_impl(JacobianPoint* r, const Scalar* k, const AffineP
AffinePoint base = *p;
if (k1_neg) field_negate_impl(&base.y, &base.y);
// Build P precomp table: [P, 3P, 5P, ..., 15P] (8 entries, w=5)
JacobianPoint tbl_jac[8];
JacobianPoint dbl;
point_from_affine(&tbl_jac[0], &base);
point_double_impl(&dbl, &tbl_jac[0]);
for (int i = 1; i < 8; i++)
point_add_impl(&tbl_jac[i], &tbl_jac[i-1], &dbl);
AffinePoint table[8];
FieldElement globalz;
build_wnaf_table_zr_impl(&base, table, &globalz);
// Build phi(P) table: apply endomorphism, flip y if signs differ
AffinePoint endo_base;
FieldElement beta;
beta.limbs[0] = GLV_BETA0; beta.limbs[1] = GLV_BETA1;
beta.limbs[2] = GLV_BETA2; beta.limbs[3] = GLV_BETA3;
field_mul_impl(&endo_base.x, &base.x, &beta);
endo_base.y = base.y;
int flip_phi = (k1_neg != k2_neg);
if (flip_phi) field_negate_impl(&endo_base.y, &endo_base.y);
JacobianPoint tbl2_jac[8];
point_from_affine(&tbl2_jac[0], &endo_base);
JacobianPoint dbl2;
point_double_impl(&dbl2, &tbl2_jac[0]);
for (int i = 1; i < 8; i++)
point_add_impl(&tbl2_jac[i], &tbl2_jac[i-1], &dbl2);
AffinePoint endo_table[8];
derive_endo_table_impl(table, endo_table, (k1_neg != k2_neg));
// wNAF encode both half-width scalars
int wnaf1[260], wnaf2[260];
int len1 = scalar_to_wnaf(&k1, wnaf1);
int len2 = scalar_to_wnaf(&k2, wnaf2);
int max_len = (len1 > len2) ? len1 : len2;
int wnaf1[130] = {0};
int wnaf2[130] = {0};
scalar_to_wnaf(&k1, wnaf1);
scalar_to_wnaf(&k2, wnaf2);
// Shamir interleaved loop
point_set_infinity(r);
for (int i = max_len - 1; i >= 0; --i) {
for (int i = 129; i >= 0; --i) {
if (!point_is_infinity(r)) point_double_impl(r, r);
int d1 = (i < len1) ? wnaf1[i] : 0;
int d1 = wnaf1[i];
if (d1 != 0) {
int idx = ((d1 > 0) ? d1 : -d1) >> 1;
if (idx >= 8) idx = 7;
JacobianPoint pt = tbl_jac[idx];
int idx = (((d1 > 0) ? d1 : -d1) - 1) >> 1;
AffinePoint pt = table[idx];
if (d1 < 0) field_negate_impl(&pt.y, &pt.y);
if (point_is_infinity(r)) { *r = pt; }
else { JacobianPoint tmp; point_add_impl(&tmp, r, &pt); *r = tmp; }
if (point_is_infinity(r)) { point_from_affine(r, &pt); }
else { point_add_mixed_impl(r, r, &pt); }
}
int d2 = (i < len2) ? wnaf2[i] : 0;
int d2 = wnaf2[i];
if (d2 != 0) {
int idx = ((d2 > 0) ? d2 : -d2) >> 1;
if (idx >= 8) idx = 7;
JacobianPoint pt = tbl2_jac[idx];
int idx = (((d2 > 0) ? d2 : -d2) - 1) >> 1;
AffinePoint pt = endo_table[idx];
if (d2 < 0) field_negate_impl(&pt.y, &pt.y);
if (point_is_infinity(r)) { *r = pt; }
else { JacobianPoint tmp; point_add_impl(&tmp, r, &pt); *r = tmp; }
if (point_is_infinity(r)) { point_from_affine(r, &pt); }
else { point_add_mixed_impl(r, r, &pt); }
}
}
if (!point_is_infinity(r)) {
FieldElement corrected_z;
field_mul_impl(&corrected_z, &r->z, &globalz);
r->z = corrected_z;
}
}
// Precomputed generator multiplication using fixed window w=4

View File

@ -59,6 +59,306 @@ typedef struct {
ulong limbs[4]; // Little-endian: limbs[0] is LSB
} FieldElement;
// =============================================================================
// NVIDIA OpenCL PTX Acceleration (Level 1+2+3)
// =============================================================================
// On consumer NVIDIA GPUs (Turing/Ampere/Ada/Blackwell), INT32 multiply
// throughput is 32x higher than INT64. Inline PTX enables:
// Level 1+2: mad.lo.cc.u64/madc.hi.cc.u64 carry chains (no comparison-carry)
// Level 3: mad.lo.cc.u32/madc.hi.cc.u32 32-bit Comba (INT32 throughput)
// Fallback (AMD, Intel, portable): mul_hi + comparison-based carry unchanged.
// Guard: __NV_CL_C_VERSION is defined only by NVIDIA's OpenCL compiler.
// =============================================================================
#ifdef __NV_CL_C_VERSION
// 32-bit MAD accumulate: (r0:r1:r2) += a * b [3-register 96-bit accumulator]
#define OCL_MAD32(r0, r1, r2, a, b) \
__asm volatile( \
"mad.lo.cc.u32 %0, %3, %4, %0; \n\t" \
"madc.hi.cc.u32 %1, %3, %4, %1; \n\t" \
"addc.u32 %2, %2, 0; \n\t" \
: "+r"(r0), "+r"(r1), "+r"(r2) \
: "r"(a), "r"(b) \
)
// 32-bit squaring diagonal: (r0:r1:r2) += a*a
#define OCL_SQR32_D(r0, r1, r2, a) \
__asm volatile( \
"mad.lo.cc.u32 %0, %3, %3, %0; \n\t" \
"madc.hi.cc.u32 %1, %3, %3, %1; \n\t" \
"addc.u32 %2, %2, 0; \n\t" \
: "+r"(r0), "+r"(r1), "+r"(r2) \
: "r"(a) \
)
// 32-bit squaring off-diagonal: (r0:r1:r2) += 2 * a*b
#define OCL_SQR32_M2(r0, r1, r2, a, b) \
do { \
uint _lo, _hi; \
__asm volatile( \
"mul.lo.u32 %0, %2, %3; \n\t" \
"mul.hi.u32 %1, %2, %3; \n\t" \
: "=r"(_lo), "=r"(_hi) : "r"(a), "r"(b) \
); \
__asm volatile( \
"add.cc.u32 %0, %0, %3; \n\t" \
"addc.cc.u32 %1, %1, %4; \n\t" \
"addc.u32 %2, %2, 0; \n\t" \
"add.cc.u32 %0, %0, %3; \n\t" \
"addc.cc.u32 %1, %1, %4; \n\t" \
"addc.u32 %2, %2, 0; \n\t" \
: "+r"(r0), "+r"(r1), "+r"(r2) : "r"(_lo), "r"(_hi) \
); \
} while(0)
// ----------------------------------------------------------------------------
// 32-bit Comba multiplication: 4x64 FieldElement reinterpreted as 8x32 limbs.
// Produces uint[16] raw output (little-endian 32-bit limbs of 512-bit product).
// Mirrors CUDA's mul_256_comba32 from secp256k1_32_hybrid_final.cuh.
// ----------------------------------------------------------------------------
static inline void mul_256_comba32_ocl(
const FieldElement* a, const FieldElement* b, uint t32[16]
) {
uint a32[8], b32[8];
for (int i = 0; i < 4; i++) {
a32[2*i] = (uint)(a->limbs[i]);
a32[2*i+1] = (uint)(a->limbs[i] >> 32);
b32[2*i] = (uint)(b->limbs[i]);
b32[2*i+1] = (uint)(b->limbs[i] >> 32);
}
uint r0 = 0, r1 = 0, r2 = 0;
OCL_MAD32(r0,r1,r2, a32[0],b32[0]);
t32[0]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[0],b32[1]); OCL_MAD32(r0,r1,r2, a32[1],b32[0]);
t32[1]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[0],b32[2]); OCL_MAD32(r0,r1,r2, a32[1],b32[1]); OCL_MAD32(r0,r1,r2, a32[2],b32[0]);
t32[2]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[0],b32[3]); OCL_MAD32(r0,r1,r2, a32[1],b32[2]); OCL_MAD32(r0,r1,r2, a32[2],b32[1]); OCL_MAD32(r0,r1,r2, a32[3],b32[0]);
t32[3]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[0],b32[4]); OCL_MAD32(r0,r1,r2, a32[1],b32[3]); OCL_MAD32(r0,r1,r2, a32[2],b32[2]); OCL_MAD32(r0,r1,r2, a32[3],b32[1]); OCL_MAD32(r0,r1,r2, a32[4],b32[0]);
t32[4]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[0],b32[5]); OCL_MAD32(r0,r1,r2, a32[1],b32[4]); OCL_MAD32(r0,r1,r2, a32[2],b32[3]); OCL_MAD32(r0,r1,r2, a32[3],b32[2]); OCL_MAD32(r0,r1,r2, a32[4],b32[1]); OCL_MAD32(r0,r1,r2, a32[5],b32[0]);
t32[5]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[0],b32[6]); OCL_MAD32(r0,r1,r2, a32[1],b32[5]); OCL_MAD32(r0,r1,r2, a32[2],b32[4]); OCL_MAD32(r0,r1,r2, a32[3],b32[3]); OCL_MAD32(r0,r1,r2, a32[4],b32[2]); OCL_MAD32(r0,r1,r2, a32[5],b32[1]); OCL_MAD32(r0,r1,r2, a32[6],b32[0]);
t32[6]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[0],b32[7]); OCL_MAD32(r0,r1,r2, a32[1],b32[6]); OCL_MAD32(r0,r1,r2, a32[2],b32[5]); OCL_MAD32(r0,r1,r2, a32[3],b32[4]); OCL_MAD32(r0,r1,r2, a32[4],b32[3]); OCL_MAD32(r0,r1,r2, a32[5],b32[2]); OCL_MAD32(r0,r1,r2, a32[6],b32[1]); OCL_MAD32(r0,r1,r2, a32[7],b32[0]);
t32[7]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[1],b32[7]); OCL_MAD32(r0,r1,r2, a32[2],b32[6]); OCL_MAD32(r0,r1,r2, a32[3],b32[5]); OCL_MAD32(r0,r1,r2, a32[4],b32[4]); OCL_MAD32(r0,r1,r2, a32[5],b32[3]); OCL_MAD32(r0,r1,r2, a32[6],b32[2]); OCL_MAD32(r0,r1,r2, a32[7],b32[1]);
t32[8]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[2],b32[7]); OCL_MAD32(r0,r1,r2, a32[3],b32[6]); OCL_MAD32(r0,r1,r2, a32[4],b32[5]); OCL_MAD32(r0,r1,r2, a32[5],b32[4]); OCL_MAD32(r0,r1,r2, a32[6],b32[3]); OCL_MAD32(r0,r1,r2, a32[7],b32[2]);
t32[9]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[3],b32[7]); OCL_MAD32(r0,r1,r2, a32[4],b32[6]); OCL_MAD32(r0,r1,r2, a32[5],b32[5]); OCL_MAD32(r0,r1,r2, a32[6],b32[4]); OCL_MAD32(r0,r1,r2, a32[7],b32[3]);
t32[10]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[4],b32[7]); OCL_MAD32(r0,r1,r2, a32[5],b32[6]); OCL_MAD32(r0,r1,r2, a32[6],b32[5]); OCL_MAD32(r0,r1,r2, a32[7],b32[4]);
t32[11]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[5],b32[7]); OCL_MAD32(r0,r1,r2, a32[6],b32[6]); OCL_MAD32(r0,r1,r2, a32[7],b32[5]);
t32[12]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[6],b32[7]); OCL_MAD32(r0,r1,r2, a32[7],b32[6]);
t32[13]=r0; r0=r1; r1=r2; r2=0;
OCL_MAD32(r0,r1,r2, a32[7],b32[7]);
t32[14]=r0; t32[15]=r1;
}
// 32-bit Comba squaring: ~40% fewer multiplications (symmetry exploitation).
// Mirrors CUDA's sqr_256_comba32 from secp256k1_32_hybrid_final.cuh.
static inline void sqr_256_comba32_ocl(const FieldElement* a, uint t32[16]) {
uint a32[8];
for (int i = 0; i < 4; i++) {
a32[2*i] = (uint)(a->limbs[i]);
a32[2*i+1] = (uint)(a->limbs[i] >> 32);
}
uint r0 = 0, r1 = 0, r2 = 0;
OCL_SQR32_D(r0,r1,r2, a32[0]);
t32[0]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[1]);
t32[1]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[2]); OCL_SQR32_D(r0,r1,r2, a32[1]);
t32[2]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[3]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[2]);
t32[3]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[4]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[3]); OCL_SQR32_D(r0,r1,r2, a32[2]);
t32[4]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[5]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[4]); OCL_SQR32_M2(r0,r1,r2, a32[2],a32[3]);
t32[5]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[6]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[5]); OCL_SQR32_M2(r0,r1,r2, a32[2],a32[4]); OCL_SQR32_D(r0,r1,r2, a32[3]);
t32[6]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[6]); OCL_SQR32_M2(r0,r1,r2, a32[2],a32[5]); OCL_SQR32_M2(r0,r1,r2, a32[3],a32[4]);
t32[7]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[1],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[2],a32[6]); OCL_SQR32_M2(r0,r1,r2, a32[3],a32[5]); OCL_SQR32_D(r0,r1,r2, a32[4]);
t32[8]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[2],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[3],a32[6]); OCL_SQR32_M2(r0,r1,r2, a32[4],a32[5]);
t32[9]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[3],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[4],a32[6]); OCL_SQR32_D(r0,r1,r2, a32[5]);
t32[10]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[4],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[5],a32[6]);
t32[11]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[5],a32[7]); OCL_SQR32_D(r0,r1,r2, a32[6]);
t32[12]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_M2(r0,r1,r2, a32[6],a32[7]);
t32[13]=r0; r0=r1; r1=r2; r2=0;
OCL_SQR32_D(r0,r1,r2, a32[7]);
t32[14]=r0; t32[15]=r1;
}
// 32-bit reduction: T_hi x K_MOD (32-bit MAD chain) + conditional P-subtract.
// Phase 1: T_hi[8..15] x 977 (scalar, 32-bit MAD chain)
// Phase 1b: add T_hi << 32 (K_MOD = 2^32 + 977)
// Phase 2: T_lo[0..7] += result (32-bit carry chain)
// Phase 3+4: pack to 64-bit, fold overflow, conditional P-subtract (64-bit PTX)
// Mirrors CUDA's reduce_512_to_256_32 from secp256k1_32_hybrid_final.cuh.
static inline void reduce_512_to_256_32_ocl(uint t32[16], FieldElement* r) {
uint t0=t32[0], t1=t32[1], t2=t32[2], t3=t32[3];
uint t4=t32[4], t5=t32[5], t6=t32[6], t7=t32[7];
const uint t8 =t32[8], t9 =t32[9], t10=t32[10], t11=t32[11];
const uint t12=t32[12], t13=t32[13], t14=t32[14], t15=t32[15];
// Phase 1: A = T_hi[8..15] x 977 (32-bit scalar MAD chain -> 9 limbs)
uint a0, a1, a2, a3, a4, a5, a6, a7, a8;
__asm volatile(
"mul.lo.u32 %0, %9, 977;\n\t"
"mul.hi.u32 %1, %9, 977;\n\t"
"mad.lo.cc.u32 %1, %10, 977, %1;\n\t"
"madc.hi.u32 %2, %10, 977, 0;\n\t"
"mad.lo.cc.u32 %2, %11, 977, %2;\n\t"
"madc.hi.u32 %3, %11, 977, 0;\n\t"
"mad.lo.cc.u32 %3, %12, 977, %3;\n\t"
"madc.hi.u32 %4, %12, 977, 0;\n\t"
"mad.lo.cc.u32 %4, %13, 977, %4;\n\t"
"madc.hi.u32 %5, %13, 977, 0;\n\t"
"mad.lo.cc.u32 %5, %14, 977, %5;\n\t"
"madc.hi.u32 %6, %14, 977, 0;\n\t"
"mad.lo.cc.u32 %6, %15, 977, %6;\n\t"
"madc.hi.u32 %7, %15, 977, 0;\n\t"
"mad.lo.cc.u32 %7, %16, 977, %7;\n\t"
"madc.hi.u32 %8, %16, 977, 0;\n\t"
: "=r"(a0),"=r"(a1),"=r"(a2),"=r"(a3),"=r"(a4),
"=r"(a5),"=r"(a6),"=r"(a7),"=r"(a8)
: "r"(t8),"r"(t9),"r"(t10),"r"(t11),
"r"(t12),"r"(t13),"r"(t14),"r"(t15)
);
// Phase 1b: add T_hi << 32 (a[1..8] += T_hi[8..15], yielding a9 overflow)
uint a9;
__asm volatile(
"add.cc.u32 %0, %0, %9;\n\t"
"addc.cc.u32 %1, %1, %10;\n\t"
"addc.cc.u32 %2, %2, %11;\n\t"
"addc.cc.u32 %3, %3, %12;\n\t"
"addc.cc.u32 %4, %4, %13;\n\t"
"addc.cc.u32 %5, %5, %14;\n\t"
"addc.cc.u32 %6, %6, %15;\n\t"
"addc.cc.u32 %7, %7, %16;\n\t"
"addc.u32 %8, 0, 0;\n\t"
: "+r"(a1),"+r"(a2),"+r"(a3),"+r"(a4),
"+r"(a5),"+r"(a6),"+r"(a7),"+r"(a8),"=r"(a9)
: "r"(t8),"r"(t9),"r"(t10),"r"(t11),
"r"(t12),"r"(t13),"r"(t14),"r"(t15)
);
// Phase 2: T_lo[0..7] += A[0..7] (32-bit carry chain)
uint carry;
__asm volatile(
"add.cc.u32 %0, %0, %9;\n\t"
"addc.cc.u32 %1, %1, %10;\n\t"
"addc.cc.u32 %2, %2, %11;\n\t"
"addc.cc.u32 %3, %3, %12;\n\t"
"addc.cc.u32 %4, %4, %13;\n\t"
"addc.cc.u32 %5, %5, %14;\n\t"
"addc.cc.u32 %6, %6, %15;\n\t"
"addc.cc.u32 %7, %7, %16;\n\t"
"addc.u32 %8, 0, 0;\n\t"
: "+r"(t0),"+r"(t1),"+r"(t2),"+r"(t3),
"+r"(t4),"+r"(t5),"+r"(t6),"+r"(t7),"=r"(carry)
: "r"(a0),"r"(a1),"r"(a2),"r"(a3),
"r"(a4),"r"(a5),"r"(a6),"r"(a7)
);
// Phase 3: pack to 64-bit and fold overflow (extra * K)
ulong r0 = ((ulong)t1 << 32) | t0;
ulong r1 = ((ulong)t3 << 32) | t2;
ulong r2 = ((ulong)t5 << 32) | t4;
ulong r3 = ((ulong)t7 << 32) | t6;
ulong extra = (ulong)a8 + carry + ((ulong)a9 << 32);
ulong ek_lo, ek_hi;
__asm volatile(
"mul.lo.u64 %0, %2, %3;\n\t"
"mul.hi.u64 %1, %2, %3;\n\t"
: "=l"(ek_lo), "=l"(ek_hi)
: "l"(extra), "l"((ulong)SECP256K1_K)
);
ulong c;
__asm volatile(
"add.cc.u64 %0, %0, %5;\n\t"
"addc.cc.u64 %1, %1, %6;\n\t"
"addc.cc.u64 %2, %2, 0;\n\t"
"addc.cc.u64 %3, %3, 0;\n\t"
"addc.u64 %4, 0, 0;\n\t"
: "+l"(r0),"+l"(r1),"+l"(r2),"+l"(r3),"=l"(c)
: "l"(ek_lo),"l"(ek_hi)
);
if (c) {
__asm volatile(
"add.cc.u64 %0, %0, %4;\n\t"
"addc.cc.u64 %1, %1, 0;\n\t"
"addc.cc.u64 %2, %2, 0;\n\t"
"addc.u64 %3, %3, 0;\n\t"
: "+l"(r0),"+l"(r1),"+l"(r2),"+l"(r3)
: "l"((ulong)SECP256K1_K)
);
}
// Phase 4: conditional subtraction of P (64-bit PTX sub.cc chain)
ulong s0, s1, s2, s3, borrow;
__asm volatile(
"sub.cc.u64 %0, %5, %9;\n\t"
"subc.cc.u64 %1, %6, %10;\n\t"
"subc.cc.u64 %2, %7, %11;\n\t"
"subc.cc.u64 %3, %8, %12;\n\t"
"subc.u64 %4, 0, 0;\n\t"
: "=l"(s0),"=l"(s1),"=l"(s2),"=l"(s3),"=l"(borrow)
: "l"(r0),"l"(r1),"l"(r2),"l"(r3),
"l"(SECP256K1_P0),"l"(SECP256K1_P1),"l"(SECP256K1_P2),"l"(SECP256K1_P3)
);
if (borrow == 0) {
r->limbs[0]=s0; r->limbs[1]=s1; r->limbs[2]=s2; r->limbs[3]=s3;
} else {
r->limbs[0]=r0; r->limbs[1]=r1; r->limbs[2]=r2; r->limbs[3]=r3;
}
}
#endif // __NV_CL_C_VERSION
// =============================================================================
// Field Reduction: r = a mod p
// Uses the fact that p = 2^256 - K where K = 0x1000003D1
@ -151,32 +451,56 @@ inline void field_reduce(FieldElement* r, const ulong* a8) {
// =============================================================================
inline void field_add_impl(FieldElement* r, const FieldElement* a, const FieldElement* b) {
#ifdef __NV_CL_C_VERSION
// Level 2: native add.cc/addc carry chains (no comparison-based carry)
ulong s0, s1, s2, s3, carry;
__asm volatile(
"add.cc.u64 %0, %5, %9;\n\t"
"addc.cc.u64 %1, %6, %10;\n\t"
"addc.cc.u64 %2, %7, %11;\n\t"
"addc.cc.u64 %3, %8, %12;\n\t"
"addc.u64 %4, 0, 0;\n\t"
: "=l"(s0),"=l"(s1),"=l"(s2),"=l"(s3),"=l"(carry)
: "l"(a->limbs[0]),"l"(a->limbs[1]),"l"(a->limbs[2]),"l"(a->limbs[3]),
"l"(b->limbs[0]),"l"(b->limbs[1]),"l"(b->limbs[2]),"l"(b->limbs[3])
);
ulong d0, d1, d2, d3, borrow;
__asm volatile(
"sub.cc.u64 %0, %5, %9;\n\t"
"subc.cc.u64 %1, %6, %10;\n\t"
"subc.cc.u64 %2, %7, %11;\n\t"
"subc.cc.u64 %3, %8, %12;\n\t"
"subc.u64 %4, 0, 0;\n\t"
: "=l"(d0),"=l"(d1),"=l"(d2),"=l"(d3),"=l"(borrow)
: "l"(s0),"l"(s1),"l"(s2),"l"(s3),
"l"(SECP256K1_P0),"l"(SECP256K1_P1),"l"(SECP256K1_P2),"l"(SECP256K1_P3)
);
// use diff if: no borrow (s >= P) OR carry from add (sum overflowed 2^256)
ulong mask = ~borrow | (0UL - carry);
r->limbs[0] = (d0 & mask) | (s0 & ~mask);
r->limbs[1] = (d1 & mask) | (s1 & ~mask);
r->limbs[2] = (d2 & mask) | (s2 & ~mask);
r->limbs[3] = (d3 & mask) | (s3 & ~mask);
#else
ulong carry = 0;
ulong sum[4];
// Add with carry chain
sum[0] = add_with_carry(a->limbs[0], b->limbs[0], 0, &carry);
sum[1] = add_with_carry(a->limbs[1], b->limbs[1], carry, &carry);
sum[2] = add_with_carry(a->limbs[2], b->limbs[2], carry, &carry);
sum[3] = add_with_carry(a->limbs[3], b->limbs[3], carry, &carry);
// Reduce: if carry or sum >= p, subtract p
ulong borrow = 0;
ulong diff[4];
diff[0] = sub_with_borrow(sum[0], SECP256K1_P0, 0, &borrow);
diff[1] = sub_with_borrow(sum[1], SECP256K1_P1, borrow, &borrow);
diff[2] = sub_with_borrow(sum[2], SECP256K1_P2, borrow, &borrow);
diff[3] = sub_with_borrow(sum[3], SECP256K1_P3, borrow, &borrow);
// If carry from addition or no borrow from subtraction, use diff
ulong use_diff = (carry != 0) | (borrow == 0);
ulong mask = use_diff ? ~0UL : 0UL;
r->limbs[0] = (diff[0] & mask) | (sum[0] & ~mask);
r->limbs[1] = (diff[1] & mask) | (sum[1] & ~mask);
r->limbs[2] = (diff[2] & mask) | (sum[2] & ~mask);
r->limbs[3] = (diff[3] & mask) | (sum[3] & ~mask);
#endif
}
// =============================================================================
@ -184,29 +508,51 @@ inline void field_add_impl(FieldElement* r, const FieldElement* a, const FieldEl
// =============================================================================
inline void field_sub_impl(FieldElement* r, const FieldElement* a, const FieldElement* b) {
#ifdef __NV_CL_C_VERSION
// Level 2: native sub.cc/subc + add.cc/addc carry chains
ulong d0, d1, d2, d3, borrow;
__asm volatile(
"sub.cc.u64 %0, %5, %9;\n\t"
"subc.cc.u64 %1, %6, %10;\n\t"
"subc.cc.u64 %2, %7, %11;\n\t"
"subc.cc.u64 %3, %8, %12;\n\t"
"subc.u64 %4, 0, 0;\n\t"
: "=l"(d0),"=l"(d1),"=l"(d2),"=l"(d3),"=l"(borrow)
: "l"(a->limbs[0]),"l"(a->limbs[1]),"l"(a->limbs[2]),"l"(a->limbs[3]),
"l"(b->limbs[0]),"l"(b->limbs[1]),"l"(b->limbs[2]),"l"(b->limbs[3])
);
// borrow = 0xFFFF...FFFF if a < b (underflow), 0 otherwise
ulong p0 = SECP256K1_P0 & borrow;
ulong p1 = SECP256K1_P1 & borrow;
ulong p2 = SECP256K1_P2 & borrow;
ulong p3 = SECP256K1_P3 & borrow;
__asm volatile(
"add.cc.u64 %0, %4, %8;\n\t"
"addc.cc.u64 %1, %5, %9;\n\t"
"addc.cc.u64 %2, %6, %10;\n\t"
"addc.u64 %3, %7, %11;\n\t"
: "=l"(r->limbs[0]),"=l"(r->limbs[1]),"=l"(r->limbs[2]),"=l"(r->limbs[3])
: "l"(d0),"l"(d1),"l"(d2),"l"(d3), "l"(p0),"l"(p1),"l"(p2),"l"(p3)
);
#else
ulong borrow = 0;
ulong diff[4];
// Subtract with borrow chain
diff[0] = sub_with_borrow(a->limbs[0], b->limbs[0], 0, &borrow);
diff[1] = sub_with_borrow(a->limbs[1], b->limbs[1], borrow, &borrow);
diff[2] = sub_with_borrow(a->limbs[2], b->limbs[2], borrow, &borrow);
diff[3] = sub_with_borrow(a->limbs[3], b->limbs[3], borrow, &borrow);
// If borrow, add p (result was negative)
ulong mask = borrow ? ~0UL : 0UL;
ulong carry = 0;
ulong adj[4];
adj[0] = add_with_carry(diff[0], SECP256K1_P0 & mask, 0, &carry);
adj[1] = add_with_carry(diff[1], SECP256K1_P1 & mask, carry, &carry);
adj[2] = add_with_carry(diff[2], SECP256K1_P2 & mask, carry, &carry);
adj[3] = add_with_carry(diff[3], SECP256K1_P3 & mask, carry, &carry);
r->limbs[0] = adj[0];
r->limbs[1] = adj[1];
r->limbs[2] = adj[2];
r->limbs[3] = adj[3];
#endif
}
// =============================================================================
@ -228,6 +574,12 @@ inline void muladd2(ulong lo, ulong hi, ulong* c0, ulong* c1, ulong* c2) {
}
inline void field_mul_impl(FieldElement* r, const FieldElement* a, const FieldElement* b) {
#ifdef __NV_CL_C_VERSION
// Level 3: 32-bit hybrid Comba + 32-bit reduction (INT32 throughput 32x > INT64)
uint t32[16];
mul_256_comba32_ocl(a, b, t32);
reduce_512_to_256_32_ocl(t32, r);
#else
ulong a0 = a->limbs[0], a1 = a->limbs[1], a2 = a->limbs[2], a3 = a->limbs[3];
ulong b0 = b->limbs[0], b1 = b->limbs[1], b2 = b->limbs[2], b3 = b->limbs[3];
ulong product[8];
@ -274,11 +626,11 @@ inline void field_mul_impl(FieldElement* r, const FieldElement* a, const FieldEl
product[7] = c1;
field_reduce(r, product);
#endif // __NV_CL_C_VERSION
}
// =============================================================================
// Field Squaring: r = a² mod p
// Optimized: only need upper triangle of multiplication
// =============================================================================
// Forward declaration for field_sqr_n_impl
@ -293,6 +645,12 @@ inline void field_sqr_n_impl(FieldElement* r, int n) {
}
inline void field_sqr_impl(FieldElement* r, const FieldElement* a) {
#ifdef __NV_CL_C_VERSION
// Level 3: 32-bit hybrid squaring (40% fewer multiplications + INT32 throughput)
uint t32[16];
sqr_256_comba32_ocl(a, t32);
reduce_512_to_256_32_ocl(t32, r);
#else
ulong a0 = a->limbs[0], a1 = a->limbs[1], a2 = a->limbs[2], a3 = a->limbs[3];
ulong product[8];
ulong c0, c1, c2;
@ -332,6 +690,7 @@ inline void field_sqr_impl(FieldElement* r, const FieldElement* a) {
product[7] = c1;
field_reduce(r, product);
#endif // __NV_CL_C_VERSION
}
// =============================================================================

View File

@ -417,7 +417,7 @@ inline void scalar_add_u64(Scalar* a, ulong val, Scalar* r) {
// Convert scalar to wNAF representation (window width 5)
// Returns length of wNAF representation
inline int scalar_to_wnaf(const Scalar* k, int wnaf[260]) {
static inline int scalar_to_wnaf(const Scalar* k, int wnaf[260]) {
Scalar temp = *k;
int len = 0;
const int window_size = 32; // 2^5

View File

@ -768,6 +768,427 @@ static int audit_perf_schnorr_stress() {
return 0;
}
// =============================================================================
// Section 9: BIP-352 Silent Payments & GLV Correctness
// =============================================================================
// Helper: expand kernel file with #include directives resolved (like bench_bip352_opencl.cpp).
static std::string bip352_expand_kernel(const std::string& path,
std::vector<std::string>& seen) {
if (std::find(seen.begin(), seen.end(), path) != seen.end()) return {};
seen.push_back(path);
std::string src = load_file(path);
if (src.empty()) return {};
std::string dir = path.substr(0, path.find_last_of("/\\"));
if (dir.empty()) dir = ".";
std::istringstream in(src);
std::ostringstream out;
std::string line;
while (std::getline(in, line)) {
size_t s = line.find_first_not_of(" \t");
std::string trimmed = (s != std::string::npos) ? line.substr(s) : line;
if (trimmed.rfind("#include \"", 0) == 0) {
size_t q1 = trimmed.find('"') + 1;
size_t q2 = trimmed.find('"', q1);
std::string child = dir + "/" + trimmed.substr(q1, q2 - q1);
out << bip352_expand_kernel(child, seen);
} else {
out << line << '\n';
}
}
return out.str();
}
// Host wNAF encoder: mirrors the GPU scalar_to_wnaf fixed-130-step version.
// Encodes 128-bit scalar (s0=LSW, s1=MSW) into 5-bit signed wNAF digits.
static void audit_host_wnaf(uint64_t s0, uint64_t s1, int8_t wnaf[130]) {
uint64_t s[4] = {s0, s1, 0, 0};
for (int i = 0; i < 130; i++) {
if (s[0] & 1ULL) {
int d = (int)(s[0] & 0x1FULL);
if (d >= 16) {
d -= 32;
uint64_t add = (uint64_t)(-d);
uint64_t prev = s[0]; s[0] += add;
if (s[0] < prev) { for (int j=1;j<4;j++) if (++s[j]) break; }
} else {
uint64_t prev = s[0]; s[0] -= (uint64_t)d;
if (s[0] > prev) { for (int j=1;j<4;j++) if (s[j]--) break; }
}
wnaf[i] = (int8_t)d;
} else { wnaf[i] = 0; }
s[0] = (s[0] >> 1) | (s[1] << 63);
s[1] = (s[1] >> 1) | (s[2] << 63);
s[2] = (s[2] >> 1) | (s[3] << 63);
s[3] >>= 1;
}
}
// Test 1: CPU wNAF round-trip — encode scalar, decode digits back, verify match.
// Tests host_compute_wnaf correctness: this was the key change that fixed the -36 crash.
static int audit_glv_wnaf_roundtrip() {
struct TC { uint64_t s0, s1; const char* label; };
static const TC cases[] = {
{1, 0, "k=1"},
{2, 0, "k=2"},
{15, 0, "k=15 (max single wNAF digit)"},
{16, 0, "k=16 (two-digit boundary)"},
{31, 0, "k=31 (wNAF carry: 32-1)"},
{0x5555555555555555ULL, 0x5555555555555555ULL, "k=0x5555... (alternating bits)"},
{0xFFFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL, "k near 2^127"},
// k1 half of SCAN_KEY GLV decomposition (lower 128 bits of full key)
{0x38af4ad300da1a42ULL, 0x30d7d6a3b98294b1ULL, "k1 from SCAN_KEY GLV half"},
};
for (auto& tc : cases) {
int8_t wnaf[130] = {};
audit_host_wnaf(tc.s0, tc.s1, wnaf);
// Reconstruct: sum(wnaf[i] * 2^i) for i=0..129 using 128-bit arithmetic.
// Use __uint128_t for correctness (GCC/Clang extension, fine on x86-64).
__uint128_t result = 0, power = 1;
for (int i = 0; i < 130; i++) {
if (wnaf[i] > 0) result += (__uint128_t)(uint8_t) wnaf[i] * power;
if (wnaf[i] < 0) result -= (__uint128_t)(uint8_t)(-wnaf[i]) * power;
power <<= 1;
}
uint64_t r0 = (uint64_t)result;
uint64_t r1 = (uint64_t)(result >> 64);
if (r0 != tc.s0 || r1 != tc.s1) {
std::fprintf(stderr, " [FAIL] wNAF roundtrip for %s: "
"expected (%016llx,%016llx) got (%016llx,%016llx)\n",
tc.label,
(unsigned long long)tc.s0, (unsigned long long)tc.s1,
(unsigned long long)r0, (unsigned long long)r1);
return 1;
}
}
return 0;
}
// Test 2: GLV large scalar consistency via OpenCL library.
// Verifies k*G + G = (k+1)*G for three large scalars that stress the GLV path:
// - SCAN_KEY (256-bit random key, both GLV halves active)
// - 2^128 (decomposition boundary)
// - 0x5555... (alternating bit pattern, maximally stresses wNAF carry logic)
static int audit_glv_large_scalar() {
// Helper: hex string (big-endian) -> little-endian Scalar limbs
auto from_hex = [](const char* hex) -> Scalar {
Scalar s{};
std::string h(hex);
while (h.size() < 64) h = "0" + h;
for (int i = 0; i < 4; i++) {
uint64_t v = 0;
for (int j = 0; j < 16; j++) {
char c = h[(3 - i) * 16 + j];
int d = (c >= '0' && c <= '9') ? c - '0'
: (c >= 'a' && c <= 'f') ? c - 'a' + 10
: (c >= 'A' && c <= 'F') ? c - 'A' + 10 : 0;
v = (v << 4) | (uint64_t)d;
}
s.limbs[i] = v;
}
return s;
};
struct TC { Scalar k, kp1; const char* label; };
Scalar s_scan = from_hex("c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a42");
Scalar s_scanp = from_hex("c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a43");
Scalar s_2_128 = {{0UL, 0UL, 1UL, 0UL}};
Scalar s_2_128p = {{1UL, 0UL, 1UL, 0UL}};
Scalar s_alt = {{0x5555555555555555ULL, 0x5555555555555555ULL,
0x5555555555555555ULL, 0x5555555555555555ULL}};
Scalar s_altp = {{0x5555555555555556ULL, 0x5555555555555555ULL,
0x5555555555555555ULL, 0x5555555555555555ULL}};
TC cases[] = {
{s_scan, s_scanp, "SCAN_KEY (256-bit)"},
{s_2_128, s_2_128p, "k = 2^128 (GLV boundary)"},
{s_alt, s_altp, "k = 0x5555... (alternating bits)"},
};
Scalar one = sc_from_u64(1);
JacobianPoint oneG = g_ctx->scalar_mul_generator(one);
for (auto& tc : cases) {
JacobianPoint kG = g_ctx->scalar_mul_generator(tc.k);
JacobianPoint kp1_a = g_ctx->point_add(kG, oneG); // k*G + G
JacobianPoint kp1_b = g_ctx->scalar_mul_generator(tc.kp1); // (k+1)*G
AffinePoint a = jacobian_to_affine(kp1_a);
AffinePoint b = jacobian_to_affine(kp1_b);
if (!fe_eq(a.x, b.x) || !fe_eq(a.y, b.y)) {
std::fprintf(stderr, " [FAIL] GLV %s: k*G+G != (k+1)*G\n", tc.label);
return 1;
}
}
return 0;
}
// Struct layout matching BIP352ScanKeyGlv in secp256k1_bip352.cl.
// Used by the BIP-352 kernel audit tests below.
struct alignas(1) AuditBIP352ScanKeyGlv {
int8_t wnaf1[130]; // +0: wNAF digits for k1 half-scalar
int8_t wnaf2[130]; // +130: wNAF digits for k2 half-scalar
uint8_t k1_neg; // +260: 1 if k1 negative
uint8_t flip_phi; // +261: 1 if phi table y should be negated
uint8_t pad0, pad1; // +262-263: padding
};
static_assert(sizeof(AuditBIP352ScanKeyGlv) == 264, "BIP352ScanKeyGlv size mismatch");
// Kernel-side AffinePoint and FieldElement layout (must match .cl struct).
struct AuditFieldElement { uint64_t limbs[4]; };
struct AuditAffinePoint { AuditFieldElement x, y; };
// secp256k1 generator G in the kernel's field element representation (little-endian limbs).
static AuditAffinePoint audit_generator_point() {
AuditAffinePoint g;
g.x.limbs[0] = 0x59F2815B16F81798ULL; g.x.limbs[1] = 0x029BFCDB2DCE28D9ULL;
g.x.limbs[2] = 0x55A06295CE870B07ULL; g.x.limbs[3] = 0x79BE667EF9DCBBACULL;
g.y.limbs[0] = 0x9C47D08FFB10D4B8ULL; g.y.limbs[1] = 0xFD17B448A6855419ULL;
g.y.limbs[2] = 0x5DA4FBFC0E1108A8ULL; g.y.limbs[3] = 0x483ADA7726A3C465ULL;
return g;
}
// Test 3: BIP-352 kernel compiles without error.
static int audit_bip352_kernel_build() {
if (g_kernel_dir.empty()) return -1;
std::vector<std::string> seen;
std::string src = bip352_expand_kernel(g_kernel_dir + "/secp256k1_bip352.cl", seen);
if (src.empty()) return -1;
cl_context cl_ctx = (cl_context)g_ctx->native_context();
cl_command_queue cl_q = (cl_command_queue)g_ctx->native_queue();
cl_device_id cl_dev = nullptr;
clGetCommandQueueInfo(cl_q, CL_QUEUE_DEVICE, sizeof(cl_dev), &cl_dev, nullptr);
cl_int err;
const char* src_ptr = src.c_str();
size_t src_len = src.size();
cl_program prog = clCreateProgramWithSource(cl_ctx, 1, &src_ptr, &src_len, &err);
if (err != CL_SUCCESS) return 1;
err = clBuildProgram(prog, 1, &cl_dev, "-cl-std=CL1.2", nullptr, nullptr);
if (err != CL_SUCCESS) {
size_t log_size = 0;
clGetProgramBuildInfo(prog, cl_dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_size);
std::string log(log_size, '\0');
clGetProgramBuildInfo(prog, cl_dev, CL_PROGRAM_BUILD_LOG, log_size, log.data(), nullptr);
std::fprintf(stderr, " BIP-352 build log:\n%s\n", log.c_str());
clReleaseProgram(prog);
return 2;
}
// Verify both kernel entry points exist
cl_kernel k_nolut = clCreateKernel(prog, "bip352_pipeline_kernel", &err);
if (err != CL_SUCCESS) { clReleaseProgram(prog); return 3; }
cl_kernel k_lut = clCreateKernel(prog, "bip352_pipeline_kernel_lut", &err);
if (err != CL_SUCCESS) { clReleaseKernel(k_nolut); clReleaseProgram(prog); return 4; }
clReleaseKernel(k_nolut);
clReleaseKernel(k_lut);
clReleaseProgram(prog);
return 0;
}
// Test 4: Regression for CL_INVALID_COMMAND_QUEUE (-36) GPU fault.
// Runs bip352_pipeline_kernel (no-LUT path) with 1 work item and verifies no crash.
// The crash was caused by GPU private-memory overflow from int wnaf[130]×2 arrays.
// Fix: precompute wNAF on CPU (BIP352ScanKeyGlv.wnaf1/wnaf2), read from __constant.
// Three scan-key edge cases: k=1 (minimal), k from SCAN_KEY, k with all-15 wNAF digits.
static int audit_bip352_no_crash() {
if (g_kernel_dir.empty()) return -1;
std::vector<std::string> seen;
std::string src = bip352_expand_kernel(g_kernel_dir + "/secp256k1_bip352.cl", seen);
if (src.empty()) return -1;
cl_context cl_ctx = (cl_context)g_ctx->native_context();
cl_command_queue cl_q = (cl_command_queue)g_ctx->native_queue();
cl_device_id cl_dev = nullptr;
clGetCommandQueueInfo(cl_q, CL_QUEUE_DEVICE, sizeof(cl_dev), &cl_dev, nullptr);
cl_int err;
const char* src_ptr = src.c_str();
size_t src_len = src.size();
cl_program prog = clCreateProgramWithSource(cl_ctx, 1, &src_ptr, &src_len, &err);
if (err != CL_SUCCESS) return 1;
err = clBuildProgram(prog, 1, &cl_dev, "-cl-std=CL1.2 -cl-fast-relaxed-math", nullptr, nullptr);
if (err != CL_SUCCESS) { clReleaseProgram(prog); return 2; }
cl_kernel kernel = clCreateKernel(prog, "bip352_pipeline_kernel", &err);
if (err != CL_SUCCESS) { clReleaseProgram(prog); return 3; }
// Edge case scan keys to test. k1_neg/flip_phi chosen to exercise both paths.
struct EdgeCase {
const char* label;
int8_t wnaf1_0; // wnaf1[0] digit (rest 0)
int8_t wnaf2_0; // wnaf2[0] digit (rest 0)
uint8_t k1_neg, flip_phi;
};
static const EdgeCase edges[] = {
{"k=1 (minimal scalar)", 1, 0, 0, 0},
{"k1=15,k2=1 (max digit)", 15, 1, 0, 0},
{"k1_neg=1, flip_phi=1", 1, 1, 1, 1}, // negate path
};
AuditAffinePoint g_pt = audit_generator_point();
AuditAffinePoint spend_pt = g_pt; // spend = G for simplicity
// Pre-allocate buffers (reused across edge cases)
cl_mem d_tweaks = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY, sizeof(AuditAffinePoint), nullptr, &err);
cl_mem d_spend = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(AuditAffinePoint), &spend_pt, &err);
cl_mem d_scan = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY, sizeof(AuditBIP352ScanKeyGlv), nullptr, &err);
cl_mem d_prefixes = clCreateBuffer(cl_ctx, CL_MEM_WRITE_ONLY, sizeof(uint64_t), nullptr, &err);
cl_uint count = 1;
clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_tweaks);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_scan);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_spend);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_prefixes);
clSetKernelArg(kernel, 4, sizeof(cl_uint), &count);
int result = 0;
for (auto& ec : edges) {
// Build scan plan
AuditBIP352ScanKeyGlv plan{};
plan.wnaf1[0] = ec.wnaf1_0;
plan.wnaf2[0] = ec.wnaf2_0;
plan.k1_neg = ec.k1_neg;
plan.flip_phi = ec.flip_phi;
// Upload tweak=G and scan plan
clEnqueueWriteBuffer(cl_q, d_tweaks, CL_TRUE, 0, sizeof(AuditAffinePoint), &g_pt, 0, nullptr, nullptr);
clEnqueueWriteBuffer(cl_q, d_scan, CL_TRUE, 0, sizeof(AuditBIP352ScanKeyGlv), &plan, 0, nullptr, nullptr);
size_t global = 1, local = 1;
err = clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr);
if (err != CL_SUCCESS) { result = 10; break; }
err = clFinish(cl_q);
if (err != CL_SUCCESS) {
// -36 = CL_INVALID_COMMAND_QUEUE = GPU fault (regression for the private-stack overflow crash)
std::fprintf(stderr, " [FAIL] bip352_no_crash edge='%s' clFinish error=%d"
" (expected 0; -36 = GPU fault regression)\n", ec.label, err);
result = 20 + err; // encode the OCL error
break;
}
uint64_t prefix = 0;
clEnqueueReadBuffer(cl_q, d_prefixes, CL_TRUE, 0, sizeof(uint64_t), &prefix, 0, nullptr, nullptr);
// prefix may be 0 if the point is infinity (edge case k1=0 path) — that's valid.
// What we really test is that we reach here without crashing.
}
clReleaseMemObject(d_tweaks);
clReleaseMemObject(d_scan);
clReleaseMemObject(d_spend);
clReleaseMemObject(d_prefixes);
clReleaseKernel(kernel);
clReleaseProgram(prog);
return result;
}
// Test 5: BIP-352 pipeline output matches expected prefix for known input.
// Uses tweak=G, scan_key=SCAN_KEY. Expected prefix pre-computed by the CPU
// validation path in bench_bip352_opencl (validation: 0xb63b4601066a6971
// is the last-item prefix when batch=10000; for single item with tweak=G
// and k=SCAN_KEY this is independently computed below).
static int audit_bip352_correct() {
if (g_kernel_dir.empty()) return -1;
std::vector<std::string> seen;
std::string src = bip352_expand_kernel(g_kernel_dir + "/secp256k1_bip352.cl", seen);
if (src.empty()) return -1;
cl_context cl_ctx = (cl_context)g_ctx->native_context();
cl_command_queue cl_q = (cl_command_queue)g_ctx->native_queue();
cl_device_id cl_dev = nullptr;
clGetCommandQueueInfo(cl_q, CL_QUEUE_DEVICE, sizeof(cl_dev), &cl_dev, nullptr);
cl_int err;
const char* src_ptr = src.c_str();
size_t src_len = src.size();
cl_program prog = clCreateProgramWithSource(cl_ctx, 1, &src_ptr, &src_len, &err);
if (err != CL_SUCCESS) return 1;
err = clBuildProgram(prog, 1, &cl_dev, "-cl-std=CL1.2 -cl-fast-relaxed-math", nullptr, nullptr);
if (err != CL_SUCCESS) { clReleaseProgram(prog); return 2; }
cl_kernel kernel = clCreateKernel(prog, "bip352_pipeline_kernel", &err);
if (err != CL_SUCCESS) { clReleaseProgram(prog); return 3; }
// Build BIP352ScanKeyGlv for SCAN_KEY using the host wNAF encoder.
// SCAN_KEY = c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a42
// GLV decomposition (pre-computed, matches bench_bip352_opencl):
// k1 (LE64): {0x5db6fc2bc78a0e07, 0x7fff7d82be8fb40f, 0, 0} k1_neg=0
// k2 (LE64): {0x62491d65b0efea74, 0x3ca3a038cb4bac36, 0, 0} flip_phi=0
// (These are the GLV halves as output by secp256k1::fast::glv_decompose)
// We use the benchmark's own scan_key encoding to stay in sync; here we use
// the actual k1/k2 from a one-time CPU run of build_scan_glv_plan().
// Instead of hard-coding the decomposition (which requires CPU GLV logic),
// we test consistency: run 2 items (tweak=G), compare both give the same prefix.
// A truly independent correctness check is in bench_bip352_opencl --batch 1 --local 1.
// For this audit: run 2 identical tweaks, check both prefixes are equal (determinism).
AuditBIP352ScanKeyGlv plan{};
// k1=1, k2=0 (simplest: scan*tweak = 1*G = G for any decomposition where k1=1, k2=0)
plan.wnaf1[0] = 1;
plan.k1_neg = 0;
plan.flip_phi = 0;
AuditAffinePoint g_pt = audit_generator_point();
AuditAffinePoint spend_pt = g_pt;
AuditAffinePoint tweaks[2] = {g_pt, g_pt}; // same tweak twice
cl_mem d_tweaks = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
2 * sizeof(AuditAffinePoint), tweaks, &err);
cl_mem d_scan = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(AuditBIP352ScanKeyGlv), &plan, &err);
cl_mem d_spend = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(AuditAffinePoint), &spend_pt, &err);
cl_mem d_prefixes = clCreateBuffer(cl_ctx, CL_MEM_WRITE_ONLY, 2 * sizeof(uint64_t), nullptr, &err);
cl_uint count = 2;
clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_tweaks);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_scan);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_spend);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_prefixes);
clSetKernelArg(kernel, 4, sizeof(cl_uint), &count);
size_t global = 2, local = 1;
err = clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr);
if (err != CL_SUCCESS) { clReleaseProgram(prog); return 4; }
err = clFinish(cl_q);
if (err != CL_SUCCESS) {
std::fprintf(stderr, " [FAIL] bip352_correct: clFinish error=%d\n", err);
clReleaseProgram(prog); return 5;
}
uint64_t prefixes[2] = {};
clEnqueueReadBuffer(cl_q, d_prefixes, CL_TRUE, 0, 2 * sizeof(uint64_t), prefixes, 0, nullptr, nullptr);
int result = 0;
// Both items have identical input so must produce identical prefix (determinism test)
if (prefixes[0] != prefixes[1]) {
std::fprintf(stderr, " [FAIL] bip352_correct: non-deterministic output:"
" item[0]=0x%016llx item[1]=0x%016llx\n",
(unsigned long long)prefixes[0], (unsigned long long)prefixes[1]);
result = 6;
}
// Prefix must be non-zero (1*G = G is not the point at infinity)
if (prefixes[0] == 0) {
std::fprintf(stderr, " [FAIL] bip352_correct: prefix=0 (unexpected infinity)\n");
result = 7;
}
clReleaseMemObject(d_tweaks);
clReleaseMemObject(d_scan);
clReleaseMemObject(d_spend);
clReleaseMemObject(d_prefixes);
clReleaseKernel(kernel);
clReleaseProgram(prog);
return result;
}
// =============================================================================
// Module & Section Registry
// =============================================================================
@ -781,6 +1202,7 @@ static const OclSectionInfo OCL_SECTIONS[] = {
{ "protocol_security", "Protocol Security (multi-key)" },
{ "fuzzing", "Fuzzing & Adversarial Inputs" },
{ "performance", "Performance Smoke Tests" },
{ "bip352_glv", "BIP-352 Silent Payments & GLV Correctness" },
};
static constexpr int NUM_OCL_SECTIONS = sizeof(OCL_SECTIONS) / sizeof(OCL_SECTIONS[0]);
@ -827,6 +1249,13 @@ static const OclAuditModule OCL_MODULES[] = {
// Section 8: Performance Smoke
{ "perf_ecdsa_50", "ECDSA 50-iteration stress", "performance", audit_perf_ecdsa_stress, false },
{ "perf_schnorr_25", "Schnorr 25-iteration stress", "performance", audit_perf_schnorr_stress, false },
// Section 9: BIP-352 Silent Payments & GLV Correctness
{ "glv_wnaf_rt", "CPU wNAF encode/decode roundtrip (8 scalars)", "bip352_glv", audit_glv_wnaf_roundtrip, false },
{ "glv_large_k", "GLV large scalar k*G+G=(k+1)*G (3 scalars)", "bip352_glv", audit_glv_large_scalar, false },
{ "bip352_build", "BIP-352 kernel compiles (both entry points)", "bip352_glv", audit_bip352_kernel_build, false },
{ "bip352_nocrash", "BIP-352 no GPU fault: -36 crash regression (3 edge cases)", "bip352_glv", audit_bip352_no_crash, false },
{ "bip352_correct", "BIP-352 pipeline determinism (2 identical tweaks)", "bip352_glv", audit_bip352_correct, false },
};
static constexpr int NUM_OCL_MODULES = sizeof(OCL_MODULES) / sizeof(OCL_MODULES[0]);
@ -1080,6 +1509,7 @@ int main(int argc, char* argv[]) {
auto dev = detect_ocl_device(*g_ctx);
// Try to init extended kernels
g_kernel_dir = kernel_dir; // make available to audit modules
if (!kernel_dir.empty()) {
g_ext.init(*g_ctx, kernel_dir);
}

View File

@ -383,9 +383,9 @@ bool Context::Impl::init(const DeviceConfig& cfg) {
return true;
}
// Embedded kernel source (will be generated by CMake)
// For now, include a minimal version
static const char* kernel_source = R"KERNEL(
// Embedded kernel source — split into separate array entries so that
// no single string literal exceeds MSVC's 65535-byte C2026 limit.
static const char* const kernel_parts[] = { R"KERNEL(
// =============================================================================
// Secp256k1 OpenCL Kernels - Embedded Version
// =============================================================================
@ -635,6 +635,18 @@ inline void field_sqr_n_impl(FieldElement* r, int n) {
for (int i = 0; i < n; i++) field_sqr_impl(r, r);
}
inline int field_is_zero_impl(const FieldElement* a) {
return (a->limbs[0] | a->limbs[1] | a->limbs[2] | a->limbs[3]) == 0;
}
inline void field_set_zero_impl(FieldElement* a) {
a->limbs[0] = 0; a->limbs[1] = 0; a->limbs[2] = 0; a->limbs[3] = 0;
}
inline void field_set_one_impl(FieldElement* a) {
a->limbs[0] = 1; a->limbs[1] = 0; a->limbs[2] = 0; a->limbs[3] = 0;
}
inline void field_inv_impl(FieldElement* r, const FieldElement* a) {
FieldElement x2,x3,x6,x12,x24,x48,x96,x192,x7,x31,x223,x5,x11,x22,t;
field_sqr_impl(&x2, a); field_mul_impl(&x2, &x2, a);
@ -687,12 +699,61 @@ __kernel void field_sqr(__global const FieldElement* a, __global FieldElement* r
}
__kernel void field_inv(__global const FieldElement* a, __global FieldElement* r, uint count) {
uint gid = get_global_id(0); if (gid >= count) return;
FieldElement a_local = a[gid];
FieldElement res; field_inv_impl(&res, &a_local); r[gid] = res;
#define BATCH_INV_LOCAL_MAX 256
__local FieldElement local_vals[BATCH_INV_LOCAL_MAX];
__local FieldElement local_prefix[BATCH_INV_LOCAL_MAX];
__local FieldElement local_invs[BATCH_INV_LOCAL_MAX];
__local uint local_nonzero[BATCH_INV_LOCAL_MAX];
uint gid = get_global_id(0);
uint lid = get_local_id(0);
uint lsize = get_local_size(0);
uint group_start = get_group_id(0) * lsize;
uint active = (group_start < count) ? min(lsize, count - group_start) : 0;
if (gid >= count) return;
if (lsize > BATCH_INV_LOCAL_MAX) {
FieldElement a_local = a[gid];
FieldElement res; field_inv_impl(&res, &a_local); r[gid] = res;
return;
}
FieldElement v = a[gid];
uint nz = field_is_zero_impl(&v) ? 0U : 1U;
local_nonzero[lid] = nz;
local_vals[lid] = v;
if (!nz) { FieldElement _t; field_set_one_impl(&_t); local_vals[lid] = _t; }
barrier(CLK_LOCAL_MEM_FENCE);
if (lid == 0) {
FieldElement acc;
field_set_one_impl(&acc);
for (uint i = 0; i < active; ++i) {
local_prefix[i] = acc;
if (local_nonzero[i]) { FieldElement _t = local_vals[i]; field_mul_impl(&acc, &acc, &_t); }
}
field_inv_impl(&acc, &acc);
for (int i = (int)active - 1; i >= 0; --i) {
if (local_nonzero[i]) {
FieldElement inv_i;
{ FieldElement _t = local_prefix[i]; field_mul_impl(&inv_i, &acc, &_t); }
local_invs[i] = inv_i;
{ FieldElement _t = local_vals[i]; field_mul_impl(&acc, &acc, &_t); }
} else {
FieldElement _t; field_set_zero_impl(&_t); local_invs[i] = _t;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
r[gid] = local_invs[lid];
}
)KERNEL"
)KERNEL",
// ---- second segment (point operations + scalar mul + batch) ----
R"KERNEL(
@ -961,6 +1022,11 @@ inline void scalar_mul_mod_n_cl(const Scalar* a, const Scalar* b, Scalar* r) {
scalar_cond_sub_n_cl(r); scalar_cond_sub_n_cl(r); scalar_cond_sub_n_cl(r);
}
)KERNEL",
// ---- third segment (scalar utilities + GLV + point operations) ----
R"KERNEL(
// Scalar bit length (uses clz intrinsic -- single instruction on GPU)
inline int scalar_bitlen_cl(const Scalar* s) {
for (int i = 3; i >= 0; i--) {
@ -1031,41 +1097,189 @@ inline void glv_decompose_cl(const Scalar* k, Scalar* k1, Scalar* k2, int* k1_ne
*k1_neg = k1_is_neg; *k2_neg = k2_is_neg;
}
// GLV + interleaved binary scalar multiplication: k*P
// GPU-optimized: NO tables, two affine bases, mixed additions, minimal registers (~92)
// SIMT-aware: two independent if-blocks (not else-if) for optimal warp divergence
inline void point_from_affine(JacobianPoint* j, const AffinePoint* a) {
j->x = a->x; j->y = a->y;
j->z.limbs[0] = 1UL; j->z.limbs[1] = 0UL; j->z.limbs[2] = 0UL; j->z.limbs[3] = 0UL;
j->infinity = 0;
}
inline void point_add_mixed_h_impl(JacobianPoint* r, const JacobianPoint* p,
const AffinePoint* q, FieldElement* h_out) {
h_out->limbs[0] = 1UL; h_out->limbs[1] = 0UL; h_out->limbs[2] = 0UL; h_out->limbs[3] = 0UL;
if (point_is_infinity(p)) { point_from_affine(r, q); return; }
FieldElement Z1Z1, U2, S2, H, HH, I, J, rr, V, X3, Y3, Z3, t1, t2;
field_sqr_impl(&Z1Z1, &p->z);
field_mul_impl(&U2, &q->x, &Z1Z1);
field_mul_impl(&t1, &q->y, &p->z);
field_mul_impl(&S2, &t1, &Z1Z1);
field_sub_impl(&H, &U2, &p->x);
if ((H.limbs[0] | H.limbs[1] | H.limbs[2] | H.limbs[3]) == 0) {
field_sub_impl(&t1, &S2, &p->y);
if ((t1.limbs[0] | t1.limbs[1] | t1.limbs[2] | t1.limbs[3]) == 0)
{ point_double_impl(r, p); return; }
point_set_infinity(r); return;
}
field_add_impl(h_out, &H, &H);
field_sqr_impl(&HH, &H);
field_add_impl(&I, &HH, &HH); field_add_impl(&I, &I, &I);
field_mul_impl(&J, &H, &I);
field_sub_impl(&rr, &S2, &p->y); field_add_impl(&rr, &rr, &rr);
field_mul_impl(&V, &p->x, &I);
field_sqr_impl(&X3, &rr);
field_sub_impl(&X3, &X3, &J);
field_add_impl(&t1, &V, &V); field_sub_impl(&X3, &X3, &t1);
field_sub_impl(&t1, &V, &X3); field_mul_impl(&Y3, &rr, &t1);
field_mul_impl(&t2, &p->y, &J); field_add_impl(&t2, &t2, &t2);
field_sub_impl(&Y3, &Y3, &t2);
field_add_impl(&t1, &p->z, &H); field_sqr_impl(&Z3, &t1);
field_sub_impl(&Z3, &Z3, &Z1Z1); field_sub_impl(&Z3, &Z3, &HH);
r->x = X3; r->y = Y3; r->z = Z3; r->infinity = 0;
}
inline void build_wnaf_table_zr_cl(const AffinePoint* base, AffinePoint table[8], FieldElement* globalz) {
JacobianPoint base_jac;
point_from_affine(&base_jac, base);
JacobianPoint doubled;
point_double_impl(&doubled, &base_jac);
FieldElement c = doubled.z;
FieldElement c2, c3;
field_sqr_impl(&c2, &c);
field_mul_impl(&c3, &c2, &c);
AffinePoint doubled_affine;
doubled_affine.x = doubled.x;
doubled_affine.y = doubled.y;
JacobianPoint accum;
field_mul_impl(&accum.x, &base->x, &c2);
field_mul_impl(&accum.y, &base->y, &c3);
accum.z.limbs[0] = 1UL; accum.z.limbs[1] = 0UL; accum.z.limbs[2] = 0UL; accum.z.limbs[3] = 0UL;
accum.infinity = 0;
table[0].x = accum.x;
table[0].y = accum.y;
FieldElement zr[8];
zr[0] = c;
for (int i = 1; i < 8; ++i) {
FieldElement h;
point_add_mixed_h_impl(&accum, &accum, &doubled_affine, &h);
table[i].x = accum.x;
table[i].y = accum.y;
zr[i] = h;
}
field_mul_impl(globalz, &accum.z, &c);
FieldElement zs = zr[7];
for (int idx = 6; idx >= 0; --idx) {
if (idx != 6) {
FieldElement tmp;
field_mul_impl(&tmp, &zs, &zr[idx + 1]);
zs = tmp;
}
FieldElement zs2, zs3;
field_sqr_impl(&zs2, &zs);
field_mul_impl(&zs3, &zs2, &zs);
FieldElement tx, ty;
field_mul_impl(&tx, &table[idx].x, &zs2);
field_mul_impl(&ty, &table[idx].y, &zs3);
table[idx].x = tx;
table[idx].y = ty;
}
}
inline void derive_endo_table_cl(const AffinePoint table[8], AffinePoint endo_table[8], int negate_y) {
FieldElement beta;
beta.limbs[0]=GLV_BETA0; beta.limbs[1]=GLV_BETA1;
beta.limbs[2]=GLV_BETA2; beta.limbs[3]=GLV_BETA3;
for (int i = 0; i < 8; ++i) {
field_mul_impl(&endo_table[i].x, &table[i].x, &beta);
if (negate_y) field_neg_impl(&endo_table[i].y, &table[i].y);
else endo_table[i].y = table[i].y;
}
}
static inline void scalar_to_wnaf(const Scalar* k, int wnaf[130]) {
ulong s[4];
for (int i = 0; i < 4; i++) s[i] = k->limbs[i];
for (int i = 0; i < 130; i++) {
if (s[0] & 1UL) {
int d = (int)(s[0] & 0x1FUL);
if (d >= 16) {
d -= 32;
ulong add = (ulong)(-d);
ulong prev = s[0]; s[0] += add;
if (s[0] < prev) { for (int j=1;j<4;j++) if (++s[j]) break; }
} else {
ulong prev = s[0]; s[0] -= (ulong)d;
if (s[0] > prev) { for (int j=1;j<4;j++) if (s[j]--) break; }
}
wnaf[i] = d;
} else { wnaf[i] = 0; }
s[0] = (s[0] >> 1) | (s[1] << 63);
s[1] = (s[1] >> 1) | (s[2] << 63);
s[2] = (s[2] >> 1) | (s[3] << 63);
s[3] >>= 1;
}
}
inline void scalar_mul_glv_cl(JacobianPoint* r, const Scalar* k, const AffinePoint* base) {
if (scalar_is_zero_cl(k)) { point_set_infinity(r); return; }
Scalar k1, k2; int k1_neg, k2_neg;
glv_decompose_cl(k, &k1, &k2, &k1_neg, &k2_neg);
// Two affine bases: P and phi(P) = (beta*P.x, (+/-)P.y)
AffinePoint P = *base;
if (k1_neg) field_neg_impl(&P.y, &P.y);
FieldElement beta;
beta.limbs[0]=GLV_BETA0; beta.limbs[1]=GLV_BETA1;
beta.limbs[2]=GLV_BETA2; beta.limbs[3]=GLV_BETA3;
AffinePoint table[8];
FieldElement globalz;
build_wnaf_table_zr_cl(&P, table, &globalz);
AffinePoint phi_P;
field_mul_impl(&phi_P.x, &P.x, &beta);
if (k1_neg != k2_neg) field_neg_impl(&phi_P.y, &P.y);
else phi_P.y = P.y;
AffinePoint endo_table[8];
derive_endo_table_cl(table, endo_table, k1_neg != k2_neg);
// Find max bit length of k1, k2
int bl1 = scalar_bitlen_cl(&k1);
int bl2 = scalar_bitlen_cl(&k2);
int max_bit = (bl1 > bl2) ? bl1 : bl2;
int wnaf1[130] = {0};
int wnaf2[130] = {0};
scalar_to_wnaf(&k1, wnaf1);
scalar_to_wnaf(&k2, wnaf2);
// Interleaved binary double-and-add with mixed additions
point_set_infinity(r);
for (int i = max_bit - 1; i >= 0; --i) {
for (int i = 129; i >= 0; --i) {
if (!point_is_infinity(r)) point_double_impl(r, r);
int b1 = (int)((k1.limbs[i >> 6] >> (i & 63)) & 1UL);
int b2 = (int)((k2.limbs[i >> 6] >> (i & 63)) & 1UL);
if (b1) point_add_mixed_impl(r, r, &P);
if (b2) point_add_mixed_impl(r, r, &phi_P);
int d1 = wnaf1[i];
if (d1 != 0) {
int idx = (((d1 > 0) ? d1 : -d1) - 1) >> 1;
AffinePoint pt = table[idx];
if (d1 < 0) field_neg_impl(&pt.y, &pt.y);
if (point_is_infinity(r)) point_from_affine(r, &pt);
else point_add_mixed_impl(r, r, &pt);
}
int d2 = wnaf2[i];
if (d2 != 0) {
int idx = (((d2 > 0) ? d2 : -d2) - 1) >> 1;
AffinePoint pt = endo_table[idx];
if (d2 < 0) field_neg_impl(&pt.y, &pt.y);
if (point_is_infinity(r)) point_from_affine(r, &pt);
else point_add_mixed_impl(r, r, &pt);
}
}
if (!point_is_infinity(r)) {
FieldElement corrected_z;
field_mul_impl(&corrected_z, &r->z, &globalz);
r->z = corrected_z;
}
}
@ -1125,40 +1339,41 @@ inline int get_window_4bit(const Scalar* s, int pos) {
return (int)(v & 0xFUL);
}
__kernel void scalar_mul_generator(__global const Scalar* scalars, __global JacobianPoint* results, uint count) {
uint gid = get_global_id(0); if (gid >= count) return;
Scalar k = scalars[gid];
JacobianPoint R;
if ((k.limbs[0]|k.limbs[1]|k.limbs[2]|k.limbs[3]) == 0) { point_set_infinity(&R); results[gid] = R; return; }
inline void scalar_mul_generator_glv_impl(JacobianPoint* r, const Scalar* k) {
if ((k->limbs[0]|k->limbs[1]|k->limbs[2]|k->limbs[3]) == 0) {
point_set_infinity(r);
return;
}
Scalar k1, k2; int k1_neg, k2_neg;
glv_decompose_cl(&k, &k1, &k2, &k1_neg, &k2_neg);
glv_decompose_cl(k, &k1, &k2, &k1_neg, &k2_neg);
// Compute actual number of 4-bit windows needed
int bl1 = scalar_bitlen_cl(&k1);
int bl2 = scalar_bitlen_cl(&k2);
int max_bits = (bl1 > bl2) ? bl1 : bl2;
int num_windows = (max_bits + 3) / 4;
point_set_infinity(&R);
for (int w = num_windows - 1; w >= 0; --w) {
if (!point_is_infinity(&R)) {
point_double_impl(&R, &R); point_double_impl(&R, &R);
point_double_impl(&R, &R); point_double_impl(&R, &R);
point_set_infinity(r);
for (int w = 31; w >= 0; --w) {
if (!point_is_infinity(r)) {
point_double_impl(r, r); point_double_impl(r, r);
point_double_impl(r, r); point_double_impl(r, r);
}
int w1 = get_window_4bit(&k1, w);
if (w1) {
AffinePoint pt = GENERATOR_TABLE_NIBBLE[w1];
if (k1_neg) field_neg_impl(&pt.y, &pt.y);
point_add_mixed_impl(&R, &R, &pt);
point_add_mixed_impl(r, r, &pt);
}
int w2 = get_window_4bit(&k2, w);
if (w2) {
AffinePoint pt = GENERATOR_TABLE_NIBBLE_PHI[w2];
if (k2_neg) field_neg_impl(&pt.y, &pt.y);
point_add_mixed_impl(&R, &R, &pt);
point_add_mixed_impl(r, r, &pt);
}
}
}
__kernel void scalar_mul_generator(__global const Scalar* scalars, __global JacobianPoint* results, uint count) {
uint gid = get_global_id(0); if (gid >= count) return;
Scalar k = scalars[gid];
JacobianPoint R;
scalar_mul_generator_glv_impl(&R, &k);
results[gid] = R;
}
@ -1278,12 +1493,62 @@ __kernel void affine_add(
__global FieldElement* rx, __global FieldElement* ry,
const uint count
) {
#define BATCH_INV_LOCAL_MAX 256
__local FieldElement local_h[BATCH_INV_LOCAL_MAX];
__local FieldElement local_prefix[BATCH_INV_LOCAL_MAX];
__local FieldElement local_h_inv[BATCH_INV_LOCAL_MAX];
__local uint local_nonzero[BATCH_INV_LOCAL_MAX];
uint gid = get_global_id(0);
uint lid = get_local_id(0);
uint lsize = get_local_size(0);
uint group_start = get_group_id(0) * lsize;
uint active = (group_start < count) ? min(lsize, count - group_start) : 0;
if (gid >= count) return;
FieldElement lpx = px[gid], lpy = py[gid];
FieldElement lqx = qx[gid], lqy = qy[gid];
if (lsize > BATCH_INV_LOCAL_MAX) {
AffinePoint r;
affine_add_impl(&r, &lpx, &lpy, &lqx, &lqy);
rx[gid] = r.x;
ry[gid] = r.y;
return;
}
{ FieldElement _t; field_sub_impl(&_t, &lqx, &lpx); local_h[lid] = _t; }
{ FieldElement _t = local_h[lid]; local_nonzero[lid] = field_is_zero_impl(&_t) ? 0U : 1U; }
if (!local_nonzero[lid]) { FieldElement _t; field_set_one_impl(&_t); local_h[lid] = _t; }
barrier(CLK_LOCAL_MEM_FENCE);
if (lid == 0) {
FieldElement acc;
field_set_one_impl(&acc);
for (uint i = 0; i < active; ++i) {
local_prefix[i] = acc;
if (local_nonzero[i]) { FieldElement _t = local_h[i]; field_mul_impl(&acc, &acc, &_t); }
}
field_inv_impl(&acc, &acc);
for (int i = (int)active - 1; i >= 0; --i) {
if (local_nonzero[i]) {
FieldElement inv_i;
{ FieldElement _t = local_prefix[i]; field_mul_impl(&inv_i, &acc, &_t); }
local_h_inv[i] = inv_i;
{ FieldElement _t = local_h[i]; field_mul_impl(&acc, &acc, &_t); }
} else {
FieldElement _t; field_set_zero_impl(&_t); local_h_inv[i] = _t;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
AffinePoint r;
affine_add_impl(&r, &lpx, &lpy, &lqx, &lqy);
{ FieldElement _hinv = local_h_inv[lid]; affine_add_lambda_impl(&r, &lpx, &lpy, &lqx, &lqy, &_hinv); }
rx[gid] = r.x;
ry[gid] = r.y;
}
@ -1330,24 +1595,80 @@ __kernel void jacobian_to_affine(
__global FieldElement* ax, __global FieldElement* ay,
const uint count
) {
#define BATCH_INV_LOCAL_MAX 256
__local FieldElement local_z[BATCH_INV_LOCAL_MAX];
__local FieldElement local_prefix[BATCH_INV_LOCAL_MAX];
__local FieldElement local_z_inv[BATCH_INV_LOCAL_MAX];
__local uint local_nonzero[BATCH_INV_LOCAL_MAX];
uint gid = get_global_id(0);
uint lid = get_local_id(0);
uint lsize = get_local_size(0);
uint group_start = get_group_id(0) * lsize;
uint active = (group_start < count) ? min(lsize, count - group_start) : 0;
if (gid >= count) return;
FieldElement lx = jx[gid], ly = jy[gid], lz = jz[gid];
AffinePoint r;
jacobian_to_affine_convert_impl(&r, &lx, &ly, &lz);
ax[gid] = r.x;
ay[gid] = r.y;
if (lsize > BATCH_INV_LOCAL_MAX) {
AffinePoint r;
jacobian_to_affine_convert_impl(&r, &lx, &ly, &lz);
ax[gid] = r.x;
ay[gid] = r.y;
return;
}
local_z[lid] = lz;
{ FieldElement _t = local_z[lid]; local_nonzero[lid] = field_is_zero_impl(&_t) ? 0U : 1U; }
if (!local_nonzero[lid]) { FieldElement _t; field_set_one_impl(&_t); local_z[lid] = _t; }
barrier(CLK_LOCAL_MEM_FENCE);
if (lid == 0) {
FieldElement acc;
field_set_one_impl(&acc);
for (uint i = 0; i < active; ++i) {
local_prefix[i] = acc;
if (local_nonzero[i]) { FieldElement _t = local_z[i]; field_mul_impl(&acc, &acc, &_t); }
}
field_inv_impl(&acc, &acc);
for (int i = (int)active - 1; i >= 0; --i) {
if (local_nonzero[i]) {
FieldElement inv_i;
{ FieldElement _t = local_prefix[i]; field_mul_impl(&inv_i, &acc, &_t); }
local_z_inv[i] = inv_i;
{ FieldElement _t = local_z[i]; field_mul_impl(&acc, &acc, &_t); }
} else {
FieldElement _t; field_set_zero_impl(&_t); local_z_inv[i] = _t;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
FieldElement z_inv2, z_inv3;
{ FieldElement _t = local_z_inv[lid]; field_sqr_impl(&z_inv2, &_t); }
{ FieldElement _t = local_z_inv[lid]; field_mul_impl(&z_inv3, &z_inv2, &_t); }
{ FieldElement _ax; field_mul_impl(&_ax, &lx, &z_inv2); ax[gid] = _ax; }
{ FieldElement _ay; field_mul_impl(&_ay, &ly, &z_inv3); ay[gid] = _ay; }
}
)KERNEL";
)KERNEL" };
bool Context::Impl::build_program() {
cl_int err;
// Create program from source
const char* sources[] = {kernel_source};
std::size_t lengths[] = {std::strlen(kernel_source)};
// Create program from source (multiple parts avoid MSVC C2026 limit)
constexpr cl_uint num_parts = sizeof(kernel_parts) / sizeof(kernel_parts[0]);
const char* sources[num_parts];
std::size_t lengths[num_parts];
for (cl_uint i = 0; i < num_parts; ++i) {
sources[i] = kernel_parts[i];
lengths[i] = std::strlen(kernel_parts[i]);
}
program = clCreateProgramWithSource(context, 1, sources, lengths, &err);
program = clCreateProgramWithSource(context, num_parts, sources, lengths, &err);
if (err != CL_SUCCESS) {
last_error = std::string("Failed to create program: ") + cl_error_string(err);
return false;
@ -1730,6 +2051,14 @@ static void compute_work_sizes(std::size_t count, std::size_t max_wg, std::size_
global = ((count + local - 1) / local) * local;
}
static void compute_scalar_mul_work_sizes(std::size_t count, std::size_t requested_local,
std::size_t auto_local, std::size_t max_wg, std::size_t& local,
std::size_t& global) {
const std::size_t tuned_auto_local = std::min(auto_local, max_wg);
local = requested_local == 0 ? tuned_auto_local : std::min(requested_local, max_wg);
global = ((count + local - 1) / local) * local;
}
void Context::batch_field_add(const FieldElement* a, const FieldElement* b,
FieldElement* results, std::size_t count) {
if (count == 0) return;
@ -1882,7 +2211,6 @@ void Context::batch_scalar_mul_generator(const Scalar* scalars, JacobianPoint* r
impl_->cache_smg_count = count;
}
// Upload scalars (async)
clEnqueueWriteBuffer(impl_->queue, impl_->cache_smg_scalars, CL_FALSE, 0,
count * sizeof(Scalar), scalars, 0, nullptr, nullptr);
@ -1891,13 +2219,11 @@ void Context::batch_scalar_mul_generator(const Scalar* scalars, JacobianPoint* r
clSetKernelArg(impl_->kernel_scalar_mul_generator, 1, sizeof(cl_mem), &impl_->cache_smg_results);
clSetKernelArg(impl_->kernel_scalar_mul_generator, 2, sizeof(cl_uint), &cnt);
// Calculate work group size
std::size_t local_size = impl_->config.local_work_size;
if (local_size == 0 || local_size > impl_->device_info.max_work_group_size) {
local_size = std::min(static_cast<std::size_t>(256), impl_->device_info.max_work_group_size);
}
std::size_t global_size = ((count + local_size - 1) / local_size) * local_size;
std::size_t local_size, global_size;
compute_scalar_mul_work_sizes(count, impl_->config.local_work_size,
128,
impl_->device_info.max_work_group_size,
local_size, global_size);
clEnqueueNDRangeKernel(impl_->queue, impl_->kernel_scalar_mul_generator, 1, nullptr,
&global_size, &local_size, 0, nullptr, nullptr);
@ -1925,10 +2251,11 @@ void Context::batch_scalar_mul(const Scalar* scalars, const AffinePoint* points,
impl_->cache_sm_count = count;
}
clEnqueueWriteBuffer(impl_->queue, impl_->cache_sm_points, CL_TRUE, 0,
count * sizeof(AffinePoint), points, 0, nullptr, nullptr);
clEnqueueWriteBuffer(impl_->queue, impl_->cache_sm_scalars, CL_FALSE, 0,
count * sizeof(Scalar), scalars, 0, nullptr, nullptr);
clEnqueueWriteBuffer(impl_->queue, impl_->cache_sm_points, CL_FALSE, 0,
count * sizeof(AffinePoint), points, 0, nullptr, nullptr);
clFlush(impl_->queue);
cl_uint cnt = static_cast<cl_uint>(count);
clSetKernelArg(impl_->kernel_scalar_mul, 0, sizeof(cl_mem), &impl_->cache_sm_scalars);
@ -1936,8 +2263,11 @@ void Context::batch_scalar_mul(const Scalar* scalars, const AffinePoint* points,
clSetKernelArg(impl_->kernel_scalar_mul, 2, sizeof(cl_mem), &impl_->cache_sm_results);
clSetKernelArg(impl_->kernel_scalar_mul, 3, sizeof(cl_uint), &cnt);
std::size_t local_size = std::min(static_cast<std::size_t>(256), impl_->device_info.max_work_group_size);
std::size_t global_size = ((count + local_size - 1) / local_size) * local_size;
std::size_t local_size, global_size;
compute_scalar_mul_work_sizes(count, impl_->config.local_work_size,
128,
impl_->device_info.max_work_group_size,
local_size, global_size);
clEnqueueNDRangeKernel(impl_->queue, impl_->kernel_scalar_mul, 1, nullptr,
&global_size, &local_size, 0, nullptr, nullptr);
@ -2043,6 +2373,8 @@ void* Context::native_kernel(const char* name) const {
if (n == "point_add") return impl_->kernel_point_add;
if (n == "scalar_mul") return impl_->kernel_scalar_mul;
if (n == "scalar_mul_generator") return impl_->kernel_scalar_mul_generator;
if (n == "batch_jacobian_to_affine") return impl_->kernel_batch_jacobian_to_affine;
if (n == "batch_jacobian_to_affine_kernel") return impl_->kernel_batch_jacobian_to_affine;
if (n == "affine_add") return impl_->kernel_affine_add;
if (n == "affine_add_lambda") return impl_->kernel_affine_add_lambda;
if (n == "affine_add_x_only") return impl_->kernel_affine_add_x_only;

View File

@ -1128,6 +1128,133 @@ bool selftest(bool verbose, int platform_id, int device_id) {
if (verbose) SELFTEST_PRINT(pass ? " PASS\n" : " FAIL\n");
}
// ==========================================================================
// Test 41: BIP-352 SCAN_KEY smoke — large 256-bit scalar, must not be infinity
// Regression: verifies that scalar_mul_generator handles a real-world key
// that stresses the GLV decomposition path (both half-scalars non-trivial).
// ==========================================================================
{
total++;
if (verbose) SELFTEST_PRINT("\nBIP-352 SCAN_KEY k*G smoke (not infinity):\n");
bool pass = true;
// SCAN_KEY used in bench_bip352_opencl — 256-bit, both GLV halves non-zero
Scalar k_scan = scalar_from_hex(
"c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a42");
JacobianPoint P = ctx->scalar_mul_generator(k_scan);
AffinePoint Pa = jacobian_to_affine(P);
// Sanity: x-coordinate must be non-zero (point at infinity has x=0)
if ((Pa.x.limbs[0] | Pa.x.limbs[1] | Pa.x.limbs[2] | Pa.x.limbs[3]) == 0) {
if (verbose) SELFTEST_PRINT(" FAIL: SCAN_KEY * G produced x=0 (infinity)\n");
pass = false;
}
if (pass) passed++;
if (verbose) SELFTEST_PRINT(pass ? " PASS\n" : " FAIL\n");
}
// ==========================================================================
// Test 42: GLV large scalar consistency — k*G + G = (k+1)*G for SCAN_KEY
// Checks that GLV decomposition is correct for a full 256-bit key by
// cross-checking with the additive property: (k+1)*G = k*G + 1*G.
// ==========================================================================
{
total++;
if (verbose) SELFTEST_PRINT("\nGLV large scalar consistency: k*G + G = (k+1)*G:\n");
bool pass = true;
Scalar k = scalar_from_hex(
"c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a42");
Scalar kp1 = scalar_from_hex(
"c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a43");
Scalar one = scalar_from_u64(1);
JacobianPoint kG = ctx->scalar_mul_generator(k);
JacobianPoint oneG = ctx->scalar_mul_generator(one);
JacobianPoint kp1_a = ctx->point_add(kG, oneG); // k*G + G
JacobianPoint kp1_b = ctx->scalar_mul_generator(kp1); // (k+1)*G
AffinePoint a = jacobian_to_affine(kp1_a);
AffinePoint b = jacobian_to_affine(kp1_b);
if (field_to_hex(a.x) != field_to_hex(b.x) || field_to_hex(a.y) != field_to_hex(b.y)) {
if (verbose) {
SELFTEST_PRINT(" FAIL: k*G + G != (k+1)*G\n");
SELFTEST_PRINT(" k*G+G x: %s\n", field_to_hex(a.x).c_str());
SELFTEST_PRINT(" (k+1)G x: %s\n", field_to_hex(b.x).c_str());
}
pass = false;
}
if (pass) passed++;
if (verbose) SELFTEST_PRINT(pass ? " PASS\n" : " FAIL\n");
}
// ==========================================================================
// Test 43: GLV 2^128 boundary — (2^128)*G + G = (2^128+1)*G
// The GLV decomposition boundary sits near 2^128; a scalar k = 2^128
// forces the high half of the GLV decomposition to be active. Regression
// for any off-by-one in the half-scalar split.
// ==========================================================================
{
total++;
if (verbose) SELFTEST_PRINT("\nGLV 2^128 boundary: 2^128*G + G = (2^128+1)*G:\n");
bool pass = true;
// k = 2^128: limbs[2]=1 (little-endian), others=0
Scalar k_128 = {{0UL, 0UL, 1UL, 0UL}};
Scalar k_128p = {{1UL, 0UL, 1UL, 0UL}}; // 2^128 + 1
Scalar one = scalar_from_u64(1);
JacobianPoint kG = ctx->scalar_mul_generator(k_128);
JacobianPoint oneG = ctx->scalar_mul_generator(one);
JacobianPoint kp1_a = ctx->point_add(kG, oneG);
JacobianPoint kp1_b = ctx->scalar_mul_generator(k_128p);
AffinePoint a = jacobian_to_affine(kp1_a);
AffinePoint b = jacobian_to_affine(kp1_b);
if (field_to_hex(a.x) != field_to_hex(b.x) || field_to_hex(a.y) != field_to_hex(b.y)) {
if (verbose) SELFTEST_PRINT(" FAIL: 2^128*G + G != (2^128+1)*G\n");
pass = false;
}
if (pass) passed++;
if (verbose) SELFTEST_PRINT(pass ? " PASS\n" : " FAIL\n");
}
// ==========================================================================
// Test 44: wNAF alternating-bit stress — 0x5555...*G + G = 0x5556...*G
// Alternating 0101... bits maximally stress wNAF digit selection:
// every bit triggers a non-adjacent form carry/borrow. Catches bugs in
// the w=5 wNAF encoder that surface only with specific bit patterns.
// ==========================================================================
{
total++;
if (verbose) SELFTEST_PRINT("\nwNAF alternating-bit stress: 0x5555...*G + G:\n");
bool pass = true;
// k = 0x5555555555555555 * 4 limbs = repeating 01 bits in every position
Scalar k_alt = {{0x5555555555555555ULL, 0x5555555555555555ULL,
0x5555555555555555ULL, 0x5555555555555555ULL}};
Scalar k_altp = {{0x5555555555555556ULL, 0x5555555555555555ULL,
0x5555555555555555ULL, 0x5555555555555555ULL}};
Scalar one = scalar_from_u64(1);
JacobianPoint kG = ctx->scalar_mul_generator(k_alt);
JacobianPoint oneG = ctx->scalar_mul_generator(one);
JacobianPoint kp1_a = ctx->point_add(kG, oneG);
JacobianPoint kp1_b = ctx->scalar_mul_generator(k_altp);
AffinePoint a = jacobian_to_affine(kp1_a);
AffinePoint b = jacobian_to_affine(kp1_b);
if (field_to_hex(a.x) != field_to_hex(b.x) || field_to_hex(a.y) != field_to_hex(b.y)) {
if (verbose) SELFTEST_PRINT(" FAIL: 0x5555...*G + G != 0x5556...*G\n");
pass = false;
}
if (pass) passed++;
if (verbose) SELFTEST_PRINT(pass ? " PASS\n" : " FAIL\n");
}
// ==========================================================================
// Test 40: Distributive k*(P+Q) = k*P + k*Q
// ==========================================================================

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,51 @@
[project]
name = "UltrafastSecp256k1"
language = "cpp"
[[source_dirs]]
label = "cpu"
path = "../cpu"
[[source_dirs]]
label = "include"
path = "../include"
[[source_dirs]]
label = "audit"
path = "../audit"
optional = true
[[source_dirs]]
label = "benchmarks"
path = "../benchmarks"
optional = true
[[source_dirs]]
label = "cuda"
path = "../cuda"
optional = true
[[source_dirs]]
label = "examples"
path = "../examples"
optional = true
[[source_dirs]]
label = "gpu"
path = "../gpu"
optional = true
[[source_dirs]]
label = "metal"
path = "../metal"
optional = true
[[source_dirs]]
label = "opencl"
path = "../opencl"
optional = true
[[source_dirs]]
label = "tests"
path = "../tests"
optional = true