fix: MSVC C2026 string limit (#173), OpenCL batch-inv kernels, source graph tooling
- Split embedded OpenCL kernel_source into kernel_parts[] array so no single string literal exceeds MSVC's 65535-byte limit. clCreateProgramWithSource now receives multiple source strings. - Added batch-inversion kernels (field_inv, affine_add, jac_to_affine) using per-workgroup Montgomery's trick with __local memory. - OpenCL BIP352 benchmark scaffold and kernel stubs. - Source graph kit for indexed codebase exploration. - Assorted doc, benchmark, and audit report updates.
This commit is contained in:
parent
cfda151728
commit
fea2420fe7
10
CHANGELOG.md
10
CHANGELOG.md
@ -78,6 +78,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- `coin_address()` CASHADDR dispatch now correctly routes to `coin_address_cashaddr()` --
|
||||
Bitcoin Cash addresses generate via CashAddr instead of falling through to Base58Check.
|
||||
- All 28 coins now generate addresses correctly (was 27; BCH fixed, Tron added).
|
||||
- **ARM64 Android hash dispatch** -- `hash_accel` now routes `sha256_33`, `sha256_32`,
|
||||
`hash160_33`, and `sha256_compress_dispatch` through ARMv8 SHA-256 instructions when
|
||||
building for AArch64 targets with SHA2 support. On RK3588 / Android NDK r27.2 this reduced
|
||||
`ecdsa_sign` from 25.89 us to 22.22 us, `schnorr_sign` (precomputed) from 17.73 us to 16.67 us,
|
||||
and `ct::ecdsa_sign` from 70.50 us to 67.11 us, with verify paths remaining effectively flat.
|
||||
- **x86 Schnorr batch verify allocation path** -- `batch_verify.cpp` now reserves the
|
||||
full batch size for the uncached x-only pubkey cache instead of capping capacity at 64.
|
||||
Local i5-14400F reruns reduced uncached `schnorr_batch_verify` from 20.27 us/sig to about
|
||||
19.94-20.06 us/sig at N=128 and from 18.56 us/sig to about 18.01-18.45 us/sig at N=192,
|
||||
with `comprehensive` remaining green.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@ -63,6 +63,9 @@ set(SECP256K1_MARCH "" CACHE STRING "x86-64 -march override (empty = auto-detect
|
||||
|
||||
# Warning policy: promote warnings to errors (recommended for CI)
|
||||
option(SECP256K1_WERROR "Treat compiler warnings as errors (-Werror / /WX)" OFF)
|
||||
option(UFSECP_REFRESH_SOURCE_GRAPH "Refresh the repo source graph during builds" ON)
|
||||
|
||||
find_package(Python3 COMPONENTS Interpreter QUIET)
|
||||
|
||||
# Global compile definitions
|
||||
if(SECP256K1_SPEED_FIRST)
|
||||
@ -345,6 +348,21 @@ if(SECP256K1_INSTALL)
|
||||
endif()
|
||||
|
||||
# -- CPack packaging ---------------------------------------------------------
|
||||
set(UFSECP_SOURCE_GRAPH_TOOL "${CMAKE_CURRENT_SOURCE_DIR}/tools/source_graph_kit/source_graph.py")
|
||||
if(UFSECP_REFRESH_SOURCE_GRAPH)
|
||||
if(Python3_Interpreter_FOUND AND EXISTS "${UFSECP_SOURCE_GRAPH_TOOL}")
|
||||
add_custom_target(ufsecp_source_graph_refresh ALL
|
||||
COMMAND "${Python3_EXECUTABLE}" "${UFSECP_SOURCE_GRAPH_TOOL}" build -i
|
||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
|
||||
COMMENT "Refreshing UltrafastSecp256k1 source graph incrementally"
|
||||
USES_TERMINAL
|
||||
VERBATIM
|
||||
)
|
||||
else()
|
||||
message(STATUS "secp256k1-fast: source graph refresh disabled (missing Python3 interpreter or source_graph.py)")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(CPACK_PACKAGE_NAME "UltrafastSecp256k1")
|
||||
set(CPACK_PACKAGE_VERSION "${PROJECT_VERSION}")
|
||||
set(CPACK_PACKAGE_VENDOR "shrec")
|
||||
|
||||
229
benchmarks/comparison/bench_unified_full_local_20260317.json
Normal file
229
benchmarks/comparison/bench_unified_full_local_20260317.json
Normal file
@ -0,0 +1,229 @@
|
||||
{
|
||||
"metadata": {
|
||||
"cpu": "Intel(R) Core(TM) i5-14400F",
|
||||
"compiler": "GCC 14.2.0",
|
||||
"arch": "x86-64",
|
||||
"timer": "RDTSCP",
|
||||
"tsc_ghz": 2.496,
|
||||
"passes": 11,
|
||||
"warmup": 500,
|
||||
"pool_size": 64
|
||||
},
|
||||
"results": [
|
||||
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_mul", "ns": 10.78},
|
||||
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_sqr", "ns": 10.06},
|
||||
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_inv", "ns": 645.81},
|
||||
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_add", "ns": 3.92},
|
||||
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_sub", "ns": 4.18},
|
||||
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_negate", "ns": 5.66},
|
||||
{"section": "FIELD ARITHMETIC (Ultra)", "name": "field_from_bytes (32B)", "ns": 2.80},
|
||||
{"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_mul", "ns": 19.96},
|
||||
{"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_inv", "ns": 859.65},
|
||||
{"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_add", "ns": 4.14},
|
||||
{"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_negate", "ns": 2.35},
|
||||
{"section": "SCALAR ARITHMETIC (Ultra)", "name": "scalar_from_bytes (32B)", "ns": 2.56},
|
||||
{"section": "POINT ARITHMETIC (Ultra)", "name": "pubkey_create (k*G)", "ns": 4750.01},
|
||||
{"section": "POINT ARITHMETIC (Ultra)", "name": "scalar_mul (k*P)", "ns": 19404.73},
|
||||
{"section": "POINT ARITHMETIC (Ultra)", "name": "scalar_mul_with_plan", "ns": 16596.88},
|
||||
{"section": "POINT ARITHMETIC (Ultra)", "name": "dual_mul (a*G + b*P)", "ns": 18738.36},
|
||||
{"section": "POINT ARITHMETIC (Ultra)", "name": "point_add (affine+affine)", "ns": 761.58},
|
||||
{"section": "POINT ARITHMETIC (Ultra)", "name": "point_add (J+A mixed)", "ns": 118.54},
|
||||
{"section": "POINT ARITHMETIC (Ultra)", "name": "point_dbl", "ns": 67.58},
|
||||
{"section": "POINT ARITHMETIC (Ultra)", "name": "normalize (J->affine)", "ns": 2.63},
|
||||
{"section": "POINT ARITHMETIC (Ultra)", "name": "batch_normalize /pt (N=64)", "ns": 8.15},
|
||||
{"section": "POINT ARITHMETIC (Ultra)", "name": "next_inplace (+=G)", "ns": 132.52},
|
||||
{"section": "POINT ARITHMETIC (Ultra)", "name": "KPlan::from_scalar(w=4)", "ns": 1103.67},
|
||||
{"section": "POINT SERIALIZATION (Ultra)", "name": "to_compressed (33B)", "ns": 7.19},
|
||||
{"section": "POINT SERIALIZATION (Ultra)", "name": "to_uncompressed (65B)", "ns": 6.98},
|
||||
{"section": "POINT SERIALIZATION (Ultra)", "name": "x_only_bytes (32B)", "ns": 3.05},
|
||||
{"section": "POINT SERIALIZATION (Ultra)", "name": "x_bytes_and_parity", "ns": 4.15},
|
||||
{"section": "POINT SERIALIZATION (Ultra)", "name": "has_even_y", "ns": 1.74},
|
||||
{"section": "POINT SERIALIZATION (Ultra)", "name": "batch_to_compressed /pt (N=64)", "ns": 2.03},
|
||||
{"section": "POINT SERIALIZATION (Ultra)", "name": "batch_x_only_bytes /pt (N=64)", "ns": 1.71},
|
||||
{"section": "POINT SERIALIZATION (Ultra)", "name": "msm /pt (N=128)", "ns": 6130.30},
|
||||
{"section": "POINT SERIALIZATION (Ultra)", "name": "pippenger_msm /pt (N=128)", "ns": 6158.43},
|
||||
{"section": "POINT SERIALIZATION (Ultra)", "name": "precompute_g_multiples /pt (N=64)", "ns": 248.30},
|
||||
{"section": "POINT SERIALIZATION (Ultra)", "name": "precompute_point_multiples /pt (N=64)", "ns": 240.09},
|
||||
{"section": "ECDSA -- Ultra FAST", "name": "ecdsa_sign", "ns": 6450.90},
|
||||
{"section": "ECDSA -- Ultra FAST", "name": "ecdsa_sign_verified", "ns": 37580.34},
|
||||
{"section": "ECDSA -- Ultra FAST", "name": "ecdsa_verify", "ns": 20846.59},
|
||||
{"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_keypair_create", "ns": 5405.04},
|
||||
{"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_sign", "ns": 5295.75},
|
||||
{"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_sign_verified", "ns": 27132.24},
|
||||
{"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_verify (cached xonly)", "ns": 20279.62},
|
||||
{"section": "SCHNORR / BIP-340 -- Ultra FAST", "name": "schnorr_verify (raw bytes)", "ns": 21640.76},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Scalar::from_bytes (32B->scalar)", "ns": 2.56},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Scalar::inverse (safegcd)", "ns": 849.50},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Scalar::mul", "ns": 19.74},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Scalar::negate", "ns": 2.40},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "glv_decompose", "ns": 74.70},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Point::dbl (jac52_double)", "ns": 57.55},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "Point::add (J+A mixed)", "ns": 121.40},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "dual_scalar_mul_gen_point", "ns": 19001.51},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::from_4x64_limbs", "ns": 1.39},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::mul (52-bit)", "ns": 15.76},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::sqr (52-bit)", "ns": 13.50},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::inverse_safegcd", "ns": 725.37},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::inverse (Fermat)", "ns": 3828.50},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::add (52-bit)", "ns": 0.53},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::negate (52-bit)", "ns": 0.49},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE52::normalize", "ns": 3.50},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "SHA256 (BIP0340/challenge)", "ns": 107.37},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "tagged_hash (recompute tag)", "ns": 196.91},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "cached_tagged_hash (midstate)", "ns": 70.00},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "lift_x (4x64 sqrt)", "ns": 5094.45},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "lift_x (FE52 sqrt)", "ns": 3347.34},
|
||||
{"section": "MICRO-DIAGNOSTICS (sub-ops)", "name": "FE::parse_bytes_strict", "ns": 3.41},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(N=4)", "ns": 78874.28},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=4)", "ns": 19718.57},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(N=16)", "ns": 325401.49},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=16)", "ns": 20337.59},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(N=64)", "ns": 1329107.06},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=64)", "ns": 20767.30},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(N=192)", "ns": 3283487.41},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=192)", "ns": 17101.50},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_verify(repeated,N=192)", "ns": 2884848.94},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig repeated (N=192)", "ns": 15025.25},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_seed_only(N=192)", "ns": 16218.78},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_weights_only(N=192)", "ns": 10063.15},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_R_lift_only(N=192)", "ns": 926909.98},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_P_lift_only(N=192)", "ns": 951004.08},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_challenge_only(N=192)", "ns": 16512.13},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_xonly_parse_only(N=192)", "ns": 659796.91},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_P_lift+challenge_only(N=192)", "ns": 937008.04},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_lift+challenge(N=192)", "ns": 1977220.42},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_setup_only(N=192)", "ns": 2008198.98},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": " -> setup per-sig (N=192)", "ns": 10459.37},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_seed_only(repeated,N=192)", "ns": 14453.32},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_weights_only(repeated,N=192)", "ns": 8768.24},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_R_lift_only(repeated,N=192)", "ns": 1004852.26},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_P_lift_only(repeated,N=192)", "ns": 945079.24},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_challenge_only(repeated,N=192)", "ns": 18516.63},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_xonly_parse_only(repeated,N=192)", "ns": 663956.75},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_P_lift+challenge_only(repeated,N=192)", "ns": 953751.95},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_lift+challenge(repeated,N=192)", "ns": 1912494.47},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "schnorr_batch_setup_only(repeated,N=192)", "ns": 1908150.13},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": " -> setup repeated per-sig (N=192)", "ns": 9938.28},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "ecdsa_batch_verify(N=4)", "ns": 76754.13},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=4)", "ns": 19188.53},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "ecdsa_batch_verify(N=16)", "ns": 304265.14},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=16)", "ns": 19016.57},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": "ecdsa_batch_verify(N=64)", "ns": 1230289.00},
|
||||
{"section": "BATCH VERIFICATION (FAST)", "name": " -> per-sig amortized (N=64)", "ns": 19223.27},
|
||||
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::scalar_inverse (SafeGCD)", "ns": 1351.46},
|
||||
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::generator_mul (k*G)", "ns": 9533.74},
|
||||
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::scalar_mul (k*P)", "ns": 21251.51},
|
||||
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::point_dbl", "ns": 70.57},
|
||||
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::point_add_complete (11M+6S)", "ns": 203.36},
|
||||
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::point_add_mixed_complete (7M+5S)", "ns": 135.48},
|
||||
{"section": "CT POINT ARITHMETIC (sub-ops)", "name": "ct::point_add_mixed_unified (7M+5S)", "ns": 131.32},
|
||||
{"section": "CT SIGNING (Ultra CT)", "name": "ct::ecdsa_sign", "ns": 12761.02},
|
||||
{"section": "CT SIGNING (Ultra CT)", "name": " CT overhead (ECDSA)", "ratio": 1.9782},
|
||||
{"section": "CT SIGNING (Ultra CT)", "name": "ct::ecdsa_sign_verified", "ns": 43190.57},
|
||||
{"section": "CT SIGNING (Ultra CT)", "name": "ct::schnorr_sign", "ns": 11070.78},
|
||||
{"section": "CT SIGNING (Ultra CT)", "name": " CT overhead (Schnorr)", "ratio": 2.0905},
|
||||
{"section": "CT SIGNING (Ultra CT)", "name": "ct::schnorr_sign_verified", "ns": 33161.61},
|
||||
{"section": "CT SIGNING (Ultra CT)", "name": "ct::schnorr_keypair_create", "ns": 12088.90},
|
||||
{"section": "CT SIGNING (Ultra CT)", "name": " CT overhead (keypair)", "ratio": 2.2366},
|
||||
{"section": "ETHEREUM OPERATIONS", "name": "keccak256 (32B)", "ns": 254.19},
|
||||
{"section": "ETHEREUM OPERATIONS", "name": "ethereum_address", "ns": 228.43},
|
||||
{"section": "ETHEREUM OPERATIONS", "name": "eip191_hash", "ns": 225.03},
|
||||
{"section": "ETHEREUM OPERATIONS", "name": "eth_sign_hash", "ns": 6525.31},
|
||||
{"section": "ETHEREUM OPERATIONS", "name": "ecdsa_sign_recoverable", "ns": 6598.01},
|
||||
{"section": "ETHEREUM OPERATIONS", "name": "ecrecover", "ns": 27095.12},
|
||||
{"section": "ETHEREUM OPERATIONS", "name": "eth_personal_sign", "ns": 6787.62},
|
||||
{"section": "ETHEREUM OPERATIONS", "name": "ethereum_address_eip55", "ns": 564.81},
|
||||
{"section": "REAL-WORLD FLOWS", "name": "ecdh_compute (SHA256 shared secret)", "ns": 20215.17},
|
||||
{"section": "REAL-WORLD FLOWS", "name": "ecdh_compute_raw (x-only shared)", "ns": 20134.59},
|
||||
{"section": "REAL-WORLD FLOWS", "name": "taproot_output_key (BIP-341 key path)", "ns": 10438.59},
|
||||
{"section": "REAL-WORLD FLOWS", "name": "taproot_tweak_privkey (BIP-341)", "ns": 11246.91},
|
||||
{"section": "REAL-WORLD FLOWS", "name": "bip32_master_key (64B seed)", "ns": 933.32},
|
||||
{"section": "REAL-WORLD FLOWS", "name": "bip32_coin_derive_key (BTC m/84'/0'/0'/0/0)", "ns": 77986.98},
|
||||
{"section": "REAL-WORLD FLOWS", "name": "coin_address_from_seed (BTC end-to-end)", "ns": 91654.63},
|
||||
{"section": "REAL-WORLD FLOWS", "name": "coin_address_from_seed (ETH end-to-end)", "ns": 91281.62},
|
||||
{"section": "REAL-WORLD FLOWS", "name": "silent_payment_create_output", "ns": 24181.18},
|
||||
{"section": "REAL-WORLD FLOWS", "name": "silent_payment_scan (single output set)", "ns": 34901.09},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_mul", "ns": 11.61},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_sqr", "ns": 10.51},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_inv_var", "ns": 833.17},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_add", "ns": 6.57},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_negate", "ns": 6.32},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_normalize", "ns": 7.41},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "field_from_bytes (set_b32)", "ns": 6.97},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_mul", "ns": 26.42},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_inverse (CT)", "ns": 1421.11},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_inverse_var", "ns": 856.24},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_add", "ns": 5.23},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_negate", "ns": 7.00},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_from_bytes (set_b32)", "ns": 5.01},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "point_dbl (gej_double_var)", "ns": 78.64},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "point_add (gej_add_ge_var)", "ns": 141.13},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "ecmult (a*P + b*G, Strauss)", "ns": 21020.33},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "ecmult_gen (k*G, comb)", "ns": 9723.23},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "generator_mul (ec_pubkey_create)", "ns": 11384.81},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "scalar_mul_P (k*P, tweak_mul)", "ns": 20135.59},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "serialize_compressed (33B)", "ns": 17.67},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "serialize_uncompressed (65B)", "ns": 22.52},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "point_add (pubkey_combine)", "ns": 1774.01},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "ecdsa_sign", "ns": 17203.14},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "ecdsa_verify", "ns": 22448.31},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "schnorr_keypair_create", "ns": 11751.95},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "schnorr_sign (BIP-340)", "ns": 13712.35},
|
||||
{"section": "libsecp256k1 (bitcoin-core)", "name": "schnorr_verify (BIP-340)", "ns": 24529.62},
|
||||
{"section": "OpenSSL (ECDSA, secp256k1)", "name": "generator_mul (EC_POINT_mul k*G)", "ns": 213014.57},
|
||||
{"section": "OpenSSL (ECDSA, secp256k1)", "name": "ecdsa_sign (ECDSA_do_sign)", "ns": 222950.90},
|
||||
{"section": "OpenSSL (ECDSA, secp256k1)", "name": "ecdsa_verify (ECDSA_do_verify)", "ns": 214672.40},
|
||||
{"section": "FIELD ARITHMETIC", "name": "mul", "ns": 10.78},
|
||||
{"section": "FIELD ARITHMETIC", "name": "sqr", "ns": 10.06},
|
||||
{"section": "FIELD ARITHMETIC", "name": "inv", "ns": 645.81},
|
||||
{"section": "FIELD ARITHMETIC", "name": "add", "ns": 3.92},
|
||||
{"section": "FIELD ARITHMETIC", "name": "sub", "ns": 4.18},
|
||||
{"section": "FIELD ARITHMETIC", "name": "negate", "ns": 5.66},
|
||||
{"section": "FIELD ARITHMETIC", "name": "normalize (FE52)", "ns": 3.50},
|
||||
{"section": "FIELD ARITHMETIC", "name": "from_bytes (32B)", "ns": 2.80},
|
||||
{"section": "FIELD ARITHMETIC", "name": "FE52 add (hot path)", "ns": 0.53},
|
||||
{"section": "FIELD ARITHMETIC", "name": "FE52 neg (hot path)", "ns": 0.49},
|
||||
{"section": "SCALAR ARITHMETIC", "name": "mul", "ns": 19.96},
|
||||
{"section": "SCALAR ARITHMETIC", "name": "inv (CT)", "ns": 849.50},
|
||||
{"section": "SCALAR ARITHMETIC", "name": "inv (var-time)", "ns": 849.50},
|
||||
{"section": "SCALAR ARITHMETIC", "name": "add", "ns": 4.14},
|
||||
{"section": "SCALAR ARITHMETIC", "name": "negate", "ns": 2.35},
|
||||
{"section": "SCALAR ARITHMETIC", "name": "from_bytes (32B)", "ns": 2.56},
|
||||
{"section": "POINT ARITHMETIC", "name": "dbl (Jacobian)", "ns": 67.58},
|
||||
{"section": "POINT ARITHMETIC", "name": "add (mixed J+A)", "ns": 118.54},
|
||||
{"section": "POINT ARITHMETIC", "name": "ecmult (a*P+b*G)", "ns": 18738.36},
|
||||
{"section": "POINT ARITHMETIC", "name": "ecmult_gen (k*G raw)", "ns": 4750.01},
|
||||
{"section": "POINT ARITHMETIC", "name": "pubkey_create (API)", "ns": 4750.01},
|
||||
{"section": "POINT ARITHMETIC", "name": "scalar_mul (k*P)", "ns": 19404.73},
|
||||
{"section": "POINT ARITHMETIC", "name": "scalar_mul (KPlan)", "ns": 16596.88},
|
||||
{"section": "POINT ARITHMETIC", "name": "point_add (combine)", "ns": 761.58},
|
||||
{"section": "SERIALIZATION", "name": "compressed (33B)", "ns": 7.19},
|
||||
{"section": "SERIALIZATION", "name": "uncompressed (65B)", "ns": 6.98},
|
||||
{"section": "SIGNING (FAST vs libsecp CT)", "name": "ECDSA Sign", "ns": 6450.90},
|
||||
{"section": "SIGNING (FAST vs libsecp CT)", "name": "Schnorr Sign", "ns": 5295.75},
|
||||
{"section": "SIGNING (FAST vs libsecp CT)", "name": "Schnorr Keypair", "ns": 5405.04},
|
||||
{"section": "VERIFICATION", "name": "ECDSA Verify", "ns": 20846.59},
|
||||
{"section": "VERIFICATION", "name": "Schnorr Verify (cached)", "ns": 20279.62},
|
||||
{"section": "VERIFICATION", "name": "Schnorr Verify (raw)", "ns": 21640.76},
|
||||
{"section": "CT-vs-CT (fair signing)", "name": "ECDSA Sign", "ns": 12761.02},
|
||||
{"section": "CT-vs-CT (fair signing)", "name": "Schnorr Sign", "ns": 11070.78},
|
||||
{"section": "CT-vs-CT (fair signing)", "name": "ECDSA Verify", "ns": 20846.59},
|
||||
{"section": "CT-vs-CT (fair signing)", "name": "Schnorr Verify", "ns": 21640.76},
|
||||
{"section": "ETHEREUM / RECOVERY", "name": "sign_recoverable", "ns": 6598.01},
|
||||
{"section": "ETHEREUM / RECOVERY", "name": "ecrecover", "ns": 27095.12},
|
||||
{"section": "ETHEREUM / RECOVERY", "name": "eth_sign_hash", "ns": 6525.31},
|
||||
{"section": "ETHEREUM / RECOVERY", "name": "eth_personal_sign", "ns": 6787.62},
|
||||
{"section": "FAST path (Ultra FAST vs OpenSSL)", "name": "Generator * k", "ratio": 44.8451},
|
||||
{"section": "FAST path (Ultra FAST vs OpenSSL)", "name": "ECDSA Sign", "ratio": 34.5612},
|
||||
{"section": "FAST path (Ultra FAST vs OpenSSL)", "name": "ECDSA Verify", "ratio": 10.2977},
|
||||
{"section": "CT path (Ultra CT vs OpenSSL)", "name": "ECDSA Sign (CT vs CT)", "ratio": 17.4712},
|
||||
{"section": "CT path (Ultra CT vs OpenSSL)", "name": "ECDSA Verify", "ratio": 10.2977},
|
||||
{"section": "ZK Proofs & Commitments", "name": "Pedersen commit", "ns": 30575.55},
|
||||
{"section": "ZK Proofs & Commitments", "name": "Knowledge prove (sigma)", "ns": 20371.57},
|
||||
{"section": "ZK Proofs & Commitments", "name": "Knowledge verify", "ns": 21392.29},
|
||||
{"section": "ZK Proofs & Commitments", "name": "DLEQ prove", "ns": 44028.64},
|
||||
{"section": "ZK Proofs & Commitments", "name": "DLEQ verify", "ns": 57020.44},
|
||||
{"section": "ZK Proofs & Commitments", "name": "Bulletproof range_prove (64b)", "ns": 13055460.41},
|
||||
{"section": "ZK Proofs & Commitments", "name": "Bulletproof range_verify (64b)", "ns": 1259727.10}
|
||||
]
|
||||
}
|
||||
538
benchmarks/comparison/bench_unified_full_local_20260317.txt
Normal file
538
benchmarks/comparison/bench_unified_full_local_20260317.txt
Normal file
@ -0,0 +1,538 @@
|
||||
CPU frequency warmup (3000 ms heavy load)... stable at 2.496 GHz (569198 k*G ops)
|
||||
Running integrity check... OK
|
||||
|
||||
======================================================================
|
||||
UltrafastSecp256k1 -- Unified Apple-to-Apple Benchmark
|
||||
======================================================================
|
||||
|
||||
CPU: Intel(R) Core(TM) i5-14400F
|
||||
TSC freq: 2.496 GHz
|
||||
Core: 1 (pinned to core 0, priority elevated)
|
||||
Compiler: GCC 14.2.0
|
||||
Arch: x86-64
|
||||
Ultra: UltrafastSecp256k1
|
||||
libsecp: bitcoin-core libsecp256k1 v0.7.x
|
||||
Harness: 3s CPU ramp-up, 500 warmup/op, 11 passes, IQR outlier removal, median
|
||||
Timer: RDTSCP
|
||||
Pool: 64 independent key/msg/sig sets
|
||||
NOTE: Both Ultra and libsecp use IDENTICAL harness
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| FIELD ARITHMETIC (Ultra) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| field_mul | 10.8 |
|
||||
| field_sqr | 10.1 |
|
||||
| field_inv | 645.8 |
|
||||
| field_add | 3.9 |
|
||||
| field_sub | 4.2 |
|
||||
| field_negate | 5.7 |
|
||||
| field_from_bytes (32B) | 2.8 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| SCALAR ARITHMETIC (Ultra) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| scalar_mul | 20.0 |
|
||||
| scalar_inv | 859.7 |
|
||||
| scalar_add | 4.1 |
|
||||
| scalar_negate | 2.3 |
|
||||
| scalar_from_bytes (32B) | 2.6 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| POINT ARITHMETIC (Ultra) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| pubkey_create (k*G) | 4750.0 |
|
||||
| scalar_mul (k*P) | 19404.7 |
|
||||
| scalar_mul_with_plan | 16596.9 |
|
||||
| dual_mul (a*G + b*P) | 18738.4 |
|
||||
| point_add (affine+affine) | 761.6 |
|
||||
| point_add (J+A mixed) | 118.5 |
|
||||
| point_dbl | 67.6 |
|
||||
| normalize (J->affine) | 2.6 |
|
||||
| batch_normalize /pt (N=64) | 8.2 |
|
||||
| next_inplace (+=G) | 132.5 |
|
||||
| KPlan::from_scalar(w=4) | 1103.7 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| POINT SERIALIZATION (Ultra) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| to_compressed (33B) | 7.2 |
|
||||
| to_uncompressed (65B) | 7.0 |
|
||||
| x_only_bytes (32B) | 3.1 |
|
||||
| x_bytes_and_parity | 4.1 |
|
||||
| has_even_y | 1.7 |
|
||||
| batch_to_compressed /pt (N=64) | 2.0 |
|
||||
| batch_x_only_bytes /pt (N=64) | 1.7 |
|
||||
| msm /pt (N=128) | 6130.3 |
|
||||
| pippenger_msm /pt (N=128) | 6158.4 |
|
||||
| precompute_g_multiples /pt (N=64) | 248.3 |
|
||||
| precompute_point_multiples /pt (N=64) | 240.1 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| ECDSA -- Ultra FAST | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| ecdsa_sign | 6450.9 |
|
||||
| ecdsa_sign_verified | 37580.3 |
|
||||
| ecdsa_verify | 20846.6 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| SCHNORR / BIP-340 -- Ultra FAST | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| schnorr_keypair_create | 5405.0 |
|
||||
| schnorr_sign | 5295.8 |
|
||||
| schnorr_sign_verified | 27132.2 |
|
||||
| schnorr_verify (cached xonly) | 20279.6 |
|
||||
| schnorr_verify (raw bytes) | 21640.8 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| MICRO-DIAGNOSTICS (sub-ops) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| Scalar::from_bytes (32B->scalar) | 2.6 |
|
||||
| Scalar::inverse (safegcd) | 849.5 |
|
||||
| Scalar::mul | 19.7 |
|
||||
| Scalar::negate | 2.4 |
|
||||
| glv_decompose | 74.7 |
|
||||
| Point::dbl (jac52_double) | 57.6 |
|
||||
| Point::add (J+A mixed) | 121.4 |
|
||||
| dual_scalar_mul_gen_point | 19001.5 |
|
||||
| FE52::from_4x64_limbs | 1.4 |
|
||||
| FE52::mul (52-bit) | 15.8 |
|
||||
| FE52::sqr (52-bit) | 13.5 |
|
||||
| FE52::inverse_safegcd | 725.4 |
|
||||
| FE52::inverse (Fermat) | 3828.5 |
|
||||
| -> SafeGCD/Fermat speedup | 5.28x |
|
||||
| FE52::add (52-bit) | 0.5 |
|
||||
| FE52::negate (52-bit) | 0.5 |
|
||||
| FE52::normalize | 3.5 |
|
||||
| SHA256 (BIP0340/challenge) | 107.4 |
|
||||
| tagged_hash (recompute tag) | 196.9 |
|
||||
| cached_tagged_hash (midstate) | 70.0 |
|
||||
| -> midstate speedup | 2.81x |
|
||||
| lift_x (4x64 sqrt) | 5094.4 |
|
||||
| lift_x (FE52 sqrt) | 3347.3 |
|
||||
| -> FE52/4x64 speedup | 1.52x |
|
||||
| FE::parse_bytes_strict | 3.4 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
---- VERIFY COST DECOMPOSITION ----
|
||||
ECDSA verify breakdown (estimated):
|
||||
scalar_inv (1x): 849.5 ns
|
||||
scalar_mul (2x): 39.5 ns
|
||||
dual_scalar_mul: 19001.5 ns
|
||||
from_bytes + overhead: 2.6 ns
|
||||
--------------------------------
|
||||
SUM (sub-ops): 19893.0 ns
|
||||
MEASURED ecdsa_verify: 20846.6 ns
|
||||
UNEXPLAINED gap: 953.6 ns (4.6%)
|
||||
|
||||
Schnorr verify breakdown (estimated):
|
||||
SHA256 challenge: (included in total)
|
||||
scalar_negate: 2.4 ns
|
||||
dual_scalar_mul: 19001.5 ns
|
||||
lift_x (sqrt): (included in total)
|
||||
from_bytes: 2.6 ns
|
||||
--------------------------------
|
||||
SUM (sub-ops, partial): 19006.5 ns
|
||||
MEASURED schnorr_verify: 20279.6 ns
|
||||
UNEXPLAINED gap: 1273.2 ns (SHA256+lift_x+Z-check)
|
||||
|
||||
Verify vs libsecp breakdown:
|
||||
Our dual_mul: 19001.5 ns
|
||||
Our scalar_inv: 849.5 ns
|
||||
Our dual+inv: 19851.0 ns
|
||||
Total ECDSA verify: 20846.6 ns
|
||||
Overhead (verify - d+i): 995.6 ns
|
||||
|
||||
---- SIGN COST DECOMPOSITION (FAST path) ----
|
||||
ecdsa_sign = RFC6979 + k*G + field_inv + scalar_inv + scalar_muls
|
||||
k*G (generator_mul): 4750.0 ns
|
||||
field_inv (R.x): 645.8 ns
|
||||
scalar_inv (k^-1): 849.5 ns
|
||||
scalar_mul (2x): 39.5 ns
|
||||
--------------------------------
|
||||
Core signing (no RFC6979): 6284.8 ns
|
||||
MEASURED ecdsa_sign: 6450.9 ns
|
||||
RFC6979 overhead: 166.1 ns (2.6%)
|
||||
MEASURED ecdsa_sign_verified:37580.3 ns
|
||||
sign-then-verify overhead: 31129.4 ns (pubkey + verify)
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| BATCH VERIFICATION (FAST) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| schnorr_batch_verify(N=4) | 78874.3 |
|
||||
| -> per-sig amortized (N=4) | 19718.6 |
|
||||
| -> speedup vs individual | 1.03x |
|
||||
| schnorr_batch_verify(N=16) | 325401.5 |
|
||||
| -> per-sig amortized (N=16) | 20337.6 |
|
||||
| -> speedup vs individual | 1.00x |
|
||||
| schnorr_batch_verify(N=64) | 1329107.1 |
|
||||
| -> per-sig amortized (N=64) | 20767.3 |
|
||||
| -> speedup vs individual | 0.98x |
|
||||
| schnorr_batch_verify(N=192) | 3283487.4 |
|
||||
| -> per-sig amortized (N=192) | 17101.5 |
|
||||
| -> speedup vs individual | 1.19x |
|
||||
| schnorr_batch_verify(repeated,N=192) | 2884848.9 |
|
||||
| -> per-sig repeated (N=192) | 15025.3 |
|
||||
| -> repeated speedup vs individual | 1.35x |
|
||||
| schnorr_batch_seed_only(N=192) | 16218.8 |
|
||||
| schnorr_batch_weights_only(N=192) | 10063.2 |
|
||||
| schnorr_batch_R_lift_only(N=192) | 926910.0 |
|
||||
| schnorr_batch_P_lift_only(N=192) | 951004.1 |
|
||||
| schnorr_batch_challenge_only(N=192) | 16512.1 |
|
||||
| schnorr_batch_xonly_parse_only(N=192) | 659796.9 |
|
||||
| schnorr_batch_P_lift+challenge_only(N=192) | 937008.0 |
|
||||
| schnorr_batch_lift+challenge(N=192) | 1977220.4 |
|
||||
| schnorr_batch_setup_only(N=192) | 2008199.0 |
|
||||
| -> setup per-sig (N=192) | 10459.4 |
|
||||
| -> setup share of full (N=192) | 60.34% |
|
||||
| schnorr_batch_seed_only(repeated,N=192) | 14453.3 |
|
||||
| schnorr_batch_weights_only(repeated,N=192) | 8768.2 |
|
||||
| schnorr_batch_R_lift_only(repeated,N=192) | 1004852.3 |
|
||||
| schnorr_batch_P_lift_only(repeated,N=192) | 945079.2 |
|
||||
| schnorr_batch_challenge_only(repeated,N=192) | 18516.6 |
|
||||
| schnorr_batch_xonly_parse_only(repeated,N=192) | 663956.8 |
|
||||
| schnorr_batch_P_lift+challenge_only(repeated,N=192) | 953751.9 |
|
||||
| schnorr_batch_lift+challenge(repeated,N=192) | 1912494.5 |
|
||||
| schnorr_batch_setup_only(repeated,N=192) | 1908150.1 |
|
||||
| -> setup repeated per-sig (N=192) | 9938.3 |
|
||||
| -> setup share repeated (N=192) | 65.68% |
|
||||
| | |
|
||||
| ecdsa_batch_verify(N=4) | 76754.1 |
|
||||
| -> per-sig amortized (N=4) | 19188.5 |
|
||||
| -> speedup vs individual | 1.09x |
|
||||
| ecdsa_batch_verify(N=16) | 304265.1 |
|
||||
| -> per-sig amortized (N=16) | 19016.6 |
|
||||
| -> speedup vs individual | 1.10x |
|
||||
| ecdsa_batch_verify(N=64) | 1230289.0 |
|
||||
| -> per-sig amortized (N=64) | 19223.3 |
|
||||
| -> speedup vs individual | 1.08x |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| CT POINT ARITHMETIC (sub-ops) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| ct::scalar_inverse (SafeGCD) | 1351.5 |
|
||||
| ct::generator_mul (k*G) | 9533.7 |
|
||||
| ct::scalar_mul (k*P) | 21251.5 |
|
||||
| ct::point_dbl | 70.6 |
|
||||
| ct::point_add_complete (11M+6S) | 203.4 |
|
||||
| ct::point_add_mixed_complete (7M+5S) | 135.5 |
|
||||
| ct::point_add_mixed_unified (7M+5S) | 131.3 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
---- CT vs FAST point ops ----
|
||||
FAST Point::dbl 57.6 ns
|
||||
FAST Point::add 121.4 ns
|
||||
FAST pubkey_create (k*G) 4750.0 ns
|
||||
FAST scalar_mul (k*P) 19404.7 ns
|
||||
CT generator_mul (k*G) 9533.7 ns
|
||||
CT scalar_mul (k*P) 21251.5 ns
|
||||
CT/FAST ratio (k*G): 2.01x overhead
|
||||
CT/FAST ratio (k*P): 1.10x overhead
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| CT SIGNING (Ultra CT) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| ct::ecdsa_sign | 12761.0 |
|
||||
| CT overhead (ECDSA) | 1.98x |
|
||||
| ct::ecdsa_sign_verified | 43190.6 |
|
||||
| ct::schnorr_sign | 11070.8 |
|
||||
| CT overhead (Schnorr) | 2.09x |
|
||||
| ct::schnorr_sign_verified | 33161.6 |
|
||||
| ct::schnorr_keypair_create | 12088.9 |
|
||||
| CT overhead (keypair) | 2.24x |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
---- CT ECDSA SIGN DECOMPOSITION ----
|
||||
ct::generator_mul (R=k*G): 9533.7 ns
|
||||
ct::scalar_inverse (k^-1): 1351.5 ns
|
||||
field_inv (R.x affine): 645.8 ns
|
||||
scalar_mul (2x): 39.5 ns
|
||||
--------------------------------
|
||||
SUM (sub-ops): 11570.5 ns
|
||||
MEASURED ct::ecdsa_sign: 12761.0 ns
|
||||
UNEXPLAINED gap: 1190.5 ns (9.3%, RFC6979+checks)
|
||||
|
||||
---- CT SCHNORR SIGN DECOMPOSITION ----
|
||||
ct::generator_mul (R=k*G): 9533.7 ns
|
||||
SHA256 (tag+nonce+msg): (included in total)
|
||||
scalar_mul + negate: 22.1 ns
|
||||
--------------------------------
|
||||
SUM (sub-ops, partial): 9555.9 ns
|
||||
MEASURED ct::schnorr_sign: 11070.8 ns
|
||||
UNEXPLAINED gap: 1514.9 ns (SHA256+aux+serialize)
|
||||
|
||||
---- CT vs libsecp (true apples-to-apples) ----
|
||||
CT ecdsa_sign 12761.0 ns
|
||||
lib ecdsa_sign (measured after libsecp section)
|
||||
CT schnorr_sign 11070.8 ns
|
||||
lib schnorr_sign (measured after libsecp section)
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| ETHEREUM OPERATIONS | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| keccak256 (32B) | 254.2 |
|
||||
| ethereum_address | 228.4 |
|
||||
| eip191_hash | 225.0 |
|
||||
| eth_sign_hash | 6525.3 |
|
||||
| ecdsa_sign_recoverable | 6598.0 |
|
||||
| ecrecover | 27095.1 |
|
||||
| eth_personal_sign | 6787.6 |
|
||||
| ethereum_address_eip55 | 564.8 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| REAL-WORLD FLOWS | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| ecdh_compute (SHA256 shared secret) | 20215.2 |
|
||||
| ecdh_compute_raw (x-only shared) | 20134.6 |
|
||||
| taproot_output_key (BIP-341 key path) | 10438.6 |
|
||||
| taproot_tweak_privkey (BIP-341) | 11246.9 |
|
||||
| bip32_master_key (64B seed) | 933.3 |
|
||||
| bip32_coin_derive_key (BTC m/84'/0'/0'/0/0) | 77987.0 |
|
||||
| coin_address_from_seed (BTC end-to-end) | 91654.6 |
|
||||
| coin_address_from_seed (ETH end-to-end) | 91281.6 |
|
||||
| silent_payment_create_output | 24181.2 |
|
||||
| silent_payment_scan (single output set) | 34901.1 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
Running libsecp256k1 benchmark (same harness: RDTSCP, 3s ramp-up, 500 warmup, 11 passes, IQR)...
|
||||
+----------------------------------------------+------------+
|
||||
| libsecp256k1 (bitcoin-core) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| field_mul | 11.6 |
|
||||
| field_sqr | 10.5 |
|
||||
| field_inv_var | 833.2 |
|
||||
| field_add | 6.6 |
|
||||
| field_negate | 6.3 |
|
||||
| field_normalize | 7.4 |
|
||||
| field_from_bytes (set_b32) | 7.0 |
|
||||
| scalar_mul | 26.4 |
|
||||
| scalar_inverse (CT) | 1421.1 |
|
||||
| scalar_inverse_var | 856.2 |
|
||||
| scalar_add | 5.2 |
|
||||
| scalar_negate | 7.0 |
|
||||
| scalar_from_bytes (set_b32) | 5.0 |
|
||||
| point_dbl (gej_double_var) | 78.6 |
|
||||
| point_add (gej_add_ge_var) | 141.1 |
|
||||
| ecmult (a*P + b*G, Strauss) | 21020.3 |
|
||||
| ecmult_gen (k*G, comb) | 9723.2 |
|
||||
| generator_mul (ec_pubkey_create) | 11384.8 |
|
||||
| scalar_mul_P (k*P, tweak_mul) | 20135.6 |
|
||||
| serialize_compressed (33B) | 17.7 |
|
||||
| serialize_uncompressed (65B) | 22.5 |
|
||||
| point_add (pubkey_combine) | 1774.0 |
|
||||
| ecdsa_sign | 17203.1 |
|
||||
| ecdsa_verify | 22448.3 |
|
||||
| schnorr_keypair_create | 11751.9 |
|
||||
| schnorr_sign (BIP-340) | 13712.3 |
|
||||
| schnorr_verify (BIP-340) | 24529.6 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
Running OpenSSL benchmark (OpenSSL 3.0.13 30 Jan 2024, same harness)...
|
||||
+----------------------------------------------+------------+
|
||||
| OpenSSL (ECDSA, secp256k1) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| generator_mul (EC_POINT_mul k*G) | 213014.6 |
|
||||
| ecdsa_sign (ECDSA_do_sign) | 222950.9 |
|
||||
| ecdsa_verify (ECDSA_do_verify) | 214672.4 |
|
||||
+----------------------------------------------+------------+
|
||||
(OpenSSL has no BIP-340 Schnorr -- ECDSA-only comparison)
|
||||
|
||||
======================================================================
|
||||
HEAD-TO-HEAD: UltrafastSecp256k1 vs libsecp256k1
|
||||
(ratio > 1.0 = Ultra wins, < 1.0 = libsecp wins)
|
||||
======================================================================
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| FIELD ARITHMETIC | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| mul | 10.8 | 11.6 | 1.08x |
|
||||
| sqr | 10.1 | 10.5 | 1.04x |
|
||||
| inv | 645.8 | 833.2 | 1.29x |
|
||||
| add | 3.9 | 6.6 | 1.67x |
|
||||
| sub | 4.2 | --- | --- |
|
||||
| negate | 5.7 | 6.3 | 1.12x |
|
||||
| normalize (FE52) | 3.5 | 7.4 | 2.12x |
|
||||
| from_bytes (32B) | 2.8 | 7.0 | 2.49x |
|
||||
| FE52 add (hot path) | 0.5 | 6.6 | 12.48x |
|
||||
| FE52 neg (hot path) | 0.5 | 6.3 | 12.93x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| SCALAR ARITHMETIC | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| mul | 20.0 | 26.4 | 1.32x |
|
||||
| inv (CT) | 849.5 | 1421.1 | 1.67x |
|
||||
| inv (var-time) | 849.5 | 856.2 | 1.01x |
|
||||
| add | 4.1 | 5.2 | 1.26x |
|
||||
| negate | 2.3 | 7.0 | 2.98x |
|
||||
| from_bytes (32B) | 2.6 | 5.0 | 1.96x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| POINT ARITHMETIC | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| dbl (Jacobian) | 67.6 | 78.6 | 1.16x |
|
||||
| add (mixed J+A) | 118.5 | 141.1 | 1.19x |
|
||||
| ecmult (a*P+b*G) | 18738.4 | 21020.3 | 1.12x |
|
||||
| ecmult_gen (k*G raw) | 4750.0 | 9723.2 | 2.05x |
|
||||
| pubkey_create (API) | 4750.0 | 11384.8 | 2.40x |
|
||||
| scalar_mul (k*P) | 19404.7 | 20135.6 | 1.04x |
|
||||
| scalar_mul (KPlan) | 16596.9 | 20135.6 | 1.21x |
|
||||
| point_add (combine) | 761.6 | 1774.0 | 2.33x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| SERIALIZATION | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| compressed (33B) | 7.2 | 17.7 | 2.46x |
|
||||
| uncompressed (65B) | 7.0 | 22.5 | 3.23x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| SIGNING (FAST vs libsecp CT) | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| ECDSA Sign | 6450.9 | 17203.1 | 2.67x |
|
||||
| Schnorr Sign | 5295.8 | 13712.3 | 2.59x |
|
||||
| Schnorr Keypair | 5405.0 | 11751.9 | 2.17x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| VERIFICATION | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| ECDSA Verify | 20846.6 | 22448.3 | 1.08x |
|
||||
| Schnorr Verify (cached) | 20279.6 | 24529.6 | 1.21x |
|
||||
| Schnorr Verify (raw) | 21640.8 | 24529.6 | 1.13x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| CT-vs-CT (fair signing) | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| ECDSA Sign | 12761.0 | 17203.1 | 1.35x |
|
||||
| Schnorr Sign | 11070.8 | 13712.3 | 1.24x |
|
||||
| ECDSA Verify | 20846.6 | 22448.3 | 1.08x |
|
||||
| Schnorr Verify | 21640.8 | 24529.6 | 1.13x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| ETHEREUM / RECOVERY | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| sign_recoverable | 6598.0 | 15920.0 | 2.41x |
|
||||
| ecrecover | 27095.1 | 26314.1 | 0.97x |
|
||||
| eth_sign_hash | 6525.3 | 15920.0 | 2.44x |
|
||||
| eth_personal_sign | 6787.6 | 15920.0 | 2.35x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
======================================================================
|
||||
APPLE-TO-APPLE: UltrafastSecp256k1 / OpenSSL
|
||||
(ratio > 1.0 = Ultra wins, < 1.0 = OpenSSL wins)
|
||||
======================================================================
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| FAST path (Ultra FAST vs OpenSSL) | ratio |
|
||||
+----------------------------------------------+------------+
|
||||
| Generator * k | 44.85x |
|
||||
| ECDSA Sign | 34.56x |
|
||||
| ECDSA Verify | 10.30x |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| CT path (Ultra CT vs OpenSSL) | ratio |
|
||||
+----------------------------------------------+------------+
|
||||
| ECDSA Sign (CT vs CT) | 17.47x |
|
||||
| ECDSA Verify | 10.30x |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| ZK Proofs & Commitments | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| Pedersen commit | 30575.5 |
|
||||
| Knowledge prove (sigma) | 20371.6 |
|
||||
| Knowledge verify | 21392.3 |
|
||||
| DLEQ prove | 44028.6 |
|
||||
| DLEQ verify | 57020.4 |
|
||||
| Bulletproof range_prove (64b) | 13055460.4 |
|
||||
| Bulletproof range_verify (64b) | 1259727.1 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
======================================================================
|
||||
THROUGHPUT SUMMARY (1 core, pinned)
|
||||
======================================================================
|
||||
|
||||
--- Ultra FAST ---
|
||||
ECDSA sign 6.45 us -> 155.0 k op/s
|
||||
ECDSA verify 20.85 us -> 48.0 k op/s
|
||||
Schnorr sign 5.30 us -> 188.8 k op/s
|
||||
Schnorr verify (cached) 20.28 us -> 49.3 k op/s
|
||||
Schnorr verify (raw) 21.64 us -> 46.2 k op/s
|
||||
pubkey_create (k*G) 4.75 us -> 210.5 k op/s
|
||||
ECDH 20.22 us -> 49.5 k op/s
|
||||
Taproot output key 10.44 us -> 95.8 k op/s
|
||||
BIP32 derive (BTC) 77.99 us -> 12.8 k op/s
|
||||
Silent Payment sender 24.18 us -> 41.4 k op/s
|
||||
Silent Payment scan 34.90 us -> 28.7 k op/s
|
||||
|
||||
--- Ultra CT ---
|
||||
CT ECDSA sign 12.76 us -> 78.4 k op/s
|
||||
CT Schnorr sign 11.07 us -> 90.3 k op/s
|
||||
|
||||
--- Ultra ZK ---
|
||||
Pedersen commit 30.58 us -> 32.7 k op/s
|
||||
Knowledge prove 20.37 us -> 49.1 k op/s
|
||||
Knowledge verify 21.39 us -> 46.7 k op/s
|
||||
DLEQ prove 44.03 us -> 22.7 k op/s
|
||||
DLEQ verify 57.02 us -> 17.5 k op/s
|
||||
Bulletproof range_prove 13055.46 us -> 77 op/s
|
||||
Bulletproof range_verify 1259.73 us -> 794 op/s
|
||||
|
||||
--- libsecp256k1 ---
|
||||
field_mul 0.01 us -> 86.16 M op/s
|
||||
field_sqr 0.01 us -> 95.12 M op/s
|
||||
field_inv_var 0.83 us -> 1.20 M op/s
|
||||
scalar_mul 0.03 us -> 37.85 M op/s
|
||||
scalar_inverse (CT) 1.42 us -> 703.7 k op/s
|
||||
scalar_inverse_var 0.86 us -> 1.17 M op/s
|
||||
point_dbl 0.08 us -> 12.72 M op/s
|
||||
point_add (mixed) 0.14 us -> 7.09 M op/s
|
||||
ecmult (a*P+b*G) 21.02 us -> 47.6 k op/s
|
||||
ecmult_gen (k*G raw) 9.72 us -> 102.8 k op/s
|
||||
generator_mul (API) 11.38 us -> 87.8 k op/s
|
||||
scalar_mul_P (k*P) 20.14 us -> 49.7 k op/s
|
||||
ECDSA sign 17.20 us -> 58.1 k op/s
|
||||
ECDSA verify 22.45 us -> 44.5 k op/s
|
||||
Schnorr sign 13.71 us -> 72.9 k op/s
|
||||
Schnorr verify 24.53 us -> 40.8 k op/s
|
||||
|
||||
--- OpenSSL ---
|
||||
ECDSA sign 222.95 us -> 4.5 k op/s
|
||||
ECDSA verify 214.67 us -> 4.7 k op/s
|
||||
generator_mul (k*G) 213.01 us -> 4.7 k op/s
|
||||
|
||||
======================================================================
|
||||
BITCOIN BLOCK VALIDATION ESTIMATES (1 core)
|
||||
======================================================================
|
||||
|
||||
Pre-Taproot block (~3000 ECDSA verify):
|
||||
Wall time: 62.5 ms
|
||||
Blocks/sec: 16.0
|
||||
|
||||
Taproot block (~2000 Schnorr + ~1000 ECDSA):
|
||||
Wall time: 64.1 ms
|
||||
Blocks/sec: 15.6
|
||||
|
||||
TX throughput (1 core):
|
||||
ECDSA: 47969 tx/sec
|
||||
Schnorr: 46209 tx/sec
|
||||
|
||||
======================================================================
|
||||
Intel(R) Core(TM) i5-14400F | 1 core pinned | GCC 14.2.0
|
||||
UltrafastSecp256k1 vs libsecp256k1 vs OpenSSL -- Unified Benchmark
|
||||
======================================================================
|
||||
|
||||
JSON report written to: /tmp/bench_today.json
|
||||
@ -9,10 +9,11 @@
|
||||
// ## Three tiers of acceleration (runtime-detected):
|
||||
//
|
||||
// Tier 0: SCALAR -- Portable C++ (baseline, always available)
|
||||
// Tier 1: SHA-NI -- Intel SHA Extensions (single-message HW accel, ~3-5x)
|
||||
// Tier 2: AVX2 -- 4-way multi-buffer SHA-256 (interleaved, ~8-12x)
|
||||
// Tier 1: ARM SHA2 -- ARMv8 SHA-256 instructions (single-message HW accel)
|
||||
// Tier 2: SHA-NI -- Intel SHA Extensions (single-message HW accel, ~3-5x)
|
||||
// Tier 3: AVX2 -- 4-way multi-buffer SHA-256 (interleaved, ~8-12x)
|
||||
// + optimized RIPEMD-160 with BMI/BMI2
|
||||
// Tier 3: AVX-512 -- 8-way multi-buffer SHA-256 (if available, ~16x)
|
||||
// Tier 4: AVX-512 -- 8-way multi-buffer SHA-256 (if available, ~16x)
|
||||
//
|
||||
// ## Hot-path API for search pipeline:
|
||||
//
|
||||
@ -48,9 +49,10 @@ namespace secp256k1::hash {
|
||||
|
||||
enum class HashTier : int {
|
||||
SCALAR = 0,
|
||||
SHA_NI = 1, // Intel SHA Extensions
|
||||
AVX2 = 2, // 4-way multi-buffer
|
||||
AVX512 = 3, // 8-way multi-buffer
|
||||
ARM_SHA2 = 1, // ARMv8 SHA-256 instructions
|
||||
SHA_NI = 2, // Intel SHA Extensions
|
||||
AVX2 = 3, // 4-way multi-buffer
|
||||
AVX512 = 4, // 8-way multi-buffer
|
||||
};
|
||||
|
||||
/// Detect best available hashing tier at runtime.
|
||||
|
||||
@ -227,7 +227,7 @@ std::vector<std::size_t> schnorr_batch_identify_invalid_impl(
|
||||
|
||||
bool schnorr_batch_verify(const SchnorrBatchEntry* entries, std::size_t n) {
|
||||
std::vector<SchnorrXonlyPubkey> pubkey_cache;
|
||||
pubkey_cache.reserve((n < 64) ? n : 64);
|
||||
pubkey_cache.reserve(n);
|
||||
|
||||
auto verify_one = [](const SchnorrBatchEntry& entry) {
|
||||
return schnorr_verify(entry.pubkey_x, entry.message, entry.signature);
|
||||
|
||||
@ -1016,10 +1016,15 @@ limbs4 mul_impl(const limbs4& a, const limbs4& b) {
|
||||
arm64::field_mul_arm64(out.data(), a.data(), b.data());
|
||||
return out;
|
||||
#elif defined(SECP256K1_HAS_ASM) && (defined(__x86_64__) || defined(_M_X64))
|
||||
// x86-64: Direct assembly call -- zero-copy, no FieldElement wrapper overhead
|
||||
limbs4 out;
|
||||
field_mul_full_asm(a.data(), b.data(), out.data());
|
||||
return out;
|
||||
// x86-64: Runtime dispatch — assembly requires BMI2+ADX (mulx/adcx/adox).
|
||||
// Fall back to portable path on CPUs that lack these extensions (e.g. Jasper Lake).
|
||||
static bool const asm_available = has_bmi2_support() && has_adx_support();
|
||||
if (asm_available) {
|
||||
limbs4 out;
|
||||
field_mul_full_asm(a.data(), b.data(), out.data());
|
||||
return out;
|
||||
}
|
||||
return reduce(mul_wide(a, b));
|
||||
#elif defined(SECP256K1_NO_ASM)
|
||||
// Generic no-asm fallback
|
||||
auto result = reduce(mul_wide(a, b));
|
||||
@ -1055,10 +1060,15 @@ limbs4 square_impl(const limbs4& a) {
|
||||
arm64::field_sqr_arm64(out.data(), a.data());
|
||||
return out;
|
||||
#elif defined(SECP256K1_HAS_ASM) && (defined(__x86_64__) || defined(_M_X64))
|
||||
// x86-64: Direct assembly call -- zero-copy, no FieldElement wrapper overhead
|
||||
limbs4 out;
|
||||
field_sqr_full_asm(a.data(), out.data());
|
||||
return out;
|
||||
// x86-64: Runtime dispatch — assembly requires BMI2+ADX (mulx/adcx/adox).
|
||||
// Fall back to portable path on CPUs that lack these extensions (e.g. Jasper Lake).
|
||||
static bool const asm_available = has_bmi2_support() && has_adx_support();
|
||||
if (asm_available) {
|
||||
limbs4 out;
|
||||
field_sqr_full_asm(a.data(), out.data());
|
||||
return out;
|
||||
}
|
||||
return reduce(mul_wide(a, a));
|
||||
#elif defined(SECP256K1_NO_ASM)
|
||||
// Generic no-asm fallback
|
||||
return reduce(mul_wide(a, a));
|
||||
|
||||
@ -26,6 +26,11 @@
|
||||
#include <cstring>
|
||||
|
||||
// Architecture detection
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define SECP256K1_ARM64_TARGET 1
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
|
||||
#define SECP256K1_X86_TARGET 1
|
||||
#ifdef _MSC_VER
|
||||
@ -105,10 +110,19 @@ bool avx512_available() noexcept {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool arm_sha2_available() noexcept {
|
||||
#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
HashTier detect_hash_tier() noexcept {
|
||||
// SHA-NI usually coexists with AVX2 on modern CPUs (Zen, Ice Lake+)
|
||||
// SHA-NI single-message is often faster than multi-buffer AVX2 for
|
||||
// sequential work. For batch, AVX2 multi-buffer wins.
|
||||
if (arm_sha2_available()) return HashTier::ARM_SHA2;
|
||||
if (sha_ni_available()) return HashTier::SHA_NI;
|
||||
if (avx2_available()) return HashTier::AVX2;
|
||||
return HashTier::SCALAR;
|
||||
@ -116,6 +130,7 @@ HashTier detect_hash_tier() noexcept {
|
||||
|
||||
const char* hash_tier_name(HashTier tier) noexcept {
|
||||
switch (tier) {
|
||||
case HashTier::ARM_SHA2: return "ARM SHA2";
|
||||
case HashTier::SHA_NI: return "SHA-NI";
|
||||
case HashTier::AVX2: return "AVX2";
|
||||
case HashTier::AVX512: return "AVX-512";
|
||||
@ -392,6 +407,90 @@ void hash160_33(const std::uint8_t* pubkey33, std::uint8_t* out20) noexcept {
|
||||
|
||||
} // namespace scalar
|
||||
|
||||
// ============================================================================
|
||||
// ARMv8 SHA2 -- Hardware-accelerated SHA-256
|
||||
// ============================================================================
|
||||
|
||||
#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
|
||||
|
||||
namespace armsha {
|
||||
|
||||
void sha256_compress(const std::uint8_t block[64], std::uint32_t state[8]) noexcept {
|
||||
std::uint32_t w[64];
|
||||
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
w[i] = load_be32(block + static_cast<std::size_t>(i) * 4);
|
||||
}
|
||||
for (int i = 16; i < 64; ++i) {
|
||||
std::uint32_t const s0 = rotr32(w[i - 15], 7) ^ rotr32(w[i - 15], 18) ^ (w[i - 15] >> 3);
|
||||
std::uint32_t const s1 = rotr32(w[i - 2], 17) ^ rotr32(w[i - 2], 19) ^ (w[i - 2] >> 10);
|
||||
w[i] = w[i - 16] + s0 + w[i - 7] + s1;
|
||||
}
|
||||
|
||||
uint32x4_t abcd = vld1q_u32(state + 0);
|
||||
uint32x4_t efgh = vld1q_u32(state + 4);
|
||||
uint32x4_t const abcd_save = abcd;
|
||||
uint32x4_t const efgh_save = efgh;
|
||||
|
||||
for (int i = 0; i < 64; i += 4) {
|
||||
uint32x4_t const msg = vld1q_u32(w + i);
|
||||
uint32x4_t const k = vld1q_u32(SHA256_K + i);
|
||||
uint32x4_t const wk = vaddq_u32(msg, k);
|
||||
abcd = vsha256hq_u32(abcd, efgh, wk);
|
||||
efgh = vsha256h2q_u32(efgh, abcd, wk);
|
||||
}
|
||||
|
||||
abcd = vaddq_u32(abcd, abcd_save);
|
||||
efgh = vaddq_u32(efgh, efgh_save);
|
||||
|
||||
vst1q_u32(state + 0, abcd);
|
||||
vst1q_u32(state + 4, efgh);
|
||||
}
|
||||
|
||||
void sha256_33(const std::uint8_t* pubkey33, std::uint8_t* out32) noexcept {
|
||||
alignas(16) std::uint8_t block[64];
|
||||
std::memcpy(block, pubkey33, 33);
|
||||
block[33] = 0x80;
|
||||
std::memset(block + 34, 0, 22);
|
||||
block[56] = 0; block[57] = 0; block[58] = 0; block[59] = 0;
|
||||
block[60] = 0; block[61] = 0; block[62] = 0x01; block[63] = 0x08;
|
||||
|
||||
std::uint32_t state[8];
|
||||
std::memcpy(state, SHA256_IV, sizeof(state));
|
||||
sha256_compress(block, state);
|
||||
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
store_be32(out32 + static_cast<std::size_t>(i) * 4, state[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void sha256_32(const std::uint8_t* in32, std::uint8_t* out32) noexcept {
|
||||
alignas(16) std::uint8_t block[64];
|
||||
std::memcpy(block, in32, 32);
|
||||
block[32] = 0x80;
|
||||
std::memset(block + 33, 0, 23);
|
||||
block[56] = 0; block[57] = 0; block[58] = 0; block[59] = 0;
|
||||
block[60] = 0; block[61] = 0; block[62] = 0x01; block[63] = 0x00;
|
||||
|
||||
std::uint32_t state[8];
|
||||
std::memcpy(state, SHA256_IV, sizeof(state));
|
||||
sha256_compress(block, state);
|
||||
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
store_be32(out32 + static_cast<std::size_t>(i) * 4, state[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void hash160_33(const std::uint8_t* pubkey33, std::uint8_t* out20) noexcept {
|
||||
std::uint8_t sha_out[32];
|
||||
sha256_33(pubkey33, sha_out);
|
||||
scalar::ripemd160_32(sha_out, out20);
|
||||
}
|
||||
|
||||
} // namespace armsha
|
||||
|
||||
#endif // SECP256K1_ARM64_TARGET && __ARM_FEATURE_SHA2
|
||||
|
||||
// ============================================================================
|
||||
// SHA-NI (Intel SHA Extensions) -- Hardware-accelerated SHA-256
|
||||
// ============================================================================
|
||||
@ -616,6 +715,12 @@ std::array<std::uint8_t, 32> sha256(const void* data, std::size_t len) noexcept
|
||||
}
|
||||
|
||||
void sha256_33(const std::uint8_t* pubkey33, std::uint8_t* out32) noexcept {
|
||||
#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
|
||||
if (arm_sha2_available()) {
|
||||
armsha::sha256_33(pubkey33, out32);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#ifdef SECP256K1_X86_TARGET
|
||||
if (sha_ni_available()) {
|
||||
shani::sha256_33(pubkey33, out32);
|
||||
@ -626,6 +731,12 @@ void sha256_33(const std::uint8_t* pubkey33, std::uint8_t* out32) noexcept {
|
||||
}
|
||||
|
||||
void sha256_32(const std::uint8_t* in32, std::uint8_t* out32) noexcept {
|
||||
#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
|
||||
if (arm_sha2_available()) {
|
||||
armsha::sha256_32(in32, out32);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#ifdef SECP256K1_X86_TARGET
|
||||
if (sha_ni_available()) {
|
||||
shani::sha256_32(in32, out32);
|
||||
@ -714,6 +825,12 @@ std::array<std::uint8_t, 20> hash160(const void* data, std::size_t len) noexcept
|
||||
}
|
||||
|
||||
void hash160_33(const std::uint8_t* pubkey33, std::uint8_t* out20) noexcept {
|
||||
#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
|
||||
if (arm_sha2_available()) {
|
||||
armsha::hash160_33(pubkey33, out20);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#ifdef SECP256K1_X86_TARGET
|
||||
if (sha_ni_available()) {
|
||||
shani::hash160_33(pubkey33, out20);
|
||||
@ -775,6 +892,12 @@ namespace secp256k1::detail {
|
||||
|
||||
void sha256_compress_dispatch(const std::uint8_t block[64],
|
||||
std::uint32_t state[8]) noexcept {
|
||||
#if defined(SECP256K1_ARM64_TARGET) && defined(__ARM_FEATURE_SHA2)
|
||||
if (secp256k1::hash::arm_sha2_available()) {
|
||||
secp256k1::hash::armsha::sha256_compress(block, state);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#ifdef SECP256K1_X86_TARGET
|
||||
if (secp256k1::hash::sha_ni_available()) {
|
||||
secp256k1::hash::shani::sha256_compress(block, state);
|
||||
|
||||
@ -212,17 +212,25 @@ CPU-computed data transfers directly to GPU via `cudaMemcpy` (little-endian, sam
|
||||
|
||||
| Operation | Time |
|
||||
|-----------|------|
|
||||
| field_mul (a*b mod p) | 85 ns |
|
||||
| field_sqr (a^2 mod p) | 66 ns |
|
||||
| field_add (a+b mod p) | 18 ns |
|
||||
| field_sub (a-b mod p) | 16 ns |
|
||||
| field_inverse | 2,621 ns |
|
||||
| **fast scalar_mul (k*G)** | **7.6 us** |
|
||||
| fast scalar_mul (k*P) | 77.6 us |
|
||||
| CT scalar_mul (k*G) | 545 us |
|
||||
| ECDH (full CT) | 545 us |
|
||||
| field_mul (a*b mod p) | 68.3 ns |
|
||||
| field_sqr (a^2 mod p) | 50 ns |
|
||||
| field_add (a+b mod p) | 8 ns |
|
||||
| field_inverse | 2 us |
|
||||
| **fast scalar_mul (k*G)** | **15.27 us** |
|
||||
| fast scalar_mul (k*P) | 130.33 us |
|
||||
| ECDSA sign | 22.22 us |
|
||||
| Schnorr sign (precomputed) | 16.67 us |
|
||||
| ECDSA verify | 150.13 us |
|
||||
|
||||
> Backend: ARM64 inline assembly (MUL/UMULH). ~5x faster than generic C++.
|
||||
> Backend: ARM64 inline assembly (MUL/UMULH). Latest rerun kept the ARMv8 SHA2 dispatch win for signing-heavy paths on RK3588.
|
||||
|
||||
### Latest RTX 5060 Ti Refresh
|
||||
|
||||
- CUDA local rerun via `gpu_bench_unified`: `k*G = 129.5 ns` at TPB 256 on batch 65536.
|
||||
- OpenCL retained revalidation: `kG (batch=65536) = 115.1 ns`, `kP (batch=65536) = 263.1 ns`, `kG (kernel) = 98.7 ns`.
|
||||
- CUDA TPB 512 was not retained as a default because the same harness produced invalid CT timings while only marginally improving `k*G`.
|
||||
|
||||
See `../docs/BENCHMARKS.md` for the current cross-platform benchmark matrix and retained-vs-rejected rerun notes.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@ using CpuKPlan = secp256k1::fast::KPlan;
|
||||
// ============================================================================
|
||||
// Configuration
|
||||
// ============================================================================
|
||||
static constexpr int BENCH_N = 10000;
|
||||
static constexpr int BENCH_N = 500000;
|
||||
static constexpr int BENCH_WARMUP = 3;
|
||||
static constexpr int BENCH_PASSES = 11;
|
||||
static constexpr int DETAIL_N = 1000;
|
||||
|
||||
@ -82,27 +82,45 @@ build-bench\cpu\bench_unified.exe
|
||||
### 2. ARM64 Android (Cross-compile via NDK)
|
||||
|
||||
Requires:
|
||||
- Android NDK (tested with r27, Clang 18.0.1)
|
||||
- Android NDK (tested with r27.2.12479018, Clang 18.0.3)
|
||||
- Android device/emulator (arm64-v8a)
|
||||
- ADB
|
||||
|
||||
```bash
|
||||
# Configure with NDK toolchain
|
||||
cmake -S . -B build-android -G Ninja \
|
||||
# Configure with the Android CMake entrypoint.
|
||||
# Use a clean Android-only build dir to avoid root/android cache mismatches.
|
||||
cmake -S android -B build-android-ndk-arm64 -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_HOME/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_STL=c++_static \
|
||||
-DANDROID_PLATFORM=android-28
|
||||
|
||||
# Build
|
||||
cmake --build build-android --target bench_hornet -j
|
||||
cmake --build build-android-ndk-arm64 --target bench_hornet -j
|
||||
|
||||
# Deploy and run
|
||||
adb push build-android/android/test/bench_hornet /data/local/tmp/
|
||||
adb shell chmod +x /data/local/tmp/bench_hornet
|
||||
adb shell /data/local/tmp/bench_hornet
|
||||
adb shell 'mkdir -p /data/local/tmp/ufsecp'
|
||||
adb push build-android-ndk-arm64/bench_hornet /data/local/tmp/ufsecp/bench_hornet
|
||||
adb shell 'chmod 755 /data/local/tmp/ufsecp/bench_hornet && /data/local/tmp/ufsecp/bench_hornet'
|
||||
```
|
||||
|
||||
Measured Android rerun retained the ARMv8 SHA2 dispatch path in `cpu/src/hash_accel.cpp`.
|
||||
On RK3588 big cores this moved the signing-heavy hot path materially while leaving verify
|
||||
and point arithmetic essentially flat:
|
||||
|
||||
| Operation | Baseline | With ARM SHA2 dispatch | Delta |
|
||||
|-----------|----------|------------------------|-------|
|
||||
| ECDSA Sign | 25.89 us | 22.22 us | 1.17x faster |
|
||||
| Schnorr Sign (precomputed) | 17.73 us | 16.67 us | 1.06x faster |
|
||||
| Schnorr Sign (raw privkey) | 33.01 us | 31.99 us | 1.03x faster |
|
||||
| CT ECDSA Sign | 70.50 us | 67.11 us | 1.05x faster |
|
||||
| CT Schnorr Sign | 59.87 us | 59.10 us | 1.01x faster |
|
||||
|
||||
Rejected Android ARM64 experiments from the same campaign: forcing `SECP256K1_USE_4X64_POINT_OPS`,
|
||||
changing `SECP256K1_GLV_WINDOW_WIDTH` to 4 or 6, and using default PGO as the shipped path.
|
||||
Those variants did not beat the retained source-level SHA2 dispatch win on the connected RK3588 device.
|
||||
|
||||
### 3. RISC-V 64 (Cross-compile for Milk-V Mars / SiFive U74)
|
||||
|
||||
Requires:
|
||||
|
||||
@ -11,14 +11,18 @@ Benchmark results for UltrafastSecp256k1 across all supported platforms.
|
||||
| **x86-64 (i5-14400F, Clang 19)** | **12.8 ns** | **6.7 us** | **17.6 us** | **21.3 us** | **24.3 us** | **1.09x** |
|
||||
| x86-64 (Clang 21, Win) | 17 ns (5x52) | 5 us | 25 us | -- | -- | -- |
|
||||
| RISC-V 64 (SiFive U74, Clang 21) | 176 ns | 40.2 us | 150.5 us | **181.8 us** | -- | **1.13x** |
|
||||
| ARM64 (RK3588, A76) | 74 ns | 14 us | 131 us | -- | -- | -- |
|
||||
| ARM64 (RK3588, A76, Android NDK r27.2) | 68.3 ns | 15.27 us | 130.33 us | **150.13 us** | -- | -- |
|
||||
| ESP32-S3 (LX7, 240 MHz) | 7,458 ns | 2,483 us | -- | -- | -- | -- |
|
||||
| ESP32 (LX6, 240 MHz) | 6,993 ns | 6,203 us | -- | -- | -- | -- |
|
||||
| STM32F103 (CM3, 72 MHz) | 15,331 ns | 37,982 us | -- | -- | -- | -- |
|
||||
| CUDA (RTX 5060 Ti) | 0.2 ns | 217.7 ns | 225.8 ns | -- | **263.7 ns** | -- |
|
||||
| OpenCL (RTX 5060 Ti) | 0.2 ns | 295.1 ns | -- | -- | -- | -- |
|
||||
| CUDA (RTX 5060 Ti) | 0.2 ns | 129.5 ns | 225.8 ns | -- | **263.7 ns** | -- |
|
||||
| OpenCL (RTX 5060 Ti) | 0.2 ns | 115.1 ns | 263.1 ns | -- | -- | -- |
|
||||
| Metal (Apple M3 Pro) | 1.9 ns | 3.00 us | 2.94 us | -- | -- | -- |
|
||||
|
||||
GPU rows use the latest retained local rerun per backend. For OpenCL, the public
|
||||
GPU C ABI still covers 4 of the 6 first-wave operations; the missing two are
|
||||
batch ECDSA verify and batch Schnorr verify.
|
||||
|
||||
---
|
||||
|
||||
## Real-World Flow Coverage
|
||||
@ -56,6 +60,46 @@ These values are mainly intended as workflow reference points. For publishable
|
||||
cross-machine comparisons, use the full pinned benchmark methodology and JSON
|
||||
artifacts from `bench_unified`.
|
||||
|
||||
### x86-64 Batch Verify Rerun (2026-03-17)
|
||||
|
||||
A retained low-risk x86 CPU improvement was keeping the Schnorr batch pubkey cache
|
||||
capacity aligned with the full batch size in `cpu/src/batch_verify.cpp` instead of
|
||||
clamping reserve capacity to 64 entries. This avoids avoidable vector reallocations
|
||||
when uncached batches grow beyond 64 signatures.
|
||||
|
||||
Quick reruns on the local i5-14400F validation machine showed the improvement on the
|
||||
uncached Schnorr path while preserving correctness (`ctest -R 'comprehensive|multiscalar'` PASS):
|
||||
|
||||
| Operation | Before | After | Delta |
|
||||
|-----------|--------|-------|-------|
|
||||
| Schnorr batch verify N=128 | 20.27 us/sig | 19.94-20.06 us/sig | up to 1.6% faster |
|
||||
| Schnorr batch verify N=192 | 18.56 us/sig | 18.01-18.45 us/sig | up to 3.0% faster |
|
||||
|
||||
This change does not materially affect the cached-path benchmark; the measured win is specifically
|
||||
the uncached parse-and-resolve flow for larger Schnorr batches.
|
||||
|
||||
### Cross-Platform Refresh Status (2026-03-18)
|
||||
|
||||
Recent retained reruns and validation passes across the active optimization campaign:
|
||||
|
||||
| Platform | Latest validated result | Status |
|
||||
|----------|-------------------------|--------|
|
||||
| x86-64 / Linux | Schnorr batch verify `N=128`: 19.94-20.06 us/sig, `N=192`: 18.01-18.45 us/sig | Retained low-risk pubkey-cache reserve improvement |
|
||||
| Android ARM64 / RK3588 | ECDSA Sign 22.22 us, Schnorr Sign (precomputed) 16.67 us, CT ECDSA Sign 67.11 us | Retained ARMv8 SHA2 dispatch win |
|
||||
| OpenCL / RTX 5060 Ti | `kG (batch=65536)` 115.1 ns, `kP (batch=65536)` 263.1 ns, `kG (kernel)` 98.7 ns | Revalidated retained tuning; `opencl_test` and `opencl_audit_runner` passed |
|
||||
| CUDA / RTX 5060 Ti | `k*G` 129.5 ns at TPB 256; TPB 512 reached 128.5 ns but CT rows became invalid in the same harness | No safe global retune retained yet |
|
||||
| RISC-V / Milk-V Mars | Latest native rerun remains the 2026-03-07 Mars baseline below | Current local environment has toolchain but no runnable board/emulator path |
|
||||
|
||||
This page keeps the last trustworthy result per platform. When a rerun only proves that an
|
||||
experiment is unstable or not worth shipping, it is recorded here but not promoted as a retained
|
||||
default.
|
||||
|
||||
OpenCL's current 4/6 C ABI status refers specifically to the generic GPU host ABI in
|
||||
`ufsecp_gpu.h`: `generator_mul_batch`, `ecdh_batch`, `hash160_pubkey_batch`, and
|
||||
`msm` are implemented on the OpenCL backend, while `ecdsa_verify_batch` and
|
||||
`schnorr_verify_batch` currently return `UFSECP_ERR_GPU_UNSUPPORTED` until the
|
||||
extended verify kernels are promoted into the backend bridge.
|
||||
|
||||
---
|
||||
|
||||
## x86-64 Benchmarks
|
||||
@ -229,6 +273,20 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
|
||||
| Range Prove (64-bit) | 3,711,570 ns | 0.27 k/s | Bulletproof, CT path, batch 256 |
|
||||
| Range Verify (64-bit) | 764,649 ns | 1.3 k/s | Full IPA verification, batch 256 |
|
||||
|
||||
### CUDA Launch-Width Triage (2026-03-18)
|
||||
|
||||
The latest local rerun on the RTX 5060 Ti used `gpu_bench_unified` to check whether a global block-size
|
||||
retune should replace the current default. The answer was no: there is not yet a safe retained win.
|
||||
|
||||
| TPB | k*G (generator) | CT k*G | CT k*P | Verdict |
|
||||
|-----|-----------------|--------|--------|---------|
|
||||
| 256 | 129.5 ns | 98.7 ns | 162.8 ns | Stable reference rerun |
|
||||
| 512 | 128.5 ns | invalid (`0.0 ns`) | invalid (`0.1 ns`) | Rejected; CT timing became unstable |
|
||||
|
||||
The `512`-thread launch showed only a marginal `k*G` gain, while the same harness produced invalid
|
||||
constant-time timings. Until the CT timing methodology is tightened, no global CUDA TPB default change
|
||||
is retained from this sweep.
|
||||
|
||||
**GPU vs CPU ZK Speedup (single-core throughput):**
|
||||
|
||||
| Operation | CPU (i5-14400F) | GPU (RTX 5060 Ti) | GPU/CPU Speedup |
|
||||
@ -249,6 +307,20 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
|
||||
**OpenCL:** 3.0 CUDA, Driver 580.126.09
|
||||
**Build:** Clang 19, Release, -O3, PTX inline assembly
|
||||
|
||||
### OpenCL GPU C ABI Coverage (2026-03-18)
|
||||
|
||||
| C ABI operation | OpenCL status | Notes |
|
||||
|-----------------|---------------|-------|
|
||||
| `ufsecp_gpu_generator_mul_batch` | Implemented | Uses `batch_scalar_mul_generator` + `batch_jacobian_to_affine` |
|
||||
| `ufsecp_gpu_ecdsa_verify_batch` | Missing | Returns `UFSECP_ERR_GPU_UNSUPPORTED` |
|
||||
| `ufsecp_gpu_schnorr_verify_batch` | Missing | Returns `UFSECP_ERR_GPU_UNSUPPORTED` |
|
||||
| `ufsecp_gpu_ecdh_batch` | Implemented | GPU scalar mul, CPU SHA-256 finalization |
|
||||
| `ufsecp_gpu_hash160_pubkey_batch` | Implemented | Public-data batch hashing |
|
||||
| `ufsecp_gpu_msm` | Implemented | GPU scalar mul + CPU-side affine reduction |
|
||||
|
||||
The missing OpenCL pieces are therefore the two batch verify paths. Core ECC,
|
||||
ECDH, Hash160, and MSM are already wired through the backend-neutral C ABI.
|
||||
|
||||
### Kernel-Only Timing (no buffer alloc/copy overhead)
|
||||
|
||||
| Operation | Time/Op | Throughput | Notes |
|
||||
@ -260,7 +332,8 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
|
||||
| Field Inv | 14.3 ns | 69.97 M/s | batch 1M |
|
||||
| Point Double | 0.9 ns | 1,139 M/s | batch 256K |
|
||||
| Point Add | 1.6 ns | 630.6 M/s | batch 256K |
|
||||
| kG (kernel) | 295.1 ns | 3.39 M/s | batch 256K |
|
||||
| kG (kernel) | 98.7 ns | 10.13 M/s | batch 65K |
|
||||
| kP (kernel) | 238.1 ns | 4.20 M/s | batch 65K |
|
||||
|
||||
### End-to-End Timing (including buffer transfers)
|
||||
|
||||
@ -271,8 +344,10 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
|
||||
| Field Inv | 29.0 ns | 34.43 M/s | batch 1M |
|
||||
| Point Double | 58.4 ns | 17.11 M/s | batch 1M |
|
||||
| Point Add | 111.9 ns | 8.94 M/s | batch 1M |
|
||||
| kG (batch=65K) | 307.7 ns | 3.25 M/s | |
|
||||
| kG (batch=16K) | 311.6 ns | 3.21 M/s | |
|
||||
| kG (batch=65536) | 115.1 ns | 8.69 M/s | retained 2026-03-17 revalidation |
|
||||
| kP (batch=65536) | 263.1 ns | 3.80 M/s | retained 2026-03-17 revalidation |
|
||||
| kP upload | 6.7 ns | 149.25 M/s | host-to-device transfer slice |
|
||||
| kP readback | 12.4 ns | 80.65 M/s | device-to-host transfer slice |
|
||||
|
||||
### CUDA / OpenCL Configuration
|
||||
|
||||
@ -291,7 +366,7 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
|
||||
| Field Inv | 10.2 ns | 14.3 ns | **CUDA 1.40x** |
|
||||
| Point Double | 0.8 ns | 0.9 ns | CUDA 1.13x |
|
||||
| Point Add | 1.6 ns | 1.6 ns | Tie |
|
||||
| Scalar Mul (kG) | 217.7 ns | 295.1 ns | **CUDA 1.36x** |
|
||||
| Scalar Mul (kG) | 129.5 ns | 98.7 ns | **OpenCL 1.31x** |
|
||||
| ECDSA Sign | 204.8 ns | -- | CUDA only |
|
||||
| ECDSA Verify | 410.1 ns | -- | CUDA only |
|
||||
| Schnorr Sign | 273.4 ns | -- | CUDA only |
|
||||
@ -301,6 +376,11 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
|
||||
| DLEQ Prove | 675.4 ns | -- | CUDA only |
|
||||
| DLEQ Verify | 1,912.0 ns | -- | CUDA only |
|
||||
|
||||
`kG` above uses the latest retained local reruns on the same RTX 5060 Ti host:
|
||||
CUDA `gpu_bench_unified` at TPB 256 (`129.5 ns`) and OpenCL `opencl_benchmark`
|
||||
kernel timing (`98.7 ns`). CUDA still leads on verify and ZK because those paths
|
||||
are not yet exposed on OpenCL.
|
||||
|
||||
---
|
||||
|
||||
## Apple Metal Benchmarks
|
||||
@ -353,30 +433,49 @@ Summary: `53/54 modules passed -- ALL PASSED (1 advisory warnings)`.
|
||||
|
||||
**Hardware:** RK3588 (Cortex-A76 @ 2.256 GHz, pinned to big cores)
|
||||
**OS:** Android
|
||||
**Compiler:** NDK r26, Clang 17.0.2
|
||||
**Compiler:** NDK r27.2.12479018, Clang 18.0.3
|
||||
**Assembly:** ARM64 inline (MUL/UMULH)
|
||||
**Field:** 10x26 (optimal for ARM64)
|
||||
|
||||
| Operation | Time | Notes |
|
||||
|-----------|------|-------|
|
||||
| Field Mul | 74 ns | ARM64 MUL/UMULH, 10x26 |
|
||||
| Field Mul | 68.3 ns | ARM64 MUL/UMULH, 10x26 |
|
||||
| Field Square | 50 ns | |
|
||||
| Field Add | 8 ns | |
|
||||
| Field Negate | 18 ns | |
|
||||
| Field Inverse | 2 us | Fermat's theorem |
|
||||
| Point Add | 992 ns | Jacobian coordinates |
|
||||
| Point Double | 548 ns | |
|
||||
| Generator Mul (kxG) | 14 us | Precomputed tables |
|
||||
| Scalar Mul (kxP) | 131 us | GLV + wNAF |
|
||||
| ECDSA Sign | 30 us | RFC 6979 |
|
||||
| ECDSA Verify | 153 us | Shamir + GLV |
|
||||
| Schnorr Sign (BIP-340) | 38 us | |
|
||||
| Schnorr Verify (BIP-340) | 173 us | |
|
||||
| Generator Mul (kxG) | 15.27 us | Precomputed tables |
|
||||
| Scalar Mul (kxP) | 130.33 us | GLV + wNAF |
|
||||
| ECDSA Sign | 22.22 us | ARMv8 SHA2 dispatch retained |
|
||||
| ECDSA Verify | 150.13 us | Shamir + GLV |
|
||||
| Schnorr Sign (BIP-340) | 16.67 us | Precomputed keypair path |
|
||||
| Schnorr Verify (BIP-340) | 153.63 us | Raw pubkey path is similar |
|
||||
| Batch Inverse (n=100) | 265 ns/elem | Montgomery's trick |
|
||||
| Batch Inverse (n=1000) | 240 ns/elem | |
|
||||
|
||||
ARM64 10x26 representation with MUL/UMULH assembly provides optimal field arithmetic performance.
|
||||
|
||||
### Android ARM64 Optimization Rerun (2026-03-17)
|
||||
|
||||
This rerun used the connected RK3588 Android device and `android/test/bench_hornet_android.cpp`
|
||||
as the benchmark truth source. The retained code change was enabling the existing ARMv8 SHA-256
|
||||
instruction path in `hash_accel.cpp` for `sha256_33`, `sha256_32`, `hash160_33`, and
|
||||
`sha256_compress_dispatch`.
|
||||
|
||||
| Operation | Baseline | Retained result | Delta |
|
||||
|-----------|----------|-----------------|-------|
|
||||
| ECDSA Sign | 25.89 us | 22.22 us | 14.2% faster |
|
||||
| Schnorr Sign (precomputed) | 17.73 us | 16.67 us | 6.0% faster |
|
||||
| Schnorr Sign (raw privkey) | 33.01 us | 31.99 us | 3.1% faster |
|
||||
| CT ECDSA Sign | 70.50 us | 67.11 us | 4.8% faster |
|
||||
| CT Schnorr Sign | 59.87 us | 59.10 us | 1.3% faster |
|
||||
|
||||
No meaningful win was found from forcing `SECP256K1_USE_4X64_POINT_OPS`, from changing
|
||||
`SECP256K1_GLV_WINDOW_WIDTH` to 4 or 6, or from keeping PGO as the default Android path.
|
||||
Those variants were measured and rejected.
|
||||
|
||||
---
|
||||
|
||||
## ESP32-S3 Benchmarks (Embedded)
|
||||
|
||||
@ -10,7 +10,7 @@
|
||||
|----------|-------------|
|
||||
| [API Reference](API_REFERENCE.md) | Complete CPU + CUDA + WASM function reference |
|
||||
| [Building](BUILDING.md) | Build instructions for all 10+ platforms |
|
||||
| [Benchmarks](BENCHMARKS.md) | Performance data: CPU, GPU, embedded, mobile |
|
||||
| [Benchmarks](BENCHMARKS.md) | Performance data: CPU, GPU, embedded, mobile, including the 2026-03 x86, Android, CUDA, and OpenCL refresh |
|
||||
| [ESP32 Setup](ESP32_SETUP.md) | ESP32-S3/PICO-D4 flashing & testing guide |
|
||||
| [RISC-V Optimizations](../RISCV_OPTIMIZATIONS.md) | RISC-V assembly & RVV details |
|
||||
| [Porting Guide](../PORTING.md) | Add new platforms, architectures, GPU backends |
|
||||
|
||||
@ -58,17 +58,24 @@ cd libs\UltrafastSecp256k1\android\
|
||||
### Build (Manual CMake)
|
||||
|
||||
```bash
|
||||
cmake -S android -B android/build-android-arm64 \
|
||||
cmake -S android -B build-android-ndk-arm64 \
|
||||
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_HOME/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_PLATFORM=android-24 \
|
||||
-DANDROID_PLATFORM=android-28 \
|
||||
-DANDROID_STL=c++_static \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-G Ninja
|
||||
|
||||
cmake --build android/build-android-arm64 -j
|
||||
cmake --build build-android-ndk-arm64 --target bench_hornet -j
|
||||
|
||||
adb shell 'mkdir -p /data/local/tmp/ufsecp'
|
||||
adb push build-android-ndk-arm64/bench_hornet /data/local/tmp/ufsecp/bench_hornet
|
||||
adb shell 'chmod 755 /data/local/tmp/ufsecp/bench_hornet && /data/local/tmp/ufsecp/bench_hornet'
|
||||
```
|
||||
|
||||
Use a clean Android-only build directory. Reusing a build directory first configured from the
|
||||
repository root can trigger a CMake source/cache mismatch when switching to `android/` as the source tree.
|
||||
|
||||
### Output
|
||||
|
||||
```
|
||||
@ -217,6 +224,21 @@ NDK Clang additionally uses:
|
||||
|
||||
\* CT mode uses generic C++ (for constant-time guarantees)
|
||||
|
||||
### Android ARM64 rerun retained on-device SHA2 dispatch
|
||||
|
||||
Measured on the connected RK3588 Android device with `bench_hornet` after wiring the ARMv8 SHA2
|
||||
path into `hash_accel.cpp` hot wrappers:
|
||||
|
||||
| Operation | Baseline | Retained result | Delta |
|
||||
|-----------|----------|-----------------|-------|
|
||||
| ECDSA sign | 25.89 us | 22.22 us | 14.2% faster |
|
||||
| Schnorr sign (precomputed) | 17.73 us | 16.67 us | 6.0% faster |
|
||||
| Schnorr sign (raw privkey) | 33.01 us | 31.99 us | 3.1% faster |
|
||||
| CT ECDSA sign | 70.50 us | 67.11 us | 4.8% faster |
|
||||
|
||||
The same rerun rejected forced 4x64 point ops, GLV window retuning, and keeping Android PGO as the
|
||||
default path because they did not outperform the retained SHA2 dispatch result on this device.
|
||||
|
||||
### ARMv7 (32-bit) Limitations
|
||||
|
||||
- No `__int128` -> `SECP256K1_NO_INT128` fallback (portable 64x64->128)
|
||||
|
||||
649
fix_alerts.py
Normal file
649
fix_alerts.py
Normal file
@ -0,0 +1,649 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Apply all readability-braces and misc-const-correctness fixes to ufsecp_impl.cpp"""
|
||||
|
||||
import sys
|
||||
|
||||
PATH = "include/ufsecp/ufsecp_impl.cpp"
|
||||
|
||||
# Each entry: (old_string, new_string)
|
||||
REPLACEMENTS = [
|
||||
# L1380: if (!ok)
|
||||
(
|
||||
" if (!ok)\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid mnemonic\");",
|
||||
" if (!ok) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid mnemonic\");\n"
|
||||
" }",
|
||||
),
|
||||
# L1382: if (*entropy_len < ent.length)
|
||||
(
|
||||
" if (*entropy_len < ent.length)\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"entropy buffer too small\");",
|
||||
" if (*entropy_len < ent.length) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"entropy buffer too small\");\n"
|
||||
" }",
|
||||
),
|
||||
# L1404: if (!FE::parse_bytes_strict(e, pk_fe)) in ufsecp_schnorr_batch_verify
|
||||
(
|
||||
" for (size_t i = 0; i < n; ++i) {\n"
|
||||
" const uint8_t* e = entries + i * 128;\n"
|
||||
" // Strict: reject x-only pubkey >= p at ABI gate\n"
|
||||
" FE pk_fe;\n"
|
||||
" if (!FE::parse_bytes_strict(e, pk_fe))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p) in batch\");\n"
|
||||
" std::memcpy(batch[i].pubkey_x.data(), e, 32);\n"
|
||||
" std::memcpy(batch[i].message.data(), e + 32, 32);\n"
|
||||
" if (!secp256k1::SchnorrSignature::parse_strict(e + 64, batch[i].signature))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid Schnorr sig in batch\");",
|
||||
" for (size_t i = 0; i < n; ++i) {\n"
|
||||
" const uint8_t* e = entries + i * 128;\n"
|
||||
" // Strict: reject x-only pubkey >= p at ABI gate\n"
|
||||
" FE pk_fe;\n"
|
||||
" if (!FE::parse_bytes_strict(e, pk_fe)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p) in batch\");\n"
|
||||
" }\n"
|
||||
" std::memcpy(batch[i].pubkey_x.data(), e, 32);\n"
|
||||
" std::memcpy(batch[i].message.data(), e + 32, 32);\n"
|
||||
" if (!secp256k1::SchnorrSignature::parse_strict(e + 64, batch[i].signature)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid Schnorr sig in batch\");\n"
|
||||
" }",
|
||||
),
|
||||
# L1448: if (!FE::parse_bytes_strict(e, pk_fe)) in ufsecp_schnorr_batch_identify_invalid
|
||||
(
|
||||
" for (size_t i = 0; i < n; ++i) {\n"
|
||||
" const uint8_t* e = entries + i * 128;\n"
|
||||
" FE pk_fe;\n"
|
||||
" if (!FE::parse_bytes_strict(e, pk_fe))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p) in batch\");\n"
|
||||
" std::memcpy(batch[i].pubkey_x.data(), e, 32);\n"
|
||||
" std::memcpy(batch[i].message.data(), e + 32, 32);\n"
|
||||
" if (!secp256k1::SchnorrSignature::parse_strict(e + 64, batch[i].signature))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid Schnorr sig in batch\");",
|
||||
" for (size_t i = 0; i < n; ++i) {\n"
|
||||
" const uint8_t* e = entries + i * 128;\n"
|
||||
" FE pk_fe;\n"
|
||||
" if (!FE::parse_bytes_strict(e, pk_fe)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p) in batch\");\n"
|
||||
" }\n"
|
||||
" std::memcpy(batch[i].pubkey_x.data(), e, 32);\n"
|
||||
" std::memcpy(batch[i].message.data(), e + 32, 32);\n"
|
||||
" if (!secp256k1::SchnorrSignature::parse_strict(e + 64, batch[i].signature)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid Schnorr sig in batch\");\n"
|
||||
" }",
|
||||
),
|
||||
# L1661: widening + braces — in ufsecp_musig2_start_sign_session
|
||||
(
|
||||
" for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
|
||||
" Scalar s;\n"
|
||||
" if (!scalar_parse_strict(keyagg + 38 + i * 32, s))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient in keyagg\");\n"
|
||||
" kagg.key_coefficients.push_back(s);\n"
|
||||
" }",
|
||||
" for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
|
||||
" Scalar s;\n"
|
||||
" if (!scalar_parse_strict(keyagg + 38 + static_cast<size_t>(i) * 32, s)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient in keyagg\");\n"
|
||||
" }\n"
|
||||
" kagg.key_coefficients.push_back(s);\n"
|
||||
" }",
|
||||
),
|
||||
# L1707: widening + braces — inside { } block in ufsecp_musig2_partial_sign
|
||||
(
|
||||
" auto qc = kagg.Q.to_compressed(); std::memcpy(kagg.Q_x.data(), qc.data() + 1, 32);\n"
|
||||
" for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
|
||||
" Scalar s; if (!scalar_parse_strict(keyagg + 38 + i * 32, s))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient\");\n"
|
||||
" kagg.key_coefficients.push_back(s); } }\n"
|
||||
" secp256k1::MuSig2Session sess;",
|
||||
" auto qc = kagg.Q.to_compressed(); std::memcpy(kagg.Q_x.data(), qc.data() + 1, 32);\n"
|
||||
" for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
|
||||
" Scalar s;\n"
|
||||
" if (!scalar_parse_strict(keyagg + 38 + static_cast<size_t>(i) * 32, s)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient\");\n"
|
||||
" }\n"
|
||||
" kagg.key_coefficients.push_back(s);\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" secp256k1::MuSig2Session sess;",
|
||||
),
|
||||
# L1715: if (!scalar_parse_strict(session + 33, sess.b)) — in ufsecp_musig2_partial_sign
|
||||
(
|
||||
" if (!scalar_parse_strict(session + 33, sess.b))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
|
||||
" if (!scalar_parse_strict(session + 65, sess.e))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
|
||||
" sess.R_negated = (session[97] != 0);\n"
|
||||
" auto psig = secp256k1::musig2_partial_sign",
|
||||
" if (!scalar_parse_strict(session + 33, sess.b)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
|
||||
" }\n"
|
||||
" if (!scalar_parse_strict(session + 65, sess.e)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
|
||||
" }\n"
|
||||
" sess.R_negated = (session[97] != 0);\n"
|
||||
" auto psig = secp256k1::musig2_partial_sign",
|
||||
),
|
||||
# L1756: widening + braces — inside { } block in ufsecp_musig2_partial_verify
|
||||
(
|
||||
" auto qc = kagg.Q.to_compressed(); std::memcpy(kagg.Q_x.data(), qc.data() + 1, 32);\n"
|
||||
" for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
|
||||
" Scalar s; if (!scalar_parse_strict(keyagg + 38 + i * 32, s))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient\");\n"
|
||||
" kagg.key_coefficients.push_back(s); } }\n"
|
||||
" secp256k1::MuSig2Session sess;\n"
|
||||
" sess.R = point_from_compressed(session);\n"
|
||||
" if (sess.R.is_infinity()) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session R point\");\n"
|
||||
" }\n"
|
||||
" if (!scalar_parse_strict(session + 33, sess.b))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
|
||||
" if (!scalar_parse_strict(session + 65, sess.e))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
|
||||
" sess.R_negated = (session[97] != 0);\n"
|
||||
" if (!secp256k1::musig2_partial_verify",
|
||||
" auto qc = kagg.Q.to_compressed(); std::memcpy(kagg.Q_x.data(), qc.data() + 1, 32);\n"
|
||||
" for (uint32_t i = 0; i < nk && (38u + (i+1)*32u <= UFSECP_MUSIG2_KEYAGG_LEN); ++i) {\n"
|
||||
" Scalar s;\n"
|
||||
" if (!scalar_parse_strict(keyagg + 38 + static_cast<size_t>(i) * 32, s)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid key coefficient\");\n"
|
||||
" }\n"
|
||||
" kagg.key_coefficients.push_back(s);\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" secp256k1::MuSig2Session sess;\n"
|
||||
" sess.R = point_from_compressed(session);\n"
|
||||
" if (sess.R.is_infinity()) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session R point\");\n"
|
||||
" }\n"
|
||||
" if (!scalar_parse_strict(session + 33, sess.b)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
|
||||
" }\n"
|
||||
" if (!scalar_parse_strict(session + 65, sess.e)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
|
||||
" }\n"
|
||||
" sess.R_negated = (session[97] != 0);\n"
|
||||
" if (!secp256k1::musig2_partial_verify",
|
||||
),
|
||||
# L1791+L1793: in ufsecp_musig2_partial_sig_agg
|
||||
(
|
||||
" if (!scalar_parse_strict(session + 33, sess.b))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
|
||||
" if (!scalar_parse_strict(session + 65, sess.e))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
|
||||
" sess.R_negated = (session[97] != 0);\n"
|
||||
" auto final_sig",
|
||||
" if (!scalar_parse_strict(session + 33, sess.b)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar b\");\n"
|
||||
" }\n"
|
||||
" if (!scalar_parse_strict(session + 65, sess.e)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid session scalar e\");\n"
|
||||
" }\n"
|
||||
" sess.R_negated = (session[97] != 0);\n"
|
||||
" auto final_sig",
|
||||
),
|
||||
# L1822+L1823: const for coeff_count and needed_commits
|
||||
(
|
||||
" size_t coeff_count = commit.coeffs.size();\n"
|
||||
" size_t needed_commits = 8 + coeff_count * 33;",
|
||||
" const size_t coeff_count = commit.coeffs.size();\n"
|
||||
" const size_t needed_commits = 8 + coeff_count * 33;",
|
||||
),
|
||||
# L1845: for (auto& s : shares) — erase in ufsecp_frost_keygen_begin
|
||||
(
|
||||
" // Erase secret shares from memory\n"
|
||||
" for (auto& s : shares)\n"
|
||||
" secp256k1::detail::secure_erase(&s.value, sizeof(s.value));\n"
|
||||
" return UFSECP_OK;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"ufsecp_error_t ufsecp_frost_keygen_finalize(",
|
||||
" // Erase secret shares from memory\n"
|
||||
" for (auto& s : shares) {\n"
|
||||
" secp256k1::detail::secure_erase(&s.value, sizeof(s.value));\n"
|
||||
" }\n"
|
||||
" return UFSECP_OK;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"ufsecp_error_t ufsecp_frost_keygen_finalize(",
|
||||
),
|
||||
# L1864: uint32_t cc; — init-variables
|
||||
(
|
||||
" secp256k1::FrostCommitment fc;\n"
|
||||
" uint32_t cc;\n"
|
||||
" if (pos + 8 > commits_len)\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"truncated commit header\");\n"
|
||||
" std::memcpy(&cc, all_commits + pos, 4); pos += 4;\n"
|
||||
" std::memcpy(&fc.from, all_commits + pos, 4); pos += 4;\n"
|
||||
" if (pos + static_cast<size_t>(cc) * 33 > commits_len)\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"truncated commit coefficients\");\n"
|
||||
" for (uint32_t j = 0; j < cc; ++j) {\n"
|
||||
" auto pt = point_from_compressed(all_commits + pos);",
|
||||
" secp256k1::FrostCommitment fc;\n"
|
||||
" uint32_t cc = 0;\n"
|
||||
" if (pos + 8 > commits_len) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"truncated commit header\");\n"
|
||||
" }\n"
|
||||
" std::memcpy(&cc, all_commits + pos, 4); pos += 4;\n"
|
||||
" std::memcpy(&fc.from, all_commits + pos, 4); pos += 4;\n"
|
||||
" if (pos + static_cast<size_t>(cc) * 33 > commits_len) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"truncated commit coefficients\");\n"
|
||||
" }\n"
|
||||
" for (uint32_t j = 0; j < cc; ++j) {\n"
|
||||
" auto pt = point_from_compressed(all_commits + pos);",
|
||||
),
|
||||
# L1889: if (!scalar_parse_strict(s + 4, v)) in ufsecp_frost_keygen_finalize
|
||||
(
|
||||
" if (!scalar_parse_strict(s + 4, v))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid share scalar\");",
|
||||
" if (!scalar_parse_strict(s + 4, v)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid share scalar\");\n"
|
||||
" }",
|
||||
),
|
||||
# L1895+L1898: if (!ok) + for (auto& s : shares) — erase in ufsecp_frost_keygen_finalize
|
||||
(
|
||||
" if (!ok)\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_INTERNAL, \"FROST keygen finalize failed\");\n"
|
||||
" // Erase secret shares\n"
|
||||
" for (auto& s : shares)\n"
|
||||
" secp256k1::detail::secure_erase(&s.value, sizeof(s.value));",
|
||||
" if (!ok) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_INTERNAL, \"FROST keygen finalize failed\");\n"
|
||||
" }\n"
|
||||
" // Erase secret shares\n"
|
||||
" for (auto& s : shares) {\n"
|
||||
" secp256k1::detail::secure_erase(&s.value, sizeof(s.value));\n"
|
||||
" }",
|
||||
),
|
||||
# L1955: if (!scalar_parse_strict(keypkg + 12, kp.signing_share))
|
||||
(
|
||||
" if (!scalar_parse_strict(keypkg + 12, kp.signing_share))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"invalid signing share in keypkg\");",
|
||||
" if (!scalar_parse_strict(keypkg + 12, kp.signing_share)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"invalid signing share in keypkg\");\n"
|
||||
" }",
|
||||
),
|
||||
# L1967+L1969: if (!scalar_parse_strict(nonce, h)) + if (!scalar_parse_strict(nonce + 32, b))
|
||||
(
|
||||
" if (!scalar_parse_strict(nonce, h))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid hiding nonce\");\n"
|
||||
" if (!scalar_parse_strict(nonce + 32, b))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid binding nonce\");",
|
||||
" if (!scalar_parse_strict(nonce, h)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid hiding nonce\");\n"
|
||||
" }\n"
|
||||
" if (!scalar_parse_strict(nonce + 32, b)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"invalid binding nonce\");\n"
|
||||
" }",
|
||||
),
|
||||
# L2006+L2012: multi-line if null check + scalar parse in ufsecp_frost_verify_partial
|
||||
(
|
||||
" if (!ctx || !partial_sig || !verification_share33 || !nonce_commits || !msg32 || !group_pubkey33)\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
" secp256k1::FrostPartialSig psig;\n"
|
||||
" std::memcpy(&psig.id, partial_sig, 4);\n"
|
||||
" Scalar z;\n"
|
||||
" if (!scalar_parse_strict(partial_sig + 4, z))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid partial sig scalar\");",
|
||||
" if (!ctx || !partial_sig || !verification_share33 || !nonce_commits || !msg32 || !group_pubkey33) {\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" }\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
" secp256k1::FrostPartialSig psig;\n"
|
||||
" std::memcpy(&psig.id, partial_sig, 4);\n"
|
||||
" Scalar z;\n"
|
||||
" if (!scalar_parse_strict(partial_sig + 4, z)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid partial sig scalar\");\n"
|
||||
" }",
|
||||
),
|
||||
# L2057+L2065: multi-line if null check + scalar parse in ufsecp_frost_aggregate
|
||||
(
|
||||
" if (!ctx || !partial_sigs || !nonce_commits || !group_pubkey33 || !msg32 || !sig64_out)\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
" std::vector<secp256k1::FrostPartialSig> psigs(n);\n"
|
||||
" for (size_t i = 0; i < n; ++i) {\n"
|
||||
" const uint8_t* ps = partial_sigs + i * 36;\n"
|
||||
" std::memcpy(&psigs[i].id, ps, 4);\n"
|
||||
" Scalar z;\n"
|
||||
" if (!scalar_parse_strict(ps + 4, z))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid partial sig scalar\");",
|
||||
" if (!ctx || !partial_sigs || !nonce_commits || !group_pubkey33 || !msg32 || !sig64_out) {\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" }\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
" std::vector<secp256k1::FrostPartialSig> psigs(n);\n"
|
||||
" for (size_t i = 0; i < n; ++i) {\n"
|
||||
" const uint8_t* ps = partial_sigs + i * 36;\n"
|
||||
" std::memcpy(&psigs[i].id, ps, 4);\n"
|
||||
" Scalar z;\n"
|
||||
" if (!scalar_parse_strict(ps + 4, z)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid partial sig scalar\");\n"
|
||||
" }",
|
||||
),
|
||||
# L2144+L2150: in ufsecp_schnorr_adaptor_verify
|
||||
(
|
||||
" if (!scalar_parse_strict(pre_sig + 33, shat))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
|
||||
" as.s_hat = shat;\n"
|
||||
" as.needs_negation = (pre_sig[65] != 0);\n"
|
||||
" // Strict: reject x-only pubkey >= p at ABI gate\n"
|
||||
" FE pk_fe;\n"
|
||||
" if (!FE::parse_bytes_strict(pubkey_x, pk_fe))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p)\");",
|
||||
" if (!scalar_parse_strict(pre_sig + 33, shat)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
|
||||
" }\n"
|
||||
" as.s_hat = shat;\n"
|
||||
" as.needs_negation = (pre_sig[65] != 0);\n"
|
||||
" // Strict: reject x-only pubkey >= p at ABI gate\n"
|
||||
" FE pk_fe;\n"
|
||||
" if (!FE::parse_bytes_strict(pubkey_x, pk_fe)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"non-canonical pubkey (x>=p)\");\n"
|
||||
" }",
|
||||
),
|
||||
# L2176: in ufsecp_schnorr_adaptor_adapt
|
||||
(
|
||||
" if (!scalar_parse_strict(pre_sig + 33, shat))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
|
||||
" as.s_hat = shat;\n"
|
||||
" as.needs_negation = (pre_sig[65] != 0);\n"
|
||||
" Scalar secret;\n"
|
||||
" if (!scalar_parse_strict_nonzero(adaptor_secret, secret))",
|
||||
" if (!scalar_parse_strict(pre_sig + 33, shat)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
|
||||
" }\n"
|
||||
" as.s_hat = shat;\n"
|
||||
" as.needs_negation = (pre_sig[65] != 0);\n"
|
||||
" Scalar secret;\n"
|
||||
" if (!scalar_parse_strict_nonzero(adaptor_secret, secret))",
|
||||
),
|
||||
# L2203: in ufsecp_schnorr_adaptor_extract
|
||||
(
|
||||
" if (!scalar_parse_strict(pre_sig + 33, shat))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
|
||||
" as.s_hat = shat;\n"
|
||||
" as.needs_negation = (pre_sig[65] != 0);\n"
|
||||
" secp256k1::SchnorrSignature sig;",
|
||||
" if (!scalar_parse_strict(pre_sig + 33, shat)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_SIG, \"invalid adaptor sig scalar\");\n"
|
||||
" }\n"
|
||||
" as.s_hat = shat;\n"
|
||||
" as.needs_negation = (pre_sig[65] != 0);\n"
|
||||
" secp256k1::SchnorrSignature sig;",
|
||||
),
|
||||
# L2810+L2815+L2817: in ufsecp_silent_payment_address_create
|
||||
(
|
||||
" if (!ctx || !scan_privkey || !spend_privkey || !scan_pubkey33_out ||\n"
|
||||
" !spend_pubkey33_out || !addr_out || !addr_len)\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
"\n"
|
||||
" Scalar scan_sk, spend_sk;\n"
|
||||
" if (!scalar_parse_strict_nonzero(scan_privkey, scan_sk))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"scan privkey is zero or >= n\");\n"
|
||||
" if (!scalar_parse_strict_nonzero(spend_privkey, spend_sk))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"spend privkey is zero or >= n\");",
|
||||
" if (!ctx || !scan_privkey || !spend_privkey || !scan_pubkey33_out ||\n"
|
||||
" !spend_pubkey33_out || !addr_out || !addr_len) {\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" }\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
"\n"
|
||||
" Scalar scan_sk, spend_sk;\n"
|
||||
" if (!scalar_parse_strict_nonzero(scan_privkey, scan_sk)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"scan privkey is zero or >= n\");\n"
|
||||
" }\n"
|
||||
" if (!scalar_parse_strict_nonzero(spend_privkey, spend_sk)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"spend privkey is zero or >= n\");\n"
|
||||
" }",
|
||||
),
|
||||
# L2827: if (addr_str.size() >= *addr_len)
|
||||
(
|
||||
" if (addr_str.size() >= *addr_len)\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"address buffer too small\");",
|
||||
" if (addr_str.size() >= *addr_len) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"address buffer too small\");\n"
|
||||
" }",
|
||||
),
|
||||
# L2846+L2855+L2864+L2868: in ufsecp_silent_payment_create_output
|
||||
(
|
||||
" if (!ctx || !input_privkeys || n_inputs == 0 || !scan_pubkey33 ||\n"
|
||||
" !spend_pubkey33 || !output_pubkey33_out)\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
"\n"
|
||||
" // Parse input private keys\n"
|
||||
" std::vector<Scalar> privkeys;\n"
|
||||
" privkeys.reserve(n_inputs);\n"
|
||||
" for (size_t i = 0; i < n_inputs; ++i) {\n"
|
||||
" Scalar sk;\n"
|
||||
" if (!scalar_parse_strict_nonzero(input_privkeys + i * 32, sk))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"input privkey is zero or >= n\");\n"
|
||||
" privkeys.push_back(sk);\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" // Parse recipient address\n"
|
||||
" secp256k1::SilentPaymentAddress recipient;\n"
|
||||
" recipient.scan_pubkey = point_from_compressed(scan_pubkey33);\n"
|
||||
" recipient.spend_pubkey = point_from_compressed(spend_pubkey33);\n"
|
||||
" if (recipient.scan_pubkey.is_infinity() || recipient.spend_pubkey.is_infinity())\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid recipient pubkey\");\n"
|
||||
"\n"
|
||||
" auto [output_point, tweak] = secp256k1::silent_payment_create_output(privkeys, recipient, k);\n"
|
||||
" if (output_point.is_infinity())\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_ARITH, \"output point is infinity\");",
|
||||
" if (!ctx || !input_privkeys || n_inputs == 0 || !scan_pubkey33 ||\n"
|
||||
" !spend_pubkey33 || !output_pubkey33_out) {\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" }\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
"\n"
|
||||
" // Parse input private keys\n"
|
||||
" std::vector<Scalar> privkeys;\n"
|
||||
" privkeys.reserve(n_inputs);\n"
|
||||
" for (size_t i = 0; i < n_inputs; ++i) {\n"
|
||||
" Scalar sk;\n"
|
||||
" if (!scalar_parse_strict_nonzero(input_privkeys + i * 32, sk)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"input privkey is zero or >= n\");\n"
|
||||
" }\n"
|
||||
" privkeys.push_back(sk);\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" // Parse recipient address\n"
|
||||
" secp256k1::SilentPaymentAddress recipient;\n"
|
||||
" recipient.scan_pubkey = point_from_compressed(scan_pubkey33);\n"
|
||||
" recipient.spend_pubkey = point_from_compressed(spend_pubkey33);\n"
|
||||
" if (recipient.scan_pubkey.is_infinity() || recipient.spend_pubkey.is_infinity()) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid recipient pubkey\");\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" auto [output_point, tweak] = secp256k1::silent_payment_create_output(privkeys, recipient, k);\n"
|
||||
" if (output_point.is_infinity()) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_ARITH, \"output point is infinity\");\n"
|
||||
" }",
|
||||
),
|
||||
# L2879: for (auto& sk : privkeys) — erase in ufsecp_silent_payment_create_output
|
||||
(
|
||||
" for (auto& sk : privkeys)\n"
|
||||
" secp256k1::detail::secure_erase(&sk, sizeof(sk));\n"
|
||||
" return UFSECP_OK;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"ufsecp_error_t ufsecp_silent_payment_scan(",
|
||||
" for (auto& sk : privkeys) {\n"
|
||||
" secp256k1::detail::secure_erase(&sk, sizeof(sk));\n"
|
||||
" }\n"
|
||||
" return UFSECP_OK;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"ufsecp_error_t ufsecp_silent_payment_scan(",
|
||||
),
|
||||
# L2894+L2896+L2901+L2903+L2911: in ufsecp_silent_payment_scan
|
||||
(
|
||||
" if (!ctx || !scan_privkey || !spend_privkey || !input_pubkeys33 ||\n"
|
||||
" !output_xonly32 || !n_found)\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" if (n_input_pubkeys == 0 || n_outputs == 0)\n"
|
||||
" return UFSECP_ERR_BAD_INPUT;\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
"\n"
|
||||
" Scalar scan_sk, spend_sk;\n"
|
||||
" if (!scalar_parse_strict_nonzero(scan_privkey, scan_sk))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"scan privkey is zero or >= n\");\n"
|
||||
" if (!scalar_parse_strict_nonzero(spend_privkey, spend_sk))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"spend privkey is zero or >= n\");\n"
|
||||
"\n"
|
||||
" // Parse input pubkeys\n"
|
||||
" std::vector<Point> input_pks;\n"
|
||||
" input_pks.reserve(n_input_pubkeys);\n"
|
||||
" for (size_t i = 0; i < n_input_pubkeys; ++i) {\n"
|
||||
" auto pk = point_from_compressed(input_pubkeys33 + i * 33);\n"
|
||||
" if (pk.is_infinity())\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid input pubkey\");",
|
||||
" if (!ctx || !scan_privkey || !spend_privkey || !input_pubkeys33 ||\n"
|
||||
" !output_xonly32 || !n_found) {\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" }\n"
|
||||
" if (n_input_pubkeys == 0 || n_outputs == 0) {\n"
|
||||
" return UFSECP_ERR_BAD_INPUT;\n"
|
||||
" }\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
"\n"
|
||||
" Scalar scan_sk, spend_sk;\n"
|
||||
" if (!scalar_parse_strict_nonzero(scan_privkey, scan_sk)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"scan privkey is zero or >= n\");\n"
|
||||
" }\n"
|
||||
" if (!scalar_parse_strict_nonzero(spend_privkey, spend_sk)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"spend privkey is zero or >= n\");\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" // Parse input pubkeys\n"
|
||||
" std::vector<Point> input_pks;\n"
|
||||
" input_pks.reserve(n_input_pubkeys);\n"
|
||||
" for (size_t i = 0; i < n_input_pubkeys; ++i) {\n"
|
||||
" auto pk = point_from_compressed(input_pubkeys33 + i * 33);\n"
|
||||
" if (pk.is_infinity()) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid input pubkey\");\n"
|
||||
" }",
|
||||
),
|
||||
# L2953+L2964+L2968+L2972: in ufsecp_ecies_encrypt
|
||||
(
|
||||
" if (!ctx || !recipient_pubkey33 || !plaintext || !envelope_out || !envelope_len)\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" if (plaintext_len == 0) {\n"
|
||||
" return UFSECP_ERR_BAD_INPUT;\n"
|
||||
" }\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
"\n"
|
||||
" if (plaintext_len > SIZE_MAX - UFSECP_ECIES_OVERHEAD) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"plaintext_len too large\");\n"
|
||||
" }\n"
|
||||
" size_t const needed = plaintext_len + UFSECP_ECIES_OVERHEAD;\n"
|
||||
" if (*envelope_len < needed)\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"envelope buffer too small\");\n"
|
||||
"\n"
|
||||
" auto pk = point_from_compressed(recipient_pubkey33);\n"
|
||||
" if (pk.is_infinity())\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid recipient pubkey\");\n"
|
||||
"\n"
|
||||
" auto envelope = secp256k1::ecies_encrypt(pk, plaintext, plaintext_len);\n"
|
||||
" if (envelope.empty())\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_INTERNAL, \"ECIES encryption failed\");",
|
||||
" if (!ctx || !recipient_pubkey33 || !plaintext || !envelope_out || !envelope_len) {\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" }\n"
|
||||
" if (plaintext_len == 0) {\n"
|
||||
" return UFSECP_ERR_BAD_INPUT;\n"
|
||||
" }\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
"\n"
|
||||
" if (plaintext_len > SIZE_MAX - UFSECP_ECIES_OVERHEAD) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_INPUT, \"plaintext_len too large\");\n"
|
||||
" }\n"
|
||||
" size_t const needed = plaintext_len + UFSECP_ECIES_OVERHEAD;\n"
|
||||
" if (*envelope_len < needed) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"envelope buffer too small\");\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" auto pk = point_from_compressed(recipient_pubkey33);\n"
|
||||
" if (pk.is_infinity()) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_PUBKEY, \"invalid recipient pubkey\");\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" auto envelope = secp256k1::ecies_encrypt(pk, plaintext, plaintext_len);\n"
|
||||
" if (envelope.empty()) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_INTERNAL, \"ECIES encryption failed\");\n"
|
||||
" }",
|
||||
),
|
||||
# L2985+L2987+L2992+L2996+L3002: in ufsecp_ecies_decrypt
|
||||
(
|
||||
" if (!ctx || !privkey || !envelope || !plaintext_out || !plaintext_len)\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" if (envelope_len < 82) // min: 33 + 16 + 1 + 32\n"
|
||||
" return UFSECP_ERR_BAD_INPUT;\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
"\n"
|
||||
" size_t const expected_pt_len = envelope_len - UFSECP_ECIES_OVERHEAD;\n"
|
||||
" if (*plaintext_len < expected_pt_len)\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"plaintext buffer too small\");\n"
|
||||
"\n"
|
||||
" Scalar sk;\n"
|
||||
" if (!scalar_parse_strict_nonzero(privkey, sk))\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"privkey is zero or >= n\");\n"
|
||||
"\n"
|
||||
" auto pt = secp256k1::ecies_decrypt(sk, envelope, envelope_len);\n"
|
||||
" secp256k1::detail::secure_erase(&sk, sizeof(sk));\n"
|
||||
"\n"
|
||||
" if (pt.empty())\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_VERIFY_FAIL, \"ECIES decryption failed (bad key or tampered)\");",
|
||||
" if (!ctx || !privkey || !envelope || !plaintext_out || !plaintext_len) {\n"
|
||||
" return UFSECP_ERR_NULL_ARG;\n"
|
||||
" }\n"
|
||||
" if (envelope_len < 82) { // min: 33 + 16 + 1 + 32\n"
|
||||
" return UFSECP_ERR_BAD_INPUT;\n"
|
||||
" }\n"
|
||||
" ctx_clear_err(ctx);\n"
|
||||
"\n"
|
||||
" size_t const expected_pt_len = envelope_len - UFSECP_ECIES_OVERHEAD;\n"
|
||||
" if (*plaintext_len < expected_pt_len) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BUF_TOO_SMALL, \"plaintext buffer too small\");\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" Scalar sk;\n"
|
||||
" if (!scalar_parse_strict_nonzero(privkey, sk)) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_BAD_KEY, \"privkey is zero or >= n\");\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" auto pt = secp256k1::ecies_decrypt(sk, envelope, envelope_len);\n"
|
||||
" secp256k1::detail::secure_erase(&sk, sizeof(sk));\n"
|
||||
"\n"
|
||||
" if (pt.empty()) {\n"
|
||||
" return ctx_set_err(ctx, UFSECP_ERR_VERIFY_FAIL, \"ECIES decryption failed (bad key or tampered)\");\n"
|
||||
" }",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
with open(PATH, "r") as f:
|
||||
content = f.read()
|
||||
|
||||
for i, (old, new) in enumerate(REPLACEMENTS):
|
||||
count = content.count(old)
|
||||
if count == 0:
|
||||
print(f"[FAIL] Replacement {i+1}: NOT FOUND")
|
||||
print(f" Looking for: {repr(old[:80])}")
|
||||
sys.exit(1)
|
||||
if count > 1:
|
||||
print(f"[WARN] Replacement {i+1}: found {count} occurrences, replacing first")
|
||||
content = content.replace(old, new, 1)
|
||||
print(f"[OK] Replacement {i+1} applied")
|
||||
|
||||
with open(PATH, "w") as f:
|
||||
f.write(content)
|
||||
print(f"\nAll {len(REPLACEMENTS)} replacements applied to {PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
893
fix_round4.py
Normal file
893
fix_round4.py
Normal file
@ -0,0 +1,893 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Fix all 211 code-scanning alerts across 13 files."""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
BASE = Path('/home/shrek/Secp256K1/Secp256K1fast/libs/UltrafastSecp256k1')
|
||||
|
||||
|
||||
def read(path):
|
||||
return (BASE / path).read_text().splitlines(keepends=True)
|
||||
|
||||
|
||||
def save(path, lines):
|
||||
(BASE / path).write_text(''.join(lines))
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Algorithmic helpers
|
||||
# ============================================================================
|
||||
|
||||
def add_braces(lines, alert_lines_1based, tag=''):
|
||||
"""Add { } around single-statement bodies. Process bottom-to-top."""
|
||||
fixed = 0
|
||||
for lnum in sorted(alert_lines_1based, reverse=True):
|
||||
idx = lnum - 1
|
||||
if idx >= len(lines):
|
||||
print(f' SKIP {tag}L{lnum}: out of range ({len(lines)} lines)')
|
||||
continue
|
||||
line = lines[idx]
|
||||
# Get indentation of the controlling statement
|
||||
indent = len(line) - len(line.lstrip())
|
||||
indent_str = line[:indent]
|
||||
stripped = line.rstrip('\n\r').rstrip()
|
||||
|
||||
# Skip if already has brace at end
|
||||
if stripped.endswith('{'):
|
||||
print(f' SKIP {tag}L{lnum}: already has {{')
|
||||
continue
|
||||
|
||||
# Find next non-empty line (the body)
|
||||
body_idx = idx + 1
|
||||
while body_idx < len(lines) and lines[body_idx].strip() == '':
|
||||
body_idx += 1
|
||||
|
||||
if body_idx >= len(lines):
|
||||
print(f' SKIP {tag}L{lnum}: no body line found')
|
||||
continue
|
||||
|
||||
body_line_stripped = lines[body_idx].lstrip()
|
||||
# Skip if body already starts with {
|
||||
if body_line_stripped.startswith('{'):
|
||||
print(f' SKIP {tag}L{lnum}: body already has {{')
|
||||
continue
|
||||
|
||||
# Apply fix
|
||||
lines[idx] = stripped + ' {\n'
|
||||
lines.insert(body_idx + 1, indent_str + '}\n')
|
||||
fixed += 1
|
||||
|
||||
print(f' -> {tag}braces fixed: {fixed}')
|
||||
return lines
|
||||
|
||||
|
||||
def add_const_to_lines(lines, alert_lines_1based, tag=''):
|
||||
"""Prepend const to variable declarations, handling range-for loops."""
|
||||
fixed = 0
|
||||
for lnum in sorted(alert_lines_1based, reverse=True):
|
||||
idx = lnum - 1
|
||||
if idx >= len(lines):
|
||||
continue
|
||||
line = lines[idx]
|
||||
stripped = line.lstrip()
|
||||
leading = line[:len(line) - len(stripped)]
|
||||
|
||||
if stripped.startswith('const '):
|
||||
print(f' SKIP {tag}L{lnum}: already const')
|
||||
continue
|
||||
|
||||
# Range-based for loop: for (TYPE var : container) -> for (const TYPE var : container)
|
||||
if stripped.startswith('for (') and ':' in stripped:
|
||||
# Match: for (TYPE var : ...
|
||||
m = re.match(r'(for \()(\w[^:]+: .+)', stripped)
|
||||
if m:
|
||||
lines[idx] = leading + m.group(1) + 'const ' + m.group(2)
|
||||
fixed += 1
|
||||
continue
|
||||
|
||||
# Regular declaration
|
||||
lines[idx] = leading + 'const ' + stripped
|
||||
fixed += 1
|
||||
|
||||
print(f' -> {tag}const fixed: {fixed}')
|
||||
return lines
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: include/ufsecp/ufsecp_impl.cpp
|
||||
# ============================================================================
|
||||
|
||||
def fix_ufsecp_impl():
|
||||
path = 'include/ufsecp/ufsecp_impl.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
lines = read(path)
|
||||
|
||||
# --- readability-braces-around-statements (59 alerts) ---
|
||||
brace_lines = [
|
||||
1242, 1245, 1248, 1260, 1274, 1277, 1281, 1294, 1297,
|
||||
1300, 1314, 1318, 1322, 1340, 1343, 1345, 1355, 1368,
|
||||
1412, 1415, 1431, 1435, 1438, 1457, 1462, 1477, 1481,
|
||||
1486, 1514, 1516, 1519, 1522, 1525, 1542, 1545, 1549,
|
||||
1567, 1577, 1594, 1691, 1695, 1699, 1701, 1749, 1753,
|
||||
1787, 1801, 1831, 1834, 1844, 1856, 1974, 2047, 2068,
|
||||
2071, 2076, 2138, 2832, 2834,
|
||||
]
|
||||
lines = add_braces(lines, brace_lines, 'ufsecp_impl/')
|
||||
|
||||
# --- misc-const-correctness ---
|
||||
const_lines = [1366, 1855, 1905, 2075, 3147, 3167, 3172]
|
||||
lines = add_const_to_lines(lines, const_lines, 'ufsecp_impl/')
|
||||
|
||||
# --- modernize-use-auto ---
|
||||
# L1573: uint32_t nk = static_cast<uint32_t>(...) -> auto nk = ...
|
||||
# L1846: uint32_t cc32 = static_cast<uint32_t>(...) -> auto cc32 = ...
|
||||
for lnum in [1573, 1846]:
|
||||
idx = lnum - 1
|
||||
line = lines[idx]
|
||||
m = re.match(r'(\s*)uint32_t (\w+) = (static_cast<uint32_t>\(.+)', line)
|
||||
if m:
|
||||
lines[idx] = f'{m.group(1)}auto {m.group(2)} = {m.group(3)}'
|
||||
print(f' AUTO: L{lnum}')
|
||||
|
||||
# --- cppcoreguidelines-init-variables ---
|
||||
# L1655: uint32_t nk; -> uint32_t nk = 0;
|
||||
idx = 1655 - 1
|
||||
if ' uint32_t nk;' in lines[idx]:
|
||||
lines[idx] = lines[idx].replace('uint32_t nk;', 'uint32_t nk = 0;')
|
||||
print(' INIT: L1655')
|
||||
|
||||
# L1706: { uint32_t nk; -> { uint32_t nk = 0;
|
||||
idx = 1706 - 1
|
||||
if 'uint32_t nk;' in lines[idx]:
|
||||
lines[idx] = lines[idx].replace('uint32_t nk;', 'uint32_t nk = 0;')
|
||||
print(' INIT: L1706')
|
||||
|
||||
# L1761: same pattern
|
||||
idx = 1761 - 1
|
||||
if 'uint32_t nk;' in lines[idx]:
|
||||
lines[idx] = lines[idx].replace('uint32_t nk;', 'uint32_t nk = 0;')
|
||||
print(' INIT: L1761')
|
||||
|
||||
# --- bugprone-implicit-widening-of-multiplication-result ---
|
||||
# L1578: keyagg_out + 38 + i * 32 -> keyagg_out + 38 + static_cast<size_t>(i) * 32
|
||||
idx = 1578 - 1
|
||||
if 'i * 32' in lines[idx] and 'static_cast<size_t>(i)' not in lines[idx]:
|
||||
lines[idx] = lines[idx].replace(
|
||||
'keyagg_out + 38 + i * 32',
|
||||
'keyagg_out + 38 + static_cast<size_t>(i) * 32'
|
||||
)
|
||||
print(' WIDENING: L1578')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: cpu/src/bip39.cpp
|
||||
# ============================================================================
|
||||
|
||||
def fix_bip39():
|
||||
path = 'cpu/src/bip39.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
lines = read(path)
|
||||
|
||||
brace_lines = [49, 50, 93, 110, 117, 138, 140, 150, 171, 196, 200, 223,
|
||||
246, 269, 273]
|
||||
lines = add_braces(lines, brace_lines, 'bip39/')
|
||||
|
||||
const_lines = [33, 46, 47, 97, 126, 127, 128, 129, 136, 145,
|
||||
182, 183, 184, 185, 191, 193, 194, 199,
|
||||
255, 256, 257, 258, 264, 266, 267, 272]
|
||||
lines = add_const_to_lines(lines, const_lines, 'bip39/')
|
||||
|
||||
# --- cppcoreguidelines-init-variables ---
|
||||
# L137: some variable, need to find it
|
||||
idx = 137 - 1
|
||||
line = lines[idx]
|
||||
# Pattern: TYPE var; (uninitialized) - add = 0 or = {} or = nullptr
|
||||
m = re.match(r'(\s*)((?:int|uint\w*|size_t|bool|char|float|double)\s+\w+);(\s*(?://.*)?)\n', line)
|
||||
if m:
|
||||
type_and_var = m.group(2).rstrip()
|
||||
# Determine default value
|
||||
if 'bool' in type_and_var:
|
||||
default = 'false'
|
||||
elif 'float' in type_and_var or 'double' in type_and_var:
|
||||
default = '0.0'
|
||||
elif 'char*' in type_and_var or 'uint8_t*' in type_and_var:
|
||||
default = 'nullptr'
|
||||
else:
|
||||
default = '0'
|
||||
lines[idx] = f'{m.group(1)}{type_and_var} = {default};{m.group(3)}\n'
|
||||
print(f' INIT: L137 -> added = {default}')
|
||||
else:
|
||||
print(f' INIT_SKIP: L137 pattern not matched: {repr(line[:60])}')
|
||||
|
||||
# --- modernize-use-auto ---
|
||||
# L191 and L264: iterator/auto type replacement
|
||||
for lnum in [191, 264]:
|
||||
idx = lnum - 1
|
||||
line = lines[idx]
|
||||
# Pattern: SomeType::iterator it = or std::vector<...>::iterator it =
|
||||
m = re.match(r'(\s*)(\w[\w:<>, *]+::iterator)(\s+\w+\s*=.+)', line)
|
||||
if m:
|
||||
lines[idx] = f'{m.group(1)}auto{m.group(3)}'
|
||||
print(f' AUTO: L{lnum}')
|
||||
else:
|
||||
# Try: SomeType it = container.begin()
|
||||
m2 = re.match(r'(\s*)(\w[\w:<>, *]+\*?)(\s+\w+\s*=\s*\w.+\.begin\(\).+)', line)
|
||||
if m2:
|
||||
lines[idx] = f'{m2.group(1)}auto{m2.group(3)}'
|
||||
print(f' AUTO: L{lnum}')
|
||||
else:
|
||||
print(f' AUTO_SKIP: L{lnum}: {repr(line[:60])}')
|
||||
|
||||
# --- cert-err33-c (unchecked fclose return) ---
|
||||
# L34: std::fclose(f); -> (void)std::fclose(f);
|
||||
idx = 34 - 1
|
||||
line = lines[idx]
|
||||
if 'std::fclose' in line and '(void)' not in line:
|
||||
lines[idx] = line.replace('std::fclose', '(void)std::fclose')
|
||||
print(' ERR33: L34 fclose')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: cpu/src/zk.cpp
|
||||
# ============================================================================
|
||||
|
||||
def fix_zk():
|
||||
path = 'cpu/src/zk.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
lines = read(path)
|
||||
|
||||
brace_lines = [45, 68, 381, 415, 423, 481, 503, 610, 615, 619, 623,
|
||||
664, 668, 675, 686, 688, 720, 785]
|
||||
lines = add_braces(lines, brace_lines, 'zk/')
|
||||
|
||||
const_lines = [359, 363, 446, 448, 500, 642, 661]
|
||||
lines = add_const_to_lines(lines, const_lines, 'zk/')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: cpu/src/message_signing.cpp
|
||||
# ============================================================================
|
||||
|
||||
def fix_message_signing():
|
||||
path = 'cpu/src/message_signing.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
lines = read(path)
|
||||
|
||||
brace_lines = [30, 35]
|
||||
lines = add_braces(lines, brace_lines, 'msg_signing/')
|
||||
|
||||
const_lines = [65, 152, 153, 154, 155, 159, 193, 196]
|
||||
lines = add_const_to_lines(lines, const_lines, 'msg_signing/')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: cpu/src/eth_signing.cpp
|
||||
# ============================================================================
|
||||
|
||||
def fix_eth_signing():
|
||||
path = 'cpu/src/eth_signing.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
lines = read(path)
|
||||
|
||||
# --- misc-unused-using-decls: L16 'using fast::Point;' ---
|
||||
idx = 16 - 1
|
||||
if 'using fast::Point' in lines[idx]:
|
||||
lines[idx] = '' # Remove the line (keep blank to preserve line numbers)
|
||||
# Actually remove the line entirely
|
||||
lines[idx] = '\n'
|
||||
# Better: just delete and shift
|
||||
del lines[idx]
|
||||
# Now const_lines will shift by -1
|
||||
print(' UNUSED-USING: L16 removed')
|
||||
# After removal, adjust const lines
|
||||
const_lines = [95, 96] # shifted from [96, 97]
|
||||
else:
|
||||
print(f' UNUSED-USING SKIP: L16: {repr(lines[idx][:50])}')
|
||||
const_lines = [96, 97]
|
||||
|
||||
lines = add_const_to_lines(lines, const_lines, 'eth_signing/')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: cpu/src/address.cpp
|
||||
# ============================================================================
|
||||
|
||||
def fix_address():
|
||||
path = 'cpu/src/address.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
lines = read(path)
|
||||
|
||||
# L516: for (char c : prefix) -> for (const char c : prefix)
|
||||
# L527: std::uint8_t version_byte = ... -> const std::uint8_t version_byte = ...
|
||||
# L527: also modernize-use-auto -> auto version_byte = ...
|
||||
const_lines = [516, 527]
|
||||
lines = add_const_to_lines(lines, const_lines, 'address/')
|
||||
|
||||
# L527: modernize-use-auto: const std::uint8_t version_byte = static_cast<...>
|
||||
# -> const auto version_byte = static_cast<...>
|
||||
# This is handled by add_const adding 'const', but we also need to change the type
|
||||
# Actually the modernize-use-auto wants: 'auto version_byte = static_cast<std::uint8_t>(...)'
|
||||
# And const-correctness wants: 'const ... version_byte = ...'
|
||||
# Combined: 'const auto version_byte = static_cast<std::uint8_t>(...)'
|
||||
# Let's check what add_const_to_lines did for L527:
|
||||
# Line 527 was: std::uint8_t version_byte = static_cast<std::uint8_t>(type << 3);
|
||||
# After add_const: const std::uint8_t version_byte = ...
|
||||
# But we also want to replace std::uint8_t with auto for modernize-use-auto:
|
||||
# Find current state of L527 (0-indexed: 526, but const_lines processed in reverse,
|
||||
# so L516 was processed first (higher reverse order), then L527)
|
||||
# Actually both were processed with const_lines = [516, 527], processed in reverse: 527, 516
|
||||
# After const processing, L527 has 'const std::uint8_t version_byte = ...'
|
||||
# Now apply modernize-use-auto: replace 'const std::uint8_t' with 'const auto'
|
||||
idx = 527 - 1
|
||||
if idx < len(lines):
|
||||
line = lines[idx]
|
||||
if 'const std::uint8_t version_byte' in line:
|
||||
lines[idx] = line.replace('const std::uint8_t version_byte',
|
||||
'const auto version_byte')
|
||||
print(' AUTO: L527')
|
||||
elif 'const auto version_byte' in line:
|
||||
print(' AUTO: L527 already auto')
|
||||
else:
|
||||
print(f' AUTO_SKIP: L527: {repr(line[:60])}')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: cpu/src/wallet.cpp
|
||||
# ============================================================================
|
||||
|
||||
def fix_wallet():
|
||||
path = 'cpu/src/wallet.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
lines = read(path)
|
||||
|
||||
# L150, L171: bugprone-misplaced-widening-cast
|
||||
# Pattern: static_cast<std::uint64_t>(27 + rsig.recid)
|
||||
# Fix: static_cast<std::uint64_t>(27) + static_cast<std::uint64_t>(rsig.recid)
|
||||
for lnum in [150, 171]:
|
||||
idx = lnum - 1
|
||||
if idx >= len(lines):
|
||||
continue
|
||||
line = lines[idx]
|
||||
if 'static_cast<std::uint64_t>(27 + rsig.recid)' in line:
|
||||
lines[idx] = line.replace(
|
||||
'static_cast<std::uint64_t>(27 + rsig.recid)',
|
||||
'static_cast<std::uint64_t>(27) + static_cast<std::uint64_t>(rsig.recid)'
|
||||
)
|
||||
print(f' WIDEN: L{lnum}')
|
||||
else:
|
||||
print(f' WIDEN_SKIP: L{lnum}: {repr(line[:60])}')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: cpu/src/coin_address.cpp
|
||||
# ============================================================================
|
||||
|
||||
def fix_coin_address():
|
||||
path = 'cpu/src/coin_address.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
lines = read(path)
|
||||
|
||||
# L170: std::string prefix = testnet ? ... -> const std::string prefix = ...
|
||||
const_lines = [170]
|
||||
lines = add_const_to_lines(lines, const_lines, 'coin_address/')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: cpu/tests/test_bip39.cpp
|
||||
# ============================================================================
|
||||
|
||||
# Helper function for replacing sscanf with strtoul in hex_to_bytes
|
||||
HEX_TO_BYTES_SSCANF_BIP39 = '''\
|
||||
static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
unsigned int byte = 0;
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
std::sscanf(hex + 2 * i, "%02x", &byte);
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic pop
|
||||
#endif
|
||||
out[i] = static_cast<uint8_t>(byte);
|
||||
}
|
||||
}'''
|
||||
|
||||
HEX_TO_BYTES_STRTOUL_BIP39 = '''\
|
||||
static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
char pair[3] = { hex[2 * i], hex[2 * i + 1], '\\0' };
|
||||
char* endptr = nullptr;
|
||||
const unsigned long val = std::strtoul(pair, &endptr, 16);
|
||||
out[i] = (endptr == pair + 2) ? static_cast<uint8_t>(val) : 0;
|
||||
}
|
||||
}'''
|
||||
|
||||
BYTES_TO_HEX_OLD = '''\
|
||||
static std::string bytes_to_hex(const uint8_t* data, size_t len) {
|
||||
std::string result;
|
||||
result.reserve(len * 2);
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
char buf[3];
|
||||
std::snprintf(buf, sizeof(buf), "%02x", data[i]);
|
||||
result += buf;
|
||||
}
|
||||
return result;
|
||||
}'''
|
||||
|
||||
BYTES_TO_HEX_NEW = '''\
|
||||
static std::string bytes_to_hex(const uint8_t* data, size_t len) {
|
||||
std::string result;
|
||||
result.reserve(len * 2);
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
char buf[3];
|
||||
(void)std::snprintf(buf, sizeof(buf), "%02x", data[i]);
|
||||
result += buf;
|
||||
}
|
||||
return result;
|
||||
}'''
|
||||
|
||||
|
||||
def fix_test_bip39():
|
||||
path = 'cpu/tests/test_bip39.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
content = (BASE / path).read_text()
|
||||
|
||||
# cert-err33-c + cert-err34-c: replace sscanf with strtoul
|
||||
if HEX_TO_BYTES_SSCANF_BIP39 in content:
|
||||
content = content.replace(HEX_TO_BYTES_SSCANF_BIP39, HEX_TO_BYTES_STRTOUL_BIP39)
|
||||
print(' ERR34: hex_to_bytes sscanf -> strtoul')
|
||||
else:
|
||||
print(' ERR34_SKIP: hex_to_bytes sscanf pattern not found')
|
||||
|
||||
# cert-err33-c: snprintf return unchecked
|
||||
if BYTES_TO_HEX_OLD in content:
|
||||
content = content.replace(BYTES_TO_HEX_OLD, BYTES_TO_HEX_NEW)
|
||||
print(' ERR33: bytes_to_hex snprintf -> (void)snprintf')
|
||||
else:
|
||||
print(' ERR33_SKIP: bytes_to_hex pattern not found')
|
||||
|
||||
# clang-analyzer-core.NullDereference at L99
|
||||
# CHECK(wl != nullptr, ...) then wl[0] - add explicit if
|
||||
old_null = ' CHECK(wl != nullptr, "wordlist not null");\n CHECK(std::strcmp(wl[0]'
|
||||
new_null = ' CHECK(wl != nullptr, "wordlist not null");\n if (!wl) { return; }\n CHECK(std::strcmp(wl[0]'
|
||||
if old_null in content:
|
||||
content = content.replace(old_null, new_null)
|
||||
print(' NULL_DEREF: L99 added null guard')
|
||||
else:
|
||||
print(' NULL_DEREF_SKIP: pattern not found')
|
||||
|
||||
(BASE / path).write_text(content)
|
||||
# Now add const to specific lines
|
||||
lines = read(path)
|
||||
|
||||
# After the sscanf->strtoul replacement, L32 changes. The line numbers may shift.
|
||||
# The original file had 393 lines. After replacing 14-line block with 7-line block
|
||||
# and 9-line block with 9-line block (same), the const lines may shift.
|
||||
# Let's handle const by string pattern instead.
|
||||
# L238, L252, L264: std::string hex = bytes_to_hex(...) -> const std::string hex = ...
|
||||
for idx in range(len(lines)):
|
||||
line = lines[idx]
|
||||
stripped = line.lstrip()
|
||||
if stripped.startswith('std::string hex = bytes_to_hex('):
|
||||
leading = line[:len(line) - len(stripped)]
|
||||
lines[idx] = leading + 'const ' + stripped
|
||||
print(f' CONST: L{idx+1} std::string hex')
|
||||
|
||||
# L340, L352, L365: for (char c : mnemonic) -> for (const char c : mnemonic)
|
||||
for idx in range(len(lines)):
|
||||
line = lines[idx]
|
||||
if 'for (char c : mnemonic)' in line:
|
||||
lines[idx] = line.replace('for (char c : mnemonic)',
|
||||
'for (const char c : mnemonic)')
|
||||
print(f' CONST: L{idx+1} for (char c : mnemonic)')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: cpu/tests/test_ethereum.cpp
|
||||
# ============================================================================
|
||||
|
||||
HEX_TO_BYTES_SSCANF_ETH = '''\
|
||||
static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
unsigned int byte = 0;
|
||||
if (std::sscanf(hex + i * 2, "%02x", &byte) != 1) byte = 0;
|
||||
out[i] = static_cast<uint8_t>(byte);
|
||||
}
|
||||
}'''
|
||||
|
||||
HEX_TO_BYTES_STRTOUL_ETH = '''\
|
||||
static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
char pair[3] = { hex[i * 2], hex[i * 2 + 1], '\\0' };
|
||||
char* endptr = nullptr;
|
||||
const unsigned long val = std::strtoul(pair, &endptr, 16);
|
||||
out[i] = (endptr == pair + 2) ? static_cast<uint8_t>(val) : 0;
|
||||
}
|
||||
}'''
|
||||
|
||||
SNPRINTF_ETH_OLD = ' std::snprintf(buf, sizeof(buf), "Round-trip chain_id=%lu (%s)",'
|
||||
SNPRINTF_ETH_NEW = ' (void)std::snprintf(buf, sizeof(buf), "Round-trip chain_id=%lu (%s)",'
|
||||
|
||||
|
||||
def fix_test_ethereum():
|
||||
path = 'cpu/tests/test_ethereum.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
content = (BASE / path).read_text()
|
||||
|
||||
# cert-err34-c: sscanf -> strtoul
|
||||
if HEX_TO_BYTES_SSCANF_ETH in content:
|
||||
content = content.replace(HEX_TO_BYTES_SSCANF_ETH, HEX_TO_BYTES_STRTOUL_ETH)
|
||||
print(' ERR34: hex_to_bytes sscanf -> strtoul')
|
||||
else:
|
||||
print(' ERR34_SKIP: hex_to_bytes pattern not found')
|
||||
|
||||
# cert-err33-c at L352: snprintf return unchecked
|
||||
if SNPRINTF_ETH_OLD in content:
|
||||
content = content.replace(SNPRINTF_ETH_OLD, SNPRINTF_ETH_NEW)
|
||||
print(' ERR33: snprintf -> (void)snprintf')
|
||||
else:
|
||||
print(' ERR33_SKIP: snprintf pattern not found')
|
||||
|
||||
# readability-simplify-boolean-expr: extract conditions to named bools
|
||||
# L189: ASSERT_TRUE(sig.v == 27 || sig.v == 28, "legacy v should be 27 or 28");
|
||||
# Fix: const bool v_ok = (sig.v == 27 || sig.v == 28); ASSERT_TRUE(v_ok, ...);
|
||||
content = content.replace(
|
||||
' ASSERT_TRUE(sig.v == 27 || sig.v == 28, "legacy v should be 27 or 28");',
|
||||
' {\n const bool v_ok = (sig.v == 27 || sig.v == 28);\n ASSERT_TRUE(v_ok, "legacy v should be 27 or 28");\n }'
|
||||
)
|
||||
content = content.replace(
|
||||
' ASSERT_TRUE(sig2.v == 37 || sig2.v == 38, "EIP-155 v should be 37 or 38");',
|
||||
' {\n const bool v2_ok = (sig2.v == 37 || sig2.v == 38);\n ASSERT_TRUE(v2_ok, "EIP-155 v should be 37 or 38");\n }'
|
||||
)
|
||||
content = content.replace(
|
||||
' ASSERT_TRUE(sig.v == 27 || sig.v == 28, "v should be 27 or 28");',
|
||||
' {\n const bool v_ok2 = (sig.v == 27 || sig.v == 28);\n ASSERT_TRUE(v_ok2, "v should be 27 or 28");\n }'
|
||||
)
|
||||
print(' SIMPLIFY-BOOL: test_ethereum sig.v checks')
|
||||
|
||||
(BASE / path).write_text(content)
|
||||
|
||||
# Add const to variable declarations (by pattern)
|
||||
lines = read(path)
|
||||
|
||||
# Find and fix const alerts: Point pk = ..., Scalar sk = ..., auto vars, etc.
|
||||
# L226: Point pk = ... -> const Point pk
|
||||
# L264: std::array<...> zero{} - this is const alert? Let me check
|
||||
# Actually the const alerts at L226, L264, L287, L302, L309, L317, L333
|
||||
# are all variable declarations that should be const
|
||||
const_patterns = [
|
||||
'Point pk = ',
|
||||
'Point pk2 = ',
|
||||
'auto expected_addr = ',
|
||||
'auto addr = ',
|
||||
'auto addr2 = ',
|
||||
'std::array<uint8_t, 32> hash{};',
|
||||
'std::array<uint8_t, 32> wrong_hash{};',
|
||||
'bool wrong = ',
|
||||
'bool wrong2 = ',
|
||||
]
|
||||
# Instead, use line numbers after adjusting for line-number shifts from replacements
|
||||
# The simplify-bool fix added 3 blocks (each +4 lines = 3 lines inserted per block = +9 total)
|
||||
# But let's use pattern matching instead of line numbers
|
||||
|
||||
# Pattern: find lines with variable declarations that are const-alerting
|
||||
# Based on the alert line context I read:
|
||||
# L226: Point pk = Point::generator().scalar_mul(sk);
|
||||
# L264: std::array<uint8_t, 32> zero{};
|
||||
# L287: Point pk = ...
|
||||
# L302: bool valid = ...
|
||||
# L309: bool wrong = ...
|
||||
# L317: bool wrong2 = ...
|
||||
# L333: Point pk = ...
|
||||
|
||||
for idx in range(len(lines)):
|
||||
line = lines[idx]
|
||||
stripped = line.lstrip()
|
||||
leading = line[:len(line) - len(stripped)]
|
||||
|
||||
if stripped.startswith('const '):
|
||||
continue
|
||||
|
||||
# Point pk = ... (not already const)
|
||||
if re.match(r'Point pk\d? = ', stripped) and not stripped.startswith('const '):
|
||||
lines[idx] = leading + 'const ' + stripped
|
||||
print(f' CONST: L{idx+1} Point pk')
|
||||
elif re.match(r'(bool (valid|wrong\d?|r_zero|s_zero|all_zero)) = ', stripped):
|
||||
lines[idx] = leading + 'const ' + stripped
|
||||
print(f' CONST: L{idx+1} bool')
|
||||
elif re.match(r'std::array<uint8_t, 32> (hash|wrong_hash|zero)\{\}', stripped):
|
||||
lines[idx] = leading + 'const ' + stripped
|
||||
print(f' CONST: L{idx+1} array')
|
||||
elif re.match(r'auto expected_addr = ethernet_address_bytes', stripped) or \
|
||||
re.match(r'auto expected_addr = ethereum_address_bytes', stripped):
|
||||
lines[idx] = leading + 'const ' + stripped
|
||||
print(f' CONST: L{idx+1} auto expected_addr')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: cpu/tests/test_wallet.cpp
|
||||
# ============================================================================
|
||||
|
||||
HEX_TO_BYTES_SSCANF_WALLET = '''\
|
||||
static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
unsigned int byte = 0;
|
||||
if (std::sscanf(hex + i * 2, "%02x", &byte) != 1) byte = 0;
|
||||
out[i] = static_cast<uint8_t>(byte);
|
||||
}
|
||||
}'''
|
||||
|
||||
HEX_TO_BYTES_STRTOUL_WALLET = '''\
|
||||
static void hex_to_bytes(const char* hex, uint8_t* out, size_t len) {
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
char pair[3] = { hex[i * 2], hex[i * 2 + 1], '\\0' };
|
||||
char* endptr = nullptr;
|
||||
const unsigned long val = std::strtoul(pair, &endptr, 16);
|
||||
out[i] = (endptr == pair + 2) ? static_cast<uint8_t>(val) : 0;
|
||||
}
|
||||
}'''
|
||||
|
||||
|
||||
def fix_test_wallet():
|
||||
path = 'cpu/tests/test_wallet.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
content = (BASE / path).read_text()
|
||||
|
||||
# misc-unused-using-decls: L45 'using fast::Point;'
|
||||
if 'using fast::Point;\n' in content:
|
||||
content = content.replace('using fast::Point;\n', '')
|
||||
print(' UNUSED-USING: removed using fast::Point')
|
||||
else:
|
||||
print(' UNUSED-USING SKIP: using fast::Point not found')
|
||||
|
||||
# cert-err34-c: sscanf -> strtoul
|
||||
if HEX_TO_BYTES_SSCANF_WALLET in content:
|
||||
content = content.replace(HEX_TO_BYTES_SSCANF_WALLET, HEX_TO_BYTES_STRTOUL_WALLET)
|
||||
print(' ERR34: hex_to_bytes sscanf -> strtoul')
|
||||
else:
|
||||
print(' ERR34_SKIP: hex_to_bytes sscanf pattern not found')
|
||||
|
||||
# readability-simplify-boolean-expr: extract to named bools
|
||||
# L197: ASSERT_TRUE(wif[0] == 'K' || wif[0] == 'L', "WIF starts with K or L");
|
||||
content = content.replace(
|
||||
' ASSERT_TRUE(wif[0] == \'K\' || wif[0] == \'L\', "WIF starts with K or L");',
|
||||
' {\n const bool wif_prefix_ok = (wif[0] == \'K\' || wif[0] == \'L\');\n ASSERT_TRUE(wif_prefix_ok, "WIF starts with K or L");\n }'
|
||||
)
|
||||
# L397: ASSERT_TRUE(sig.recid >= 0 && sig.recid <= 3, "valid recid");
|
||||
content = content.replace(
|
||||
' ASSERT_TRUE(sig.recid >= 0 && sig.recid <= 3, "valid recid");',
|
||||
' {\n const bool recid_ok = (sig.recid >= 0 && sig.recid <= 3);\n ASSERT_TRUE(recid_ok, "valid recid");\n }'
|
||||
)
|
||||
# L505: ASSERT_TRUE(!btc.empty() && !ltc.empty() && !doge.empty(), "all non-empty");
|
||||
content = content.replace(
|
||||
' ASSERT_TRUE(!btc.empty() && !ltc.empty() && !doge.empty(), "all non-empty");',
|
||||
' {\n const bool coins_non_empty = !btc.empty() && !ltc.empty() && !doge.empty();\n ASSERT_TRUE(coins_non_empty, "all non-empty");\n }'
|
||||
)
|
||||
# L602: multi-line ASSERT_TRUE
|
||||
content = content.replace(
|
||||
' ASSERT_TRUE(!p2pkh.empty() && !p2wpkh.empty() && !p2sh.empty() && !p2tr.empty(),\n "all non-empty");',
|
||||
' {\n const bool addrs_non_empty = !p2pkh.empty() && !p2wpkh.empty() && !p2sh.empty() && !p2tr.empty();\n ASSERT_TRUE(addrs_non_empty, "all non-empty");\n }'
|
||||
)
|
||||
print(' SIMPLIFY-BOOL: 4 bool expressions extracted')
|
||||
|
||||
(BASE / path).write_text(content)
|
||||
|
||||
# Add const to variable declarations (by pattern matching)
|
||||
lines = read(path)
|
||||
for idx in range(len(lines)):
|
||||
line = lines[idx]
|
||||
stripped = line.lstrip()
|
||||
leading = line[:len(line) - len(stripped)]
|
||||
|
||||
if stripped.startswith('const '):
|
||||
continue
|
||||
|
||||
# L290: size_t msg_len = sizeof(msg) - 1;
|
||||
# L293: bool ok = bitcoin_verify_message(...)
|
||||
# L298: bool bad = bitcoin_verify_message(...)
|
||||
# L314: size_t msg_len = sizeof(msg) - 1;
|
||||
# L336: size_t msg_len = sizeof(msg) - 1;
|
||||
# L366: size_t msg_len = sizeof(msg) - 1;
|
||||
# L369: bool verified = verify_message(...)
|
||||
# L418: size_t msg_len = sizeof(msg) - 1;
|
||||
# L437: size_t msg_len = sizeof(msg) - 1;
|
||||
if re.match(r'size_t msg_len = sizeof\(msg\) - 1;', stripped):
|
||||
lines[idx] = leading + 'const ' + stripped
|
||||
print(f' CONST: L{idx+1} size_t msg_len')
|
||||
elif re.match(r'bool ok = bitcoin_verify_message\(', stripped):
|
||||
lines[idx] = leading + 'const ' + stripped
|
||||
print(f' CONST: L{idx+1} bool ok')
|
||||
elif re.match(r'bool bad = bitcoin_verify_message\(', stripped):
|
||||
lines[idx] = leading + 'const ' + stripped
|
||||
print(f' CONST: L{idx+1} bool bad')
|
||||
elif re.match(r'bool verified = verify_message\(', stripped):
|
||||
lines[idx] = leading + 'const ' + stripped
|
||||
print(f' CONST: L{idx+1} bool verified')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: cpu/tests/test_zk.cpp
|
||||
# ============================================================================
|
||||
|
||||
def fix_test_zk():
|
||||
path = 'cpu/tests/test_zk.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
lines = read(path)
|
||||
|
||||
# All 10 alerts are misc-const-correctness at:
|
||||
# L60, L95, L103, L117, L134, L267, L281, L295, L309, L325
|
||||
const_lines = [60, 95, 103, 117, 134, 267, 281, 295, 309, 325]
|
||||
lines = add_const_to_lines(lines, const_lines, 'test_zk/')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# File: audit/test_ffi_round_trip.cpp
|
||||
# ============================================================================
|
||||
|
||||
def fix_test_ffi():
|
||||
path = 'audit/test_ffi_round_trip.cpp'
|
||||
print(f'\n=== {path} ===')
|
||||
content = (BASE / path).read_text()
|
||||
|
||||
# L1055: misc-redundant-expression (tautological check)
|
||||
# Fix: remove the first redundant half of the OR expression
|
||||
old_check = (
|
||||
'CHECK(ufsecp_bip39_validate(ctx, "abandon abandon abandon abandon abandon abandon '
|
||||
'abandon abandon abandon abandon abandon abandon") != UFSECP_OK\n'
|
||||
' || ufsecp_bip39_validate(ctx, "abandon abandon abandon abandon '
|
||||
'abandon abandon abandon abandon abandon abandon abandon abandon") == UFSECP_OK,\n'
|
||||
' "bip39_validate accepts or rejects known mnemonic");'
|
||||
)
|
||||
new_check = (
|
||||
'CHECK(ufsecp_bip39_validate(ctx, "abandon abandon abandon abandon abandon abandon '
|
||||
'abandon abandon abandon abandon abandon abandon") == UFSECP_OK,\n'
|
||||
' "bip39_validate accepts valid 12-word mnemonic");'
|
||||
)
|
||||
if old_check in content:
|
||||
content = content.replace(old_check, new_check)
|
||||
print(' REDUNDANT: L1055 tautological check fixed')
|
||||
else:
|
||||
print(' REDUNDANT_SKIP: L1055 exact pattern not found, trying partial match')
|
||||
# Try a partial match
|
||||
old_pattern = 'bip39_validate accepts or rejects known mnemonic'
|
||||
if old_pattern in content:
|
||||
# Need to find and replace the surrounding context
|
||||
# Use regex for multi-line replacement
|
||||
pattern = re.compile(
|
||||
r'CHECK\(ufsecp_bip39_validate\(ctx,\s*"abandon[^"]+"\)\s*!=\s*UFSECP_OK\s*\n'
|
||||
r'\s*\|\|\s*ufsecp_bip39_validate\(ctx,\s*"abandon[^"]+"\)\s*==\s*UFSECP_OK,\s*\n'
|
||||
r'\s*"bip39_validate accepts or rejects known mnemonic"\)',
|
||||
re.MULTILINE
|
||||
)
|
||||
replacement = (
|
||||
'CHECK(ufsecp_bip39_validate(ctx, "abandon abandon abandon abandon abandon abandon '
|
||||
'abandon abandon abandon abandon abandon abandon") == UFSECP_OK,\n'
|
||||
' "bip39_validate accepts valid 12-word mnemonic")'
|
||||
)
|
||||
content, n = pattern.subn(replacement, content)
|
||||
if n:
|
||||
print(f' REDUNDANT: L1055 fixed via regex ({n} replacement)')
|
||||
else:
|
||||
print(' REDUNDANT_FAIL: could not fix L1055')
|
||||
|
||||
(BASE / path).write_text(content)
|
||||
lines = read(path)
|
||||
|
||||
# L1317: size_t msg_len = 15; -> const size_t msg_len = 15;
|
||||
# L1538: bool match = ... -> const bool match = ...
|
||||
# Use pattern matching since line numbers may have shifted
|
||||
for idx in range(len(lines)):
|
||||
line = lines[idx]
|
||||
stripped = line.lstrip()
|
||||
leading = line[:len(line) - len(stripped)]
|
||||
|
||||
if stripped.startswith('const '):
|
||||
continue
|
||||
|
||||
if stripped == 'size_t msg_len = 15;\n':
|
||||
lines[idx] = leading + 'const ' + stripped
|
||||
print(f' CONST: L{idx+1} size_t msg_len = 15')
|
||||
elif stripped.startswith('bool match = (std::memcmp('):
|
||||
lines[idx] = leading + 'const ' + stripped
|
||||
print(f' CONST: L{idx+1} bool match')
|
||||
|
||||
save(path, lines)
|
||||
print(f' Saved {path} ({len(lines)} lines)')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Main
|
||||
# ============================================================================
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Fix Round 4: resolving 211 code-scanning alerts')
|
||||
print('=' * 60)
|
||||
|
||||
fix_ufsecp_impl()
|
||||
fix_bip39()
|
||||
fix_zk()
|
||||
fix_message_signing()
|
||||
fix_eth_signing()
|
||||
fix_address()
|
||||
fix_wallet()
|
||||
fix_coin_address()
|
||||
fix_test_bip39()
|
||||
fix_test_ethereum()
|
||||
fix_test_wallet()
|
||||
fix_test_zk()
|
||||
fix_test_ffi()
|
||||
|
||||
print('\n' + '=' * 60)
|
||||
print('Done. Check brace balance:')
|
||||
files = [
|
||||
'include/ufsecp/ufsecp_impl.cpp',
|
||||
'cpu/src/bip39.cpp',
|
||||
'cpu/src/zk.cpp',
|
||||
'cpu/src/message_signing.cpp',
|
||||
'cpu/src/eth_signing.cpp',
|
||||
'cpu/src/address.cpp',
|
||||
'cpu/src/wallet.cpp',
|
||||
'cpu/src/coin_address.cpp',
|
||||
'cpu/tests/test_bip39.cpp',
|
||||
'cpu/tests/test_ethereum.cpp',
|
||||
'cpu/tests/test_wallet.cpp',
|
||||
'cpu/tests/test_zk.cpp',
|
||||
'audit/test_ffi_round_trip.cpp',
|
||||
]
|
||||
all_ok = True
|
||||
for f in files:
|
||||
try:
|
||||
text = (BASE / f).read_text()
|
||||
opens = text.count('{')
|
||||
closes = text.count('}')
|
||||
ok = opens == closes
|
||||
status = 'OK' if ok else f'MISMATCH ({opens} vs {closes})'
|
||||
print(f' {f}: {status}')
|
||||
if not ok:
|
||||
all_ok = False
|
||||
except Exception as e:
|
||||
print(f' {f}: ERROR {e}')
|
||||
all_ok = False
|
||||
|
||||
if all_ok:
|
||||
print('\nAll brace counts balanced.')
|
||||
else:
|
||||
print('\nWARNING: Some files have mismatched braces!')
|
||||
@ -197,13 +197,11 @@ public:
|
||||
std::vector<secp256k1::opencl::AffinePoint> h_aff(count);
|
||||
ctx_->batch_jacobian_to_affine(h_jac.data(), h_aff.data(), count);
|
||||
|
||||
/* CPU: SHA-256(x_bytes) → 32-byte shared secret */
|
||||
/* CPU: SHA-256(compressed shared point) to match ufsecp_ecdh/CUDA. */
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
std::array<uint64_t, 4> xl;
|
||||
std::memcpy(xl.data(), h_aff[i].x.limbs, 32);
|
||||
auto fe = secp256k1::fast::FieldElement::from_limbs(xl);
|
||||
auto xbytes = fe.to_bytes();
|
||||
auto digest = secp256k1::SHA256::hash(xbytes.data(), 32);
|
||||
uint8_t compressed[33];
|
||||
affine_to_compressed(&h_aff[i], compressed);
|
||||
auto digest = secp256k1::SHA256::hash(compressed, sizeof(compressed));
|
||||
std::memcpy(out_secrets32 + i * 32, digest.data(), 32);
|
||||
}
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
"vendor": "NVIDIA Corporation",
|
||||
"version": "OpenCL 3.0 CUDA",
|
||||
"driver_version": "580.126.09",
|
||||
"memory_mb": 15847,
|
||||
"memory_mb": 15844,
|
||||
"compute_units": 36
|
||||
},
|
||||
"platform": {
|
||||
@ -20,36 +20,36 @@
|
||||
"passed": 27,
|
||||
"failed": 0,
|
||||
"skipped": 0,
|
||||
"total_seconds": 0.727543,
|
||||
"total_seconds": 0.673606,
|
||||
"verdict": "AUDIT-READY"
|
||||
},
|
||||
"modules": [
|
||||
{ "id": "selftest_core", "name": "OpenCL Selftest (23+ kernel tests)", "section": "math_invariants", "result": "PASS", "time_ms": 152.799583, "error_code": 0 },
|
||||
{ "id": "field_add_sub", "name": "Field add/sub roundtrip", "section": "math_invariants", "result": "PASS", "time_ms": 0.307649, "error_code": 0 },
|
||||
{ "id": "field_mul_comm", "name": "Field mul commutativity", "section": "math_invariants", "result": "PASS", "time_ms": 0.140150, "error_code": 0 },
|
||||
{ "id": "field_inv", "name": "Field inverse roundtrip (a * a^-1 = 1)", "section": "math_invariants", "result": "PASS", "time_ms": 0.266819, "error_code": 0 },
|
||||
{ "id": "field_sqr", "name": "Field square == mul(a,a)", "section": "math_invariants", "result": "PASS", "time_ms": 0.120384, "error_code": 0 },
|
||||
{ "id": "field_negate", "name": "Field negate roundtrip (a + (-a) = 0)", "section": "math_invariants", "result": "PASS", "time_ms": 0.118151, "error_code": 0 },
|
||||
{ "id": "gen_mul_vec", "name": "Generator mul known vectors", "section": "math_invariants", "result": "PASS", "time_ms": 0.069495, "error_code": 0 },
|
||||
{ "id": "scalar_roundtrip", "name": "Scalar/Point consistency", "section": "math_invariants", "result": "PASS", "time_ms": 0.196458, "error_code": 0 },
|
||||
{ "id": "add_dbl_consist", "name": "Point add vs double consistency", "section": "math_invariants", "result": "PASS", "time_ms": 0.226176, "error_code": 0 },
|
||||
{ "id": "scalar_mul_lin", "name": "Scalar mul linearity (a+b)*G = aG+bG", "section": "math_invariants", "result": "PASS", "time_ms": 0.334149, "error_code": 0 },
|
||||
{ "id": "group_order", "name": "Group order basic checks", "section": "math_invariants", "result": "PASS", "time_ms": 0.224126, "error_code": 0 },
|
||||
{ "id": "batch_inv", "name": "Batch inversion (Montgomery trick)", "section": "math_invariants", "result": "PASS", "time_ms": 0.639383, "error_code": 0 },
|
||||
{ "id": "ecdsa_roundtrip", "name": "ECDSA sign + verify roundtrip", "section": "signatures", "result": "PASS", "time_ms": 7.346328, "error_code": 0 },
|
||||
{ "id": "schnorr_roundtrip", "name": "Schnorr/BIP-340 sign + verify roundtrip", "section": "signatures", "result": "PASS", "time_ms": 6.647268, "error_code": 0 },
|
||||
{ "id": "ecdsa_wrong_key", "name": "ECDSA verify rejects wrong pubkey", "section": "signatures", "result": "PASS", "time_ms": 6.214200, "error_code": 0 },
|
||||
{ "id": "batch_smul", "name": "Batch scalar mul generator", "section": "batch_advanced", "result": "PASS", "time_ms": 0.435053, "error_code": 0 },
|
||||
{ "id": "batch_j2a", "name": "Batch Jacobian to Affine", "section": "batch_advanced", "result": "PASS", "time_ms": 0.518009, "error_code": 0 },
|
||||
{ "id": "diff_smul", "name": "OpenCL-host differential scalar mul", "section": "differential", "result": "PASS", "time_ms": 0.070056, "error_code": 0 },
|
||||
{ "id": "rfc6979_determ", "name": "RFC-6979 ECDSA deterministic nonce", "section": "standard_vectors", "result": "PASS", "time_ms": 6.422079, "error_code": 0 },
|
||||
{ "id": "bip340_vectors", "name": "BIP-340 Schnorr known-key roundtrip", "section": "standard_vectors", "result": "PASS", "time_ms": 8.872533, "error_code": 0 },
|
||||
{ "id": "ecdsa_multi_key", "name": "ECDSA multi-key (10 keys) sign+verify", "section": "protocol_security", "result": "PASS", "time_ms": 60.908449, "error_code": 0 },
|
||||
{ "id": "schnorr_multi_key", "name": "Schnorr multi-key (10 keys) sign+verify", "section": "protocol_security", "result": "PASS", "time_ms": 55.286184, "error_code": 0 },
|
||||
{ "id": "fuzz_edge_scalar", "name": "Edge-case scalars (0*G, 1*G, G+G=2G)", "section": "fuzzing", "result": "PASS", "time_ms": 0.229781, "error_code": 0 },
|
||||
{ "id": "fuzz_ecdsa_zero", "name": "ECDSA rejects zero private key", "section": "fuzzing", "result": "PASS", "time_ms": 0.077824, "error_code": 0 },
|
||||
{ "id": "fuzz_schnorr_zero", "name": "Schnorr rejects zero private key", "section": "fuzzing", "result": "PASS", "time_ms": 0.082019, "error_code": 0 },
|
||||
{ "id": "perf_ecdsa_50", "name": "ECDSA 50-iteration stress", "section": "performance", "result": "PASS", "time_ms": 287.677880, "error_code": 0 },
|
||||
{ "id": "perf_schnorr_25", "name": "Schnorr 25-iteration stress", "section": "performance", "result": "PASS", "time_ms": 131.178937, "error_code": 0 }
|
||||
{ "id": "selftest_core", "name": "OpenCL Selftest (23+ kernel tests)", "section": "math_invariants", "result": "PASS", "time_ms": 137.571479, "error_code": 0 },
|
||||
{ "id": "field_add_sub", "name": "Field add/sub roundtrip", "section": "math_invariants", "result": "PASS", "time_ms": 0.335681, "error_code": 0 },
|
||||
{ "id": "field_mul_comm", "name": "Field mul commutativity", "section": "math_invariants", "result": "PASS", "time_ms": 0.215808, "error_code": 0 },
|
||||
{ "id": "field_inv", "name": "Field inverse roundtrip (a * a^-1 = 1)", "section": "math_invariants", "result": "PASS", "time_ms": 0.280040, "error_code": 0 },
|
||||
{ "id": "field_sqr", "name": "Field square == mul(a,a)", "section": "math_invariants", "result": "PASS", "time_ms": 0.129584, "error_code": 0 },
|
||||
{ "id": "field_negate", "name": "Field negate roundtrip (a + (-a) = 0)", "section": "math_invariants", "result": "PASS", "time_ms": 0.131630, "error_code": 0 },
|
||||
{ "id": "gen_mul_vec", "name": "Generator mul known vectors", "section": "math_invariants", "result": "PASS", "time_ms": 0.062463, "error_code": 0 },
|
||||
{ "id": "scalar_roundtrip", "name": "Scalar/Point consistency", "section": "math_invariants", "result": "PASS", "time_ms": 0.121435, "error_code": 0 },
|
||||
{ "id": "add_dbl_consist", "name": "Point add vs double consistency", "section": "math_invariants", "result": "PASS", "time_ms": 0.219232, "error_code": 0 },
|
||||
{ "id": "scalar_mul_lin", "name": "Scalar mul linearity (a+b)*G = aG+bG", "section": "math_invariants", "result": "PASS", "time_ms": 0.330590, "error_code": 0 },
|
||||
{ "id": "group_order", "name": "Group order basic checks", "section": "math_invariants", "result": "PASS", "time_ms": 0.199699, "error_code": 0 },
|
||||
{ "id": "batch_inv", "name": "Batch inversion (Montgomery trick)", "section": "math_invariants", "result": "PASS", "time_ms": 0.353371, "error_code": 0 },
|
||||
{ "id": "ecdsa_roundtrip", "name": "ECDSA sign + verify roundtrip", "section": "signatures", "result": "PASS", "time_ms": 7.218292, "error_code": 0 },
|
||||
{ "id": "schnorr_roundtrip", "name": "Schnorr/BIP-340 sign + verify roundtrip", "section": "signatures", "result": "PASS", "time_ms": 5.841064, "error_code": 0 },
|
||||
{ "id": "ecdsa_wrong_key", "name": "ECDSA verify rejects wrong pubkey", "section": "signatures", "result": "PASS", "time_ms": 6.111775, "error_code": 0 },
|
||||
{ "id": "batch_smul", "name": "Batch scalar mul generator", "section": "batch_advanced", "result": "PASS", "time_ms": 0.093349, "error_code": 0 },
|
||||
{ "id": "batch_j2a", "name": "Batch Jacobian to Affine", "section": "batch_advanced", "result": "PASS", "time_ms": 0.083400, "error_code": 0 },
|
||||
{ "id": "diff_smul", "name": "OpenCL-host differential scalar mul", "section": "differential", "result": "PASS", "time_ms": 0.020089, "error_code": 0 },
|
||||
{ "id": "rfc6979_determ", "name": "RFC-6979 ECDSA deterministic nonce", "section": "standard_vectors", "result": "PASS", "time_ms": 6.313681, "error_code": 0 },
|
||||
{ "id": "bip340_vectors", "name": "BIP-340 Schnorr known-key roundtrip", "section": "standard_vectors", "result": "PASS", "time_ms": 7.330723, "error_code": 0 },
|
||||
{ "id": "ecdsa_multi_key", "name": "ECDSA multi-key (10 keys) sign+verify", "section": "protocol_security", "result": "PASS", "time_ms": 59.571898, "error_code": 0 },
|
||||
{ "id": "schnorr_multi_key", "name": "Schnorr multi-key (10 keys) sign+verify", "section": "protocol_security", "result": "PASS", "time_ms": 47.122783, "error_code": 0 },
|
||||
{ "id": "fuzz_edge_scalar", "name": "Edge-case scalars (0*G, 1*G, G+G=2G)", "section": "fuzzing", "result": "PASS", "time_ms": 0.078238, "error_code": 0 },
|
||||
{ "id": "fuzz_ecdsa_zero", "name": "ECDSA rejects zero private key", "section": "fuzzing", "result": "PASS", "time_ms": 0.042447, "error_code": 0 },
|
||||
{ "id": "fuzz_schnorr_zero", "name": "Schnorr rejects zero private key", "section": "fuzzing", "result": "PASS", "time_ms": 0.045454, "error_code": 0 },
|
||||
{ "id": "perf_ecdsa_50", "name": "ECDSA 50-iteration stress", "section": "performance", "result": "PASS", "time_ms": 282.603579, "error_code": 0 },
|
||||
{ "id": "perf_schnorr_25", "name": "Schnorr 25-iteration stress", "section": "performance", "result": "PASS", "time_ms": 111.053794, "error_code": 0 }
|
||||
]
|
||||
}
|
||||
|
||||
@ -2,60 +2,60 @@
|
||||
UltrafastSecp256k1 -- OpenCL Unified Audit Report
|
||||
Framework v2.0.0
|
||||
Linux x86-64 | GCC 14.2.0 | Release
|
||||
Device: NVIDIA GeForce RTX 5060 Ti (NVIDIA Corporation) | 36 CUs | 15847 MB
|
||||
Device: NVIDIA GeForce RTX 5060 Ti (NVIDIA Corporation) | 36 CUs | 15844 MB
|
||||
================================================================
|
||||
|
||||
|
||||
Section: math_invariants
|
||||
--------------------------------------------------
|
||||
[PASS] OpenCL Selftest (23+ kernel tests) (152.8 ms)
|
||||
[PASS] Field add/sub roundtrip (0.307649 ms)
|
||||
[PASS] Field mul commutativity (0.14015 ms)
|
||||
[PASS] Field inverse roundtrip (a * a^-1 = 1) (0.266819 ms)
|
||||
[PASS] Field square == mul(a,a) (0.120384 ms)
|
||||
[PASS] Field negate roundtrip (a + (-a) = 0) (0.118151 ms)
|
||||
[PASS] Generator mul known vectors (0.069495 ms)
|
||||
[PASS] Scalar/Point consistency (0.196458 ms)
|
||||
[PASS] Point add vs double consistency (0.226176 ms)
|
||||
[PASS] Scalar mul linearity (a+b)*G = aG+bG (0.334149 ms)
|
||||
[PASS] Group order basic checks (0.224126 ms)
|
||||
[PASS] Batch inversion (Montgomery trick) (0.639383 ms)
|
||||
[PASS] OpenCL Selftest (23+ kernel tests) (137.571 ms)
|
||||
[PASS] Field add/sub roundtrip (0.335681 ms)
|
||||
[PASS] Field mul commutativity (0.215808 ms)
|
||||
[PASS] Field inverse roundtrip (a * a^-1 = 1) (0.28004 ms)
|
||||
[PASS] Field square == mul(a,a) (0.129584 ms)
|
||||
[PASS] Field negate roundtrip (a + (-a) = 0) (0.13163 ms)
|
||||
[PASS] Generator mul known vectors (0.062463 ms)
|
||||
[PASS] Scalar/Point consistency (0.121435 ms)
|
||||
[PASS] Point add vs double consistency (0.219232 ms)
|
||||
[PASS] Scalar mul linearity (a+b)*G = aG+bG (0.33059 ms)
|
||||
[PASS] Group order basic checks (0.199699 ms)
|
||||
[PASS] Batch inversion (Montgomery trick) (0.353371 ms)
|
||||
|
||||
Section: signatures
|
||||
--------------------------------------------------
|
||||
[PASS] ECDSA sign + verify roundtrip (7.34633 ms)
|
||||
[PASS] Schnorr/BIP-340 sign + verify roundtrip (6.64727 ms)
|
||||
[PASS] ECDSA verify rejects wrong pubkey (6.2142 ms)
|
||||
[PASS] ECDSA sign + verify roundtrip (7.21829 ms)
|
||||
[PASS] Schnorr/BIP-340 sign + verify roundtrip (5.84106 ms)
|
||||
[PASS] ECDSA verify rejects wrong pubkey (6.11177 ms)
|
||||
|
||||
Section: batch_advanced
|
||||
--------------------------------------------------
|
||||
[PASS] Batch scalar mul generator (0.435053 ms)
|
||||
[PASS] Batch Jacobian to Affine (0.518009 ms)
|
||||
[PASS] Batch scalar mul generator (0.093349 ms)
|
||||
[PASS] Batch Jacobian to Affine (0.0834 ms)
|
||||
|
||||
Section: differential
|
||||
--------------------------------------------------
|
||||
[PASS] OpenCL-host differential scalar mul (0.070056 ms)
|
||||
[PASS] OpenCL-host differential scalar mul (0.020089 ms)
|
||||
|
||||
Section: standard_vectors
|
||||
--------------------------------------------------
|
||||
[PASS] RFC-6979 ECDSA deterministic nonce (6.42208 ms)
|
||||
[PASS] BIP-340 Schnorr known-key roundtrip (8.87253 ms)
|
||||
[PASS] RFC-6979 ECDSA deterministic nonce (6.31368 ms)
|
||||
[PASS] BIP-340 Schnorr known-key roundtrip (7.33072 ms)
|
||||
|
||||
Section: protocol_security
|
||||
--------------------------------------------------
|
||||
[PASS] ECDSA multi-key (10 keys) sign+verify (60.9084 ms)
|
||||
[PASS] Schnorr multi-key (10 keys) sign+verify (55.2862 ms)
|
||||
[PASS] ECDSA multi-key (10 keys) sign+verify (59.5719 ms)
|
||||
[PASS] Schnorr multi-key (10 keys) sign+verify (47.1228 ms)
|
||||
|
||||
Section: fuzzing
|
||||
--------------------------------------------------
|
||||
[PASS] Edge-case scalars (0*G, 1*G, G+G=2G) (0.229781 ms)
|
||||
[PASS] ECDSA rejects zero private key (0.077824 ms)
|
||||
[PASS] Schnorr rejects zero private key (0.082019 ms)
|
||||
[PASS] Edge-case scalars (0*G, 1*G, G+G=2G) (0.078238 ms)
|
||||
[PASS] ECDSA rejects zero private key (0.042447 ms)
|
||||
[PASS] Schnorr rejects zero private key (0.045454 ms)
|
||||
|
||||
Section: performance
|
||||
--------------------------------------------------
|
||||
[PASS] ECDSA 50-iteration stress (287.678 ms)
|
||||
[PASS] Schnorr 25-iteration stress (131.179 ms)
|
||||
[PASS] ECDSA 50-iteration stress (282.604 ms)
|
||||
[PASS] Schnorr 25-iteration stress (111.054 ms)
|
||||
|
||||
================================================================
|
||||
VERDICT: AUDIT-READY
|
||||
|
||||
@ -145,12 +145,20 @@ set(KERNEL_FILE_1 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_field.cl")
|
||||
set(KERNEL_FILE_2 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_point.cl")
|
||||
set(KERNEL_FILE_3 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_batch.cl")
|
||||
set(KERNEL_FILE_4 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_affine.cl")
|
||||
set(KERNEL_FILE_5 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_extended.cl")
|
||||
set(KERNEL_FILE_6 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_hash160.cl")
|
||||
set(KERNEL_FILE_7 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_ecdh.cl")
|
||||
set(KERNEL_FILE_8 "${CMAKE_CURRENT_SOURCE_DIR}/kernels/secp256k1_bip352.cl")
|
||||
|
||||
set(KERNEL_FILES_LIST
|
||||
${KERNEL_FILE_1}
|
||||
${KERNEL_FILE_2}
|
||||
${KERNEL_FILE_3}
|
||||
${KERNEL_FILE_4}
|
||||
${KERNEL_FILE_5}
|
||||
${KERNEL_FILE_6}
|
||||
${KERNEL_FILE_7}
|
||||
${KERNEL_FILE_8}
|
||||
)
|
||||
|
||||
set(KERNEL_HEADER "${CMAKE_CURRENT_BINARY_DIR}/include/secp256k1_kernels_embedded.hpp")
|
||||
@ -215,6 +223,20 @@ else()
|
||||
)
|
||||
endif()
|
||||
|
||||
add_executable(opencl_bip352_benchmark
|
||||
benchmarks/bench_bip352_opencl.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(opencl_bip352_benchmark PRIVATE
|
||||
secp256k1_opencl
|
||||
$<TARGET_NAME_IF_EXISTS:fastsecp256k1>
|
||||
${FASTSECP256K1_LIB}
|
||||
)
|
||||
|
||||
target_compile_definitions(opencl_bip352_benchmark PRIVATE
|
||||
SECP256K1_OPENCL_KERNEL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/kernels"
|
||||
)
|
||||
|
||||
# =============================================================================
|
||||
# Test Executable
|
||||
# =============================================================================
|
||||
@ -290,4 +312,3 @@ install(DIRECTORY kernels/
|
||||
DESTINATION share/secp256k1/opencl
|
||||
FILES_MATCHING PATTERN "*.cl"
|
||||
)
|
||||
|
||||
|
||||
641
opencl/benchmarks/bench_bip352_opencl.cpp
Normal file
641
opencl/benchmarks/bench_bip352_opencl.cpp
Normal file
@ -0,0 +1,641 @@
|
||||
#include "secp256k1_opencl.hpp"
|
||||
#include "secp256k1/batch_add_affine.hpp"
|
||||
#include "secp256k1/fast.hpp"
|
||||
#include "secp256k1/glv.hpp"
|
||||
#include "secp256k1/tagged_hash.hpp"
|
||||
|
||||
#define CL_TARGET_OPENCL_VERSION 120
|
||||
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/cl.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <chrono>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using CpuPoint = secp256k1::fast::Point;
|
||||
using CpuScalar = secp256k1::fast::Scalar;
|
||||
using CpuField = secp256k1::fast::FieldElement;
|
||||
using OclAffine = secp256k1::opencl::AffinePoint;
|
||||
using OclField = secp256k1::opencl::FieldElement;
|
||||
using OclScalar = secp256k1::opencl::Scalar;
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr int BENCH_N = 10000;
|
||||
constexpr int BENCH_WARMUP = 3;
|
||||
constexpr int BENCH_PASSES = 11;
|
||||
// RTX 5060 Ti (and most NVIDIA): warp=32, SM occupancy peaks at 128-256 threads.
|
||||
// Previous defaults (64/32) left SMs underutilized.
|
||||
constexpr int DEFAULT_LOCAL_SIZE_FUSED = 128;
|
||||
constexpr int DEFAULT_LOCAL_SIZE_LUT = 128;
|
||||
constexpr std::size_t LUT_WINDOWS = 16;
|
||||
constexpr std::size_t LUT_ENTRIES = 65536;
|
||||
|
||||
constexpr uint8_t SCAN_KEY[32] = {
|
||||
0xc4,0x23,0x9f,0xd6,0xfc,0x3d,0xb6,0xe2,
|
||||
0x2b,0x8b,0xed,0x6a,0x49,0x21,0x9e,0x4e,
|
||||
0x30,0xd7,0xd6,0xa3,0xb9,0x82,0x94,0xb1,
|
||||
0x38,0xaf,0x4a,0xd3,0x00,0xda,0x1a,0x42
|
||||
};
|
||||
|
||||
constexpr uint8_t SPEND_PUBKEY_COMPRESSED[33] = {
|
||||
0x02,
|
||||
0xe2,0xed,0x4b,0x9c,0xe9,0x14,0x5e,0x17,
|
||||
0x21,0xf1,0x1f,0x99,0x5f,0x72,0x6e,0xf8,
|
||||
0xcf,0x50,0xfc,0x85,0x92,0x89,0xac,0x94,
|
||||
0x4b,0x2d,0xaf,0xe5,0x03,0xa3,0xc7,0x4c
|
||||
};
|
||||
|
||||
// Must match BIP352ScanKeyGlv typedef in secp256k1_bip352.cl exactly.
|
||||
struct BIP352ScanKeyGlv {
|
||||
std::int8_t wnaf1[130]{}; // +0: wNAF digits for k1 half-scalar
|
||||
std::int8_t wnaf2[130]{}; // +130: wNAF digits for k2 half-scalar
|
||||
std::uint8_t k1_neg{0}; // +260: 1 if k1 negative (negate base.y)
|
||||
std::uint8_t flip_phi{0}; // +261: 1 if phi table y should be negated
|
||||
std::uint8_t pad0{0}; // +262: padding
|
||||
std::uint8_t pad1{0}; // +263: padding
|
||||
}; // Total: 264 bytes
|
||||
|
||||
// Compute 5-bit wNAF digits for a 128-bit half-scalar.
|
||||
// Mirrors the GPU's scalar_to_wnaf fixed-130-iteration version.
|
||||
// scalar_bytes: big-endian 32-byte scalar (upper 128 bits should be zero for GLV halves).
|
||||
static void host_compute_wnaf(const std::uint8_t* scalar_bytes, std::int8_t wnaf[130]) {
|
||||
// Convert big-endian bytes to 4 little-endian 64-bit limbs (limb[0] = LSW).
|
||||
std::uint64_t s[4] = {};
|
||||
for (int limb = 0; limb < 4; ++limb) {
|
||||
std::uint64_t v = 0;
|
||||
int base = limb * 8;
|
||||
for (int i = 0; i < 8; ++i) v = (v << 8) | scalar_bytes[base + i];
|
||||
s[3 - limb] = v;
|
||||
}
|
||||
for (int i = 0; i < 130; i++) {
|
||||
if (s[0] & 1ULL) {
|
||||
int d = (int)(s[0] & 0x1FULL);
|
||||
if (d >= 16) {
|
||||
d -= 32;
|
||||
std::uint64_t add = (std::uint64_t)(-d);
|
||||
std::uint64_t prev = s[0]; s[0] += add;
|
||||
if (s[0] < prev) { for (int j = 1; j < 4; j++) if (++s[j]) break; }
|
||||
} else {
|
||||
std::uint64_t prev = s[0]; s[0] -= (std::uint64_t)d;
|
||||
if (s[0] > prev) { for (int j = 1; j < 4; j++) if (s[j]--) break; }
|
||||
}
|
||||
wnaf[i] = (std::int8_t)d;
|
||||
} else {
|
||||
wnaf[i] = 0;
|
||||
}
|
||||
s[0] = (s[0] >> 1) | (s[1] << 63);
|
||||
s[1] = (s[1] >> 1) | (s[2] << 63);
|
||||
s[2] = (s[2] >> 1) | (s[3] << 63);
|
||||
s[3] >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
static const uint32_t host_sha256_k[64] = {
|
||||
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
|
||||
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
|
||||
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
|
||||
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
|
||||
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
|
||||
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
|
||||
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
|
||||
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
};
|
||||
|
||||
inline uint32_t rotr32(uint32_t a, uint32_t b) {
|
||||
return (a >> b) | (a << (32 - b));
|
||||
}
|
||||
|
||||
void host_sha256(const uint8_t* msg, size_t len, uint8_t out[32]) {
|
||||
uint32_t h0=0x6a09e667, h1=0xbb67ae85, h2=0x3c6ef372, h3=0xa54ff53a;
|
||||
uint32_t h4=0x510e527f, h5=0x9b05688c, h6=0x1f83d9ab, h7=0x5be0cd19;
|
||||
|
||||
size_t bit_len = len * 8;
|
||||
size_t padded = ((len + 9 + 63) / 64) * 64;
|
||||
std::vector<uint8_t> buf(padded, 0);
|
||||
std::memcpy(buf.data(), msg, len);
|
||||
buf[len] = 0x80;
|
||||
for (int i = 7; i >= 0; --i) buf[padded - 1 - i] = static_cast<uint8_t>(bit_len >> (i * 8));
|
||||
|
||||
for (size_t off = 0; off < padded; off += 64) {
|
||||
uint32_t w[64];
|
||||
for (int i = 0; i < 16; i++) {
|
||||
w[i] = (static_cast<uint32_t>(buf[off+i*4]) << 24) |
|
||||
(static_cast<uint32_t>(buf[off+i*4+1]) << 16) |
|
||||
(static_cast<uint32_t>(buf[off+i*4+2]) << 8) |
|
||||
buf[off+i*4+3];
|
||||
}
|
||||
for (int i = 16; i < 64; i++) {
|
||||
uint32_t s0 = rotr32(w[i-15], 7) ^ rotr32(w[i-15], 18) ^ (w[i-15] >> 3);
|
||||
uint32_t s1 = rotr32(w[i-2], 17) ^ rotr32(w[i-2], 19) ^ (w[i-2] >> 10);
|
||||
w[i] = w[i-16] + s0 + w[i-7] + s1;
|
||||
}
|
||||
uint32_t a=h0,b=h1,c=h2,d=h3,e=h4,f=h5,g=h6,hh=h7;
|
||||
for (int i = 0; i < 64; i++) {
|
||||
uint32_t S1 = rotr32(e,6)^rotr32(e,11)^rotr32(e,25);
|
||||
uint32_t ch = (e&f)^(~e&g);
|
||||
uint32_t t1 = hh+S1+ch+host_sha256_k[i]+w[i];
|
||||
uint32_t S0 = rotr32(a,2)^rotr32(a,13)^rotr32(a,22);
|
||||
uint32_t maj = (a&b)^(a&c)^(b&c);
|
||||
uint32_t t2 = S0+maj;
|
||||
hh=g; g=f; f=e; e=d+t1; d=c; c=b; b=a; a=t1+t2;
|
||||
}
|
||||
h0+=a; h1+=b; h2+=c; h3+=d; h4+=e; h5+=f; h6+=g; h7+=hh;
|
||||
}
|
||||
|
||||
auto store = [&](uint32_t v, int i) {
|
||||
out[i*4] = static_cast<uint8_t>(v >> 24);
|
||||
out[i*4+1] = static_cast<uint8_t>(v >> 16);
|
||||
out[i*4+2] = static_cast<uint8_t>(v >> 8);
|
||||
out[i*4+3] = static_cast<uint8_t>(v);
|
||||
};
|
||||
store(h0,0); store(h1,1); store(h2,2); store(h3,3);
|
||||
store(h4,4); store(h5,5); store(h6,6); store(h7,7);
|
||||
}
|
||||
|
||||
CpuPoint point_from_compressed(const uint8_t* pub33) {
|
||||
if (pub33[0] != 0x02 && pub33[0] != 0x03) return CpuPoint::infinity();
|
||||
CpuField x;
|
||||
if (!CpuField::parse_bytes_strict(pub33 + 1, x)) return CpuPoint::infinity();
|
||||
auto x2 = x * x;
|
||||
auto x3 = x2 * x;
|
||||
auto y2 = x3 + CpuField::from_uint64(7);
|
||||
auto t = y2;
|
||||
auto a = t.square() * t;
|
||||
auto b = a.square() * t;
|
||||
auto c = b.square().square().square() * b;
|
||||
auto d = c.square().square().square() * b;
|
||||
auto e = d.square().square() * a;
|
||||
auto f = e;
|
||||
for (int i = 0; i < 11; ++i) f = f.square();
|
||||
f = f * e;
|
||||
auto g = f;
|
||||
for (int i = 0; i < 22; ++i) g = g.square();
|
||||
g = g * f;
|
||||
auto h = g;
|
||||
for (int i = 0; i < 44; ++i) h = h.square();
|
||||
h = h * g;
|
||||
auto j = h;
|
||||
for (int i = 0; i < 88; ++i) j = j.square();
|
||||
j = j * h;
|
||||
auto k = j;
|
||||
for (int i = 0; i < 44; ++i) k = k.square();
|
||||
k = k * g;
|
||||
auto m = k.square().square().square() * b;
|
||||
auto y = m;
|
||||
for (int i = 0; i < 23; ++i) y = y.square();
|
||||
y = y * f;
|
||||
for (int i = 0; i < 6; ++i) y = y.square();
|
||||
y = y * a;
|
||||
y = y.square().square();
|
||||
if (!(y * y == y2)) return CpuPoint::infinity();
|
||||
auto y_bytes = y.to_bytes();
|
||||
bool y_is_odd = (y_bytes[31] & 1) != 0;
|
||||
bool want_odd = (pub33[0] == 0x03);
|
||||
if (y_is_odd != want_odd) y = CpuField::from_uint64(0) - y;
|
||||
return CpuPoint::from_affine(x, y);
|
||||
}
|
||||
|
||||
OclField bytes_to_ocl_field(const uint8_t* bytes32) {
|
||||
OclField out{};
|
||||
for (int limb = 0; limb < 4; ++limb) {
|
||||
uint64_t v = 0;
|
||||
int base = limb * 8;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
v = (v << 8) | bytes32[base + i];
|
||||
}
|
||||
out.limbs[3 - limb] = v;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
OclAffine to_ocl_affine(const CpuPoint& p) {
|
||||
OclAffine out{};
|
||||
auto x = p.x().to_bytes();
|
||||
auto y = p.y().to_bytes();
|
||||
out.x = bytes_to_ocl_field(x.data());
|
||||
out.y = bytes_to_ocl_field(y.data());
|
||||
return out;
|
||||
}
|
||||
|
||||
OclAffine to_ocl_affine(const secp256k1::fast::AffinePointCompact& p) {
|
||||
OclAffine out{};
|
||||
auto x = p.x.to_bytes();
|
||||
auto y = p.y.to_bytes();
|
||||
out.x = bytes_to_ocl_field(x.data());
|
||||
out.y = bytes_to_ocl_field(y.data());
|
||||
return out;
|
||||
}
|
||||
|
||||
uint64_t extract_upper_64(const uint8_t* x_bytes) {
|
||||
uint64_t v = 0;
|
||||
for (int i = 0; i < 8; i++) v = (v << 8) | x_bytes[i];
|
||||
return v;
|
||||
}
|
||||
|
||||
std::string read_text(const std::string& path) {
|
||||
std::ifstream in(path, std::ios::binary);
|
||||
if (!in) throw std::runtime_error("failed to open: " + path);
|
||||
std::ostringstream ss;
|
||||
ss << in.rdbuf();
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string dirname_of(const std::string& path) {
|
||||
auto pos = path.find_last_of("/\\");
|
||||
return pos == std::string::npos ? "." : path.substr(0, pos);
|
||||
}
|
||||
|
||||
std::string trim_left(std::string s) {
|
||||
while (!s.empty() && (s.front() == ' ' || s.front() == '\t')) s.erase(s.begin());
|
||||
return s;
|
||||
}
|
||||
|
||||
std::string expand_kernel_file(const std::string& path, std::set<std::string>& include_stack) {
|
||||
if (include_stack.count(path)) return {};
|
||||
include_stack.insert(path);
|
||||
std::istringstream in(read_text(path));
|
||||
std::ostringstream out;
|
||||
std::string dir = dirname_of(path);
|
||||
std::string line;
|
||||
while (std::getline(in, line)) {
|
||||
std::string trimmed = trim_left(line);
|
||||
if (trimmed.rfind("#include \"", 0) == 0) {
|
||||
auto start = trimmed.find('"') + 1;
|
||||
auto end = trimmed.find('"', start);
|
||||
std::string child = dir + "/" + trimmed.substr(start, end - start);
|
||||
out << expand_kernel_file(child, include_stack);
|
||||
continue;
|
||||
}
|
||||
out << line << '\n';
|
||||
}
|
||||
include_stack.erase(path);
|
||||
return out.str();
|
||||
}
|
||||
|
||||
std::string load_bip352_kernel_source() {
|
||||
std::set<std::string> stack;
|
||||
return expand_kernel_file(std::string(SECP256K1_OPENCL_KERNEL_DIR) + "/secp256k1_bip352.cl", stack);
|
||||
}
|
||||
|
||||
std::vector<OclAffine> build_generator_lut_host() {
|
||||
std::vector<OclAffine> lut(LUT_WINDOWS * LUT_ENTRIES);
|
||||
CpuPoint base = CpuPoint::generator();
|
||||
|
||||
for (std::size_t win = 0; win < LUT_WINDOWS; ++win) {
|
||||
std::cout << " Building LUT window " << win + 1 << "/" << LUT_WINDOWS << "...\n";
|
||||
auto base_x = base.x();
|
||||
auto base_y = base.y();
|
||||
auto table = (win == 0)
|
||||
? secp256k1::fast::precompute_g_multiples(LUT_ENTRIES - 1)
|
||||
: secp256k1::fast::precompute_point_multiples(base_x, base_y, LUT_ENTRIES - 1);
|
||||
|
||||
lut[win * LUT_ENTRIES] = OclAffine{};
|
||||
for (std::size_t i = 0; i < table.size(); ++i) {
|
||||
lut[win * LUT_ENTRIES + i + 1] = to_ocl_affine(table[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 16; ++i) base.dbl_inplace();
|
||||
}
|
||||
|
||||
return lut;
|
||||
}
|
||||
|
||||
BIP352ScanKeyGlv build_scan_glv_plan() {
|
||||
BIP352ScanKeyGlv out{};
|
||||
auto scan_scalar = CpuScalar::from_bytes(SCAN_KEY);
|
||||
auto decomp = secp256k1::fast::glv_decompose(scan_scalar);
|
||||
auto k1 = decomp.k1.to_bytes();
|
||||
auto k2 = decomp.k2.to_bytes();
|
||||
out.k1_neg = decomp.k1_neg ? 1 : 0;
|
||||
out.flip_phi = (decomp.k1_neg != decomp.k2_neg) ? 1 : 0;
|
||||
host_compute_wnaf(k1.data(), out.wnaf1);
|
||||
host_compute_wnaf(k2.data(), out.wnaf2);
|
||||
return out;
|
||||
}
|
||||
|
||||
double median_iqr(std::vector<double> samples) {
|
||||
if (samples.empty()) return 0.0;
|
||||
std::sort(samples.begin(), samples.end());
|
||||
const int n = static_cast<int>(samples.size());
|
||||
if (n < 4) return samples[n / 2];
|
||||
double q1 = samples[n / 4];
|
||||
double q3 = samples[(3 * n) / 4];
|
||||
double iqr = q3 - q1;
|
||||
double lo = q1 - 1.5 * iqr;
|
||||
double hi = q3 + 1.5 * iqr;
|
||||
std::vector<double> filtered;
|
||||
filtered.reserve(samples.size());
|
||||
for (double v : samples) {
|
||||
if (v >= lo && v <= hi) filtered.push_back(v);
|
||||
}
|
||||
if (filtered.empty()) filtered = std::move(samples);
|
||||
return filtered[filtered.size() / 2];
|
||||
}
|
||||
|
||||
void check_cl(cl_int err, const char* what) {
|
||||
if (err != CL_SUCCESS) {
|
||||
throw std::runtime_error(std::string(what) + " failed with OpenCL error " + std::to_string(err));
|
||||
}
|
||||
}
|
||||
|
||||
// Autotune OpenCL local_size by running a few passes at candidate sizes.
|
||||
// Mirrors CUDA's autotune_gpu_tpb. Returns best local size found.
|
||||
static int autotune_local_size(
|
||||
const char* label,
|
||||
cl_command_queue cl_q,
|
||||
cl_kernel kernel,
|
||||
size_t count,
|
||||
size_t max_wg_size,
|
||||
std::initializer_list<int> candidates)
|
||||
{
|
||||
std::printf("Autotuning %s local size...\n", label);
|
||||
int best = 0;
|
||||
double best_ns = 0.0;
|
||||
|
||||
for (int ls : candidates) {
|
||||
if (ls <= 0 || static_cast<size_t>(ls) > max_wg_size) continue;
|
||||
|
||||
size_t local = static_cast<size_t>(ls);
|
||||
size_t global = ((count + local - 1) / local) * local;
|
||||
|
||||
// warmup
|
||||
for (int w = 0; w < 2; ++w) {
|
||||
cl_int err2 = clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr);
|
||||
if (err2 != CL_SUCCESS) goto next;
|
||||
}
|
||||
clFinish(cl_q);
|
||||
|
||||
{
|
||||
constexpr int SAMPLE_PASSES = 5;
|
||||
constexpr int SAMPLE_REPS = 10;
|
||||
std::vector<double> samples;
|
||||
samples.reserve(SAMPLE_PASSES);
|
||||
for (int p = 0; p < SAMPLE_PASSES; ++p) {
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
for (int r = 0; r < SAMPLE_REPS; ++r) {
|
||||
cl_int err2 = clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr);
|
||||
if (err2 != CL_SUCCESS) goto next;
|
||||
}
|
||||
clFinish(cl_q);
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
|
||||
samples.push_back((ms * 1e6) / (static_cast<double>(count) * SAMPLE_REPS));
|
||||
}
|
||||
double ns = median_iqr(samples);
|
||||
std::printf(" local=%3d -> %8.1f ns/op\n", ls, ns);
|
||||
if (best == 0 || ns < best_ns) { best = ls; best_ns = ns; }
|
||||
}
|
||||
next:;
|
||||
}
|
||||
|
||||
if (best == 0) best = DEFAULT_LOCAL_SIZE_FUSED;
|
||||
std::printf(" selected local=%d for %s\n\n", best, label);
|
||||
return best;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
bool prefer_intel = false;
|
||||
bool use_lut = false;
|
||||
int platform_id = -1;
|
||||
int device_id = 0;
|
||||
int batch_n = BENCH_N;
|
||||
int local_size = 0;
|
||||
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
std::string arg = argv[i];
|
||||
if (arg == "--intel") prefer_intel = true;
|
||||
else if (arg == "--nvidia") prefer_intel = false;
|
||||
else if (arg == "--lut") use_lut = true;
|
||||
else if (arg == "--platform" && i + 1 < argc) platform_id = std::atoi(argv[++i]);
|
||||
else if (arg == "--device" && i + 1 < argc) device_id = std::atoi(argv[++i]);
|
||||
else if (arg == "--batch" && i + 1 < argc) batch_n = std::atoi(argv[++i]);
|
||||
else if (arg == "--local" && i + 1 < argc) local_size = std::atoi(argv[++i]);
|
||||
}
|
||||
if (local_size == 0) {
|
||||
local_size = use_lut ? DEFAULT_LOCAL_SIZE_LUT : DEFAULT_LOCAL_SIZE_FUSED;
|
||||
}
|
||||
|
||||
secp256k1::opencl::DeviceConfig cfg;
|
||||
cfg.prefer_intel = prefer_intel;
|
||||
cfg.verbose = true;
|
||||
cfg.platform_id = platform_id;
|
||||
cfg.device_id = device_id;
|
||||
auto ctx = secp256k1::opencl::Context::create(cfg);
|
||||
if (!ctx || !ctx->is_valid()) {
|
||||
std::cerr << "Failed to create OpenCL context\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
cl_context cl_ctx = static_cast<cl_context>(ctx->native_context());
|
||||
cl_command_queue cl_q = static_cast<cl_command_queue>(ctx->native_queue());
|
||||
cl_device_id cl_dev = nullptr;
|
||||
check_cl(clGetCommandQueueInfo(cl_q, CL_QUEUE_DEVICE, sizeof(cl_dev), &cl_dev, nullptr),
|
||||
"clGetCommandQueueInfo(CL_QUEUE_DEVICE)");
|
||||
|
||||
std::cout << "============================================================\n";
|
||||
std::cout << " BIP-352 Silent Payments Pipeline: CPU vs OpenCL\n";
|
||||
std::cout << "============================================================\n";
|
||||
std::cout << " Device: " << ctx->device_info().name << " (" << ctx->device_info().vendor << ")\n";
|
||||
std::cout << " N = " << batch_n << " tweak points, " << BENCH_PASSES << " passes (median)\n\n";
|
||||
std::cout << " Local size = " << local_size << "\n\n";
|
||||
|
||||
std::cout << "Generating " << batch_n << " deterministic tweak points...\n";
|
||||
std::vector<OclAffine> tweaks(static_cast<size_t>(batch_n));
|
||||
CpuPoint last_tweak = CpuPoint::infinity();
|
||||
uint8_t seed[32];
|
||||
const char* tag = "bench_bip352_seed";
|
||||
host_sha256(reinterpret_cast<const uint8_t*>(tag), std::strlen(tag), seed);
|
||||
for (int i = 0; i < batch_n; ++i) {
|
||||
uint8_t buf[36];
|
||||
std::memcpy(buf, seed, 32);
|
||||
buf[32] = static_cast<uint8_t>((i >> 24) & 0xff);
|
||||
buf[33] = static_cast<uint8_t>((i >> 16) & 0xff);
|
||||
buf[34] = static_cast<uint8_t>((i >> 8) & 0xff);
|
||||
buf[35] = static_cast<uint8_t>(i & 0xff);
|
||||
uint8_t scalar_bytes[32];
|
||||
host_sha256(buf, 36, scalar_bytes);
|
||||
CpuScalar s = CpuScalar::from_bytes(scalar_bytes);
|
||||
CpuPoint p = CpuPoint::generator().scalar_mul(s);
|
||||
if (i == batch_n - 1) last_tweak = p;
|
||||
tweaks[static_cast<size_t>(i)] = to_ocl_affine(p);
|
||||
}
|
||||
std::cout << "Done.\n";
|
||||
|
||||
CpuPoint spend_cpu = point_from_compressed(SPEND_PUBKEY_COMPRESSED);
|
||||
if (spend_cpu.is_infinity()) {
|
||||
std::cerr << "Failed to decode spend pubkey\n";
|
||||
return 1;
|
||||
}
|
||||
OclAffine spend = to_ocl_affine(spend_cpu);
|
||||
|
||||
std::cout << "Building OpenCL BIP352 pipeline kernel...\n";
|
||||
std::string source = load_bip352_kernel_source();
|
||||
const char* src_ptr = source.c_str();
|
||||
size_t src_len = source.size();
|
||||
cl_int err = CL_SUCCESS;
|
||||
cl_program program = clCreateProgramWithSource(cl_ctx, 1, &src_ptr, &src_len, &err);
|
||||
check_cl(err, "clCreateProgramWithSource");
|
||||
std::string build_options = "-cl-std=CL1.2 -cl-fast-relaxed-math -cl-mad-enable"
|
||||
" -cl-nv-opt-level=3";
|
||||
err = clBuildProgram(program, 1, &cl_dev, build_options.c_str(), nullptr, nullptr);
|
||||
if (err != CL_SUCCESS) {
|
||||
size_t log_size = 0;
|
||||
clGetProgramBuildInfo(program, cl_dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_size);
|
||||
std::string log(log_size, '\0');
|
||||
clGetProgramBuildInfo(program, cl_dev, CL_PROGRAM_BUILD_LOG, log_size, log.data(), nullptr);
|
||||
std::cerr << "Build failed:\n" << log << "\n";
|
||||
return 1;
|
||||
}
|
||||
const char* kernel_name = use_lut ? "bip352_pipeline_kernel_lut" : "bip352_pipeline_kernel";
|
||||
cl_kernel kernel = clCreateKernel(program, kernel_name, &err);
|
||||
check_cl(err, kernel_name);
|
||||
std::cout << "Done.\n";
|
||||
|
||||
size_t count = static_cast<size_t>(batch_n);
|
||||
size_t tweak_bytes = count * sizeof(OclAffine);
|
||||
std::vector<uint64_t> prefixes(count);
|
||||
std::vector<OclAffine> gen_lut;
|
||||
BIP352ScanKeyGlv scan_plan{};
|
||||
|
||||
// Both paths now use BIP352ScanKeyGlv with precomputed wNAF digits.
|
||||
scan_plan = build_scan_glv_plan();
|
||||
cl_mem d_tweaks = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, tweak_bytes, tweaks.data(), &err);
|
||||
check_cl(err, "clCreateBuffer(d_tweaks)");
|
||||
cl_mem d_scan = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||
sizeof(BIP352ScanKeyGlv), &scan_plan, &err);
|
||||
check_cl(err, "clCreateBuffer(d_scan)");
|
||||
cl_mem d_spend = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(OclAffine), &spend, &err);
|
||||
check_cl(err, "clCreateBuffer(d_spend)");
|
||||
cl_mem d_prefixes = clCreateBuffer(cl_ctx, CL_MEM_WRITE_ONLY, count * sizeof(uint64_t), nullptr, &err);
|
||||
check_cl(err, "clCreateBuffer(d_prefixes)");
|
||||
cl_mem d_gen_lut = nullptr;
|
||||
if (use_lut) {
|
||||
std::cout << "Building CPU generator LUT (" << (LUT_WINDOWS * LUT_ENTRIES) << " affine points)...\n";
|
||||
gen_lut = build_generator_lut_host();
|
||||
std::cout << "Uploading generator LUT to OpenCL...\n";
|
||||
d_gen_lut = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||
gen_lut.size() * sizeof(OclAffine), gen_lut.data(), &err);
|
||||
check_cl(err, "clCreateBuffer(d_gen_lut)");
|
||||
}
|
||||
|
||||
cl_uint cl_count = static_cast<cl_uint>(count);
|
||||
check_cl(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_tweaks), "clSetKernelArg(0)");
|
||||
check_cl(clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_scan), "clSetKernelArg(1)");
|
||||
check_cl(clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_spend), "clSetKernelArg(2)");
|
||||
if (use_lut) {
|
||||
check_cl(clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_gen_lut), "clSetKernelArg(3)");
|
||||
check_cl(clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_prefixes), "clSetKernelArg(4)");
|
||||
check_cl(clSetKernelArg(kernel, 5, sizeof(cl_uint), &cl_count), "clSetKernelArg(5)");
|
||||
} else {
|
||||
check_cl(clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_prefixes), "clSetKernelArg(3)");
|
||||
check_cl(clSetKernelArg(kernel, 4, sizeof(cl_uint), &cl_count), "clSetKernelArg(4)");
|
||||
}
|
||||
|
||||
if (local_size <= 0) {
|
||||
throw std::runtime_error("local size must be positive");
|
||||
}
|
||||
if (static_cast<std::size_t>(local_size) > ctx->device_info().max_work_group_size) {
|
||||
throw std::runtime_error("local size exceeds device max work group size");
|
||||
}
|
||||
|
||||
// Autotune: find optimal local size among candidates (mirrors CUDA autotune_gpu_tpb).
|
||||
// Only autotune when no explicit --local was given (i.e., we're still at the default).
|
||||
{
|
||||
int default_ls = use_lut ? DEFAULT_LOCAL_SIZE_LUT : DEFAULT_LOCAL_SIZE_FUSED;
|
||||
if (local_size == default_ls) {
|
||||
int tuned = autotune_local_size(
|
||||
use_lut ? "LUT kernel" : "fused kernel",
|
||||
cl_q, kernel, count,
|
||||
ctx->device_info().max_work_group_size,
|
||||
{64, 128, 256, 384});
|
||||
local_size = tuned;
|
||||
}
|
||||
}
|
||||
|
||||
size_t global = ((count + static_cast<size_t>(local_size) - 1) / static_cast<size_t>(local_size)) * static_cast<size_t>(local_size);
|
||||
size_t local = static_cast<size_t>(local_size);
|
||||
std::cout << " Running with local_size=" << local_size << "\n";
|
||||
|
||||
for (int i = 0; i < BENCH_WARMUP; ++i) {
|
||||
check_cl(clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr),
|
||||
"clEnqueueNDRangeKernel(warmup)");
|
||||
}
|
||||
check_cl(clFinish(cl_q), "clFinish(warmup)");
|
||||
|
||||
std::vector<double> samples;
|
||||
samples.reserve(BENCH_PASSES);
|
||||
std::cout << "\n--- OpenCL (" << (use_lut ? "fused pipeline + LUT" : "fused pipeline") << ") ---\n";
|
||||
for (int pass = 0; pass < BENCH_PASSES; ++pass) {
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
check_cl(clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr),
|
||||
"clEnqueueNDRangeKernel");
|
||||
check_cl(clFinish(cl_q), "clFinish");
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
|
||||
samples.push_back((ms * 1e6) / static_cast<double>(count));
|
||||
std::printf(" pass %2d: %8.3f ms\n", pass + 1, ms);
|
||||
}
|
||||
double ns_per_op = median_iqr(samples);
|
||||
double ops_per_sec = 1e9 / ns_per_op;
|
||||
|
||||
check_cl(clEnqueueReadBuffer(cl_q, d_prefixes, CL_TRUE, 0, count * sizeof(uint64_t), prefixes.data(), 0, nullptr, nullptr),
|
||||
"clEnqueueReadBuffer");
|
||||
|
||||
CpuScalar scan_scalar = CpuScalar::from_bytes(SCAN_KEY);
|
||||
CpuPoint shared = last_tweak.scalar_mul(scan_scalar);
|
||||
auto shared_comp = shared.to_compressed();
|
||||
uint8_t shared_ser[37];
|
||||
std::memcpy(shared_ser, shared_comp.data(), 33);
|
||||
shared_ser[33] = shared_ser[34] = shared_ser[35] = shared_ser[36] = 0;
|
||||
auto tagged = secp256k1::detail::cached_tagged_hash(
|
||||
secp256k1::detail::make_tag_midstate("BIP0352/SharedSecret"),
|
||||
shared_ser,
|
||||
sizeof(shared_ser));
|
||||
CpuScalar hs = CpuScalar::from_bytes(tagged.data());
|
||||
CpuPoint out = CpuPoint::generator().scalar_mul(hs);
|
||||
CpuPoint cand = spend_cpu;
|
||||
cand.add_inplace(out);
|
||||
uint64_t cpu_validation = extract_upper_64(cand.x_only_bytes().data());
|
||||
uint64_t ocl_validation = prefixes.back();
|
||||
|
||||
std::printf("\n OpenCL%s: %.1f ns/op (%.2f M/s)\n", use_lut ? " LUT" : "", ns_per_op, ops_per_sec / 1e6);
|
||||
std::printf(" validation prefix: 0x%016llx\n", static_cast<unsigned long long>(ocl_validation));
|
||||
// CUDA reference: bench_bip352 on RTX 5060 Ti (SM 12.0, 36 SMs, 384 tpb).
|
||||
// GLV (no LUT): 260.4 ns/op (3.84 M/s). LUT: 127.2 ns/op (7.86 M/s).
|
||||
constexpr double CUDA_GLV_NS = 260.4;
|
||||
constexpr double CUDA_LUT_NS = 127.2;
|
||||
double cuda_ref = use_lut ? CUDA_LUT_NS : CUDA_GLV_NS;
|
||||
std::printf(" CUDA reference: %.1f ns/op (%.2f M/s) [%s]\n",
|
||||
cuda_ref, 1e9 / cuda_ref / 1e6, use_lut ? "LUT" : "GLV");
|
||||
std::printf(" gap vs CUDA: %.2fx\n", ns_per_op / cuda_ref);
|
||||
std::printf(" Validation: %s\n", cpu_validation == ocl_validation ? "[OK] MATCH" : "[FAIL] MISMATCH");
|
||||
|
||||
clReleaseMemObject(d_tweaks);
|
||||
clReleaseMemObject(d_scan);
|
||||
clReleaseMemObject(d_spend);
|
||||
clReleaseMemObject(d_prefixes);
|
||||
if (d_gen_lut) clReleaseMemObject(d_gen_lut);
|
||||
clReleaseKernel(kernel);
|
||||
clReleaseProgram(program);
|
||||
return cpu_validation == ocl_validation ? 0 : 2;
|
||||
}
|
||||
@ -195,6 +195,7 @@ int main(int argc, char* argv[]) {
|
||||
std::vector<Scalar> point_scalars(point_batch);
|
||||
std::vector<JacobianPoint> pd_in(point_batch), pd_out(point_batch);
|
||||
std::vector<JacobianPoint> pa_in1(point_batch), pa_in2(point_batch), pa_out(point_batch);
|
||||
std::vector<AffinePoint> sm_points(point_batch);
|
||||
|
||||
for (std::size_t i = 0; i < point_batch; ++i) {
|
||||
point_scalars[i] = {{rng(), rng(), rng(), rng()}};
|
||||
@ -206,6 +207,7 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
ctx->batch_scalar_mul_generator(point_scalars.data(), pa_in2.data(), point_batch);
|
||||
pa_in1 = pd_in;
|
||||
ctx->batch_jacobian_to_affine(pd_in.data(), sm_points.data(), point_batch);
|
||||
|
||||
{
|
||||
auto r = bench_batch("Point Double", [&]() {
|
||||
@ -244,6 +246,13 @@ int main(int argc, char* argv[]) {
|
||||
}, bs, 1, 3);
|
||||
print_result(r);
|
||||
results.push_back(r);
|
||||
|
||||
std::string kp_name = "kP (batch=" + std::to_string(bs) + ")";
|
||||
auto kp = bench_batch(kp_name, [&]() {
|
||||
ctx->batch_scalar_mul(sm_scalars.data(), sm_points.data(), sm_results.data(), bs);
|
||||
}, bs, 1, 3);
|
||||
print_result(kp);
|
||||
results.push_back(kp);
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
@ -598,7 +607,8 @@ int main(int argc, char* argv[]) {
|
||||
{
|
||||
std::size_t smk_batch = std::min(point_batch, static_cast<std::size_t>(65536));
|
||||
cl_uint smk_cnt = static_cast<cl_uint>(smk_batch);
|
||||
std::size_t smk_global = ((smk_batch + p_local_sz - 1) / p_local_sz) * p_local_sz;
|
||||
std::size_t smk_local_sz = std::min<std::size_t>(128, p_local_sz);
|
||||
std::size_t smk_global = ((smk_batch + smk_local_sz - 1) / smk_local_sz) * smk_local_sz;
|
||||
|
||||
// Use existing point_scalars for scalar data
|
||||
cl_mem buf_sc = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||
@ -616,12 +626,12 @@ int main(int argc, char* argv[]) {
|
||||
int smk_iters = 5;
|
||||
|
||||
for (int i = 0; i < smk_warmup; ++i)
|
||||
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &p_local_sz, 0, nullptr, nullptr);
|
||||
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
|
||||
clFinish(cl_q);
|
||||
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
for (int i = 0; i < smk_iters; ++i)
|
||||
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &p_local_sz, 0, nullptr, nullptr);
|
||||
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
|
||||
clFinish(cl_q);
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
@ -634,6 +644,95 @@ int main(int argc, char* argv[]) {
|
||||
clReleaseMemObject(buf_smr);
|
||||
}
|
||||
|
||||
// Scalar Mul Arbitrary Point (kernel-only) -- same batch cap as kG
|
||||
{
|
||||
std::size_t smk_batch = std::min(point_batch, static_cast<std::size_t>(65536));
|
||||
cl_uint smk_cnt = static_cast<cl_uint>(smk_batch);
|
||||
std::size_t smk_local_sz = std::min<std::size_t>(128, p_local_sz);
|
||||
std::size_t smk_global = ((smk_batch + smk_local_sz - 1) / smk_local_sz) * smk_local_sz;
|
||||
|
||||
std::vector<AffinePoint> sm_points(smk_batch);
|
||||
ctx->batch_jacobian_to_affine(pd_in.data(), sm_points.data(), smk_batch);
|
||||
|
||||
cl_mem buf_sc = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||
smk_batch * sizeof(Scalar), (void*)point_scalars.data(), &err);
|
||||
cl_mem buf_pts = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||
smk_batch * sizeof(AffinePoint), (void*)sm_points.data(), &err);
|
||||
cl_mem buf_smr = clCreateBuffer(cl_ctx, CL_MEM_WRITE_ONLY,
|
||||
smk_batch * sizeof(JacobianPoint), nullptr, &err);
|
||||
clFinish(cl_q);
|
||||
|
||||
cl_kernel kern = (cl_kernel)ctx->native_kernel("scalar_mul");
|
||||
clSetKernelArg(kern, 0, sizeof(cl_mem), &buf_sc);
|
||||
clSetKernelArg(kern, 1, sizeof(cl_mem), &buf_pts);
|
||||
clSetKernelArg(kern, 2, sizeof(cl_mem), &buf_smr);
|
||||
clSetKernelArg(kern, 3, sizeof(cl_uint), &smk_cnt);
|
||||
|
||||
int smk_warmup = 2;
|
||||
int smk_iters = 5;
|
||||
|
||||
for (int i = 0; i < smk_warmup; ++i)
|
||||
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
|
||||
clFinish(cl_q);
|
||||
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
for (int i = 0; i < smk_iters; ++i)
|
||||
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
|
||||
clFinish(cl_q);
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
|
||||
double total_ops = static_cast<double>(smk_batch) * smk_iters;
|
||||
BenchResult r = {"kP (kernel)", ns / total_ops, total_ops / (ns * 1e-9)};
|
||||
print_result(r); results.push_back(r);
|
||||
|
||||
for (int i = 0; i < smk_warmup; ++i) {
|
||||
clEnqueueWriteBuffer(cl_q, buf_pts, CL_FALSE, 0,
|
||||
smk_batch * sizeof(AffinePoint), (void*)sm_points.data(), 0, nullptr, nullptr);
|
||||
clEnqueueWriteBuffer(cl_q, buf_sc, CL_FALSE, 0,
|
||||
smk_batch * sizeof(Scalar), (void*)point_scalars.data(), 0, nullptr, nullptr);
|
||||
}
|
||||
clFinish(cl_q);
|
||||
|
||||
t0 = std::chrono::high_resolution_clock::now();
|
||||
for (int i = 0; i < smk_iters; ++i) {
|
||||
clEnqueueWriteBuffer(cl_q, buf_pts, CL_FALSE, 0,
|
||||
smk_batch * sizeof(AffinePoint), (void*)sm_points.data(), 0, nullptr, nullptr);
|
||||
clEnqueueWriteBuffer(cl_q, buf_sc, CL_FALSE, 0,
|
||||
smk_batch * sizeof(Scalar), (void*)point_scalars.data(), 0, nullptr, nullptr);
|
||||
}
|
||||
clFinish(cl_q);
|
||||
t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
|
||||
BenchResult upload = {"kP (upload)", ns / total_ops, total_ops / (ns * 1e-9)};
|
||||
print_result(upload); results.push_back(upload);
|
||||
|
||||
clEnqueueNDRangeKernel(cl_q, kern, 1, nullptr, &smk_global, &smk_local_sz, 0, nullptr, nullptr);
|
||||
clFinish(cl_q);
|
||||
|
||||
std::vector<JacobianPoint> sm_readback(smk_batch);
|
||||
for (int i = 0; i < smk_warmup; ++i)
|
||||
clEnqueueReadBuffer(cl_q, buf_smr, CL_FALSE, 0,
|
||||
smk_batch * sizeof(JacobianPoint), sm_readback.data(), 0, nullptr, nullptr);
|
||||
clFinish(cl_q);
|
||||
|
||||
t0 = std::chrono::high_resolution_clock::now();
|
||||
for (int i = 0; i < smk_iters; ++i)
|
||||
clEnqueueReadBuffer(cl_q, buf_smr, CL_FALSE, 0,
|
||||
smk_batch * sizeof(JacobianPoint), sm_readback.data(), 0, nullptr, nullptr);
|
||||
clFinish(cl_q);
|
||||
t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
|
||||
BenchResult readback = {"kP (readback)", ns / total_ops, total_ops / (ns * 1e-9)};
|
||||
print_result(readback); results.push_back(readback);
|
||||
|
||||
clReleaseMemObject(buf_sc);
|
||||
clReleaseMemObject(buf_pts);
|
||||
clReleaseMemObject(buf_smr);
|
||||
}
|
||||
|
||||
clReleaseMemObject(buf_jp1);
|
||||
clReleaseMemObject(buf_jp2);
|
||||
clReleaseMemObject(buf_jpr);
|
||||
|
||||
@ -37,7 +37,7 @@ struct DeviceConfig {
|
||||
int device_id = 0; // GPU device index
|
||||
int platform_id = 0; // Platform index (e.g., Intel, AMD)
|
||||
std::size_t max_batch_size = 65536; // Max points per batch
|
||||
std::size_t local_work_size = 256; // Work group size (auto if 0)
|
||||
std::size_t local_work_size = 0; // Work group size (auto if 0)
|
||||
bool prefer_intel = true; // Prefer Intel GPU if available
|
||||
bool verbose = false; // Print device info on init
|
||||
};
|
||||
|
||||
203
opencl/kernels/secp256k1_bip352.cl
Normal file
203
opencl/kernels/secp256k1_bip352.cl
Normal file
@ -0,0 +1,203 @@
|
||||
#ifndef SECP256K1_BIP352_CL
|
||||
#define SECP256K1_BIP352_CL
|
||||
|
||||
#include "secp256k1_extended.cl"
|
||||
|
||||
// BIP352ScanKeyGlv: precomputed GLV scan-key plan uploaded to __constant memory.
|
||||
// wNAF digits are computed on the CPU host and read directly in the kernel,
|
||||
// avoiding the GPU call to scalar_to_wnaf and eliminating 1040 bytes of
|
||||
// private-stack pressure (int wnaf1[130] + int wnaf2[130]).
|
||||
// Layout must match the host-side BIP352ScanKeyGlv struct exactly.
|
||||
typedef struct {
|
||||
char wnaf1[130]; // wNAF digits for k1 half-scalar (precomputed, range [-15..15])
|
||||
char wnaf2[130]; // wNAF digits for k2 half-scalar (precomputed, range [-15..15])
|
||||
uchar k1_neg; // 1 if k1 was negative: negate base.y before table build
|
||||
uchar flip_phi; // 1 if phi table y-coordinate should be negated
|
||||
uchar pad0;
|
||||
uchar pad1;
|
||||
} BIP352ScanKeyGlv;
|
||||
|
||||
// SHA256("BIP0352/SharedSecret") || SHA256("BIP0352/SharedSecret")
|
||||
__constant uint BIP352_SHAREDSECRET_MIDSTATE[8] = {
|
||||
0x88831537U, 0x5127079bU, 0x69c2137bU, 0xab0303e6U,
|
||||
0x98fa21faU, 0x4a888523U, 0xbd99daabU, 0xf25e5e0aU
|
||||
};
|
||||
|
||||
inline void bip352_tagged_sha256_impl(const uchar* data, uint data_len, uchar out[32]) {
|
||||
SHA256Ctx ctx;
|
||||
for (int i = 0; i < 8; i++) ctx.h[i] = BIP352_SHAREDSECRET_MIDSTATE[i];
|
||||
ctx.buf_len = 0;
|
||||
ctx.total_len = 64;
|
||||
sha256_update(&ctx, data, data_len);
|
||||
sha256_final(&ctx, out);
|
||||
}
|
||||
|
||||
inline void bip352_shared_secret_input_impl(const JacobianPoint* p, uchar ser[37]) {
|
||||
FieldElement z_inv, z_inv2, z_inv3, x_aff, y_aff;
|
||||
field_inv_impl(&z_inv, &p->z);
|
||||
field_sqr_impl(&z_inv2, &z_inv);
|
||||
field_mul_impl(&z_inv3, &z_inv2, &z_inv);
|
||||
field_mul_impl(&x_aff, &p->x, &z_inv2);
|
||||
field_mul_impl(&y_aff, &p->y, &z_inv3);
|
||||
|
||||
uchar x_bytes[32], y_bytes[32];
|
||||
field_to_bytes_impl(&x_aff, x_bytes);
|
||||
field_to_bytes_impl(&y_aff, y_bytes);
|
||||
|
||||
ser[0] = (y_bytes[31] & 1) ? 0x03 : 0x02;
|
||||
for (int i = 0; i < 32; i++) ser[1 + i] = x_bytes[i];
|
||||
ser[33] = 0;
|
||||
ser[34] = 0;
|
||||
ser[35] = 0;
|
||||
ser[36] = 0;
|
||||
}
|
||||
|
||||
inline ulong point_prefix64_impl(const JacobianPoint* p) {
|
||||
FieldElement z_inv, z_inv2, x_aff;
|
||||
field_inv_impl(&z_inv, &p->z);
|
||||
field_sqr_impl(&z_inv2, &z_inv);
|
||||
field_mul_impl(&x_aff, &p->x, &z_inv2);
|
||||
|
||||
uchar x_bytes[32];
|
||||
field_to_bytes_impl(&x_aff, x_bytes);
|
||||
|
||||
ulong prefix = 0;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
prefix = (prefix << 8) | (ulong)x_bytes[i];
|
||||
}
|
||||
return prefix;
|
||||
}
|
||||
|
||||
// Optimized GLV scalar multiply with pre-decomposed scan key.
|
||||
// Uses build_wnaf_table_zr_impl (Z-trick affine table) + derive_endo_table_impl
|
||||
// instead of the old Jacobian-Jacobian table -- eliminates 6 J-J adds per half,
|
||||
// replaces with 7 mixed (J+A) adds and 1 field_inv shared across 8 entries.
|
||||
// This matches the quality of scalar_mul_glv_impl in secp256k1_extended.cl.
|
||||
inline void scalar_mul_glv_predecomp_impl(
|
||||
JacobianPoint* r,
|
||||
const AffinePoint* p,
|
||||
__constant const BIP352ScanKeyGlv* scan
|
||||
) {
|
||||
AffinePoint base = *p;
|
||||
if (scan->k1_neg) field_negate_impl(&base.y, &base.y);
|
||||
|
||||
// Build affine table[0..7] = {P, 3P, 5P, 7P, 9P, 11P, 13P, 15P} via Z-trick.
|
||||
// One field_inv for the whole table instead of per-point.
|
||||
AffinePoint table[8];
|
||||
FieldElement globalz;
|
||||
build_wnaf_table_zr_impl(&base, table, &globalz);
|
||||
|
||||
// Endomorphism table: endo_table[i] = phi(table[i]) with optional Y-negate.
|
||||
AffinePoint endo_table[8];
|
||||
derive_endo_table_impl(table, endo_table, scan->flip_phi);
|
||||
|
||||
// Shamir interleaved double-and-add with mixed (J+A) additions.
|
||||
// wNAF digits are read directly from __constant memory (precomputed on CPU host),
|
||||
// eliminating the GPU scalar_to_wnaf call and 1040 bytes of private stack.
|
||||
point_set_infinity(r);
|
||||
for (int i = 129; i >= 0; --i) {
|
||||
if (!point_is_infinity(r)) point_double_impl(r, r);
|
||||
|
||||
int d1 = (int)scan->wnaf1[i];
|
||||
if (d1 != 0) {
|
||||
int idx = (((d1 > 0) ? d1 : -d1) - 1) >> 1;
|
||||
AffinePoint pt = table[idx];
|
||||
if (d1 < 0) field_negate_impl(&pt.y, &pt.y);
|
||||
if (point_is_infinity(r)) { point_from_affine(r, &pt); }
|
||||
else { point_add_mixed_impl(r, r, &pt); }
|
||||
}
|
||||
|
||||
int d2 = (int)scan->wnaf2[i];
|
||||
if (d2 != 0) {
|
||||
int idx = (((d2 > 0) ? d2 : -d2) - 1) >> 1;
|
||||
AffinePoint pt = endo_table[idx];
|
||||
if (d2 < 0) field_negate_impl(&pt.y, &pt.y);
|
||||
if (point_is_infinity(r)) { point_from_affine(r, &pt); }
|
||||
else { point_add_mixed_impl(r, r, &pt); }
|
||||
}
|
||||
}
|
||||
|
||||
// Correct accumulated Z by the shared table Z factor.
|
||||
if (!point_is_infinity(r)) {
|
||||
FieldElement corrected_z;
|
||||
field_mul_impl(&corrected_z, &r->z, &globalz);
|
||||
r->z = corrected_z;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void bip352_pipeline_kernel(
|
||||
__global const AffinePoint* tweak_points,
|
||||
__constant const BIP352ScanKeyGlv* scan_key,
|
||||
__global const AffinePoint* spend_point,
|
||||
__global ulong* prefixes,
|
||||
const uint count
|
||||
) {
|
||||
uint gid = get_global_id(0);
|
||||
if (gid >= count) return;
|
||||
|
||||
AffinePoint tweak = tweak_points[gid];
|
||||
AffinePoint spend = spend_point[0];
|
||||
|
||||
JacobianPoint shared;
|
||||
scalar_mul_glv_predecomp_impl(&shared, &tweak, scan_key);
|
||||
if (point_is_infinity(&shared)) {
|
||||
prefixes[gid] = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
uchar ser[37];
|
||||
bip352_shared_secret_input_impl(&shared, ser);
|
||||
|
||||
uchar hash[32];
|
||||
bip352_tagged_sha256_impl(ser, 37, hash);
|
||||
|
||||
Scalar hs;
|
||||
scalar_from_bytes_impl(hash, &hs);
|
||||
|
||||
JacobianPoint out;
|
||||
scalar_mul_generator_windowed_impl(&out, &hs);
|
||||
|
||||
JacobianPoint cand;
|
||||
point_add_mixed_impl(&cand, &out, &spend);
|
||||
prefixes[gid] = point_prefix64_impl(&cand);
|
||||
}
|
||||
|
||||
__kernel void bip352_pipeline_kernel_lut(
|
||||
__global const AffinePoint* tweak_points,
|
||||
__constant const BIP352ScanKeyGlv* scan_key,
|
||||
__global const AffinePoint* spend_point,
|
||||
__global const AffinePoint* gen_lut,
|
||||
__global ulong* prefixes,
|
||||
const uint count
|
||||
) {
|
||||
uint gid = get_global_id(0);
|
||||
if (gid >= count) return;
|
||||
|
||||
AffinePoint tweak = tweak_points[gid];
|
||||
AffinePoint spend = spend_point[0];
|
||||
|
||||
JacobianPoint shared;
|
||||
scalar_mul_glv_predecomp_impl(&shared, &tweak, scan_key);
|
||||
if (point_is_infinity(&shared)) {
|
||||
prefixes[gid] = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
uchar ser[37];
|
||||
bip352_shared_secret_input_impl(&shared, ser);
|
||||
|
||||
uchar hash[32];
|
||||
bip352_tagged_sha256_impl(ser, 37, hash);
|
||||
|
||||
Scalar hs;
|
||||
scalar_from_bytes_impl(hash, &hs);
|
||||
|
||||
JacobianPoint out;
|
||||
scalar_mul_generator_lut_impl(&out, &hs, gen_lut);
|
||||
|
||||
JacobianPoint cand;
|
||||
point_add_mixed_impl(&cand, &out, &spend);
|
||||
prefixes[gid] = point_prefix64_impl(&cand);
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -564,6 +564,84 @@ inline void glv_decompose_impl(const Scalar* k, Scalar* k1, Scalar* k2,
|
||||
// GLV-accelerated scalar multiplication: k*P using Shamir's trick
|
||||
// with endomorphism phi(P) = (beta*x, y) where phi corresponds to lambda.
|
||||
// Uses interleaved wNAF w=5 for both half-scalars k1, k2.
|
||||
inline void build_wnaf_table_zr_impl(const AffinePoint* base, AffinePoint table[8],
|
||||
FieldElement* globalz) {
|
||||
JacobianPoint base_jac;
|
||||
point_from_affine(&base_jac, base);
|
||||
|
||||
JacobianPoint doubled;
|
||||
point_double_impl(&doubled, &base_jac);
|
||||
|
||||
FieldElement c = doubled.z;
|
||||
FieldElement c2, c3;
|
||||
field_sqr_impl(&c2, &c);
|
||||
field_mul_impl(&c3, &c2, &c);
|
||||
|
||||
AffinePoint doubled_affine;
|
||||
doubled_affine.x = doubled.x;
|
||||
doubled_affine.y = doubled.y;
|
||||
|
||||
JacobianPoint accum;
|
||||
field_mul_impl(&accum.x, &base->x, &c2);
|
||||
field_mul_impl(&accum.y, &base->y, &c3);
|
||||
accum.z.limbs[0] = 1UL;
|
||||
accum.z.limbs[1] = 0UL;
|
||||
accum.z.limbs[2] = 0UL;
|
||||
accum.z.limbs[3] = 0UL;
|
||||
accum.infinity = 0;
|
||||
|
||||
table[0].x = accum.x;
|
||||
table[0].y = accum.y;
|
||||
|
||||
FieldElement zr[8];
|
||||
zr[0] = c;
|
||||
|
||||
for (int i = 1; i < 8; ++i) {
|
||||
FieldElement h;
|
||||
point_add_mixed_h_impl(&accum, &accum, &doubled_affine, &h);
|
||||
table[i].x = accum.x;
|
||||
table[i].y = accum.y;
|
||||
zr[i] = h;
|
||||
}
|
||||
|
||||
field_mul_impl(globalz, &accum.z, &c);
|
||||
|
||||
FieldElement zs = zr[7];
|
||||
for (int idx = 6; idx >= 0; --idx) {
|
||||
if (idx != 6) {
|
||||
FieldElement tmp;
|
||||
field_mul_impl(&tmp, &zs, &zr[idx + 1]);
|
||||
zs = tmp;
|
||||
}
|
||||
|
||||
FieldElement zs2, zs3;
|
||||
field_sqr_impl(&zs2, &zs);
|
||||
field_mul_impl(&zs3, &zs2, &zs);
|
||||
|
||||
FieldElement tx, ty;
|
||||
field_mul_impl(&tx, &table[idx].x, &zs2);
|
||||
field_mul_impl(&ty, &table[idx].y, &zs3);
|
||||
table[idx].x = tx;
|
||||
table[idx].y = ty;
|
||||
}
|
||||
}
|
||||
|
||||
inline void derive_endo_table_impl(const AffinePoint table[8], AffinePoint endo_table[8],
|
||||
int negate_y) {
|
||||
FieldElement beta;
|
||||
beta.limbs[0] = GLV_BETA0; beta.limbs[1] = GLV_BETA1;
|
||||
beta.limbs[2] = GLV_BETA2; beta.limbs[3] = GLV_BETA3;
|
||||
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
field_mul_impl(&endo_table[i].x, &table[i].x, &beta);
|
||||
if (negate_y) {
|
||||
field_negate_impl(&endo_table[i].y, &table[i].y);
|
||||
} else {
|
||||
endo_table[i].y = table[i].y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void scalar_mul_glv_impl(JacobianPoint* r, const Scalar* k, const AffinePoint* p) {
|
||||
Scalar k1, k2;
|
||||
int k1_neg, k2_neg;
|
||||
@ -573,62 +651,48 @@ inline void scalar_mul_glv_impl(JacobianPoint* r, const Scalar* k, const AffineP
|
||||
AffinePoint base = *p;
|
||||
if (k1_neg) field_negate_impl(&base.y, &base.y);
|
||||
|
||||
// Build P precomp table: [P, 3P, 5P, ..., 15P] (8 entries, w=5)
|
||||
JacobianPoint tbl_jac[8];
|
||||
JacobianPoint dbl;
|
||||
point_from_affine(&tbl_jac[0], &base);
|
||||
point_double_impl(&dbl, &tbl_jac[0]);
|
||||
for (int i = 1; i < 8; i++)
|
||||
point_add_impl(&tbl_jac[i], &tbl_jac[i-1], &dbl);
|
||||
AffinePoint table[8];
|
||||
FieldElement globalz;
|
||||
build_wnaf_table_zr_impl(&base, table, &globalz);
|
||||
|
||||
// Build phi(P) table: apply endomorphism, flip y if signs differ
|
||||
AffinePoint endo_base;
|
||||
FieldElement beta;
|
||||
beta.limbs[0] = GLV_BETA0; beta.limbs[1] = GLV_BETA1;
|
||||
beta.limbs[2] = GLV_BETA2; beta.limbs[3] = GLV_BETA3;
|
||||
field_mul_impl(&endo_base.x, &base.x, &beta);
|
||||
endo_base.y = base.y;
|
||||
int flip_phi = (k1_neg != k2_neg);
|
||||
if (flip_phi) field_negate_impl(&endo_base.y, &endo_base.y);
|
||||
|
||||
JacobianPoint tbl2_jac[8];
|
||||
point_from_affine(&tbl2_jac[0], &endo_base);
|
||||
JacobianPoint dbl2;
|
||||
point_double_impl(&dbl2, &tbl2_jac[0]);
|
||||
for (int i = 1; i < 8; i++)
|
||||
point_add_impl(&tbl2_jac[i], &tbl2_jac[i-1], &dbl2);
|
||||
AffinePoint endo_table[8];
|
||||
derive_endo_table_impl(table, endo_table, (k1_neg != k2_neg));
|
||||
|
||||
// wNAF encode both half-width scalars
|
||||
int wnaf1[260], wnaf2[260];
|
||||
int len1 = scalar_to_wnaf(&k1, wnaf1);
|
||||
int len2 = scalar_to_wnaf(&k2, wnaf2);
|
||||
int max_len = (len1 > len2) ? len1 : len2;
|
||||
int wnaf1[130] = {0};
|
||||
int wnaf2[130] = {0};
|
||||
scalar_to_wnaf(&k1, wnaf1);
|
||||
scalar_to_wnaf(&k2, wnaf2);
|
||||
|
||||
// Shamir interleaved loop
|
||||
point_set_infinity(r);
|
||||
for (int i = max_len - 1; i >= 0; --i) {
|
||||
for (int i = 129; i >= 0; --i) {
|
||||
if (!point_is_infinity(r)) point_double_impl(r, r);
|
||||
|
||||
int d1 = (i < len1) ? wnaf1[i] : 0;
|
||||
int d1 = wnaf1[i];
|
||||
if (d1 != 0) {
|
||||
int idx = ((d1 > 0) ? d1 : -d1) >> 1;
|
||||
if (idx >= 8) idx = 7;
|
||||
JacobianPoint pt = tbl_jac[idx];
|
||||
int idx = (((d1 > 0) ? d1 : -d1) - 1) >> 1;
|
||||
AffinePoint pt = table[idx];
|
||||
if (d1 < 0) field_negate_impl(&pt.y, &pt.y);
|
||||
if (point_is_infinity(r)) { *r = pt; }
|
||||
else { JacobianPoint tmp; point_add_impl(&tmp, r, &pt); *r = tmp; }
|
||||
if (point_is_infinity(r)) { point_from_affine(r, &pt); }
|
||||
else { point_add_mixed_impl(r, r, &pt); }
|
||||
}
|
||||
|
||||
int d2 = (i < len2) ? wnaf2[i] : 0;
|
||||
int d2 = wnaf2[i];
|
||||
if (d2 != 0) {
|
||||
int idx = ((d2 > 0) ? d2 : -d2) >> 1;
|
||||
if (idx >= 8) idx = 7;
|
||||
JacobianPoint pt = tbl2_jac[idx];
|
||||
int idx = (((d2 > 0) ? d2 : -d2) - 1) >> 1;
|
||||
AffinePoint pt = endo_table[idx];
|
||||
if (d2 < 0) field_negate_impl(&pt.y, &pt.y);
|
||||
if (point_is_infinity(r)) { *r = pt; }
|
||||
else { JacobianPoint tmp; point_add_impl(&tmp, r, &pt); *r = tmp; }
|
||||
if (point_is_infinity(r)) { point_from_affine(r, &pt); }
|
||||
else { point_add_mixed_impl(r, r, &pt); }
|
||||
}
|
||||
}
|
||||
|
||||
if (!point_is_infinity(r)) {
|
||||
FieldElement corrected_z;
|
||||
field_mul_impl(&corrected_z, &r->z, &globalz);
|
||||
r->z = corrected_z;
|
||||
}
|
||||
}
|
||||
|
||||
// Precomputed generator multiplication using fixed window w=4
|
||||
|
||||
@ -59,6 +59,306 @@ typedef struct {
|
||||
ulong limbs[4]; // Little-endian: limbs[0] is LSB
|
||||
} FieldElement;
|
||||
|
||||
// =============================================================================
|
||||
// NVIDIA OpenCL PTX Acceleration (Level 1+2+3)
|
||||
// =============================================================================
|
||||
// On consumer NVIDIA GPUs (Turing/Ampere/Ada/Blackwell), INT32 multiply
|
||||
// throughput is 32x higher than INT64. Inline PTX enables:
|
||||
// Level 1+2: mad.lo.cc.u64/madc.hi.cc.u64 carry chains (no comparison-carry)
|
||||
// Level 3: mad.lo.cc.u32/madc.hi.cc.u32 32-bit Comba (INT32 throughput)
|
||||
// Fallback (AMD, Intel, portable): mul_hi + comparison-based carry unchanged.
|
||||
// Guard: __NV_CL_C_VERSION is defined only by NVIDIA's OpenCL compiler.
|
||||
// =============================================================================
|
||||
|
||||
#ifdef __NV_CL_C_VERSION
|
||||
|
||||
// 32-bit MAD accumulate: (r0:r1:r2) += a * b [3-register 96-bit accumulator]
|
||||
#define OCL_MAD32(r0, r1, r2, a, b) \
|
||||
__asm volatile( \
|
||||
"mad.lo.cc.u32 %0, %3, %4, %0; \n\t" \
|
||||
"madc.hi.cc.u32 %1, %3, %4, %1; \n\t" \
|
||||
"addc.u32 %2, %2, 0; \n\t" \
|
||||
: "+r"(r0), "+r"(r1), "+r"(r2) \
|
||||
: "r"(a), "r"(b) \
|
||||
)
|
||||
|
||||
// 32-bit squaring diagonal: (r0:r1:r2) += a*a
|
||||
#define OCL_SQR32_D(r0, r1, r2, a) \
|
||||
__asm volatile( \
|
||||
"mad.lo.cc.u32 %0, %3, %3, %0; \n\t" \
|
||||
"madc.hi.cc.u32 %1, %3, %3, %1; \n\t" \
|
||||
"addc.u32 %2, %2, 0; \n\t" \
|
||||
: "+r"(r0), "+r"(r1), "+r"(r2) \
|
||||
: "r"(a) \
|
||||
)
|
||||
|
||||
// 32-bit squaring off-diagonal: (r0:r1:r2) += 2 * a*b
|
||||
#define OCL_SQR32_M2(r0, r1, r2, a, b) \
|
||||
do { \
|
||||
uint _lo, _hi; \
|
||||
__asm volatile( \
|
||||
"mul.lo.u32 %0, %2, %3; \n\t" \
|
||||
"mul.hi.u32 %1, %2, %3; \n\t" \
|
||||
: "=r"(_lo), "=r"(_hi) : "r"(a), "r"(b) \
|
||||
); \
|
||||
__asm volatile( \
|
||||
"add.cc.u32 %0, %0, %3; \n\t" \
|
||||
"addc.cc.u32 %1, %1, %4; \n\t" \
|
||||
"addc.u32 %2, %2, 0; \n\t" \
|
||||
"add.cc.u32 %0, %0, %3; \n\t" \
|
||||
"addc.cc.u32 %1, %1, %4; \n\t" \
|
||||
"addc.u32 %2, %2, 0; \n\t" \
|
||||
: "+r"(r0), "+r"(r1), "+r"(r2) : "r"(_lo), "r"(_hi) \
|
||||
); \
|
||||
} while(0)
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// 32-bit Comba multiplication: 4x64 FieldElement reinterpreted as 8x32 limbs.
|
||||
// Produces uint[16] raw output (little-endian 32-bit limbs of 512-bit product).
|
||||
// Mirrors CUDA's mul_256_comba32 from secp256k1_32_hybrid_final.cuh.
|
||||
// ----------------------------------------------------------------------------
|
||||
static inline void mul_256_comba32_ocl(
|
||||
const FieldElement* a, const FieldElement* b, uint t32[16]
|
||||
) {
|
||||
uint a32[8], b32[8];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
a32[2*i] = (uint)(a->limbs[i]);
|
||||
a32[2*i+1] = (uint)(a->limbs[i] >> 32);
|
||||
b32[2*i] = (uint)(b->limbs[i]);
|
||||
b32[2*i+1] = (uint)(b->limbs[i] >> 32);
|
||||
}
|
||||
uint r0 = 0, r1 = 0, r2 = 0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[0],b32[0]);
|
||||
t32[0]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[0],b32[1]); OCL_MAD32(r0,r1,r2, a32[1],b32[0]);
|
||||
t32[1]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[0],b32[2]); OCL_MAD32(r0,r1,r2, a32[1],b32[1]); OCL_MAD32(r0,r1,r2, a32[2],b32[0]);
|
||||
t32[2]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[0],b32[3]); OCL_MAD32(r0,r1,r2, a32[1],b32[2]); OCL_MAD32(r0,r1,r2, a32[2],b32[1]); OCL_MAD32(r0,r1,r2, a32[3],b32[0]);
|
||||
t32[3]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[0],b32[4]); OCL_MAD32(r0,r1,r2, a32[1],b32[3]); OCL_MAD32(r0,r1,r2, a32[2],b32[2]); OCL_MAD32(r0,r1,r2, a32[3],b32[1]); OCL_MAD32(r0,r1,r2, a32[4],b32[0]);
|
||||
t32[4]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[0],b32[5]); OCL_MAD32(r0,r1,r2, a32[1],b32[4]); OCL_MAD32(r0,r1,r2, a32[2],b32[3]); OCL_MAD32(r0,r1,r2, a32[3],b32[2]); OCL_MAD32(r0,r1,r2, a32[4],b32[1]); OCL_MAD32(r0,r1,r2, a32[5],b32[0]);
|
||||
t32[5]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[0],b32[6]); OCL_MAD32(r0,r1,r2, a32[1],b32[5]); OCL_MAD32(r0,r1,r2, a32[2],b32[4]); OCL_MAD32(r0,r1,r2, a32[3],b32[3]); OCL_MAD32(r0,r1,r2, a32[4],b32[2]); OCL_MAD32(r0,r1,r2, a32[5],b32[1]); OCL_MAD32(r0,r1,r2, a32[6],b32[0]);
|
||||
t32[6]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[0],b32[7]); OCL_MAD32(r0,r1,r2, a32[1],b32[6]); OCL_MAD32(r0,r1,r2, a32[2],b32[5]); OCL_MAD32(r0,r1,r2, a32[3],b32[4]); OCL_MAD32(r0,r1,r2, a32[4],b32[3]); OCL_MAD32(r0,r1,r2, a32[5],b32[2]); OCL_MAD32(r0,r1,r2, a32[6],b32[1]); OCL_MAD32(r0,r1,r2, a32[7],b32[0]);
|
||||
t32[7]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[1],b32[7]); OCL_MAD32(r0,r1,r2, a32[2],b32[6]); OCL_MAD32(r0,r1,r2, a32[3],b32[5]); OCL_MAD32(r0,r1,r2, a32[4],b32[4]); OCL_MAD32(r0,r1,r2, a32[5],b32[3]); OCL_MAD32(r0,r1,r2, a32[6],b32[2]); OCL_MAD32(r0,r1,r2, a32[7],b32[1]);
|
||||
t32[8]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[2],b32[7]); OCL_MAD32(r0,r1,r2, a32[3],b32[6]); OCL_MAD32(r0,r1,r2, a32[4],b32[5]); OCL_MAD32(r0,r1,r2, a32[5],b32[4]); OCL_MAD32(r0,r1,r2, a32[6],b32[3]); OCL_MAD32(r0,r1,r2, a32[7],b32[2]);
|
||||
t32[9]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[3],b32[7]); OCL_MAD32(r0,r1,r2, a32[4],b32[6]); OCL_MAD32(r0,r1,r2, a32[5],b32[5]); OCL_MAD32(r0,r1,r2, a32[6],b32[4]); OCL_MAD32(r0,r1,r2, a32[7],b32[3]);
|
||||
t32[10]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[4],b32[7]); OCL_MAD32(r0,r1,r2, a32[5],b32[6]); OCL_MAD32(r0,r1,r2, a32[6],b32[5]); OCL_MAD32(r0,r1,r2, a32[7],b32[4]);
|
||||
t32[11]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[5],b32[7]); OCL_MAD32(r0,r1,r2, a32[6],b32[6]); OCL_MAD32(r0,r1,r2, a32[7],b32[5]);
|
||||
t32[12]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[6],b32[7]); OCL_MAD32(r0,r1,r2, a32[7],b32[6]);
|
||||
t32[13]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_MAD32(r0,r1,r2, a32[7],b32[7]);
|
||||
t32[14]=r0; t32[15]=r1;
|
||||
}
|
||||
|
||||
// 32-bit Comba squaring: ~40% fewer multiplications (symmetry exploitation).
|
||||
// Mirrors CUDA's sqr_256_comba32 from secp256k1_32_hybrid_final.cuh.
|
||||
static inline void sqr_256_comba32_ocl(const FieldElement* a, uint t32[16]) {
|
||||
uint a32[8];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
a32[2*i] = (uint)(a->limbs[i]);
|
||||
a32[2*i+1] = (uint)(a->limbs[i] >> 32);
|
||||
}
|
||||
uint r0 = 0, r1 = 0, r2 = 0;
|
||||
|
||||
OCL_SQR32_D(r0,r1,r2, a32[0]);
|
||||
t32[0]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[1]);
|
||||
t32[1]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[2]); OCL_SQR32_D(r0,r1,r2, a32[1]);
|
||||
t32[2]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[3]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[2]);
|
||||
t32[3]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[4]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[3]); OCL_SQR32_D(r0,r1,r2, a32[2]);
|
||||
t32[4]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[5]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[4]); OCL_SQR32_M2(r0,r1,r2, a32[2],a32[3]);
|
||||
t32[5]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[6]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[5]); OCL_SQR32_M2(r0,r1,r2, a32[2],a32[4]); OCL_SQR32_D(r0,r1,r2, a32[3]);
|
||||
t32[6]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[0],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[1],a32[6]); OCL_SQR32_M2(r0,r1,r2, a32[2],a32[5]); OCL_SQR32_M2(r0,r1,r2, a32[3],a32[4]);
|
||||
t32[7]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[1],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[2],a32[6]); OCL_SQR32_M2(r0,r1,r2, a32[3],a32[5]); OCL_SQR32_D(r0,r1,r2, a32[4]);
|
||||
t32[8]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[2],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[3],a32[6]); OCL_SQR32_M2(r0,r1,r2, a32[4],a32[5]);
|
||||
t32[9]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[3],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[4],a32[6]); OCL_SQR32_D(r0,r1,r2, a32[5]);
|
||||
t32[10]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[4],a32[7]); OCL_SQR32_M2(r0,r1,r2, a32[5],a32[6]);
|
||||
t32[11]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[5],a32[7]); OCL_SQR32_D(r0,r1,r2, a32[6]);
|
||||
t32[12]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_M2(r0,r1,r2, a32[6],a32[7]);
|
||||
t32[13]=r0; r0=r1; r1=r2; r2=0;
|
||||
|
||||
OCL_SQR32_D(r0,r1,r2, a32[7]);
|
||||
t32[14]=r0; t32[15]=r1;
|
||||
}
|
||||
|
||||
// 32-bit reduction: T_hi x K_MOD (32-bit MAD chain) + conditional P-subtract.
|
||||
// Phase 1: T_hi[8..15] x 977 (scalar, 32-bit MAD chain)
|
||||
// Phase 1b: add T_hi << 32 (K_MOD = 2^32 + 977)
|
||||
// Phase 2: T_lo[0..7] += result (32-bit carry chain)
|
||||
// Phase 3+4: pack to 64-bit, fold overflow, conditional P-subtract (64-bit PTX)
|
||||
// Mirrors CUDA's reduce_512_to_256_32 from secp256k1_32_hybrid_final.cuh.
|
||||
static inline void reduce_512_to_256_32_ocl(uint t32[16], FieldElement* r) {
|
||||
uint t0=t32[0], t1=t32[1], t2=t32[2], t3=t32[3];
|
||||
uint t4=t32[4], t5=t32[5], t6=t32[6], t7=t32[7];
|
||||
const uint t8 =t32[8], t9 =t32[9], t10=t32[10], t11=t32[11];
|
||||
const uint t12=t32[12], t13=t32[13], t14=t32[14], t15=t32[15];
|
||||
|
||||
// Phase 1: A = T_hi[8..15] x 977 (32-bit scalar MAD chain -> 9 limbs)
|
||||
uint a0, a1, a2, a3, a4, a5, a6, a7, a8;
|
||||
__asm volatile(
|
||||
"mul.lo.u32 %0, %9, 977;\n\t"
|
||||
"mul.hi.u32 %1, %9, 977;\n\t"
|
||||
"mad.lo.cc.u32 %1, %10, 977, %1;\n\t"
|
||||
"madc.hi.u32 %2, %10, 977, 0;\n\t"
|
||||
"mad.lo.cc.u32 %2, %11, 977, %2;\n\t"
|
||||
"madc.hi.u32 %3, %11, 977, 0;\n\t"
|
||||
"mad.lo.cc.u32 %3, %12, 977, %3;\n\t"
|
||||
"madc.hi.u32 %4, %12, 977, 0;\n\t"
|
||||
"mad.lo.cc.u32 %4, %13, 977, %4;\n\t"
|
||||
"madc.hi.u32 %5, %13, 977, 0;\n\t"
|
||||
"mad.lo.cc.u32 %5, %14, 977, %5;\n\t"
|
||||
"madc.hi.u32 %6, %14, 977, 0;\n\t"
|
||||
"mad.lo.cc.u32 %6, %15, 977, %6;\n\t"
|
||||
"madc.hi.u32 %7, %15, 977, 0;\n\t"
|
||||
"mad.lo.cc.u32 %7, %16, 977, %7;\n\t"
|
||||
"madc.hi.u32 %8, %16, 977, 0;\n\t"
|
||||
: "=r"(a0),"=r"(a1),"=r"(a2),"=r"(a3),"=r"(a4),
|
||||
"=r"(a5),"=r"(a6),"=r"(a7),"=r"(a8)
|
||||
: "r"(t8),"r"(t9),"r"(t10),"r"(t11),
|
||||
"r"(t12),"r"(t13),"r"(t14),"r"(t15)
|
||||
);
|
||||
|
||||
// Phase 1b: add T_hi << 32 (a[1..8] += T_hi[8..15], yielding a9 overflow)
|
||||
uint a9;
|
||||
__asm volatile(
|
||||
"add.cc.u32 %0, %0, %9;\n\t"
|
||||
"addc.cc.u32 %1, %1, %10;\n\t"
|
||||
"addc.cc.u32 %2, %2, %11;\n\t"
|
||||
"addc.cc.u32 %3, %3, %12;\n\t"
|
||||
"addc.cc.u32 %4, %4, %13;\n\t"
|
||||
"addc.cc.u32 %5, %5, %14;\n\t"
|
||||
"addc.cc.u32 %6, %6, %15;\n\t"
|
||||
"addc.cc.u32 %7, %7, %16;\n\t"
|
||||
"addc.u32 %8, 0, 0;\n\t"
|
||||
: "+r"(a1),"+r"(a2),"+r"(a3),"+r"(a4),
|
||||
"+r"(a5),"+r"(a6),"+r"(a7),"+r"(a8),"=r"(a9)
|
||||
: "r"(t8),"r"(t9),"r"(t10),"r"(t11),
|
||||
"r"(t12),"r"(t13),"r"(t14),"r"(t15)
|
||||
);
|
||||
|
||||
// Phase 2: T_lo[0..7] += A[0..7] (32-bit carry chain)
|
||||
uint carry;
|
||||
__asm volatile(
|
||||
"add.cc.u32 %0, %0, %9;\n\t"
|
||||
"addc.cc.u32 %1, %1, %10;\n\t"
|
||||
"addc.cc.u32 %2, %2, %11;\n\t"
|
||||
"addc.cc.u32 %3, %3, %12;\n\t"
|
||||
"addc.cc.u32 %4, %4, %13;\n\t"
|
||||
"addc.cc.u32 %5, %5, %14;\n\t"
|
||||
"addc.cc.u32 %6, %6, %15;\n\t"
|
||||
"addc.cc.u32 %7, %7, %16;\n\t"
|
||||
"addc.u32 %8, 0, 0;\n\t"
|
||||
: "+r"(t0),"+r"(t1),"+r"(t2),"+r"(t3),
|
||||
"+r"(t4),"+r"(t5),"+r"(t6),"+r"(t7),"=r"(carry)
|
||||
: "r"(a0),"r"(a1),"r"(a2),"r"(a3),
|
||||
"r"(a4),"r"(a5),"r"(a6),"r"(a7)
|
||||
);
|
||||
|
||||
// Phase 3: pack to 64-bit and fold overflow (extra * K)
|
||||
ulong r0 = ((ulong)t1 << 32) | t0;
|
||||
ulong r1 = ((ulong)t3 << 32) | t2;
|
||||
ulong r2 = ((ulong)t5 << 32) | t4;
|
||||
ulong r3 = ((ulong)t7 << 32) | t6;
|
||||
ulong extra = (ulong)a8 + carry + ((ulong)a9 << 32);
|
||||
ulong ek_lo, ek_hi;
|
||||
__asm volatile(
|
||||
"mul.lo.u64 %0, %2, %3;\n\t"
|
||||
"mul.hi.u64 %1, %2, %3;\n\t"
|
||||
: "=l"(ek_lo), "=l"(ek_hi)
|
||||
: "l"(extra), "l"((ulong)SECP256K1_K)
|
||||
);
|
||||
ulong c;
|
||||
__asm volatile(
|
||||
"add.cc.u64 %0, %0, %5;\n\t"
|
||||
"addc.cc.u64 %1, %1, %6;\n\t"
|
||||
"addc.cc.u64 %2, %2, 0;\n\t"
|
||||
"addc.cc.u64 %3, %3, 0;\n\t"
|
||||
"addc.u64 %4, 0, 0;\n\t"
|
||||
: "+l"(r0),"+l"(r1),"+l"(r2),"+l"(r3),"=l"(c)
|
||||
: "l"(ek_lo),"l"(ek_hi)
|
||||
);
|
||||
if (c) {
|
||||
__asm volatile(
|
||||
"add.cc.u64 %0, %0, %4;\n\t"
|
||||
"addc.cc.u64 %1, %1, 0;\n\t"
|
||||
"addc.cc.u64 %2, %2, 0;\n\t"
|
||||
"addc.u64 %3, %3, 0;\n\t"
|
||||
: "+l"(r0),"+l"(r1),"+l"(r2),"+l"(r3)
|
||||
: "l"((ulong)SECP256K1_K)
|
||||
);
|
||||
}
|
||||
|
||||
// Phase 4: conditional subtraction of P (64-bit PTX sub.cc chain)
|
||||
ulong s0, s1, s2, s3, borrow;
|
||||
__asm volatile(
|
||||
"sub.cc.u64 %0, %5, %9;\n\t"
|
||||
"subc.cc.u64 %1, %6, %10;\n\t"
|
||||
"subc.cc.u64 %2, %7, %11;\n\t"
|
||||
"subc.cc.u64 %3, %8, %12;\n\t"
|
||||
"subc.u64 %4, 0, 0;\n\t"
|
||||
: "=l"(s0),"=l"(s1),"=l"(s2),"=l"(s3),"=l"(borrow)
|
||||
: "l"(r0),"l"(r1),"l"(r2),"l"(r3),
|
||||
"l"(SECP256K1_P0),"l"(SECP256K1_P1),"l"(SECP256K1_P2),"l"(SECP256K1_P3)
|
||||
);
|
||||
if (borrow == 0) {
|
||||
r->limbs[0]=s0; r->limbs[1]=s1; r->limbs[2]=s2; r->limbs[3]=s3;
|
||||
} else {
|
||||
r->limbs[0]=r0; r->limbs[1]=r1; r->limbs[2]=r2; r->limbs[3]=r3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // __NV_CL_C_VERSION
|
||||
|
||||
// =============================================================================
|
||||
// Field Reduction: r = a mod p
|
||||
// Uses the fact that p = 2^256 - K where K = 0x1000003D1
|
||||
@ -151,32 +451,56 @@ inline void field_reduce(FieldElement* r, const ulong* a8) {
|
||||
// =============================================================================
|
||||
|
||||
inline void field_add_impl(FieldElement* r, const FieldElement* a, const FieldElement* b) {
|
||||
#ifdef __NV_CL_C_VERSION
|
||||
// Level 2: native add.cc/addc carry chains (no comparison-based carry)
|
||||
ulong s0, s1, s2, s3, carry;
|
||||
__asm volatile(
|
||||
"add.cc.u64 %0, %5, %9;\n\t"
|
||||
"addc.cc.u64 %1, %6, %10;\n\t"
|
||||
"addc.cc.u64 %2, %7, %11;\n\t"
|
||||
"addc.cc.u64 %3, %8, %12;\n\t"
|
||||
"addc.u64 %4, 0, 0;\n\t"
|
||||
: "=l"(s0),"=l"(s1),"=l"(s2),"=l"(s3),"=l"(carry)
|
||||
: "l"(a->limbs[0]),"l"(a->limbs[1]),"l"(a->limbs[2]),"l"(a->limbs[3]),
|
||||
"l"(b->limbs[0]),"l"(b->limbs[1]),"l"(b->limbs[2]),"l"(b->limbs[3])
|
||||
);
|
||||
ulong d0, d1, d2, d3, borrow;
|
||||
__asm volatile(
|
||||
"sub.cc.u64 %0, %5, %9;\n\t"
|
||||
"subc.cc.u64 %1, %6, %10;\n\t"
|
||||
"subc.cc.u64 %2, %7, %11;\n\t"
|
||||
"subc.cc.u64 %3, %8, %12;\n\t"
|
||||
"subc.u64 %4, 0, 0;\n\t"
|
||||
: "=l"(d0),"=l"(d1),"=l"(d2),"=l"(d3),"=l"(borrow)
|
||||
: "l"(s0),"l"(s1),"l"(s2),"l"(s3),
|
||||
"l"(SECP256K1_P0),"l"(SECP256K1_P1),"l"(SECP256K1_P2),"l"(SECP256K1_P3)
|
||||
);
|
||||
// use diff if: no borrow (s >= P) OR carry from add (sum overflowed 2^256)
|
||||
ulong mask = ~borrow | (0UL - carry);
|
||||
r->limbs[0] = (d0 & mask) | (s0 & ~mask);
|
||||
r->limbs[1] = (d1 & mask) | (s1 & ~mask);
|
||||
r->limbs[2] = (d2 & mask) | (s2 & ~mask);
|
||||
r->limbs[3] = (d3 & mask) | (s3 & ~mask);
|
||||
#else
|
||||
ulong carry = 0;
|
||||
ulong sum[4];
|
||||
|
||||
// Add with carry chain
|
||||
sum[0] = add_with_carry(a->limbs[0], b->limbs[0], 0, &carry);
|
||||
sum[1] = add_with_carry(a->limbs[1], b->limbs[1], carry, &carry);
|
||||
sum[2] = add_with_carry(a->limbs[2], b->limbs[2], carry, &carry);
|
||||
sum[3] = add_with_carry(a->limbs[3], b->limbs[3], carry, &carry);
|
||||
|
||||
// Reduce: if carry or sum >= p, subtract p
|
||||
ulong borrow = 0;
|
||||
ulong diff[4];
|
||||
|
||||
diff[0] = sub_with_borrow(sum[0], SECP256K1_P0, 0, &borrow);
|
||||
diff[1] = sub_with_borrow(sum[1], SECP256K1_P1, borrow, &borrow);
|
||||
diff[2] = sub_with_borrow(sum[2], SECP256K1_P2, borrow, &borrow);
|
||||
diff[3] = sub_with_borrow(sum[3], SECP256K1_P3, borrow, &borrow);
|
||||
|
||||
// If carry from addition or no borrow from subtraction, use diff
|
||||
ulong use_diff = (carry != 0) | (borrow == 0);
|
||||
ulong mask = use_diff ? ~0UL : 0UL;
|
||||
|
||||
r->limbs[0] = (diff[0] & mask) | (sum[0] & ~mask);
|
||||
r->limbs[1] = (diff[1] & mask) | (sum[1] & ~mask);
|
||||
r->limbs[2] = (diff[2] & mask) | (sum[2] & ~mask);
|
||||
r->limbs[3] = (diff[3] & mask) | (sum[3] & ~mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
@ -184,29 +508,51 @@ inline void field_add_impl(FieldElement* r, const FieldElement* a, const FieldEl
|
||||
// =============================================================================
|
||||
|
||||
inline void field_sub_impl(FieldElement* r, const FieldElement* a, const FieldElement* b) {
|
||||
#ifdef __NV_CL_C_VERSION
|
||||
// Level 2: native sub.cc/subc + add.cc/addc carry chains
|
||||
ulong d0, d1, d2, d3, borrow;
|
||||
__asm volatile(
|
||||
"sub.cc.u64 %0, %5, %9;\n\t"
|
||||
"subc.cc.u64 %1, %6, %10;\n\t"
|
||||
"subc.cc.u64 %2, %7, %11;\n\t"
|
||||
"subc.cc.u64 %3, %8, %12;\n\t"
|
||||
"subc.u64 %4, 0, 0;\n\t"
|
||||
: "=l"(d0),"=l"(d1),"=l"(d2),"=l"(d3),"=l"(borrow)
|
||||
: "l"(a->limbs[0]),"l"(a->limbs[1]),"l"(a->limbs[2]),"l"(a->limbs[3]),
|
||||
"l"(b->limbs[0]),"l"(b->limbs[1]),"l"(b->limbs[2]),"l"(b->limbs[3])
|
||||
);
|
||||
// borrow = 0xFFFF...FFFF if a < b (underflow), 0 otherwise
|
||||
ulong p0 = SECP256K1_P0 & borrow;
|
||||
ulong p1 = SECP256K1_P1 & borrow;
|
||||
ulong p2 = SECP256K1_P2 & borrow;
|
||||
ulong p3 = SECP256K1_P3 & borrow;
|
||||
__asm volatile(
|
||||
"add.cc.u64 %0, %4, %8;\n\t"
|
||||
"addc.cc.u64 %1, %5, %9;\n\t"
|
||||
"addc.cc.u64 %2, %6, %10;\n\t"
|
||||
"addc.u64 %3, %7, %11;\n\t"
|
||||
: "=l"(r->limbs[0]),"=l"(r->limbs[1]),"=l"(r->limbs[2]),"=l"(r->limbs[3])
|
||||
: "l"(d0),"l"(d1),"l"(d2),"l"(d3), "l"(p0),"l"(p1),"l"(p2),"l"(p3)
|
||||
);
|
||||
#else
|
||||
ulong borrow = 0;
|
||||
ulong diff[4];
|
||||
|
||||
// Subtract with borrow chain
|
||||
diff[0] = sub_with_borrow(a->limbs[0], b->limbs[0], 0, &borrow);
|
||||
diff[1] = sub_with_borrow(a->limbs[1], b->limbs[1], borrow, &borrow);
|
||||
diff[2] = sub_with_borrow(a->limbs[2], b->limbs[2], borrow, &borrow);
|
||||
diff[3] = sub_with_borrow(a->limbs[3], b->limbs[3], borrow, &borrow);
|
||||
|
||||
// If borrow, add p (result was negative)
|
||||
ulong mask = borrow ? ~0UL : 0UL;
|
||||
|
||||
ulong carry = 0;
|
||||
ulong adj[4];
|
||||
adj[0] = add_with_carry(diff[0], SECP256K1_P0 & mask, 0, &carry);
|
||||
adj[1] = add_with_carry(diff[1], SECP256K1_P1 & mask, carry, &carry);
|
||||
adj[2] = add_with_carry(diff[2], SECP256K1_P2 & mask, carry, &carry);
|
||||
adj[3] = add_with_carry(diff[3], SECP256K1_P3 & mask, carry, &carry);
|
||||
|
||||
r->limbs[0] = adj[0];
|
||||
r->limbs[1] = adj[1];
|
||||
r->limbs[2] = adj[2];
|
||||
r->limbs[3] = adj[3];
|
||||
#endif
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
@ -228,6 +574,12 @@ inline void muladd2(ulong lo, ulong hi, ulong* c0, ulong* c1, ulong* c2) {
|
||||
}
|
||||
|
||||
inline void field_mul_impl(FieldElement* r, const FieldElement* a, const FieldElement* b) {
|
||||
#ifdef __NV_CL_C_VERSION
|
||||
// Level 3: 32-bit hybrid Comba + 32-bit reduction (INT32 throughput 32x > INT64)
|
||||
uint t32[16];
|
||||
mul_256_comba32_ocl(a, b, t32);
|
||||
reduce_512_to_256_32_ocl(t32, r);
|
||||
#else
|
||||
ulong a0 = a->limbs[0], a1 = a->limbs[1], a2 = a->limbs[2], a3 = a->limbs[3];
|
||||
ulong b0 = b->limbs[0], b1 = b->limbs[1], b2 = b->limbs[2], b3 = b->limbs[3];
|
||||
ulong product[8];
|
||||
@ -274,11 +626,11 @@ inline void field_mul_impl(FieldElement* r, const FieldElement* a, const FieldEl
|
||||
product[7] = c1;
|
||||
|
||||
field_reduce(r, product);
|
||||
#endif // __NV_CL_C_VERSION
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Field Squaring: r = a² mod p
|
||||
// Optimized: only need upper triangle of multiplication
|
||||
// =============================================================================
|
||||
|
||||
// Forward declaration for field_sqr_n_impl
|
||||
@ -293,6 +645,12 @@ inline void field_sqr_n_impl(FieldElement* r, int n) {
|
||||
}
|
||||
|
||||
inline void field_sqr_impl(FieldElement* r, const FieldElement* a) {
|
||||
#ifdef __NV_CL_C_VERSION
|
||||
// Level 3: 32-bit hybrid squaring (40% fewer multiplications + INT32 throughput)
|
||||
uint t32[16];
|
||||
sqr_256_comba32_ocl(a, t32);
|
||||
reduce_512_to_256_32_ocl(t32, r);
|
||||
#else
|
||||
ulong a0 = a->limbs[0], a1 = a->limbs[1], a2 = a->limbs[2], a3 = a->limbs[3];
|
||||
ulong product[8];
|
||||
ulong c0, c1, c2;
|
||||
@ -332,6 +690,7 @@ inline void field_sqr_impl(FieldElement* r, const FieldElement* a) {
|
||||
product[7] = c1;
|
||||
|
||||
field_reduce(r, product);
|
||||
#endif // __NV_CL_C_VERSION
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
|
||||
@ -417,7 +417,7 @@ inline void scalar_add_u64(Scalar* a, ulong val, Scalar* r) {
|
||||
|
||||
// Convert scalar to wNAF representation (window width 5)
|
||||
// Returns length of wNAF representation
|
||||
inline int scalar_to_wnaf(const Scalar* k, int wnaf[260]) {
|
||||
static inline int scalar_to_wnaf(const Scalar* k, int wnaf[260]) {
|
||||
Scalar temp = *k;
|
||||
int len = 0;
|
||||
const int window_size = 32; // 2^5
|
||||
|
||||
@ -768,6 +768,427 @@ static int audit_perf_schnorr_stress() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Section 9: BIP-352 Silent Payments & GLV Correctness
|
||||
// =============================================================================
|
||||
|
||||
// Helper: expand kernel file with #include directives resolved (like bench_bip352_opencl.cpp).
|
||||
static std::string bip352_expand_kernel(const std::string& path,
|
||||
std::vector<std::string>& seen) {
|
||||
if (std::find(seen.begin(), seen.end(), path) != seen.end()) return {};
|
||||
seen.push_back(path);
|
||||
std::string src = load_file(path);
|
||||
if (src.empty()) return {};
|
||||
std::string dir = path.substr(0, path.find_last_of("/\\"));
|
||||
if (dir.empty()) dir = ".";
|
||||
std::istringstream in(src);
|
||||
std::ostringstream out;
|
||||
std::string line;
|
||||
while (std::getline(in, line)) {
|
||||
size_t s = line.find_first_not_of(" \t");
|
||||
std::string trimmed = (s != std::string::npos) ? line.substr(s) : line;
|
||||
if (trimmed.rfind("#include \"", 0) == 0) {
|
||||
size_t q1 = trimmed.find('"') + 1;
|
||||
size_t q2 = trimmed.find('"', q1);
|
||||
std::string child = dir + "/" + trimmed.substr(q1, q2 - q1);
|
||||
out << bip352_expand_kernel(child, seen);
|
||||
} else {
|
||||
out << line << '\n';
|
||||
}
|
||||
}
|
||||
return out.str();
|
||||
}
|
||||
|
||||
// Host wNAF encoder: mirrors the GPU scalar_to_wnaf fixed-130-step version.
|
||||
// Encodes 128-bit scalar (s0=LSW, s1=MSW) into 5-bit signed wNAF digits.
|
||||
static void audit_host_wnaf(uint64_t s0, uint64_t s1, int8_t wnaf[130]) {
|
||||
uint64_t s[4] = {s0, s1, 0, 0};
|
||||
for (int i = 0; i < 130; i++) {
|
||||
if (s[0] & 1ULL) {
|
||||
int d = (int)(s[0] & 0x1FULL);
|
||||
if (d >= 16) {
|
||||
d -= 32;
|
||||
uint64_t add = (uint64_t)(-d);
|
||||
uint64_t prev = s[0]; s[0] += add;
|
||||
if (s[0] < prev) { for (int j=1;j<4;j++) if (++s[j]) break; }
|
||||
} else {
|
||||
uint64_t prev = s[0]; s[0] -= (uint64_t)d;
|
||||
if (s[0] > prev) { for (int j=1;j<4;j++) if (s[j]--) break; }
|
||||
}
|
||||
wnaf[i] = (int8_t)d;
|
||||
} else { wnaf[i] = 0; }
|
||||
s[0] = (s[0] >> 1) | (s[1] << 63);
|
||||
s[1] = (s[1] >> 1) | (s[2] << 63);
|
||||
s[2] = (s[2] >> 1) | (s[3] << 63);
|
||||
s[3] >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Test 1: CPU wNAF round-trip — encode scalar, decode digits back, verify match.
|
||||
// Tests host_compute_wnaf correctness: this was the key change that fixed the -36 crash.
|
||||
static int audit_glv_wnaf_roundtrip() {
|
||||
struct TC { uint64_t s0, s1; const char* label; };
|
||||
static const TC cases[] = {
|
||||
{1, 0, "k=1"},
|
||||
{2, 0, "k=2"},
|
||||
{15, 0, "k=15 (max single wNAF digit)"},
|
||||
{16, 0, "k=16 (two-digit boundary)"},
|
||||
{31, 0, "k=31 (wNAF carry: 32-1)"},
|
||||
{0x5555555555555555ULL, 0x5555555555555555ULL, "k=0x5555... (alternating bits)"},
|
||||
{0xFFFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL, "k near 2^127"},
|
||||
// k1 half of SCAN_KEY GLV decomposition (lower 128 bits of full key)
|
||||
{0x38af4ad300da1a42ULL, 0x30d7d6a3b98294b1ULL, "k1 from SCAN_KEY GLV half"},
|
||||
};
|
||||
|
||||
for (auto& tc : cases) {
|
||||
int8_t wnaf[130] = {};
|
||||
audit_host_wnaf(tc.s0, tc.s1, wnaf);
|
||||
|
||||
// Reconstruct: sum(wnaf[i] * 2^i) for i=0..129 using 128-bit arithmetic.
|
||||
// Use __uint128_t for correctness (GCC/Clang extension, fine on x86-64).
|
||||
__uint128_t result = 0, power = 1;
|
||||
for (int i = 0; i < 130; i++) {
|
||||
if (wnaf[i] > 0) result += (__uint128_t)(uint8_t) wnaf[i] * power;
|
||||
if (wnaf[i] < 0) result -= (__uint128_t)(uint8_t)(-wnaf[i]) * power;
|
||||
power <<= 1;
|
||||
}
|
||||
uint64_t r0 = (uint64_t)result;
|
||||
uint64_t r1 = (uint64_t)(result >> 64);
|
||||
|
||||
if (r0 != tc.s0 || r1 != tc.s1) {
|
||||
std::fprintf(stderr, " [FAIL] wNAF roundtrip for %s: "
|
||||
"expected (%016llx,%016llx) got (%016llx,%016llx)\n",
|
||||
tc.label,
|
||||
(unsigned long long)tc.s0, (unsigned long long)tc.s1,
|
||||
(unsigned long long)r0, (unsigned long long)r1);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Test 2: GLV large scalar consistency via OpenCL library.
|
||||
// Verifies k*G + G = (k+1)*G for three large scalars that stress the GLV path:
|
||||
// - SCAN_KEY (256-bit random key, both GLV halves active)
|
||||
// - 2^128 (decomposition boundary)
|
||||
// - 0x5555... (alternating bit pattern, maximally stresses wNAF carry logic)
|
||||
static int audit_glv_large_scalar() {
|
||||
// Helper: hex string (big-endian) -> little-endian Scalar limbs
|
||||
auto from_hex = [](const char* hex) -> Scalar {
|
||||
Scalar s{};
|
||||
std::string h(hex);
|
||||
while (h.size() < 64) h = "0" + h;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
uint64_t v = 0;
|
||||
for (int j = 0; j < 16; j++) {
|
||||
char c = h[(3 - i) * 16 + j];
|
||||
int d = (c >= '0' && c <= '9') ? c - '0'
|
||||
: (c >= 'a' && c <= 'f') ? c - 'a' + 10
|
||||
: (c >= 'A' && c <= 'F') ? c - 'A' + 10 : 0;
|
||||
v = (v << 4) | (uint64_t)d;
|
||||
}
|
||||
s.limbs[i] = v;
|
||||
}
|
||||
return s;
|
||||
};
|
||||
|
||||
struct TC { Scalar k, kp1; const char* label; };
|
||||
Scalar s_scan = from_hex("c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a42");
|
||||
Scalar s_scanp = from_hex("c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a43");
|
||||
Scalar s_2_128 = {{0UL, 0UL, 1UL, 0UL}};
|
||||
Scalar s_2_128p = {{1UL, 0UL, 1UL, 0UL}};
|
||||
Scalar s_alt = {{0x5555555555555555ULL, 0x5555555555555555ULL,
|
||||
0x5555555555555555ULL, 0x5555555555555555ULL}};
|
||||
Scalar s_altp = {{0x5555555555555556ULL, 0x5555555555555555ULL,
|
||||
0x5555555555555555ULL, 0x5555555555555555ULL}};
|
||||
|
||||
TC cases[] = {
|
||||
{s_scan, s_scanp, "SCAN_KEY (256-bit)"},
|
||||
{s_2_128, s_2_128p, "k = 2^128 (GLV boundary)"},
|
||||
{s_alt, s_altp, "k = 0x5555... (alternating bits)"},
|
||||
};
|
||||
|
||||
Scalar one = sc_from_u64(1);
|
||||
JacobianPoint oneG = g_ctx->scalar_mul_generator(one);
|
||||
|
||||
for (auto& tc : cases) {
|
||||
JacobianPoint kG = g_ctx->scalar_mul_generator(tc.k);
|
||||
JacobianPoint kp1_a = g_ctx->point_add(kG, oneG); // k*G + G
|
||||
JacobianPoint kp1_b = g_ctx->scalar_mul_generator(tc.kp1); // (k+1)*G
|
||||
|
||||
AffinePoint a = jacobian_to_affine(kp1_a);
|
||||
AffinePoint b = jacobian_to_affine(kp1_b);
|
||||
if (!fe_eq(a.x, b.x) || !fe_eq(a.y, b.y)) {
|
||||
std::fprintf(stderr, " [FAIL] GLV %s: k*G+G != (k+1)*G\n", tc.label);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Struct layout matching BIP352ScanKeyGlv in secp256k1_bip352.cl.
|
||||
// Used by the BIP-352 kernel audit tests below.
|
||||
struct alignas(1) AuditBIP352ScanKeyGlv {
|
||||
int8_t wnaf1[130]; // +0: wNAF digits for k1 half-scalar
|
||||
int8_t wnaf2[130]; // +130: wNAF digits for k2 half-scalar
|
||||
uint8_t k1_neg; // +260: 1 if k1 negative
|
||||
uint8_t flip_phi; // +261: 1 if phi table y should be negated
|
||||
uint8_t pad0, pad1; // +262-263: padding
|
||||
};
|
||||
static_assert(sizeof(AuditBIP352ScanKeyGlv) == 264, "BIP352ScanKeyGlv size mismatch");
|
||||
|
||||
// Kernel-side AffinePoint and FieldElement layout (must match .cl struct).
|
||||
struct AuditFieldElement { uint64_t limbs[4]; };
|
||||
struct AuditAffinePoint { AuditFieldElement x, y; };
|
||||
|
||||
// secp256k1 generator G in the kernel's field element representation (little-endian limbs).
|
||||
static AuditAffinePoint audit_generator_point() {
|
||||
AuditAffinePoint g;
|
||||
g.x.limbs[0] = 0x59F2815B16F81798ULL; g.x.limbs[1] = 0x029BFCDB2DCE28D9ULL;
|
||||
g.x.limbs[2] = 0x55A06295CE870B07ULL; g.x.limbs[3] = 0x79BE667EF9DCBBACULL;
|
||||
g.y.limbs[0] = 0x9C47D08FFB10D4B8ULL; g.y.limbs[1] = 0xFD17B448A6855419ULL;
|
||||
g.y.limbs[2] = 0x5DA4FBFC0E1108A8ULL; g.y.limbs[3] = 0x483ADA7726A3C465ULL;
|
||||
return g;
|
||||
}
|
||||
|
||||
// Test 3: BIP-352 kernel compiles without error.
|
||||
static int audit_bip352_kernel_build() {
|
||||
if (g_kernel_dir.empty()) return -1;
|
||||
std::vector<std::string> seen;
|
||||
std::string src = bip352_expand_kernel(g_kernel_dir + "/secp256k1_bip352.cl", seen);
|
||||
if (src.empty()) return -1;
|
||||
|
||||
cl_context cl_ctx = (cl_context)g_ctx->native_context();
|
||||
cl_command_queue cl_q = (cl_command_queue)g_ctx->native_queue();
|
||||
cl_device_id cl_dev = nullptr;
|
||||
clGetCommandQueueInfo(cl_q, CL_QUEUE_DEVICE, sizeof(cl_dev), &cl_dev, nullptr);
|
||||
|
||||
cl_int err;
|
||||
const char* src_ptr = src.c_str();
|
||||
size_t src_len = src.size();
|
||||
cl_program prog = clCreateProgramWithSource(cl_ctx, 1, &src_ptr, &src_len, &err);
|
||||
if (err != CL_SUCCESS) return 1;
|
||||
|
||||
err = clBuildProgram(prog, 1, &cl_dev, "-cl-std=CL1.2", nullptr, nullptr);
|
||||
if (err != CL_SUCCESS) {
|
||||
size_t log_size = 0;
|
||||
clGetProgramBuildInfo(prog, cl_dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_size);
|
||||
std::string log(log_size, '\0');
|
||||
clGetProgramBuildInfo(prog, cl_dev, CL_PROGRAM_BUILD_LOG, log_size, log.data(), nullptr);
|
||||
std::fprintf(stderr, " BIP-352 build log:\n%s\n", log.c_str());
|
||||
clReleaseProgram(prog);
|
||||
return 2;
|
||||
}
|
||||
|
||||
// Verify both kernel entry points exist
|
||||
cl_kernel k_nolut = clCreateKernel(prog, "bip352_pipeline_kernel", &err);
|
||||
if (err != CL_SUCCESS) { clReleaseProgram(prog); return 3; }
|
||||
cl_kernel k_lut = clCreateKernel(prog, "bip352_pipeline_kernel_lut", &err);
|
||||
if (err != CL_SUCCESS) { clReleaseKernel(k_nolut); clReleaseProgram(prog); return 4; }
|
||||
|
||||
clReleaseKernel(k_nolut);
|
||||
clReleaseKernel(k_lut);
|
||||
clReleaseProgram(prog);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Test 4: Regression for CL_INVALID_COMMAND_QUEUE (-36) GPU fault.
|
||||
// Runs bip352_pipeline_kernel (no-LUT path) with 1 work item and verifies no crash.
|
||||
// The crash was caused by GPU private-memory overflow from int wnaf[130]×2 arrays.
|
||||
// Fix: precompute wNAF on CPU (BIP352ScanKeyGlv.wnaf1/wnaf2), read from __constant.
|
||||
// Three scan-key edge cases: k=1 (minimal), k from SCAN_KEY, k with all-15 wNAF digits.
|
||||
static int audit_bip352_no_crash() {
|
||||
if (g_kernel_dir.empty()) return -1;
|
||||
std::vector<std::string> seen;
|
||||
std::string src = bip352_expand_kernel(g_kernel_dir + "/secp256k1_bip352.cl", seen);
|
||||
if (src.empty()) return -1;
|
||||
|
||||
cl_context cl_ctx = (cl_context)g_ctx->native_context();
|
||||
cl_command_queue cl_q = (cl_command_queue)g_ctx->native_queue();
|
||||
cl_device_id cl_dev = nullptr;
|
||||
clGetCommandQueueInfo(cl_q, CL_QUEUE_DEVICE, sizeof(cl_dev), &cl_dev, nullptr);
|
||||
|
||||
cl_int err;
|
||||
const char* src_ptr = src.c_str();
|
||||
size_t src_len = src.size();
|
||||
cl_program prog = clCreateProgramWithSource(cl_ctx, 1, &src_ptr, &src_len, &err);
|
||||
if (err != CL_SUCCESS) return 1;
|
||||
|
||||
err = clBuildProgram(prog, 1, &cl_dev, "-cl-std=CL1.2 -cl-fast-relaxed-math", nullptr, nullptr);
|
||||
if (err != CL_SUCCESS) { clReleaseProgram(prog); return 2; }
|
||||
|
||||
cl_kernel kernel = clCreateKernel(prog, "bip352_pipeline_kernel", &err);
|
||||
if (err != CL_SUCCESS) { clReleaseProgram(prog); return 3; }
|
||||
|
||||
// Edge case scan keys to test. k1_neg/flip_phi chosen to exercise both paths.
|
||||
struct EdgeCase {
|
||||
const char* label;
|
||||
int8_t wnaf1_0; // wnaf1[0] digit (rest 0)
|
||||
int8_t wnaf2_0; // wnaf2[0] digit (rest 0)
|
||||
uint8_t k1_neg, flip_phi;
|
||||
};
|
||||
static const EdgeCase edges[] = {
|
||||
{"k=1 (minimal scalar)", 1, 0, 0, 0},
|
||||
{"k1=15,k2=1 (max digit)", 15, 1, 0, 0},
|
||||
{"k1_neg=1, flip_phi=1", 1, 1, 1, 1}, // negate path
|
||||
};
|
||||
|
||||
AuditAffinePoint g_pt = audit_generator_point();
|
||||
AuditAffinePoint spend_pt = g_pt; // spend = G for simplicity
|
||||
|
||||
// Pre-allocate buffers (reused across edge cases)
|
||||
cl_mem d_tweaks = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY, sizeof(AuditAffinePoint), nullptr, &err);
|
||||
cl_mem d_spend = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||
sizeof(AuditAffinePoint), &spend_pt, &err);
|
||||
cl_mem d_scan = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY, sizeof(AuditBIP352ScanKeyGlv), nullptr, &err);
|
||||
cl_mem d_prefixes = clCreateBuffer(cl_ctx, CL_MEM_WRITE_ONLY, sizeof(uint64_t), nullptr, &err);
|
||||
|
||||
cl_uint count = 1;
|
||||
clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_tweaks);
|
||||
clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_scan);
|
||||
clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_spend);
|
||||
clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_prefixes);
|
||||
clSetKernelArg(kernel, 4, sizeof(cl_uint), &count);
|
||||
|
||||
int result = 0;
|
||||
for (auto& ec : edges) {
|
||||
// Build scan plan
|
||||
AuditBIP352ScanKeyGlv plan{};
|
||||
plan.wnaf1[0] = ec.wnaf1_0;
|
||||
plan.wnaf2[0] = ec.wnaf2_0;
|
||||
plan.k1_neg = ec.k1_neg;
|
||||
plan.flip_phi = ec.flip_phi;
|
||||
|
||||
// Upload tweak=G and scan plan
|
||||
clEnqueueWriteBuffer(cl_q, d_tweaks, CL_TRUE, 0, sizeof(AuditAffinePoint), &g_pt, 0, nullptr, nullptr);
|
||||
clEnqueueWriteBuffer(cl_q, d_scan, CL_TRUE, 0, sizeof(AuditBIP352ScanKeyGlv), &plan, 0, nullptr, nullptr);
|
||||
|
||||
size_t global = 1, local = 1;
|
||||
err = clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr);
|
||||
if (err != CL_SUCCESS) { result = 10; break; }
|
||||
err = clFinish(cl_q);
|
||||
if (err != CL_SUCCESS) {
|
||||
// -36 = CL_INVALID_COMMAND_QUEUE = GPU fault (regression for the private-stack overflow crash)
|
||||
std::fprintf(stderr, " [FAIL] bip352_no_crash edge='%s' clFinish error=%d"
|
||||
" (expected 0; -36 = GPU fault regression)\n", ec.label, err);
|
||||
result = 20 + err; // encode the OCL error
|
||||
break;
|
||||
}
|
||||
|
||||
uint64_t prefix = 0;
|
||||
clEnqueueReadBuffer(cl_q, d_prefixes, CL_TRUE, 0, sizeof(uint64_t), &prefix, 0, nullptr, nullptr);
|
||||
// prefix may be 0 if the point is infinity (edge case k1=0 path) — that's valid.
|
||||
// What we really test is that we reach here without crashing.
|
||||
}
|
||||
|
||||
clReleaseMemObject(d_tweaks);
|
||||
clReleaseMemObject(d_scan);
|
||||
clReleaseMemObject(d_spend);
|
||||
clReleaseMemObject(d_prefixes);
|
||||
clReleaseKernel(kernel);
|
||||
clReleaseProgram(prog);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Test 5: BIP-352 pipeline output matches expected prefix for known input.
|
||||
// Uses tweak=G, scan_key=SCAN_KEY. Expected prefix pre-computed by the CPU
|
||||
// validation path in bench_bip352_opencl (validation: 0xb63b4601066a6971
|
||||
// is the last-item prefix when batch=10000; for single item with tweak=G
|
||||
// and k=SCAN_KEY this is independently computed below).
|
||||
static int audit_bip352_correct() {
|
||||
if (g_kernel_dir.empty()) return -1;
|
||||
std::vector<std::string> seen;
|
||||
std::string src = bip352_expand_kernel(g_kernel_dir + "/secp256k1_bip352.cl", seen);
|
||||
if (src.empty()) return -1;
|
||||
|
||||
cl_context cl_ctx = (cl_context)g_ctx->native_context();
|
||||
cl_command_queue cl_q = (cl_command_queue)g_ctx->native_queue();
|
||||
cl_device_id cl_dev = nullptr;
|
||||
clGetCommandQueueInfo(cl_q, CL_QUEUE_DEVICE, sizeof(cl_dev), &cl_dev, nullptr);
|
||||
|
||||
cl_int err;
|
||||
const char* src_ptr = src.c_str();
|
||||
size_t src_len = src.size();
|
||||
cl_program prog = clCreateProgramWithSource(cl_ctx, 1, &src_ptr, &src_len, &err);
|
||||
if (err != CL_SUCCESS) return 1;
|
||||
err = clBuildProgram(prog, 1, &cl_dev, "-cl-std=CL1.2 -cl-fast-relaxed-math", nullptr, nullptr);
|
||||
if (err != CL_SUCCESS) { clReleaseProgram(prog); return 2; }
|
||||
cl_kernel kernel = clCreateKernel(prog, "bip352_pipeline_kernel", &err);
|
||||
if (err != CL_SUCCESS) { clReleaseProgram(prog); return 3; }
|
||||
|
||||
// Build BIP352ScanKeyGlv for SCAN_KEY using the host wNAF encoder.
|
||||
// SCAN_KEY = c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a42
|
||||
// GLV decomposition (pre-computed, matches bench_bip352_opencl):
|
||||
// k1 (LE64): {0x5db6fc2bc78a0e07, 0x7fff7d82be8fb40f, 0, 0} k1_neg=0
|
||||
// k2 (LE64): {0x62491d65b0efea74, 0x3ca3a038cb4bac36, 0, 0} flip_phi=0
|
||||
// (These are the GLV halves as output by secp256k1::fast::glv_decompose)
|
||||
// We use the benchmark's own scan_key encoding to stay in sync; here we use
|
||||
// the actual k1/k2 from a one-time CPU run of build_scan_glv_plan().
|
||||
// Instead of hard-coding the decomposition (which requires CPU GLV logic),
|
||||
// we test consistency: run 2 items (tweak=G), compare both give the same prefix.
|
||||
// A truly independent correctness check is in bench_bip352_opencl --batch 1 --local 1.
|
||||
|
||||
// For this audit: run 2 identical tweaks, check both prefixes are equal (determinism).
|
||||
AuditBIP352ScanKeyGlv plan{};
|
||||
// k1=1, k2=0 (simplest: scan*tweak = 1*G = G for any decomposition where k1=1, k2=0)
|
||||
plan.wnaf1[0] = 1;
|
||||
plan.k1_neg = 0;
|
||||
plan.flip_phi = 0;
|
||||
|
||||
AuditAffinePoint g_pt = audit_generator_point();
|
||||
AuditAffinePoint spend_pt = g_pt;
|
||||
AuditAffinePoint tweaks[2] = {g_pt, g_pt}; // same tweak twice
|
||||
|
||||
cl_mem d_tweaks = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||
2 * sizeof(AuditAffinePoint), tweaks, &err);
|
||||
cl_mem d_scan = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||
sizeof(AuditBIP352ScanKeyGlv), &plan, &err);
|
||||
cl_mem d_spend = clCreateBuffer(cl_ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||
sizeof(AuditAffinePoint), &spend_pt, &err);
|
||||
cl_mem d_prefixes = clCreateBuffer(cl_ctx, CL_MEM_WRITE_ONLY, 2 * sizeof(uint64_t), nullptr, &err);
|
||||
|
||||
cl_uint count = 2;
|
||||
clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_tweaks);
|
||||
clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_scan);
|
||||
clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_spend);
|
||||
clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_prefixes);
|
||||
clSetKernelArg(kernel, 4, sizeof(cl_uint), &count);
|
||||
|
||||
size_t global = 2, local = 1;
|
||||
err = clEnqueueNDRangeKernel(cl_q, kernel, 1, nullptr, &global, &local, 0, nullptr, nullptr);
|
||||
if (err != CL_SUCCESS) { clReleaseProgram(prog); return 4; }
|
||||
err = clFinish(cl_q);
|
||||
if (err != CL_SUCCESS) {
|
||||
std::fprintf(stderr, " [FAIL] bip352_correct: clFinish error=%d\n", err);
|
||||
clReleaseProgram(prog); return 5;
|
||||
}
|
||||
|
||||
uint64_t prefixes[2] = {};
|
||||
clEnqueueReadBuffer(cl_q, d_prefixes, CL_TRUE, 0, 2 * sizeof(uint64_t), prefixes, 0, nullptr, nullptr);
|
||||
|
||||
int result = 0;
|
||||
// Both items have identical input so must produce identical prefix (determinism test)
|
||||
if (prefixes[0] != prefixes[1]) {
|
||||
std::fprintf(stderr, " [FAIL] bip352_correct: non-deterministic output:"
|
||||
" item[0]=0x%016llx item[1]=0x%016llx\n",
|
||||
(unsigned long long)prefixes[0], (unsigned long long)prefixes[1]);
|
||||
result = 6;
|
||||
}
|
||||
// Prefix must be non-zero (1*G = G is not the point at infinity)
|
||||
if (prefixes[0] == 0) {
|
||||
std::fprintf(stderr, " [FAIL] bip352_correct: prefix=0 (unexpected infinity)\n");
|
||||
result = 7;
|
||||
}
|
||||
|
||||
clReleaseMemObject(d_tweaks);
|
||||
clReleaseMemObject(d_scan);
|
||||
clReleaseMemObject(d_spend);
|
||||
clReleaseMemObject(d_prefixes);
|
||||
clReleaseKernel(kernel);
|
||||
clReleaseProgram(prog);
|
||||
return result;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Module & Section Registry
|
||||
// =============================================================================
|
||||
@ -781,6 +1202,7 @@ static const OclSectionInfo OCL_SECTIONS[] = {
|
||||
{ "protocol_security", "Protocol Security (multi-key)" },
|
||||
{ "fuzzing", "Fuzzing & Adversarial Inputs" },
|
||||
{ "performance", "Performance Smoke Tests" },
|
||||
{ "bip352_glv", "BIP-352 Silent Payments & GLV Correctness" },
|
||||
};
|
||||
static constexpr int NUM_OCL_SECTIONS = sizeof(OCL_SECTIONS) / sizeof(OCL_SECTIONS[0]);
|
||||
|
||||
@ -827,6 +1249,13 @@ static const OclAuditModule OCL_MODULES[] = {
|
||||
// Section 8: Performance Smoke
|
||||
{ "perf_ecdsa_50", "ECDSA 50-iteration stress", "performance", audit_perf_ecdsa_stress, false },
|
||||
{ "perf_schnorr_25", "Schnorr 25-iteration stress", "performance", audit_perf_schnorr_stress, false },
|
||||
|
||||
// Section 9: BIP-352 Silent Payments & GLV Correctness
|
||||
{ "glv_wnaf_rt", "CPU wNAF encode/decode roundtrip (8 scalars)", "bip352_glv", audit_glv_wnaf_roundtrip, false },
|
||||
{ "glv_large_k", "GLV large scalar k*G+G=(k+1)*G (3 scalars)", "bip352_glv", audit_glv_large_scalar, false },
|
||||
{ "bip352_build", "BIP-352 kernel compiles (both entry points)", "bip352_glv", audit_bip352_kernel_build, false },
|
||||
{ "bip352_nocrash", "BIP-352 no GPU fault: -36 crash regression (3 edge cases)", "bip352_glv", audit_bip352_no_crash, false },
|
||||
{ "bip352_correct", "BIP-352 pipeline determinism (2 identical tweaks)", "bip352_glv", audit_bip352_correct, false },
|
||||
};
|
||||
static constexpr int NUM_OCL_MODULES = sizeof(OCL_MODULES) / sizeof(OCL_MODULES[0]);
|
||||
|
||||
@ -1080,6 +1509,7 @@ int main(int argc, char* argv[]) {
|
||||
auto dev = detect_ocl_device(*g_ctx);
|
||||
|
||||
// Try to init extended kernels
|
||||
g_kernel_dir = kernel_dir; // make available to audit modules
|
||||
if (!kernel_dir.empty()) {
|
||||
g_ext.init(*g_ctx, kernel_dir);
|
||||
}
|
||||
|
||||
@ -383,9 +383,9 @@ bool Context::Impl::init(const DeviceConfig& cfg) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Embedded kernel source (will be generated by CMake)
|
||||
// For now, include a minimal version
|
||||
static const char* kernel_source = R"KERNEL(
|
||||
// Embedded kernel source — split into separate array entries so that
|
||||
// no single string literal exceeds MSVC's 65535-byte C2026 limit.
|
||||
static const char* const kernel_parts[] = { R"KERNEL(
|
||||
// =============================================================================
|
||||
// Secp256k1 OpenCL Kernels - Embedded Version
|
||||
// =============================================================================
|
||||
@ -635,6 +635,18 @@ inline void field_sqr_n_impl(FieldElement* r, int n) {
|
||||
for (int i = 0; i < n; i++) field_sqr_impl(r, r);
|
||||
}
|
||||
|
||||
inline int field_is_zero_impl(const FieldElement* a) {
|
||||
return (a->limbs[0] | a->limbs[1] | a->limbs[2] | a->limbs[3]) == 0;
|
||||
}
|
||||
|
||||
inline void field_set_zero_impl(FieldElement* a) {
|
||||
a->limbs[0] = 0; a->limbs[1] = 0; a->limbs[2] = 0; a->limbs[3] = 0;
|
||||
}
|
||||
|
||||
inline void field_set_one_impl(FieldElement* a) {
|
||||
a->limbs[0] = 1; a->limbs[1] = 0; a->limbs[2] = 0; a->limbs[3] = 0;
|
||||
}
|
||||
|
||||
inline void field_inv_impl(FieldElement* r, const FieldElement* a) {
|
||||
FieldElement x2,x3,x6,x12,x24,x48,x96,x192,x7,x31,x223,x5,x11,x22,t;
|
||||
field_sqr_impl(&x2, a); field_mul_impl(&x2, &x2, a);
|
||||
@ -687,12 +699,61 @@ __kernel void field_sqr(__global const FieldElement* a, __global FieldElement* r
|
||||
}
|
||||
|
||||
__kernel void field_inv(__global const FieldElement* a, __global FieldElement* r, uint count) {
|
||||
uint gid = get_global_id(0); if (gid >= count) return;
|
||||
FieldElement a_local = a[gid];
|
||||
FieldElement res; field_inv_impl(&res, &a_local); r[gid] = res;
|
||||
#define BATCH_INV_LOCAL_MAX 256
|
||||
__local FieldElement local_vals[BATCH_INV_LOCAL_MAX];
|
||||
__local FieldElement local_prefix[BATCH_INV_LOCAL_MAX];
|
||||
__local FieldElement local_invs[BATCH_INV_LOCAL_MAX];
|
||||
__local uint local_nonzero[BATCH_INV_LOCAL_MAX];
|
||||
|
||||
uint gid = get_global_id(0);
|
||||
uint lid = get_local_id(0);
|
||||
uint lsize = get_local_size(0);
|
||||
uint group_start = get_group_id(0) * lsize;
|
||||
uint active = (group_start < count) ? min(lsize, count - group_start) : 0;
|
||||
|
||||
if (gid >= count) return;
|
||||
|
||||
if (lsize > BATCH_INV_LOCAL_MAX) {
|
||||
FieldElement a_local = a[gid];
|
||||
FieldElement res; field_inv_impl(&res, &a_local); r[gid] = res;
|
||||
return;
|
||||
}
|
||||
|
||||
FieldElement v = a[gid];
|
||||
uint nz = field_is_zero_impl(&v) ? 0U : 1U;
|
||||
local_nonzero[lid] = nz;
|
||||
local_vals[lid] = v;
|
||||
if (!nz) { FieldElement _t; field_set_one_impl(&_t); local_vals[lid] = _t; }
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lid == 0) {
|
||||
FieldElement acc;
|
||||
field_set_one_impl(&acc);
|
||||
|
||||
for (uint i = 0; i < active; ++i) {
|
||||
local_prefix[i] = acc;
|
||||
if (local_nonzero[i]) { FieldElement _t = local_vals[i]; field_mul_impl(&acc, &acc, &_t); }
|
||||
}
|
||||
|
||||
field_inv_impl(&acc, &acc);
|
||||
|
||||
for (int i = (int)active - 1; i >= 0; --i) {
|
||||
if (local_nonzero[i]) {
|
||||
FieldElement inv_i;
|
||||
{ FieldElement _t = local_prefix[i]; field_mul_impl(&inv_i, &acc, &_t); }
|
||||
local_invs[i] = inv_i;
|
||||
{ FieldElement _t = local_vals[i]; field_mul_impl(&acc, &acc, &_t); }
|
||||
} else {
|
||||
FieldElement _t; field_set_zero_impl(&_t); local_invs[i] = _t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
r[gid] = local_invs[lid];
|
||||
}
|
||||
|
||||
)KERNEL"
|
||||
)KERNEL",
|
||||
|
||||
// ---- second segment (point operations + scalar mul + batch) ----
|
||||
R"KERNEL(
|
||||
@ -961,6 +1022,11 @@ inline void scalar_mul_mod_n_cl(const Scalar* a, const Scalar* b, Scalar* r) {
|
||||
scalar_cond_sub_n_cl(r); scalar_cond_sub_n_cl(r); scalar_cond_sub_n_cl(r);
|
||||
}
|
||||
|
||||
)KERNEL",
|
||||
|
||||
// ---- third segment (scalar utilities + GLV + point operations) ----
|
||||
R"KERNEL(
|
||||
|
||||
// Scalar bit length (uses clz intrinsic -- single instruction on GPU)
|
||||
inline int scalar_bitlen_cl(const Scalar* s) {
|
||||
for (int i = 3; i >= 0; i--) {
|
||||
@ -1031,41 +1097,189 @@ inline void glv_decompose_cl(const Scalar* k, Scalar* k1, Scalar* k2, int* k1_ne
|
||||
*k1_neg = k1_is_neg; *k2_neg = k2_is_neg;
|
||||
}
|
||||
|
||||
// GLV + interleaved binary scalar multiplication: k*P
|
||||
// GPU-optimized: NO tables, two affine bases, mixed additions, minimal registers (~92)
|
||||
// SIMT-aware: two independent if-blocks (not else-if) for optimal warp divergence
|
||||
inline void point_from_affine(JacobianPoint* j, const AffinePoint* a) {
|
||||
j->x = a->x; j->y = a->y;
|
||||
j->z.limbs[0] = 1UL; j->z.limbs[1] = 0UL; j->z.limbs[2] = 0UL; j->z.limbs[3] = 0UL;
|
||||
j->infinity = 0;
|
||||
}
|
||||
|
||||
inline void point_add_mixed_h_impl(JacobianPoint* r, const JacobianPoint* p,
|
||||
const AffinePoint* q, FieldElement* h_out) {
|
||||
h_out->limbs[0] = 1UL; h_out->limbs[1] = 0UL; h_out->limbs[2] = 0UL; h_out->limbs[3] = 0UL;
|
||||
if (point_is_infinity(p)) { point_from_affine(r, q); return; }
|
||||
|
||||
FieldElement Z1Z1, U2, S2, H, HH, I, J, rr, V, X3, Y3, Z3, t1, t2;
|
||||
field_sqr_impl(&Z1Z1, &p->z);
|
||||
field_mul_impl(&U2, &q->x, &Z1Z1);
|
||||
field_mul_impl(&t1, &q->y, &p->z);
|
||||
field_mul_impl(&S2, &t1, &Z1Z1);
|
||||
field_sub_impl(&H, &U2, &p->x);
|
||||
|
||||
if ((H.limbs[0] | H.limbs[1] | H.limbs[2] | H.limbs[3]) == 0) {
|
||||
field_sub_impl(&t1, &S2, &p->y);
|
||||
if ((t1.limbs[0] | t1.limbs[1] | t1.limbs[2] | t1.limbs[3]) == 0)
|
||||
{ point_double_impl(r, p); return; }
|
||||
point_set_infinity(r); return;
|
||||
}
|
||||
field_add_impl(h_out, &H, &H);
|
||||
field_sqr_impl(&HH, &H);
|
||||
field_add_impl(&I, &HH, &HH); field_add_impl(&I, &I, &I);
|
||||
field_mul_impl(&J, &H, &I);
|
||||
field_sub_impl(&rr, &S2, &p->y); field_add_impl(&rr, &rr, &rr);
|
||||
field_mul_impl(&V, &p->x, &I);
|
||||
field_sqr_impl(&X3, &rr);
|
||||
field_sub_impl(&X3, &X3, &J);
|
||||
field_add_impl(&t1, &V, &V); field_sub_impl(&X3, &X3, &t1);
|
||||
field_sub_impl(&t1, &V, &X3); field_mul_impl(&Y3, &rr, &t1);
|
||||
field_mul_impl(&t2, &p->y, &J); field_add_impl(&t2, &t2, &t2);
|
||||
field_sub_impl(&Y3, &Y3, &t2);
|
||||
field_add_impl(&t1, &p->z, &H); field_sqr_impl(&Z3, &t1);
|
||||
field_sub_impl(&Z3, &Z3, &Z1Z1); field_sub_impl(&Z3, &Z3, &HH);
|
||||
r->x = X3; r->y = Y3; r->z = Z3; r->infinity = 0;
|
||||
}
|
||||
|
||||
inline void build_wnaf_table_zr_cl(const AffinePoint* base, AffinePoint table[8], FieldElement* globalz) {
|
||||
JacobianPoint base_jac;
|
||||
point_from_affine(&base_jac, base);
|
||||
|
||||
JacobianPoint doubled;
|
||||
point_double_impl(&doubled, &base_jac);
|
||||
|
||||
FieldElement c = doubled.z;
|
||||
FieldElement c2, c3;
|
||||
field_sqr_impl(&c2, &c);
|
||||
field_mul_impl(&c3, &c2, &c);
|
||||
|
||||
AffinePoint doubled_affine;
|
||||
doubled_affine.x = doubled.x;
|
||||
doubled_affine.y = doubled.y;
|
||||
|
||||
JacobianPoint accum;
|
||||
field_mul_impl(&accum.x, &base->x, &c2);
|
||||
field_mul_impl(&accum.y, &base->y, &c3);
|
||||
accum.z.limbs[0] = 1UL; accum.z.limbs[1] = 0UL; accum.z.limbs[2] = 0UL; accum.z.limbs[3] = 0UL;
|
||||
accum.infinity = 0;
|
||||
|
||||
table[0].x = accum.x;
|
||||
table[0].y = accum.y;
|
||||
|
||||
FieldElement zr[8];
|
||||
zr[0] = c;
|
||||
|
||||
for (int i = 1; i < 8; ++i) {
|
||||
FieldElement h;
|
||||
point_add_mixed_h_impl(&accum, &accum, &doubled_affine, &h);
|
||||
table[i].x = accum.x;
|
||||
table[i].y = accum.y;
|
||||
zr[i] = h;
|
||||
}
|
||||
|
||||
field_mul_impl(globalz, &accum.z, &c);
|
||||
|
||||
FieldElement zs = zr[7];
|
||||
for (int idx = 6; idx >= 0; --idx) {
|
||||
if (idx != 6) {
|
||||
FieldElement tmp;
|
||||
field_mul_impl(&tmp, &zs, &zr[idx + 1]);
|
||||
zs = tmp;
|
||||
}
|
||||
|
||||
FieldElement zs2, zs3;
|
||||
field_sqr_impl(&zs2, &zs);
|
||||
field_mul_impl(&zs3, &zs2, &zs);
|
||||
|
||||
FieldElement tx, ty;
|
||||
field_mul_impl(&tx, &table[idx].x, &zs2);
|
||||
field_mul_impl(&ty, &table[idx].y, &zs3);
|
||||
table[idx].x = tx;
|
||||
table[idx].y = ty;
|
||||
}
|
||||
}
|
||||
|
||||
inline void derive_endo_table_cl(const AffinePoint table[8], AffinePoint endo_table[8], int negate_y) {
|
||||
FieldElement beta;
|
||||
beta.limbs[0]=GLV_BETA0; beta.limbs[1]=GLV_BETA1;
|
||||
beta.limbs[2]=GLV_BETA2; beta.limbs[3]=GLV_BETA3;
|
||||
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
field_mul_impl(&endo_table[i].x, &table[i].x, &beta);
|
||||
if (negate_y) field_neg_impl(&endo_table[i].y, &table[i].y);
|
||||
else endo_table[i].y = table[i].y;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void scalar_to_wnaf(const Scalar* k, int wnaf[130]) {
|
||||
ulong s[4];
|
||||
for (int i = 0; i < 4; i++) s[i] = k->limbs[i];
|
||||
for (int i = 0; i < 130; i++) {
|
||||
if (s[0] & 1UL) {
|
||||
int d = (int)(s[0] & 0x1FUL);
|
||||
if (d >= 16) {
|
||||
d -= 32;
|
||||
ulong add = (ulong)(-d);
|
||||
ulong prev = s[0]; s[0] += add;
|
||||
if (s[0] < prev) { for (int j=1;j<4;j++) if (++s[j]) break; }
|
||||
} else {
|
||||
ulong prev = s[0]; s[0] -= (ulong)d;
|
||||
if (s[0] > prev) { for (int j=1;j<4;j++) if (s[j]--) break; }
|
||||
}
|
||||
wnaf[i] = d;
|
||||
} else { wnaf[i] = 0; }
|
||||
s[0] = (s[0] >> 1) | (s[1] << 63);
|
||||
s[1] = (s[1] >> 1) | (s[2] << 63);
|
||||
s[2] = (s[2] >> 1) | (s[3] << 63);
|
||||
s[3] >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
inline void scalar_mul_glv_cl(JacobianPoint* r, const Scalar* k, const AffinePoint* base) {
|
||||
if (scalar_is_zero_cl(k)) { point_set_infinity(r); return; }
|
||||
|
||||
Scalar k1, k2; int k1_neg, k2_neg;
|
||||
glv_decompose_cl(k, &k1, &k2, &k1_neg, &k2_neg);
|
||||
|
||||
// Two affine bases: P and phi(P) = (beta*P.x, (+/-)P.y)
|
||||
AffinePoint P = *base;
|
||||
if (k1_neg) field_neg_impl(&P.y, &P.y);
|
||||
|
||||
FieldElement beta;
|
||||
beta.limbs[0]=GLV_BETA0; beta.limbs[1]=GLV_BETA1;
|
||||
beta.limbs[2]=GLV_BETA2; beta.limbs[3]=GLV_BETA3;
|
||||
AffinePoint table[8];
|
||||
FieldElement globalz;
|
||||
build_wnaf_table_zr_cl(&P, table, &globalz);
|
||||
|
||||
AffinePoint phi_P;
|
||||
field_mul_impl(&phi_P.x, &P.x, &beta);
|
||||
if (k1_neg != k2_neg) field_neg_impl(&phi_P.y, &P.y);
|
||||
else phi_P.y = P.y;
|
||||
AffinePoint endo_table[8];
|
||||
derive_endo_table_cl(table, endo_table, k1_neg != k2_neg);
|
||||
|
||||
// Find max bit length of k1, k2
|
||||
int bl1 = scalar_bitlen_cl(&k1);
|
||||
int bl2 = scalar_bitlen_cl(&k2);
|
||||
int max_bit = (bl1 > bl2) ? bl1 : bl2;
|
||||
int wnaf1[130] = {0};
|
||||
int wnaf2[130] = {0};
|
||||
scalar_to_wnaf(&k1, wnaf1);
|
||||
scalar_to_wnaf(&k2, wnaf2);
|
||||
|
||||
// Interleaved binary double-and-add with mixed additions
|
||||
point_set_infinity(r);
|
||||
for (int i = max_bit - 1; i >= 0; --i) {
|
||||
for (int i = 129; i >= 0; --i) {
|
||||
if (!point_is_infinity(r)) point_double_impl(r, r);
|
||||
int b1 = (int)((k1.limbs[i >> 6] >> (i & 63)) & 1UL);
|
||||
int b2 = (int)((k2.limbs[i >> 6] >> (i & 63)) & 1UL);
|
||||
if (b1) point_add_mixed_impl(r, r, &P);
|
||||
if (b2) point_add_mixed_impl(r, r, &phi_P);
|
||||
|
||||
int d1 = wnaf1[i];
|
||||
if (d1 != 0) {
|
||||
int idx = (((d1 > 0) ? d1 : -d1) - 1) >> 1;
|
||||
AffinePoint pt = table[idx];
|
||||
if (d1 < 0) field_neg_impl(&pt.y, &pt.y);
|
||||
if (point_is_infinity(r)) point_from_affine(r, &pt);
|
||||
else point_add_mixed_impl(r, r, &pt);
|
||||
}
|
||||
|
||||
int d2 = wnaf2[i];
|
||||
if (d2 != 0) {
|
||||
int idx = (((d2 > 0) ? d2 : -d2) - 1) >> 1;
|
||||
AffinePoint pt = endo_table[idx];
|
||||
if (d2 < 0) field_neg_impl(&pt.y, &pt.y);
|
||||
if (point_is_infinity(r)) point_from_affine(r, &pt);
|
||||
else point_add_mixed_impl(r, r, &pt);
|
||||
}
|
||||
}
|
||||
|
||||
if (!point_is_infinity(r)) {
|
||||
FieldElement corrected_z;
|
||||
field_mul_impl(&corrected_z, &r->z, &globalz);
|
||||
r->z = corrected_z;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1125,40 +1339,41 @@ inline int get_window_4bit(const Scalar* s, int pos) {
|
||||
return (int)(v & 0xFUL);
|
||||
}
|
||||
|
||||
__kernel void scalar_mul_generator(__global const Scalar* scalars, __global JacobianPoint* results, uint count) {
|
||||
uint gid = get_global_id(0); if (gid >= count) return;
|
||||
Scalar k = scalars[gid];
|
||||
JacobianPoint R;
|
||||
if ((k.limbs[0]|k.limbs[1]|k.limbs[2]|k.limbs[3]) == 0) { point_set_infinity(&R); results[gid] = R; return; }
|
||||
inline void scalar_mul_generator_glv_impl(JacobianPoint* r, const Scalar* k) {
|
||||
if ((k->limbs[0]|k->limbs[1]|k->limbs[2]|k->limbs[3]) == 0) {
|
||||
point_set_infinity(r);
|
||||
return;
|
||||
}
|
||||
|
||||
Scalar k1, k2; int k1_neg, k2_neg;
|
||||
glv_decompose_cl(&k, &k1, &k2, &k1_neg, &k2_neg);
|
||||
glv_decompose_cl(k, &k1, &k2, &k1_neg, &k2_neg);
|
||||
|
||||
// Compute actual number of 4-bit windows needed
|
||||
int bl1 = scalar_bitlen_cl(&k1);
|
||||
int bl2 = scalar_bitlen_cl(&k2);
|
||||
int max_bits = (bl1 > bl2) ? bl1 : bl2;
|
||||
int num_windows = (max_bits + 3) / 4;
|
||||
|
||||
point_set_infinity(&R);
|
||||
for (int w = num_windows - 1; w >= 0; --w) {
|
||||
if (!point_is_infinity(&R)) {
|
||||
point_double_impl(&R, &R); point_double_impl(&R, &R);
|
||||
point_double_impl(&R, &R); point_double_impl(&R, &R);
|
||||
point_set_infinity(r);
|
||||
for (int w = 31; w >= 0; --w) {
|
||||
if (!point_is_infinity(r)) {
|
||||
point_double_impl(r, r); point_double_impl(r, r);
|
||||
point_double_impl(r, r); point_double_impl(r, r);
|
||||
}
|
||||
int w1 = get_window_4bit(&k1, w);
|
||||
if (w1) {
|
||||
AffinePoint pt = GENERATOR_TABLE_NIBBLE[w1];
|
||||
if (k1_neg) field_neg_impl(&pt.y, &pt.y);
|
||||
point_add_mixed_impl(&R, &R, &pt);
|
||||
point_add_mixed_impl(r, r, &pt);
|
||||
}
|
||||
int w2 = get_window_4bit(&k2, w);
|
||||
if (w2) {
|
||||
AffinePoint pt = GENERATOR_TABLE_NIBBLE_PHI[w2];
|
||||
if (k2_neg) field_neg_impl(&pt.y, &pt.y);
|
||||
point_add_mixed_impl(&R, &R, &pt);
|
||||
point_add_mixed_impl(r, r, &pt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void scalar_mul_generator(__global const Scalar* scalars, __global JacobianPoint* results, uint count) {
|
||||
uint gid = get_global_id(0); if (gid >= count) return;
|
||||
Scalar k = scalars[gid];
|
||||
JacobianPoint R;
|
||||
scalar_mul_generator_glv_impl(&R, &k);
|
||||
results[gid] = R;
|
||||
}
|
||||
|
||||
@ -1278,12 +1493,62 @@ __kernel void affine_add(
|
||||
__global FieldElement* rx, __global FieldElement* ry,
|
||||
const uint count
|
||||
) {
|
||||
#define BATCH_INV_LOCAL_MAX 256
|
||||
__local FieldElement local_h[BATCH_INV_LOCAL_MAX];
|
||||
__local FieldElement local_prefix[BATCH_INV_LOCAL_MAX];
|
||||
__local FieldElement local_h_inv[BATCH_INV_LOCAL_MAX];
|
||||
__local uint local_nonzero[BATCH_INV_LOCAL_MAX];
|
||||
|
||||
uint gid = get_global_id(0);
|
||||
uint lid = get_local_id(0);
|
||||
uint lsize = get_local_size(0);
|
||||
uint group_start = get_group_id(0) * lsize;
|
||||
uint active = (group_start < count) ? min(lsize, count - group_start) : 0;
|
||||
if (gid >= count) return;
|
||||
|
||||
FieldElement lpx = px[gid], lpy = py[gid];
|
||||
FieldElement lqx = qx[gid], lqy = qy[gid];
|
||||
|
||||
if (lsize > BATCH_INV_LOCAL_MAX) {
|
||||
AffinePoint r;
|
||||
affine_add_impl(&r, &lpx, &lpy, &lqx, &lqy);
|
||||
rx[gid] = r.x;
|
||||
ry[gid] = r.y;
|
||||
return;
|
||||
}
|
||||
|
||||
{ FieldElement _t; field_sub_impl(&_t, &lqx, &lpx); local_h[lid] = _t; }
|
||||
{ FieldElement _t = local_h[lid]; local_nonzero[lid] = field_is_zero_impl(&_t) ? 0U : 1U; }
|
||||
if (!local_nonzero[lid]) { FieldElement _t; field_set_one_impl(&_t); local_h[lid] = _t; }
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lid == 0) {
|
||||
FieldElement acc;
|
||||
field_set_one_impl(&acc);
|
||||
|
||||
for (uint i = 0; i < active; ++i) {
|
||||
local_prefix[i] = acc;
|
||||
if (local_nonzero[i]) { FieldElement _t = local_h[i]; field_mul_impl(&acc, &acc, &_t); }
|
||||
}
|
||||
|
||||
field_inv_impl(&acc, &acc);
|
||||
|
||||
for (int i = (int)active - 1; i >= 0; --i) {
|
||||
if (local_nonzero[i]) {
|
||||
FieldElement inv_i;
|
||||
{ FieldElement _t = local_prefix[i]; field_mul_impl(&inv_i, &acc, &_t); }
|
||||
local_h_inv[i] = inv_i;
|
||||
{ FieldElement _t = local_h[i]; field_mul_impl(&acc, &acc, &_t); }
|
||||
} else {
|
||||
FieldElement _t; field_set_zero_impl(&_t); local_h_inv[i] = _t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
AffinePoint r;
|
||||
affine_add_impl(&r, &lpx, &lpy, &lqx, &lqy);
|
||||
{ FieldElement _hinv = local_h_inv[lid]; affine_add_lambda_impl(&r, &lpx, &lpy, &lqx, &lqy, &_hinv); }
|
||||
rx[gid] = r.x;
|
||||
ry[gid] = r.y;
|
||||
}
|
||||
@ -1330,24 +1595,80 @@ __kernel void jacobian_to_affine(
|
||||
__global FieldElement* ax, __global FieldElement* ay,
|
||||
const uint count
|
||||
) {
|
||||
#define BATCH_INV_LOCAL_MAX 256
|
||||
__local FieldElement local_z[BATCH_INV_LOCAL_MAX];
|
||||
__local FieldElement local_prefix[BATCH_INV_LOCAL_MAX];
|
||||
__local FieldElement local_z_inv[BATCH_INV_LOCAL_MAX];
|
||||
__local uint local_nonzero[BATCH_INV_LOCAL_MAX];
|
||||
|
||||
uint gid = get_global_id(0);
|
||||
uint lid = get_local_id(0);
|
||||
uint lsize = get_local_size(0);
|
||||
uint group_start = get_group_id(0) * lsize;
|
||||
uint active = (group_start < count) ? min(lsize, count - group_start) : 0;
|
||||
if (gid >= count) return;
|
||||
|
||||
FieldElement lx = jx[gid], ly = jy[gid], lz = jz[gid];
|
||||
AffinePoint r;
|
||||
jacobian_to_affine_convert_impl(&r, &lx, &ly, &lz);
|
||||
ax[gid] = r.x;
|
||||
ay[gid] = r.y;
|
||||
|
||||
if (lsize > BATCH_INV_LOCAL_MAX) {
|
||||
AffinePoint r;
|
||||
jacobian_to_affine_convert_impl(&r, &lx, &ly, &lz);
|
||||
ax[gid] = r.x;
|
||||
ay[gid] = r.y;
|
||||
return;
|
||||
}
|
||||
|
||||
local_z[lid] = lz;
|
||||
{ FieldElement _t = local_z[lid]; local_nonzero[lid] = field_is_zero_impl(&_t) ? 0U : 1U; }
|
||||
if (!local_nonzero[lid]) { FieldElement _t; field_set_one_impl(&_t); local_z[lid] = _t; }
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lid == 0) {
|
||||
FieldElement acc;
|
||||
field_set_one_impl(&acc);
|
||||
|
||||
for (uint i = 0; i < active; ++i) {
|
||||
local_prefix[i] = acc;
|
||||
if (local_nonzero[i]) { FieldElement _t = local_z[i]; field_mul_impl(&acc, &acc, &_t); }
|
||||
}
|
||||
|
||||
field_inv_impl(&acc, &acc);
|
||||
|
||||
for (int i = (int)active - 1; i >= 0; --i) {
|
||||
if (local_nonzero[i]) {
|
||||
FieldElement inv_i;
|
||||
{ FieldElement _t = local_prefix[i]; field_mul_impl(&inv_i, &acc, &_t); }
|
||||
local_z_inv[i] = inv_i;
|
||||
{ FieldElement _t = local_z[i]; field_mul_impl(&acc, &acc, &_t); }
|
||||
} else {
|
||||
FieldElement _t; field_set_zero_impl(&_t); local_z_inv[i] = _t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
FieldElement z_inv2, z_inv3;
|
||||
{ FieldElement _t = local_z_inv[lid]; field_sqr_impl(&z_inv2, &_t); }
|
||||
{ FieldElement _t = local_z_inv[lid]; field_mul_impl(&z_inv3, &z_inv2, &_t); }
|
||||
{ FieldElement _ax; field_mul_impl(&_ax, &lx, &z_inv2); ax[gid] = _ax; }
|
||||
{ FieldElement _ay; field_mul_impl(&_ay, &ly, &z_inv3); ay[gid] = _ay; }
|
||||
}
|
||||
)KERNEL";
|
||||
)KERNEL" };
|
||||
|
||||
bool Context::Impl::build_program() {
|
||||
cl_int err;
|
||||
|
||||
// Create program from source
|
||||
const char* sources[] = {kernel_source};
|
||||
std::size_t lengths[] = {std::strlen(kernel_source)};
|
||||
// Create program from source (multiple parts avoid MSVC C2026 limit)
|
||||
constexpr cl_uint num_parts = sizeof(kernel_parts) / sizeof(kernel_parts[0]);
|
||||
const char* sources[num_parts];
|
||||
std::size_t lengths[num_parts];
|
||||
for (cl_uint i = 0; i < num_parts; ++i) {
|
||||
sources[i] = kernel_parts[i];
|
||||
lengths[i] = std::strlen(kernel_parts[i]);
|
||||
}
|
||||
|
||||
program = clCreateProgramWithSource(context, 1, sources, lengths, &err);
|
||||
program = clCreateProgramWithSource(context, num_parts, sources, lengths, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
last_error = std::string("Failed to create program: ") + cl_error_string(err);
|
||||
return false;
|
||||
@ -1730,6 +2051,14 @@ static void compute_work_sizes(std::size_t count, std::size_t max_wg, std::size_
|
||||
global = ((count + local - 1) / local) * local;
|
||||
}
|
||||
|
||||
static void compute_scalar_mul_work_sizes(std::size_t count, std::size_t requested_local,
|
||||
std::size_t auto_local, std::size_t max_wg, std::size_t& local,
|
||||
std::size_t& global) {
|
||||
const std::size_t tuned_auto_local = std::min(auto_local, max_wg);
|
||||
local = requested_local == 0 ? tuned_auto_local : std::min(requested_local, max_wg);
|
||||
global = ((count + local - 1) / local) * local;
|
||||
}
|
||||
|
||||
void Context::batch_field_add(const FieldElement* a, const FieldElement* b,
|
||||
FieldElement* results, std::size_t count) {
|
||||
if (count == 0) return;
|
||||
@ -1882,7 +2211,6 @@ void Context::batch_scalar_mul_generator(const Scalar* scalars, JacobianPoint* r
|
||||
impl_->cache_smg_count = count;
|
||||
}
|
||||
|
||||
// Upload scalars (async)
|
||||
clEnqueueWriteBuffer(impl_->queue, impl_->cache_smg_scalars, CL_FALSE, 0,
|
||||
count * sizeof(Scalar), scalars, 0, nullptr, nullptr);
|
||||
|
||||
@ -1891,13 +2219,11 @@ void Context::batch_scalar_mul_generator(const Scalar* scalars, JacobianPoint* r
|
||||
clSetKernelArg(impl_->kernel_scalar_mul_generator, 1, sizeof(cl_mem), &impl_->cache_smg_results);
|
||||
clSetKernelArg(impl_->kernel_scalar_mul_generator, 2, sizeof(cl_uint), &cnt);
|
||||
|
||||
// Calculate work group size
|
||||
std::size_t local_size = impl_->config.local_work_size;
|
||||
if (local_size == 0 || local_size > impl_->device_info.max_work_group_size) {
|
||||
local_size = std::min(static_cast<std::size_t>(256), impl_->device_info.max_work_group_size);
|
||||
}
|
||||
|
||||
std::size_t global_size = ((count + local_size - 1) / local_size) * local_size;
|
||||
std::size_t local_size, global_size;
|
||||
compute_scalar_mul_work_sizes(count, impl_->config.local_work_size,
|
||||
128,
|
||||
impl_->device_info.max_work_group_size,
|
||||
local_size, global_size);
|
||||
|
||||
clEnqueueNDRangeKernel(impl_->queue, impl_->kernel_scalar_mul_generator, 1, nullptr,
|
||||
&global_size, &local_size, 0, nullptr, nullptr);
|
||||
@ -1925,10 +2251,11 @@ void Context::batch_scalar_mul(const Scalar* scalars, const AffinePoint* points,
|
||||
impl_->cache_sm_count = count;
|
||||
}
|
||||
|
||||
clEnqueueWriteBuffer(impl_->queue, impl_->cache_sm_points, CL_TRUE, 0,
|
||||
count * sizeof(AffinePoint), points, 0, nullptr, nullptr);
|
||||
clEnqueueWriteBuffer(impl_->queue, impl_->cache_sm_scalars, CL_FALSE, 0,
|
||||
count * sizeof(Scalar), scalars, 0, nullptr, nullptr);
|
||||
clEnqueueWriteBuffer(impl_->queue, impl_->cache_sm_points, CL_FALSE, 0,
|
||||
count * sizeof(AffinePoint), points, 0, nullptr, nullptr);
|
||||
clFlush(impl_->queue);
|
||||
|
||||
cl_uint cnt = static_cast<cl_uint>(count);
|
||||
clSetKernelArg(impl_->kernel_scalar_mul, 0, sizeof(cl_mem), &impl_->cache_sm_scalars);
|
||||
@ -1936,8 +2263,11 @@ void Context::batch_scalar_mul(const Scalar* scalars, const AffinePoint* points,
|
||||
clSetKernelArg(impl_->kernel_scalar_mul, 2, sizeof(cl_mem), &impl_->cache_sm_results);
|
||||
clSetKernelArg(impl_->kernel_scalar_mul, 3, sizeof(cl_uint), &cnt);
|
||||
|
||||
std::size_t local_size = std::min(static_cast<std::size_t>(256), impl_->device_info.max_work_group_size);
|
||||
std::size_t global_size = ((count + local_size - 1) / local_size) * local_size;
|
||||
std::size_t local_size, global_size;
|
||||
compute_scalar_mul_work_sizes(count, impl_->config.local_work_size,
|
||||
128,
|
||||
impl_->device_info.max_work_group_size,
|
||||
local_size, global_size);
|
||||
|
||||
clEnqueueNDRangeKernel(impl_->queue, impl_->kernel_scalar_mul, 1, nullptr,
|
||||
&global_size, &local_size, 0, nullptr, nullptr);
|
||||
@ -2043,6 +2373,8 @@ void* Context::native_kernel(const char* name) const {
|
||||
if (n == "point_add") return impl_->kernel_point_add;
|
||||
if (n == "scalar_mul") return impl_->kernel_scalar_mul;
|
||||
if (n == "scalar_mul_generator") return impl_->kernel_scalar_mul_generator;
|
||||
if (n == "batch_jacobian_to_affine") return impl_->kernel_batch_jacobian_to_affine;
|
||||
if (n == "batch_jacobian_to_affine_kernel") return impl_->kernel_batch_jacobian_to_affine;
|
||||
if (n == "affine_add") return impl_->kernel_affine_add;
|
||||
if (n == "affine_add_lambda") return impl_->kernel_affine_add_lambda;
|
||||
if (n == "affine_add_x_only") return impl_->kernel_affine_add_x_only;
|
||||
|
||||
@ -1128,6 +1128,133 @@ bool selftest(bool verbose, int platform_id, int device_id) {
|
||||
if (verbose) SELFTEST_PRINT(pass ? " PASS\n" : " FAIL\n");
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
// Test 41: BIP-352 SCAN_KEY smoke — large 256-bit scalar, must not be infinity
|
||||
// Regression: verifies that scalar_mul_generator handles a real-world key
|
||||
// that stresses the GLV decomposition path (both half-scalars non-trivial).
|
||||
// ==========================================================================
|
||||
{
|
||||
total++;
|
||||
if (verbose) SELFTEST_PRINT("\nBIP-352 SCAN_KEY k*G smoke (not infinity):\n");
|
||||
bool pass = true;
|
||||
|
||||
// SCAN_KEY used in bench_bip352_opencl — 256-bit, both GLV halves non-zero
|
||||
Scalar k_scan = scalar_from_hex(
|
||||
"c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a42");
|
||||
JacobianPoint P = ctx->scalar_mul_generator(k_scan);
|
||||
AffinePoint Pa = jacobian_to_affine(P);
|
||||
// Sanity: x-coordinate must be non-zero (point at infinity has x=0)
|
||||
if ((Pa.x.limbs[0] | Pa.x.limbs[1] | Pa.x.limbs[2] | Pa.x.limbs[3]) == 0) {
|
||||
if (verbose) SELFTEST_PRINT(" FAIL: SCAN_KEY * G produced x=0 (infinity)\n");
|
||||
pass = false;
|
||||
}
|
||||
|
||||
if (pass) passed++;
|
||||
if (verbose) SELFTEST_PRINT(pass ? " PASS\n" : " FAIL\n");
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
// Test 42: GLV large scalar consistency — k*G + G = (k+1)*G for SCAN_KEY
|
||||
// Checks that GLV decomposition is correct for a full 256-bit key by
|
||||
// cross-checking with the additive property: (k+1)*G = k*G + 1*G.
|
||||
// ==========================================================================
|
||||
{
|
||||
total++;
|
||||
if (verbose) SELFTEST_PRINT("\nGLV large scalar consistency: k*G + G = (k+1)*G:\n");
|
||||
bool pass = true;
|
||||
|
||||
Scalar k = scalar_from_hex(
|
||||
"c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a42");
|
||||
Scalar kp1 = scalar_from_hex(
|
||||
"c4239fd6fc3db6e22b8bed6a49219e4e30d7d6a3b98294b138af4ad300da1a43");
|
||||
Scalar one = scalar_from_u64(1);
|
||||
|
||||
JacobianPoint kG = ctx->scalar_mul_generator(k);
|
||||
JacobianPoint oneG = ctx->scalar_mul_generator(one);
|
||||
JacobianPoint kp1_a = ctx->point_add(kG, oneG); // k*G + G
|
||||
JacobianPoint kp1_b = ctx->scalar_mul_generator(kp1); // (k+1)*G
|
||||
|
||||
AffinePoint a = jacobian_to_affine(kp1_a);
|
||||
AffinePoint b = jacobian_to_affine(kp1_b);
|
||||
if (field_to_hex(a.x) != field_to_hex(b.x) || field_to_hex(a.y) != field_to_hex(b.y)) {
|
||||
if (verbose) {
|
||||
SELFTEST_PRINT(" FAIL: k*G + G != (k+1)*G\n");
|
||||
SELFTEST_PRINT(" k*G+G x: %s\n", field_to_hex(a.x).c_str());
|
||||
SELFTEST_PRINT(" (k+1)G x: %s\n", field_to_hex(b.x).c_str());
|
||||
}
|
||||
pass = false;
|
||||
}
|
||||
|
||||
if (pass) passed++;
|
||||
if (verbose) SELFTEST_PRINT(pass ? " PASS\n" : " FAIL\n");
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
// Test 43: GLV 2^128 boundary — (2^128)*G + G = (2^128+1)*G
|
||||
// The GLV decomposition boundary sits near 2^128; a scalar k = 2^128
|
||||
// forces the high half of the GLV decomposition to be active. Regression
|
||||
// for any off-by-one in the half-scalar split.
|
||||
// ==========================================================================
|
||||
{
|
||||
total++;
|
||||
if (verbose) SELFTEST_PRINT("\nGLV 2^128 boundary: 2^128*G + G = (2^128+1)*G:\n");
|
||||
bool pass = true;
|
||||
|
||||
// k = 2^128: limbs[2]=1 (little-endian), others=0
|
||||
Scalar k_128 = {{0UL, 0UL, 1UL, 0UL}};
|
||||
Scalar k_128p = {{1UL, 0UL, 1UL, 0UL}}; // 2^128 + 1
|
||||
Scalar one = scalar_from_u64(1);
|
||||
|
||||
JacobianPoint kG = ctx->scalar_mul_generator(k_128);
|
||||
JacobianPoint oneG = ctx->scalar_mul_generator(one);
|
||||
JacobianPoint kp1_a = ctx->point_add(kG, oneG);
|
||||
JacobianPoint kp1_b = ctx->scalar_mul_generator(k_128p);
|
||||
|
||||
AffinePoint a = jacobian_to_affine(kp1_a);
|
||||
AffinePoint b = jacobian_to_affine(kp1_b);
|
||||
if (field_to_hex(a.x) != field_to_hex(b.x) || field_to_hex(a.y) != field_to_hex(b.y)) {
|
||||
if (verbose) SELFTEST_PRINT(" FAIL: 2^128*G + G != (2^128+1)*G\n");
|
||||
pass = false;
|
||||
}
|
||||
|
||||
if (pass) passed++;
|
||||
if (verbose) SELFTEST_PRINT(pass ? " PASS\n" : " FAIL\n");
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
// Test 44: wNAF alternating-bit stress — 0x5555...*G + G = 0x5556...*G
|
||||
// Alternating 0101... bits maximally stress wNAF digit selection:
|
||||
// every bit triggers a non-adjacent form carry/borrow. Catches bugs in
|
||||
// the w=5 wNAF encoder that surface only with specific bit patterns.
|
||||
// ==========================================================================
|
||||
{
|
||||
total++;
|
||||
if (verbose) SELFTEST_PRINT("\nwNAF alternating-bit stress: 0x5555...*G + G:\n");
|
||||
bool pass = true;
|
||||
|
||||
// k = 0x5555555555555555 * 4 limbs = repeating 01 bits in every position
|
||||
Scalar k_alt = {{0x5555555555555555ULL, 0x5555555555555555ULL,
|
||||
0x5555555555555555ULL, 0x5555555555555555ULL}};
|
||||
Scalar k_altp = {{0x5555555555555556ULL, 0x5555555555555555ULL,
|
||||
0x5555555555555555ULL, 0x5555555555555555ULL}};
|
||||
Scalar one = scalar_from_u64(1);
|
||||
|
||||
JacobianPoint kG = ctx->scalar_mul_generator(k_alt);
|
||||
JacobianPoint oneG = ctx->scalar_mul_generator(one);
|
||||
JacobianPoint kp1_a = ctx->point_add(kG, oneG);
|
||||
JacobianPoint kp1_b = ctx->scalar_mul_generator(k_altp);
|
||||
|
||||
AffinePoint a = jacobian_to_affine(kp1_a);
|
||||
AffinePoint b = jacobian_to_affine(kp1_b);
|
||||
if (field_to_hex(a.x) != field_to_hex(b.x) || field_to_hex(a.y) != field_to_hex(b.y)) {
|
||||
if (verbose) SELFTEST_PRINT(" FAIL: 0x5555...*G + G != 0x5556...*G\n");
|
||||
pass = false;
|
||||
}
|
||||
|
||||
if (pass) passed++;
|
||||
if (verbose) SELFTEST_PRINT(pass ? " PASS\n" : " FAIL\n");
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
// Test 40: Distributive k*(P+Q) = k*P + k*Q
|
||||
// ==========================================================================
|
||||
|
||||
BIN
tools/source_graph_kit/source_graph.db
Normal file
BIN
tools/source_graph_kit/source_graph.db
Normal file
Binary file not shown.
9217
tools/source_graph_kit/source_graph.py
Normal file
9217
tools/source_graph_kit/source_graph.py
Normal file
File diff suppressed because it is too large
Load Diff
51
tools/source_graph_kit/source_graph.toml
Normal file
51
tools/source_graph_kit/source_graph.toml
Normal file
@ -0,0 +1,51 @@
|
||||
[project]
|
||||
name = "UltrafastSecp256k1"
|
||||
language = "cpp"
|
||||
|
||||
[[source_dirs]]
|
||||
label = "cpu"
|
||||
path = "../cpu"
|
||||
|
||||
[[source_dirs]]
|
||||
label = "include"
|
||||
path = "../include"
|
||||
|
||||
[[source_dirs]]
|
||||
label = "audit"
|
||||
path = "../audit"
|
||||
optional = true
|
||||
|
||||
[[source_dirs]]
|
||||
label = "benchmarks"
|
||||
path = "../benchmarks"
|
||||
optional = true
|
||||
|
||||
[[source_dirs]]
|
||||
label = "cuda"
|
||||
path = "../cuda"
|
||||
optional = true
|
||||
|
||||
[[source_dirs]]
|
||||
label = "examples"
|
||||
path = "../examples"
|
||||
optional = true
|
||||
|
||||
[[source_dirs]]
|
||||
label = "gpu"
|
||||
path = "../gpu"
|
||||
optional = true
|
||||
|
||||
[[source_dirs]]
|
||||
label = "metal"
|
||||
path = "../metal"
|
||||
optional = true
|
||||
|
||||
[[source_dirs]]
|
||||
label = "opencl"
|
||||
path = "../opencl"
|
||||
optional = true
|
||||
|
||||
[[source_dirs]]
|
||||
label = "tests"
|
||||
path = "../tests"
|
||||
optional = true
|
||||
Loading…
Reference in New Issue
Block a user