fix: tech debt batch — zero-safe batch inverse, 4-stream WNAF sign, OpenMP, MuSig2 validation
1. fe_batch_inverse: handle zero inputs gracefully by substituting ones during forward accumulation and restoring zeros in output. Prevents undefined behavior when callers pass zero-valued field elements. Added test_batch_inverse_zero_safe covering mixed, all-zero, and single-zero cases. (CT paths in ct_point.cpp unchanged — documented preconditions only.) 2. 4-stream WNAF (ESP32/STM32): fixed phi(G) sign — use k2_neg directly instead of k1_neg XOR k2_neg. G tables are precomputed without any sign baked in, unlike P tables where k1_neg is absorbed into P_base. Re-enabled the previously disabled code path. 3. OpenMP: added conditional OpenMP support for fe_h_based_inversion_batched. find_package(OpenMP QUIET) in CMakeLists.txt with ESP32/WASM exclusion. Static libgomp.a resolution for ARM64 cross-compilation. 4. MuSig2 key aggregation: validate ALL pubkeys upfront before computing anything. Previously, invalid pubkeys were silently skipped via continue, enabling potential rogue key attacks. Now returns empty ctx (Q=infinity) if any pubkey is invalid. Tested on x86_64 (25/25), ARM64 RK3588 (25/25), RISC-V VisionFive2 (25/25). No benchmark regressions detected.
This commit is contained in:
parent
00fae17cb2
commit
2f3051282c
498
.ci-baseline/baseline_arm64_pre_techdebt_20260321.txt
Normal file
498
.ci-baseline/baseline_arm64_pre_techdebt_20260321.txt
Normal file
@ -0,0 +1,498 @@
|
||||
CPU frequency warmup (3000 ms heavy load)... done (43998 k*G ops)
|
||||
Running integrity check... OK
|
||||
|
||||
======================================================================
|
||||
UltrafastSecp256k1 -- Unified Apple-to-Apple Benchmark
|
||||
======================================================================
|
||||
|
||||
CPU: AArch64
|
||||
Core: 1 (pinned to core 0, priority elevated)
|
||||
Compiler: GCC 13.3.0
|
||||
Arch: ARM64 (AArch64)
|
||||
Ultra: UltrafastSecp256k1
|
||||
libsecp: bitcoin-core libsecp256k1 v0.7.x
|
||||
Harness: 3s CPU ramp-up, 500 warmup/op, 11 passes, IQR outlier removal, median
|
||||
Timer: chrono::high_resolution_clock
|
||||
Pool: 64 independent key/msg/sig sets
|
||||
NOTE: Both Ultra and libsecp use IDENTICAL harness
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| FIELD ARITHMETIC (Ultra) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| field_mul | 288.1 |
|
||||
| field_sqr | 259.2 |
|
||||
| field_inv | 4538.3 |
|
||||
| field_add | 24.0 |
|
||||
| field_sub | 21.7 |
|
||||
| field_negate | 15.6 |
|
||||
| field_from_bytes (32B) | 15.1 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| SCALAR ARITHMETIC (Ultra) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| scalar_mul | 295.1 |
|
||||
| scalar_inv | 4622.9 |
|
||||
| scalar_add | 29.6 |
|
||||
| scalar_negate | 16.7 |
|
||||
| scalar_from_bytes (32B) | 15.1 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| POINT ARITHMETIC (Ultra) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| pubkey_create (k*G) | 73616.7 |
|
||||
| scalar_mul (k*P) | 396233.9 |
|
||||
| scalar_mul_with_plan | 397301.4 |
|
||||
| dual_mul (a*G + b*P) | 433852.5 |
|
||||
| point_add (affine+affine) | 10631.5 |
|
||||
| point_add (J+A mixed) | 2849.3 |
|
||||
| point_dbl | 1529.6 |
|
||||
| normalize (J->affine) | 36.3 |
|
||||
| batch_normalize /pt (N=64) | 44.9 |
|
||||
| next_inplace (+=G) | 2907.6 |
|
||||
| KPlan::from_scalar(w=4) | 10820.3 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| POINT SERIALIZATION (Ultra) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| to_compressed (33B) | 36.3 |
|
||||
| to_uncompressed (65B) | 52.1 |
|
||||
| x_only_bytes (32B) | 30.7 |
|
||||
| x_bytes_and_parity | 44.7 |
|
||||
| has_even_y | 19.5 |
|
||||
| batch_to_compressed /pt (N=64) | 18.7 |
|
||||
| batch_x_only_bytes /pt (N=64) | 15.8 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| ECDSA -- Ultra FAST | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| ecdsa_sign | 99722.6 |
|
||||
| ecdsa_sign_verified | 773269.7 |
|
||||
| ecdsa_verify | 454391.6 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| SCHNORR / BIP-340 -- Ultra FAST | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| schnorr_keypair_create | 73654.0 |
|
||||
| schnorr_sign | 76502.4 |
|
||||
| schnorr_sign_verified | 555055.7 |
|
||||
| schnorr_verify (cached xonly) | 455459.7 |
|
||||
| schnorr_verify (raw bytes) | 477054.1 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| MICRO-DIAGNOSTICS (sub-ops) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| Scalar::from_bytes (32B->scalar) | 15.1 |
|
||||
| Scalar::inverse (safegcd) | 4625.8 |
|
||||
| Scalar::mul | 294.8 |
|
||||
| Scalar::negate | 16.7 |
|
||||
| glv_decompose | 1259.2 |
|
||||
| Point::dbl (jac52_double) | 1388.1 |
|
||||
| Point::add (J+A mixed) | 2849.8 |
|
||||
| dual_scalar_mul_gen_point | 446939.5 |
|
||||
| FE52::from_4x64_limbs | 7.8 |
|
||||
| FE52::mul (52-bit) | 288.9 |
|
||||
| FE52::sqr (52-bit) | 246.8 |
|
||||
| FE52::inverse_safegcd | 4507.7 |
|
||||
| FE52::inverse (Fermat) | 59663.3 |
|
||||
| -> SafeGCD/Fermat speedup | 13.24x |
|
||||
| FE52::add (52-bit) | 11.7 |
|
||||
| FE52::negate (52-bit) | 10.6 |
|
||||
| FE52::normalize | 29.0 |
|
||||
| SHA256 (BIP0340/challenge) | 1817.5 |
|
||||
| tagged_hash (recompute tag) | 3249.7 |
|
||||
| cached_tagged_hash (midstate) | 1634.2 |
|
||||
| -> midstate speedup | 1.99x |
|
||||
| lift_x (4x64 sqrt) | 62443.7 |
|
||||
| lift_x (FE52 sqrt) | 52185.6 |
|
||||
| -> FE52/4x64 speedup | 1.20x |
|
||||
| FE::parse_bytes_strict | 27.9 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
---- VERIFY COST DECOMPOSITION ----
|
||||
ECDSA verify breakdown (estimated):
|
||||
scalar_inv (1x): 4625.8 ns
|
||||
scalar_mul (2x): 589.6 ns
|
||||
dual_scalar_mul: 446939.5 ns
|
||||
from_bytes + overhead: 15.1 ns
|
||||
--------------------------------
|
||||
SUM (sub-ops): 452170.0 ns
|
||||
MEASURED ecdsa_verify: 454391.6 ns
|
||||
UNEXPLAINED gap: 2221.6 ns (0.5%)
|
||||
|
||||
Schnorr verify breakdown (estimated):
|
||||
SHA256 challenge: (included in total)
|
||||
scalar_negate: 16.7 ns
|
||||
dual_scalar_mul: 446939.5 ns
|
||||
lift_x (sqrt): (included in total)
|
||||
from_bytes: 15.1 ns
|
||||
--------------------------------
|
||||
SUM (sub-ops, partial): 446971.3 ns
|
||||
MEASURED schnorr_verify: 455459.7 ns
|
||||
UNEXPLAINED gap: 8488.4 ns (SHA256+lift_x+Z-check)
|
||||
|
||||
Verify vs libsecp breakdown:
|
||||
Our dual_mul: 446939.5 ns
|
||||
Our scalar_inv: 4625.8 ns
|
||||
Our dual+inv: 451565.4 ns
|
||||
Total ECDSA verify: 454391.6 ns
|
||||
Overhead (verify - d+i): 2826.3 ns
|
||||
|
||||
---- SIGN COST DECOMPOSITION (FAST path) ----
|
||||
ecdsa_sign = RFC6979 + k*G + field_inv + scalar_inv + scalar_muls
|
||||
k*G (generator_mul): 73616.7 ns
|
||||
field_inv (R.x): 4538.3 ns
|
||||
scalar_inv (k^-1): 4625.8 ns
|
||||
scalar_mul (2x): 589.6 ns
|
||||
--------------------------------
|
||||
Core signing (no RFC6979): 83370.5 ns
|
||||
MEASURED ecdsa_sign: 99722.6 ns
|
||||
RFC6979 overhead: 16352.1 ns (16.4%)
|
||||
MEASURED ecdsa_sign_verified:773269.7 ns
|
||||
sign-then-verify overhead: 673547.1 ns (pubkey + verify)
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| BATCH VERIFICATION (FAST) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| schnorr_batch_verify(N=4) | 1851052.5 |
|
||||
| -> per-sig amortized (N=4) | 462763.1 |
|
||||
| -> speedup vs individual | 0.98x |
|
||||
| schnorr_batch_verify(cached,N=4) | 1815740.4 |
|
||||
| -> per-sig cached (N=4) | 453935.1 |
|
||||
| -> cached speedup vs individual | 1.00x |
|
||||
| schnorr_batch_verify(N=16) | 7459975.1 |
|
||||
| -> per-sig amortized (N=16) | 466248.4 |
|
||||
| -> speedup vs individual | 0.98x |
|
||||
| schnorr_batch_verify(cached,N=16) | 7322709.5 |
|
||||
| -> per-sig cached (N=16) | 457669.3 |
|
||||
| -> cached speedup vs individual | 1.00x |
|
||||
| schnorr_batch_verify(N=64) | 30376681.0 |
|
||||
| -> per-sig amortized (N=64) | 474635.6 |
|
||||
| -> speedup vs individual | 0.96x |
|
||||
| schnorr_batch_verify(cached,N=64) | 29319508.8 |
|
||||
| -> per-sig cached (N=64) | 458117.3 |
|
||||
| -> cached speedup vs individual | 0.99x |
|
||||
| schnorr_batch_verify(N=128) | 57073083.9 |
|
||||
| -> per-sig amortized (N=128) | 445883.5 |
|
||||
| -> speedup vs individual | 1.02x |
|
||||
| schnorr_batch_verify(cached,N=128) | 56030535.9 |
|
||||
| -> per-sig cached (N=128) | 437738.6 |
|
||||
| -> cached speedup vs individual | 1.04x |
|
||||
| schnorr_batch_verify(N=192) | 76937171.1 |
|
||||
| -> per-sig amortized (N=192) | 400714.4 |
|
||||
| -> speedup vs individual | 1.14x |
|
||||
| schnorr_batch_verify(cached,N=192) | 75869566.0 |
|
||||
| -> per-sig cached (N=192) | 395154.0 |
|
||||
| -> cached speedup vs individual | 1.15x |
|
||||
| | |
|
||||
| ecdsa_batch_verify(N=4) | 1794253.3 |
|
||||
| -> per-sig amortized (N=4) | 448563.3 |
|
||||
| -> speedup vs individual | 1.01x |
|
||||
| ecdsa_batch_verify(N=16) | 7189903.4 |
|
||||
| -> per-sig amortized (N=16) | 449369.0 |
|
||||
| -> speedup vs individual | 1.01x |
|
||||
| ecdsa_batch_verify(N=64) | 28653619.1 |
|
||||
| -> per-sig amortized (N=64) | 447712.8 |
|
||||
| -> speedup vs individual | 1.01x |
|
||||
| ecdsa_batch_verify(N=128) | 57303537.0 |
|
||||
| -> per-sig amortized (N=128) | 447683.9 |
|
||||
| -> speedup vs individual | 1.01x |
|
||||
| ecdsa_batch_verify(N=192) | 85917087.0 |
|
||||
| -> per-sig amortized (N=192) | 447484.8 |
|
||||
| -> speedup vs individual | 1.02x |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| CT POINT ARITHMETIC (sub-ops) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| ct::scalar_inverse (SafeGCD) | 21112.0 |
|
||||
| ct::generator_mul (k*G) | 211478.8 |
|
||||
| ct::scalar_mul (k*P) | 477302.0 |
|
||||
| ct::point_dbl | 1520.1 |
|
||||
| ct::point_add_complete (11M+6S) | 4316.7 |
|
||||
| ct::point_add_mixed_complete (7M+5S) | 2903.5 |
|
||||
| ct::point_add_mixed_unified (7M+5S) | 2876.4 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
---- CT vs FAST point ops ----
|
||||
FAST Point::dbl 1388.1 ns
|
||||
FAST Point::add 2849.8 ns
|
||||
FAST pubkey_create (k*G) 73616.7 ns
|
||||
FAST scalar_mul (k*P) 396233.9 ns
|
||||
CT generator_mul (k*G) 211478.8 ns
|
||||
CT scalar_mul (k*P) 477302.0 ns
|
||||
CT/FAST ratio (k*G): 2.87x overhead
|
||||
CT/FAST ratio (k*P): 1.20x overhead
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| CT SIGNING (Ultra CT) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| ct::ecdsa_sign | 246601.9 |
|
||||
| CT overhead (ECDSA) | 2.47x |
|
||||
| ct::ecdsa_sign_verified | 931057.8 |
|
||||
| ct::schnorr_sign | 233444.2 |
|
||||
| CT overhead (Schnorr) | 3.05x |
|
||||
| ct::schnorr_sign_verified | 714612.6 |
|
||||
| ct::schnorr_keypair_create | 217165.7 |
|
||||
| CT overhead (keypair) | 2.95x |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
---- CT ECDSA SIGN DECOMPOSITION ----
|
||||
ct::generator_mul (R=k*G): 211478.8 ns
|
||||
ct::scalar_inverse (k^-1): 21112.0 ns
|
||||
field_inv (R.x affine): 4538.3 ns
|
||||
scalar_mul (2x): 589.6 ns
|
||||
--------------------------------
|
||||
SUM (sub-ops): 237718.7 ns
|
||||
MEASURED ct::ecdsa_sign: 246601.9 ns
|
||||
UNEXPLAINED gap: 8883.1 ns (3.6%, RFC6979+checks)
|
||||
|
||||
---- CT SCHNORR SIGN DECOMPOSITION ----
|
||||
ct::generator_mul (R=k*G): 211478.8 ns
|
||||
SHA256 (tag+nonce+msg): (included in total)
|
||||
scalar_mul + negate: 311.5 ns
|
||||
--------------------------------
|
||||
SUM (sub-ops, partial): 211790.3 ns
|
||||
MEASURED ct::schnorr_sign: 233444.2 ns
|
||||
UNEXPLAINED gap: 21653.9 ns (SHA256+aux+serialize)
|
||||
|
||||
---- CT vs libsecp (true apples-to-apples) ----
|
||||
CT ecdsa_sign 246601.9 ns
|
||||
lib ecdsa_sign (measured after libsecp section)
|
||||
CT schnorr_sign 233444.2 ns
|
||||
lib schnorr_sign (measured after libsecp section)
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| ETHEREUM OPERATIONS | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| keccak256 (32B) | 2624.8 |
|
||||
| ethereum_address | 2644.0 |
|
||||
| eip191_hash | 1323.0 |
|
||||
| eth_sign_hash | 99923.3 |
|
||||
| ecdsa_sign_recoverable | 86545.1 |
|
||||
| ecrecover | 533539.5 |
|
||||
| eth_personal_sign | 101517.5 |
|
||||
| ethereum_address_eip55 | 6688.3 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| REAL-WORLD FLOWS | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| ecdh_compute (SHA256 shared secret) | 483352.4 |
|
||||
| ecdh_compute_raw (x-only shared) | 482554.4 |
|
||||
| taproot_output_key (BIP-341 key path) | 71982.8 |
|
||||
| taproot_tweak_privkey (BIP-341) | 116835.8 |
|
||||
| bip32_master_key (64B seed) | 6024.7 |
|
||||
| bip32_coin_derive_key (BTC m/84'/0'/0'/0/0) | 820495.7 |
|
||||
| coin_address_from_seed (BTC end-to-end) | 945466.2 |
|
||||
| coin_address_from_seed (ETH end-to-end) | 942422.3 |
|
||||
| silent_payment_create_output | 280357.0 |
|
||||
| silent_payment_scan (single output set) | 398169.4 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
Running libsecp256k1 benchmark (same harness: RDTSCP, 3s ramp-up, 500 warmup, 11 passes, IQR)...
|
||||
+----------------------------------------------+------------+
|
||||
| libsecp256k1 (bitcoin-core) | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| field_mul | 190.8 |
|
||||
| field_sqr | 144.5 |
|
||||
| field_inv_var | 4065.8 |
|
||||
| field_add | 65.3 |
|
||||
| field_negate | 59.2 |
|
||||
| field_normalize | 75.9 |
|
||||
| field_from_bytes (set_b32) | 72.5 |
|
||||
| scalar_mul | 246.6 |
|
||||
| scalar_inverse (CT) | 8222.1 |
|
||||
| scalar_inverse_var | 4165.0 |
|
||||
| scalar_add | 72.0 |
|
||||
| scalar_negate | 61.9 |
|
||||
| scalar_from_bytes (set_b32) | 68.6 |
|
||||
| point_dbl (gej_double_var) | 923.4 |
|
||||
| point_add (gej_add_ge_var) | 1515.6 |
|
||||
| ecmult (a*P + b*G, Strauss) | 231686.6 |
|
||||
| ecmult_gen (k*G, comb) | 103213.8 |
|
||||
| generator_mul (ec_pubkey_create) | 111912.5 |
|
||||
| scalar_mul_P (k*P, tweak_mul) | 212644.3 |
|
||||
| serialize_compressed (33B) | 174.5 |
|
||||
| serialize_uncompressed (65B) | 245.1 |
|
||||
| point_add (pubkey_combine) | 12021.3 |
|
||||
| ecdsa_sign | 135044.6 |
|
||||
| ecdsa_verify | 238623.0 |
|
||||
| schnorr_keypair_create | 111976.7 |
|
||||
| schnorr_sign (BIP-340) | 115792.8 |
|
||||
| schnorr_verify (BIP-340) | 239774.5 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
OpenSSL: not linked (rebuild with -DBENCH_HAS_OPENSSL or install libssl-dev)
|
||||
|
||||
======================================================================
|
||||
HEAD-TO-HEAD: UltrafastSecp256k1 vs libsecp256k1
|
||||
(ratio > 1.0 = Ultra wins, < 1.0 = libsecp wins)
|
||||
======================================================================
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| FIELD ARITHMETIC | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| mul | 288.1 | 190.8 | 0.66x |
|
||||
| sqr | 259.2 | 144.5 | 0.56x |
|
||||
| inv | 4538.3 | 4065.8 | 0.90x |
|
||||
| add | 24.0 | 65.3 | 2.72x |
|
||||
| sub | 21.7 | --- | --- |
|
||||
| negate | 15.6 | 59.2 | 3.79x |
|
||||
| normalize (FE52) | 29.0 | 75.9 | 2.62x |
|
||||
| from_bytes (32B) | 15.1 | 72.5 | 4.82x |
|
||||
| FE52 add (hot path) | 11.7 | 65.3 | 5.58x |
|
||||
| FE52 neg (hot path) | 10.6 | 59.2 | 5.58x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| SCALAR ARITHMETIC | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| mul | 295.1 | 246.6 | 0.84x |
|
||||
| inv (CT) | 4625.8 | 8222.1 | 1.78x |
|
||||
| inv (var-time) | 4625.8 | 4165.0 | 0.90x |
|
||||
| add | 29.6 | 72.0 | 2.43x |
|
||||
| negate | 16.7 | 61.9 | 3.70x |
|
||||
| from_bytes (32B) | 15.1 | 68.6 | 4.56x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| POINT ARITHMETIC | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| dbl (Jacobian) | 1529.6 | 923.4 | 0.60x |
|
||||
| add (mixed J+A) | 2849.3 | 1515.6 | 0.53x |
|
||||
| ecmult (a*P+b*G) | 433852.5 | 231686.6 | 0.53x |
|
||||
| ecmult_gen (k*G raw) | 73616.7 | 103213.8 | 1.40x |
|
||||
| pubkey_create (API) | 73616.7 | 111912.5 | 1.52x |
|
||||
| scalar_mul (k*P) | 396233.9 | 212644.3 | 0.54x |
|
||||
| scalar_mul (KPlan) | 397301.4 | 212644.3 | 0.54x |
|
||||
| point_add (combine) | 10631.5 | 12021.3 | 1.13x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| SERIALIZATION | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| compressed (33B) | 36.3 | 174.5 | 4.81x |
|
||||
| uncompressed (65B) | 52.1 | 245.1 | 4.70x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| SIGNING (FAST vs libsecp CT) | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| ECDSA Sign | 99722.6 | 135044.6 | 1.35x |
|
||||
| Schnorr Sign | 76502.4 | 115792.8 | 1.51x |
|
||||
| Schnorr Keypair | 73654.0 | 111976.7 | 1.52x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| VERIFICATION | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| ECDSA Verify | 454391.6 | 238623.0 | 0.53x |
|
||||
| Schnorr Verify (cached) | 455459.7 | 239774.5 | 0.53x |
|
||||
| Schnorr Verify (raw) | 477054.1 | 239774.5 | 0.50x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| CT-vs-CT (fair signing) | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| ECDSA Sign | 246601.9 | 135044.6 | 0.55x |
|
||||
| Schnorr Sign | 233444.2 | 115792.8 | 0.50x |
|
||||
| ECDSA Verify | 454391.6 | 238623.0 | 0.53x |
|
||||
| Schnorr Verify | 477054.1 | 239774.5 | 0.50x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| ETHEREUM / RECOVERY | Ultra ns | libsecp | ratio |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
| sign_recoverable | 86545.1 | 134991.5 | 1.56x |
|
||||
| ecrecover | 533539.5 | 268722.4 | 0.50x |
|
||||
| eth_sign_hash | 99923.3 | 134991.5 | 1.35x |
|
||||
| eth_personal_sign | 101517.5 | 134991.5 | 1.33x |
|
||||
+------------------------------------+----------+----------+-----------+
|
||||
|
||||
+----------------------------------------------+------------+
|
||||
| ZK Proofs & Commitments | ns/op |
|
||||
+----------------------------------------------+------------+
|
||||
| Pedersen commit | 347414.7 |
|
||||
| Knowledge prove (sigma) | 250399.4 |
|
||||
| Knowledge verify | 235237.4 |
|
||||
| DLEQ prove | 487730.9 |
|
||||
| DLEQ verify | 642250.1 |
|
||||
| Bulletproof range_prove (64b) | 142647931.0 |
|
||||
| Bulletproof range_verify (64b) | 18191932.4 |
|
||||
+----------------------------------------------+------------+
|
||||
|
||||
======================================================================
|
||||
THROUGHPUT SUMMARY (1 core, pinned)
|
||||
======================================================================
|
||||
|
||||
--- Ultra FAST ---
|
||||
ECDSA sign 99.72 us -> 10.0 k op/s
|
||||
ECDSA verify 454.39 us -> 2.2 k op/s
|
||||
Schnorr sign 76.50 us -> 13.1 k op/s
|
||||
Schnorr verify (cached) 455.46 us -> 2.2 k op/s
|
||||
Schnorr verify (raw) 477.05 us -> 2.1 k op/s
|
||||
pubkey_create (k*G) 73.62 us -> 13.6 k op/s
|
||||
ECDH 483.35 us -> 2.1 k op/s
|
||||
Taproot output key 71.98 us -> 13.9 k op/s
|
||||
BIP32 derive (BTC) 820.50 us -> 1.2 k op/s
|
||||
Silent Payment sender 280.36 us -> 3.6 k op/s
|
||||
Silent Payment scan 398.17 us -> 2.5 k op/s
|
||||
|
||||
--- Ultra CT ---
|
||||
CT ECDSA sign 246.60 us -> 4.1 k op/s
|
||||
CT Schnorr sign 233.44 us -> 4.3 k op/s
|
||||
|
||||
--- Ultra ZK ---
|
||||
Pedersen commit 347.41 us -> 2.9 k op/s
|
||||
Knowledge prove 250.40 us -> 4.0 k op/s
|
||||
Knowledge verify 235.24 us -> 4.3 k op/s
|
||||
DLEQ prove 487.73 us -> 2.1 k op/s
|
||||
DLEQ verify 642.25 us -> 1.6 k op/s
|
||||
Bulletproof range_prove 142647.93 us -> 7 op/s
|
||||
Bulletproof range_verify 18191.93 us -> 55 op/s
|
||||
|
||||
--- libsecp256k1 ---
|
||||
field_mul 0.19 us -> 5.24 M op/s
|
||||
field_sqr 0.14 us -> 6.92 M op/s
|
||||
field_inv_var 4.07 us -> 246.0 k op/s
|
||||
scalar_mul 0.25 us -> 4.05 M op/s
|
||||
scalar_inverse (CT) 8.22 us -> 121.6 k op/s
|
||||
scalar_inverse_var 4.17 us -> 240.1 k op/s
|
||||
point_dbl 0.92 us -> 1.08 M op/s
|
||||
point_add (mixed) 1.52 us -> 659.8 k op/s
|
||||
ecmult (a*P+b*G) 231.69 us -> 4.3 k op/s
|
||||
ecmult_gen (k*G raw) 103.21 us -> 9.7 k op/s
|
||||
generator_mul (API) 111.91 us -> 8.9 k op/s
|
||||
scalar_mul_P (k*P) 212.64 us -> 4.7 k op/s
|
||||
ECDSA sign 135.04 us -> 7.4 k op/s
|
||||
ECDSA verify 238.62 us -> 4.2 k op/s
|
||||
Schnorr sign 115.79 us -> 8.6 k op/s
|
||||
Schnorr verify 239.77 us -> 4.2 k op/s
|
||||
|
||||
======================================================================
|
||||
BITCOIN BLOCK VALIDATION ESTIMATES (1 core)
|
||||
======================================================================
|
||||
|
||||
Pre-Taproot block (~3000 ECDSA verify):
|
||||
Wall time: 1363.2 ms
|
||||
Blocks/sec: 0.7
|
||||
|
||||
Taproot block (~2000 Schnorr + ~1000 ECDSA):
|
||||
Wall time: 1408.5 ms
|
||||
Blocks/sec: 0.7
|
||||
|
||||
TX throughput (1 core):
|
||||
ECDSA: 2201 tx/sec
|
||||
Schnorr: 2096 tx/sec
|
||||
|
||||
======================================================================
|
||||
AArch64 | 1 core pinned | GCC 13.3.0
|
||||
UltrafastSecp256k1 vs libsecp256k1 -- Unified Benchmark
|
||||
======================================================================
|
||||
|
||||
2318
.ci-baseline/baseline_pre_techdebt_20260321.txt
Normal file
2318
.ci-baseline/baseline_pre_techdebt_20260321.txt
Normal file
File diff suppressed because it is too large
Load Diff
387
.ci-baseline/bench_quick_baseline_pre_techdebt_20260321.json
Normal file
387
.ci-baseline/bench_quick_baseline_pre_techdebt_20260321.json
Normal file
@ -0,0 +1,387 @@
|
||||
[
|
||||
{
|
||||
"name": "field_mul",
|
||||
"unit": "ns",
|
||||
"value": 10.7
|
||||
},
|
||||
{
|
||||
"name": "field_sqr",
|
||||
"unit": "ns",
|
||||
"value": 10.0
|
||||
},
|
||||
{
|
||||
"name": "field_inv",
|
||||
"unit": "ns",
|
||||
"value": 662.3
|
||||
},
|
||||
{
|
||||
"name": "field_add",
|
||||
"unit": "ns",
|
||||
"value": 4.4
|
||||
},
|
||||
{
|
||||
"name": "field_sub",
|
||||
"unit": "ns",
|
||||
"value": 4.1
|
||||
},
|
||||
{
|
||||
"name": "field_negate",
|
||||
"unit": "ns",
|
||||
"value": 5.7
|
||||
},
|
||||
{
|
||||
"name": "field_from_bytes (32B)",
|
||||
"unit": "ns",
|
||||
"value": 2.7
|
||||
},
|
||||
{
|
||||
"name": "scalar_mul",
|
||||
"unit": "ns",
|
||||
"value": 19.7
|
||||
},
|
||||
{
|
||||
"name": "scalar_inv",
|
||||
"unit": "ns",
|
||||
"value": 838.6
|
||||
},
|
||||
{
|
||||
"name": "scalar_add",
|
||||
"unit": "ns",
|
||||
"value": 4.7
|
||||
},
|
||||
{
|
||||
"name": "scalar_negate",
|
||||
"unit": "ns",
|
||||
"value": 2.9
|
||||
},
|
||||
{
|
||||
"name": "scalar_from_bytes (32B)",
|
||||
"unit": "ns",
|
||||
"value": 2.9
|
||||
},
|
||||
{
|
||||
"name": "pubkey_create (k*G)",
|
||||
"unit": "ns",
|
||||
"value": 5607.4
|
||||
},
|
||||
{
|
||||
"name": "scalar_mul (k*P)",
|
||||
"unit": "ns",
|
||||
"value": 21169.2
|
||||
},
|
||||
{
|
||||
"name": "scalar_mul_with_plan",
|
||||
"unit": "ns",
|
||||
"value": 20677.5
|
||||
},
|
||||
{
|
||||
"name": "dual_mul (a*G + b*P)",
|
||||
"unit": "ns",
|
||||
"value": 23398.8
|
||||
},
|
||||
{
|
||||
"name": "point_add (affine+affine)",
|
||||
"unit": "ns",
|
||||
"value": 914.2
|
||||
},
|
||||
{
|
||||
"name": "point_add (J+A mixed)",
|
||||
"unit": "ns",
|
||||
"value": 149.3
|
||||
},
|
||||
{
|
||||
"name": "point_dbl",
|
||||
"unit": "ns",
|
||||
"value": 82.6
|
||||
},
|
||||
{
|
||||
"name": "normalize (J->affine)",
|
||||
"unit": "ns",
|
||||
"value": 2.8
|
||||
},
|
||||
{
|
||||
"name": "batch_normalize /pt (N=64)",
|
||||
"unit": "ns",
|
||||
"value": 140.5
|
||||
},
|
||||
{
|
||||
"name": "next_inplace (+=G)",
|
||||
"unit": "ns",
|
||||
"value": 147.1
|
||||
},
|
||||
{
|
||||
"name": "KPlan::from_scalar(w=4)",
|
||||
"unit": "ns",
|
||||
"value": 1284.8
|
||||
},
|
||||
{
|
||||
"name": "to_compressed (33B)",
|
||||
"unit": "ns",
|
||||
"value": 8.1
|
||||
},
|
||||
{
|
||||
"name": "to_uncompressed (65B)",
|
||||
"unit": "ns",
|
||||
"value": 8.3
|
||||
},
|
||||
{
|
||||
"name": "x_only_bytes (32B)",
|
||||
"unit": "ns",
|
||||
"value": 3.4
|
||||
},
|
||||
{
|
||||
"name": "x_bytes_and_parity",
|
||||
"unit": "ns",
|
||||
"value": 4.7
|
||||
},
|
||||
{
|
||||
"name": "has_even_y",
|
||||
"unit": "ns",
|
||||
"value": 2.0
|
||||
},
|
||||
{
|
||||
"name": "batch_to_compressed /pt (N=64)",
|
||||
"unit": "ns",
|
||||
"value": 148.1
|
||||
},
|
||||
{
|
||||
"name": "batch_x_only_bytes /pt (N=64)",
|
||||
"unit": "ns",
|
||||
"value": 109.9
|
||||
},
|
||||
{
|
||||
"name": "ecdsa_sign",
|
||||
"unit": "ns",
|
||||
"value": 7705.8
|
||||
},
|
||||
{
|
||||
"name": "ecdsa_sign_verified",
|
||||
"unit": "ns",
|
||||
"value": 38585.4
|
||||
},
|
||||
{
|
||||
"name": "ecdsa_verify",
|
||||
"unit": "ns",
|
||||
"value": 24704.3
|
||||
},
|
||||
{
|
||||
"name": "schnorr_keypair_create",
|
||||
"unit": "ns",
|
||||
"value": 5593.2
|
||||
},
|
||||
{
|
||||
"name": "schnorr_sign",
|
||||
"unit": "ns",
|
||||
"value": 6030.8
|
||||
},
|
||||
{
|
||||
"name": "schnorr_sign_verified",
|
||||
"unit": "ns",
|
||||
"value": 35556.5
|
||||
},
|
||||
{
|
||||
"name": "schnorr_verify (cached xonly)",
|
||||
"unit": "ns",
|
||||
"value": 22837.2
|
||||
},
|
||||
{
|
||||
"name": "schnorr_verify (raw bytes)",
|
||||
"unit": "ns",
|
||||
"value": 28809.0
|
||||
},
|
||||
{
|
||||
"name": "schnorr_batch_verify(N=4)",
|
||||
"unit": "ns",
|
||||
"value": 109819.1
|
||||
},
|
||||
{
|
||||
"name": "-> per-sig amortized (N=4)",
|
||||
"unit": "ns",
|
||||
"value": 27454.8
|
||||
},
|
||||
{
|
||||
"name": "schnorr_batch_verify(N=16)",
|
||||
"unit": "ns",
|
||||
"value": 396874.1
|
||||
},
|
||||
{
|
||||
"name": "-> per-sig amortized (N=16)",
|
||||
"unit": "ns",
|
||||
"value": 24804.6
|
||||
},
|
||||
{
|
||||
"name": "schnorr_batch_verify(N=64)",
|
||||
"unit": "ns",
|
||||
"value": 2397940.9
|
||||
},
|
||||
{
|
||||
"name": "-> per-sig amortized (N=64)",
|
||||
"unit": "ns",
|
||||
"value": 37467.8
|
||||
},
|
||||
{
|
||||
"name": "ecdsa_batch_verify(N=4)",
|
||||
"unit": "ns",
|
||||
"value": 82520.4
|
||||
},
|
||||
{
|
||||
"name": "ecdsa_batch_verify(N=16)",
|
||||
"unit": "ns",
|
||||
"value": 342148.9
|
||||
},
|
||||
{
|
||||
"name": "ecdsa_batch_verify(N=64)",
|
||||
"unit": "ns",
|
||||
"value": 1403852.4
|
||||
},
|
||||
{
|
||||
"name": "ct::scalar_inverse (SafeGCD)",
|
||||
"unit": "ns",
|
||||
"value": 1545.3
|
||||
},
|
||||
{
|
||||
"name": "ct::generator_mul (k*G)",
|
||||
"unit": "ns",
|
||||
"value": 11613.9
|
||||
},
|
||||
{
|
||||
"name": "ct::scalar_mul (k*P)",
|
||||
"unit": "ns",
|
||||
"value": 25233.8
|
||||
},
|
||||
{
|
||||
"name": "ct::point_dbl",
|
||||
"unit": "ns",
|
||||
"value": 81.2
|
||||
},
|
||||
{
|
||||
"name": "ct::point_add_complete (11M+6S)",
|
||||
"unit": "ns",
|
||||
"value": 263.1
|
||||
},
|
||||
{
|
||||
"name": "ct::point_add_mixed_complete (7M+5S)",
|
||||
"unit": "ns",
|
||||
"value": 173.9
|
||||
},
|
||||
{
|
||||
"name": "ct::point_add_mixed_unified (7M+5S)",
|
||||
"unit": "ns",
|
||||
"value": 163.1
|
||||
},
|
||||
{
|
||||
"name": "ct::ecdsa_sign",
|
||||
"unit": "ns",
|
||||
"value": 15667.7
|
||||
},
|
||||
{
|
||||
"name": "ct::ecdsa_sign_verified",
|
||||
"unit": "ns",
|
||||
"value": 52603.2
|
||||
},
|
||||
{
|
||||
"name": "ct::schnorr_sign",
|
||||
"unit": "ns",
|
||||
"value": 13500.2
|
||||
},
|
||||
{
|
||||
"name": "ct::schnorr_sign_verified",
|
||||
"unit": "ns",
|
||||
"value": 42850.8
|
||||
},
|
||||
{
|
||||
"name": "ct::schnorr_keypair_create",
|
||||
"unit": "ns",
|
||||
"value": 13145.8
|
||||
},
|
||||
{
|
||||
"name": "field_inv_var",
|
||||
"unit": "ns",
|
||||
"value": 937.0
|
||||
},
|
||||
{
|
||||
"name": "field_normalize",
|
||||
"unit": "ns",
|
||||
"value": 7.4
|
||||
},
|
||||
{
|
||||
"name": "field_from_bytes (set_b32)",
|
||||
"unit": "ns",
|
||||
"value": 7.2
|
||||
},
|
||||
{
|
||||
"name": "scalar_inverse (CT)",
|
||||
"unit": "ns",
|
||||
"value": 1421.9
|
||||
},
|
||||
{
|
||||
"name": "scalar_inverse_var",
|
||||
"unit": "ns",
|
||||
"value": 842.9
|
||||
},
|
||||
{
|
||||
"name": "scalar_from_bytes (set_b32)",
|
||||
"unit": "ns",
|
||||
"value": 5.0
|
||||
},
|
||||
{
|
||||
"name": "point_dbl (gej_double_var)",
|
||||
"unit": "ns",
|
||||
"value": 79.8
|
||||
},
|
||||
{
|
||||
"name": "point_add (gej_add_ge_var)",
|
||||
"unit": "ns",
|
||||
"value": 156.5
|
||||
},
|
||||
{
|
||||
"name": "ecmult (a*P + b*G, Strauss)",
|
||||
"unit": "ns",
|
||||
"value": 24125.0
|
||||
},
|
||||
{
|
||||
"name": "ecmult_gen (k*G, comb)",
|
||||
"unit": "ns",
|
||||
"value": 11303.3
|
||||
},
|
||||
{
|
||||
"name": "generator_mul (ec_pubkey_create)",
|
||||
"unit": "ns",
|
||||
"value": 12862.6
|
||||
},
|
||||
{
|
||||
"name": "scalar_mul_P (k*P, tweak_mul)",
|
||||
"unit": "ns",
|
||||
"value": 22261.2
|
||||
},
|
||||
{
|
||||
"name": "serialize_compressed (33B)",
|
||||
"unit": "ns",
|
||||
"value": 17.1
|
||||
},
|
||||
{
|
||||
"name": "serialize_uncompressed (65B)",
|
||||
"unit": "ns",
|
||||
"value": 21.7
|
||||
},
|
||||
{
|
||||
"name": "point_add (pubkey_combine)",
|
||||
"unit": "ns",
|
||||
"value": 1812.3
|
||||
},
|
||||
{
|
||||
"name": "schnorr_sign (BIP-340)",
|
||||
"unit": "ns",
|
||||
"value": 13788.7
|
||||
},
|
||||
{
|
||||
"name": "schnorr_verify (BIP-340)",
|
||||
"unit": "ns",
|
||||
"value": 25950.3
|
||||
},
|
||||
{
|
||||
"name": "Harness",
|
||||
"unit": "ns",
|
||||
"value": 3000000000.0
|
||||
}
|
||||
]
|
||||
@ -424,6 +424,29 @@ elseif(APPLE)
|
||||
target_link_libraries(${SECP256K1_LIB_NAME} PUBLIC ${SECURITY_FRAMEWORK})
|
||||
endif()
|
||||
|
||||
# Optional OpenMP support for batch parallelization (e.g. fe_h_based_inversion_batched)
|
||||
# Skipped on ESP32, WASM, and other embedded targets
|
||||
if(NOT DEFINED SECP256K1_PLATFORM_ESP32 AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
|
||||
# Prefer static OpenMP when doing static builds (cross-compile for Android etc.)
|
||||
if(CMAKE_EXE_LINKER_FLAGS MATCHES "-static" OR BUILD_SHARED_LIBS STREQUAL "OFF")
|
||||
set(OpenMP_gomp_LIBRARY "" CACHE STRING "" FORCE)
|
||||
find_library(_GOMP_STATIC libgomp.a PATHS
|
||||
/usr/lib/gcc-cross/${CMAKE_C_COMPILER_TARGET}/13
|
||||
/usr/lib/gcc/${CMAKE_C_COMPILER_TARGET}/13
|
||||
NO_DEFAULT_PATH)
|
||||
if(_GOMP_STATIC)
|
||||
set(OpenMP_gomp_LIBRARY "${_GOMP_STATIC}" CACHE STRING "" FORCE)
|
||||
endif()
|
||||
endif()
|
||||
find_package(OpenMP QUIET)
|
||||
if(OpenMP_CXX_FOUND)
|
||||
target_link_libraries(${SECP256K1_LIB_NAME} PUBLIC OpenMP::OpenMP_CXX)
|
||||
message(STATUS "Secp256k1: OpenMP enabled (${OpenMP_CXX_VERSION})")
|
||||
else()
|
||||
message(STATUS "Secp256k1: OpenMP not found - batch parallelization disabled")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Optional inline assembly support (x64 only)
|
||||
if(SECP256K1_HAS_ASM)
|
||||
target_compile_definitions(${SECP256K1_LIB_NAME}
|
||||
|
||||
@ -147,7 +147,9 @@ inline void fe_h_based_inversion_batched(FieldElement* h_values,
|
||||
if (n_threads == 0 || batch_size == 0) return;
|
||||
|
||||
// Process each thread's sequence independently
|
||||
// (Parallel on multi-core CPU via OpenMP, TBB, or std::execution)
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for schedule(static)
|
||||
#endif
|
||||
for (std::size_t tid = 0; tid < n_threads; ++tid) {
|
||||
// Forward pass: Z_final = Z_0 * prodH_i
|
||||
FieldElement z_current = z0_values[tid];
|
||||
@ -172,9 +174,6 @@ inline void fe_h_based_inversion_batched(FieldElement* h_values,
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Add parallel version using OpenMP or TBB for multi-threaded CPU batch processing
|
||||
// std::execution is not well-supported by ClangCL on Windows yet (2026)
|
||||
|
||||
} // namespace secp256k1::fast
|
||||
|
||||
#endif // SECP256K1_FIELD_H_BASED_HPP
|
||||
|
||||
@ -42,6 +42,7 @@ struct MuSig2KeyAggCtx {
|
||||
// Aggregate public keys (KeyAgg from BIP-327).
|
||||
// pubkeys: array of X-only public keys (32 bytes each)
|
||||
// Returns aggregation context with combined key and coefficients.
|
||||
// If ANY pubkey is invalid (x >= p or not on curve), returns ctx with Q = infinity.
|
||||
MuSig2KeyAggCtx musig2_key_agg(const std::vector<std::array<std::uint8_t, 32>>& pubkeys);
|
||||
|
||||
// -- Nonce --------------------------------------------------------------------
|
||||
|
||||
@ -184,7 +184,10 @@ inline FE52 fe52_inverse(const FE52& a) noexcept {
|
||||
// --- FE52 Batch Inversion (Montgomery's trick) ------------------------------
|
||||
// Computes z_inv[i] = z[i]^{-1} mod p for all i in [0, n).
|
||||
// Cost: 1 inversion + 3(n-1) multiplications (vs n inversions).
|
||||
// All z[i] MUST be non-zero (caller ensures by excluding infinity entries).
|
||||
//
|
||||
// PRECONDITION: All z[i] MUST be non-zero. This is a CT function —
|
||||
// adding variable-time zero-checks would break constant-time guarantees.
|
||||
// Callers ensure this by excluding infinity entries before calling.
|
||||
inline void fe52_batch_inverse(FE52* z_inv, const FE52* z, std::size_t n) noexcept {
|
||||
if (n == 0) return;
|
||||
if (n == 1) {
|
||||
@ -1754,7 +1757,8 @@ inline Jac4x64 jac_add_ge_var_zr(const Jac4x64& a,
|
||||
return {x3, y3, z3};
|
||||
}
|
||||
|
||||
// --- Montgomery batch inversion ----------------------------------------------
|
||||
// --- Montgomery batch inversion (CT, 4×64 helpers) ---------------------------
|
||||
// PRECONDITION: All z[i] MUST be non-zero. CT function — no zero-checks.
|
||||
|
||||
inline void fe_batch_inverse(FE52* inv, const FE52* z, std::size_t n) noexcept {
|
||||
if (n == 0) return;
|
||||
|
||||
@ -3408,25 +3408,32 @@ FieldElement fe_inverse_strauss(const FieldElement& value) {
|
||||
return pow_p_minus_2_strauss(value);
|
||||
}
|
||||
|
||||
// Montgomery batch inversion algorithm
|
||||
// Montgomery batch inversion algorithm (zero-safe)
|
||||
// Input: array of N field elements [a_0, a_1, ..., a_n_1]
|
||||
// Output: modifies array in-place to [a_0^-^1, a_1^-^1, ..., a_n_1^-^1]
|
||||
// Output: modifies array in-place to [a_0^-1, a_1^-1, ..., a_n_1^-1]
|
||||
// Zero elements map to zero in the output.
|
||||
//
|
||||
// Algorithm:
|
||||
// 1. Compute products: p_0=a_0, p_1=a_0*a_1, p_2=a_0*a_1*a_2, ..., p_n_1=a_0*...*a_n_1
|
||||
// 2. Invert final product: inv = (a_0*...*a_n_1)^-^1
|
||||
// 3. Work backwards: a^-^1 = inv * p_1, then inv = inv * a
|
||||
// 1. Compute products: p_0=a_0, p_1=a_0*a_1, ..., p_n_1=a_0*...*a_n_1
|
||||
// (zeros are substituted with 1 to keep the product chain valid)
|
||||
// 2. Invert final product: inv = (a_0*...*a_n_1)^-1 (zeros excluded)
|
||||
// 3. Work backwards: a_i^-1 = inv * p_{i-1}, then inv = inv * a_i
|
||||
// (zero positions are set to zero and do not update inv)
|
||||
//
|
||||
// Cost: 3N multiplications + 1 inversion (vs N inversions)
|
||||
// For N=8: ~8 us vs ~28 us (3.5x faster!)
|
||||
static inline void fe_batch_inverse_with_scratch(FieldElement* elements,
|
||||
size_t count,
|
||||
FieldElement* scratch) {
|
||||
// Step 1: Compute cumulative products
|
||||
// products[i] = elements[0] * elements[1] * ... * elements[i]
|
||||
scratch[0] = elements[0];
|
||||
FieldElement const kOne = FieldElement::one();
|
||||
FieldElement const kZero = FieldElement::zero();
|
||||
|
||||
// Step 1: Compute cumulative products, substituting zeros with 1
|
||||
bool const first_zero = (elements[0] == kZero);
|
||||
scratch[0] = first_zero ? kOne : elements[0];
|
||||
for (size_t i = 1; i < count; ++i) {
|
||||
scratch[i] = scratch[i - 1] * elements[i];
|
||||
bool const is_z = (elements[i] == kZero);
|
||||
scratch[i] = scratch[i - 1] * (is_z ? kOne : elements[i]);
|
||||
}
|
||||
|
||||
// Step 2: Invert the final product (only 1 expensive inverse!)
|
||||
@ -3434,20 +3441,25 @@ static inline void fe_batch_inverse_with_scratch(FieldElement* elements,
|
||||
|
||||
// Step 3: Work backwards to compute individual inverses
|
||||
for (size_t i = count - 1; i > 0; --i) {
|
||||
if (elements[i] == kZero) {
|
||||
elements[i] = kZero; // zero maps to zero
|
||||
continue; // inv unchanged (we used 1 for this slot)
|
||||
}
|
||||
FieldElement const original = elements[i];
|
||||
elements[i] = inv * scratch[i - 1];
|
||||
inv = inv * original;
|
||||
}
|
||||
|
||||
// Handle first element separately (no products[i-1])
|
||||
elements[0] = inv;
|
||||
// Handle first element separately (no scratch[i-1])
|
||||
elements[0] = first_zero ? kZero : inv;
|
||||
}
|
||||
|
||||
SECP256K1_HOT_FUNCTION
|
||||
void fe_batch_inverse(FieldElement* elements, size_t count, std::vector<FieldElement>& scratch) {
|
||||
if (count == 0) return;
|
||||
if (count == 1) {
|
||||
elements[0] = elements[0].inverse();
|
||||
if (!(elements[0] == FieldElement::zero()))
|
||||
elements[0] = elements[0].inverse();
|
||||
return;
|
||||
}
|
||||
|
||||
@ -3462,7 +3474,7 @@ void fe_batch_inverse(FieldElement* elements, size_t count, std::vector<FieldEle
|
||||
SECP256K1_HOT_FUNCTION
|
||||
void fe_batch_inverse(FieldElement* elements, size_t count) {
|
||||
if (count <= 1) {
|
||||
if (count == 1) {
|
||||
if (count == 1 && !(elements[0] == FieldElement::zero())) {
|
||||
elements[0] = elements[0].inverse();
|
||||
}
|
||||
return;
|
||||
|
||||
@ -81,6 +81,18 @@ MuSig2KeyAggCtx musig2_key_agg(const std::vector<std::array<uint8_t, 32>>& pubke
|
||||
std::size_t const n = pubkeys.size();
|
||||
if (n == 0) return ctx;
|
||||
|
||||
// Validate ALL pubkeys upfront before computing anything.
|
||||
// If any key is invalid (x >= p or not on curve), reject the entire set.
|
||||
// Silently skipping invalid keys would allow rogue key attacks.
|
||||
for (std::size_t i = 0; i < n; ++i) {
|
||||
FieldElement px;
|
||||
if (!FieldElement::parse_bytes_strict(pubkeys[i], px)) return ctx;
|
||||
auto x3 = px.square() * px;
|
||||
auto y2 = x3 + FieldElement::from_uint64(7);
|
||||
auto y = y2.sqrt();
|
||||
if (y.square() != y2) return ctx; // x not on curve
|
||||
}
|
||||
|
||||
// L = tagged_hash("KeyAgg list", pk_1 || pk_2 || ... || pk_n)
|
||||
SHA256 l_ctx;
|
||||
// Use tagged hash prefix
|
||||
@ -119,18 +131,14 @@ MuSig2KeyAggCtx musig2_key_agg(const std::vector<std::array<uint8_t, 32>>& pubke
|
||||
}
|
||||
|
||||
// Q = sum(a_i * P_i)
|
||||
// First, lift all x-only pubkeys to points
|
||||
// All pubkeys validated upfront — lift to points unconditionally
|
||||
Point Q = Point::infinity();
|
||||
for (std::size_t i = 0; i < n; ++i) {
|
||||
// Lift x-only to point (even Y) -- strict: reject x >= p
|
||||
FieldElement px;
|
||||
if (!FieldElement::parse_bytes_strict(pubkeys[i], px)) continue;
|
||||
FieldElement::parse_bytes_strict(pubkeys[i], px); // validated above
|
||||
auto x3 = px.square() * px;
|
||||
auto y2 = x3 + FieldElement::from_uint64(7);
|
||||
|
||||
// sqrt via optimized addition chain (~253 sqr + 13 mul)
|
||||
auto y = y2.sqrt();
|
||||
if (y.square() != y2) continue; // invalid pubkey x-coord
|
||||
|
||||
// BIP-340: ensure even Y
|
||||
if (y.limbs()[0] & 1) {
|
||||
|
||||
@ -3362,13 +3362,11 @@ Point Point::dual_scalar_mul_gen_point(const Scalar& a, const Scalar& b, const P
|
||||
return from_jac52(result52);
|
||||
#endif // SECP256K1_USE_4X64_POINT_OPS
|
||||
}
|
||||
// DISABLED: ESP32/Embedded 4-stream GLV Strauss produces incorrect verify results.
|
||||
// Root cause: the 4-stream interleaved Shamir scan with GLV decomposition of BOTH
|
||||
// scalars (a and b) computes wrong R' point, causing 100% ECDSA/Schnorr verify failure.
|
||||
// The simple fallback (a*G + b*P via separate scalar_muls) uses proven code paths
|
||||
// (gen_fixed_mul for G, GLV+Shamir per-point for P) and is correct.
|
||||
// TODO: investigate and fix the 4-stream path, then re-enable.
|
||||
#elif 0 && (defined(SECP256K1_PLATFORM_ESP32) || defined(ESP_PLATFORM) || defined(SECP256K1_PLATFORM_STM32))
|
||||
// ESP32/Embedded: 4-stream GLV Strauss (4x64 field)
|
||||
// Fixed in v3.3.1: the phi(G) sign used k1_neg XOR k2_neg (flip_a) instead
|
||||
// of k2_neg alone. This is correct for the P table (where k1_neg is baked
|
||||
// into P_base), but wrong for the G table (precomputed once, no sign baked).
|
||||
#elif (defined(SECP256K1_PLATFORM_ESP32) || defined(ESP_PLATFORM) || defined(SECP256K1_PLATFORM_STM32))
|
||||
// -- ESP32/Embedded: 4-stream GLV Strauss (4x64 field) --------------------
|
||||
// Combines a*G + b*P into a single doubling chain with 4 wNAF streams,
|
||||
// halving the doublings compared to two separate scalar_mul calls.
|
||||
@ -3496,10 +3494,10 @@ Point Point::dual_scalar_mul_gen_point(const Scalar& a, const Scalar& b, const P
|
||||
// -- Handle G sign: if a decomposed with k1_neg, use neg tables --
|
||||
const AffinePoint* g_pos = decomp_a.k1_neg ? gen4.neg_tbl_G : gen4.tbl_G;
|
||||
const AffinePoint* g_neg = decomp_a.k1_neg ? gen4.tbl_G : gen4.neg_tbl_G;
|
||||
// phi(G) sign: flip if k1_neg != k2_neg for a
|
||||
bool flip_a = (decomp_a.k1_neg != decomp_a.k2_neg);
|
||||
const AffinePoint* pg_pos = flip_a ? gen4.neg_tbl_phiG : gen4.tbl_phiG;
|
||||
const AffinePoint* pg_neg = flip_a ? gen4.tbl_phiG : gen4.neg_tbl_phiG;
|
||||
// phi(G) sign: use k2_neg directly (G tables have no sign baked in,
|
||||
// unlike the 2-stream P case where k1_neg is absorbed into P_base)
|
||||
const AffinePoint* pg_pos = decomp_a.k2_neg ? gen4.neg_tbl_phiG : gen4.tbl_phiG;
|
||||
const AffinePoint* pg_neg = decomp_a.k2_neg ? gen4.tbl_phiG : gen4.neg_tbl_phiG;
|
||||
|
||||
// -- 4-stream Shamir interleaved scan (JacobianPoint direct -- no Point wrapper) --
|
||||
std::size_t max_len = len_a1;
|
||||
|
||||
@ -459,6 +459,52 @@ static bool test_batch_inverse_expanded(bool verbose) {
|
||||
return ok;
|
||||
}
|
||||
|
||||
// Batch inversion with zero elements — zero-safety test
|
||||
static bool test_batch_inverse_zero_safe(bool verbose) {
|
||||
if (verbose) {
|
||||
SELFTEST_PRINT("\nBatch Inversion (zero-safe):\n");
|
||||
}
|
||||
// Mix of non-zero and zero elements: [3, 0, 7, 0, 11]
|
||||
FieldElement elems[5] = {
|
||||
FieldElement::from_uint64(3),
|
||||
FieldElement::zero(),
|
||||
FieldElement::from_uint64(7),
|
||||
FieldElement::zero(),
|
||||
FieldElement::from_uint64(11)
|
||||
};
|
||||
FieldElement const copy[5] = { elems[0], elems[1], elems[2], elems[3], elems[4] };
|
||||
fe_batch_inverse(elems, 5);
|
||||
bool ok = true;
|
||||
// Non-zero elements get correct inverses
|
||||
if (!(elems[0] == copy[0].inverse())) ok = false;
|
||||
if (!(elems[2] == copy[2].inverse())) ok = false;
|
||||
if (!(elems[4] == copy[4].inverse())) ok = false;
|
||||
// Zero elements stay zero
|
||||
if (!(elems[1] == FieldElement::zero())) ok = false;
|
||||
if (!(elems[3] == FieldElement::zero())) ok = false;
|
||||
|
||||
// Edge case: all zeros
|
||||
if (ok) {
|
||||
FieldElement all_zero[3] = { FieldElement::zero(), FieldElement::zero(), FieldElement::zero() };
|
||||
fe_batch_inverse(all_zero, 3);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
if (!(all_zero[i] == FieldElement::zero())) { ok = false; break; }
|
||||
}
|
||||
}
|
||||
|
||||
// Edge case: single zero
|
||||
if (ok) {
|
||||
FieldElement single_zero[1] = { FieldElement::zero() };
|
||||
fe_batch_inverse(single_zero, 1);
|
||||
if (!(single_zero[0] == FieldElement::zero())) ok = false;
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
SELFTEST_PRINT(ok ? " PASS\n" : " FAIL\n");
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
// Bilinearity checks for K*Q with non-generator points
|
||||
// Tests: (Q+G)*K == Q*K + G*K, (Q-G)*K == Q*K - G*K
|
||||
static bool test_bilinearity_K_times_Q(bool verbose) {
|
||||
@ -858,11 +904,9 @@ static bool test_batch_inverse_sweep(bool verbose) {
|
||||
return ok;
|
||||
}
|
||||
|
||||
// NOTE: fe_batch_inverse() requires ALL inputs to be non-zero.
|
||||
// Zero inputs cause undefined cumulative product. This is the standard
|
||||
// Montgomery trick contract (same as libsecp256k1). Callers must skip
|
||||
// zero lanes before calling. No zero-input test is needed here;
|
||||
// the contract is enforced by documentation, not runtime checks.
|
||||
// NOTE: fe_batch_inverse() is zero-safe since v3.3.1.
|
||||
// Zero inputs produce zero outputs without corrupting non-zero inverses.
|
||||
// CT variants (fe52_batch_inverse) still require callers to exclude zeros.
|
||||
|
||||
// -- Repro bundle: prints environment info for reproducibility --
|
||||
static void print_repro_bundle(SelftestMode mode, uint64_t seed) {
|
||||
@ -1435,6 +1479,9 @@ bool Selftest(bool verbose, SelftestMode mode, uint64_t seed) {
|
||||
// Expanded batch inverse (32 elements)
|
||||
tally(total, passed, "batch_inverse_expanded", test_batch_inverse_expanded(verbose));
|
||||
|
||||
// Batch inverse zero-safety
|
||||
tally(total, passed, "batch_inverse_zero_safe", test_batch_inverse_zero_safe(verbose));
|
||||
|
||||
// Bilinearity for K*Q with +/-G
|
||||
tally(total, passed, "bilinearity_K_times_Q", test_bilinearity_K_times_Q(verbose));
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user