fix partial results issue

fix partial results issue 2 fix partial results issue 3 fix partial results issue 4 fix partial results issue 5 fix partial results issue 6 fix partial results issue 7 fix partial results issue 8 fix partial results issue 9 fix partial results issue 10 fix partial results issue 11 fix partial results issue 12 fix partial results issue 13 fix partial results issue 14 fix partial results issue 15 fix partial results issue 16 fix partial results issue 17 fix partial results issue 18 fix partial results issue 19 fix partial results issue 20 fix partial results issue 21 fix partial results issue 22 fix partial results issue 23 fix partial results issue 24 fix partial results issue 25 Revert "fix partial results issue 25" This reverts commit 0491717f9b5b5c3ff13a64dbb97aaf7d80dfdf8d. fix partial results issue 26 fix partial results issue 27 fix partial results issue 28 fix partial results issue 29 fix partial results issue 30 fix partial results issue 31 fix partial results issue 32 fix partial results issue 33 fix partial results issue 34 fix partial results issue 35 fix partial results issue 36 fix partial results issue 37 fix partial results issue 38 fix partial results issue 39 fix partial results issue 40 fix partial results issue 41 fix partial results issue 42 fix partial results issue 43 fix partial results issue 44 fix partial results issue 45 fix partial results issue 46 fix partial results issue 47 fix partial results issue 48 fix partial results issue 49 fix partial results issue 50 fix partial results issue 51 fix partial results issue 52 rename ec_pmul_random_init to ec_pmul_init fix partial results issue 53 fix partial results issue 54 fix partial results issue 55
2025-10-28 15:35:58 +02:00 · 2025-10-28 15:35:58 +02:00 · 56f5ec8872
commit 56f5ec8872
parent 4358453e6f
4 changed files with 319 additions and 194 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 926492cc45eb6cd4dfe90bfd37cff5d7e58ff014
+Subproject commit f3ab474f24d0e375bc2fa41dd480525506ceaa8a
--- a/src/cudasp_extension.cu
+++ b/src/cudasp_extension.cu
@ -13,6 +13,10 @@
 // CUDA runtime for GPU operations
 #include <cuda_runtime.h>

+// Standard library
+#include <map>
+#include <set>
+
 // Declare CUDA functions from cudasp_gpu.cu
 extern "C" {
 	void* LaunchBatchScan(
@ -81,7 +85,7 @@ static void ConvertScalarToU32(const uint8_t* blob_data, uint32_t* out_scalar) {
 }

 struct CudaspScanBindData : public TableFunctionData {
-	CudaspScanBindData() : batch_size(10000) {
+	CudaspScanBindData() : batch_size(10240) {
 	}
 	static constexpr idx_t TWEAK_KEY_SIZE = 64; // 64 bytes: uncompressed EC point (32-byte x || 32-byte y)
 	static constexpr idx_t SCALAR_SIZE = 32; // 32 bytes: scalar for EC multiplication
@ -105,17 +109,17 @@ struct CudaspScanLocalState : public LocalTableFunctionState {
 	bool finalized;

 	// Per-thread accumulated input data
-	vector<string_t> accumulated_txids;        // Transaction IDs (BLOB)
-	vector<int32_t> accumulated_heights;       // Block heights (INTEGER)
-	vector<string_t> accumulated_tweak_keys;   // 64-byte EC points (BLOB)
-	vector<int64_t> accumulated_outputs;       // Flattened output values (BIGINT)
-	vector<idx_t> accumulated_output_offsets;  // Offset into accumulated_outputs for each row
-	vector<idx_t> accumulated_output_lengths;  // Length of each outputs list
+	vector<std::string> accumulated_txids;        // Transaction IDs (BLOB) - owned copies
+	vector<int32_t> accumulated_heights;          // Block heights (INTEGER)
+	vector<std::string> accumulated_tweak_keys;   // 64-byte EC points (BLOB) - owned copies
+	vector<int64_t> accumulated_outputs;          // Flattened output values (BIGINT)
+	vector<idx_t> accumulated_output_offsets;     // Offset into accumulated_outputs for each row
+	vector<idx_t> accumulated_output_lengths;     // Length of each outputs list

 	// Per-thread processed output data (only rows with matches)
-	vector<string_t> output_txids;
+	vector<std::string> output_txids;       // Owned copies
 	vector<int32_t> output_heights;
-	vector<string_t> output_tweak_keys;
+	vector<std::string> output_tweak_keys;  // Owned copies
 	idx_t output_position;
 };

@ -124,6 +128,12 @@ struct CudaspScanState : public GlobalTableFunctionState {
 		finalize_lock = make_uniq<std::mutex>();
 	}

+	// Limit to single thread to prevent duplicate results from parallel execution
+	// GPU parallelism with thousands of CUDA threads provides sufficient performance
+	idx_t MaxThreads() const override {
+		return 1;
+	}
+
 	// Thread synchronization
 	std::atomic_uint64_t currently_adding;
 	unique_ptr<std::mutex> finalize_lock;
@ -131,6 +141,9 @@ struct CudaspScanState : public GlobalTableFunctionState {

 static void AccumulateInput(CudaspScanLocalState &local_state, DataChunk &input) {
 	idx_t count = input.size();
+
+	size_t before_size = local_state.accumulated_heights.size();
+
 	// Expected columns: txid (BLOB), height (INTEGER), tweak_key (BLOB), outputs (LIST[BIGINT])
 	auto &txid_column = input.data[0];
 	auto &height_column = input.data[1];
@ -171,9 +184,12 @@ static void AccumulateInput(CudaspScanLocalState &local_state, DataChunk &input)
 		    height_data.validity.RowIsValid(height_idx) &&
 		    tweak_key_data.validity.RowIsValid(tweak_key_idx)) {

-			local_state.accumulated_txids.push_back(txid_ptr[txid_idx]);
+			// Copy string data to owned storage to avoid dangling pointers when input is cleared
+			auto txid_str = txid_ptr[txid_idx];
+			auto tweak_key_str = tweak_key_ptr[tweak_key_idx];
+			local_state.accumulated_txids.push_back(std::string(txid_str.GetData(), txid_str.GetSize()));
 			local_state.accumulated_heights.push_back(height_ptr[height_idx]);
-			local_state.accumulated_tweak_keys.push_back(tweak_key_ptr[tweak_key_idx]);
+			local_state.accumulated_tweak_keys.push_back(std::string(tweak_key_str.GetData(), tweak_key_str.GetSize()));

 			// Store outputs list offset and length
 			idx_t outputs_offset = local_state.accumulated_outputs.size();
@ -221,7 +237,7 @@ static void ProcessBatch(CudaspScanLocalState &local_state, const CudaspScanBind

 	// Convert tweak_keys from BLOB to u32 format (row-major first)
 	for (idx_t i = 0; i < batch_size; i++) {
-		const uint8_t* tweak_data = reinterpret_cast<const uint8_t*>(local_state.accumulated_tweak_keys[i].GetData());
+		const uint8_t* tweak_data = reinterpret_cast<const uint8_t*>(local_state.accumulated_tweak_keys[i].data());
 		ConvertTweakKeyToU32(tweak_data,
 		                     &h_points_x[i * field_limbs],
 		                     &h_points_y[i * field_limbs]);
@ -293,12 +309,12 @@ static void ProcessBatch(CudaspScanLocalState &local_state, const CudaspScanBind
 	);

 	if (state_handle) {
-	    // Write points data directly to managed memory (exactly like gECC's ec_pmul_random_init)
+	    // Write points data directly to managed memory (exactly like gECC's ec_pmul_init)
 #ifdef GECC_QAPW_OPT_COLUMN_MAJORED_INPUTS
 	    // Write in column-major layout directly
 	    for (idx_t j = 0; j < field_limbs; j++) {
 	        for (idx_t i = 0; i < batch_size; i++) {
-	            const uint8_t* tweak_data = reinterpret_cast<const uint8_t*>(local_state.accumulated_tweak_keys[i].GetData());
+	            const uint8_t* tweak_data = reinterpret_cast<const uint8_t*>(local_state.accumulated_tweak_keys[i].data());
 	            uint32_t x_limbs[8], y_limbs[8];
 	            ConvertTweakKeyToU32(tweak_data, x_limbs, y_limbs);
 	            managed_points_x[j * batch_size + i] = x_limbs[j];
@ -346,12 +362,19 @@ static void ProcessBatch(CudaspScanLocalState &local_state, const CudaspScanBind
 	    );

 	    if (kernel_result == 0) {
+	        // Ensure all GPU writes to managed_match_flags are visible to CPU
+	        cudaDeviceSynchronize();
+
 	        // Build output for matching rows
+	        idx_t match_count = 0;
+	        std::vector<int32_t> matched_heights;
 	        for (idx_t i = 0; i < batch_size; i++) {
 	            if (managed_match_flags[i]) {
 	                local_state.output_txids.push_back(local_state.accumulated_txids[i]);
 	                local_state.output_heights.push_back(local_state.accumulated_heights[i]);
 	                local_state.output_tweak_keys.push_back(local_state.accumulated_tweak_keys[i]);
+	                matched_heights.push_back(local_state.accumulated_heights[i]);
+	                match_count++;
 	            }
 	        }
 	    }
@ -434,8 +457,8 @@ static unique_ptr<FunctionData> CudaspScanBind(ClientContext &context, TableFunc
 		label_keys.push_back(std::string(label_key.GetData(), label_key.GetSize()));
 	}

-	// Parse optional batch_size named parameter (default: 10000)
-	idx_t batch_size = 10000;
+	// Parse optional batch_size named parameter (default: 10240)
+	idx_t batch_size = 10240;
 	auto batch_size_entry = input.named_parameters.find("batch_size");
 	if (batch_size_entry != input.named_parameters.end()) {
 		auto &batch_size_value = batch_size_entry->second;
@ -503,9 +526,11 @@ static OperatorResultType CudaspScanFunction(ExecutionContext &context, TableFun
 		auto tweak_key_data = FlatVector::GetData<string_t>(tweak_key_result);

 		for (idx_t i = 0; i < output_count; i++) {
-			txid_data[i] = StringVector::AddStringOrBlob(txid_result, local_state.output_txids[local_state.output_position + i]);
+			auto &txid = local_state.output_txids[local_state.output_position + i];
+			auto &tweak_key = local_state.output_tweak_keys[local_state.output_position + i];
+			txid_data[i] = StringVector::AddStringOrBlob(txid_result, string_t(txid.data(), txid.size()));
 			height_data[i] = local_state.output_heights[local_state.output_position + i];
-			tweak_key_data[i] = StringVector::AddStringOrBlob(tweak_key_result, local_state.output_tweak_keys[local_state.output_position + i]);
+			tweak_key_data[i] = StringVector::AddStringOrBlob(tweak_key_result, string_t(tweak_key.data(), tweak_key.size()));
 		}

 		output.SetCardinality(output_count);
@ -530,6 +555,9 @@ static OperatorResultType CudaspScanFunction(ExecutionContext &context, TableFun
 	if (input.size() > 0) {
 		AccumulateInput(local_state, input);

+		// Signal that we've consumed the input
+		input.SetCardinality(0);
+
 		// Process batch if we've accumulated enough data
 		if (ShouldProcessBatch(local_state, bind_data)) {
 			ProcessBatch(local_state, bind_data);
@ -547,9 +575,11 @@ static OperatorResultType CudaspScanFunction(ExecutionContext &context, TableFun
 			auto tweak_key_data = FlatVector::GetData<string_t>(tweak_key_result);

 			for (idx_t i = 0; i < output_count; i++) {
-				txid_data[i] = StringVector::AddStringOrBlob(txid_result, local_state.output_txids[i]);
+				auto &txid = local_state.output_txids[i];
+				auto &tweak_key = local_state.output_tweak_keys[i];
+				txid_data[i] = StringVector::AddStringOrBlob(txid_result, string_t(txid.data(), txid.size()));
 				height_data[i] = local_state.output_heights[i];
-				tweak_key_data[i] = StringVector::AddStringOrBlob(tweak_key_result, local_state.output_tweak_keys[i]);
+				tweak_key_data[i] = StringVector::AddStringOrBlob(tweak_key_result, string_t(tweak_key.data(), tweak_key.size()));
 			}

 			output.SetCardinality(output_count);
@ -587,9 +617,11 @@ static OperatorFinalizeResultType CudaspScanFinalFunction(ExecutionContext &cont
 		auto tweak_key_data = FlatVector::GetData<string_t>(tweak_key_result);

 		for (idx_t i = 0; i < output_count; i++) {
-			txid_data[i] = StringVector::AddStringOrBlob(txid_result, local_state.output_txids[local_state.output_position + i]);
+			auto &txid = local_state.output_txids[local_state.output_position + i];
+			auto &tweak_key = local_state.output_tweak_keys[local_state.output_position + i];
+			txid_data[i] = StringVector::AddStringOrBlob(txid_result, string_t(txid.data(), txid.size()));
 			height_data[i] = local_state.output_heights[local_state.output_position + i];
-			tweak_key_data[i] = StringVector::AddStringOrBlob(tweak_key_result, local_state.output_tweak_keys[local_state.output_position + i]);
+			tweak_key_data[i] = StringVector::AddStringOrBlob(tweak_key_result, string_t(tweak_key.data(), tweak_key.size()));
 		}

 		output.SetCardinality(output_count);
--- a/src/cudasp_gpu.cu
+++ b/src/cudasp_gpu.cu
@ -80,73 +80,99 @@ __global__ void CheckMatchesWithLabelsKernel(
    const uint32_t *output_offsets,      // Offset into outputs for each row
    const uint32_t *output_lengths,      // Length of outputs list for each row
    uint8_t *match_flags,                // Output: 1 if match found, 0 otherwise
-    uint32_t count) {                    // Number of points to process
-
-    const u32 instance = blockIdx.x * blockDim.x + threadIdx.x;
-    if (instance >= count) return;
+    uint32_t count,                      // Number of points to process
+    uint64_t batch_id) {                 // Batch ID for debugging

    constexpr u32 field_limbs = 8;

-    // Load FPM result (output_point in Montgomery form, column-major)
-    Field output_x_mont, output_y_mont;
-    #ifdef GECC_QAPW_OPT_COLUMN_MAJORED_INPUTS
+    // Grid-stride loop: each thread processes multiple elements
+    for (u32 instance = blockIdx.x * blockDim.x + threadIdx.x;
+         instance < count;
+         instance += gridDim.x * blockDim.x) {
+
+        // Load FPM result (output_point in Montgomery form, column-major)
+        Field output_x_mont, output_y_mont;
+        #ifdef GECC_QAPW_OPT_COLUMN_MAJORED_INPUTS
+            for (u32 j = 0; j < field_limbs; j++) {
+                output_x_mont.digits[j] = fpm_results[j * count + instance];
+                output_y_mont.digits[j] = fpm_results[(field_limbs + j) * count + instance];
+            }
+        #else
+            for (u32 j = 0; j < field_limbs; j++) {
+                output_x_mont.digits[j] = fpm_results[instance * ECPoint::Affine::LIMBS + j];
+                output_y_mont.digits[j] = fpm_results[instance * ECPoint::Affine::LIMBS + field_limbs + j];
+            }
+        #endif
+
+        // Load spend_public_key and convert to Montgomery form
+        Field spend_x, spend_y;
        for (u32 j = 0; j < field_limbs; j++) {
-            output_x_mont.digits[j] = fpm_results[j * count + instance];
-            output_y_mont.digits[j] = fpm_results[(field_limbs + j) * count + instance];
+            spend_x.digits[j] = spend_pubkey_x[j];
+            spend_y.digits[j] = spend_pubkey_y[j];
        }
-    #else
-        for (u32 j = 0; j < field_limbs; j++) {
-            output_x_mont.digits[j] = fpm_results[instance * ECPoint::Affine::LIMBS + j];
-            output_y_mont.digits[j] = fpm_results[instance * ECPoint::Affine::LIMBS + field_limbs + j];
+        spend_x.inplace_to_montgomery();
+        spend_y.inplace_to_montgomery();
+
+        // Get output list metadata
+        uint32_t offset = output_offsets[instance];
+        uint32_t length = output_lengths[instance];
+
+        bool found_match = false;
+
+        // Base case: output_point + spend_public_key
+        Field final_x_normal = AddPointsAndGetX(output_x_mont, output_y_mont, spend_x, spend_y);
+
+        int64_t base_value = ExtractUpper64(final_x_normal);
+
+        if (CheckValueMatch(base_value, outputs, offset, length)) {
+            found_match = true;
        }
-    #endif

-    // Load spend_public_key and convert to Montgomery form
-    Field spend_x, spend_y;
-    for (u32 j = 0; j < field_limbs; j++) {
-        spend_x.digits[j] = spend_pubkey_x[j];
-        spend_y.digits[j] = spend_pubkey_y[j];
-    }
-    spend_x.inplace_to_montgomery();
-    spend_y.inplace_to_montgomery();
+        // Try each label key if no match yet
+        if (!found_match) {
+            // For label checking, we need final_point (output_point + spend_public_key) in Montgomery form
+            // Recompute the full point addition to get both X and Y coordinates
+            typename ECPoint::Affine output_affine;
+            output_affine.x = output_x_mont;
+            output_affine.y = output_y_mont;

-    // Get output list metadata
-    uint32_t offset = output_offsets[instance];
-    uint32_t length = output_lengths[instance];
+            typename ECPoint::Affine spend_affine;
+            spend_affine.x = spend_x;
+            spend_affine.y = spend_y;

-    // Base case: output_point + spend_public_key
-    Field final_x_normal = AddPointsAndGetX(output_x_mont, output_y_mont, spend_x, spend_y);
-    int64_t base_value = ExtractUpper64(final_x_normal);
+            ECPoint output_jac = output_affine.to_nonzero_jacobian();
+            ECPoint final_jac = output_jac + spend_affine;
+            typename ECPoint::Affine final_affine = final_jac.to_affine();

-    if (CheckValueMatch(base_value, outputs, offset, length)) {
-        match_flags[instance] = 1;
-        return;
-    }
+            // final_affine.x and final_affine.y are already in Montgomery form
+            Field final_x_mont = final_affine.x;
+            Field final_y_mont = final_affine.y;

-    // Try each label key
-    for (uint32_t label_idx = 0; label_idx < label_count; label_idx++) {
-        // Load label key and convert to Montgomery form
-        Field label_x, label_y;
-        for (u32 j = 0; j < field_limbs; j++) {
-            label_x.digits[j] = label_keys_x[label_idx * field_limbs + j];
-            label_y.digits[j] = label_keys_y[label_idx * field_limbs + j];
+            for (uint32_t label_idx = 0; label_idx < label_count; label_idx++) {
+                // Load label key and convert to Montgomery form
+                Field label_x, label_y;
+                for (u32 j = 0; j < field_limbs; j++) {
+                    label_x.digits[j] = label_keys_x[label_idx * field_limbs + j];
+                    label_y.digits[j] = label_keys_y[label_idx * field_limbs + j];
+                }
+                label_x.inplace_to_montgomery();
+                label_y.inplace_to_montgomery();
+
+                // Compute: final_point + label_key (NOT output_point + label_key!)
+                Field labeled_x_normal = AddPointsAndGetX(final_x_mont, final_y_mont, label_x, label_y);
+                int64_t labeled_value = ExtractUpper64(labeled_x_normal);
+
+                // Check value
+                if (CheckValueMatch(labeled_value, outputs, offset, length)) {
+                    found_match = true;
+                    break;  // Exit label loop
+                }
+            }
        }
-        label_x.inplace_to_montgomery();
-        label_y.inplace_to_montgomery();

-        // Compute: output_point + label_key
-        Field labeled_x_normal = AddPointsAndGetX(output_x_mont, output_y_mont, label_x, label_y);
-        int64_t labeled_value = ExtractUpper64(labeled_x_normal);
-
-        // Check value
-        if (CheckValueMatch(labeled_value, outputs, offset, length)) {
-            match_flags[instance] = 1;
-            return;
-        }
-    }
-
-    // No match found
-    match_flags[instance] = 0;
+        // Set match flag
+        match_flags[instance] = found_match ? 1 : 0;
+    }  // end grid-stride loop
 }

 // Kernel to check computed results against outputs and set match flags
@ -208,47 +234,51 @@ __global__ void CheckMatchesKernel(
 __global__ void SerializeToCompressedSEC1Kernel(
    const uint32_t *R0,             // Input: EC points in column-major format
    uint8_t *serialized,            // Output: serialized points (37 bytes each)
-    uint32_t count) {               // Number of points
-
-    const u32 instance = blockIdx.x * blockDim.x + threadIdx.x;
-    if (instance >= count) return;
+    uint32_t count,                 // Number of points
+    uint64_t batch_id) {            // Batch ID for debugging

    constexpr u32 field_limbs = 8;  // 8 u32 limbs for 256-bit field

-    // Load X and Y coordinates from column-major R0 (in Montgomery form)
-    Field x_mont, y_mont;
-    for (u32 j = 0; j < field_limbs; ++j) {
-        x_mont.digits[j] = R0[j * count + instance];                    // X coord
-        y_mont.digits[j] = R0[(field_limbs + j) * count + instance];    // Y coord
-    }
+    // Grid-stride loop: each thread processes multiple elements
+    for (u32 instance = blockIdx.x * blockDim.x + threadIdx.x;
+         instance < count;
+         instance += gridDim.x * blockDim.x) {

-    // Convert from Montgomery form to normal form
-    Field x_normal = x_mont.from_montgomery();
-    Field y_normal = y_mont.from_montgomery();
+        // Load X and Y coordinates from column-major R0 (in Montgomery form)
+        Field x_mont, y_mont;
+        for (u32 j = 0; j < field_limbs; ++j) {
+            x_mont.digits[j] = R0[j * count + instance];                    // X coord
+            y_mont.digits[j] = R0[(field_limbs + j) * count + instance];    // Y coord
+        }

-    // Compute Y parity (even = 0x02, odd = 0x03)
-    uint8_t prefix = 0x02 + (y_normal.digits[0] & 1);
+        // Convert from Montgomery form to normal form
+        Field x_normal = x_mont.from_montgomery();
+        Field y_normal = y_mont.from_montgomery();

-    // Output pointer for this point (37 bytes)
-    uint8_t *output = serialized + instance * 37;
+        // Compute Y parity (even = 0x02, odd = 0x03)
+        uint8_t prefix = 0x02 + (y_normal.digits[0] & 1);

-    // Write prefix
-    output[0] = prefix;
+        // Output pointer for this point (37 bytes)
+        uint8_t *output = serialized + instance * 37;

-    // Write X coordinate (32 bytes, little-endian limbs to big-endian bytes)
-    for (u32 i = 0; i < 8; ++i) {
-        uint32_t limb = x_normal.digits[7 - i];  // Reverse limb order for big-endian
-        output[1 + i * 4 + 0] = (limb >> 24) & 0xFF;
-        output[1 + i * 4 + 1] = (limb >> 16) & 0xFF;
-        output[1 + i * 4 + 2] = (limb >> 8) & 0xFF;
-        output[1 + i * 4 + 3] = (limb >> 0) & 0xFF;
-    }
+        // Write prefix
+        output[0] = prefix;

-    // Append 4 zero bytes
-    output[33] = 0x00;
-    output[34] = 0x00;
-    output[35] = 0x00;
-    output[36] = 0x00;
+        // Write X coordinate (32 bytes, little-endian limbs to big-endian bytes)
+        for (u32 i = 0; i < 8; ++i) {
+            uint32_t limb = x_normal.digits[7 - i];  // Reverse limb order for big-endian
+            output[1 + i * 4 + 0] = (limb >> 24) & 0xFF;
+            output[1 + i * 4 + 1] = (limb >> 16) & 0xFF;
+            output[1 + i * 4 + 2] = (limb >> 8) & 0xFF;
+            output[1 + i * 4 + 3] = (limb >> 0) & 0xFF;
+        }
+
+        // Append 4 zero bytes
+        output[33] = 0x00;
+        output[34] = 0x00;
+        output[35] = 0x00;
+        output[36] = 0x00;
+    }  // end grid-stride loop
 }

 // Kernel to compute BIP0352 tagged hashes on serialized EC points
@ -256,25 +286,29 @@ __global__ void SerializeToCompressedSEC1Kernel(
 __global__ void ComputeTaggedHashesKernel(
    const uint8_t *serialized,      // Input: serialized points (37 bytes each)
    uint8_t *hashes,                // Output: SHA256 hashes (32 bytes each)
-    uint32_t count) {               // Number of points
-
-    const u32 instance = blockIdx.x * blockDim.x + threadIdx.x;
-    if (instance >= count) return;
+    uint32_t count,                 // Number of points
+    uint64_t batch_id) {            // Batch ID for debugging

    // Tag for BIP-352 Silent Payments
    const char tag_str[] = "BIP0352/SharedSecret";
    const uint8_t *tag = reinterpret_cast<const uint8_t*>(tag_str);
-    const uint64_t tag_len = 20;  // Length of "BIP0352/SharedSecret" (B-I-P-0-3-5-2-/-S-h-a-r-e-d-S-e-c-r-e-t = 20 chars)
+    const uint64_t tag_len = 20;  // Length of "BIP0352/SharedSecret"

-    // Input message: 37 bytes (33-byte compressed point + 4 zero bytes)
-    const uint8_t *msg = serialized + instance * 37;
-    const uint64_t msg_len = 37;
+    // Grid-stride loop: each thread processes multiple elements
+    for (u32 instance = blockIdx.x * blockDim.x + threadIdx.x;
+         instance < count;
+         instance += gridDim.x * blockDim.x) {

-    // Output hash
-    uint8_t *hash = hashes + instance * 32;
+        // Input message: 37 bytes (33-byte compressed point + 4 zero bytes)
+        const uint8_t *msg = serialized + instance * 37;
+        const uint64_t msg_len = 37;

-    // Compute tagged hash
-    tagged_hash(tag, tag_len, msg, msg_len, hash);
+        // Output hash
+        uint8_t *hash = hashes + instance * 32;
+
+        // Compute tagged hash
+        tagged_hash(tag, tag_len, msg, msg_len, hash);
+    }
 }

 // Kernel for fixed-point multiplication: Computes hash × G for each hash
@ -282,35 +316,39 @@ __global__ void ComputeTaggedHashesKernel(
 __global__ void FixedPointMultiplyKernel(
    u32 count,
    Order::Base *scalars,           // Input: scalar values (hashes converted to scalars)
-    ECPoint::Base *results) {       // Output: EC points (affine coordinates)
+    ECPoint::Base *results,         // Output: EC points (affine coordinates)
+    uint64_t batch_id) {            // Batch ID for debugging

-    u32 instance = blockIdx.x * blockDim.x + threadIdx.x;
-    if (instance >= count) return;
+    // Grid-stride loop: each thread processes multiple elements
+    for (u32 instance = blockIdx.x * blockDim.x + threadIdx.x;
+         instance < count;
+         instance += gridDim.x * blockDim.x) {

-    // Load scalar
-    Order s;
-    #ifdef GECC_QAPW_OPT_COLUMN_MAJORED_INPUTS
-        s.load_arbitrary(scalars, count, instance, 0);
-    #else
-        s.load(scalars + instance * Order::LIMBS, 0, 0, 0);
-    #endif
+        // Load scalar
+        Order s;
+        #ifdef GECC_QAPW_OPT_COLUMN_MAJORED_INPUTS
+            s.load_arbitrary(scalars, count, instance, 0);
+        #else
+            s.load(scalars + instance * Order::LIMBS, 0, 0, 0);
+        #endif

-    // Compute s × G using fixed-point multiplication
-    // This reads from precomputed table in device constant memory
-    ECPoint p = ECPoint::zero();
-    Solver::fixed_point_mult(p, s, true);
+        // Compute s × G using fixed-point multiplication
+        // This reads from precomputed table in device constant memory
+        ECPoint p = ECPoint::zero();
+        Solver::fixed_point_mult(p, s, true);

-    // Convert Jacobian to affine coordinates
-    typename ECPoint::Affine result = p.to_affine();
+        // Convert Jacobian to affine coordinates
+        typename ECPoint::Affine result = p.to_affine();

-    // Store result
-    #ifdef GECC_QAPW_OPT_COLUMN_MAJORED_INPUTS
-        result.x.store_arbitrary(results, count, instance, 0);
-        result.y.store_arbitrary(results + count * Field::LIMBS, count, instance, 0);
-    #else
-        result.x.store(results + instance * ECPoint::Affine::LIMBS, 0, 0, 0);
-        result.y.store(results + instance * ECPoint::Affine::LIMBS + Field::LIMBS, 0, 0, 0);
-    #endif
+        // Store result
+        #ifdef GECC_QAPW_OPT_COLUMN_MAJORED_INPUTS
+            result.x.store_arbitrary(results, count, instance, 0);
+            result.y.store_arbitrary(results + count * Field::LIMBS, count, instance, 0);
+        #else
+            result.x.store(results + instance * ECPoint::Affine::LIMBS, 0, 0, 0);
+            result.y.store(results + instance * ECPoint::Affine::LIMBS + Field::LIMBS, 0, 0, 0);
+        #endif
+    }
 }

 // Kernel for EC point addition: Adds spend_public_key to each output point
@ -394,10 +432,11 @@ struct BatchScanState {
    uint32_t label_count;        // Number of label keys
    Solver *solver;              // ECDSA solver instance
    uint32_t count;
+    uint64_t batch_id;           // Unique batch identifier for debugging
 };

 // Host function to initialize solver and prepare for EC multiplication
-// This follows the pattern from gECC's ec_pmul_random_init, but with our specific data
+// This follows the pattern from gECC's ec_pmul_init, but with our specific data
 // Returns an opaque handle to BatchScanState (cast to void*) for thread-safe operation
 extern "C" void* LaunchBatchScan(
    uint32_t **managed_points_x,      // Will allocate managed memory for input points
@ -416,11 +455,11 @@ extern "C" void* LaunchBatchScan(
    size_t outputs_size) {

    // Initialize field and solver once per program (not per batch)
-    static bool initialized = false;
-    if (!initialized) {
+    // Use C++11 static initialization guarantee for thread safety
+    static bool initialized = []() {
        Solver::initialize();
-        initialized = true;
-    }
+        return true;
+    }();

    // Allocate per-batch state (thread-safe)
    BatchScanState *state = new BatchScanState();
@ -436,6 +475,9 @@ extern "C" void* LaunchBatchScan(
    state->solver = nullptr;
    state->count = count;

+    // Generate unique batch ID for debugging (use pointer address as unique ID)
+    state->batch_id = reinterpret_cast<uint64_t>(state);
+
    // Allocate managed memory for point coordinates (caller will fill these)
    cudaError_t err;
    err = cudaMallocManaged(managed_points_x, Field::SIZE * count);
@ -451,8 +493,10 @@ extern "C" void* LaunchBatchScan(
        return nullptr;
    }

-    // Allocate outputs metadata
-    err = cudaMallocManaged(&state->d_outputs, outputs_size * sizeof(int64_t));
+    // Allocate outputs metadata as DEVICE memory (not unified)
+    // CRITICAL: Use cudaMalloc instead of cudaMallocManaged to avoid coherency issues
+    // in concurrent batch processing when combined with cudaMemcpyHostToDevice
+    err = cudaMalloc(&state->d_outputs, outputs_size * sizeof(int64_t));
    if (err != cudaSuccess) {
        cudaFree(*managed_points_x);
        cudaFree(*managed_points_y);
@ -460,7 +504,7 @@ extern "C" void* LaunchBatchScan(
        return nullptr;
    }

-    err = cudaMallocManaged(&state->d_output_offsets, count * sizeof(uint32_t));
+    err = cudaMalloc(&state->d_output_offsets, count * sizeof(uint32_t));
    if (err != cudaSuccess) {
        cudaFree(*managed_points_x);
        cudaFree(*managed_points_y);
@ -468,8 +512,10 @@ extern "C" void* LaunchBatchScan(
        delete state;
        return nullptr;
    }
+    // Zero out the memory to ensure no stale data
+    cudaMemset(state->d_output_offsets, 0, count * sizeof(uint32_t));

-    err = cudaMallocManaged(&state->d_output_lengths, count * sizeof(uint32_t));
+    err = cudaMalloc(&state->d_output_lengths, count * sizeof(uint32_t));
    if (err != cudaSuccess) {
        cudaFree(*managed_points_x);
        cudaFree(*managed_points_y);
@ -478,6 +524,8 @@ extern "C" void* LaunchBatchScan(
        delete state;
        return nullptr;
    }
+    // Zero out the memory to ensure no stale data
+    cudaMemset(state->d_output_lengths, 0, count * sizeof(uint32_t));

    err = cudaMallocManaged(managed_match_flags, count * sizeof(uint8_t));
    if (err != cudaSuccess) {
@ -495,7 +543,7 @@ extern "C" void* LaunchBatchScan(

    // Allocate device memory for spend public key (8 u32 limbs each for x and y)
    constexpr u32 field_limbs = 8;
-    err = cudaMallocManaged(&state->d_spend_pubkey_x, field_limbs * sizeof(uint32_t));
+    err = cudaMalloc(&state->d_spend_pubkey_x, field_limbs * sizeof(uint32_t));
    if (err != cudaSuccess) {
        cudaFree(*managed_points_x);
        cudaFree(*managed_points_y);
@ -507,7 +555,7 @@ extern "C" void* LaunchBatchScan(
        return nullptr;
    }

-    err = cudaMallocManaged(&state->d_spend_pubkey_y, field_limbs * sizeof(uint32_t));
+    err = cudaMalloc(&state->d_spend_pubkey_y, field_limbs * sizeof(uint32_t));
    if (err != cudaSuccess) {
        cudaFree(*managed_points_x);
        cudaFree(*managed_points_y);
@ -520,18 +568,35 @@ extern "C" void* LaunchBatchScan(
        return nullptr;
    }

-    // Copy outputs metadata
-    memcpy(state->d_outputs, h_outputs, outputs_size * sizeof(int64_t));
-    memcpy(state->d_output_offsets, h_output_offsets, count * sizeof(uint32_t));
-    memcpy(state->d_output_lengths, h_output_lengths, count * sizeof(uint32_t));
+    // Copy outputs metadata using cudaMemcpy to ensure proper coherency
+    // CRITICAL: Use cudaMemcpy instead of memcpy for unified memory to ensure
+    // data is properly synchronized between host and device in concurrent execution
+    err = cudaMemcpy(state->d_outputs, h_outputs, outputs_size * sizeof(int64_t), cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) {
+        printf("cudaMemcpy d_outputs error: %s\n", cudaGetErrorString(err));
+    }
+    err = cudaMemcpy(state->d_output_offsets, h_output_offsets, count * sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) {
+        printf("cudaMemcpy d_output_offsets error: %s\n", cudaGetErrorString(err));
+    }
+    err = cudaMemcpy(state->d_output_lengths, h_output_lengths, count * sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) {
+        printf("cudaMemcpy d_output_lengths error: %s\n", cudaGetErrorString(err));
+    }
+
+    // Synchronize to ensure all data is copied to device before returning
+    err = cudaDeviceSynchronize();
+    if (err != cudaSuccess) {
+        printf("cudaDeviceSynchronize after memcpy error: %s\n", cudaGetErrorString(err));
+    }

    // Copy spend public key
-    memcpy(state->d_spend_pubkey_x, h_spend_pubkey_x, field_limbs * sizeof(uint32_t));
-    memcpy(state->d_spend_pubkey_y, h_spend_pubkey_y, field_limbs * sizeof(uint32_t));
+    cudaMemcpy(state->d_spend_pubkey_x, h_spend_pubkey_x, field_limbs * sizeof(uint32_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(state->d_spend_pubkey_y, h_spend_pubkey_y, field_limbs * sizeof(uint32_t), cudaMemcpyHostToDevice);

    // Allocate and copy label keys (if any)
    if (label_count > 0) {
-        err = cudaMallocManaged(&state->d_label_keys_x, label_count * field_limbs * sizeof(uint32_t));
+        err = cudaMalloc(&state->d_label_keys_x, label_count * field_limbs * sizeof(uint32_t));
        if (err != cudaSuccess) {
            cudaFree(*managed_points_x);
            cudaFree(*managed_points_y);
@ -545,7 +610,7 @@ extern "C" void* LaunchBatchScan(
            return nullptr;
        }

-        err = cudaMallocManaged(&state->d_label_keys_y, label_count * field_limbs * sizeof(uint32_t));
+        err = cudaMalloc(&state->d_label_keys_y, label_count * field_limbs * sizeof(uint32_t));
        if (err != cudaSuccess) {
            cudaFree(*managed_points_x);
            cudaFree(*managed_points_y);
@ -560,8 +625,8 @@ extern "C" void* LaunchBatchScan(
            return nullptr;
        }

-        memcpy(state->d_label_keys_x, h_label_keys_x, label_count * field_limbs * sizeof(uint32_t));
-        memcpy(state->d_label_keys_y, h_label_keys_y, label_count * field_limbs * sizeof(uint32_t));
+        cudaMemcpy(state->d_label_keys_x, h_label_keys_x, label_count * field_limbs * sizeof(uint32_t), cudaMemcpyHostToDevice);
+        cudaMemcpy(state->d_label_keys_y, h_label_keys_y, label_count * field_limbs * sizeof(uint32_t), cudaMemcpyHostToDevice);
    }

    // Create fresh solver for this batch
@ -595,13 +660,13 @@ extern "C" int RunBatchScanKernels(

    cudaDeviceSynchronize();

-    // Prepare data in the format expected by ec_pmul_random_init
+    // Prepare data in the format expected by ec_pmul_init
    // MAX_LIMBS is defined in gECC as 64 (maximum array size)
    // For secp256k1 (256-bit), we use 4 u64 limbs, but arrays must be size MAX_LIMBS
    constexpr u32 MAX_LIMBS = 64;
    constexpr u32 USED_LIMBS = 4;  // 256 bits = 4 u64 limbs

-    // Allocate host arrays in the format ec_pmul_random_init expects
+    // Allocate host arrays in the format ec_pmul_init expects
    u64 (*h_scalars)[MAX_LIMBS] = new u64[count][MAX_LIMBS];
    u64 (*h_keys_x)[MAX_LIMBS] = new u64[count][MAX_LIMBS];
    u64 (*h_keys_y)[MAX_LIMBS] = new u64[count][MAX_LIMBS];
@ -617,7 +682,7 @@ extern "C" int RunBatchScanKernels(
        // h_scalar array is in little-endian order: h_scalar[0] is LEAST significant u32
        // We need to pack sequentially: h_scalars[0] should be least significant u64
        //
-        // IMPORTANT: gECC's ec_pmul_random_init uses reinterpret_cast<Base*>(u64_array)
+        // IMPORTANT: gECC's ec_pmul_init uses reinterpret_cast<Base*>(u64_array)
        // which interprets each u64 as two u32s. We pack two sequential u32s into each u64.
        for (u32 j = 0; j < USED_LIMBS; j++) {
            // Pack two sequential u32s: low_u32 | (high_u32 << 32)
@ -656,8 +721,8 @@ extern "C" int RunBatchScanKernels(
        }
    #endif

-    // Call ec_pmul_random_init with our specific data
-    solver->ec_pmul_random_init(h_scalars, h_keys_x, h_keys_y, count);
+    // Call ec_pmul_init with our specific data
+    solver->ec_pmul_init(h_scalars, h_keys_x, h_keys_y, count);

    // Free host arrays
    delete[] h_scalars;
@ -667,19 +732,15 @@ extern "C" int RunBatchScanKernels(
    // Check for initialization errors
    cudaError_t err = cudaPeekAtLastError();
    if (err != cudaSuccess) {
-        printf("ec_pmul_random_init error: %s\n", cudaGetErrorString(err));
+        printf("ec_pmul_init error: %s\n", cudaGetErrorString(err));
        return -1;
    }

    // Run EC multiplication (matching ecdsa_ec_pmul call)
-    // Calculate block_num to ensure block_num * threads_per_block >= count
+    // Use MAX_SM_NUMS blocks (like gECC tests) to ensure proper work distribution
+    // The kernels use grid-stride loops, so they can handle any count with any block_num
    u32 max_thread_per_block = 256;
-    u32 block_num = (count + max_thread_per_block - 1) / max_thread_per_block;
-
-    // Ensure we use at least MAX_SM_NUMS blocks for efficiency (if count allows)
-    if (block_num < MAX_SM_NUMS && count >= MAX_SM_NUMS * max_thread_per_block) {
-        block_num = MAX_SM_NUMS;
-    }
+    u32 block_num = MAX_SM_NUMS;  // Use SM count like gECC test for optimal occupancy

    solver->ecdsa_ec_pmul(block_num, max_thread_per_block, true);  // true = unknown points

@ -690,6 +751,8 @@ extern "C" int RunBatchScanKernels(
        return -1;
    }

+    cudaDeviceSynchronize();
+
    // === BIP-352 Silent Payment Pipeline ===
    // Step 1: Serialize shared secrets to compressed SEC1 format + 4 zero bytes (37 bytes each)
    uint8_t *d_serialized;
@ -700,10 +763,12 @@ extern "C" int RunBatchScanKernels(
    }

    int threads_per_block = 256;
-    int num_blocks = (count + threads_per_block - 1) / threads_per_block;
+    // Use fixed block count with grid-stride loops to handle any batch size
+    // Kernels will process multiple elements per thread when count > blocks * threads
+    int num_blocks = MAX_SM_NUMS;  // Use SM count for good occupancy

    SerializeToCompressedSEC1Kernel<<<num_blocks, threads_per_block>>>(
-        solver->R0, d_serialized, count
+        solver->R0, d_serialized, count, state->batch_id
    );

    err = cudaDeviceSynchronize();
@ -723,7 +788,7 @@ extern "C" int RunBatchScanKernels(
    }

    ComputeTaggedHashesKernel<<<num_blocks, threads_per_block>>>(
-        d_serialized, d_hashes, count
+        d_serialized, d_hashes, count, state->batch_id
    );

    err = cudaDeviceSynchronize();
@ -734,22 +799,29 @@ extern "C" int RunBatchScanKernels(
        return -1;
    }

+    // Debug: Log hash result
    cudaFree(d_serialized);  // No longer needed

    // Step 3: Convert hashes to Order scalars for fixed-point multiplication
-    Order::Base *d_hash_scalars;
-    err = cudaMallocManaged(&d_hash_scalars, Order::SIZE * count);
+    // Copy hashes to host memory first to avoid coherency issues
+    uint8_t *h_hashes = new uint8_t[count * 32];
+    err = cudaMemcpy(h_hashes, d_hashes, count * 32, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) {
-        printf("cudaMallocManaged for d_hash_scalars error: %s\n", cudaGetErrorString(err));
+        printf("cudaMemcpy for h_hashes error: %s\n", cudaGetErrorString(err));
        cudaFree(d_hashes);
        return -1;
    }

+    cudaFree(d_hashes);  // No longer needed
+
+    // Allocate host buffer for conversion
+    Order::Base *h_hash_scalars = new Order::Base[Order::SIZE * count];
+
    // Convert 32-byte hashes (big-endian) to Order::Base (u32) limbs in column-major format
    #ifdef GECC_QAPW_OPT_COLUMN_MAJORED_INPUTS
        // Column-major: limb j of scalar i is at [j * count + i]
        for (u32 i = 0; i < count; i++) {
-            uint8_t *hash = d_hashes + i * 32;
+            uint8_t *hash = h_hashes + i * 32;
            // Hash is big-endian: hash[0] is most significant byte
            // Convert to u32 limbs in little-endian order
            for (u32 j = 0; j < Order::LIMBS; j++) {
@ -758,24 +830,43 @@ extern "C" int RunBatchScanKernels(
                                (static_cast<uint32_t>(hash[31 - j * 4 - 1]) << 8) |
                                (static_cast<uint32_t>(hash[31 - j * 4 - 2]) << 16) |
                                (static_cast<uint32_t>(hash[31 - j * 4 - 3]) << 24);
-                d_hash_scalars[j * count + i] = limb;
+                h_hash_scalars[j * count + i] = limb;
            }
        }
    #else
        // Row-major: limb j of scalar i is at [i * Order::LIMBS + j]
        for (u32 i = 0; i < count; i++) {
-            uint8_t *hash = d_hashes + i * 32;
+            uint8_t *hash = h_hashes + i * 32;
            for (u32 j = 0; j < Order::LIMBS; j++) {
                uint32_t limb = (static_cast<uint32_t>(hash[31 - j * 4 - 0]) << 0) |
                                (static_cast<uint32_t>(hash[31 - j * 4 - 1]) << 8) |
                                (static_cast<uint32_t>(hash[31 - j * 4 - 2]) << 16) |
                                (static_cast<uint32_t>(hash[31 - j * 4 - 3]) << 24);
-                d_hash_scalars[i * Order::LIMBS + j] = limb;
+                h_hash_scalars[i * Order::LIMBS + j] = limb;
            }
        }
    #endif

-    cudaFree(d_hashes);  // No longer needed
+    delete[] h_hashes;  // No longer needed
+
+    // Allocate device memory and copy
+    Order::Base *d_hash_scalars;
+    err = cudaMalloc(&d_hash_scalars, Order::SIZE * count);
+    if (err != cudaSuccess) {
+        printf("cudaMalloc for d_hash_scalars error: %s\n", cudaGetErrorString(err));
+        delete[] h_hash_scalars;
+        return -1;
+    }
+
+    err = cudaMemcpy(d_hash_scalars, h_hash_scalars, Order::SIZE * count, cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) {
+        printf("cudaMemcpy for d_hash_scalars error: %s\n", cudaGetErrorString(err));
+        delete[] h_hash_scalars;
+        cudaFree(d_hash_scalars);
+        return -1;
+    }
+
+    delete[] h_hash_scalars;  // No longer needed

    // Step 4: Allocate output buffer for fixed-point multiply results
    ECPoint::Base *d_fpm_results;
@ -788,7 +879,7 @@ extern "C" int RunBatchScanKernels(

    // Step 5: Fixed-point multiply: hash × G using precomputed table
    FixedPointMultiplyKernel<<<num_blocks, threads_per_block>>>(
-        count, d_hash_scalars, d_fpm_results
+        count, d_hash_scalars, d_fpm_results, state->batch_id
    );

    err = cudaDeviceSynchronize();
@ -804,6 +895,7 @@ extern "C" int RunBatchScanKernels(
    // Step 6: Check matches with label support
    // This will: (1) try base case: output_point + spend_pubkey
    //           (2) for each label: try output_point + label_key (and negated)
+
    CheckMatchesWithLabelsKernel<<<num_blocks, threads_per_block>>>(
        d_fpm_results,
        state->d_spend_pubkey_x,
@ -815,7 +907,8 @@ extern "C" int RunBatchScanKernels(
        state->d_output_offsets,
        state->d_output_lengths,
        managed_match_flags,
-        count
+        count,
+        state->batch_id
    );

    err = cudaDeviceSynchronize();
--- a/test/sql/cudasp.test
+++ b/test/sql/cudasp.test
@ -94,10 +94,10 @@ SELECT height FROM cudasp_scan((SELECT txid, height, tweak_key, outputs FROM tes
 # spend_public_key: 025cc9856d6f8375350e123978daac200c260cb5b5ae83106cab90484dcd8fcf36
 # tweak_key: 0314bec14463d6c0181083d607fecfba67bb83f95915f6f247975ec566d5642ee8
 # label_key: 034e52d154b56ffe17964bd72e1dc4478c956f3fa29e1ea7e8bdee2d2a21f963cd
-# output: -3096348491593986964 (from label case, NOT base case which produces -1006811617310360495)
+# output: -1265772155233867786 (from label case: final_point + label_key, NOT base case which produces -1006811617310360495)
 statement ok
 INSERT INTO test_data VALUES
-    (BLOB '\x00\x01\x02\x06', 400, BLOB '\xe8\x2e\x64\xd5\x66\xc5\x5e\x97\x47\xf2\xf6\x15\x59\xf9\x83\xbb\x67\xba\xcf\xfe\x07\xd6\x83\x10\x18\xc0\xd6\x63\x44\xc1\xbe\x14\xc3\x80\x32\xa4\x8f\x5b\x3c\x56\xb5\xb6\x28\x6a\x06\xc0\x27\x08\x46\xb7\xb8\x52\xcd\x31\x8d\x9a\x13\x71\x73\xa5\xb4\x1c\x2f\x84', [-3096348491593986964]);
+    (BLOB '\x00\x01\x02\x06', 400, BLOB '\xe8\x2e\x64\xd5\x66\xc5\x5e\x97\x47\xf2\xf6\x15\x59\xf9\x83\xbb\x67\xba\xcf\xfe\x07\xd6\x83\x10\x18\xc0\xd6\x63\x44\xc1\xbe\x14\xc3\x80\x32\xa4\x8f\x5b\x3c\x56\xb5\xb6\x28\x6a\x06\xc0\x27\x08\x46\xb7\xb8\x52\xcd\x31\x8d\x9a\x13\x71\x73\xa5\xb4\x1c\x2f\x84', [-1265772155233867786]);

 # Should match using the label_key
 query I