From fc6826d55dc047aa1c56ba9901f605e5d080ef26 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 9 Jul 2025 18:04:48 +0200 Subject: [PATCH 01/48] Rework --- crates/batcher/Cargo.toml | 1 + crates/batcher/src/lib.rs | 240 +++++++++++++++++------- crates/batcher/src/types/batch_state.rs | 185 ------------------ 3 files changed, 174 insertions(+), 252 deletions(-) diff --git a/crates/batcher/Cargo.toml b/crates/batcher/Cargo.toml index adca6b494c..8ecf527f74 100644 --- a/crates/batcher/Cargo.toml +++ b/crates/batcher/Cargo.toml @@ -29,6 +29,7 @@ aligned-sdk = { path = "../sdk" } ciborium = "=0.2.2" priority-queue = "2.1.0" reqwest = { version = "0.12", features = ["json"] } +dashmap = "6.0.1" once_cell = "1.20.2" warp = "0.3.7" diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index a3d0a670e3..648bdaf050 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -22,6 +22,7 @@ use std::env; use std::net::SocketAddr; use std::sync::Arc; use std::time::Duration; +use dashmap::DashMap; use aligned_sdk::common::constants::{ ADDITIONAL_SUBMISSION_GAS_COST_PER_PROOF, BATCHER_SUBMISSION_BASE_GAS_COST, @@ -85,6 +86,7 @@ pub struct Batcher { service_manager: ServiceManager, service_manager_fallback: ServiceManager, batch_state: Mutex, + user_states: DashMap>, min_block_interval: u64, transaction_wait_timeout: u64, max_proof_size: usize, @@ -211,8 +213,8 @@ impl Batcher { .await .expect("Failed to get fallback Service Manager contract"); - let mut user_states = HashMap::new(); - let mut batch_state = BatchState::new(config.batcher.max_queue_size); + let user_states = DashMap::new(); + let batch_state = BatchState::new(config.batcher.max_queue_size); let non_paying_config = if let Some(non_paying_config) = config.batcher.non_paying { warn!("Non-paying address configuration detected. Will replace non-paying address {} with configured address.", non_paying_config.address); @@ -227,11 +229,9 @@ impl Batcher { let non_paying_user_state = UserState::new(nonpaying_nonce); user_states.insert( non_paying_config.replacement.address(), - non_paying_user_state, + Mutex::new(non_paying_user_state), ); - batch_state = - BatchState::new_with_user_states(user_states, config.batcher.max_queue_size); Some(non_paying_config) } else { None @@ -275,12 +275,149 @@ impl Batcher { aggregator_gas_cost: config.batcher.aggregator_gas_cost, posting_batch: Mutex::new(false), batch_state: Mutex::new(batch_state), + user_states, disabled_verifiers: Mutex::new(disabled_verifiers), metrics, telemetry, } } + // Helper methods for user_states operations with per-address locking + + async fn get_user_nonce(&self, addr: &Address) -> Option { + let user_state = self.user_states.get(addr)?; + let user_state_guard = user_state.lock().await; + Some(user_state_guard.nonce) + } + + async fn get_user_last_max_fee_limit(&self, addr: &Address) -> Option { + let user_state = self.user_states.get(addr)?; + let user_state_guard = user_state.lock().await; + Some(user_state_guard.last_max_fee_limit) + } + + async fn get_user_total_fees_in_queue(&self, addr: &Address) -> Option { + let user_state = self.user_states.get(addr)?; + let user_state_guard = user_state.lock().await; + Some(user_state_guard.total_fees_in_queue) + } + + async fn get_user_proof_count(&self, addr: &Address) -> Option { + let user_state = self.user_states.get(addr)?; + let user_state_guard = user_state.lock().await; + Some(user_state_guard.proofs_in_batch) + } + + async fn update_user_nonce(&self, addr: &Address, new_nonce: U256) -> Option { + let user_state = self.user_states.get(addr)?; + let mut user_state_guard = user_state.lock().await; + user_state_guard.nonce = new_nonce; + Some(new_nonce) + } + + async fn update_user_max_fee_limit(&self, addr: &Address, new_max_fee_limit: U256) -> Option { + let user_state = self.user_states.get(addr)?; + let mut user_state_guard = user_state.lock().await; + user_state_guard.last_max_fee_limit = new_max_fee_limit; + Some(new_max_fee_limit) + } + + async fn update_user_proof_count(&self, addr: &Address, new_proof_count: usize) -> Option { + let user_state = self.user_states.get(addr)?; + let mut user_state_guard = user_state.lock().await; + user_state_guard.proofs_in_batch = new_proof_count; + Some(new_proof_count) + } + + async fn update_user_total_fees_in_queue(&self, addr: &Address, new_total_fees_in_queue: U256) -> Option { + let user_state = self.user_states.get(addr)?; + let mut user_state_guard = user_state.lock().await; + user_state_guard.total_fees_in_queue = new_total_fees_in_queue; + Some(new_total_fees_in_queue) + } + + async fn update_user_total_fees_in_queue_of_replacement_message( + &self, + addr: &Address, + original_max_fee: U256, + new_max_fee: U256, + ) -> Option { + let user_state = self.user_states.get(addr)?; + let mut user_state_guard = user_state.lock().await; + let fee_difference = new_max_fee - original_max_fee; + user_state_guard.total_fees_in_queue += fee_difference; + Some(user_state_guard.total_fees_in_queue) + } + + async fn update_user_state( + &self, + addr: &Address, + new_nonce: U256, + new_max_fee_limit: U256, + new_proof_count: usize, + new_total_fees_in_queue: U256, + ) -> Option<(U256, U256, usize, U256)> { + let user_state = self.user_states.get(addr)?; + let mut user_state_guard = user_state.lock().await; + user_state_guard.nonce = new_nonce; + user_state_guard.last_max_fee_limit = new_max_fee_limit; + user_state_guard.proofs_in_batch = new_proof_count; + user_state_guard.total_fees_in_queue = new_total_fees_in_queue; + Some((new_nonce, new_max_fee_limit, new_proof_count, new_total_fees_in_queue)) + } + + fn get_user_min_fee_in_batch(&self, addr: &Address, batch_queue: &types::batch_queue::BatchQueue) -> U256 { + batch_queue + .iter() + .filter(|(e, _)| &e.sender == addr) + .map(|(e, _)| e.nonced_verification_data.max_fee) + .min() + .unwrap_or(U256::max_value()) + } + + async fn update_user_state_on_entry_removal(&self, removed_entry: &types::batch_queue::BatchQueueEntry, batch_queue: &types::batch_queue::BatchQueue) -> Option<()> { + let addr = removed_entry.sender; + + let new_last_max_fee_limit = match batch_queue + .iter() + .filter(|(e, _)| e.sender == addr) + .next_back() + { + Some((last_entry, _)) => last_entry.nonced_verification_data.max_fee, + None => { + self.user_states.remove(&addr); + return Some(()); + } + }; + + let user_state = self.user_states.get(&addr)?; + let mut user_state_guard = user_state.lock().await; + user_state_guard.proofs_in_batch -= 1; + user_state_guard.nonce -= U256::one(); + user_state_guard.total_fees_in_queue -= removed_entry.nonced_verification_data.max_fee; + user_state_guard.last_max_fee_limit = new_last_max_fee_limit; + Some(()) + } + + fn calculate_new_user_states_data(&self, batch_queue: &types::batch_queue::BatchQueue) -> HashMap { + let mut updated_user_states = HashMap::new(); + for (entry, _) in batch_queue.iter() { + let addr = entry.sender; + let max_fee = entry.nonced_verification_data.max_fee; + + let (proof_count, max_fee_limit, total_fees_in_queue) = updated_user_states + .entry(addr) + .or_insert((0, max_fee, U256::zero())); + + *proof_count += 1; + *total_fees_in_queue += max_fee; + if max_fee < *max_fee_limit { + *max_fee_limit = max_fee; + } + } + updated_user_states + } + pub async fn listen_connections(self: Arc, address: &str) -> Result<(), BatcherError> { // Create the event loop and TCP listener we'll accept connections on. let listener = TcpListener::bind(address) @@ -562,10 +699,7 @@ impl Batcher { address = replacement_addr; } - let cached_user_nonce = { - let batch_state_lock = self.batch_state.lock().await; - batch_state_lock.get_user_nonce(&address).await - }; + let cached_user_nonce = self.get_user_nonce(&address).await; let user_nonce = if let Some(user_nonce) = cached_user_nonce { user_nonce @@ -715,11 +849,7 @@ impl Batcher { // If it was not present, then the user nonce is queried to the Aligned contract. // Lastly, we get a lock of the batch state again and insert the user state if it was still missing. - let is_user_in_state: bool; - { - let batch_state_lock = self.batch_state.lock().await; - is_user_in_state = batch_state_lock.user_states.contains_key(&addr); - } + let is_user_in_state = self.user_states.contains_key(&addr); if !is_user_in_state { let ethereum_user_nonce = match self.get_user_nonce_from_ethereum(addr).await { @@ -738,11 +868,7 @@ impl Batcher { } }; let user_state = UserState::new(ethereum_user_nonce); - let mut batch_state_lock = self.batch_state.lock().await; - batch_state_lock - .user_states - .entry(addr) - .or_insert(user_state); + self.user_states.entry(addr).or_insert(Mutex::new(user_state)); } // * ---------------------------------------------------* @@ -760,17 +886,8 @@ impl Batcher { return Ok(()); }; - // For now on until the message is fully processed, the batch state is locked - // This is needed because we need to query the user state to make validations and - // finally add the proof to the batch queue. - - let mut batch_state_lock = self.batch_state.lock().await; - let msg_max_fee = nonced_verification_data.max_fee; - let Some(user_last_max_fee_limit) = - batch_state_lock.get_user_last_max_fee_limit(&addr).await - else { - std::mem::drop(batch_state_lock); + let Some(user_last_max_fee_limit) = self.get_user_last_max_fee_limit(&addr).await else { send_message( ws_conn_sink.clone(), SubmitProofResponseMessage::AddToBatchError, @@ -780,9 +897,7 @@ impl Batcher { return Ok(()); }; - let Some(user_accumulated_fee) = batch_state_lock.get_user_total_fees_in_queue(&addr).await - else { - std::mem::drop(batch_state_lock); + let Some(user_accumulated_fee) = self.get_user_total_fees_in_queue(&addr).await else { send_message( ws_conn_sink.clone(), SubmitProofResponseMessage::AddToBatchError, @@ -793,7 +908,6 @@ impl Batcher { }; if !self.verify_user_has_enough_balance(user_balance, user_accumulated_fee, msg_max_fee) { - std::mem::drop(batch_state_lock); send_message( ws_conn_sink.clone(), SubmitProofResponseMessage::InsufficientBalance(addr), @@ -803,11 +917,10 @@ impl Batcher { return Ok(()); } - let cached_user_nonce = batch_state_lock.get_user_nonce(&addr).await; + let cached_user_nonce = self.get_user_nonce(&addr).await; let Some(expected_nonce) = cached_user_nonce else { error!("Failed to get cached user nonce: User not found in user states, but it should have been already inserted"); - std::mem::drop(batch_state_lock); send_message( ws_conn_sink.clone(), SubmitProofResponseMessage::AddToBatchError, @@ -818,7 +931,6 @@ impl Batcher { }; if expected_nonce < msg_nonce { - std::mem::drop(batch_state_lock); warn!("Invalid nonce for address {addr}, expected nonce: {expected_nonce:?}, received nonce: {msg_nonce:?}"); send_message( ws_conn_sink.clone(), @@ -834,7 +946,6 @@ impl Batcher { if expected_nonce > msg_nonce { info!("Possible replacement message received: Expected nonce {expected_nonce:?} - message nonce: {msg_nonce:?}"); self.handle_replacement_message( - batch_state_lock, nonced_verification_data, ws_conn_sink.clone(), client_msg.signature, @@ -848,7 +959,6 @@ impl Batcher { // We check this after replacement logic because if user wants to replace a proof, their // new_max_fee must be greater or equal than old_max_fee if msg_max_fee > user_last_max_fee_limit { - std::mem::drop(batch_state_lock); warn!("Invalid max fee for address {addr}, had fee limit of {user_last_max_fee_limit:?}, sent {msg_max_fee:?}"); send_message( ws_conn_sink.clone(), @@ -863,6 +973,7 @@ impl Batcher { // * Perform validation over batcher queue * // * ---------------------------------------------------------------------* + let mut batch_state_lock = self.batch_state.lock().await; if batch_state_lock.is_queue_full() { debug!("Batch queue is full. Evaluating if the incoming proof can replace a lower-priority entry."); @@ -895,7 +1006,7 @@ impl Batcher { removed_entry.nonced_verification_data.nonce ); - batch_state_lock.update_user_state_on_entry_removal(&removed_entry); + self.update_user_state_on_entry_removal(&removed_entry, &batch_state_lock.batch_queue).await; if let Some(removed_entry_ws) = removed_entry.messaging_sink { send_message( @@ -969,7 +1080,6 @@ impl Batcher { /// Returns true if the message was replaced in the batch, false otherwise async fn handle_replacement_message( &self, - mut batch_state_lock: MutexGuard<'_, BatchState>, nonced_verification_data: NoncedVerificationData, ws_conn_sink: WsMessageSink, signature: Signature, @@ -977,6 +1087,7 @@ impl Batcher { ) { let replacement_max_fee = nonced_verification_data.max_fee; let nonce = nonced_verification_data.nonce; + let mut batch_state_lock = self.batch_state.lock().await; let Some(entry) = batch_state_lock.get_entry(addr, nonce) else { std::mem::drop(batch_state_lock); warn!("Invalid nonce for address {addr}. Queue entry with nonce {nonce} not found"); @@ -1059,9 +1170,10 @@ impl Batcher { ); // update max_fee_limit - let updated_max_fee_limit_in_batch = batch_state_lock.get_user_min_fee_in_batch(&addr); - if batch_state_lock + let updated_max_fee_limit_in_batch = self.get_user_min_fee_in_batch(&addr, &batch_state_lock.batch_queue); + if self .update_user_max_fee_limit(&addr, updated_max_fee_limit_in_batch) + .await .is_none() { std::mem::drop(batch_state_lock); @@ -1075,12 +1187,13 @@ impl Batcher { }; // update total_fees_in_queue - if batch_state_lock + if self .update_user_total_fees_in_queue_of_replacement_message( &addr, original_max_fee, replacement_max_fee, ) + .await .is_none() { std::mem::drop(batch_state_lock); @@ -1162,10 +1275,7 @@ impl Batcher { info!("Current batch queue length: {}", queue_len); - let Some(user_proof_count) = batch_state_lock - .get_user_proof_count(&proof_submitter_addr) - .await - else { + let Some(user_proof_count) = self.get_user_proof_count(&proof_submitter_addr).await else { error!("User state of address {proof_submitter_addr} was not found when trying to update user state. This user state should have been present"); std::mem::drop(batch_state_lock); return Err(BatcherError::AddressNotFoundInUserStates( @@ -1173,10 +1283,7 @@ impl Batcher { )); }; - let Some(current_total_fees_in_queue) = batch_state_lock - .get_user_total_fees_in_queue(&proof_submitter_addr) - .await - else { + let Some(current_total_fees_in_queue) = self.get_user_total_fees_in_queue(&proof_submitter_addr).await else { error!("User state of address {proof_submitter_addr} was not found when trying to update user state. This user state should have been present"); std::mem::drop(batch_state_lock); return Err(BatcherError::AddressNotFoundInUserStates( @@ -1185,7 +1292,7 @@ impl Batcher { }; // User state is updated - if batch_state_lock + if self .update_user_state( &proof_submitter_addr, nonce + U256::one(), @@ -1193,6 +1300,7 @@ impl Batcher { user_proof_count + 1, current_total_fees_in_queue + max_fee, ) + .await .is_none() { error!("User state of address {proof_submitter_addr} was not found when trying to update user state. This user state should have been present"); @@ -1298,10 +1406,9 @@ impl Batcher { }); // now we calculate the new user_states - let new_user_states = // proofs, max_fee_limit, total_fees_in_queue - batch_state_lock.calculate_new_user_states_data(); + let new_user_states = self.calculate_new_user_states_data(&batch_state_lock.batch_queue); - let user_addresses: Vec
= batch_state_lock.user_states.keys().cloned().collect(); + let user_addresses: Vec
= self.user_states.iter().map(|entry| *entry.key()).collect(); let default_value = (0, U256::MAX, U256::zero()); for addr in user_addresses.iter() { let (proof_count, max_fee_limit, total_fees_in_queue) = @@ -1313,18 +1420,18 @@ impl Batcher { // informative error. // Now we update the user states related to the batch (proof count in batch and min fee in batch) - batch_state_lock - .update_user_proof_count(addr, *proof_count) + self.update_user_proof_count(addr, *proof_count) + .await .ok_or(BatcherError::QueueRemoveError( "Could not update_user_proof_count".into(), ))?; - batch_state_lock - .update_user_max_fee_limit(addr, *max_fee_limit) + self.update_user_max_fee_limit(addr, *max_fee_limit) + .await .ok_or(BatcherError::QueueRemoveError( "Could not update_user_max_fee_limit".into(), ))?; - batch_state_lock - .update_user_total_fees_in_queue(addr, *total_fees_in_queue) + self.update_user_total_fees_in_queue(addr, *total_fees_in_queue) + .await .ok_or(BatcherError::QueueRemoveError( "Could not update_user_total_fees_in_queue".into(), ))?; @@ -1463,7 +1570,7 @@ impl Batcher { let Some(nonpaying_replacement_addr) = self.get_nonpaying_replacement_addr() else { batch_state_lock.batch_queue.clear(); - batch_state_lock.user_states.clear(); + self.user_states.clear(); return; }; @@ -1475,15 +1582,14 @@ impl Batcher { .await else { batch_state_lock.batch_queue.clear(); - batch_state_lock.user_states.clear(); + self.user_states.clear(); return; }; batch_state_lock.batch_queue.clear(); - batch_state_lock.user_states.clear(); + self.user_states.clear(); let nonpaying_user_state = UserState::new(nonpaying_replacement_addr_nonce); - batch_state_lock - .user_states - .insert(nonpaying_replacement_addr, nonpaying_user_state); + self.user_states + .insert(nonpaying_replacement_addr, Mutex::new(nonpaying_user_state)); self.metrics.update_queue_metrics(0, 0); } diff --git a/crates/batcher/src/types/batch_state.rs b/crates/batcher/src/types/batch_state.rs index 481ca44f74..d28985ef7d 100644 --- a/crates/batcher/src/types/batch_state.rs +++ b/crates/batcher/src/types/batch_state.rs @@ -1,15 +1,11 @@ -use std::collections::{hash_map::Entry, HashMap}; - use super::{ batch_queue::{BatchQueue, BatchQueueEntry}, - user_state::UserState, }; use ethers::types::{Address, U256}; use log::debug; pub(crate) struct BatchState { pub(crate) batch_queue: BatchQueue, - pub(crate) user_states: HashMap, pub(crate) max_size: usize, } @@ -19,21 +15,10 @@ impl BatchState { pub(crate) fn new(max_size: usize) -> Self { Self { batch_queue: BatchQueue::new(), - user_states: HashMap::new(), max_size, } } - pub(crate) fn new_with_user_states( - user_states: HashMap, - max_size: usize, - ) -> Self { - Self { - batch_queue: BatchQueue::new(), - user_states, - max_size, - } - } // GETTERS: @@ -44,29 +29,6 @@ impl BatchState { .find(|entry| entry.sender == sender && entry.nonced_verification_data.nonce == nonce) } - pub(crate) fn get_user_state(&self, addr: &Address) -> Option<&UserState> { - self.user_states.get(addr) - } - - pub(crate) async fn get_user_nonce(&self, addr: &Address) -> Option { - let user_state = self.get_user_state(addr)?; - Some(user_state.nonce) - } - - pub(crate) async fn get_user_last_max_fee_limit(&self, addr: &Address) -> Option { - let user_state = self.get_user_state(addr)?; - Some(user_state.last_max_fee_limit) - } - - pub(crate) async fn get_user_total_fees_in_queue(&self, addr: &Address) -> Option { - let user_state = self.get_user_state(addr)?; - Some(user_state.total_fees_in_queue) - } - - pub(crate) async fn get_user_proof_count(&self, addr: &Address) -> Option { - let user_state = self.get_user_state(addr)?; - Some(user_state.proofs_in_batch) - } pub(crate) fn get_user_min_fee_in_batch(&self, addr: &Address) -> U256 { self.batch_queue @@ -77,126 +39,9 @@ impl BatchState { .unwrap_or(U256::max_value()) } - // SETTERS: - - pub(crate) fn update_user_max_fee_limit( - &mut self, - addr: &Address, - new_max_fee_limit: U256, - ) -> Option { - // TODO refactor to return Result, or something less redundant - if let Entry::Occupied(mut user_state) = self.user_states.entry(*addr) { - user_state.get_mut().last_max_fee_limit = new_max_fee_limit; - return Some(new_max_fee_limit); - } - None - } - - pub(crate) fn update_user_proof_count( - &mut self, - addr: &Address, - new_proof_count: usize, - ) -> Option { - // TODO refactor to return Result, or something less redundant - if let Entry::Occupied(mut user_state) = self.user_states.entry(*addr) { - user_state.get_mut().proofs_in_batch = new_proof_count; - return Some(new_proof_count); - } - None - } - - pub(crate) fn update_user_nonce(&mut self, addr: &Address, new_nonce: U256) -> Option { - // TODO refactor to return Result, or something less redundant - if let Entry::Occupied(mut user_state) = self.user_states.entry(*addr) { - user_state.get_mut().nonce = new_nonce; - return Some(new_nonce); - } - None - } - - pub(crate) fn update_user_total_fees_in_queue( - &mut self, - addr: &Address, - new_total_fees_in_queue: U256, - ) -> Option { - // TODO refactor to return Result, or something less redundant - if let Entry::Occupied(mut user_state) = self.user_states.entry(*addr) { - user_state.get_mut().total_fees_in_queue = new_total_fees_in_queue; - return Some(new_total_fees_in_queue); - } - None - } - - pub(crate) fn update_user_total_fees_in_queue_of_replacement_message( - &mut self, - addr: &Address, - original_max_fee: U256, - new_max_fee: U256, - ) -> Option { - // TODO refactor to return Result, or something less redundant - let fee_difference = new_max_fee - original_max_fee; //here we already know new_max_fee > original_max_fee - if let Entry::Occupied(mut user_state) = self.user_states.entry(*addr) { - user_state.get_mut().total_fees_in_queue += fee_difference; - return Some(user_state.get().total_fees_in_queue); - } - None - } - - /// Updates the user with address `addr` with the provided values of - /// `new_nonce`, `new_max_fee_limit`, `new_proof_count` and `new_total_fees_in_queue` - /// If state is updated successfully, returns the updated values inside a `Some()` - /// If the address was not found in the user states, returns `None` - pub(crate) fn update_user_state( - &mut self, - addr: &Address, - new_nonce: U256, - new_max_fee_limit: U256, - new_proof_count: usize, - new_total_fees_in_queue: U256, - ) -> Option<(U256, U256, usize, U256)> { - // TODO refactor to return Result, or something less redundant - let updated_nonce = self.update_user_nonce(addr, new_nonce); - let updated_max_fee_limit = self.update_user_max_fee_limit(addr, new_max_fee_limit); - let updated_proof_count = self.update_user_proof_count(addr, new_proof_count); - let updated_total_fees_in_queue = - self.update_user_total_fees_in_queue(addr, new_total_fees_in_queue); - - if updated_nonce.is_some() - && updated_max_fee_limit.is_some() - && updated_proof_count.is_some() - && updated_total_fees_in_queue.is_some() - { - return Some(( - new_nonce, - new_max_fee_limit, - new_proof_count, - new_total_fees_in_queue, - )); - } - None - } // LOGIC: - pub(crate) fn calculate_new_user_states_data(&self) -> HashMap { - let mut updated_user_states = HashMap::new(); // address -> (proof_count, max_fee_limit, total_fees_in_queue) - for (entry, _) in self.batch_queue.iter() { - let addr = entry.sender; - let max_fee = entry.nonced_verification_data.max_fee; - - let (proof_count, max_fee_limit, total_fees_in_queue) = updated_user_states - .entry(addr) - .or_insert((0, max_fee, U256::zero())); - - *proof_count += 1; - *total_fees_in_queue += max_fee; - if max_fee < *max_fee_limit { - *max_fee_limit = max_fee; - } - } - - updated_user_states - } /// Checks if the entry is valid /// An entry is valid if there is no entry with the same sender, lower nonce and a lower fee @@ -221,36 +66,6 @@ impl BatchState { }) } - /// Updates or removes a user's state when their latest proof entry is removed from the batch queue. - /// - /// If the user has no other proofs remaining in the queue, their state is removed entirely. - /// Otherwise, the user's state is updated to reflect the next most recent entry in the queue. - /// - /// Note: The given `removed_entry` must be the most recent (latest or highest nonce) entry for the user in the queue. - pub(crate) fn update_user_state_on_entry_removal(&mut self, removed_entry: &BatchQueueEntry) { - let addr = removed_entry.sender; - - let new_last_max_fee_limit = match self - .batch_queue - .iter() - .filter(|(e, _)| e.sender == addr) - .next_back() - { - Some((last_entry, _)) => last_entry.nonced_verification_data.max_fee, - None => { - self.user_states.remove(&addr); - return; - } - }; - - if let Entry::Occupied(mut user_state) = self.user_states.entry(addr) { - user_state.get_mut().proofs_in_batch -= 1; - user_state.get_mut().nonce -= U256::one(); - user_state.get_mut().total_fees_in_queue -= - removed_entry.nonced_verification_data.max_fee; - user_state.get_mut().last_max_fee_limit = new_last_max_fee_limit; - } - } pub(crate) fn is_queue_full(&self) -> bool { self.batch_queue.len() >= self.max_size From 173d4c8ad8ee2ca2af4ce0daeb8b909b2df5c0a6 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Fri, 11 Jul 2025 12:46:48 +0200 Subject: [PATCH 02/48] Remove all functions that takes and drops lock --- crates/batcher/src/lib.rs | 275 ++++++++++++++++---------------------- 1 file changed, 112 insertions(+), 163 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 648bdaf050..de20846f17 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -282,89 +282,6 @@ impl Batcher { } } - // Helper methods for user_states operations with per-address locking - - async fn get_user_nonce(&self, addr: &Address) -> Option { - let user_state = self.user_states.get(addr)?; - let user_state_guard = user_state.lock().await; - Some(user_state_guard.nonce) - } - - async fn get_user_last_max_fee_limit(&self, addr: &Address) -> Option { - let user_state = self.user_states.get(addr)?; - let user_state_guard = user_state.lock().await; - Some(user_state_guard.last_max_fee_limit) - } - - async fn get_user_total_fees_in_queue(&self, addr: &Address) -> Option { - let user_state = self.user_states.get(addr)?; - let user_state_guard = user_state.lock().await; - Some(user_state_guard.total_fees_in_queue) - } - - async fn get_user_proof_count(&self, addr: &Address) -> Option { - let user_state = self.user_states.get(addr)?; - let user_state_guard = user_state.lock().await; - Some(user_state_guard.proofs_in_batch) - } - - async fn update_user_nonce(&self, addr: &Address, new_nonce: U256) -> Option { - let user_state = self.user_states.get(addr)?; - let mut user_state_guard = user_state.lock().await; - user_state_guard.nonce = new_nonce; - Some(new_nonce) - } - - async fn update_user_max_fee_limit(&self, addr: &Address, new_max_fee_limit: U256) -> Option { - let user_state = self.user_states.get(addr)?; - let mut user_state_guard = user_state.lock().await; - user_state_guard.last_max_fee_limit = new_max_fee_limit; - Some(new_max_fee_limit) - } - - async fn update_user_proof_count(&self, addr: &Address, new_proof_count: usize) -> Option { - let user_state = self.user_states.get(addr)?; - let mut user_state_guard = user_state.lock().await; - user_state_guard.proofs_in_batch = new_proof_count; - Some(new_proof_count) - } - - async fn update_user_total_fees_in_queue(&self, addr: &Address, new_total_fees_in_queue: U256) -> Option { - let user_state = self.user_states.get(addr)?; - let mut user_state_guard = user_state.lock().await; - user_state_guard.total_fees_in_queue = new_total_fees_in_queue; - Some(new_total_fees_in_queue) - } - - async fn update_user_total_fees_in_queue_of_replacement_message( - &self, - addr: &Address, - original_max_fee: U256, - new_max_fee: U256, - ) -> Option { - let user_state = self.user_states.get(addr)?; - let mut user_state_guard = user_state.lock().await; - let fee_difference = new_max_fee - original_max_fee; - user_state_guard.total_fees_in_queue += fee_difference; - Some(user_state_guard.total_fees_in_queue) - } - - async fn update_user_state( - &self, - addr: &Address, - new_nonce: U256, - new_max_fee_limit: U256, - new_proof_count: usize, - new_total_fees_in_queue: U256, - ) -> Option<(U256, U256, usize, U256)> { - let user_state = self.user_states.get(addr)?; - let mut user_state_guard = user_state.lock().await; - user_state_guard.nonce = new_nonce; - user_state_guard.last_max_fee_limit = new_max_fee_limit; - user_state_guard.proofs_in_batch = new_proof_count; - user_state_guard.total_fees_in_queue = new_total_fees_in_queue; - Some((new_nonce, new_max_fee_limit, new_proof_count, new_total_fees_in_queue)) - } fn get_user_min_fee_in_batch(&self, addr: &Address, batch_queue: &types::batch_queue::BatchQueue) -> U256 { batch_queue @@ -699,7 +616,16 @@ impl Batcher { address = replacement_addr; } - let cached_user_nonce = self.get_user_nonce(&address).await; + let cached_user_nonce = { + let user_state = self.user_states.get(&address); + match user_state { + Some(user_state) => { + let user_state_guard = user_state.lock().await; + Some(user_state_guard.nonce) + } + None => None, + } + }; let user_nonce = if let Some(user_nonce) = cached_user_nonce { user_nonce @@ -887,7 +813,17 @@ impl Batcher { }; let msg_max_fee = nonced_verification_data.max_fee; - let Some(user_last_max_fee_limit) = self.get_user_last_max_fee_limit(&addr).await else { + let user_last_max_fee_limit = { + let user_state = self.user_states.get(&addr); + match user_state { + Some(user_state) => { + let user_state_guard = user_state.lock().await; + Some(user_state_guard.last_max_fee_limit) + } + None => None, + } + }; + let Some(user_last_max_fee_limit) = user_last_max_fee_limit else { send_message( ws_conn_sink.clone(), SubmitProofResponseMessage::AddToBatchError, @@ -897,7 +833,17 @@ impl Batcher { return Ok(()); }; - let Some(user_accumulated_fee) = self.get_user_total_fees_in_queue(&addr).await else { + let user_accumulated_fee = { + let user_state = self.user_states.get(&addr); + match user_state { + Some(user_state) => { + let user_state_guard = user_state.lock().await; + Some(user_state_guard.total_fees_in_queue) + } + None => None, + } + }; + let Some(user_accumulated_fee) = user_accumulated_fee else { send_message( ws_conn_sink.clone(), SubmitProofResponseMessage::AddToBatchError, @@ -917,7 +863,16 @@ impl Batcher { return Ok(()); } - let cached_user_nonce = self.get_user_nonce(&addr).await; + let cached_user_nonce = { + let user_state = self.user_states.get(&addr); + match user_state { + Some(user_state) => { + let user_state_guard = user_state.lock().await; + Some(user_state_guard.nonce) + } + None => None, + } + }; let Some(expected_nonce) = cached_user_nonce else { error!("Failed to get cached user nonce: User not found in user states, but it should have been already inserted"); @@ -1171,39 +1126,46 @@ impl Batcher { // update max_fee_limit let updated_max_fee_limit_in_batch = self.get_user_min_fee_in_batch(&addr, &batch_state_lock.batch_queue); - if self - .update_user_max_fee_limit(&addr, updated_max_fee_limit_in_batch) - .await - .is_none() { - std::mem::drop(batch_state_lock); - warn!("User state for address {addr:?} was not present in batcher user states, but it should be"); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::AddToBatchError, - ) - .await; - return; - }; + let user_state = self.user_states.get(&addr); + match user_state { + Some(user_state) => { + let mut user_state_guard = user_state.lock().await; + user_state_guard.last_max_fee_limit = updated_max_fee_limit_in_batch; + } + None => { + std::mem::drop(batch_state_lock); + warn!("User state for address {addr:?} was not present in batcher user states, but it should be"); + send_message( + ws_conn_sink.clone(), + SubmitProofResponseMessage::AddToBatchError, + ) + .await; + return; + } + } + } // update total_fees_in_queue - if self - .update_user_total_fees_in_queue_of_replacement_message( - &addr, - original_max_fee, - replacement_max_fee, - ) - .await - .is_none() { - std::mem::drop(batch_state_lock); - warn!("User state for address {addr:?} was not present in batcher user states, but it should be"); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::AddToBatchError, - ) - .await; - }; + let user_state = self.user_states.get(&addr); + match user_state { + Some(user_state) => { + let mut user_state_guard = user_state.lock().await; + let fee_difference = replacement_max_fee - original_max_fee; + user_state_guard.total_fees_in_queue += fee_difference; + } + None => { + std::mem::drop(batch_state_lock); + warn!("User state for address {addr:?} was not present in batcher user states, but it should be"); + send_message( + ws_conn_sink.clone(), + SubmitProofResponseMessage::AddToBatchError, + ) + .await; + } + } + } } async fn disabled_verifiers(&self) -> Result> { @@ -1275,40 +1237,26 @@ impl Batcher { info!("Current batch queue length: {}", queue_len); - let Some(user_proof_count) = self.get_user_proof_count(&proof_submitter_addr).await else { - error!("User state of address {proof_submitter_addr} was not found when trying to update user state. This user state should have been present"); - std::mem::drop(batch_state_lock); - return Err(BatcherError::AddressNotFoundInUserStates( - proof_submitter_addr, - )); - }; - - let Some(current_total_fees_in_queue) = self.get_user_total_fees_in_queue(&proof_submitter_addr).await else { - error!("User state of address {proof_submitter_addr} was not found when trying to update user state. This user state should have been present"); - std::mem::drop(batch_state_lock); - return Err(BatcherError::AddressNotFoundInUserStates( - proof_submitter_addr, - )); - }; - // User state is updated - if self - .update_user_state( - &proof_submitter_addr, - nonce + U256::one(), - max_fee, - user_proof_count + 1, - current_total_fees_in_queue + max_fee, - ) - .await - .is_none() { - error!("User state of address {proof_submitter_addr} was not found when trying to update user state. This user state should have been present"); - std::mem::drop(batch_state_lock); - return Err(BatcherError::AddressNotFoundInUserStates( - proof_submitter_addr, - )); - }; + let user_state = self.user_states.get(&proof_submitter_addr); + match user_state { + Some(user_state) => { + let mut user_state_guard = user_state.lock().await; + user_state_guard.nonce = nonce + U256::one(); + user_state_guard.last_max_fee_limit = max_fee; + user_state_guard.proofs_in_batch += 1; + user_state_guard.total_fees_in_queue += max_fee; + } + None => { + error!("User state of address {proof_submitter_addr} was not found when trying to update user state. This user state should have been present"); + std::mem::drop(batch_state_lock); + return Err(BatcherError::AddressNotFoundInUserStates( + proof_submitter_addr, + )); + } + } + } Ok(()) } @@ -1420,21 +1368,22 @@ impl Batcher { // informative error. // Now we update the user states related to the batch (proof count in batch and min fee in batch) - self.update_user_proof_count(addr, *proof_count) - .await - .ok_or(BatcherError::QueueRemoveError( - "Could not update_user_proof_count".into(), - ))?; - self.update_user_max_fee_limit(addr, *max_fee_limit) - .await - .ok_or(BatcherError::QueueRemoveError( - "Could not update_user_max_fee_limit".into(), - ))?; - self.update_user_total_fees_in_queue(addr, *total_fees_in_queue) - .await - .ok_or(BatcherError::QueueRemoveError( - "Could not update_user_total_fees_in_queue".into(), - ))?; + { + let user_state = self.user_states.get(addr); + match user_state { + Some(user_state) => { + let mut user_state_guard = user_state.lock().await; + user_state_guard.proofs_in_batch = *proof_count; + user_state_guard.last_max_fee_limit = *max_fee_limit; + user_state_guard.total_fees_in_queue = *total_fees_in_queue; + } + None => { + return Err(BatcherError::QueueRemoveError( + "Could not update user state".into(), + )); + } + } + } } // Update metrics From db618777bfa51c8e99ba5ada12180dd34d88f52e Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 16 Jul 2025 17:29:33 -0300 Subject: [PATCH 03/48] Add lock per user state --- crates/Cargo.lock | 15 +++++++++++++++ crates/batcher/src/lib.rs | 35 ++++++++++++++++++++++++++++++----- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/crates/Cargo.lock b/crates/Cargo.lock index a58e1adf86..2c9f47c26c 100644 --- a/crates/Cargo.lock +++ b/crates/Cargo.lock @@ -103,6 +103,7 @@ dependencies = [ "bytes", "ciborium", "clap", + "dashmap", "dotenvy", "env_logger", "ethers", @@ -2039,6 +2040,20 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "dashu" version = "0.4.2" diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index de20846f17..8af88ce02d 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -86,7 +86,7 @@ pub struct Batcher { service_manager: ServiceManager, service_manager_fallback: ServiceManager, batch_state: Mutex, - user_states: DashMap>, + user_states: DashMap>>, min_block_interval: u64, transaction_wait_timeout: u64, max_proof_size: usize, @@ -229,7 +229,7 @@ impl Batcher { let non_paying_user_state = UserState::new(nonpaying_nonce); user_states.insert( non_paying_config.replacement.address(), - Mutex::new(non_paying_user_state), + Arc::new(Mutex::new(non_paying_user_state)), ); Some(non_paying_config) @@ -762,7 +762,6 @@ impl Batcher { } } - info!("Handling message"); // We don't need a batch state lock here, since if the user locks its funds // after the check, some blocks should pass until he can withdraw. @@ -771,12 +770,37 @@ impl Batcher { return Ok(()); } + info!("Handling message, locking user state"); + + + // We acquire the lock first only to query if the user is already present and the lock is dropped. // If it was not present, then the user nonce is queried to the Aligned contract. // Lastly, we get a lock of the batch state again and insert the user state if it was still missing. let is_user_in_state = self.user_states.contains_key(&addr); + + if !is_user_in_state { + // We add a dummy user state to grab a lock on the user state + let dummy_user_state = UserState::new(U256::zero()); + self.user_states.insert(addr, Arc::new(Mutex::new(dummy_user_state))); + } + + let Some(user_state_ref) = self.user_states.get(&addr) else { + error!("This should never happen, user state has previously been inserted if it didn't exist"); + send_message( + ws_conn_sink.clone(), + SubmitProofResponseMessage::AddToBatchError, + ) + .await; + self.metrics.user_error(&["batcher_state_error", ""]); + return Ok(()); + }; + + // We acquire the lock on the user state, now everything will be processed sequentially + let _user_state_guard = user_state_ref.lock().await; + // If the user state was not present, we need to get the nonce from the Ethereum contract and update the dummy user state if !is_user_in_state { let ethereum_user_nonce = match self.get_user_nonce_from_ethereum(addr).await { Ok(ethereum_user_nonce) => ethereum_user_nonce, @@ -794,7 +818,7 @@ impl Batcher { } }; let user_state = UserState::new(ethereum_user_nonce); - self.user_states.entry(addr).or_insert(Mutex::new(user_state)); + self.user_states.entry(addr).or_insert(Arc::new(Mutex::new(user_state))); } // * ---------------------------------------------------* @@ -823,6 +847,7 @@ impl Batcher { None => None, } }; + let Some(user_last_max_fee_limit) = user_last_max_fee_limit else { send_message( ws_conn_sink.clone(), @@ -1538,7 +1563,7 @@ impl Batcher { self.user_states.clear(); let nonpaying_user_state = UserState::new(nonpaying_replacement_addr_nonce); self.user_states - .insert(nonpaying_replacement_addr, Mutex::new(nonpaying_user_state)); + .insert(nonpaying_replacement_addr, Arc::new(Mutex::new(nonpaying_user_state))); self.metrics.update_queue_metrics(0, 0); } From ade564834a31fdf1c2a9c2e141184063bf24ddea Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 16 Jul 2025 17:37:16 -0300 Subject: [PATCH 04/48] Fmt --- crates/batcher/src/lib.rs | 52 +++++++++++++++++-------- crates/batcher/src/types/batch_state.rs | 9 +---- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 8af88ce02d..3e2b7cba66 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -17,12 +17,12 @@ use types::batch_state::BatchState; use types::user_state::UserState; use batch_queue::calculate_batch_size; +use dashmap::DashMap; use std::collections::HashMap; use std::env; use std::net::SocketAddr; use std::sync::Arc; use std::time::Duration; -use dashmap::DashMap; use aligned_sdk::common::constants::{ ADDITIONAL_SUBMISSION_GAS_COST_PER_PROOF, BATCHER_SUBMISSION_BASE_GAS_COST, @@ -282,8 +282,11 @@ impl Batcher { } } - - fn get_user_min_fee_in_batch(&self, addr: &Address, batch_queue: &types::batch_queue::BatchQueue) -> U256 { + fn get_user_min_fee_in_batch( + &self, + addr: &Address, + batch_queue: &types::batch_queue::BatchQueue, + ) -> U256 { batch_queue .iter() .filter(|(e, _)| &e.sender == addr) @@ -292,7 +295,11 @@ impl Batcher { .unwrap_or(U256::max_value()) } - async fn update_user_state_on_entry_removal(&self, removed_entry: &types::batch_queue::BatchQueueEntry, batch_queue: &types::batch_queue::BatchQueue) -> Option<()> { + async fn update_user_state_on_entry_removal( + &self, + removed_entry: &types::batch_queue::BatchQueueEntry, + batch_queue: &types::batch_queue::BatchQueue, + ) -> Option<()> { let addr = removed_entry.sender; let new_last_max_fee_limit = match batch_queue @@ -316,7 +323,10 @@ impl Batcher { Some(()) } - fn calculate_new_user_states_data(&self, batch_queue: &types::batch_queue::BatchQueue) -> HashMap { + fn calculate_new_user_states_data( + &self, + batch_queue: &types::batch_queue::BatchQueue, + ) -> HashMap { let mut updated_user_states = HashMap::new(); for (entry, _) in batch_queue.iter() { let addr = entry.sender; @@ -762,7 +772,6 @@ impl Batcher { } } - // We don't need a batch state lock here, since if the user locks its funds // after the check, some blocks should pass until he can withdraw. // It is safe to do just do this here. @@ -772,20 +781,19 @@ impl Batcher { info!("Handling message, locking user state"); - - // We acquire the lock first only to query if the user is already present and the lock is dropped. // If it was not present, then the user nonce is queried to the Aligned contract. // Lastly, we get a lock of the batch state again and insert the user state if it was still missing. let is_user_in_state = self.user_states.contains_key(&addr); - + if !is_user_in_state { // We add a dummy user state to grab a lock on the user state let dummy_user_state = UserState::new(U256::zero()); - self.user_states.insert(addr, Arc::new(Mutex::new(dummy_user_state))); + self.user_states + .insert(addr, Arc::new(Mutex::new(dummy_user_state))); } - + let Some(user_state_ref) = self.user_states.get(&addr) else { error!("This should never happen, user state has previously been inserted if it didn't exist"); send_message( @@ -818,7 +826,9 @@ impl Batcher { } }; let user_state = UserState::new(ethereum_user_nonce); - self.user_states.entry(addr).or_insert(Arc::new(Mutex::new(user_state))); + self.user_states + .entry(addr) + .or_insert(Arc::new(Mutex::new(user_state))); } // * ---------------------------------------------------* @@ -986,7 +996,11 @@ impl Batcher { removed_entry.nonced_verification_data.nonce ); - self.update_user_state_on_entry_removal(&removed_entry, &batch_state_lock.batch_queue).await; + self.update_user_state_on_entry_removal( + &removed_entry, + &batch_state_lock.batch_queue, + ) + .await; if let Some(removed_entry_ws) = removed_entry.messaging_sink { send_message( @@ -1150,7 +1164,8 @@ impl Batcher { ); // update max_fee_limit - let updated_max_fee_limit_in_batch = self.get_user_min_fee_in_batch(&addr, &batch_state_lock.batch_queue); + let updated_max_fee_limit_in_batch = + self.get_user_min_fee_in_batch(&addr, &batch_state_lock.batch_queue); { let user_state = self.user_states.get(&addr); match user_state { @@ -1381,7 +1396,8 @@ impl Batcher { // now we calculate the new user_states let new_user_states = self.calculate_new_user_states_data(&batch_state_lock.batch_queue); - let user_addresses: Vec
= self.user_states.iter().map(|entry| *entry.key()).collect(); + let user_addresses: Vec
= + self.user_states.iter().map(|entry| *entry.key()).collect(); let default_value = (0, U256::MAX, U256::zero()); for addr in user_addresses.iter() { let (proof_count, max_fee_limit, total_fees_in_queue) = @@ -1562,8 +1578,10 @@ impl Batcher { batch_state_lock.batch_queue.clear(); self.user_states.clear(); let nonpaying_user_state = UserState::new(nonpaying_replacement_addr_nonce); - self.user_states - .insert(nonpaying_replacement_addr, Arc::new(Mutex::new(nonpaying_user_state))); + self.user_states.insert( + nonpaying_replacement_addr, + Arc::new(Mutex::new(nonpaying_user_state)), + ); self.metrics.update_queue_metrics(0, 0); } diff --git a/crates/batcher/src/types/batch_state.rs b/crates/batcher/src/types/batch_state.rs index d28985ef7d..1530195192 100644 --- a/crates/batcher/src/types/batch_state.rs +++ b/crates/batcher/src/types/batch_state.rs @@ -1,6 +1,4 @@ -use super::{ - batch_queue::{BatchQueue, BatchQueueEntry}, -}; +use super::batch_queue::{BatchQueue, BatchQueueEntry}; use ethers::types::{Address, U256}; use log::debug; @@ -19,7 +17,6 @@ impl BatchState { } } - // GETTERS: pub(crate) fn get_entry(&self, sender: Address, nonce: U256) -> Option<&BatchQueueEntry> { @@ -29,7 +26,6 @@ impl BatchState { .find(|entry| entry.sender == sender && entry.nonced_verification_data.nonce == nonce) } - pub(crate) fn get_user_min_fee_in_batch(&self, addr: &Address) -> U256 { self.batch_queue .iter() @@ -39,10 +35,8 @@ impl BatchState { .unwrap_or(U256::max_value()) } - // LOGIC: - /// Checks if the entry is valid /// An entry is valid if there is no entry with the same sender, lower nonce and a lower fee pub(crate) fn replacement_entry_is_valid( @@ -66,7 +60,6 @@ impl BatchState { }) } - pub(crate) fn is_queue_full(&self) -> bool { self.batch_queue.len() >= self.max_size } From 4afe0f07d64ea7b5adb6f32eb156452174321f3f Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 16 Jul 2025 17:37:52 -0300 Subject: [PATCH 05/48] Update cargo lock --- .../aggregation_programs/sp1/Cargo.lock | 734 ++++++++++++++++++ 1 file changed, 734 insertions(+) create mode 100644 aggregation_mode/aggregation_programs/sp1/Cargo.lock diff --git a/aggregation_mode/aggregation_programs/sp1/Cargo.lock b/aggregation_mode/aggregation_programs/sp1/Cargo.lock new file mode 100644 index 0000000000..20a817514c --- /dev/null +++ b/aggregation_mode/aggregation_programs/sp1/Cargo.lock @@ -0,0 +1,734 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + +[[package]] +name = "blake3" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bumpalo" +version = "3.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1b094a32014c3d1f3944e4808e0e7c70e97dae0660886a8eb6dbc52d745badc" + +[[package]] +name = "cc" +version = "1.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0fc897dc1e865cc67c0e05a836d9d3f1df3cbe442aa4a9473b18e12624a4951" +dependencies = [ + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "gcd" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d758ba1b47b00caf47f24925c0074ecb20d6dfcffe7f6d53395c0465674841a" + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "keccak" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc2af9a1119c51f12a14607e783cb977bde58bc069ff0c3da1095e635d70654" +dependencies = [ + "cpufeatures", +] + +[[package]] +name = "lambdaworks-crypto" +version = "0.12.0" +source = "git+https://github.com/lambdaclass/lambdaworks.git?rev=5f8f2cfcc8a1a22f77e8dff2d581f1166eefb80b#5f8f2cfcc8a1a22f77e8dff2d581f1166eefb80b" +dependencies = [ + "lambdaworks-math", + "rand", + "rand_chacha", + "serde", + "sha2 0.10.9", + "sha3 0.10.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "lambdaworks-math" +version = "0.12.0" +source = "git+https://github.com/lambdaclass/lambdaworks.git?rev=5f8f2cfcc8a1a22f77e8dff2d581f1166eefb80b#5f8f2cfcc8a1a22f77e8dff2d581f1166eefb80b" +dependencies = [ + "getrandom", + "rand", + "serde", + "serde_json", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.172" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" + +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "p3-baby-bear" +version = "0.2.3-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7521838ecab2ddf4f7bc4ceebad06ec02414729598485c1ada516c39900820e8" +dependencies = [ + "num-bigint", + "p3-field", + "p3-mds", + "p3-poseidon2", + "p3-symmetric", + "rand", + "serde", +] + +[[package]] +name = "p3-dft" +version = "0.2.3-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46414daedd796f1eefcdc1811c0484e4bced5729486b6eaba9521c572c76761a" +dependencies = [ + "p3-field", + "p3-matrix", + "p3-maybe-rayon", + "p3-util", + "tracing", +] + +[[package]] +name = "p3-field" +version = "0.2.3-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48948a0516b349e9d1cdb95e7236a6ee010c44e68c5cc78b4b92bf1c4022a0d9" +dependencies = [ + "itertools", + "num-bigint", + "num-traits", + "p3-util", + "rand", + "serde", +] + +[[package]] +name = "p3-matrix" +version = "0.2.3-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e4de3f373589477cb735ea58e125898ed20935e03664b4614c7fac258b3c42f" +dependencies = [ + "itertools", + "p3-field", + "p3-maybe-rayon", + "p3-util", + "rand", + "serde", + "tracing", +] + +[[package]] +name = "p3-maybe-rayon" +version = "0.2.3-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3968ad1160310296eb04f91a5f4edfa38fe1d6b2b8cd6b5c64e6f9b7370979e" + +[[package]] +name = "p3-mds" +version = "0.2.3-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2356b1ed0add6d5dfbf7a338ce534a6fde827374394a52cec16a0840af6e97c9" +dependencies = [ + "itertools", + "p3-dft", + "p3-field", + "p3-matrix", + "p3-symmetric", + "p3-util", + "rand", +] + +[[package]] +name = "p3-poseidon2" +version = "0.2.3-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da1eec7e1b6900581bedd95e76e1ef4975608dd55be9872c9d257a8a9651c3a" +dependencies = [ + "gcd", + "p3-field", + "p3-mds", + "p3-symmetric", + "rand", + "serde", +] + +[[package]] +name = "p3-symmetric" +version = "0.2.3-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edb439bea1d822623b41ff4b51e3309e80d13cadf8b86d16ffd5e6efb9fdc360" +dependencies = [ + "itertools", + "p3-field", + "serde", +] + +[[package]] +name = "p3-util" +version = "0.2.3-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c2c2010678b9332b563eaa38364915b585c1a94b5ca61e2c7541c087ddda5c" +dependencies = [ + "serde", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "git+https://github.com/sp1-patches/RustCrypto-hashes?tag=sha2-v0.10.8-patch-v1#1f224388fdede7cef649bce0d63876d1a9e3f515" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha3" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75872d278a8f37ef87fa0ddbda7802605cb18344497949862c0d4dcb291eba60" +dependencies = [ + "digest", + "keccak", +] + +[[package]] +name = "sha3" +version = "0.10.8" +source = "git+https://github.com/sp1-patches/RustCrypto-hashes?tag=sha3-v0.10.8-patch-v1#8f6d303c0861ba7e5adcc36207c0f41fe5edaabc" +dependencies = [ + "digest", + "keccak", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "sp1-lib" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03046db52868c1b60e8acffa0777ef6dc11ec1bbbb10b9eb612a871f69c8d3f6" +dependencies = [ + "bincode", + "serde", + "sp1-primitives", +] + +[[package]] +name = "sp1-primitives" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6939d6b2f63e54e5fbd208a0293027608f22511741b62fe32b6f67f6c144e0c0" +dependencies = [ + "bincode", + "blake3", + "cfg-if", + "hex", + "lazy_static", + "num-bigint", + "p3-baby-bear", + "p3-field", + "p3-poseidon2", + "p3-symmetric", + "serde", + "sha2 0.10.9", +] + +[[package]] +name = "sp1-zkvm" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16e69fef4d915b10072461e52fd616ca2625409ede7b37a36ec910e1a52bd860" +dependencies = [ + "cfg-if", + "getrandom", + "lazy_static", + "libm", + "p3-baby-bear", + "p3-field", + "rand", + "sha2 0.10.9", + "sp1-lib", + "sp1-primitives", +] + +[[package]] +name = "sp1_aggregation_program" +version = "0.1.0" +dependencies = [ + "lambdaworks-crypto", + "serde", + "serde_json", + "sha2 0.10.8", + "sha3 0.10.8 (git+https://github.com/sp1-patches/RustCrypto-hashes?tag=sha3-v0.10.8-patch-v1)", + "sp1-zkvm", +] + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tracing" +version = "0.1.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" +dependencies = [ + "once_cell", +] + +[[package]] +name = "typenum" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "zerocopy" +version = "0.8.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] From 2cfa935840b810b47a777a10b68410625f7a6622 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 16 Jul 2025 17:51:55 -0300 Subject: [PATCH 06/48] Handle everything with the state locked --- crates/batcher/src/lib.rs | 106 +++++++------------------------------- 1 file changed, 18 insertions(+), 88 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 3e2b7cba66..3b43a2b71f 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -806,7 +806,7 @@ impl Batcher { }; // We acquire the lock on the user state, now everything will be processed sequentially - let _user_state_guard = user_state_ref.lock().await; + let mut user_state_guard = user_state_ref.lock().await; // If the user state was not present, we need to get the nonce from the Ethereum contract and update the dummy user state if !is_user_in_state { @@ -825,10 +825,8 @@ impl Batcher { return Ok(()); } }; - let user_state = UserState::new(ethereum_user_nonce); - self.user_states - .entry(addr) - .or_insert(Arc::new(Mutex::new(user_state))); + // Update the dummy user state with the correct nonce + user_state_guard.nonce = ethereum_user_nonce; } // * ---------------------------------------------------* @@ -847,46 +845,9 @@ impl Batcher { }; let msg_max_fee = nonced_verification_data.max_fee; - let user_last_max_fee_limit = { - let user_state = self.user_states.get(&addr); - match user_state { - Some(user_state) => { - let user_state_guard = user_state.lock().await; - Some(user_state_guard.last_max_fee_limit) - } - None => None, - } - }; + let user_last_max_fee_limit = user_state_guard.last_max_fee_limit; - let Some(user_last_max_fee_limit) = user_last_max_fee_limit else { - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::AddToBatchError, - ) - .await; - self.metrics.user_error(&["batcher_state_error", ""]); - return Ok(()); - }; - - let user_accumulated_fee = { - let user_state = self.user_states.get(&addr); - match user_state { - Some(user_state) => { - let user_state_guard = user_state.lock().await; - Some(user_state_guard.total_fees_in_queue) - } - None => None, - } - }; - let Some(user_accumulated_fee) = user_accumulated_fee else { - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::AddToBatchError, - ) - .await; - self.metrics.user_error(&["batcher_state_error", ""]); - return Ok(()); - }; + let user_accumulated_fee = user_state_guard.total_fees_in_queue; if !self.verify_user_has_enough_balance(user_balance, user_accumulated_fee, msg_max_fee) { send_message( @@ -898,27 +859,7 @@ impl Batcher { return Ok(()); } - let cached_user_nonce = { - let user_state = self.user_states.get(&addr); - match user_state { - Some(user_state) => { - let user_state_guard = user_state.lock().await; - Some(user_state_guard.nonce) - } - None => None, - } - }; - - let Some(expected_nonce) = cached_user_nonce else { - error!("Failed to get cached user nonce: User not found in user states, but it should have been already inserted"); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::AddToBatchError, - ) - .await; - self.metrics.user_error(&["batcher_state_error", ""]); - return Ok(()); - }; + let expected_nonce = user_state_guard.nonce; if expected_nonce < msg_nonce { warn!("Invalid nonce for address {addr}, expected nonce: {expected_nonce:?}, received nonce: {msg_nonce:?}"); @@ -1032,7 +973,7 @@ impl Batcher { if let Err(e) = self .add_to_batch( batch_state_lock, - nonced_verification_data, + &nonced_verification_data, ws_conn_sink.clone(), signature, addr, @@ -1045,6 +986,14 @@ impl Batcher { return Ok(()); }; + // Update user state now that entry has been successfully added to batch + let max_fee = nonced_verification_data.max_fee; + let nonce = nonced_verification_data.nonce; + user_state_guard.nonce = nonce + U256::one(); + user_state_guard.last_max_fee_limit = max_fee; + user_state_guard.proofs_in_batch += 1; + user_state_guard.total_fees_in_queue += max_fee; + info!("Verification data message handled"); Ok(()) } @@ -1247,7 +1196,7 @@ impl Batcher { async fn add_to_batch( &self, mut batch_state_lock: MutexGuard<'_, BatchState>, - verification_data: NoncedVerificationData, + verification_data: &NoncedVerificationData, ws_conn_sink: WsMessageSink, proof_submitter_sig: Signature, proof_submitter_addr: Address, @@ -1260,7 +1209,7 @@ impl Batcher { let nonce = verification_data.nonce; batch_state_lock.batch_queue.push( BatchQueueEntry::new( - verification_data, + verification_data.clone(), verification_data_comm, ws_conn_sink, proof_submitter_sig, @@ -1277,26 +1226,7 @@ impl Batcher { info!("Current batch queue length: {}", queue_len); - // User state is updated - { - let user_state = self.user_states.get(&proof_submitter_addr); - match user_state { - Some(user_state) => { - let mut user_state_guard = user_state.lock().await; - user_state_guard.nonce = nonce + U256::one(); - user_state_guard.last_max_fee_limit = max_fee; - user_state_guard.proofs_in_batch += 1; - user_state_guard.total_fees_in_queue += max_fee; - } - None => { - error!("User state of address {proof_submitter_addr} was not found when trying to update user state. This user state should have been present"); - std::mem::drop(batch_state_lock); - return Err(BatcherError::AddressNotFoundInUserStates( - proof_submitter_addr, - )); - } - } - } + // User state will be updated by the caller who already has the lock Ok(()) } From dc40c0a4ad13b8228dd69ad254c5a62520154601 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 16 Jul 2025 18:02:57 -0300 Subject: [PATCH 07/48] Rename function --- crates/batcher/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 3b43a2b71f..b814462ced 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -295,7 +295,7 @@ impl Batcher { .unwrap_or(U256::max_value()) } - async fn update_user_state_on_entry_removal( + async fn update_evicted_user_state( &self, removed_entry: &types::batch_queue::BatchQueueEntry, batch_queue: &types::batch_queue::BatchQueue, @@ -937,7 +937,7 @@ impl Batcher { removed_entry.nonced_verification_data.nonce ); - self.update_user_state_on_entry_removal( + self.update_evicted_user_state( &removed_entry, &batch_state_lock.batch_queue, ) From 60b79b9e05eca130934cb9106403953a2b8e79aa Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 16 Jul 2025 18:15:35 -0300 Subject: [PATCH 08/48] Rename variables --- crates/batcher/src/lib.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index b814462ced..7c15203bee 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -627,10 +627,10 @@ impl Batcher { } let cached_user_nonce = { - let user_state = self.user_states.get(&address); - match user_state { - Some(user_state) => { - let user_state_guard = user_state.lock().await; + let user_state_ref = self.user_states.get(&address); + match user_state_ref { + Some(user_state_ref) => { + let user_state_guard = user_state_ref.lock().await; Some(user_state_guard.nonce) } None => None, From fc18d6c77a24536e8aa432d18fc2ce95bd24302c Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 16 Jul 2025 18:15:47 -0300 Subject: [PATCH 09/48] fmt --- crates/batcher/src/lib.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 7c15203bee..d189f470ba 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -937,11 +937,8 @@ impl Batcher { removed_entry.nonced_verification_data.nonce ); - self.update_evicted_user_state( - &removed_entry, - &batch_state_lock.batch_queue, - ) - .await; + self.update_evicted_user_state(&removed_entry, &batch_state_lock.batch_queue) + .await; if let Some(removed_entry_ws) = removed_entry.messaging_sink { send_message( From 60164312c957ad6f6ad382aa9a0afa4af1c2aef2 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 16 Jul 2025 18:35:37 -0300 Subject: [PATCH 10/48] Fix lib --- crates/batcher/src/lib.rs | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index d189f470ba..0659310bfa 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -282,18 +282,6 @@ impl Batcher { } } - fn get_user_min_fee_in_batch( - &self, - addr: &Address, - batch_queue: &types::batch_queue::BatchQueue, - ) -> U256 { - batch_queue - .iter() - .filter(|(e, _)| &e.sender == addr) - .map(|(e, _)| e.nonced_verification_data.max_fee) - .min() - .unwrap_or(U256::max_value()) - } async fn update_evicted_user_state( &self, @@ -1111,7 +1099,7 @@ impl Batcher { // update max_fee_limit let updated_max_fee_limit_in_batch = - self.get_user_min_fee_in_batch(&addr, &batch_state_lock.batch_queue); + batch_state_lock.get_user_min_fee_in_batch(&addr); { let user_state = self.user_states.get(&addr); match user_state { From fb381033308f71107c33db2dbf8f6647cd973d50 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Fri, 18 Jul 2025 11:23:09 -0300 Subject: [PATCH 11/48] Re organize data --- crates/batcher/src/lib.rs | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 0659310bfa..1ad37ca30f 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -73,6 +73,7 @@ mod zk_utils; pub const LISTEN_NEW_BLOCKS_MAX_TIMES: usize = usize::MAX; pub struct Batcher { + // Configuration parameters s3_client: S3Client, s3_bucket_name: String, download_endpoint: String, @@ -85,20 +86,39 @@ pub struct Batcher { payment_service_fallback: BatcherPaymentService, service_manager: ServiceManager, service_manager_fallback: ServiceManager, - batch_state: Mutex, - user_states: DashMap>>, min_block_interval: u64, transaction_wait_timeout: u64, max_proof_size: usize, max_batch_byte_size: usize, max_batch_proof_qty: usize, - last_uploaded_batch_block: Mutex, pre_verification_is_enabled: bool, non_paying_config: Option, - posting_batch: Mutex, - disabled_verifiers: Mutex, aggregator_fee_percentage_multiplier: u128, aggregator_gas_cost: u128, + + // Shared state (Mutex) + /// The general business rule is: + /// - User processing can be done in parallel unless a batch creation is happening + /// - Batch creation needs to be able to change all the states, so all processing + /// needs to be stopped, and all user_states locks need to be taken + batch_state: Mutex, + user_states: DashMap>>, + /// When posting a task, this is taken as a write to stop new threads to update + /// user_states, ideally we would want a bigger mutex on the whole user_states, but this can't be done + batch_processing_lock: RwLock<()>, + + last_uploaded_batch_block: Mutex, + + /// This is used to avoid multiple batches being submitted at the same time + /// It could be removed in the future by changing how we spawn + /// the batch creation task + posting_batch: Mutex, + + + disabled_verifiers: Mutex, + + + // Observability and monitoring pub metrics: metrics::BatcherMetrics, pub telemetry: TelemetrySender, } @@ -274,6 +294,7 @@ impl Batcher { .aggregator_fee_percentage_multiplier, aggregator_gas_cost: config.batcher.aggregator_gas_cost, posting_batch: Mutex::new(false), + batch_processing_lock: RwLock::new(()), batch_state: Mutex::new(batch_state), user_states, disabled_verifiers: Mutex::new(disabled_verifiers), @@ -667,6 +688,9 @@ impl Batcher { client_msg: Box, ws_conn_sink: WsMessageSink, ) -> Result<(), Error> { + // Acquire read lock to allow concurrent user processing but block during batch creation + let _batch_processing_guard = self.batch_processing_lock.read().await; + let msg_nonce = client_msg.verification_data.nonce; debug!("Received message with nonce: {msg_nonce:?}"); self.metrics.received_proofs.inc(); @@ -1365,6 +1389,9 @@ impl Batcher { finalized_batch: Vec, gas_price: U256, ) -> Result<(), BatcherError> { + // Acquire write lock to ensure exclusive access during batch creation (blocks all user processing) + let _batch_processing_guard = self.batch_processing_lock.write().await; + let nonced_batch_verifcation_data: Vec = finalized_batch .clone() .into_iter() @@ -1528,6 +1555,8 @@ impl Batcher { / U256::from(PERCENTAGE_DIVIDER); if let Some(finalized_batch) = self.is_batch_ready(block_number, modified_gas_price).await { + // TODO (Mauro): There is a race condition here, + let batch_finalization_result = self .finalize_batch(block_number, finalized_batch, modified_gas_price) .await; From 1c4a59e41de2147ecd0d4fd336488d6685163521 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Fri, 18 Jul 2025 15:55:11 -0300 Subject: [PATCH 12/48] Checkpoint: Happy path --- crates/batcher/src/lib.rs | 188 +++++++++++++++++++++++++++++--------- 1 file changed, 147 insertions(+), 41 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 1ad37ca30f..026ca6959b 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -95,7 +95,7 @@ pub struct Batcher { non_paying_config: Option, aggregator_fee_percentage_multiplier: u128, aggregator_gas_cost: u128, - + // Shared state (Mutex) /// The general business rule is: /// - User processing can be done in parallel unless a batch creation is happening @@ -103,8 +103,8 @@ pub struct Batcher { /// needs to be stopped, and all user_states locks need to be taken batch_state: Mutex, user_states: DashMap>>, - /// When posting a task, this is taken as a write to stop new threads to update - /// user_states, ideally we would want a bigger mutex on the whole user_states, but this can't be done + /// When posting a task, this is taken as a write to stop new threads to update + /// user_states, ideally we would want a bigger mutex on the whole user_states, but this can't be done batch_processing_lock: RwLock<()>, last_uploaded_batch_block: Mutex, @@ -114,10 +114,8 @@ pub struct Batcher { /// the batch creation task posting_batch: Mutex, - disabled_verifiers: Mutex, - - + // Observability and monitoring pub metrics: metrics::BatcherMetrics, pub telemetry: TelemetrySender, @@ -303,7 +301,6 @@ impl Batcher { } } - async fn update_evicted_user_state( &self, removed_entry: &types::batch_queue::BatchQueueEntry, @@ -690,7 +687,7 @@ impl Batcher { ) -> Result<(), Error> { // Acquire read lock to allow concurrent user processing but block during batch creation let _batch_processing_guard = self.batch_processing_lock.read().await; - + let msg_nonce = client_msg.verification_data.nonce; debug!("Received message with nonce: {msg_nonce:?}"); self.metrics.received_proofs.inc(); @@ -1122,8 +1119,7 @@ impl Batcher { ); // update max_fee_limit - let updated_max_fee_limit_in_batch = - batch_state_lock.get_user_min_fee_in_batch(&addr); + let updated_max_fee_limit_in_batch = batch_state_lock.get_user_min_fee_in_batch(&addr); { let user_state = self.user_states.get(&addr); match user_state { @@ -1254,8 +1250,8 @@ impl Batcher { /// an empty batch, even if the block interval has been reached. /// Once the batch meets the conditions for submission, the finalized batch is then passed to the /// `finalize_batch` function. - /// This function doesn't remove the proofs from the queue. - async fn is_batch_ready( + /// This function removes the proofs from the queue immediately to avoid race conditions. + async fn extract_batch_if_ready( &self, block_number: u64, gas_price: U256, @@ -1291,9 +1287,12 @@ impl Batcher { // Set the batch posting flag to true *batch_posting = true; - let batch_queue_copy = batch_state_lock.batch_queue.clone(); - let finalized_batch = batch_queue::try_build_batch( - batch_queue_copy, + + // PHASE 1: Extract the batch directly from the queue to avoid race conditions + let mut batch_state_lock = batch_state_lock; // Make mutable + + let finalized_batch = batch_queue::extract_batch_directly( + &mut batch_state_lock.batch_queue, gas_price, self.max_batch_byte_size, self.max_batch_proof_qty, @@ -1313,26 +1312,31 @@ impl Batcher { }) .ok()?; + info!( + "Extracted {} proofs from queue for batch processing", + finalized_batch.len() + ); + + // PHASE 1.5: Update user states immediately after batch extraction to make the operation atomic + // We assume the batch posting will be successful, so we update user states now + if let Err(e) = self.update_user_states_after_batch_extraction(&batch_state_lock).await { + error!("Failed to update user states after batch extraction: {:?}", e); + // We could potentially put the batch back in the queue here if needed + *batch_posting = false; + return None; + } + Some(finalized_batch) } - /// Takes the submitted proofs and removes them from the queue. - /// This function should be called only AFTER the submission was confirmed onchain - async fn remove_proofs_from_queue( - &self, - finalized_batch: Vec, - ) -> Result<(), BatcherError> { - info!("Removing proofs from queue..."); - let mut batch_state_lock = self.batch_state.lock().await; - - finalized_batch.iter().for_each(|entry| { - if batch_state_lock.batch_queue.remove(entry).is_none() { - // If this happens, we have a bug in our code - error!("Some proofs were not found in the queue. This should not happen."); - } - }); + /// Updates user states after successful batch submission. + /// This function should be called only AFTER the submission was confirmed onchain. + /// Note: Proofs were already removed from the queue during the extraction phase. + async fn update_user_states_after_batch_submission(&self) -> Result<(), BatcherError> { + info!("Updating user states after batch submission..."); + let batch_state_lock = self.batch_state.lock().await; - // now we calculate the new user_states + // Calculate the new user_states based on the current queue (proofs already removed) let new_user_states = self.calculate_new_user_states_data(&batch_state_lock.batch_queue); let user_addresses: Vec
= @@ -1376,6 +1380,105 @@ impl Batcher { Ok(()) } + /// Updates user states immediately after batch extraction to make the operation atomic. + /// This function should be called right after extracting proofs from the queue. + /// We assume the batch posting will be successful and update user states optimistically. + /// IMPORTANT: Preserves last_max_fee_limit when users have no proofs left in queue. + async fn update_user_states_after_batch_extraction( + &self, + batch_state_lock: &tokio::sync::MutexGuard<'_, crate::types::batch_state::BatchState>, + ) -> Result<(), BatcherError> { + info!("Updating user states after batch extraction..."); + + // Calculate the new user_states based on the current queue (proofs already removed) + let new_user_states = self.calculate_new_user_states_data(&batch_state_lock.batch_queue); + + let user_addresses: Vec
= + self.user_states.iter().map(|entry| *entry.key()).collect(); + + for addr in user_addresses.iter() { + // FIXME: The case where a the update functions return `None` can only happen when the user was not found + // in the `user_states` map should not really happen here, but doing this check so that we don't unwrap. + // Once https://github.com/yetanotherco/aligned_layer/issues/1046 is done we could return a more + // informative error. + + // Now we update the user states related to the batch (proof count in batch and min fee in batch) + { + let user_state = self.user_states.get(addr); + match user_state { + Some(user_state) => { + let mut user_state_guard = user_state.lock().await; + + if let Some((proof_count, max_fee_limit, total_fees_in_queue)) = new_user_states.get(addr) { + // User still has proofs in the queue + user_state_guard.proofs_in_batch = *proof_count; + user_state_guard.last_max_fee_limit = *max_fee_limit; + user_state_guard.total_fees_in_queue = *total_fees_in_queue; + } else { + // User has no more proofs in the queue - only update count and total fees + // but preserve the last_max_fee_limit to avoid setting it to U256::MAX + // This is important for rollback scenarios where we need to restore proofs + user_state_guard.proofs_in_batch = 0; + user_state_guard.total_fees_in_queue = U256::zero(); + // Keep user_state_guard.last_max_fee_limit unchanged + } + } + None => { + return Err(BatcherError::QueueRemoveError( + "Could not update user state".into(), + )); + } + } + } + } + + // Update metrics + let queue_len = batch_state_lock.batch_queue.len(); + let queue_size_bytes = calculate_batch_size(&batch_state_lock.batch_queue)?; + + self.metrics + .update_queue_metrics(queue_len as i64, queue_size_bytes as i64); + + Ok(()) + } + + /// Cleans up user states after successful batch submission. + /// Resets last_max_fee_limit to U256::MAX for users who had proofs in the submitted batch + /// but now have no proofs left in the queue. + fn cleanup_user_states_after_successful_submission(&self, finalized_batch: &[BatchQueueEntry]) { + use std::collections::HashSet; + + // Get unique users from the submitted batch + let users_in_batch: HashSet
= finalized_batch.iter() + .map(|entry| entry.sender) + .collect(); + + // Check current queue state to see which users still have proofs + let batch_state_lock = match self.batch_state.try_lock() { + Ok(lock) => lock, + Err(_) => { + // If we can't get the lock, skip cleanup - it's not critical + warn!("Could not acquire batch state lock for user state cleanup"); + return; + } + }; + + let current_user_states = self.calculate_new_user_states_data(&batch_state_lock.batch_queue); + + // For each user in the batch, check if they now have no proofs left + for user_addr in users_in_batch { + if !current_user_states.contains_key(&user_addr) { + // User has no proofs left in queue - reset their max_fee_limit + if let Some(user_state_ref) = self.user_states.get(&user_addr) { + if let Ok(mut user_state_guard) = user_state_ref.try_lock() { + user_state_guard.last_max_fee_limit = U256::max_value(); + } + // If we can't get the lock, skip this user - not critical + } + } + } + } + /// Takes the finalized batch as input and: /// builds the merkle tree /// posts verification data batch to s3 @@ -1391,7 +1494,7 @@ impl Batcher { ) -> Result<(), BatcherError> { // Acquire write lock to ensure exclusive access during batch creation (blocks all user processing) let _batch_processing_guard = self.batch_processing_lock.write().await; - + let nonced_batch_verifcation_data: Vec = finalized_batch .clone() .into_iter() @@ -1468,8 +1571,8 @@ impl Batcher { BatcherError::TransactionSendError( TransactionSendError::SubmissionInsufficientBalance, ) => { - // TODO calling remove_proofs_from_queue here is a better solution, flushing only the failed batch - // this would also need a message sent to the clients + // TODO: In the future, we should re-add the failed batch back to the queue + // For now, we flush everything as a safety measure self.flush_queue_and_clear_nonce_cache().await; } _ => { @@ -1480,11 +1583,11 @@ impl Batcher { return Err(e); }; - // Once the submit is succesfull, we remove the submitted proofs from the queue - // TODO handle error case: - if let Err(e) = self.remove_proofs_from_queue(finalized_batch.clone()).await { - error!("Unexpected error while updating queue: {:?}", e); - } + // Note: Proofs were already removed from the queue during extraction phase + // User states were also already updated atomically during extraction + + // Clean up user states for users who had proofs in this batch but now have no proofs left + self.cleanup_user_states_after_successful_submission(&finalized_batch); connection::send_batch_inclusion_data_responses(finalized_batch, &batch_merkle_tree).await } @@ -1554,8 +1657,11 @@ impl Batcher { let modified_gas_price = gas_price * U256::from(GAS_PRICE_PERCENTAGE_MULTIPLIER) / U256::from(PERCENTAGE_DIVIDER); - if let Some(finalized_batch) = self.is_batch_ready(block_number, modified_gas_price).await { - // TODO (Mauro): There is a race condition here, + // TODO (Mauro): Take all the user locks here + if let Some(finalized_batch) = self + .extract_batch_if_ready(block_number, modified_gas_price) + .await + { let batch_finalization_result = self .finalize_batch(block_number, finalized_batch, modified_gas_price) From e60ad68f2a23fe82c1a8c24009bafe860ade8359 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Fri, 18 Jul 2025 16:30:07 -0300 Subject: [PATCH 13/48] Restore proofs --- crates/batcher/src/lib.rs | 122 +++++++++++++++++++++++++++++++++++--- 1 file changed, 113 insertions(+), 9 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 026ca6959b..45e24f920b 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -1479,6 +1479,107 @@ impl Batcher { } } + /// Restores proofs to the queue after batch submission failure. + /// Uses similar logic to user proof submission, including handling queue capacity. + /// NOTE: Nonce ordering is preserved by the priority queue's eviction order: + /// - Lower fees get evicted first + /// - For same fees, higher nonces get evicted first + /// This ensures we never have nonce N+1 without nonce N in the queue. + async fn restore_proofs_after_batch_failure(&self, failed_batch: &[BatchQueueEntry]) { + info!("Restoring {} proofs to queue after batch failure", failed_batch.len()); + + let mut batch_state_lock = self.batch_state.lock().await; + let mut restored_entries = Vec::new(); + + for entry in failed_batch { + let priority = BatchQueueEntryPriority::new( + entry.nonced_verification_data.max_fee, + entry.nonced_verification_data.nonce, + ); + + // Check if queue is full + if batch_state_lock.is_queue_full() { + // Use same logic as user submission - evict lowest priority if this one is higher + if let Some((lowest_entry, _)) = batch_state_lock.batch_queue.peek() { + let lowest_fee = lowest_entry.nonced_verification_data.max_fee; + let restore_fee = entry.nonced_verification_data.max_fee; + + if restore_fee > lowest_fee { + // Evict the lowest priority entry (preserves nonce ordering) + if let Some((evicted_entry, _)) = batch_state_lock.batch_queue.pop() { + warn!("Queue full during restoration, evicting proof from sender {} with nonce {} (fee: {})", + evicted_entry.sender, evicted_entry.nonced_verification_data.nonce, evicted_entry.nonced_verification_data.max_fee); + + // Update user state for evicted entry + self.update_evicted_user_state(&evicted_entry, &batch_state_lock.batch_queue).await; + + // Notify the evicted user via websocket + if let Some(evicted_ws_sink) = evicted_entry.messaging_sink { + connection::send_message( + evicted_ws_sink, + aligned_sdk::common::types::SubmitProofResponseMessage::UnderpricedProof, + ) + .await; + } + } + } else { + warn!("Queue full and restored proof has lower priority, dropping proof from sender {} with nonce {} (fee: {})", + entry.sender, entry.nonced_verification_data.nonce, entry.nonced_verification_data.max_fee); + continue; + } + } + } + + // Add the proof back to the queue + batch_state_lock.batch_queue.push(entry.clone(), priority); + restored_entries.push(entry); + } + + info!("Restored {} proofs to queue, new queue length: {}", restored_entries.len(), batch_state_lock.batch_queue.len()); + + // Update user states for successfully restored proofs + self.update_user_states_for_restored_proofs(&restored_entries, &batch_state_lock).await; + } + + /// Updates user states for proofs that were successfully restored to the queue. + /// This essentially undoes the optimistic user state updates that were made during extraction. + async fn update_user_states_for_restored_proofs( + &self, + restored_entries: &[&BatchQueueEntry], + batch_state_lock: &tokio::sync::MutexGuard<'_, crate::types::batch_state::BatchState>, + ) { + use std::collections::HashMap; + + // Group restored entries by user address + let mut users_restored_proofs: HashMap> = HashMap::new(); + for entry in restored_entries { + users_restored_proofs.entry(entry.sender).or_default().push(entry); + } + + // Calculate new user states based on current queue (including restored proofs) + let new_user_states = self.calculate_new_user_states_data(&batch_state_lock.batch_queue); + + // Update user states for each user who had proofs restored + for (user_addr, _user_entries) in users_restored_proofs { + if let Some(user_state_ref) = self.user_states.get(&user_addr) { + let mut user_state_guard = user_state_ref.lock().await; + + // Update based on current queue state (which includes restored proofs) + if let Some((proof_count, max_fee_limit, total_fees_in_queue)) = new_user_states.get(&user_addr) { + user_state_guard.proofs_in_batch = *proof_count; + user_state_guard.last_max_fee_limit = *max_fee_limit; + user_state_guard.total_fees_in_queue = *total_fees_in_queue; + + info!("Restored user state for {}: {} proofs, total fees: {}", + user_addr, proof_count, total_fees_in_queue); + } else { + // This shouldn't happen since we just added proofs for this user + warn!("User {} had proofs restored but not found in queue calculation", user_addr); + } + } + } + } + /// Takes the finalized batch as input and: /// builds the merkle tree /// posts verification data batch to s3 @@ -1489,16 +1590,15 @@ impl Batcher { async fn finalize_batch( &self, block_number: u64, - finalized_batch: Vec, + finalized_batch: &[BatchQueueEntry], gas_price: U256, ) -> Result<(), BatcherError> { // Acquire write lock to ensure exclusive access during batch creation (blocks all user processing) let _batch_processing_guard = self.batch_processing_lock.write().await; let nonced_batch_verifcation_data: Vec = finalized_batch - .clone() - .into_iter() - .map(|entry| entry.nonced_verification_data) + .iter() + .map(|entry| entry.nonced_verification_data.clone()) .collect(); let batch_verification_data: Vec = nonced_batch_verifcation_data @@ -1511,9 +1611,8 @@ impl Batcher { info!("Finalizing batch. Length: {}", finalized_batch.len()); let batch_data_comm: Vec = finalized_batch - .clone() - .into_iter() - .map(|entry| entry.verification_data_commitment) + .iter() + .map(|entry| entry.verification_data_commitment.clone()) .collect(); let batch_merkle_tree: MerkleTree = @@ -1664,14 +1763,19 @@ impl Batcher { { let batch_finalization_result = self - .finalize_batch(block_number, finalized_batch, modified_gas_price) + .finalize_batch(block_number, &finalized_batch, modified_gas_price) .await; // Resetting this here to avoid doing it on every return path of `finalize_batch` function let mut batch_posting = self.posting_batch.lock().await; *batch_posting = false; - batch_finalization_result?; + // If batch finalization failed, restore the proofs to the queue + if let Err(e) = batch_finalization_result { + error!("Batch finalization failed, restoring proofs to queue: {:?}", e); + self.restore_proofs_after_batch_failure(&finalized_batch).await; + return Err(e); + } } Ok(()) From 9493a0461a34dbf2cf607370d72a673fed9ec624 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Fri, 18 Jul 2025 17:57:28 -0300 Subject: [PATCH 14/48] Split locks --- contracts/lib/risc0-ethereum | 2 +- contracts/lib/sp1-contracts | 2 +- crates/CLAUDE.md | 123 ++++++++++++++++++++++++ crates/batcher/src/connection.rs | 2 +- crates/batcher/src/lib.rs | 67 +------------ crates/batcher/src/types/batch_queue.rs | 115 ++++++++++++---------- 6 files changed, 195 insertions(+), 116 deletions(-) create mode 100644 crates/CLAUDE.md diff --git a/contracts/lib/risc0-ethereum b/contracts/lib/risc0-ethereum index 382d76a804..728ec4b043 160000 --- a/contracts/lib/risc0-ethereum +++ b/contracts/lib/risc0-ethereum @@ -1 +1 @@ -Subproject commit 382d76a8040068243a5d13e3de50fcca4224b0d6 +Subproject commit 728ec4b0432ddb49d8416b7a3c3cfdfe052356f9 diff --git a/contracts/lib/sp1-contracts b/contracts/lib/sp1-contracts index 512b5e029a..26651fdb1c 160000 --- a/contracts/lib/sp1-contracts +++ b/contracts/lib/sp1-contracts @@ -1 +1 @@ -Subproject commit 512b5e029abc27f6e46a3c7eba220dac83ecc306 +Subproject commit 26651fdb1c1d6513443560d42888dedc3f68bde6 diff --git a/crates/CLAUDE.md b/crates/CLAUDE.md new file mode 100644 index 0000000000..f36eed2431 --- /dev/null +++ b/crates/CLAUDE.md @@ -0,0 +1,123 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Build Commands + +### Rust Workspace +- **Build all crates**: `cargo build` (from `/crates` directory) +- **Build specific crate**: `cargo build --manifest-path ./crates/[crate-name]/Cargo.toml` +- **Build with release optimization**: `cargo build --release` + +### Batcher +- **Build**: `cargo build --manifest-path ./crates/batcher/Cargo.toml --release` +- **Run**: `cargo run --manifest-path ./crates/batcher/Cargo.toml --release -- --config ./config-files/config-batcher.yaml --env-file ./crates/batcher/.env` +- **Start locally**: `make batcher_start_local` + +### CLI +- **Build**: `cd crates/cli && cargo build --release` +- **Install**: `cargo install --path crates/cli` +- **Install script**: `./crates/cli/install_aligned.sh` + +### SDK +- **Build**: `cargo build --manifest-path ./crates/sdk/Cargo.toml` +- **Test**: `cargo test --manifest-path ./crates/sdk/Cargo.toml` + +## Testing Commands + +### Rust Tests +- **Run all tests**: `cargo test` (from `/crates` directory) +- **Run specific crate tests**: `cargo test --manifest-path ./crates/[crate-name]/Cargo.toml` +- **Run with release mode**: `cargo test --release` + +### Go Tests +- **Run all Go tests**: `go test ./... -timeout 15m` +- **Run retry tests**: `cd core/ && go test -v -timeout 15m` + +### FFI Tests +- **SP1 Rust FFI**: `cd operator/sp1/lib && RUST_MIN_STACK=83886080 cargo test --release` +- **RISC Zero Rust FFI**: `cd operator/risc_zero/lib && cargo test --release` +- **Merkle Tree FFI**: `cd operator/merkle_tree/lib && RUST_MIN_STACK=83886080 cargo test --release` + +## Linting Commands + +### Solidity Contracts +- **Lint contracts**: `cd contracts && npm run lint:sol` + +### Rust (via Makefile targets) +- Check individual crate formatting: `cargo fmt --check --manifest-path ./crates/[crate-name]/Cargo.toml` +- Check individual crate linting: `cargo clippy --manifest-path ./crates/[crate-name]/Cargo.toml` + +## Common Development Commands + +### Dependencies +- **Install all dependencies**: `make deps` +- **Install Go dependencies**: `make go_deps` +- **Initialize submodules**: `make submodules` + +### Development Environment +- **Start Anvil**: `make anvil_start` +- **Start full local environment**: `make setup_local_aligned_all` +- **Build all FFIs**: `make build_all_ffi` + +### Proof Submission +- **Send SP1 proof**: `make batcher_send_sp1_task RPC_URL=http://localhost:8545 NETWORK=devnet` +- **Send RISC0 proof**: `make batcher_send_risc0_task RPC_URL=http://localhost:8545 NETWORK=devnet` +- **Send Gnark proofs**: `make batcher_send_gnark_plonk_bn254_task RPC_URL=http://localhost:8545 NETWORK=devnet` + +## Architecture Overview + +### Core Components + +**Aligned Layer** is a verification layer for zero-knowledge proofs built on EigenLayer. The system consists of several key components: + +1. **Batcher** (`crates/batcher/`): Aggregates multiple proofs into batches for efficient verification + - Listens for WebSocket connections from clients + - Collects verification data and batches them based on time/size thresholds + - Submits batches to the verification layer + +2. **SDK** (`crates/sdk/`): Provides client libraries for interacting with Aligned Layer + - **Verification Layer**: Core verification functionality + - **Aggregation Layer**: Handles proof aggregation modes + - **Communication**: Protocol implementations for client-server communication + - **Ethereum Integration**: Smart contract interfaces and utilities + +3. **CLI** (`crates/cli/`): Command-line interface for submitting proofs and interacting with the system + - Proof submission with various proving systems (SP1, RISC0, Gnark, Circom) + - Balance queries and verification status checks + - Batch verification data handling + +4. **Task Sender** (`crates/task-sender/`): Utility for load testing and automated proof submission + - Wallet generation and funding + - Infinite proof submission with configurable parameters + - Connection testing utilities + +### Supported Proving Systems + +The system supports multiple zero-knowledge proving systems: +- **SP1**: Succinct's zkVM proving system +- **RISC Zero**: General-purpose zkVM +- **Gnark**: Groth16 and PLONK protocols (BN254, BLS12-381) +- **Circom**: Circuit compiler with Groth16 backend + +### Key Architectural Patterns + +1. **Modular Design**: Each component (batcher, SDK, CLI) is a separate crate with clear boundaries +2. **Async/Await**: Heavy use of Tokio for asynchronous operations +3. **FFI Integration**: Foreign function interfaces for integrating with Go-based verifiers +4. **EigenLayer Integration**: Built as an AVS (Actively Validated Service) on EigenLayer +5. **Multi-Network Support**: Configurable for different networks (devnet, testnet, mainnet) + +### Development Workflow + +1. **Local Development**: Use `make anvil_start` to start local blockchain +2. **Component Testing**: Each crate can be built and tested independently +3. **Integration Testing**: Full system testing using Docker compose or Makefile targets +4. **Proof Generation**: Scripts in `scripts/test_files/` for generating test proofs + +### Configuration Management + +- **YAML Configuration**: Primary configuration files in `config-files/` +- **Environment Variables**: `.env` files for sensitive configuration +- **Network-Specific Config**: Separate configurations for different networks +- **Makefile Parameters**: Extensive use of Make variables for configuration \ No newline at end of file diff --git a/crates/batcher/src/connection.rs b/crates/batcher/src/connection.rs index 946922db30..b32e511c52 100644 --- a/crates/batcher/src/connection.rs +++ b/crates/batcher/src/connection.rs @@ -18,7 +18,7 @@ use tokio_tungstenite::{ pub(crate) type WsMessageSink = Arc, Message>>>; pub(crate) async fn send_batch_inclusion_data_responses( - finalized_batch: Vec, + finalized_batch: &[BatchQueueEntry], batch_merkle_tree: &MerkleTree, ) -> Result<(), BatcherError> { // Finalized_batch is ordered as the PriorityQueue, ordered by: ascending max_fee && if max_fee is equal, by descending nonce. diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 45e24f920b..7548522565 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -100,7 +100,7 @@ pub struct Batcher { /// The general business rule is: /// - User processing can be done in parallel unless a batch creation is happening /// - Batch creation needs to be able to change all the states, so all processing - /// needs to be stopped, and all user_states locks need to be taken + /// needs to be stopped, and all user_states locks need to be taken batch_state: Mutex, user_states: DashMap>>, /// When posting a task, this is taken as a write to stop new threads to update @@ -1317,68 +1317,9 @@ impl Batcher { finalized_batch.len() ); - // PHASE 1.5: Update user states immediately after batch extraction to make the operation atomic - // We assume the batch posting will be successful, so we update user states now - if let Err(e) = self.update_user_states_after_batch_extraction(&batch_state_lock).await { - error!("Failed to update user states after batch extraction: {:?}", e); - // We could potentially put the batch back in the queue here if needed - *batch_posting = false; - return None; - } - Some(finalized_batch) } - /// Updates user states after successful batch submission. - /// This function should be called only AFTER the submission was confirmed onchain. - /// Note: Proofs were already removed from the queue during the extraction phase. - async fn update_user_states_after_batch_submission(&self) -> Result<(), BatcherError> { - info!("Updating user states after batch submission..."); - let batch_state_lock = self.batch_state.lock().await; - - // Calculate the new user_states based on the current queue (proofs already removed) - let new_user_states = self.calculate_new_user_states_data(&batch_state_lock.batch_queue); - - let user_addresses: Vec
= - self.user_states.iter().map(|entry| *entry.key()).collect(); - let default_value = (0, U256::MAX, U256::zero()); - for addr in user_addresses.iter() { - let (proof_count, max_fee_limit, total_fees_in_queue) = - new_user_states.get(addr).unwrap_or(&default_value); - - // FIXME: The case where a the update functions return `None` can only happen when the user was not found - // in the `user_states` map should not really happen here, but doing this check so that we don't unwrap. - // Once https://github.com/yetanotherco/aligned_layer/issues/1046 is done we could return a more - // informative error. - - // Now we update the user states related to the batch (proof count in batch and min fee in batch) - { - let user_state = self.user_states.get(addr); - match user_state { - Some(user_state) => { - let mut user_state_guard = user_state.lock().await; - user_state_guard.proofs_in_batch = *proof_count; - user_state_guard.last_max_fee_limit = *max_fee_limit; - user_state_guard.total_fees_in_queue = *total_fees_in_queue; - } - None => { - return Err(BatcherError::QueueRemoveError( - "Could not update user state".into(), - )); - } - } - } - } - - // Update metrics - let queue_len = batch_state_lock.batch_queue.len(); - let queue_size_bytes = calculate_batch_size(&batch_state_lock.batch_queue)?; - - self.metrics - .update_queue_metrics(queue_len as i64, queue_size_bytes as i64); - - Ok(()) - } /// Updates user states immediately after batch extraction to make the operation atomic. /// This function should be called right after extracting proofs from the queue. @@ -1484,7 +1425,7 @@ impl Batcher { /// NOTE: Nonce ordering is preserved by the priority queue's eviction order: /// - Lower fees get evicted first /// - For same fees, higher nonces get evicted first - /// This ensures we never have nonce N+1 without nonce N in the queue. + /// This ensures we never have nonce N+1 without nonce N in the queue. async fn restore_proofs_after_batch_failure(&self, failed_batch: &[BatchQueueEntry]) { info!("Restoring {} proofs to queue after batch failure", failed_batch.len()); @@ -1651,7 +1592,7 @@ impl Batcher { &batch_bytes, &batch_merkle_tree.root, leaves, - &finalized_batch, + finalized_batch, gas_price, ) .await @@ -1686,7 +1627,7 @@ impl Batcher { // User states were also already updated atomically during extraction // Clean up user states for users who had proofs in this batch but now have no proofs left - self.cleanup_user_states_after_successful_submission(&finalized_batch); + self.cleanup_user_states_after_successful_submission(finalized_batch); connection::send_batch_inclusion_data_responses(finalized_batch, &batch_merkle_tree).await } diff --git a/crates/batcher/src/types/batch_queue.rs b/crates/batcher/src/types/batch_queue.rs index 74fea2615c..27aef35371 100644 --- a/crates/batcher/src/types/batch_queue.rs +++ b/crates/batcher/src/types/batch_queue.rs @@ -146,62 +146,71 @@ pub(crate) fn calculate_batch_size(batch_queue: &BatchQueue) -> Result Result, BatcherError> { - let mut finalized_batch = batch_queue; - let mut batch_size = calculate_batch_size(&finalized_batch)?; - - while let Some((entry, _)) = finalized_batch.peek() { - let batch_len = finalized_batch.len(); - let fee_per_proof = calculate_fee_per_proof(batch_len, gas_price, constant_gas_cost); + let mut batch_size = calculate_batch_size(batch_queue)?; + let mut rejected_entries = Vec::new(); + + // Remove entries that won't pay enough (same logic as try_build_batch) + loop { + let should_remove = if let Some((entry, _)) = batch_queue.peek() { + let batch_len = batch_queue.len(); + let fee_per_proof = calculate_fee_per_proof(batch_len, gas_price, constant_gas_cost); + + // if batch is not acceptable: + batch_size > max_batch_byte_size + || fee_per_proof > entry.nonced_verification_data.max_fee + || batch_len > max_batch_proof_qty + } else { + false + }; - // if batch is not acceptable: - if batch_size > max_batch_byte_size - || fee_per_proof > entry.nonced_verification_data.max_fee - || batch_len > max_batch_proof_qty - { - // Update the state for the next iteration: - // * Subtract this entry size to the size of the batch size. - // * Push the current entry to the resulting batch queue. + if should_remove { + // Remove this entry (it won't pay enough) and save it + let (rejected_entry, rejected_priority) = batch_queue.pop().unwrap(); - // It is safe to call `.unwrap()` here since any serialization error should have been caught - // when calculating the total size of the batch with the `calculate_batch_size` function + // Update batch size let verification_data_size = - cbor_serialize(&entry.nonced_verification_data.verification_data) + cbor_serialize(&rejected_entry.nonced_verification_data.verification_data) .unwrap() .len(); batch_size -= verification_data_size; - finalized_batch.pop(); - - continue; + rejected_entries.push((rejected_entry, rejected_priority)); + } else { + // At this point, we found a viable batch - break + break; } - - // At this point, we break since we found a batch that can be submitted - break; } - // If `finalized_batch` is empty, this means that all the batch queue was traversed and we didn't find - // any user willing to pay fot the fee per proof. - if finalized_batch.is_empty() { + // Check if we have a viable batch + if batch_queue.is_empty() { + // No viable batch found - put back the rejected entries + for (entry, priority) in rejected_entries { + batch_queue.push(entry, priority); + } return Err(BatcherError::BatchCostTooHigh); } - Ok(finalized_batch.clone().into_sorted_vec()) + // Extract the batch entries (the ones that will pay enough) + let batch_entries = batch_queue.clone().into_sorted_vec(); + batch_queue.clear(); + + // Put back the rejected entries (they stay in the queue for later) + for (entry, priority) in rejected_entries { + batch_queue.push(entry, priority); + } + + Ok(batch_entries) } fn calculate_fee_per_proof(batch_len: usize, gas_price: U256, constant_gas_cost: u128) -> U256 { @@ -311,8 +320,9 @@ mod test { batch_queue.push(entry_3, batch_priority_3); let gas_price = U256::from(1); - let finalized_batch = try_build_batch( - batch_queue.clone(), + let mut test_queue = batch_queue.clone(); + let finalized_batch = extract_batch_directly( + &mut test_queue, gas_price, 5000000, 50, @@ -423,8 +433,9 @@ mod test { batch_queue.push(entry_3, batch_priority_3); let gas_price = U256::from(1); - let finalized_batch = try_build_batch( - batch_queue.clone(), + let mut test_queue = batch_queue.clone(); + let finalized_batch = extract_batch_directly( + &mut test_queue, gas_price, 5000000, 50, @@ -533,8 +544,9 @@ mod test { batch_queue.push(entry_3.clone(), batch_priority_3.clone()); let gas_price = U256::from(1); - let finalized_batch = try_build_batch( - batch_queue.clone(), + let mut test_queue = batch_queue.clone(); + let finalized_batch = extract_batch_directly( + &mut test_queue, gas_price, 5000000, 2, @@ -643,8 +655,9 @@ mod test { batch_queue.push(entry_3, batch_priority_3); let gas_price = U256::from(1); - let finalized_batch = try_build_batch( - batch_queue.clone(), + let mut test_queue = batch_queue.clone(); + let finalized_batch = extract_batch_directly( + &mut test_queue, gas_price, 5000000, 50, @@ -759,8 +772,9 @@ mod test { batch_queue.push(entry_3, batch_priority_3); let gas_price = U256::from(1); - let finalized_batch = try_build_batch( - batch_queue.clone(), + let mut test_queue = batch_queue.clone(); + let finalized_batch = extract_batch_directly( + &mut test_queue, gas_price, 5000000, 50, @@ -875,8 +889,9 @@ mod test { // The max batch len is 2, so the algorithm should stop at the second entry. let max_batch_proof_qty = 2; - let finalized_batch = try_build_batch( - batch_queue.clone(), + let mut test_queue = batch_queue.clone(); + let finalized_batch = extract_batch_directly( + &mut test_queue, gas_price, 5000000, max_batch_proof_qty, From 859590afa753a1787963fd70615412a2bb7304c0 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Mon, 21 Jul 2025 17:23:52 -0300 Subject: [PATCH 15/48] Add replacement and parallel eviction logic --- crates/batcher/src/lib.rs | 359 +++++++++++++++++++------------------- 1 file changed, 175 insertions(+), 184 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 7548522565..b21bf292bd 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -301,7 +301,34 @@ impl Batcher { } } - async fn update_evicted_user_state( + fn update_evicted_user_state_with_lock( + &self, + removed_entry: &types::batch_queue::BatchQueueEntry, + batch_queue: &types::batch_queue::BatchQueue, + user_state_guard: &mut tokio::sync::MutexGuard<'_, crate::types::user_state::UserState>, + ) { + let addr = removed_entry.sender; + + let new_last_max_fee_limit = match batch_queue + .iter() + .filter(|(e, _)| e.sender == addr) + .next_back() + { + Some((last_entry, _)) => last_entry.nonced_verification_data.max_fee, + None => { + self.user_states.remove(&addr); + return; + } + }; + + user_state_guard.proofs_in_batch -= 1; + user_state_guard.nonce -= U256::one(); + user_state_guard.total_fees_in_queue -= removed_entry.nonced_verification_data.max_fee; + user_state_guard.last_max_fee_limit = new_last_max_fee_limit; + } + + // Fallback async version for restoration path where we don't have pre-held locks + async fn update_evicted_user_state_async( &self, removed_entry: &types::batch_queue::BatchQueueEntry, batch_queue: &types::batch_queue::BatchQueue, @@ -912,55 +939,80 @@ impl Batcher { // * ---------------------------------------------------------------------* // * Perform validation over batcher queue * // * ---------------------------------------------------------------------* + let mut batch_state_lock = self.batch_state.lock().await; if batch_state_lock.is_queue_full() { debug!("Batch queue is full. Evaluating if the incoming proof can replace a lower-priority entry."); - // This cannot panic, if the batch queue is full it has at least one item - let (lowest_priority_entry, _) = batch_state_lock - .batch_queue - .peek() - .expect("Batch queue was expected to be full, but somehow no item was inside"); - - let lowest_fee_in_queue = lowest_priority_entry.nonced_verification_data.max_fee; - let new_proof_fee = nonced_verification_data.max_fee; + let mut evicted_entry = None; - // We will keep the proof with the highest fee - // Note: we previously checked that if it's a new proof from the same user the fee is the same or lower - // So this will never eject a proof of the same user with a lower nonce - // which is the expected behaviour - if new_proof_fee > lowest_fee_in_queue { - // This cannot panic, if the batch queue is full it has at least one item - let (removed_entry, _) = batch_state_lock - .batch_queue - .pop() - .expect("Batch queue was expected to be full, but somehow no item was inside"); - - info!( - "Incoming proof (nonce: {}, fee: {}) has higher fee. Replacing lowest fee proof from sender {} with nonce {}.", - nonced_verification_data.nonce, - nonced_verification_data.max_fee, - removed_entry.sender, - removed_entry.nonced_verification_data.nonce - ); - - self.update_evicted_user_state(&removed_entry, &batch_state_lock.batch_queue) - .await; + // Collect addresses of potential candidates (lightweight) + let eviction_candidates: Vec
= batch_state_lock + .batch_queue + .iter() + .filter_map(|(entry, _)| { + if new_proof_fee > entry.nonced_verification_data.max_fee { + Some(entry.sender) + } else { + None + } + }) + .collect(); + + // Try to find any candidate whose lock we can acquire and immediately process them + for candidate_addr in eviction_candidates { + if let Some(user_state_arc) = self.user_states.get(&candidate_addr) { + if let Ok(mut user_guard) = user_state_arc.try_lock() { + // Found someone whose lock we can get - now find and remove their entry + let entries_to_check: Vec<_> = batch_state_lock + .batch_queue + .iter() + .filter(|(entry, _)| entry.sender == candidate_addr && new_proof_fee > entry.nonced_verification_data.max_fee) + .map(|(entry, _)| entry.clone()) + .collect(); + + if let Some(target_entry) = entries_to_check.into_iter().next() { + let removed_entry = batch_state_lock.batch_queue.remove(&target_entry).map(|(e, _)| e); + + if let Some(removed) = removed_entry { + info!( + "Incoming proof (nonce: {}, fee: {}) replacing proof from sender {} with nonce {} (fee: {})", + nonced_verification_data.nonce, + new_proof_fee, + removed.sender, + removed.nonced_verification_data.nonce, + removed.nonced_verification_data.max_fee + ); + + // Update the evicted user's state immediately + self.update_evicted_user_state_with_lock(&removed, &batch_state_lock.batch_queue, &mut user_guard); + + // Notify the evicted user + if let Some(ref removed_entry_ws) = removed.messaging_sink { + send_message( + removed_entry_ws.clone(), + SubmitProofResponseMessage::UnderpricedProof, + ) + .await; + } + + evicted_entry = Some(removed); + break; + } + } + } + } + } - if let Some(removed_entry_ws) = removed_entry.messaging_sink { - send_message( - removed_entry_ws, - SubmitProofResponseMessage::UnderpricedProof, - ) - .await; - }; - } else { + // Check if we successfully evicted someone + if evicted_entry.is_none() { + // No lock could be acquired or no evictable entry found - reject this proof info!( - "Incoming proof (nonce: {}, fee: {}) has lower priority than all entries in the full queue. Rejecting submission.", + "Incoming proof (nonce: {}, fee: {}) rejected - queue is full and no evictable entries found.", nonced_verification_data.nonce, - nonced_verification_data.max_fee + new_proof_fee ); std::mem::drop(batch_state_lock); send_message( @@ -1036,9 +1088,26 @@ impl Batcher { ) { let replacement_max_fee = nonced_verification_data.max_fee; let nonce = nonced_verification_data.nonce; - let mut batch_state_lock = self.batch_state.lock().await; - let Some(entry) = batch_state_lock.get_entry(addr, nonce) else { - std::mem::drop(batch_state_lock); + + // Take user state lock first to maintain proper lock ordering + let user_state = match self.user_states.get(&addr) { + Some(user_state) => user_state, + None => { + warn!("User state not found for address {addr} during replacement message"); + send_message( + ws_conn_sink.clone(), + SubmitProofResponseMessage::InvalidNonce, + ) + .await; + self.metrics.user_error(&["invalid_nonce", ""]); + return; + } + }; + let mut user_state_guard = user_state.lock().await; // First: user lock + let mut batch_state_guard = self.batch_state.lock().await; // Second: batch lock + let Some(entry) = batch_state_guard.get_entry(addr, nonce) else { + drop(batch_state_guard); + drop(user_state_guard); warn!("Invalid nonce for address {addr}. Queue entry with nonce {nonce} not found"); send_message( ws_conn_sink.clone(), @@ -1051,7 +1120,8 @@ impl Batcher { let original_max_fee = entry.nonced_verification_data.max_fee; if original_max_fee > replacement_max_fee { - std::mem::drop(batch_state_lock); + drop(batch_state_guard); + drop(user_state_guard); warn!("Invalid replacement message for address {addr}, had max fee: {original_max_fee:?}, received fee: {replacement_max_fee:?}"); send_message( ws_conn_sink.clone(), @@ -1090,8 +1160,8 @@ impl Batcher { } replacement_entry.messaging_sink = Some(ws_conn_sink.clone()); - if !batch_state_lock.replacement_entry_is_valid(&replacement_entry) { - std::mem::drop(batch_state_lock); + if !batch_state_guard.replacement_entry_is_valid(&replacement_entry) { + std::mem::drop(batch_state_guard); warn!("Invalid replacement message"); send_message( ws_conn_sink.clone(), @@ -1112,54 +1182,18 @@ impl Batcher { // note that the entries are considered equal for the priority queue // if they have the same nonce and sender, so we can remove the old entry // by calling remove with the new entry - batch_state_lock.batch_queue.remove(&replacement_entry); - batch_state_lock.batch_queue.push( + batch_state_guard.batch_queue.remove(&replacement_entry); + batch_state_guard.batch_queue.push( replacement_entry.clone(), BatchQueueEntryPriority::new(replacement_max_fee, nonce), ); - // update max_fee_limit - let updated_max_fee_limit_in_batch = batch_state_lock.get_user_min_fee_in_batch(&addr); - { - let user_state = self.user_states.get(&addr); - match user_state { - Some(user_state) => { - let mut user_state_guard = user_state.lock().await; - user_state_guard.last_max_fee_limit = updated_max_fee_limit_in_batch; - } - None => { - std::mem::drop(batch_state_lock); - warn!("User state for address {addr:?} was not present in batcher user states, but it should be"); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::AddToBatchError, - ) - .await; - return; - } - } - } - - // update total_fees_in_queue - { - let user_state = self.user_states.get(&addr); - match user_state { - Some(user_state) => { - let mut user_state_guard = user_state.lock().await; - let fee_difference = replacement_max_fee - original_max_fee; - user_state_guard.total_fees_in_queue += fee_difference; - } - None => { - std::mem::drop(batch_state_lock); - warn!("User state for address {addr:?} was not present in batcher user states, but it should be"); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::AddToBatchError, - ) - .await; - } - } - } + // update max_fee_limit and total_fees_in_queue using already held user_state_guard + let updated_max_fee_limit_in_batch = batch_state_guard.get_user_min_fee_in_batch(&addr); + user_state_guard.last_max_fee_limit = updated_max_fee_limit_in_batch; + + let fee_difference = replacement_max_fee - original_max_fee; + user_state_guard.total_fees_in_queue += fee_difference; } async fn disabled_verifiers(&self) -> Result> { @@ -1321,65 +1355,42 @@ impl Batcher { } - /// Updates user states immediately after batch extraction to make the operation atomic. - /// This function should be called right after extracting proofs from the queue. - /// We assume the batch posting will be successful and update user states optimistically. - /// IMPORTANT: Preserves last_max_fee_limit when users have no proofs left in queue. - async fn update_user_states_after_batch_extraction( + /// Updates user states based on current queue state after batch operations. + /// Used for both successful batch confirmation and failed batch restoration. + /// Updates proofs_in_batch, total_fees_in_queue, and last_max_fee_limit based on current queue state. + /// Uses proper lock ordering: user_state -> batch_state to avoid deadlocks. + async fn update_user_states_from_queue_state( &self, - batch_state_lock: &tokio::sync::MutexGuard<'_, crate::types::batch_state::BatchState>, + affected_users: std::collections::HashSet
, ) -> Result<(), BatcherError> { - info!("Updating user states after batch extraction..."); - - // Calculate the new user_states based on the current queue (proofs already removed) - let new_user_states = self.calculate_new_user_states_data(&batch_state_lock.batch_queue); - - let user_addresses: Vec
= - self.user_states.iter().map(|entry| *entry.key()).collect(); - - for addr in user_addresses.iter() { - // FIXME: The case where a the update functions return `None` can only happen when the user was not found - // in the `user_states` map should not really happen here, but doing this check so that we don't unwrap. - // Once https://github.com/yetanotherco/aligned_layer/issues/1046 is done we could return a more - // informative error. - - // Now we update the user states related to the batch (proof count in batch and min fee in batch) - { - let user_state = self.user_states.get(addr); - match user_state { - Some(user_state) => { - let mut user_state_guard = user_state.lock().await; - - if let Some((proof_count, max_fee_limit, total_fees_in_queue)) = new_user_states.get(addr) { - // User still has proofs in the queue - user_state_guard.proofs_in_batch = *proof_count; - user_state_guard.last_max_fee_limit = *max_fee_limit; - user_state_guard.total_fees_in_queue = *total_fees_in_queue; - } else { - // User has no more proofs in the queue - only update count and total fees - // but preserve the last_max_fee_limit to avoid setting it to U256::MAX - // This is important for rollback scenarios where we need to restore proofs - user_state_guard.proofs_in_batch = 0; - user_state_guard.total_fees_in_queue = U256::zero(); - // Keep user_state_guard.last_max_fee_limit unchanged - } - } - None => { - return Err(BatcherError::QueueRemoveError( - "Could not update user state".into(), - )); - } + // Update each user's state with proper lock ordering + for addr in affected_users { + if let Some(user_state) = self.user_states.get(&addr) { + let mut user_state_guard = user_state.lock().await; // First: user lock + let batch_state_lock = self.batch_state.lock().await; // Second: batch lock + + // Calculate what each user's state should be based on current queue contents + let current_queue_user_states = self.calculate_new_user_states_data(&batch_state_lock.batch_queue); + + if let Some((proof_count, min_max_fee_in_queue, total_fees_in_queue)) = current_queue_user_states.get(&addr) { + // User has proofs in queue - use calculated values + user_state_guard.proofs_in_batch = *proof_count; + user_state_guard.total_fees_in_queue = *total_fees_in_queue; + user_state_guard.last_max_fee_limit = *min_max_fee_in_queue; + } else { + // User not found in queue - reset to defaults + user_state_guard.proofs_in_batch = 0; + user_state_guard.total_fees_in_queue = U256::zero(); + user_state_guard.last_max_fee_limit = U256::MAX; } + + drop(batch_state_lock); // Release batch lock + drop(user_state_guard); // Release user lock + } else { + warn!("User state not found for address {}", addr); } } - // Update metrics - let queue_len = batch_state_lock.batch_queue.len(); - let queue_size_bytes = calculate_batch_size(&batch_state_lock.batch_queue)?; - - self.metrics - .update_queue_metrics(queue_len as i64, queue_size_bytes as i64); - Ok(()) } @@ -1452,7 +1463,7 @@ impl Batcher { evicted_entry.sender, evicted_entry.nonced_verification_data.nonce, evicted_entry.nonced_verification_data.max_fee); // Update user state for evicted entry - self.update_evicted_user_state(&evicted_entry, &batch_state_lock.batch_queue).await; + self.update_evicted_user_state_async(&evicted_entry, &batch_state_lock.batch_queue).await; // Notify the evicted user via websocket if let Some(evicted_ws_sink) = evicted_entry.messaging_sink { @@ -1478,49 +1489,21 @@ impl Batcher { info!("Restored {} proofs to queue, new queue length: {}", restored_entries.len(), batch_state_lock.batch_queue.len()); + // Get unique users from restored entries + let users_with_restored_proofs: std::collections::HashSet
= restored_entries.iter() + .map(|entry| entry.sender) + .collect(); + + drop(batch_state_lock); // Release batch lock before user state updates + // Update user states for successfully restored proofs - self.update_user_states_for_restored_proofs(&restored_entries, &batch_state_lock).await; - } - - /// Updates user states for proofs that were successfully restored to the queue. - /// This essentially undoes the optimistic user state updates that were made during extraction. - async fn update_user_states_for_restored_proofs( - &self, - restored_entries: &[&BatchQueueEntry], - batch_state_lock: &tokio::sync::MutexGuard<'_, crate::types::batch_state::BatchState>, - ) { - use std::collections::HashMap; - - // Group restored entries by user address - let mut users_restored_proofs: HashMap> = HashMap::new(); - for entry in restored_entries { - users_restored_proofs.entry(entry.sender).or_default().push(entry); - } - - // Calculate new user states based on current queue (including restored proofs) - let new_user_states = self.calculate_new_user_states_data(&batch_state_lock.batch_queue); - - // Update user states for each user who had proofs restored - for (user_addr, _user_entries) in users_restored_proofs { - if let Some(user_state_ref) = self.user_states.get(&user_addr) { - let mut user_state_guard = user_state_ref.lock().await; - - // Update based on current queue state (which includes restored proofs) - if let Some((proof_count, max_fee_limit, total_fees_in_queue)) = new_user_states.get(&user_addr) { - user_state_guard.proofs_in_batch = *proof_count; - user_state_guard.last_max_fee_limit = *max_fee_limit; - user_state_guard.total_fees_in_queue = *total_fees_in_queue; - - info!("Restored user state for {}: {} proofs, total fees: {}", - user_addr, proof_count, total_fees_in_queue); - } else { - // This shouldn't happen since we just added proofs for this user - warn!("User {} had proofs restored but not found in queue calculation", user_addr); - } - } + info!("Updating user states after proof restoration..."); + if let Err(e) = self.update_user_states_from_queue_state(users_with_restored_proofs).await { + error!("Failed to update user states after proof restoration: {:?}", e); } } + /// Takes the finalized batch as input and: /// builds the merkle tree /// posts verification data batch to s3 @@ -1624,7 +1607,15 @@ impl Batcher { }; // Note: Proofs were already removed from the queue during extraction phase - // User states were also already updated atomically during extraction + // Now update user states based on current queue state after successful submission + info!("Updating user states after batch confirmation..."); + let users_in_batch: std::collections::HashSet
= finalized_batch.iter() + .map(|entry| entry.sender) + .collect(); + if let Err(e) = self.update_user_states_from_queue_state(users_in_batch).await { + error!("Failed to update user states after batch confirmation: {:?}", e); + // Continue with the rest of the process since batch was already submitted successfully + } // Clean up user states for users who had proofs in this batch but now have no proofs left self.cleanup_user_states_after_successful_submission(finalized_batch); From fcac3867ddc78135fc39892c1b4edfff21eeb47d Mon Sep 17 00:00:00 2001 From: MauroFab Date: Mon, 21 Jul 2025 17:31:36 -0300 Subject: [PATCH 16/48] Move proof verification before batch verifications --- crates/batcher/src/lib.rs | 81 ++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index b21bf292bd..4a99376c08 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -768,46 +768,6 @@ impl Batcher { nonced_verification_data = aux_verification_data } - // When pre-verification is enabled, batcher will verify proofs for faster feedback with clients - if self.pre_verification_is_enabled { - let verification_data = &nonced_verification_data.verification_data; - if self - .is_verifier_disabled(verification_data.proving_system) - .await - { - warn!( - "Verifier for proving system {} is disabled, skipping verification", - verification_data.proving_system - ); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::DisabledVerifier( - verification_data.proving_system, - )), - ) - .await; - self.metrics.user_error(&[ - "disabled_verifier", - &format!("{}", verification_data.proving_system), - ]); - return Ok(()); - } - - if !zk_utils::verify(verification_data).await { - error!("Invalid proof detected. Verification failed"); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::RejectedProof), - ) - .await; - self.metrics.user_error(&[ - "rejected_proof", - &format!("{}", verification_data.proving_system), - ]); - return Ok(()); - } - } - // We don't need a batch state lock here, since if the user locks its funds // after the check, some blocks should pass until he can withdraw. // It is safe to do just do this here. @@ -936,6 +896,47 @@ impl Batcher { return Ok(()); } + + // When pre-verification is enabled, batcher will verify proofs for faster feedback with clients + if self.pre_verification_is_enabled { + let verification_data = &nonced_verification_data.verification_data; + if self + .is_verifier_disabled(verification_data.proving_system) + .await + { + warn!( + "Verifier for proving system {} is disabled, skipping verification", + verification_data.proving_system + ); + send_message( + ws_conn_sink.clone(), + SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::DisabledVerifier( + verification_data.proving_system, + )), + ) + .await; + self.metrics.user_error(&[ + "disabled_verifier", + &format!("{}", verification_data.proving_system), + ]); + return Ok(()); + } + + if !zk_utils::verify(verification_data).await { + error!("Invalid proof detected. Verification failed"); + send_message( + ws_conn_sink.clone(), + SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::RejectedProof), + ) + .await; + self.metrics.user_error(&[ + "rejected_proof", + &format!("{}", verification_data.proving_system), + ]); + return Ok(()); + } + } + // * ---------------------------------------------------------------------* // * Perform validation over batcher queue * // * ---------------------------------------------------------------------* From 9dfd4a73dc6bde9699352a442d032c969f5cc755 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Tue, 22 Jul 2025 13:48:49 -0300 Subject: [PATCH 17/48] Add claude code to gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 2fc6232a04..5c01213b9c 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,6 @@ witness.wtns *.zkey circuit_cpp/ circuit_js + +# Claude Code files +CLAUDE.md From f90c394c0cda165b473ab964ec8d9d8be5113663 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Tue, 22 Jul 2025 13:50:55 -0300 Subject: [PATCH 18/48] Remove claude code files --- crates/CLAUDE.md | 123 ----------------------------------------------- 1 file changed, 123 deletions(-) delete mode 100644 crates/CLAUDE.md diff --git a/crates/CLAUDE.md b/crates/CLAUDE.md deleted file mode 100644 index f36eed2431..0000000000 --- a/crates/CLAUDE.md +++ /dev/null @@ -1,123 +0,0 @@ -# CLAUDE.md - -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. - -## Build Commands - -### Rust Workspace -- **Build all crates**: `cargo build` (from `/crates` directory) -- **Build specific crate**: `cargo build --manifest-path ./crates/[crate-name]/Cargo.toml` -- **Build with release optimization**: `cargo build --release` - -### Batcher -- **Build**: `cargo build --manifest-path ./crates/batcher/Cargo.toml --release` -- **Run**: `cargo run --manifest-path ./crates/batcher/Cargo.toml --release -- --config ./config-files/config-batcher.yaml --env-file ./crates/batcher/.env` -- **Start locally**: `make batcher_start_local` - -### CLI -- **Build**: `cd crates/cli && cargo build --release` -- **Install**: `cargo install --path crates/cli` -- **Install script**: `./crates/cli/install_aligned.sh` - -### SDK -- **Build**: `cargo build --manifest-path ./crates/sdk/Cargo.toml` -- **Test**: `cargo test --manifest-path ./crates/sdk/Cargo.toml` - -## Testing Commands - -### Rust Tests -- **Run all tests**: `cargo test` (from `/crates` directory) -- **Run specific crate tests**: `cargo test --manifest-path ./crates/[crate-name]/Cargo.toml` -- **Run with release mode**: `cargo test --release` - -### Go Tests -- **Run all Go tests**: `go test ./... -timeout 15m` -- **Run retry tests**: `cd core/ && go test -v -timeout 15m` - -### FFI Tests -- **SP1 Rust FFI**: `cd operator/sp1/lib && RUST_MIN_STACK=83886080 cargo test --release` -- **RISC Zero Rust FFI**: `cd operator/risc_zero/lib && cargo test --release` -- **Merkle Tree FFI**: `cd operator/merkle_tree/lib && RUST_MIN_STACK=83886080 cargo test --release` - -## Linting Commands - -### Solidity Contracts -- **Lint contracts**: `cd contracts && npm run lint:sol` - -### Rust (via Makefile targets) -- Check individual crate formatting: `cargo fmt --check --manifest-path ./crates/[crate-name]/Cargo.toml` -- Check individual crate linting: `cargo clippy --manifest-path ./crates/[crate-name]/Cargo.toml` - -## Common Development Commands - -### Dependencies -- **Install all dependencies**: `make deps` -- **Install Go dependencies**: `make go_deps` -- **Initialize submodules**: `make submodules` - -### Development Environment -- **Start Anvil**: `make anvil_start` -- **Start full local environment**: `make setup_local_aligned_all` -- **Build all FFIs**: `make build_all_ffi` - -### Proof Submission -- **Send SP1 proof**: `make batcher_send_sp1_task RPC_URL=http://localhost:8545 NETWORK=devnet` -- **Send RISC0 proof**: `make batcher_send_risc0_task RPC_URL=http://localhost:8545 NETWORK=devnet` -- **Send Gnark proofs**: `make batcher_send_gnark_plonk_bn254_task RPC_URL=http://localhost:8545 NETWORK=devnet` - -## Architecture Overview - -### Core Components - -**Aligned Layer** is a verification layer for zero-knowledge proofs built on EigenLayer. The system consists of several key components: - -1. **Batcher** (`crates/batcher/`): Aggregates multiple proofs into batches for efficient verification - - Listens for WebSocket connections from clients - - Collects verification data and batches them based on time/size thresholds - - Submits batches to the verification layer - -2. **SDK** (`crates/sdk/`): Provides client libraries for interacting with Aligned Layer - - **Verification Layer**: Core verification functionality - - **Aggregation Layer**: Handles proof aggregation modes - - **Communication**: Protocol implementations for client-server communication - - **Ethereum Integration**: Smart contract interfaces and utilities - -3. **CLI** (`crates/cli/`): Command-line interface for submitting proofs and interacting with the system - - Proof submission with various proving systems (SP1, RISC0, Gnark, Circom) - - Balance queries and verification status checks - - Batch verification data handling - -4. **Task Sender** (`crates/task-sender/`): Utility for load testing and automated proof submission - - Wallet generation and funding - - Infinite proof submission with configurable parameters - - Connection testing utilities - -### Supported Proving Systems - -The system supports multiple zero-knowledge proving systems: -- **SP1**: Succinct's zkVM proving system -- **RISC Zero**: General-purpose zkVM -- **Gnark**: Groth16 and PLONK protocols (BN254, BLS12-381) -- **Circom**: Circuit compiler with Groth16 backend - -### Key Architectural Patterns - -1. **Modular Design**: Each component (batcher, SDK, CLI) is a separate crate with clear boundaries -2. **Async/Await**: Heavy use of Tokio for asynchronous operations -3. **FFI Integration**: Foreign function interfaces for integrating with Go-based verifiers -4. **EigenLayer Integration**: Built as an AVS (Actively Validated Service) on EigenLayer -5. **Multi-Network Support**: Configurable for different networks (devnet, testnet, mainnet) - -### Development Workflow - -1. **Local Development**: Use `make anvil_start` to start local blockchain -2. **Component Testing**: Each crate can be built and tested independently -3. **Integration Testing**: Full system testing using Docker compose or Makefile targets -4. **Proof Generation**: Scripts in `scripts/test_files/` for generating test proofs - -### Configuration Management - -- **YAML Configuration**: Primary configuration files in `config-files/` -- **Environment Variables**: `.env` files for sensitive configuration -- **Network-Specific Config**: Separate configurations for different networks -- **Makefile Parameters**: Extensive use of Make variables for configuration \ No newline at end of file From 28b95d33229c71609fb0f08b34f257a30b92c8fb Mon Sep 17 00:00:00 2001 From: MauroFab Date: Tue, 22 Jul 2025 14:20:14 -0300 Subject: [PATCH 19/48] Reset submodules to match staging --- contracts/lib/risc0-ethereum | 2 +- contracts/lib/sp1-contracts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/contracts/lib/risc0-ethereum b/contracts/lib/risc0-ethereum index 728ec4b043..15db3fe560 160000 --- a/contracts/lib/risc0-ethereum +++ b/contracts/lib/risc0-ethereum @@ -1 +1 @@ -Subproject commit 728ec4b0432ddb49d8416b7a3c3cfdfe052356f9 +Subproject commit 15db3fe560f6174839d676a9b5e732a81131347c diff --git a/contracts/lib/sp1-contracts b/contracts/lib/sp1-contracts index 26651fdb1c..512b5e029a 160000 --- a/contracts/lib/sp1-contracts +++ b/contracts/lib/sp1-contracts @@ -1 +1 @@ -Subproject commit 26651fdb1c1d6513443560d42888dedc3f68bde6 +Subproject commit 512b5e029abc27f6e46a3c7eba220dac83ecc306 From 1545dcb330a8cb268b77b285b3382943af981900 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Tue, 22 Jul 2025 14:44:35 -0300 Subject: [PATCH 20/48] Simplify extract_batch --- crates/batcher/src/types/batch_queue.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/crates/batcher/src/types/batch_queue.rs b/crates/batcher/src/types/batch_queue.rs index 27aef35371..5efabfaf6d 100644 --- a/crates/batcher/src/types/batch_queue.rs +++ b/crates/batcher/src/types/batch_queue.rs @@ -201,16 +201,19 @@ pub(crate) fn extract_batch_directly( return Err(BatcherError::BatchCostTooHigh); } - // Extract the batch entries (the ones that will pay enough) - let batch_entries = batch_queue.clone().into_sorted_vec(); - batch_queue.clear(); + // Extract remaining entries in sorted order + // Since pop() gives highest priority first, we collect them directly + let mut batch_for_posting = Vec::new(); + while let Some((entry, _)) = batch_queue.pop() { + batch_for_posting.push(entry); + } // Put back the rejected entries (they stay in the queue for later) for (entry, priority) in rejected_entries { batch_queue.push(entry, priority); } - Ok(batch_entries) + Ok(batch_for_posting) } fn calculate_fee_per_proof(batch_len: usize, gas_price: U256, constant_gas_cost: u128) -> U256 { From 9f0fd1557b597bbed968337dab63c595f3e59d75 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Tue, 22 Jul 2025 14:44:50 -0300 Subject: [PATCH 21/48] Simplify extract_batch --- crates/batcher/src/types/batch_queue.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/batcher/src/types/batch_queue.rs b/crates/batcher/src/types/batch_queue.rs index 5efabfaf6d..10edcc2f95 100644 --- a/crates/batcher/src/types/batch_queue.rs +++ b/crates/batcher/src/types/batch_queue.rs @@ -160,7 +160,7 @@ pub(crate) fn extract_batch_directly( let mut batch_size = calculate_batch_size(batch_queue)?; let mut rejected_entries = Vec::new(); - // Remove entries that won't pay enough (same logic as try_build_batch) + // Remove entries that won't pay enough, or that makes a queue that is too big loop { let should_remove = if let Some((entry, _)) = batch_queue.peek() { let batch_len = batch_queue.len(); From 26328ec9b9a68050d9a5db21effd2f704d2000fc Mon Sep 17 00:00:00 2001 From: MauroFab Date: Tue, 22 Jul 2025 14:46:39 -0300 Subject: [PATCH 22/48] Simplify batch queue creation algorithm --- crates/batcher/src/lib.rs | 133 +++++++++++++++--------- crates/batcher/src/types/batch_queue.rs | 1 - 2 files changed, 83 insertions(+), 51 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 4a99376c08..e0fad1a160 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -896,7 +896,6 @@ impl Batcher { return Ok(()); } - // When pre-verification is enabled, batcher will verify proofs for faster feedback with clients if self.pre_verification_is_enabled { let verification_data = &nonced_verification_data.verification_data; @@ -936,11 +935,10 @@ impl Batcher { return Ok(()); } } - + // * ---------------------------------------------------------------------* // * Perform validation over batcher queue * // * ---------------------------------------------------------------------* - let mut batch_state_lock = self.batch_state.lock().await; if batch_state_lock.is_queue_full() { @@ -970,13 +968,19 @@ impl Batcher { let entries_to_check: Vec<_> = batch_state_lock .batch_queue .iter() - .filter(|(entry, _)| entry.sender == candidate_addr && new_proof_fee > entry.nonced_verification_data.max_fee) + .filter(|(entry, _)| { + entry.sender == candidate_addr + && new_proof_fee > entry.nonced_verification_data.max_fee + }) .map(|(entry, _)| entry.clone()) .collect(); - + if let Some(target_entry) = entries_to_check.into_iter().next() { - let removed_entry = batch_state_lock.batch_queue.remove(&target_entry).map(|(e, _)| e); - + let removed_entry = batch_state_lock + .batch_queue + .remove(&target_entry) + .map(|(e, _)| e); + if let Some(removed) = removed_entry { info!( "Incoming proof (nonce: {}, fee: {}) replacing proof from sender {} with nonce {} (fee: {})", @@ -988,8 +992,12 @@ impl Batcher { ); // Update the evicted user's state immediately - self.update_evicted_user_state_with_lock(&removed, &batch_state_lock.batch_queue, &mut user_guard); - + self.update_evicted_user_state_with_lock( + &removed, + &batch_state_lock.batch_queue, + &mut user_guard, + ); + // Notify the evicted user if let Some(ref removed_entry_ws) = removed.messaging_sink { send_message( @@ -1089,7 +1097,7 @@ impl Batcher { ) { let replacement_max_fee = nonced_verification_data.max_fee; let nonce = nonced_verification_data.nonce; - + // Take user state lock first to maintain proper lock ordering let user_state = match self.user_states.get(&addr) { Some(user_state) => user_state, @@ -1192,7 +1200,7 @@ impl Batcher { // update max_fee_limit and total_fees_in_queue using already held user_state_guard let updated_max_fee_limit_in_batch = batch_state_guard.get_user_min_fee_in_batch(&addr); user_state_guard.last_max_fee_limit = updated_max_fee_limit_in_batch; - + let fee_difference = replacement_max_fee - original_max_fee; user_state_guard.total_fees_in_queue += fee_difference; } @@ -1325,7 +1333,7 @@ impl Batcher { // PHASE 1: Extract the batch directly from the queue to avoid race conditions let mut batch_state_lock = batch_state_lock; // Make mutable - + let finalized_batch = batch_queue::extract_batch_directly( &mut batch_state_lock.batch_queue, gas_price, @@ -1355,7 +1363,6 @@ impl Batcher { Some(finalized_batch) } - /// Updates user states based on current queue state after batch operations. /// Used for both successful batch confirmation and failed batch restoration. /// Updates proofs_in_batch, total_fees_in_queue, and last_max_fee_limit based on current queue state. @@ -1369,11 +1376,14 @@ impl Batcher { if let Some(user_state) = self.user_states.get(&addr) { let mut user_state_guard = user_state.lock().await; // First: user lock let batch_state_lock = self.batch_state.lock().await; // Second: batch lock - + // Calculate what each user's state should be based on current queue contents - let current_queue_user_states = self.calculate_new_user_states_data(&batch_state_lock.batch_queue); - - if let Some((proof_count, min_max_fee_in_queue, total_fees_in_queue)) = current_queue_user_states.get(&addr) { + let current_queue_user_states = + self.calculate_new_user_states_data(&batch_state_lock.batch_queue); + + if let Some((proof_count, min_max_fee_in_queue, total_fees_in_queue)) = + current_queue_user_states.get(&addr) + { // User has proofs in queue - use calculated values user_state_guard.proofs_in_batch = *proof_count; user_state_guard.total_fees_in_queue = *total_fees_in_queue; @@ -1384,7 +1394,7 @@ impl Batcher { user_state_guard.total_fees_in_queue = U256::zero(); user_state_guard.last_max_fee_limit = U256::MAX; } - + drop(batch_state_lock); // Release batch lock drop(user_state_guard); // Release user lock } else { @@ -1400,12 +1410,11 @@ impl Batcher { /// but now have no proofs left in the queue. fn cleanup_user_states_after_successful_submission(&self, finalized_batch: &[BatchQueueEntry]) { use std::collections::HashSet; - + // Get unique users from the submitted batch - let users_in_batch: HashSet
= finalized_batch.iter() - .map(|entry| entry.sender) - .collect(); - + let users_in_batch: HashSet
= + finalized_batch.iter().map(|entry| entry.sender).collect(); + // Check current queue state to see which users still have proofs let batch_state_lock = match self.batch_state.try_lock() { Ok(lock) => lock, @@ -1415,9 +1424,10 @@ impl Batcher { return; } }; - - let current_user_states = self.calculate_new_user_states_data(&batch_state_lock.batch_queue); - + + let current_user_states = + self.calculate_new_user_states_data(&batch_state_lock.batch_queue); + // For each user in the batch, check if they now have no proofs left for user_addr in users_in_batch { if !current_user_states.contains_key(&user_addr) { @@ -1439,8 +1449,11 @@ impl Batcher { /// - For same fees, higher nonces get evicted first /// This ensures we never have nonce N+1 without nonce N in the queue. async fn restore_proofs_after_batch_failure(&self, failed_batch: &[BatchQueueEntry]) { - info!("Restoring {} proofs to queue after batch failure", failed_batch.len()); - + info!( + "Restoring {} proofs to queue after batch failure", + failed_batch.len() + ); + let mut batch_state_lock = self.batch_state.lock().await; let mut restored_entries = Vec::new(); @@ -1462,10 +1475,14 @@ impl Batcher { if let Some((evicted_entry, _)) = batch_state_lock.batch_queue.pop() { warn!("Queue full during restoration, evicting proof from sender {} with nonce {} (fee: {})", evicted_entry.sender, evicted_entry.nonced_verification_data.nonce, evicted_entry.nonced_verification_data.max_fee); - + // Update user state for evicted entry - self.update_evicted_user_state_async(&evicted_entry, &batch_state_lock.batch_queue).await; - + self.update_evicted_user_state_async( + &evicted_entry, + &batch_state_lock.batch_queue, + ) + .await; + // Notify the evicted user via websocket if let Some(evicted_ws_sink) = evicted_entry.messaging_sink { connection::send_message( @@ -1488,23 +1505,31 @@ impl Batcher { restored_entries.push(entry); } - info!("Restored {} proofs to queue, new queue length: {}", restored_entries.len(), batch_state_lock.batch_queue.len()); - + info!( + "Restored {} proofs to queue, new queue length: {}", + restored_entries.len(), + batch_state_lock.batch_queue.len() + ); + // Get unique users from restored entries - let users_with_restored_proofs: std::collections::HashSet
= restored_entries.iter() - .map(|entry| entry.sender) - .collect(); - + let users_with_restored_proofs: std::collections::HashSet
= + restored_entries.iter().map(|entry| entry.sender).collect(); + drop(batch_state_lock); // Release batch lock before user state updates - + // Update user states for successfully restored proofs info!("Updating user states after proof restoration..."); - if let Err(e) = self.update_user_states_from_queue_state(users_with_restored_proofs).await { - error!("Failed to update user states after proof restoration: {:?}", e); + if let Err(e) = self + .update_user_states_from_queue_state(users_with_restored_proofs) + .await + { + error!( + "Failed to update user states after proof restoration: {:?}", + e + ); } } - /// Takes the finalized batch as input and: /// builds the merkle tree /// posts verification data batch to s3 @@ -1610,14 +1635,19 @@ impl Batcher { // Note: Proofs were already removed from the queue during extraction phase // Now update user states based on current queue state after successful submission info!("Updating user states after batch confirmation..."); - let users_in_batch: std::collections::HashSet
= finalized_batch.iter() - .map(|entry| entry.sender) - .collect(); - if let Err(e) = self.update_user_states_from_queue_state(users_in_batch).await { - error!("Failed to update user states after batch confirmation: {:?}", e); + let users_in_batch: std::collections::HashSet
= + finalized_batch.iter().map(|entry| entry.sender).collect(); + if let Err(e) = self + .update_user_states_from_queue_state(users_in_batch) + .await + { + error!( + "Failed to update user states after batch confirmation: {:?}", + e + ); // Continue with the rest of the process since batch was already submitted successfully } - + // Clean up user states for users who had proofs in this batch but now have no proofs left self.cleanup_user_states_after_successful_submission(finalized_batch); @@ -1694,7 +1724,6 @@ impl Batcher { .extract_batch_if_ready(block_number, modified_gas_price) .await { - let batch_finalization_result = self .finalize_batch(block_number, &finalized_batch, modified_gas_price) .await; @@ -1705,8 +1734,12 @@ impl Batcher { // If batch finalization failed, restore the proofs to the queue if let Err(e) = batch_finalization_result { - error!("Batch finalization failed, restoring proofs to queue: {:?}", e); - self.restore_proofs_after_batch_failure(&finalized_batch).await; + error!( + "Batch finalization failed, restoring proofs to queue: {:?}", + e + ); + self.restore_proofs_after_batch_failure(&finalized_batch) + .await; return Err(e); } } diff --git a/crates/batcher/src/types/batch_queue.rs b/crates/batcher/src/types/batch_queue.rs index 10edcc2f95..bfd378ab09 100644 --- a/crates/batcher/src/types/batch_queue.rs +++ b/crates/batcher/src/types/batch_queue.rs @@ -146,7 +146,6 @@ pub(crate) fn calculate_batch_size(batch_queue: &BatchQueue) -> Result Date: Tue, 22 Jul 2025 14:54:54 -0300 Subject: [PATCH 23/48] Add test for only one proof in queue --- crates/batcher/src/types/batch_queue.rs | 64 +++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/crates/batcher/src/types/batch_queue.rs b/crates/batcher/src/types/batch_queue.rs index bfd378ab09..0b83bc7e20 100644 --- a/crates/batcher/src/types/batch_queue.rs +++ b/crates/batcher/src/types/batch_queue.rs @@ -797,6 +797,70 @@ mod test { ); } + #[test] + fn batch_finalization_algorithm_works_single_high_fee_proof() { + // Test the scenario: 1 proof with high fee that should be viable + let proof_generator_addr = Address::random(); + let payment_service_addr = Address::random(); + let sender_addr = Address::random(); + let bytes_for_verification_data = vec![42_u8; 10]; + let dummy_signature = Signature { + r: U256::from(1), + s: U256::from(2), + v: 3, + }; + let verification_data = VerificationData { + proving_system: ProvingSystemId::Risc0, + proof: bytes_for_verification_data.clone(), + pub_input: Some(bytes_for_verification_data.clone()), + verification_key: Some(bytes_for_verification_data.clone()), + vm_program_code: Some(bytes_for_verification_data), + proof_generator_addr, + }; + let chain_id = U256::from(42); + + // Single entry with very high fee - should definitely be viable + let nonce = U256::from(1); + let high_max_fee = U256::from(1_000_000_000_000_000_000u128); // Very high fee - 1 ETH + let nonced_verification_data = NoncedVerificationData::new( + verification_data, + nonce, + high_max_fee, + chain_id, + payment_service_addr, + ); + let vd_commitment: VerificationDataCommitment = nonced_verification_data.clone().into(); + let entry = BatchQueueEntry::new_for_testing( + nonced_verification_data, + vd_commitment, + dummy_signature, + sender_addr, + ); + let batch_priority = BatchQueueEntryPriority::new(high_max_fee, nonce); + + let mut batch_queue = BatchQueue::new(); + batch_queue.push(entry, batch_priority); + + let gas_price = U256::from(10_000_000_000u64); // 10 gwei gas price + let mut test_queue = batch_queue.clone(); + let finalized_batch = extract_batch_directly( + &mut test_queue, + gas_price, + 5000000, // Large byte size limit + 50, // Large proof quantity limit + DEFAULT_CONSTANT_GAS_COST, + ); + + // This should succeed and return the single proof + assert!(finalized_batch.is_ok(), "Should successfully extract batch with single high-fee proof"); + let batch = finalized_batch.unwrap(); + assert_eq!(batch.len(), 1, "Batch should contain exactly 1 proof"); + assert_eq!(batch[0].nonced_verification_data.max_fee, high_max_fee); + + // The queue should now be empty (no rejected entries to put back) + assert_eq!(test_queue.len(), 0, "Queue should be empty after extracting the single viable proof"); + } + #[test] fn batch_finalization_algorithm_works_not_bigger_than_max_batch_proof_qty() { // The following information will be the same for each entry, it is just some dummy data to see From 0c6159a805a5302d876da402549ea0643c4e1d7a Mon Sep 17 00:00:00 2001 From: MauroFab Date: Tue, 22 Jul 2025 15:37:56 -0300 Subject: [PATCH 24/48] Remove unused lock --- crates/batcher/src/lib.rs | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index e0fad1a160..63582f9daa 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -103,10 +103,7 @@ pub struct Batcher { /// needs to be stopped, and all user_states locks need to be taken batch_state: Mutex, user_states: DashMap>>, - /// When posting a task, this is taken as a write to stop new threads to update - /// user_states, ideally we would want a bigger mutex on the whole user_states, but this can't be done - batch_processing_lock: RwLock<()>, - + last_uploaded_batch_block: Mutex, /// This is used to avoid multiple batches being submitted at the same time @@ -712,8 +709,6 @@ impl Batcher { client_msg: Box, ws_conn_sink: WsMessageSink, ) -> Result<(), Error> { - // Acquire read lock to allow concurrent user processing but block during batch creation - let _batch_processing_guard = self.batch_processing_lock.read().await; let msg_nonce = client_msg.verification_data.nonce; debug!("Received message with nonce: {msg_nonce:?}"); @@ -870,6 +865,8 @@ impl Batcher { // In this case, the message might be a replacement one. If it is valid, // we replace the old entry with the new from the replacement message. + // Notice this stops the normal flow of the handle_submit_proof. + // locks will be taken inside this function if expected_nonce > msg_nonce { info!("Possible replacement message received: Expected nonce {expected_nonce:?} - message nonce: {msg_nonce:?}"); self.handle_replacement_message( @@ -1543,8 +1540,6 @@ impl Batcher { finalized_batch: &[BatchQueueEntry], gas_price: U256, ) -> Result<(), BatcherError> { - // Acquire write lock to ensure exclusive access during batch creation (blocks all user processing) - let _batch_processing_guard = self.batch_processing_lock.write().await; let nonced_batch_verifcation_data: Vec = finalized_batch .iter() From b88aa2f8132b57eab968a6a9802ab6c5b0ac01c7 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Tue, 22 Jul 2025 15:54:40 -0300 Subject: [PATCH 25/48] Fix deadlock in replacement message --- crates/batcher/src/lib.rs | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 63582f9daa..c1e6f13d09 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -289,7 +289,6 @@ impl Batcher { .aggregator_fee_percentage_multiplier, aggregator_gas_cost: config.batcher.aggregator_gas_cost, posting_batch: Mutex::new(false), - batch_processing_lock: RwLock::new(()), batch_state: Mutex::new(batch_state), user_states, disabled_verifiers: Mutex::new(disabled_verifiers), @@ -866,7 +865,8 @@ impl Batcher { // In this case, the message might be a replacement one. If it is valid, // we replace the old entry with the new from the replacement message. // Notice this stops the normal flow of the handle_submit_proof. - // locks will be taken inside this function + // We pass the already-held user_state_guard to avoid double-locking + // This will take the batch lock internally if expected_nonce > msg_nonce { info!("Possible replacement message received: Expected nonce {expected_nonce:?} - message nonce: {msg_nonce:?}"); self.handle_replacement_message( @@ -874,6 +874,7 @@ impl Batcher { ws_conn_sink.clone(), client_msg.signature, addr, + user_state_guard, ) .await; @@ -1091,25 +1092,10 @@ impl Batcher { ws_conn_sink: WsMessageSink, signature: Signature, addr: Address, + mut user_state_guard: tokio::sync::MutexGuard<'_, UserState>, ) { let replacement_max_fee = nonced_verification_data.max_fee; let nonce = nonced_verification_data.nonce; - - // Take user state lock first to maintain proper lock ordering - let user_state = match self.user_states.get(&addr) { - Some(user_state) => user_state, - None => { - warn!("User state not found for address {addr} during replacement message"); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::InvalidNonce, - ) - .await; - self.metrics.user_error(&["invalid_nonce", ""]); - return; - } - }; - let mut user_state_guard = user_state.lock().await; // First: user lock let mut batch_state_guard = self.batch_state.lock().await; // Second: batch lock let Some(entry) = batch_state_guard.get_entry(addr, nonce) else { drop(batch_state_guard); From 613e14f0c17323fe07332d021946f419fe033124 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Tue, 29 Jul 2025 11:18:48 -0300 Subject: [PATCH 26/48] Fmt --- crates/batcher/src/lib.rs | 4 +--- crates/batcher/src/types/batch_queue.rs | 15 +++++++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index c1e6f13d09..8cda4a8a74 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -103,7 +103,7 @@ pub struct Batcher { /// needs to be stopped, and all user_states locks need to be taken batch_state: Mutex, user_states: DashMap>>, - + last_uploaded_batch_block: Mutex, /// This is used to avoid multiple batches being submitted at the same time @@ -708,7 +708,6 @@ impl Batcher { client_msg: Box, ws_conn_sink: WsMessageSink, ) -> Result<(), Error> { - let msg_nonce = client_msg.verification_data.nonce; debug!("Received message with nonce: {msg_nonce:?}"); self.metrics.received_proofs.inc(); @@ -1526,7 +1525,6 @@ impl Batcher { finalized_batch: &[BatchQueueEntry], gas_price: U256, ) -> Result<(), BatcherError> { - let nonced_batch_verifcation_data: Vec = finalized_batch .iter() .map(|entry| entry.nonced_verification_data.clone()) diff --git a/crates/batcher/src/types/batch_queue.rs b/crates/batcher/src/types/batch_queue.rs index 0b83bc7e20..f8de63c7ec 100644 --- a/crates/batcher/src/types/batch_queue.rs +++ b/crates/batcher/src/types/batch_queue.rs @@ -847,18 +847,25 @@ mod test { &mut test_queue, gas_price, 5000000, // Large byte size limit - 50, // Large proof quantity limit + 50, // Large proof quantity limit DEFAULT_CONSTANT_GAS_COST, ); // This should succeed and return the single proof - assert!(finalized_batch.is_ok(), "Should successfully extract batch with single high-fee proof"); + assert!( + finalized_batch.is_ok(), + "Should successfully extract batch with single high-fee proof" + ); let batch = finalized_batch.unwrap(); assert_eq!(batch.len(), 1, "Batch should contain exactly 1 proof"); assert_eq!(batch[0].nonced_verification_data.max_fee, high_max_fee); - + // The queue should now be empty (no rejected entries to put back) - assert_eq!(test_queue.len(), 0, "Queue should be empty after extracting the single viable proof"); + assert_eq!( + test_queue.len(), + 0, + "Queue should be empty after extracting the single viable proof" + ); } #[test] From 52781d6a623b781aa995fee159b75077123ff25e Mon Sep 17 00:00:00 2001 From: MauroFab Date: Mon, 4 Aug 2025 18:17:39 -0300 Subject: [PATCH 27/48] Make async message for ejection --- crates/batcher/src/lib.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 8cda4a8a74..b2827df46c 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -995,13 +995,18 @@ impl Batcher { &mut user_guard, ); - // Notify the evicted user if let Some(ref removed_entry_ws) = removed.messaging_sink { - send_message( - removed_entry_ws.clone(), - SubmitProofResponseMessage::UnderpricedProof, - ) - .await; + let ws_sink = removed_entry_ws.clone(); + // Usually we just drop the locks, but this time + // We still need to keep them since we are doing more work + // So we send the message in an async manner + tokio::spawn(async move { + send_message( + ws_sink, + SubmitProofResponseMessage::UnderpricedProof, + ) + .await; + }); } evicted_entry = Some(removed); From c63a41f1e335e4330f6037bb521284951c887a89 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Mon, 4 Aug 2025 18:48:04 -0300 Subject: [PATCH 28/48] Add verification to replacement messages --- crates/batcher/src/lib.rs | 56 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index b2827df46c..c023ac11d6 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -1115,22 +1115,69 @@ impl Batcher { }; let original_max_fee = entry.nonced_verification_data.max_fee; - if original_max_fee > replacement_max_fee { + // Require 10% fee increase to prevent DoS attacks. While this could theoretically overflow, + // it would require an attacker to have an impractical amount of Ethereum to reach U256::MAX + let min_required_fee = original_max_fee + (original_max_fee / U256::from(10)); // 10% increase (1.1x) + if replacement_max_fee < min_required_fee { drop(batch_state_guard); drop(user_state_guard); - warn!("Invalid replacement message for address {addr}, had max fee: {original_max_fee:?}, received fee: {replacement_max_fee:?}"); + info!("Replacement message fee increase too small for address {addr}. Original: {original_max_fee:?}, received: {replacement_max_fee:?}, minimum required: {min_required_fee:?}"); send_message( ws_conn_sink.clone(), SubmitProofResponseMessage::InvalidReplacementMessage, ) .await; self.metrics - .user_error(&["invalid_replacement_message", ""]); + .user_error(&["insufficient_fee_increase", ""]); return; } info!("Replacing message for address {addr} with nonce {nonce} and max fee {replacement_max_fee}"); + // When pre-verification is enabled, verify the replacement proof + if self.pre_verification_is_enabled { + let verification_data = &nonced_verification_data.verification_data; + if self + .is_verifier_disabled(verification_data.proving_system) + .await + { + drop(batch_state_guard); + drop(user_state_guard); + warn!( + "Verifier for proving system {} is disabled for replacement message", + verification_data.proving_system + ); + send_message( + ws_conn_sink.clone(), + SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::DisabledVerifier( + verification_data.proving_system, + )), + ) + .await; + self.metrics.user_error(&[ + "disabled_verifier", + &format!("{}", verification_data.proving_system), + ]); + return; + } + + if !zk_utils::verify(verification_data).await { + drop(batch_state_guard); + drop(user_state_guard); + error!("Invalid replacement proof detected. Verification failed"); + send_message( + ws_conn_sink.clone(), + SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::RejectedProof), + ) + .await; + self.metrics.user_error(&[ + "rejected_proof", + &format!("{}", verification_data.proving_system), + ]); + return; + } + } + // The replacement entry is built from the old entry and validated for then to be replaced let mut replacement_entry = entry.clone(); replacement_entry.signature = signature; @@ -1157,7 +1204,8 @@ impl Batcher { replacement_entry.messaging_sink = Some(ws_conn_sink.clone()); if !batch_state_guard.replacement_entry_is_valid(&replacement_entry) { - std::mem::drop(batch_state_guard); + drop(batch_state_guard); + drop(user_state_guard); warn!("Invalid replacement message"); send_message( ws_conn_sink.clone(), From 91ef45f2a0626a6cfe0df24400d65a2946f3e025 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Mon, 4 Aug 2025 18:54:58 -0300 Subject: [PATCH 29/48] Refactor verify function --- crates/batcher/src/lib.rs | 137 ++++++++++++++++---------------------- 1 file changed, 57 insertions(+), 80 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index c023ac11d6..71e94816c7 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -893,44 +893,11 @@ impl Batcher { return Ok(()); } - // When pre-verification is enabled, batcher will verify proofs for faster feedback with clients - if self.pre_verification_is_enabled { - let verification_data = &nonced_verification_data.verification_data; - if self - .is_verifier_disabled(verification_data.proving_system) - .await - { - warn!( - "Verifier for proving system {} is disabled, skipping verification", - verification_data.proving_system - ); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::DisabledVerifier( - verification_data.proving_system, - )), - ) - .await; - self.metrics.user_error(&[ - "disabled_verifier", - &format!("{}", verification_data.proving_system), - ]); - return Ok(()); - } - - if !zk_utils::verify(verification_data).await { - error!("Invalid proof detected. Verification failed"); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::RejectedProof), - ) - .await; - self.metrics.user_error(&[ - "rejected_proof", - &format!("{}", verification_data.proving_system), - ]); - return Ok(()); - } + if !self.verify_proof_if_enabled( + &nonced_verification_data.verification_data, + ws_conn_sink.clone(), + ).await { + return Ok(()); } // * ---------------------------------------------------------------------* @@ -1134,48 +1101,13 @@ impl Batcher { info!("Replacing message for address {addr} with nonce {nonce} and max fee {replacement_max_fee}"); - // When pre-verification is enabled, verify the replacement proof - if self.pre_verification_is_enabled { - let verification_data = &nonced_verification_data.verification_data; - if self - .is_verifier_disabled(verification_data.proving_system) - .await - { - drop(batch_state_guard); - drop(user_state_guard); - warn!( - "Verifier for proving system {} is disabled for replacement message", - verification_data.proving_system - ); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::DisabledVerifier( - verification_data.proving_system, - )), - ) - .await; - self.metrics.user_error(&[ - "disabled_verifier", - &format!("{}", verification_data.proving_system), - ]); - return; - } - - if !zk_utils::verify(verification_data).await { - drop(batch_state_guard); - drop(user_state_guard); - error!("Invalid replacement proof detected. Verification failed"); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::RejectedProof), - ) - .await; - self.metrics.user_error(&[ - "rejected_proof", - &format!("{}", verification_data.proving_system), - ]); - return; - } + if !self.verify_proof_if_enabled( + &nonced_verification_data.verification_data, + ws_conn_sink.clone(), + ).await { + drop(batch_state_guard); + drop(user_state_guard); + return; } // The replacement entry is built from the old entry and validated for then to be replaced @@ -1240,6 +1172,51 @@ impl Batcher { user_state_guard.total_fees_in_queue += fee_difference; } + async fn verify_proof_if_enabled( + &self, + verification_data: &aligned_sdk::common::types::VerificationData, + ws_conn_sink: WsMessageSink, + ) -> bool { + if !self.pre_verification_is_enabled { + return true; + } + + if self + .is_verifier_disabled(verification_data.proving_system) + .await + { + warn!("Verifier for proving system {} is disabled", verification_data.proving_system); + send_message( + ws_conn_sink, + SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::DisabledVerifier( + verification_data.proving_system, + )), + ) + .await; + self.metrics.user_error(&[ + "disabled_verifier", + &format!("{}", verification_data.proving_system), + ]); + return false; + } + + if !zk_utils::verify(verification_data).await { + error!("Invalid proof detected. Verification failed"); + send_message( + ws_conn_sink, + SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::RejectedProof), + ) + .await; + self.metrics.user_error(&[ + "rejected_proof", + &format!("{}", verification_data.proving_system), + ]); + return false; + } + + true + } + async fn disabled_verifiers(&self) -> Result> { match self.service_manager.disabled_verifiers().call().await { Ok(disabled_verifiers) => Ok(disabled_verifiers), From 7f2d1ce0aba01bd95297bee05190637447c4cd59 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Tue, 5 Aug 2025 15:03:56 -0300 Subject: [PATCH 30/48] Fmt --- crates/batcher/src/lib.rs | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 71e94816c7..d52547ffd6 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -893,10 +893,13 @@ impl Batcher { return Ok(()); } - if !self.verify_proof_if_enabled( - &nonced_verification_data.verification_data, - ws_conn_sink.clone(), - ).await { + if !self + .verify_proof_if_enabled( + &nonced_verification_data.verification_data, + ws_conn_sink.clone(), + ) + .await + { return Ok(()); } @@ -1094,17 +1097,19 @@ impl Batcher { SubmitProofResponseMessage::InvalidReplacementMessage, ) .await; - self.metrics - .user_error(&["insufficient_fee_increase", ""]); + self.metrics.user_error(&["insufficient_fee_increase", ""]); return; } info!("Replacing message for address {addr} with nonce {nonce} and max fee {replacement_max_fee}"); - if !self.verify_proof_if_enabled( - &nonced_verification_data.verification_data, - ws_conn_sink.clone(), - ).await { + if !self + .verify_proof_if_enabled( + &nonced_verification_data.verification_data, + ws_conn_sink.clone(), + ) + .await + { drop(batch_state_guard); drop(user_state_guard); return; @@ -1185,7 +1190,10 @@ impl Batcher { .is_verifier_disabled(verification_data.proving_system) .await { - warn!("Verifier for proving system {} is disabled", verification_data.proving_system); + warn!( + "Verifier for proving system {} is disabled", + verification_data.proving_system + ); send_message( ws_conn_sink, SubmitProofResponseMessage::InvalidProof(ProofInvalidReason::DisabledVerifier( From 763e72591fb7b87eaa134ab676806b140b1d14c8 Mon Sep 17 00:00:00 2001 From: JuArce <52429267+JuArce@users.noreply.github.com> Date: Tue, 5 Aug 2025 16:33:40 -0300 Subject: [PATCH 31/48] fix: remove wrong comment --- crates/batcher/src/types/batch_queue.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/batcher/src/types/batch_queue.rs b/crates/batcher/src/types/batch_queue.rs index f8de63c7ec..3e749c0d68 100644 --- a/crates/batcher/src/types/batch_queue.rs +++ b/crates/batcher/src/types/batch_queue.rs @@ -106,7 +106,7 @@ impl Ord for BatchQueueEntryPriority { // Implementation of lowest-first: let ord: std::cmp::Ordering = other.max_fee.cmp(&self.max_fee); // This means, less max_fee will go first - // We want this because we will .pop() to remove unwanted elements, low fee submitions. + // We want this because we will .pop() to remove unwanted elements, low fee submissions. if ord == std::cmp::Ordering::Equal { // Case of same max_fee: @@ -200,8 +200,7 @@ pub(crate) fn extract_batch_directly( return Err(BatcherError::BatchCostTooHigh); } - // Extract remaining entries in sorted order - // Since pop() gives highest priority first, we collect them directly + // Extract remaining entries let mut batch_for_posting = Vec::new(); while let Some((entry, _)) = batch_queue.pop() { batch_for_posting.push(entry); From ac8dc95bbe3c132be240c856b27da8333df1da16 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 6 Aug 2025 16:56:20 -0300 Subject: [PATCH 32/48] Add timeouts to lock in handle_messages --- crates/batcher/src/lib.rs | 64 +++++++++++++++++++++-- crates/batcher/src/metrics.rs | 24 +++++++++ crates/cli/src/main.rs | 3 ++ crates/sdk/src/common/errors.rs | 1 + crates/sdk/src/common/types.rs | 2 + crates/sdk/src/communication/messaging.rs | 4 ++ crates/sdk/src/verification_layer/mod.rs | 3 ++ 7 files changed, 97 insertions(+), 4 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index d52547ffd6..9ecab5e47a 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -374,6 +374,36 @@ impl Batcher { updated_user_states } + /// Helper to apply 15-second timeout to user lock acquisition with consistent logging and metrics + async fn try_user_lock_with_timeout(&self, addr: Address, lock_future: F) -> Option + where + F: std::future::Future, + { + match timeout(Duration::from_secs(15), lock_future).await { + Ok(result) => Some(result), + Err(_) => { + warn!("User lock acquisition timed out for address {}", addr); + self.metrics.inc_message_handler_user_lock_timeout(); + None + } + } + } + + /// Helper to apply 15-second timeout to batch lock acquisition with consistent logging and metrics + async fn try_batch_lock_with_timeout(&self, lock_future: F) -> Option + where + F: std::future::Future, + { + match timeout(Duration::from_secs(15), lock_future).await { + Ok(result) => Some(result), + Err(_) => { + warn!("Batch lock acquisition timed out"); + self.metrics.inc_message_handler_batch_lock_timeout(); + None + } + } + } + pub async fn listen_connections(self: Arc, address: &str) -> Result<(), BatcherError> { // Create the event loop and TCP listener we'll accept connections on. let listener = TcpListener::bind(address) @@ -659,7 +689,14 @@ impl Batcher { let user_state_ref = self.user_states.get(&address); match user_state_ref { Some(user_state_ref) => { - let user_state_guard = user_state_ref.lock().await; + let Some(user_state_guard) = self + .try_user_lock_with_timeout(address, user_state_ref.lock()) + .await + else { + send_message(ws_conn_sink.clone(), GetNonceResponseMessage::ServerBusy) + .await; + return Ok(()); + }; Some(user_state_guard.nonce) } None => None, @@ -795,7 +832,13 @@ impl Batcher { }; // We acquire the lock on the user state, now everything will be processed sequentially - let mut user_state_guard = user_state_ref.lock().await; + let Some(mut user_state_guard) = self + .try_user_lock_with_timeout(addr, user_state_ref.lock()) + .await + else { + send_message(ws_conn_sink.clone(), SubmitProofResponseMessage::ServerBusy).await; + return Ok(()); + }; // If the user state was not present, we need to get the nonce from the Ethereum contract and update the dummy user state if !is_user_in_state { @@ -907,7 +950,13 @@ impl Batcher { // * Perform validation over batcher queue * // * ---------------------------------------------------------------------* - let mut batch_state_lock = self.batch_state.lock().await; + let Some(mut batch_state_lock) = self + .try_batch_lock_with_timeout(self.batch_state.lock()) + .await + else { + send_message(ws_conn_sink.clone(), SubmitProofResponseMessage::ServerBusy).await; + return Ok(()); + }; if batch_state_lock.is_queue_full() { debug!("Batch queue is full. Evaluating if the incoming proof can replace a lower-priority entry."); @@ -1070,7 +1119,14 @@ impl Batcher { ) { let replacement_max_fee = nonced_verification_data.max_fee; let nonce = nonced_verification_data.nonce; - let mut batch_state_guard = self.batch_state.lock().await; // Second: batch lock + let Some(mut batch_state_guard) = self + .try_batch_lock_with_timeout(self.batch_state.lock()) + .await + else { + drop(user_state_guard); + send_message(ws_conn_sink.clone(), SubmitProofResponseMessage::ServerBusy).await; + return; + }; let Some(entry) = batch_state_guard.get_entry(addr, nonce) else { drop(batch_state_guard); drop(user_state_guard); diff --git a/crates/batcher/src/metrics.rs b/crates/batcher/src/metrics.rs index 0ae7117c46..1b72f72de2 100644 --- a/crates/batcher/src/metrics.rs +++ b/crates/batcher/src/metrics.rs @@ -27,6 +27,8 @@ pub struct BatcherMetrics { pub cancel_create_new_task_duration: IntGauge, pub batcher_gas_cost_create_task_total: GenericCounter, pub batcher_gas_cost_cancel_task_total: GenericCounter, + pub message_handler_user_lock_timeouts: IntCounter, + pub message_handler_batch_lock_timeouts: IntCounter, } impl BatcherMetrics { @@ -80,6 +82,16 @@ impl BatcherMetrics { "Batcher Gas Cost Cancel Task Total" ))?; + let message_handler_user_lock_timeouts = register_int_counter!(opts!( + "message_handler_user_lock_timeouts_count", + "Message Handler User Lock Timeouts" + ))?; + + let message_handler_batch_lock_timeouts = register_int_counter!(opts!( + "message_handler_batch_lock_timeouts_count", + "Message Handler Batch Lock Timeouts" + ))?; + registry.register(Box::new(open_connections.clone()))?; registry.register(Box::new(received_proofs.clone()))?; registry.register(Box::new(sent_batches.clone()))?; @@ -96,6 +108,8 @@ impl BatcherMetrics { registry.register(Box::new(cancel_create_new_task_duration.clone()))?; registry.register(Box::new(batcher_gas_cost_create_task_total.clone()))?; registry.register(Box::new(batcher_gas_cost_cancel_task_total.clone()))?; + registry.register(Box::new(message_handler_user_lock_timeouts.clone()))?; + registry.register(Box::new(message_handler_batch_lock_timeouts.clone()))?; let metrics_route = warp::path!("metrics") .and(warp::any().map(move || registry.clone())) @@ -124,6 +138,8 @@ impl BatcherMetrics { cancel_create_new_task_duration, batcher_gas_cost_create_task_total, batcher_gas_cost_cancel_task_total, + message_handler_user_lock_timeouts, + message_handler_batch_lock_timeouts, }) } @@ -158,4 +174,12 @@ impl BatcherMetrics { self.queue_len.set(queue_len); self.queue_size_bytes.set(queue_size); } + + pub fn inc_message_handler_user_lock_timeout(&self) { + self.message_handler_user_lock_timeouts.inc(); + } + + pub fn inc_message_handler_batch_lock_timeout(&self) { + self.message_handler_batch_lock_timeouts.inc(); + } } diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index ebada4d68e..a9778cbda2 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -567,6 +567,9 @@ async fn main() -> Result<(), AlignedError> { aligned_sdk::common::errors::GetNonceError::UnexpectedResponse(e) => { SubmitError::UnexpectedBatcherResponse(e) } + aligned_sdk::common::errors::GetNonceError::GenericError(e) => { + SubmitError::GenericError(e) + } })?, }; diff --git a/crates/sdk/src/common/errors.rs b/crates/sdk/src/common/errors.rs index 30d1147242..4be1dfa451 100644 --- a/crates/sdk/src/common/errors.rs +++ b/crates/sdk/src/common/errors.rs @@ -251,6 +251,7 @@ pub enum GetNonceError { UnexpectedResponse(String), InvalidRequest(String), ProtocolMismatch { current: u16, expected: u16 }, + GenericError(String), } #[derive(Debug)] diff --git a/crates/sdk/src/common/types.rs b/crates/sdk/src/common/types.rs index f9850910f4..5f1a9931fc 100644 --- a/crates/sdk/src/common/types.rs +++ b/crates/sdk/src/common/types.rs @@ -401,6 +401,7 @@ pub enum SubmitProofResponseMessage { EthRpcError, InvalidPaymentServiceAddress(Address, Address), UnderpricedProof, + ServerBusy, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -408,6 +409,7 @@ pub enum GetNonceResponseMessage { Nonce(U256), EthRpcError(String), InvalidRequest(String), + ServerBusy, } #[derive(Debug, Clone)] diff --git a/crates/sdk/src/communication/messaging.rs b/crates/sdk/src/communication/messaging.rs index 2a1c100e7d..a4c746f2cb 100644 --- a/crates/sdk/src/communication/messaging.rs +++ b/crates/sdk/src/communication/messaging.rs @@ -269,6 +269,10 @@ async fn handle_batcher_response(msg: Message) -> Result { + error!("Server is busy processing requests, please retry. Funds have not been spent."); + Err(SubmitError::GenericError("Server is busy processing requests, please retry".to_string())) + } Err(e) => { error!( "Error while deserializing batch inclusion data: {}. Funds have not been spent.", diff --git a/crates/sdk/src/verification_layer/mod.rs b/crates/sdk/src/verification_layer/mod.rs index 82d4604da7..9ddadba5b8 100644 --- a/crates/sdk/src/verification_layer/mod.rs +++ b/crates/sdk/src/verification_layer/mod.rs @@ -584,6 +584,9 @@ pub async fn get_nonce_from_batcher( Ok(GetNonceResponseMessage::Nonce(nonce)) => Ok(nonce), Ok(GetNonceResponseMessage::EthRpcError(e)) => Err(GetNonceError::EthRpcError(e)), Ok(GetNonceResponseMessage::InvalidRequest(e)) => Err(GetNonceError::InvalidRequest(e)), + Ok(GetNonceResponseMessage::ServerBusy) => Err(GetNonceError::GenericError( + "Server is busy processing requests, please retry".to_string(), + )), Err(_) => Err(GetNonceError::SerializationError( "Failed to deserialize batcher message".to_string(), )), From 6c4689a14af88b7321b522579fdb76e6b5294672 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 6 Aug 2025 17:37:46 -0300 Subject: [PATCH 33/48] Stop processing users while restoring the queue --- crates/batcher/src/lib.rs | 41 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 9ecab5e47a..e9d47b93c0 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -102,6 +102,11 @@ pub struct Batcher { /// - Batch creation needs to be able to change all the states, so all processing /// needs to be stopped, and all user_states locks need to be taken batch_state: Mutex, + /// Flag to indicate when restoration is in progress + /// When true, message handlers will return ServerBusy responses + /// It's used a way to "lock" all the user_states at the same time + /// If one needed is taken in the handle message it will timeout + is_restoration_in_progress: RwLock, user_states: DashMap>>, last_uploaded_batch_block: Mutex, @@ -113,6 +118,8 @@ pub struct Batcher { disabled_verifiers: Mutex, + + // Observability and monitoring pub metrics: metrics::BatcherMetrics, pub telemetry: TelemetrySender, @@ -292,6 +299,7 @@ impl Batcher { batch_state: Mutex::new(batch_state), user_states, disabled_verifiers: Mutex::new(disabled_verifiers), + is_restoration_in_progress: RwLock::new(false), metrics, telemetry, } @@ -665,6 +673,17 @@ impl Batcher { mut address: Address, ws_conn_sink: WsMessageSink, ) -> Result<(), Error> { + // Check if restoration is in progress + if *self.is_restoration_in_progress.read().await { + warn!( + "Rejecting nonce request from {} during restoration", + address + ); + let response = GetNonceResponseMessage::ServerBusy; + send_message(ws_conn_sink, response).await; + return Ok(()); + } + // If the address is not paying, we will return the nonce of the aligned_payment_address if !self.has_to_pay(&address) { info!("Handling nonpaying message"); @@ -749,6 +768,17 @@ impl Batcher { debug!("Received message with nonce: {msg_nonce:?}"); self.metrics.received_proofs.inc(); + // Check if restoration is in progress + if *self.is_restoration_in_progress.read().await { + warn!( + "Rejecting proof submission from {} during restoration (nonce: {})", + client_msg.verification_data.verification_data.proof_generator_addr, msg_nonce + ); + let response = SubmitProofResponseMessage::ServerBusy; + send_message(ws_conn_sink, response).await; + return Ok(()); + } + // * ---------------------------------------------------* // * Perform validations over the message * // * ---------------------------------------------------* @@ -1530,6 +1560,9 @@ impl Batcher { failed_batch.len() ); + // Set restoration flag to stop handling new user messages + *self.is_restoration_in_progress.write().await = true; + let mut batch_state_lock = self.batch_state.lock().await; let mut restored_entries = Vec::new(); @@ -1591,10 +1624,10 @@ impl Batcher { let users_with_restored_proofs: std::collections::HashSet
= restored_entries.iter().map(|entry| entry.sender).collect(); - drop(batch_state_lock); // Release batch lock before user state updates - // Update user states for successfully restored proofs info!("Updating user states after proof restoration..."); + // TODO: We may have ejected some users that didn't have restored proofs, + // we should include in this list the ejected users if let Err(e) = self .update_user_states_from_queue_state(users_with_restored_proofs) .await @@ -1604,6 +1637,10 @@ impl Batcher { e ); } + + // Clear restoration flag to allow normal user message handling + *self.is_restoration_in_progress.write().await = false; + info!("Proof restoration completed, resuming normal operations"); } /// Takes the finalized batch as input and: From 7d86f474a97b64a25f0fa78c316bade39db68ecb Mon Sep 17 00:00:00 2001 From: MauroFab Date: Thu, 7 Aug 2025 13:02:11 -0300 Subject: [PATCH 34/48] Add improved failure handling --- crates/batcher/src/lib.rs | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index e9d47b93c0..c5da84257b 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -102,11 +102,12 @@ pub struct Batcher { /// - Batch creation needs to be able to change all the states, so all processing /// needs to be stopped, and all user_states locks need to be taken batch_state: Mutex, - /// Flag to indicate when restoration is in progress + + /// Flag to indicate when recovery is in progress /// When true, message handlers will return ServerBusy responses /// It's used a way to "lock" all the user_states at the same time /// If one needed is taken in the handle message it will timeout - is_restoration_in_progress: RwLock, + is_recovering_from_submission_failure: RwLock, user_states: DashMap>>, last_uploaded_batch_block: Mutex, @@ -118,8 +119,6 @@ pub struct Batcher { disabled_verifiers: Mutex, - - // Observability and monitoring pub metrics: metrics::BatcherMetrics, pub telemetry: TelemetrySender, @@ -299,7 +298,7 @@ impl Batcher { batch_state: Mutex::new(batch_state), user_states, disabled_verifiers: Mutex::new(disabled_verifiers), - is_restoration_in_progress: RwLock::new(false), + is_recovering_from_submission_failure: RwLock::new(false), metrics, telemetry, } @@ -674,7 +673,7 @@ impl Batcher { ws_conn_sink: WsMessageSink, ) -> Result<(), Error> { // Check if restoration is in progress - if *self.is_restoration_in_progress.read().await { + if *self.is_recovering_from_submission_failure.read().await { warn!( "Rejecting nonce request from {} during restoration", address @@ -769,7 +768,7 @@ impl Batcher { self.metrics.received_proofs.inc(); // Check if restoration is in progress - if *self.is_restoration_in_progress.read().await { + if *self.is_recovering_from_submission_failure.read().await { warn!( "Rejecting proof submission from {} during restoration (nonce: {})", client_msg.verification_data.verification_data.proof_generator_addr, msg_nonce @@ -1561,7 +1560,7 @@ impl Batcher { ); // Set restoration flag to stop handling new user messages - *self.is_restoration_in_progress.write().await = true; + *self.is_recovering_from_submission_failure.write().await = true; let mut batch_state_lock = self.batch_state.lock().await; let mut restored_entries = Vec::new(); @@ -1624,10 +1623,15 @@ impl Batcher { let users_with_restored_proofs: std::collections::HashSet
= restored_entries.iter().map(|entry| entry.sender).collect(); - // Update user states for successfully restored proofs - info!("Updating user states after proof restoration..."); - // TODO: We may have ejected some users that didn't have restored proofs, - // we should include in this list the ejected users + // At this point we have a valid queue with updated evicted users states + // Only auxiliary user data (max_min_fee) can be "inconsistent" + // but we can keep updating it without locking the queue + info!("Queue recovered from submission failure, resuming user processing and updating user states metadata"); + std::mem::drop(batch_state_lock); + *self.is_recovering_from_submission_failure.write().await = false; + + + info!("Updating user states after proof restoration..."); if let Err(e) = self .update_user_states_from_queue_state(users_with_restored_proofs) .await @@ -1638,9 +1642,6 @@ impl Batcher { ); } - // Clear restoration flag to allow normal user message handling - *self.is_restoration_in_progress.write().await = false; - info!("Proof restoration completed, resuming normal operations"); } /// Takes the finalized batch as input and: From fc93d6094889baf465902c43e73e744cd3a4a25b Mon Sep 17 00:00:00 2001 From: MauroFab Date: Thu, 7 Aug 2025 13:09:43 -0300 Subject: [PATCH 35/48] Fmt --- crates/batcher/src/lib.rs | 12 +++++++----- crates/sdk/src/communication/messaging.rs | 4 +++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index c5da84257b..311d46719e 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -102,7 +102,7 @@ pub struct Batcher { /// - Batch creation needs to be able to change all the states, so all processing /// needs to be stopped, and all user_states locks need to be taken batch_state: Mutex, - + /// Flag to indicate when recovery is in progress /// When true, message handlers will return ServerBusy responses /// It's used a way to "lock" all the user_states at the same time @@ -771,7 +771,11 @@ impl Batcher { if *self.is_recovering_from_submission_failure.read().await { warn!( "Rejecting proof submission from {} during restoration (nonce: {})", - client_msg.verification_data.verification_data.proof_generator_addr, msg_nonce + client_msg + .verification_data + .verification_data + .proof_generator_addr, + msg_nonce ); let response = SubmitProofResponseMessage::ServerBusy; send_message(ws_conn_sink, response).await; @@ -1629,9 +1633,8 @@ impl Batcher { info!("Queue recovered from submission failure, resuming user processing and updating user states metadata"); std::mem::drop(batch_state_lock); *self.is_recovering_from_submission_failure.write().await = false; - - info!("Updating user states after proof restoration..."); + info!("Updating user states after proof restoration..."); if let Err(e) = self .update_user_states_from_queue_state(users_with_restored_proofs) .await @@ -1641,7 +1644,6 @@ impl Batcher { e ); } - } /// Takes the finalized batch as input and: diff --git a/crates/sdk/src/communication/messaging.rs b/crates/sdk/src/communication/messaging.rs index a4c746f2cb..19d593c938 100644 --- a/crates/sdk/src/communication/messaging.rs +++ b/crates/sdk/src/communication/messaging.rs @@ -271,7 +271,9 @@ async fn handle_batcher_response(msg: Message) -> Result { error!("Server is busy processing requests, please retry. Funds have not been spent."); - Err(SubmitError::GenericError("Server is busy processing requests, please retry".to_string())) + Err(SubmitError::GenericError( + "Server is busy processing requests, please retry".to_string(), + )) } Err(e) => { error!( From 648761522cee7bf00d03c38fc55dc46a1e7a8523 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Thu, 7 Aug 2025 14:49:20 -0300 Subject: [PATCH 36/48] Add grafana --- .../aligned/aggregator_batcher.json | 124 ++++++------------ 1 file changed, 40 insertions(+), 84 deletions(-) diff --git a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json index 28bac958c4..77650ce447 100644 --- a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json +++ b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 4, + "id": 2, "links": [], "liveNow": false, "panels": [ @@ -2120,6 +2120,7 @@ "type": "prometheus", "uid": "prometheus" }, + "description": "", "fieldConfig": { "defaults": { "color": { @@ -3808,41 +3809,11 @@ "type": "prometheus", "uid": "prometheus" }, + "description": "", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "mode": "thresholds" }, "mappings": [], "thresholds": { @@ -3867,19 +3838,22 @@ "x": 0, "y": 76 }, - "id": 24, + "id": 67, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "textMode": "auto" }, + "pluginVersion": "10.1.10", "targets": [ { "datasource": { @@ -3887,54 +3861,36 @@ "uid": "prometheus" }, "disableTextWrap": false, - "editorMode": "code", - "expr": "floor(increase(user_errors_count{job=\"aligned-batcher\"}[10y]))", + "editorMode": "builder", + "expr": "message_handler_batch_lock_timeouts_count", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, - "legendFormat": "{{error_type}}", + "legendFormat": "__auto", "range": true, - "refId": "A", + "refId": "Batch lock timeout", "useBackend": false - } - ], - "title": "# User Errors", - "transformations": [ - { - "id": "calculateField", - "options": { - "alias": "proof_rejected", - "mode": "reduceRow", - "reduce": { - "include": [ - "rejected_proof" - ], - "reducer": "sum" - } - } }, { - "id": "organize", - "options": { - "excludeByName": { - "rejected_proof": true - }, - "indexByName": {}, - "renameByName": {} - } - }, - { - "id": "calculateField", - "options": { - "alias": "total", - "mode": "reduceRow", - "reduce": { - "reducer": "sum" - } - } + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "message_handler_user_lock_timeouts_count", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "User lock timeout", + "useBackend": false } ], - "type": "timeseries" + "title": "Message Handler - (Batch | User) Lock Timeout", + "type": "stat" }, { "datasource": { @@ -4398,6 +4354,6 @@ "timezone": "browser", "title": "System Data", "uid": "aggregator", - "version": 18, + "version": 3, "weekStart": "" -} \ No newline at end of file +} From e18554bcb32a3ec0c3976085a717a1ad9f7f6626 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Thu, 7 Aug 2025 15:55:29 -0300 Subject: [PATCH 37/48] Update batch metrics after posting --- crates/batcher/src/lib.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 234ca91dad..d41b7a97a4 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -1500,6 +1500,20 @@ impl Batcher { finalized_batch.len() ); + // Update queue metrics after successful batch extraction + let queue_len = batch_state_lock.batch_queue.len(); + match calculate_batch_size(&batch_state_lock.batch_queue) { + Ok(queue_size_bytes) => { + self.metrics + .update_queue_metrics(queue_len as i64, queue_size_bytes as i64); + } + Err(e) => { + error!("Failed to calculate batch size for queue metrics update: {:?}", e); + // Still update queue length metric, set size to 0 due to calculation error + self.metrics.update_queue_metrics(queue_len as i64, 0); + } + } + Some(finalized_batch) } From 7805ae29f1cb1c55b6d2c17f63f52f35e6eaa822 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Thu, 7 Aug 2025 15:56:20 -0300 Subject: [PATCH 38/48] Improve comment --- crates/batcher/src/types/batch_queue.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/batcher/src/types/batch_queue.rs b/crates/batcher/src/types/batch_queue.rs index 3e749c0d68..af67715d77 100644 --- a/crates/batcher/src/types/batch_queue.rs +++ b/crates/batcher/src/types/batch_queue.rs @@ -148,7 +148,7 @@ pub(crate) fn calculate_batch_size(batch_queue: &BatchQueue) -> Result Date: Thu, 7 Aug 2025 16:10:36 -0300 Subject: [PATCH 39/48] Re add missing grafana error items --- .../aligned/aggregator_batcher.json | 241 +++++++++++++----- 1 file changed, 171 insertions(+), 70 deletions(-) diff --git a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json index 77650ce447..7f12299a30 100644 --- a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json +++ b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json @@ -1524,8 +1524,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1595,8 +1594,7 @@ "mode": "absolute", "steps": [ { - "color": "dark-red", - "value": null + "color": "dark-red" }, { "color": "green", @@ -1728,8 +1726,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1804,8 +1801,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1910,8 +1906,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2053,8 +2048,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "dark-red", @@ -2120,7 +2114,6 @@ "type": "prometheus", "uid": "prometheus" }, - "description": "", "fieldConfig": { "defaults": { "color": { @@ -2162,8 +2155,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2230,8 +2222,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -2330,8 +2321,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2398,8 +2388,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "#FF9830", @@ -2506,8 +2495,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2573,8 +2561,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -2674,8 +2661,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2742,8 +2728,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "#EAB839", @@ -2850,8 +2835,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2949,8 +2933,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3017,8 +3000,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "#EAB839", @@ -3125,8 +3107,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -3220,8 +3201,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -3308,8 +3288,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -3409,8 +3388,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -3481,8 +3459,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3622,8 +3599,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3699,8 +3675,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -3809,19 +3784,48 @@ "type": "prometheus", "uid": "prometheus" }, - "description": "", "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3838,7 +3842,108 @@ "x": 0, "y": 76 }, - "id": 67, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "floor(increase(user_errors_count{job=\"aligned-batcher\"}[10y]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{error_type}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "# User Errors", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "proof_rejected", + "mode": "reduceRow", + "reduce": { + "include": [ + "rejected_proof" + ], + "reducer": "sum" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "rejected_proof": true + }, + "indexByName": {}, + "renameByName": {} + } + }, + { + "id": "calculateField", + "options": { + "alias": "total", + "mode": "reduceRow", + "reduce": { + "reducer": "sum" + } + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 84 + }, + "id": 65, "options": { "colorMode": "value", "graphMode": "area", @@ -3868,7 +3973,7 @@ "instant": false, "legendFormat": "__auto", "range": true, - "refId": "Batch lock timeout", + "refId": "A", "useBackend": false }, { @@ -3885,7 +3990,7 @@ "instant": false, "legendFormat": "__auto", "range": true, - "refId": "User lock timeout", + "refId": "B", "useBackend": false } ], @@ -3901,7 +4006,7 @@ "h": 2, "w": 24, "x": 0, - "y": 84 + "y": 92 }, "id": 46, "options": { @@ -3963,8 +4068,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -3976,7 +4080,7 @@ "h": 8, "w": 12, "x": 0, - "y": 86 + "y": 94 }, "id": 47, "options": { @@ -4059,8 +4163,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -4076,7 +4179,7 @@ "h": 8, "w": 12, "x": 12, - "y": 86 + "y": 94 }, "id": 43, "interval": "1s", @@ -4156,8 +4259,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -4173,7 +4275,7 @@ "h": 8, "w": 12, "x": 0, - "y": 94 + "y": 102 }, "id": 45, "options": { @@ -4288,8 +4390,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -4304,7 +4405,7 @@ "h": 8, "w": 12, "x": 12, - "y": 94 + "y": 102 }, "id": 44, "interval": "1s", @@ -4354,6 +4455,6 @@ "timezone": "browser", "title": "System Data", "uid": "aggregator", - "version": 3, + "version": 7, "weekStart": "" } From 558fea8b3f29b01d6798479112b9478f4dfc6506 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Thu, 7 Aug 2025 16:11:27 -0300 Subject: [PATCH 40/48] Fix fmt --- crates/batcher/src/lib.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index d41b7a97a4..a3d154987f 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -1508,7 +1508,10 @@ impl Batcher { .update_queue_metrics(queue_len as i64, queue_size_bytes as i64); } Err(e) => { - error!("Failed to calculate batch size for queue metrics update: {:?}", e); + error!( + "Failed to calculate batch size for queue metrics update: {:?}", + e + ); // Still update queue length metric, set size to 0 due to calculation error self.metrics.update_queue_metrics(queue_len as i64, 0); } From 92c03f0ae2483f05c0090c94e30c803b81396d0c Mon Sep 17 00:00:00 2001 From: MauroFab Date: Thu, 7 Aug 2025 16:18:07 -0300 Subject: [PATCH 41/48] Fix path in examples l2 --- examples/l2/cmd/Cargo.toml | 2 +- examples/l2/crates/l2/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/l2/cmd/Cargo.toml b/examples/l2/cmd/Cargo.toml index dee029aa66..6bfd129415 100644 --- a/examples/l2/cmd/Cargo.toml +++ b/examples/l2/cmd/Cargo.toml @@ -12,7 +12,7 @@ bincode = "1.3.3" tokio = "1.44" dotenv = "0.15" l2 = { path = "../crates/l2/" } -aligned-sdk = { path = "../../../batcher/aligned-sdk" } +aligned-sdk = { path = "../../../crates/sdk" } [lib] name = "l2_cmd" diff --git a/examples/l2/crates/l2/Cargo.toml b/examples/l2/crates/l2/Cargo.toml index 3eba964f15..2a52cbe0ce 100644 --- a/examples/l2/crates/l2/Cargo.toml +++ b/examples/l2/crates/l2/Cargo.toml @@ -14,7 +14,7 @@ tracing-subscriber = { version = "0.3.0", features = ["env-filter"] } rand = "0.8" sp1-sdk = "5.0.0" sp1_state_transition_program = { path = "./zkvm_programs/sp1" } -aligned-sdk = { path = "../../../../batcher/aligned-sdk" } +aligned-sdk = { path = "../../../../crates/sdk" } bincode = "1.3.3" futures-util = "0.3" tokio = "1.44" From 95e99c2dc29843139f30e6ce3b9fdc94581cd828 Mon Sep 17 00:00:00 2001 From: JuArce <52429267+JuArce@users.noreply.github.com> Date: Fri, 8 Aug 2025 16:16:04 -0300 Subject: [PATCH 42/48] fix: show locks metrics correctly --- .../aligned/aggregator_batcher.json | 98 ++++++++++++------- 1 file changed, 63 insertions(+), 35 deletions(-) diff --git a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json index 7f12299a30..57193acb68 100644 --- a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json +++ b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 2, + "id": 4, "links": [], "liveNow": false, "panels": [ @@ -1524,7 +1524,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1594,7 +1595,8 @@ "mode": "absolute", "steps": [ { - "color": "dark-red" + "color": "dark-red", + "value": null }, { "color": "green", @@ -1726,7 +1728,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1801,7 +1804,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1906,7 +1910,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2048,7 +2053,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "dark-red", @@ -2155,7 +2161,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2222,7 +2229,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -2321,7 +2329,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2388,7 +2397,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "#FF9830", @@ -2495,7 +2505,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2561,7 +2572,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -2661,7 +2673,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2728,7 +2741,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "#EAB839", @@ -2835,7 +2849,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2933,7 +2948,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -3000,7 +3016,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "#EAB839", @@ -3107,7 +3124,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3201,7 +3219,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3288,7 +3307,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -3388,7 +3408,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -3459,7 +3480,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -3599,7 +3621,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -3675,7 +3698,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3825,7 +3849,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -3926,7 +3951,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -3966,8 +3992,8 @@ "uid": "prometheus" }, "disableTextWrap": false, - "editorMode": "builder", - "expr": "message_handler_batch_lock_timeouts_count", + "editorMode": "code", + "expr": "floor(increase(message_handler_batch_lock_timeouts_count{job=\"aligned-batcher\"}[$__range]))", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -3982,8 +4008,8 @@ "uid": "prometheus" }, "disableTextWrap": false, - "editorMode": "builder", - "expr": "message_handler_user_lock_timeouts_count", + "editorMode": "code", + "expr": "floor(increase(message_handler_user_lock_timeouts_count{job=\"aligned-batcher\"}[$__range]))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -4068,7 +4094,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -4163,7 +4190,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -4455,6 +4483,6 @@ "timezone": "browser", "title": "System Data", "uid": "aggregator", - "version": 7, + "version": 20, "weekStart": "" -} +} \ No newline at end of file From f846b695986d25b0959216a6dbad422f1f2428c0 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Mon, 18 Aug 2025 16:06:22 -0300 Subject: [PATCH 43/48] refactor: use Hashmap instead of Dashmap (#2051) --- crates/batcher/src/lib.rs | 61 ++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index a3d154987f..2eec09004d 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -17,7 +17,6 @@ use types::batch_state::BatchState; use types::user_state::UserState; use batch_queue::calculate_batch_size; -use dashmap::DashMap; use std::collections::HashMap; use std::env; use std::net::SocketAddr; @@ -109,9 +108,9 @@ pub struct Batcher { /// Flag to indicate when recovery is in progress /// When true, message handlers will return ServerBusy responses /// It's used a way to "lock" all the user_states at the same time - /// If one needed is taken in the handle message it will timeout + /// If one needed is taken in the handle message it will time out is_recovering_from_submission_failure: RwLock, - user_states: DashMap>>, + user_states: Arc>>>>, last_uploaded_batch_block: Mutex, @@ -181,7 +180,7 @@ impl Batcher { let deployment_output = ContractDeploymentOutput::new(config.aligned_layer_deployment_config_file_path); - log::info!( + info!( "Starting metrics server on port {}", config.batcher.metrics_port ); @@ -262,7 +261,7 @@ impl Batcher { .await .expect("Failed to get fallback Service Manager contract"); - let user_states = DashMap::new(); + let user_states = Arc::new(RwLock::new(HashMap::new())); let batch_state = BatchState::new(config.batcher.max_queue_size); let non_paying_config = if let Some(non_paying_config) = config.batcher.non_paying { warn!("Non-paying address configuration detected. Will replace non-paying address {} with configured address.", @@ -276,7 +275,7 @@ impl Batcher { .expect("Could not get non-paying nonce from Ethereum"); let non_paying_user_state = UserState::new(nonpaying_nonce); - user_states.insert( + user_states.write().await.insert( non_paying_config.replacement.address(), Arc::new(Mutex::new(non_paying_user_state)), ); @@ -335,7 +334,7 @@ impl Batcher { } } - fn update_evicted_user_state_with_lock( + async fn update_evicted_user_state_with_lock( &self, removed_entry: &types::batch_queue::BatchQueueEntry, batch_queue: &types::batch_queue::BatchQueue, @@ -350,7 +349,7 @@ impl Batcher { { Some((last_entry, _)) => last_entry.nonced_verification_data.max_fee, None => { - self.user_states.remove(&addr); + self.user_states.write().await.remove(&addr); return; } }; @@ -376,12 +375,12 @@ impl Batcher { { Some((last_entry, _)) => last_entry.nonced_verification_data.max_fee, None => { - self.user_states.remove(&addr); + self.user_states.write().await.remove(&addr); return Some(()); } }; - let user_state = self.user_states.get(&addr)?; + let user_state = self.user_states.read().await.get(&addr)?.clone(); let mut user_state_guard = user_state.lock().await; user_state_guard.proofs_in_batch -= 1; user_state_guard.nonce -= U256::one(); @@ -392,7 +391,7 @@ impl Batcher { fn calculate_new_user_states_data( &self, - batch_queue: &types::batch_queue::BatchQueue, + batch_queue: &batch_queue::BatchQueue, ) -> HashMap { let mut updated_user_states = HashMap::new(); for (entry, _) in batch_queue.iter() { @@ -735,7 +734,7 @@ impl Batcher { } let cached_user_nonce = { - let user_state_ref = self.user_states.get(&address); + let user_state_ref = self.user_states.read().await.get(&address).cloned(); match user_state_ref { Some(user_state_ref) => { let Some(user_state_guard) = self @@ -875,16 +874,20 @@ impl Batcher { // If it was not present, then the user nonce is queried to the Aligned contract. // Lastly, we get a lock of the batch state again and insert the user state if it was still missing. - let is_user_in_state = self.user_states.contains_key(&addr); + let is_user_in_state = self.user_states.read().await.contains_key(&addr); if !is_user_in_state { + debug!("User state for address {addr:?} not found, creating a new one"); // We add a dummy user state to grab a lock on the user state let dummy_user_state = UserState::new(U256::zero()); self.user_states + .write() + .await .insert(addr, Arc::new(Mutex::new(dummy_user_state))); + debug!("Dummy user state for address {addr:?} created"); } - let Some(user_state_ref) = self.user_states.get(&addr) else { + let Some(user_state_ref) = self.user_states.read().await.get(&addr).cloned() else { error!("This should never happen, user state has previously been inserted if it didn't exist"); send_message( ws_conn_sink.clone(), @@ -1042,7 +1045,9 @@ impl Batcher { // Try to find any candidate whose lock we can acquire and immediately process them for candidate_addr in eviction_candidates { - if let Some(user_state_arc) = self.user_states.get(&candidate_addr) { + if let Some(user_state_arc) = + self.user_states.read().await.get(&candidate_addr).cloned() + { if let Ok(mut user_guard) = user_state_arc.try_lock() { // Found someone whose lock we can get - now find and remove their entry let entries_to_check: Vec<_> = batch_state_lock @@ -1076,7 +1081,8 @@ impl Batcher { &removed, &batch_state_lock.batch_queue, &mut user_guard, - ); + ) + .await; if let Some(ref removed_entry_ws) = removed.messaging_sink { let ws_sink = removed_entry_ws.clone(); @@ -1145,8 +1151,6 @@ impl Batcher { user_state_guard.last_max_fee_limit = max_fee; user_state_guard.proofs_in_batch += 1; user_state_guard.total_fees_in_queue += max_fee; - - info!("Verification data message handled"); Ok(()) } @@ -1530,7 +1534,7 @@ impl Batcher { ) -> Result<(), BatcherError> { // Update each user's state with proper lock ordering for addr in affected_users { - if let Some(user_state) = self.user_states.get(&addr) { + if let Some(user_state) = self.user_states.read().await.get(&addr).cloned() { let mut user_state_guard = user_state.lock().await; // First: user lock let batch_state_lock = self.batch_state.lock().await; // Second: batch lock @@ -1565,7 +1569,10 @@ impl Batcher { /// Cleans up user states after successful batch submission. /// Resets last_max_fee_limit to U256::MAX for users who had proofs in the submitted batch /// but now have no proofs left in the queue. - fn cleanup_user_states_after_successful_submission(&self, finalized_batch: &[BatchQueueEntry]) { + async fn cleanup_user_states_after_successful_submission( + &self, + finalized_batch: &[BatchQueueEntry], + ) { use std::collections::HashSet; // Get unique users from the submitted batch @@ -1589,7 +1596,8 @@ impl Batcher { for user_addr in users_in_batch { if !current_user_states.contains_key(&user_addr) { // User has no proofs left in queue - reset their max_fee_limit - if let Some(user_state_ref) = self.user_states.get(&user_addr) { + if let Some(user_state_ref) = self.user_states.read().await.get(&user_addr).cloned() + { if let Ok(mut user_state_guard) = user_state_ref.try_lock() { user_state_guard.last_max_fee_limit = U256::max_value(); } @@ -1810,7 +1818,8 @@ impl Batcher { } // Clean up user states for users who had proofs in this batch but now have no proofs left - self.cleanup_user_states_after_successful_submission(finalized_batch); + self.cleanup_user_states_after_successful_submission(finalized_batch) + .await; connection::send_batch_inclusion_data_responses(finalized_batch, &batch_merkle_tree).await } @@ -1828,7 +1837,7 @@ impl Batcher { let Some(nonpaying_replacement_addr) = self.get_nonpaying_replacement_addr() else { batch_state_lock.batch_queue.clear(); - self.user_states.clear(); + self.user_states.write().await.clear(); return; }; @@ -1840,13 +1849,13 @@ impl Batcher { .await else { batch_state_lock.batch_queue.clear(); - self.user_states.clear(); + self.user_states.write().await.clear(); return; }; batch_state_lock.batch_queue.clear(); - self.user_states.clear(); + self.user_states.write().await.clear(); let nonpaying_user_state = UserState::new(nonpaying_replacement_addr_nonce); - self.user_states.insert( + self.user_states.write().await.insert( nonpaying_replacement_addr, Arc::new(Mutex::new(nonpaying_user_state)), ); From cb5c84e497819c699e44eee3d63bf10dd41ffa73 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Tue, 19 Aug 2025 13:55:14 -0300 Subject: [PATCH 44/48] fix(batcher): initialize dummy state with correct nonce (#2057) --- crates/batcher/src/lib.rs | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 2eec09004d..61afabaa60 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -877,9 +877,25 @@ impl Batcher { let is_user_in_state = self.user_states.read().await.contains_key(&addr); if !is_user_in_state { + // If the user state was not present, we need to get the nonce from the Ethereum contract + let ethereum_user_nonce = match self.get_user_nonce_from_ethereum(addr).await { + Ok(ethereum_user_nonce) => ethereum_user_nonce, + Err(e) => { + error!( + "Failed to get user nonce from Ethereum for address {addr:?}. Error: {e:?}" + ); + send_message( + ws_conn_sink.clone(), + SubmitProofResponseMessage::EthRpcError, + ) + .await; + self.metrics.user_error(&["eth_rpc_error", ""]); + return Ok(()); + } + }; debug!("User state for address {addr:?} not found, creating a new one"); // We add a dummy user state to grab a lock on the user state - let dummy_user_state = UserState::new(U256::zero()); + let dummy_user_state = UserState::new(ethereum_user_nonce); self.user_states .write() .await @@ -907,27 +923,6 @@ impl Batcher { return Ok(()); }; - // If the user state was not present, we need to get the nonce from the Ethereum contract and update the dummy user state - if !is_user_in_state { - let ethereum_user_nonce = match self.get_user_nonce_from_ethereum(addr).await { - Ok(ethereum_user_nonce) => ethereum_user_nonce, - Err(e) => { - error!( - "Failed to get user nonce from Ethereum for address {addr:?}. Error: {e:?}" - ); - send_message( - ws_conn_sink.clone(), - SubmitProofResponseMessage::EthRpcError, - ) - .await; - self.metrics.user_error(&["eth_rpc_error", ""]); - return Ok(()); - } - }; - // Update the dummy user state with the correct nonce - user_state_guard.nonce = ethereum_user_nonce; - } - // * ---------------------------------------------------* // * Perform validations over user state * // * ---------------------------------------------------* From 662bdd7d26cd1e2ad93afb3708c7fc82d614f1ab Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Tue, 19 Aug 2025 13:55:52 -0300 Subject: [PATCH 45/48] docs(batcher): explain locking logic (#2058) --- crates/batcher/src/lib.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 61afabaa60..f719c01158 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -98,11 +98,18 @@ pub struct Batcher { aggregator_fee_percentage_multiplier: u128, aggregator_gas_cost: u128, - // Shared state (Mutex) - /// The general business rule is: - /// - User processing can be done in parallel unless a batch creation is happening - /// - Batch creation needs to be able to change all the states, so all processing - /// needs to be stopped, and all user_states locks need to be taken + // Shared state access: + // Two kinds of threads interact with the shared state: + // 1. User message processing threads (run in parallel) + // 2. Batch creation thread (runs sequentially, includes failure recovery) + // + // Locking rules: + // - To avoid deadlocks, always acquire `user_states` before `batch_state`. + // - During failure recovery, restoring a valid state may require breaking this rule: + // additional user locks might be acquired *after* the batch lock. + // (See the `restore` algorithm in the `batch_queue` module.) + // + // Because of this exception, user message handling uses lock acquisition with timeouts. batch_state: Mutex, /// Flag to indicate when recovery is in progress From 3492e7e1e3d1e1698615437aa39cb6c3bf95cd37 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Wed, 20 Aug 2025 14:43:01 -0300 Subject: [PATCH 46/48] fix(batcher): remove is_recovering_from_submission_failure (#2056) Co-authored-by: MauroFab --- crates/batcher/src/lib.rs | 101 ++++++++++-------- crates/batcher/src/metrics.rs | 12 +++ crates/batcher/src/types/batch_queue.rs | 4 +- .../aligned/aggregator_batcher.json | 31 ++++-- 4 files changed, 96 insertions(+), 52 deletions(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index 3bc79db9de..adadbf1a05 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -47,6 +47,9 @@ use lambdaworks_crypto::merkle_tree::traits::IsMerkleTreeBackend; use log::{debug, error, info, warn}; use tokio::net::{TcpListener, TcpStream}; use tokio::sync::{Mutex, MutexGuard, RwLock}; + +// Message handler lock timeout +const MESSAGE_HANDLER_LOCK_TIMEOUT: Duration = Duration::from_secs(10); use tokio_tungstenite::tungstenite::{Error, Message}; use types::batch_queue::{self, BatchQueueEntry, BatchQueueEntryPriority}; use types::errors::{BatcherError, TransactionSendError}; @@ -112,11 +115,6 @@ pub struct Batcher { // Because of this exception, user message handling uses lock acquisition with timeouts. batch_state: Mutex, - /// Flag to indicate when recovery is in progress - /// When true, message handlers will return ServerBusy responses - /// It's used a way to "lock" all the user_states at the same time - /// If one needed is taken in the handle message it will time out - is_recovering_from_submission_failure: RwLock, user_states: Arc>>>>, last_uploaded_batch_block: Mutex, @@ -335,7 +333,6 @@ impl Batcher { batch_state: Mutex::new(batch_state), user_states, disabled_verifiers: Mutex::new(disabled_verifiers), - is_recovering_from_submission_failure: RwLock::new(false), metrics, telemetry, } @@ -423,7 +420,7 @@ impl Batcher { where F: std::future::Future, { - match timeout(Duration::from_secs(15), lock_future).await { + match timeout(MESSAGE_HANDLER_LOCK_TIMEOUT, lock_future).await { Ok(result) => Some(result), Err(_) => { warn!("User lock acquisition timed out for address {}", addr); @@ -438,7 +435,7 @@ impl Batcher { where F: std::future::Future, { - match timeout(Duration::from_secs(15), lock_future).await { + match timeout(MESSAGE_HANDLER_LOCK_TIMEOUT, lock_future).await { Ok(result) => Some(result), Err(_) => { warn!("Batch lock acquisition timed out"); @@ -709,17 +706,6 @@ impl Batcher { mut address: Address, ws_conn_sink: WsMessageSink, ) -> Result<(), Error> { - // Check if restoration is in progress - if *self.is_recovering_from_submission_failure.read().await { - warn!( - "Rejecting nonce request from {} during restoration", - address - ); - let response = GetNonceResponseMessage::ServerBusy; - send_message(ws_conn_sink, response).await; - return Ok(()); - } - // If the address is not paying, we will return the nonce of the aligned_payment_address if !self.has_to_pay(&address) { info!("Handling nonpaying message"); @@ -741,7 +727,21 @@ impl Batcher { } let cached_user_nonce = { - let user_state_ref = self.user_states.read().await.get(&address).cloned(); + let user_states_guard = match timeout( + MESSAGE_HANDLER_LOCK_TIMEOUT, + self.user_states.read(), + ) + .await + { + Ok(guard) => guard, + Err(_) => { + warn!("User states read lock acquisition timed out in handle_get_nonce_for_address_msg"); + self.metrics.inc_message_handler_user_states_lock_timeouts(); + send_message(ws_conn_sink, GetNonceResponseMessage::ServerBusy).await; + return Ok(()); + } + }; + let user_state_ref = user_states_guard.get(&address).cloned(); match user_state_ref { Some(user_state_ref) => { let Some(user_state_guard) = self @@ -804,21 +804,6 @@ impl Batcher { debug!("Received message with nonce: {msg_nonce:?}"); self.metrics.received_proofs.inc(); - // Check if restoration is in progress - if *self.is_recovering_from_submission_failure.read().await { - warn!( - "Rejecting proof submission from {} during restoration (nonce: {})", - client_msg - .verification_data - .verification_data - .proof_generator_addr, - msg_nonce - ); - let response = SubmitProofResponseMessage::ServerBusy; - send_message(ws_conn_sink, response).await; - return Ok(()); - } - // * ---------------------------------------------------* // * Perform validations over the message * // * ---------------------------------------------------* @@ -881,7 +866,17 @@ impl Batcher { // If it was not present, then the user nonce is queried to the Aligned contract. // Lastly, we get a lock of the batch state again and insert the user state if it was still missing. - let is_user_in_state = self.user_states.read().await.contains_key(&addr); + let is_user_in_state = match timeout(MESSAGE_HANDLER_LOCK_TIMEOUT, self.user_states.read()) + .await + { + Ok(user_states_guard) => user_states_guard.contains_key(&addr), + Err(_) => { + warn!("User states read lock acquisition timed out in handle_submit_proof_msg (user check)"); + self.metrics.inc_message_handler_user_states_lock_timeouts(); + send_message(ws_conn_sink, SubmitProofResponseMessage::ServerBusy).await; + return Ok(()); + } + }; if !is_user_in_state { // If the user state was not present, we need to get the nonce from the Ethereum contract @@ -903,14 +898,32 @@ impl Batcher { debug!("User state for address {addr:?} not found, creating a new one"); // We add a dummy user state to grab a lock on the user state let dummy_user_state = UserState::new(ethereum_user_nonce); - self.user_states - .write() - .await - .insert(addr, Arc::new(Mutex::new(dummy_user_state))); + match timeout(MESSAGE_HANDLER_LOCK_TIMEOUT, self.user_states.write()).await { + Ok(mut user_states_guard) => { + user_states_guard.insert(addr, Arc::new(Mutex::new(dummy_user_state))); + } + Err(_) => { + warn!("User states write lock acquisition timed out in handle_submit_proof_msg (user creation)"); + self.metrics.inc_message_handler_user_states_lock_timeouts(); + send_message(ws_conn_sink, SubmitProofResponseMessage::ServerBusy).await; + return Ok(()); + } + }; debug!("Dummy user state for address {addr:?} created"); } - let Some(user_state_ref) = self.user_states.read().await.get(&addr).cloned() else { + let user_state_ref = match timeout(MESSAGE_HANDLER_LOCK_TIMEOUT, self.user_states.read()) + .await + { + Ok(user_states_guard) => user_states_guard.get(&addr).cloned(), + Err(_) => { + warn!("User states read lock acquisition timed out in handle_submit_proof_msg (user retrieval)"); + self.metrics.inc_message_handler_user_states_lock_timeouts(); + send_message(ws_conn_sink, SubmitProofResponseMessage::ServerBusy).await; + return Ok(()); + } + }; + let Some(user_state_ref) = user_state_ref else { error!("This should never happen, user state has previously been inserted if it didn't exist"); send_message( ws_conn_sink.clone(), @@ -1621,9 +1634,7 @@ impl Batcher { failed_batch.len() ); - // Set restoration flag to stop handling new user messages - *self.is_recovering_from_submission_failure.write().await = true; - + let user_states_lock = self.user_states.write().await; let mut batch_state_lock = self.batch_state.lock().await; let mut restored_entries = Vec::new(); @@ -1689,8 +1700,8 @@ impl Batcher { // Only auxiliary user data (max_min_fee) can be "inconsistent" // but we can keep updating it without locking the queue info!("Queue recovered from submission failure, resuming user processing and updating user states metadata"); + std::mem::drop(user_states_lock); std::mem::drop(batch_state_lock); - *self.is_recovering_from_submission_failure.write().await = false; info!("Updating user states after proof restoration..."); if let Err(e) = self diff --git a/crates/batcher/src/metrics.rs b/crates/batcher/src/metrics.rs index a28361ac58..b68a5dc16b 100644 --- a/crates/batcher/src/metrics.rs +++ b/crates/batcher/src/metrics.rs @@ -29,6 +29,7 @@ pub struct BatcherMetrics { pub batcher_gas_cost_cancel_task_total: GenericCounter, pub message_handler_user_lock_timeouts: IntCounter, pub message_handler_batch_lock_timeouts: IntCounter, + pub message_handler_user_states_lock_timeouts: IntCounter, pub available_data_services: IntGauge, } @@ -97,6 +98,11 @@ impl BatcherMetrics { "Message Handler Batch Lock Timeouts" ))?; + let message_handler_user_states_lock_timeouts = register_int_counter!(opts!( + "message_handler_user_states_lock_timeouts_count", + "Message Handler User States Lock Timeouts" + ))?; + registry.register(Box::new(open_connections.clone()))?; registry.register(Box::new(received_proofs.clone()))?; registry.register(Box::new(sent_batches.clone()))?; @@ -115,6 +121,7 @@ impl BatcherMetrics { registry.register(Box::new(batcher_gas_cost_cancel_task_total.clone()))?; registry.register(Box::new(message_handler_user_lock_timeouts.clone()))?; registry.register(Box::new(message_handler_batch_lock_timeouts.clone()))?; + registry.register(Box::new(message_handler_user_states_lock_timeouts.clone()))?; registry.register(Box::new(available_data_services.clone()))?; let metrics_route = warp::path!("metrics") @@ -146,6 +153,7 @@ impl BatcherMetrics { batcher_gas_cost_cancel_task_total, message_handler_user_lock_timeouts, message_handler_batch_lock_timeouts, + message_handler_user_states_lock_timeouts, available_data_services, }) } @@ -188,4 +196,8 @@ impl BatcherMetrics { pub fn inc_message_handler_batch_lock_timeout(&self) { self.message_handler_batch_lock_timeouts.inc(); } + + pub fn inc_message_handler_user_states_lock_timeouts(&self) { + self.message_handler_user_states_lock_timeouts.inc(); + } } diff --git a/crates/batcher/src/types/batch_queue.rs b/crates/batcher/src/types/batch_queue.rs index ed904294b1..7461b4ac3d 100644 --- a/crates/batcher/src/types/batch_queue.rs +++ b/crates/batcher/src/types/batch_queue.rs @@ -170,7 +170,9 @@ pub(crate) fn extract_batch_directly( let (rejected_entry, rejected_priority) = batch_queue.pop().unwrap(); // Update batch size - let verification_data_size = rejected_entry.nonced_verification_data.cbor_size_upper_bound(); + let verification_data_size = rejected_entry + .nonced_verification_data + .cbor_size_upper_bound(); batch_size -= verification_data_size; rejected_entries.push((rejected_entry, rejected_priority)); diff --git a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json index 57193acb68..d85bb82d75 100644 --- a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json +++ b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 4, + "id": 2, "links": [], "liveNow": false, "panels": [ @@ -4018,9 +4018,26 @@ "range": true, "refId": "B", "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": " floor(increase(message_handler_user_states_lock_timeouts_count{job=\"aligned-batcher\"}[$__range]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "C", + "useBackend": false } ], - "title": "Message Handler - (Batch | User) Lock Timeout", + "title": "Message Handler - (Batch | User | User State Map) Lock Timeout", "type": "stat" }, { @@ -4287,7 +4304,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -4418,7 +4436,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -4483,6 +4502,6 @@ "timezone": "browser", "title": "System Data", "uid": "aggregator", - "version": 20, + "version": 9, "weekStart": "" -} \ No newline at end of file +} From 370e440129985546a7246523461137e7c53f0ece Mon Sep 17 00:00:00 2001 From: Mauro Toscano <12560266+MauroToscano@users.noreply.github.com> Date: Wed, 20 Aug 2025 15:18:13 -0300 Subject: [PATCH 47/48] Remove dashmap as dependency --- crates/batcher/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/batcher/Cargo.toml b/crates/batcher/Cargo.toml index 0b8662dd4c..af79092851 100644 --- a/crates/batcher/Cargo.toml +++ b/crates/batcher/Cargo.toml @@ -29,7 +29,6 @@ aligned-sdk = { path = "../sdk" } ciborium = "=0.2.2" priority-queue = "2.1.0" reqwest = { version = "0.12", features = ["json"] } -dashmap = "6.0.1" once_cell = "1.20.2" warp = "0.3.7" From 447efd4c62f78222e745789af08a85f0de3b44d8 Mon Sep 17 00:00:00 2001 From: JuArce <52429267+JuArce@users.noreply.github.com> Date: Fri, 22 Aug 2025 17:21:59 -0300 Subject: [PATCH 48/48] remove comment --- crates/batcher/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/batcher/src/lib.rs b/crates/batcher/src/lib.rs index adadbf1a05..7a1bf78718 100644 --- a/crates/batcher/src/lib.rs +++ b/crates/batcher/src/lib.rs @@ -1902,7 +1902,6 @@ impl Batcher { let modified_gas_price = gas_price * U256::from(GAS_PRICE_PERCENTAGE_MULTIPLIER) / U256::from(PERCENTAGE_DIVIDER); - // TODO (Mauro): Take all the user locks here if let Some(finalized_batch) = self .extract_batch_if_ready(block_number, modified_gas_price) .await