Fix up chain resync deadlock during cascade failure

This commit is contained in:
Matt Hess
2026-01-12 16:19:52 +00:00
parent aa8075ee57
commit 5e5fef537e
2 changed files with 102 additions and 73 deletions

View File

@@ -754,6 +754,8 @@ bool SideChain::add_external_block(PoolBlock& block, std::vector<hash>& missing_
if (added && block.m_verified) {
if (block.m_invalid) {
on_block_rejected(&block, "external block validation failed");
// Check for deferred recovery after on_block_rejected (we're outside the lock here)
check_and_run_deferred_recovery();
} else {
on_block_accepted();
}
@@ -791,47 +793,54 @@ bool SideChain::add_block(const PoolBlock& block)
prune_seen_data();
}
WriteLock lock(m_sidechainLock);
// Scope the lock so we can check for deferred recovery after releasing it
{
WriteLock lock(m_sidechainLock);
auto result = m_blocksById.insert({ new_block->m_sidechainId, new_block });
if (!result.second) {
const PoolBlock* old_block = result.first->second;
auto result = m_blocksById.insert({ new_block->m_sidechainId, new_block });
if (!result.second) {
const PoolBlock* old_block = result.first->second;
LOGWARN(3, "add_block: trying to add the same block twice:"
<< "\nnew block id = " << new_block->m_sidechainId
<< ", sidechain height = " << new_block->m_sidechainHeight
<< ", height = " << new_block->m_txinGenHeight
<< ", nonce = " << new_block->m_nonce
<< ", extra_nonce = " << new_block->m_extraNonce
<< "\nold block id = " << old_block->m_sidechainId
<< ", sidechain height = " << old_block->m_sidechainHeight
<< ", height = " << old_block->m_txinGenHeight
<< ", nonce = " << old_block->m_nonce
<< ", extra_nonce = " << old_block->m_extraNonce
);
LOGWARN(3, "add_block: trying to add the same block twice:"
<< "\nnew block id = " << new_block->m_sidechainId
<< ", sidechain height = " << new_block->m_sidechainHeight
<< ", height = " << new_block->m_txinGenHeight
<< ", nonce = " << new_block->m_nonce
<< ", extra_nonce = " << new_block->m_extraNonce
<< "\nold block id = " << old_block->m_sidechainId
<< ", sidechain height = " << old_block->m_sidechainHeight
<< ", height = " << old_block->m_txinGenHeight
<< ", nonce = " << old_block->m_nonce
<< ", extra_nonce = " << old_block->m_extraNonce
);
delete new_block;
return false;
}
delete new_block;
return false;
}
m_blocksByHeight[new_block->m_sidechainHeight].push_back(new_block);
m_blocksByMerkleRoot.insert({ new_block->m_merkleRoot, new_block });
m_blocksByHeight[new_block->m_sidechainHeight].push_back(new_block);
m_blocksByMerkleRoot.insert({ new_block->m_merkleRoot, new_block });
update_depths(new_block);
update_depths(new_block);
if (new_block->m_verified) {
if (!new_block->m_invalid) {
update_chain_tip(new_block);
if (new_block->m_verified) {
if (!new_block->m_invalid) {
update_chain_tip(new_block);
// Save it for faster syncing on the next p2pool start
if (P2PServer* server = p2pServer()) {
server->store_in_cache(*new_block);
// Save it for faster syncing on the next p2pool start
if (P2PServer* server = p2pServer()) {
server->store_in_cache(*new_block);
}
}
}
}
else {
verify_loop(new_block);
}
else {
verify_loop(new_block);
}
} // Lock released here
// Check if recovery was triggered during verify_loop and execute it now
// (must be done outside the lock to avoid deadlock)
check_and_run_deferred_recovery();
return true;
}
@@ -2716,46 +2725,52 @@ void SideChain::get_missing_blocks(unordered_set<hash>& missing_blocks) const
void SideChain::retry_unverified_blocks()
{
WriteLock lock(m_sidechainLock);
// Scope the lock so we can check for deferred recovery after releasing it
{
WriteLock lock(m_sidechainLock);
// Scan for unverified blocks and retry them
// This is called when new mainchain data arrives that might allow verification
std::vector<PoolBlock*> blocks_to_retry;
// Scan for unverified blocks and retry them
// This is called when new mainchain data arrives that might allow verification
std::vector<PoolBlock*> blocks_to_retry;
for (auto& pair : m_blocksById) {
PoolBlock* block = pair.second;
if (!block->m_verified && !block->m_invalid) {
blocks_to_retry.push_back(block);
for (auto& pair : m_blocksById) {
PoolBlock* block = pair.second;
if (!block->m_verified && !block->m_invalid) {
blocks_to_retry.push_back(block);
}
}
}
if (blocks_to_retry.empty()) {
return;
}
LOGINFO(4, "Retrying verification of " << blocks_to_retry.size() << " unverified blocks after mainchain update");
// Sort by height to process in order
std::sort(blocks_to_retry.begin(), blocks_to_retry.end(),
[](const PoolBlock* a, const PoolBlock* b) {
return a->m_sidechainHeight < b->m_sidechainHeight;
});
// Try to verify each block
uint32_t verified_count = 0;
for (PoolBlock* block : blocks_to_retry) {
if (block->m_verified) {
continue; // Already verified by earlier iteration
if (blocks_to_retry.empty()) {
return;
}
verify_loop(block);
if (block->m_verified) {
++verified_count;
}
}
if (verified_count > 0) {
LOGINFO(3, "Verified " << verified_count << " blocks after mainchain update");
}
LOGINFO(4, "Retrying verification of " << blocks_to_retry.size() << " unverified blocks after mainchain update");
// Sort by height to process in order
std::sort(blocks_to_retry.begin(), blocks_to_retry.end(),
[](const PoolBlock* a, const PoolBlock* b) {
return a->m_sidechainHeight < b->m_sidechainHeight;
});
// Try to verify each block
uint32_t verified_count = 0;
for (PoolBlock* block : blocks_to_retry) {
if (block->m_verified) {
continue; // Already verified by earlier iteration
}
verify_loop(block);
if (block->m_verified) {
++verified_count;
}
}
if (verified_count > 0) {
LOGINFO(3, "Verified " << verified_count << " blocks after mainchain update");
}
} // Lock released here
// Check if recovery was triggered during verify_loop and execute it now
check_and_run_deferred_recovery();
}
bool SideChain::consider_peer_genesis(const hash& genesis_id, uint64_t timestamp, uint64_t height)
@@ -3241,22 +3256,35 @@ void SideChain::trigger_recovery(uint64_t failure_height)
// Already in recovery mode
return;
}
// Disable mining immediately
m_readyToMine = false;
// Find the checkpoint before the failure
uint64_t recovery_checkpoint = (failure_height / CHECKPOINT_INTERVAL) * CHECKPOINT_INTERVAL;
if (recovery_checkpoint >= failure_height && recovery_checkpoint >= CHECKPOINT_INTERVAL) {
recovery_checkpoint -= CHECKPOINT_INTERVAL;
}
LOGINFO(0, "Recovery target checkpoint: " << recovery_checkpoint);
m_pendingRecoveryHeight = recovery_checkpoint;
// Request checkpoint validation from peers via P2P server
request_checkpoint_validation();
// NOTE: Don't call request_checkpoint_validation() here!
// We may be called while holding m_sidechainLock (from verify_loop -> on_block_rejected).
// The actual recovery will be executed by check_and_run_deferred_recovery() after
// the lock is released.
}
void SideChain::check_and_run_deferred_recovery()
{
// This function should be called AFTER releasing m_sidechainLock
// It checks if recovery was triggered and executes it
uint64_t checkpoint_height = m_pendingRecoveryHeight.load();
if (checkpoint_height > 0 && m_recoveryMode.load()) {
LOGINFO(0, "Executing deferred recovery to checkpoint " << checkpoint_height);
request_checkpoint_validation();
}
}
void SideChain::request_checkpoint_validation()

View File

@@ -110,6 +110,7 @@ public:
// Recovery
void trigger_recovery(uint64_t failure_height);
void reset_to_checkpoint(uint64_t checkpoint_height);
void check_and_run_deferred_recovery();
bool is_in_recovery() const { return m_recoveryMode.load(); }
[[nodiscard]] FORCEINLINE difficulty_type difficulty() const { ReadLock lock(m_curDifficultyLock); return m_curDifficulty; }