Fix up chain resync deadlock during cascade failure
This commit is contained in:
@@ -754,6 +754,8 @@ bool SideChain::add_external_block(PoolBlock& block, std::vector<hash>& missing_
|
||||
if (added && block.m_verified) {
|
||||
if (block.m_invalid) {
|
||||
on_block_rejected(&block, "external block validation failed");
|
||||
// Check for deferred recovery after on_block_rejected (we're outside the lock here)
|
||||
check_and_run_deferred_recovery();
|
||||
} else {
|
||||
on_block_accepted();
|
||||
}
|
||||
@@ -791,47 +793,54 @@ bool SideChain::add_block(const PoolBlock& block)
|
||||
prune_seen_data();
|
||||
}
|
||||
|
||||
WriteLock lock(m_sidechainLock);
|
||||
// Scope the lock so we can check for deferred recovery after releasing it
|
||||
{
|
||||
WriteLock lock(m_sidechainLock);
|
||||
|
||||
auto result = m_blocksById.insert({ new_block->m_sidechainId, new_block });
|
||||
if (!result.second) {
|
||||
const PoolBlock* old_block = result.first->second;
|
||||
auto result = m_blocksById.insert({ new_block->m_sidechainId, new_block });
|
||||
if (!result.second) {
|
||||
const PoolBlock* old_block = result.first->second;
|
||||
|
||||
LOGWARN(3, "add_block: trying to add the same block twice:"
|
||||
<< "\nnew block id = " << new_block->m_sidechainId
|
||||
<< ", sidechain height = " << new_block->m_sidechainHeight
|
||||
<< ", height = " << new_block->m_txinGenHeight
|
||||
<< ", nonce = " << new_block->m_nonce
|
||||
<< ", extra_nonce = " << new_block->m_extraNonce
|
||||
<< "\nold block id = " << old_block->m_sidechainId
|
||||
<< ", sidechain height = " << old_block->m_sidechainHeight
|
||||
<< ", height = " << old_block->m_txinGenHeight
|
||||
<< ", nonce = " << old_block->m_nonce
|
||||
<< ", extra_nonce = " << old_block->m_extraNonce
|
||||
);
|
||||
LOGWARN(3, "add_block: trying to add the same block twice:"
|
||||
<< "\nnew block id = " << new_block->m_sidechainId
|
||||
<< ", sidechain height = " << new_block->m_sidechainHeight
|
||||
<< ", height = " << new_block->m_txinGenHeight
|
||||
<< ", nonce = " << new_block->m_nonce
|
||||
<< ", extra_nonce = " << new_block->m_extraNonce
|
||||
<< "\nold block id = " << old_block->m_sidechainId
|
||||
<< ", sidechain height = " << old_block->m_sidechainHeight
|
||||
<< ", height = " << old_block->m_txinGenHeight
|
||||
<< ", nonce = " << old_block->m_nonce
|
||||
<< ", extra_nonce = " << old_block->m_extraNonce
|
||||
);
|
||||
|
||||
delete new_block;
|
||||
return false;
|
||||
}
|
||||
delete new_block;
|
||||
return false;
|
||||
}
|
||||
|
||||
m_blocksByHeight[new_block->m_sidechainHeight].push_back(new_block);
|
||||
m_blocksByMerkleRoot.insert({ new_block->m_merkleRoot, new_block });
|
||||
m_blocksByHeight[new_block->m_sidechainHeight].push_back(new_block);
|
||||
m_blocksByMerkleRoot.insert({ new_block->m_merkleRoot, new_block });
|
||||
|
||||
update_depths(new_block);
|
||||
update_depths(new_block);
|
||||
|
||||
if (new_block->m_verified) {
|
||||
if (!new_block->m_invalid) {
|
||||
update_chain_tip(new_block);
|
||||
if (new_block->m_verified) {
|
||||
if (!new_block->m_invalid) {
|
||||
update_chain_tip(new_block);
|
||||
|
||||
// Save it for faster syncing on the next p2pool start
|
||||
if (P2PServer* server = p2pServer()) {
|
||||
server->store_in_cache(*new_block);
|
||||
// Save it for faster syncing on the next p2pool start
|
||||
if (P2PServer* server = p2pServer()) {
|
||||
server->store_in_cache(*new_block);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
verify_loop(new_block);
|
||||
}
|
||||
else {
|
||||
verify_loop(new_block);
|
||||
}
|
||||
} // Lock released here
|
||||
|
||||
// Check if recovery was triggered during verify_loop and execute it now
|
||||
// (must be done outside the lock to avoid deadlock)
|
||||
check_and_run_deferred_recovery();
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -2716,46 +2725,52 @@ void SideChain::get_missing_blocks(unordered_set<hash>& missing_blocks) const
|
||||
|
||||
void SideChain::retry_unverified_blocks()
|
||||
{
|
||||
WriteLock lock(m_sidechainLock);
|
||||
// Scope the lock so we can check for deferred recovery after releasing it
|
||||
{
|
||||
WriteLock lock(m_sidechainLock);
|
||||
|
||||
// Scan for unverified blocks and retry them
|
||||
// This is called when new mainchain data arrives that might allow verification
|
||||
std::vector<PoolBlock*> blocks_to_retry;
|
||||
// Scan for unverified blocks and retry them
|
||||
// This is called when new mainchain data arrives that might allow verification
|
||||
std::vector<PoolBlock*> blocks_to_retry;
|
||||
|
||||
for (auto& pair : m_blocksById) {
|
||||
PoolBlock* block = pair.second;
|
||||
if (!block->m_verified && !block->m_invalid) {
|
||||
blocks_to_retry.push_back(block);
|
||||
for (auto& pair : m_blocksById) {
|
||||
PoolBlock* block = pair.second;
|
||||
if (!block->m_verified && !block->m_invalid) {
|
||||
blocks_to_retry.push_back(block);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (blocks_to_retry.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
LOGINFO(4, "Retrying verification of " << blocks_to_retry.size() << " unverified blocks after mainchain update");
|
||||
|
||||
// Sort by height to process in order
|
||||
std::sort(blocks_to_retry.begin(), blocks_to_retry.end(),
|
||||
[](const PoolBlock* a, const PoolBlock* b) {
|
||||
return a->m_sidechainHeight < b->m_sidechainHeight;
|
||||
});
|
||||
|
||||
// Try to verify each block
|
||||
uint32_t verified_count = 0;
|
||||
for (PoolBlock* block : blocks_to_retry) {
|
||||
if (block->m_verified) {
|
||||
continue; // Already verified by earlier iteration
|
||||
if (blocks_to_retry.empty()) {
|
||||
return;
|
||||
}
|
||||
verify_loop(block);
|
||||
if (block->m_verified) {
|
||||
++verified_count;
|
||||
}
|
||||
}
|
||||
|
||||
if (verified_count > 0) {
|
||||
LOGINFO(3, "Verified " << verified_count << " blocks after mainchain update");
|
||||
}
|
||||
LOGINFO(4, "Retrying verification of " << blocks_to_retry.size() << " unverified blocks after mainchain update");
|
||||
|
||||
// Sort by height to process in order
|
||||
std::sort(blocks_to_retry.begin(), blocks_to_retry.end(),
|
||||
[](const PoolBlock* a, const PoolBlock* b) {
|
||||
return a->m_sidechainHeight < b->m_sidechainHeight;
|
||||
});
|
||||
|
||||
// Try to verify each block
|
||||
uint32_t verified_count = 0;
|
||||
for (PoolBlock* block : blocks_to_retry) {
|
||||
if (block->m_verified) {
|
||||
continue; // Already verified by earlier iteration
|
||||
}
|
||||
verify_loop(block);
|
||||
if (block->m_verified) {
|
||||
++verified_count;
|
||||
}
|
||||
}
|
||||
|
||||
if (verified_count > 0) {
|
||||
LOGINFO(3, "Verified " << verified_count << " blocks after mainchain update");
|
||||
}
|
||||
} // Lock released here
|
||||
|
||||
// Check if recovery was triggered during verify_loop and execute it now
|
||||
check_and_run_deferred_recovery();
|
||||
}
|
||||
|
||||
bool SideChain::consider_peer_genesis(const hash& genesis_id, uint64_t timestamp, uint64_t height)
|
||||
@@ -3241,22 +3256,35 @@ void SideChain::trigger_recovery(uint64_t failure_height)
|
||||
// Already in recovery mode
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Disable mining immediately
|
||||
m_readyToMine = false;
|
||||
|
||||
|
||||
// Find the checkpoint before the failure
|
||||
uint64_t recovery_checkpoint = (failure_height / CHECKPOINT_INTERVAL) * CHECKPOINT_INTERVAL;
|
||||
if (recovery_checkpoint >= failure_height && recovery_checkpoint >= CHECKPOINT_INTERVAL) {
|
||||
recovery_checkpoint -= CHECKPOINT_INTERVAL;
|
||||
}
|
||||
|
||||
|
||||
LOGINFO(0, "Recovery target checkpoint: " << recovery_checkpoint);
|
||||
|
||||
|
||||
m_pendingRecoveryHeight = recovery_checkpoint;
|
||||
|
||||
// Request checkpoint validation from peers via P2P server
|
||||
request_checkpoint_validation();
|
||||
|
||||
// NOTE: Don't call request_checkpoint_validation() here!
|
||||
// We may be called while holding m_sidechainLock (from verify_loop -> on_block_rejected).
|
||||
// The actual recovery will be executed by check_and_run_deferred_recovery() after
|
||||
// the lock is released.
|
||||
}
|
||||
|
||||
void SideChain::check_and_run_deferred_recovery()
|
||||
{
|
||||
// This function should be called AFTER releasing m_sidechainLock
|
||||
// It checks if recovery was triggered and executes it
|
||||
uint64_t checkpoint_height = m_pendingRecoveryHeight.load();
|
||||
if (checkpoint_height > 0 && m_recoveryMode.load()) {
|
||||
LOGINFO(0, "Executing deferred recovery to checkpoint " << checkpoint_height);
|
||||
request_checkpoint_validation();
|
||||
}
|
||||
}
|
||||
|
||||
void SideChain::request_checkpoint_validation()
|
||||
|
||||
@@ -110,6 +110,7 @@ public:
|
||||
// Recovery
|
||||
void trigger_recovery(uint64_t failure_height);
|
||||
void reset_to_checkpoint(uint64_t checkpoint_height);
|
||||
void check_and_run_deferred_recovery();
|
||||
bool is_in_recovery() const { return m_recoveryMode.load(); }
|
||||
|
||||
[[nodiscard]] FORCEINLINE difficulty_type difficulty() const { ReadLock lock(m_curDifficultyLock); return m_curDifficulty; }
|
||||
|
||||
Reference in New Issue
Block a user