From e73036c6845fd3cc16479a1b497db82a5ebb3897 Mon Sep 17 00:00:00 2001
From: Antoine Riard <ariard@student.42.fr>
Date: Thu, 27 Aug 2020 20:47:02 -0400
Subject: [PATCH 1/3] Implement concurrent broadcast tolerance for distributed
 watchtowers

With a distrbuted watchtowers deployment, where each monitor is plugged
to its own chain view, there is no guarantee that block are going to be
seen in same order. Watchtower may diverge in their acceptance of a
submitted `commitment_signed` update due to a block timing-out a HTLC
and provoking a subset but yet not seen by the other watchtower subset.
Any update reject by one of the watchtower must block offchain coordinator
to move channel state forward and release revocation secret for previous
state.

In this case, we want any watchtower from the rejection subset to still
be able to claim outputs if the concurrent state, has accepted by the
other subset, is confirming. This improve overall watchtower system
fault-tolerance.

This change stores local commitment transaction unconditionally and fail
the update if there is knowledge of an already signed commitment
transaction (ChannelMonitor.local_tx_signed=true).
---
 lightning/src/ln/channelmonitor.rs | 31 ++++++++++++++----------------
 lightning/src/ln/onchaintx.rs      | 11 +----------
 2 files changed, 15 insertions(+), 27 deletions(-)
diff --git a/lightning/src/ln/channelmonitor.rs b/lightning/src/ln/channelmonitor.rs
index d17078554b4..55943487019 100644
--- a/lightning/src/ln/channelmonitor.rs
+++ b/lightning/src/ln/channelmonitor.rs
@@ -824,6 +824,10 @@ pub struct ChannelMonitor<ChanSigner: ChannelKeys> {
 	// Set once we've signed a holder commitment transaction and handed it over to our
 	// OnchainTxHandler. After this is set, no future updates to our holder commitment transactions
 	// may occur, and we fail any such monitor updates.
+	//
+	// In case of update rejection due to a locally already signed commitment transaction, we
+	// nevertheless store update content to track in case of concurrent broadcast by another
+	// remote monitor out-of-order with regards to the block view.
 	holder_tx_signed: bool,
 
 	// We simply modify last_block_hash in Channel's block_connected so that serialization is
@@ -888,6 +892,11 @@ pub trait ManyChannelMonitor: Send + Sync {
 	///
 	/// Any spends of outputs which should have been registered which aren't passed to
 	/// ChannelMonitors via block_connected may result in FUNDS LOSS.
+	///
+	/// In case of distributed watchtowers deployment, even if an Err is return, the new version
+	/// must be written to disk, as state may have been stored but rejected due to a block forcing
+	/// a commitment broadcast. This storage is used to claim outputs of rejected state confirmed
+	/// onchain by another watchtower, lagging behind on block processing.
 	fn update_monitor(&self, funding_txo: OutPoint, monitor: ChannelMonitorUpdate) -> Result<(), ChannelMonitorUpdateErr>;
 
 	/// Used by ChannelManager to get list of HTLC resolved onchain and which needed to be updated
@@ -1167,12 +1176,7 @@ impl<ChanSigner: ChannelKeys> ChannelMonitor<ChanSigner> {
 			feerate_per_kw: initial_holder_commitment_tx.feerate_per_kw,
 			htlc_outputs: Vec::new(), // There are never any HTLCs in the initial commitment transactions
 		};
-		// Returning a monitor error before updating tracking points means in case of using
-		// a concurrent watchtower implementation for same channel, if this one doesn't
-		// reject update as we do, you MAY have the latest holder valid commitment tx onchain
-		// for which you want to spend outputs. We're NOT robust again this scenario right
-		// now but we should consider it later.
-		onchain_tx_handler.provide_latest_holder_tx(initial_holder_commitment_tx).unwrap();
+		onchain_tx_handler.provide_latest_holder_tx(initial_holder_commitment_tx);
 
 		ChannelMonitor {
 			latest_update_id: 0,
@@ -1327,9 +1331,6 @@ impl<ChanSigner: ChannelKeys> ChannelMonitor<ChanSigner> {
 	/// up-to-date as our holder commitment transaction is updated.
 	/// Panics if set_on_holder_tx_csv has never been called.
 	pub(super) fn provide_latest_holder_commitment_tx_info(&mut self, commitment_tx: HolderCommitmentTransaction, htlc_outputs: Vec<(HTLCOutputInCommitment, Option<Signature>, Option<HTLCSource>)>) -> Result<(), MonitorUpdateError> {
-		if self.holder_tx_signed {
-			return Err(MonitorUpdateError("A holder commitment tx has already been signed, no new holder commitment txn can be sent to our counterparty"));
-		}
 		let txid = commitment_tx.txid();
 		let sequence = commitment_tx.unsigned_tx.input[0].sequence as u64;
 		let locktime = commitment_tx.unsigned_tx.lock_time as u64;
@@ -1343,17 +1344,13 @@ impl<ChanSigner: ChannelKeys> ChannelMonitor<ChanSigner> {
 			feerate_per_kw: commitment_tx.feerate_per_kw,
 			htlc_outputs: htlc_outputs,
 		};
-		// Returning a monitor error before updating tracking points means in case of using
-		// a concurrent watchtower implementation for same channel, if this one doesn't
-		// reject update as we do, you MAY have the latest holder valid commitment tx onchain
-		// for which you want to spend outputs. We're NOT robust again this scenario right
-		// now but we should consider it later.
-		if let Err(_) = self.onchain_tx_handler.provide_latest_holder_tx(commitment_tx) {
-			return Err(MonitorUpdateError("Holder commitment signed has already been signed, no further update of LOCAL commitment transaction is allowed"));
-		}
+		self.onchain_tx_handler.provide_latest_holder_tx(commitment_tx);
 		self.current_holder_commitment_number = 0xffff_ffff_ffff - ((((sequence & 0xffffff) << 3*8) | (locktime as u64 & 0xffffff)) ^ self.commitment_transaction_number_obscure_factor);
 		mem::swap(&mut new_holder_commitment_tx, &mut self.current_holder_commitment_tx);
 		self.prev_holder_signed_commitment_tx = Some(new_holder_commitment_tx);
+		if self.holder_tx_signed {
+			return Err(MonitorUpdateError("Latest holder commitment signed has already been signed, update is rejected"));
+		}
 		Ok(())
 	}
 
diff --git a/lightning/src/ln/onchaintx.rs b/lightning/src/ln/onchaintx.rs
index 281bc919060..cad5cc1bb2e 100644
--- a/lightning/src/ln/onchaintx.rs
+++ b/lightning/src/ln/onchaintx.rs
@@ -877,18 +877,9 @@ impl<ChanSigner: ChannelKeys> OnchainTxHandler<ChanSigner> {
 		}
 	}
 
-	pub(super) fn provide_latest_holder_tx(&mut self, tx: HolderCommitmentTransaction) -> Result<(), ()> {
-		// To prevent any unsafe state discrepancy between offchain and onchain, once holder
-		// commitment transaction has been signed due to an event (either block height for
-		// HTLC-timeout or channel force-closure), don't allow any further update of holder
-		// commitment transaction view to avoid delivery of revocation secret to counterparty
-		// for the aformentionned signed transaction.
-		if self.holder_htlc_sigs.is_some() || self.prev_holder_htlc_sigs.is_some() {
-			return Err(());
-		}
+	pub(super) fn provide_latest_holder_tx(&mut self, tx: HolderCommitmentTransaction) {
 		self.prev_holder_commitment = self.holder_commitment.take();
 		self.holder_commitment = Some(tx);
-		Ok(())
 	}
 
 	fn sign_latest_holder_htlcs(&mut self) {

From e706c67bdb508247f54bb11c698646db5e5839a2 Mon Sep 17 00:00:00 2001
From: Antoine Riard <ariard@student.42.fr>
Date: Thu, 27 Aug 2020 19:48:35 -0400
Subject: [PATCH 2/3] Add test_concurrent_monitor_claim

Watchower Alice receives block 134, broadcasts state X, rejects state Y.
Watchtower Bob accepts state Y, receives blocks 135, broadcasts state Y.
State Y confirms onchain. Alice must be able to claim outputs.
---
 lightning/src/ln/functional_tests.rs | 112 +++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/lightning/src/ln/functional_tests.rs b/lightning/src/ln/functional_tests.rs
index b11b57d34fa..6bc8efec2d5 100644
--- a/lightning/src/ln/functional_tests.rs
+++ b/lightning/src/ln/functional_tests.rs
@@ -8735,3 +8735,115 @@ fn test_update_err_monitor_lockdown() {
 	let events = nodes[0].node.get_and_clear_pending_events();
 	assert_eq!(events.len(), 1);
 }
+
+#[test]
+fn test_concurrent_monitor_claim() {
+	// Watchtower A receives block, broadcasts state N, then channel receives new state N+1,
+	// sending it to both watchtowers, Bob accepts N+1, then receives block and broadcasts
+	// the latest state N+1, Alice rejects state N+1, but Bob has already broadcast it,
+	// state N+1 confirms. Alice claims output from state N+1.
+
+	let chanmon_cfgs = create_chanmon_cfgs(2);
+	let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
+	let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
+	let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+
+	// Create some initial channel
+	let chan_1 = create_announced_chan_between_nodes(&nodes, 0, 1, InitFeatures::known(), InitFeatures::known());
+	let outpoint = OutPoint { txid: chan_1.3.txid(), index: 0 };
+
+	// Rebalance the network to generate htlc in the two directions
+	send_payment(&nodes[0], &vec!(&nodes[1])[..], 10_000_000, 10_000_000);
+
+	// Route a HTLC from node 0 to node 1 (but don't settle)
+	route_payment(&nodes[0], &vec!(&nodes[1])[..], 9_000_000).0;
+
+	// Copy SimpleManyChannelMonitor to simulate watchtower Alice and update block height her ChannelMonitor timeout HTLC onchain
+	let logger = test_utils::TestLogger::with_id(format!("node {}", "Alice"));
+	let chain_monitor = chaininterface::ChainWatchInterfaceUtil::new(Network::Testnet);
+	let watchtower_alice = {
+		let monitors = nodes[0].chan_monitor.simple_monitor.monitors.lock().unwrap();
+		let monitor = monitors.get(&outpoint).unwrap();
+		let mut w = test_utils::TestVecWriter(Vec::new());
+		monitor.write_for_disk(&mut w).unwrap();
+		let new_monitor = <(BlockHash, channelmonitor::ChannelMonitor<EnforcingChannelKeys>)>::read(
+				&mut ::std::io::Cursor::new(&w.0)).unwrap().1;
+		assert!(new_monitor == *monitor);
+		let watchtower = test_utils::TestChannelMonitor::new(&chain_monitor, &chanmon_cfgs[0].tx_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator);
+		assert!(watchtower.add_monitor(outpoint, new_monitor).is_ok());
+		watchtower
+	};
+	let header = BlockHeader { version: 0x20000000, prev_blockhash: Default::default(), merkle_root: Default::default(), time: 42, bits: 42, nonce: 42 };
+	watchtower_alice.simple_monitor.block_connected(&header, 135, &vec![], &vec![]);
+
+	// Watchtower Alice should have broadcast a commitment/HTLC-timeout
+	{
+		let mut txn = chanmon_cfgs[0].tx_broadcaster.txn_broadcasted.lock().unwrap();
+		assert_eq!(txn.len(), 2);
+		txn.clear();
+	}
+
+	// Copy SimpleManyChannelMonitor to simulate watchtower Bob and make it receive a commitment update first.
+	let logger = test_utils::TestLogger::with_id(format!("node {}", "Bob"));
+	let chain_monitor = chaininterface::ChainWatchInterfaceUtil::new(Network::Testnet);
+	let watchtower_bob = {
+		let monitors = nodes[0].chan_monitor.simple_monitor.monitors.lock().unwrap();
+		let monitor = monitors.get(&outpoint).unwrap();
+		let mut w = test_utils::TestVecWriter(Vec::new());
+		monitor.write_for_disk(&mut w).unwrap();
+		let new_monitor = <(BlockHash, channelmonitor::ChannelMonitor<EnforcingChannelKeys>)>::read(
+				&mut ::std::io::Cursor::new(&w.0)).unwrap().1;
+		assert!(new_monitor == *monitor);
+		let watchtower = test_utils::TestChannelMonitor::new(&chain_monitor, &chanmon_cfgs[0].tx_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator);
+		assert!(watchtower.add_monitor(outpoint, new_monitor).is_ok());
+		watchtower
+	};
+	let header = BlockHeader { version: 0x20000000, prev_blockhash: Default::default(), merkle_root: Default::default(), time: 42, bits: 42, nonce: 42 };
+	watchtower_bob.simple_monitor.block_connected(&header, 134, &vec![], &vec![]);
+
+	// Route another payment to generate another update with still previous HTLC pending
+	let (_, payment_hash) = get_payment_preimage_hash!(nodes[0]);
+	{
+		let net_graph_msg_handler = &nodes[1].net_graph_msg_handler;
+		let route = get_route(&nodes[1].node.get_our_node_id(), &net_graph_msg_handler.network_graph.read().unwrap(), &nodes[0].node.get_our_node_id(), None, &Vec::new(), 3000000 , TEST_FINAL_CLTV, &logger).unwrap();
+		nodes[1].node.send_payment(&route, payment_hash, &None).unwrap();
+	}
+	check_added_monitors!(nodes[1], 1);
+
+	let updates = get_htlc_update_msgs!(nodes[1], nodes[0].node.get_our_node_id());
+	assert_eq!(updates.update_add_htlcs.len(), 1);
+	nodes[0].node.handle_update_add_htlc(&nodes[1].node.get_our_node_id(), &updates.update_add_htlcs[0]);
+	if let Some(ref mut channel) = nodes[0].node.channel_state.lock().unwrap().by_id.get_mut(&chan_1.2) {
+		if let Ok((_, _, _, update)) = channel.commitment_signed(&updates.commitment_signed, &node_cfgs[0].fee_estimator, &node_cfgs[0].logger) {
+			// Watchtower Alice should already have seen the block and reject the update
+			if let Err(_) =  watchtower_alice.simple_monitor.update_monitor(outpoint, update.clone()) {} else { assert!(false); }
+			if let Ok(_) = watchtower_bob.simple_monitor.update_monitor(outpoint, update.clone()) {} else { assert!(false); }
+			if let Ok(_) = nodes[0].chan_monitor.update_monitor(outpoint, update) {} else { assert!(false); }
+		} else { assert!(false); }
+	} else { assert!(false); };
+	// Our local monitor is in-sync and hasn't processed yet timeout
+	check_added_monitors!(nodes[0], 1);
+
+	//// Provide one more block to watchtower Bob, expect broadcast of commitment and HTLC-Timeout
+	watchtower_bob.simple_monitor.block_connected(&header, 135, &vec![], &vec![]);
+
+	// Watchtower Bob should have broadcast a commitment/HTLC-timeout
+	let bob_state_y;
+	{
+		let mut txn = chanmon_cfgs[0].tx_broadcaster.txn_broadcasted.lock().unwrap();
+		assert_eq!(txn.len(), 2);
+		bob_state_y = txn[0].clone();
+		txn.clear();
+	};
+
+	// We confirm Bob's state Y on Alice, she should broadcast a HTLC-timeout
+	watchtower_alice.simple_monitor.block_connected(&header, 136, &vec![&bob_state_y][..], &vec![]);
+	{
+		let htlc_txn = chanmon_cfgs[0].tx_broadcaster.txn_broadcasted.lock().unwrap();
+		// We broadcast twice the transaction, once due to the HTLC-timeout, once due
+		// the onchain detection of the HTLC output
+		assert_eq!(htlc_txn.len(), 2);
+		check_spends!(htlc_txn[0], bob_state_y);
+		check_spends!(htlc_txn[1], bob_state_y);
+	}
+}

From 6622ea724f9947a52600d5743cbd8d8f45d1e6e4 Mon Sep 17 00:00:00 2001
From: Antoine Riard <ariard@student.42.fr>
Date: Fri, 28 Aug 2020 16:31:31 -0400
Subject: [PATCH 3/3] Improve PermanentFailure requiremnts documentation

Sources of the failure may be multiple in case of distributed watchtower
deployment. In either case, the channel manager must return a final
update asking to its channel monitor(s) to broadcast the lastest state
available. Revocation secret must not be released for the faultive
channel.

In the future, we may return wider type of failures to take more
fine-grained processing decision (e.g if local disk failure and
redudant remote channel copy available channel may still be processed
forward).
---
 lightning/src/ln/channelmonitor.rs | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/lightning/src/ln/channelmonitor.rs b/lightning/src/ln/channelmonitor.rs
index 55943487019..91f81fa5736 100644
--- a/lightning/src/ln/channelmonitor.rs
+++ b/lightning/src/ln/channelmonitor.rs
@@ -133,11 +133,19 @@ pub enum ChannelMonitorUpdateErr {
 	TemporaryFailure,
 	/// Used to indicate no further channel monitor updates will be allowed (eg we've moved on to a
 	/// different watchtower and cannot update with all watchtowers that were previously informed
-	/// of this channel). This will force-close the channel in question (which will generate one
-	/// final ChannelMonitorUpdate which must be delivered to at least one ChannelMonitor copy).
+	/// of this channel).
 	///
-	/// Should also be used to indicate a failure to update the local persisted copy of the channel
-	/// monitor.
+	/// At reception of this error, ChannelManager will force-close the channel and return at
+	/// least a final ChannelMonitorUpdate::ChannelForceClosed which must be delivered to at
+	/// least one ChannelMonitor copy. Revocation secret MUST NOT be released and offchain channel
+	/// update must be rejected.
+	///
+	/// This failure may also signal a failure to update the local persisted copy of one of
+	/// the channel monitor instance.
+	///
+	/// Note that even when you fail a holder commitment transaction update, you must store the
+	/// update to ensure you can claim from it in case of a duplicate copy of this ChannelMonitor
+	/// broadcasts it (e.g distributed channel-monitor deployment)
 	PermanentFailure,
 }