1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
|
/*
* SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd <hello@stalw.art>
*
* SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL
*/
use trc::ClusterEvent;
use super::{Peer, State, HEARTBEAT_WINDOW, HEARTBEAT_WINDOW_MASK};
use std::time::Instant;
// Phi Accrual Failure Detector defaults
const HB_MAX_PAUSE_MS: f64 = 0.0;
const HB_MIN_STD_DEV: f64 = 300.0;
const HB_PHI_SUSPECT_THRESHOLD: f64 = 5.0;
const HB_PHI_CONVICT_THRESHOLD: f64 = 9.0;
impl Peer {
pub fn update_heartbeat(&mut self, is_direct_ping: bool) -> bool {
let hb_diff =
std::cmp::min(self.last_heartbeat.elapsed().as_millis(), 60 * 60 * 1000) as u64;
self.last_heartbeat = Instant::now();
match self.state {
State::Seed | State::Offline => {
trc::event!(Cluster(ClusterEvent::PeerAlive), RemoteIp = self.addr);
self.state = State::Alive;
// Do not count stale heartbeats.
return true;
}
State::Suspected => {
trc::event!(
Cluster(ClusterEvent::PeerSuspectedIsAlive),
RemoteIp = self.addr
);
self.state = State::Alive;
}
State::Left if is_direct_ping => {
trc::event!(Cluster(ClusterEvent::PeerBackOnline), RemoteIp = self.addr);
self.state = State::Alive;
// Do not count stale heartbeats.
return true;
}
_ => (),
}
self.hb_window_pos = (self.hb_window_pos + 1) & HEARTBEAT_WINDOW_MASK;
if !self.hb_is_full && self.hb_window_pos == 0 && self.hb_sum > 0 {
self.hb_is_full = true;
}
if self.hb_is_full {
let hb_window = self.hb_window[self.hb_window_pos] as u64;
self.hb_sum -= hb_window;
self.hb_sq_sum -= hb_window.saturating_mul(hb_window);
}
self.hb_window[self.hb_window_pos] = hb_diff as u32;
self.hb_sum += hb_diff;
self.hb_sq_sum += hb_diff.saturating_mul(hb_diff);
false
}
/*
Phi Accrual Failure Detection
Ported from https://github.com/akka/akka/blob/main/akka-remote/src/main/scala/akka/remote/PhiAccrualFailureDetector.scala
*/
pub fn check_heartbeat(&mut self) -> bool {
if self.hb_sum == 0 {
return false;
}
let hb_diff = self.last_heartbeat.elapsed().as_millis() as f64;
let sample_size = if self.hb_is_full {
HEARTBEAT_WINDOW
} else {
self.hb_window_pos + 1
} as f64;
let hb_mean = (self.hb_sum as f64 / sample_size) + HB_MAX_PAUSE_MS;
let hb_variance = (self.hb_sq_sum as f64 / sample_size) - (hb_mean * hb_mean);
let hb_std_dev = hb_variance.sqrt();
let y = (hb_diff - hb_mean) / hb_std_dev.max(HB_MIN_STD_DEV);
let e = (-y * (1.5976 + 0.070566 * y * y)).exp();
let phi = if hb_diff > hb_mean {
-(e / (1.0 + e)).log10()
} else {
-(1.0 - 1.0 / (1.0 + e)).log10()
};
/*trc::event!(
"Heartbeat from {}: mean={:.2}ms, variance={:.2}ms, std_dev={:.2}ms, phi={:.2}, samples={}, status={:?}",
self.addr, hb_mean, hb_variance, hb_std_dev, phi, sample_size, if phi > HB_PHI_CONVICT_THRESHOLD {
State::Offline
} else if phi > HB_PHI_SUSPECT_THRESHOLD {
State::Suspected
} else {
State::Alive
}
);*/
if phi > HB_PHI_CONVICT_THRESHOLD {
trc::event!(Cluster(ClusterEvent::PeerOffline), RemoteIp = self.addr);
self.state = State::Offline;
false
} else if phi > HB_PHI_SUSPECT_THRESHOLD {
trc::event!(Cluster(ClusterEvent::PeerSuspected), RemoteIp = self.addr);
self.state = State::Suspected;
true
} else {
true
}
}
}
|