summaryrefslogtreecommitdiff
path: root/crates/jmap/src/services/gossip/heartbeat.rs
blob: 10132e1f0235a48ddbb39ed85dfe8f89a388e011 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/*
 * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd <hello@stalw.art>
 *
 * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL
 */

use trc::ClusterEvent;

use super::{Peer, State, HEARTBEAT_WINDOW, HEARTBEAT_WINDOW_MASK};
use std::time::Instant;

// Phi Accrual Failure Detector defaults
const HB_MAX_PAUSE_MS: f64 = 0.0;
const HB_MIN_STD_DEV: f64 = 300.0;
const HB_PHI_SUSPECT_THRESHOLD: f64 = 5.0;
const HB_PHI_CONVICT_THRESHOLD: f64 = 9.0;

impl Peer {
    pub fn update_heartbeat(&mut self, is_direct_ping: bool) -> bool {
        let hb_diff =
            std::cmp::min(self.last_heartbeat.elapsed().as_millis(), 60 * 60 * 1000) as u64;
        self.last_heartbeat = Instant::now();

        match self.state {
            State::Seed | State::Offline => {
                trc::event!(Cluster(ClusterEvent::PeerAlive), RemoteIp = self.addr);

                self.state = State::Alive;

                // Do not count stale heartbeats.
                return true;
            }
            State::Suspected => {
                trc::event!(
                    Cluster(ClusterEvent::PeerSuspectedIsAlive),
                    RemoteIp = self.addr
                );

                self.state = State::Alive;
            }
            State::Left if is_direct_ping => {
                trc::event!(Cluster(ClusterEvent::PeerBackOnline), RemoteIp = self.addr);

                self.state = State::Alive;

                // Do not count stale heartbeats.
                return true;
            }
            _ => (),
        }

        self.hb_window_pos = (self.hb_window_pos + 1) & HEARTBEAT_WINDOW_MASK;

        if !self.hb_is_full && self.hb_window_pos == 0 && self.hb_sum > 0 {
            self.hb_is_full = true;
        }

        if self.hb_is_full {
            let hb_window = self.hb_window[self.hb_window_pos] as u64;
            self.hb_sum -= hb_window;
            self.hb_sq_sum -= hb_window.saturating_mul(hb_window);
        }

        self.hb_window[self.hb_window_pos] = hb_diff as u32;
        self.hb_sum += hb_diff;
        self.hb_sq_sum += hb_diff.saturating_mul(hb_diff);

        false
    }

    /*
       Phi Accrual Failure Detection
       Ported from https://github.com/akka/akka/blob/main/akka-remote/src/main/scala/akka/remote/PhiAccrualFailureDetector.scala
    */
    pub fn check_heartbeat(&mut self) -> bool {
        if self.hb_sum == 0 {
            return false;
        }

        let hb_diff = self.last_heartbeat.elapsed().as_millis() as f64;
        let sample_size = if self.hb_is_full {
            HEARTBEAT_WINDOW
        } else {
            self.hb_window_pos + 1
        } as f64;
        let hb_mean = (self.hb_sum as f64 / sample_size) + HB_MAX_PAUSE_MS;
        let hb_variance = (self.hb_sq_sum as f64 / sample_size) - (hb_mean * hb_mean);
        let hb_std_dev = hb_variance.sqrt();
        let y = (hb_diff - hb_mean) / hb_std_dev.max(HB_MIN_STD_DEV);
        let e = (-y * (1.5976 + 0.070566 * y * y)).exp();
        let phi = if hb_diff > hb_mean {
            -(e / (1.0 + e)).log10()
        } else {
            -(1.0 - 1.0 / (1.0 + e)).log10()
        };

        /*trc::event!(
            "Heartbeat from {}: mean={:.2}ms, variance={:.2}ms, std_dev={:.2}ms, phi={:.2}, samples={}, status={:?}",
            self.addr, hb_mean, hb_variance, hb_std_dev, phi, sample_size, if phi > HB_PHI_CONVICT_THRESHOLD {
                State::Offline
            } else if phi > HB_PHI_SUSPECT_THRESHOLD {
                State::Suspected
            } else {
                State::Alive
            }
        );*/

        if phi > HB_PHI_CONVICT_THRESHOLD {
            trc::event!(Cluster(ClusterEvent::PeerOffline), RemoteIp = self.addr);

            self.state = State::Offline;
            false
        } else if phi > HB_PHI_SUSPECT_THRESHOLD {
            trc::event!(Cluster(ClusterEvent::PeerSuspected), RemoteIp = self.addr);

            self.state = State::Suspected;
            true
        } else {
            true
        }
    }
}