chj_unix_util/
daemon.rs

1//! Infrastructure to run / start / stop a service as a daemon process
2//! (or process group).
3
4//! See [daemon](../docs/daemon.md) for more info.
5
6pub mod warrants_restart;
7
8use std::{
9    borrow::Cow,
10    fmt::Debug,
11    io::{stderr, stdout, Write},
12    num::{NonZeroU32, ParseIntError},
13    ops::Deref,
14    os::unix::ffi::OsStrExt,
15    path::Path,
16    str::FromStr,
17    sync::{atomic::Ordering, Arc},
18    thread::sleep,
19    time::Duration,
20};
21
22use anyhow::{anyhow, bail, Context, Result};
23use cj_path_util::path_util::AppendToPath;
24use nix::{
25    libc::getsid,
26    sys::signal::Signal::{self, SIGCONT, SIGKILL, SIGSTOP},
27    unistd::{execvp, setsid, Pid},
28};
29
30use crate::{
31    backoff::LoopWithBackoff,
32    daemon::warrants_restart::WarrantsRestart,
33    eval_with_default::EvalWithDefault,
34    file_lock::{file_lock_nonblocking, FileLockError},
35    file_util::{create_dir_if_not_exists, PathIOError},
36    forking_loop::forking_loop,
37    logging::{Logger, LoggingOpts, TimestampOpts},
38    polling_signals::{IPCAtomicError, IPCAtomicU64},
39    re_exec::re_exec,
40    retry::{retry, retry_n},
41    signal::send_signal_to_all_processes_of_session,
42    unix::easy_fork,
43    util::cstring,
44};
45
46/// You may want to use this as a normal argument, i.e. via FromStr,
47/// instead, as then single strings can be passed through. There is
48/// more flexibility with the options here, though. XX currently this
49/// is also conflating STOP and Stop etc., and does not have `up`
50/// etc. aliases, thus pretty unusable.
51#[derive(Debug, Clone, Copy, clap::Subcommand)]
52pub enum DaemonMode {
53    /// Do not put into background, just run forever in the foreground.
54    Run,
55
56    /// Start daemon into background.
57    Start,
58
59    /// Stop daemon running in background (does not stop daemons in
60    /// `run` mode, only those in `start` mode). This subcommand has
61    /// options.
62    Stop(StopOpts),
63
64    /// `stop` (ignoring errors) then `start`. This subcommand has
65    /// options.
66    Restart(StopOpts),
67
68    /// Report if there is a daemon in `start` or `run` mode, together
69    /// with pid if running and what the desired status is.
70    Status,
71
72    /// Report if there is a daemon in `start` or `run` mode (without
73    /// additional information).
74    ShortStatus,
75
76    /// Send a STOP signal to (i.e. suspend) the daemon.
77    STOP,
78
79    /// Send a CONT signal to (i.e. continue) the daemon.
80    CONT,
81
82    /// Send a KILL signal to (i.e. terminate right away) the daemon.
83    KILL,
84
85    /// Open the current log file in the pager ($PAGER or 'less')
86    Log,
87
88    /// Run `tail -f` on the current log file
89    Logf,
90}
91
92const FROM_STR_CASES: &[(&str, DaemonMode, &str)] = {
93    const fn opts(hard: bool, soft: bool) -> StopOpts {
94        StopOpts {
95            hard,
96            soft,
97            wait: false,
98            timeout_before_sigkill: 30,
99        }
100    }
101    {
102        use DaemonMode::*;
103        // reminder to adapt the code below when the enum changes
104        match DaemonMode::Run {
105            Run => (),
106            Start => (),
107            Stop(_) => (),
108            Restart(_) => (),
109            Status => (),
110            ShortStatus => (),
111            STOP => (),
112            CONT => (),
113            KILL => (),
114            Log => (),
115            Logf => (),
116        }
117    }
118
119    &[
120        (
121            "run",
122            DaemonMode::Run,
123            "Do not put into background, just run forever in the foreground.",
124        ),
125        ("start", DaemonMode::Start, "Start daemon into background."),
126        ("up", DaemonMode::Start, "Alias for `start`."),
127        (
128            "stop",
129            DaemonMode::Stop(opts(false, false)),
130            "Stop a daemon running in background (does not stop daemons running\n\
131             via `run`). Alias for hard-stop or soft-stop, depending on the application.",
132        ),
133        (
134            "hard-stop",
135            DaemonMode::Stop(opts(true, false)),
136            "Stop daemon by sending it and its children signals (SIGINT then SIGKILL).\n\
137             Returns only after the daemon has ended.",
138        ),
139        (
140            "soft-stop",
141            DaemonMode::Stop(opts(false, true)),
142            "Stop daemon gracefully by sending the daemon a plea to exit. Returns\n\
143             immediately, the daemon will stop at its own leisure.",
144        ),
145        (
146            "down",
147            DaemonMode::Stop(opts(false, false)),
148            "Alias for `stop`.",
149        ),
150        (
151            "hard-down",
152            DaemonMode::Stop(opts(true, false)),
153            "Alias for `hard-stop`.",
154        ),
155        (
156            "soft-down",
157            DaemonMode::Stop(opts(false, true)),
158            "Alias for `soft-stop`.",
159        ),
160        (
161            "restart",
162            DaemonMode::Restart(opts(false, false)),
163            "Alias for `hard-restart` or `soft-restart`, depending on the application.",
164        ),
165        (
166            "hard-restart",
167            DaemonMode::Restart(opts(true, false)),
168            "`hard-stop` then `start` the daemon; picks up new command line flags and\n\
169             environment changes.",
170        ),
171        (
172            "soft-restart",
173            DaemonMode::Restart(opts(false, true)),
174            "Sends the daemon a plea to re-execute itself, with its original command line\n\
175             flags and environment.",
176        ),
177        (
178            "status",
179            DaemonMode::Status,
180            "Show if a (start/stop based) daemon is running, with pid (if running) and the\n\
181             desired status.",
182        ),
183        (
184            "short-status",
185            DaemonMode::ShortStatus,
186            "Show if a (start/stop based) daemon is running in one word.",
187        ),
188        (
189            "STOP",
190            DaemonMode::STOP,
191            "Send a STOP signal to the daemon and its children.",
192        ),
193        (
194            "CONT",
195            DaemonMode::CONT,
196            "Send a CONT signal to the daemon and its children.",
197        ),
198        (
199            "KILL",
200            DaemonMode::KILL,
201            "Send a KILL signal to the daemon and its children.",
202        ),
203        (
204            "log",
205            DaemonMode::Log,
206            "Open the current log file in the pager ($PAGER or 'less')",
207        ),
208        (
209            "logf",
210            DaemonMode::Logf,
211            "Run `tail -f` on the current log file",
212        ),
213    ]
214};
215
216fn errmsg() -> String {
217    // Cannot do join() since not using itertools in this crate.
218    let mut s = String::from("please give one of the following arguments:\n\n");
219    for (k, _m, doc) in FROM_STR_CASES {
220        use std::fmt::Write;
221        _ = writeln!(&mut s, "    `{k}`:");
222        for line in doc.split('\n') {
223            _ = writeln!(&mut s, "        {line}");
224        }
225        s.push('\n');
226    }
227
228    s.truncate(s.len() - 2);
229    s
230}
231
232#[derive(thiserror::Error, Debug)]
233#[error("{}", errmsg())]
234pub struct DaemonModeError;
235
236impl FromStr for DaemonMode {
237    type Err = DaemonModeError;
238
239    fn from_str(s: &str) -> Result<Self, Self::Err> {
240        for (k, v, _doc) in FROM_STR_CASES {
241            if s == *k {
242                return Ok(v.clone());
243            }
244        }
245        Err(DaemonModeError)
246    }
247}
248
249#[derive(Debug, Clone, Copy, Default, clap::Args)]
250pub struct RestartOnFailures {
251    // Adding `, help = None` does not help to avoid the empty paragraph
252    #[clap(long)]
253    pub restart_on_failures: bool,
254
255    /// Whether to restart the daemon or not when it crashes (in
256    /// start/up mode).  Restarting works by forking before running
257    /// the work, then re-forking when the child ends in a non-normal
258    /// way (exit with an error or by signal). Between restarts, the
259    /// parent sleeps a bit, with exponential back-off within a time
260    /// range configured by the application. The default restart
261    /// behaviour is set by the application; the two options cancel
262    /// each other out. Note: debugging crashes will be easiest using
263    /// the `run` mode, which ignores the restart setting (never
264    /// restarts).
265    #[clap(long)]
266    pub no_restart_on_failures: bool,
267}
268
269impl EvalWithDefault for RestartOnFailures {
270    fn explicit_yes_and_no(&self) -> (bool, bool) {
271        let Self {
272            restart_on_failures,
273            no_restart_on_failures,
274        } = self;
275        (*restart_on_failures, *no_restart_on_failures)
276    }
277}
278
279/// These settings may be useful to expose to the user.
280#[derive(Debug, Clone, Default, clap::Args)]
281pub struct DaemonOpts {
282    #[clap(flatten)]
283    pub logging_opts: LoggingOpts,
284
285    #[clap(flatten)]
286    pub restart_on_failures: RestartOnFailures,
287}
288
289#[derive(Debug, Clone, PartialEq, Eq)]
290pub struct DaemonPaths {
291    /// Where the lock/pid files should be written to (is created if missing).
292    pub state_dir: Arc<Path>,
293    /// Where the log files should be written to (is created if missing).
294    pub log_dir: Arc<Path>,
295}
296
297pub struct Daemon<
298    Other: Deref<Target: WarrantsRestart> + Clone,
299    F: FnOnce(DaemonCheckExit<Other>) -> Result<()>,
300> {
301    pub opts: DaemonOpts,
302    /// The default value for
303    /// opts.restart_on_failures.eval_with_default()
304    pub restart_on_failures_default: bool,
305    /// The default value for
306    /// opts.logging_opts.local_time_default.eval_with_default()
307    pub local_time_default: bool,
308    /// The settings for the restarting; if not provided, uses its
309    /// Default values. The `daemon` field is overwritten with the
310    /// string "daemon service process restart ".
311    pub restart_opts: Option<LoopWithBackoff>,
312    pub timestamp_opts: TimestampOpts,
313    /// The code to run; the daemon ends/stops when this function
314    /// returns. The function should periodically call `want()` on its
315    /// argument and stop processing when it doesn't give
316    /// `DaemonWant::Up`.
317    pub paths: DaemonPaths,
318    /// A value that implements `WarrantsRestart`, checking *other*
319    /// conditions warranting restart than the daemon state indicating
320    /// it. Used as part of the argument to `run`. See
321    /// `chj_unix_util::daemon::warrants_restart` for reusable
322    /// implementations.
323    pub other_restart_checks: Other,
324    /// The code to run in the daemon. Should return when calling
325    /// `want_exit()` on the argument returns true.
326    pub run: F,
327}
328
329#[derive(thiserror::Error, Debug)]
330pub enum InOutError {
331    #[error("IO error: {0}")]
332    IOError(#[from] std::io::Error),
333    #[error("IO error: {0}")]
334    Errno(#[from] nix::errno::Errno),
335    #[error("integer parsing error: {0}")]
336    ParseIntError(#[from] ParseIntError),
337}
338
339#[derive(thiserror::Error, Debug)]
340pub enum DaemonError {
341    #[error("can't lock file {lock_path:?}: {error}")]
342    LockError { lock_path: Arc<Path>, error: String },
343    #[error("{context}: IO error: {error}")]
344    IoError {
345        context: &'static str,
346        error: std::io::Error,
347    },
348    #[error("{context}: {error}")]
349    ErrnoError {
350        context: &'static str,
351        error: nix::errno::Errno,
352    },
353    #[error("{0}")]
354    IPCAtomicError(#[from] IPCAtomicError),
355    #[error("{0}")]
356    PathIOError(#[from] PathIOError),
357    #[error("{0}")]
358    Anyhow(#[from] anyhow::Error),
359}
360
361pub struct DaemonResult {
362    daemon_state: DaemonStateAccessor,
363}
364
365impl DaemonResult {
366    pub fn daemon_cleanup(self) {
367        let DaemonResult { daemon_state } = self;
368        let (want, old_pid) = daemon_state.read();
369        // Should not need to change the pid, right?
370        let current_sid = unsafe {
371            // There's actually no safety issue with getside?
372            getsid(0)
373        };
374        if Some(current_sid) != old_pid {
375            eprintln!(
376                "warning on stop or restart: our session-id is {current_sid}, but \
377                 daemon state has {old_pid:?}. Overwriting it."
378            );
379        }
380        match want {
381            DaemonWant::Down => {
382                daemon_state.store(DaemonWant::Down, None);
383            }
384            DaemonWant::Up | DaemonWant::Restart => {
385                daemon_state.store(DaemonWant::Up, Some(current_sid));
386                // (Ah, the new instance will overwrite daemon_state
387                // again, with a new sid.)
388                let e = re_exec();
389                eprintln!("{e:#}");
390                std::process::exit(1);
391            }
392        }
393    }
394}
395
396enum _ExecutionResult {
397    /// In the parent process that started the daemon: no value
398    Initiator,
399    /// In the daemon child: context for handling exiting/restarts
400    /// during shutdown. Pass up to the main function, call
401    /// `daemon_cleanup`.
402    Daemon(DaemonResult),
403    /// Daemon-less `Run` result. No need to execute any restarts or
404    /// change any daemon state.
405    Run,
406}
407
408struct Bomb(bool);
409impl Drop for Bomb {
410    fn drop(&mut self) {
411        if self.0 {
412            panic!("`ExecutionResult`s need to be passed to their `daemon_cleanup` method");
413        }
414    }
415}
416
417#[must_use]
418pub struct ExecutionResult(_ExecutionResult, Bomb);
419
420impl ExecutionResult {
421    fn initiator() -> Self {
422        Self(_ExecutionResult::Initiator, Bomb(true))
423    }
424
425    fn run() -> Self {
426        Self(_ExecutionResult::Run, Bomb(true))
427    }
428
429    fn daemon(r: DaemonResult) -> Self {
430        Self(_ExecutionResult::Daemon(r), Bomb(true))
431    }
432
433    /// If need to know if in the daemon, e.g. to only conditionally return to `main`
434    pub fn is_daemon(&self) -> bool {
435        match &self.0 {
436            _ExecutionResult::Initiator => false,
437            _ExecutionResult::Daemon(_) => true,
438            _ExecutionResult::Run => false,
439        }
440    }
441
442    /// Call this in the `main` function, after everything in the app
443    /// has been cleaned up. If this is in the daemon child, it will
444    /// re-exec the daemon binary if this was a restart
445    /// action. Otherwise exits, indicating whether this is a daemon
446    /// context (same as `is_daemon`).
447    pub fn daemon_cleanup(self) -> bool {
448        let Self(er, mut bomb) = self;
449        bomb.0 = false;
450        match er {
451            _ExecutionResult::Initiator => false,
452            _ExecutionResult::Daemon(daemon_result) => {
453                daemon_result.daemon_cleanup();
454                true
455            }
456            _ExecutionResult::Run => false,
457        }
458    }
459}
460
461#[derive(Debug, Clone, Copy, clap::Args)]
462pub struct StopOpts {
463    /// This stops the daemon via signals, first SIGINT, then
464    /// SIGKILL. Restarting in this mode takes the new enviroment from
465    /// the issuer since it works by forking a new daemon.  (--hard
466    /// and --soft are opposites; the default depends on the
467    /// application.)
468    // `short` -h conflicts with --help
469    #[clap(long)]
470    pub hard: bool,
471
472    /// This stops the daemon by communicating a wish for termination
473    /// via shared memory. The daemon may delay the reaction for a
474    /// long time. Restarting in this mode works by the daemon
475    /// re-executing itself, meaning it will not pick up environment
476    /// or command line argument changes. This action returns
477    /// immediately as it only stores the wish. (--hard and --soft are
478    /// opposites; the default depends on the application.)
479    #[clap(short, long)]
480    pub soft: bool,
481
482    /// When doing graceful termination, by default the stop/restart
483    /// actions do not wait for the daemon to carry it out. This
484    /// changes the behaviour to wait in that case, too.
485    #[clap(short, long)]
486    pub wait: bool,
487
488    /// The time in seconds after sending SIGINT before sending
489    /// SIGKILL
490    // Default: keep in sync with const fn opts
491    #[clap(short, long, default_value = "30")]
492    pub timeout_before_sigkill: u32,
493}
494
495impl StopOpts {
496    pub fn hard(&self, default_is_hard: bool) -> bool {
497        let Self {
498            hard,
499            soft,
500            wait: _,
501            timeout_before_sigkill: _,
502        } = self;
503        match (hard, soft) {
504            (false, false) | (true, true) => default_is_hard,
505            (true, false) => true,
506            (false, true) => false,
507        }
508    }
509}
510
511#[derive(Debug, Clone)]
512pub struct StopReport {
513    pub was_pid: Option<i32>,
514    pub was_running: bool,
515    pub sent_sigint: bool,
516    pub sent_sigkill: bool,
517    pub crashed: bool,
518}
519
520impl<
521        Other: Deref<Target: WarrantsRestart> + Clone,
522        F: FnOnce(DaemonCheckExit<Other>) -> Result<()>,
523    > Daemon<Other, F>
524{
525    pub fn create_dirs(&self) -> Result<(), PathIOError> {
526        create_dir_if_not_exists(&self.state_dir())?;
527        create_dir_if_not_exists(&self.log_dir())?;
528        Ok(())
529    }
530
531    pub fn state_dir(&self) -> Arc<Path> {
532        self.paths.state_dir.clone()
533    }
534
535    pub fn log_dir(&self) -> Arc<Path> {
536        self.paths.log_dir.clone()
537    }
538
539    pub fn to_logger(&self) -> Logger {
540        Logger {
541            logging_opts: self.opts.logging_opts.clone(),
542            local_time_default: self.local_time_default,
543            timestamp_opts: self.timestamp_opts.clone(),
544            dir_path: self.log_dir(),
545        }
546    }
547
548    /// Path to a file that is used as a 8-byte mmap file, and for
549    /// flock. Protect this file from modification by other
550    /// parties--doing so can segfault the app!
551    pub fn daemon_state_path(&self) -> Arc<Path> {
552        self.state_dir().append("daemon_state.mmap").into()
553    }
554
555    /// The same as `daemon_state_path`
556    pub fn lock_path(&self) -> Arc<Path> {
557        self.daemon_state_path()
558    }
559
560    fn daemon_state(&self) -> anyhow::Result<DaemonStateAccessor> {
561        let daemon_state_path = self.daemon_state_path();
562        DaemonStateAccessor::open(daemon_state_path.clone())
563            .with_context(|| anyhow!("opening {daemon_state_path:?}"))
564    }
565
566    /// Check via flock (sufficient, although the `DaemonState` should
567    /// also provide a pid in this case). Slightly costly as it
568    /// involves memory allocations and multiple syscalls.
569    pub fn is_running(&self) -> Result<bool, anyhow::Error> {
570        let lock_path = self.lock_path();
571        // The daemon takes an exclusive lock; it's enough and
572        // necessary to take a non-exclusive one here, so that
573        // multiple testers don't find a lock by accident. XX test
574        match file_lock_nonblocking(&lock_path, false) {
575            Ok(lock) => {
576                // We only get the (non-exclusive) lock as side effect
577                // of our approach of testing for it being
578                // locked. Drop it right away to minimize the risk for
579                // a `start` action failing to get the exclusive lock.
580                drop(lock);
581                Ok(false)
582            }
583            Err(e) => match e {
584                FileLockError::AlreadyLocked => Ok(true),
585                _ => bail!("lock error on {lock_path:?}: {e:#}"),
586            },
587        }
588    }
589
590    /// Send the signal once or twice (once via the process group,
591    /// then individually if still around) to all processes belonging
592    /// to the session that the daemon is running in.
593    pub fn send_signal(&self, signal: Option<Signal>) -> anyhow::Result<bool> {
594        let daemon_state = self.daemon_state()?;
595        let (_old_want, was_pid) = daemon_state.read();
596        if let Some(session_pid) = was_pid {
597            retry(|| daemon_state.store_want(DaemonWant::Down));
598            let session_pid = Pid::from_raw(session_pid);
599            send_signal_to_all_processes_of_session(session_pid, signal)
600        } else {
601            Ok(false)
602        }
603    }
604
605    // (Giving up and just using anyhow here)
606    fn stop_or_restartstop(
607        &self,
608        want: DaemonWant,
609        opts: StopOpts,
610        default_is_hard: bool,
611    ) -> Result<StopReport, anyhow::Error> {
612        let StopOpts {
613            hard: _,
614            soft: _,
615            wait,
616            timeout_before_sigkill,
617        } = opts;
618
619        let daemon_state = self.daemon_state()?;
620        let (_old_want, was_pid) = daemon_state.read();
621
622        let was_running = self.is_running()?;
623
624        // Set the want even if not running: the want might have been
625        // Running from before a reboot.
626        let (_, old_state) = retry(|| daemon_state.store_want(want));
627
628        let mut sent_sigint = false;
629        let mut sent_sigkill = false;
630        let mut crashed = false;
631
632        if was_running {
633            if opts.hard(default_is_hard) {
634                if let Some(session_pid) = was_pid {
635                    let session_pid = Pid::from_raw(session_pid);
636                    if send_signal_to_all_processes_of_session(session_pid, Some(Signal::SIGINT))? {
637                        sent_sigint = true;
638                        let sleep_duration_ms: u64 = 1000;
639                        let num_sleeps =
640                            u64::from(timeout_before_sigkill) * 1000 / sleep_duration_ms;
641                        'outer: {
642                            for _ in 0..num_sleeps {
643                                sleep(Duration::from_millis(sleep_duration_ms));
644                                if !send_signal_to_all_processes_of_session(session_pid, None)? {
645                                    break 'outer;
646                                }
647                            }
648                            send_signal_to_all_processes_of_session(
649                                session_pid,
650                                Some(Signal::SIGKILL),
651                            )?;
652                            sent_sigkill = true;
653                        }
654                        // Remove the pid
655                        daemon_state.store(want, None);
656                    }
657                } else {
658                    // DaemonIsRunningButHaveNoPid -- can reconstruct from report
659                }
660                // XX todo: write a "daemon stopped" message to log? From
661                // here?  Or ignore signals in logging child and log it on
662                // pipe close?
663            } else {
664                // Graceful stop or restart.
665                if wait {
666                    let mut i = 0;
667                    loop {
668                        sleep(Duration::from_millis(500));
669                        // Do not just check if pid is none (daemon
670                        // should be deleting pid as it goes down):
671                        // restart action just sets another pid. Wait,
672                        // actually does not change the pid if
673                        // implemented by daemon re-exec'ing itself!
674                        // But it will change a DaemonWant::Restart
675                        // into a DaemonWant::Up. Also if any other
676                        // actor changes the want we should stop,
677                        // too. Thus, stop on *any* change of daemon
678                        // state.
679                        if daemon_state.access.load() != old_state {
680                            break;
681                        }
682                        // Don't fully trust pid state changes
683                        // (e.g. daemon crashing instead of shutting
684                        // down cleanly), thus:
685                        if i % 20 == 0 {
686                            if !self.is_running()? {
687                                // Should actually never happen for
688                                // DaemonWant::Restart, right? Would
689                                // indicate a crash, hence:
690                                crashed = true;
691                                break;
692                            }
693                        }
694                        i += 1;
695                    }
696                }
697            }
698        }
699        Ok(StopReport {
700            was_pid,
701            was_running,
702            sent_sigint,
703            sent_sigkill,
704            crashed,
705        })
706    }
707
708    /// Note: must be run while there are no running threads,
709    /// otherwise panics! Returns the result of the `run` procedure in
710    /// the child, but nothing in the parent.
711    fn start(self) -> Result<ExecutionResult, DaemonError> {
712        if self
713            .opts
714            .restart_on_failures
715            .eval_with_default(self.restart_on_failures_default)
716        {
717            // Wrap `run`
718            let Daemon {
719                opts,
720                restart_on_failures_default: _,
721                restart_opts,
722                timestamp_opts,
723                paths,
724                other_restart_checks,
725                run,
726                local_time_default,
727            } = self;
728
729            let run = |daemon_check_exit: DaemonCheckExit<Other>| -> Result<()> {
730                let mut opts = restart_opts.unwrap_or_else(Default::default);
731                opts.prefix = "daemon service process restart ".into();
732                forking_loop(
733                    opts,
734                    || -> Result<()> { run(daemon_check_exit.clone()) },
735                    || daemon_check_exit.want_exit(),
736                );
737                Ok(())
738            };
739
740            // The wrapper does not need yet another layer for
741            // restarting (although, the `_start` method ignores that
742            // anyway)
743            let opts = DaemonOpts {
744                restart_on_failures: RestartOnFailures {
745                    restart_on_failures: false,
746                    no_restart_on_failures: true,
747                },
748                ..opts
749            };
750
751            Daemon {
752                opts,
753                restart_on_failures_default: false,
754                restart_opts: None,
755                timestamp_opts,
756                paths,
757                other_restart_checks,
758                run,
759                local_time_default,
760            }
761            ._start()
762        } else {
763            self._start()
764        }
765    }
766
767    fn _start(self) -> Result<ExecutionResult, DaemonError> {
768        self.create_dirs()?;
769
770        let daemon_state = self.daemon_state()?;
771        let (current_want, current_pid) = daemon_state.read();
772
773        // Try to get exclusive `is_running` lock. This can fail if
774        // unlucky and a concurrent process tests with the shared
775        // lock, thus retry.
776        let lock_path = self.lock_path();
777        let mut is_running_lock = {
778            // Retry less often if there is indication that the daemon
779            // is running as then failures are expected.
780            let attempts =
781                NonZeroU32::try_from(if current_want == DaemonWant::Up && current_pid.is_some() {
782                    3
783                } else {
784                    30
785                })
786                .expect("nonzero");
787
788            match retry_n(attempts, 10, || file_lock_nonblocking(&lock_path, true)) {
789                Ok(lock) => lock,
790                Err(e) => match e {
791                    FileLockError::AlreadyLocked => {
792                        match current_want {
793                            DaemonWant::Down => {
794                                // Signal that we want it to again be
795                                // up; still have it effect the
796                                // restart that would have happened
797                                // anyway given more time.
798                                retry(|| daemon_state.store_want(DaemonWant::Restart));
799                            }
800                            DaemonWant::Up => (),
801                            DaemonWant::Restart => (),
802                        }
803                        // XX have a report as with stop?
804                        return Ok(ExecutionResult::initiator());
805                    }
806                    _ => {
807                        return Err(DaemonError::LockError {
808                            lock_path: lock_path.into(),
809                            error: e.to_string(),
810                        })
811                    }
812                },
813            }
814        };
815
816        daemon_state.want_starting();
817
818        if let Some(_pid) = easy_fork().map_err(|error| DaemonError::ErrnoError {
819            context: "fork",
820            error,
821        })? {
822            // The child is holding onto the locks; apparently flock
823            // acts globally when on the same filehandle, so we have
824            // to disable the locks here in the parent.
825            is_running_lock.leak();
826
827            Ok(ExecutionResult::initiator())
828        } else {
829            // Start a new session, so that signals can be sent to
830            // the whole group and will kill child processes, too.
831            let session_pid = setsid().map_err(|error| DaemonError::ErrnoError {
832                context: "setsid",
833                error,
834            })?;
835
836            // Now write the new pid / session group leader to the
837            // state file
838            daemon_state.store(DaemonWant::Up, Some(session_pid.into()));
839
840            let logger = self.to_logger();
841            logger.redirect_to_logger(session_pid)?;
842
843            eprintln!("daemon {session_pid} started");
844
845            (self.run)(DaemonCheckExit(Some((
846                DaemonStateReader(&daemon_state),
847                self.other_restart_checks,
848            ))))?;
849
850            Ok(ExecutionResult::daemon(DaemonResult { daemon_state }))
851        }
852    }
853
854    pub fn status_string(&self, additional_info: bool) -> anyhow::Result<Cow<'static, str>> {
855        let daemon_state = self.daemon_state()?;
856        let is_running = self.is_running()?;
857        let (want, pid) = daemon_state.read();
858        let is = if is_running { "running" } else { "stopped" };
859        if additional_info {
860            let pid_string = match pid {
861                Some(pid) => {
862                    format!("pid: {pid}, ")
863                }
864                None => "".into(),
865            };
866            Ok(format!("{is} ({pid_string}want: {want:?})").into())
867        } else {
868            Ok(is.into())
869        }
870    }
871
872    pub fn print_status(&self, additional_info: bool) -> anyhow::Result<()> {
873        let s = self.status_string(additional_info)?;
874        (|| -> Result<()> {
875            let mut out = stdout().lock();
876            out.write_all(s.as_bytes())?;
877            out.write_all(b"\n")?;
878            out.flush()?;
879            Ok(())
880        })()
881        .context("printing to stdout")
882    }
883
884    /// Note: actions involving forking a new instance must be run
885    /// while there are no running threads--they panic otherwise!
886    pub fn execute(
887        self,
888        mode: DaemonMode,
889        default_is_hard: bool,
890    ) -> Result<ExecutionResult, DaemonError> {
891        match mode {
892            DaemonMode::Run => {
893                (self.run)(DaemonCheckExit(None))?;
894                Ok(ExecutionResult::run())
895            }
896            DaemonMode::Start => Ok(self.start()?),
897            DaemonMode::Stop(opts) => {
898                let _report = self.stop_or_restartstop(DaemonWant::Down, opts, default_is_hard)?;
899                Ok(ExecutionResult::initiator())
900            }
901            DaemonMode::Restart(opts) => {
902                let StopReport {
903                    was_pid: _,
904                    was_running,
905                    sent_sigint,
906                    sent_sigkill,
907                    crashed,
908                } = self.stop_or_restartstop(DaemonWant::Restart, opts, default_is_hard)?;
909
910                if !was_running || sent_sigint || sent_sigkill || crashed {
911                    self.start()
912                } else {
913                    Ok(ExecutionResult::initiator())
914                }
915            }
916            DaemonMode::Status => {
917                self.print_status(true)?;
918                Ok(ExecutionResult::initiator())
919            }
920            DaemonMode::ShortStatus => {
921                self.print_status(false)?;
922                Ok(ExecutionResult::initiator())
923            }
924            DaemonMode::STOP => {
925                self.send_signal(Some(SIGSTOP))?;
926                Ok(ExecutionResult::initiator())
927            }
928            DaemonMode::CONT => {
929                self.send_signal(Some(SIGCONT))?;
930                Ok(ExecutionResult::initiator())
931            }
932            DaemonMode::KILL => {
933                self.send_signal(Some(SIGKILL))?;
934                Ok(ExecutionResult::initiator())
935            }
936            DaemonMode::Log => {
937                // Once again.
938                let cmd = match std::env::var_os("PAGER") {
939                    Some(path) => cstring(path.as_bytes())?,
940                    None => cstring("less")?,
941                };
942                execvp(
943                    &cmd,
944                    &[
945                        &cmd,
946                        &cstring(
947                            self.to_logger()
948                                .current_log_path()
949                                .into_os_string()
950                                .as_bytes(),
951                        )?,
952                    ],
953                )
954                .with_context(|| anyhow!("exec'ing {cmd:?} command"))?;
955                unreachable!("execv never returns Ok")
956            }
957            DaemonMode::Logf => {
958                let cmd = cstring("tail")?;
959                execvp(
960                    &cmd,
961                    &[
962                        &cmd,
963                        &cstring("-f")?,
964                        &cstring(
965                            self.to_logger()
966                                .current_log_path()
967                                .into_os_string()
968                                .as_bytes(),
969                        )?,
970                    ],
971                )
972                .context("exec'ing `tail` command")?;
973                unreachable!("execv never returns Ok")
974            }
975        }
976    }
977}
978
979#[derive(Debug)]
980pub struct DaemonStateAccessor {
981    path: Arc<Path>,
982    access: IPCAtomicU64,
983}
984
985/// What state we want the daemon to be in
986#[derive(Debug, Clone, Copy, PartialEq, Eq)]
987pub enum DaemonWant {
988    Down,
989    Up,
990    /// Signals to the daemon that we want it to re-execute itself
991    Restart,
992}
993
994// Operations for daemon state, keep private!
995impl DaemonWant {
996    /// From DaemonState's AtomicU64. Ignores the lower half of the
997    /// u64. Panics with `path` in the message for invalid values.
998    fn from_u64(want: u64, path: &Arc<Path>) -> Self {
999        let wantu32 = (want >> 32) as u32;
1000        if wantu32 == b'd' as u32 {
1001            DaemonWant::Down
1002        } else if wantu32 == b'u' as u32 {
1003            DaemonWant::Up
1004        } else if wantu32 == b'r' as u32 {
1005            DaemonWant::Restart
1006        } else {
1007            panic!(
1008                "got invalid upper value {wantu32} as DaemonWant value \
1009                 from DaemonState file {path:?}"
1010            )
1011        }
1012    }
1013
1014    /// Ready to be used in DaemonState AtomicU64
1015    fn to_u64(self) -> u64 {
1016        let want = match self {
1017            DaemonWant::Down => b'd',
1018            DaemonWant::Up => b'u',
1019            DaemonWant::Restart => b'r',
1020        } as u32;
1021        (want as u64) << 32
1022    }
1023
1024    /// Whether the value warrants exiting from a daemon
1025    pub fn wants_exit(self) -> bool {
1026        match self {
1027            DaemonWant::Down => true,
1028            DaemonWant::Up => false,
1029            DaemonWant::Restart => true,
1030        }
1031    }
1032}
1033
1034impl DaemonStateAccessor {
1035    pub fn open(path: Arc<Path>) -> Result<Self, IPCAtomicError> {
1036        let access = IPCAtomicU64::open(&path, (b'd' as u64) << 32)?;
1037        Ok(Self { path, access })
1038    }
1039
1040    /// The second result is the pid if set. A pid present does not
1041    /// imply that the daemon is up--have to also check flock.
1042    pub fn read(&self) -> (DaemonWant, Option<i32>) {
1043        let v: u64 = self.access.load();
1044        let lower: u32 = v as u32;
1045        let pid = lower as i32;
1046        let pid = if pid == 0 { None } else { Some(pid) };
1047
1048        let want = DaemonWant::from_u64(v, &self.path);
1049        (want, pid)
1050    }
1051
1052    pub fn want(&self) -> DaemonWant {
1053        self.read().0
1054    }
1055
1056    fn store(&self, want: DaemonWant, pid: Option<i32>) {
1057        let pid: u32 = pid.unwrap_or(0) as u32;
1058        let want = match want {
1059            DaemonWant::Down => b'd',
1060            DaemonWant::Up => b'u',
1061            DaemonWant::Restart => b'r',
1062        } as u32;
1063        let val = ((want as u64) << 32) + (pid as u64);
1064        self.access.store(val);
1065    }
1066
1067    /// Change want while keeping pid field value. Returns the (old,
1068    /// new) value on success, or the newly attempted store in the
1069    /// error case, which means some change happened in the mean time,
1070    /// you could retry but may want to retry on a higher level
1071    /// instead.
1072    fn store_want(&self, want: DaemonWant) -> Result<(u64, u64), u64> {
1073        let wantu64 = want.to_u64();
1074        let atomic = self.access.atomic();
1075        let ordering = Ordering::SeqCst;
1076
1077        let old = atomic.load(ordering);
1078        let new = (old & (u32::MAX as u64)) | wantu64;
1079        let got = atomic.compare_exchange(old, new, ordering, ordering)?;
1080        // just testing my understanding--always guaranteed, right?
1081        if !(got == old) {
1082            _ = writeln!(
1083                &mut stderr(),
1084                "got != old, {} vs. {} at {}:{}",
1085                got,
1086                old,
1087                file!(),
1088                line!()
1089            );
1090        }
1091        Ok((old, new))
1092    }
1093
1094    pub fn want_starting(&self) {
1095        self.store(DaemonWant::Up, None);
1096    }
1097}
1098
1099#[derive(Debug, Clone)]
1100pub struct DaemonStateReader<'t>(&'t DaemonStateAccessor);
1101
1102impl<'t> DaemonStateReader<'t> {
1103    pub fn want(&self) -> DaemonWant {
1104        self.0.want()
1105    }
1106
1107    /// Whether the daemon should exit due to wanted Stop or Restart.
1108    pub fn want_exit(&self) -> bool {
1109        self.want().wants_exit()
1110    }
1111}
1112
1113#[derive(Debug, Clone)]
1114pub struct DaemonCheckExit<'t, Other: Deref<Target: WarrantsRestart> + Clone>(
1115    Option<(DaemonStateReader<'t>, Other)>,
1116);
1117
1118impl<'t, Other: Deref<Target: WarrantsRestart> + Clone> DaemonCheckExit<'t, Other> {
1119    pub fn want_exit(&self) -> bool {
1120        if let Some((daemon_state_reader, other)) = &self.0 {
1121            daemon_state_reader.want_exit() || {
1122                if other.warrants_restart() {
1123                    // Already store the change in want, so that
1124                    // forking_loop or whichever upper levels don't
1125                    // have to re-evaluate secondary checks again (and
1126                    // trigger duplicate notifications). (Also, maybe
1127                    // this is better in case the app crashes while
1128                    // restarting?)
1129                    retry(|| daemon_state_reader.0.store_want(DaemonWant::Restart));
1130                    true
1131                } else {
1132                    false
1133                }
1134            }
1135        } else {
1136            false
1137        }
1138    }
1139}