Skip to content

Commit 6deb2e9

Browse files
Merge pull request #3731 from balena-os/soft-watchdog
add initial disk watchdog recipes and sources
2 parents a8a5037 + fa725ce commit 6deb2e9

File tree

13 files changed

+647
-1
lines changed

13 files changed

+647
-1
lines changed

meta-balena-common/recipes-core/packagegroups/packagegroup-resin.bb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,5 @@ RDEPENDS:${PN} += " \
3333
systemd-zram-swap \
3434
${@bb.utils.contains('BALENA_STORAGE', 'aufs', 'aufs-util-auplink', '', d)} \
3535
${BALENA_SUPERVISOR} \
36+
disk-watchdog \
3637
"
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
SUMMARY = "Disk watchdog service for monitoring disk health"
2+
DESCRIPTION = "A watchdog service that monitors disk health"
3+
LICENSE = "Apache-2.0"
4+
LIC_FILES_CHKSUM = "file://${BALENA_COREBASE}/COPYING.Apache-2.0;md5=89aea4e17d99a7cacdbeed46a0096b10"
5+
6+
SRC_URI = "git://github.com/balena-os/disk-watchdogd.git;branch=master;protocol=https"
7+
SRCREV = "1060e49da45d9bfda3047856d66bdc7c6b8c1911"
8+
9+
SRC_URI += "file://disk-watchdogd.service \
10+
file://disk-watchdog-boot-history.service \
11+
file://disk-watchdog-boot-history"
12+
13+
S = "${WORKDIR}/git"
14+
15+
WD_TEST_FILE ?= "${bindir}/disk-watchdogd"
16+
DISK_WD_BOOT_DIR ?= "/mnt/state/disk-watchdog"
17+
18+
DEPENDS += "systemd"
19+
RDEPENDS:${PN} += "systemd"
20+
RDEPENDS:${PN} += "os-helpers-fs bash"
21+
22+
do_compile() {
23+
oe_runmake all
24+
}
25+
26+
inherit systemd
27+
28+
SYSTEMD_SERVICE:${PN} = "disk-watchdogd.service disk-watchdog-boot-history.service"
29+
SYSTEMD_AUTO_ENABLE = "enable"
30+
31+
do_install() {
32+
install -d ${D}${bindir}
33+
install -m 0755 disk-watchdogd ${D}${bindir}/
34+
35+
# Substitute paths in service file
36+
sed -i -e 's|@DAEMON_PATH@|/usr/bin/disk-watchdogd|g' \
37+
-e 's|@WD_TEST_FILE@|${WD_TEST_FILE}|g' \
38+
-e 's|@OS_HELPERS_FS@|${libexecdir}/os-helpers-fs|g' \
39+
-e 's|@DISK_WD_BOOT_DIR@|${DISK_WD_BOOT_DIR}|g' \
40+
${WORKDIR}/disk-watchdogd.service
41+
42+
# Substitute paths in boot history script
43+
sed -i -e 's|@DISK_WD_BOOT_DIR@|${DISK_WD_BOOT_DIR}|g' \
44+
${WORKDIR}/disk-watchdog-boot-history
45+
46+
install -d ${D}${systemd_unitdir}/system
47+
install -m 0644 ${WORKDIR}/disk-watchdogd.service ${D}${systemd_unitdir}/system/
48+
install -m 0644 ${WORKDIR}/disk-watchdog-boot-history.service ${D}${systemd_unitdir}/system/
49+
install -m 0755 ${WORKDIR}/disk-watchdog-boot-history ${D}${bindir}/
50+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/bin/bash
2+
3+
# Disk watchdog boot history updater
4+
# This script runs once per boot to update the boot timestamp history
5+
6+
TIME_WINDOW=600 # 10 minutes
7+
MAX_BOOTS=3
8+
9+
# Read boot history, one timestamp per line
10+
read_boot_history() {
11+
local boot_file="@DISK_WD_BOOT_DIR@/boot-history"
12+
if [[ -f "$boot_file" ]]; then
13+
cat "$boot_file"
14+
fi
15+
}
16+
17+
# Write boot history, one timestamp per line
18+
write_boot_history() {
19+
local timestamps="$1"
20+
local boot_file="@DISK_WD_BOOT_DIR@/boot-history"
21+
mkdir -p "$(dirname "$boot_file")" 2>/dev/null || true
22+
echo "$timestamps" > "$boot_file" 2>/dev/null || true
23+
}
24+
25+
# Disable the watchdog service by creating the disable file
26+
disable_watchdog_service() {
27+
mkdir -p @DISK_WD_BOOT_DIR@ 2>/dev/null || true
28+
touch @DISK_WD_BOOT_DIR@/disabled 2>/dev/null || true
29+
}
30+
31+
# Update boot history with current boot timestamp
32+
update_boot_history() {
33+
local current_time=$(date +%s)
34+
local filtered_history=""
35+
36+
echo "Boot history updater: Processing boot at timestamp $current_time"
37+
38+
# Process existing boots, keeping only recent ones
39+
while IFS= read -r timestamp; do
40+
if [[ -n "$timestamp" && $((current_time - timestamp)) -lt $TIME_WINDOW ]]; then
41+
filtered_history="${filtered_history}${timestamp}"$'\n'
42+
fi
43+
done < <(read_boot_history)
44+
45+
# Add current boot timestamp
46+
filtered_history="${filtered_history}${current_time}"
47+
48+
# Write updated history
49+
write_boot_history "$filtered_history"
50+
51+
# Count boots in the updated history
52+
local boot_count=$(echo "$filtered_history" | wc -l)
53+
54+
echo "Boot history updated successfully (boot #$boot_count in last $((TIME_WINDOW/60)) minutes)"
55+
56+
# Check if we have too many boots
57+
if [[ $boot_count -ge $MAX_BOOTS ]]; then
58+
echo "Excessive boots detected ($boot_count >= $MAX_BOOTS) - disabling disk watchdog"
59+
disable_watchdog_service
60+
else
61+
echo "Boot count ($boot_count) below threshold ($MAX_BOOTS) - watchdog will be allowed to start"
62+
fi
63+
}
64+
65+
# Main execution
66+
update_boot_history
67+
exit 0
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[Unit]
2+
Description=Disk watchdog boot history updater
3+
After=local-fs.target
4+
Before=disk-watchdogd.service
5+
6+
[Service]
7+
Type=oneshot
8+
ExecStart=/usr/bin/disk-watchdog-boot-history
9+
RemainAfterExit=yes
10+
11+
[Install]
12+
WantedBy=multi-user.target
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[Unit]
2+
Description=Disk watchdog service for monitoring disk health
3+
After=local-fs.target disk-watchdog-boot-history.service
4+
Requires=disk-watchdog-boot-history.service
5+
ConditionPathExists=!@DISK_WD_BOOT_DIR@/disabled
6+
7+
[Service]
8+
Type=notify
9+
ExecStart=/bin/sh -c '@DAEMON_PATH@ -v -f @WD_TEST_FILE@ -b $(source @OS_HELPERS_FS@ && get_sector_size /dev/disk/by-state/active)'
10+
WatchdogSec=30s
11+
StartLimitInterval=2min
12+
StartLimitBurst=3
13+
StartLimitAction=reboot-force
14+
Restart=on-failure
15+
RestartSec=5s
16+
17+
[Install]
18+
WantedBy=multi-user.target

meta-balena-common/recipes-support/os-helpers/os-helpers/os-helpers-fs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,16 @@ if [ -f "/usr/libexec/os-helpers-logging" ]; then
1616
. /usr/libexec/os-helpers-logging
1717
fi
1818

19+
# Get the physical sector size for a device
20+
# Arguments:
21+
# 1 - device path (e.g., /dev/sda1, /dev/mmcblk0p1)
22+
# Returns: sector size in bytes via stdout
23+
get_sector_size() {
24+
local _device="$1"
25+
26+
lsblk -o NAME,PHY-SEC,TYPE "$_device" 2>/dev/null | awk '$3 == "part" || $3 == "crypt" {print $2}' | head -n1
27+
}
28+
1929
# Wait for a file to appear with loop count limit.
2030
# Use-case example: wait for udev to create a filesystem symlink.
2131
# Arguments:
@@ -536,7 +546,7 @@ estimate_size_in_zram() {
536546
SAMPLE_SIZE_BYTES=$(expr 50 \* 1024 \* 1024)
537547

538548
_dev=$(df "$_file" | awk 'NR==2 {print $1}')
539-
_blk_sz=$(lsblk -o NAME,PHY-SEC,TYPE "$_dev" 2>/dev/null | awk '$3 == "part" {print $2}' | head -n1)
549+
_blk_sz=$(get_sector_size "$_dev")
540550
if [ -z "$_blk_sz" ] || [ "$_blk_sz" -eq 0 ]; then
541551
warn "Unable to determine block size, using 512 bytes"
542552
_blk_sz=512

tests/suites/os/suite.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,5 +477,6 @@ module.exports = {
477477
'./tests/swap',
478478
'./tests/internet-sharing',
479479
'./tests/safe-reboot',
480+
'./tests/disk-watchdog',
480481
],
481482
};
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
FROM balenalib/%%BALENA_ARCH%%-alpine:3.12-run
2+
3+
RUN apk add device-mapper e2fsprogs bash
4+
5+
COPY create-dm-flakey.sh /usr/bin/create-dm-flakey.sh
6+
RUN chmod +x /usr/bin/create-dm-flakey.sh
7+
8+
COPY entrypoint.sh /usr/bin/entrypoint.sh
9+
RUN chmod +x /usr/bin/entrypoint.sh
10+
11+
ENTRYPOINT ["/usr/bin/entrypoint.sh"]
12+
CMD ["sleep", "infinity"]
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
#!/usr/bin/env bash
2+
# Create a dm-flakey device on top of a loop-backed image and mount it.
3+
# Usage:
4+
# create-dm-flakey.sh <mount_point> <image_path> <dm_name>
5+
# create-dm-flakey.sh cleanup <mount_point> <image_path> <dm_name>
6+
# Exit codes:
7+
# 1: usage
8+
# 2: image creation failed
9+
# 3: loop setup failed
10+
# 4: mkfs failed
11+
# 5: dmsetup failed
12+
# 6: mount failed
13+
# 7: could not determine device size
14+
15+
set -u
16+
17+
die() { echo "ERROR: $2" >&2; cleanup; exit "$1"; }
18+
19+
cleanup() {
20+
# Best-effort cleanup
21+
set +e
22+
if [[ -n ${MOUNT_POINT:-} ]] && mountpoint -q "$MOUNT_POINT"; then
23+
umount "$MOUNT_POINT" >/dev/null 2>&1
24+
fi
25+
if [[ -n ${DM_NAME:-} ]] && dmsetup info "$DM_NAME" >/dev/null 2>&1; then
26+
dmsetup remove "$DM_NAME" >/dev/null 2>&1
27+
fi
28+
# Determine loop device by variable or by image path
29+
if [[ -z ${LOOP_DEV:-} ]] && [[ -n ${IMG_PATH:-} ]]; then
30+
LOOP_DEV=$(losetup -d "$IMG_PATH" | awk -F: 'NR==1{print $1}')
31+
fi
32+
if [[ -n ${LOOP_DEV:-} ]] && losetup -a | grep -q "^$LOOP_DEV:"; then
33+
losetup -d "$LOOP_DEV" >/dev/null 2>&1
34+
fi
35+
# Remove backing image if provided
36+
if [[ -n ${IMG_PATH:-} && -f "$IMG_PATH" ]]; then
37+
rm -f "$IMG_PATH"
38+
fi
39+
set -e
40+
}
41+
42+
if [[ $# -lt 3 ]]; then
43+
echo "Usage: $0 <mount_point> <image_path> <dm_name> | $0 cleanup <mount_point> <image_path> <dm_name>" >&2
44+
exit 1
45+
fi
46+
47+
if [[ "$1" == "cleanup" ]]; then
48+
MOUNT_POINT=$2
49+
IMG_PATH=$3
50+
DM_NAME=$4
51+
cleanup
52+
exit 0
53+
fi
54+
55+
if [[ $# -ne 3 ]]; then
56+
echo "Usage: $0 <mount_point> <image_path> <dm_name>" >&2
57+
exit 1
58+
fi
59+
60+
MOUNT_POINT=$1
61+
IMG_PATH=$2
62+
DM_NAME=$3
63+
64+
mkdir -p "$MOUNT_POINT" || die 6 "Failed to create mount point $MOUNT_POINT"
65+
66+
# Create image if missing (100 MiB)
67+
if [[ ! -e "$IMG_PATH" ]]; then
68+
if ! dd if=/dev/zero of="$IMG_PATH" bs=1M count=10 status=none; then
69+
die 2 "Failed to create image at $IMG_PATH"
70+
fi
71+
fi
72+
73+
# Setup loop device
74+
LOOP_DEV=$(losetup -f) || die 3 "Failed to get free loop device"
75+
if ! losetup -P "$LOOP_DEV" "$IMG_PATH"; then
76+
die 3 "Failed to attach $IMG_PATH to $LOOP_DEV"
77+
fi
78+
79+
# Create filesystem on the loop device
80+
if ! mkfs.ext4 -F "$LOOP_DEV" >/dev/null; then
81+
die 4 "mkfs.ext4 failed on $LOOP_DEV"
82+
fi
83+
84+
if ! mount "$LOOP_DEV" "$MOUNT_POINT"; then
85+
die 6 "Failed to mount $LOOP_DEV on $MOUNT_POINT"
86+
fi
87+
dd if=/dev/urandom of="$MOUNT_POINT/test.bin" bs=1M count=5 status=none || die 7 "Failed to create test.bin on $MOUNT_POINT"
88+
if ! umount "$MOUNT_POINT"; then
89+
die 6 "Failed to unmount $MOUNT_POINT"
90+
fi
91+
92+
# Determine size in 512B sectors
93+
SECTORS=$(blockdev --getsz "$LOOP_DEV" 2>/dev/null) || true
94+
[[ -n "$SECTORS" && "$SECTORS" =~ ^[0-9]+$ ]] || die 7 "Could not determine size for $LOOP_DEV"
95+
96+
# Create dm-flakey mapping: 1s up / 1s down
97+
TABLE="0 $SECTORS flakey $LOOP_DEV 0 1 2"
98+
if ! echo "$TABLE" | dmsetup create "$DM_NAME"; then
99+
die 5 "dmsetup create failed for $DM_NAME"
100+
fi
101+
102+
# Mount the flakey device
103+
if ! mount "/dev/mapper/$DM_NAME" "$MOUNT_POINT"; then
104+
die 6 "Failed to mount /dev/mapper/$DM_NAME on $MOUNT_POINT"
105+
fi
106+
107+
# Leave devices mounted/active; caller is responsible for teardown.
108+
exit 0
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[Service]
2+
ExecStart=
3+
ExecStart=/bin/sh -c '/usr/bin/disk-watchdogd -v -f /mnt/state/flakey-mount/test.bin -b 512'
4+
WatchdogSec=
5+
WatchdogSec=2

0 commit comments

Comments
 (0)