diff --git a/020_sanoid.sh b/020_sanoid.sh index 7ad3dcd..a0ddb8a 100755 --- a/020_sanoid.sh +++ b/020_sanoid.sh @@ -13,22 +13,26 @@ echo "override them in /etc/sanoid/sanoid.conf" echo "Installed files:" cat <<-EOF - syncoid /usr/sbin/ - sanoid /usr/sbin/ - findoid /usr/sbin/ - sanoid.defaults.conf /usr/share/sanoid/ - debian/sanoid-prune.service /lib/systemd/system - CHANGELIST /usr/share/doc/sanoid/changelog +syncoid /usr/sbin/ +sanoid /usr/sbin/ +findoid /usr/sbin/ +sanoid.defaults.conf /usr/share/sanoid/ +debian/sanoid-prune.service /lib/systemd/system +CHANGELIST /usr/share/doc/sanoid/changelog EOF +# Both sanoid and syncoid are oneshot processes so it makes little sense to +# provide an init file, cron is just fine. In this case the systemd file is there +# because systemd decided to manage cron tasks. +# # Cronjob for non-systemd systems: every 15 minutes. # If you require a different interval, you will need to disable the # timer or the cron job according to your system configuration. conf_print_sanoid_cron() { cat <<-EOF - PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin +PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin - */15 * * * * root [ -f /etc/sanoid/sanoid.conf ] && if [ ! -d /run/systemd/system ]; then TZ=UTC /usr/sbin/sanoid --cron --quiet; fi +*/15 * * * * root [ -f /etc/sanoid/sanoid.conf ] && if [ ! -d /run/systemd/system ]; then TZ=UTC /usr/sbin/sanoid --cron --quiet; fi EOF } conf_print_sanoid_cron | sudo tee /etc/cron.d/sanoid @@ -36,9 +40,9 @@ conf_print_sanoid_cron | sudo tee /etc/cron.d/sanoid # Do the same for syncoid for backups. conf_print_syncoid_cron() { cat <<-EOF - PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin +PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin - */15 * * * * root [ -f /etc/sanoid/syncoid.conf ] && if [ ! -d /run/systemd/system ]; then TZ=UTC /usr/sbin/syncoid --cron --quiet; fi +*/15 * * * * root [ -f /etc/sanoid/syncoid.conf ] && if [ ! -d /run/systemd/system ]; then TZ=UTC /usr/sbin/syncoid --cron --quiet; fi EOF } # conf_print_syncoid_cron | sudo tee /etc/cron.d/syncoid @@ -48,283 +52,239 @@ sudo mkdir -p /etc/sanoid/ conf_print_sanoid() { cat <<-EOF - [rpool/docker] - use_template = ignore +#-- Organisational datasets +# DATA +[rpool/DATA] + use_template = ignore + recursive = yes + process_children_only = yes - [rpool/archive] - use_template = ignore +# ROOT +[rpool/ROOT] + use_template = ignore + recursive = yes + process_children_only = yes - [rpool/swap] - use_template = ignore +#-- END - [rpool/tmp] - use_template = ignore +# These datasets contain the docker zsh backing store "graph" (layers). +# Just restore broken container with docker-compose down && docker-compose up +[rpool/docker] + use_template = ignore + recursive = yes - [rpool/ROOT/devuan-1] - use_template = root - recursive = yes +# Docker persistent data +[rpool/DATA/docker-volumes] + use_template = docker-persistent + recursive = yes - # rpool/ROOT/devuan-1/opt - # rpool/ROOT/devuan-1/usr - # rpool/ROOT/devuan-1/usr_local - # rpool/ROOT/devuan-1/usr_share - # rpool/ROOT/devuan-1/var - # rpool/ROOT/devuan-1/var_lib +[rpool/archive] + use_template = ignore + recursive = yes - # Specific override for Virtual Machines to use scripts - [rpool/ROOT/devuan-1/var_lib_virt] - use_template = root - recursive = no - pre_snapshot_script = /usr/local/bin/virt-freeze-all.sh - post_snapshot_script = /usr/local/bin/virt-thaw-all.sh +[rpool/swap] + use_template = ignore + recursive = yes - # -- User Data -- - [rpool/home] - use_template = production - recursive = yes +[rpool/tmp] + use_template = ignore + recursive = yes - [rpool/space] - use_template = production +[rpool/ROOT/devuan-1] + use_template = root + recursive = yes - ############################# - # templates below this line # - ############################# +# rpool/ROOT/devuan-1/opt +# rpool/ROOT/devuan-1/usr +# rpool/ROOT/devuan-1/usr_local +# rpool/ROOT/devuan-1/usr_share +# rpool/ROOT/devuan-1/var +# rpool/ROOT/devuan-1/var_lib - [template_production] - frequently = 0 - hourly = 36 - daily = 30 - monthly = 3 - yearly = 0 - autosnap = yes - autoprune = yes +# Specific override for Virtual Machines to use scripts +[rpool/ROOT/devuan-1/var_lib_virt] + use_template = root + recursive = yes + pre_snapshot_script = /usr/local/bin/sanoid_virt-freeze-all.sh + post_snapshot_script = /usr/local/bin/sanoid_virt-thaw-all.sh - [template_root] - # Root changes fast; shorter history often suffices - hourly = 24 - daily = 7 - monthly = 1 - yearly = 0 - autosnap = yes - autoprune = yes +# -- User Data -- +[rpool/home] + use_template = production + recursive = yes - [template_ignore] - autoprune = no - autosnap = no - monitor = no +[rpool/space] + use_template = production - [template_backup] - autoprune = yes - frequently = 0 - hourly = 30 - daily = 90 - monthly = 12 - yearly = 0 +############################# +# templates below this line # +############################# - ### don't take new snapshots - snapshots on backup - ### datasets are replicated in from source, not - ### generated locally - autosnap = no +[template_production] + frequently = 0 + hourly = 36 + daily = 30 + monthly = 3 + yearly = 0 + autosnap = yes + autoprune = yes - ### monitor hourlies and dailies, but don't warn or - ### crit until they're over 48h old, since replication - ### is typically daily only - hourly_warn = 2880 - hourly_crit = 3600 - daily_warn = 48 - daily_crit = 60 +[template_root] + # Root changes fast; shorter history often suffices + hourly = 24 + daily = 7 + monthly = 1 + yearly = 0 + autosnap = yes + autoprune = yes - [template_hotspare] - autoprune = yes - frequently = 0 - hourly = 30 - daily = 90 - monthly = 3 - yearly = 0 +[template_ignore] + autoprune = no + autosnap = no + monitor = no - ### don't take new snapshots - snapshots on backup - ### datasets are replicated in from source, not - ### generated locally - autosnap = no +############################## +# Docker Persistent Template # +############################## +[template_docker-persistent] + # Frequent snapshots for active databases/configs + frequently = 0 + hourly = 24 + daily = 7 + monthly = 1 + yearly = 0 - ### monitor hourlies and dailies, but don't warn or - ### crit until they're over 4h old, since replication - ### is typically hourly only - hourly_warn = 4h - hourly_crit = 6h - daily_warn = 2d - daily_crit = 4d + # Safety checks + autosnap = yes + autoprune = yes - [template_scripts] - ### information about the snapshot will be supplied as environment variables, - ### see the README.md file for details about what is passed when. - ### run script before snapshot - pre_snapshot_script = /path/to/script.sh - ### run script after snapshot - post_snapshot_script = /path/to/script.sh - ### run script before pruning snapshot - pre_pruning_script = /path/to/script.sh - ### run script after pruning snapshot - pruning_script = /path/to/script.sh - ### don't take an inconsistent snapshot (skip if pre script fails) - #no_inconsistent_snapshot = yes - ### run post_snapshot_script when pre_snapshot_script is failing - #force_post_snapshot_script = yes - ### limit allowed execution time of scripts before continuing (<= 0: infinite) - script_timeout = 5 + # Don't take a snapshot if the dataset hasn't changed + # (Saves metadata overhead) + # skip_hourless = yes + pre_snapshot_script = /usr/local/bin/sanoid_zfs-skip-empty.sh + +#-- END + +[template_backup] + autoprune = yes + frequently = 0 + hourly = 30 + daily = 90 + monthly = 12 + yearly = 0 + + ### don't take new snapshots - snapshots on backup + ### datasets are replicated in from source, not + ### generated locally + autosnap = no + + ### monitor hourlies and dailies, but don't warn or + ### crit until they're over 48h old, since replication + ### is typically daily only + hourly_warn = 2880 + hourly_crit = 3600 + daily_warn = 48 + daily_crit = 60 + +#-- END + +[template_hotspare] + autoprune = yes + frequently = 0 + hourly = 30 + daily = 90 + weekly = 4 + monthly = 3 + yearly = 0 + + ### don't take new snapshots - snapshots on backup + ### datasets are replicated in from source, not + ### generated locally + autosnap = no + + ### monitor hourlies and dailies, but don't warn or + ### crit until they're over 4h old, since replication + ### is typically hourly only + hourly_warn = 4h + hourly_crit = 6h + daily_warn = 2d + daily_crit = 4d EOF } conf_print_sanoid | sudo tee /etc/sanoid/sanoid.conf -# Both sanoid and synmcoid are oneshot processes so it makes little sense to -# provide an init file, cron is just fine. In this case the systemd file is there -# because systemd decided to manage cron tasks. -# Generated using: -# https://raw.githubusercontent.com/akhilvij/systemd-to-sysvinit-converter/master/converter.py -# python2 converter /usr/src/sanoid-2.2.0/sanoid.service > sanoid -conf_print_sanoid_init() { - cat <<-'EOF' - #!/bin/sh - ### BEGIN INIT INFO - # Provides: sanoid - # Required-Start: $syslog $local_fs $remote_fs - # Required-Stop: $syslog $local_fs $remote_fs - # Default-Start: 2 3 4 5 - # Default-Stop: 0 1 6 - # Short-Description: Snapshot ZFS filesystems - ### END INIT INFO +######################## +# Pre-snapshot scripts # +######################## - . /lib/lsb/init-functions - prog=sanoid - PIDFILE=/var/run/$prog.pid - DESC="Snapshot ZFS filesystems" - start() { - log_daemon_msg "Starting $DESC" "$prog" - start_daemon -p $PIDFILE /usr/sbin/sanoid --take-snapshots --verbose - if [ $? -ne 0 ]; then - log_end_msg 1 - exit 1 - fi - if [ $? -eq 0 ]; then - log_end_msg 0 - fi - exit 0 - } +# In ZFS, even if no data has changed, creating a snapshot still consumes a +# small amount of space for metadata and adds an entry to the ZFS history. +# If you have hundreds of datasets being snapshotted every 15 minutes, this +# "metadata bloat" can make commands like zfs list -t snapshot feel sluggish +# over time. If you think this is an issue for ypu use zfs-skip-empty.sh as +# a pre_snapshot_script +conf_print_skip_empty() { + cat <<'EOF' +#!/bin/bash + +# Usage: ./sanoid-threshold.sh +# or +# Add this to you /etc/sanoid.conf to fire this script. - stop() { - log_daemon_msg "Stopping $DESC" "$prog" - killproc -p $PIDFILE /usr/sbin/sanoid - if [ $? -ne 0 ]; then - log_end_msg 1 - exit 1 - fi - if [ $? -eq 0 ]; then - log_end_msg 0 - fi - } +# [tank/important_data] +# use_template = production +# # Only snapshot if more than 5MB changed +# pre_snapshot_script = /usr/local/bin/sanoid-threshold.sh 5M - force_reload() { - stop - start - } +DATASET=$1 +RAW_THRESHOLD=$2 - case "$1" in - start) - start - ;; - stop) - stop - ;; - force-reload) - force_reload - ;; - restart) - stop - start - ;; +convert_to_bytes() { + local number=$(echo "$1" | grep -oE '^[0-9.]+') + local unit=$(echo "$1" | grep -oI '[KMGPT]' | tr '[:lower:]' '[:upper:]') - *) - echo "$Usage: $prog {start|stop|force-reload|restart}" - exit 2 + case "$unit" in + K) awk "BEGIN { printf \"%.0f\", $number * 1024 }" ;; + M) awk "BEGIN { printf \"%.0f\", $number * 1024^2 }" ;; + G) awk "BEGIN { printf \"%.0f\", $number * 1024^3 }" ;; + T) awk "BEGIN { printf \"%.0f\", $number * 1024^4 }" ;; + *) printf "%.0f" "$number" ;; esac +} + +if [[ -z "$DATASET" || -z "$RAW_THRESHOLD" ]]; then + logger -t sanoid "Threshold Error: Usage: $0 " + exit 1 +fi + +if ! zfs list -H "$DATASET" >/dev/null 2>&1; then + logger -t sanoid "Threshold Error: Dataset $DATASET not found." + exit 1 +fi + +THRESHOLD=$(convert_to_bytes "$RAW_THRESHOLD") +WRITTEN_BYTES=$(zfs get -Hp -o value written "$DATASET") + +if [[ "$WRITTEN_BYTES" -lt "$THRESHOLD" ]]; then + WRITTEN_HUMAN=$(zfs get -H -o value written "$DATASET") + # Optional: Comment out the logger below if your logs get too noisy + logger -t sanoid "Skipping $DATASET: Written $WRITTEN_HUMAN < Threshold $RAW_THRESHOLD." + exit 1 +fi + +exit 0 EOF } -# Sandoid doesn't ran as a daemon it runs vi cron -# conf_print_sanoid_init | sudo tee /etc/init.d/sanoid -# sudo chmod +x /etc/init.d/sanoid +conf_print_skip_hourless | sudo tee /usr/local/bin/sanoid_zfs-skip-empty.sh -# Generated using: -# https://raw.githubusercontent.com/akhilvij/systemd-to-sysvinit-converter/master/converter.py -# python2 converter /usr/src/sanoid-2.2.0/sanoid-prune.service > sanoid-prune -conf_print_sanoid-prune_init() { - cat <<-'EOF' - #!/bin/sh - ### BEGIN INIT INFO - # Provides: sanoid-prune - # Required-Start: $syslog $local_fs $remote_fs - # Required-Stop: $syslog $local_fs $remote_fs - # Short-Description: Prune ZFS snapshots - ### END INIT INFO +chmod +x /usr/local/bin/sanoid_zfs-skip-empty.sh +# VM Consistency (The "Freeze/Thaw" Logic) - . /lib/lsb/init-functions - prog=sanoid-prune - PIDFILE=/var/run/$prog.pid - DESC="Prune ZFS snapshots" - start() { - log_daemon_msg "Starting $DESC" "$prog" - start_daemon -p $PIDFILE /usr/sbin/sanoid --prune-snapshots --verbose - if [ $? -ne 0 ]; then - log_end_msg 1 - exit 1 - fi - if [ $? -eq 0 ]; then - log_end_msg 0 - fi - exit 0 - } - - stop() { - log_daemon_msg "Stopping $DESC" "$prog" - killproc -p $PIDFILE /usr/sbin/sanoid - if [ $? -ne 0 ]; then - log_end_msg 1 - exit 1 - fi - if [ $? -eq 0 ]; then - log_end_msg 0 - fi - } - - force_reload() { - stop - start - - } - - case "$1" in - start) - start - ;; - stop) - stop - ;; - force-reload) - force_reload - ;; - restart) - stop - start - ;; - - *) - echo "$Usage: $prog {start|stop|force-reload|restart}" - exit 2 - esac -EOF -} -# Sandoid doesn't ran as a daemon it runs vi cron -# conf_print_sanoid-prune_init | sudo tee /etc/init.d/sanoid-prune -# sudo chmod +x /etc/init.d/sanoid-prune +# The inclusion of virt-freeze-all.sh and virt-thaw-all.sh to ensure data integrity: +# * Pre-snapshot: virsh domfsfreeze tells the Guest OS (via qemu-guest-agent) to flush its write buffers and temporarily pause I/O. +# * Snapshot: Sanoid takes an atomic ZFS snapshot. +# * Post-snapshot: virsh domfsthaw resumes I/O. +# Result: You get an "application-consistent" backup rather than a "crash-consistent" one. # Give sudo access to virsh or is part of the libvirt group. # qemu-guest-agent must be running in the vm @@ -341,7 +301,7 @@ if [ -z "$VM_NAME" ]; then fi # Check if the VM is running -STATE=$(virsh domstate "$VM_NAME" 2>/dev/null) +STATE=$(/usr/bin/virsh domstate "$VM_NAME" 2>/dev/null) if [ "$STATE" != "running" ]; then echo "VM $VM_NAME is not running or does not exist. Skipping freeze." exit 0 @@ -349,7 +309,7 @@ fi echo "Freezing filesystems for $VM_NAME..." # domfsfreeze returns the number of frozen filesystems on success -if virsh domfsfreeze "$VM_NAME" > /dev/null; then +if /usr/bin/virsh domfsfreeze "$VM_NAME" > /dev/null; then echo "Successfully frozen $VM_NAME." else echo "Error: Failed to freeze $VM_NAME. Ensure qemu-guest-agent is active." @@ -357,8 +317,8 @@ else fi EOF } -conf_print_virt_freeze | sudo tee /usr/local/bin/virt-freeze.sh -sudo chmod +x /usr/local/bin/virt-freeze.sh +conf_print_virt_freeze | sudo tee /usr/local/bin/sanoid_virt-freeze.sh +sudo chmod +x /usr/local/bin/sanoid_virt-freeze.sh #--- @@ -375,7 +335,7 @@ if [ -z "$VM_NAME" ]; then fi # Check if the VM is running -STATE=$(virsh domstate "$VM_NAME" 2>/dev/null) +STATE=$(/usr/bin/virsh domstate "$VM_NAME" 2>/dev/null) if [ "$STATE" != "running" ]; then echo "VM $VM_NAME is not running. Skipping unfreeze." exit 0 @@ -383,7 +343,7 @@ fi echo "Thawing filesystems for $VM_NAME..." # domfsthaw returns the number of thawed filesystems on success -if virsh domfsthaw "$VM_NAME" > /dev/null; then +if /usr/bin/virsh domfsthaw "$VM_NAME" > /dev/null; then echo "Successfully thawed $VM_NAME." else echo "Error: Failed to thaw $VM_NAME." @@ -391,8 +351,8 @@ else fi EOF } -conf_print_virt_unfreeze | sudo tee /usr/local/bin/virt-unfreeze.sh -sudo chmod +x /usr/local/bin/virt-unfreeze.sh +conf_print_virt_unfreeze | sudo tee /usr/local/bin/sanoid_virt-unfreeze.sh +sudo chmod +x /usr/local/bin/sanoid_virt-unfreeze.sh #--- @@ -402,7 +362,7 @@ conf_print_virt_thaw_all() { # /usr/local/bin/virt-thaw-all.sh # 1. Get running VM names, filtering out empty lines with awk -RUNNING_VMS=$(virsh list --state-running --name | awk 'NF') +RUNNING_VMS=$(/usr/bin/virsh list --state-running --name | awk 'NF') if [ -z "$RUNNING_VMS" ]; then echo "No running VMs found." @@ -415,11 +375,11 @@ for VM_NAME in $RUNNING_VMS; do # Use the native thaw command. # It handles the guest agent communication for you. - if virsh domfsthaw "$VM_NAME" > /dev/null 2>&1; then + if /usr/bin/virsh domfsthaw "$VM_NAME" > /dev/null 2>&1; then echo "Successfully thawed $VM_NAME." else # If native fails, we capture the error for the user - ERROR=$(virsh domfsthaw "$VM_NAME" 2>&1) + ERROR=$(/usr/bin/virsh domfsthaw "$VM_NAME" 2>&1) echo "Error thawing $VM_NAME: $ERROR" fi done @@ -427,8 +387,8 @@ done echo "Finished processing all VMs." EOF } -conf_print_virt_thaw_all | sudo tee /usr/local/bin/virt-thaw-all.sh -sudo chmod +x /usr/local/bin/virt-thaw-all.sh +conf_print_virt_thaw_all | sudo tee /usr/local/bin/sanoid_virt-thaw-all.sh +sudo chmod +x /usr/local/bin/sanoid_virt-thaw-all.sh #--- @@ -437,7 +397,7 @@ conf_print_virt_freeze-all() { #!/bin/bash # 1. Get running VM names, filtering out empty lines -RUNNING_VMS=$(virsh list --state-running --name | awk 'NF') +RUNNING_VMS=$(/usr/bin/virsh list --state-running --name | awk 'NF') if [ -z "$RUNNING_VMS" ]; then echo "No running VMs found." @@ -450,7 +410,7 @@ for VM_NAME in $RUNNING_VMS; do # Using the native virsh command is generally cleaner # It returns the number of frozen filesystems on success - if virsh domfsfreeze "$VM_NAME" > /dev/null 2>&1; then + if /usr/bin/virsh domfsfreeze "$VM_NAME" > /dev/null 2>&1; then echo "Successfully frozen $VM_NAME." else echo "Error: Could not freeze $VM_NAME. Check if QEMU Guest Agent is running." @@ -460,8 +420,8 @@ done echo "Finished processing all VMs." EOF } -conf_print_virt_freeze-all | sudo tee /usr/local/bin/virt-freeze-all.sh -sudo chmod +x /usr/local/bin/virt-freeze-all.sh +conf_print_virt_freeze-all | sudo tee /usr/local/bin/sanoid_virt-freeze-all.sh +sudo chmod +x /usr/local/bin/sanoid_virt-freeze-all.sh #---