diff --git a/bootstrap.sh b/bootstrap.sh index 7930699..1faad27 100644 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -1,358 +1,385 @@ #!/bin/bash ### # Part of a stupid script to backup some stuff # # This bootstrap.sh file does nothing by itself but loads useful stuff. # # This file is loaded from 'backup-everything.sh' or 'rotate.sh' # # Author: 2020-2024 Valerio Bozzolan, contributors # License: MIT ## # current directory export DIR="${BASH_SOURCE%/*}" if [[ ! -d "$DIR" ]]; then DIR="$PWD"; fi # check if the standard input is not a terminal export INTERACTIVE= if [ -t 0 ]; then INTERACTIVE=1 fi # # Check if this is the quiet mode # # Default - not quite. # # Actually we are in quiet mode if it's not interactive. # This lazy behavior is to avoid stupid emails from the crontab # without the need to specify some --quiet etc. # Note that in quiet mode only WARN and ERROR messages are shown. # I've not created a --quiet flag because nobody is needing it. # # Edit your options - do not edit here. # #QUIET= # # Eventually write a log file # # Default - write a log file. # # Edit your options - do not edit here. # export WRITELOG=1 # path to the instructions file export INSTRUCTIONS="$DIR/backup-instructions.conf" # path to the configuration file export CONFIG="$DIR/options.conf" # no config no party if [ ! -f "$CONFIG" ]; then echo "missing options expected in $CONFIG" exit 1 fi # default mysql commands # --batch: avoid fancy columns (auto-enabled, but better to specify it) # --silent: avoid the column name to be included export MYSQL="mysql --batch --silent" export MYSQLDUMP="mysqldump --routines --triggers" # default rsync command # --archive: Try to keep all the properties # --fuzzy: Try to check if a file was renamed instead of delete and download a new one # It's efficient for example with log rotated files. # --delete: Delete the destination files if not present in the source # NOTE: we want this behaviour but it's not a good idea toghether with --fuzzy # that's why we do not use --delete but we use the next flags # --delay-updates Put all updated files into place at end (useful with fuzzy and delete modes) # --delete-delay Delete after everything (useful with fuzzy and delete modes) # NOTE: sometime some data is kept in damn .~tmp~ directories # So we are deprecating --delete-delay, and going back to --delete # and so removing --fuzzy # --hard-links Try to look for hard links during the transfer to do not copy separate files #RSYNC="rsync --archive --fuzzy --delay-updates --delete-delay --hard-links" # default rsync command # --archive: Try to keep all the properties # --delete: Delete the destination files if not present in the source # --hard-links Try to look for hard links during the transfer to do not copy separate files # --no-inc-recursive Disables the new incremental recursion algorithm of the --recursive option. # This makes rsync scan the full file list before it begins to transfer files. # This may improve how hard links are copied successfully. export RSYNC="rsync --archive --delete --hard-links --no-inc-recursive" # rsync used in remote transfers # --compress Use more CPU to save network bandwidth export RSYNC_REMOTE="$RSYNC --compress" # default base backup directory for all backups export BASE="/home/backups" # default box name BOX="$(hostname)" export BOX # set to 1 to avoid any disservice (e.g. systemctl stop/start) export NO_DISSERVICE= # set to 1 to do nothing export PORCELAIN= # How many hours should pass between each execution. # This is just a sane default to avoid daylight saving issues. export HOURS_INTERVAL=12 # include the configuration to eventually override some options # shellcheck source=config.sh . "$CONFIG" # as default, if not interactive, set quite mode if [ -z "$QUIET" ] && [ "$INTERACTIVE" != 1 ]; then QUIET=1 fi # full pathnames to the backup directories export BASEBOX="$BASE/$BOX" export DAILY="$BASEBOX/daily" export DAILY_FILES="$DAILY/files" export DAILY_DATABASES="$DAILY/databases" export DAILY_LASTLOG="$DAILY/last.log" export DAILY_LASTTIME="$DAILY/last.timestamp" export DAILY_STARTTIME="$DAILY/start.timestamp" # apply the porcelain to the rsync command if [ "$PORCELAIN" = 1 ]; then RSYNC="$RSYNC --dry-run" RSYNC_REMOTE="$RSYNC_REMOTE --dry-run" NO_DISSERVICE=1 fi # set default backup_last_log() lines if [ -z "$BACKUP_LAST_LOG_LINES" ]; then BACKUP_LAST_LOG_LINES=8000 fi ## # Receive in input a file path, and a number of hours, and check whenever # enough time (in hours) was passed or not. # # If the file was never created, we assume that enough time was passed. # # @param string timestamp_file # @param int hours # function are_enough_hours_passed() { # No args, no party. # Note that the file argument will be checked later. local timestamp_file="$1" local expected_hours="$2" if [ -z "$expected_hours" ]; then echo "Error: Missing argument expected hours." exit 2 fi local expected_seconds=$((expected_hours * 3600)) are_enough_seconds_passed "$timestamp_file" "$expected_seconds" } ## # Receive in input a file path, and a number of hours, and check whenever # enough time (in seconds) was passed or not. # # If the file was never created, we assume that enough time was passed. # # @param string timestamp_file # @param int seconds # function are_enough_seconds_passed() { # No args, no party. local timestamp_file="$1" local expected_hours="$2" if [ -z "$timestamp_file" ]; then echo "Error: Missing argument timestamp file." exit 2 fi if [ -z "$expected_hours" ]; then echo "Error: Missing argument expected hours." exit 2 fi if [ -f "$timestamp_file" ]; then # Read the file, if it has sense. local last_timestamp=$(<"$timestamp_file") if [ "$last_timestamp" -lt 1000 ]; then echo "Error: Bad format in file $timestamp_file" exit 2 fi local current_timestamp=$(date +%s) local diff_seconds=$((current_timestamp - last_timestamp)) # If enough time is passed, return true. [ "$diff_seconds" -ge "$expected_seconds" ]; fi # The file doesn't exist. Return nothing special (0, that is True). } ## # Receive in input a file path, and write there the current timestamp. # # @param string timestamp_file # function write_timestamp() { # No arg, no party. local timestamp_file="$1" if [ -z "$timestamp_file" ]; then echo "Error: Missing timestamp file argument." exit 1 fi # Write the current Unix timestamp. date +%s > "$timestamp_file" } ### # Print something # # It also put the message in the backup directory # # @param string severity # @param string message # function printthis() { local msg msg="[$(date)][$1] $2" # print to standard output if it's not in quiet mode if [ "$QUIET" != 1 ]; then printf "%s\n" "$msg" fi # put in the log file if possible if [ -f "$DAILY_LASTLOG" ] && [ "$WRITELOG" = 1 ]; then printf "%s\n" "$msg" >> "$DAILY_LASTLOG" fi } ### # Run an rsync archive copy # @param string source # @param string destionation # function copy() { # show what we are doing log "copy $*" # run the rsync command if [ "$PORCELAIN" != 1 ]; then $RSYNC $@ fi } ### # Run an rsync archive copy, # creating hard-links into the destination, # instead of copying files. -# This saves a lot of space but you MUST pay attention +# This saves a lot of space and reduces I/O but you MUST pay attention # to never touch the source files, or you will overwrite # destination files. # @param string source # @param string destionation # @param string source_for_hardlinks # function copy_using_hard_links() { local source="$1" local dest="$2" local source_hardlinks="$3" if [ "$#" != 3 ]; then echo "Bad usage of $0. Must have 3 arguments. Current arguments: $@" exit 3 fi # No source for hardlinks, no party source_hardlinks_abs=$(realpath "$source_hardlinks") if ! [ -d "$source_hardlinks_abs" ]; then echo "Source for hardlinks not existing: $source_hardlinks_abs given from source_hardlinks from current path $(pwd)" exit 4 fi # Show what we are doing, without being verbose. log "copy using hard links from $source to $dest (source for hard links: $source_hardlinks)" log "$RSYNC --link-dest=$source_hardlinks_abs $source $dest" # run the rsync command if [ "$PORCELAIN" != 1 ]; then $RSYNC --link-dest="$source_hardlinks_abs" "$source" "$dest" fi } +## +# Check if a path is managed by a BTRFS filesystem. +# This is useful when you want to de-duplicate filesystem data, +# and so if you want to use specific features of BTRFS like 'duperemove' +# (instead of 'rdfind'). +# +# @param string Path. Example: '/srv/primary' +# @return int 0 if the filesystem is BTRFS. +# 1 if the filesystem is not. +# 2 if you should read the fantastic manual. +# +function is_btrfs() { + # No path, no party. + local place=$1 + if [[ -z "$place" ]]; then + echo "$0: Missing path." + exit 2 + fi + + local filesystem=$(stat -f --format=%T "$place") + if [[ "$filesystem" == btrfs ]]; then + return 0 + else + return 1 + fi +} + ### # Remove a pathname # function drop() { # show what we are doing log "drop $*" # well, proceed... finger crossed... with some protections if [ "$PORCELAIN" != 1 ]; then rm --recursive --force --one-file-system --preserve-root -- $@ fi } ### # Move something somewhere # function move() { # show what we are doing log "move $*" if [ "$PORCELAIN" != 1 ]; then mv --force $@ fi } ### # Print a information message # # @param msg Message # function log() { printthis INFO "$1" } ### # Print a warning message # # @param msg Message # function warn() { printthis WARN "$1" } ### # Print an error message # # @param msg Message # function error() { printthis ERROR "$1" } diff --git a/rotate.sh b/rotate.sh index 9bbee05..11551e2 100755 --- a/rotate.sh +++ b/rotate.sh @@ -1,235 +1,252 @@ #!/bin/bash ### # Stupid script to rotate backup, # creating incremental backups thanks to # hard-links. # # Author: Valerio B. # Date: Wed 4 Ago 2020 # License: CC 0 - public domain ## # do not proceed in case of errors set -e # current directory MYDIR="$(dirname "$(realpath "$0")")" # as default don't be quiet while rotating QUIET=0 # as default don't write in the log while rotating WRITELOG=0 # # Maximum time that your rotation could last # # Right now this should be a good default since it doesn't make much sense # for a rotation to take more than this number of hours. # # If the script takes longer than this, the next rotation may not run. # # Note: at the moment this must be shorter than a single day. # # Current default: 6 hours (6 * 60 * 60 = 21600 seconds) MAX_ROTATE_SECONDS=21600 # include all the stuff and useful functions . "$MYDIR"/bootstrap.sh # arguments place="$1" days="$2" max="$3" # expected file containing last timestamp last_timestamp_file="$place.timestamp" # show usage function show_help_rotate() { echo "USAGE" echo " $0 PATH DAYS MAX_ROTATIONS" echo "EXAMPLE" echo " $0 /home/backups 1 30" } function harden() { local harden_path="$1" # no path no party if [ -z "$harden_path" ]; then echo "Wrong usage of harden" exit 2 fi # Harden rotations # # Note that non-privileged users should be able to push their last copy, # but MUST not in any way be able to touch older copies chown root:root "$harden_path" chmod 600 "$harden_path" } # all the arguments must exist (just check the last one) if [ -z "$max" ]; then echo "Bad usage" show_help_rotate exit 1 fi # the place to be rotated must exist if [ ! -e "$place" ]; then error "unexisting directory '$place'" exit 2 fi # validate max parameter if [ "$max" -lt 2 ]; then echo "The MAX parameter must be greater than 1" show_help_rotate exit 3 fi # expected seconds from the last rotation before continuing # NOTE: leave the star escaped to avoid syntax error in expr expected_seconds=$(expr "$days" "*" 86400) # check if the duration in seconds is a day or more if [ "$expected_seconds" -ge 86400 ]; then # the expected time since the last execution is never exactly the number of days in seconds # Solution: remove few hours from the expected (just to say, uhm, 5 hours) expected_seconds=$(expr "$expected_seconds" - "$MAX_ROTATE_SECONDS") fi # do not proceed if not enough time passed since last execution on that directory # this avoids daylight saving time change problems # this also avoids race conditions when starting parallel executions by mistake if ! are_enough_seconds_passed "$last_timestamp_file" "$expected_seconds"; then warn "doing nothing: last rotation was executed too recently on $place: now-last $(date +%s)-$(< "$last_timestamp_file") - expected at least $expected_seconds seconds" exit 0 fi # save the last timestamp before rotating everything # this will avoid even parallel rotations write_timestamp "$last_timestamp_file" # eventually drop the last backup step # if it does not exist, don't care max_path="$place.$max" drop "$max_path" # shift all the backups after="$max" while [[ "$after" -gt 1 ]]; do before=$(expr "$after" - 1) # do not process the root directory for no reason in the world if you type that by mistake # the --preserve-root is already implicit but... let's be sure! asd before_path="$place.$before" after_path="$place.$after" # the source must exist. asd if [ -e "$before_path" ]; then # the trailing slash means: copy files and not just the directory move "$before_path/" "$after_path" harden "$after_path" fi # next after="$before" done # at the end, move the base forward -# the trailing slash means: copy files and not just the directory -# this should be able to create a copy that is lightweight than the original if few -# things changed. So later "rdfind" has less work to do. -# Micro-optimization: if we already have an older backup, use that as source for hard links. -# This drammatically reduces I/O on disk and saves A LOT of disk space, -# if you know what you are doing, and as long as you consider ".2" as read-only. +# the trailing slash means: copy files and not just the directory. +# We were trying to adopt "hard links" copy but it does not work very well. if [ -d "$place.2" ]; then - # Source Destin SourceForHardLinks + # Try to create the "$place.1" minimizing I/O and disk usage. copy_using_hard_links "$place/" "$place.1" "$place.2" else - # Source Destin - copy "$place/" "$place.1" + # Create "$place.1" in the traditional way. + copy "$place/" "$place.1" fi # Make sure that other users cannot see this path. # This is usually since you may want to preserve original context in YOUR files, # but having a strict parent context in OUR generated files. harden "$place.1" # # De-duplicate the first rotations. # This saves ~1TB of data for 20 rotations of 100 computers in my use-case :) # # - Why not de-duplicating "$place"? # We avoid to touch the source, since the source can be wrote and that's an additional risk. # I'm 9999.99% sure that we could also write the original source "$place", # but again, it's an unnecessary risk. # So this is a security mitigation. # - Why de-duplicating only "$place.1" and "$place.2"? # It's not useful to rotate ALL rotations since the 1+2 will be rotated. # It's rare that you generate a duplicate file between rotation 1 and rotation 30, # So this is a micro-optimization. # # Do these path exists? -if [ -d "$place.1" ] && [ -d "$place.2" ]; then - # Check that rdfind is installed in your system. - if which rdfind > /dev/null; then - # rdfind arguments: - # -makehardlinks: this does the de-duplication trick while keeping data consistency. - # Yuppie! - # -removeidentinode: this sounds scary for backups, and was disabled. - # I don't even know why it is not auto-disabled with the next one. - # -makeresultsfile For some reasons this script creates a report. We don't want it. - # We are already happy with its output. - # -checksum: the default is sha1, but it's not a secure method nowadays, - # so, adopting sha256 is better, slower but more safe. - # The risk with sha1 is that an attacker can push exact copies - # with same size and same hash and same initial and final bytes, - # to try to compromise future rotations. - # -> please avoid sha1 - # -> please adopt sha256 or bigger (but think about performance) - # -minsize Under this size in bytes, files are skipped. - # The default is just 1 bytes, to skip empty files. - # It may be really inconvenient to process billions of tiny files, - # so, I suggest to keep this at least at some megabytes, - # To receive some visible benefits in terms of saved disk space, - # and do not cause intensive read operations to run sha256 (or whatever) - # over millions of files. - # -sleep: This is just a sleep between each file, to do not over-heat your drives. - # This is just to extend the life of your backup drives. - # This is probably reasonable since this backup is supposed to be executed - # on daily basis, so 1 millisecond at least between each file is probably - # a good default. - log "de-duplication started on $place.1 and $place.2" - rdfind \ - -makehardlinks true \ - -removeidentinode false \ - -makeresultsfile false \ - -minsize 1000000 \ - -checksum sha256 \ - "$place.1" \ - "$place.2" - - log "de-duplication concluded" +if [ -d "$place.1" ] && [ -d "$place.2" ] && [ -d "$place.3" ]; then + + # If we have BTRFS on board + if is_btrfs "$place.1"; then + log "$place.1 is btrfs" + + # Check that rdfind is installed in your system. + if which duperemove > /dev/null; then + log "de-duplication with duperemove started on $place.1 and $place.2 and $place.3 (and nothing more for performance reasons)" + duperemove -rdh \ + "$place.1" \ + "$place.2" \ + "$place.3" + else + warn "duperemove is not installed in this system so we cannot de-duplicate your backup rotations" + fi else - warn "rdfind is not installed in this system so we cannot de-duplicate your backup rotations" + log "$place.1 is not btrfs" + + # Check that rdfind is installed in your system. + if which rdfind > /dev/null; then + # rdfind arguments: + # -makehardlinks: this does the de-duplication trick while keeping data consistency. + # Yuppie! + # -removeidentinode: this sounds scary for backups, and was disabled. + # I don't even know why it is not auto-disabled with the next one. + # -makeresultsfile For some reasons this script creates a report. We don't want it. + # We are already happy with its output. + # -checksum: the default is sha1, but it's not a secure method nowadays, + # so, adopting sha256 is better, slower but more safe. + # The risk with sha1 is that an attacker can push exact copies + # with same size and same hash and same initial and final bytes, + # to try to compromise future rotations. + # -> please avoid sha1 + # -> please adopt sha256 or bigger (but think about performance) + # -minsize Under this size in bytes, files are skipped. + # The default is just 1 bytes, to skip empty files. + # It may be really inconvenient to process billions of tiny files, + # so, I suggest to keep this at least at some megabytes, + # To receive some visible benefits in terms of saved disk space, + # and do not cause intensive read operations to run sha256 (or whatever) + # over millions of files. + # -sleep: This is just a sleep between each file, to do not over-heat your drives. + # This is just to extend the life of your backup drives. + # This is probably reasonable since this backup is supposed to be executed + # on daily basis, so 1 millisecond at least between each file is probably + # a good value. Or maybe not lol. + # + log "de-duplication started with rdfind on $place.1 and $place.2 and $place.3 (and nothing more for performance reasons)" + rdfind \ + -makehardlinks true \ + -removeidentinode false \ + -makeresultsfile false \ + -minsize 1000000 \ + -checksum sha256 \ + "$place.1" \ + "$place.2" \ + "$place.3" + + log "de-duplication concluded" + else + warn "rdfind is not installed in this system so we cannot de-duplicate your backup rotations" + fi fi fi # yeah! log "rotation concluded"