#!/bin/bash
# do a parallel backup and restore of specified files
#
#  Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
#
#   This file is part of Lustre, http://www.lustre.org.
#
#   Lustre is free software; you can redistribute it and/or
#   modify it under the terms of version 2 of the GNU General Public
#   License as published by the Free Software Foundation.
#
#   Lustre is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with Lustre; if not, write to the Free Software
#   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# Author: Andreas Dilger <adilger@sun.com>
#

VERSION=1.1.0

export LC_ALL=C

RSH=${RSH:-"ssh"}			# use "bash -c" for local testing
SPLITMB=${SPLITMB:-8192}		# target chunk size (uncompressed)
SPLITCOUNT=${SPLITCOUNT:-200}		# number of files to give each client
TAR=${TAR:-"tar"}			# also htar in theory works (untested)

#PROGPATH=$(which $0 2> /dev/null) || PROGPATH=$PWD/$0
case $0 in
	.*) PROGPATH=$PWD/$0 ;;
	*)  PROGPATH=$0 ;;
esac

PROGNAME="$(basename $PROGPATH)"
LOGPREFIX="$PROGNAME"
log() {
	echo "$LOGPREFIX: $*" 1>&2
}

fatal() {
	log "ERROR: $*"
	exit 1
}

usage() {
	log "$*"
	echo "usage: $PROGNAME [-chjvxz] [-C directory] [-e rsh] [-i inputlist]"
	echo -e "\t\t[-l logdir] [-n nodes] [-s splitmb] [-T tar] -f ${FTYPE}base"
	echo -e "\t-c create archive"
	echo -e "\t-C directory: relative directory for filenames (default PWD)"
	echo -e "\t-e rsh: specify the passwordless remote shell (default $RSH)"
	if [ "$OP" = "backup" ]; then
		echo -e "\t-f outputfile: specify base output filename for backup"
	else
		echo -e "\t-f ${OP}filelist: specify list of files to $OP"
	fi
	echo -e "\t-h: print help message and exit (use -x -h for restore help)"
	if [ "$OP" = "backup" ]; then
		echo -e "\t-i inputfile: list of files to backup (default stdin)"
	fi
	echo -e "\t-j: use bzip2 compression on $FTYPE file(s)"
	echo -e "\t-l logdir: directory for output logs"
	echo -e "\t-n nodes: comma-separated list of client nodes to run ${OP}s"
	if [ "$OP" = "backup" ]; then
		echo -e "\t-s splitmb: target size for backup chunks " \
			"(default ${SPLITMB}MiB)"
		echo -e "\t-S splitcount: number of files sent to each client "\
			"(default ${SPLITCOUNT})"
	fi
	echo -e "\t-t: list table of contents of tarfile"
	echo -e "\t-T tar: specify the backup command (default $TAR)"
	echo -e "\t-v: be verbose - list all files being processed"
	echo -e "\t-V: print version number and exit"
	echo -e "\t-x: extract files instead of backing them up"
	echo -e "\t-z: use gzip compression on $FTYPE file(s)"
	exit 1
}

usage_inactive() {
	usage "inactive argument '$1 $2' in '$3' mode"
}

set_op_type() {
	case $1 in
	*backup*)	OP=backup;  FTYPE=output; TAROP="-c" ;;
	*list*)		OP=list;    FTYPE=input;  TAROP="-t"; SPLITCOUNT=1 ;;
	*restore*)	OP=restore; FTYPE=input;  TAROP="-x"; SPLITCOUNT=1 ;;
	*)		FTYPE="output"; usage "unknown archive operation '$1'";;
	esac
}

#echo ARGV: "$@"

# --fileonly, --remote are internal-use-only options
TEMPARGS=$(getopt -n $LOGPREFIX -o cC:e:f:hi:jl:n:ps:S:tT:vVxz --long create,extract,list,restore,directory:,rsh:,outputbase:,help,inputfile:,bzip2,logdir:,nodes:,permissions:splitmb,splitcount,tar:,verbose,version,gzip,fileonly,remote: -- "$@")

eval set -- "$TEMPARGS"

set_op_type $PROGNAME

# parse input arguments, and accumulate the client-specific args
while true; do
	case "$1" in
	-c|--create)	[ "$OP" != "backup" ] &&
				usage "can't use $1 $TAROP at the same time"
			OP="backup"; ARGS="$ARGS $1"; shift ;;
	-C|--directory)	GOTODIR="$2"; cd "$2" || usage "error cd to -C $2";
					ARGS="$ARGS $1 \"$2\""; shift 2 ;;
	-e|--rsh)	RSH="$2"; shift 2;;
	-f|--outputbase)OUTPUTBASE="$2";ARGS="$ARGS $1 \"$2\""; shift 2 ;;
	-h|--help)	ARGS=""; break;;
	-i|--inputfile) INPUT="$2"; shift 2;;
	-j|--bzip2)	TARCOMP="-j";	ARGS="$ARGS $1"; shift ;;
	-l|--logdir)	LOGDIR="$2";	ARGS="$ARGS $1 \"$2\""; shift 2 ;;
	-n|--nodes)	NODELIST="$NODELIST,$2";
					ARGS="$ARGS $1 \"$2\""; shift 2 ;;
	-p|--permissions) PERM="-p";	ARGS="$ARGS $1"; shift ;;
	-s|--splitmb)	[ "$OP" != "backup" ] && usage_inactive $1 $2 $OP
			SPLITMB=$2;	ARGS="$ARGS $1 \"$2\""; shift 2 ;;
	-S|--splitcount)[ "$OP" != "backup" ] && usage_inactive $1 $2 $OP
			SPLITCOUNT=$2;	ARGS="$ARGS $1 \"$2\""; shift 2 ;;
	-t|--list)	[ "$OP" != "backup" -a "$OP" != "list" ] &&
				usage "can't use $1 $TAROP at the same time"
			OP="list"; ARGS="$ARGS $1"; shift ;;
	-T|--tar)	TAR="$2";	ARGS="$ARGS $1 \"$2\""; shift 2 ;;
	-v|--vebose)	[ "$VERBOSE" = "-v" ] && set -vx # be extra verbose
			VERBOSE="-v";	ARGS="$ARGS $1"; shift ;;
	-V|--version)	echo "$LOGPREFIX: version $VERSION"; exit 0;;
	-x|--extract|--restore)
			[ "$OP" != "backup" -a "$OP" != "restore" ] &&
				usage "can't use $1 $TAROP at the same time"
			OP="restore"; ARGS="$ARGS $1"; shift ;;
	-z|--gzip)	TARCOMP="-z";	ARGS="$ARGS $1"; shift ;;
	# these commands are for internal use only
	--remote)	NODENUM="$2"; LOGPREFIX="$(hostname).$2"; shift 2;;
	--fileonly)	FILEONLY="yes"; shift;;
	--)		shift; break;;
	*) usage "unknown argument '$1'" 1>&2 ;;
	esac
done

set_op_type $OP

#log "ARGS: $ARGS"

[ -z "$ARGS" ] && usage "$OP a list of files, running on multiple nodes"

# we should be able to use any backup tool that can accept filenames
# from an input file instead of just pathnames on the command-line.
# Unset TARCOMP for htar, as it doesn't support on-the-fly compression.
TAREXT=
case "$(basename $TAR)" in
	htar*)		    TARARG="-L"; TAROUT="-f"; TARCOMP=""; MINKB=0 ;;
	tar*|gnutar*|gtar*) TARARG="-T"; TAROUT="-b 2048 -f"; TAREXT=.tar ;;
	*)		fatal "unknown archiver '$TAR'" ;;
esac

if [ "$OP" = "backup" ]; then
	[ -z "$OUTPUTBASE" ] && usage "'-f ${FTYPE}base' must be given for $OP"
	# Make sure we leave some margin free in the output filesystem for the
	# chunks.  If we are dumping to a network filesystem (denoted by having
	# a ':' in the name, not sure how else to check) then we assume this
	# filesystem is shared among all clients and expect the other nodes
	# to also consume space there.
	OUTPUTFS=$(dirname $OUTPUTBASE)
	NETFS=$(df -P $OUTPUTFS | awk '/^[[:alnum:]]*:/ { print $1 }')
	MINKB=${MINKB:-$((SPLITMB * 2 * 1024))}
	[ "$NETFS" ] && MINKB=$(($(echo $NODELIST | tr ',' ' ' | wc -w) * $MINKB))

	# Compress the output files as we go.
	case "$TARCOMP" in
		-z) TAREXT="$TAREXT.gz";;
		-j) TAREXT="$TAREXT.bz2";;
	esac
else
	[ -z "$OUTPUTBASE" ] &&
		usage "-f ${OP}filelist must be specified for $OP"
	# we want to be able to use this for a list of files to restore
	# but it is convenient to use $INPUT for reading the pathnames
	# of the tar files during restore/list operations to handle stdin
	[ "$INPUT" ] && usage "-i inputbase unsupported for $OP"
	INPUT=$OUTPUTBASE
	TARARG=""
fi

[ -z "$NODELIST" ] && NODELIST="localhost"

# If we are writing to a char or block device (e.g. tape) don't add any suffix
# We can't currently specify a different target device per client...
if [ -b "$OUTPUTBASE" -o -c "$OUTPUTBASE" ]; then
	MINKB=0
	[ -z "$LOGDIR" ] && LOGDIR="/var/log"
	LOGBASE="$LOGDIR/$PROGNAME"
elif [ -d "$OUTPUTBASE" ]; then
	usage "-f $OUTPUTBASE must be a pathname, not a directory"
else
	[ -z "$LOGDIR" ] && LOGBASE="$OUTPUTBASE" || LOGBASE="$LOGDIR/$PROGNAME"
fi
LOGBASE="$LOGBASE.$(date +%Y%m%d%H%M)"

# tar up a sinle list of files into a chunk.  We don't exit if there is an
# error returned, since that might happen frequently with e.g. files moving
# and no longer being available for backup.
# usage: run_one_tar {file_list} {chunk_nr} {chunkbytes}
DONE_MSG="FINISH_THIS_PROGRAM_NOW_I_TELL_YOU"
KILL_MSG="EXIT_THIS_PROGRAM_NOW_I_TELL_YOU"
run_one_backup() {
#set -vx
	TMPLIST="$1"
	CHUNK="$2"
	CHUNKMB="$(($3 / 1048576))"
	if [ -b "$OUTPUTBASE" -o -c "$OUTPUTBASE" ]; then
		OUTFILE="$OUTPUTBASE"
	else
		OUTFILE="$OUTPUTBASE.$NODENUM.$CHUNK$TAREXT"
	fi
	CHUNKBASE="$LOGBASE.$NODENUM.$CHUNK"
	LISTFILE="$CHUNKBASE.list"
	LOG="$CHUNKBASE.log"

	cp "$TMPLIST" "$LISTFILE"

	SLEPT=0
	FREEKB=$(df -P $OUTPUTFS 2> /dev/null | tail -n 1 | awk '{print $4}')
	while [ $FREEKB -lt $MINKB ]; do
		sleep 5
		SLEPT=$((SLEPT + 5))
		if [ $((SLEPT % 60)) -eq 10 ]; then
			log "waiting ${SLEPT}s for ${MINKB}kB free in $OUTPUTFS"
		fi
		FREEKB=$(df -P $OUTPUTFS | tail -n 1 | awk '{print $4}')
	done
	[ $SLEPT -gt 0 ] && log "waited ${SLEPT}s for space in ${OUTPUTFS}"
	log "$LISTFILE started - est. ${CHUNKMB}MB"
	START=$(date +%s)
	eval $TAR $TAROP $PERM $TARARG "$TMPLIST" -v $TARCOMP $TAROUT "$OUTFILE" \
		2>&1 >>"$LOG" | tee -a $LOG | grep -v "Removing leading"
	RC=${PIPESTATUS[0]}
	ELAPSE=$(($(date +%s) - START))
	if [ $RC -eq 0 ]; then
	        if [ -f "$OUTFILE" ]; then
			BYTES=$(stat -c '%s' "$OUTFILE")
			CHUNKMB=$((BYTES / 1048576))
			log "$LISTFILE finished - act. ${CHUNKMB}MB/${ELAPSE}s"
		else
			log "$LISTFILE finished OK - ${ELAPSE}s"
		fi
		echo "OK" > $CHUNKBASE.done
	else
		echo "ERROR=$RC" > $CHUNKBASE.done
		log "ERROR: $LISTFILE exited with rc=$RC"
	fi
	rm $TMPLIST
	return $RC
}

run_one_restore_or_list() {
#set -vx
	INPUTFILE="$1"
	LOG="$LOGBASE.$(basename $INPUTFILE).restore.log"

	SLEPT=0
	while [ $MINKB != 0 -a ! -r "$INPUTFILE" ]; do
		SLEPT=$((SLEPT + 5))
		if [ $((SLEPT % 60)) -eq 10 ]; then
			log "waiting ${SLEPT}s for $INPUTFILE staging"
		fi
		sleep 5
	done
	[ $SLEPT -gt 0 ] && log "waited ${SLEPT}s for $INPUTFILE staging"
	log "$OP of $INPUTFILE started"
	START=$(date +%s)
	eval $TAR $TAROP -v $TARCOMP $TAROUT "$INPUTFILE" 2>&1 >>"$LOG" |
		tee -a "$LOG" | grep -v "Removing leading"
	RC=${PIPESTATUS[0]}
	ELAPSE=$(($(date +%s) - START))
	[ "$OP" = "list" ] && cat $LOG
	if [ $RC -eq 0 ]; then
		log "$INPUTFILE finished OK - ${ELAPSE}s"
		echo "OK" > $INPUTFILE.restore.done
	else
		echo "ERROR=$RC" > $INPUTFILE.restore.done
		log "ERROR: $OP of $INPUTFILE exited with rc=$RC"
	fi
	return $RC
}

# Run as a remote command and read input filenames from stdin and create tar
# output files of the requested size.  The input filenames can either be:
# "bytes filename" or "filename" depending on whether FILEONLY is set.
#
# Read filenames until we have either a large enough list of small files,
# or we get a very large single file that is backed up by itself.
run_remotely() {
#set -vx
	log "started thread"
	RCMAX=0

	[ "$FILEONLY" ] && PARAMS="FILENAME" || PARAMS="BYTES FILENAME"

	if [ "$OP" = "backup" ]; then
		TMPBASE=$PROGNAME.$LOGPREFIX.temp
		TMPFILE="$(mktemp -t $TMPBASE.$(date +%s).XXXXXXX)"
		OUTPUTFILENUM=0
		SUMBYTES=0
	fi
	BYTES=""

	while read $PARAMS; do
		[ "$FILENAME" = "$DONE_MSG" -o "$BYTES" = "$DONE_MSG" ] && break
		if [ "$FILENAME" = "$KILL_MSG" -o "$BYTES" = "$KILL_MSG" ]; then
			log "exiting $OP on request"
			[ "$TARPID" ] && kill -9 $TARPID 2> /dev/null
			exit 9
		fi

		case "$OP" in
		list|restore)
			run_one_restore_or_list $FILENAME; RC=$?
			;;
		backup)	STAT=($(stat -c '%s %F' "$FILENAME"))
			[ "$FILEONLY" ] && BYTES=${STAT[0]}
			# if this is a directory that has files in it, it will
			# be backed up as part of this (or some other) backup.
			# Only include it in the backup if empty, otherwise
			# the files therein will be backed up multiple times
			if [ "${STAT[1]}" = "directory" ]; then
				NUM=`find "$FILENAME" -maxdepth 1|head -2|wc -l`
				[ "$NUM" -gt 1 ] && continue
			fi
			[ "$VERBOSE" ] && log "$FILENAME"

			# if a file is > 3/4 of chunk size, archive by itself
			# avoid shell math: 1024 * 1024 / (3/4) = 1398101
			if [ $((BYTES / 1398101)) -gt $SPLITMB ]; then
				# create a very temp list for just this file
				TARLIST=$(mktemp -t $TMPBASE.$(date +%s).XXXXXX)
				echo "$FILENAME" > "$TARLIST"
				TARBYTES=$BYTES
			else
				SUMBYTES=$((SUMBYTES + BYTES))
				echo "$FILENAME" >> $TMPFILE

				# not large enough input list, keep collecting
				[ $((SUMBYTES >> 20)) -lt $SPLITMB ] && continue

				TARBYTES=$SUMBYTES
				SUMBYTES=0
				TARLIST="$TMPFILE"
				TMPFILE=$(mktemp -t $TMPBASE.$(date +%s).XXXXXXX)
			fi

			wait $TARPID
			RC=$?
			run_one_backup "$TARLIST" "$OUTPUTFILENUM" $TARBYTES &
			TARPID=$!
			OUTPUTFILENUM=$((OUTPUTFILENUM + 1))
			;;
		esac

		[ $RC -gt $RCMAX ] && RCMAX=$RC
	done

	if [ "$TARPID" ]; then
		wait $TARPID
		RC=$?
		[ $RC -gt $RCMAX ] && RCMAX=$RC
	fi

	if [ -s "$TMPFILE" ]; then
		run_one_backup "$TMPFILE" "$OUTPUTFILENUM" $SUMBYTES
		RC=$?
		[ $RC -gt $RCMAX ] && RCMAX=$RC
	fi
	exit $RCMAX
}

# If we are a client then just run that subroutine and exit
[ "$NODENUM" ] && run_remotely && exit 0

# Tell the clients to exit.  Their input pipes might be busy so it may
# take a while for them to consume the files and finish.
CLEANING=no
cleanup() {
	log "cleaning up remote processes"
	for FD in $(seq $BASEFD $((BASEFD + NUMCLI - 1))); do
		echo "$DONE_MSG" >&$FD
	done
	CLEANING=yes

	SLEPT=0
	RUN=$(ps auxww | egrep -v "grep|bash" | grep -c "$PROGNAME.*remote")
	while [ $RUN -gt 0 ]; do
		set +vx
		#ps auxww | grep "$PROGNAME.*remote" | egrep -v "grep|bash"
		sleep 1
		SLEPT=$((SLEPT + 1))
		[ $((SLEPT % 30)) -eq 0 ] &&
			log "wait for $RUN processes to finish"
		[ $((SLEPT % 300)) -eq 0 ] &&
			ps auxww |grep "$PROGNAME.*remote" |egrep -v "grep|bash"
		RUN=$(ps auxww|egrep -v "grep|bash"|grep -c "$PROGNAME.*remote")
	done
	trap 0
}

do_cleanup() {
	if [ "$CLEANING" = "yes" ]; then
		log "killing all remote processes - may not stop immediately"
		for FD in $(seq $BASEFD $((BASEFD + NUMCLI - 1))); do
			echo "$KILL_MSG" >&$FD
		done
		sleep 1
		PROCS=$(ps auxww|awk '/$PROGNAME.*remote/ { print $2 }')
		[ "$PROCS" ] && kill -9 $PROCS
		trap 0
	fi

	cleanup
}

# values that only need to be determined on the master
# always read from stdin, even if it is a file, to be more consistent
case "$INPUT" in
	-|"")	INPUT="standard input";;
	*)	if [ ! -r "$INPUT" ]; then
			[ "$VERBOSE" ] && ls -l "$INPUT"
			usage "can't read input file '$INPUT'"
		fi
		exec <$INPUT ;;
esac

# if unspecified, run remote clients in the current PWD to get correct paths
[ -z "$GOTODIR" ] && ARGS="$ARGS -C \"$PWD\""

# main()
BASEFD=100
NUMCLI=0

# Check if the input list has the file size specified or not.  Input
# lines should be of the form "{bytes} {filename}" or "{filename}".
# If no size is given then the file sizes are determined by the clients
# to do the chunking (useful for making a full backup, but not as good
# at evenly distributing the data among clients).  In rare cases the first
# file specified may have a blank line and no size - check that as well.
if [ "$OP" = "backup" ]; then
	read BYTES FILENAME
	if [ -z "$FILENAME" -a -e "$BYTES" ]; then
		FILENAME="$BYTES"
		BYTES=""
		FILEONLY="yes" && ARGS="$ARGS --fileonly"
	elif [ -e "$BYTES $FILENAME" ]; then
		FILENAME="$BYTES $FILENAME"
		BYTES=""
		FILEONLY="yes" && ARGS="$ARGS --fileonly"
	elif [ ! -e "$FILENAME" ]; then
		log "input was '$BYTES $FILENAME'"
		fatal "first line of '$INPUT' is not a file"
	fi
else
	FILEONLY="yes" && ARGS="$ARGS --fileonly"
fi

# kill the $RSH processes if we get a signal
trap do_cleanup INT EXIT

# start up the remote processes, each one with its stdin attached to a
# different output file descriptor, so that we can communicate with them
# individually when sending files to back up.  We generate a remote log
# file and also return output to this process.
for CLIENT in $(echo $NODELIST | tr ',' ' '); do
	FD=$((BASEFD+NUMCLI))
	LOG=$OUTPUTBASE.$CLIENT.$FD.log
	eval "exec $FD> >($RSH $CLIENT '$PROGPATH --remote=$NUMCLI $ARGS')"
	RC=$?
	if [ $RC -eq 0 ]; then
		log "starting $0.$NUMCLI on $CLIENT"
		NUMCLI=$((NUMCLI + 1))
	else
		log "ERROR: failed '$RSH $CLIENT $PROGPATH': RC=$?"
	fi
done

if [ $NUMCLI -eq 0 ]; then
	fatal "unable to start any threads"
fi

CURRCLI=0
# We don't want to use "BYTES FILENAME" if the input doesn't include the
# size, as this might cause problems with files with whitespace in them.
# Instead we just have two different loops depending on whether the size
# is in the input file or not.  We dish out the files either by size
# (to fill a chunk), or just round-robin and hope for the best.
if [ "$FILEONLY" ]; then
	if [ "$FILENAME" ]; then
		[ "$VERBOSE" ] && log "$FILENAME"
		echo "$FILENAME" 1>&$BASEFD	# rewrite initial line
	fi
	# if we don't know the size, just round-robin among the clients
	while read FILENAME; do
		FD=$((BASEFD+CURRCLI))
		[ -n "$VERBOSE" -a "$OP" != "backup" ] && log "$OP $FILENAME"
		echo "$FILENAME" 1>&$FD

		COUNT=$((COUNT + 1))
		if [ $COUNT -ge $SPLITCOUNT ]; then
			CURRCLI=$(((CURRCLI + 1) % NUMCLI))
			COUNT=0
		fi
	done
else
	[ "$VERBOSE" ] && log "$FILENAME"
	echo $BYTES "$FILENAME" 1>&$BASEFD	# rewrite initial line
	# if we know the size, then give each client enough to start a chunk
	while read BYTES FILENAME; do
		FD=$((BASEFD+CURRCLI))
		[ "$VERBOSE" ] && log "$FILENAME"
		echo $BYTES "$FILENAME" >&$FD

		# take tar blocking factor into account
		[ $BYTES -lt 10240 ] && BYTES=10240
		SUMBYTES=$((SUMBYTES + BYTES))
		if [ $((SUMBYTES / 1048576)) -ge $SPLITMB ]; then
			CURRCLI=$(((CURRCLI + 1) % NUMCLI))
			SUMBYTES=0
		fi
	done
fi

# Once all of the files have been given out, wait for the remote processes
# to complete.  That might take a while depending on the size of the backup.
cleanup