scrub/e2scrub.in

   1 #!/bin/bash
   2
   3 #  Copyright (C) 2018 Oracle.  All Rights Reserved.
   4 #
   5 #  Author: Darrick J. Wong <darrick.wong@oracle.com>
   6 #
   7 #  This program is free software; you can redistribute it and/or
   8 #  modify it under the terms of the GNU General Public License
   9 #  as published by the Free Software Foundation; either version 2
  10 #  of the License, or (at your option) any later version.
  11 #
  12 #  This program is distributed in the hope that it would be useful,
  13 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #  GNU General Public License for more details.
  16 #
  17 #  You should have received a copy of the GNU General Public License
  18 #  along with this program; if not, write the Free Software Foundation,
  19 #  Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
  20
  21 # Automatically check a LVM-managed filesystem online.
  22 # We use lvm snapshots to do this, which means that we can only
  23 # check filesystems in VGs that have at least 256MB (or so) of
  24 # free space.
  25
  26 snap_size_mb=256
  27 fstrim=0
  28 reap=0
  29 e2fsck_opts=""
  30 conffile="@root_sysconfdir@/e2scrub.conf"
  31
  32 test -f "${conffile}" && . "${conffile}"
  33
  34 print_help() {
  35         echo "Usage: $0 [OPTIONS] mountpoint | device"
  36         echo
  37         echo "mountpoint must be on a LVM-managed block device"
  38         echo "-r: Remove e2scrub snapshot and exit, do not check anything."
  39         echo "-t: Run fstrim if successful."
  40         echo "-V: Print version information and exit."
  41 }
  42
  43 print_version() {
  44         echo "e2scrub @E2FSPROGS_VERSION@ (@E2FSPROGS_DATE@)"
  45 }
  46
  47 exitcode() {
  48         ret="$1"
  49
  50         # If we're being run as a service, the return code must fit the LSB
  51         # init script action error guidelines, which is to say that we
  52         # compress all errors to 1 ("generic or unspecified error", LSB 5.0
  53         # section 22.2) and hope the admin will scan the log for what
  54         # actually happened.
  55
  56         # We have to sleep 2 seconds here because journald uses the pid to
  57         # connect our log messages to the systemd service.  This is critical
  58         # for capturing all the log messages if the scrub fails, because the
  59         # fail service uses the service name to gather log messages for the
  60         # error report.
  61         if [ -n "${SERVICE_MODE}" ]; then
  62                 test "${ret}" -ne 0 && ret=1
  63                 sleep 2
  64         fi
  65
  66         exit "${ret}"
  67 }
  68
  69 while getopts "rtV" opt; do
  70         case "${opt}" in
  71         "r") reap=1;;
  72         "t") fstrim=1;;
  73         "V") print_version; exitcode 0;;
  74         *) print_help; exitcode 2;;
  75         esac
  76 done
  77 shift "$((OPTIND - 1))"
  78
  79 arg="$1"
  80 if [ -z "${arg}" ]; then
  81         print_help
  82         exitcode 1
  83 fi
  84
  85 # Find the device for a given mountpoint
  86 dev_from_mount() {
  87         local mountpt="$(realpath "$1")"
  88
  89         lsblk -o NAME,FSTYPE,MOUNTPOINT -p -P -n 2> /dev/null | while read vars; do
  90                 eval "${vars}"
  91                 if [ "${mountpt}" != "${MOUNTPOINT}" ]; then
  92                         continue
  93                 fi
  94                 case "${FSTYPE}" in
  95                 ext[234])
  96                         echo "${NAME}"
  97                         return 0
  98                         ;;
  99                 esac
 100         done
 101         return 1
 102 }
 103
 104 # Check a device argument
 105 dev_from_arg() {
 106         local dev="$1"
 107         local fstype="$(lsblk -o FSTYPE -n "${dev}" 2> /dev/null)"
 108
 109         case "${fstype}" in
 110         ext[234])
 111                 echo "${dev}"
 112                 return 0
 113                 ;;
 114         esac
 115         return 1
 116 }
 117
 118 mnt_from_dev() {
 119         local dev="$1"
 120
 121         if [ -n "${dev}" ]; then
 122                 lsblk -o MOUNTPOINT -n "${dev}"
 123         fi
 124 }
 125
 126 # Construct block device path and mountpoint from argument
 127 if [ -b "${arg}" ]; then
 128         dev="$(dev_from_arg "${arg}")"
 129         mnt="$(mnt_from_dev "${dev}")"
 130 else
 131         dev="$(dev_from_mount "${arg}")"
 132         mnt="${arg}"
 133 fi
 134 if [ ! -e "${dev}" ]; then
 135         echo "${arg}: Not an ext[234] filesystem."
 136         print_help
 137         exitcode 16
 138 fi
 139
 140 # Make sure this is an LVM device we can snapshot
 141 lvm_vars="$(lvs --nameprefixes -o name,vgname,lv_role --noheadings "${dev}" 2> /dev/null)"
 142 eval "${lvm_vars}"
 143 if [ -z "${LVM2_VG_NAME}" ] || [ -z "${LVM2_LV_NAME}" ] ||
 144    echo "${LVM2_LV_ROLE}" | grep -q "snapshot"; then
 145         echo "${arg}: Not connnected to a LVM logical volume."
 146         print_help
 147         exitcode 16
 148 fi
 149 start_time="$(date +'%Y%m%d%H%M%S')"
 150 snap="${LVM2_LV_NAME}.e2scrub"
 151 snap_dev="/dev/${LVM2_VG_NAME}/${snap}"
 152
 153 teardown() {
 154         # Remove and wait for removal to succeed.
 155         ${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 3>&-
 156         while [ -e "${snap_dev}" ] && [ "$?" -eq "5" ]; do
 157                 sleep 0.5
 158                 ${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 3>&-
 159         done
 160 }
 161
 162 check() {
 163         # First we recover the journal, then we see if e2fsck tries any
 164         # non-optimization repairs.  If either of these two returns a
 165         # non-zero status (errors fixed or remaining) then this fs is bad.
 166         E2FSCK_FIXES_ONLY=1
 167         export E2FSCK_FIXES_ONLY
 168         ${DBG} "@root_sbindir@/e2fsck" -E journal_only -p ${e2fsck_opts} "${snap_dev}" || return $?
 169         ${DBG} "@root_sbindir@/e2fsck" -f -y ${e2fsck_opts} "${snap_dev}"
 170 }
 171
 172 mark_clean() {
 173         ${DBG} "@root_sbindir@/tune2fs" -C 0 -T "${start_time}" "${dev}"
 174 }
 175
 176 mark_corrupt() {
 177         ${DBG} "@root_sbindir@/tune2fs" -E force_fsck "${dev}"
 178 }
 179
 180 setup() {
 181         # Try to remove snapshot for 30s, bail out if we can't remove it.
 182         lveremove_deadline="$(( $(date "+%s") + 30))"
 183         ${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 3>&- 2>/dev/null
 184         while [ -e "${snap_dev}" ] && [ "$?" -eq "5" ] &&
 185               [ "$(date "+%s")" -lt "${lvremove_deadline}" ]; do
 186                 sleep 0.5
 187                 ${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 3>&-
 188         done
 189         if [ -e "${snap_dev}" ]; then
 190                 echo "${arg}: e2scrub snapshot is in use, cannot check!"
 191                 return 1
 192         fi
 193         # Create the snapshot, wait for device to appear.
 194         ${DBG} lvcreate -s -L "${snap_size_mb}m" -n "${snap}" "${LVM2_VG_NAME}/${LVM2_LV_NAME}" 3>&-
 195         if [ $? -ne 0 ]; then
 196                 echo "${arg}: e2scrub snapshot FAILED, will not check!"
 197                 return 1
 198         fi
 199         ${DBG} udevadm settle 2> /dev/null
 200         return 0
 201 }
 202
 203 if [ "${reap}" -gt 0 ]; then
 204         if [ -e "${snap_dev}" ]; then
 205                 teardown 2> /dev/null
 206         fi
 207         exit 0
 208 fi
 209 if ! setup; then
 210         exitcode 8
 211 fi
 212 trap "teardown; exit 1" EXIT INT QUIT TERM
 213
 214 # Check and react
 215 check
 216 case "$?" in
 217 "0")
 218         # Clean check!
 219         echo "${arg}: Scrub succeeded."
 220         mark_clean
 221         teardown
 222         trap '' EXIT
 223
 224         # Trim the free space, which requires the snapshot be deleted.
 225         if [ "${fstrim}" -eq 1 ] && [ -d "${mnt}" ] && type fstrim > /dev/null 2>&1; then
 226                 echo "${arg}: Trimming free space."
 227                 fstrim -v "${mnt}"
 228         fi
 229
 230         ret=0
 231         ;;
 232 "8")
 233         # Operational error, what now?
 234         echo "${arg}: e2fsck operational error."
 235         teardown
 236         trap '' EXIT
 237         ret=8
 238         ;;
 239 *)
 240         # fsck failed.  Check if the snapshot is invalid; if so, make a
 241         # note of that at the end of the log.  This isn't necessarily a
 242         # failure because the mounted fs could have overflowed the
 243         # snapshot with regular disk writes /or/ our repair process
 244         # could have done it by repairing too much.
 245         #
 246         # If it's really corrupt we ought to fsck at next boot.
 247         is_invalid="$(lvs -o lv_snapshot_invalid --noheadings "${snap_dev}" | awk '{print $1}')"
 248         if [ -n "${is_invalid}" ]; then
 249                 echo "${arg}: Scrub FAILED due to invalid snapshot."
 250                 ret=8
 251         else
 252                 echo "${arg}: Scrub FAILED due to corruption!  Unmount and run e2fsck -y."
 253                 mark_corrupt
 254                 ret=6
 255         fi
 256         teardown
 257         trap '' EXIT
 258         ;;
 259 esac
 260
 261 exitcode "${ret}"