Whamcloud - gitweb
LU-16232 scripts: changelog/updatelog emergency cleanup 38/48838/4
authorMikhail Pershin <mpershin@whamcloud.com>
Wed, 12 Oct 2022 09:22:14 +0000 (12:22 +0300)
committerOleg Drokin <green@whamcloud.com>
Wed, 2 Nov 2022 07:11:37 +0000 (07:11 +0000)
Emergency cleanup scripts for situations when llogs are
corrupted and can't be cleaned up in a normal way. In such
cases the recommendation is to remove/truncate those llogs.

Scripts make all needed steps and have debugging option to
collect llogs for further analysis.

Scripts possible actions are:
 - dry-run mode to check all actions and files affected
 - create archive with all llogs for analysis
 - remove llogs including all plain llogs

Test-Parameters: trivial
Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: I3b197179bc54f451e3c5d7db36b6f1c56c076856
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48838
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Jian Yu <yujian@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre.spec.in
lustre/scripts/Makefile.am
lustre/scripts/remove_changelog [new file with mode: 0755]
lustre/scripts/remove_updatelog [new file with mode: 0755]

index f28007d..3c2dc2a 100644 (file)
@@ -734,6 +734,8 @@ echo '%{_libdir}/lustre/tests/lutf/*' >>lustre-tests.files
 %{_libexecdir}/lustre/lc_common
 %{_libexecdir}/lustre/haconfig
 %{_bindir}/lustre_req_history
+%{_bindir}/remove_changelog
+%{_bindir}/remove_updatelog
 %endif
 
 %{_bindir}/llobdstat
index cb30006..cdba9b2 100644 (file)
@@ -60,7 +60,7 @@ bin_SCRIPTS   = lfs_migrate
 
 if SERVER
 sbin_SCRIPTS += $(genscripts) lc_mon lhbadm lc_servip
-bin_SCRIPTS  += lustre_req_history
+bin_SCRIPTS  += lustre_req_history remove_changelog remove_updatelog
 
 hadir = $(sysconfdir)/ha.d/resource.d
 ha_SCRIPTS = Lustre.ha_v2
@@ -94,7 +94,8 @@ EXTRA_DIST = license-status lustre_rmmod ldev lc_mon lhbadm \
             lustre lsvcgss lc_common haconfig Lustre.ha_v2 dkms.mkconf \
             zfsobj2fid ko2iblnd-probe ksocklnd-config statechange-lustre.sh \
             vdev_attach-lustre.sh vdev_remove-lustre.sh vdev_clear-lustre.sh \
-            bash-completion/lustre bash-completion/lctl bash-completion/lfs
+            bash-completion/lustre bash-completion/lctl bash-completion/lfs \
+            remove_changelog remove_updatelog
 
 CLEANFILES = $(genscripts)
 
diff --git a/lustre/scripts/remove_changelog b/lustre/scripts/remove_changelog
new file mode 100755 (executable)
index 0000000..0bda063
--- /dev/null
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+# remove_changelog: emergency remove changelog files from server.
+#
+# This is emergency tool to cleanup changelogs in server if changelog
+# records cannot be removed by regular means, e.g. due to llog corruptions
+#
+# Tool goes through changelog_catalog and removes all plain llogs listed
+# then removes changelog_catalog itself and changelog_users
+# Script accept single parameter which is mount point of server FS mounted
+# locally, script accepts also --dry-run option to emulate files removal
+#
+# Steps to cleanup problematic llogs:
+#
+# 1. mount MDT filesystem locally on server as ldiskfs mount
+# 2. run script first in dry-run mode to make sure it parses llogs as needed:
+#    # bash remove_changelog -n <ldiskfs_mount>
+# 3. save all llogs for analysis:
+#    # bash remove_changelog -n -z /tmp/llogs_saved <ldiskfs_mount>
+# 4. check that /tmp/llogs_saved.tar.gz exists and has all llogs inside:
+#    # ls -ali /tmp/llogs_saved.tar.gz
+#    # tar -tf /tmp/llog_saved.tar.gz
+# 5. finally run script to delete all llogs:
+#    # bash remove_changelog <ldiskfs_mount>
+#
+# For better llogs compression xz can be used as well, pass it to the script
+# via GZIP env variable:
+#    # GZIP=xz bash remove_changelog -n -z /tmp/llogs_saved <ldiskfs_mount>
+# Archive name will ends with .xz in that case instead of .gz
+
+
+
+
+ECHO=echo
+PROG=$(basename $0)
+LLOG_READER=${LLOG_READER:-llog_reader}
+GZIP=${GZIP:-gzip}
+
+usage() {
+    cat -- <<USAGE 1>&2
+usage: remove_changelog [--dry-run|-n] [--help|-h] [--quiet|-q]
+                       [--zip}-z] <archive> <localmount>
+       --help|-h       show this usage message
+       --dry-run|-n    only print the names of files to be removed
+       --quiet|-q      run quietly (don't print filenames or status)
+       --zip|-z <name_prefix>
+                       save all llogs into compressed tar archive with given
+                       name prefix using gzip by default. Other compression
+                       tools can be used via GZIP env variable.
+
+The 'localmount' argument should be an ldiskfs mounted MDT device mountpoint.
+
+Examples:
+      remove_changelog /mnt/mdt0
+      remove_changelog --dry-run /mnt/mdt0
+      remove_changelog -z /tmp/llogs /mnt/mdt0
+USAGE
+    exit 1
+}
+
+OPT_DRYRUN=false
+OPT_ARCH=""
+OPT_MOUNT=""
+
+# Examine any long options and arguments
+while [ -n "$*" ]; do
+       arg="$1"
+       case "$arg" in
+       -h|--help) usage;;
+       -n|--dry-run) OPT_DRYRUN=true;;
+       -q|--quiet) ECHO=:;;
+       -z|--zip) OPT_ARCH="$2.tar"; shift;;
+       *)
+          [ -e "$arg" ] && OPT_MOUNT="$arg" && break
+       esac
+       shift
+done
+
+remove_changelog() {
+       local mntpoint=$OPT_MOUNT
+       local catalog=${mntpoint}/changelog_catalog
+       local users=${mntpoint}/changelog_users
+       local arch=$OPT_ARCH
+
+       if [[ -z $(df -t ldiskfs $mntpoint 2>/dev/null) ]] ; then
+               echo "$PROG: '$mntpoint' is not ldiskfs mount."
+               exit 1
+       fi
+
+       if $OPT_DRYRUN; then
+               $ECHO "Dry run was requested, no changes will be applied"
+       fi
+
+       $ECHO "Scan changelog_catalog at '$mntpoint':"
+       if [[ ! -f $catalog ]] ; then
+               echo "$PROG: $catalog doesn't exist already."
+       else
+               if [[ ! $(which $LLOG_READER 2>/dev/null) ]] ; then
+                       echo "$PROG: $LLOG_READER is missing."
+                       exit 1
+               fi
+               [[ -z $arch ]] || tar -cf $arch $catalog 2>/dev/null
+               if (( $(stat -c %s $catalog) >= 8192 )) ; then
+                       while read -r path ; do
+                               [[ -z $arch ]] ||
+                                       tar -rf $arch ${mntpoint}/$path 2>/dev/null
+                               $ECHO "rm ${mntpoint}/$path"
+                               $OPT_DRYRUN || rm -f ${mntpoint}/$path
+                       done < <($LLOG_READER $catalog |
+                                awk -F "path=" '/path=/ { print $2 }')
+               else
+                       echo "$PROG: $catalog is too small."
+               fi
+               $ECHO "> $catalog"
+               $OPT_DRYRUN || > $catalog
+       fi
+
+       if [[ -f $users ]] ; then
+               [[ -z $arch ]] || tar -rf $arch $users 2>/dev/null
+               $ECHO "> $users"
+               $OPT_DRYRUN || > $users
+       else
+               echo "$PROG: $user doesn't exist."
+       fi
+       if [[ "$arch" ]] ; then
+               $GZIP -3 $arch
+               $ECHO "llog archive was created by $GZIP"
+       fi
+}
+
+if [ -z $OPT_MOUNT ] ; then
+       echo "Mount is not specified, exiting"
+       exit 1
+fi
+remove_changelog
+
+
diff --git a/lustre/scripts/remove_updatelog b/lustre/scripts/remove_updatelog
new file mode 100755 (executable)
index 0000000..2fad732
--- /dev/null
@@ -0,0 +1,160 @@
+#!/bin/bash
+
+# remove_updatelogs: emergency remove MDT updatelog files from server.
+#
+# This is emergency tool to cleanup updatelogs in server if llog records
+# cannot be removed by regular means, e.g. due to llog corruptions
+#
+# Tool goes the following:
+# - goes through update_log catlist to find per-MDT update llog catalog
+# - process llog catalog to delete all plain llogs in it
+# - truncate or remove related llog catalog after all
+# - truncates update_llogs itself if all catalogs were removed
+#
+# Script required parameter is mount point of server FS mounted locally
+# it accepts also optional options as described below in usage()
+#
+# Steps to cleanup problematic llogs:
+#
+# 1. mount MDT filesystem locally on server as ldiskfs mount
+# 2. run script first in dry-run mode to make sure it parses llogs as needed:
+#    # bash remove_updatelog -n <ldiskfs_mount>
+# 3. save all llogs for analysis:
+#    # bash remove_updatelog -n -z /tmp/llogs_saved <ldiskfs_mount>
+# 4. check that /tmp/llogs_saved.tar.gz exists and has all llogs inside:
+#    # ls -ali /tmp/llogs_saved.tar.gz
+#    # tar -tf /tmp/llog_saved.tar.gz
+# 5. finally run script to delete all llogs:
+#    # bash remove_updatelog <ldiskfs_mount>
+#
+# For better llogs compression xz can be used as well, pass it to the script
+# via GZIP env variable:
+#    # GZIP=xz bash remove_updatelog -n -z /tmp/llogs_saved <ldiskfs_mount>
+# Archive name will ends with .xz in that case instead of .gz
+
+ECHO=echo
+PROG=$(basename $0)
+LLOG_READER=${LLOG_READER:-llog_reader}
+GZIP=${GZIP:-gzip}
+
+usage() {
+    cat -- <<USAGE 1>&2
+usage: remove_updatelog [--dry-run|-n] [--help|-h] [--quiet|-q] <localmount>
+       --help|-h          show this usage message
+       --dry-run|-n       only print the names of files to be removed
+       --quiet|-q         run quietly (don't print filenames or status)
+       --zip|-z <name_prefix>
+                       save all llogs into compressed tar archive with given
+                       name prefix using gzip by default. Other compression
+                       tools can be used via GZIP env variable.
+
+The 'localmount' argument should be an ldiskfs mounted MDT device mountpoint.
+
+Examples:
+      remove_updatelog /mnt/mdt0
+      remove_updatelog --dry-run /mnt/mdt0
+      remove_changelog -z /tmp/llogs /mnt/mdt0
+USAGE
+    exit 1
+}
+
+OPT_DRYRUN=false
+OPT_ARCH=""
+OPT_MOUNT=""
+OPT_MDTS=()
+
+# Examine any long options and arguments
+while [ -n "$*" ]; do
+       arg="$1"
+       case "$arg" in
+       -h|--help) usage;;
+       -n|--dry-run) OPT_DRYRUN=true;;
+       -q|--quiet) ECHO=:;;
+       -z|--zip) OPT_ARCH="$2.tar"; shift;;
+       *)
+          [ -e "$arg" ] && OPT_MOUNT="$arg" && break
+       esac
+       shift
+done
+
+remove_updatelog() {
+       local mntpoint=$OPT_MOUNT
+       local catlist=${mntpoint}/update_log
+       local dir=${mntpoint}/update_log_dir
+       local arch=$OPT_ARCH
+       local length=0
+
+       if [[ -z $(df -t ldiskfs $mntpoint 2>/dev/null) ]] ; then
+               echo "$PROG: '$mntpoint' is not ldiskfs mount."
+               exit 1
+       fi
+
+       if $OPT_DRYRUN; then
+               $ECHO "Dry run was requested, no changes will be applied"
+       fi
+
+       $ECHO "Scan update_log at '$mntpoint':"
+       if [[ ! -f $catlist ]] ; then
+               echo "$PROG: $catlist doesn't exist already."
+       else
+               read -r -d '' -a OPT_MDTS < <(hexdump -v -e '2/8 " %16x" 2/8 "\n"' $catlist |
+                                             awk '{print "[0x"$2":0x"$1":0x0]"}')
+
+               if [[ ! $(which $LLOG_READER 2>/dev/null) ]] ; then
+                       echo "$PROG: $LLOG_READER is missing."
+                       exit 1
+               fi
+               [[ -z $arch ]] || tar -cf $arch $catlist 2>/dev/null
+               length=${#OPT_MDTS[@]}
+               for (( i = 0; i < ${length}; i++ )); do
+                       local catalog=$dir/${OPT_MDTS[$i]}
+
+                       $ECHO "Processing MDT$i llogs ..."
+                       if [[ ! -f $catalog ]] ; then
+                               echo "$PROG: $catalog doesn't exist already."
+                               continue
+                       fi
+                       [[ -z $arch ]] || tar -rf $arch $catalog 2>/dev/null
+                       if (( $(stat -c %s $catalog) >= 8192 )) ; then
+                               while read -r plain ; do
+                                       local path
+
+                                       # compatibility checks:
+                                       # old llog reader reports path in /O
+                                       # but correct path in update_log_dir
+                                       if [ ${plain:0:1} == 'O' ] ; then
+                                               local fid=${plain#"O/"*}
+
+                                               # old format: O/8589935617/d3/3
+                                               # get sequence and oid in hex:
+                                               fid=$(printf "[0x%x:0x%x:0x0]" ${fid%%/*} ${fid##*/})
+                                               path="$dir/$fid"
+                                       else
+                                               path=$mntpoint/$plain
+                                       fi
+                                       [[ -z $arch ]] ||
+                                               tar -rf $arch $path 2>/dev/null
+                                       $ECHO "rm -f $path"
+                                       $OPT_DRYRUN || rm -f $path
+                               done < <(llog_reader $catalog |
+                                        awk -F "path=" '/path=/ { print $2 }')
+                       else
+                               echo "$PROG: $catalog is too small."
+                       fi
+                       $ECHO "> $catalog"
+                       $OPT_DRYRUN || > $catalog
+               done
+       fi
+       if [[ "$arch" ]] ; then
+               $GZIP -3 $arch
+               $ECHO "llog archive was created by $GZIP"
+       fi
+}
+
+if [ -z $OPT_MOUNT ] ; then
+       echo "Mount is not specified, exiting"
+       exit 1
+fi
+remove_updatelog
+
+