Whamcloud - gitweb
EX-2067 lpurge: check layout before opening
authorJohn L. Hammond <jhammond@whamcloud.com>
Fri, 24 Sep 2021 17:59:31 +0000 (10:59 -0700)
committerAndreas Dilger <adilger@whamcloud.com>
Sat, 2 Oct 2021 18:44:49 +0000 (18:44 +0000)
In lpurge_mirror_delete(), fetch the layout by getxattr() and check if
we would be deleting the last stale mirror before we open the
file. This is to avoid breaking the lease held by a concurrent mirror
resync.

Lustre-change: https://review.whamcloud.com/45033
Lustre-commit: 912766d05efd95886597a46dcbaaf10237e56ebf

Test-Parameters: trivial testlist=hot-pools
Signed-off-by: John L. Hammond <jhammond@whamcloud.com>
Signed-off-by: Jian Yu <yujian@whamcloud.com>
Change-Id: I8b1addd14b290faa537fcc15514cae8b18c802f2
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/45111
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lipe/src/lpurge.c
lustre/tests/hot-pools.sh [changed mode: 0644->0755]

index 91b670f..4423b38 100644 (file)
  *  - take OST load into account
  *
  */
-#include <stdlib.h>
 #include <stdio.h>
-#include <string.h>
-#include <errno.h>
-#include <getopt.h>
-#include <unistd.h>
+#include <stdlib.h>
+#include <assert.h>
 #include <dirent.h>
+#include <errno.h>
 #include <fcntl.h>
-#include <poll.h>
+#include <getopt.h>
 #include <glob.h>
-#include <assert.h>
 #include <mntent.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/ioctl.h>
+#include <poll.h>
+#include <string.h>
 #include <time.h>
-#include <linux/unistd.h>
+#include <unistd.h>
+#include <json-c/json.h>
 #include <linux/kernel.h>
-#include <sys/wait.h>
-#include <sys/sysinfo.h>
-#include <sys/statfs.h>
-#include <sys/file.h>
+#include <linux/lustre/lustre_cfg.h>
+#include <linux/lustre/lustre_fid.h>
+#include <linux/lustre/lustre_idl.h>
 #include <linux/lustre/lustre_user.h>
+#include <linux/unistd.h>
 #include <lustre/lustreapi.h>
-#include <linux/lustre/lustre_idl.h>
-#include <linux/lustre/lustre_fid.h>
-#include <linux/lustre/lustre_cfg.h>
-#include <json-c/json.h>
+#include <sys/file.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/sysinfo.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/xattr.h>
 #include "lipe_object_attrs.h"
 #include "lipe_version.h"
 #include "list.h"
@@ -869,6 +870,7 @@ lpurge_mirror_delete(const struct lu_fid *fid, unsigned int mirror_id)
 {
        char fid_buf[FID_LEN + 1];
        char vname_buf[PATH_MAX];
+       char lov_xattr_buf[XATTR_SIZE_MAX];
        struct ll_ioc_lease *lil = NULL;
        struct llapi_layout *layout = NULL;
        int mdt_index = -1;
@@ -876,6 +878,36 @@ lpurge_mirror_delete(const struct lu_fid *fid, unsigned int mirror_id)
        int vfd = -1;
        int rc;
 
+       /* Before we open the file and break any leases, fetch the
+        * layout and check to see if we would be trying to delete the
+        * last non stale mirror. To avoid races, we'll check again
+        * after we open the file. */
+
+       memset(lov_xattr_buf, 0, sizeof(lov_xattr_buf));
+       snprintf(lov_xattr_buf, sizeof(lov_xattr_buf), DFID, PFID(fid));
+
+       rc = ioctl(open_by_fid_fd, IOC_MDC_GETFILESTRIPE, lov_xattr_buf);
+       if (rc < 0) {
+               rc = -errno;
+               llapi_printf(LLAPI_MSG_DEBUG, "cannot IOC_MDC_GETFILESTRIPE "DFID", rc = %d\n",
+                            PFID(fid), rc);
+               goto out;
+       }
+
+       layout = llapi_layout_get_by_xattr(lov_xattr_buf, sizeof(lov_xattr_buf), 0);
+       if (layout == NULL) {
+               rc = -errno;
+               goto out;
+       }
+
+       if (last_non_stale_mirror(mirror_id, layout)) {
+               rc = -EUCLEAN;
+               goto out;
+       }
+
+       llapi_layout_free(layout);
+       layout = NULL;
+
        /* Inline replacement for
         * lfs mirror split -d --mirror-id mirror_id $MOUNTPOINT/.lustre/fid/FID
         */
old mode 100644 (file)
new mode 100755 (executable)
index 1e323cb..3d831b9
@@ -1899,6 +1899,97 @@ test_58() {
 }
 run_test 58 "replicaste DoM files"
 
+test_59() {
+       local td=$DIR/$tdir
+       local tf=$td/$tfile
+       local free_MB
+       local size_MB
+       local freehi=99
+       local freelo=96
+       local ids
+       local cmd
+       local pid
+
+       init_hot_pools_env
+
+       # start lamigo
+       start_lamigo_service
+       check_lamigo_is_started || error "failed to start lamigo"
+       stack_trap stop_lamigo_service
+
+       # start lpurge
+       LPURGE_FREELO=$freelo LPURGE_FREEHI=$freehi start_lpurge_service
+       check_lpurge_is_started || error "failed to start lpurge"
+       stack_trap stop_lpurge_service
+
+       $LFS df -h
+       free_MB=$(($(lfs_df -p $LAMIGO_SRC $DIR |
+                       awk '/summary/{print $4}') / 1024))
+       size_MB=$((free_MB * (100 - freelo + 1) / 100))
+
+       # create a regular file in source pool
+       mkdir $td || error "mkdir $td failed"
+       cmd="$LFS setstripe -c -1 -p $LAMIGO_SRC $td"
+       echo $cmd
+       $cmd || error "'$cmd' failed"
+
+       cmd="$MULTIOP $tf oO_CREAT:O_RDWR:eRE+eU"
+       echo $cmd
+       $cmd || error "'$cmd' failed"
+
+       cancel_lru_locks osc
+       cancel_lru_locks mdc
+       sleep $((LAMIGO_AGE * 2))
+
+       # verify the file replicated in target pool
+       $LFS getstripe $tf
+       verify_one_lamigo_param 0 replicated 1
+       verify_file_mirror $tf 2
+
+       # fill in data
+       yes "${size_MB}M file"|
+               dd bs=1M count=$size_MB iflag=fullblock of=$tf ||
+                       error "failed to write to $tf"
+
+       cmd="$MULTIOP $tf oO_RDWR:eR_E-eUc"
+       echo $cmd
+       $cmd &
+       pid=$!
+
+       echo "Before purging:"
+       $LFS df -h
+       $LFS getstripe $tf
+
+       sleep $((LPURGE_INTV * 2))
+
+       echo "After trying to purge:"
+       $LFS df -h
+       $LFS getstripe $tf
+
+       # since the mirror in source pool is the last non-stale mirror,
+       # it cannot be purged because 'lfs mirror split' cannot get WRITE lease.
+       ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+       verify_comp_attr lcme_flags $tf ${ids[0]} init,prefer
+       verify_comp_attr lcme_flags $tf ${ids[1]} init,stale
+
+       # release the lease lock
+       kill -USR1 $pid && wait $pid || error "$MULTIOP failed"
+
+       wait_file_resync $tf
+
+       sleep $((LPURGE_INTV * 2))
+
+       # verify the file purged from source pool
+       echo "After purging:"
+       wait_file_mirror $tf 1 900
+       $LFS df -h
+       $LFS getstripe $tf
+       verify_mirror_count $tf 1
+       ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+       verify_comp_attr pool $tf ${ids[${#ids[@]}-1]} $LAMIGO_TGT
+}
+run_test 59 "lpurge: check layout before opening"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status