Whamcloud - gitweb
LU-14111 obdclass: count eviction per obd_device
authorAurelien Degremont <degremoa@amazon.com>
Tue, 13 Oct 2020 14:12:23 +0000 (14:12 +0000)
committerAndreas Dilger <adilger@whamcloud.com>
Wed, 14 Feb 2024 19:19:00 +0000 (19:19 +0000)
Add a new 'obd_eviction_count' counter to obd_device which
is increased every time a client is evicted, which means
every time we call `class_fail_export()`.

Expose this counter through `lctl get_param *.*.eviction_count`
for every target.

Only support recovery-small test 146 for 2.14.0.133+.

Lustre-change: https://review.whamcloud.com/40528
Lustre-commit: 3c69d46e1766480c0ffd1bef840b4e167b4cf88e

Lustre-change: https://review.whamcloud.com/52098
Lustre-commit: b034dd27dd39483e40f91ea82d3f5c62b514ec54

Signed-off-by: Aurelien Degremont <degremoa@amazon.com>
Change-Id: I83b691662285cf2cd937187bffa54de6bd1f694c
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Shaun Tancheff <shaun.tancheff@hpe.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53897
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Andreas Dilger <adilger@whamcloud.com>
lustre/include/lprocfs_status.h
lustre/include/obd.h
lustre/mdt/mdt_lproc.c
lustre/mgs/lproc_mgs.c
lustre/obdclass/genops.c
lustre/obdclass/lprocfs_status_server.c
lustre/ofd/lproc_ofd.c
lustre/tests/recovery-small.sh

index 4d7397f..c587b05 100644 (file)
@@ -599,6 +599,8 @@ extern int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data);
 #ifdef HAVE_SERVER_SUPPORT
 ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr,
                         char *buf);
+ssize_t eviction_count_show(struct kobject *kobj, struct attribute *attr,
+                           char *buf);
 #endif
 struct adaptive_timeout;
 extern int lprocfs_at_hist_helper(struct seq_file *m,
index d0b4a27..8031bf8 100644 (file)
@@ -769,6 +769,7 @@ struct obd_device {
        atomic_t                obd_evict_inprogress;
        wait_queue_head_t       obd_evict_inprogress_waitq;
        struct list_head        obd_evict_list; /* protected with pet_lock */
+       atomic_t                obd_eviction_count;
 
        /**
         * LDLM pool part. Save last calculated SLV and Limit.
index 675abd7..12f9b18 100644 (file)
@@ -1379,6 +1379,7 @@ LUSTRE_RW_ATTR(grant_compat_disable);
 LUSTRE_RO_ATTR(instance);
 
 LUSTRE_RO_ATTR(num_exports);
+LUSTRE_RO_ATTR(eviction_count);
 
 static struct attribute *mdt_attrs[] = {
        &lustre_attr_tot_dirty.attr,
@@ -1390,6 +1391,7 @@ static struct attribute *mdt_attrs[] = {
        &lustre_attr_recovery_time_soft.attr,
        &lustre_attr_ir_factor.attr,
        &lustre_attr_num_exports.attr,
+       &lustre_attr_eviction_count.attr,
        &lustre_attr_identity_expire.attr,
        &lustre_attr_identity_acquire_expire.attr,
        &lustre_attr_identity_upcall.attr,
index 4ea2973..d645ac4 100644 (file)
@@ -217,6 +217,7 @@ static struct lprocfs_vars lprocfs_mgs_obd_vars[] = {
 };
 
 LUSTRE_RO_ATTR(num_exports);
+LUSTRE_RO_ATTR(eviction_count);
 
 static ssize_t fstype_show(struct kobject *kobj, struct attribute *attr,
                           char *buf)
@@ -253,6 +254,7 @@ LUSTRE_RO_ATTR(mntdev);
 static struct attribute *mgs_attrs[] = {
        &lustre_attr_fstype.attr,
        &lustre_attr_mntdev.attr,
+       &lustre_attr_eviction_count.attr,
        &lustre_attr_num_exports.attr,
        NULL,
 };
index b6ebc82..245235b 100644 (file)
@@ -1566,6 +1566,7 @@ void class_disconnect_stale_exports(struct obd_device *obd,
                        continue;
                }
                exp->exp_failed = 1;
+               atomic_inc(&exp->exp_obd->obd_eviction_count);
                spin_unlock(&exp->exp_lock);
 
                list_move(&exp->exp_obd_chain, &work_list);
@@ -1602,6 +1603,8 @@ void class_fail_export(struct obd_export *exp)
                 return;
         }
 
+       atomic_inc(&exp->exp_obd->obd_eviction_count);
+
         CDEBUG(D_HA, "disconnecting export %p/%s\n",
                exp, exp->exp_client_uuid.uuid);
 
index 186e359..5cef66d 100644 (file)
@@ -144,6 +144,17 @@ EXPORT_SYMBOL(lprocfs_evict_client_seq_write);
 
 #undef BUFLEN
 
+ssize_t eviction_count_show(struct kobject *kobj, struct attribute *attr,
+                        char *buf)
+{
+       struct obd_device *obd = container_of(kobj, struct obd_device,
+                                             obd_kset.kobj);
+
+       return scnprintf(buf, PAGE_SIZE, "%u\n",
+                        atomic_read(&obd->obd_eviction_count));
+}
+EXPORT_SYMBOL(eviction_count_show);
+
 ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr,
                         char *buf)
 {
index f9fafc0..7dd9eb7 100644 (file)
@@ -1087,6 +1087,7 @@ LUSTRE_RW_ATTR(grant_compat_disable);
 LUSTRE_RO_ATTR(instance);
 
 LUSTRE_RO_ATTR(num_exports);
+LUSTRE_RO_ATTR(eviction_count);
 
 struct lprocfs_vars lprocfs_ofd_obd_vars[] = {
        { .name =       "last_id",
@@ -1176,6 +1177,7 @@ static struct attribute *ofd_attrs[] = {
        &lustre_attr_no_precreate.attr,
 #endif
        &lustre_attr_num_exports.attr,
+       &lustre_attr_eviction_count.attr,
        &lustre_attr_precreate_batch.attr,
        &lustre_attr_recovery_time_hard.attr,
        &lustre_attr_recovery_time_soft.attr,
index a858dd9..5ab0e26 100755 (executable)
@@ -3204,6 +3204,25 @@ $(do_facet mds1 $LCTL get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard)
 }
 run_test 145 "connect mdtlovs and process update logs after recovery expire"
 
+test_146() {
+       (( $MDS1_VERSION >= $(version_code 2.14.0.133) )) ||
+               skip "Need MDS >= v2.14.0.133 for eviction_count"
+
+       local prev_count=$(do_facet $SINGLEMDS \
+               $LCTL get_param -n "mdt.${mds1_svc}.eviction_count")
+
+       mds_evict_client
+
+       client_reconnect
+
+       local next_count=$(do_facet $SINGLEMDS \
+               $LCTL get_param -n "mdt.${mds1_svc}.eviction_count")
+
+       [ "$prev_count" -lt "$next_count" ] ||
+               error "wrong eviction count ($prev_count >= $next_count)"
+}
+run_test 146 "test eviction is counted properly"
+
 test_147() {
        local obd_timeout=200
        local old=$($LCTL get_param -n timeout)