Whamcloud - gitweb
LU-11227 lod: lod_sync: don't attempt sync to inactive targets 64/32964/5
authorRobin Humble <plaguedbypenguins@gmail.com>
Thu, 9 Aug 2018 05:33:04 +0000 (15:33 +1000)
committerOleg Drokin <green@whamcloud.com>
Thu, 23 Aug 2018 07:18:48 +0000 (07:18 +0000)
chgrp on a client triggers lod_sync() which in turn loops over OST/MDT
targets with dt_sync(). dt_sync() fails with -ENOTCONN when targets
have been deactivated (ie. set to active=0). The client retries
infinitely causing the client process to hang and considerably MDS
network traffic, load, and disk i/o.

the fix is to not attempt dt_sync() to ost/mdt targets that have been
deactivated and also (because of possible races) to ignore connection
errors.

tested with Lustre 2.10.4.

Signed-off-by: Robin Humble <plaguedbypenguins@gmail.com>
Change-Id: I617509cf7944541489f4fd9762c233b771132165
Reviewed-on: https://review.whamcloud.com/32964
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: John L. Hammond <jhammond@whamcloud.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/lod/lod_dev.c

index be0d7bd..8d94449 100644 (file)
@@ -1409,11 +1409,16 @@ static int lod_sync(const struct lu_env *env, struct dt_device *dev)
        lod_foreach_ost(lod, i) {
                ost = OST_TGT(lod, i);
                LASSERT(ost && ost->ltd_ost);
+               if (!ost->ltd_active)
+                       continue;
                rc = dt_sync(env, ost->ltd_ost);
                if (rc) {
-                       CERROR("%s: can't sync ost %u: %d\n",
-                              lod2obd(lod)->obd_name, i, rc);
-                       break;
+                       if (rc != -ENOTCONN) {
+                               CERROR("%s: can't sync ost %u: %d\n",
+                                      lod2obd(lod)->obd_name, i, rc);
+                               break;
+                       }
+                       rc = 0;
                }
        }
        lod_putref(lod, &lod->lod_ost_descs);
@@ -1425,11 +1430,16 @@ static int lod_sync(const struct lu_env *env, struct dt_device *dev)
        lod_foreach_mdt(lod, i) {
                mdt = MDT_TGT(lod, i);
                LASSERT(mdt && mdt->ltd_mdt);
+               if (!mdt->ltd_active)
+                       continue;
                rc = dt_sync(env, mdt->ltd_mdt);
                if (rc) {
-                       CERROR("%s: can't sync mdt %u: %d\n",
-                              lod2obd(lod)->obd_name, i, rc);
-                       break;
+                       if (rc != -ENOTCONN) {
+                               CERROR("%s: can't sync mdt %u: %d\n",
+                                      lod2obd(lod)->obd_name, i, rc);
+                               break;
+                       }
+                       rc = 0;
                }
        }
        lod_putref(lod, &lod->lod_mdt_descs);