Whamcloud - gitweb
LU-17649 ptlrpc: fix -EACCES connection error handling 48/54448/13
authorMikhail Pershin <mpershin@whamcloud.com>
Mon, 18 Mar 2024 15:37:02 +0000 (18:37 +0300)
committerOleg Drokin <green@whamcloud.com>
Wed, 29 May 2024 04:45:57 +0000 (04:45 +0000)
Connection errors -EACCES and -EROFS leave import in
intermediate state. It is still active as well as pinger
over it but has obd_no_recov set. That allows import to
recover after all if server security is updated. But even
in FULL state any RPC over import gets -ESHUTDOWN as
obd_no_recov is set

Meanwhile obd_no_recov is not supposed to be used in that
way, it reflects particular mount option and should not
be recovered ever. So patch sets import to deactive state
instead, making import not operational too but with
option to be activated manually or remounted

Server connections like LWP, MDT-OST and MDT-MDT are
excluded and are never deactivated. Such errors are
considered as temporary until remote target updates own
security as required or administrative intervention will
restart target as needed.

In both cases console message is issued.

Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: Ib83e1b0ac541823ec236591f08145340d6f6bf04
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54448
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: Aurelien Degremont <adegremont@nvidia.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/include/obd_support.h
lustre/ptlrpc/import.c
lustre/target/tgt_handler.c
lustre/tests/recovery-small.sh

index 816c2f7..623215c 100644 (file)
@@ -774,6 +774,7 @@ extern bool obd_enable_health_write;
 
 /* continuation of MDS related constants */
 #define OBD_FAIL_MDS_PAUSE_CREATE_AFTER_LOOKUP 0x2401
+#define OBD_FAIL_MDS_CONNECT_ACCESS            0x2402
 
 /* PLEASE, KEEP NUMBERS UP TO 0x3000 RESERVED FOR OBD_FAIL_MDS_* */
 
index 496458c..02a8adc 100644 (file)
@@ -1350,13 +1350,29 @@ out:
                import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
                if (rc == -EACCES || rc == -EROFS) {
                        /*
-                        * Give up trying to reconnect
-                        * EACCES means client has no permission for connection
-                        * EROFS means client must mount read-only
+                        * -EACCES means client has no permission for connection
+                        * -EROFS means client must mount read-only
+                        *  Client deactivates import, it can be activated back
+                        *  manually when issue is resolved
+                        *  Server keeps trying forever until reconfigured or
+                        *  unmounted
                         */
-                       imp->imp_obd->obd_no_recov = 1;
-                       ptlrpc_deactivate_import_nolock(imp);
-                       inact = true;
+                       LCONSOLE_WARN("%s: connection denied by %s: rc = %d\n",
+                                     imp->imp_obd->obd_name,
+                                     obd2cli_tgt(imp->imp_obd), rc);
+                       if (imp->imp_connect_flags_orig &
+                           (OBD_CONNECT_LIGHTWEIGHT |
+                            OBD_CONNECT_MDS_MDS | OBD_CONNECT_MDS)) {
+                               /* consider LWP, MDT-MDT and MDT-OST access
+                                * errors as temporary
+                                */
+                               rc = -EAGAIN;
+                               imp->imp_force_reconnect = 1;
+                       } else {
+                               imp->imp_deactive = 1;
+                               ptlrpc_deactivate_import_nolock(imp);
+                               inact = true;
+                       }
                } else if (rc == -EPROTO) {
                        struct obd_connect_data *ocd;
 
@@ -1383,6 +1399,7 @@ out:
                                                   OBD_OCD_VERSION_PATCH(ocd->ocd_version),
                                                   OBD_OCD_VERSION_FIX(ocd->ocd_version),
                                                   LUSTRE_VERSION_STRING);
+                               imp->imp_deactive = 1;
                                ptlrpc_deactivate_import_nolock(imp);
                                import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
                                inact = true;
@@ -1411,8 +1428,18 @@ out:
                               (request->rq_deadline - request->rq_sent);
                spin_unlock(&imp->imp_lock);
 
-               if (inact)
+               if (inact) {
+                       /* imp_deactive event */
+                       obd_import_event(imp->imp_obd, imp,
+                                        IMP_EVENT_DEACTIVATE);
+                       /* imp_invalid event */
                        obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+                       /* evict and invalidate if reconnect */
+                       if (!aa->pcaa_initial_connect) {
+                               import_set_state(imp, LUSTRE_IMP_EVICTED);
+                               ptlrpc_import_recovery_state_machine(imp);
+                       }
+               }
 
                if (rc == -EPROTO)
                        RETURN(rc);
index a970d04..1d5cb42 100644 (file)
@@ -1061,6 +1061,9 @@ int tgt_connect(struct tgt_session_info *tsi)
                   LUSTRE_MDT_NAME) == 0) {
                struct lu_nodemap *nm = NULL;
 
+               if (CFS_FAIL_CHECK(OBD_FAIL_MDS_CONNECT_ACCESS))
+                       GOTO(out, rc = -EACCES);
+
                rc = req_check_sepol(tsi->tsi_pill);
                if (rc)
                        GOTO(out, rc);
index 41859b9..37d70b1 100755 (executable)
@@ -3633,6 +3633,47 @@ test_157()
 }
 run_test 157 "eviction during mmaped i/o"
 
+cleanup_158() {
+       do_facet mds2 $LCTL set_param fail_loc=0
+       zconf_umount_clients $CLIENTS $MOUNT
+       mountcli
+}
+
+test_158a() {
+       (( $MDS1_VERSION >= $(version_code 2.15.62) )) ||
+               skip "Need MDS version at least 2.15.62 for -EACCES fix"
+       (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTS"
+
+       remount_client $MOUNT
+       # ensure there is no MOUNT2 on clients
+       zconf_umount_clients $CLIENTS $MOUNT2 -f
+
+       stack_trap cleanup_158 EXIT RETURN
+
+       # client import is evicted after failover followed by -EACCES
+       #define OBD_FAIL_MDS_CONNECT_ACCESS     0x2402
+       do_facet mds2 $LCTL set_param fail_loc=0x2402
+       fail_nodf mds2
+       sleep 5
+       $LFS df $MOUNT
+       do_facet mds2 $LCTL set_param fail_loc=0
+       wait_recovery_complete mds2 || error "MDS recovery not done"
+       sleep 5
+
+       local mdc2=$($LCTL dl | awk '/mdc.*MDT0001-mdc*/ { print $4}')
+       local active=$($LCTL get_param -n mdc.$mdc2.active)
+       (( active == 0 )) || error "import status is 'active'"
+       $LCTL --device $mdc2 activate
+       active=$($LCTL get_param -n mdc.$mdc2.active)
+       (( active == 1 )) || error "import status is not 'active'"
+       status=$($LFS check mdts | grep $mdc2 | grep -c active)
+       (( status == 1 )) || {
+               $LFS check mdts
+               error "import is not operational"
+       }
+}
+run_test 158a "connect without access right"
+
 complete_test $SECONDS
 check_and_cleanup_lustre
 exit_status