Whamcloud - gitweb
LU-17744 ldiskfs: mballoc stats fixes
[fs/lustre-release.git] / lustre / ldlm / ldlm_lib.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2010, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  */
31
32 /**
33  * This file deals with various client/target related logic including recovery.
34  *
35  * TODO: This code more logically belongs in the ptlrpc module than in ldlm and
36  * should be moved.
37  */
38
39 #define DEBUG_SUBSYSTEM S_LDLM
40
41 #include <cl_object.h>
42 #include <linux/fs_struct.h>
43 #include <linux/jiffies.h>
44 #include <linux/kernel.h>
45 #include <linux/kthread.h>
46 #include <libcfs/libcfs.h>
47 #include <obd.h>
48 #include <obd_class.h>
49 #include <lustre_dlm.h>
50 #include <lustre_net.h>
51 #include <lustre_sec.h>
52 #include <uapi/linux/lustre/lustre_ioctl.h>
53 #include "ldlm_internal.h"
54
55 /*
56  * @priority: If non-zero, move the selected connection to the list head.
57  * @create: If zero, only search in existing connections.
58  */
59 static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid,
60                            int priority, int create)
61 {
62         struct ptlrpc_connection *ptlrpc_conn;
63         struct obd_import_conn *imp_conn = NULL, *item;
64         u32 refnet = imp->imp_conn_restricted_net;
65         int rc = 0;
66
67         ENTRY;
68
69         if (!create && !priority) {
70                 CDEBUG(D_HA, "Nothing to do\n");
71                 RETURN(-EINVAL);
72         }
73
74         /* refnet is used to restrict network connections */
75         if (refnet != LNET_NET_ANY)
76                 CDEBUG(D_HA, "imp %s: restrict %s to %s net\n",
77                        imp->imp_obd->obd_name, uuid->uuid,
78                        libcfs_net2str(refnet));
79
80         ptlrpc_conn = ptlrpc_uuid_to_connection(uuid, refnet);
81         if (!ptlrpc_conn) {
82                 CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid);
83                 RETURN(-ENOENT);
84         }
85
86         if (create) {
87                 OBD_ALLOC(imp_conn, sizeof(*imp_conn));
88                 if (!imp_conn)
89                         GOTO(out_put, rc = -ENOMEM);
90         }
91
92         spin_lock(&imp->imp_lock);
93         list_for_each_entry(item, &imp->imp_conn_list, oic_item) {
94                 if (obd_uuid_equals(uuid, &item->oic_uuid)) {
95                         if (priority) {
96                                 list_move(&item->oic_item,
97                                           &imp->imp_conn_list);
98                                 item->oic_last_attempt = 0;
99                         }
100                         CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n",
101                                imp, imp->imp_obd->obd_name, uuid->uuid,
102                                (priority ? ", moved to head" : ""));
103                         spin_unlock(&imp->imp_lock);
104                         GOTO(out_free, rc = 0);
105                 }
106         }
107         /* No existing import connection found for \a uuid. */
108         if (create) {
109                 imp_conn->oic_conn = ptlrpc_conn;
110                 imp_conn->oic_uuid = *uuid;
111                 imp_conn->oic_last_attempt = 0;
112                 if (priority)
113                         list_add(&imp_conn->oic_item, &imp->imp_conn_list);
114                 else
115                         list_add_tail(&imp_conn->oic_item,
116                                       &imp->imp_conn_list);
117                 CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n",
118                        imp, imp->imp_obd->obd_name, uuid->uuid,
119                        (priority ? "head" : "tail"));
120         } else {
121                 spin_unlock(&imp->imp_lock);
122                 GOTO(out_free, rc = -ENOENT);
123         }
124
125         spin_unlock(&imp->imp_lock);
126         RETURN(0);
127 out_free:
128         if (imp_conn)
129                 OBD_FREE(imp_conn, sizeof(*imp_conn));
130 out_put:
131         ptlrpc_connection_put(ptlrpc_conn);
132         RETURN(rc);
133 }
134
135 int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid)
136 {
137         return import_set_conn(imp, uuid, 1, 0);
138 }
139
140 int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
141                            int priority)
142 {
143         return import_set_conn(imp, uuid, priority, 1);
144 }
145 EXPORT_SYMBOL(client_import_add_conn);
146
147 int client_import_dyn_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
148                                struct lnet_nid *prim_nid, int priority)
149 {
150         struct ptlrpc_connection *ptlrpc_conn;
151         int rc;
152
153         ptlrpc_conn = ptlrpc_uuid_to_connection(uuid, LNET_NID_NET(prim_nid));
154         if (!ptlrpc_conn) {
155                 const char *str_uuid = obd_uuid2str(uuid);
156
157                 rc = class_add_uuid(str_uuid, prim_nid);
158                 if (rc) {
159                         CERROR("%s: failed to add UUID '%s': rc = %d\n",
160                                imp->imp_obd->obd_name, str_uuid, rc);
161                         return rc;
162                 }
163         }
164         return import_set_conn(imp, uuid, priority, 1);
165 }
166 EXPORT_SYMBOL(client_import_dyn_add_conn);
167
168 int client_import_add_nids_to_conn(struct obd_import *imp,
169                                    struct lnet_nid *nidlist,
170                                    int nid_count, int nid_size,
171                                    struct obd_uuid *uuid)
172 {
173         struct obd_import_conn *conn;
174         int rc = -ENOENT;
175
176         ENTRY;
177         if (nid_count <= 0 || !nidlist)
178                 return rc;
179
180         spin_lock(&imp->imp_lock);
181         list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
182                 if (class_check_uuid(&conn->oic_uuid, &nidlist[0])) {
183                         *uuid = conn->oic_uuid;
184                         spin_unlock(&imp->imp_lock);
185                         rc = class_add_nids_to_uuid(&conn->oic_uuid, nidlist,
186                                                     nid_count, nid_size);
187                         RETURN(rc);
188                 }
189         }
190         spin_unlock(&imp->imp_lock);
191         RETURN(rc);
192 }
193 EXPORT_SYMBOL(client_import_add_nids_to_conn);
194
195 int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
196 {
197         struct obd_import_conn *imp_conn;
198         struct obd_export *dlmexp;
199         int rc = -ENOENT;
200
201         ENTRY;
202
203         spin_lock(&imp->imp_lock);
204         if (list_empty(&imp->imp_conn_list)) {
205                 LASSERT(!imp->imp_connection);
206                 GOTO(out, rc);
207         }
208
209         list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) {
210                 if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid))
211                         continue;
212                 LASSERT(imp_conn->oic_conn);
213
214                 if (imp_conn == imp->imp_conn_current) {
215                         LASSERT(imp_conn->oic_conn == imp->imp_connection);
216
217                         if (imp->imp_state != LUSTRE_IMP_CLOSED &&
218                             imp->imp_state != LUSTRE_IMP_DISCON) {
219                                 CERROR("can't remove current connection\n");
220                                 GOTO(out, rc = -EBUSY);
221                         }
222
223                         ptlrpc_connection_put(imp->imp_connection);
224                         imp->imp_connection = NULL;
225
226                         dlmexp = class_conn2export(&imp->imp_dlm_handle);
227                         if (dlmexp && dlmexp->exp_connection) {
228                                 LASSERT(dlmexp->exp_connection ==
229                                         imp_conn->oic_conn);
230                                 ptlrpc_connection_put(dlmexp->exp_connection);
231                                 dlmexp->exp_connection = NULL;
232                         }
233
234                         if (dlmexp != NULL)
235                                 class_export_put(dlmexp);
236                 }
237
238                 list_del(&imp_conn->oic_item);
239                 ptlrpc_connection_put(imp_conn->oic_conn);
240                 OBD_FREE(imp_conn, sizeof(*imp_conn));
241                 CDEBUG(D_HA, "imp %p@%s: remove connection %s\n",
242                        imp, imp->imp_obd->obd_name, uuid->uuid);
243                 rc = 0;
244                 break;
245         }
246 out:
247         spin_unlock(&imp->imp_lock);
248         if (rc == -ENOENT)
249                 CERROR("connection %s not found\n", uuid->uuid);
250         RETURN(rc);
251 }
252 EXPORT_SYMBOL(client_import_del_conn);
253
254 void client_destroy_import(struct obd_import *imp)
255 {
256         /*
257          * Drop security policy instance after all RPCs have finished/aborted
258          * to let all busy contexts be released.
259          */
260         class_import_get(imp);
261         class_destroy_import(imp);
262         sptlrpc_import_sec_put(imp);
263         class_import_put(imp);
264 }
265 EXPORT_SYMBOL(client_destroy_import);
266
267 /**
268  * Check whether or not the OSC is on MDT.
269  * In the config log,
270  * osc on MDT
271  *      setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID
272  * osc on client
273  *      setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID
274  *
275  **/
276 static int osc_on_mdt(char *obdname)
277 {
278         char *ptr;
279
280         ptr = strrchr(obdname, '-');
281         if (ptr == NULL)
282                 return 0;
283
284         if (strncmp(ptr + 1, "MDT", 3) == 0)
285                 return 1;
286
287         return 0;
288 }
289
290 /*
291  * Configure an RPC client OBD device.
292  *
293  * lcfg parameters:
294  * 1 - client UUID
295  * 2 - server UUID
296  * 3 - inactive-on-startup
297  * 4 - restrictive net
298  */
299 int client_obd_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
300 {
301         struct client_obd *cli = &obd->u.cli;
302         struct obd_import *imp;
303         struct obd_uuid server_uuid;
304         int rq_portal, rp_portal, connect_op;
305         const char *name = obd->obd_type->typ_name;
306         enum ldlm_ns_type ns_type = LDLM_NS_TYPE_UNKNOWN;
307         char *cli_name = lustre_cfg_buf(lcfg, 0);
308         int rc;
309
310         ENTRY;
311
312         /*
313          * In a more perfect world, we would hang a ptlrpc_client off of
314          * obd_type and just use the values from there.
315          */
316         if (!strcmp(name, LUSTRE_OSC_NAME)) {
317                 rq_portal = OST_REQUEST_PORTAL;
318                 rp_portal = OSC_REPLY_PORTAL;
319                 connect_op = OST_CONNECT;
320                 cli->cl_sp_me = LUSTRE_SP_CLI;
321                 cli->cl_sp_to = LUSTRE_SP_OST;
322                 ns_type = LDLM_NS_TYPE_OSC;
323         } else if (!strcmp(name, LUSTRE_MDC_NAME) ||
324                    !strcmp(name, LUSTRE_LWP_NAME)) {
325                 rq_portal = MDS_REQUEST_PORTAL;
326                 rp_portal = MDC_REPLY_PORTAL;
327                 connect_op = MDS_CONNECT;
328                 if (is_lwp_on_ost(cli_name))
329                         cli->cl_sp_me = LUSTRE_SP_OST;
330                 else if (is_lwp_on_mdt(cli_name))
331                         cli->cl_sp_me = LUSTRE_SP_MDT;
332                 else
333                         cli->cl_sp_me = LUSTRE_SP_CLI;
334                 cli->cl_sp_to = LUSTRE_SP_MDT;
335                 ns_type = LDLM_NS_TYPE_MDC;
336         } else if (!strcmp(name, LUSTRE_OSP_NAME)) {
337                 if (strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL) {
338                         /* OSP_on_MDT for other MDTs */
339                         connect_op = MDS_CONNECT;
340                         cli->cl_sp_to = LUSTRE_SP_MDT;
341                         ns_type = LDLM_NS_TYPE_MDC;
342                         rq_portal = OUT_PORTAL;
343                 } else {
344                         /* OSP on MDT for OST */
345                         connect_op = OST_CONNECT;
346                         cli->cl_sp_to = LUSTRE_SP_OST;
347                         ns_type = LDLM_NS_TYPE_OSC;
348                         rq_portal = OST_REQUEST_PORTAL;
349                 }
350                 rp_portal = OSC_REPLY_PORTAL;
351                 cli->cl_sp_me = LUSTRE_SP_MDT;
352         } else if (!strcmp(name, LUSTRE_MGC_NAME)) {
353                 rq_portal = MGS_REQUEST_PORTAL;
354                 rp_portal = MGC_REPLY_PORTAL;
355                 connect_op = MGS_CONNECT;
356                 cli->cl_sp_me = LUSTRE_SP_MGC;
357                 cli->cl_sp_to = LUSTRE_SP_MGS;
358                 cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID;
359                 ns_type = LDLM_NS_TYPE_MGC;
360         } else {
361                 CERROR("unknown client OBD type \"%s\", can't setup\n",
362                        name);
363                 RETURN(-EINVAL);
364         }
365
366         if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
367                 CERROR("requires a TARGET UUID\n");
368                 RETURN(-EINVAL);
369         }
370
371         if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) {
372                 CERROR("client UUID must be less than 38 characters\n");
373                 RETURN(-EINVAL);
374         }
375
376         if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) {
377                 CERROR("setup requires a SERVER UUID\n");
378                 RETURN(-EINVAL);
379         }
380
381         if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) {
382                 CERROR("target UUID must be less than 38 characters\n");
383                 RETURN(-EINVAL);
384         }
385
386         init_rwsem(&cli->cl_sem);
387         mutex_init(&cli->cl_mgc_mutex);
388         cli->cl_seq = NULL;
389         init_rwsem(&cli->cl_seq_rwsem);
390         cli->cl_conn_count = 0;
391         memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2),
392                min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2),
393                      sizeof(server_uuid)));
394
395         cli->cl_dirty_pages = 0;
396         cli->cl_dirty_max_pages = 0;
397         cli->cl_avail_grant = 0;
398         /* FIXME: Should limit this for the sum of all cl_dirty_max_pages. */
399         /*
400          * cl_dirty_max_pages may be changed at connect time in
401          * ptlrpc_connect_interpret().
402          */
403         client_adjust_max_dirty(cli);
404         init_waitqueue_head(&cli->cl_cache_waiters);
405         INIT_LIST_HEAD(&cli->cl_loi_ready_list);
406         INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
407         INIT_LIST_HEAD(&cli->cl_loi_write_list);
408         INIT_LIST_HEAD(&cli->cl_loi_read_list);
409         spin_lock_init(&cli->cl_loi_list_lock);
410         atomic_set(&cli->cl_pending_w_pages, 0);
411         atomic_set(&cli->cl_pending_r_pages, 0);
412         cli->cl_r_in_flight = 0;
413         cli->cl_w_in_flight = 0;
414
415         cli->cl_stats_init = ktime_get_real();
416         spin_lock_init(&cli->cl_read_rpc_hist.oh_lock);
417         spin_lock_init(&cli->cl_write_rpc_hist.oh_lock);
418         spin_lock_init(&cli->cl_read_page_hist.oh_lock);
419         spin_lock_init(&cli->cl_write_page_hist.oh_lock);
420         spin_lock_init(&cli->cl_read_offset_hist.oh_lock);
421         spin_lock_init(&cli->cl_write_offset_hist.oh_lock);
422         spin_lock_init(&cli->cl_batch_rpc_hist.oh_lock);
423
424         /* lru for osc. */
425         INIT_LIST_HEAD(&cli->cl_lru_osc);
426         atomic_set(&cli->cl_lru_shrinkers, 0);
427         atomic_long_set(&cli->cl_lru_busy, 0);
428         atomic_long_set(&cli->cl_lru_in_list, 0);
429         INIT_LIST_HEAD(&cli->cl_lru_list);
430         spin_lock_init(&cli->cl_lru_list_lock);
431         atomic_long_set(&cli->cl_unstable_count, 0);
432         INIT_LIST_HEAD(&cli->cl_shrink_list);
433         INIT_LIST_HEAD(&cli->cl_grant_chain);
434
435         INIT_LIST_HEAD(&cli->cl_flight_waiters);
436         cli->cl_rpcs_in_flight = 0;
437
438         init_waitqueue_head(&cli->cl_destroy_waitq);
439         atomic_set(&cli->cl_destroy_in_flight, 0);
440
441
442         cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
443         cli->cl_preferred_cksum_type = 0;
444 #ifdef CONFIG_ENABLE_CHECKSUM
445         /* Turn on checksumming by default. */
446         cli->cl_checksum = 1;
447         /*
448          * The supported checksum types will be worked out at connect time
449          * Set cl_chksum* to CRC32 for now to avoid returning screwed info
450          * through procfs.
451          */
452         cli->cl_cksum_type = cli->cl_supp_cksum_types;
453 #endif
454         atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
455
456         /*
457          * Set it to possible maximum size. It may be reduced by ocd_brw_size
458          * from OFD after connecting.
459          */
460         cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES;
461
462         cli->cl_max_short_io_bytes = OBD_DEF_SHORT_IO_BYTES;
463
464         /*
465          * set cl_chunkbits default value to PAGE_SHIFT,
466          * it will be updated at OSC connection time.
467          */
468         cli->cl_chunkbits = PAGE_SHIFT;
469
470         if (!strcmp(name, LUSTRE_MDC_NAME)) {
471                 cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
472         } else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 128 /* MB */) {
473                 cli->cl_max_rpcs_in_flight = 2;
474         } else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 256 /* MB */) {
475                 cli->cl_max_rpcs_in_flight = 3;
476         } else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 512 /* MB */) {
477                 cli->cl_max_rpcs_in_flight = 4;
478         } else {
479                 if (osc_on_mdt(obd->obd_name))
480                         cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_MAX;
481                 else
482                         cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
483         }
484
485         spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock);
486         cli->cl_max_mod_rpcs_in_flight = 0;
487         cli->cl_mod_rpcs_in_flight = 0;
488         cli->cl_close_rpcs_in_flight = 0;
489         init_waitqueue_head(&cli->cl_mod_rpcs_waitq);
490         cli->cl_mod_rpcs_init = ktime_get_real();
491         cli->cl_mod_tag_bitmap = NULL;
492
493         INIT_LIST_HEAD(&cli->cl_chg_dev_linkage);
494
495         if (connect_op == MDS_CONNECT) {
496                 cli->cl_max_mod_rpcs_in_flight = cli->cl_max_rpcs_in_flight - 1;
497                 OBD_ALLOC(cli->cl_mod_tag_bitmap,
498                           BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
499                 if (cli->cl_mod_tag_bitmap == NULL)
500                         GOTO(err, rc = -ENOMEM);
501         }
502
503         rc = ldlm_get_ref();
504         if (rc) {
505                 CERROR("ldlm_get_ref failed: %d\n", rc);
506                 GOTO(err, rc);
507         }
508
509         ptlrpc_init_client(rq_portal, rp_portal, name,
510                            &obd->obd_ldlm_client);
511
512         imp = class_new_import(obd);
513         if (imp == NULL)
514                 GOTO(err_ldlm, rc = -ENOENT);
515         imp->imp_client = &obd->obd_ldlm_client;
516         imp->imp_connect_op = connect_op;
517         memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
518                LUSTRE_CFG_BUFLEN(lcfg, 1));
519         class_import_put(imp);
520
521         if (lustre_cfg_buf(lcfg, 4)) {
522                 __u32 refnet = libcfs_str2net(lustre_cfg_string(lcfg, 4));
523
524                 if (refnet == LNET_NET_ANY) {
525                         rc = -EINVAL;
526                         CERROR("%s: bad mount option 'network=%s': rc = %d\n",
527                                obd->obd_name, lustre_cfg_string(lcfg, 4),
528                                rc);
529                         GOTO(err_import, rc);
530                 }
531                 imp->imp_conn_restricted_net = refnet;
532         } else {
533                 imp->imp_conn_restricted_net = LNET_NET_ANY;
534         }
535
536         rc = client_import_add_conn(imp, &server_uuid, 1);
537         if (rc) {
538                 CERROR("can't add initial connection\n");
539                 GOTO(err_import, rc);
540         }
541         imp->imp_connection = NULL;
542
543         cli->cl_import = imp;
544         /* cli->cl_max_mds_easize updated by mdc_init_ea_size() */
545         cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
546
547         if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
548                 if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
549                         CDEBUG(D_HA, "marking %s %s->%s as inactive\n",
550                                name, obd->obd_name,
551                                cli->cl_target_uuid.uuid);
552                         spin_lock(&imp->imp_lock);
553                         imp->imp_deactive = 1;
554                         spin_unlock(&imp->imp_lock);
555                 }
556         }
557
558         obd->obd_namespace = ldlm_namespace_new(obd, obd->obd_name,
559                                                 LDLM_NAMESPACE_CLIENT,
560                                                 LDLM_NAMESPACE_GREEDY,
561                                                 ns_type);
562         if (IS_ERR(obd->obd_namespace)) {
563                 rc = PTR_ERR(obd->obd_namespace);
564                 CERROR("%s: unable to create client namespace: rc = %d\n",
565                        obd->obd_name, rc);
566                 obd->obd_namespace = NULL;
567                 GOTO(err_import, rc);
568         }
569
570         RETURN(rc);
571
572 err_import:
573         class_destroy_import(imp);
574 err_ldlm:
575         ldlm_put_ref();
576 err:
577         if (cli->cl_mod_tag_bitmap != NULL)
578                 OBD_FREE(cli->cl_mod_tag_bitmap,
579                          BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
580         cli->cl_mod_tag_bitmap = NULL;
581
582         RETURN(rc);
583 }
584 EXPORT_SYMBOL(client_obd_setup);
585
586 int client_obd_cleanup(struct obd_device *obd)
587 {
588         struct client_obd *cli = &obd->u.cli;
589
590         ENTRY;
591
592         ldlm_namespace_free_post(obd->obd_namespace);
593         obd->obd_namespace = NULL;
594
595         obd_cleanup_client_import(obd);
596         LASSERT(obd->u.cli.cl_import == NULL);
597
598         ldlm_put_ref();
599
600         if (cli->cl_mod_tag_bitmap != NULL)
601                 OBD_FREE(cli->cl_mod_tag_bitmap,
602                          BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
603         cli->cl_mod_tag_bitmap = NULL;
604
605         RETURN(0);
606 }
607 EXPORT_SYMBOL(client_obd_cleanup);
608
609 /* ->o_connect() method for client side (OSC and MDC and MGC) */
610 int client_connect_import(const struct lu_env *env,
611                           struct obd_export **exp,
612                           struct obd_device *obd, struct obd_uuid *cluuid,
613                           struct obd_connect_data *data, void *localdata)
614 {
615         struct client_obd *cli = &obd->u.cli;
616         struct obd_import *imp = cli->cl_import;
617         struct obd_connect_data *ocd;
618         struct lustre_handle conn = { 0 };
619         int rc;
620
621         ENTRY;
622
623         *exp = NULL;
624         down_write(&cli->cl_sem);
625         if (cli->cl_conn_count > 0)
626                 GOTO(out_sem, rc = -EALREADY);
627
628         rc = class_connect(&conn, obd, cluuid);
629         if (rc)
630                 GOTO(out_sem, rc);
631
632         cli->cl_conn_count++;
633         *exp = class_conn2export(&conn);
634
635         LASSERT(obd->obd_namespace);
636
637         spin_lock(&imp->imp_lock);
638         if (imp->imp_state == LUSTRE_IMP_CLOSED && imp->imp_deactive) {
639                 /* need to reactivate import if trying to connect
640                  * to a previously disconnected
641                  */
642                 imp->imp_deactive = 0;
643                 imp->imp_invalid = 0;
644         }
645         spin_unlock(&imp->imp_lock);
646
647         imp->imp_dlm_handle = conn;
648         rc = ptlrpc_init_import(imp);
649         if (rc != 0)
650                 GOTO(out_ldlm, rc);
651
652         ocd = &imp->imp_connect_data;
653         if (data) {
654                 *ocd = *data;
655                 imp->imp_connect_flags_orig = data->ocd_connect_flags;
656                 imp->imp_connect_flags2_orig = data->ocd_connect_flags2;
657         }
658
659         rc = ptlrpc_connect_import(imp);
660         if (rc != 0) {
661                 LASSERT(imp->imp_state == LUSTRE_IMP_DISCON);
662                 GOTO(out_ldlm, rc);
663         }
664         LASSERT(*exp != NULL && (*exp)->exp_connection);
665
666         if (data) {
667                 LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) ==
668                          ocd->ocd_connect_flags, "old %#llx, new %#llx\n",
669                          data->ocd_connect_flags, ocd->ocd_connect_flags);
670                 data->ocd_connect_flags = ocd->ocd_connect_flags;
671                 data->ocd_connect_flags2 = ocd->ocd_connect_flags2;
672         }
673         /* ldiskfs servers do not actually need patching to support unaligned
674          * DIO, so we always set the flag in that case
675          */
676         if (data->ocd_connect_flags & OBD_CONNECT_MAXBYTES) {
677                 /* > 2ULL << 59 implies ZFS, so this is ldiskfs */
678                 if (data->ocd_maxbytes < (2ULL << 59))
679                         data->ocd_connect_flags2 |= OBD_CONNECT2_UNALIGNED_DIO;
680         }
681
682         ptlrpc_pinger_add_import(imp);
683
684         EXIT;
685
686         if (rc) {
687 out_ldlm:
688                 cli->cl_conn_count--;
689                 class_disconnect(*exp);
690                 *exp = NULL;
691         }
692 out_sem:
693         up_write(&cli->cl_sem);
694
695         if (!rc && localdata && !cli->cl_cache) {
696                 cli->cl_cache = (struct cl_client_cache *)localdata;
697                 cl_cache_incref(cli->cl_cache);
698                 cli->cl_lru_left = &cli->cl_cache->ccc_lru_left;
699
700                 /* add this osc into entity list */
701                 LASSERT(list_empty(&cli->cl_lru_osc));
702                 spin_lock(&cli->cl_cache->ccc_lru_lock);
703                 list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru);
704                 spin_unlock(&cli->cl_cache->ccc_lru_lock);
705         }
706
707         return rc;
708 }
709 EXPORT_SYMBOL(client_connect_import);
710
711 int client_disconnect_export(struct obd_export *exp)
712 {
713         struct obd_device *obd = class_exp2obd(exp);
714         struct client_obd *cli;
715         struct obd_import *imp;
716         int rc = 0, err;
717
718         ENTRY;
719
720         if (!obd) {
721                 CERROR("invalid export for disconnect: exp %p cookie %#llx\n",
722                        exp, exp ? exp->exp_handle.h_cookie : -1);
723                 RETURN(-EINVAL);
724         }
725
726         cli = &obd->u.cli;
727         imp = cli->cl_import;
728
729         down_write(&cli->cl_sem);
730         CDEBUG(D_INFO, "disconnect %s - %zu\n", obd->obd_name,
731                 cli->cl_conn_count);
732
733         if (cli->cl_conn_count == 0) {
734                 CERROR("disconnecting disconnected device (%s)\n",
735                        obd->obd_name);
736                 GOTO(out_disconnect, rc = -EINVAL);
737         }
738
739         cli->cl_conn_count--;
740         if (cli->cl_conn_count != 0)
741                 GOTO(out_disconnect, rc = 0);
742
743         /*
744          * Mark import deactivated now, so we don't try to reconnect if any
745          * of the cleanup RPCs fails (e.g. LDLM cancel, etc).  We don't
746          * fully deactivate the import, or that would drop all requests.
747          */
748         spin_lock(&imp->imp_lock);
749         imp->imp_deactive = 1;
750         spin_unlock(&imp->imp_lock);
751
752         /*
753          * Some non-replayable imports (MDS's OSCs) are pinged, so just
754          * delete it regardless.  (It's safe to delete an import that was
755          * never added.)
756          */
757         (void)ptlrpc_pinger_del_import(imp);
758
759         if (obd->obd_namespace != NULL) {
760                 /* obd_force == local only */
761                 ldlm_cli_cancel_unused(obd->obd_namespace, NULL,
762                                        obd->obd_force ? LCF_LOCAL : 0, NULL);
763                 ldlm_namespace_free_prior(obd->obd_namespace, imp,
764                                           obd->obd_force);
765         }
766
767         /*
768          * There's no need to hold sem while disconnecting an import,
769          * and it may actually cause deadlock in GSS.
770          */
771         up_write(&cli->cl_sem);
772         rc = ptlrpc_disconnect_import(imp, 0);
773         down_write(&cli->cl_sem);
774
775         ptlrpc_invalidate_import(imp);
776
777         EXIT;
778
779 out_disconnect:
780         /*
781          * Use server style - class_disconnect should be always called for
782          * o_disconnect.
783          */
784         err = class_disconnect(exp);
785         if (!rc && err)
786                 rc = err;
787
788         up_write(&cli->cl_sem);
789
790         RETURN(rc);
791 }
792 EXPORT_SYMBOL(client_disconnect_export);
793
794 #ifdef HAVE_SERVER_SUPPORT
795 int server_disconnect_export(struct obd_export *exp)
796 {
797         int rc;
798
799         ENTRY;
800
801         /* Disconnect early so that clients can't keep using export. */
802         rc = class_disconnect(exp);
803         /* Close import to avoid sending any requests. */
804         if (exp->exp_imp_reverse)
805                 ptlrpc_cleanup_imp(exp->exp_imp_reverse);
806
807         ldlm_bl_thread_wakeup();
808
809         /* complete all outstanding replies */
810         spin_lock(&exp->exp_lock);
811         while (!list_empty(&exp->exp_outstanding_replies)) {
812                 struct ptlrpc_reply_state *rs =
813                         list_first_entry(&exp->exp_outstanding_replies,
814                                          struct ptlrpc_reply_state,
815                                          rs_exp_list);
816                 struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
817
818                 spin_lock(&svcpt->scp_rep_lock);
819
820                 list_del_init(&rs->rs_exp_list);
821
822                 spin_lock(&rs->rs_lock);
823                 ptlrpc_schedule_difficult_reply(rs);
824                 spin_unlock(&rs->rs_lock);
825
826                 spin_unlock(&svcpt->scp_rep_lock);
827         }
828         spin_unlock(&exp->exp_lock);
829
830         RETURN(rc);
831 }
832 EXPORT_SYMBOL(server_disconnect_export);
833
834 static inline int target_check_recovery_timer(struct obd_device *target)
835 {
836         ktime_t remaining;
837         s64 timeout;
838
839         if (!target->obd_recovering || target->obd_recovery_start == 0)
840                 return 0;
841
842         remaining = hrtimer_get_remaining(&target->obd_recovery_timer);
843         timeout = ktime_divns(remaining, NSEC_PER_SEC);
844         if (timeout > -30)
845                 return 0;
846
847         /* the recovery timer should expire, but it isn't triggered,
848          * it's better to abort the recovery of this target to speed up
849          * the recovery of the whole cluster.
850          */
851         spin_lock(&target->obd_dev_lock);
852         if (target->obd_recovering) {
853                 CERROR("%s: Aborting recovery\n", target->obd_name);
854                 target->obd_abort_recovery = 1;
855                 wake_up(&target->obd_next_transno_waitq);
856         }
857         spin_unlock(&target->obd_dev_lock);
858         return 0;
859 }
860
861 /*
862  * --------------------------------------------------------------------------
863  * from old lib/target.c
864  * --------------------------------------------------------------------------
865  */
866 static int target_handle_reconnect(struct lustre_handle *conn,
867                                    struct obd_export *exp,
868                                    struct obd_uuid *cluuid)
869 {
870         struct obd_device *target;
871         struct lustre_handle *hdl;
872         ktime_t remaining;
873         s64 timeout;
874         int rc = 0;
875
876         ENTRY;
877         hdl = &exp->exp_imp_reverse->imp_remote_handle;
878         if (!exp->exp_connection || !lustre_handle_is_used(hdl)) {
879                 conn->cookie = exp->exp_handle.h_cookie;
880                 CDEBUG(D_HA,
881                        "connect export for UUID '%s' at %p, cookie %#llx\n",
882                        cluuid->uuid, exp, conn->cookie);
883                 RETURN(0);
884         }
885
886         target = exp->exp_obd;
887
888         /* Might be a re-connect after a partition. */
889         if (memcmp(&conn->cookie, &hdl->cookie, sizeof(conn->cookie))) {
890                 LCONSOLE_WARN("%s: already connected client %s (at %s) with handle %#llx. Rejecting client with the same UUID trying to reconnect with handle %#llx\n",
891                               target->obd_name,
892                               obd_uuid2str(&exp->exp_client_uuid),
893                               obd_export_nid2str(exp),
894                               hdl->cookie, conn->cookie);
895                 memset(conn, 0, sizeof(*conn));
896                 /*
897                  * target_handle_connect() treats EALREADY and
898                  * -EALREADY differently.  -EALREADY is an error
899                  * (same UUID, different handle).
900                  */
901                 RETURN(-EALREADY);
902         }
903
904         if (!target->obd_recovering) {
905                 LCONSOLE_WARN("%s: Client %s (at %s) reconnecting\n",
906                         target->obd_name, obd_uuid2str(&exp->exp_client_uuid),
907                         obd_export_nid2str(exp));
908                 GOTO(out_already, rc);
909         }
910
911         remaining = hrtimer_get_remaining(&target->obd_recovery_timer);
912         timeout = ktime_divns(remaining, NSEC_PER_SEC);
913         if (timeout > 0) {
914                 LCONSOLE_WARN("%s: Client %s (at %s) reconnected, waiting for %d clients in recovery for %lld:%.02lld\n",
915                               target->obd_name,
916                               obd_uuid2str(&exp->exp_client_uuid),
917                               obd_export_nid2str(exp),
918                               atomic_read(&target->obd_max_recoverable_clients),
919                               timeout / 60, timeout % 60);
920         } else {
921                 struct target_distribute_txn_data *tdtd;
922                 int size = 0;
923                 int count = 0;
924                 char *buf = NULL;
925
926                 target_check_recovery_timer(target);
927
928                 tdtd = class_exp2tgt(exp)->lut_tdtd;
929                 if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
930                         buf = tdtd->tdtd_show_update_logs_retrievers(
931                                 tdtd->tdtd_show_retrievers_cbdata,
932                                 &size, &count);
933
934                 if (count > 0)
935                         LCONSOLE_WARN("%s: Client %s (at %s) reconnecting, waiting for %d MDTs (%s) in recovery for %lld:%.02lld. Please wait until all MDTs recovered or you may force MDT evicition via 'lctl --device %s abort_recovery.\n",
936                                       target->obd_name,
937                                       obd_uuid2str(&exp->exp_client_uuid),
938                                       obd_export_nid2str(exp), count,
939                                       buf ? buf : "unknown (not enough RAM)",
940                                       (abs(timeout) + target->obd_recovery_timeout) / 60,
941                                       (abs(timeout) + target->obd_recovery_timeout) % 60,
942                                       target->obd_name);
943                 else
944                         LCONSOLE_WARN("%s: Recovery already passed deadline %lld:%.02lld. If you do not want to wait more, you may force taget eviction via 'lctl --device %s abort_recovery.\n",
945                                       target->obd_name, abs(timeout) / 60,
946                                       abs(timeout) % 60, target->obd_name);
947
948                 if (buf != NULL)
949                         OBD_FREE(buf, size);
950         }
951
952 out_already:
953         conn->cookie = exp->exp_handle.h_cookie;
954         /*
955          * target_handle_connect() treats EALREADY and
956          * -EALREADY differently.  EALREADY means we are
957          * doing a valid reconnect from the same client.
958          */
959         RETURN(EALREADY);
960 }
961
962 static void
963 check_and_start_recovery_timer(struct obd_device *obd,
964                                struct ptlrpc_request *req, int new_client);
965
966 /**
967  * update flags for import during reconnect process
968  */
969 static int rev_import_flags_update(struct obd_import *revimp,
970                                    struct ptlrpc_request *req)
971 {
972         int rc;
973         struct obd_connect_data *data;
974
975         data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA);
976
977         if (data->ocd_connect_flags & OBD_CONNECT_AT)
978                 revimp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
979         else
980                 revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
981
982         revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
983
984         revimp->imp_connect_data = *data;
985         rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, &req->rq_flvr);
986         if (rc) {
987                 CERROR("%s: cannot get reverse import %s security: rc = %d\n",
988                         revimp->imp_client->cli_name,
989                         libcfs_idstr(&req->rq_peer), rc);
990                 return rc;
991         }
992
993         return 0;
994 }
995
996 /**
997  * Allocate a new reverse import for an export.
998  *
999  * \retval -errno in case error hit
1000  * \retval 0 if reverse import correctly init
1001  **/
1002 int rev_import_init(struct obd_export *export)
1003 {
1004         struct obd_device *obd = export->exp_obd;
1005         struct obd_import *revimp;
1006
1007         LASSERT(export->exp_imp_reverse == NULL);
1008
1009         revimp = class_new_import(obd);
1010         if (revimp == NULL)
1011                 return -ENOMEM;
1012
1013         revimp->imp_remote_handle.cookie = 0ULL;
1014         revimp->imp_client = &obd->obd_ldlm_client;
1015         revimp->imp_dlm_fake = 1;
1016
1017         /* it is safe to connect import in new state as no sends possible */
1018         spin_lock(&export->exp_lock);
1019         export->exp_imp_reverse = revimp;
1020         spin_unlock(&export->exp_lock);
1021         class_import_put(revimp);
1022
1023         return 0;
1024 }
1025 EXPORT_SYMBOL(rev_import_init);
1026
1027 /**
1028  * Handle reconnect for an export.
1029  *
1030  * \param exp export to handle reconnect process
1031  * \param req client reconnect request
1032  *
1033  * \retval -rc in case securitfy flavor can't be changed
1034  * \retval 0 in case none problems
1035  */
1036 static int rev_import_reconnect(struct obd_export *exp,
1037                                 struct ptlrpc_request *req)
1038 {
1039         struct obd_import *revimp = exp->exp_imp_reverse;
1040         struct lustre_handle *lh;
1041         int rc;
1042
1043         /* avoid sending a request until import flags are changed */
1044         ptlrpc_import_enter_resend(revimp);
1045
1046         ptlrpc_connection_put(revimp->imp_connection);
1047
1048         /*
1049          * client from recovery don't have a handle so we need to take from
1050          * request. it may produce situation when wrong client connected
1051          * to recovery as we trust a client uuid
1052          */
1053         lh = req_capsule_client_get(&req->rq_pill, &RMF_CONN);
1054         revimp->imp_remote_handle = *lh;
1055
1056         /*
1057          * unknown versions will be caught in
1058          * ptlrpc_handle_server_req_in->lustre_unpack_msg()
1059          */
1060         revimp->imp_msg_magic = req->rq_reqmsg->lm_magic;
1061
1062         revimp->imp_connection = ptlrpc_connection_addref(exp->exp_connection);
1063
1064         rc = rev_import_flags_update(revimp, req);
1065         if (rc != 0) {
1066                 /*
1067                  * it is safe to still be in RECOVERY phase as we are not able
1068                  * to setup correct security flavor so requests are not able to
1069                  * be delivered correctly
1070                  */
1071                 return rc;
1072         }
1073
1074         /* resend all rpc's via new connection */
1075         return ptlrpc_import_recovery_state_machine(revimp);
1076 }
1077
1078 int target_handle_connect(struct ptlrpc_request *req)
1079 {
1080         struct obd_device *target = NULL;
1081         struct obd_export *export = NULL;
1082         /*
1083          * connect handle - filled from target_handle_reconnect in
1084          * reconnect case
1085          */
1086         struct lustre_handle conn;
1087         struct lustre_handle *tmp;
1088         struct obd_uuid cluuid;
1089         char *str;
1090         int rc = 0;
1091         char *target_start;
1092         int target_len;
1093         bool mds_conn = false, lw_client = false, initial_conn = false;
1094         bool mds_mds_conn = false;
1095         bool new_mds_mds_conn = false;
1096         struct obd_connect_data *data, *tmpdata;
1097         int size, tmpsize;
1098 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
1099         int tmp_exp_old_falloc;
1100 #endif
1101         struct ptlrpc_connection *pcon = NULL;
1102         bool reconnected = false;
1103
1104         ENTRY;
1105
1106         CFS_RACE(OBD_FAIL_TGT_CONN_RACE);
1107
1108         str = req_capsule_client_get(&req->rq_pill, &RMF_TGTUUID);
1109         if (str == NULL) {
1110                 DEBUG_REQ(D_ERROR, req, "bad target UUID for connect");
1111                 GOTO(out, rc = -EINVAL);
1112         }
1113
1114         target = class_str2obd(str);
1115         if (!target) {
1116                 deuuidify(str, NULL, &target_start, &target_len);
1117                 LCONSOLE_ERROR_MSG(0x137,
1118                                    "%.*s: not available for connect from %s (no target). If you are running an HA pair check that the target is mounted on the other server.\n",
1119                                    target_len, target_start,
1120                                    libcfs_nidstr(&req->rq_peer.nid));
1121                 GOTO(out, rc = -ENODEV);
1122         }
1123
1124         atomic_inc(&target->obd_conn_inprogress);
1125
1126         if (target->obd_stopping || !target->obd_set_up) {
1127                 deuuidify(str, NULL, &target_start, &target_len);
1128                 LCONSOLE_INFO("%.*s: Not available for connect from %s (%s)\n",
1129                               target_len, target_start,
1130                               libcfs_nidstr(&req->rq_peer.nid),
1131                               (target->obd_stopping ?
1132                                "stopping" : "not set up"));
1133                 GOTO(out, rc = -ENODEV);
1134         }
1135
1136         if (target->obd_no_conn) {
1137                 CDEBUG(D_INFO,
1138                        "%s: Temporarily refusing client connection from %s\n",
1139                        target->obd_name, libcfs_nidstr(&req->rq_peer.nid));
1140                 GOTO(out, rc = -EAGAIN);
1141         }
1142
1143         str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID);
1144         if (str == NULL) {
1145                 DEBUG_REQ(D_ERROR, req, "bad client UUID for connect");
1146                 GOTO(out, rc = -EINVAL);
1147         }
1148
1149         obd_str2uuid(&cluuid, str);
1150
1151         tmp = req_capsule_client_get(&req->rq_pill, &RMF_CONN);
1152         if (tmp == NULL)
1153                 GOTO(out, rc = -EPROTO);
1154
1155         conn = *tmp;
1156
1157         size = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA,
1158                                     RCL_CLIENT);
1159         if (size < 0 || size > 8 * sizeof(struct obd_connect_data))
1160                 GOTO(out, rc = -EPROTO);
1161         data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA);
1162         if (!data)
1163                 GOTO(out, rc = -EPROTO);
1164
1165         rc = req_capsule_server_pack(&req->rq_pill);
1166         if (rc)
1167                 GOTO(out, rc);
1168
1169 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
1170         /*
1171          * Don't allow clients to connect that are using old 1.8 format
1172          * protocol conventions (LUSTRE_MSG_MAGIC_v1, !MSGHDR_CKSUM_INCOMPAT18,
1173          * ldlm_flock_policy_wire format, MDT_ATTR_xTIME_SET, etc).  The
1174          * FULL20 flag should be set on all connections since 2.0, but no
1175          * longer affects behaviour.
1176          *
1177          * Later this check will be disabled and the flag can be retired
1178          * completely once interop with 3.0 is no longer needed.
1179          */
1180         if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
1181                 GOTO(out, rc = -EPROTO);
1182
1183         /* Old clients will have 'tmp_exp_old_falloc' as 1.
1184          * Newer clients (2.15) and beyond will have it set as 0
1185          */
1186         tmp_exp_old_falloc =
1187                 !!(data->ocd_connect_flags & OBD_CONNECT_OLD_FALLOC);
1188
1189         CDEBUG(D_INFO, "%s: ocd_connect_flags: %#llx tmp_exp_old_falloc: %d\n",
1190                target->obd_name, data->ocd_connect_flags, tmp_exp_old_falloc);
1191
1192         /*
1193          * Don't allow liblustre clients to connect.
1194          * - testing was disabled in v2_2_50_0-61-g6a75d65
1195          * - building was disabled in v2_5_58_0-28-g7277179
1196          * - client code was deleted in v2_6_50_0-101-gcdfbc72,
1197          * - clients were refused connect for version difference > 0.0.1.32
1198          */
1199         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
1200                 DEBUG_REQ(D_WARNING, req, "Refusing libclient connection");
1201                 GOTO(out, rc = -EPROTO);
1202         }
1203 #endif
1204
1205         /*
1206          * Note: lw_client is needed in MDS-MDS failover during update log
1207          * processing, so we needs to allow lw_client to be connected at
1208          * anytime, instead of only the initial connection
1209          */
1210         lw_client = OCD_HAS_FLAG(data, LIGHTWEIGHT);
1211
1212         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) {
1213                 initial_conn = true;
1214                 mds_conn = OCD_HAS_FLAG(data, MDS);
1215                 mds_mds_conn = OCD_HAS_FLAG(data, MDS_MDS);
1216
1217                 /*
1218                  * OBD_CONNECT_MNE_SWAB is removed at 2.14
1219                  * Checking OBD_CONNECT_FID can be removed in the future.
1220                  *
1221                  * Via check OBD_CONNECT_FID, we can distinguish whether
1222                  * the OBD_CONNECT_MDS_MDS/OBD_CONNECT_MNE_SWAB is from
1223                  * MGC or MDT, since MGC does not use OBD_CONNECT_FID.
1224                  */
1225                 if (!lw_client &&
1226                     (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
1227                     (data->ocd_connect_flags & OBD_CONNECT_FID) &&
1228                     (data->ocd_connect_flags & OBD_CONNECT_VERSION)) {
1229                         __u32 major = OBD_OCD_VERSION_MAJOR(data->ocd_version);
1230                         __u32 minor = OBD_OCD_VERSION_MINOR(data->ocd_version);
1231                         __u32 patch = OBD_OCD_VERSION_PATCH(data->ocd_version);
1232
1233                         /*
1234                          * We do not support the MDT-MDT interoperations with
1235                          * different version MDT because of protocol changes.
1236                          */
1237                         if (unlikely(major != LUSTRE_MAJOR ||
1238                                      minor != LUSTRE_MINOR ||
1239                                      abs(patch - LUSTRE_PATCH) > 3)) {
1240                                 LCONSOLE_WARN("%s (%u.%u.%u.%u) refused the connection from different version MDT (%d.%d.%d.%d) %s %s\n",
1241                                               target->obd_name, LUSTRE_MAJOR,
1242                                               LUSTRE_MINOR, LUSTRE_PATCH,
1243                                               LUSTRE_FIX, major, minor, patch,
1244                                               OBD_OCD_VERSION_FIX(data->ocd_version),
1245                                               libcfs_nidstr(&req->rq_peer.nid),
1246                                               str);
1247                                 GOTO(out, rc = -EPROTO);
1248                         }
1249                 }
1250         }
1251
1252         /* lctl gets a backstage, all-access pass. */
1253         if (obd_uuid_equals(&cluuid, &target->obd_uuid))
1254                 goto dont_check_exports;
1255
1256         export = obd_uuid_lookup(target, &cluuid);
1257         if (!export)
1258                 goto no_export;
1259
1260         /* We've found an export in the hash. */
1261
1262         spin_lock(&export->exp_lock);
1263
1264         if (export->exp_connecting) { /* b=9635, et. al. */
1265                 spin_unlock(&export->exp_lock);
1266                 LCONSOLE_WARN("%s: Export %p already connecting from %s\n",
1267                               export->exp_obd->obd_name, export,
1268                               libcfs_nidstr(&req->rq_peer.nid));
1269                 class_export_put(export);
1270                 export = NULL;
1271                 rc = -EALREADY;
1272         } else if ((mds_conn || (lw_client && initial_conn) ||
1273                    OCD_HAS_FLAG(data, MDS_MDS)) && export->exp_connection) {
1274                 spin_unlock(&export->exp_lock);
1275                 if (!nid_same(&req->rq_peer.nid,
1276                               &export->exp_connection->c_peer.nid)) {
1277                         /* MDS or LWP reconnected after failover. */
1278                         LCONSOLE_WARN("%s: Received %s connection from %s, removing former export from %s\n",
1279                                       target->obd_name,
1280                                       lw_client ? "LWP" : "MDS",
1281                                       libcfs_nidstr(&req->rq_peer.nid),
1282                                       libcfs_nidstr(&export->exp_connection->c_peer.nid));
1283                 } else {
1284                         /* New connection from the same NID. */
1285                         LCONSOLE_WARN("%s: Received new %s connection from %s, %s former export from same NID\n",
1286                                       target->obd_name,
1287                                       lw_client ? "LWP" : "MDS",
1288                                       libcfs_nidstr(&req->rq_peer.nid),
1289                                       OCD_HAS_FLAG(data, MDS_MDS) ?
1290                                       "keep" : "remove");
1291                 }
1292
1293                 if (nid_same(&req->rq_peer.nid,
1294                              &export->exp_connection->c_peer.nid) &&
1295                     OCD_HAS_FLAG(data, MDS_MDS)) {
1296                         /*
1297                          * Because exports between MDTs will always be
1298                          * kept, let's do not fail such export if they
1299                          * come from the same NID, otherwise it might
1300                          * cause eviction between MDTs, which might
1301                          * cause namespace inconsistency
1302                          */
1303                         spin_lock(&export->exp_lock);
1304                         export->exp_connecting = 1;
1305                         export->exp_conn_cnt = 0;
1306                         spin_unlock(&export->exp_lock);
1307                         conn.cookie = export->exp_handle.h_cookie;
1308                         rc = EALREADY;
1309                 } else {
1310                         class_fail_export(export);
1311                         class_export_put(export);
1312                         export = NULL;
1313                         rc = 0;
1314                 }
1315         } else if (export->exp_connection != NULL && initial_conn &&
1316                    !nid_same(&req->rq_peer.nid,
1317                              &export->exp_connection->c_peer.nid)) {
1318                 spin_unlock(&export->exp_lock);
1319                 /* In MDS failover we have static UUID but NID can change. */
1320                 LCONSOLE_WARN("%s: Client %s seen on new nid %s when existing nid %s is already connected\n",
1321                               target->obd_name, cluuid.uuid,
1322                               libcfs_nidstr(&req->rq_peer.nid),
1323                               libcfs_nidstr(
1324                                       &export->exp_connection->c_peer.nid));
1325                 rc = -EALREADY;
1326                 class_export_put(export);
1327                 export = NULL;
1328         } else if (CFS_FAIL_PRECHECK(OBD_FAIL_TGT_RECOVERY_CONNECT) &&
1329                    !lw_client) {
1330                 spin_unlock(&export->exp_lock);
1331                 rc = -EAGAIN;
1332         } else {
1333                 export->exp_connecting = 1;
1334                 spin_unlock(&export->exp_lock);
1335                 LASSERT(export->exp_obd == target);
1336
1337                 rc = target_handle_reconnect(&conn, export, &cluuid);
1338         }
1339
1340         /* If we found an export, we already unlocked. */
1341         if (!export) {
1342 no_export:
1343                 CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_CONNECT, 2 * obd_timeout);
1344         } else if (req->rq_export == NULL &&
1345                    atomic_read(&export->exp_rpc_count) > 0) {
1346                 LCONSOLE_WARN("%s: Client %s (at %s) refused connection, still busy with %d references\n",
1347                               target->obd_name, cluuid.uuid,
1348                               libcfs_nidstr(&req->rq_peer.nid),
1349                               refcount_read(&export->exp_handle.h_ref));
1350                         GOTO(out, rc = -EBUSY);
1351         } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1 &&
1352                    rc != EALREADY) {
1353                 if (!strstr(cluuid.uuid, "mdt"))
1354                         LCONSOLE_WARN("%s: Rejecting reconnect from the known client %s (at %s) because it is indicating it is a new client\n",
1355                                       target->obd_name, cluuid.uuid,
1356                                       libcfs_nidstr(&req->rq_peer.nid));
1357                 GOTO(out, rc = -EALREADY);
1358         } else {
1359                 CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout);
1360         }
1361
1362         if (rc < 0)
1363                 GOTO(out, rc);
1364
1365         CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %lld last %lld\n",
1366                target->obd_name, cluuid.uuid, libcfs_nidstr(&req->rq_peer.nid),
1367                target->obd_recovering ? "recovering/" : "", data->ocd_transno,
1368                export, ktime_get_seconds(),
1369                export ? export->exp_last_request_time : 0);
1370
1371         /*
1372          * If this is the first time a client connects, reset the recovery
1373          * timer. Discard lightweight connections which might be local.
1374          */
1375         if (!lw_client && rc == 0 && target->obd_recovering)
1376                 check_and_start_recovery_timer(target, req, export == NULL);
1377
1378         /*
1379          * We want to handle EALREADY but *not* -EALREADY from
1380          * target_handle_reconnect(), return reconnection state in a flag.
1381          */
1382         if (rc == EALREADY) {
1383                 lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
1384                 rc = 0;
1385         } else {
1386                 LASSERT(rc == 0);
1387         }
1388
1389         /* Tell the client if we support replayable requests. */
1390         if (target->obd_replayable)
1391                 lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE);
1392
1393         if (export == NULL) {
1394                 /* allow lightweight connections during recovery */
1395                 /*
1396                  * allow "new" MDT to be connected during recovery, since we
1397                  * need retrieve recovery update records from it
1398                  */
1399                 if (target->obd_recovering && !lw_client && !mds_mds_conn) {
1400                         struct hrtimer *timer = &target->obd_recovery_timer;
1401                         ktime_t remaining;
1402                         s64 timeout, left;
1403                         int in_progress;
1404                         int connected;
1405                         int known;
1406                         int stale;
1407                         char *msg;
1408
1409                         connected = atomic_read(&target->obd_connected_clients);
1410                         in_progress = atomic_read(&target->obd_lock_replay_clients);
1411                         known =
1412                            atomic_read(&target->obd_max_recoverable_clients);
1413                         stale = target->obd_stale_clients;
1414                         remaining = hrtimer_get_remaining(timer);
1415                         left = ktime_divns(remaining, NSEC_PER_SEC);
1416
1417                         if (ktime_to_ns(remaining) > 0) {
1418                                 msg = "to recover in";
1419                                 timeout = left;
1420                         } else {
1421                                 msg = "already passed deadline";
1422                                 timeout = -left;
1423
1424                                 target_check_recovery_timer(target);
1425                         }
1426
1427                         LCONSOLE_WARN("%s: Denying connection for new client %s (at %s), waiting for %d known clients (%d recovered, %d in progress, and %d evicted) %s %lld:%.02lld\n",
1428                                       target->obd_name, cluuid.uuid,
1429                                       libcfs_nidstr(&req->rq_peer.nid), known,
1430                                       connected - in_progress, in_progress,
1431                                       stale, msg, timeout / 60, timeout % 60);
1432                         rc = -EBUSY;
1433                 } else {
1434 dont_check_exports:
1435                         rc = obd_connect(req->rq_svc_thread->t_env,
1436                                          &export, target, &cluuid, data,
1437                                          &req->rq_peer.nid);
1438                         if (mds_conn && CFS_FAIL_CHECK(OBD_FAIL_TGT_RCVG_FLAG))
1439                                 lustre_msg_add_op_flags(req->rq_repmsg,
1440                                                         MSG_CONNECT_RECOVERING);
1441                         if (rc == 0) {
1442                                 conn.cookie = export->exp_handle.h_cookie;
1443                                 rc = rev_import_init(export);
1444                         }
1445
1446                         if (mds_mds_conn)
1447                                 new_mds_mds_conn = true;
1448                 }
1449         } else {
1450                 if (CFS_FAIL_CHECK(OBD_FAIL_MDS_CONNECT_VS_EVICT)) {
1451                         class_export_get(export);
1452                         class_fail_export(export);
1453                         class_export_put(export);
1454                 }
1455                 rc = obd_reconnect(req->rq_svc_thread->t_env,
1456                                    export, target, &cluuid, data,
1457                                    &req->rq_peer.nid);
1458                 if (rc == 0)
1459                         reconnected = true;
1460         }
1461         if (rc)
1462                 GOTO(out, rc);
1463
1464         data->ocd_instance = obd2obt(target)->obt_instance;
1465
1466         /*
1467          * Return only the parts of obd_connect_data that we understand, so the
1468          * client knows that we don't understand the rest.
1469          */
1470         if (data) {
1471                 tmpsize = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA,
1472                                                RCL_SERVER);
1473                 tmpdata = req_capsule_server_get(&req->rq_pill,
1474                                                  &RMF_CONNECT_DATA);
1475                 /*
1476                  * Don't use struct assignment here, because the client reply
1477                  * buffer may be smaller/larger than the local struct
1478                  * obd_connect_data.
1479                  */
1480                 memcpy(tmpdata, data, min(tmpsize, size));
1481         }
1482
1483         /*
1484          * If the client and the server are the same node, we will already
1485          * have an export that really points to the client's DLM export,
1486          * because we have a shared handles table.
1487          *
1488          * XXX this will go away when shaver stops sending the "connect" handle
1489          * in the real "remote handle" field of the request --phik 24 Apr 2003
1490          */
1491         ptlrpc_request_change_export(req, export);
1492
1493         pcon = ptlrpc_connection_get(&req->rq_peer,
1494                                      &req->rq_self, &cluuid);
1495         if (pcon == NULL)
1496                 GOTO(out, rc = -ENOTCONN);
1497
1498         spin_lock(&export->exp_lock);
1499
1500         if (export->exp_disconnected) {
1501                 spin_unlock(&export->exp_lock);
1502                 if (reconnected) {
1503                         /*
1504                          * for each connect called disconnect
1505                          * should be called to cleanup stuff
1506                          */
1507                         class_export_get(export);
1508                         obd_disconnect(export);
1509                 }
1510
1511                 GOTO(out, rc = -ENODEV);
1512         }
1513         if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
1514                 spin_unlock(&export->exp_lock);
1515                 CDEBUG(D_RPCTRACE,
1516                        "%s: %s already connected at greater or equal conn_cnt: %d >= %d\n",
1517                        cluuid.uuid, libcfs_nidstr(&req->rq_peer.nid),
1518                        export->exp_conn_cnt,
1519                        lustre_msg_get_conn_cnt(req->rq_reqmsg));
1520
1521                 GOTO(out, rc = -EALREADY);
1522         }
1523         LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
1524         export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
1525
1526 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
1527         /* make 'tmp_exp_old_falloc' persistent by saving it into
1528          * server side export object(obd_export)
1529          */
1530         export->exp_old_falloc = tmp_exp_old_falloc;
1531 #endif
1532
1533         /* Check to see if connection came from another NID. */
1534         if (export->exp_connection != NULL &&
1535             !nid_same(&export->exp_connection->c_peer.nid,
1536                       &req->rq_peer.nid)) {
1537                 obd_nid_del(export->exp_obd, export);
1538                 ptlrpc_connection_put(export->exp_connection);
1539                 export->exp_connection = NULL;
1540         }
1541
1542         if (export->exp_connection == NULL) {
1543                 export->exp_connection = pcon;
1544                 pcon = NULL;
1545         }
1546         obd_nid_add(export->exp_obd, export);
1547
1548         spin_unlock(&export->exp_lock);
1549
1550         lustre_msg_set_handle(req->rq_repmsg, &conn);
1551
1552         rc = rev_import_reconnect(export, req);
1553         if (rc != 0)
1554                 GOTO(out, rc);
1555
1556         if (target->obd_recovering && !export->exp_in_recovery && !lw_client) {
1557                 int has_transno;
1558                 __u64 transno = data->ocd_transno;
1559
1560                 spin_lock(&export->exp_lock);
1561                 /*
1562                  * possible race with class_disconnect_stale_exports,
1563                  * export may be already in the eviction process
1564                  */
1565                 if (export->exp_failed) {
1566                         spin_unlock(&export->exp_lock);
1567                         GOTO(out, rc = -ENODEV);
1568                 }
1569                 export->exp_in_recovery = 1;
1570                 export->exp_req_replay_needed = 1;
1571                 export->exp_lock_replay_needed = 1;
1572                 spin_unlock(&export->exp_lock);
1573
1574                 has_transno = !!(lustre_msg_get_op_flags(req->rq_reqmsg) &
1575                                  MSG_CONNECT_TRANSNO);
1576                 if (has_transno && transno == 0)
1577                         CWARN("Connect with zero transno!\n");
1578
1579                 if (has_transno && transno > 0 &&
1580                     transno < target->obd_next_recovery_transno &&
1581                     transno > target->obd_last_committed) {
1582                         /* Another way is to use cmpxchg() to be lock-free. */
1583                         spin_lock(&target->obd_recovery_task_lock);
1584                         if (transno < target->obd_next_recovery_transno)
1585                                 target->obd_next_recovery_transno = transno;
1586                         spin_unlock(&target->obd_recovery_task_lock);
1587                 }
1588
1589                 atomic_inc(&target->obd_req_replay_clients);
1590                 atomic_inc(&target->obd_lock_replay_clients);
1591                 /*
1592                  * Note: MDS-MDS connection is allowed to be connected during
1593                  * recovery, no matter if the exports needs to be recoveried.
1594                  * Because we need retrieve updates logs from all other MDTs.
1595                  * So if the MDS-MDS export is new, obd_max_recoverable_clients
1596                  * also needs to be increased to match other recovery checking
1597                  * condition.
1598                  */
1599                 if (new_mds_mds_conn)
1600                         atomic_inc(&target->obd_max_recoverable_clients);
1601
1602                 if (atomic_inc_return(&target->obd_connected_clients) ==
1603                     atomic_read(&target->obd_max_recoverable_clients))
1604                         wake_up(&target->obd_next_transno_waitq);
1605         }
1606
1607         /* Tell the client we're in recovery, when client is involved in it. */
1608         if (target->obd_recovering && !lw_client)
1609                 lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING);
1610
1611 out:
1612         if (export) {
1613                 spin_lock(&export->exp_lock);
1614                 export->exp_connecting = 0;
1615                 spin_unlock(&export->exp_lock);
1616
1617                 class_export_put(export);
1618         }
1619         if (target) {
1620                 if (atomic_dec_and_test(&target->obd_conn_inprogress))
1621                         wake_up_var(&target->obd_conn_inprogress);
1622                 class_decref(target, "find", current);
1623         }
1624         if (pcon)
1625                 ptlrpc_connection_put(pcon);
1626         req->rq_status = rc;
1627         RETURN(rc);
1628 }
1629
1630 int target_handle_disconnect(struct ptlrpc_request *req)
1631 {
1632         int rc;
1633
1634         ENTRY;
1635
1636         rc = req_capsule_server_pack(&req->rq_pill);
1637         if (rc)
1638                 RETURN(rc);
1639
1640         /* In case of target disconnect, updating sec ctx immediately is
1641          * required in order to record latest sequence number used.
1642          * Sequence is normally updated on export destroy, but this event
1643          * can occur too late, ie after a new target connect request has
1644          * been processed.
1645          * Maintaining correct sequence when client connection becomes idle
1646          * ensures that GSS does not erroneously consider requests as replays.
1647          */
1648         rc = sptlrpc_export_update_ctx(req->rq_export);
1649         if (rc)
1650                 RETURN(rc);
1651
1652         /* Keep the rq_export around so we can send the reply. */
1653         req->rq_status = obd_disconnect(class_export_get(req->rq_export));
1654
1655         RETURN(0);
1656 }
1657
1658 void target_destroy_export(struct obd_export *exp)
1659 {
1660         struct obd_import *imp = NULL;
1661         /*
1662          * exports created from last_rcvd data, and "fake"
1663          * exports created by lctl don't have an import
1664          */
1665         spin_lock(&exp->exp_lock);
1666         if (exp->exp_imp_reverse != NULL) {
1667                 imp = exp->exp_imp_reverse;
1668                 exp->exp_imp_reverse = NULL;
1669         }
1670         spin_unlock(&exp->exp_lock);
1671         if (imp != NULL)
1672                 client_destroy_import(imp);
1673
1674         LASSERT(atomic_read(&exp->exp_locks_count) == 0);
1675         LASSERT(atomic_read(&exp->exp_rpc_count) == 0);
1676         LASSERT(atomic_read(&exp->exp_cb_count) == 0);
1677         LASSERT(atomic_read(&exp->exp_replay_count) == 0);
1678 }
1679 EXPORT_SYMBOL(target_destroy_export);
1680
1681 /*
1682  * Recovery functions
1683  */
1684 static void target_request_copy_get(struct ptlrpc_request *req)
1685 {
1686         class_export_rpc_inc(req->rq_export);
1687         LASSERT(list_empty(&req->rq_list));
1688         INIT_LIST_HEAD(&req->rq_replay_list);
1689
1690         /* Increase refcount to keep request in queue. */
1691         atomic_inc(&req->rq_refcount);
1692         /* Let export know it has replays to be handled. */
1693         atomic_inc(&req->rq_export->exp_replay_count);
1694 }
1695
1696 static void target_request_copy_put(struct ptlrpc_request *req)
1697 {
1698         LASSERT(list_empty(&req->rq_replay_list));
1699         LASSERT(atomic_read(&(req)->rq_export->exp_replay_count) > 0);
1700
1701         atomic_dec(&req->rq_export->exp_replay_count);
1702         class_export_rpc_dec(req->rq_export);
1703         ptlrpc_server_drop_request(req);
1704 }
1705
1706 static int target_exp_enqueue_req_replay(struct ptlrpc_request *req)
1707 {
1708         __u64 transno = lustre_msg_get_transno(req->rq_reqmsg);
1709         struct obd_export *exp = req->rq_export;
1710         struct ptlrpc_request *reqiter;
1711         struct ptlrpc_request *dup_req = NULL;
1712         int dup = 0;
1713
1714         LASSERT(exp);
1715
1716         spin_lock(&exp->exp_lock);
1717         list_for_each_entry(reqiter, &exp->exp_req_replay_queue,
1718                             rq_replay_list) {
1719                 if (lustre_msg_get_transno(reqiter->rq_reqmsg) == transno) {
1720                         dup_req = reqiter;
1721                         dup = 1;
1722                         break;
1723                 }
1724         }
1725
1726         if (dup) {
1727                 /* We expect it with RESENT and REPLAY flags. */
1728                 if ((lustre_msg_get_flags(req->rq_reqmsg) &
1729                     (MSG_RESENT | MSG_REPLAY)) != (MSG_RESENT | MSG_REPLAY))
1730                         CERROR("invalid flags %x of resent replay\n",
1731                                lustre_msg_get_flags(req->rq_reqmsg));
1732
1733                 if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
1734                         __u32 new_conn;
1735
1736                         new_conn = lustre_msg_get_conn_cnt(req->rq_reqmsg);
1737                         if (new_conn >
1738                             lustre_msg_get_conn_cnt(dup_req->rq_reqmsg))
1739                                 lustre_msg_set_conn_cnt(dup_req->rq_reqmsg,
1740                                                         new_conn);
1741                 }
1742         } else {
1743                 list_add_tail(&req->rq_replay_list,
1744                               &exp->exp_req_replay_queue);
1745         }
1746
1747         spin_unlock(&exp->exp_lock);
1748         return dup;
1749 }
1750
1751 static void target_exp_dequeue_req_replay(struct ptlrpc_request *req)
1752 {
1753         LASSERT(!list_empty(&req->rq_replay_list));
1754         LASSERT(req->rq_export);
1755
1756         spin_lock(&req->rq_export->exp_lock);
1757         list_del_init(&req->rq_replay_list);
1758         spin_unlock(&req->rq_export->exp_lock);
1759 }
1760
1761 static void target_finish_recovery(struct lu_target *lut)
1762 {
1763         struct obd_device *obd = lut->lut_obd;
1764
1765         ENTRY;
1766
1767         /* Only log a recovery message when recovery has occurred. */
1768         if (obd->obd_recovery_start) {
1769                 time64_t now = ktime_get_seconds();
1770                 time64_t elapsed_time;
1771
1772                 elapsed_time = max_t(time64_t, now - obd->obd_recovery_start,
1773                                      1);
1774                 LCONSOLE_INFO("%s: Recovery over after %lld:%.02lld, of %d clients %d recovered and %d %s evicted.\n",
1775                               obd->obd_name, elapsed_time / 60,
1776                               elapsed_time % 60,
1777                               atomic_read(&obd->obd_max_recoverable_clients),
1778                               atomic_read(&obd->obd_connected_clients),
1779                               obd->obd_stale_clients,
1780                               obd->obd_stale_clients == 1 ? "was" : "were");
1781                 if (obd->obd_stale_clients && do_dump_on_eviction(obd))
1782                         libcfs_debug_dumplog();
1783         }
1784
1785         ldlm_reprocess_recovery_done(obd->obd_namespace);
1786         spin_lock(&obd->obd_recovery_task_lock);
1787         if (!list_empty(&obd->obd_req_replay_queue) ||
1788             !list_empty(&obd->obd_lock_replay_queue) ||
1789             !list_empty(&obd->obd_final_req_queue)) {
1790                 CERROR("%s: Recovery queues ( %s%s%s) are not empty\n",
1791                        obd->obd_name,
1792                        list_empty(&obd->obd_req_replay_queue) ? "" : "req ",
1793                        list_empty(&obd->obd_lock_replay_queue) ?
1794                                   "" : "lock ",
1795                        list_empty(&obd->obd_final_req_queue) ?
1796                                   "" : "final ");
1797                 spin_unlock(&obd->obd_recovery_task_lock);
1798                 LBUG();
1799         }
1800         spin_unlock(&obd->obd_recovery_task_lock);
1801
1802         obd->obd_recovery_end = ktime_get_seconds();
1803
1804         /* When recovery finished, cleanup orphans on MDS and OST. */
1805         if (obd->obd_type && obd->obd_type->typ_dt_ops->o_postrecov) {
1806                 int rc = obd->obd_type->typ_dt_ops->o_postrecov(obd);
1807
1808                 if (rc < 0)
1809                         LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
1810                                       obd->obd_name, rc);
1811         }
1812         EXIT;
1813 }
1814
1815 static void abort_req_replay_queue(struct obd_device *obd)
1816 {
1817         struct ptlrpc_request *req, *n;
1818         LIST_HEAD(abort_list);
1819
1820         spin_lock(&obd->obd_recovery_task_lock);
1821         list_splice_init(&obd->obd_req_replay_queue, &abort_list);
1822         spin_unlock(&obd->obd_recovery_task_lock);
1823         list_for_each_entry_safe(req, n, &abort_list, rq_list) {
1824                 DEBUG_REQ(D_WARNING, req, "aborted:");
1825                 req->rq_status = -ENOTCONN;
1826                 if (ptlrpc_error(req)) {
1827                         DEBUG_REQ(D_ERROR, req,
1828                                   "failed abort_req_reply; skipping");
1829                 }
1830                 target_exp_dequeue_req_replay(req);
1831                 target_request_copy_put(req);
1832         }
1833 }
1834
1835 static void abort_lock_replay_queue(struct obd_device *obd)
1836 {
1837         struct ptlrpc_request *req, *n;
1838         LIST_HEAD(abort_list);
1839
1840         spin_lock(&obd->obd_recovery_task_lock);
1841         list_splice_init(&obd->obd_lock_replay_queue, &abort_list);
1842         spin_unlock(&obd->obd_recovery_task_lock);
1843         list_for_each_entry_safe(req, n, &abort_list, rq_list) {
1844                 DEBUG_REQ(D_ERROR, req, "aborted:");
1845                 req->rq_status = -ENOTCONN;
1846                 if (ptlrpc_error(req)) {
1847                         DEBUG_REQ(D_ERROR, req,
1848                                   "failed abort_lock_reply; skipping");
1849                 }
1850                 target_request_copy_put(req);
1851         }
1852 }
1853
1854 /*
1855  * Called from a cleanup function if the device is being cleaned up
1856  * forcefully.  The exports should all have been disconnected already,
1857  * the only thing left to do is
1858  * - clear the recovery flags
1859  * - cancel the timer
1860  * - free queued requests and replies, but don't send replies
1861  * Because the obd_stopping flag is set, no new requests should be received.
1862  */
1863 void target_cleanup_recovery(struct obd_device *obd)
1864 {
1865         struct ptlrpc_request *req, *n;
1866         LIST_HEAD(clean_list);
1867
1868         spin_lock(&obd->obd_dev_lock);
1869         if (!obd->obd_recovering) {
1870                 spin_unlock(&obd->obd_dev_lock);
1871                 EXIT;
1872                 return;
1873         }
1874         obd->obd_recovering = 0;
1875         obd->obd_abort_recovery = 0;
1876         obd->obd_abort_mdt_recovery = 0;
1877         spin_unlock(&obd->obd_dev_lock);
1878
1879         spin_lock(&obd->obd_recovery_task_lock);
1880         target_cancel_recovery_timer(obd);
1881         list_splice_init(&obd->obd_req_replay_queue, &clean_list);
1882         spin_unlock(&obd->obd_recovery_task_lock);
1883
1884         list_for_each_entry_safe(req, n, &clean_list, rq_list) {
1885                 LASSERT(req->rq_reply_state == NULL);
1886                 target_exp_dequeue_req_replay(req);
1887                 target_request_copy_put(req);
1888         }
1889
1890         spin_lock(&obd->obd_recovery_task_lock);
1891         list_splice_init(&obd->obd_lock_replay_queue, &clean_list);
1892         list_splice_init(&obd->obd_final_req_queue, &clean_list);
1893         spin_unlock(&obd->obd_recovery_task_lock);
1894
1895         list_for_each_entry_safe(req, n, &clean_list, rq_list) {
1896                 LASSERT(req->rq_reply_state == NULL);
1897                 target_request_copy_put(req);
1898         }
1899
1900         EXIT;
1901 }
1902 EXPORT_SYMBOL(target_cleanup_recovery);
1903
1904 /* obd_recovery_task_lock should be held */
1905 void target_cancel_recovery_timer(struct obd_device *obd)
1906 {
1907         CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name);
1908         hrtimer_cancel(&obd->obd_recovery_timer);
1909 }
1910
1911 static void target_start_recovery_timer(struct obd_device *obd)
1912 {
1913         ktime_t delay;
1914
1915         if (obd->obd_recovery_start != 0)
1916                 return;
1917
1918         spin_lock(&obd->obd_dev_lock);
1919         if (!obd->obd_recovering || obd->obd_abort_recovery) {
1920                 spin_unlock(&obd->obd_dev_lock);
1921                 return;
1922         }
1923
1924         LASSERT(obd->obd_recovery_timeout != 0);
1925
1926         if (obd->obd_recovery_start != 0) {
1927                 spin_unlock(&obd->obd_dev_lock);
1928                 return;
1929         }
1930
1931         obd->obd_recovery_start = ktime_get_seconds();
1932         delay = ktime_set(obd->obd_recovery_start +
1933                           obd->obd_recovery_timeout, 0);
1934         hrtimer_start(&obd->obd_recovery_timer, delay, HRTIMER_MODE_ABS);
1935         spin_unlock(&obd->obd_dev_lock);
1936
1937         LCONSOLE_WARN("%s: Will be in recovery for at least %u:%02u, or until %d client%s reconnect%s\n",
1938                       obd->obd_name,
1939                       obd->obd_recovery_timeout / 60,
1940                       obd->obd_recovery_timeout % 60,
1941                       atomic_read(&obd->obd_max_recoverable_clients),
1942                       (atomic_read(&obd->obd_max_recoverable_clients) == 1) ?
1943                       "" : "s",
1944                       (atomic_read(&obd->obd_max_recoverable_clients) == 1) ?
1945                       "s" : "");
1946 }
1947
1948 /**
1949  * extend recovery window.
1950  *
1951  * if @extend is true, extend recovery window to have @dr_timeout remaining
1952  * at least; otherwise, make sure the recovery timeout value is not less
1953  * than @dr_timeout.
1954  */
1955 static void extend_recovery_timer(struct obd_device *obd, timeout_t dr_timeout,
1956                                   bool extend)
1957 {
1958         ktime_t left_ns;
1959         timeout_t timeout;
1960         timeout_t left;
1961
1962         spin_lock(&obd->obd_dev_lock);
1963         if (!obd->obd_recovering || obd->obd_abort_recovery ||
1964             obd->obd_stopping) {
1965                 spin_unlock(&obd->obd_dev_lock);
1966                 return;
1967         }
1968         LASSERT(obd->obd_recovery_start != 0);
1969
1970         left_ns = hrtimer_get_remaining(&obd->obd_recovery_timer);
1971         left = ktime_divns(left_ns, NSEC_PER_SEC);
1972
1973         if (extend) {
1974                 timeout = obd->obd_recovery_timeout;
1975                 /* dr_timeout will happen after the hrtimer has expired.
1976                  * Add the excess time to the soft recovery timeout without
1977                  * exceeding the hard recovery timeout.
1978                  */
1979                 if (dr_timeout > left) {
1980                         timeout += dr_timeout - left;
1981                         timeout = min_t(timeout_t, obd->obd_recovery_time_hard,
1982                                         timeout);
1983                 }
1984         } else {
1985                 timeout = clamp_t(timeout_t, dr_timeout,
1986                                   obd->obd_recovery_timeout,
1987                                   obd->obd_recovery_time_hard);
1988         }
1989
1990         if (timeout == obd->obd_recovery_time_hard)
1991                 CWARN("%s: extended recovery timer reached hard limit: %d, extend: %d\n",
1992                       obd->obd_name, timeout, extend);
1993
1994         if (obd->obd_recovery_timeout < timeout) {
1995                 ktime_t end, now;
1996
1997                 obd->obd_recovery_timeout = timeout;
1998                 end = ktime_set(obd->obd_recovery_start + timeout, 0);
1999                 now = ktime_set(ktime_get_seconds(), 0);
2000                 left_ns = ktime_sub(end, now);
2001                 hrtimer_start(&obd->obd_recovery_timer, end, HRTIMER_MODE_ABS);
2002                 left = ktime_divns(left_ns, NSEC_PER_SEC);
2003         }
2004         spin_unlock(&obd->obd_dev_lock);
2005
2006         CDEBUG(D_HA, "%s: recovery timer will expire in %d seconds\n",
2007                 obd->obd_name, left);
2008 }
2009
2010 /* Reset the timer with each new client connection */
2011 /*
2012  * This timer is actually reconnect_timer, which is for making sure
2013  * the total recovery window is at least as big as my reconnect
2014  * attempt timing. So the initial recovery time_out will be set to
2015  * OBD_RECOVERY_FACTOR * obd_timeout. If the timeout coming
2016  * from client is bigger than this, then the recovery time_out will
2017  * be extended to make sure the client could be reconnected, in the
2018  * process, the timeout from the new client should be ignored.
2019  */
2020 static void
2021 check_and_start_recovery_timer(struct obd_device *obd,
2022                                struct ptlrpc_request *req,
2023                                int new_client)
2024 {
2025         timeout_t service_timeout = lustre_msg_get_service_timeout(req->rq_reqmsg);
2026         struct obd_device_target *obt = obd2obt(obd);
2027
2028         if (!new_client && service_timeout)
2029                 /*
2030                  * Teach server about old server's estimates, as first guess
2031                  * at how long new requests will take.
2032                  */
2033                 obd_at_measure(obd, &req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
2034                                service_timeout);
2035
2036         target_start_recovery_timer(obd);
2037
2038         /*
2039          * Convert the service time to RPC timeout,
2040          * and reuse service_timeout to limit stack usage.
2041          */
2042         service_timeout = at_est2timeout(service_timeout);
2043
2044         if (CFS_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) &&
2045             service_timeout < at_extra)
2046                 service_timeout = at_extra;
2047
2048         /*
2049          * We expect other clients to timeout within service_timeout, then try
2050          * to reconnect, then try the failover server.  The max delay between
2051          * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL.
2052          */
2053         service_timeout += 2 * INITIAL_CONNECT_TIMEOUT;
2054
2055         LASSERT(obt->obt_magic == OBT_MAGIC);
2056         service_timeout += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC);
2057         if (service_timeout > obd->obd_recovery_timeout && !new_client)
2058                 extend_recovery_timer(obd, service_timeout, false);
2059 }
2060
2061 /** Health checking routines */
2062 static inline int exp_connect_healthy(struct obd_export *exp)
2063 {
2064         return exp->exp_in_recovery;
2065 }
2066
2067 /** if export done req_replay or has replay in queue */
2068 static inline int exp_req_replay_healthy(struct obd_export *exp)
2069 {
2070         return (!exp->exp_req_replay_needed ||
2071                 atomic_read(&exp->exp_replay_count) > 0);
2072 }
2073
2074
2075 static inline int exp_req_replay_healthy_or_from_mdt(struct obd_export *exp)
2076 {
2077         return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) ||
2078                exp_req_replay_healthy(exp);
2079 }
2080
2081 /** if export done lock_replay or has replay in queue */
2082 static inline int exp_lock_replay_healthy(struct obd_export *exp)
2083 {
2084         return (!exp->exp_lock_replay_needed ||
2085                 atomic_read(&exp->exp_replay_count) > 0);
2086 }
2087
2088 static inline int exp_vbr_healthy(struct obd_export *exp)
2089 {
2090         return !exp->exp_vbr_failed;
2091 }
2092
2093 static inline int exp_finished(struct obd_export *exp)
2094 {
2095         return exp->exp_in_recovery && !exp->exp_lock_replay_needed;
2096 }
2097
2098 static inline int exp_finished_or_from_mdt(struct obd_export *exp)
2099 {
2100         return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) ||
2101                 exp_finished(exp);
2102 }
2103
2104 static int check_for_next_transno(struct lu_target *lut)
2105 {
2106         struct ptlrpc_request *req = NULL;
2107         struct obd_device *obd = lut->lut_obd;
2108         struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
2109         int wake_up = 0, connected, completed, queue_len;
2110         __u64 req_transno = 0;
2111         __u64 update_transno = 0;
2112         __u64 next_transno = 0;
2113
2114         ENTRY;
2115
2116         spin_lock(&obd->obd_recovery_task_lock);
2117         if (!list_empty(&obd->obd_req_replay_queue)) {
2118                 req = list_first_entry(&obd->obd_req_replay_queue,
2119                                        struct ptlrpc_request, rq_list);
2120                 req_transno = lustre_msg_get_transno(req->rq_reqmsg);
2121         }
2122
2123         if (!obd_mdt_recovery_abort(obd) && tdtd)
2124                 update_transno = distribute_txn_get_next_transno(tdtd);
2125
2126         connected = atomic_read(&obd->obd_connected_clients);
2127         completed = connected - atomic_read(&obd->obd_req_replay_clients);
2128         queue_len = obd->obd_requests_queued_for_recovery;
2129         next_transno = obd->obd_next_recovery_transno;
2130
2131         CDEBUG(D_HA,
2132                "max: %d, connected: %d, completed: %d, queue_len: %d, req_transno: %llu, next_transno: %llu\n",
2133                atomic_read(&obd->obd_max_recoverable_clients),
2134                connected, completed,
2135                queue_len, req_transno, next_transno);
2136
2137         if (obd_recovery_abort(obd)) {
2138                 CDEBUG(D_HA, "waking for aborted recovery\n");
2139                 wake_up = 1;
2140         } else if (obd->obd_recovery_expired) {
2141                 CDEBUG(D_HA, "waking for expired recovery\n");
2142                 wake_up = 1;
2143         } else if (!obd_mdt_recovery_abort(obd) && tdtd && req &&
2144                    is_req_replayed_by_update(req)) {
2145                 LASSERTF(req_transno < next_transno,
2146                          "req_transno %llu next_transno%llu\n", req_transno,
2147                          next_transno);
2148                 CDEBUG(D_HA, "waking for duplicate req (%llu)\n",
2149                        req_transno);
2150                 wake_up = 1;
2151         } else if (req_transno == next_transno ||
2152                    (update_transno != 0 && update_transno <= next_transno)) {
2153                 CDEBUG(D_HA, "waking for next (%lld)\n", next_transno);
2154                 wake_up = 1;
2155         } else if (queue_len > 0 &&
2156                    queue_len == atomic_read(&obd->obd_req_replay_clients)) {
2157                 /** handle gaps occured due to lost reply or VBR */
2158                 LASSERTF(req_transno >= next_transno,
2159                          "req_transno: %llu, next_transno: %llu\n",
2160                          req_transno, next_transno);
2161                 CDEBUG(D_HA,
2162                        "%s: waking for gap in transno, VBR is %s (skip: %lld, ql: %d, comp: %d, conn: %d, next: %lld, next_update %lld last_committed: %lld)\n",
2163                        obd->obd_name, obd->obd_version_recov ? "ON" : "OFF",
2164                        next_transno, queue_len, completed, connected,
2165                        req_transno, update_transno, obd->obd_last_committed);
2166                 obd->obd_next_recovery_transno = req_transno;
2167                 wake_up = 1;
2168         } else if (atomic_read(&obd->obd_req_replay_clients) == 0) {
2169                 CDEBUG(D_HA, "waking for completed recovery\n");
2170                 wake_up = 1;
2171         } else if (CFS_FAIL_CHECK(OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS)) {
2172                 CDEBUG(D_HA,
2173                        "accepting transno gaps is explicitly allowed by fail_lock, waking up (%lld)\n",
2174                        next_transno);
2175                 obd->obd_next_recovery_transno = req_transno;
2176                 wake_up = 1;
2177         }
2178         spin_unlock(&obd->obd_recovery_task_lock);
2179         return wake_up;
2180 }
2181
2182 static int check_for_next_lock(struct lu_target *lut)
2183 {
2184         struct obd_device *obd = lut->lut_obd;
2185         int wake_up = 0;
2186
2187         spin_lock(&obd->obd_recovery_task_lock);
2188         if (!list_empty(&obd->obd_lock_replay_queue)) {
2189                 CDEBUG(D_HA, "waking for next lock\n");
2190                 wake_up = 1;
2191         } else if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
2192                 CDEBUG(D_HA, "waking for completed lock replay\n");
2193                 wake_up = 1;
2194         } else if (obd->obd_abort_recovery) {
2195                 CDEBUG(D_HA, "waking for aborted recovery\n");
2196                 wake_up = 1;
2197         } else if (obd->obd_recovery_expired) {
2198                 CDEBUG(D_HA, "waking for expired recovery\n");
2199                 wake_up = 1;
2200         }
2201         spin_unlock(&obd->obd_recovery_task_lock);
2202
2203         return wake_up;
2204 }
2205
2206 static int check_update_llog(struct lu_target *lut)
2207 {
2208         struct obd_device *obd = lut->lut_obd;
2209         struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
2210
2211         if (obd_mdt_recovery_abort(obd)) {
2212                 CDEBUG(D_HA, "waking for aborted recovery\n");
2213                 return 1;
2214         }
2215
2216         if (atomic_read(&tdtd->tdtd_recovery_threads_count) == 0) {
2217                 CDEBUG(D_HA, "waking for completion of reading update log\n");
2218                 return 1;
2219         }
2220
2221         return 0;
2222 }
2223
2224 /**
2225  * wait for recovery events,
2226  * check its status with help of check_routine
2227  * evict dead clients via health_check
2228  */
2229 static int target_recovery_overseer(struct lu_target *lut,
2230                                     int (*check_routine)(struct lu_target *),
2231                                     int (*health_check)(struct obd_export *))
2232 {
2233         struct obd_device *obd = lut->lut_obd;
2234         struct target_distribute_txn_data *tdtd;
2235         time64_t last = 0;
2236         time64_t now;
2237 repeat:
2238         if (obd->obd_recovering && obd->obd_recovery_start == 0) {
2239                 now = ktime_get_seconds();
2240                 if (now - last > 600) {
2241                         LCONSOLE_INFO("%s: in recovery but waiting for the first client to connect\n",
2242                                       obd->obd_name);
2243                         last = now;
2244                 }
2245         }
2246         if (obd->obd_recovery_start != 0 && ktime_get_seconds() >=
2247               (obd->obd_recovery_start + obd->obd_recovery_time_hard)) {
2248                 __u64 next_update_transno = 0;
2249
2250                 /*
2251                  * Only abort the recovery if there are no update recovery
2252                  * left in the queue
2253                  */
2254                 spin_lock(&obd->obd_recovery_task_lock);
2255                 if (!obd_mdt_recovery_abort(obd) && lut->lut_tdtd) {
2256                         next_update_transno =
2257                                 distribute_txn_get_next_transno(lut->lut_tdtd);
2258
2259                         tdtd = lut->lut_tdtd;
2260                         /*
2261                          * If next_update_transno == 0, it probably because
2262                          * updatelog retrieve threads did not get any records
2263                          * yet, let's wait those threads stopped
2264                          */
2265                         if (next_update_transno == 0) {
2266                                 spin_unlock(&obd->obd_recovery_task_lock);
2267
2268                                 while (wait_event_timeout(
2269                                         tdtd->tdtd_recovery_threads_waitq,
2270                                         check_update_llog(lut),
2271                                         cfs_time_seconds(60)) == 0);
2272
2273                                 spin_lock(&obd->obd_recovery_task_lock);
2274                                 next_update_transno =
2275                                         distribute_txn_get_next_transno(tdtd);
2276                         }
2277                 }
2278
2279                 if (next_update_transno != 0 && !obd_recovery_abort(obd)) {
2280                         obd->obd_next_recovery_transno = next_update_transno;
2281                         spin_unlock(&obd->obd_recovery_task_lock);
2282                         /*
2283                          * Disconnect unfinished exports from clients, and
2284                          * keep connection from MDT to make sure the update
2285                          * recovery will still keep trying until some one
2286                          * manually abort the recovery
2287                          */
2288                         class_disconnect_stale_exports(obd,
2289                                                 exp_finished_or_from_mdt);
2290                         /* Abort all of replay & replay lock req from clients */
2291                         abort_req_replay_queue(obd);
2292                         abort_lock_replay_queue(obd);
2293                         CDEBUG(D_HA,
2294                                "%s: there are still update replay (%#llx)in the queue.\n",
2295                                obd->obd_name, next_update_transno);
2296                 } else {
2297                         obd->obd_abort_recovery = 1;
2298                         spin_unlock(&obd->obd_recovery_task_lock);
2299                         CWARN("%s recovery is aborted by hard timeout\n",
2300                               obd->obd_name);
2301                 }
2302         }
2303
2304         while (wait_event_timeout(obd->obd_next_transno_waitq,
2305                                   check_routine(lut),
2306                                   cfs_time_seconds(60)) == 0)
2307                 ; /* wait indefinitely for event, but don't trigger watchdog */
2308
2309         if (obd_recovery_abort(obd)) {
2310                 CWARN("recovery is aborted, evict exports in recovery\n");
2311                 if (lut->lut_tdtd != NULL) {
2312                         tdtd = lut->lut_tdtd;
2313                         /*
2314                          * Let's wait all of the update log recovery thread
2315                          * finished
2316                          */
2317                         wait_event_idle(
2318                                 tdtd->tdtd_recovery_threads_waitq,
2319                                 atomic_read(&tdtd->tdtd_recovery_threads_count)
2320                                 == 0);
2321                         /* Then abort the update recovery list */
2322                         dtrq_list_destroy(lut->lut_tdtd);
2323                 }
2324
2325                 /** evict exports which didn't finish recovery yet */
2326                 class_disconnect_stale_exports(obd, exp_finished);
2327                 return 1;
2328         } else if (obd->obd_recovery_expired) {
2329                 obd->obd_recovery_expired = 0;
2330
2331                 /** If some clients died being recovered, evict them */
2332                 LCONSOLE_WARN("%s: recovery is timed out, evict stale exports\n",
2333                               obd->obd_name);
2334                 /** evict cexports with no replay in queue, they are stalled */
2335                 class_disconnect_stale_exports(obd, health_check);
2336
2337                 /** continue with VBR */
2338                 spin_lock(&obd->obd_dev_lock);
2339                 obd->obd_version_recov = 1;
2340                 spin_unlock(&obd->obd_dev_lock);
2341                 /**
2342                  * reset timer, recovery will proceed with versions now,
2343                  * timeout is set just to handle reconnection delays
2344                  */
2345                 extend_recovery_timer(obd, RECONNECT_DELAY_MAX, true);
2346                 /**
2347                  * Wait for recovery events again, after evicting bad clients
2348                  */
2349                 goto repeat;
2350         }
2351         return 0;
2352 }
2353
2354 static struct ptlrpc_request *target_next_replay_lock(struct lu_target *lut)
2355 {
2356         struct obd_device *obd = lut->lut_obd;
2357         struct ptlrpc_request *req = NULL;
2358
2359         CDEBUG(D_HA, "Waiting for lock\n");
2360         if (target_recovery_overseer(lut, check_for_next_lock,
2361                                      exp_lock_replay_healthy))
2362                 abort_lock_replay_queue(obd);
2363
2364         spin_lock(&obd->obd_recovery_task_lock);
2365         if (!list_empty(&obd->obd_lock_replay_queue)) {
2366                 req = list_first_entry(&obd->obd_lock_replay_queue,
2367                                        struct ptlrpc_request, rq_list);
2368                 list_del_init(&req->rq_list);
2369                 spin_unlock(&obd->obd_recovery_task_lock);
2370         } else {
2371                 spin_unlock(&obd->obd_recovery_task_lock);
2372                 LASSERT(list_empty(&obd->obd_lock_replay_queue));
2373                 LASSERT(atomic_read(&obd->obd_lock_replay_clients) == 0);
2374                 /** evict exports failed VBR */
2375                 class_disconnect_stale_exports(obd, exp_vbr_healthy);
2376         }
2377         return req;
2378 }
2379
2380 static struct ptlrpc_request *target_next_final_ping(struct obd_device *obd)
2381 {
2382         struct ptlrpc_request *req = NULL;
2383
2384         spin_lock(&obd->obd_recovery_task_lock);
2385         if (!list_empty(&obd->obd_final_req_queue)) {
2386                 req = list_first_entry(&obd->obd_final_req_queue,
2387                                        struct ptlrpc_request, rq_list);
2388                 list_del_init(&req->rq_list);
2389                 spin_unlock(&obd->obd_recovery_task_lock);
2390                 if (req->rq_export->exp_in_recovery) {
2391                         spin_lock(&req->rq_export->exp_lock);
2392                         req->rq_export->exp_in_recovery = 0;
2393                         spin_unlock(&req->rq_export->exp_lock);
2394                 }
2395         } else {
2396                 spin_unlock(&obd->obd_recovery_task_lock);
2397         }
2398         return req;
2399 }
2400
2401 static void handle_recovery_req(struct ptlrpc_thread *thread,
2402                                 struct ptlrpc_request *req,
2403                                 svc_handler_t handler)
2404 {
2405         ENTRY;
2406
2407         /**
2408          * export can be evicted during recovery, no need to handle replays for
2409          * it after that, discard such request silently
2410          */
2411         if (req->rq_export->exp_disconnected)
2412                 RETURN_EXIT;
2413
2414         req->rq_session.lc_thread = thread;
2415         req->rq_svc_thread = thread;
2416         req->rq_svc_thread->t_env->le_ses = &req->rq_session;
2417
2418         /* thread context */
2419         lu_context_enter(&thread->t_env->le_ctx);
2420         (void)handler(req);
2421         lu_context_exit(&thread->t_env->le_ctx);
2422
2423         req->rq_svc_thread->t_env->le_ses = NULL;
2424
2425         /* don't reset timer for final stage */
2426         if (!exp_finished(req->rq_export)) {
2427                 timeout_t timeout = obd_timeout;
2428                 struct obd_device *obd = req->rq_export->exp_obd;
2429
2430                 /**
2431                  * Add request @timeout to the recovery time so next request from
2432                  * this client may come in recovery time
2433                  */
2434                 if (!obd_at_off(obd)) {
2435                         struct ptlrpc_service_part *svcpt;
2436                         timeout_t est_timeout;
2437
2438                         svcpt = req->rq_rqbd->rqbd_svcpt;
2439                         /*
2440                          * If the server sent early reply for this request,
2441                          * the client will recalculate the timeout according to
2442                          * current server estimate service time, so we will
2443                          * use the maxium timeout here for waiting the client
2444                          * sending the next req
2445                          */
2446                         est_timeout = obd_at_get(obd, &svcpt->scp_at_estimate);
2447                         timeout = max_t(timeout_t, at_est2timeout(est_timeout),
2448                                         lustre_msg_get_timeout(req->rq_reqmsg));
2449                         /*
2450                          * Add 2 net_latency, one for balance rq_deadline
2451                          * (see ptl_send_rpc), one for resend the req to server,
2452                          * Note: client will pack net_latency in replay req
2453                          * (see ptlrpc_replay_req)
2454                          */
2455                         timeout += 2 * lustre_msg_get_service_timeout(req->rq_reqmsg);
2456                 }
2457                 extend_recovery_timer(class_exp2obd(req->rq_export), timeout,
2458                                       true);
2459         }
2460         EXIT;
2461 }
2462
2463 /** Checking routines for recovery */
2464 static int check_for_recovery_ready(struct lu_target *lut)
2465 {
2466         struct obd_device *obd = lut->lut_obd;
2467         unsigned int clnts = atomic_read(&obd->obd_connected_clients);
2468
2469         CDEBUG(D_HA,
2470                "connected %d stale %d max_recoverable_clients %d abort %d expired %d\n",
2471                clnts, obd->obd_stale_clients,
2472                atomic_read(&obd->obd_max_recoverable_clients),
2473                obd->obd_abort_recovery, obd->obd_recovery_expired);
2474
2475         if (!obd_recovery_abort(obd) && !obd->obd_recovery_expired) {
2476                 LASSERT(clnts <=
2477                         atomic_read(&obd->obd_max_recoverable_clients));
2478                 if (clnts + obd->obd_stale_clients <
2479                     atomic_read(&obd->obd_max_recoverable_clients))
2480                         return 0;
2481         }
2482
2483         if (!obd_mdt_recovery_abort(obd) && lut->lut_tdtd &&
2484             !lut->lut_tdtd->tdtd_replay_ready) {
2485                 /* Let's extend recovery timer, in case the recovery timer
2486                  * expired, and some clients got evicted
2487                  */
2488                 extend_recovery_timer(obd, obd->obd_recovery_timeout, true);
2489                 CDEBUG(D_HA,
2490                        "%s update recovery is not ready, extend recovery %d\n",
2491                        obd->obd_name, obd->obd_recovery_timeout);
2492                 return 0;
2493         }
2494
2495         return 1;
2496 }
2497
2498 enum {
2499         REQUEST_RECOVERY = 1,
2500         UPDATE_RECOVERY = 2,
2501 };
2502
2503 static __u64 get_next_replay_req_transno(struct obd_device *obd)
2504 {
2505         __u64 transno = 0;
2506
2507         if (!list_empty(&obd->obd_req_replay_queue)) {
2508                 struct ptlrpc_request *req;
2509
2510                 req = list_first_entry(&obd->obd_req_replay_queue,
2511                                        struct ptlrpc_request, rq_list);
2512                 transno = lustre_msg_get_transno(req->rq_reqmsg);
2513         }
2514
2515         return transno;
2516 }
2517
2518 static __u64 get_next_transno(struct lu_target *lut, int *type)
2519 {
2520         struct obd_device *obd = lut->lut_obd;
2521         struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
2522         __u64 transno = 0;
2523         __u64 update_transno;
2524
2525         ENTRY;
2526
2527         transno = get_next_replay_req_transno(obd);
2528         if (type != NULL)
2529                 *type = REQUEST_RECOVERY;
2530
2531         if (!tdtd || obd_mdt_recovery_abort(obd))
2532                 RETURN(transno);
2533
2534         update_transno = distribute_txn_get_next_transno(tdtd);
2535         if (transno == 0 || (transno >= update_transno &&
2536                              update_transno != 0)) {
2537                 transno = update_transno;
2538                 if (type != NULL)
2539                         *type = UPDATE_RECOVERY;
2540         }
2541
2542         RETURN(transno);
2543 }
2544
2545 /**
2546  * drop duplicate replay request
2547  *
2548  * Because the operation has been replayed by update recovery, the request
2549  * with the same transno will be dropped and also notify the client to send
2550  * next replay request.
2551  *
2552  * \param[in] env       execution environment
2553  * \param[in] obd       failover obd device
2554  * \param[in] req       request to be dropped
2555  *
2556  * \retval true         duplicate replay update is dropped
2557  * \retval false        duplicate replay update is not dropped
2558  */
2559 static bool drop_duplicate_replay_req(struct lu_env *env,
2560                                       struct obd_device *obd,
2561                                       struct ptlrpc_request *req)
2562 {
2563         __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
2564
2565         DEBUG_REQ(D_HA, req,
2566                   "remove t%lld from %s because duplicate update records found",
2567                   lustre_msg_get_transno(req->rq_reqmsg),
2568                   libcfs_nidstr(&req->rq_peer.nid));
2569
2570         /*
2571          * Right now, only for MDS reint operation update replay and
2572          * normal request replay can have the same transno
2573          */
2574         if (opc == MDS_REINT) {
2575                 req_capsule_set(&req->rq_pill, &RQF_MDS_REINT);
2576                 req->rq_status = req_capsule_server_pack(&req->rq_pill);
2577                 if (likely(req->rq_export))
2578                         target_committed_to_req(req);
2579                 lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
2580                 target_send_reply(req, req->rq_status, 0);
2581         } else if (opc == MDS_CLOSE) {
2582                 DEBUG_REQ(D_HA, req, "duplicate close replay from %s\n",
2583                                 libcfs_nidstr(&req->rq_peer.nid));
2584                 return false;
2585         } else {
2586                 DEBUG_REQ(D_ERROR, req, "wrong opc %d from %s\n", opc,
2587                           libcfs_nidstr(&req->rq_peer.nid));
2588         }
2589         target_exp_dequeue_req_replay(req);
2590         target_request_copy_put(req);
2591         obd->obd_replayed_requests++;
2592
2593         return true;
2594 }
2595
2596 #define WATCHDOG_TIMEOUT (obd_timeout * 10)
2597
2598 static void replay_request_or_update(struct lu_env *env,
2599                                      struct lu_target *lut,
2600                                      struct target_recovery_data *trd,
2601                                      struct ptlrpc_thread *thread)
2602 {
2603         struct obd_device *obd = lut->lut_obd;
2604         struct ptlrpc_request *req = NULL;
2605         int type;
2606         __u64 transno;
2607
2608         ENTRY;
2609
2610         CDEBUG(D_HA, "Waiting for transno %lld\n",
2611                obd->obd_next_recovery_transno);
2612
2613         /* Replay all of request and update by transno */
2614         do {
2615                 struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
2616
2617                 CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);
2618
2619                 /**
2620                  * It is needed to extend recovery window above
2621                  *  recovery_time_soft. Extending is possible only in the
2622                  *  end of recovery window (see more details in
2623                  *  handle_recovery_req()).
2624                  */
2625                 CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
2626
2627                 if (target_recovery_overseer(lut, check_for_next_transno,
2628                                         exp_req_replay_healthy_or_from_mdt)) {
2629                         abort_req_replay_queue(obd);
2630                         abort_lock_replay_queue(obd);
2631                         goto abort;
2632                 }
2633
2634                 spin_lock(&obd->obd_recovery_task_lock);
2635                 transno = get_next_transno(lut, &type);
2636                 if (type == REQUEST_RECOVERY && transno != 0) {
2637                         bool update = false;
2638                         /*
2639                          * Drop replay request from client side, if the
2640                          * replay has been executed by update with the
2641                          * same transno
2642                          */
2643                         req = list_first_entry(&obd->obd_req_replay_queue,
2644                                                struct ptlrpc_request, rq_list);
2645
2646                         list_del_init(&req->rq_list);
2647                         obd->obd_requests_queued_for_recovery--;
2648                         spin_unlock(&obd->obd_recovery_task_lock);
2649
2650                         /*
2651                          * Let's check if the request has been redone by
2652                          * update replay
2653                          */
2654                         if (is_req_replayed_by_update(req)) {
2655                                 struct distribute_txn_replay_req *dtrq;
2656
2657                                 dtrq = distribute_txn_lookup_finish_list(tdtd,
2658                                                                       transno);
2659                                 LASSERT(dtrq != NULL);
2660                                 spin_lock(&tdtd->tdtd_replay_list_lock);
2661                                 list_del_init(&dtrq->dtrq_list);
2662                                 spin_unlock(&tdtd->tdtd_replay_list_lock);
2663                                 dtrq_destroy(dtrq);
2664
2665                                 if (drop_duplicate_replay_req(env, obd, req))
2666                                         continue;
2667                                 /* not dropped yet */
2668                                 update = true;
2669                         }
2670
2671                         LASSERT(trd->trd_processing_task == current->pid);
2672                         DEBUG_REQ(D_HA, req, "processing x%llu t%lld from %s",
2673                                   req->rq_xid,
2674                                   lustre_msg_get_transno(req->rq_reqmsg),
2675                                   libcfs_nidstr(&req->rq_peer.nid));
2676
2677                         ptlrpc_watchdog_init(&thread->t_watchdog,
2678                                              WATCHDOG_TIMEOUT);
2679                         handle_recovery_req(thread, req,
2680                                             trd->trd_recovery_handler);
2681                         ptlrpc_watchdog_disable(&thread->t_watchdog);
2682
2683                         /**
2684                          * bz18031: increase next_recovery_transno before
2685                          * target_request_copy_put() will drop exp_rpc reference
2686                          */
2687                         if (!update) {
2688                                 spin_lock(&obd->obd_recovery_task_lock);
2689                                 obd->obd_next_recovery_transno++;
2690                                 spin_unlock(&obd->obd_recovery_task_lock);
2691                         }
2692                         target_exp_dequeue_req_replay(req);
2693                         target_request_copy_put(req);
2694                         obd->obd_replayed_requests++;
2695                 } else if (type == UPDATE_RECOVERY && transno != 0) {
2696                         struct distribute_txn_replay_req *dtrq;
2697                         int rc;
2698
2699                         spin_unlock(&obd->obd_recovery_task_lock);
2700
2701                         LASSERT(tdtd != NULL);
2702                         dtrq = distribute_txn_get_next_req(tdtd);
2703                         lu_context_enter(&thread->t_env->le_ctx);
2704                         ptlrpc_watchdog_init(&thread->t_watchdog,
2705                                              WATCHDOG_TIMEOUT);
2706                         rc = tdtd->tdtd_replay_handler(env, tdtd, dtrq);
2707                         ptlrpc_watchdog_disable(&thread->t_watchdog);
2708                         lu_context_exit(&thread->t_env->le_ctx);
2709                         extend_recovery_timer(obd, obd_timeout, true);
2710
2711                         if (rc == 0 && dtrq->dtrq_xid != 0) {
2712                                 CDEBUG(D_HA,
2713                                        "Move x%llu t%llu to finish list\n",
2714                                        dtrq->dtrq_xid,
2715                                        dtrq->dtrq_master_transno);
2716
2717                                 /* Add it to the replay finish list */
2718                                 spin_lock(&tdtd->tdtd_replay_list_lock);
2719                                 list_add(&dtrq->dtrq_list,
2720                                          &tdtd->tdtd_replay_finish_list);
2721                                 spin_unlock(&tdtd->tdtd_replay_list_lock);
2722
2723                                 spin_lock(&obd->obd_recovery_task_lock);
2724                                 if (transno == obd->obd_next_recovery_transno)
2725                                         obd->obd_next_recovery_transno++;
2726                                 else if (transno >
2727                                          obd->obd_next_recovery_transno)
2728                                         obd->obd_next_recovery_transno =
2729                                                                 transno + 1;
2730                                 spin_unlock(&obd->obd_recovery_task_lock);
2731                         } else {
2732                                 dtrq_destroy(dtrq);
2733                         }
2734                 } else {
2735                         spin_unlock(&obd->obd_recovery_task_lock);
2736 abort:
2737                         LASSERT(list_empty(&obd->obd_req_replay_queue));
2738                         LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
2739                         /** evict exports failed VBR */
2740                         class_disconnect_stale_exports(obd, exp_vbr_healthy);
2741                         break;
2742                 }
2743         } while (1);
2744 }
2745
2746 static int target_recovery_thread(void *arg)
2747 {
2748         struct lu_target *lut = arg;
2749         struct obd_device *obd = lut->lut_obd;
2750         struct ptlrpc_request *req;
2751         struct target_recovery_data *trd = &obd->obd_recovery_data;
2752         unsigned long delta;
2753         struct lu_env *env;
2754         struct ptlrpc_thread *thread = NULL;
2755         int rc = 0;
2756
2757         ENTRY;
2758         unshare_fs_struct();
2759         OBD_ALLOC_PTR(thread);
2760         if (thread == NULL)
2761                 RETURN(-ENOMEM);
2762
2763         OBD_ALLOC_PTR(env);
2764         if (env == NULL)
2765                 GOTO(out_thread, rc = -ENOMEM);
2766         rc = lu_env_add(env);
2767         if (rc)
2768                 GOTO(out_env, rc);
2769
2770         rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD);
2771         if (rc)
2772                 GOTO(out_env_remove, rc);
2773
2774         thread->t_env = env;
2775         thread->t_id = -1; /* force filter_iobuf_get/put to use local buffers */
2776         thread->t_task = current;
2777         env->le_ctx.lc_thread = thread;
2778         tgt_io_thread_init(thread); /* init thread_big_cache for IO requests */
2779
2780         CDEBUG(D_HA, "%s: started recovery thread pid %d\n", obd->obd_name,
2781                current->pid);
2782         trd->trd_processing_task = current->pid;
2783
2784         spin_lock(&obd->obd_dev_lock);
2785         obd->obd_recovering = 1;
2786         spin_unlock(&obd->obd_dev_lock);
2787         complete(&trd->trd_starting);
2788
2789         /* first of all, we have to know the first transno to replay */
2790         if (target_recovery_overseer(lut, check_for_recovery_ready,
2791                                      exp_connect_healthy)) {
2792                 abort_req_replay_queue(obd);
2793                 abort_lock_replay_queue(obd);
2794                 if (lut->lut_tdtd != NULL)
2795                         dtrq_list_destroy(lut->lut_tdtd);
2796         }
2797
2798         /* next stage: replay requests or update */
2799         delta = jiffies;
2800         CDEBUG(D_INFO, "1: request replay stage - %d clients from t%llu\n",
2801                atomic_read(&obd->obd_req_replay_clients),
2802                obd->obd_next_recovery_transno);
2803         replay_request_or_update(env, lut, trd, thread);
2804
2805         /**
2806          * The second stage: replay locks
2807          */
2808         CDEBUG(D_INFO, "2: lock replay stage - %d clients\n",
2809                atomic_read(&obd->obd_lock_replay_clients));
2810         while ((req = target_next_replay_lock(lut))) {
2811                 LASSERT(trd->trd_processing_task == current->pid);
2812                 DEBUG_REQ(D_HA, req, "processing lock from %s:",
2813                           libcfs_nidstr(&req->rq_peer.nid));
2814                 if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_LOCK_REPLAY)) {
2815                         req->rq_status = -ENODEV;
2816                         target_request_copy_put(req);
2817                         continue;
2818                 }
2819                 handle_recovery_req(thread, req,
2820                                     trd->trd_recovery_handler);
2821                 target_request_copy_put(req);
2822                 obd->obd_replayed_locks++;
2823         }
2824
2825         /**
2826          * The third stage: reply on final pings, at this moment all clients
2827          * must have request in final queue
2828          */
2829         CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_RECONNECT, cfs_fail_val);
2830         CDEBUG(D_INFO, "3: final stage - process recovery completion pings\n");
2831         /** Update server last boot epoch */
2832         tgt_boot_epoch_update(lut);
2833
2834         /* cancel update llogs upon recovery abort */
2835         if (obd->obd_abort_recovery || obd->obd_abort_mdt_recovery)
2836                 obd->obd_type->typ_dt_ops->o_iocontrol(OBD_IOC_LLOG_CANCEL,
2837                                                        obd->obd_self_export,
2838                                                        0, trd, NULL);
2839
2840         list_for_each_entry(req, &obd->obd_final_req_queue, rq_list) {
2841                 /*
2842                  * Because the waiting client can not send ping to server,
2843                  * so we need refresh the last_request_time, to avoid the
2844                  * export is being evicted
2845                  */
2846                 ptlrpc_update_export_timer(req->rq_export, 0);
2847         }
2848
2849         /*
2850          * We drop recoverying flag to forward all new requests
2851          * to regular mds_handle() since now
2852          */
2853         spin_lock(&obd->obd_dev_lock);
2854         obd->obd_recovering = 0;
2855         obd->obd_abort_recovery = 0;
2856         obd->obd_abort_mdt_recovery = 0;
2857         spin_unlock(&obd->obd_dev_lock);
2858         spin_lock(&obd->obd_recovery_task_lock);
2859         target_cancel_recovery_timer(obd);
2860         spin_unlock(&obd->obd_recovery_task_lock);
2861         while ((req = target_next_final_ping(obd))) {
2862                 LASSERT(trd->trd_processing_task == current->pid);
2863                 DEBUG_REQ(D_HA, req, "processing final ping from %s:",
2864                           libcfs_nidstr(&req->rq_peer.nid));
2865                 handle_recovery_req(thread, req,
2866                                     trd->trd_recovery_handler);
2867                 target_request_copy_put(req);
2868         }
2869
2870         delta = jiffies_to_msecs(jiffies - delta) / MSEC_PER_SEC;
2871         CDEBUG(D_INFO, "4: recovery completed in %lus - %d/%d reqs/locks\n",
2872                delta, obd->obd_replayed_requests, obd->obd_replayed_locks);
2873         if (delta > OBD_RECOVERY_TIME_SOFT) {
2874                 CWARN("too long recovery - read logs\n");
2875                 libcfs_debug_dumplog();
2876         }
2877
2878         target_finish_recovery(lut);
2879         lu_context_fini(&env->le_ctx);
2880         trd->trd_processing_task = 0;
2881         complete_all(&trd->trd_finishing);
2882         tgt_io_thread_done(thread);
2883 out_env_remove:
2884         lu_env_remove(env);
2885 out_env:
2886         OBD_FREE_PTR(env);
2887 out_thread:
2888         OBD_FREE_PTR(thread);
2889         RETURN(rc);
2890 }
2891
2892 static int target_start_recovery_thread(struct lu_target *lut,
2893                                         svc_handler_t handler)
2894 {
2895         struct obd_device *obd = lut->lut_obd;
2896         int rc = 0;
2897         struct target_recovery_data *trd = &obd->obd_recovery_data;
2898         int index;
2899
2900         memset(trd, 0, sizeof(*trd));
2901         init_completion(&trd->trd_starting);
2902         init_completion(&trd->trd_finishing);
2903         trd->trd_recovery_handler = handler;
2904
2905         rc = server_name2index(obd->obd_name, &index, NULL);
2906         if (rc < 0)
2907                 return rc;
2908
2909         if (!IS_ERR(kthread_run(target_recovery_thread,
2910                                 lut, "tgt_recover_%d", index))) {
2911                 wait_for_completion(&trd->trd_starting);
2912                 LASSERT(obd->obd_recovering != 0);
2913         } else {
2914                 rc = -ECHILD;
2915         }
2916
2917         return rc;
2918 }
2919
2920 void target_stop_recovery_thread(struct obd_device *obd)
2921 {
2922         if (obd->obd_recovery_data.trd_processing_task > 0) {
2923                 struct target_recovery_data *trd = &obd->obd_recovery_data;
2924                 /** recovery can be done but postrecovery is not yet */
2925                 spin_lock(&obd->obd_dev_lock);
2926                 if (obd->obd_recovering) {
2927                         CERROR("%s: Aborting recovery\n", obd->obd_name);
2928                         obd->obd_abort_recovery = 1;
2929                         wake_up(&obd->obd_next_transno_waitq);
2930                 }
2931                 spin_unlock(&obd->obd_dev_lock);
2932                 wait_for_completion(&trd->trd_finishing);
2933         }
2934 }
2935 EXPORT_SYMBOL(target_stop_recovery_thread);
2936
2937 void target_recovery_fini(struct obd_device *obd)
2938 {
2939         class_disconnect_exports(obd);
2940         target_stop_recovery_thread(obd);
2941         target_cleanup_recovery(obd);
2942 }
2943 EXPORT_SYMBOL(target_recovery_fini);
2944
2945 static enum hrtimer_restart target_recovery_expired(struct hrtimer *timer)
2946 {
2947         struct obd_device *obd = container_of(timer, struct obd_device,
2948                                               obd_recovery_timer);
2949
2950         CDEBUG(D_HA,
2951                "%s: recovery timed out; %d clients are still in recovery after %llu seconds (%d clients connected)\n",
2952                obd->obd_name, atomic_read(&obd->obd_lock_replay_clients),
2953                ktime_get_seconds() - obd->obd_recovery_start,
2954                atomic_read(&obd->obd_connected_clients));
2955
2956         obd->obd_recovery_expired = 1;
2957         wake_up(&obd->obd_next_transno_waitq);
2958         return HRTIMER_NORESTART;
2959 }
2960
2961 void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
2962 {
2963         struct obd_device *obd = lut->lut_obd;
2964
2965         if (lut->lut_bottom->dd_rdonly)
2966                 return;
2967
2968         if (atomic_read(&obd->obd_max_recoverable_clients) == 0) {
2969                 /** Update server last boot epoch */
2970                 tgt_boot_epoch_update(lut);
2971                 return;
2972         }
2973
2974         CDEBUG(D_HA, "RECOVERY: service %s, %d recoverable clients, "
2975                "last_transno %llu\n", obd->obd_name,
2976                atomic_read(&obd->obd_max_recoverable_clients),
2977                obd->obd_last_committed);
2978         LASSERT(obd->obd_stopping == 0);
2979         obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
2980         obd->obd_recovery_start = 0;
2981         obd->obd_recovery_end = 0;
2982
2983         hrtimer_init(&obd->obd_recovery_timer, CLOCK_MONOTONIC,
2984                      HRTIMER_MODE_ABS);
2985         obd->obd_recovery_timer.function = &target_recovery_expired;
2986         target_start_recovery_thread(lut, handler);
2987 }
2988 EXPORT_SYMBOL(target_recovery_init);
2989
2990 static int target_process_req_flags(struct obd_device *obd,
2991                                     struct ptlrpc_request *req)
2992 {
2993         struct obd_export *exp = req->rq_export;
2994
2995         LASSERT(exp != NULL);
2996         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REQ_REPLAY_DONE) {
2997                 /* client declares he's ready to replay locks */
2998                 spin_lock(&exp->exp_lock);
2999                 if (exp->exp_req_replay_needed) {
3000                         exp->exp_req_replay_needed = 0;
3001                         spin_unlock(&exp->exp_lock);
3002
3003                         LASSERT(atomic_read(&(obd)->obd_req_replay_clients) >
3004                                 0);
3005                         atomic_dec(&obd->obd_req_replay_clients);
3006                 } else {
3007                         spin_unlock(&exp->exp_lock);
3008                 }
3009         }
3010         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) {
3011                 /*
3012                  * client declares he's ready to complete recovery
3013                  * so, we put the request on th final queue
3014                  */
3015                 spin_lock(&exp->exp_lock);
3016                 if (exp->exp_lock_replay_needed) {
3017                         exp->exp_lock_replay_needed = 0;
3018                         spin_unlock(&exp->exp_lock);
3019
3020                         LASSERT(atomic_read(&(obd)->obd_lock_replay_clients) >
3021                                 0);
3022                         atomic_dec(&obd->obd_lock_replay_clients);
3023                 } else {
3024                         spin_unlock(&exp->exp_lock);
3025                 }
3026         }
3027         return 0;
3028 }
3029
3030 int target_queue_recovery_request(struct ptlrpc_request *req,
3031                                   struct obd_device *obd)
3032 {
3033         __u64 transno = lustre_msg_get_transno(req->rq_reqmsg);
3034         struct ptlrpc_request *reqiter;
3035         int inserted = 0;
3036
3037         ENTRY;
3038
3039         if (obd->obd_recovery_data.trd_processing_task == current->pid) {
3040                 /* Processing the queue right now, don't re-add. */
3041                 RETURN(1);
3042         }
3043
3044         target_process_req_flags(obd, req);
3045
3046         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) {
3047                 if (unlikely(CFS_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) {
3048                         if (cfs_fail_val == 1) {
3049                                 cfs_race_state = 1;
3050                                 cfs_fail_val = 0;
3051                                 wake_up(&cfs_race_waitq);
3052
3053                                 schedule_timeout_interruptible(
3054                                         cfs_time_seconds(1));
3055                         }
3056                 }
3057
3058                 /*
3059                  * client declares he's ready to complete recovery
3060                  * so, we put the request on th final queue
3061                  */
3062                 target_request_copy_get(req);
3063                 DEBUG_REQ(D_HA, req, "queue final req");
3064                 wake_up(&obd->obd_next_transno_waitq);
3065                 spin_lock(&obd->obd_recovery_task_lock);
3066                 if (obd->obd_recovering) {
3067                         struct ptlrpc_request *tmp;
3068                         struct ptlrpc_request *duplicate = NULL;
3069
3070                         if (likely(!req->rq_export->exp_replay_done)) {
3071                                 req->rq_export->exp_replay_done = 1;
3072                                 list_add_tail(&req->rq_list,
3073                                               &obd->obd_final_req_queue);
3074                                 spin_unlock(&obd->obd_recovery_task_lock);
3075                                 RETURN(0);
3076                         }
3077
3078                         /*
3079                          * XXX O(n), but only happens if final ping is
3080                          * timed out, probably reorganize the list as
3081                          * a hash list later
3082                          */
3083                         list_for_each_entry_safe(reqiter, tmp,
3084                                                  &obd->obd_final_req_queue,
3085                                                  rq_list) {
3086                                 if (reqiter->rq_export == req->rq_export) {
3087                                         list_del_init(&reqiter->rq_list);
3088                                         duplicate = reqiter;
3089                                         break;
3090                                 }
3091                         }
3092
3093                         list_add_tail(&req->rq_list,
3094                                       &obd->obd_final_req_queue);
3095                         req->rq_export->exp_replay_done = 1;
3096                         spin_unlock(&obd->obd_recovery_task_lock);
3097
3098                         if (duplicate != NULL) {
3099                                 DEBUG_REQ(D_HA, duplicate,
3100                                           "put prev final req");
3101                                 target_request_copy_put(duplicate);
3102                         }
3103                         RETURN(0);
3104                 } else {
3105                         spin_unlock(&obd->obd_recovery_task_lock);
3106                         target_request_copy_put(req);
3107                         RETURN(obd->obd_stopping ? -ENOTCONN : 1);
3108                 }
3109         }
3110         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REQ_REPLAY_DONE) {
3111                 /* client declares he's ready to replay locks */
3112                 target_request_copy_get(req);
3113                 DEBUG_REQ(D_HA, req, "queue lock replay req");
3114                 wake_up(&obd->obd_next_transno_waitq);
3115                 spin_lock(&obd->obd_recovery_task_lock);
3116                 LASSERT(obd->obd_recovering);
3117                 /* usually due to recovery abort */
3118                 if (!req->rq_export->exp_in_recovery) {
3119                         spin_unlock(&obd->obd_recovery_task_lock);
3120                         target_request_copy_put(req);
3121                         RETURN(-ENOTCONN);
3122                 }
3123                 LASSERT(req->rq_export->exp_lock_replay_needed);
3124                 list_add_tail(&req->rq_list, &obd->obd_lock_replay_queue);
3125                 spin_unlock(&obd->obd_recovery_task_lock);
3126                 RETURN(0);
3127         }
3128
3129         /*
3130          * CAVEAT EMPTOR: The incoming request message has been swabbed
3131          * (i.e. buflens etc are in my own byte order), but type-dependent
3132          * buffers (eg mdt_body, ost_body etc) have NOT been swabbed.
3133          */
3134
3135         if (!transno) {
3136                 INIT_LIST_HEAD(&req->rq_list);
3137                 DEBUG_REQ(D_HA, req, "not queueing");
3138                 RETURN(1);
3139         }
3140
3141         /*
3142          * If we're processing the queue, we want don't want to queue this
3143          * message.
3144          *
3145          * Also, if this request has a transno less than the one we're waiting
3146          * for, we should process it now.  It could (and currently always will)
3147          * be an open request for a descriptor that was opened some time ago.
3148          *
3149          * Also, a resent, replayed request that has already been
3150          * handled will pass through here and be processed immediately.
3151          */
3152         CDEBUG(D_HA,
3153                "Next recovery transno: %llu, current: %llu, replaying\n",
3154                obd->obd_next_recovery_transno, transno);
3155
3156         /*
3157          * If the request has been replayed by update replay, then sends this
3158          * request to the recovery thread (replay_request_or_update()), where
3159          * it will be handled
3160          */
3161         spin_lock(&obd->obd_recovery_task_lock);
3162         if (transno < obd->obd_next_recovery_transno &&
3163             !is_req_replayed_by_update(req)) {
3164                 /* Processing the queue right now, don't re-add. */
3165                 LASSERT(list_empty(&req->rq_list));
3166                 spin_unlock(&obd->obd_recovery_task_lock);
3167                 RETURN(1);
3168         }
3169         spin_unlock(&obd->obd_recovery_task_lock);
3170
3171         if (CFS_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_DROP))
3172                 RETURN(0);
3173
3174         target_request_copy_get(req);
3175         if (!req->rq_export->exp_in_recovery) {
3176                 target_request_copy_put(req);
3177                 RETURN(-ENOTCONN);
3178         }
3179         LASSERT(req->rq_export->exp_req_replay_needed);
3180
3181         if (target_exp_enqueue_req_replay(req)) {
3182                 DEBUG_REQ(D_ERROR, req, "dropping resent queued req");
3183                 target_request_copy_put(req);
3184                 RETURN(0);
3185         }
3186
3187         /* XXX O(n^2) */
3188         spin_lock(&obd->obd_recovery_task_lock);
3189         LASSERT(obd->obd_recovering);
3190         list_for_each_entry(reqiter, &obd->obd_req_replay_queue, rq_list) {
3191                 if (lustre_msg_get_transno(reqiter->rq_reqmsg) > transno) {
3192                         list_add_tail(&req->rq_list, &reqiter->rq_list);
3193                         inserted = 1;
3194                         goto added;
3195                 }
3196
3197                 if (unlikely(lustre_msg_get_transno(reqiter->rq_reqmsg) ==
3198                              transno)) {
3199                         DEBUG_REQ(D_ERROR, req,
3200                                   "dropping replay: transno has been claimed by another client");
3201                         spin_unlock(&obd->obd_recovery_task_lock);
3202                         target_exp_dequeue_req_replay(req);
3203                         target_request_copy_put(req);
3204                         RETURN(0);
3205                 }
3206         }
3207 added:
3208         if (!inserted)
3209                 list_add_tail(&req->rq_list, &obd->obd_req_replay_queue);
3210
3211         obd->obd_requests_queued_for_recovery++;
3212         spin_unlock(&obd->obd_recovery_task_lock);
3213         wake_up(&obd->obd_next_transno_waitq);
3214         RETURN(0);
3215 }
3216
3217 void target_committed_to_req(struct ptlrpc_request *req)
3218 {
3219         struct obd_export *exp = req->rq_export;
3220
3221         if (!exp->exp_obd->obd_no_transno && req->rq_repmsg != NULL)
3222                 lustre_msg_set_last_committed(req->rq_repmsg,
3223                                               exp->exp_last_committed);
3224         else
3225                 DEBUG_REQ(D_IOCTL, req,
3226                           "not sending last_committed update (%d/%d)",
3227                           exp->exp_obd->obd_no_transno,
3228                           req->rq_repmsg == NULL);
3229
3230         CDEBUG(D_INFO, "last_committed %llu, transno %llu, xid %llu\n",
3231                exp->exp_last_committed, req->rq_transno, req->rq_xid);
3232 }
3233
3234 #endif /* HAVE_SERVER_SUPPORT */
3235
3236 /**
3237  * Packs current SLV and Limit into \a req.
3238  */
3239 int target_pack_pool_reply(struct ptlrpc_request *req)
3240 {
3241         struct obd_device *obd;
3242
3243         ENTRY;
3244
3245         /*
3246          * Check that we still have all structures alive as this may
3247          * be some late RPC at shutdown time.
3248          */
3249         if (unlikely(!req->rq_export || !req->rq_export->exp_obd ||
3250                      !exp_connect_lru_resize(req->rq_export))) {
3251                 lustre_msg_set_slv(req->rq_repmsg, 0);
3252                 lustre_msg_set_limit(req->rq_repmsg, 0);
3253                 RETURN(0);
3254         }
3255
3256         /* OBD is alive here as export is alive, which we checked above. */
3257         obd = req->rq_export->exp_obd;
3258
3259         read_lock(&obd->obd_pool_lock);
3260         lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv);
3261         lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit);
3262         read_unlock(&obd->obd_pool_lock);
3263
3264         RETURN(0);
3265 }
3266
3267 static int target_send_reply_msg(struct ptlrpc_request *req,
3268                                  int rc, int fail_id)
3269 {
3270         if (CFS_FAIL_CHECK_ORSET(fail_id & ~CFS_FAIL_ONCE, CFS_FAIL_ONCE)) {
3271                 DEBUG_REQ(D_ERROR, req, "dropping reply");
3272                 return -ECOMM;
3273         }
3274         /*
3275          * We can have a null rq_reqmsg in the event of bad signature or
3276          * no context when unwrapping
3277          */
3278         if (req->rq_reqmsg &&
3279             unlikely(lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT &&
3280             CFS_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET_REP)))
3281                 return -ECOMM;
3282
3283         if (unlikely(rc)) {
3284                 DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
3285                 req->rq_status = rc;
3286                 return ptlrpc_send_error(req, 1);
3287         }
3288         DEBUG_REQ(D_NET, req, "sending reply");
3289
3290         return ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT);
3291 }
3292
3293 void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
3294 {
3295         struct ptlrpc_service_part *svcpt;
3296         int netrc;
3297         struct ptlrpc_reply_state *rs;
3298         struct obd_export *exp;
3299
3300         ENTRY;
3301
3302         if (req->rq_no_reply) {
3303                 EXIT;
3304                 return;
3305         }
3306
3307         svcpt = req->rq_rqbd->rqbd_svcpt;
3308         rs = req->rq_reply_state;
3309         if (rs == NULL || !rs->rs_difficult) {
3310                 /* no notifiers */
3311                 target_send_reply_msg(req, rc, fail_id);
3312                 EXIT;
3313                 return;
3314         }
3315
3316         /* must be an export if locks saved */
3317         LASSERT(req->rq_export != NULL);
3318         /* req/reply consistent */
3319         LASSERT(rs->rs_svcpt == svcpt);
3320
3321         /* "fresh" reply */
3322         LASSERT(!rs->rs_scheduled);
3323         LASSERT(!rs->rs_scheduled_ever);
3324         LASSERT(!rs->rs_handled);
3325         LASSERT(!rs->rs_sent);
3326         LASSERT(!rs->rs_unlinked);
3327         LASSERT(rs->rs_export == NULL);
3328         LASSERT(list_empty(&rs->rs_obd_list));
3329         LASSERT(list_empty(&rs->rs_exp_list));
3330
3331         exp = class_export_get(req->rq_export);
3332
3333         /* disable reply scheduling while I'm setting up */
3334         rs->rs_scheduled = 1;
3335         rs->rs_sent      = 0;
3336         rs->rs_unlinked  = 0;
3337         rs->rs_xid       = req->rq_xid;
3338         rs->rs_transno   = req->rq_transno;
3339         rs->rs_export    = exp;
3340         rs->rs_opc       = lustre_msg_get_opc(req->rq_reqmsg);
3341
3342         spin_lock(&exp->exp_uncommitted_replies_lock);
3343         CDEBUG(D_NET, "rs transno = %llu, last committed = %llu\n",
3344                rs->rs_transno, exp->exp_last_committed);
3345         if (rs->rs_transno > exp->exp_last_committed) {
3346                 /* not committed already */
3347                 list_add_tail(&rs->rs_obd_list,
3348                                   &exp->exp_uncommitted_replies);
3349         }
3350         spin_unlock(&exp->exp_uncommitted_replies_lock);
3351
3352         spin_lock(&exp->exp_lock);
3353         list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies);
3354         spin_unlock(&exp->exp_lock);
3355
3356         netrc = target_send_reply_msg(req, rc, fail_id);
3357
3358         spin_lock(&svcpt->scp_rep_lock);
3359
3360         atomic_inc(&svcpt->scp_nreps_difficult);
3361
3362         if (netrc != 0) {
3363                 /*
3364                  * error sending: reply is off the net.  Also we need +1
3365                  * reply ref until ptlrpc_handle_rs() is done
3366                  * with the reply state (if the send was successful, there
3367                  * would have been +1 ref for the net, which
3368                  * reply_out_callback leaves alone)
3369                  */
3370                 rs->rs_sent = 1;
3371                 rs->rs_unlinked = 1;
3372                 ptlrpc_rs_addref(rs);
3373         }
3374
3375         spin_lock(&rs->rs_lock);
3376         if (rs->rs_transno <= exp->exp_last_committed ||
3377             (rs->rs_unlinked && !rs->rs_no_ack) ||
3378             list_empty(&rs->rs_exp_list) ||     /* completed already */
3379             list_empty(&rs->rs_obd_list)) {
3380                 CDEBUG(D_HA, "Schedule reply immediately\n");
3381                 ptlrpc_dispatch_difficult_reply(rs);
3382         } else {
3383                 list_add(&rs->rs_list, &svcpt->scp_rep_active);
3384                 rs->rs_scheduled = 0;   /* allow notifier to schedule */
3385         }
3386         spin_unlock(&rs->rs_lock);
3387         spin_unlock(&svcpt->scp_rep_lock);
3388         EXIT;
3389 }
3390
3391 enum ldlm_mode lck_compat_array[] = {
3392         [LCK_EX]    = LCK_COMPAT_EX,
3393         [LCK_PW]    = LCK_COMPAT_PW,
3394         [LCK_PR]    = LCK_COMPAT_PR,
3395         [LCK_CW]    = LCK_COMPAT_CW,
3396         [LCK_CR]    = LCK_COMPAT_CR,
3397         [LCK_NL]    = LCK_COMPAT_NL,
3398         [LCK_GROUP] = LCK_COMPAT_GROUP,
3399         [LCK_COS]   = LCK_COMPAT_COS,
3400         [LCK_TXN]   = LCK_COMPAT_TXN,
3401 };
3402
3403 /**
3404  * Rather arbitrary mapping from LDLM error codes to errno values. This should
3405  * not escape to the user level.
3406  */
3407 int ldlm_error2errno(enum ldlm_error error)
3408 {
3409         int result;
3410
3411         switch (error) {
3412         case ELDLM_OK:
3413         case ELDLM_LOCK_MATCHED:
3414                 result = 0;
3415                 break;
3416         case ELDLM_LOCK_CHANGED:
3417                 result = -ESTALE;
3418                 break;
3419         case ELDLM_LOCK_ABORTED:
3420                 result = -ENAVAIL;
3421                 break;
3422         case ELDLM_LOCK_REPLACED:
3423                 result = -ESRCH;
3424                 break;
3425         case ELDLM_NO_LOCK_DATA:
3426                 result = -ENOENT;
3427                 break;
3428         case ELDLM_NAMESPACE_EXISTS:
3429                 result = -EEXIST;
3430                 break;
3431         case ELDLM_BAD_NAMESPACE:
3432                 result = -EBADF;
3433                 break;
3434         default:
3435                 if (((int)error) < 0) { /* cast to signed type */
3436                         result = error; /* as ldlm_error can be unsigned */
3437                 } else {
3438                         CERROR("Invalid DLM result code: %d\n", error);
3439                         result = -EPROTO;
3440                 }
3441         }
3442         return result;
3443 }
3444 EXPORT_SYMBOL(ldlm_error2errno);
3445
3446 /**
3447  * Dual to ldlm_error2errno(): maps errno values back to enum ldlm_error.
3448  */
3449 enum ldlm_error ldlm_errno2error(int err_no)
3450 {
3451         int error;
3452
3453         switch (err_no) {
3454         case 0:
3455                 error = ELDLM_OK;
3456                 break;
3457         case -ESTALE:
3458                 error = ELDLM_LOCK_CHANGED;
3459                 break;
3460         case -ENAVAIL:
3461                 error = ELDLM_LOCK_ABORTED;
3462                 break;
3463         case -ESRCH:
3464                 error = ELDLM_LOCK_REPLACED;
3465                 break;
3466         case -ENOENT:
3467                 error = ELDLM_NO_LOCK_DATA;
3468                 break;
3469         case -EEXIST:
3470                 error = ELDLM_NAMESPACE_EXISTS;
3471                 break;
3472         case -EBADF:
3473                 error = ELDLM_BAD_NAMESPACE;
3474                 break;
3475         default:
3476                 error = err_no;
3477         }
3478         return error;
3479 }
3480
3481 #if LUSTRE_TRACKS_LOCK_EXP_REFS
3482 void ldlm_dump_export_locks(struct obd_export *exp)
3483 {
3484         spin_lock(&exp->exp_locks_list_guard);
3485         if (!list_empty(&exp->exp_locks_list)) {
3486                 struct ldlm_lock *lock;
3487
3488                 CERROR("dumping locks for export %p, ignore if the unmount doesn't hang\n",
3489                        exp);
3490                 list_for_each_entry(lock, &exp->exp_locks_list,
3491                                         l_exp_refs_link)
3492                         LDLM_ERROR(lock, "lock:");
3493         }
3494         spin_unlock(&exp->exp_locks_list_guard);
3495 }
3496 #endif
3497
3498 #ifdef HAVE_SERVER_SUPPORT
3499 static inline const char *bulk2type(struct ptlrpc_request *req)
3500 {
3501         if (req->rq_bulk_read)
3502                 return "READ";
3503         if (req->rq_bulk_write)
3504                 return "WRITE";
3505         return "UNKNOWN";
3506 }
3507
3508 int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc)
3509 {
3510         struct ptlrpc_request *req = desc->bd_req;
3511         time64_t start = ktime_get_seconds();
3512         time64_t deadline;
3513         int rc = 0;
3514
3515         ENTRY;
3516
3517         /* Check if client was evicted or reconnected already. */
3518         if (exp->exp_failed ||
3519             exp->exp_conn_cnt > lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
3520                 rc = -ENOTCONN;
3521         } else {
3522                 if (req->rq_bulk_read)
3523                         rc = sptlrpc_svc_wrap_bulk(req, desc);
3524
3525                 if (OCD_HAS_FLAG(&exp->exp_connect_data, BULK_MBITS))
3526                         req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg);
3527                 else /* old version, bulk matchbits is rq_xid */
3528                         req->rq_mbits = req->rq_xid;
3529
3530                 if (rc == 0)
3531                         rc = ptlrpc_start_bulk_transfer(desc);
3532         }
3533
3534         if (rc < 0) {
3535                 DEBUG_REQ(D_ERROR, req, "bulk %s failed: rc = %d",
3536                           bulk2type(req), rc);
3537                 RETURN(rc);
3538         }
3539
3540         if (CFS_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) {
3541                 ptlrpc_abort_bulk(desc);
3542                 RETURN(0);
3543         }
3544
3545         /* limit actual bulk transfer to bulk_timeout seconds */
3546         deadline = start + bulk_timeout;
3547         if (deadline > req->rq_deadline)
3548                 deadline = req->rq_deadline;
3549
3550         do {
3551                 time64_t timeoutl = deadline - ktime_get_seconds();
3552                 time64_t rq_deadline;
3553
3554                 while (timeoutl >= 0 &&
3555                        wait_event_idle_timeout(
3556                                desc->bd_waitq,
3557                                !ptlrpc_server_bulk_active(desc) ||
3558                                exp->exp_failed ||
3559                                exp->exp_conn_cnt >
3560                                lustre_msg_get_conn_cnt(req->rq_reqmsg),
3561                                timeoutl ? cfs_time_seconds(1) : 1) == 0)
3562                         timeoutl -= 1;
3563                 rc = timeoutl < 0 ? -ETIMEDOUT : 0;
3564
3565                 /* Wait again if we changed rq_deadline. */
3566                 rq_deadline = READ_ONCE(req->rq_deadline);
3567                 deadline = start + bulk_timeout;
3568                 if (deadline > rq_deadline)
3569                         deadline = rq_deadline;
3570         } while (rc == -ETIMEDOUT &&
3571                  deadline > ktime_get_seconds());
3572
3573         if (rc == -ETIMEDOUT) {
3574                 DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %lld%+llds",
3575                           bulk2type(req), deadline - start,
3576                           ktime_get_real_seconds() - deadline);
3577                 ptlrpc_abort_bulk(desc);
3578         } else if (exp->exp_failed) {
3579                 DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
3580                           bulk2type(req));
3581                 rc = -ENOTCONN;
3582                 ptlrpc_abort_bulk(desc);
3583         } else if (exp->exp_conn_cnt >
3584                    lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
3585                 DEBUG_REQ(D_ERROR, req, "Reconnect on bulk %s",
3586                           bulk2type(req));
3587                 /* We don't reply anyway. */
3588                 rc = -ETIMEDOUT;
3589                 ptlrpc_abort_bulk(desc);
3590         } else if (desc->bd_failure) {
3591                 DEBUG_REQ(D_ERROR, req, "network error on bulk %s",
3592                           bulk2type(req));
3593                 /* XXX should this be a different errno? */
3594                 rc = -ETIMEDOUT;
3595         } else {
3596                 if (req->rq_bulk_write)
3597                         rc = sptlrpc_svc_unwrap_bulk(req, desc);
3598                 if (rc == 0 && desc->bd_nob_transferred != desc->bd_nob) {
3599                         DEBUG_REQ(D_ERROR, req, "truncated bulk %s %d(%d)",
3600                                   bulk2type(req), desc->bd_nob_transferred,
3601                                   desc->bd_nob);
3602                         /* XXX should this be a different errno? */
3603                         rc = -ETIMEDOUT;
3604                 }
3605         }
3606
3607         RETURN(rc);
3608 }
3609 EXPORT_SYMBOL(target_bulk_io);
3610
3611 #endif /* HAVE_SERVER_SUPPORT */