Whamcloud - gitweb
03e61642146965a73feee17bea9a7e52364acda0
[fs/lustre-release.git] / lustre / ptlrpc / client.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  */
32
33 /** Implementation of client-side PortalRPC interfaces */
34
35 #define DEBUG_SUBSYSTEM S_RPC
36
37 #include <linux/delay.h>
38 #include <obd_support.h>
39 #include <obd_class.h>
40 #include <lustre_lib.h>
41 #include <lustre_ha.h>
42 #include <lustre_import.h>
43 #include <lustre_req_layout.h>
44
45 #include "ptlrpc_internal.h"
46
47 const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops = {
48         .add_kiov_frag  = ptlrpc_prep_bulk_page_pin,
49         .release_frags  = ptlrpc_release_bulk_page_pin,
50 };
51 EXPORT_SYMBOL(ptlrpc_bulk_kiov_pin_ops);
52
53 const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops = {
54         .add_kiov_frag  = ptlrpc_prep_bulk_page_nopin,
55         .release_frags  = ptlrpc_release_bulk_noop,
56 };
57 EXPORT_SYMBOL(ptlrpc_bulk_kiov_nopin_ops);
58
59 const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kvec_ops = {
60         .add_iov_frag = ptlrpc_prep_bulk_frag,
61 };
62 EXPORT_SYMBOL(ptlrpc_bulk_kvec_ops);
63
64 static int ptlrpc_send_new_req(struct ptlrpc_request *req);
65 static int ptlrpcd_check_work(struct ptlrpc_request *req);
66 static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async);
67
68 /**
69  * Initialize passed in client structure \a cl.
70  */
71 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
72                         struct ptlrpc_client *cl)
73 {
74         cl->cli_request_portal = req_portal;
75         cl->cli_reply_portal   = rep_portal;
76         cl->cli_name           = name;
77 }
78 EXPORT_SYMBOL(ptlrpc_init_client);
79
80 /**
81  * Return PortalRPC connection for remore uud \a uuid
82  */
83 struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid,
84                                                     lnet_nid_t nid4refnet)
85 {
86         struct ptlrpc_connection *c;
87         lnet_nid_t self;
88         struct lnet_process_id peer;
89         int err;
90
91         /*
92          * ptlrpc_uuid_to_peer() initializes its 2nd parameter
93          * before accessing its values.
94          */
95         /* coverity[uninit_use_in_call] */
96         peer.nid = nid4refnet;
97         err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
98         if (err != 0) {
99                 CNETERR("cannot find peer %s!\n", uuid->uuid);
100                 return NULL;
101         }
102
103         c = ptlrpc_connection_get(peer, self, uuid);
104         if (c) {
105                 memcpy(c->c_remote_uuid.uuid,
106                        uuid->uuid, sizeof(c->c_remote_uuid.uuid));
107         }
108
109         CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
110
111         return c;
112 }
113
114 /**
115  * Allocate and initialize new bulk descriptor on the sender.
116  * Returns pointer to the descriptor or NULL on error.
117  */
118 struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned int nfrags,
119                                          unsigned int max_brw,
120                                          enum ptlrpc_bulk_op_type type,
121                                          unsigned int portal,
122                                          const struct ptlrpc_bulk_frag_ops *ops)
123 {
124         struct ptlrpc_bulk_desc *desc;
125         int i;
126
127         /* ensure that only one of KIOV or IOVEC is set but not both */
128         LASSERT((ptlrpc_is_bulk_desc_kiov(type) &&
129                  ops->add_kiov_frag != NULL) ||
130                 (ptlrpc_is_bulk_desc_kvec(type) &&
131                  ops->add_iov_frag != NULL));
132
133         OBD_ALLOC_PTR(desc);
134         if (!desc)
135                 return NULL;
136         if (type & PTLRPC_BULK_BUF_KIOV) {
137                 OBD_ALLOC_LARGE(GET_KIOV(desc),
138                                 nfrags * sizeof(*GET_KIOV(desc)));
139                 if (!GET_KIOV(desc))
140                         goto out;
141         } else {
142                 OBD_ALLOC_LARGE(GET_KVEC(desc),
143                                 nfrags * sizeof(*GET_KVEC(desc)));
144                 if (!GET_KVEC(desc))
145                         goto out;
146         }
147
148         spin_lock_init(&desc->bd_lock);
149         init_waitqueue_head(&desc->bd_waitq);
150         desc->bd_max_iov = nfrags;
151         desc->bd_iov_count = 0;
152         desc->bd_portal = portal;
153         desc->bd_type = type;
154         desc->bd_md_count = 0;
155         desc->bd_frag_ops = (struct ptlrpc_bulk_frag_ops *)ops;
156         LASSERT(max_brw > 0);
157         desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
158         /*
159          * PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
160          * node. Negotiated ocd_brw_size will always be <= this number.
161          */
162         for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
163                 LNetInvalidateMDHandle(&desc->bd_mds[i]);
164
165         return desc;
166 out:
167         OBD_FREE_PTR(desc);
168         return NULL;
169 }
170
171 /**
172  * Prepare bulk descriptor for specified outgoing request \a req that
173  * can fit \a nfrags * pages. \a type is bulk type. \a portal is where
174  * the bulk to be sent. Used on client-side.
175  * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
176  * error.
177  */
178 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
179                                               unsigned int nfrags,
180                                               unsigned int max_brw,
181                                               unsigned int type,
182                                               unsigned int portal,
183                                               const struct ptlrpc_bulk_frag_ops
184                                                 *ops)
185 {
186         struct obd_import *imp = req->rq_import;
187         struct ptlrpc_bulk_desc *desc;
188
189         ENTRY;
190         LASSERT(ptlrpc_is_bulk_op_passive(type));
191
192         desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops);
193         if (!desc)
194                 RETURN(NULL);
195
196         desc->bd_import_generation = req->rq_import_generation;
197         desc->bd_import = class_import_get(imp);
198         desc->bd_req = req;
199
200         desc->bd_cbid.cbid_fn  = client_bulk_callback;
201         desc->bd_cbid.cbid_arg = desc;
202
203         /* This makes req own desc, and free it when she frees herself */
204         req->rq_bulk = desc;
205
206         return desc;
207 }
208 EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
209
210 void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
211                              struct page *page, int pageoffset, int len,
212                              int pin)
213 {
214         lnet_kiov_t *kiov;
215
216         LASSERT(desc->bd_iov_count < desc->bd_max_iov);
217         LASSERT(page != NULL);
218         LASSERT(pageoffset >= 0);
219         LASSERT(len > 0);
220         LASSERT(pageoffset + len <= PAGE_SIZE);
221         LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
222
223         kiov = &BD_GET_KIOV(desc, desc->bd_iov_count);
224
225         desc->bd_nob += len;
226
227         if (pin)
228                 get_page(page);
229
230         kiov->kiov_page = page;
231         kiov->kiov_offset = pageoffset;
232         kiov->kiov_len = len;
233
234         desc->bd_iov_count++;
235 }
236 EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
237
238 int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc,
239                           void *frag, int len)
240 {
241         struct kvec *iovec;
242
243         ENTRY;
244
245         LASSERT(desc->bd_iov_count < desc->bd_max_iov);
246         LASSERT(frag != NULL);
247         LASSERT(len > 0);
248         LASSERT(ptlrpc_is_bulk_desc_kvec(desc->bd_type));
249
250         iovec = &BD_GET_KVEC(desc, desc->bd_iov_count);
251
252         desc->bd_nob += len;
253
254         iovec->iov_base = frag;
255         iovec->iov_len = len;
256
257         desc->bd_iov_count++;
258
259         RETURN(desc->bd_nob);
260 }
261 EXPORT_SYMBOL(ptlrpc_prep_bulk_frag);
262
263 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
264 {
265         ENTRY;
266
267         LASSERT(desc != NULL);
268         LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
269         LASSERT(desc->bd_md_count == 0);         /* network hands off */
270         LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
271         LASSERT(desc->bd_frag_ops != NULL);
272
273         if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
274                 sptlrpc_enc_pool_put_pages(desc);
275
276         if (desc->bd_export)
277                 class_export_put(desc->bd_export);
278         else
279                 class_import_put(desc->bd_import);
280
281         if (desc->bd_frag_ops->release_frags != NULL)
282                 desc->bd_frag_ops->release_frags(desc);
283
284         if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
285                 OBD_FREE_LARGE(GET_KIOV(desc),
286                                desc->bd_max_iov * sizeof(*GET_KIOV(desc)));
287         else
288                 OBD_FREE_LARGE(GET_KVEC(desc),
289                                desc->bd_max_iov * sizeof(*GET_KVEC(desc)));
290         OBD_FREE_PTR(desc);
291         EXIT;
292 }
293 EXPORT_SYMBOL(ptlrpc_free_bulk);
294
295 /**
296  * Set server timelimit for this req, i.e. how long are we willing to wait
297  * for reply before timing out this request.
298  */
299 void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
300 {
301         __u32 serv_est;
302         int idx;
303         struct imp_at *at;
304
305         LASSERT(req->rq_import);
306
307         if (AT_OFF) {
308                 /* non-AT settings */
309                 /**
310                  * \a imp_server_timeout means this is reverse import and
311                  * we send (currently only) ASTs to the client and cannot afford
312                  * to wait too long for the reply, otherwise the other client
313                  * (because of which we are sending this request) would
314                  * timeout waiting for us
315                  */
316                 req->rq_timeout = req->rq_import->imp_server_timeout ?
317                                   obd_timeout / 2 : obd_timeout;
318         } else {
319                 at = &req->rq_import->imp_at;
320                 idx = import_at_get_index(req->rq_import,
321                                           req->rq_request_portal);
322                 serv_est = at_get(&at->iat_service_estimate[idx]);
323                 req->rq_timeout = at_est2timeout(serv_est);
324         }
325         /*
326          * We could get even fancier here, using history to predict increased
327          * loading...
328          */
329
330         /*
331          * Let the server know what this RPC timeout is by putting it in the
332          * reqmsg
333          */
334         lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
335 }
336 EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
337
338 /* Adjust max service estimate based on server value */
339 static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
340                                   unsigned int serv_est)
341 {
342         int idx;
343         unsigned int oldse;
344         struct imp_at *at;
345
346         LASSERT(req->rq_import);
347         at = &req->rq_import->imp_at;
348
349         idx = import_at_get_index(req->rq_import, req->rq_request_portal);
350         /*
351          * max service estimates are tracked on the server side,
352          * so just keep minimal history here
353          */
354         oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
355         if (oldse != 0)
356                 CDEBUG(D_ADAPTTO,
357                        "The RPC service estimate for %s ptl %d has changed from %d to %d\n",
358                        req->rq_import->imp_obd->obd_name,
359                        req->rq_request_portal,
360                        oldse, at_get(&at->iat_service_estimate[idx]));
361 }
362
363 /* Expected network latency per remote node (secs) */
364 int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
365 {
366         return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
367 }
368
369 /* Adjust expected network latency */
370 void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
371                                unsigned int service_time)
372 {
373         unsigned int nl, oldnl;
374         struct imp_at *at;
375         time64_t now = ktime_get_real_seconds();
376
377         LASSERT(req->rq_import);
378
379         if (service_time > now - req->rq_sent + 3) {
380                 /*
381                  * b=16408, however, this can also happen if early reply
382                  * is lost and client RPC is expired and resent, early reply
383                  * or reply of original RPC can still be fit in reply buffer
384                  * of resent RPC, now client is measuring time from the
385                  * resent time, but server sent back service time of original
386                  * RPC.
387                  */
388                 CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ?
389                        D_ADAPTTO : D_WARNING,
390                        "Reported service time %u > total measured time %lld\n",
391                        service_time, now - req->rq_sent);
392                 return;
393         }
394
395         /* Network latency is total time less server processing time */
396         nl = max_t(int, now - req->rq_sent -
397                         service_time, 0) + 1; /* st rounding */
398         at = &req->rq_import->imp_at;
399
400         oldnl = at_measured(&at->iat_net_latency, nl);
401         if (oldnl != 0)
402                 CDEBUG(D_ADAPTTO,
403                        "The network latency for %s (nid %s) has changed from %d to %d\n",
404                        req->rq_import->imp_obd->obd_name,
405                        obd_uuid2str(&req->rq_import->imp_connection->c_remote_uuid),
406                        oldnl, at_get(&at->iat_net_latency));
407 }
408
409 static int unpack_reply(struct ptlrpc_request *req)
410 {
411         int rc;
412
413         if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
414                 rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
415                 if (rc) {
416                         DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
417                         return -EPROTO;
418                 }
419         }
420
421         rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
422         if (rc) {
423                 DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
424                 return -EPROTO;
425         }
426         return 0;
427 }
428
429 /**
430  * Handle an early reply message, called with the rq_lock held.
431  * If anything goes wrong just ignore it - same as if it never happened
432  */
433 static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
434 __must_hold(&req->rq_lock)
435 {
436         struct ptlrpc_request *early_req;
437         time64_t olddl;
438         int rc;
439
440         ENTRY;
441         req->rq_early = 0;
442         spin_unlock(&req->rq_lock);
443
444         rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
445         if (rc) {
446                 spin_lock(&req->rq_lock);
447                 RETURN(rc);
448         }
449
450         rc = unpack_reply(early_req);
451         if (rc != 0) {
452                 sptlrpc_cli_finish_early_reply(early_req);
453                 spin_lock(&req->rq_lock);
454                 RETURN(rc);
455         }
456
457         /*
458          * Use new timeout value just to adjust the local value for this
459          * request, don't include it into at_history. It is unclear yet why
460          * service time increased and should it be counted or skipped, e.g.
461          * that can be recovery case or some error or server, the real reply
462          * will add all new data if it is worth to add.
463          */
464         req->rq_timeout = lustre_msg_get_timeout(early_req->rq_repmsg);
465         lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
466
467         /* Network latency can be adjusted, it is pure network delays */
468         ptlrpc_at_adj_net_latency(req,
469                                   lustre_msg_get_service_time(early_req->rq_repmsg));
470
471         sptlrpc_cli_finish_early_reply(early_req);
472
473         spin_lock(&req->rq_lock);
474         olddl = req->rq_deadline;
475         /*
476          * server assumes it now has rq_timeout from when the request
477          * arrived, so the client should give it at least that long.
478          * since we don't know the arrival time we'll use the original
479          * sent time
480          */
481         req->rq_deadline = req->rq_sent + req->rq_timeout +
482                            ptlrpc_at_get_net_latency(req);
483
484         DEBUG_REQ(D_ADAPTTO, req,
485                   "Early reply #%d, new deadline in %llds (%llds)",
486                   req->rq_early_count,
487                   req->rq_deadline - ktime_get_real_seconds(),
488                   req->rq_deadline - olddl);
489
490         RETURN(rc);
491 }
492
493 static struct kmem_cache *request_cache;
494
495 int ptlrpc_request_cache_init(void)
496 {
497         request_cache = kmem_cache_create("ptlrpc_cache",
498                                           sizeof(struct ptlrpc_request),
499                                           0, SLAB_HWCACHE_ALIGN, NULL);
500         return request_cache ? 0 : -ENOMEM;
501 }
502
503 void ptlrpc_request_cache_fini(void)
504 {
505         kmem_cache_destroy(request_cache);
506 }
507
508 struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags)
509 {
510         struct ptlrpc_request *req;
511
512         OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags);
513         return req;
514 }
515
516 void ptlrpc_request_cache_free(struct ptlrpc_request *req)
517 {
518         OBD_SLAB_FREE_PTR(req, request_cache);
519 }
520
521 /**
522  * Wind down request pool \a pool.
523  * Frees all requests from the pool too
524  */
525 void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
526 {
527         struct list_head *l, *tmp;
528         struct ptlrpc_request *req;
529
530         LASSERT(pool != NULL);
531
532         spin_lock(&pool->prp_lock);
533         list_for_each_safe(l, tmp, &pool->prp_req_list) {
534                 req = list_entry(l, struct ptlrpc_request, rq_list);
535                 list_del(&req->rq_list);
536                 LASSERT(req->rq_reqbuf);
537                 LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
538                 OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size);
539                 ptlrpc_request_cache_free(req);
540         }
541         spin_unlock(&pool->prp_lock);
542         OBD_FREE(pool, sizeof(*pool));
543 }
544 EXPORT_SYMBOL(ptlrpc_free_rq_pool);
545
546 /**
547  * Allocates, initializes and adds \a num_rq requests to the pool \a pool
548  */
549 int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
550 {
551         int i;
552         int size = 1;
553
554         while (size < pool->prp_rq_size)
555                 size <<= 1;
556
557         LASSERTF(list_empty(&pool->prp_req_list) ||
558                  size == pool->prp_rq_size,
559                  "Trying to change pool size with nonempty pool from %d to %d bytes\n",
560                  pool->prp_rq_size, size);
561
562         spin_lock(&pool->prp_lock);
563         pool->prp_rq_size = size;
564         for (i = 0; i < num_rq; i++) {
565                 struct ptlrpc_request *req;
566                 struct lustre_msg *msg;
567
568                 spin_unlock(&pool->prp_lock);
569                 req = ptlrpc_request_cache_alloc(GFP_NOFS);
570                 if (!req)
571                         return i;
572                 OBD_ALLOC_LARGE(msg, size);
573                 if (!msg) {
574                         ptlrpc_request_cache_free(req);
575                         return i;
576                 }
577                 req->rq_reqbuf = msg;
578                 req->rq_reqbuf_len = size;
579                 req->rq_pool = pool;
580                 spin_lock(&pool->prp_lock);
581                 list_add_tail(&req->rq_list, &pool->prp_req_list);
582         }
583         spin_unlock(&pool->prp_lock);
584         return num_rq;
585 }
586 EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
587
588 /**
589  * Create and initialize new request pool with given attributes:
590  * \a num_rq - initial number of requests to create for the pool
591  * \a msgsize - maximum message size possible for requests in thid pool
592  * \a populate_pool - function to be called when more requests need to be added
593  *                    to the pool
594  * Returns pointer to newly created pool or NULL on error.
595  */
596 struct ptlrpc_request_pool *
597 ptlrpc_init_rq_pool(int num_rq, int msgsize,
598                     int (*populate_pool)(struct ptlrpc_request_pool *, int))
599 {
600         struct ptlrpc_request_pool *pool;
601
602         OBD_ALLOC(pool, sizeof(struct ptlrpc_request_pool));
603         if (!pool)
604                 return NULL;
605
606         /*
607          * Request next power of two for the allocation, because internally
608          * kernel would do exactly this
609          */
610         spin_lock_init(&pool->prp_lock);
611         INIT_LIST_HEAD(&pool->prp_req_list);
612         pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
613         pool->prp_populate = populate_pool;
614
615         populate_pool(pool, num_rq);
616
617         return pool;
618 }
619 EXPORT_SYMBOL(ptlrpc_init_rq_pool);
620
621 /**
622  * Fetches one request from pool \a pool
623  */
624 static struct ptlrpc_request *
625 ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
626 {
627         struct ptlrpc_request *request;
628         struct lustre_msg *reqbuf;
629
630         if (!pool)
631                 return NULL;
632
633         spin_lock(&pool->prp_lock);
634
635         /*
636          * See if we have anything in a pool, and bail out if nothing,
637          * in writeout path, where this matters, this is safe to do, because
638          * nothing is lost in this case, and when some in-flight requests
639          * complete, this code will be called again.
640          */
641         if (unlikely(list_empty(&pool->prp_req_list))) {
642                 spin_unlock(&pool->prp_lock);
643                 return NULL;
644         }
645
646         request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
647                              rq_list);
648         list_del_init(&request->rq_list);
649         spin_unlock(&pool->prp_lock);
650
651         LASSERT(request->rq_reqbuf);
652         LASSERT(request->rq_pool);
653
654         reqbuf = request->rq_reqbuf;
655         memset(request, 0, sizeof(*request));
656         request->rq_reqbuf = reqbuf;
657         request->rq_reqbuf_len = pool->prp_rq_size;
658         request->rq_pool = pool;
659
660         return request;
661 }
662
663 /**
664  * Returns freed \a request to pool.
665  */
666 static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
667 {
668         struct ptlrpc_request_pool *pool = request->rq_pool;
669
670         spin_lock(&pool->prp_lock);
671         LASSERT(list_empty(&request->rq_list));
672         LASSERT(!request->rq_receiving_reply);
673         list_add_tail(&request->rq_list, &pool->prp_req_list);
674         spin_unlock(&pool->prp_lock);
675 }
676
677 void ptlrpc_add_unreplied(struct ptlrpc_request *req)
678 {
679         struct obd_import *imp = req->rq_import;
680         struct list_head *tmp;
681         struct ptlrpc_request *iter;
682
683         assert_spin_locked(&imp->imp_lock);
684         LASSERT(list_empty(&req->rq_unreplied_list));
685
686         /* unreplied list is sorted by xid in ascending order */
687         list_for_each_prev(tmp, &imp->imp_unreplied_list) {
688                 iter = list_entry(tmp, struct ptlrpc_request,
689                                   rq_unreplied_list);
690
691                 LASSERT(req->rq_xid != iter->rq_xid);
692                 if (req->rq_xid < iter->rq_xid)
693                         continue;
694                 list_add(&req->rq_unreplied_list, &iter->rq_unreplied_list);
695                 return;
696         }
697         list_add(&req->rq_unreplied_list, &imp->imp_unreplied_list);
698 }
699
700 void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req)
701 {
702         req->rq_xid = ptlrpc_next_xid();
703         ptlrpc_add_unreplied(req);
704 }
705
706 static inline void ptlrpc_assign_next_xid(struct ptlrpc_request *req)
707 {
708         spin_lock(&req->rq_import->imp_lock);
709         ptlrpc_assign_next_xid_nolock(req);
710         spin_unlock(&req->rq_import->imp_lock);
711 }
712
713 static __u64 ptlrpc_last_xid;
714 static spinlock_t ptlrpc_last_xid_lock;
715
716 int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
717                              __u32 version, int opcode, char **bufs,
718                              struct ptlrpc_cli_ctx *ctx)
719 {
720         int count;
721         struct obd_import *imp;
722         __u32 *lengths;
723         int rc;
724
725         ENTRY;
726
727         count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
728         imp = request->rq_import;
729         lengths = request->rq_pill.rc_area[RCL_CLIENT];
730
731         if (ctx) {
732                 request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
733         } else {
734                 rc = sptlrpc_req_get_ctx(request);
735                 if (rc)
736                         GOTO(out_free, rc);
737         }
738         sptlrpc_req_set_flavor(request, opcode);
739
740         rc = lustre_pack_request(request, imp->imp_msg_magic, count,
741                                  lengths, bufs);
742         if (rc)
743                 GOTO(out_ctx, rc);
744
745         lustre_msg_add_version(request->rq_reqmsg, version);
746         request->rq_send_state = LUSTRE_IMP_FULL;
747         request->rq_type = PTL_RPC_MSG_REQUEST;
748
749         request->rq_req_cbid.cbid_fn  = request_out_callback;
750         request->rq_req_cbid.cbid_arg = request;
751
752         request->rq_reply_cbid.cbid_fn  = reply_in_callback;
753         request->rq_reply_cbid.cbid_arg = request;
754
755         request->rq_reply_deadline = 0;
756         request->rq_bulk_deadline = 0;
757         request->rq_req_deadline = 0;
758         request->rq_phase = RQ_PHASE_NEW;
759         request->rq_next_phase = RQ_PHASE_UNDEFINED;
760
761         request->rq_request_portal = imp->imp_client->cli_request_portal;
762         request->rq_reply_portal = imp->imp_client->cli_reply_portal;
763
764         ptlrpc_at_set_req_timeout(request);
765
766         lustre_msg_set_opc(request->rq_reqmsg, opcode);
767
768         /* Let's setup deadline for req/reply/bulk unlink for opcode. */
769         if (cfs_fail_val == opcode) {
770                 time64_t *fail_t = NULL, *fail2_t = NULL;
771
772                 if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
773                         fail_t = &request->rq_bulk_deadline;
774                 } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
775                         fail_t = &request->rq_reply_deadline;
776                 } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK)) {
777                         fail_t = &request->rq_req_deadline;
778                 } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK)) {
779                         fail_t = &request->rq_reply_deadline;
780                         fail2_t = &request->rq_bulk_deadline;
781                 } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_ROUND_XID)) {
782                         time64_t now = ktime_get_real_seconds();
783                         spin_lock(&ptlrpc_last_xid_lock);
784                         ptlrpc_last_xid = ((__u64)now >> 4) << 24;
785                         spin_unlock(&ptlrpc_last_xid_lock);
786                 }
787
788                 if (fail_t) {
789                         *fail_t = ktime_get_real_seconds() + LONG_UNLINK;
790
791                         if (fail2_t)
792                                 *fail2_t = ktime_get_real_seconds() +
793                                            LONG_UNLINK;
794
795                         /*
796                          * The RPC is infected, let the test to change the
797                          * fail_loc
798                          */
799                         msleep(4 * MSEC_PER_SEC);
800                 }
801         }
802         ptlrpc_assign_next_xid(request);
803
804         RETURN(0);
805
806 out_ctx:
807         LASSERT(!request->rq_pool);
808         sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
809 out_free:
810         class_import_put(imp);
811
812         return rc;
813 }
814 EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
815
816 /**
817  * Pack request buffers for network transfer, performing necessary encryption
818  * steps if necessary.
819  */
820 int ptlrpc_request_pack(struct ptlrpc_request *request,
821                         __u32 version, int opcode)
822 {
823         int rc;
824
825         rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
826         if (rc)
827                 return rc;
828
829         /*
830          * For some old 1.8 clients (< 1.8.7), they will LASSERT the size of
831          * ptlrpc_body sent from server equal to local ptlrpc_body size, so we
832          * have to send old ptlrpc_body to keep interoprability with these
833          * clients.
834          *
835          * Only three kinds of server->client RPCs so far:
836          *  - LDLM_BL_CALLBACK
837          *  - LDLM_CP_CALLBACK
838          *  - LDLM_GL_CALLBACK
839          *
840          * XXX This should be removed whenever we drop the interoprability with
841          *     the these old clients.
842          */
843         if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK ||
844             opcode == LDLM_GL_CALLBACK)
845                 req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY,
846                                    sizeof(struct ptlrpc_body_v2), RCL_CLIENT);
847
848         return rc;
849 }
850 EXPORT_SYMBOL(ptlrpc_request_pack);
851
852 /**
853  * Helper function to allocate new request on import \a imp
854  * and possibly using existing request from pool \a pool if provided.
855  * Returns allocated request structure with import field filled or
856  * NULL on error.
857  */
858 static inline
859 struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
860                                               struct ptlrpc_request_pool *pool)
861 {
862         struct ptlrpc_request *request = NULL;
863
864         request = ptlrpc_request_cache_alloc(GFP_NOFS);
865
866         if (!request && pool)
867                 request = ptlrpc_prep_req_from_pool(pool);
868
869         if (request) {
870                 ptlrpc_cli_req_init(request);
871
872                 LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
873                 LASSERT(imp != LP_POISON);
874                 LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p\n",
875                          imp->imp_client);
876                 LASSERT(imp->imp_client != LP_POISON);
877
878                 request->rq_import = class_import_get(imp);
879         } else {
880                 CERROR("request allocation out of memory\n");
881         }
882
883         return request;
884 }
885
886 /**
887  * Helper function for creating a request.
888  * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits
889  * buffer structures according to capsule template \a format.
890  * Returns allocated request structure pointer or NULL on error.
891  */
892 static struct ptlrpc_request *
893 ptlrpc_request_alloc_internal(struct obd_import *imp,
894                               struct ptlrpc_request_pool *pool,
895                               const struct req_format *format)
896 {
897         struct ptlrpc_request *request;
898         int connect = 0;
899
900         request = __ptlrpc_request_alloc(imp, pool);
901         if (!request)
902                 return NULL;
903
904         /*
905          * initiate connection if needed when the import has been
906          * referenced by the new request to avoid races with disconnect
907          */
908         if (unlikely(imp->imp_state == LUSTRE_IMP_IDLE)) {
909                 int rc;
910
911                 CDEBUG_LIMIT(imp->imp_idle_debug,
912                              "%s: reconnect after %llds idle\n",
913                              imp->imp_obd->obd_name, ktime_get_real_seconds() -
914                                                      imp->imp_last_reply_time);
915                 spin_lock(&imp->imp_lock);
916                 if (imp->imp_state == LUSTRE_IMP_IDLE) {
917                         imp->imp_generation++;
918                         imp->imp_initiated_at = imp->imp_generation;
919                         imp->imp_state =  LUSTRE_IMP_NEW;
920                         connect = 1;
921                 }
922                 spin_unlock(&imp->imp_lock);
923                 if (connect) {
924                         rc = ptlrpc_connect_import(imp);
925                         if (rc < 0) {
926                                 ptlrpc_request_free(request);
927                                 return NULL;
928                         }
929                         ptlrpc_pinger_add_import(imp);
930                 }
931         }
932
933         req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
934         req_capsule_set(&request->rq_pill, format);
935         return request;
936 }
937
938 /**
939  * Allocate new request structure for import \a imp and initialize its
940  * buffer structure according to capsule template \a format.
941  */
942 struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
943                                             const struct req_format *format)
944 {
945         return ptlrpc_request_alloc_internal(imp, NULL, format);
946 }
947 EXPORT_SYMBOL(ptlrpc_request_alloc);
948
949 /**
950  * Allocate new request structure for import \a imp from pool \a pool and
951  * initialize its buffer structure according to capsule template \a format.
952  */
953 struct ptlrpc_request *
954 ptlrpc_request_alloc_pool(struct obd_import *imp,
955                           struct ptlrpc_request_pool *pool,
956                           const struct req_format *format)
957 {
958         return ptlrpc_request_alloc_internal(imp, pool, format);
959 }
960 EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
961
962 /**
963  * For requests not from pool, free memory of the request structure.
964  * For requests obtained from a pool earlier, return request back to pool.
965  */
966 void ptlrpc_request_free(struct ptlrpc_request *request)
967 {
968         if (request->rq_pool)
969                 __ptlrpc_free_req_to_pool(request);
970         else
971                 ptlrpc_request_cache_free(request);
972 }
973 EXPORT_SYMBOL(ptlrpc_request_free);
974
975 /**
976  * Allocate new request for operatione \a opcode and immediatelly pack it for
977  * network transfer.
978  * Only used for simple requests like OBD_PING where the only important
979  * part of the request is operation itself.
980  * Returns allocated request or NULL on error.
981  */
982 struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
983                                                  const struct req_format *format,
984                                                  __u32 version, int opcode)
985 {
986         struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
987         int rc;
988
989         if (req) {
990                 rc = ptlrpc_request_pack(req, version, opcode);
991                 if (rc) {
992                         ptlrpc_request_free(req);
993                         req = NULL;
994                 }
995         }
996         return req;
997 }
998 EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
999
1000 /**
1001  * Allocate and initialize new request set structure on the current CPT.
1002  * Returns a pointer to the newly allocated set structure or NULL on error.
1003  */
1004 struct ptlrpc_request_set *ptlrpc_prep_set(void)
1005 {
1006         struct ptlrpc_request_set *set;
1007         int cpt;
1008
1009         ENTRY;
1010         cpt = cfs_cpt_current(cfs_cpt_table, 0);
1011         OBD_CPT_ALLOC(set, cfs_cpt_table, cpt, sizeof(*set));
1012         if (!set)
1013                 RETURN(NULL);
1014         atomic_set(&set->set_refcount, 1);
1015         INIT_LIST_HEAD(&set->set_requests);
1016         init_waitqueue_head(&set->set_waitq);
1017         atomic_set(&set->set_new_count, 0);
1018         atomic_set(&set->set_remaining, 0);
1019         spin_lock_init(&set->set_new_req_lock);
1020         INIT_LIST_HEAD(&set->set_new_requests);
1021         set->set_max_inflight = UINT_MAX;
1022         set->set_producer     = NULL;
1023         set->set_producer_arg = NULL;
1024         set->set_rc           = 0;
1025
1026         RETURN(set);
1027 }
1028 EXPORT_SYMBOL(ptlrpc_prep_set);
1029
1030 /**
1031  * Allocate and initialize new request set structure with flow control
1032  * extension. This extension allows to control the number of requests in-flight
1033  * for the whole set. A callback function to generate requests must be provided
1034  * and the request set will keep the number of requests sent over the wire to
1035  * @max_inflight.
1036  * Returns a pointer to the newly allocated set structure or NULL on error.
1037  */
1038 struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
1039                                              void *arg)
1040
1041 {
1042         struct ptlrpc_request_set *set;
1043
1044         set = ptlrpc_prep_set();
1045         if (!set)
1046                 RETURN(NULL);
1047
1048         set->set_max_inflight  = max;
1049         set->set_producer      = func;
1050         set->set_producer_arg  = arg;
1051
1052         RETURN(set);
1053 }
1054
1055 /**
1056  * Wind down and free request set structure previously allocated with
1057  * ptlrpc_prep_set.
1058  * Ensures that all requests on the set have completed and removes
1059  * all requests from the request list in a set.
1060  * If any unsent request happen to be on the list, pretends that they got
1061  * an error in flight and calls their completion handler.
1062  */
1063 void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
1064 {
1065         struct list_head *tmp;
1066         struct list_head *next;
1067         int expected_phase;
1068         int n = 0;
1069
1070         ENTRY;
1071
1072         /* Requests on the set should either all be completed, or all be new */
1073         expected_phase = (atomic_read(&set->set_remaining) == 0) ?
1074                          RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
1075         list_for_each(tmp, &set->set_requests) {
1076                 struct ptlrpc_request *req =
1077                         list_entry(tmp, struct ptlrpc_request,
1078                                    rq_set_chain);
1079
1080                 LASSERT(req->rq_phase == expected_phase);
1081                 n++;
1082         }
1083
1084         LASSERTF(atomic_read(&set->set_remaining) == 0 ||
1085                  atomic_read(&set->set_remaining) == n, "%d / %d\n",
1086                  atomic_read(&set->set_remaining), n);
1087
1088         list_for_each_safe(tmp, next, &set->set_requests) {
1089                 struct ptlrpc_request *req =
1090                         list_entry(tmp, struct ptlrpc_request,
1091                                    rq_set_chain);
1092                 list_del_init(&req->rq_set_chain);
1093
1094                 LASSERT(req->rq_phase == expected_phase);
1095
1096                 if (req->rq_phase == RQ_PHASE_NEW) {
1097                         ptlrpc_req_interpret(NULL, req, -EBADR);
1098                         atomic_dec(&set->set_remaining);
1099                 }
1100
1101                 spin_lock(&req->rq_lock);
1102                 req->rq_set = NULL;
1103                 req->rq_invalid_rqset = 0;
1104                 spin_unlock(&req->rq_lock);
1105
1106                 ptlrpc_req_finished(req);
1107         }
1108
1109         LASSERT(atomic_read(&set->set_remaining) == 0);
1110
1111         ptlrpc_reqset_put(set);
1112         EXIT;
1113 }
1114 EXPORT_SYMBOL(ptlrpc_set_destroy);
1115
1116 /**
1117  * Add a new request to the general purpose request set.
1118  * Assumes request reference from the caller.
1119  */
1120 void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
1121                         struct ptlrpc_request *req)
1122 {
1123         LASSERT(req->rq_import->imp_state != LUSTRE_IMP_IDLE);
1124         LASSERT(list_empty(&req->rq_set_chain));
1125
1126         if (req->rq_allow_intr)
1127                 set->set_allow_intr = 1;
1128
1129         /* The set takes over the caller's request reference */
1130         list_add_tail(&req->rq_set_chain, &set->set_requests);
1131         req->rq_set = set;
1132         atomic_inc(&set->set_remaining);
1133         req->rq_queued_time = ktime_get_seconds();
1134
1135         if (req->rq_reqmsg)
1136                 lustre_msg_set_jobid(req->rq_reqmsg, NULL);
1137
1138         if (set->set_producer)
1139                 /*
1140                  * If the request set has a producer callback, the RPC must be
1141                  * sent straight away
1142                  */
1143                 ptlrpc_send_new_req(req);
1144 }
1145 EXPORT_SYMBOL(ptlrpc_set_add_req);
1146
1147 /**
1148  * Add a request to a request with dedicated server thread
1149  * and wake the thread to make any necessary processing.
1150  * Currently only used for ptlrpcd.
1151  */
1152 void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
1153                             struct ptlrpc_request *req)
1154 {
1155         struct ptlrpc_request_set *set = pc->pc_set;
1156         int count, i;
1157
1158         LASSERT(req->rq_set == NULL);
1159         LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
1160
1161         spin_lock(&set->set_new_req_lock);
1162         /*
1163          * The set takes over the caller's request reference.
1164          */
1165         req->rq_set = set;
1166         req->rq_queued_time = ktime_get_seconds();
1167         list_add_tail(&req->rq_set_chain, &set->set_new_requests);
1168         count = atomic_inc_return(&set->set_new_count);
1169         spin_unlock(&set->set_new_req_lock);
1170
1171         /* Only need to call wakeup once for the first entry. */
1172         if (count == 1) {
1173                 wake_up(&set->set_waitq);
1174
1175                 /*
1176                  * XXX: It maybe unnecessary to wakeup all the partners. But to
1177                  *      guarantee the async RPC can be processed ASAP, we have
1178                  *      no other better choice. It maybe fixed in future.
1179                  */
1180                 for (i = 0; i < pc->pc_npartners; i++)
1181                         wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
1182         }
1183 }
1184
1185 /**
1186  * Based on the current state of the import, determine if the request
1187  * can be sent, is an error, or should be delayed.
1188  *
1189  * Returns true if this request should be delayed. If false, and
1190  * *status is set, then the request can not be sent and *status is the
1191  * error code.  If false and status is 0, then request can be sent.
1192  *
1193  * The imp->imp_lock must be held.
1194  */
1195 static int ptlrpc_import_delay_req(struct obd_import *imp,
1196                                    struct ptlrpc_request *req, int *status)
1197 {
1198         int delay = 0;
1199
1200         ENTRY;
1201         LASSERT(status);
1202         *status = 0;
1203
1204         if (req->rq_ctx_init || req->rq_ctx_fini) {
1205                 /* always allow ctx init/fini rpc go through */
1206         } else if (imp->imp_state == LUSTRE_IMP_NEW) {
1207                 DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
1208                 *status = -EIO;
1209         } else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
1210                 unsigned int opc = lustre_msg_get_opc(req->rq_reqmsg);
1211
1212                 /*
1213                  * pings or MDS-equivalent STATFS may safely
1214                  * race with umount
1215                  */
1216                 DEBUG_REQ((opc == OBD_PING || opc == OST_STATFS) ?
1217                           D_HA : D_ERROR, req, "IMP_CLOSED ");
1218                 *status = -EIO;
1219         } else if (ptlrpc_send_limit_expired(req)) {
1220                 /* probably doesn't need to be a D_ERROR afterinitial testing */
1221                 DEBUG_REQ(D_HA, req, "send limit expired ");
1222                 *status = -ETIMEDOUT;
1223         } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
1224                    imp->imp_state == LUSTRE_IMP_CONNECTING) {
1225                 ;/* allow CONNECT even if import is invalid */
1226                 if (atomic_read(&imp->imp_inval_count) != 0) {
1227                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
1228                         *status = -EIO;
1229                 }
1230         } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
1231                 if (!imp->imp_deactive)
1232                         DEBUG_REQ(D_NET, req, "IMP_INVALID");
1233                 *status = -ESHUTDOWN; /* b=12940 */
1234         } else if (req->rq_import_generation != imp->imp_generation) {
1235                 DEBUG_REQ(D_ERROR, req, "req wrong generation:");
1236                 *status = -EIO;
1237         } else if (req->rq_send_state != imp->imp_state) {
1238                 /* invalidate in progress - any requests should be drop */
1239                 if (atomic_read(&imp->imp_inval_count) != 0) {
1240                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
1241                         *status = -EIO;
1242                 } else if (req->rq_no_delay &&
1243                            imp->imp_generation != imp->imp_initiated_at) {
1244                         /* ignore nodelay for requests initiating connections */
1245                         *status = -EWOULDBLOCK;
1246                 } else if (req->rq_allow_replay &&
1247                            (imp->imp_state == LUSTRE_IMP_REPLAY ||
1248                             imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
1249                             imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
1250                             imp->imp_state == LUSTRE_IMP_RECOVER)) {
1251                         DEBUG_REQ(D_HA, req, "allow during recovery.\n");
1252                 } else {
1253                         delay = 1;
1254                 }
1255         }
1256
1257         RETURN(delay);
1258 }
1259
1260 /**
1261  * Decide if the error message should be printed to the console or not.
1262  * Makes its decision based on request type, status, and failure frequency.
1263  *
1264  * \param[in] req  request that failed and may need a console message
1265  *
1266  * \retval false if no message should be printed
1267  * \retval true  if console message should be printed
1268  */
1269 static bool ptlrpc_console_allow(struct ptlrpc_request *req, __u32 opc, int err)
1270 {
1271         LASSERT(req->rq_reqmsg != NULL);
1272
1273         /* Suppress particular reconnect errors which are to be expected. */
1274         if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) {
1275                 /* Suppress timed out reconnect requests */
1276                 if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) ||
1277                     req->rq_timedout)
1278                         return false;
1279
1280                 /*
1281                  * Suppress most unavailable/again reconnect requests, but
1282                  * print occasionally so it is clear client is trying to
1283                  * connect to a server where no target is running.
1284                  */
1285                 if ((err == -ENODEV || err == -EAGAIN) &&
1286                     req->rq_import->imp_conn_cnt % 30 != 20)
1287                         return false;
1288         }
1289
1290         if (opc == LDLM_ENQUEUE && err == -EAGAIN)
1291                 /* -EAGAIN is normal when using POSIX flocks */
1292                 return false;
1293
1294         if (opc == OBD_PING && (err == -ENODEV || err == -ENOTCONN) &&
1295             (req->rq_xid & 0xf) != 10)
1296                 /* Suppress most ping requests, they may fail occasionally */
1297                 return false;
1298
1299         return true;
1300 }
1301
1302 /**
1303  * Check request processing status.
1304  * Returns the status.
1305  */
1306 static int ptlrpc_check_status(struct ptlrpc_request *req)
1307 {
1308         int err;
1309
1310         ENTRY;
1311         err = lustre_msg_get_status(req->rq_repmsg);
1312         if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
1313                 struct obd_import *imp = req->rq_import;
1314                 lnet_nid_t nid = imp->imp_connection->c_peer.nid;
1315                 __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
1316
1317                 if (ptlrpc_console_allow(req, opc, err))
1318                         LCONSOLE_ERROR_MSG(0x11,
1319                                            "%s: operation %s to node %s failed: rc = %d\n",
1320                                            imp->imp_obd->obd_name,
1321                                            ll_opcode2str(opc),
1322                                            libcfs_nid2str(nid), err);
1323                 RETURN(err < 0 ? err : -EINVAL);
1324         }
1325
1326         if (err < 0) {
1327                 DEBUG_REQ(D_INFO, req, "status is %d", err);
1328         } else if (err > 0) {
1329                 /* XXX: translate this error from net to host */
1330                 DEBUG_REQ(D_INFO, req, "status is %d", err);
1331         }
1332
1333         RETURN(err);
1334 }
1335
1336 /**
1337  * save pre-versions of objects into request for replay.
1338  * Versions are obtained from server reply.
1339  * used for VBR.
1340  */
1341 static void ptlrpc_save_versions(struct ptlrpc_request *req)
1342 {
1343         struct lustre_msg *repmsg = req->rq_repmsg;
1344         struct lustre_msg *reqmsg = req->rq_reqmsg;
1345         __u64 *versions = lustre_msg_get_versions(repmsg);
1346
1347         ENTRY;
1348         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
1349                 return;
1350
1351         LASSERT(versions);
1352         lustre_msg_set_versions(reqmsg, versions);
1353         CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n",
1354                versions[0], versions[1]);
1355
1356         EXIT;
1357 }
1358
1359 __u64 ptlrpc_known_replied_xid(struct obd_import *imp)
1360 {
1361         struct ptlrpc_request *req;
1362
1363         assert_spin_locked(&imp->imp_lock);
1364         if (list_empty(&imp->imp_unreplied_list))
1365                 return 0;
1366
1367         req = list_entry(imp->imp_unreplied_list.next, struct ptlrpc_request,
1368                          rq_unreplied_list);
1369         LASSERTF(req->rq_xid >= 1, "XID:%llu\n", req->rq_xid);
1370
1371         if (imp->imp_known_replied_xid < req->rq_xid - 1)
1372                 imp->imp_known_replied_xid = req->rq_xid - 1;
1373
1374         return req->rq_xid - 1;
1375 }
1376
1377 /**
1378  * Callback function called when client receives RPC reply for \a req.
1379  * Returns 0 on success or error code.
1380  * The return alue would be assigned to req->rq_status by the caller
1381  * as request processing status.
1382  * This function also decides if the request needs to be saved for later replay.
1383  */
1384 static int after_reply(struct ptlrpc_request *req)
1385 {
1386         struct obd_import *imp = req->rq_import;
1387         struct obd_device *obd = req->rq_import->imp_obd;
1388         ktime_t work_start;
1389         u64 committed;
1390         s64 timediff;
1391         int rc;
1392
1393         ENTRY;
1394         LASSERT(obd != NULL);
1395         /* repbuf must be unlinked */
1396         LASSERT(!req->rq_receiving_reply && req->rq_reply_unlinked);
1397
1398         if (req->rq_reply_truncated) {
1399                 if (ptlrpc_no_resend(req)) {
1400                         DEBUG_REQ(D_ERROR, req,
1401                                   "reply buffer overflow, expected: %d, actual size: %d",
1402                                   req->rq_nob_received, req->rq_repbuf_len);
1403                         RETURN(-EOVERFLOW);
1404                 }
1405
1406                 sptlrpc_cli_free_repbuf(req);
1407                 /*
1408                  * Pass the required reply buffer size (include
1409                  * space for early reply).
1410                  * NB: no need to roundup because alloc_repbuf
1411                  * will roundup it
1412                  */
1413                 req->rq_replen = req->rq_nob_received;
1414                 req->rq_nob_received = 0;
1415                 spin_lock(&req->rq_lock);
1416                 req->rq_resend       = 1;
1417                 spin_unlock(&req->rq_lock);
1418                 RETURN(0);
1419         }
1420
1421         work_start = ktime_get_real();
1422         timediff = ktime_us_delta(work_start, req->rq_sent_ns);
1423
1424         /*
1425          * NB Until this point, the whole of the incoming message,
1426          * including buflens, status etc is in the sender's byte order.
1427          */
1428         rc = sptlrpc_cli_unwrap_reply(req);
1429         if (rc) {
1430                 DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
1431                 RETURN(rc);
1432         }
1433
1434         /*
1435          * Security layer unwrap might ask resend this request.
1436          */
1437         if (req->rq_resend)
1438                 RETURN(0);
1439
1440         rc = unpack_reply(req);
1441         if (rc)
1442                 RETURN(rc);
1443
1444         /* retry indefinitely on EINPROGRESS */
1445         if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
1446             ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
1447                 time64_t now = ktime_get_real_seconds();
1448
1449                 DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
1450                 spin_lock(&req->rq_lock);
1451                 req->rq_resend = 1;
1452                 spin_unlock(&req->rq_lock);
1453                 req->rq_nr_resend++;
1454
1455                 /* Readjust the timeout for current conditions */
1456                 ptlrpc_at_set_req_timeout(req);
1457                 /*
1458                  * delay resend to give a chance to the server to get ready.
1459                  * The delay is increased by 1s on every resend and is capped to
1460                  * the current request timeout (i.e. obd_timeout if AT is off,
1461                  * or AT service time x 125% + 5s, see at_est2timeout)
1462                  */
1463                 if (req->rq_nr_resend > req->rq_timeout)
1464                         req->rq_sent = now + req->rq_timeout;
1465                 else
1466                         req->rq_sent = now + req->rq_nr_resend;
1467
1468                 /* Resend for EINPROGRESS will use a new XID */
1469                 spin_lock(&imp->imp_lock);
1470                 list_del_init(&req->rq_unreplied_list);
1471                 spin_unlock(&imp->imp_lock);
1472
1473                 RETURN(0);
1474         }
1475
1476         if (obd->obd_svc_stats) {
1477                 lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
1478                                     timediff);
1479                 ptlrpc_lprocfs_rpc_sent(req, timediff);
1480         }
1481
1482         if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
1483             lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
1484                 DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
1485                           lustre_msg_get_type(req->rq_repmsg));
1486                 RETURN(-EPROTO);
1487         }
1488
1489         if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
1490                 CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
1491         ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
1492         ptlrpc_at_adj_net_latency(req,
1493                                   lustre_msg_get_service_time(req->rq_repmsg));
1494
1495         rc = ptlrpc_check_status(req);
1496
1497         if (rc) {
1498                 /*
1499                  * Either we've been evicted, or the server has failed for
1500                  * some reason. Try to reconnect, and if that fails, punt to
1501                  * the upcall.
1502                  */
1503                 if (ptlrpc_recoverable_error(rc)) {
1504                         if (req->rq_send_state != LUSTRE_IMP_FULL ||
1505                             imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
1506                                 RETURN(rc);
1507                         }
1508                         ptlrpc_request_handle_notconn(req);
1509                         RETURN(rc);
1510                 }
1511         } else {
1512                 /*
1513                  * Let's look if server sent slv. Do it only for RPC with
1514                  * rc == 0.
1515                  */
1516                 ldlm_cli_update_pool(req);
1517         }
1518
1519         /*
1520          * Store transno in reqmsg for replay.
1521          */
1522         if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
1523                 req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
1524                 lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
1525         }
1526
1527         if (imp->imp_replayable) {
1528                 spin_lock(&imp->imp_lock);
1529                 /*
1530                  * No point in adding already-committed requests to the replay
1531                  * list, we will just remove them immediately. b=9829
1532                  */
1533                 if (req->rq_transno != 0 &&
1534                     (req->rq_transno >
1535                      lustre_msg_get_last_committed(req->rq_repmsg) ||
1536                      req->rq_replay)) {
1537                         /** version recovery */
1538                         ptlrpc_save_versions(req);
1539                         ptlrpc_retain_replayable_request(req, imp);
1540                 } else if (req->rq_commit_cb &&
1541                            list_empty(&req->rq_replay_list)) {
1542                         /*
1543                          * NB: don't call rq_commit_cb if it's already on
1544                          * rq_replay_list, ptlrpc_free_committed() will call
1545                          * it later, see LU-3618 for details
1546                          */
1547                         spin_unlock(&imp->imp_lock);
1548                         req->rq_commit_cb(req);
1549                         spin_lock(&imp->imp_lock);
1550                 }
1551
1552                 /*
1553                  * Replay-enabled imports return commit-status information.
1554                  */
1555                 committed = lustre_msg_get_last_committed(req->rq_repmsg);
1556                 if (likely(committed > imp->imp_peer_committed_transno))
1557                         imp->imp_peer_committed_transno = committed;
1558
1559                 ptlrpc_free_committed(imp);
1560
1561                 if (!list_empty(&imp->imp_replay_list)) {
1562                         struct ptlrpc_request *last;
1563
1564                         last = list_entry(imp->imp_replay_list.prev,
1565                                           struct ptlrpc_request,
1566                                           rq_replay_list);
1567                         /*
1568                          * Requests with rq_replay stay on the list even if no
1569                          * commit is expected.
1570                          */
1571                         if (last->rq_transno > imp->imp_peer_committed_transno)
1572                                 ptlrpc_pinger_commit_expected(imp);
1573                 }
1574
1575                 spin_unlock(&imp->imp_lock);
1576         }
1577
1578         RETURN(rc);
1579 }
1580
1581 /**
1582  * Helper function to send request \a req over the network for the first time
1583  * Also adjusts request phase.
1584  * Returns 0 on success or error code.
1585  */
1586 static int ptlrpc_send_new_req(struct ptlrpc_request *req)
1587 {
1588         struct obd_import *imp = req->rq_import;
1589         __u64 min_xid = 0;
1590         int rc;
1591
1592         ENTRY;
1593         LASSERT(req->rq_phase == RQ_PHASE_NEW);
1594
1595         /* do not try to go further if there is not enough memory in enc_pool */
1596         if (req->rq_sent && req->rq_bulk)
1597                 if (req->rq_bulk->bd_iov_count > get_free_pages_in_pool() &&
1598                     pool_is_at_full_capacity())
1599                         RETURN(-ENOMEM);
1600
1601         if (req->rq_sent && (req->rq_sent > ktime_get_real_seconds()) &&
1602             (!req->rq_generation_set ||
1603              req->rq_import_generation == imp->imp_generation))
1604                 RETURN(0);
1605
1606         ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
1607
1608         spin_lock(&imp->imp_lock);
1609
1610         LASSERT(req->rq_xid != 0);
1611         LASSERT(!list_empty(&req->rq_unreplied_list));
1612
1613         if (!req->rq_generation_set)
1614                 req->rq_import_generation = imp->imp_generation;
1615
1616         if (ptlrpc_import_delay_req(imp, req, &rc)) {
1617                 spin_lock(&req->rq_lock);
1618                 req->rq_waiting = 1;
1619                 spin_unlock(&req->rq_lock);
1620
1621                 DEBUG_REQ(D_HA, req, "req waiting for recovery: (%s != %s)",
1622                           ptlrpc_import_state_name(req->rq_send_state),
1623                           ptlrpc_import_state_name(imp->imp_state));
1624                 LASSERT(list_empty(&req->rq_list));
1625                 list_add_tail(&req->rq_list, &imp->imp_delayed_list);
1626                 atomic_inc(&req->rq_import->imp_inflight);
1627                 spin_unlock(&imp->imp_lock);
1628                 RETURN(0);
1629         }
1630
1631         if (rc != 0) {
1632                 spin_unlock(&imp->imp_lock);
1633                 req->rq_status = rc;
1634                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1635                 RETURN(rc);
1636         }
1637
1638         LASSERT(list_empty(&req->rq_list));
1639         list_add_tail(&req->rq_list, &imp->imp_sending_list);
1640         atomic_inc(&req->rq_import->imp_inflight);
1641
1642         /*
1643          * find the known replied XID from the unreplied list, CONNECT
1644          * and DISCONNECT requests are skipped to make the sanity check
1645          * on server side happy. see process_req_last_xid().
1646          *
1647          * For CONNECT: Because replay requests have lower XID, it'll
1648          * break the sanity check if CONNECT bump the exp_last_xid on
1649          * server.
1650          *
1651          * For DISCONNECT: Since client will abort inflight RPC before
1652          * sending DISCONNECT, DISCONNECT may carry an XID which higher
1653          * than the inflight RPC.
1654          */
1655         if (!ptlrpc_req_is_connect(req) && !ptlrpc_req_is_disconnect(req))
1656                 min_xid = ptlrpc_known_replied_xid(imp);
1657         spin_unlock(&imp->imp_lock);
1658
1659         lustre_msg_set_last_xid(req->rq_reqmsg, min_xid);
1660
1661         lustre_msg_set_status(req->rq_reqmsg, current_pid());
1662
1663         rc = sptlrpc_req_refresh_ctx(req, -1);
1664         if (rc) {
1665                 if (req->rq_err) {
1666                         req->rq_status = rc;
1667                         RETURN(1);
1668                 } else {
1669                         spin_lock(&req->rq_lock);
1670                         req->rq_wait_ctx = 1;
1671                         spin_unlock(&req->rq_lock);
1672                         RETURN(0);
1673                 }
1674         }
1675
1676         CDEBUG(D_RPCTRACE,
1677                "Sending RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
1678                current_comm(),
1679                imp->imp_obd->obd_uuid.uuid,
1680                lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
1681                obd_import_nid2str(imp), lustre_msg_get_opc(req->rq_reqmsg));
1682
1683         rc = ptl_send_rpc(req, 0);
1684         if (rc == -ENOMEM) {
1685                 spin_lock(&imp->imp_lock);
1686                 if (!list_empty(&req->rq_list)) {
1687                         list_del_init(&req->rq_list);
1688                         atomic_dec(&req->rq_import->imp_inflight);
1689                 }
1690                 spin_unlock(&imp->imp_lock);
1691                 ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
1692                 RETURN(rc);
1693         }
1694         if (rc) {
1695                 DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
1696                 spin_lock(&req->rq_lock);
1697                 req->rq_net_err = 1;
1698                 spin_unlock(&req->rq_lock);
1699                 RETURN(rc);
1700         }
1701         RETURN(0);
1702 }
1703
1704 static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
1705 {
1706         int remaining, rc;
1707
1708         ENTRY;
1709         LASSERT(set->set_producer != NULL);
1710
1711         remaining = atomic_read(&set->set_remaining);
1712
1713         /*
1714          * populate the ->set_requests list with requests until we
1715          * reach the maximum number of RPCs in flight for this set
1716          */
1717         while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
1718                 rc = set->set_producer(set, set->set_producer_arg);
1719                 if (rc == -ENOENT) {
1720                         /* no more RPC to produce */
1721                         set->set_producer     = NULL;
1722                         set->set_producer_arg = NULL;
1723                         RETURN(0);
1724                 }
1725         }
1726
1727         RETURN((atomic_read(&set->set_remaining) - remaining));
1728 }
1729
1730 /**
1731  * this sends any unsent RPCs in \a set and returns 1 if all are sent
1732  * and no more replies are expected.
1733  * (it is possible to get less replies than requests sent e.g. due to timed out
1734  * requests or requests that we had trouble to send out)
1735  *
1736  * NOTE: This function contains a potential schedule point (cond_resched()).
1737  */
1738 int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
1739 {
1740         struct list_head *tmp, *next;
1741         struct list_head  comp_reqs;
1742         int force_timer_recalc = 0;
1743
1744         ENTRY;
1745         if (atomic_read(&set->set_remaining) == 0)
1746                 RETURN(1);
1747
1748         INIT_LIST_HEAD(&comp_reqs);
1749         list_for_each_safe(tmp, next, &set->set_requests) {
1750                 struct ptlrpc_request *req =
1751                         list_entry(tmp, struct ptlrpc_request,
1752                                    rq_set_chain);
1753                 struct obd_import *imp = req->rq_import;
1754                 int unregistered = 0;
1755                 int async = 1;
1756                 int rc = 0;
1757
1758                 if (req->rq_phase == RQ_PHASE_COMPLETE) {
1759                         list_move_tail(&req->rq_set_chain, &comp_reqs);
1760                         continue;
1761                 }
1762
1763                 /*
1764                  * This schedule point is mainly for the ptlrpcd caller of this
1765                  * function.  Most ptlrpc sets are not long-lived and unbounded
1766                  * in length, but at the least the set used by the ptlrpcd is.
1767                  * Since the processing time is unbounded, we need to insert an
1768                  * explicit schedule point to make the thread well-behaved.
1769                  */
1770                 cond_resched();
1771
1772                 /*
1773                  * If the caller requires to allow to be interpreted by force
1774                  * and it has really been interpreted, then move the request
1775                  * to RQ_PHASE_INTERPRET phase in spite of what the current
1776                  * phase is.
1777                  */
1778                 if (unlikely(req->rq_allow_intr && req->rq_intr)) {
1779                         req->rq_status = -EINTR;
1780                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1781
1782                         /*
1783                          * Since it is interpreted and we have to wait for
1784                          * the reply to be unlinked, then use sync mode.
1785                          */
1786                         async = 0;
1787
1788                         GOTO(interpret, req->rq_status);
1789                 }
1790
1791                 if (req->rq_phase == RQ_PHASE_NEW && ptlrpc_send_new_req(req))
1792                         force_timer_recalc = 1;
1793
1794                 /* delayed send - skip */
1795                 if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
1796                         continue;
1797
1798                 /* delayed resend - skip */
1799                 if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
1800                     req->rq_sent > ktime_get_real_seconds())
1801                         continue;
1802
1803                 if (!(req->rq_phase == RQ_PHASE_RPC ||
1804                       req->rq_phase == RQ_PHASE_BULK ||
1805                       req->rq_phase == RQ_PHASE_INTERPRET ||
1806                       req->rq_phase == RQ_PHASE_UNREG_RPC ||
1807                       req->rq_phase == RQ_PHASE_UNREG_BULK)) {
1808                         DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
1809                         LBUG();
1810                 }
1811
1812                 if (req->rq_phase == RQ_PHASE_UNREG_RPC ||
1813                     req->rq_phase == RQ_PHASE_UNREG_BULK) {
1814                         LASSERT(req->rq_next_phase != req->rq_phase);
1815                         LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
1816
1817                         if (req->rq_req_deadline &&
1818                             !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK))
1819                                 req->rq_req_deadline = 0;
1820                         if (req->rq_reply_deadline &&
1821                             !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK))
1822                                 req->rq_reply_deadline = 0;
1823                         if (req->rq_bulk_deadline &&
1824                             !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK))
1825                                 req->rq_bulk_deadline = 0;
1826
1827                         /*
1828                          * Skip processing until reply is unlinked. We
1829                          * can't return to pool before that and we can't
1830                          * call interpret before that. We need to make
1831                          * sure that all rdma transfers finished and will
1832                          * not corrupt any data.
1833                          */
1834                         if (req->rq_phase == RQ_PHASE_UNREG_RPC &&
1835                             ptlrpc_client_recv_or_unlink(req))
1836                                 continue;
1837                         if (req->rq_phase == RQ_PHASE_UNREG_BULK &&
1838                             ptlrpc_client_bulk_active(req))
1839                                 continue;
1840
1841                         /*
1842                          * Turn fail_loc off to prevent it from looping
1843                          * forever.
1844                          */
1845                         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
1846                                 OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
1847                                                      OBD_FAIL_ONCE);
1848                         }
1849                         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
1850                                 OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
1851                                                      OBD_FAIL_ONCE);
1852                         }
1853
1854                         /*
1855                          * Move to next phase if reply was successfully
1856                          * unlinked.
1857                          */
1858                         ptlrpc_rqphase_move(req, req->rq_next_phase);
1859                 }
1860
1861                 if (req->rq_phase == RQ_PHASE_INTERPRET)
1862                         GOTO(interpret, req->rq_status);
1863
1864                 /*
1865                  * Note that this also will start async reply unlink.
1866                  */
1867                 if (req->rq_net_err && !req->rq_timedout) {
1868                         ptlrpc_expire_one_request(req, 1);
1869
1870                         /*
1871                          * Check if we still need to wait for unlink.
1872                          */
1873                         if (ptlrpc_client_recv_or_unlink(req) ||
1874                             ptlrpc_client_bulk_active(req))
1875                                 continue;
1876                         /* If there is no need to resend, fail it now. */
1877                         if (req->rq_no_resend) {
1878                                 if (req->rq_status == 0)
1879                                         req->rq_status = -EIO;
1880                                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1881                                 GOTO(interpret, req->rq_status);
1882                         } else {
1883                                 continue;
1884                         }
1885                 }
1886
1887                 if (req->rq_err) {
1888                         spin_lock(&req->rq_lock);
1889                         req->rq_replied = 0;
1890                         spin_unlock(&req->rq_lock);
1891                         if (req->rq_status == 0)
1892                                 req->rq_status = -EIO;
1893                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1894                         GOTO(interpret, req->rq_status);
1895                 }
1896
1897                 /*
1898                  * ptlrpc_set_wait->l_wait_event sets lwi_allow_intr
1899                  * so it sets rq_intr regardless of individual rpc
1900                  * timeouts. The synchronous IO waiting path sets
1901                  * rq_intr irrespective of whether ptlrpcd
1902                  * has seen a timeout.  Our policy is to only interpret
1903                  * interrupted rpcs after they have timed out, so we
1904                  * need to enforce that here.
1905                  */
1906
1907                 if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
1908                                      req->rq_wait_ctx)) {
1909                         req->rq_status = -EINTR;
1910                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1911                         GOTO(interpret, req->rq_status);
1912                 }
1913
1914                 if (req->rq_phase == RQ_PHASE_RPC) {
1915                         if (req->rq_timedout || req->rq_resend ||
1916                             req->rq_waiting || req->rq_wait_ctx) {
1917                                 int status;
1918
1919                                 if (!ptlrpc_unregister_reply(req, 1)) {
1920                                         ptlrpc_unregister_bulk(req, 1);
1921                                         continue;
1922                                 }
1923
1924                                 spin_lock(&imp->imp_lock);
1925                                 if (ptlrpc_import_delay_req(imp, req,
1926                                                             &status)) {
1927                                         /*
1928                                          * put on delay list - only if we wait
1929                                          * recovery finished - before send
1930                                          */
1931                                         list_del_init(&req->rq_list);
1932                                         list_add_tail(&req->rq_list,
1933                                                       &imp->imp_delayed_list);
1934                                         spin_unlock(&imp->imp_lock);
1935                                         continue;
1936                                 }
1937
1938                                 if (status != 0)  {
1939                                         req->rq_status = status;
1940                                         ptlrpc_rqphase_move(req,
1941                                                             RQ_PHASE_INTERPRET);
1942                                         spin_unlock(&imp->imp_lock);
1943                                         GOTO(interpret, req->rq_status);
1944                                 }
1945                                 /* ignore on just initiated connections */
1946                                 if (ptlrpc_no_resend(req) &&
1947                                     !req->rq_wait_ctx &&
1948                                     imp->imp_generation !=
1949                                     imp->imp_initiated_at) {
1950                                         req->rq_status = -ENOTCONN;
1951                                         ptlrpc_rqphase_move(req,
1952                                                             RQ_PHASE_INTERPRET);
1953                                         spin_unlock(&imp->imp_lock);
1954                                         GOTO(interpret, req->rq_status);
1955                                 }
1956
1957                                 list_del_init(&req->rq_list);
1958                                 list_add_tail(&req->rq_list,
1959                                               &imp->imp_sending_list);
1960
1961                                 spin_unlock(&imp->imp_lock);
1962
1963                                 spin_lock(&req->rq_lock);
1964                                 req->rq_waiting = 0;
1965                                 spin_unlock(&req->rq_lock);
1966
1967                                 if (req->rq_timedout || req->rq_resend) {
1968                                         /*
1969                                          * This is re-sending anyways,
1970                                          * let's mark req as resend.
1971                                          */
1972                                         spin_lock(&req->rq_lock);
1973                                         req->rq_resend = 1;
1974                                         spin_unlock(&req->rq_lock);
1975                                 }
1976                                 /*
1977                                  * rq_wait_ctx is only touched by ptlrpcd,
1978                                  * so no lock is needed here.
1979                                  */
1980                                 status = sptlrpc_req_refresh_ctx(req, -1);
1981                                 if (status) {
1982                                         if (req->rq_err) {
1983                                                 req->rq_status = status;
1984                                                 spin_lock(&req->rq_lock);
1985                                                 req->rq_wait_ctx = 0;
1986                                                 spin_unlock(&req->rq_lock);
1987                                                 force_timer_recalc = 1;
1988                                         } else {
1989                                                 spin_lock(&req->rq_lock);
1990                                                 req->rq_wait_ctx = 1;
1991                                                 spin_unlock(&req->rq_lock);
1992                                         }
1993
1994                                         continue;
1995                                 } else {
1996                                         spin_lock(&req->rq_lock);
1997                                         req->rq_wait_ctx = 0;
1998                                         spin_unlock(&req->rq_lock);
1999                                 }
2000
2001                                 /*
2002                                  * In any case, the previous bulk should be
2003                                  * cleaned up to prepare for the new sending
2004                                  */
2005                                 if (req->rq_bulk &&
2006                                     !ptlrpc_unregister_bulk(req, 1))
2007                                         continue;
2008
2009                                 rc = ptl_send_rpc(req, 0);
2010                                 if (rc == -ENOMEM) {
2011                                         spin_lock(&imp->imp_lock);
2012                                         if (!list_empty(&req->rq_list))
2013                                                 list_del_init(&req->rq_list);
2014                                         spin_unlock(&imp->imp_lock);
2015                                         ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
2016                                         continue;
2017                                 }
2018                                 if (rc) {
2019                                         DEBUG_REQ(D_HA, req,
2020                                                   "send failed: rc = %d", rc);
2021                                         force_timer_recalc = 1;
2022                                         spin_lock(&req->rq_lock);
2023                                         req->rq_net_err = 1;
2024                                         spin_unlock(&req->rq_lock);
2025                                         continue;
2026                                 }
2027                                 /* need to reset the timeout */
2028                                 force_timer_recalc = 1;
2029                         }
2030
2031                         spin_lock(&req->rq_lock);
2032
2033                         if (ptlrpc_client_early(req)) {
2034                                 ptlrpc_at_recv_early_reply(req);
2035                                 spin_unlock(&req->rq_lock);
2036                                 continue;
2037                         }
2038
2039                         /* Still waiting for a reply? */
2040                         if (ptlrpc_client_recv(req)) {
2041                                 spin_unlock(&req->rq_lock);
2042                                 continue;
2043                         }
2044
2045                         /* Did we actually receive a reply? */
2046                         if (!ptlrpc_client_replied(req)) {
2047                                 spin_unlock(&req->rq_lock);
2048                                 continue;
2049                         }
2050
2051                         spin_unlock(&req->rq_lock);
2052
2053                         /*
2054                          * unlink from net because we are going to
2055                          * swab in-place of reply buffer
2056                          */
2057                         unregistered = ptlrpc_unregister_reply(req, 1);
2058                         if (!unregistered)
2059                                 continue;
2060
2061                         req->rq_status = after_reply(req);
2062                         if (req->rq_resend)
2063                                 continue;
2064
2065                         /*
2066                          * If there is no bulk associated with this request,
2067                          * then we're done and should let the interpreter
2068                          * process the reply. Similarly if the RPC returned
2069                          * an error, and therefore the bulk will never arrive.
2070                          */
2071                         if (!req->rq_bulk || req->rq_status < 0) {
2072                                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
2073                                 GOTO(interpret, req->rq_status);
2074                         }
2075
2076                         ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
2077                 }
2078
2079                 LASSERT(req->rq_phase == RQ_PHASE_BULK);
2080                 if (ptlrpc_client_bulk_active(req))
2081                         continue;
2082
2083                 if (req->rq_bulk->bd_failure) {
2084                         /*
2085                          * The RPC reply arrived OK, but the bulk screwed
2086                          * up!  Dead weird since the server told us the RPC
2087                          * was good after getting the REPLY for her GET or
2088                          * the ACK for her PUT.
2089                          */
2090                         DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
2091                         req->rq_status = -EIO;
2092                 }
2093
2094                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
2095
2096 interpret:
2097                 LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
2098
2099                 /*
2100                  * This moves to "unregistering" phase we need to wait for
2101                  * reply unlink.
2102                  */
2103                 if (!unregistered && !ptlrpc_unregister_reply(req, async)) {
2104                         /* start async bulk unlink too */
2105                         ptlrpc_unregister_bulk(req, 1);
2106                         continue;
2107                 }
2108
2109                 if (!ptlrpc_unregister_bulk(req, async))
2110                         continue;
2111
2112                 /*
2113                  * When calling interpret receiving already should be
2114                  * finished.
2115                  */
2116                 LASSERT(!req->rq_receiving_reply);
2117
2118                 ptlrpc_req_interpret(env, req, req->rq_status);
2119
2120                 if (ptlrpcd_check_work(req)) {
2121                         atomic_dec(&set->set_remaining);
2122                         continue;
2123                 }
2124                 ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
2125
2126                 if (req->rq_reqmsg)
2127                         CDEBUG(D_RPCTRACE,
2128                                "Completed RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
2129                                current_comm(),
2130                                imp->imp_obd->obd_uuid.uuid,
2131                                lustre_msg_get_status(req->rq_reqmsg),
2132                                req->rq_xid,
2133                                obd_import_nid2str(imp),
2134                                lustre_msg_get_opc(req->rq_reqmsg));
2135
2136                 spin_lock(&imp->imp_lock);
2137                 /*
2138                  * Request already may be not on sending or delaying list. This
2139                  * may happen in the case of marking it erroneous for the case
2140                  * ptlrpc_import_delay_req(req, status) find it impossible to
2141                  * allow sending this rpc and returns *status != 0.
2142                  */
2143                 if (!list_empty(&req->rq_list)) {
2144                         list_del_init(&req->rq_list);
2145                         atomic_dec(&imp->imp_inflight);
2146                 }
2147                 list_del_init(&req->rq_unreplied_list);
2148                 spin_unlock(&imp->imp_lock);
2149
2150                 atomic_dec(&set->set_remaining);
2151                 wake_up_all(&imp->imp_recovery_waitq);
2152
2153                 if (set->set_producer) {
2154                         /* produce a new request if possible */
2155                         if (ptlrpc_set_producer(set) > 0)
2156                                 force_timer_recalc = 1;
2157
2158                         /*
2159                          * free the request that has just been completed
2160                          * in order not to pollute set->set_requests
2161                          */
2162                         list_del_init(&req->rq_set_chain);
2163                         spin_lock(&req->rq_lock);
2164                         req->rq_set = NULL;
2165                         req->rq_invalid_rqset = 0;
2166                         spin_unlock(&req->rq_lock);
2167
2168                         /* record rq_status to compute the final status later */
2169                         if (req->rq_status != 0)
2170                                 set->set_rc = req->rq_status;
2171                         ptlrpc_req_finished(req);
2172                 } else {
2173                         list_move_tail(&req->rq_set_chain, &comp_reqs);
2174                 }
2175         }
2176
2177         /*
2178          * move completed request at the head of list so it's easier for
2179          * caller to find them
2180          */
2181         list_splice(&comp_reqs, &set->set_requests);
2182
2183         /* If we hit an error, we want to recover promptly. */
2184         RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc);
2185 }
2186 EXPORT_SYMBOL(ptlrpc_check_set);
2187
2188 /**
2189  * Time out request \a req. is \a async_unlink is set, that means do not wait
2190  * until LNet actually confirms network buffer unlinking.
2191  * Return 1 if we should give up further retrying attempts or 0 otherwise.
2192  */
2193 int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
2194 {
2195         struct obd_import *imp = req->rq_import;
2196         unsigned int debug_mask = D_RPCTRACE;
2197         int rc = 0;
2198
2199         ENTRY;
2200         spin_lock(&req->rq_lock);
2201         req->rq_timedout = 1;
2202         spin_unlock(&req->rq_lock);
2203
2204         if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg),
2205                                  lustre_msg_get_status(req->rq_reqmsg)))
2206                 debug_mask = D_WARNING;
2207         DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]",
2208                   req->rq_net_err ? "failed due to network error" :
2209                      ((req->rq_real_sent == 0 ||
2210                        req->rq_real_sent < req->rq_sent ||
2211                        req->rq_real_sent >= req->rq_deadline) ?
2212                       "timed out for sent delay" : "timed out for slow reply"),
2213                   (s64)req->rq_sent, (s64)req->rq_real_sent);
2214
2215         if (imp && obd_debug_peer_on_timeout)
2216                 LNetDebugPeer(imp->imp_connection->c_peer);
2217
2218         ptlrpc_unregister_reply(req, async_unlink);
2219         ptlrpc_unregister_bulk(req, async_unlink);
2220
2221         if (obd_dump_on_timeout)
2222                 libcfs_debug_dumplog();
2223
2224         if (!imp) {
2225                 DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
2226                 RETURN(1);
2227         }
2228
2229         atomic_inc(&imp->imp_timeouts);
2230
2231         /* The DLM server doesn't want recovery run on its imports. */
2232         if (imp->imp_dlm_fake)
2233                 RETURN(1);
2234
2235         /*
2236          * If this request is for recovery or other primordial tasks,
2237          * then error it out here.
2238          */
2239         if (req->rq_ctx_init || req->rq_ctx_fini ||
2240             req->rq_send_state != LUSTRE_IMP_FULL ||
2241             imp->imp_obd->obd_no_recov) {
2242                 DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
2243                           ptlrpc_import_state_name(req->rq_send_state),
2244                           ptlrpc_import_state_name(imp->imp_state));
2245                 spin_lock(&req->rq_lock);
2246                 req->rq_status = -ETIMEDOUT;
2247                 req->rq_err = 1;
2248                 spin_unlock(&req->rq_lock);
2249                 RETURN(1);
2250         }
2251
2252         /*
2253          * if a request can't be resent we can't wait for an answer after
2254          * the timeout
2255          */
2256         if (ptlrpc_no_resend(req)) {
2257                 DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
2258                 rc = 1;
2259         }
2260
2261         ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
2262
2263         RETURN(rc);
2264 }
2265
2266 /**
2267  * Time out all uncompleted requests in request set pointed by \a data
2268  * Callback used when waiting on sets with l_wait_event.
2269  * Always returns 1.
2270  */
2271 int ptlrpc_expired_set(void *data)
2272 {
2273         struct ptlrpc_request_set *set = data;
2274         struct list_head *tmp;
2275         time64_t now = ktime_get_real_seconds();
2276
2277         ENTRY;
2278         LASSERT(set != NULL);
2279
2280         /*
2281          * A timeout expired. See which reqs it applies to...
2282          */
2283         list_for_each(tmp, &set->set_requests) {
2284                 struct ptlrpc_request *req =
2285                         list_entry(tmp, struct ptlrpc_request,
2286                                    rq_set_chain);
2287
2288                 /* don't expire request waiting for context */
2289                 if (req->rq_wait_ctx)
2290                         continue;
2291
2292                 /* Request in-flight? */
2293                 if (!((req->rq_phase == RQ_PHASE_RPC &&
2294                        !req->rq_waiting && !req->rq_resend) ||
2295                       (req->rq_phase == RQ_PHASE_BULK)))
2296                         continue;
2297
2298                 if (req->rq_timedout ||     /* already dealt with */
2299                     req->rq_deadline > now) /* not expired */
2300                         continue;
2301
2302                 /*
2303                  * Deal with this guy. Do it asynchronously to not block
2304                  * ptlrpcd thread.
2305                  */
2306                 ptlrpc_expire_one_request(req, 1);
2307         }
2308
2309         /*
2310          * When waiting for a whole set, we always break out of the
2311          * sleep so we can recalculate the timeout, or enable interrupts
2312          * if everyone's timed out.
2313          */
2314         RETURN(1);
2315 }
2316
2317 /**
2318  * Sets rq_intr flag in \a req under spinlock.
2319  */
2320 void ptlrpc_mark_interrupted(struct ptlrpc_request *req)
2321 {
2322         spin_lock(&req->rq_lock);
2323         req->rq_intr = 1;
2324         spin_unlock(&req->rq_lock);
2325 }
2326 EXPORT_SYMBOL(ptlrpc_mark_interrupted);
2327
2328 /**
2329  * Interrupts (sets interrupted flag) all uncompleted requests in
2330  * a set \a data. Callback for l_wait_event for interruptible waits.
2331  */
2332 static void ptlrpc_interrupted_set(void *data)
2333 {
2334         struct ptlrpc_request_set *set = data;
2335         struct list_head *tmp;
2336
2337         LASSERT(set != NULL);
2338         CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
2339
2340         list_for_each(tmp, &set->set_requests) {
2341                 struct ptlrpc_request *req =
2342                         list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2343
2344                 if (req->rq_intr)
2345                         continue;
2346
2347                 if (req->rq_phase != RQ_PHASE_RPC &&
2348                     req->rq_phase != RQ_PHASE_UNREG_RPC &&
2349                     !req->rq_allow_intr)
2350                         continue;
2351
2352                 ptlrpc_mark_interrupted(req);
2353         }
2354 }
2355
2356 /**
2357  * Get the smallest timeout in the set; this does NOT set a timeout.
2358  */
2359 time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
2360 {
2361         struct list_head *tmp;
2362         time64_t now = ktime_get_real_seconds();
2363         int timeout = 0;
2364         struct ptlrpc_request *req;
2365         time64_t deadline;
2366
2367         ENTRY;
2368         list_for_each(tmp, &set->set_requests) {
2369                 req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2370
2371                 /* Request in-flight? */
2372                 if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
2373                       (req->rq_phase == RQ_PHASE_BULK) ||
2374                       (req->rq_phase == RQ_PHASE_NEW)))
2375                         continue;
2376
2377                 /* Already timed out. */
2378                 if (req->rq_timedout)
2379                         continue;
2380
2381                 /* Waiting for ctx. */
2382                 if (req->rq_wait_ctx)
2383                         continue;
2384
2385                 if (req->rq_phase == RQ_PHASE_NEW)
2386                         deadline = req->rq_sent;
2387                 else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
2388                         deadline = req->rq_sent;
2389                 else
2390                         deadline = req->rq_sent + req->rq_timeout;
2391
2392                 if (deadline <= now)    /* actually expired already */
2393                         timeout = 1;    /* ASAP */
2394                 else if (timeout == 0 || timeout > deadline - now)
2395                         timeout = deadline - now;
2396         }
2397         RETURN(timeout);
2398 }
2399
2400 /**
2401  * Send all unset request from the set and then wait untill all
2402  * requests in the set complete (either get a reply, timeout, get an
2403  * error or otherwise be interrupted).
2404  * Returns 0 on success or error code otherwise.
2405  */
2406 int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *set)
2407 {
2408         struct list_head *tmp;
2409         struct ptlrpc_request *req;
2410         struct l_wait_info lwi;
2411         struct lu_env _env;
2412         time64_t timeout;
2413         int rc;
2414
2415         ENTRY;
2416         if (set->set_producer)
2417                 (void)ptlrpc_set_producer(set);
2418         else
2419                 list_for_each(tmp, &set->set_requests) {
2420                         req = list_entry(tmp, struct ptlrpc_request,
2421                                          rq_set_chain);
2422                         if (req->rq_phase == RQ_PHASE_NEW)
2423                                 (void)ptlrpc_send_new_req(req);
2424                 }
2425
2426         if (list_empty(&set->set_requests))
2427                 RETURN(0);
2428
2429         /*
2430          * ideally we want env provide by the caller all the time,
2431          * but at the moment that would mean a massive change in
2432          * LDLM while benefits would be close to zero, so just
2433          * initialize env here for those rare cases
2434          */
2435         if (!env) {
2436                 /* XXX: skip on the client side? */
2437                 rc = lu_env_init(&_env, LCT_DT_THREAD);
2438                 if (rc)
2439                         RETURN(rc);
2440                 env = &_env;
2441         }
2442
2443         do {
2444                 timeout = ptlrpc_set_next_timeout(set);
2445
2446                 /*
2447                  * wait until all complete, interrupted, or an in-flight
2448                  * req times out
2449                  */
2450                 CDEBUG(D_RPCTRACE, "set %p going to sleep for %lld seconds\n",
2451                        set, timeout);
2452
2453                 if ((timeout == 0 && !signal_pending(current)) ||
2454                     set->set_allow_intr)
2455                         /*
2456                          * No requests are in-flight (ether timed out
2457                          * or delayed), so we can allow interrupts.
2458                          * We still want to block for a limited time,
2459                          * so we allow interrupts during the timeout.
2460                          */
2461                         lwi = LWI_TIMEOUT_INTR_ALL(
2462                                         cfs_time_seconds(timeout ? timeout : 1),
2463                                         ptlrpc_expired_set,
2464                                         ptlrpc_interrupted_set, set);
2465                 else
2466                         /*
2467                          * At least one request is in flight, so no
2468                          * interrupts are allowed. Wait until all
2469                          * complete, or an in-flight req times out.
2470                          */
2471                         lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
2472                                           ptlrpc_expired_set, set);
2473
2474                 rc = l_wait_event(set->set_waitq,
2475                                   ptlrpc_check_set(env, set), &lwi);
2476
2477                 /*
2478                  * LU-769 - if we ignored the signal because it was already
2479                  * pending when we started, we need to handle it now or we risk
2480                  * it being ignored forever
2481                  */
2482                 if (rc == -ETIMEDOUT &&
2483                     (!lwi.lwi_allow_intr || set->set_allow_intr) &&
2484                     signal_pending(current)) {
2485                         sigset_t blocked_sigs =
2486                                            cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
2487
2488                         /*
2489                          * In fact we only interrupt for the "fatal" signals
2490                          * like SIGINT or SIGKILL. We still ignore less
2491                          * important signals since ptlrpc set is not easily
2492                          * reentrant from userspace again
2493                          */
2494                         if (signal_pending(current))
2495                                 ptlrpc_interrupted_set(set);
2496                         cfs_restore_sigs(blocked_sigs);
2497                 }
2498
2499                 LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
2500
2501                 /*
2502                  * -EINTR => all requests have been flagged rq_intr so next
2503                  * check completes.
2504                  * -ETIMEDOUT => someone timed out.  When all reqs have
2505                  * timed out, signals are enabled allowing completion with
2506                  * EINTR.
2507                  * I don't really care if we go once more round the loop in
2508                  * the error cases -eeb.
2509                  */
2510                 if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
2511                         list_for_each(tmp, &set->set_requests) {
2512                                 req = list_entry(tmp, struct ptlrpc_request,
2513                                                  rq_set_chain);
2514                                 spin_lock(&req->rq_lock);
2515                                 req->rq_invalid_rqset = 1;
2516                                 spin_unlock(&req->rq_lock);
2517                         }
2518                 }
2519         } while (rc != 0 || atomic_read(&set->set_remaining) != 0);
2520
2521         LASSERT(atomic_read(&set->set_remaining) == 0);
2522
2523         rc = set->set_rc; /* rq_status of already freed requests if any */
2524         list_for_each(tmp, &set->set_requests) {
2525                 req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2526
2527                 LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
2528                 if (req->rq_status != 0)
2529                         rc = req->rq_status;
2530         }
2531
2532         if (env && env == &_env)
2533                 lu_env_fini(&_env);
2534
2535         RETURN(rc);
2536 }
2537 EXPORT_SYMBOL(ptlrpc_set_wait);
2538
2539 /**
2540  * Helper fuction for request freeing.
2541  * Called when request count reached zero and request needs to be freed.
2542  * Removes request from all sorts of sending/replay lists it might be on,
2543  * frees network buffers if any are present.
2544  * If \a locked is set, that means caller is already holding import imp_lock
2545  * and so we no longer need to reobtain it (for certain lists manipulations)
2546  */
2547 static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
2548 {
2549         ENTRY;
2550
2551         if (!request)
2552                 RETURN_EXIT;
2553
2554         LASSERT(!request->rq_srv_req);
2555         LASSERT(request->rq_export == NULL);
2556         LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
2557         LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
2558         LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
2559         LASSERTF(!request->rq_replay, "req %p\n", request);
2560
2561         req_capsule_fini(&request->rq_pill);
2562
2563         /*
2564          * We must take it off the imp_replay_list first.  Otherwise, we'll set
2565          * request->rq_reqmsg to NULL while osc_close is dereferencing it.
2566          */
2567         if (request->rq_import) {
2568                 if (!locked)
2569                         spin_lock(&request->rq_import->imp_lock);
2570                 list_del_init(&request->rq_replay_list);
2571                 list_del_init(&request->rq_unreplied_list);
2572                 if (!locked)
2573                         spin_unlock(&request->rq_import->imp_lock);
2574         }
2575         LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
2576
2577         if (atomic_read(&request->rq_refcount) != 0) {
2578                 DEBUG_REQ(D_ERROR, request,
2579                           "freeing request with nonzero refcount");
2580                 LBUG();
2581         }
2582
2583         if (request->rq_repbuf)
2584                 sptlrpc_cli_free_repbuf(request);
2585
2586         if (request->rq_import) {
2587                 class_import_put(request->rq_import);
2588                 request->rq_import = NULL;
2589         }
2590         if (request->rq_bulk)
2591                 ptlrpc_free_bulk(request->rq_bulk);
2592
2593         if (request->rq_reqbuf || request->rq_clrbuf)
2594                 sptlrpc_cli_free_reqbuf(request);
2595
2596         if (request->rq_cli_ctx)
2597                 sptlrpc_req_put_ctx(request, !locked);
2598
2599         if (request->rq_pool)
2600                 __ptlrpc_free_req_to_pool(request);
2601         else
2602                 ptlrpc_request_cache_free(request);
2603         EXIT;
2604 }
2605
2606 static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
2607 /**
2608  * Drop one request reference. Must be called with import imp_lock held.
2609  * When reference count drops to zero, request is freed.
2610  */
2611 void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
2612 {
2613         assert_spin_locked(&request->rq_import->imp_lock);
2614         (void)__ptlrpc_req_finished(request, 1);
2615 }
2616
2617 /**
2618  * Helper function
2619  * Drops one reference count for request \a request.
2620  * \a locked set indicates that caller holds import imp_lock.
2621  * Frees the request whe reference count reaches zero.
2622  *
2623  * \retval 1    the request is freed
2624  * \retval 0    some others still hold references on the request
2625  */
2626 static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
2627 {
2628         int count;
2629
2630         ENTRY;
2631         if (!request)
2632                 RETURN(1);
2633
2634         LASSERT(request != LP_POISON);
2635         LASSERT(request->rq_reqmsg != LP_POISON);
2636
2637         DEBUG_REQ(D_INFO, request, "refcount now %u",
2638                   atomic_read(&request->rq_refcount) - 1);
2639
2640         spin_lock(&request->rq_lock);
2641         count = atomic_dec_return(&request->rq_refcount);
2642         LASSERTF(count >= 0, "Invalid ref count %d\n", count);
2643
2644         /*
2645          * For open RPC, the client does not know the EA size (LOV, ACL, and
2646          * so on) before replied, then the client has to reserve very large
2647          * reply buffer. Such buffer will not be released until the RPC freed.
2648          * Since The open RPC is replayable, we need to keep it in the replay
2649          * list until close. If there are a lot of files opened concurrently,
2650          * then the client may be OOM.
2651          *
2652          * If fact, it is unnecessary to keep reply buffer for open replay,
2653          * related EAs have already been saved via mdc_save_lovea() before
2654          * coming here. So it is safe to free the reply buffer some earlier
2655          * before releasing the RPC to avoid client OOM. LU-9514
2656          */
2657         if (count == 1 && request->rq_early_free_repbuf && request->rq_repbuf) {
2658                 spin_lock(&request->rq_early_free_lock);
2659                 sptlrpc_cli_free_repbuf(request);
2660                 request->rq_repbuf = NULL;
2661                 request->rq_repbuf_len = 0;
2662                 request->rq_repdata = NULL;
2663                 request->rq_reqdata_len = 0;
2664                 spin_unlock(&request->rq_early_free_lock);
2665         }
2666         spin_unlock(&request->rq_lock);
2667
2668         if (!count)
2669                 __ptlrpc_free_req(request, locked);
2670
2671         RETURN(!count);
2672 }
2673
2674 /**
2675  * Drops one reference count for a request.
2676  */
2677 void ptlrpc_req_finished(struct ptlrpc_request *request)
2678 {
2679         __ptlrpc_req_finished(request, 0);
2680 }
2681 EXPORT_SYMBOL(ptlrpc_req_finished);
2682
2683 /**
2684  * Returns xid of a \a request
2685  */
2686 __u64 ptlrpc_req_xid(struct ptlrpc_request *request)
2687 {
2688         return request->rq_xid;
2689 }
2690 EXPORT_SYMBOL(ptlrpc_req_xid);
2691
2692 /**
2693  * Disengage the client's reply buffer from the network
2694  * NB does _NOT_ unregister any client-side bulk.
2695  * IDEMPOTENT, but _not_ safe against concurrent callers.
2696  * The request owner (i.e. the thread doing the I/O) must call...
2697  * Returns 0 on success or 1 if unregistering cannot be made.
2698  */
2699 static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
2700 {
2701         int rc;
2702         struct l_wait_info lwi;
2703
2704         /*
2705          * Might sleep.
2706          */
2707         LASSERT(!in_interrupt());
2708
2709         /* Let's setup deadline for reply unlink. */
2710         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
2711             async && request->rq_reply_deadline == 0 && cfs_fail_val == 0)
2712                 request->rq_reply_deadline = ktime_get_real_seconds() +
2713                                              LONG_UNLINK;
2714
2715         /*
2716          * Nothing left to do.
2717          */
2718         if (!ptlrpc_client_recv_or_unlink(request))
2719                 RETURN(1);
2720
2721         LNetMDUnlink(request->rq_reply_md_h);
2722
2723         /*
2724          * Let's check it once again.
2725          */
2726         if (!ptlrpc_client_recv_or_unlink(request))
2727                 RETURN(1);
2728
2729         /* Move to "Unregistering" phase as reply was not unlinked yet. */
2730         ptlrpc_rqphase_move(request, RQ_PHASE_UNREG_RPC);
2731
2732         /*
2733          * Do not wait for unlink to finish.
2734          */
2735         if (async)
2736                 RETURN(0);
2737
2738         /*
2739          * We have to l_wait_event() whatever the result, to give liblustre
2740          * a chance to run reply_in_callback(), and to make sure we've
2741          * unlinked before returning a req to the pool.
2742          */
2743         for (;;) {
2744                 /* The wq argument is ignored by user-space wait_event macros */
2745                 wait_queue_head_t *wq = (request->rq_set) ?
2746                                         &request->rq_set->set_waitq :
2747                                         &request->rq_reply_waitq;
2748                 /*
2749                  * Network access will complete in finite time but the HUGE
2750                  * timeout lets us CWARN for visibility of sluggish NALs
2751                  */
2752                 lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
2753                                            cfs_time_seconds(1), NULL, NULL);
2754                 rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
2755                                   &lwi);
2756                 if (rc == 0) {
2757                         ptlrpc_rqphase_move(request, request->rq_next_phase);
2758                         RETURN(1);
2759                 }
2760
2761                 LASSERT(rc == -ETIMEDOUT);
2762                 DEBUG_REQ(D_WARNING, request,
2763                           "Unexpectedly long timeout receiving_reply=%d req_ulinked=%d reply_unlinked=%d",
2764                           request->rq_receiving_reply,
2765                           request->rq_req_unlinked,
2766                           request->rq_reply_unlinked);
2767         }
2768         RETURN(0);
2769 }
2770
2771 static void ptlrpc_free_request(struct ptlrpc_request *req)
2772 {
2773         spin_lock(&req->rq_lock);
2774         req->rq_replay = 0;
2775         spin_unlock(&req->rq_lock);
2776
2777         if (req->rq_commit_cb)
2778                 req->rq_commit_cb(req);
2779         list_del_init(&req->rq_replay_list);
2780
2781         __ptlrpc_req_finished(req, 1);
2782 }
2783
2784 /**
2785  * the request is committed and dropped from the replay list of its import
2786  */
2787 void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
2788 {
2789         struct obd_import *imp = req->rq_import;
2790
2791         spin_lock(&imp->imp_lock);
2792         if (list_empty(&req->rq_replay_list)) {
2793                 spin_unlock(&imp->imp_lock);
2794                 return;
2795         }
2796
2797         if (force || req->rq_transno <= imp->imp_peer_committed_transno) {
2798                 if (imp->imp_replay_cursor == &req->rq_replay_list)
2799                         imp->imp_replay_cursor = req->rq_replay_list.next;
2800                 ptlrpc_free_request(req);
2801         }
2802
2803         spin_unlock(&imp->imp_lock);
2804 }
2805 EXPORT_SYMBOL(ptlrpc_request_committed);
2806
2807 /**
2808  * Iterates through replay_list on import and prunes
2809  * all requests have transno smaller than last_committed for the
2810  * import and don't have rq_replay set.
2811  * Since requests are sorted in transno order, stops when meetign first
2812  * transno bigger than last_committed.
2813  * caller must hold imp->imp_lock
2814  */
2815 void ptlrpc_free_committed(struct obd_import *imp)
2816 {
2817         struct ptlrpc_request *req, *saved;
2818         struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
2819         bool skip_committed_list = true;
2820
2821         ENTRY;
2822         LASSERT(imp != NULL);
2823         assert_spin_locked(&imp->imp_lock);
2824
2825         if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
2826             imp->imp_generation == imp->imp_last_generation_checked) {
2827                 CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n",
2828                        imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
2829                 RETURN_EXIT;
2830         }
2831         CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n",
2832                imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
2833                imp->imp_generation);
2834
2835         if (imp->imp_generation != imp->imp_last_generation_checked ||
2836             imp->imp_last_transno_checked == 0)
2837                 skip_committed_list = false;
2838
2839         imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
2840         imp->imp_last_generation_checked = imp->imp_generation;
2841
2842         list_for_each_entry_safe(req, saved, &imp->imp_replay_list,
2843                                  rq_replay_list) {
2844                 /* XXX ok to remove when 1357 resolved - rread 05/29/03  */
2845                 LASSERT(req != last_req);
2846                 last_req = req;
2847
2848                 if (req->rq_transno == 0) {
2849                         DEBUG_REQ(D_EMERG, req, "zero transno during replay");
2850                         LBUG();
2851                 }
2852                 if (req->rq_import_generation < imp->imp_generation) {
2853                         DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
2854                         GOTO(free_req, 0);
2855                 }
2856
2857                 /* not yet committed */
2858                 if (req->rq_transno > imp->imp_peer_committed_transno) {
2859                         DEBUG_REQ(D_RPCTRACE, req, "stopping search");
2860                         break;
2861                 }
2862
2863                 if (req->rq_replay) {
2864                         DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
2865                         list_move_tail(&req->rq_replay_list,
2866                                        &imp->imp_committed_list);
2867                         continue;
2868                 }
2869
2870                 DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)",
2871                           imp->imp_peer_committed_transno);
2872 free_req:
2873                 ptlrpc_free_request(req);
2874         }
2875
2876         if (skip_committed_list)
2877                 GOTO(out, 0);
2878
2879         list_for_each_entry_safe(req, saved, &imp->imp_committed_list,
2880                                  rq_replay_list) {
2881                 LASSERT(req->rq_transno != 0);
2882                 if (req->rq_import_generation < imp->imp_generation ||
2883                     !req->rq_replay) {
2884                         DEBUG_REQ(D_RPCTRACE, req, "free %s open request",
2885                                   req->rq_import_generation <
2886                                   imp->imp_generation ? "stale" : "closed");
2887
2888                         if (imp->imp_replay_cursor == &req->rq_replay_list)
2889                                 imp->imp_replay_cursor =
2890                                         req->rq_replay_list.next;
2891
2892                         ptlrpc_free_request(req);
2893                 }
2894         }
2895 out:
2896         EXIT;
2897 }
2898
2899 void ptlrpc_cleanup_client(struct obd_import *imp)
2900 {
2901         ENTRY;
2902         EXIT;
2903 }
2904
2905 /**
2906  * Schedule previously sent request for resend.
2907  * For bulk requests we assign new xid (to avoid problems with
2908  * lost replies and therefore several transfers landing into same buffer
2909  * from different sending attempts).
2910  */
2911 void ptlrpc_resend_req(struct ptlrpc_request *req)
2912 {
2913         DEBUG_REQ(D_HA, req, "going to resend");
2914         spin_lock(&req->rq_lock);
2915
2916         /*
2917          * Request got reply but linked to the import list still.
2918          * Let ptlrpc_check_set() process it.
2919          */
2920         if (ptlrpc_client_replied(req)) {
2921                 spin_unlock(&req->rq_lock);
2922                 DEBUG_REQ(D_HA, req, "it has reply, so skip it");
2923                 return;
2924         }
2925
2926         req->rq_status = -EAGAIN;
2927
2928         req->rq_resend = 1;
2929         req->rq_net_err = 0;
2930         req->rq_timedout = 0;
2931
2932         ptlrpc_client_wake_req(req);
2933         spin_unlock(&req->rq_lock);
2934 }
2935
2936 /* XXX: this function and rq_status are currently unused */
2937 void ptlrpc_restart_req(struct ptlrpc_request *req)
2938 {
2939         DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
2940         req->rq_status = -ERESTARTSYS;
2941
2942         spin_lock(&req->rq_lock);
2943         req->rq_restart = 1;
2944         req->rq_timedout = 0;
2945         ptlrpc_client_wake_req(req);
2946         spin_unlock(&req->rq_lock);
2947 }
2948
2949 /**
2950  * Grab additional reference on a request \a req
2951  */
2952 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
2953 {
2954         ENTRY;
2955         atomic_inc(&req->rq_refcount);
2956         RETURN(req);
2957 }
2958 EXPORT_SYMBOL(ptlrpc_request_addref);
2959
2960 /**
2961  * Add a request to import replay_list.
2962  * Must be called under imp_lock
2963  */
2964 void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
2965                                       struct obd_import *imp)
2966 {
2967         struct list_head *tmp;
2968
2969         assert_spin_locked(&imp->imp_lock);
2970
2971         if (req->rq_transno == 0) {
2972                 DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
2973                 LBUG();
2974         }
2975
2976         /*
2977          * clear this for new requests that were resent as well
2978          * as resent replayed requests.
2979          */
2980         lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
2981
2982         /* don't re-add requests that have been replayed */
2983         if (!list_empty(&req->rq_replay_list))
2984                 return;
2985
2986         lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
2987
2988         spin_lock(&req->rq_lock);
2989         req->rq_resend = 0;
2990         spin_unlock(&req->rq_lock);
2991
2992         LASSERT(imp->imp_replayable);
2993         /* Balanced in ptlrpc_free_committed, usually. */
2994         ptlrpc_request_addref(req);
2995         list_for_each_prev(tmp, &imp->imp_replay_list) {
2996                 struct ptlrpc_request *iter = list_entry(tmp,
2997                                                          struct ptlrpc_request,
2998                                                          rq_replay_list);
2999
3000                 /*
3001                  * We may have duplicate transnos if we create and then
3002                  * open a file, or for closes retained if to match creating
3003                  * opens, so use req->rq_xid as a secondary key.
3004                  * (See bugs 684, 685, and 428.)
3005                  * XXX no longer needed, but all opens need transnos!
3006                  */
3007                 if (iter->rq_transno > req->rq_transno)
3008                         continue;
3009
3010                 if (iter->rq_transno == req->rq_transno) {
3011                         LASSERT(iter->rq_xid != req->rq_xid);
3012                         if (iter->rq_xid > req->rq_xid)
3013                                 continue;
3014                 }
3015
3016                 list_add(&req->rq_replay_list, &iter->rq_replay_list);
3017                 return;
3018         }
3019
3020         list_add(&req->rq_replay_list, &imp->imp_replay_list);
3021 }
3022
3023 /**
3024  * Send request and wait until it completes.
3025  * Returns request processing status.
3026  */
3027 int ptlrpc_queue_wait(struct ptlrpc_request *req)
3028 {
3029         struct ptlrpc_request_set *set;
3030         int rc;
3031
3032         ENTRY;
3033         LASSERT(req->rq_set == NULL);
3034         LASSERT(!req->rq_receiving_reply);
3035
3036         set = ptlrpc_prep_set();
3037         if (!set) {
3038                 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
3039                 RETURN(-ENOMEM);
3040         }
3041
3042         /* for distributed debugging */
3043         lustre_msg_set_status(req->rq_reqmsg, current_pid());
3044
3045         /* add a ref for the set (see comment in ptlrpc_set_add_req) */
3046         ptlrpc_request_addref(req);
3047         ptlrpc_set_add_req(set, req);
3048         rc = ptlrpc_set_wait(NULL, set);
3049         ptlrpc_set_destroy(set);
3050
3051         RETURN(rc);
3052 }
3053 EXPORT_SYMBOL(ptlrpc_queue_wait);
3054
3055 /**
3056  * Callback used for replayed requests reply processing.
3057  * In case of successful reply calls registered request replay callback.
3058  * In case of error restart replay process.
3059  */
3060 static int ptlrpc_replay_interpret(const struct lu_env *env,
3061                                    struct ptlrpc_request *req,
3062                                    void *args, int rc)
3063 {
3064         struct ptlrpc_replay_async_args *aa = args;
3065         struct obd_import *imp = req->rq_import;
3066
3067         ENTRY;
3068         atomic_dec(&imp->imp_replay_inflight);
3069
3070         /*
3071          * Note: if it is bulk replay (MDS-MDS replay), then even if
3072          * server got the request, but bulk transfer timeout, let's
3073          * replay the bulk req again
3074          */
3075         if (!ptlrpc_client_replied(req) ||
3076             (req->rq_bulk &&
3077              lustre_msg_get_status(req->rq_repmsg) == -ETIMEDOUT)) {
3078                 DEBUG_REQ(D_ERROR, req, "request replay timed out.\n");
3079                 GOTO(out, rc = -ETIMEDOUT);
3080         }
3081
3082         if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
3083             (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
3084             lustre_msg_get_status(req->rq_repmsg) == -ENODEV))
3085                 GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg));
3086
3087         /** VBR: check version failure */
3088         if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
3089                 /** replay was failed due to version mismatch */
3090                 DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
3091                 spin_lock(&imp->imp_lock);
3092                 imp->imp_vbr_failed = 1;
3093                 spin_unlock(&imp->imp_lock);
3094                 lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
3095         } else {
3096                 /** The transno had better not change over replay. */
3097                 LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
3098                          lustre_msg_get_transno(req->rq_repmsg) ||
3099                          lustre_msg_get_transno(req->rq_repmsg) == 0,
3100                          "%#llx/%#llx\n",
3101                          lustre_msg_get_transno(req->rq_reqmsg),
3102                          lustre_msg_get_transno(req->rq_repmsg));
3103         }
3104
3105         spin_lock(&imp->imp_lock);
3106         imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
3107         spin_unlock(&imp->imp_lock);
3108         LASSERT(imp->imp_last_replay_transno);
3109
3110         /* transaction number shouldn't be bigger than the latest replayed */
3111         if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
3112                 DEBUG_REQ(D_ERROR, req,
3113                           "Reported transno %llu is bigger than the replayed one: %llu",
3114                           req->rq_transno,
3115                           lustre_msg_get_transno(req->rq_reqmsg));
3116                 GOTO(out, rc = -EINVAL);
3117         }
3118
3119         DEBUG_REQ(D_HA, req, "got rep");
3120
3121         /* let the callback do fixups, possibly including in the request */
3122         if (req->rq_replay_cb)
3123                 req->rq_replay_cb(req);
3124
3125         if (ptlrpc_client_replied(req) &&
3126             lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
3127                 DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
3128                           lustre_msg_get_status(req->rq_repmsg),
3129                           aa->praa_old_status);
3130
3131                 /*
3132                  * Note: If the replay fails for MDT-MDT recovery, let's
3133                  * abort all of the following requests in the replay
3134                  * and sending list, because MDT-MDT update requests
3135                  * are dependent on each other, see LU-7039
3136                  */
3137                 if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) {
3138                         struct ptlrpc_request *free_req;
3139                         struct ptlrpc_request *tmp;
3140
3141                         spin_lock(&imp->imp_lock);
3142                         list_for_each_entry_safe(free_req, tmp,
3143                                                  &imp->imp_replay_list,
3144                                                  rq_replay_list) {
3145                                 ptlrpc_free_request(free_req);
3146                         }
3147
3148                         list_for_each_entry_safe(free_req, tmp,
3149                                                  &imp->imp_committed_list,
3150                                                  rq_replay_list) {
3151                                 ptlrpc_free_request(free_req);
3152                         }
3153
3154                         list_for_each_entry_safe(free_req, tmp,
3155                                                  &imp->imp_delayed_list,
3156                                                  rq_list) {
3157                                 spin_lock(&free_req->rq_lock);
3158                                 free_req->rq_err = 1;
3159                                 free_req->rq_status = -EIO;
3160                                 ptlrpc_client_wake_req(free_req);
3161                                 spin_unlock(&free_req->rq_lock);
3162                         }
3163
3164                         list_for_each_entry_safe(free_req, tmp,
3165                                                  &imp->imp_sending_list,
3166                                                  rq_list) {
3167                                 spin_lock(&free_req->rq_lock);
3168                                 free_req->rq_err = 1;
3169                                 free_req->rq_status = -EIO;
3170                                 ptlrpc_client_wake_req(free_req);
3171                                 spin_unlock(&free_req->rq_lock);
3172                         }
3173                         spin_unlock(&imp->imp_lock);
3174                 }
3175         } else {
3176                 /* Put it back for re-replay. */
3177                 lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
3178         }
3179
3180         /*
3181          * Errors while replay can set transno to 0, but
3182          * imp_last_replay_transno shouldn't be set to 0 anyway
3183          */
3184         if (req->rq_transno == 0)
3185                 CERROR("Transno is 0 during replay!\n");
3186
3187         /* continue with recovery */
3188         rc = ptlrpc_import_recovery_state_machine(imp);
3189  out:
3190         req->rq_send_state = aa->praa_old_state;
3191
3192         if (rc != 0)
3193                 /* this replay failed, so restart recovery */
3194                 ptlrpc_connect_import(imp);
3195
3196         RETURN(rc);
3197 }
3198
3199 /**
3200  * Prepares and queues request for replay.
3201  * Adds it to ptlrpcd queue for actual sending.
3202  * Returns 0 on success.
3203  */
3204 int ptlrpc_replay_req(struct ptlrpc_request *req)
3205 {
3206         struct ptlrpc_replay_async_args *aa;
3207
3208         ENTRY;
3209
3210         LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
3211
3212         CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
3213         aa = ptlrpc_req_async_args(req);
3214         memset(aa, 0, sizeof(*aa));
3215
3216         /* Prepare request to be resent with ptlrpcd */
3217         aa->praa_old_state = req->rq_send_state;
3218         req->rq_send_state = LUSTRE_IMP_REPLAY;
3219         req->rq_phase = RQ_PHASE_NEW;
3220         req->rq_next_phase = RQ_PHASE_UNDEFINED;
3221         if (req->rq_repmsg)
3222                 aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
3223         req->rq_status = 0;
3224         req->rq_interpret_reply = ptlrpc_replay_interpret;
3225         /* Readjust the timeout for current conditions */
3226         ptlrpc_at_set_req_timeout(req);
3227
3228         /* Tell server net_latency to calculate how long to wait for reply. */
3229         lustre_msg_set_service_time(req->rq_reqmsg,
3230                                     ptlrpc_at_get_net_latency(req));
3231         DEBUG_REQ(D_HA, req, "REPLAY");
3232
3233         atomic_inc(&req->rq_import->imp_replay_inflight);
3234         spin_lock(&req->rq_lock);
3235         req->rq_early_free_repbuf = 0;
3236         spin_unlock(&req->rq_lock);
3237         ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
3238
3239         ptlrpcd_add_req(req);
3240         RETURN(0);
3241 }
3242
3243 /**
3244  * Aborts all in-flight request on import \a imp sending and delayed lists
3245  */
3246 void ptlrpc_abort_inflight(struct obd_import *imp)
3247 {
3248         struct list_head *tmp, *n;
3249
3250         ENTRY;
3251         /*
3252          * Make sure that no new requests get processed for this import.
3253          * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
3254          * this flag and then putting requests on sending_list or delayed_list.
3255          */
3256         spin_lock(&imp->imp_lock);
3257
3258         /*
3259          * XXX locking?  Maybe we should remove each request with the list
3260          * locked?  Also, how do we know if the requests on the list are
3261          * being freed at this time?
3262          */
3263         list_for_each_safe(tmp, n, &imp->imp_sending_list) {
3264                 struct ptlrpc_request *req = list_entry(tmp,
3265                                                         struct ptlrpc_request,
3266                                                         rq_list);
3267
3268                 DEBUG_REQ(D_RPCTRACE, req, "inflight");
3269
3270                 spin_lock(&req->rq_lock);
3271                 if (req->rq_import_generation < imp->imp_generation) {
3272                         req->rq_err = 1;
3273                         req->rq_status = -EIO;
3274                         ptlrpc_client_wake_req(req);
3275                 }
3276                 spin_unlock(&req->rq_lock);
3277         }
3278
3279         list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
3280                 struct ptlrpc_request *req =
3281                         list_entry(tmp, struct ptlrpc_request, rq_list);
3282
3283                 DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
3284
3285                 spin_lock(&req->rq_lock);
3286                 if (req->rq_import_generation < imp->imp_generation) {
3287                         req->rq_err = 1;
3288                         req->rq_status = -EIO;
3289                         ptlrpc_client_wake_req(req);
3290                 }
3291                 spin_unlock(&req->rq_lock);
3292         }
3293
3294         /*
3295          * Last chance to free reqs left on the replay list, but we
3296          * will still leak reqs that haven't committed.
3297          */
3298         if (imp->imp_replayable)
3299                 ptlrpc_free_committed(imp);
3300
3301         spin_unlock(&imp->imp_lock);
3302
3303         EXIT;
3304 }
3305
3306 /**
3307  * Abort all uncompleted requests in request set \a set
3308  */
3309 void ptlrpc_abort_set(struct ptlrpc_request_set *set)
3310 {
3311         struct list_head *tmp, *pos;
3312
3313         LASSERT(set != NULL);
3314
3315         list_for_each_safe(pos, tmp, &set->set_requests) {
3316                 struct ptlrpc_request *req =
3317                         list_entry(pos, struct ptlrpc_request,
3318                                    rq_set_chain);
3319
3320                 spin_lock(&req->rq_lock);
3321                 if (req->rq_phase != RQ_PHASE_RPC) {
3322                         spin_unlock(&req->rq_lock);
3323                         continue;
3324                 }
3325
3326                 req->rq_err = 1;
3327                 req->rq_status = -EINTR;
3328                 ptlrpc_client_wake_req(req);
3329                 spin_unlock(&req->rq_lock);
3330         }
3331 }
3332
3333 /**
3334  * Initialize the XID for the node.  This is common among all requests on
3335  * this node, and only requires the property that it is monotonically
3336  * increasing.  It does not need to be sequential.  Since this is also used
3337  * as the RDMA match bits, it is important that a single client NOT have
3338  * the same match bits for two different in-flight requests, hence we do
3339  * NOT want to have an XID per target or similar.
3340  *
3341  * To avoid an unlikely collision between match bits after a client reboot
3342  * (which would deliver old data into the wrong RDMA buffer) initialize
3343  * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
3344  * If the time is clearly incorrect, we instead use a 62-bit random number.
3345  * In the worst case the random number will overflow 1M RPCs per second in
3346  * 9133 years, or permutations thereof.
3347  */
3348 #define YEAR_2004 (1ULL << 30)
3349 void ptlrpc_init_xid(void)
3350 {
3351         time64_t now = ktime_get_real_seconds();
3352
3353         spin_lock_init(&ptlrpc_last_xid_lock);
3354         if (now < YEAR_2004) {
3355                 cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
3356                 ptlrpc_last_xid >>= 2;
3357                 ptlrpc_last_xid |= (1ULL << 61);
3358         } else {
3359                 ptlrpc_last_xid = (__u64)now << 20;
3360         }
3361
3362         /* Need to always be aligned to a power-of-two for mutli-bulk BRW */
3363         CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0);
3364         ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK;
3365 }
3366
3367 /**
3368  * Increase xid and returns resulting new value to the caller.
3369  *
3370  * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
3371  * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
3372  * itself uses the last bulk xid needed, so the server can determine the
3373  * the number of bulk transfers from the RPC XID and a bitmask.  The starting
3374  * xid must align to a power-of-two value.
3375  *
3376  * This is assumed to be true due to the initial ptlrpc_last_xid
3377  * value also being initialized to a power-of-two value. LU-1431
3378  */
3379 __u64 ptlrpc_next_xid(void)
3380 {
3381         __u64 next;
3382
3383         spin_lock(&ptlrpc_last_xid_lock);
3384         next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
3385         ptlrpc_last_xid = next;
3386         spin_unlock(&ptlrpc_last_xid_lock);
3387
3388         return next;
3389 }
3390
3391 /**
3392  * If request has a new allocated XID (new request or EINPROGRESS resend),
3393  * use this XID as matchbits of bulk, otherwise allocate a new matchbits for
3394  * request to ensure previous bulk fails and avoid problems with lost replies
3395  * and therefore several transfers landing into the same buffer from different
3396  * sending attempts.
3397  */
3398 void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req)
3399 {
3400         struct ptlrpc_bulk_desc *bd = req->rq_bulk;
3401
3402         LASSERT(bd != NULL);
3403
3404         /*
3405          * Generate new matchbits for all resend requests, including
3406          * resend replay.
3407          */
3408         if (req->rq_resend) {
3409                 __u64 old_mbits = req->rq_mbits;
3410
3411                 /*
3412                  * First time resend on -EINPROGRESS will generate new xid,
3413                  * so we can actually use the rq_xid as rq_mbits in such case,
3414                  * however, it's bit hard to distinguish such resend with a
3415                  * 'resend for the -EINPROGRESS resend'. To make it simple,
3416                  * we opt to generate mbits for all resend cases.
3417                  */
3418                 if (OCD_HAS_FLAG(&bd->bd_import->imp_connect_data,
3419                                  BULK_MBITS)) {
3420                         req->rq_mbits = ptlrpc_next_xid();
3421                 } else {
3422                         /*
3423                          * Old version transfers rq_xid to peer as
3424                          * matchbits.
3425                          */
3426                         spin_lock(&req->rq_import->imp_lock);
3427                         list_del_init(&req->rq_unreplied_list);
3428                         ptlrpc_assign_next_xid_nolock(req);
3429                         spin_unlock(&req->rq_import->imp_lock);
3430                         req->rq_mbits = req->rq_xid;
3431                 }
3432                 CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
3433                        old_mbits, req->rq_mbits);
3434         } else if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
3435                 /* Request being sent first time, use xid as matchbits. */
3436                 if (OCD_HAS_FLAG(&bd->bd_import->imp_connect_data, BULK_MBITS)
3437                     || req->rq_mbits == 0) {
3438                         req->rq_mbits = req->rq_xid;
3439                 } else {
3440                         int total_md = (bd->bd_iov_count + LNET_MAX_IOV - 1) /
3441                                         LNET_MAX_IOV;
3442                         req->rq_mbits -= total_md - 1;
3443                 }
3444         } else {
3445                 /*
3446                  * Replay request, xid and matchbits have already been
3447                  * correctly assigned.
3448                  */
3449                 return;
3450         }
3451
3452         /*
3453          * For multi-bulk RPCs, rq_mbits is the last mbits needed for bulks so
3454          * that server can infer the number of bulks that were prepared,
3455          * see LU-1431
3456          */
3457         req->rq_mbits += ((bd->bd_iov_count + LNET_MAX_IOV - 1) /
3458                           LNET_MAX_IOV) - 1;
3459
3460         /*
3461          * Set rq_xid as rq_mbits to indicate the final bulk for the old
3462          * server which does not support OBD_CONNECT_BULK_MBITS. LU-6808.
3463          *
3464          * It's ok to directly set the rq_xid here, since this xid bump
3465          * won't affect the request position in unreplied list.
3466          */
3467         if (!OCD_HAS_FLAG(&bd->bd_import->imp_connect_data, BULK_MBITS))
3468                 req->rq_xid = req->rq_mbits;
3469 }
3470
3471 /**
3472  * Get a glimpse at what next xid value might have been.
3473  * Returns possible next xid.
3474  */
3475 __u64 ptlrpc_sample_next_xid(void)
3476 {
3477 #if BITS_PER_LONG == 32
3478         /* need to avoid possible word tearing on 32-bit systems */
3479         __u64 next;
3480
3481         spin_lock(&ptlrpc_last_xid_lock);
3482         next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
3483         spin_unlock(&ptlrpc_last_xid_lock);
3484
3485         return next;
3486 #else
3487         /* No need to lock, since returned value is racy anyways */
3488         return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
3489 #endif
3490 }
3491 EXPORT_SYMBOL(ptlrpc_sample_next_xid);
3492
3493 /**
3494  * Functions for operating ptlrpc workers.
3495  *
3496  * A ptlrpc work is a function which will be running inside ptlrpc context.
3497  * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
3498  *
3499  * 1. after a work is created, it can be used many times, that is:
3500  *         handler = ptlrpcd_alloc_work();
3501  *         ptlrpcd_queue_work();
3502  *
3503  *    queue it again when necessary:
3504  *         ptlrpcd_queue_work();
3505  *         ptlrpcd_destroy_work();
3506  * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
3507  *    it will only be queued once in any time. Also as its name implies, it may
3508  *    have delay before it really runs by ptlrpcd thread.
3509  */
3510 struct ptlrpc_work_async_args {
3511         int (*cb)(const struct lu_env *, void *);
3512         void *cbdata;
3513 };
3514
3515 static void ptlrpcd_add_work_req(struct ptlrpc_request *req)
3516 {
3517         /* re-initialize the req */
3518         req->rq_timeout         = obd_timeout;
3519         req->rq_sent            = ktime_get_real_seconds();
3520         req->rq_deadline        = req->rq_sent + req->rq_timeout;
3521         req->rq_phase           = RQ_PHASE_INTERPRET;
3522         req->rq_next_phase      = RQ_PHASE_COMPLETE;
3523         req->rq_xid             = ptlrpc_next_xid();
3524         req->rq_import_generation = req->rq_import->imp_generation;
3525
3526         ptlrpcd_add_req(req);
3527 }
3528
3529 static int work_interpreter(const struct lu_env *env,
3530                             struct ptlrpc_request *req, void *args, int rc)
3531 {
3532         struct ptlrpc_work_async_args *arg = args;
3533
3534         LASSERT(ptlrpcd_check_work(req));
3535         LASSERT(arg->cb != NULL);
3536
3537         rc = arg->cb(env, arg->cbdata);
3538
3539         list_del_init(&req->rq_set_chain);
3540         req->rq_set = NULL;
3541
3542         if (atomic_dec_return(&req->rq_refcount) > 1) {
3543                 atomic_set(&req->rq_refcount, 2);
3544                 ptlrpcd_add_work_req(req);
3545         }
3546         return rc;
3547 }
3548
3549 static int worker_format;
3550
3551 static int ptlrpcd_check_work(struct ptlrpc_request *req)
3552 {
3553         return req->rq_pill.rc_fmt == (void *)&worker_format;
3554 }
3555
3556 /**
3557  * Create a work for ptlrpc.
3558  */
3559 void *ptlrpcd_alloc_work(struct obd_import *imp,
3560                          int (*cb)(const struct lu_env *, void *), void *cbdata)
3561 {
3562         struct ptlrpc_request *req = NULL;
3563         struct ptlrpc_work_async_args *args;
3564
3565         ENTRY;
3566         might_sleep();
3567
3568         if (!cb)
3569                 RETURN(ERR_PTR(-EINVAL));
3570
3571         /* copy some code from deprecated fakereq. */
3572         req = ptlrpc_request_cache_alloc(GFP_NOFS);
3573         if (!req) {
3574                 CERROR("ptlrpc: run out of memory!\n");
3575                 RETURN(ERR_PTR(-ENOMEM));
3576         }
3577
3578         ptlrpc_cli_req_init(req);
3579
3580         req->rq_send_state = LUSTRE_IMP_FULL;
3581         req->rq_type = PTL_RPC_MSG_REQUEST;
3582         req->rq_import = class_import_get(imp);
3583         req->rq_interpret_reply = work_interpreter;
3584         /* don't want reply */
3585         req->rq_no_delay = req->rq_no_resend = 1;
3586         req->rq_pill.rc_fmt = (void *)&worker_format;
3587
3588         CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args));
3589         args = ptlrpc_req_async_args(req);
3590         args->cb     = cb;
3591         args->cbdata = cbdata;
3592
3593         RETURN(req);
3594 }
3595 EXPORT_SYMBOL(ptlrpcd_alloc_work);
3596
3597 void ptlrpcd_destroy_work(void *handler)
3598 {
3599         struct ptlrpc_request *req = handler;
3600
3601         if (req)
3602                 ptlrpc_req_finished(req);
3603 }
3604 EXPORT_SYMBOL(ptlrpcd_destroy_work);
3605
3606 int ptlrpcd_queue_work(void *handler)
3607 {
3608         struct ptlrpc_request *req = handler;
3609
3610         /*
3611          * Check if the req is already being queued.
3612          *
3613          * Here comes a trick: it lacks a way of checking if a req is being
3614          * processed reliably in ptlrpc. Here I have to use refcount of req
3615          * for this purpose. This is okay because the caller should use this
3616          * req as opaque data. - Jinshan
3617          */
3618         LASSERT(atomic_read(&req->rq_refcount) > 0);
3619         if (atomic_inc_return(&req->rq_refcount) == 2)
3620                 ptlrpcd_add_work_req(req);
3621         return 0;
3622 }
3623 EXPORT_SYMBOL(ptlrpcd_queue_work);