Whamcloud - gitweb
Land from b_hd_pid to HEAD
[fs/lustre-release.git] / lustre / ptlrpc / niobuf.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5  *
6  *   This file is part of Lustre, http://www.lustre.org.
7  *
8  *   Lustre is free software; you can redistribute it and/or
9  *   modify it under the terms of version 2 of the GNU General Public
10  *   License as published by the Free Software Foundation.
11  *
12  *   Lustre is distributed in the hope that it will be useful,
13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *   GNU General Public License for more details.
16  *
17  *   You should have received a copy of the GNU General Public License
18  *   along with Lustre; if not, write to the Free Software
19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  *
21  */
22
23 #define DEBUG_SUBSYSTEM S_RPC
24 #ifndef __KERNEL__
25 #include <liblustre.h>
26 #endif
27 #include <linux/obd_support.h>
28 #include <linux/lustre_net.h>
29 #include <linux/lustre_lib.h>
30 #include <linux/obd.h>
31 #include "ptlrpc_internal.h"
32
33 static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len, 
34                          ptl_ack_req_t ack, struct ptlrpc_cb_id *cbid,
35                          struct ptlrpc_connection *conn, int portal, __u64 xid)
36 {
37         int              rc;
38         int              rc2;
39         ptl_md_t         md;
40         char str[PTL_NALFMT_SIZE];
41         ENTRY;
42
43         LASSERT (portal != 0);
44         LASSERT (conn != NULL);
45         CDEBUG (D_INFO, "conn=%p ni %s id %s on %s\n",
46                 conn, conn->c_peer.peer_ni->pni_name,
47                 ptlrpc_id2str(&conn->c_peer, str),
48                 conn->c_peer.peer_ni->pni_name);
49         md.start     = base;
50         md.length    = len;
51         md.threshold = (ack == PTL_ACK_REQ) ? 2 : 1;
52         md.options   = PTLRPC_MD_OPTIONS;
53         md.user_ptr  = cbid;
54         md.eventq    = conn->c_peer.peer_ni->pni_eq_h;
55
56         if (ack == PTL_ACK_REQ &&
57             OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_ACK | OBD_FAIL_ONCE)) {
58                 /* don't ask for the ack to simulate failing client */
59                 ack = PTL_NOACK_REQ;
60                 obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
61         }
62
63         rc = PtlMDBind (conn->c_peer.peer_ni->pni_ni_h, md, 
64                         PTL_UNLINK, mdh);
65         if (rc != PTL_OK) {
66                 CERROR ("PtlMDBind failed: %d\n", rc);
67                 LASSERT (rc == PTL_NO_SPACE);
68                 RETURN (-ENOMEM);
69         }
70
71         CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n",
72                len, portal, xid);
73
74         rc = PtlPut (*mdh, ack, conn->c_peer.peer_id, portal, 0, xid, 0, 0);
75         if (rc != PTL_OK) {
76                 /* We're going to get an UNLINK event when I unlink below,
77                  * which will complete just like any other failed send, so
78                  * I fall through and return success here! */
79                 CERROR("PtlPut(%s, %d, "LPD64") failed: %d\n",
80                        ptlrpc_id2str(&conn->c_peer, str),
81                        portal, xid, rc);
82                 rc2 = PtlMDUnlink(*mdh);
83                 LASSERT (rc2 == PTL_OK);
84         }
85
86         RETURN (0);
87 }
88
89 int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc)
90 {
91         int                 rc;
92         int                 rc2;
93         struct ptlrpc_peer *peer;
94         ptl_md_t            md;
95         __u64               xid;
96         char                str[PTL_NALFMT_SIZE];
97         ENTRY;
98
99         if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_PTLRPC_BULK_PUT_NET)) 
100                 RETURN(0);
101
102         /* NB no locking required until desc is on the network */
103         LASSERT (!desc->bd_network_rw);
104         LASSERT (desc->bd_type == BULK_PUT_SOURCE ||
105                  desc->bd_type == BULK_GET_SINK);
106         desc->bd_success = 0;
107         peer = &desc->bd_export->exp_connection->c_peer;
108
109         md.user_ptr = &desc->bd_cbid;
110         md.eventq = peer->peer_ni->pni_eq_h;
111         md.threshold = 2; /* SENT and ACK/REPLY */
112         md.options = PTLRPC_MD_OPTIONS;
113         ptlrpc_fill_bulk_md(&md, desc);
114
115         LASSERT (desc->bd_cbid.cbid_fn == server_bulk_callback);
116         LASSERT (desc->bd_cbid.cbid_arg == desc);
117
118         /* NB total length may be 0 for a read past EOF, so we send a 0
119          * length bulk, since the client expects a bulk event. */
120
121         rc = PtlMDBind(peer->peer_ni->pni_ni_h, md,
122                        PTL_UNLINK, &desc->bd_md_h);
123         if (rc != PTL_OK) {
124                 CERROR("PtlMDBind failed: %d\n", rc);
125                 LASSERT (rc == PTL_NO_SPACE);
126                 RETURN(-ENOMEM);
127         }
128
129         /* Client's bulk and reply matchbits are the same */
130         xid = desc->bd_req->rq_xid;
131         CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d on %s "
132                "nid %s pid %d xid "LPX64"\n", desc->bd_iov_count,
133                desc->bd_nob, desc->bd_portal, peer->peer_ni->pni_name,
134                ptlrpc_id2str(peer, str), peer->peer_id.pid, xid);
135
136         /* Network is about to get at the memory */
137         desc->bd_network_rw = 1;
138
139         if (desc->bd_type == BULK_PUT_SOURCE)
140                 rc = PtlPut (desc->bd_md_h, PTL_ACK_REQ, peer->peer_id,
141                              desc->bd_portal, 0, xid, 0, 0);
142         else
143                 rc = PtlGet (desc->bd_md_h, peer->peer_id,
144                              desc->bd_portal, 0, xid, 0);
145         
146         if (rc != PTL_OK) {
147                 /* Can't send, so we unlink the MD bound above.  The UNLINK
148                  * event this creates will signal completion with failure,
149                  * so we return SUCCESS here! */
150                 CERROR("Transfer(%s, %d, "LPX64") failed: %d\n",
151                        ptlrpc_id2str(peer, str),
152                        desc->bd_portal, xid, rc);
153                 rc2 = PtlMDUnlink(desc->bd_md_h);
154                 LASSERT (rc2 == PTL_OK);
155         }
156
157         RETURN(0);
158 }
159
160 void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc)
161 {
162         /* Server side bulk abort. Idempotent. Not thread-safe (i.e. only
163          * serialises with completion callback) */
164         struct l_wait_info lwi;
165         int                rc;
166
167         LASSERT (!in_interrupt ());             /* might sleep */
168
169         if (!ptlrpc_bulk_active(desc))          /* completed or */
170                 return;                         /* never started */
171         
172         /* The unlink ensures the callback happens ASAP and is the last
173          * one.  If it fails, it must be because completion just happened,
174          * but we must still l_wait_event() in this case, to give liblustre
175          * a chance to run server_bulk_callback()*/
176
177         PtlMDUnlink (desc->bd_md_h);
178
179         for (;;) {
180                 /* Network access will complete in finite time but the HUGE
181                  * timeout lets us CWARN for visibility of sluggish NALs */
182                 lwi = LWI_TIMEOUT (300 * HZ, NULL, NULL);
183                 rc = l_wait_event(desc->bd_waitq, 
184                                   !ptlrpc_bulk_active(desc), &lwi);
185                 if (rc == 0)
186                         return;
187
188                 LASSERT(rc == -ETIMEDOUT);
189                 CWARN("Unexpectedly long timeout: desc %p\n", desc);
190         }
191 }
192
193 int ptlrpc_register_bulk (struct ptlrpc_request *req)
194 {
195         struct ptlrpc_bulk_desc *desc = req->rq_bulk;
196         struct ptlrpc_peer *peer;
197         int rc;
198         int rc2;
199         ptl_handle_me_t  me_h;
200         ptl_md_t         md;
201         ENTRY;
202
203         if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_PTLRPC_BULK_GET_NET)) 
204                 RETURN(0);
205
206         /* NB no locking required until desc is on the network */
207         LASSERT (desc->bd_nob > 0);
208         LASSERT (!desc->bd_network_rw);
209         LASSERT (desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
210         LASSERT (desc->bd_req != NULL);
211         LASSERT (desc->bd_type == BULK_PUT_SINK ||
212                  desc->bd_type == BULK_GET_SOURCE);
213
214         desc->bd_success = 0;
215
216         peer = &desc->bd_import->imp_connection->c_peer;
217
218         md.user_ptr = &desc->bd_cbid;
219         md.eventq = peer->peer_ni->pni_eq_h;
220         md.threshold = 1;                       /* PUT or GET */
221         md.options = PTLRPC_MD_OPTIONS | 
222                      ((desc->bd_type == BULK_GET_SOURCE) ? 
223                       PTL_MD_OP_GET : PTL_MD_OP_PUT);
224         ptlrpc_fill_bulk_md(&md, desc);
225
226         LASSERT (desc->bd_cbid.cbid_fn == client_bulk_callback);
227         LASSERT (desc->bd_cbid.cbid_arg == desc);
228
229         /* XXX Registering the same xid on retried bulk makes my head
230          * explode trying to understand how the original request's bulk
231          * might interfere with the retried request -eeb */
232         LASSERT (!desc->bd_registered || req->rq_xid != desc->bd_last_xid);
233         desc->bd_registered = 1;
234         desc->bd_last_xid = req->rq_xid;
235         
236         rc = PtlMEAttach(peer->peer_ni->pni_ni_h,
237                          desc->bd_portal, desc->bd_import->imp_connection->c_peer.peer_id, 
238                          req->rq_xid, 0, PTL_UNLINK, PTL_INS_AFTER, &me_h);
239         if (rc != PTL_OK) {
240                 CERROR("PtlMEAttach failed: %d\n", rc);
241                 LASSERT (rc == PTL_NO_SPACE);
242                 RETURN (-ENOMEM);
243         }
244
245         /* About to let the network at it... */
246         desc->bd_network_rw = 1;
247         rc = PtlMDAttach(me_h, md, PTL_UNLINK, &desc->bd_md_h);
248         if (rc != PTL_OK) {
249                 CERROR("PtlMDAttach failed: %d\n", rc);
250                 LASSERT (rc == PTL_NO_SPACE);
251                 desc->bd_network_rw = 0;
252                 rc2 = PtlMEUnlink (me_h);
253                 LASSERT (rc2 == PTL_OK);
254                 RETURN (-ENOMEM);
255         }
256
257         CDEBUG(D_NET, "Setup bulk %s buffers: %u pages %u bytes, xid "LPX64", "
258                "portal %u on %s\n",
259                desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
260                desc->bd_iov_count, desc->bd_nob,
261                req->rq_xid, desc->bd_portal, peer->peer_ni->pni_name);
262         RETURN(0);
263 }
264
265 void ptlrpc_unregister_bulk (struct ptlrpc_request *req)
266 {
267         /* Disconnect a bulk desc from the network. Idempotent. Not
268          * thread-safe (i.e. only interlocks with completion callback). */
269         struct ptlrpc_bulk_desc *desc = req->rq_bulk;
270         wait_queue_head_t       *wq;
271         struct l_wait_info       lwi;
272         int                      rc;
273
274         LASSERT (!in_interrupt ());             /* might sleep */
275
276         if (!ptlrpc_bulk_active(desc))          /* completed or */
277                 return;                         /* never registered */
278         
279         LASSERT (desc->bd_req == req);          /* bd_req NULL until registered */
280
281         /* the unlink ensures the callback happens ASAP and is the last
282          * one.  If it fails, it must be because completion just happened,
283          * but we must still l_wait_event() in this case to give liblustre
284          * a chance to run client_bulk_callback() */
285
286         PtlMDUnlink (desc->bd_md_h);
287         
288         if (desc->bd_req->rq_set != NULL)
289                 wq = &req->rq_set->set_waitq;
290         else
291                 wq = &req->rq_reply_waitq;
292
293         for (;;) {
294                 /* Network access will complete in finite time but the HUGE
295                  * timeout lets us CWARN for visibility of sluggish NALs */
296                 lwi = LWI_TIMEOUT (300 * HZ, NULL, NULL);
297                 rc = l_wait_event(*wq, !ptlrpc_bulk_active(desc), &lwi);
298                 if (rc == 0)
299                         return;
300                 
301                 LASSERT (rc == -ETIMEDOUT);
302                 CWARN("Unexpectedly long timeout: desc %p\n", desc);
303         }
304 }
305
306 int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult)
307 {
308         struct ptlrpc_service     *svc = req->rq_rqbd->rqbd_srv_ni->sni_service;
309         struct ptlrpc_reply_state *rs = req->rq_reply_state;
310         struct ptlrpc_connection  *conn;
311         int                        rc;
312
313         /* We must already have a reply buffer (only ptlrpc_error() may be
314          * called without one).  We must also have a request buffer which
315          * is either the actual (swabbed) incoming request, or a saved copy
316          * if this is a req saved in target_queue_final_reply(). */
317         LASSERT (req->rq_reqmsg != NULL);
318         LASSERT (rs != NULL);
319         LASSERT (req->rq_repmsg != NULL);
320         LASSERT (may_be_difficult || !rs->rs_difficult);
321         LASSERT (req->rq_repmsg == &rs->rs_msg);
322         LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
323         LASSERT (rs->rs_cb_id.cbid_arg == rs);
324
325         LASSERT (req->rq_repmsg != NULL);
326         if (req->rq_type != PTL_RPC_MSG_ERR)
327                 req->rq_type = PTL_RPC_MSG_REPLY;
328
329         req->rq_repmsg->type   = req->rq_type;
330         req->rq_repmsg->status = req->rq_status;
331         req->rq_repmsg->opc    = req->rq_reqmsg->opc;
332
333         if (req->rq_export == NULL) 
334                 conn = ptlrpc_get_connection(&req->rq_peer, NULL);
335         else
336                 conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
337
338         atomic_inc (&svc->srv_outstanding_replies);
339
340         rc = ptl_send_buf (&rs->rs_md_h, req->rq_repmsg, req->rq_replen,
341                            rs->rs_difficult ? PTL_ACK_REQ : PTL_NOACK_REQ,
342                            &rs->rs_cb_id, conn,
343                            svc->srv_rep_portal, req->rq_xid);
344         if (rc != 0) {
345                 atomic_dec (&svc->srv_outstanding_replies);
346
347                 if (!rs->rs_difficult) {
348                         /* Callers other than target_send_reply() expect me
349                          * to clean up on a comms error */
350                         lustre_free_reply_state (rs);
351                         req->rq_reply_state = NULL;
352                         req->rq_repmsg = NULL;
353                 }
354         }
355         ptlrpc_put_connection(conn);
356         return rc;
357 }
358
359 int ptlrpc_reply (struct ptlrpc_request *req)
360 {
361         return (ptlrpc_send_reply (req, 0));
362 }
363
364 int ptlrpc_error(struct ptlrpc_request *req)
365 {
366         int rc;
367         ENTRY;
368
369         if (!req->rq_repmsg) {
370                 rc = lustre_pack_reply(req, 0, NULL, NULL);
371                 if (rc)
372                         RETURN(rc);
373         }
374
375         req->rq_type = PTL_RPC_MSG_ERR;
376
377         rc = ptlrpc_send_reply (req, 0);
378         RETURN(rc);
379 }
380
381 int ptl_send_rpc(struct ptlrpc_request *request)
382 {
383         int rc;
384         int rc2;
385         struct ptlrpc_connection *connection;
386         unsigned long flags;
387         ptl_handle_me_t  reply_me_h;
388         ptl_md_t         reply_md;
389         ENTRY;
390
391         LASSERT (request->rq_type == PTL_RPC_MSG_REQUEST);
392
393         /* If this is a re-transmit, we're required to have disengaged
394          * cleanly from the previous attempt */
395         LASSERT (!request->rq_receiving_reply);
396
397         connection = request->rq_import->imp_connection;
398
399         if (request->rq_bulk != NULL) {
400                 rc = ptlrpc_register_bulk (request);
401                 if (rc != 0)
402                         RETURN(rc);
403         }
404
405         request->rq_reqmsg->handle = request->rq_import->imp_remote_handle;
406         request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST;
407         request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt;
408                 
409         LASSERT (request->rq_replen != 0);
410         if (request->rq_repmsg == NULL)
411                 OBD_ALLOC(request->rq_repmsg, request->rq_replen);
412         if (request->rq_repmsg == NULL)
413                 GOTO(cleanup_bulk, rc = -ENOMEM);
414
415         rc = PtlMEAttach(connection->c_peer.peer_ni->pni_ni_h,
416                          request->rq_reply_portal, /* XXX FIXME bug 249 */
417                          connection->c_peer.peer_id, request->rq_xid, 0, PTL_UNLINK,
418                          PTL_INS_AFTER, &reply_me_h);
419         if (rc != PTL_OK) {
420                 CERROR("PtlMEAttach failed: %d\n", rc);
421                 LASSERT (rc == PTL_NO_SPACE);
422                 GOTO(cleanup_repmsg, rc = -ENOMEM);
423         }
424
425         spin_lock_irqsave (&request->rq_lock, flags);
426         /* If the MD attach succeeds, there _will_ be a reply_in callback */
427         request->rq_receiving_reply = 1;
428         /* Clear any flags that may be present from previous sends. */
429         request->rq_replied = 0;
430         request->rq_err = 0;
431         request->rq_timedout = 0;
432         request->rq_net_err = 0;
433         request->rq_resend = 0;
434         request->rq_restart = 0;
435         spin_unlock_irqrestore (&request->rq_lock, flags);
436
437         reply_md.start     = request->rq_repmsg;
438         reply_md.length    = request->rq_replen;
439         reply_md.threshold = 1;
440         reply_md.options   = PTLRPC_MD_OPTIONS | PTL_MD_OP_PUT;
441         reply_md.user_ptr  = &request->rq_reply_cbid;
442         reply_md.eventq    = connection->c_peer.peer_ni->pni_eq_h;
443
444         rc = PtlMDAttach(reply_me_h, reply_md, PTL_UNLINK, 
445                          &request->rq_reply_md_h);
446         if (rc != PTL_OK) {
447                 CERROR("PtlMDAttach failed: %d\n", rc);
448                 LASSERT (rc == PTL_NO_SPACE);
449                 GOTO(cleanup_me, rc -ENOMEM);
450         }
451
452         CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
453                ", portal %u on %s\n",
454                request->rq_replen, request->rq_xid,
455                request->rq_reply_portal,
456                connection->c_peer.peer_ni->pni_name);
457
458         ptlrpc_request_addref(request);        /* +1 ref for the SENT callback */
459
460         request->rq_sent = LTIME_S(CURRENT_TIME);
461         ptlrpc_pinger_sending_on_import(request->rq_import);
462         rc = ptl_send_buf(&request->rq_req_md_h, 
463                           request->rq_reqmsg, request->rq_reqlen,
464                           PTL_NOACK_REQ, &request->rq_req_cbid, 
465                           connection,
466                           request->rq_request_portal,
467                           request->rq_xid);
468         if (rc == 0) {
469                 ptlrpc_lprocfs_rpc_sent(request);
470                 RETURN(rc);
471         }
472
473         ptlrpc_req_finished (request);          /* drop callback ref */
474
475  cleanup_me:
476         /* MEUnlink is safe; the PUT didn't even get off the ground, and
477          * nobody apart from the PUT's target has the right nid+XID to
478          * access the reply buffer. */
479         rc2 = PtlMEUnlink(reply_me_h);
480         LASSERT (rc2 == PTL_OK);
481         /* UNLINKED callback called synchronously */
482         LASSERT (!request->rq_receiving_reply);
483
484  cleanup_repmsg:
485         OBD_FREE(request->rq_repmsg, request->rq_replen);
486         request->rq_repmsg = NULL;
487
488  cleanup_bulk:
489         if (request->rq_bulk != NULL)
490                 ptlrpc_unregister_bulk(request);
491
492         return rc;
493 }
494
495 int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd)
496 {
497         struct ptlrpc_srv_ni    *srv_ni = rqbd->rqbd_srv_ni;
498         struct ptlrpc_service   *service = srv_ni->sni_service;
499         static ptl_process_id_t  match_id = {PTL_NID_ANY, PTL_PID_ANY};
500         int                      rc;
501         ptl_md_t                 md;
502         ptl_handle_me_t          me_h;
503
504         CDEBUG(D_NET, "PtlMEAttach: portal %d on %s\n",
505                service->srv_req_portal, srv_ni->sni_ni->pni_name);
506
507         if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_PTLRPC_RQBD))
508                 return (-ENOMEM);
509
510         rc = PtlMEAttach(srv_ni->sni_ni->pni_ni_h, service->srv_req_portal,
511                          match_id, 0, ~0, PTL_UNLINK, PTL_INS_AFTER, &me_h);
512         if (rc != PTL_OK) {
513                 CERROR("PtlMEAttach failed: %d\n", rc);
514                 return (-ENOMEM);
515         }
516
517         LASSERT(rqbd->rqbd_refcount == 0);
518         rqbd->rqbd_refcount = 1;
519
520         md.start     = rqbd->rqbd_buffer;
521         md.length    = service->srv_buf_size;
522         md.max_size  = service->srv_max_req_size;
523         md.threshold = PTL_MD_THRESH_INF;
524         md.options   = PTLRPC_MD_OPTIONS | PTL_MD_OP_PUT | PTL_MD_MAX_SIZE;
525         md.user_ptr  = &rqbd->rqbd_cbid;
526         md.eventq    = srv_ni->sni_ni->pni_eq_h;
527         
528         rc = PtlMDAttach(me_h, md, PTL_UNLINK, &rqbd->rqbd_md_h);
529         if (rc == PTL_OK)
530                 return (0);
531
532         CERROR("PtlMDAttach failed: %d; \n", rc);
533         LASSERT (rc == PTL_NO_SPACE);
534         rc = PtlMEUnlink (me_h);
535         LASSERT (rc == PTL_OK);
536         rqbd->rqbd_refcount = 0;
537         
538         return (-ENOMEM);
539 }