1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
6 * This file is part of Lustre, http://www.lustre.org.
8 * Lustre is free software; you can redistribute it and/or
9 * modify it under the terms of version 2 of the GNU General Public
10 * License as published by the Free Software Foundation.
12 * Lustre is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Lustre; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 #include <linux/version.h>
28 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
29 #include <linux/tqueue.h>
31 #include <linux/workqueue.h>
35 #include <linux/kp30.h>
36 // #include <linux/obd.h>
37 #include <portals/p30.h>
38 #include <linux/lustre_idl.h>
39 #include <linux/lustre_ha.h>
40 #include <linux/lustre_import.h>
41 #include <linux/lprocfs_status.h>
43 /* The following constants determine how much memory is devoted to
44 * buffering in the lustre services.
46 * ?_NEVENTS # event queue entries
48 * ?_NBUFS # request buffers
49 * ?_BUFSIZE # bytes in a single request buffer
50 * total memory = ?_NBUFS * ?_BUFSIZE
52 * ?_MAXREQSIZE # maximum request service will receive
53 * larger messages will get dropped.
54 * request buffers are auto-unlinked when less than ?_MAXREQSIZE
58 #define LDLM_NUM_THREADS 4
59 #define LDLM_NEVENT_MAX 8192UL
60 #define LDLM_NEVENTS min(num_physpages / 64, LDLM_NEVENT_MAX)
61 #define LDLM_NBUF_MAX 256UL
62 #define LDLM_NBUFS min(LDLM_NEVENTS / 16, LDLM_NBUF_MAX)
63 #define LDLM_BUFSIZE (8 * 1024)
64 #define LDLM_MAXREQSIZE 1024
66 #define MDT_MAX_THREADS 32UL
67 #define MDT_NUM_THREADS min(num_physpages / 8192, MDT_MAX_THREADS)
68 #define MDS_NEVENT_MAX 8192UL
69 #define MDS_NEVENTS min(num_physpages / 64, MDS_NEVENT_MAX)
70 #define MDS_NBUF_MAX 512UL
71 #define MDS_NBUFS min(MDS_NEVENTS / 16, MDS_NBUF_MAX)
72 #define MDS_BUFSIZE (8 * 1024)
73 /* Assume file name length = FNAME_MAX = 256 (true for extN).
74 * path name length = PATH_MAX = 4096
75 * LOV MD size max = EA_MAX = 4000
76 * symlink: FNAME_MAX + PATH_MAX <- largest
77 * link: FNAME_MAX + PATH_MAX (mds_rec_link < mds_rec_create)
78 * rename: FNAME_MAX + FNAME_MAX
79 * open: FNAME_MAX + EA_MAX
81 * MDS_MAXREQSIZE ~= 4736 bytes =
82 * lustre_msg + ldlm_request + mds_body + mds_rec_create + FNAME_MAX + PATH_MAX
84 * Realistic size is about 512 bytes (20 character name + 128 char symlink),
85 * except in the open case where there are a large number of OSTs in a LOV.
87 #define MDS_MAXREQSIZE (5 * 1024)
89 #define OST_MAX_THREADS 36UL
90 #define OST_NUM_THREADS min(num_physpages / 8192, OST_MAX_THREADS)
91 #define OST_NEVENT_MAX 32768UL
92 #define OST_NEVENTS min(num_physpages / 16, OST_NEVENT_MAX)
93 #define OST_NBUF_MAX 1280UL
94 #define OST_NBUFS min(OST_NEVENTS / 64, OST_NBUF_MAX)
95 #define OST_BUFSIZE (8 * 1024)
96 /* OST_MAXREQSIZE ~= 1640 bytes =
97 * lustre_msg + obdo + 16 * obd_ioobj + 64 * niobuf_remote
99 * - single object with 16 pages is 512 bytes
100 * - OST_MAXREQSIZE must be at least 1 page of cookies plus some spillover
102 #define OST_MAXREQSIZE (5 * 1024)
104 #define PTLBD_NUM_THREADS 4
105 #define PTLBD_NEVENTS 1024
106 #define PTLBD_NBUFS 20
107 #define PTLBD_BUFSIZE (32 * 1024)
108 #define PTLBD_MAXREQSIZE 1024
110 #define CONN_INVALID 1
114 struct ptlrpc_ni *peer_ni;
117 struct ptlrpc_connection {
118 struct list_head c_link;
119 struct ptlrpc_peer c_peer;
120 struct obd_uuid c_local_uuid; /* XXX do we need this? */
121 struct obd_uuid c_remote_uuid;
123 __u32 c_generation; /* changes upon new connection */
124 __u32 c_epoch; /* changes when peer changes */
125 __u32 c_bootcount; /* peer's boot count */
132 __u64 c_remote_token;
134 __u32 c_flags; // can we indicate INVALID elsewhere?
137 struct ptlrpc_client {
138 __u32 cli_request_portal;
139 __u32 cli_reply_portal;
141 __u32 cli_target_devno;
147 /* state flags of requests */
148 /* XXX only ones left are those used by the bulk descs as well! */
149 #define PTL_RPC_FL_INTR (1 << 0) /* reply wait was interrupted by user */
150 #define PTL_RPC_FL_TIMEOUT (1 << 7) /* request timed out waiting for reply */
152 #define REQ_MAX_ACK_LOCKS 4
154 #define SWAB_PARANOIA 1
156 /* unpacking: assert idx not unpacked already */
157 #define LASSERT_REQSWAB(rq, idx) \
159 LASSERT ((idx) < sizeof ((rq)->rq_req_swab_mask) * 8); \
160 LASSERT (((rq)->rq_req_swab_mask & (1 << (idx))) == 0); \
161 (rq)->rq_req_swab_mask |= (1 << (idx)); \
164 #define LASSERT_REPSWAB(rq, idx) \
166 LASSERT ((idx) < sizeof ((rq)->rq_rep_swab_mask) * 8); \
167 LASSERT (((rq)->rq_rep_swab_mask & (1 << (idx))) == 0); \
168 (rq)->rq_rep_swab_mask |= (1 << (idx)); \
171 /* just looking: assert idx already unpacked */
172 #define LASSERT_REQSWABBED(rq, idx) \
173 LASSERT ((idx) < sizeof ((rq)->rq_req_swab_mask) * 8 && \
174 ((rq)->rq_req_swab_mask & (1 << (idx))) != 0)
176 #define LASSERT_REPSWABBED(rq, idx) \
177 LASSERT ((idx) < sizeof ((rq)->rq_rep_swab_mask) * 8 && \
178 ((rq)->rq_rep_swab_mask & (1 << (idx))) != 0)
180 #define LASSERT_REQSWAB(rq, idx)
181 #define LASSERT_REPSWAB(rq, idx)
182 #define LASSERT_REQSWABBED(rq, idx)
183 #define LASSERT_REPSWABBED(rq, idx)
186 union ptlrpc_async_args {
187 /* Scratchpad for passing args to completion interpreter. Users
188 * cast to the struct of their choosing, and LASSERT that this is
189 * big enough. For _tons_ of context, OBD_ALLOC a struct and store
190 * a pointer to it here. The pointer_arg ensures this struct is at
191 * least big enough for that. */
192 void *pointer_arg[5];
196 struct ptlrpc_request_set;
197 typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
199 struct ptlrpc_request_set {
200 int set_remaining; /* # uncompleted requests */
201 wait_queue_head_t set_waitq;
202 wait_queue_head_t *set_wakeup_ptr;
203 struct list_head set_requests;
204 set_interpreter_func set_interpret; /* completion callback */
205 union ptlrpc_async_args set_args; /* completion context */
208 struct ptlrpc_bulk_desc;
210 struct ptlrpc_request {
211 int rq_type; /* one of PTL_RPC_MSG_* */
212 struct list_head rq_list;
213 struct obd_device *rq_obd;
216 unsigned int rq_intr:1, rq_replied:1, rq_want_ack:1, rq_err:1,
217 rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1,
218 rq_no_resend:1, rq_resent:1, rq_no_recov:1, rq_waiting:1,
219 rq_receiving_reply:1;
222 atomic_t rq_refcount;
224 int rq_request_portal; /* XXX FIXME bug 249 */
225 int rq_reply_portal; /* XXX FIXME bug 249 */
228 struct lustre_msg *rq_reqmsg;
232 struct lustre_msg *rq_repmsg;
237 __u32 rq_req_swab_mask;
238 __u32 rq_rep_swab_mask;
241 int rq_import_generation;
243 wait_queue_head_t rq_wait_for_rep; /* XXX also _for_ack */
246 ptl_md_t rq_reply_md;
247 ptl_handle_md_t rq_reply_md_h;
249 /* outgoing req/rep */
252 struct ptlrpc_peer rq_peer; /* XXX see service.c can this be factored away? */
253 struct obd_export *rq_export;
254 struct ptlrpc_connection *rq_connection;
255 struct obd_import *rq_import;
256 struct ptlrpc_service *rq_svc;
258 void (*rq_replay_cb)(struct ptlrpc_request *);
259 void *rq_replay_data;
261 struct ptlrpc_bulk_desc *rq_bulk; /* client side bulk */
262 time_t rq_sent; /* when the request was sent */
265 struct list_head rq_set_chain;
266 struct ptlrpc_request_set *rq_set;
267 void *rq_interpret_reply; /* Async completion handler */
268 union ptlrpc_async_args rq_async_args; /* Async completion context */
270 /* Only used on the server side for tracking acks. */
271 struct ptlrpc_req_ack_lock {
272 struct lustre_handle lock;
274 } rq_ack_locks[REQ_MAX_ACK_LOCKS];
277 #define RQ_PHASE_NEW 0xebc0de00
278 #define RQ_PHASE_RPC 0xebc0de01
279 #define RQ_PHASE_BULK 0xebc0de02
280 #define RQ_PHASE_INTERPRET 0xebc0de03
281 #define RQ_PHASE_COMPLETE 0xebc0de04
283 /* Spare the preprocessor, spoil the bugs. */
284 #define FLAG(field, str) (field ? str : "")
286 #define DEBUG_REQ_FLAGS(req) \
287 ((req->rq_phase == RQ_PHASE_NEW) ? "New" : \
288 (req->rq_phase == RQ_PHASE_RPC) ? "RPC" : \
289 (req->rq_phase == RQ_PHASE_INTERPRET) ? "Interpret" : \
290 (req->rq_phase == RQ_PHASE_COMPLETE) ? "Complete" : \
291 (req->rq_phase == RQ_PHASE_BULK) ? "Bulk" : "?phase?"), \
292 FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \
293 FLAG(req->rq_want_ack, "A"), FLAG(req->rq_err, "E"), \
294 FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \
295 FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \
296 FLAG(req->rq_no_resend, "N"), FLAG(req->rq_resent, "s"), \
297 FLAG(req->rq_no_recov, "n"), FLAG(req->rq_waiting, "W")
299 #define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s"
301 #define DEBUG_REQ(level, req, fmt, args...) \
303 CDEBUG(level, "@@@ " fmt \
304 " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d ref %d fl " \
305 REQ_FLAGS_FMT"/%x/%x rc %x\n" , ## args, req, req->rq_xid, \
306 req->rq_reqmsg ? req->rq_reqmsg->transno : -1, \
307 req->rq_reqmsg ? req->rq_reqmsg->opc : -1, \
308 req->rq_import ? (char *)req->rq_import->imp_target_uuid.uuid : "<?>", \
309 req->rq_connection ? \
310 (char *)req->rq_connection->c_remote_uuid.uuid : "<?>", \
311 (req->rq_import && req->rq_import->imp_client) ? \
312 req->rq_import->imp_client->cli_request_portal : -1, \
313 req->rq_reqlen, req->rq_replen, \
314 atomic_read(&req->rq_refcount), \
315 DEBUG_REQ_FLAGS(req), \
316 req->rq_reqmsg ? req->rq_reqmsg->flags : 0, \
317 req->rq_repmsg ? req->rq_repmsg->flags : 0, \
321 struct ptlrpc_bulk_page {
322 struct ptlrpc_bulk_desc *bp_desc;
323 struct list_head bp_link;
325 int bp_pageoffset; /* offset within a page */
326 struct page *bp_page;
329 #define BULK_GET_SOURCE 0
330 #define BULK_PUT_SINK 1
331 #define BULK_GET_SINK 2
332 #define BULK_PUT_SOURCE 3
334 struct ptlrpc_bulk_desc {
335 unsigned int bd_complete:1;
336 unsigned int bd_network_rw:1; /* accessible to the network */
337 unsigned int bd_type:2; /* {put,get}{source,sink} */
338 unsigned int bd_registered:1; /* client side */
339 spinlock_t bd_lock; /* serialise with callback */
340 int bd_import_generation;
341 struct obd_export *bd_export;
342 struct obd_import *bd_import;
344 struct ptlrpc_request *bd_req; /* associated request */
345 wait_queue_head_t bd_waitq; /* server side only WQ */
346 struct list_head bd_page_list;
351 ptl_handle_md_t bd_md_h;
352 ptl_handle_me_t bd_me_h;
354 int bd_callback_count; /* server side callbacks */
357 ptl_kiov_t bd_iov[16]; /* self-sized pre-allocated iov */
359 struct iovec bd_iov[16]; /* self-sized pre-allocated iov */
363 struct ptlrpc_thread {
364 struct list_head t_link;
367 wait_queue_head_t t_ctl_waitq;
370 struct ptlrpc_request_buffer_desc {
371 struct list_head rqbd_list;
372 struct ptlrpc_srv_ni *rqbd_srv_ni;
373 ptl_handle_me_t rqbd_me_h;
374 atomic_t rqbd_refcount;
379 /* Generic interface state */
382 ptl_handle_ni_t pni_ni_h;
383 ptl_handle_eq_t pni_request_out_eq_h;
384 ptl_handle_eq_t pni_reply_in_eq_h;
385 ptl_handle_eq_t pni_reply_out_eq_h;
386 ptl_handle_eq_t pni_bulk_put_source_eq_h;
387 ptl_handle_eq_t pni_bulk_put_sink_eq_h;
388 ptl_handle_eq_t pni_bulk_get_source_eq_h;
389 ptl_handle_eq_t pni_bulk_get_sink_eq_h;
392 struct ptlrpc_srv_ni {
393 /* Interface-specific service state */
394 struct ptlrpc_service *sni_service; /* owning service */
395 struct ptlrpc_ni *sni_ni; /* network interface */
396 ptl_handle_eq_t sni_eq_h; /* event queue handle */
397 struct list_head sni_rqbds; /* all the request buffer descriptors */
398 __u32 sni_nrqbds; /* # request buffers */
399 atomic_t sni_nrqbds_receiving; /* # request buffers posted */
402 struct ptlrpc_service {
406 struct list_head srv_ni_list; /* list of interfaces */
407 __u32 srv_max_req_size; /* biggest request to receive */
408 __u32 srv_buf_size; /* # bytes in a request buffer */
410 __u32 srv_req_portal;
411 __u32 srv_rep_portal;
415 wait_queue_head_t srv_waitq; /* all threads sleep on this */
418 struct list_head srv_threads;
419 int (*srv_handler)(struct ptlrpc_request *req);
420 char *srv_name; /* only statically allocated strings here; we don't clean them */
421 struct proc_dir_entry *svc_procroot;
422 struct lprocfs_stats *svc_stats;
424 int srv_interface_rover;
425 struct ptlrpc_srv_ni srv_interfaces[0];
428 typedef int (*svc_handler_t)(struct ptlrpc_request *req);
430 /* ptlrpc/events.c */
431 extern struct ptlrpc_ni ptlrpc_interfaces[];
432 extern int ptlrpc_ninterfaces;
433 extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, struct ptlrpc_peer *peer);
435 /* ptlrpc/connection.c */
436 void ptlrpc_dump_connections(void);
437 void ptlrpc_readdress_connection(struct ptlrpc_connection *, struct obd_uuid *);
438 struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
439 struct obd_uuid *uuid);
440 int ptlrpc_put_connection(struct ptlrpc_connection *c);
441 struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
442 void ptlrpc_init_connection(void);
443 void ptlrpc_cleanup_connection(void);
445 /* ptlrpc/niobuf.c */
446 int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *);
447 int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *);
448 void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk);
449 int ptlrpc_register_bulk(struct ptlrpc_request *req);
450 void ptlrpc_unregister_bulk (struct ptlrpc_request *req);
452 static inline int ptlrpc_bulk_complete (struct ptlrpc_bulk_desc *desc)
457 spin_lock_irqsave (&desc->bd_lock, flags);
458 rc = desc->bd_complete;
459 spin_unlock_irqrestore (&desc->bd_lock, flags);
463 int ptlrpc_reply(struct ptlrpc_request *req);
464 int ptlrpc_error(struct ptlrpc_request *req);
465 void ptlrpc_resend_req(struct ptlrpc_request *request);
466 int ptl_send_rpc(struct ptlrpc_request *request);
467 void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd);
469 /* ptlrpc/client.c */
470 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
471 struct ptlrpc_client *);
472 void ptlrpc_cleanup_client(struct obd_import *imp);
473 struct obd_uuid *ptlrpc_req_to_uuid(struct ptlrpc_request *req);
474 struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
476 int ptlrpc_queue_wait(struct ptlrpc_request *req);
477 int ptlrpc_replay_req(struct ptlrpc_request *req);
478 void ptlrpc_unregister_reply(struct ptlrpc_request *req);
479 void ptlrpc_restart_req(struct ptlrpc_request *req);
480 void ptlrpc_abort_inflight(struct obd_import *imp);
482 struct ptlrpc_request_set *ptlrpc_prep_set(void);
483 int ptlrpc_set_wait(struct ptlrpc_request_set *);
484 void ptlrpc_set_destroy(struct ptlrpc_request_set *);
485 void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
487 struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
488 int count, int *lengths, char **bufs);
489 void ptlrpc_free_req(struct ptlrpc_request *request);
490 void ptlrpc_req_finished(struct ptlrpc_request *request);
491 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
492 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req,
493 int type, int portal);
494 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
495 int type, int portal);
496 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk);
497 int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
498 struct page *page, int pageoffset, int len);
499 void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *page);
500 void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
501 struct obd_import *imp);
502 __u64 ptlrpc_next_xid(void);
504 /* ptlrpc/ptlrpc_module.c */
505 void ptlrpc_put_ldlm_hooks(void);
506 int ptlrpc_ldlm_hooks_referenced(void);
508 /* ptlrpc/service.c */
509 struct ptlrpc_service *
510 ptlrpc_init_svc(__u32 nevents, __u32 nbufs, __u32 bufsize, __u32 max_req_size,
511 int req_portal, int rep_portal, svc_handler_t, char *name,
512 struct obd_device *dev);
513 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
514 int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
516 int ptlrpc_unregister_service(struct ptlrpc_service *service);
518 struct ptlrpc_svc_data {
520 struct ptlrpc_service *svc;
521 struct ptlrpc_thread *thread;
522 struct obd_device *dev;
525 /* ptlrpc/pack_generic.c */
526 int lustre_pack_msg(int count, int *lens, char **bufs, int *len,
527 struct lustre_msg **msg);
528 int lustre_msg_size(int count, int *lengths);
529 int lustre_unpack_msg(struct lustre_msg *m, int len);
530 void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
531 char *lustre_msg_string (struct lustre_msg *m, int n, int max_len);
532 void *lustre_swab_reqbuf (struct ptlrpc_request *req, int n, int minlen,
534 void *lustre_swab_repbuf (struct ptlrpc_request *req, int n, int minlen,
537 /* ldlm/ldlm_lib.c */
538 int client_import_connect(struct lustre_handle *conn, struct obd_device *obd,
539 struct obd_uuid *cluuid);
540 int client_import_disconnect(struct lustre_handle *conn, int failover);
542 /* ptlrpc/pinger.c */
543 int ptlrpc_pinger_add_import(struct obd_import *imp);
544 int ptlrpc_pinger_del_import(struct obd_import *imp);