1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
6 * This file is part of Lustre, http://www.lustre.org.
8 * Lustre is free software; you can redistribute it and/or
9 * modify it under the terms of version 2 of the GNU General Public
10 * License as published by the Free Software Foundation.
12 * Lustre is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Lustre; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 #include <linux/version.h>
28 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
29 #include <linux/tqueue.h>
31 #include <linux/workqueue.h>
35 #include <linux/kp30.h>
36 // #include <linux/obd.h>
37 #include <portals/p30.h>
38 #include <portals/lib-types.h> /* FIXME (for PTL_MD_MAX_IOV) */
39 #include <linux/lustre_idl.h>
40 #include <linux/lustre_ha.h>
41 #include <linux/lustre_import.h>
42 #include <linux/lprocfs_status.h>
44 /* The following constants determine how much memory is devoted to
45 * buffering in the lustre services.
47 * ?_NEVENTS # event queue entries
49 * ?_NBUFS # request buffers
50 * ?_BUFSIZE # bytes in a single request buffer
51 * total memory = ?_NBUFS * ?_BUFSIZE
53 * ?_MAXREQSIZE # maximum request service will receive
54 * larger messages will get dropped.
55 * request buffers are auto-unlinked when less than ?_MAXREQSIZE
59 #define LDLM_NUM_THREADS min(smp_num_cpus * smp_num_cpus * 8, 64)
60 #define LDLM_NEVENT_MAX 8192UL
61 #define LDLM_NEVENTS min_t(unsigned long, num_physpages / 64, \
63 #define LDLM_NBUF_MAX 256UL
64 #define LDLM_NBUFS min(LDLM_NEVENTS / 16, LDLM_NBUF_MAX)
65 #define LDLM_BUFSIZE (8 * 1024)
66 #define LDLM_MAXREQSIZE (5 * 1024)
68 #define MDT_MAX_THREADS 32UL
69 #define MDT_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
70 MDT_MAX_THREADS), 2UL)
71 #define MDS_NEVENT_MAX 8192UL
72 #define MDS_NEVENTS min_t(unsigned long, num_physpages / 64, \
74 #define MDS_NBUF_MAX 512UL
75 #define MDS_NBUFS min(MDS_NEVENTS / 16, MDS_NBUF_MAX)
76 #define MDS_BUFSIZE (8 * 1024)
77 /* Assume file name length = FNAME_MAX = 256 (true for extN).
78 * path name length = PATH_MAX = 4096
79 * LOV MD size max = EA_MAX = 4000
80 * symlink: FNAME_MAX + PATH_MAX <- largest
81 * link: FNAME_MAX + PATH_MAX (mds_rec_link < mds_rec_create)
82 * rename: FNAME_MAX + FNAME_MAX
83 * open: FNAME_MAX + EA_MAX
85 * MDS_MAXREQSIZE ~= 4736 bytes =
86 * lustre_msg + ldlm_request + mds_body + mds_rec_create + FNAME_MAX + PATH_MAX
88 * Realistic size is about 512 bytes (20 character name + 128 char symlink),
89 * except in the open case where there are a large number of OSTs in a LOV.
91 #define MDS_MAXREQSIZE (5 * 1024)
93 #define OST_MAX_THREADS 36UL
94 #define OST_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
95 OST_MAX_THREADS), 2UL)
96 #define OST_NEVENT_MAX 16384UL
97 #define OST_NEVENTS min_t(unsigned long, num_physpages / 16, \
99 #define OST_NBUF_MAX 5000UL
100 #define OST_NBUFS min(OST_NEVENTS / 2, OST_NBUF_MAX)
101 #define OST_BUFSIZE (8 * 1024)
102 /* OST_MAXREQSIZE ~= 1640 bytes =
103 * lustre_msg + obdo + 16 * obd_ioobj + 64 * niobuf_remote
105 * - single object with 16 pages is 512 bytes
106 * - OST_MAXREQSIZE must be at least 1 page of cookies plus some spillover
108 #define OST_MAXREQSIZE (5 * 1024)
110 #define PTLBD_NUM_THREADS 4
111 #define PTLBD_NEVENTS 1024
112 #define PTLBD_NBUFS 20
113 #define PTLBD_BUFSIZE (32 * 1024)
114 #define PTLBD_MAXREQSIZE 1024
118 struct ptlrpc_ni *peer_ni;
121 struct ptlrpc_connection {
122 struct list_head c_link;
123 struct ptlrpc_peer c_peer;
124 struct obd_uuid c_remote_uuid;
128 struct ptlrpc_client {
129 __u32 cli_request_portal;
130 __u32 cli_reply_portal;
134 /* state flags of requests */
135 /* XXX only ones left are those used by the bulk descs as well! */
136 #define PTL_RPC_FL_INTR (1 << 0) /* reply wait was interrupted by user */
137 #define PTL_RPC_FL_TIMEOUT (1 << 7) /* request timed out waiting for reply */
139 #define REQ_MAX_ACK_LOCKS 8
141 #define SWAB_PARANOIA 1
143 /* unpacking: assert idx not unpacked already */
144 #define LASSERT_REQSWAB(rq, idx) \
146 LASSERT ((idx) < sizeof ((rq)->rq_req_swab_mask) * 8); \
147 LASSERT (((rq)->rq_req_swab_mask & (1 << (idx))) == 0); \
148 (rq)->rq_req_swab_mask |= (1 << (idx)); \
151 #define LASSERT_REPSWAB(rq, idx) \
153 LASSERT ((idx) < sizeof ((rq)->rq_rep_swab_mask) * 8); \
154 LASSERT (((rq)->rq_rep_swab_mask & (1 << (idx))) == 0); \
155 (rq)->rq_rep_swab_mask |= (1 << (idx)); \
158 /* just looking: assert idx already unpacked */
159 #define LASSERT_REQSWABBED(rq, idx) \
160 LASSERT ((idx) < sizeof ((rq)->rq_req_swab_mask) * 8 && \
161 ((rq)->rq_req_swab_mask & (1 << (idx))) != 0)
163 #define LASSERT_REPSWABBED(rq, idx) \
164 LASSERT ((idx) < sizeof ((rq)->rq_rep_swab_mask) * 8 && \
165 ((rq)->rq_rep_swab_mask & (1 << (idx))) != 0)
167 #define LASSERT_REQSWAB(rq, idx)
168 #define LASSERT_REPSWAB(rq, idx)
169 #define LASSERT_REQSWABBED(rq, idx)
170 #define LASSERT_REPSWABBED(rq, idx)
173 union ptlrpc_async_args {
174 /* Scratchpad for passing args to completion interpreter. Users
175 * cast to the struct of their choosing, and LASSERT that this is
176 * big enough. For _tons_ of context, OBD_ALLOC a struct and store
177 * a pointer to it here. The pointer_arg ensures this struct is at
178 * least big enough for that. */
179 void *pointer_arg[9];
183 struct ptlrpc_request_set;
184 typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
186 struct ptlrpc_request_set {
187 int set_remaining; /* # uncompleted requests */
188 wait_queue_head_t set_waitq;
189 wait_queue_head_t *set_wakeup_ptr;
190 struct list_head set_requests;
191 set_interpreter_func set_interpret; /* completion callback */
192 union ptlrpc_async_args set_args; /* completion context */
193 /* locked so that any old caller can communicate requests to
194 * the set holder who can then fold them into the lock-free set */
195 spinlock_t set_new_req_lock;
196 struct list_head set_new_requests;
199 struct ptlrpc_bulk_desc;
201 struct ptlrpc_request {
202 int rq_type; /* one of PTL_RPC_MSG_* */
203 struct list_head rq_list;
204 struct obd_device *rq_obd;
207 unsigned int rq_intr:1, rq_replied:1, rq_want_ack:1, rq_err:1,
208 rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1,
209 rq_no_resend:1, rq_resent:1, rq_waiting:1, rq_receiving_reply:1;
212 atomic_t rq_refcount;
214 int rq_request_portal; /* XXX FIXME bug 249 */
215 int rq_reply_portal; /* XXX FIXME bug 249 */
218 struct lustre_msg *rq_reqmsg;
222 struct lustre_msg *rq_repmsg;
227 __u32 rq_req_swab_mask;
228 __u32 rq_rep_swab_mask;
231 int rq_import_generation;
232 enum lustre_imp_state rq_send_state;
233 wait_queue_head_t rq_wait_for_rep; /* XXX also _for_ack */
236 ptl_md_t rq_reply_md;
237 ptl_handle_md_t rq_reply_md_h;
239 /* outgoing req/rep */
242 struct ptlrpc_peer rq_peer; /* XXX see service.c can this be factored away? */
243 struct obd_export *rq_export;
244 struct ptlrpc_connection *rq_connection;
245 struct obd_import *rq_import;
246 struct ptlrpc_service *rq_svc;
248 void (*rq_replay_cb)(struct ptlrpc_request *);
249 void (*rq_commit_cb)(struct ptlrpc_request *);
252 struct ptlrpc_bulk_desc *rq_bulk; /* client side bulk */
253 time_t rq_sent; /* when the request was sent */
256 struct list_head rq_set_chain;
257 struct ptlrpc_request_set *rq_set;
258 void *rq_interpret_reply; /* Async completion handler */
259 union ptlrpc_async_args rq_async_args; /* Async completion context */
261 /* Only used on the server side for tracking acks. */
262 struct ptlrpc_req_ack_lock {
263 struct lustre_handle lock;
265 } rq_ack_locks[REQ_MAX_ACK_LOCKS];
269 #define RQ_PHASE_NEW 0xebc0de00
270 #define RQ_PHASE_RPC 0xebc0de01
271 #define RQ_PHASE_BULK 0xebc0de02
272 #define RQ_PHASE_INTERPRET 0xebc0de03
273 #define RQ_PHASE_COMPLETE 0xebc0de04
275 /* Spare the preprocessor, spoil the bugs. */
276 #define FLAG(field, str) (field ? str : "")
278 #define DEBUG_REQ_FLAGS(req) \
279 ((req->rq_phase == RQ_PHASE_NEW) ? "New" : \
280 (req->rq_phase == RQ_PHASE_RPC) ? "RPC" : \
281 (req->rq_phase == RQ_PHASE_INTERPRET) ? "Interpret" : \
282 (req->rq_phase == RQ_PHASE_COMPLETE) ? "Complete" : \
283 (req->rq_phase == RQ_PHASE_BULK) ? "Bulk" : "?phase?"), \
284 FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \
285 FLAG(req->rq_want_ack, "A"), FLAG(req->rq_err, "E"), \
286 FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \
287 FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \
288 FLAG(req->rq_no_resend, "N"), FLAG(req->rq_resent, "s"), \
289 FLAG(req->rq_waiting, "W")
291 #define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s"
293 #define DEBUG_REQ(level, req, fmt, args...) \
295 CDEBUG(level, "@@@ " fmt \
296 " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d ref %d fl " \
297 REQ_FLAGS_FMT"/%x/%x rc %x\n" , ## args, req, req->rq_xid, \
299 req->rq_reqmsg ? req->rq_reqmsg->opc : -1, \
300 req->rq_import ? (char *)req->rq_import->imp_target_uuid.uuid : "<?>", \
301 req->rq_connection ? \
302 (char *)req->rq_connection->c_remote_uuid.uuid : "<?>", \
303 (req->rq_import && req->rq_import->imp_client) ? \
304 req->rq_import->imp_client->cli_request_portal : -1, \
305 req->rq_reqlen, req->rq_replen, \
306 atomic_read(&req->rq_refcount), \
307 DEBUG_REQ_FLAGS(req), \
308 req->rq_reqmsg ? req->rq_reqmsg->flags : 0, \
309 req->rq_repmsg ? req->rq_repmsg->flags : 0, \
313 struct ptlrpc_bulk_page {
314 struct ptlrpc_bulk_desc *bp_desc;
315 struct list_head bp_link;
317 int bp_pageoffset; /* offset within a page */
318 struct page *bp_page;
321 #define BULK_GET_SOURCE 0
322 #define BULK_PUT_SINK 1
323 #define BULK_GET_SINK 2
324 #define BULK_PUT_SOURCE 3
326 struct ptlrpc_bulk_desc {
327 unsigned int bd_complete:1;
328 unsigned int bd_network_rw:1; /* accessible to the network */
329 unsigned int bd_type:2; /* {put,get}{source,sink} */
330 unsigned int bd_registered:1; /* client side */
331 spinlock_t bd_lock; /* serialise with callback */
332 int bd_import_generation;
333 struct obd_export *bd_export;
334 struct obd_import *bd_import;
336 struct ptlrpc_request *bd_req; /* associated request */
337 wait_queue_head_t bd_waitq; /* server side only WQ */
338 struct list_head bd_page_list;
343 ptl_handle_md_t bd_md_h;
344 ptl_handle_me_t bd_me_h;
346 int bd_callback_count; /* server side callbacks */
349 ptl_kiov_t bd_iov[PTL_MD_MAX_IOV];
351 struct iovec bd_iov[PTL_MD_MAX_IOV];
355 struct ptlrpc_thread {
356 struct list_head t_link;
359 wait_queue_head_t t_ctl_waitq;
362 struct ptlrpc_request_buffer_desc {
363 struct list_head rqbd_list;
364 struct ptlrpc_srv_ni *rqbd_srv_ni;
365 ptl_handle_me_t rqbd_me_h;
366 atomic_t rqbd_refcount;
370 /* event queues are per-ni, because one day we may get a hardware
371 * supported NAL that delivers events asynchonously wrt kernel portals
374 struct ptlrpc_ni { /* Generic interface state */
377 ptl_handle_ni_t pni_ni_h;
378 ptl_handle_eq_t pni_request_out_eq_h;
379 ptl_handle_eq_t pni_reply_in_eq_h;
380 ptl_handle_eq_t pni_reply_out_eq_h;
381 ptl_handle_eq_t pni_bulk_put_source_eq_h;
382 ptl_handle_eq_t pni_bulk_put_sink_eq_h;
383 ptl_handle_eq_t pni_bulk_get_source_eq_h;
384 ptl_handle_eq_t pni_bulk_get_sink_eq_h;
387 struct ptlrpc_srv_ni {
388 /* Interface-specific service state */
389 struct ptlrpc_service *sni_service; /* owning service */
390 struct ptlrpc_ni *sni_ni; /* network interface */
391 ptl_handle_eq_t sni_eq_h; /* event queue handle */
392 struct list_head sni_rqbds; /* all the request buffer descriptors */
393 __u32 sni_nrqbds; /* # request buffers */
394 atomic_t sni_nrqbds_receiving; /* # request buffers posted */
397 struct ptlrpc_service {
401 struct list_head srv_ni_list; /* list of interfaces */
402 __u32 srv_max_req_size; /* biggest request to receive */
403 __u32 srv_buf_size; /* # bytes in a request buffer */
405 __u32 srv_req_portal;
406 __u32 srv_rep_portal;
410 wait_queue_head_t srv_waitq; /* all threads sleep on this */
413 struct list_head srv_threads;
414 int (*srv_handler)(struct ptlrpc_request *req);
415 char *srv_name; /* only statically allocated strings here; we don't clean them */
416 struct proc_dir_entry *svc_procroot;
417 struct lprocfs_stats *svc_stats;
419 int srv_interface_rover;
420 struct ptlrpc_srv_ni srv_interfaces[0];
423 typedef int (*svc_handler_t)(struct ptlrpc_request *req);
425 /* ptlrpc/events.c */
426 extern struct ptlrpc_ni ptlrpc_interfaces[];
427 extern int ptlrpc_ninterfaces;
428 extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, struct ptlrpc_peer *peer);
430 /* ptlrpc/connection.c */
431 void ptlrpc_dump_connections(void);
432 void ptlrpc_readdress_connection(struct ptlrpc_connection *, struct obd_uuid *);
433 struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
434 struct obd_uuid *uuid);
435 int ptlrpc_put_connection(struct ptlrpc_connection *c);
436 struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
437 void ptlrpc_init_connection(void);
438 void ptlrpc_cleanup_connection(void);
440 /* ptlrpc/niobuf.c */
441 int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *);
442 int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *);
443 void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk);
444 int ptlrpc_register_bulk(struct ptlrpc_request *req);
445 void ptlrpc_unregister_bulk (struct ptlrpc_request *req);
447 static inline int ptlrpc_bulk_complete (struct ptlrpc_bulk_desc *desc)
452 spin_lock_irqsave (&desc->bd_lock, flags);
453 rc = desc->bd_complete;
454 spin_unlock_irqrestore (&desc->bd_lock, flags);
458 int ptlrpc_reply(struct ptlrpc_request *req);
459 int ptlrpc_error(struct ptlrpc_request *req);
460 void ptlrpc_resend_req(struct ptlrpc_request *request);
461 int ptl_send_rpc(struct ptlrpc_request *request);
462 void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd);
464 /* ptlrpc/client.c */
465 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
466 struct ptlrpc_client *);
467 void ptlrpc_cleanup_client(struct obd_import *imp);
468 struct obd_uuid *ptlrpc_req_to_uuid(struct ptlrpc_request *req);
469 struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
471 int ptlrpc_queue_wait(struct ptlrpc_request *req);
472 int ptlrpc_replay_req(struct ptlrpc_request *req);
473 void ptlrpc_unregister_reply(struct ptlrpc_request *req);
474 void ptlrpc_restart_req(struct ptlrpc_request *req);
475 void ptlrpc_abort_inflight(struct obd_import *imp);
477 struct ptlrpc_request_set *ptlrpc_prep_set(void);
478 int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
479 int ptlrpc_check_set(struct ptlrpc_request_set *set);
480 int ptlrpc_set_wait(struct ptlrpc_request_set *);
481 int ptlrpc_expired_set(void *data);
482 void ptlrpc_interrupted_set(void *data);
483 void ptlrpc_set_destroy(struct ptlrpc_request_set *);
484 void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
485 void ptlrpc_set_add_new_req(struct ptlrpc_request_set *,
486 struct ptlrpc_request *);
488 struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
489 int count, int *lengths, char **bufs);
490 void ptlrpc_free_req(struct ptlrpc_request *request);
491 void ptlrpc_req_finished(struct ptlrpc_request *request);
492 void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
493 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
494 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req,
495 int type, int portal);
496 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
497 int type, int portal);
498 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk);
499 int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
500 struct page *page, int pageoffset, int len);
501 void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *page);
502 void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
503 struct obd_import *imp);
504 __u64 ptlrpc_next_xid(void);
506 /* ptlrpc/service.c */
507 struct ptlrpc_service *
508 ptlrpc_init_svc(__u32 nevents, __u32 nbufs, __u32 bufsize, __u32 max_req_size,
509 int req_portal, int rep_portal, svc_handler_t, char *name,
510 struct proc_dir_entry *proc_entry);
511 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
512 int ptlrpc_start_n_threads(struct obd_device *dev, struct ptlrpc_service *svc,
513 int cnt, char *base_name);
514 int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
516 int ptlrpc_unregister_service(struct ptlrpc_service *service);
518 struct ptlrpc_svc_data {
520 struct ptlrpc_service *svc;
521 struct ptlrpc_thread *thread;
522 struct obd_device *dev;
525 /* ptlrpc/import.c */
526 int ptlrpc_connect_import(struct obd_import *imp);
527 int ptlrpc_disconnect_import(struct obd_import *imp);
529 /* ptlrpc/pack_generic.c */
530 int lustre_msg_swabbed(struct lustre_msg *msg);
531 int lustre_pack_request(struct ptlrpc_request *, int count, int *lens,
533 int lustre_pack_reply(struct ptlrpc_request *, int count, int *lens,
535 int lustre_msg_size(int count, int *lengths);
536 int lustre_unpack_msg(struct lustre_msg *m, int len);
537 void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
538 char *lustre_msg_string (struct lustre_msg *m, int n, int max_len);
539 void *lustre_swab_reqbuf (struct ptlrpc_request *req, int n, int minlen,
541 void *lustre_swab_repbuf (struct ptlrpc_request *req, int n, int minlen,
544 /* ldlm/ldlm_lib.c */
545 int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf);
546 int client_obd_cleanup(struct obd_device * obddev, int flags);
547 int client_connect_import(struct lustre_handle *conn, struct obd_device *obd,
548 struct obd_uuid *cluuid);
549 int client_disconnect_export(struct obd_export *exp, int failover);
551 /* ptlrpc/pinger.c */
552 int ptlrpc_pinger_add_import(struct obd_import *imp);
553 int ptlrpc_pinger_del_import(struct obd_import *imp);
555 /* ptlrpc/lproc_ptlrpc.c */
557 void ptlrpc_lprocfs_register_obd(struct obd_device *obddev);
558 void ptlrpc_lprocfs_unregister_obd(struct obd_device *obddev);
560 #define ptlrpc_lprocfs_register_obd(param...) do{}while(0)
561 #define ptlrpc_lprocfs_unregister_obd(param...) do{}while(0)
564 /* ptlrpc/llog_server.c */
565 struct llog_obd_ctxt;
566 int llog_origin_handle_create(struct ptlrpc_request *req);
567 int llog_origin_handle_next_block(struct ptlrpc_request *req);
568 int llog_origin_handle_read_header(struct ptlrpc_request *req);
569 int llog_origin_handle_close(struct ptlrpc_request *req);
570 int llog_origin_handle_cancel(struct ptlrpc_request *req);
572 /* ptlrpc/llog_client.c */
573 extern struct llog_operations llog_client_ops;