1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
6 * This file is part of Lustre, http://www.lustre.org.
8 * Lustre is free software; you can redistribute it and/or
9 * modify it under the terms of version 2 of the GNU General Public
10 * License as published by the Free Software Foundation.
12 * Lustre is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Lustre; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 #include <linux/version.h>
28 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
29 #include <linux/tqueue.h>
31 #include <linux/workqueue.h>
35 #include <linux/kp30.h>
36 // #include <linux/obd.h>
37 #include <portals/p30.h>
38 #include <linux/lustre_idl.h>
39 #include <linux/lustre_ha.h>
40 #include <linux/lustre_import.h>
41 #include <linux/lprocfs_status.h>
43 /* The following constants determine how much memory is devoted to
44 * buffering in the lustre services.
46 * ?_NEVENTS # event queue entries
48 * ?_NBUFS # request buffers
49 * ?_BUFSIZE # bytes in a single request buffer
50 * total memory = ?_NBUFS * ?_BUFSIZE
52 * ?_MAXREQSIZE # maximum request service will receive
53 * larger messages will get dropped.
54 * request buffers are auto-unlinked when less than ?_MAXREQSIZE
58 #define LDLM_NUM_THREADS 4
59 #define LDLM_NEVENT_MAX 8192UL
60 #define LDLM_NEVENTS min(num_physpages / 64, LDLM_NEVENT_MAX)
61 #define LDLM_NBUF_MAX 256UL
62 #define LDLM_NBUFS min(LDLM_NEVENTS / 16, LDLM_NBUF_MAX)
63 #define LDLM_BUFSIZE (8 * 1024)
64 #define LDLM_MAXREQSIZE 1024
66 #define MDT_MAX_THREADS 32UL
67 #define MDT_NUM_THREADS min(num_physpages / 8192, MDT_MAX_THREADS)
68 #define MDS_NEVENT_MAX 8192UL
69 #define MDS_NEVENTS min(num_physpages / 64, MDS_NEVENT_MAX)
70 #define MDS_NBUF_MAX 512UL
71 #define MDS_NBUFS min(MDS_NEVENTS / 16, MDS_NBUF_MAX)
72 #define MDS_BUFSIZE (8 * 1024)
73 /* Assume file name length = FNAME_MAX = 256 (true for extN).
74 * path name length = PATH_MAX = 4096
75 * LOV MD size max = EA_MAX = 4000
76 * symlink: FNAME_MAX + PATH_MAX <- largest
77 * link: FNAME_MAX + PATH_MAX (mds_rec_link < mds_rec_create)
78 * rename: FNAME_MAX + FNAME_MAX
79 * open: FNAME_MAX + EA_MAX
81 * MDS_MAXREQSIZE ~= 4736 bytes =
82 * lustre_msg + ldlm_request + mds_body + mds_rec_create + FNAME_MAX + PATH_MAX
84 * Realistic size is about 512 bytes (20 character name + 128 char symlink),
85 * except in the open case where there are a large number of OSTs in a LOV.
87 #define MDS_MAXREQSIZE (5 * 1024)
89 #define OST_MAX_THREADS 36UL
90 #define OST_NUM_THREADS min(num_physpages / 8192, OST_MAX_THREADS)
91 #define OST_NEVENT_MAX 32768UL
92 #define OST_NEVENTS min(num_physpages / 16, OST_NEVENT_MAX)
93 #define OST_NBUF_MAX 1280UL
94 #define OST_NBUFS min(OST_NEVENTS / 64, OST_NBUF_MAX)
95 #define OST_BUFSIZE (8 * 1024)
96 /* OST_MAXREQSIZE ~= 1640 bytes =
97 * lustre_msg + obdo + 16 * obd_ioobj + 64 * niobuf_remote
99 * single object with 16 pages is 512 bytes
101 #define OST_MAXREQSIZE (2 * 1024)
103 #define PTLBD_NUM_THREADS 4
104 #define PTLBD_NEVENTS 1024
105 #define PTLBD_NBUFS 20
106 #define PTLBD_BUFSIZE (32 * 1024)
107 #define PTLBD_MAXREQSIZE 1024
109 #define CONN_INVALID 1
113 struct ptlrpc_ni *peer_ni;
116 struct ptlrpc_connection {
117 struct list_head c_link;
118 struct ptlrpc_peer c_peer;
119 struct obd_uuid c_local_uuid; /* XXX do we need this? */
120 struct obd_uuid c_remote_uuid;
122 __u32 c_generation; /* changes upon new connection */
123 __u32 c_epoch; /* changes when peer changes */
124 __u32 c_bootcount; /* peer's boot count */
131 __u64 c_remote_token;
133 __u32 c_flags; // can we indicate INVALID elsewhere?
136 struct ptlrpc_client {
137 __u32 cli_request_portal;
138 __u32 cli_reply_portal;
140 __u32 cli_target_devno;
146 /* state flags of requests */
147 /* XXX only ones left are those used by the bulk descs as well! */
148 #define PTL_RPC_FL_INTR (1 << 0) /* reply wait was interrupted by user */
149 #define PTL_RPC_FL_TIMEOUT (1 << 7) /* request timed out waiting for reply */
151 #define REQ_MAX_ACK_LOCKS 4
153 #define SWAB_PARANOIA 1
155 /* unpacking: assert idx not unpacked already */
156 #define LASSERT_REQSWAB(rq, idx) \
158 LASSERT ((idx) < sizeof ((rq)->rq_req_swab_mask) * 8); \
159 LASSERT (((rq)->rq_req_swab_mask & (1 << (idx))) == 0); \
160 (rq)->rq_req_swab_mask |= (1 << (idx)); \
163 #define LASSERT_REPSWAB(rq, idx) \
165 LASSERT ((idx) < sizeof ((rq)->rq_rep_swab_mask) * 8); \
166 LASSERT (((rq)->rq_rep_swab_mask & (1 << (idx))) == 0); \
167 (rq)->rq_rep_swab_mask |= (1 << (idx)); \
170 /* just looking: assert idx already unpacked */
171 #define LASSERT_REQSWABBED(rq, idx) \
172 LASSERT ((idx) < sizeof ((rq)->rq_req_swab_mask) * 8 && \
173 ((rq)->rq_req_swab_mask & (1 << (idx))) != 0)
175 #define LASSERT_REPSWABBED(rq, idx) \
176 LASSERT ((idx) < sizeof ((rq)->rq_rep_swab_mask) * 8 && \
177 ((rq)->rq_rep_swab_mask & (1 << (idx))) != 0)
179 #define LASSERT_REQSWAB(rq, idx)
180 #define LASSERT_REPSWAB(rq, idx)
181 #define LASSERT_REQSWABBED(rq, idx)
182 #define LASSERT_REPSWABBED(rq, idx)
185 union ptlrpc_async_args {
186 /* Scratchpad for passing args to completion interpreter. Users
187 * cast to the struct of their choosing, and LASSERT that this is
188 * big enough. For _tons_ of context, OBD_ALLOC a struct and store
189 * a pointer to it here. The pointer_arg ensures this struct is at
190 * least big enough for that. */
191 void *pointer_arg[4];
195 struct ptlrpc_request_set {
196 int set_remaining; /* # uncompleted requests */
197 wait_queue_head_t set_waitq;
198 struct list_head set_requests;
199 void *set_interpret; /* completion callback */
200 union ptlrpc_async_args set_args; /* completion context */
203 struct ptlrpc_bulk_desc;
205 struct ptlrpc_request {
206 int rq_type; /* one of PTL_RPC_MSG_* */
207 struct list_head rq_list;
208 struct obd_device *rq_obd;
211 unsigned int rq_intr:1, rq_replied:1, rq_want_ack:1, rq_err:1,
212 rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1,
213 rq_no_resend:1, rq_resent:1, rq_no_recov:1, rq_waiting:1,
214 rq_receiving_reply:1;
217 atomic_t rq_refcount;
219 int rq_request_portal; /* XXX FIXME bug 249 */
220 int rq_reply_portal; /* XXX FIXME bug 249 */
223 struct lustre_msg *rq_reqmsg;
227 struct lustre_msg *rq_repmsg;
232 __u32 rq_req_swab_mask;
233 __u32 rq_rep_swab_mask;
236 int rq_import_generation;
238 wait_queue_head_t rq_wait_for_rep; /* XXX also _for_ack */
241 ptl_md_t rq_reply_md;
242 ptl_handle_md_t rq_reply_md_h;
244 /* outgoing req/rep */
247 struct ptlrpc_peer rq_peer; /* XXX see service.c can this be factored away? */
248 struct obd_export *rq_export;
249 struct ptlrpc_connection *rq_connection;
250 struct obd_import *rq_import;
251 struct ptlrpc_service *rq_svc;
253 void (*rq_replay_cb)(struct ptlrpc_request *);
254 void *rq_replay_data;
256 struct ptlrpc_bulk_desc *rq_bulk; /* client side bulk */
257 time_t rq_sent; /* when the request was sent */
260 struct list_head rq_set_chain;
261 struct ptlrpc_request_set *rq_set;
262 void *rq_interpret_reply; /* Async completion handler */
263 union ptlrpc_async_args rq_async_args; /* Async completion context */
265 /* Only used on the server side for tracking acks. */
266 struct ptlrpc_req_ack_lock {
267 struct lustre_handle lock;
269 } rq_ack_locks[REQ_MAX_ACK_LOCKS];
272 #define RQ_PHASE_NEW 0xebc0de00
273 #define RQ_PHASE_RPC 0xebc0de01
274 #define RQ_PHASE_BULK 0xebc0de02
275 #define RQ_PHASE_INTERPRET 0xebc0de03
276 #define RQ_PHASE_COMPLETE 0xebc0de04
278 /* Spare the preprocessor, spoil the bugs. */
279 #define FLAG(field, str) (field ? str : "")
281 #define DEBUG_REQ_FLAGS(req) \
282 ((req->rq_phase == RQ_PHASE_NEW) ? "New" : \
283 (req->rq_phase == RQ_PHASE_RPC) ? "Rpc" : \
284 (req->rq_phase == RQ_PHASE_INTERPRET) ? "Interpret" : \
285 (req->rq_phase == RQ_PHASE_COMPLETE) ? "Complete" : "?phase?"), \
286 FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \
287 FLAG(req->rq_want_ack, "A"), FLAG(req->rq_err, "E"), \
288 FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \
289 FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \
290 FLAG(req->rq_no_resend, "N"), FLAG(req->rq_resent, "s"), \
291 FLAG(req->rq_no_recov, "n"), FLAG(req->rq_waiting, "W")
293 #define REQ_FLAGS_FMT "%s%s%s%s%s%s%s%s%s%s%s%s%s"
295 #define DEBUG_REQ(level, req, fmt, args...) \
297 CDEBUG(level, "@@@ " fmt \
298 " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d ref %d fl " \
299 REQ_FLAGS_FMT"/%x/%x rc %x\n" , ## args, req, req->rq_xid, \
300 req->rq_reqmsg ? req->rq_reqmsg->transno : -1, \
301 req->rq_reqmsg ? req->rq_reqmsg->opc : -1, \
302 req->rq_import ? (char *)req->rq_import->imp_target_uuid.uuid : "<?>", \
303 req->rq_connection ? \
304 (char *)req->rq_connection->c_remote_uuid.uuid : "<?>", \
305 (req->rq_import && req->rq_import->imp_client) ? \
306 req->rq_import->imp_client->cli_request_portal : -1, \
307 req->rq_reqlen, req->rq_replen, \
308 atomic_read(&req->rq_refcount), \
309 DEBUG_REQ_FLAGS(req), \
310 req->rq_reqmsg ? req->rq_reqmsg->flags : 0, \
311 req->rq_repmsg ? req->rq_repmsg->flags : 0, \
315 struct ptlrpc_bulk_page {
316 struct ptlrpc_bulk_desc *bp_desc;
317 struct list_head bp_link;
319 int bp_pageoffset; /* offset within a page */
320 struct page *bp_page;
323 #define BULK_GET_SOURCE 0
324 #define BULK_PUT_SINK 1
325 #define BULK_GET_SINK 2
326 #define BULK_PUT_SOURCE 3
328 struct ptlrpc_bulk_desc {
329 unsigned int bd_complete:1;
330 unsigned int bd_network_rw:1; /* accessible to the network */
331 unsigned int bd_type:2; /* {put,get}{source,sink} */
332 unsigned int bd_registered:1; /* client side */
333 spinlock_t bd_lock; /* serialise with callback */
334 int bd_import_generation;
335 struct obd_export *bd_export;
336 struct obd_import *bd_import;
338 struct ptlrpc_request *bd_req; /* associated request */
339 wait_queue_head_t bd_waitq; /* server side only WQ */
340 struct list_head bd_page_list;
345 ptl_handle_md_t bd_md_h;
346 ptl_handle_me_t bd_me_h;
348 int bd_callback_count; /* server side callbacks */
351 ptl_kiov_t bd_iov[16]; /* self-sized pre-allocated iov */
353 struct iovec bd_iov[16]; /* self-sized pre-allocated iov */
357 struct ptlrpc_thread {
358 struct list_head t_link;
361 wait_queue_head_t t_ctl_waitq;
364 struct ptlrpc_request_buffer_desc {
365 struct list_head rqbd_list;
366 struct ptlrpc_srv_ni *rqbd_srv_ni;
367 ptl_handle_me_t rqbd_me_h;
368 atomic_t rqbd_refcount;
373 /* Generic interface state */
376 ptl_handle_ni_t pni_ni_h;
377 ptl_handle_eq_t pni_request_out_eq_h;
378 ptl_handle_eq_t pni_reply_in_eq_h;
379 ptl_handle_eq_t pni_reply_out_eq_h;
380 ptl_handle_eq_t pni_bulk_put_source_eq_h;
381 ptl_handle_eq_t pni_bulk_put_sink_eq_h;
382 ptl_handle_eq_t pni_bulk_get_source_eq_h;
383 ptl_handle_eq_t pni_bulk_get_sink_eq_h;
386 struct ptlrpc_srv_ni {
387 /* Interface-specific service state */
388 struct ptlrpc_service *sni_service; /* owning service */
389 struct ptlrpc_ni *sni_ni; /* network interface */
390 ptl_handle_eq_t sni_eq_h; /* event queue handle */
391 struct list_head sni_rqbds; /* all the request buffer descriptors */
392 __u32 sni_nrqbds; /* # request buffers */
393 atomic_t sni_nrqbds_receiving; /* # request buffers posted */
396 struct ptlrpc_service {
400 struct list_head srv_ni_list; /* list of interfaces */
401 __u32 srv_max_req_size; /* biggest request to receive */
402 __u32 srv_buf_size; /* # bytes in a request buffer */
404 __u32 srv_req_portal;
405 __u32 srv_rep_portal;
409 wait_queue_head_t srv_waitq; /* all threads sleep on this */
412 struct list_head srv_threads;
413 int (*srv_handler)(struct ptlrpc_request *req);
414 char *srv_name; /* only statically allocated strings here; we don't clean them */
415 struct proc_dir_entry *svc_procroot;
416 struct lprocfs_counters *svc_counters;
418 int srv_interface_rover;
419 struct ptlrpc_srv_ni srv_interfaces[0];
422 typedef int (*svc_handler_t)(struct ptlrpc_request *req);
424 /* ptlrpc/events.c */
425 extern struct ptlrpc_ni ptlrpc_interfaces[];
426 extern int ptlrpc_ninterfaces;
427 extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, struct ptlrpc_peer *peer);
429 /* ptlrpc/connection.c */
430 void ptlrpc_dump_connections(void);
431 void ptlrpc_readdress_connection(struct ptlrpc_connection *, struct obd_uuid *);
432 struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
433 struct obd_uuid *uuid);
434 int ptlrpc_put_connection(struct ptlrpc_connection *c);
435 struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
436 void ptlrpc_init_connection(void);
437 void ptlrpc_cleanup_connection(void);
439 /* ptlrpc/niobuf.c */
440 int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *);
441 int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *);
442 void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk);
443 int ptlrpc_register_bulk(struct ptlrpc_request *req);
444 void ptlrpc_unregister_bulk (struct ptlrpc_request *req);
446 static inline int ptlrpc_bulk_complete (struct ptlrpc_bulk_desc *desc)
451 spin_lock_irqsave (&desc->bd_lock, flags);
452 rc = desc->bd_complete;
453 spin_unlock_irqrestore (&desc->bd_lock, flags);
457 int ptlrpc_reply(struct ptlrpc_request *req);
458 int ptlrpc_error(struct ptlrpc_request *req);
459 void ptlrpc_resend_req(struct ptlrpc_request *request);
460 int ptl_send_rpc(struct ptlrpc_request *request);
461 void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd);
463 /* ptlrpc/client.c */
464 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
465 struct ptlrpc_client *);
466 void ptlrpc_cleanup_client(struct obd_import *imp);
467 struct obd_uuid *ptlrpc_req_to_uuid(struct ptlrpc_request *req);
468 struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
470 int ptlrpc_queue_wait(struct ptlrpc_request *req);
471 int ptlrpc_replay_req(struct ptlrpc_request *req);
472 void ptlrpc_unregister_reply(struct ptlrpc_request *req);
473 void ptlrpc_restart_req(struct ptlrpc_request *req);
474 void ptlrpc_abort_inflight(struct obd_import *imp);
476 struct ptlrpc_request_set *ptlrpc_prep_set(void);
477 int ptlrpc_set_wait(struct ptlrpc_request_set *);
478 void ptlrpc_set_destroy(struct ptlrpc_request_set *);
479 void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
481 struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
482 int count, int *lengths, char **bufs);
483 void ptlrpc_free_req(struct ptlrpc_request *request);
484 void ptlrpc_req_finished(struct ptlrpc_request *request);
485 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
486 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req,
487 int type, int portal);
488 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
489 int type, int portal);
490 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk);
491 int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
492 struct page *page, int pageoffset, int len);
493 void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *page);
494 void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
495 struct obd_import *imp);
496 __u64 ptlrpc_next_xid(void);
498 /* ptlrpc/ptlrpc_module.c */
499 void ptlrpc_put_ldlm_hooks(void);
500 int ptlrpc_ldlm_hooks_referenced(void);
502 /* ptlrpc/service.c */
503 struct ptlrpc_service *
504 ptlrpc_init_svc(__u32 nevents, __u32 nbufs, __u32 bufsize, __u32 max_req_size,
505 int req_portal, int rep_portal, svc_handler_t, char *name,
506 struct obd_device *dev);
507 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
508 int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
510 int ptlrpc_unregister_service(struct ptlrpc_service *service);
512 struct ptlrpc_svc_data {
514 struct ptlrpc_service *svc;
515 struct ptlrpc_thread *thread;
516 struct obd_device *dev;
519 /* ptlrpc/pack_generic.c */
520 int lustre_pack_msg(int count, int *lens, char **bufs, int *len,
521 struct lustre_msg **msg);
522 int lustre_msg_size(int count, int *lengths);
523 int lustre_unpack_msg(struct lustre_msg *m, int len);
524 void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
525 char *lustre_msg_string (struct lustre_msg *m, int n, int max_len);
526 void *lustre_swab_reqbuf (struct ptlrpc_request *req, int n, int minlen,
528 void *lustre_swab_repbuf (struct ptlrpc_request *req, int n, int minlen,
531 /* ldlm/ldlm_lib.c */
532 int client_import_connect(struct lustre_handle *conn, struct obd_device *obd,
533 struct obd_uuid *cluuid);
534 int client_import_disconnect(struct lustre_handle *conn, int failover);