From ecec139c9d4e5ca37937fefee256f00dd84d53df Mon Sep 17 00:00:00 2001 From: eeb Date: Thu, 28 Oct 2004 12:05:35 +0000 Subject: [PATCH] * Landed b1_2_singleportals --- lustre/autoMakefile.am | 7 +- lustre/configure.in | 6 +- lustre/include/liblustre.h | 38 +- lustre/include/linux/lustre_net.h | 79 +- lustre/include/linux/obd_class.h | 2 +- lustre/ldlm/ldlm_lib.c | 15 +- lustre/ldlm/ldlm_lock.c | 12 +- lustre/ldlm/ldlm_lockd.c | 58 +- lustre/ldlm/ldlm_request.c | 8 +- lustre/llite/llite_lib.c | 23 +- lustre/mds/mds_reint.c | 5 +- lustre/obdclass/class_obd.c | 8 +- lustre/obdclass/lustre_peer.c | 30 +- lustre/obdclass/obd_config.c | 2 +- lustre/ost/ost_handler.c | 52 +- lustre/portals/archdep.m4 | 235 +- lustre/portals/build.m4 | 13 +- lustre/portals/include/linux/kp30.h | 151 +- lustre/portals/include/linux/kpr.h | 21 +- lustre/portals/include/linux/libcfs.h | 94 +- lustre/portals/include/linux/portals_compat25.h | 1 + lustre/portals/include/linux/portals_lib.h | 108 - lustre/portals/include/portals/Makefile.am | 6 +- lustre/portals/include/portals/api-support.h | 9 +- lustre/portals/include/portals/api.h | 61 +- lustre/portals/include/portals/arg-blocks.h | 265 -- lustre/portals/include/portals/build_check.h | 2 +- lustre/portals/include/portals/defines.h | 116 - lustre/portals/include/portals/errno.h | 77 +- lustre/portals/include/portals/lib-dispatch.h | 45 - lustre/portals/include/portals/lib-nal.h | 115 - lustre/portals/include/portals/lib-p30.h | 277 +- lustre/portals/include/portals/lib-types.h | 154 +- lustre/portals/include/portals/nal.h | 84 +- lustre/portals/include/portals/nalids.h | 6 +- lustre/portals/include/portals/p30.h | 49 +- lustre/portals/include/portals/ppid.h | 49 - lustre/portals/include/portals/ptlctl.h | 24 +- lustre/portals/include/portals/types.h | 134 +- lustre/portals/knals/Makefile.in | 3 +- lustre/portals/knals/autoMakefile.am | 2 +- lustre/portals/knals/gmnal/gmnal.h | 109 +- lustre/portals/knals/gmnal/gmnal_api.c | 362 +-- lustre/portals/knals/gmnal/gmnal_cb.c | 152 +- lustre/portals/knals/gmnal/gmnal_comm.c | 177 +- lustre/portals/knals/gmnal/gmnal_module.c | 37 +- lustre/portals/knals/ibnal/Makefile.in | 6 - lustre/portals/knals/ibnal/autoMakefile.am | 10 - lustre/portals/knals/ibnal/ibnal.c | 2146 -------------- lustre/portals/knals/ibnal/ibnal.h | 565 ---- lustre/portals/knals/ibnal/ibnal_cb.c | 1289 --------- .../knals/ibnal/ibnal_send_recv_self_testing.c | 116 - lustre/portals/knals/ibnal/uagent.c | 391 --- lustre/portals/knals/{ibnal => iibnal}/.cvsignore | 2 +- lustre/portals/knals/iibnal/Makefile.in | 6 + lustre/portals/knals/iibnal/Makefile.mk | 10 + lustre/portals/knals/iibnal/autoMakefile.am | 15 + lustre/portals/knals/iibnal/iibnal.c | 1713 +++++++++++ lustre/portals/knals/iibnal/iibnal.h | 892 ++++++ lustre/portals/knals/iibnal/iibnal_cb.c | 3018 ++++++++++++++++++++ .../knals/{scimacnal => openibnal}/.cvsignore | 2 +- lustre/portals/knals/openibnal/openibnal.c | 971 ++++--- lustre/portals/knals/openibnal/openibnal.h | 480 ++-- lustre/portals/knals/openibnal/openibnal_cb.c | 1387 ++++----- lustre/portals/knals/qswnal/Makefile.in | 2 +- lustre/portals/knals/qswnal/qswnal.c | 386 ++- lustre/portals/knals/qswnal/qswnal.h | 80 +- lustre/portals/knals/qswnal/qswnal_cb.c | 1004 ++++--- lustre/portals/knals/socknal/socknal.c | 1570 +++++++--- lustre/portals/knals/socknal/socknal.h | 95 +- lustre/portals/knals/socknal/socknal_cb.c | 801 +++--- lustre/portals/libcfs/.cvsignore | 1 - lustre/portals/libcfs/Makefile.in | 7 +- lustre/portals/libcfs/autoMakefile.am | 13 +- lustre/portals/libcfs/debug.c | 36 +- lustre/portals/libcfs/lwt.c | 62 +- lustre/portals/libcfs/module.c | 523 +--- lustre/portals/libcfs/proc.c | 24 +- lustre/portals/portals/Makefile.in | 8 +- lustre/portals/portals/Makefile.mk | 6 +- lustre/portals/portals/api-eq.c | 202 -- lustre/portals/portals/api-errno.c | 50 +- lustre/portals/portals/api-init.c | 63 - lustre/portals/portals/api-me.c | 42 - lustre/portals/portals/api-ni.c | 272 +- lustre/portals/portals/api-wrap.c | 689 ++--- lustre/portals/portals/autoMakefile.am | 12 +- lustre/portals/portals/lib-dispatch.c | 80 - lustre/portals/portals/lib-eq.c | 279 +- lustre/portals/portals/lib-init.c | 230 +- lustre/portals/portals/lib-md.c | 452 ++- lustre/portals/portals/lib-me.c | 210 +- lustre/portals/portals/lib-move.c | 854 +++--- lustre/portals/portals/lib-msg.c | 109 +- lustre/portals/portals/lib-ni.c | 126 +- lustre/portals/portals/lib-pid.c | 24 +- lustre/portals/portals/module.c | 6 +- lustre/portals/router/proc.c | 2 +- lustre/portals/router/router.c | 149 +- lustre/portals/router/router.h | 10 +- lustre/portals/tests/ping_cli.c | 69 +- lustre/portals/tests/ping_srv.c | 48 +- lustre/portals/tests/sping_cli.c | 62 +- lustre/portals/tests/sping_srv.c | 45 +- lustre/portals/unals/Makefile.am | 11 +- lustre/portals/unals/address.c | 13 +- lustre/portals/unals/bridge.h | 14 +- lustre/portals/unals/connection.c | 88 +- lustre/portals/unals/dispatch.h | 7 + lustre/portals/unals/procapi.c | 147 +- lustre/portals/unals/procbridge.h | 9 +- lustre/portals/unals/proclib.c | 139 +- lustre/portals/unals/select.c | 327 ++- lustre/portals/unals/tcpnal.c | 21 +- lustre/portals/utils/Makefile.am | 21 +- lustre/portals/utils/acceptor.c | 155 +- lustre/portals/utils/debug.c | 124 +- lustre/portals/utils/l_ioctl.c | 10 +- lustre/portals/utils/parser.c | 62 - lustre/portals/utils/parser.h | 6 - lustre/portals/utils/portals.c | 681 +++-- lustre/portals/utils/ptlctl.c | 12 +- lustre/portals/utils/wirecheck.c | 2 +- lustre/ptlrpc/Makefile.in | 2 +- lustre/ptlrpc/client.c | 60 +- lustre/ptlrpc/connection.c | 33 +- lustre/ptlrpc/events.c | 288 +- lustre/ptlrpc/niobuf.c | 175 +- lustre/ptlrpc/pers.c | 33 +- lustre/ptlrpc/ptlrpc_internal.h | 5 + lustre/ptlrpc/service.c | 47 +- lustre/utils/Lustre/lustredb.py | 7 + lustre/utils/lconf | 120 +- lustre/utils/lctl.c | 34 +- lustre/utils/lmc | 36 +- lustre/utils/wirecheck.c | 2 +- 136 files changed, 14493 insertions(+), 13203 deletions(-) delete mode 100644 lustre/portals/include/portals/arg-blocks.h delete mode 100644 lustre/portals/include/portals/defines.h delete mode 100644 lustre/portals/include/portals/lib-dispatch.h delete mode 100644 lustre/portals/include/portals/lib-nal.h delete mode 100644 lustre/portals/include/portals/ppid.h delete mode 100644 lustre/portals/knals/ibnal/Makefile.in delete mode 100644 lustre/portals/knals/ibnal/autoMakefile.am delete mode 100644 lustre/portals/knals/ibnal/ibnal.c delete mode 100644 lustre/portals/knals/ibnal/ibnal.h delete mode 100644 lustre/portals/knals/ibnal/ibnal_cb.c delete mode 100644 lustre/portals/knals/ibnal/ibnal_send_recv_self_testing.c delete mode 100644 lustre/portals/knals/ibnal/uagent.c rename lustre/portals/knals/{ibnal => iibnal}/.cvsignore (100%) create mode 100644 lustre/portals/knals/iibnal/Makefile.in create mode 100644 lustre/portals/knals/iibnal/Makefile.mk create mode 100644 lustre/portals/knals/iibnal/autoMakefile.am create mode 100644 lustre/portals/knals/iibnal/iibnal.c create mode 100644 lustre/portals/knals/iibnal/iibnal.h create mode 100644 lustre/portals/knals/iibnal/iibnal_cb.c rename lustre/portals/knals/{scimacnal => openibnal}/.cvsignore (100%) delete mode 100644 lustre/portals/portals/api-eq.c delete mode 100644 lustre/portals/portals/api-init.c delete mode 100644 lustre/portals/portals/api-me.c delete mode 100644 lustre/portals/portals/lib-dispatch.c diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am index 7cfb68e..cc851a9 100644 --- a/lustre/autoMakefile.am +++ b/lustre/autoMakefile.am @@ -40,7 +40,7 @@ if !LINUX25 DEP = dep dep: .depend -.depend: $(LDISKFS) lvfs-sources libcfs-sources +.depend: $(LDISKFS) lvfs-sources $(MAKE) $(ARCH_UM) -C $(LINUX) -f $(PWD)/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$(LINUX_CONFIG) -o scripts -o include/config/MARKER _sfdep_$(PWD) _FASTDEP_ALL_SUB_DIRS="$(PWD)" CLEANFILES = .depend @@ -55,10 +55,7 @@ endif lvfs-sources: $(MAKE) sources -C lvfs -libcfs-sources: - $(MAKE) sources -C portals/libcfs - -modules: lustre_build_version $(DEP) $(LDISKFS) lvfs-sources libcfs-sources +modules: lustre_build_version $(DEP) $(LDISKFS) lvfs-sources $(MAKE) $(ARCH_UM) -C $(LINUX) -f $(PWD)/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$(LINUX_CONFIG) $(MODULE_TARGET)=$(PWD) -o tmp_include_depends -o scripts -o include/config/MARKER $@ endif # MODULES diff --git a/lustre/configure.in b/lustre/configure.in index bcf65df..ffe07b0 100644 --- a/lustre/configure.in +++ b/lustre/configure.in @@ -231,8 +231,10 @@ portals/knals/Makefile portals/knals/autoMakefile portals/knals/gmnal/Makefile portals/knals/gmnal/autoMakefile -portals/knals/ibnal/Makefile -portals/knals/ibnal/autoMakefile +portals/knals/iibnal/Makefile +portals/knals/iibnal/autoMakefile +portals/knals/openibnal/Makefile +portals/knals/openibnal/autoMakefile portals/knals/qswnal/Makefile portals/knals/qswnal/autoMakefile portals/knals/socknal/Makefile diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index af80f44..c9e1274 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -25,14 +25,18 @@ #define LIBLUSTRE_H__ #include -#include -#ifndef __CYGWIN__ -#include -#include -#else -#include -#include "ioctl.h" +#ifdef HAVE_STDINT_H +# include +#endif +#ifdef HAVE_ASM_PAGE_H +# include #endif +#ifdef HAVE_SYS_USER_H +# include +#endif + +#include "ioctl.h" + #include #include #include @@ -116,9 +120,6 @@ static inline void *kmalloc(int size, int prot) #define PTR_ERR(a) ((long)(a)) #define ERR_PTR(a) ((void*)((long)(a))) -#define capable(foo) 1 -#define CAP_SYS_ADMIN 1 - typedef struct { void *cwd; }mm_segment_t; @@ -130,19 +131,12 @@ struct file; /* forward ref */ typedef int (write_proc_t)(struct file *file, const char *buffer, unsigned long count, void *data); -# define le16_to_cpu(x) __le16_to_cpu(x) -# define cpu_to_le16(x) __cpu_to_le16(x) -# define le32_to_cpu(x) __le32_to_cpu(x) -# define cpu_to_le32(x) __cpu_to_le32(x) -# define le64_to_cpu(x) __le64_to_cpu(x) -# define cpu_to_le64(x) __cpu_to_le64(x) - #define NIPQUAD(addr) \ ((unsigned char *)&addr)[0], \ ((unsigned char *)&addr)[1], \ ((unsigned char *)&addr)[2], \ ((unsigned char *)&addr)[3] - + #if defined(__LITTLE_ENDIAN) #define HIPQUAD(addr) \ ((unsigned char *)&addr)[3], \ @@ -362,9 +356,9 @@ static inline int kmem_cache_destroy(kmem_cache_t *a) #define kmem_cache_alloc(cache, prio) malloc(cache->size) #define kmem_cache_free(cache, obj) free(obj) -#define PAGE_CACHE_SIZE PAGE_SIZE -#define PAGE_CACHE_SHIFT 12 -#define PAGE_CACHE_MASK PAGE_MASK +#define PAGE_CACHE_SIZE PAGE_SIZE +#define PAGE_CACHE_SHIFT PAGE_SHIFT +#define PAGE_CACHE_MASK PAGE_MASK /* XXX * for this moment, liblusre will not rely OST for non-page-aligned write @@ -644,7 +638,7 @@ static inline int schedule_timeout(signed long t) _ret = tv.tv_sec; \ _ret; \ }) -#define time_after(a, b) ((long)(b) - (long)(a) > 0) +#define time_after(a, b) ((long)(b) - (long)(a) < 0) #define time_before(a, b) time_after(b,a) struct timer_list { diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index 7e612eb..87064fb 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -35,32 +35,52 @@ #include // #include #include -#include /* FIXME (for PTL_MD_MAX_IOV) */ #include #include #include #include -/* Define some large-ish defaults for MTU and MAX_IOV if portals ones - * aren't defined (i.e. no limits) or too large */ -#if (defined(PTL_MTU) && (PTL_MTU <= (1 << 20))) -# define PTLRPC_MTU PTL_MTU +/* MD flags we _always_ use */ +#define PTLRPC_MD_OPTIONS (PTL_MD_EVENT_START_DISABLE | \ + PTL_MD_LUSTRE_COMPLETION_SEMANTICS) + +/* Define some large-ish maxima for bulk I/O + * CAVEAT EMPTOR, with multinet (i.e. gateways forwarding between networks) + * these limits are system wide and not interface-local. */ +#define PTLRPC_MAX_BRW_SIZE (1 << 20) +#define PTLRPC_MAX_BRW_PAGES 512 + +/* ...reduce to fit... */ + +#if CRAY_PORTALS +/* include a cray header here if relevant + * NB liblustre SIZE/PAGES is affected too, but it merges contiguous + * chunks, so FTTB, it always used contiguous MDs */ #else -# define PTLRPC_MTU (1 << 20) +# include #endif -#if (defined(PTL_MAX_IOV) && (PTL_MAX_IOV <= 512)) -# define PTLRPC_MAX_IOV PTL_MAX_IOV -#else -# define PTLRPC_MAX_IOV 512 + +#if (defined(PTL_MTU) && (PTL_MTU < PTLRPC_MAX_BRW_SIZE)) +# undef PTLRPC_MAX_BRW_SIZE +# define PTLRPC_MAX_BRW_SIZE PTL_MTU +#endif +#if (defined(PTL_MD_MAX_IOV) && (PTL_MD_MAX_IOV < PTLRPC_MAX_BRW_PAGES )) +# undef PTLRPC_MAX_BRW_PAGES +# define PTLRPC_MAX_BRW_PAGES PTL_MD_MAX_IOV #endif -/* Define consistent max bulk size/pages */ -#if (PTLRPC_MTU > PTLRPC_MAX_IOV * PAGE_SIZE) -# define PTLRPC_MAX_BRW_PAGES PTLRPC_MAX_IOV -# define PTLRPC_MAX_BRW_SIZE (PTLRPC_MAX_IOV * PAGE_SIZE) +/* ...and make consistent... */ + +#if (PTLRPC_MAX_BRW_SIZE > PTLRPC_MAX_BRW_PAGES * PAGE_SIZE) +# undef PTLRPC_MAX_BRW_SIZE +# define PTLRPC_MAX_BRW_SIZE (PTLRPC_MAX_BRW_PAGES * PAGE_SIZE) #else -# define PTLRPC_MAX_BRW_PAGES (PTLRPC_MTU / PAGE_SIZE) -# define PTLRPC_MAX_BRW_SIZE PTLRPC_MTU +# undef PTLRPC_MAX_BRW_PAGES +# define PTLRPC_MAX_BRW_PAGES (PTLRPC_MAX_BRW_SIZE / PAGE_SIZE) +#endif + +#if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0) +#error "PTLRPC_MAX_BRW_PAGES isn't a power of two" #endif /* Size over which to OBD_VMALLOC() rather than OBD_ALLOC() service request @@ -126,7 +146,7 @@ #define PTLBD_MAXREQSIZE 1024 struct ptlrpc_peer { - ptl_nid_t peer_nid; + ptl_process_id_t peer_id; struct ptlrpc_ni *peer_ni; }; @@ -304,6 +324,7 @@ struct ptlrpc_request { struct ptlrpc_cb_id rq_reply_cbid; struct ptlrpc_peer rq_peer; /* XXX see service.c can this be factored away? */ + char rq_peerstr[PTL_NALFMT_SIZE]; struct obd_export *rq_export; struct obd_import *rq_import; @@ -390,8 +411,8 @@ struct ptlrpc_bulk_desc { __u32 bd_portal; struct ptlrpc_request *bd_req; /* associated request */ wait_queue_head_t bd_waitq; /* server side only WQ */ - int bd_page_count; /* # pages (== entries in bd_iov) */ - int bd_max_pages; /* allocated size of bd_iov */ + int bd_iov_count; /* # entries in bd_iov */ + int bd_max_iov; /* allocated size of bd_iov */ int bd_nob; /* # bytes covered */ int bd_nob_transferred; /* # bytes GOT/PUT */ @@ -400,10 +421,10 @@ struct ptlrpc_bulk_desc { struct ptlrpc_cb_id bd_cbid; /* network callback info */ ptl_handle_md_t bd_md_h; /* associated MD */ -#ifdef __KERNEL__ - ptl_kiov_t bd_iov[PTL_MD_MAX_IOV]; +#if (!CRAY_PORTALS && defined(__KERNEL__)) + ptl_kiov_t bd_iov[0]; #else - struct iovec bd_iov[PTL_MD_MAX_IOV]; + ptl_md_iovec_t bd_iov[0]; #endif }; @@ -484,6 +505,18 @@ struct ptlrpc_service { struct ptlrpc_srv_ni srv_interfaces[0]; }; +static inline char *ptlrpc_peernid2str(struct ptlrpc_peer *p, char *str) +{ + LASSERT(p->peer_ni != NULL); + return (portals_nid2str(p->peer_ni->pni_number, p->peer_id.nid, str)); +} + +static inline char *ptlrpc_id2str(struct ptlrpc_peer *p, char *str) +{ + LASSERT(p->peer_ni != NULL); + return (portals_id2str(p->peer_ni->pni_number, p->peer_id, str)); +} + /* ptlrpc/events.c */ extern struct ptlrpc_ni ptlrpc_interfaces[]; extern int ptlrpc_ninterfaces; @@ -494,6 +527,7 @@ extern void client_bulk_callback (ptl_event_t *ev); extern void request_in_callback(ptl_event_t *ev); extern void reply_out_callback(ptl_event_t *ev); extern void server_bulk_callback (ptl_event_t *ev); +extern int ptlrpc_default_nal(void); /* ptlrpc/connection.c */ void ptlrpc_dump_connections(void); @@ -504,6 +538,7 @@ int ptlrpc_put_connection(struct ptlrpc_connection *c); struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *); void ptlrpc_init_connection(void); void ptlrpc_cleanup_connection(void); +extern ptl_pid_t ptl_get_pid(void); /* ptlrpc/niobuf.c */ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc); diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h index 6a3786e..e858012 100644 --- a/lustre/include/linux/obd_class.h +++ b/lustre/include/linux/obd_class.h @@ -996,7 +996,7 @@ typedef __u8 class_uuid_t[16]; void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out); /* lustre_peer.c */ -int lustre_uuid_to_peer(char *uuid, struct lustre_peer *peer); +int lustre_uuid_to_peer(char *uuid, __u32 *peer_nal, ptl_nid_t *peer_nid); int class_add_uuid(char *uuid, __u64 nid, __u32 nal); int class_del_uuid (char *uuid); void class_init_uuidlist(void); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 836bd34..747f0c7 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -410,8 +410,19 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) obd_str2uuid (&cluuid, str); /* XXX extract a nettype and format accordingly */ - snprintf(remote_uuid.uuid, sizeof remote_uuid, - "NET_"LPX64"_UUID", req->rq_peer.peer_nid); + switch (sizeof(ptl_nid_t)) { + /* NB the casts only avoid compiler warnings */ + case 8: + snprintf(remote_uuid.uuid, sizeof remote_uuid, + "NET_"LPX64"_UUID", (__u64)req->rq_peer.peer_id.nid); + break; + case 4: + snprintf(remote_uuid.uuid, sizeof remote_uuid, + "NET_%x_UUID", (__u32)req->rq_peer.peer_id.nid); + break; + default: + LBUG(); + } spin_lock_bh(&target->obd_processing_task_lock); abort_recovery = target->obd_abort_recovery; diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 80f5bab..86550e9 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -1146,20 +1146,16 @@ void ldlm_lock_dump(int level, struct ldlm_lock *lock, int pos) if (lock->l_conn_export != NULL) obd = lock->l_conn_export->exp_obd; if (lock->l_export && lock->l_export->exp_connection) { - CDEBUG(level, " Node: NID "LPX64" (%s) on %s (rhandle: "LPX64")\n", - lock->l_export->exp_connection->c_peer.peer_nid, - portals_nid2str(lock->l_export->exp_connection->c_peer.peer_ni->pni_number, - lock->l_export->exp_connection->c_peer.peer_nid, str), + CDEBUG(level, " Node: NID %s on %s (rhandle: "LPX64")\n", + ptlrpc_peernid2str(&lock->l_export->exp_connection->c_peer, str), lock->l_export->exp_connection->c_peer.peer_ni->pni_name, lock->l_remote_handle.cookie); } else if (obd == NULL) { CDEBUG(level, " Node: local\n"); } else { struct obd_import *imp = obd->u.cli.cl_import; - CDEBUG(level, " Node: NID "LPX64" (%s) on %s (rhandle: "LPX64")\n", - imp->imp_connection->c_peer.peer_nid, - portals_nid2str(imp->imp_connection->c_peer.peer_ni->pni_number, - imp->imp_connection->c_peer.peer_nid, str), + CDEBUG(level, " Node: NID %s on %s (rhandle: "LPX64")\n", + ptlrpc_peernid2str(&imp->imp_connection->c_peer, str), imp->imp_connection->c_peer.peer_ni->pni_name, lock->l_remote_handle.cookie); } diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 3edfe7a..9446bfa 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -182,13 +182,10 @@ static void waiting_locks_callback(unsigned long unused) break; LDLM_ERROR(lock, "lock callback timer expired: evicting client " - "%s@%s nid "LPX64" (%s) ", + "%s@%s nid %s ", lock->l_export->exp_client_uuid.uuid, lock->l_export->exp_connection->c_remote_uuid.uuid, - lock->l_export->exp_connection->c_peer.peer_nid, - portals_nid2str(lock->l_export->exp_connection->c_peer.peer_ni->pni_number, - lock->l_export->exp_connection->c_peer.peer_nid, - str)); + ptlrpc_peernid2str(&lock->l_export->exp_connection->c_peer, str)); spin_lock_bh(&expired_lock_thread.elt_lock); list_del(&lock->l_pending_chain); @@ -307,14 +304,14 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock) static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,const char *ast_type) { - const struct ptlrpc_connection *conn = lock->l_export->exp_connection; + struct ptlrpc_connection *conn = lock->l_export->exp_connection; char str[PTL_NALFMT_SIZE]; LDLM_ERROR(lock, "%s AST failed (%d): evicting client %s@%s NID "LPX64 " (%s)", ast_type, rc, lock->l_export->exp_client_uuid.uuid, - conn->c_remote_uuid.uuid, conn->c_peer.peer_nid, - portals_nid2str(conn->c_peer.peer_ni->pni_number, - conn->c_peer.peer_nid, str)); + conn->c_remote_uuid.uuid, conn->c_peer.peer_id.nid, + ptlrpc_peernid2str(&conn->c_peer, str)); + ptlrpc_fail_export(lock->l_export); } @@ -322,12 +319,15 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock, struct ptlrpc_request *req, int rc, const char *ast_type) { + struct ptlrpc_peer *peer = &req->rq_import->imp_connection->c_peer; + char str[PTL_NALFMT_SIZE]; + if (rc == -ETIMEDOUT || rc == -EINTR || rc == -ENOTCONN) { LASSERT(lock->l_export); if (lock->l_export->exp_libclient) { - LDLM_DEBUG(lock, "%s AST to liblustre client (nid " - LPU64") timeout, just cancelling lock", - ast_type, req->rq_peer.peer_nid); + LDLM_DEBUG(lock, "%s AST to liblustre client (nid %s)" + " timeout, just cancelling lock", ast_type, + ptlrpc_peernid2str(peer, str)); ldlm_lock_cancel(lock); rc = -ERESTART; } else { @@ -336,13 +336,13 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock, } } else if (rc) { if (rc == -EINVAL) - LDLM_DEBUG(lock, "client (nid "LPU64") returned %d" + LDLM_DEBUG(lock, "client (nid %s) returned %d" " from %s AST - normal race", - req->rq_peer.peer_nid, + ptlrpc_peernid2str(peer, str), req->rq_repmsg->status, ast_type); else - LDLM_ERROR(lock, "client (nid "LPU64") returned %d " - "from %s AST", req->rq_peer.peer_nid, + LDLM_ERROR(lock, "client (nid %s) returned %d " + "from %s AST", ptlrpc_peernid2str(peer, str), (req->rq_repmsg != NULL) ? req->rq_repmsg->status : 0, ast_type); ldlm_lock_cancel(lock); @@ -771,7 +771,6 @@ int ldlm_handle_cancel(struct ptlrpc_request *req) struct ldlm_request *dlm_req; struct ldlm_lock *lock; struct ldlm_resource *res; - char str[PTL_NALFMT_SIZE]; int rc; ENTRY; @@ -791,12 +790,10 @@ int ldlm_handle_cancel(struct ptlrpc_request *req) lock = ldlm_handle2lock(&dlm_req->lock_handle1); if (!lock) { CERROR("received cancel for unknown lock cookie "LPX64 - " from client %s nid "LPX64" (%s)\n", + " from client %s id %s\n", dlm_req->lock_handle1.cookie, req->rq_export->exp_client_uuid.uuid, - req->rq_peer.peer_nid, - portals_nid2str(req->rq_peer.peer_ni->pni_number, - req->rq_peer.peer_nid, str)); + req->rq_peerstr); LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock " "(cookie "LPU64")", dlm_req->lock_handle1.cookie); @@ -1009,7 +1006,6 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) struct ldlm_namespace *ns; struct ldlm_request *dlm_req; struct ldlm_lock *lock; - char str[PTL_NALFMT_SIZE]; int rc; ENTRY; @@ -1021,14 +1017,12 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) if (req->rq_export == NULL) { struct ldlm_request *dlm_req; - CDEBUG(D_RPCTRACE, "operation %d from nid "LPX64" (%s) with bad " - "export cookie "LPX64" (ptl req %d/rep %d); this is " + CDEBUG(D_RPCTRACE, "operation %d from %s with bad " + "export cookie "LPX64"; this is " "normal if this node rebooted with a lock held\n", - req->rq_reqmsg->opc, req->rq_peer.peer_nid, - portals_nid2str(req->rq_peer.peer_ni->pni_number, - req->rq_peer.peer_nid, str), - req->rq_reqmsg->handle.cookie, - req->rq_request_portal, req->rq_reply_portal); + req->rq_reqmsg->opc, + req->rq_peerstr, + req->rq_reqmsg->handle.cookie); dlm_req = lustre_swab_reqbuf(req, 0, sizeof (*dlm_req), lustre_swab_ldlm_request); @@ -1150,9 +1144,9 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req) if (req->rq_export == NULL) { struct ldlm_request *dlm_req; - CERROR("operation %d with bad export (ptl req %d/rep %d)\n", - req->rq_reqmsg->opc, req->rq_request_portal, - req->rq_reply_portal); + CERROR("operation %d with bad export from %s\n", + req->rq_reqmsg->opc, + req->rq_peerstr); CERROR("--> export cookie: "LPX64"\n", req->rq_reqmsg->handle.cookie); dlm_req = lustre_swab_reqbuf(req, 0, sizeof (*dlm_req), diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 428338d..9b241f2 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -554,9 +554,11 @@ int ldlm_cli_cancel(struct lustre_handle *lockh) rc = ptlrpc_queue_wait(req); if (rc == ESTALE) { - CERROR("client/server (nid "LPU64") out of sync--not " - "fatal\n", - req->rq_import->imp_connection->c_peer.peer_nid); + char str[PTL_NALFMT_SIZE]; + CERROR("client/server (nid %s) out of sync" + " -- not fatal\n", + ptlrpc_peernid2str(&req->rq_import-> + imp_connection->c_peer, str)); } else if (rc == -ETIMEDOUT) { ptlrpc_req_finished(req); GOTO(restart, rc); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 57a5a2d..c6e54b4 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -392,20 +392,20 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile, PCFG_INIT(pcfg, NAL_CMD_REGISTER_MYNID); pcfg.pcfg_nal = lmd->lmd_nal; pcfg.pcfg_nid = lmd->lmd_local_nid; - err = kportal_nal_cmd(&pcfg); + err = libcfs_nal_cmd(&pcfg); if (err <0) GOTO(out, err); } - if (lmd->lmd_nal == SOCKNAL) { - PCFG_INIT(pcfg, NAL_CMD_ADD_AUTOCONN); + if (lmd->lmd_nal == SOCKNAL || + lmd->lmd_nal == OPENIBNAL || + lmd->lmd_nal == IIBNAL) { + PCFG_INIT(pcfg, NAL_CMD_ADD_PEER); pcfg.pcfg_nal = lmd->lmd_nal; pcfg.pcfg_nid = lmd->lmd_server_nid; pcfg.pcfg_id = lmd->lmd_server_ipaddr; pcfg.pcfg_misc = lmd->lmd_port; - pcfg.pcfg_size = 8388608; - pcfg.pcfg_flags = 0x4; /*share*/ - err = kportal_nal_cmd(&pcfg); + err = libcfs_nal_cmd(&pcfg); if (err <0) GOTO(out, err); } @@ -490,13 +490,14 @@ out_del_uuid: err = class_process_config(&lcfg); out_del_conn: - if (lmd->lmd_nal == SOCKNAL) { - PCFG_INIT(pcfg, NAL_CMD_DEL_AUTOCONN); + if (lmd->lmd_nal == SOCKNAL || + lmd->lmd_nal == OPENIBNAL || + lmd->lmd_nal == IIBNAL) { + PCFG_INIT(pcfg, NAL_CMD_DEL_PEER); pcfg.pcfg_nal = lmd->lmd_nal; pcfg.pcfg_nid = lmd->lmd_server_nid; - pcfg.pcfg_id = lmd->lmd_server_ipaddr; - pcfg.pcfg_flags = 1; /*share*/ - err = kportal_nal_cmd(&pcfg); + pcfg.pcfg_flags = 1; /* single_share */ + err = libcfs_nal_cmd(&pcfg); if (err <0) GOTO(out, err); } diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index 7ab83fc..4730c58c 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -278,6 +278,7 @@ void mds_steal_ack_locks(struct ptlrpc_request *req) struct ptlrpc_reply_state *oldrep; struct ptlrpc_service *svc; unsigned long flags; + char str[PTL_NALFMT_SIZE]; int i; /* CAVEAT EMPTOR: spinlock order */ @@ -299,10 +300,10 @@ void mds_steal_ack_locks(struct ptlrpc_request *req) list_del_init (&oldrep->rs_exp_list); CWARN("Stealing %d locks from rs %p x"LPD64".t"LPD64 - " o%d NID"LPX64"\n", + " o%d NID %s\n", oldrep->rs_nlocks, oldrep, oldrep->rs_xid, oldrep->rs_transno, oldrep->rs_msg.opc, - exp->exp_connection->c_peer.peer_nid); + ptlrpc_peernid2str(&exp->exp_connection->c_peer, str)); for (i = 0; i < oldrep->rs_nlocks; i++) ptlrpc_save_lock(req, diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 9992c02..e61023c 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -285,10 +285,11 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg) case OBD_IOC_CLOSE_UUID: { - struct lustre_peer peer; + ptl_nid_t peer_nid; + __u32 peer_nal; CDEBUG(D_IOCTL, "closing all connections to uuid %s\n", data->ioc_inlbuf1); - lustre_uuid_to_peer(data->ioc_inlbuf1, &peer); + lustre_uuid_to_peer(data->ioc_inlbuf1, &peer_nal, &peer_nid); GOTO(out, err = 0); } @@ -438,7 +439,8 @@ int obd_proc_read_version(char *page, char **start, off_t off, int count, return snprintf(page, count, "%s\n", BUILD_VERSION); } -int obd_proc_read_kernel_version(char *page, char **start, off_t off, int count, int *eof, void *data) +int obd_proc_read_kernel_version(char *page, char **start, off_t off, int count, + int *eof, void *data) { *eof = 1; return snprintf(page, count, "%u\n", LUSTRE_KERNEL_VERSION); diff --git a/lustre/obdclass/lustre_peer.c b/lustre/obdclass/lustre_peer.c index 3f172fe..4489671 100644 --- a/lustre/obdclass/lustre_peer.c +++ b/lustre/obdclass/lustre_peer.c @@ -42,7 +42,6 @@ struct uuid_nid_data { ptl_nid_t nid; char *uuid; __u32 nal; - ptl_handle_ni_t ni; }; /* FIXME: This should probably become more elegant than a global linked list */ @@ -61,7 +60,7 @@ void class_exit_uuidlist(void) class_del_uuid(NULL); } -int lustre_uuid_to_peer(char *uuid, struct lustre_peer *peer) +int lustre_uuid_to_peer(char *uuid, __u32 *peer_nal, ptl_nid_t *peer_nid) { struct list_head *tmp; @@ -72,8 +71,8 @@ int lustre_uuid_to_peer(char *uuid, struct lustre_peer *peer) list_entry(tmp, struct uuid_nid_data, head); if (strcmp(data->uuid, uuid) == 0) { - peer->peer_nid = data->nid; - peer->peer_ni = data->ni; + *peer_nid = data->nid; + *peer_nal = data->nal; spin_unlock (&g_uuid_lock); return 0; @@ -86,7 +85,6 @@ int lustre_uuid_to_peer(char *uuid, struct lustre_peer *peer) int class_add_uuid(char *uuid, __u64 nid, __u32 nal) { - const ptl_handle_ni_t *nip; struct uuid_nid_data *data; int rc; int nob = strnlen (uuid, PAGE_SIZE) + 1; @@ -94,26 +92,21 @@ int class_add_uuid(char *uuid, __u64 nid, __u32 nal) if (nob > PAGE_SIZE) return -EINVAL; - nip = kportal_get_ni (nal); - if (nip == NULL) { - CERROR("get_ni failed: is the NAL module loaded?\n"); - return -EIO; - } - rc = -ENOMEM; OBD_ALLOC(data, sizeof(*data)); if (data == NULL) - goto fail_0; + return -ENOMEM; OBD_ALLOC(data->uuid, nob); - if (data == NULL) - goto fail_1; + if (data == NULL) { + OBD_FREE(data, sizeof(*data)); + return -ENOMEM; + } CDEBUG(D_INFO, "add uuid %s "LPX64" %u\n", uuid, nid, nal); memcpy(data->uuid, uuid, nob); data->nid = nid; data->nal = nal; - data->ni = *nip; spin_lock (&g_uuid_lock); @@ -122,12 +115,6 @@ int class_add_uuid(char *uuid, __u64 nid, __u32 nal) spin_unlock (&g_uuid_lock); return 0; - - fail_1: - OBD_FREE (data, sizeof (*data)); - fail_0: - kportal_put_ni (nal); - return (rc); } /* delete only one entry if uuid is specified, otherwise delete all */ @@ -166,7 +153,6 @@ int class_del_uuid (char *uuid) list_del (&data->head); - kportal_put_ni (data->nal); OBD_FREE(data->uuid, strlen(data->uuid) + 1); OBD_FREE(data, sizeof(*data)); } while (!list_empty (&deathrow)); diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index b36cac8..afed455 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -587,7 +587,7 @@ static int class_config_llog_handler(struct llog_handle * handle, pcfg->pcfg_nid = cfg->cfg_local_nid; } - rc = kportal_nal_cmd(pcfg); + rc = libcfs_nal_cmd(pcfg); } out: RETURN(rc); diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 22a7d63..69449fe9 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -345,7 +345,7 @@ obd_count ost_checksum_bulk(struct ptlrpc_bulk_desc *desc) obd_count cksum = 0; int i; - for (i = 0; i < desc->bd_page_count; i++) { + for (i = 0; i < desc->bd_iov_count; i++) { struct page *page = desc->bd_iov[i].kiov_page; char *ptr = kmap(page); int psum, off = desc->bd_iov[i].kiov_offset & ~PAGE_MASK; @@ -377,7 +377,6 @@ static int ost_brw_read(struct ptlrpc_request *req) struct ost_body *body, *repbody; struct l_wait_info lwi; struct obd_trans_info oti = { 0 }; - char str[PTL_NALFMT_SIZE]; int size[1] = { sizeof(*body) }; int comms_error = 0; int niocount; @@ -529,23 +528,17 @@ static int ost_brw_read(struct ptlrpc_request *req) } else { if (req->rq_reqmsg->conn_cnt == req->rq_export->exp_conn_cnt) { CERROR("bulk IO comms error: " - "evicting %s@%s nid "LPX64" (%s)\n", + "evicting %s@%s id %s\n", req->rq_export->exp_client_uuid.uuid, req->rq_export->exp_connection->c_remote_uuid.uuid, - req->rq_peer.peer_nid, - portals_nid2str(req->rq_peer.peer_ni->pni_number, - req->rq_peer.peer_nid, - str)); + req->rq_peerstr); ptlrpc_fail_export(req->rq_export); } else { CERROR("ignoring bulk IO comms error: " - "client reconnected %s@%s nid "LPX64" (%s)\n", + "client reconnected %s@%s id %s\n", req->rq_export->exp_client_uuid.uuid, req->rq_export->exp_connection->c_remote_uuid.uuid, - req->rq_peer.peer_nid, - portals_nid2str(req->rq_peer.peer_ni->pni_number, - req->rq_peer.peer_nid, - str)); + req->rq_peerstr); } } @@ -566,7 +559,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) int objcount, niocount, npages; int comms_error = 0; int rc, swab, i, j; - char str[PTL_NALFMT_SIZE]; + struct timeval start; ENTRY; if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK)) @@ -678,20 +671,19 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) obd_count client_cksum = body->oa.o_cksum; obd_count cksum = ost_checksum_bulk(desc); - portals_nid2str(req->rq_peer.peer_ni->pni_number, - req->rq_peer.peer_nid, str); if (client_cksum != cksum) { - CERROR("Bad checksum: client %x, server %x, client NID " - LPX64" (%s)\n", client_cksum, cksum, - req->rq_peer.peer_nid, str); + CERROR("Bad checksum: client %x, server %x id %s\n", + client_cksum, cksum, + req->rq_peerstr); cksum_counter = 1; repbody->oa.o_cksum = cksum; } else { cksum_counter++; if ((cksum_counter & (-cksum_counter)) == cksum_counter) - CWARN("Checksum %u from "LPX64" (%s): %x OK\n", - cksum_counter, req->rq_peer.peer_nid, - str, cksum); + CWARN("Checksum %u from %s: %x OK\n", + cksum_counter, + req->rq_peerstr, + cksum); } } #endif @@ -733,25 +725,19 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) ptlrpc_error(req); } else { if (req->rq_reqmsg->conn_cnt == req->rq_export->exp_conn_cnt) { - CERROR("bulk IO comms error: " - "evicting %s@%s nid "LPX64" (%s)\n", + CERROR("%s: bulk IO comm error evicting %s@%s id %s\n", + req->rq_export->exp_obd->obd_name, req->rq_export->exp_client_uuid.uuid, req->rq_export->exp_connection->c_remote_uuid.uuid, - req->rq_peer.peer_nid, - portals_nid2str(req->rq_peer.peer_ni->pni_number, - req->rq_peer.peer_nid, - str)); + req->rq_peerstr); ptlrpc_fail_export(req->rq_export); } else { CERROR("ignoring bulk IO comms error: " - "client reconnected %s@%s nid "LPX64" (%s)\n", + "client reconnected %s@%s id %s\n", req->rq_export->exp_client_uuid.uuid, req->rq_export->exp_connection->c_remote_uuid.uuid, - req->rq_peer.peer_nid, - portals_nid2str(req->rq_peer.peer_ni->pni_number, - req->rq_peer.peer_nid, - str)); - } + req->rq_peerstr); + } } RETURN(rc); } diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4 index 27704bd..021fa68 100644 --- a/lustre/portals/archdep.m4 +++ b/lustre/portals/archdep.m4 @@ -14,16 +14,108 @@ AC_MSG_RESULT([$enable_inkernel]) AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes) # -------- are we building against an external portals? ------- -AC_MSG_CHECKING([if Cray portals should be used]) +AC_MSG_CHECKING([for Cray portals]) AC_ARG_WITH([cray-portals], AC_HELP_STRING([--with-cray-portals=path], [path to cray portals]), [ - CRAY_PORTALS_INCLUDE="-I$with_cray_portals" - AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals]) + if test "$with_cray_portals" != no; then + CRAY_PORTALS_PATH=$with_cray_portals + CRAY_PORTALS_INCLUDES="$with_cray_portals/include" + CRAY_PORTALS_LIBS="$with_cray_portals" + fi ],[with_cray_portals=no]) +AC_SUBST(CRAY_PORTALS_PATH) +AC_MSG_RESULT([$CRAY_PORTALS_PATH]) + +AC_MSG_CHECKING([for Cray portals includes]) +AC_ARG_WITH([cray-portals-includes], + AC_HELP_STRING([--with-cray-portals-includes=path], + [path to cray portals includes]), + [ + if test "$with_cray_portals_includes" != no; then + CRAY_PORTALS_INCLUDES="$with_cray_portals_includes" + fi + ]) +AC_SUBST(CRAY_PORTALS_INCLUDES) +AC_MSG_RESULT([$CRAY_PORTALS_INCLUDES]) + +AC_MSG_CHECKING([for Cray portals libs]) +AC_ARG_WITH([cray-portals-libs], + AC_HELP_STRING([--with-cray-portals-libs=path], + [path to cray portals libs]), + [ + if test "$with_cray_portals_libs" != no; then + CRAY_PORTALS_LIBS="$with_cray_portals_libs" + fi + ]) +AC_SUBST(CRAY_PORTALS_LIBS) +AC_MSG_RESULT([$CRAY_PORTALS_LIBS]) + +if test x$CRAY_PORTALS_INCLUDES != x ; then + if test ! -r $CRAY_PORTALS_INCLUDES/portals/api.h ; then + AC_MSG_ERROR([Cray portals headers were not found in $CRAY_PORTALS_INCLUDES. Please check the paths passed to --with-cray-portals or --with-cray-portals-includes.]) + fi +fi +if test x$CRAY_PORTALS_LIBS != x ; then + if test ! -r $CRAY_PORTALS_LIBS/libportals.a ; then + AC_MSG_ERROR([Cray portals libraries were not found in $CRAY_PORTALS_LIBS. Please check the paths passed to --with-cray-portals or --with-cray-portals-libs.]) + fi +fi + +AC_MSG_CHECKING([whether to use Cray portals]) +if test x$CRAY_PORTALS_INCLUDES != x -a x$CRAY_PORTALS_LIBS != x ; then + with_cray_portals=yes + AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals]) + CRAY_PORTALS_INCLUDES="-I$CRAY_PORTALS_INCLUDES" +else + with_cray_portals=no +fi AC_MSG_RESULT([$with_cray_portals]) AM_CONDITIONAL(CRAY_PORTALS, test x$with_cray_portals != xno) + +# ---------------------------------------- +# some tests for catamount-like systems +# ---------------------------------------- +AC_ARG_ENABLE([sysio_init], + AC_HELP_STRING([--disable-sysio-init], + [call sysio init functions when initializing liblustre]), + [],[enable_sysio_init=yes]) +AC_MSG_CHECKING([whether to initialize libsysio]) +AC_MSG_RESULT([$enable_sysio_init]) +if test x$enable_sysio_init != xno ; then + AC_DEFINE([INIT_SYSIO], 1, [call sysio init functions]) +fi + +AC_ARG_ENABLE([urandom], + AC_HELP_STRING([--disable-urandom], + [disable use of /dev/urandom for liblustre]), + [],[enable_urandom=yes]) +AC_MSG_CHECKING([whether to use /dev/urandom for liblustre]) +AC_MSG_RESULT([$enable_urandom]) +if test x$enable_urandom != xno ; then + AC_DEFINE([LIBLUSTRE_USE_URANDOM], 1, [use /dev/urandom for random data]) +fi + +# -------- check for -lcap and -lpthread ---- +if test x$enable_liblustre = xyes ; then + AC_CHECK_LIB([cap], [cap_get_proc], + [ + CAP_LIBS="-lcap" + AC_DEFINE([HAVE_LIBCAP], 1, [use libcap]) + ], + [CAP_LIBS=""]) + AC_SUBST(CAP_LIBS) + AC_CHECK_LIB([pthread], [pthread_create], + [ + PTHREAD_LIBS="-lpthread" + AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread]) + ], + [PTHREAD_LIBS=""]) + AC_SUBST(PTHREAD_LIBS) +fi + +# -------- enable tests and utils? ------- if test x$enable_tests = xno ; then AC_MSG_NOTICE([disabling tests]) enable_tests=no @@ -117,7 +209,7 @@ AM_CONDITIONAL(USE_QUILT, test x$QUILT != xno) # ------- Makeflags ------------------ -CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include" +CPPFLAGS="$CPPFLAGS $CRAY_PORTALS_INCLUDES -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include" # liblustre are all the same LLCPPFLAGS="-D__arch_lib__ -D_LARGEFILE64_SOURCE=1" @@ -135,7 +227,7 @@ if test x$enable_ldiskfs = xyes ; then AC_DEFINE(CONFIG_LDISKFS_FS_SECURITY, 1, [enable fs security]) fi -EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I$PWD/portals/include -I$PWD/include" +EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDES -I$PWD/portals/include -I$PWD/include" # these are like AC_TRY_COMPILE, but try to build modules against the # kernel, inside the kernel-tests directory @@ -330,7 +422,11 @@ if test x$enable_modules != xno ; then QSWCPPFLAGS="-DMULTIRAIL_EKC=1" else AC_MSG_RESULT([not supported]) - QSWCPPFLAGS="-I$LINUX/drivers/net/qsnet/include" + if test -d $LINUX/drivers/net/qsnet/include; then + QSWCPPFLAGS="-I$LINUX/drivers/net/qsnet/include" + else + QSWCPPFLAGS="-I$LINUX/include/linux" + fi fi else AC_MSG_RESULT([no]) @@ -370,39 +466,57 @@ if test x$enable_modules != xno ; then AC_SUBST(GMCPPFLAGS) AC_SUBST(GMNAL) - #fixme: where are the default IB includes? - default_ib_include_dir=/usr/local/ib/include - an_ib_include_file=vapi.h - - AC_MSG_CHECKING([if ib nal support was requested]) - AC_ARG_WITH([ib], - AC_HELP_STRING([--with-ib=yes/no/path], - [Path to IB includes]), + #### OpenIB + AC_MSG_CHECKING([if OpenIB kernel headers are present]) + OPENIBCPPFLAGS="-I$LINUX/drivers/infiniband/include -DIN_TREE_BUILD" + EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="$EXTRA_KCFLAGS $OPENIBCPPFLAGS" + LUSTRE_MODULE_TRY_COMPILE( [ - case $with_ib in - yes) - AC_MSG_RESULT([yes]) - IBCPPFLAGS="-I/usr/local/ib/include" - IBNAL="ibnal" - ;; - no) - AC_MSG_RESULT([no]) - IBCPPFLAGS="" - IBNAL="" - ;; - *) - AC_MSG_RESULT([yes]) - IBCPPFLAGS="-I$with_ib" - IBNAL="" - ;; - esac + #include + ],[ + struct ib_device_properties props; + return 0; + ],[ + AC_MSG_RESULT([yes]) + OPENIBNAL="openibnal" ],[ AC_MSG_RESULT([no]) - IBFLAGS="" - IBNAL="" + OPENIBNAL="" + OPENIBCPPFLAGS="" ]) - AC_SUBST(IBNAL) - AC_SUBST(IBCPPFLAGS) + EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" + AC_SUBST(OPENIBCPPFLAGS) + AC_SUBST(OPENIBNAL) + + #### Infinicon IB + AC_MSG_CHECKING([if Infinicon IB kernel headers are present]) + # for how the only infinicon ib build has headers in /usr/include/iba + IIBCPPFLAGS="-I/usr/include -DIN_TREE_BUILD" + EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="$EXTRA_KCFLAGS $IIBCPPFLAGS" + LUSTRE_MODULE_TRY_COMPILE( + [ + #include + ],[ + IBT_INTERFACE_UNION interfaces; + FSTATUS rc; + + rc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, + &interfaces); + + return rc == FSUCCESS ? 0 : 1; + ],[ + AC_MSG_RESULT([yes]) + IIBNAL="iibnal" + ],[ + AC_MSG_RESULT([no]) + IIBNAL="" + IIBCPPFLAGS="" + ]) + EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" + AC_SUBST(IIBCPPFLAGS) + AC_SUBST(IIBNAL) # ---------- Red Hat 2.4.18 has iobuf->dovary -------------- # But other kernels don't @@ -419,7 +533,7 @@ if test x$enable_modules != xno ; then AC_DEFINE(HAVE_KIOBUF_DOVARY, 1, [struct kiobuf has a dovary field]) ],[ AC_MSG_RESULT([no]) - ]) + ]) # ----------- 2.6.4 no longer has page->list --------------- AC_MSG_CHECKING([if struct page has a list field]) @@ -468,6 +582,16 @@ if test x$enable_modules != xno ; then AC_MSG_RESULT([no]) ]) + # --------- zap_page_range(vma) -------------------------------- + AC_MSG_CHECKING([if zap_pag_range with vma parameter]) + ZAP_PAGE_RANGE_VMA="`grep -c 'zap_page_range.*struct vm_area_struct' $LINUX/include/linux/mm.h`" + if test "$ZAP_PAGE_RANGE_VMA" != 0 ; then + AC_DEFINE(ZAP_PAGE_RANGE_VMA, 1, [zap_page_range with vma parameter]) + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + # ---------- Red Hat 2.4.21 backports some more 2.5 bits -------- AC_MSG_CHECKING([if kernel defines PDE]) @@ -500,7 +624,6 @@ if test x$enable_modules != xno ; then ],[ AC_MSG_RESULT([no]) ]) - AC_MSG_CHECKING([if kernel defines cpumask_t]) LUSTRE_MODULE_TRY_COMPILE( [ @@ -549,6 +672,7 @@ if test x$enable_modules != xno ; then AC_MSG_RESULT([no]) ]) + # ---------- modules? ------------------------ AC_MSG_CHECKING([for module support]) LUSTRE_MODULE_TRY_COMPILE( @@ -650,18 +774,37 @@ if test x$enable_modules != xno ; then esac # $BACKINGFS fi -AM_CONDITIONAL(BUILD_IBNAL, test x$IBNAL = "xibnal") -AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal") AM_CONDITIONAL(BUILD_QSWNAL, test x$QSWNAL = "xqswnal") +AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal") +AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal") +AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal") + +# portals/utils/portals.c +AC_CHECK_HEADERS([netdb.h netinet/tcp.h asm/types.h]) +AC_CHECK_FUNCS([gethostbyname socket connect]) + +# portals/utils/debug.c +AC_CHECK_HEADERS([linux/version.h]) + +# include/liblustre.h +AC_CHECK_HEADERS([asm/page.h sys/user.h stdint.h]) + +# liblustre/llite_lib.h +AC_CHECK_HEADERS([xtio.h file.h]) + +# liblustre/dir.c +AC_CHECK_HEADERS([linux/types.h sys/types.h linux/unistd.h unistd.h]) + +# liblustre/lutil.c +AC_CHECK_HEADERS([netinet/in.h arpa/inet.h catamount/data.h]) +AC_CHECK_FUNCS([inet_ntoa]) CPPFLAGS="-include \$(top_builddir)/include/config.h $CPPFLAGS" EXTRA_KCFLAGS="-include $PWD/include/config.h $EXTRA_KCFLAGS" AC_SUBST(EXTRA_KCFLAGS) -#echo "KCPPFLAGS: $KCPPFLAGS" -#echo "KCFLAGS: $KCFLAGS" -#echo "LLCPPFLAGS: $LLCPPFLAGS" -#echo "LLCFLAGS: $LLCFLAGS" -#echo "MOD_LINK: $MOD_LINK" -#echo "CFLAGS: $CFLAGS" -#echo "CPPFLAGS: $CPPFLAGS" +echo "CPPFLAGS: $CPPFLAGS" +echo "LLCPPFLAGS: $LLCPPFLAGS" +echo "CFLAGS: $CFLAGS" +echo "EXTRA_KCFLAGS: $EXTRA_KCFLAGS" +echo "LLCFLAGS: $LLCFLAGS" diff --git a/lustre/portals/build.m4 b/lustre/portals/build.m4 index e8a540a..f158396 100644 --- a/lustre/portals/build.m4 +++ b/lustre/portals/build.m4 @@ -20,12 +20,14 @@ ac_default_prefix=/usr # mount.lustre rootsbindir='/sbin' AC_SUBST(rootsbindir) +sysconfdir='/etc' +AC_SUBST(sysconfdir) # Directories for documentation and demos. docdir='${datadir}/doc/$(PACKAGE)' AC_SUBST(docdir) demodir='$(docdir)/demo' AC_SUBST(demodir) -pkgexampledir='${pkglibdir}/examples' +pkgexampledir='${pkgdatadir}/examples' AC_SUBST(pkgexampledir) pymoddir='${pkglibdir}/python/Lustre' AC_SUBST(pymoddir) @@ -59,6 +61,13 @@ case "$CC_VERSION" in "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)") bad_cc ;; + # unpatched 'gcc' on rh9. miscompiles a + # struct = (type) { .member = value, }; + # asignment in the iibnal where the struct is a mix + # of u64 and u32 bit-fields. + "gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5)") + bad_cc + ;; *) AC_MSG_RESULT([no known problems]) ;; @@ -114,3 +123,5 @@ else LIBWRAP="" fi AC_SUBST(LIBWRAP) + +AC_SUBST(LIBS) diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h index b5f1041..4e24c71d 100644 --- a/lustre/portals/include/linux/kp30.h +++ b/lustre/portals/include/linux/kp30.h @@ -7,12 +7,6 @@ #include #define PORTAL_DEBUG -#ifndef offsetof -# define offsetof(typ,memb) ((int)((char *)&(((typ *)0)->memb))) -#endif - -#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) - #ifdef __KERNEL__ # include # include @@ -300,7 +294,6 @@ extern void kportal_blockallsigs (void); # include # include # include -# include # ifndef DEBUG_SUBSYSTEM # define DEBUG_SUBSYSTEM S_UNDEFINED # endif @@ -308,7 +301,12 @@ extern void kportal_blockallsigs (void); # undef NDEBUG # include # define LASSERT(e) assert(e) -# define LASSERTF(cond, args...) assert(cond) +# define LASSERTF(cond, args...) \ +do { \ + if (!(cond)) \ + CERROR(args); \ + assert(cond); \ +} while (0) # else # define LASSERT(e) # define LASSERTF(cond, args...) do { } while (0) @@ -330,6 +328,7 @@ void portals_debug_dumplog(void); /* support decl needed both by kernel and liblustre */ char *portals_nid2str(int nal, ptl_nid_t nid, char *str); +char *portals_id2str(int nal, ptl_process_id_t nid, char *str); #ifndef CURRENT_TIME # define CURRENT_TIME time(0) @@ -340,25 +339,37 @@ char *portals_nid2str(int nal, ptl_nid_t nid, char *str); * Support for temporary event tracing with minimal Heisenberg effect. */ #define LWT_SUPPORT 0 -#define LWT_MEMORY (64<<20) -#define LWT_MAX_CPUS 4 +#define LWT_MEMORY (16<<20) + +#if !KLWT_SUPPORT +# if defined(__KERNEL__) +# if !defined(BITS_PER_LONG) +# error "BITS_PER_LONG not defined" +# endif +# elif !defined(__WORDSIZE) +# error "__WORDSIZE not defined" +# else +# define BITS_PER_LONG __WORDSIZE +# endif +/* kernel hasn't defined this? */ typedef struct { - cycles_t lwte_when; + long long lwte_when; char *lwte_where; void *lwte_task; long lwte_p1; long lwte_p2; long lwte_p3; long lwte_p4; -#if BITS_PER_LONG > 32 +# if BITS_PER_LONG > 32 long lwte_pad; -#endif +# endif } lwt_event_t; +#endif /* !KLWT_SUPPORT */ #if LWT_SUPPORT -#ifdef __KERNEL__ -#define LWT_EVENTS_PER_PAGE (PAGE_SIZE / sizeof (lwt_event_t)) +# ifdef __KERNEL__ +# if !KLWT_SUPPORT typedef struct _lwt_page { struct list_head lwtp_list; @@ -374,20 +385,13 @@ typedef struct { extern int lwt_enabled; extern lwt_cpu_t lwt_cpus[]; -extern int lwt_init (void); -extern void lwt_fini (void); -extern int lwt_lookup_string (int *size, char *knlptr, - char *usrptr, int usrsize); -extern int lwt_control (int enable, int clear); -extern int lwt_snapshot (cycles_t *now, int *ncpu, int *total_size, - void *user_ptr, int user_size); - /* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set. * This stuff is meant for finding specific problems; it never stays in * production code... */ #define LWTSTR(n) #n #define LWTWHERE(f,l) f ":" LWTSTR(l) +#define LWT_EVENTS_PER_PAGE (PAGE_SIZE / sizeof (lwt_event_t)) #define LWT_EVENT(p1, p2, p3, p4) \ do { \ @@ -396,9 +400,9 @@ do { \ lwt_page_t *p; \ lwt_event_t *e; \ \ - local_irq_save (flags); \ - \ if (lwt_enabled) { \ + local_irq_save (flags); \ + \ cpu = &lwt_cpus[smp_processor_id()]; \ p = cpu->lwtc_current_page; \ e = &p->lwtp_events[cpu->lwtc_current_index++]; \ @@ -417,13 +421,23 @@ do { \ e->lwte_p2 = (long)(p2); \ e->lwte_p3 = (long)(p3); \ e->lwte_p4 = (long)(p4); \ - } \ \ - local_irq_restore (flags); \ + local_irq_restore (flags); \ + } \ } while (0) -#else /* __KERNEL__ */ -#define LWT_EVENT(p1,p2,p3,p4) /* no userland implementation yet */ -#endif /* __KERNEL__ */ + +#endif /* !KLWT_SUPPORT */ + +extern int lwt_init (void); +extern void lwt_fini (void); +extern int lwt_lookup_string (int *size, char *knlptr, + char *usrptr, int usrsize); +extern int lwt_control (int enable, int clear); +extern int lwt_snapshot (cycles_t *now, int *ncpu, int *total_size, + void *user_ptr, int user_size); +# else /* __KERNEL__ */ +# define LWT_EVENT(p1,p2,p3,p4) /* no userland implementation yet */ +# endif /* __KERNEL__ */ #endif /* LWT_SUPPORT */ struct portals_device_userstate @@ -572,49 +586,42 @@ static inline int portal_ioctl_getdata(char *buf, char *end, void *arg) data = (struct portal_ioctl_data *)buf; err = copy_from_user(buf, (void *)arg, sizeof(*hdr)); - if ( err ) { - EXIT; - return err; - } + if (err) + RETURN(err); if (hdr->ioc_version != PORTAL_IOCTL_VERSION) { - CERROR ("PORTALS: version mismatch kernel vs application\n"); - return -EINVAL; + CERROR("PORTALS: version mismatch kernel vs application\n"); + RETURN(-EINVAL); } if (hdr->ioc_len + buf >= end) { - CERROR ("PORTALS: user buffer exceeds kernel buffer\n"); - return -EINVAL; + CERROR("PORTALS: user buffer exceeds kernel buffer\n"); + RETURN(-EINVAL); } if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) { - CERROR ("PORTALS: user buffer too small for ioctl\n"); - return -EINVAL; + CERROR("PORTALS: user buffer too small for ioctl\n"); + RETURN(-EINVAL); } err = copy_from_user(buf, (void *)arg, hdr->ioc_len); - if ( err ) { - EXIT; - return err; - } + if (err) + RETURN(err); if (portal_ioctl_is_invalid(data)) { - CERROR ("PORTALS: ioctl not correctly formatted\n"); - return -EINVAL; + CERROR("PORTALS: ioctl not correctly formatted\n"); + RETURN(-EINVAL); } - if (data->ioc_inllen1) { + if (data->ioc_inllen1) data->ioc_inlbuf1 = &data->ioc_bulk[0]; - } - if (data->ioc_inllen2) { + if (data->ioc_inllen2) data->ioc_inlbuf2 = &data->ioc_bulk[0] + size_round(data->ioc_inllen1); - } - EXIT; - return 0; + RETURN(0); } #endif @@ -643,21 +650,13 @@ enum { GMNAL = 3, /* 4 unused */ TCPNAL = 5, - SCIMACNAL = 6, - ROUTER = 7, - IBNAL = 8, + ROUTER = 6, + OPENIBNAL = 7, + IIBNAL = 8, NAL_ENUM_END_MARKER }; -#ifdef __KERNEL__ -extern ptl_handle_ni_t kqswnal_ni; -extern ptl_handle_ni_t ksocknal_ni; -extern ptl_handle_ni_t kgmnal_ni; -extern ptl_handle_ni_t kibnal_ni; -extern ptl_handle_ni_t kscimacnal_ni; -#endif - -#define PTL_NALFMT_SIZE 26 /* %u:%u.%u.%u.%u (10+4+4+4+3+1) */ +#define PTL_NALFMT_SIZE 32 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+5+1) */ #define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1) @@ -666,14 +665,18 @@ extern ptl_handle_ni_t kscimacnal_ni; #define NAL_CMD_REGISTER_MYNID 102 #define NAL_CMD_PUSH_CONNECTION 103 #define NAL_CMD_GET_CONN 104 -#define NAL_CMD_DEL_AUTOCONN 105 -#define NAL_CMD_ADD_AUTOCONN 106 -#define NAL_CMD_GET_AUTOCONN 107 +#define NAL_CMD_DEL_PEER 105 +#define NAL_CMD_ADD_PEER 106 +#define NAL_CMD_GET_PEER 107 #define NAL_CMD_GET_TXDESC 108 #define NAL_CMD_ADD_ROUTE 109 #define NAL_CMD_DEL_ROUTE 110 #define NAL_CMD_GET_ROUTE 111 #define NAL_CMD_NOTIFY_ROUTER 112 +#define NAL_CMD_ADD_INTERFACE 113 +#define NAL_CMD_DEL_INTERFACE 114 +#define NAL_CMD_GET_INTERFACE 115 + enum { DEBUG_DAEMON_START = 1, @@ -682,16 +685,6 @@ enum { DEBUG_DAEMON_CONTINUE = 4, }; -/* XXX remove to lustre ASAP */ -struct lustre_peer { - ptl_nid_t peer_nid; - ptl_handle_ni_t peer_ni; -}; - -/* module.c */ -typedef int (*nal_cmd_handler_t)(struct portals_cfg *, void * private); -int kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private); -int kportal_nal_unregister(int nal); enum cfg_record_type { PORTALS_CFG_TYPE = 1, @@ -699,10 +692,6 @@ enum cfg_record_type { }; typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data); -int kportal_nal_cmd(struct portals_cfg *); - -ptl_handle_ni_t *kportal_get_ni (int nal); -void kportal_put_ni (int nal); #ifdef __CYGWIN__ # ifndef BITS_PER_LONG diff --git a/lustre/portals/include/linux/kpr.h b/lustre/portals/include/linux/kpr.h index ee50b59..1127698 100644 --- a/lustre/portals/include/linux/kpr.h +++ b/lustre/portals/include/linux/kpr.h @@ -1,10 +1,10 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * * vim:expandtab:shiftwidth=8:tabstop=8: - * */ + * vim:expandtab:shiftwidth=8:tabstop=8: + */ #ifndef _KPR_H #define _KPR_H -# include /* for ptl_hdr_t */ +# include /* for ptl_hdr_t */ /******************************************************************************/ /* Kernel Portals Router interface */ @@ -81,21 +81,6 @@ typedef struct { void *kpr_arg; } kpr_router_t; -/* Router's control interface (Kernel Portals Routing Control Interface) */ -typedef const struct { - int (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid, - ptl_nid_t lo_nid, ptl_nid_t hi_nid); - int (*kprci_del_route)(int gateway_nal, ptl_nid_t gateway_nid, - ptl_nid_t lo_nid, ptl_nid_t hi_nid); - int (*kprci_get_route)(int index, int *gateway_nal, - ptl_nid_t *gateway, - ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, - int *alive); - int (*kprci_notify)(int gateway_nal, ptl_nid_t gateway_nid, - int alive, time_t when); -} kpr_control_interface_t; - -extern kpr_control_interface_t kpr_control_interface; extern kpr_router_interface_t kpr_router_interface; static inline int diff --git a/lustre/portals/include/linux/libcfs.h b/lustre/portals/include/linux/libcfs.h index 66ee471..e3d58dd 100644 --- a/lustre/portals/include/linux/libcfs.h +++ b/lustre/portals/include/linux/libcfs.h @@ -4,14 +4,60 @@ #ifndef _LIBCFS_H #define _LIBCFS_H +#ifdef HAVE_ASM_TYPES_H +#include +#else +#include "types.h" +#endif + +#ifdef __KERNEL__ +# include +# include +#else +# include +# define do_gettimeofday(tv) gettimeofday(tv, NULL); +typedef unsigned long long cycles_t; +#endif + #define PORTAL_DEBUG #ifndef offsetof -# define offsetof(typ,memb) ((int)((char *)&(((typ *)0)->memb))) +# define offsetof(typ,memb) ((unsigned long)((char *)&(((typ *)0)->memb))) #endif #define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) +#ifndef __KERNEL__ +/* Userpace byte flipping */ +# include +# include +# define __swab16(x) bswap_16(x) +# define __swab32(x) bswap_32(x) +# define __swab64(x) bswap_64(x) +# define __swab16s(x) do {*(x) = bswap_16(*(x));} while (0) +# define __swab32s(x) do {*(x) = bswap_32(*(x));} while (0) +# define __swab64s(x) do {*(x) = bswap_64(*(x));} while (0) +# if __BYTE_ORDER == __LITTLE_ENDIAN +# define le16_to_cpu(x) (x) +# define cpu_to_le16(x) (x) +# define le32_to_cpu(x) (x) +# define cpu_to_le32(x) (x) +# define le64_to_cpu(x) (x) +# define cpu_to_le64(x) (x) +# else +# if __BYTE_ORDER == __BIG_ENDIAN +# define le16_to_cpu(x) bswap_16(x) +# define cpu_to_le16(x) bswap_16(x) +# define le32_to_cpu(x) bswap_32(x) +# define cpu_to_le32(x) bswap_32(x) +# define le64_to_cpu(x) bswap_64(x) +# define cpu_to_le64(x) bswap_64(x) +# else +# error "Unknown byte order" +# endif /* __BIG_ENDIAN */ +# endif /* __LITTLE_ENDIAN */ +#endif /* ! __KERNEL__ */ + /* * Debugging */ @@ -20,7 +66,6 @@ extern unsigned int portal_stack; extern unsigned int portal_debug; extern unsigned int portal_printk; -#include struct ptldebug_header { __u32 ph_len; __u32 ph_flags; @@ -60,7 +105,10 @@ struct ptldebug_header { #define S_GMNAL 0x00080000 #define S_PTLROUTER 0x00100000 #define S_COBD 0x00200000 -#define S_IBNAL 0x00400000 +#define S_IBNAL 0x00400000 /* All IB NALs */ +#define S_SM 0x00800000 +#define S_ASOBD 0x01000000 +#define S_CONFOBD 0x02000000 /* If you change these values, please keep portals/utils/debug.c * up to date! */ @@ -109,7 +157,7 @@ struct ptldebug_header { # define CDEBUG_STACK (THREAD_SIZE - \ ((unsigned long)__builtin_frame_address(0) & \ (THREAD_SIZE - 1))) -# endif +# endif /* __ia64__ */ #define CHECK_STACK(stack) \ do { \ @@ -121,7 +169,7 @@ struct ptldebug_header { /*panic("LBUG");*/ \ } \ } while (0) -#else /* __KERNEL__ */ +#else /* !__KERNEL__ */ #define CHECK_STACK(stack) do { } while(0) #define CDEBUG_STACK (0L) #endif /* __KERNEL__ */ @@ -152,14 +200,14 @@ do { \ if (cdebug_count) { \ portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, \ __FILE__, __FUNCTION__, __LINE__, \ - 0, cdebug_format, ## a); \ + CDEBUG_STACK, cdebug_format, ## a); \ cdebug_count = 0; \ } \ if (time_after(jiffies, cdebug_next+(CDEBUG_MAX_LIMIT+10)*HZ))\ cdebug_delay = cdebug_delay > 8 ? cdebug_delay/8 : 1; \ else \ - cdebug_delay = cdebug_delay*2 >= CDEBUG_MAX_LIMIT*HZ? \ - CDEBUG_MAX_LIMIT * HZ : cdebug_delay*2; \ + cdebug_delay = cdebug_delay*2 >= CDEBUG_MAX_LIMIT*HZ ?\ + CDEBUG_MAX_LIMIT*HZ : cdebug_delay*2; \ cdebug_next = jiffies + cdebug_delay; \ } else { \ portals_debug_msg(DEBUG_SUBSYSTEM, \ @@ -202,15 +250,33 @@ do { \ } while(0) #else #define CDEBUG(mask, format, a...) do { } while (0) -#define CWARN(format, a...) do { } while (0) -#define CERROR(format, a...) printk("<3>" format, ## a) -#define CEMERG(format, a...) printk("<0>" format, ## a) +#define CWARN(format, a...) printk(KERN_WARNING format, ## a) +#define CERROR(format, a...) printk(KERN_ERR format, ## a) +#define CEMERG(format, a...) printk(KERN_EMERG format, ## a) #define GOTO(label, rc) do { (void)(rc); goto label; } while (0) #define RETURN(rc) return (rc) #define ENTRY do { } while (0) #define EXIT do { } while (0) #endif +/* initial pid */ +# if CRAY_PORTALS +/* + * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this + * is too big. + * + * 2) the implementation of ernal in cray portals further restricts the pid + * space that may be used to 0 <= pid <= 255 (an 8 bit value). Returns + * an error at nal init time for any pid outside this range. Other nals + * in cray portals don't have this restriction. + * */ +#define LUSTRE_PTL_PID 9 +# else +#define LUSTRE_PTL_PID 12345 +# endif + +#define LUSTRE_SRV_PTL_PID LUSTRE_PTL_PID + #define PORTALS_CFG_VERSION 0x00010001; struct portals_cfg { @@ -245,6 +311,11 @@ do { \ \ } while (0) +typedef int (nal_cmd_handler_fn)(struct portals_cfg *, void *); +int libcfs_nal_cmd_register(int nal, nal_cmd_handler_fn *handler, void *arg); +int libcfs_nal_cmd(struct portals_cfg *pcfg); +void libcfs_nal_cmd_unregister(int nal); + struct portal_ioctl_data { __u32 ioc_len; __u32 ioc_version; @@ -277,6 +348,7 @@ struct portal_ioctl_data { char ioc_bulk[0]; }; + #ifdef __KERNEL__ #include diff --git a/lustre/portals/include/linux/portals_compat25.h b/lustre/portals/include/linux/portals_compat25.h index 3d0aff0..5a43a45 100644 --- a/lustre/portals/include/linux/portals_compat25.h +++ b/lustre/portals/include/linux/portals_compat25.h @@ -30,6 +30,7 @@ # define CURRENT_SECONDS get_seconds() # define smp_num_cpus NR_CPUS + #elif defined(CONFIG_RH_2_4_20) /* RH 2.4.x */ # define SIGNAL_MASK_LOCK(task, flags) \ diff --git a/lustre/portals/include/linux/portals_lib.h b/lustre/portals/include/linux/portals_lib.h index 609290d..8778a52 100644 --- a/lustre/portals/include/linux/portals_lib.h +++ b/lustre/portals/include/linux/portals_lib.h @@ -64,114 +64,6 @@ static inline size_t round_strlen(char *fset) return size_round(strlen(fset) + 1); } -#ifdef __KERNEL__ -static inline char *strdup(const char *str) -{ - int len = strlen(str) + 1; - char *tmp = kmalloc(len, GFP_KERNEL); - if (tmp) - memcpy(tmp, str, len); - - return tmp; -} -#endif - -#ifdef __KERNEL__ -# define NTOH__u32(var) le32_to_cpu(var) -# define NTOH__u64(var) le64_to_cpu(var) -# define HTON__u32(var) cpu_to_le32(var) -# define HTON__u64(var) cpu_to_le64(var) -#else -# define expansion_u64(var) \ - ({ __u64 ret; \ - switch (sizeof(var)) { \ - case 8: (ret) = (var); break; \ - case 4: (ret) = (__u32)(var); break; \ - case 2: (ret) = (__u16)(var); break; \ - case 1: (ret) = (__u8)(var); break; \ - }; \ - (ret); \ - }) -# define NTOH__u32(var) (var) -# define NTOH__u64(var) (expansion_u64(var)) -# define HTON__u32(var) (var) -# define HTON__u64(var) (expansion_u64(var)) -#endif - -/* - * copy sizeof(type) bytes from pointer to var and move ptr forward. - * return EFAULT if pointer goes beyond end - */ -#define UNLOGV(var,type,ptr,end) \ -do { \ - var = *(type *)ptr; \ - ptr += sizeof(type); \ - if (ptr > end ) \ - return -EFAULT; \ -} while (0) - -/* the following two macros convert to little endian */ -/* type MUST be __u32 or __u64 */ -#define LUNLOGV(var,type,ptr,end) \ -do { \ - var = NTOH##type(*(type *)ptr); \ - ptr += sizeof(type); \ - if (ptr > end ) \ - return -EFAULT; \ -} while (0) - -/* now log values */ -#define LOGV(var,type,ptr) \ -do { \ - *((type *)ptr) = var; \ - ptr += sizeof(type); \ -} while (0) - -/* and in network order */ -#define LLOGV(var,type,ptr) \ -do { \ - *((type *)ptr) = HTON##type(var); \ - ptr += sizeof(type); \ -} while (0) - - -/* - * set var to point at (type *)ptr, move ptr forward with sizeof(type) - * return from function with EFAULT if ptr goes beyond end - */ -#define UNLOGP(var,type,ptr,end) \ -do { \ - var = (type *)ptr; \ - ptr += sizeof(type); \ - if (ptr > end ) \ - return -EFAULT; \ -} while (0) - -#define LOGP(var,type,ptr) \ -do { \ - memcpy(ptr, var, sizeof(type)); \ - ptr += sizeof(type); \ -} while (0) - -/* - * set var to point at (char *)ptr, move ptr forward by size_round(len); - * return from function with EFAULT if ptr goes beyond end - */ -#define UNLOGL(var,type,len,ptr,end) \ -do { \ - var = (type *)ptr; \ - ptr += size_round(len * sizeof(type)); \ - if (ptr > end ) \ - return -EFAULT; \ -} while (0) - -#define UNLOGL0(var,type,len,ptr,end) \ -do { \ - UNLOGL(var,type,len,ptr,end); \ - if ( *((char *)ptr - size_round(len) + len - 1) != '\0') \ - return -EFAULT; \ -} while (0) - #define LOGL(var,len,ptr) \ do { \ if (var) \ diff --git a/lustre/portals/include/portals/Makefile.am b/lustre/portals/include/portals/Makefile.am index 5ed6090..4043f66 100644 --- a/lustre/portals/include/portals/Makefile.am +++ b/lustre/portals/include/portals/Makefile.am @@ -4,7 +4,7 @@ if UTILS portals_HEADERS = list.h endif -EXTRA_DIST = api.h api-support.h arg-blocks.h defines.h errno.h \ - internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h \ - list.h lltrace.h myrnal.h nal.h nalids.h p30.h ppid.h ptlctl.h \ +EXTRA_DIST = api.h api-support.h build_check.h errno.h \ + internal.h lib-p30.h lib-types.h list.h \ + lltrace.h myrnal.h nal.h nalids.h p30.h ptlctl.h \ socknal.h stringtab.h types.h diff --git a/lustre/portals/include/portals/api-support.h b/lustre/portals/include/portals/api-support.h index af4a2dc..c5994c6 100644 --- a/lustre/portals/include/portals/api-support.h +++ b/lustre/portals/include/portals/api-support.h @@ -1,5 +1,5 @@ -# define DEBUG_SUBSYSTEM S_PORTALS -# define PORTAL_DEBUG + +#include "build_check.h" #ifndef __KERNEL__ # include @@ -19,9 +19,4 @@ #include #include -#include -/* Hack for 2.4.18 macro name collision */ -#ifdef yield -#undef yield -#endif diff --git a/lustre/portals/include/portals/api.h b/lustre/portals/include/portals/api.h index a83749b..56b7b99 100644 --- a/lustre/portals/include/portals/api.h +++ b/lustre/portals/include/portals/api.h @@ -1,56 +1,39 @@ #ifndef P30_API_H #define P30_API_H +#include "build_check.h" + #include -#ifndef PTL_NO_WRAP -int PtlInit(void); -int PtlInitialized(void); +int PtlInit(int *); void PtlFini(void); -int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size_in, - ptl_ac_index_t acl_size_in, ptl_pid_t requested_pid, - ptl_handle_ni_t * interface_out); +int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid, + ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits, + ptl_handle_ni_t *interface_out); int PtlNIInitialized(ptl_interface_t); int PtlNIFini(ptl_handle_ni_t interface_in); -#endif - int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id); +int PtlGetUid(ptl_handle_ni_t ni_handle, ptl_uid_t *uid); + /* * Network interfaces */ -#ifndef PTL_NO_WRAP -int PtlNIBarrier(ptl_handle_ni_t interface_in); -#endif - int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, ptl_sr_value_t * status_out); int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, unsigned long *distance_out); -#ifndef PTL_NO_WRAP int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out); -#endif -/* - * PtlNIDebug: - * - * This is not an official Portals 3 API call. It is provided - * by the reference implementation to allow the maintainers an - * easy way to turn on and off debugging information in the - * library. Do not use it in code that is not intended for use - * with any version other than the portable reference library. - */ -unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in); - /* * PtlNIFailNid * @@ -62,6 +45,13 @@ unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in); */ int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold); +/* + * PtlSnprintHandle: + * + * This is not an official Portals 3 API call. It is provided + * so that an application can print an opaque handle. + */ +void PtlSnprintHandle (char *str, int str_len, ptl_handle_any_t handle); /* * Match entries @@ -81,28 +71,23 @@ int PtlMEUnlink(ptl_handle_me_t current_in); int PtlMEUnlinkList(ptl_handle_me_t current_in); -int PtlTblDump(ptl_handle_ni_t ni, int index_in); -int PtlMEDump(ptl_handle_me_t current_in); - /* * Memory descriptors */ -#ifndef PTL_NO_WRAP int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in, ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out); int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, - ptl_handle_md_t * handle_out); + ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out); int PtlMDUnlink(ptl_handle_md_t md_in); int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout, ptl_md_t * new_inout, ptl_handle_eq_t testq_in); -#endif /* These should not be called by users */ int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout, @@ -115,24 +100,18 @@ int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout, /* * Event queues */ -#ifndef PTL_NO_WRAP - -/* These should be called by users */ int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in, - int (*callback) (ptl_event_t * event), - ptl_handle_eq_t * handle_out); + ptl_eq_handler_t handler, + ptl_handle_eq_t *handle_out); int PtlEQFree(ptl_handle_eq_t eventq_in); -int PtlEQCount(ptl_handle_eq_t eventq_in, ptl_size_t * count_out); - int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); -int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, - int timeout); -#endif +int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout, + ptl_event_t *event_out, int *which_out); /* * Access Control Table diff --git a/lustre/portals/include/portals/arg-blocks.h b/lustre/portals/include/portals/arg-blocks.h deleted file mode 100644 index 3c3b154..0000000 --- a/lustre/portals/include/portals/arg-blocks.h +++ /dev/null @@ -1,265 +0,0 @@ -#ifndef PTL_BLOCKS_H -#define PTL_BLOCKS_H - -/* - * blocks.h - * - * Argument block types for the Portals 3.0 library - * Generated by idl - * - */ - -#include - -/* put LIB_MAX_DISPATCH last here -- these must match the - assignements to the dispatch table in lib-p30/dispatch.c */ -#define PTL_GETID 1 -#define PTL_NISTATUS 2 -#define PTL_NIDIST 3 -#define PTL_NIDEBUG 4 -#define PTL_MEATTACH 5 -#define PTL_MEINSERT 6 -// #define PTL_MEPREPEND 7 -#define PTL_MEUNLINK 8 -#define PTL_TBLDUMP 9 -#define PTL_MEDUMP 10 -#define PTL_MDATTACH 11 -// #define PTL_MDINSERT 12 -#define PTL_MDBIND 13 -#define PTL_MDUPDATE 14 -#define PTL_MDUNLINK 15 -#define PTL_EQALLOC 16 -#define PTL_EQFREE 17 -#define PTL_ACENTRY 18 -#define PTL_PUT 19 -#define PTL_GET 20 -#define PTL_FAILNID 21 -#define LIB_MAX_DISPATCH 21 - -typedef struct PtlFailNid_in { - ptl_handle_ni_t interface; - ptl_nid_t nid; - unsigned int threshold; -} PtlFailNid_in; - -typedef struct PtlFailNid_out { - int rc; -} PtlFailNid_out; - -typedef struct PtlGetId_in { - ptl_handle_ni_t handle_in; -} PtlGetId_in; - -typedef struct PtlGetId_out { - int rc; - ptl_process_id_t id_out; -} PtlGetId_out; - -typedef struct PtlNIStatus_in { - ptl_handle_ni_t interface_in; - ptl_sr_index_t register_in; -} PtlNIStatus_in; - -typedef struct PtlNIStatus_out { - int rc; - ptl_sr_value_t status_out; -} PtlNIStatus_out; - - -typedef struct PtlNIDist_in { - ptl_handle_ni_t interface_in; - ptl_process_id_t process_in; -} PtlNIDist_in; - -typedef struct PtlNIDist_out { - int rc; - unsigned long distance_out; -} PtlNIDist_out; - - -typedef struct PtlNIDebug_in { - unsigned int mask_in; -} PtlNIDebug_in; - -typedef struct PtlNIDebug_out { - unsigned int rc; -} PtlNIDebug_out; - - -typedef struct PtlMEAttach_in { - ptl_handle_ni_t interface_in; - ptl_pt_index_t index_in; - ptl_ins_pos_t position_in; - ptl_process_id_t match_id_in; - ptl_match_bits_t match_bits_in; - ptl_match_bits_t ignore_bits_in; - ptl_unlink_t unlink_in; -} PtlMEAttach_in; - -typedef struct PtlMEAttach_out { - int rc; - ptl_handle_me_t handle_out; -} PtlMEAttach_out; - - -typedef struct PtlMEInsert_in { - ptl_handle_me_t current_in; - ptl_process_id_t match_id_in; - ptl_match_bits_t match_bits_in; - ptl_match_bits_t ignore_bits_in; - ptl_unlink_t unlink_in; - ptl_ins_pos_t position_in; -} PtlMEInsert_in; - -typedef struct PtlMEInsert_out { - int rc; - ptl_handle_me_t handle_out; -} PtlMEInsert_out; - -typedef struct PtlMEUnlink_in { - ptl_handle_me_t current_in; - ptl_unlink_t unlink_in; -} PtlMEUnlink_in; - -typedef struct PtlMEUnlink_out { - int rc; -} PtlMEUnlink_out; - - -typedef struct PtlTblDump_in { - int index_in; -} PtlTblDump_in; - -typedef struct PtlTblDump_out { - int rc; -} PtlTblDump_out; - - -typedef struct PtlMEDump_in { - ptl_handle_me_t current_in; -} PtlMEDump_in; - -typedef struct PtlMEDump_out { - int rc; -} PtlMEDump_out; - - -typedef struct PtlMDAttach_in { - ptl_handle_me_t me_in; - ptl_handle_eq_t eq_in; - ptl_md_t md_in; - ptl_unlink_t unlink_in; -} PtlMDAttach_in; - -typedef struct PtlMDAttach_out { - int rc; - ptl_handle_md_t handle_out; -} PtlMDAttach_out; - - -typedef struct PtlMDBind_in { - ptl_handle_ni_t ni_in; - ptl_handle_eq_t eq_in; - ptl_md_t md_in; -} PtlMDBind_in; - -typedef struct PtlMDBind_out { - int rc; - ptl_handle_md_t handle_out; -} PtlMDBind_out; - - -typedef struct PtlMDUpdate_internal_in { - ptl_handle_md_t md_in; - ptl_handle_eq_t testq_in; - ptl_seq_t sequence_in; - - ptl_md_t old_inout; - int old_inout_valid; - ptl_md_t new_inout; - int new_inout_valid; -} PtlMDUpdate_internal_in; - -typedef struct PtlMDUpdate_internal_out { - int rc; - ptl_md_t old_inout; - ptl_md_t new_inout; -} PtlMDUpdate_internal_out; - - -typedef struct PtlMDUnlink_in { - ptl_handle_md_t md_in; -} PtlMDUnlink_in; - -typedef struct PtlMDUnlink_out { - int rc; - ptl_md_t status_out; -} PtlMDUnlink_out; - - -typedef struct PtlEQAlloc_in { - ptl_handle_ni_t ni_in; - ptl_size_t count_in; - void *base_in; - int len_in; - int (*callback_in) (ptl_event_t * event); -} PtlEQAlloc_in; - -typedef struct PtlEQAlloc_out { - int rc; - ptl_handle_eq_t handle_out; -} PtlEQAlloc_out; - - -typedef struct PtlEQFree_in { - ptl_handle_eq_t eventq_in; -} PtlEQFree_in; - -typedef struct PtlEQFree_out { - int rc; -} PtlEQFree_out; - - -typedef struct PtlACEntry_in { - ptl_handle_ni_t ni_in; - ptl_ac_index_t index_in; - ptl_process_id_t match_id_in; - ptl_pt_index_t portal_in; -} PtlACEntry_in; - -typedef struct PtlACEntry_out { - int rc; -} PtlACEntry_out; - - -typedef struct PtlPut_in { - ptl_handle_md_t md_in; - ptl_ack_req_t ack_req_in; - ptl_process_id_t target_in; - ptl_pt_index_t portal_in; - ptl_ac_index_t cookie_in; - ptl_match_bits_t match_bits_in; - ptl_size_t offset_in; - ptl_hdr_data_t hdr_data_in; -} PtlPut_in; - -typedef struct PtlPut_out { - int rc; -} PtlPut_out; - - -typedef struct PtlGet_in { - ptl_handle_md_t md_in; - ptl_process_id_t target_in; - ptl_pt_index_t portal_in; - ptl_ac_index_t cookie_in; - ptl_match_bits_t match_bits_in; - ptl_size_t offset_in; -} PtlGet_in; - -typedef struct PtlGet_out { - int rc; -} PtlGet_out; - - -#endif diff --git a/lustre/portals/include/portals/build_check.h b/lustre/portals/include/portals/build_check.h index 5db1352..c219d2a 100644 --- a/lustre/portals/include/portals/build_check.h +++ b/lustre/portals/include/portals/build_check.h @@ -1,7 +1,7 @@ #ifndef _BUILD_CHECK_H #define _BUILD_CHECK_H -#ifdef CRAY_PORTALS +#if CRAY_PORTALS #error "an application got to me instead of cray's includes" #endif diff --git a/lustre/portals/include/portals/defines.h b/lustre/portals/include/portals/defines.h deleted file mode 100644 index 785ce73..0000000 --- a/lustre/portals/include/portals/defines.h +++ /dev/null @@ -1,116 +0,0 @@ -/* -** -** This files contains definitions that are used throughout the cplant code. -*/ - -#ifndef CPLANT_H -#define CPLANT_H - -#define TITLE(fname,zmig) - - -/* -** TRUE and FALSE -*/ -#undef TRUE -#define TRUE (1) -#undef FALSE -#define FALSE (0) - - -/* -** Return codes from functions -*/ -#undef OK -#define OK (0) -#undef ERROR -#define ERROR (-1) - - - -/* -** The GCC macro for a safe max() that works on all types arithmetic types. -*/ -#ifndef MAX -#define MAX(a, b) (a) > (b) ? (a) : (b) -#endif /* MAX */ - -#ifndef MIN -#define MIN(a, b) (a) < (b) ? (a) : (b) -#endif /* MIN */ - -/* -** The rest is from the old qkdefs.h -*/ - -#ifndef __linux__ -#define __inline__ -#endif - -#ifndef NULL -#define NULL ((void *)0) -#endif - -#ifndef __osf__ -#define PRIVATE static -#define PUBLIC -#endif - -#ifndef __osf__ -typedef unsigned char uchar; -#endif - -typedef char CHAR; -typedef unsigned char UCHAR; -typedef char INT8; -typedef unsigned char UINT8; -typedef short int INT16; -typedef unsigned short int UINT16; -typedef int INT32; -typedef unsigned int UINT32; -typedef long LONG32; -typedef unsigned long ULONG32; - -/* long may be 32 or 64, so we can't really append the size to the definition */ -typedef long LONG; -typedef unsigned long ULONG; - -#ifdef __alpha__ -typedef long int_t; -#ifndef __osf__ -typedef unsigned long uint_t; -#endif -#endif - -#ifdef __i386__ -typedef int int_t; -typedef unsigned int uint_t; -#endif - -typedef float FLOAT32; -typedef double FLOAT64; -typedef void VOID; -typedef INT32 BOOLEAN; -typedef void (*FCN_PTR)(void); - -#ifndef off64_t - -#if defined (__alpha__) || defined (__ia64__) -typedef long off64_t; -#else -typedef long long off64_t; -#endif - -#endif - -/* -** Process related typedefs -*/ -typedef UINT16 PID_TYPE; /* Type of Local process ID */ -typedef UINT16 NID_TYPE; /* Type of Physical node ID */ -typedef UINT16 GID_TYPE; /* Type of Group ID */ -typedef UINT16 RANK_TYPE; /* Type of Logical rank/process within a group */ - - - -#endif /* CPLANT_H */ diff --git a/lustre/portals/include/portals/errno.h b/lustre/portals/include/portals/errno.h index 08f084a..42f2626 100644 --- a/lustre/portals/include/portals/errno.h +++ b/lustre/portals/include/portals/errno.h @@ -1,6 +1,7 @@ #ifndef _P30_ERRNO_H_ #define _P30_ERRNO_H_ +#include "build_check.h" /* * include/portals/errno.h * @@ -9,49 +10,41 @@ /* If you change these, you must update the string table in api-errno.c */ typedef enum { - PTL_OK = 0, - PTL_SEGV = 1, - - PTL_NOSPACE = 2, - PTL_INUSE = 3, - PTL_VAL_FAILED = 4, - - PTL_NAL_FAILED = 5, - PTL_NOINIT = 6, - PTL_INIT_DUP = 7, - PTL_INIT_INV = 8, - PTL_AC_INV_INDEX = 9, - - PTL_INV_ASIZE = 10, - PTL_INV_HANDLE = 11, - PTL_INV_MD = 12, - PTL_INV_ME = 13, - PTL_INV_NI = 14, + PTL_OK = 0, + PTL_SEGV = 1, + + PTL_NO_SPACE = 2, + PTL_ME_IN_USE = 3, + PTL_VAL_FAILED = 4, + + PTL_NAL_FAILED = 5, + PTL_NO_INIT = 6, + PTL_IFACE_DUP = 7, + PTL_IFACE_INVALID = 8, + + PTL_HANDLE_INVALID = 9, + PTL_MD_INVALID = 10, + PTL_ME_INVALID = 11, /* If you change these, you must update the string table in api-errno.c */ - PTL_ILL_MD = 15, - PTL_INV_PROC = 16, - PTL_INV_PSIZE = 17, - PTL_INV_PTINDEX = 18, - PTL_INV_REG = 19, - - PTL_INV_SR_INDX = 20, - PTL_ML_TOOLONG = 21, - PTL_ADDR_UNKNOWN = 22, - PTL_INV_EQ = 23, - PTL_EQ_DROPPED = 24, - - PTL_EQ_EMPTY = 25, - PTL_NOUPDATE = 26, - PTL_FAIL = 27, - PTL_NOT_IMPLEMENTED = 28, - PTL_NO_ACK = 29, - - PTL_IOV_TOO_MANY = 30, - PTL_IOV_TOO_SMALL = 31, - - PTL_EQ_INUSE = 32, - - PTL_MAX_ERRNO = 32 + PTL_PROCESS_INVALID = 12, + PTL_PT_INDEX_INVALID = 13, + + PTL_SR_INDEX_INVALID = 14, + PTL_EQ_INVALID = 15, + PTL_EQ_DROPPED = 16, + + PTL_EQ_EMPTY = 17, + PTL_MD_NO_UPDATE = 18, + PTL_FAIL = 19, + + PTL_IOV_INVALID = 20, + + PTL_EQ_IN_USE = 21, + + PTL_NI_INVALID = 22, + PTL_MD_ILLEGAL = 23, + + PTL_MAX_ERRNO = 24 } ptl_err_t; /* If you change these, you must update the string table in api-errno.c */ diff --git a/lustre/portals/include/portals/lib-dispatch.h b/lustre/portals/include/portals/lib-dispatch.h deleted file mode 100644 index f87ff83..0000000 --- a/lustre/portals/include/portals/lib-dispatch.h +++ /dev/null @@ -1,45 +0,0 @@ -#ifndef PTL_DISPATCH_H -#define PTL_DISPATCH_H - -/* - * include/dispatch.h - * - * Dispatch table header and externs for remote side - * operations - * - * Generated by idl - * - */ - -#include -#include - -extern int do_PtlGetId(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlNIStatus(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlNIDist(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlNIDebug(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMEAttach(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMEInsert(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMEPrepend(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlTblDump(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMEDump(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMDAttach(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlMDBind(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlPut(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlGet(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlFailNid (nal_cb_t *nal, void *private, void *args, void *ret); - -extern char *dispatch_name(int index); -#endif diff --git a/lustre/portals/include/portals/lib-nal.h b/lustre/portals/include/portals/lib-nal.h deleted file mode 100644 index 0bf557e..0000000 --- a/lustre/portals/include/portals/lib-nal.h +++ /dev/null @@ -1,115 +0,0 @@ -#ifndef _LIB_NAL_H_ -#define _LIB_NAL_H_ - -/* - * nal.h - * - * Library side headers that define the abstraction layer's - * responsibilities and interfaces - */ - -#include - -struct nal_cb_t { - /* - * Per interface portal table, access control table - * and NAL private data field; - */ - lib_ni_t ni; - void *nal_data; - /* - * send: Sends a preformatted header and payload data to a - * specified remote process. The payload is scattered over 'niov' - * fragments described by iov, starting at 'offset' for 'mlen' - * bytes. - * NB the NAL may NOT overwrite iov. - * PTL_OK on success => NAL has committed to send and will call - * lib_finalize on completion - */ - ptl_err_t (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen); - - /* as send, but with a set of page fragments (NULL if not supported) */ - ptl_err_t (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, ptl_kiov_t *iov, - size_t offset, size_t mlen); - /* - * recv: Receives an incoming message from a remote process. The - * payload is to be received into the scattered buffer of 'niov' - * fragments described by iov, starting at 'offset' for 'mlen' - * bytes. Payload bytes after 'mlen' up to 'rlen' are to be - * discarded. - * NB the NAL may NOT overwrite iov. - * PTL_OK on success => NAL has committed to receive and will call - * lib_finalize on completion - */ - ptl_err_t (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen); - - /* as recv, but with a set of page fragments (NULL if not supported) */ - ptl_err_t (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - unsigned int niov, ptl_kiov_t *iov, - size_t offset, size_t mlen, size_t rlen); - /* - * read: Reads a block of data from a specified user address - */ - ptl_err_t (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr, - user_ptr src_addr, size_t len); - - /* - * write: Writes a block of data into a specified user address - */ - ptl_err_t (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr, - void *src_addr, size_t len); - - /* - * callback: Calls an event callback - * NULL => lib calls eq's callback (if any) directly. - */ - void (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq, - ptl_event_t *ev); - - /* - * malloc: Acquire a block of memory in a system independent - * fashion. - */ - void *(*cb_malloc) (nal_cb_t * nal, size_t len); - - void (*cb_free) (nal_cb_t * nal, void *buf, size_t len); - - /* - * (un)map: Tell the NAL about some memory it will access. - * *addrkey passed to cb_unmap() is what cb_map() set it to. - * type of *iov depends on options. - * Set to NULL if not required. - */ - ptl_err_t (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, - void **addrkey); - void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, - void **addrkey); - - /* as (un)map, but with a set of page fragments */ - ptl_err_t (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, - void **addrkey); - void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, - void **addrkey); - - void (*cb_printf) (nal_cb_t * nal, const char *fmt, ...); - - /* Turn interrupts off (begin of protected area) */ - void (*cb_cli) (nal_cb_t * nal, unsigned long *flags); - - /* Turn interrupts on (end of protected area) */ - void (*cb_sti) (nal_cb_t * nal, unsigned long *flags); - - /* - * Calculate a network "distance" to given node - */ - int (*cb_dist) (nal_cb_t * nal, ptl_nid_t nid, unsigned long *dist); -}; - -#endif diff --git a/lustre/portals/include/portals/lib-p30.h b/lustre/portals/include/portals/lib-p30.h index b1a6e04..4daf219 100644 --- a/lustre/portals/include/portals/lib-p30.h +++ b/lustre/portals/include/portals/lib-p30.h @@ -9,19 +9,21 @@ #ifndef _LIB_P30_H_ #define _LIB_P30_H_ +#include "build_check.h" + #ifdef __KERNEL__ # include # include #else # include # include +# include #endif #include #include #include +#include #include -#include -#include static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) { @@ -29,17 +31,18 @@ static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); } -#define state_lock(nal,flagsp) \ -do { \ - CDEBUG(D_PORTALS, "taking state lock\n"); \ - nal->cb_cli(nal, flagsp); \ -} while (0) +#ifdef __KERNEL__ +#define LIB_LOCK(nal,flags) \ + spin_lock_irqsave(&(nal)->libnal_ni.ni_lock, flags) +#define LIB_UNLOCK(nal,flags) \ + spin_unlock_irqrestore(&(nal)->libnal_ni.ni_lock, flags) +#else +#define LIB_LOCK(nal,flags) \ + (pthread_mutex_lock(&(nal)->libnal_ni.ni_mutex), (flags) = 0) +#define LIB_UNLOCK(nal,flags) \ + pthread_mutex_unlock(&(nal)->libnal_ni.ni_mutex) +#endif -#define state_unlock(nal,flagsp) \ -{ \ - CDEBUG(D_PORTALS, "releasing state lock\n"); \ - nal->cb_sti(nal, flagsp); \ -} #ifdef PTL_USE_LIB_FREELIST @@ -48,13 +51,13 @@ do { \ #define MAX_MSGS 2048 /* Outstanding messages */ #define MAX_EQS 512 -extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize); -extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl); +extern int lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int nobj, int objsize); +extern void lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl); static inline void * lib_freelist_alloc (lib_freelist_t *fl) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_freeobj_t *o; if (list_empty (&fl->fl_list)) @@ -68,7 +71,7 @@ lib_freelist_alloc (lib_freelist_t *fl) static inline void lib_freelist_free (lib_freelist_t *fl, void *obj) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); list_add (&o->fo_list, &fl->fl_list); @@ -76,78 +79,78 @@ lib_freelist_free (lib_freelist_t *fl, void *obj) static inline lib_eq_t * -lib_eq_alloc (nal_cb_t *nal) +lib_eq_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_eq_t *eq; - state_lock (nal, &flags); - eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + eq = (lib_eq_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_eqs); + LIB_UNLOCK (nal, flags); return (eq); } static inline void -lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_eqs, eq); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_eqs, eq); } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) +lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_md_t *md; - state_lock (nal, &flags); - md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + md = (lib_md_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mds); + LIB_UNLOCK (nal, flags); return (md); } static inline void -lib_md_free (nal_cb_t *nal, lib_md_t *md) +lib_md_free (lib_nal_t *nal, lib_md_t *md) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_mds, md); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_mds, md); } static inline lib_me_t * -lib_me_alloc (nal_cb_t *nal) +lib_me_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_me_t *me; - state_lock (nal, &flags); - me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + me = (lib_me_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mes); + LIB_UNLOCK (nal, flags); return (me); } static inline void -lib_me_free (nal_cb_t *nal, lib_me_t *me) +lib_me_free (lib_nal_t *nal, lib_me_t *me) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_mes, me); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_mes, me); } static inline lib_msg_t * -lib_msg_alloc (nal_cb_t *nal) +lib_msg_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_msg_t *msg; - state_lock (nal, &flags); - msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + msg = (lib_msg_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_msgs); + LIB_UNLOCK (nal, flags); if (msg != NULL) { /* NULL pointers, clear flags etc */ @@ -158,18 +161,18 @@ lib_msg_alloc (nal_cb_t *nal) } static inline void -lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) +lib_msg_free (lib_nal_t *nal, lib_msg_t *msg) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_msgs, msg); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_msgs, msg); } #else static inline lib_eq_t * -lib_eq_alloc (nal_cb_t *nal) +lib_eq_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ lib_eq_t *eq; PORTAL_ALLOC(eq, sizeof(*eq)); @@ -177,26 +180,26 @@ lib_eq_alloc (nal_cb_t *nal) } static inline void -lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ PORTAL_FREE(eq, sizeof(*eq)); } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) +lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ lib_md_t *md; int size; int niov; if ((umd->options & PTL_MD_KIOV) != 0) { - niov = umd->niov; + niov = umd->length; size = offsetof(lib_md_t, md_iov.kiov[niov]); } else { - niov = ((umd->options & PTL_MD_IOV) != 0) ? - umd->niov : 1; + niov = ((umd->options & PTL_MD_IOVEC) != 0) ? + umd->length : 1; size = offsetof(lib_md_t, md_iov.iov[niov]); } @@ -212,9 +215,9 @@ lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) } static inline void -lib_md_free (nal_cb_t *nal, lib_md_t *md) +lib_md_free (lib_nal_t *nal, lib_md_t *md) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ int size; if ((md->options & PTL_MD_KIOV) != 0) @@ -226,9 +229,9 @@ lib_md_free (nal_cb_t *nal, lib_md_t *md) } static inline lib_me_t * -lib_me_alloc (nal_cb_t *nal) +lib_me_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ lib_me_t *me; PORTAL_ALLOC(me, sizeof(*me)); @@ -236,16 +239,16 @@ lib_me_alloc (nal_cb_t *nal) } static inline void -lib_me_free(nal_cb_t *nal, lib_me_t *me) +lib_me_free(lib_nal_t *nal, lib_me_t *me) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ PORTAL_FREE(me, sizeof(*me)); } static inline lib_msg_t * -lib_msg_alloc(nal_cb_t *nal) +lib_msg_alloc(lib_nal_t *nal) { - /* NEVER called with statelock held; may be in interrupt... */ + /* NEVER called with liblock held; may be in interrupt... */ lib_msg_t *msg; if (in_interrupt()) @@ -262,27 +265,28 @@ lib_msg_alloc(nal_cb_t *nal) } static inline void -lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) +lib_msg_free(lib_nal_t *nal, lib_msg_t *msg) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ PORTAL_FREE(msg, sizeof(*msg)); } #endif -extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type); -extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type); -extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh); +extern lib_handle_t *lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type); +extern void lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type); +extern void lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh); static inline void -ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq) +ptl_eq2handle (ptl_handle_eq_t *handle, lib_nal_t *nal, lib_eq_t *eq) { + handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = eq->eq_lh.lh_cookie; } static inline lib_eq_t * -ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) +ptl_handle2eq (ptl_handle_eq_t *handle, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, PTL_COOKIE_TYPE_EQ); if (lh == NULL) @@ -292,15 +296,16 @@ ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) } static inline void -ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md) +ptl_md2handle (ptl_handle_md_t *handle, lib_nal_t *nal, lib_md_t *md) { + handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = md->md_lh.lh_cookie; } static inline lib_md_t * -ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) +ptl_handle2md (ptl_handle_md_t *handle, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, PTL_COOKIE_TYPE_MD); if (lh == NULL) @@ -310,12 +315,12 @@ ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) } static inline lib_md_t * -ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) +ptl_wire_handle2md (ptl_handle_wire_t *wh, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh; - if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie) + if (wh->wh_interface_cookie != nal->libnal_ni.ni_interface_cookie) return (NULL); lh = lib_lookup_cookie (nal, wh->wh_object_cookie, @@ -327,15 +332,16 @@ ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) } static inline void -ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me) +ptl_me2handle (ptl_handle_me_t *handle, lib_nal_t *nal, lib_me_t *me) { + handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = me->me_lh.lh_cookie; } static inline lib_me_t * -ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) +ptl_handle2me (ptl_handle_me_t *handle, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, PTL_COOKIE_TYPE_ME); if (lh == NULL) @@ -344,34 +350,30 @@ ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) return (lh_entry (lh, lib_me_t, me_lh)); } -extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize, - ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size); -extern int lib_fini(nal_cb_t * cb); -extern void lib_dispatch(nal_cb_t * cb, void *private, int index, - void *arg_block, void *ret_block); -extern char *dispatch_name(int index); +extern int lib_init(lib_nal_t *libnal, nal_t *apinal, + ptl_process_id_t pid, + ptl_ni_limits_t *desired_limits, + ptl_ni_limits_t *actual_limits); +extern int lib_fini(lib_nal_t *libnal); /* - * When the NAL detects an incoming message, it should call - * lib_parse() decode it. The NAL callbacks will be handed - * the private cookie as a way for the NAL to maintain state - * about which transaction is being processed. An extra parameter, - * lib_cookie will contain the necessary information for - * finalizing the message. - * - * After it has finished the handling the message, it should - * call lib_finalize() with the lib_cookie parameter. - * Call backs will be made to write events, send acks or - * replies and so on. + * When the NAL detects an incoming message header, it should call + * lib_parse() decode it. If the message header is garbage, lib_parse() + * returns immediately with failure, otherwise the NAL callbacks will be + * called to receive the message body. They are handed the private cookie + * as a way for the NAL to maintain state about which transaction is being + * processed. An extra parameter, lib_msg contains the lib-level message + * state for passing to lib_finalize() when the message body has been + * received. */ -extern void lib_enq_event_locked (nal_cb_t *nal, void *private, +extern void lib_enq_event_locked (lib_nal_t *nal, void *private, lib_eq_t *eq, ptl_event_t *ev); -extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg, - ptl_err_t status); -extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private); -extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, - lib_md_t *getmd); -extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr); +extern void lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, + ptl_ni_fail_t ni_fail_type); +extern ptl_err_t lib_parse (lib_nal_t *nal, ptl_hdr_t *hdr, void *private); +extern lib_msg_t *lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, + lib_msg_t *get_msg); +extern void print_hdr (lib_nal_t * nal, ptl_hdr_t * hdr); extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); @@ -394,14 +396,65 @@ extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, extern void lib_assert_wire_constants (void); -extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, +extern ptl_err_t lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); -extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, +extern ptl_err_t lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, lib_md_t *md, ptl_size_t offset, ptl_size_t len); -extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, - ptl_md_t * md_out); -extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in); -extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in); +extern int lib_api_ni_status (nal_t *nal, ptl_sr_index_t sr_idx, + ptl_sr_value_t *status); +extern int lib_api_ni_dist (nal_t *nal, ptl_process_id_t *pid, + unsigned long *dist); + +extern int lib_api_eq_alloc (nal_t *nal, ptl_size_t count, + ptl_eq_handler_t callback, + ptl_handle_eq_t *handle); +extern int lib_api_eq_free(nal_t *nal, ptl_handle_eq_t *eqh); +extern int lib_api_eq_poll (nal_t *nal, + ptl_handle_eq_t *eventqs, int neq, int timeout_ms, + ptl_event_t *event, int *which); + +extern int lib_api_me_attach(nal_t *nal, + ptl_pt_index_t portal, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); +extern int lib_api_me_insert(nal_t *nal, + ptl_handle_me_t *current_meh, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); +extern int lib_api_me_unlink (nal_t *nal, ptl_handle_me_t *meh); +extern void lib_me_unlink(lib_nal_t *nal, lib_me_t *me); + +extern int lib_api_get_id(nal_t *nal, ptl_process_id_t *pid); + +extern void lib_md_unlink(lib_nal_t *nal, lib_md_t *md); +extern void lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd); +extern int lib_api_md_attach(nal_t *nal, ptl_handle_me_t *meh, + ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle); +extern int lib_api_md_bind(nal_t *nal, ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle); +extern int lib_api_md_unlink (nal_t *nal, ptl_handle_md_t *mdh); +extern int lib_api_md_update (nal_t *nal, ptl_handle_md_t *mdh, + ptl_md_t *oldumd, ptl_md_t *newumd, + ptl_handle_eq_t *testqh); + +extern int lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, + ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, ptl_size_t offset); +extern int lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, + ptl_ack_req_t ack, ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, + ptl_size_t offset, ptl_hdr_data_t hdr_data); +extern int lib_api_fail_nid(nal_t *apinal, ptl_nid_t nid, unsigned int threshold); + #endif diff --git a/lustre/portals/include/portals/lib-types.h b/lustre/portals/include/portals/lib-types.h index d05d3fa..cfcef2b 100644 --- a/lustre/portals/include/portals/lib-types.h +++ b/lustre/portals/include/portals/lib-types.h @@ -10,7 +10,10 @@ #ifndef _LIB_TYPES_H_ #define _LIB_TYPES_H_ +#include "build_check.h" + #include +#include #ifdef __KERNEL__ # include # include @@ -20,9 +23,6 @@ # include #endif -/* struct nal_cb_t is defined in lib-nal.h */ -typedef struct nal_cb_t nal_cb_t; - typedef char *user_ptr; typedef struct lib_msg_t lib_msg_t; typedef struct lib_ptl_t lib_ptl_t; @@ -124,8 +124,8 @@ typedef struct { #define PORTALS_PROTO_MAGIC 0xeebc0ded -#define PORTALS_PROTO_VERSION_MAJOR 0 -#define PORTALS_PROTO_VERSION_MINOR 3 +#define PORTALS_PROTO_VERSION_MAJOR 1 +#define PORTALS_PROTO_VERSION_MINOR 0 typedef struct { long recv_count, recv_length, send_count, send_length, drop_count, @@ -133,8 +133,8 @@ typedef struct { } lib_counters_t; /* temporary expedient: limit number of entries in discontiguous MDs */ -#define PTL_MTU (512<<10) -#define PTL_MD_MAX_IOV 128 +#define PTL_MTU (1<<20) +#define PTL_MD_MAX_IOV 256 struct lib_msg_t { struct list_head msg_list; @@ -163,11 +163,12 @@ typedef struct { struct lib_eq_t { struct list_head eq_list; lib_handle_t eq_lh; - ptl_seq_t sequence; - ptl_size_t size; - ptl_event_t *base; + ptl_seq_t eq_enq_seq; + ptl_seq_t eq_deq_seq; + ptl_size_t eq_size; + ptl_event_t *eq_events; int eq_refcount; - int (*event_callback) (ptl_event_t * event); + ptl_eq_handler_t eq_callback; void *eq_addrkey; }; @@ -190,7 +191,6 @@ struct lib_md_t { ptl_size_t max_size; int threshold; int pending; - ptl_unlink_t unlink; unsigned int options; unsigned int md_flags; void *user_ptr; @@ -203,7 +203,15 @@ struct lib_md_t { } md_iov; }; -#define PTL_MD_FLAG_UNLINK (1 << 0) +#define PTL_MD_FLAG_ZOMBIE (1 << 0) +#define PTL_MD_FLAG_AUTO_UNLINK (1 << 1) + +static inline int lib_md_exhausted (lib_md_t *md) +{ + return (md->threshold == 0 || + ((md->options & PTL_MD_MAX_SIZE) != 0 && + md->offset + md->max_size > md->length)); +} #ifdef PTL_USE_LIB_FREELIST typedef struct @@ -235,33 +243,117 @@ typedef struct { /* PTL_COOKIE_TYPES must be a power of 2, so the cookie type can be * extracted by masking with (PTL_COOKIE_TYPES - 1) */ -typedef struct { - int up; - int refcnt; - ptl_nid_t nid; - ptl_pid_t pid; - int num_nodes; - unsigned int debug; - lib_ptl_t tbl; - lib_ac_t ac; - lib_counters_t counters; +typedef struct lib_ni +{ + nal_t *ni_api; + ptl_process_id_t ni_pid; + lib_ptl_t ni_portals; + lib_counters_t ni_counters; + ptl_ni_limits_t ni_actual_limits; int ni_lh_hash_size; /* size of lib handle hash table */ struct list_head *ni_lh_hash_table; /* all extant lib handles, this interface */ __u64 ni_next_object_cookie; /* cookie generator */ __u64 ni_interface_cookie; /* uniquely identifies this ni in this epoch */ - struct list_head ni_test_peers; + struct list_head ni_test_peers; #ifdef PTL_USE_LIB_FREELIST - lib_freelist_t ni_free_mes; - lib_freelist_t ni_free_msgs; - lib_freelist_t ni_free_mds; - lib_freelist_t ni_free_eqs; + lib_freelist_t ni_free_mes; + lib_freelist_t ni_free_msgs; + lib_freelist_t ni_free_mds; + lib_freelist_t ni_free_eqs; +#endif + + struct list_head ni_active_msgs; + struct list_head ni_active_mds; + struct list_head ni_active_eqs; + +#ifdef __KERNEL__ + spinlock_t ni_lock; + wait_queue_head_t ni_waitq; +#else + pthread_mutex_t ni_mutex; + pthread_cond_t ni_cond; #endif - struct list_head ni_active_msgs; - struct list_head ni_active_mds; - struct list_head ni_active_eqs; } lib_ni_t; + +typedef struct lib_nal +{ + /* lib-level interface state */ + lib_ni_t libnal_ni; + + /* NAL-private data */ + void *libnal_data; + + /* + * send: Sends a preformatted header and payload data to a + * specified remote process. The payload is scattered over 'niov' + * fragments described by iov, starting at 'offset' for 'mlen' + * bytes. + * NB the NAL may NOT overwrite iov. + * PTL_OK on success => NAL has committed to send and will call + * lib_finalize on completion + */ + ptl_err_t (*libnal_send) + (struct lib_nal *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen); + + /* as send, but with a set of page fragments (NULL if not supported) */ + ptl_err_t (*libnal_send_pages) + (struct lib_nal *nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, ptl_kiov_t *iov, + size_t offset, size_t mlen); + /* + * recv: Receives an incoming message from a remote process. The + * payload is to be received into the scattered buffer of 'niov' + * fragments described by iov, starting at 'offset' for 'mlen' + * bytes. Payload bytes after 'mlen' up to 'rlen' are to be + * discarded. + * NB the NAL may NOT overwrite iov. + * PTL_OK on success => NAL has committed to receive and will call + * lib_finalize on completion + */ + ptl_err_t (*libnal_recv) + (struct lib_nal *nal, void *private, lib_msg_t * cookie, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen); + + /* as recv, but with a set of page fragments (NULL if not supported) */ + ptl_err_t (*libnal_recv_pages) + (struct lib_nal *nal, void *private, lib_msg_t * cookie, + unsigned int niov, ptl_kiov_t *iov, + size_t offset, size_t mlen, size_t rlen); + + /* + * (un)map: Tell the NAL about some memory it will access. + * *addrkey passed to libnal_unmap() is what libnal_map() set it to. + * type of *iov depends on options. + * Set to NULL if not required. + */ + ptl_err_t (*libnal_map) + (struct lib_nal *nal, unsigned int niov, struct iovec *iov, + void **addrkey); + void (*libnal_unmap) + (struct lib_nal *nal, unsigned int niov, struct iovec *iov, + void **addrkey); + + /* as (un)map, but with a set of page fragments */ + ptl_err_t (*libnal_map_pages) + (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + void (*libnal_unmap_pages) + (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + + void (*libnal_printf)(struct lib_nal *nal, const char *fmt, ...); + + /* Calculate a network "distance" to given node */ + int (*libnal_dist) (struct lib_nal *nal, ptl_nid_t nid, unsigned long *dist); +} lib_nal_t; + #endif diff --git a/lustre/portals/include/portals/nal.h b/lustre/portals/include/portals/nal.h index 7cb3ab7..bf86569 100644 --- a/lustre/portals/include/portals/nal.h +++ b/lustre/portals/include/portals/nal.h @@ -1,6 +1,8 @@ #ifndef _NAL_H_ #define _NAL_H_ +#include "build_check.h" + /* * p30/nal.h * @@ -9,39 +11,77 @@ #include -#ifdef yield -#undef yield -#endif - typedef struct nal_t nal_t; struct nal_t { - ptl_ni_t ni; - int refct; - void *nal_data; - int *timeout; /* for libp30api users */ - int (*forward) (nal_t * nal, int index, /* Function ID */ - void *args, size_t arg_len, void *ret, size_t ret_len); + /* common interface state */ + int nal_refct; + ptl_handle_ni_t nal_handle; - int (*shutdown) (nal_t * nal, int interface); + /* NAL-private data */ + void *nal_data; - int (*validate) (nal_t * nal, void *base, size_t extent); + /* NAL API implementation + * NB only nal_ni_init needs to be set when the NAL registers itself */ + int (*nal_ni_init) (nal_t *nal, ptl_pid_t requested_pid, + ptl_ni_limits_t *req, ptl_ni_limits_t *actual); + + void (*nal_ni_fini) (nal_t *nal); - void (*yield) (nal_t * nal); + int (*nal_get_id) (nal_t *nal, ptl_process_id_t *id); + int (*nal_ni_status) (nal_t *nal, ptl_sr_index_t register, ptl_sr_value_t *status); + int (*nal_ni_dist) (nal_t *nal, ptl_process_id_t *id, unsigned long *distance); + int (*nal_fail_nid) (nal_t *nal, ptl_nid_t nid, unsigned int threshold); - void (*lock) (nal_t * nal, unsigned long *flags); + int (*nal_me_attach) (nal_t *nal, ptl_pt_index_t portal, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); + int (*nal_me_insert) (nal_t *nal, ptl_handle_me_t *me, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); + int (*nal_me_unlink) (nal_t *nal, ptl_handle_me_t *me); + + int (*nal_md_attach) (nal_t *nal, ptl_handle_me_t *me, + ptl_md_t *md, ptl_unlink_t unlink, + ptl_handle_md_t *handle); + int (*nal_md_bind) (nal_t *nal, + ptl_md_t *md, ptl_unlink_t unlink, + ptl_handle_md_t *handle); + int (*nal_md_unlink) (nal_t *nal, ptl_handle_md_t *md); + int (*nal_md_update) (nal_t *nal, ptl_handle_md_t *md, + ptl_md_t *old_md, ptl_md_t *new_md, + ptl_handle_eq_t *testq); - void (*unlock) (nal_t * nal, unsigned long *flags); -}; + int (*nal_eq_alloc) (nal_t *nal, ptl_size_t count, + ptl_eq_handler_t handler, + ptl_handle_eq_t *handle); + int (*nal_eq_free) (nal_t *nal, ptl_handle_eq_t *eq); + int (*nal_eq_poll) (nal_t *nal, + ptl_handle_eq_t *eqs, int neqs, int timeout, + ptl_event_t *event, int *which); -typedef nal_t *(ptl_interface_t) (int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); -extern nal_t *PTL_IFACE_IP(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); -extern nal_t *PTL_IFACE_MYR(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); + int (*nal_ace_entry) (nal_t *nal, ptl_ac_index_t index, + ptl_process_id_t match_id, ptl_pt_index_t portal); + + int (*nal_put) (nal_t *nal, ptl_handle_md_t *md, ptl_ack_req_t ack, + ptl_process_id_t *target, ptl_pt_index_t portal, + ptl_ac_index_t ac, ptl_match_bits_t match, + ptl_size_t offset, ptl_hdr_data_t hdr_data); + int (*nal_get) (nal_t *nal, ptl_handle_md_t *md, + ptl_process_id_t *target, ptl_pt_index_t portal, + ptl_ac_index_t ac, ptl_match_bits_t match, + ptl_size_t offset); +}; -extern nal_t *ptl_hndl2nal(ptl_handle_any_t * any); +extern nal_t *ptl_hndl2nal(ptl_handle_any_t *any); -#ifndef PTL_IFACE_DEFAULT -#define PTL_IFACE_DEFAULT (PTL_IFACE_IP) +#ifdef __KERNEL__ +extern int ptl_register_nal(ptl_interface_t interface, nal_t *nal); +extern void ptl_unregister_nal(ptl_interface_t interface); #endif #endif diff --git a/lustre/portals/include/portals/nalids.h b/lustre/portals/include/portals/nalids.h index 1b837b4..55a991b 100644 --- a/lustre/portals/include/portals/nalids.h +++ b/lustre/portals/include/portals/nalids.h @@ -1,4 +1,2 @@ -#define PTL_IFACE_TCP 1 -#define PTL_IFACE_ER 2 -#define PTL_IFACE_SS 3 -#define PTL_IFACE_MAX 4 +#include "build_check.h" + diff --git a/lustre/portals/include/portals/p30.h b/lustre/portals/include/portals/p30.h index 8b1495e..4b8631d 100644 --- a/lustre/portals/include/portals/p30.h +++ b/lustre/portals/include/portals/p30.h @@ -4,6 +4,8 @@ #ifndef _P30_H_ #define _P30_H_ +#include "build_check.h" + /* * p30.h * @@ -19,53 +21,6 @@ #endif #include -#include #include -#include - -extern int __p30_initialized; /* for libraries & test codes */ -extern int __p30_myr_initialized; /* that don't know if p30 */ -extern int __p30_ip_initialized; /* had been initialized yet */ -extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle; - -extern int __p30_myr_timeout; /* in seconds, for PtlNIBarrier, */ -extern int __p30_ip_timeout; /* PtlReduce_all, & PtlBroadcast_all */ - -/* - * Debugging flags reserved for the Portals reference library. - * These are not part of the API as described in the SAND report - * but are for the use of the maintainers of the reference implementation. - * - * It is not expected that the real implementations will export - * this functionality. - */ -#define PTL_DEBUG_NONE 0ul -#define PTL_DEBUG_ALL (0x0FFFul) /* Only the Portals flags */ - -#define __bit(x) ((unsigned long) 1<<(x)) -#define PTL_DEBUG_PUT __bit(0) -#define PTL_DEBUG_GET __bit(1) -#define PTL_DEBUG_REPLY __bit(2) -#define PTL_DEBUG_ACK __bit(3) -#define PTL_DEBUG_DROP __bit(4) -#define PTL_DEBUG_REQUEST __bit(5) -#define PTL_DEBUG_DELIVERY __bit(6) -#define PTL_DEBUG_UNLINK __bit(7) -#define PTL_DEBUG_THRESHOLD __bit(8) -#define PTL_DEBUG_API __bit(9) - -/* - * These eight are reserved for the NAL to define - * It should probably give them better names... - */ -#define PTL_DEBUG_NI_ALL (0xF000ul) /* Only the NAL flags */ -#define PTL_DEBUG_NI0 __bit(24) -#define PTL_DEBUG_NI1 __bit(25) -#define PTL_DEBUG_NI2 __bit(26) -#define PTL_DEBUG_NI3 __bit(27) -#define PTL_DEBUG_NI4 __bit(28) -#define PTL_DEBUG_NI5 __bit(29) -#define PTL_DEBUG_NI6 __bit(30) -#define PTL_DEBUG_NI7 __bit(31) #endif diff --git a/lustre/portals/include/portals/ppid.h b/lustre/portals/include/portals/ppid.h deleted file mode 100644 index 760f465..0000000 --- a/lustre/portals/include/portals/ppid.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef _INCppidh_ -#define _INCppidh_ - -#include "defines.h" -// #include "idtypes.h" - - -#define MAX_PPID 1000 /* this needs to fit into 16 bits so the - maximum value is 65535. having it "large" - can help w/ debugging process accounting - but there are reasons for making it - somewhat smaller than the maximum -- - requiring storage for arrays that index - on the ppid, eg... */ - -#define MAX_GID 1000 /* this needs to fit into 16 bits... */ - -#define MAX_FIXED_PPID 100 -#define MAX_FIXED_GID 100 -#define PPID_FLOATING MAX_FIXED_PPID+1 /* Floating area starts here */ -#define GID_FLOATING MAX_FIXED_GID+1 /* Floating area starts here */ -#define NUM_PTL_TASKS MAX_FIXED_PPID+80 /* Maximum no. portals tasks */ - -#define PPID_AUTO 0 - -/* Minimum PPID is 1 */ -#define PPID_BEBOPD 1 /* bebopd */ -#define GID_BEBOPD 1 /* bebopd */ - -#define PPID_PCT 2 /* pct */ -#define GID_PCT 2 /* pct */ - -#define PPID_FYOD 3 /* fyod */ -#define GID_FYOD 3 /* fyod */ - -#define PPID_GDBWRAP 11 /* portals proxy for gdb */ -#define GID_GDBWRAP 11 /* portals proxy for gdb */ - -#define PPID_TEST 15 /* for portals tests */ -#define GID_TEST 15 - -#define GID_YOD 5 /* yod */ -#define GID_PINGD 6 /* pingd */ -#define GID_BT 7 /* bt */ -#define GID_PTLTEST 8 /* ptltest */ -#define GID_CGDB 9 /* cgdb */ -#define GID_TVDSVR 10 /* start-tvdsvr */ - -#endif /* _INCppidh_ */ diff --git a/lustre/portals/include/portals/ptlctl.h b/lustre/portals/include/portals/ptlctl.h index 12ef47a..cfddde2 100644 --- a/lustre/portals/include/portals/ptlctl.h +++ b/lustre/portals/include/portals/ptlctl.h @@ -23,6 +23,10 @@ #ifndef _PTLCTL_H_ #define _PTLCTL_H_ +#include +#include +#include + #define PORTALS_DEV_ID 0 #define PORTALS_DEV_PATH "/dev/portals" #define OBD_DEV_ID 1 @@ -35,9 +39,12 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid); int ptl_initialize(int argc, char **argv); int jt_ptl_network(int argc, char **argv); -int jt_ptl_print_autoconnects (int argc, char **argv); -int jt_ptl_add_autoconnect (int argc, char **argv); -int jt_ptl_del_autoconnect (int argc, char **argv); +int jt_ptl_print_interfaces(int argc, char **argv); +int jt_ptl_add_interface(int argc, char **argv); +int jt_ptl_del_interface(int argc, char **argv); +int jt_ptl_print_peers (int argc, char **argv); +int jt_ptl_add_peer (int argc, char **argv); +int jt_ptl_del_peer (int argc, char **argv); int jt_ptl_print_connections (int argc, char **argv); int jt_ptl_connect(int argc, char **argv); int jt_ptl_disconnect(int argc, char **argv); @@ -50,9 +57,6 @@ int jt_ptl_add_uuid(int argc, char **argv); int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ int jt_ptl_close_uuid(int argc, char **argv); int jt_ptl_del_uuid(int argc, char **argv); -int jt_ptl_rxmem (int argc, char **argv); -int jt_ptl_txmem (int argc, char **argv); -int jt_ptl_nagle (int argc, char **argv); int jt_ptl_add_route (int argc, char **argv); int jt_ptl_del_route (int argc, char **argv); int jt_ptl_notify_router (int argc, char **argv); @@ -76,13 +80,15 @@ int jt_dbg_panic(int argc, char **argv); int ptl_set_cfg_record_cb(cfg_record_cb_t cb); /* l_ioctl.c */ -typedef int (ioc_handler_t)(int dev_id, int opc, void *buf); +typedef int (ioc_handler_t)(int dev_id, unsigned int opc, void *buf); void set_ioc_handler(ioc_handler_t *handler); int register_ioc_dev(int dev_id, const char * dev_name); void unregister_ioc_dev(int dev_id); int set_ioctl_dump(char * file); -int l_ioctl(int dev_id, int opc, void *buf); -int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)); +int l_ioctl(int dev_id, unsigned int opc, void *buf); +int parse_dump(char * dump_file, ioc_handler_t ioc_func); int jt_ioc_dump(int argc, char **argv); +extern char *dump_filename; +int dump(int dev_id, unsigned int opc, void *buf); #endif diff --git a/lustre/portals/include/portals/types.h b/lustre/portals/include/portals/types.h index 80995e9..0bada40 100644 --- a/lustre/portals/include/portals/types.h +++ b/lustre/portals/include/portals/types.h @@ -1,19 +1,18 @@ #ifndef _P30_TYPES_H_ #define _P30_TYPES_H_ -#include - -#ifdef __KERNEL__ -# include -# include -#else -# include -# define do_gettimeofday(tv) gettimeofday(tv, NULL); -typedef unsigned long long cycles_t; -#endif +#include "build_check.h" +#include #include +/* This implementation uses the same type for API function return codes and + * the completion status in an event */ +#define PTL_NI_OK PTL_OK +typedef ptl_err_t ptl_ni_fail_t; + +typedef __u32 ptl_uid_t; +typedef __u32 ptl_jid_t; typedef __u64 ptl_nid_t; typedef __u32 ptl_pid_t; typedef __u32 ptl_pt_index_t; @@ -22,6 +21,8 @@ typedef __u64 ptl_match_bits_t; typedef __u64 ptl_hdr_data_t; typedef __u32 ptl_size_t; +#define PTL_TIME_FOREVER (-1) + typedef struct { unsigned long nal_idx; /* which network interface */ __u64 cookie; /* which thing on that interface */ @@ -32,15 +33,17 @@ typedef ptl_handle_any_t ptl_handle_eq_t; typedef ptl_handle_any_t ptl_handle_md_t; typedef ptl_handle_any_t ptl_handle_me_t; -#define PTL_HANDLE_NONE \ +#define PTL_INVALID_HANDLE \ ((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1}) -#define PTL_EQ_NONE PTL_HANDLE_NONE +#define PTL_EQ_NONE PTL_INVALID_HANDLE -static inline int PtlHandleEqual (ptl_handle_any_t h1, ptl_handle_any_t h2) +static inline int PtlHandleIsEqual (ptl_handle_any_t h1, ptl_handle_any_t h2) { return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie); } +#define PTL_UID_ANY ((ptl_uid_t) -1) +#define PTL_JID_ANY ((ptl_jid_t) -1) #define PTL_NID_ANY ((ptl_nid_t) -1) #define PTL_PID_ANY ((ptl_pid_t) -1) @@ -60,41 +63,58 @@ typedef enum { } ptl_ins_pos_t; typedef struct { - struct page *kiov_page; - unsigned int kiov_len; - unsigned int kiov_offset; -} ptl_kiov_t; - -typedef struct { void *start; ptl_size_t length; int threshold; int max_size; unsigned int options; void *user_ptr; - ptl_handle_eq_t eventq; - unsigned int niov; + ptl_handle_eq_t eq_handle; } ptl_md_t; /* Options for the MD structure */ -#define PTL_MD_OP_PUT (1 << 0) -#define PTL_MD_OP_GET (1 << 1) -#define PTL_MD_MANAGE_REMOTE (1 << 2) -#define PTL_MD_AUTO_UNLINK (1 << 3) -#define PTL_MD_TRUNCATE (1 << 4) -#define PTL_MD_ACK_DISABLE (1 << 5) -#define PTL_MD_IOV (1 << 6) -#define PTL_MD_MAX_SIZE (1 << 7) -#define PTL_MD_KIOV (1 << 8) +#define PTL_MD_OP_PUT (1 << 0) +#define PTL_MD_OP_GET (1 << 1) +#define PTL_MD_MANAGE_REMOTE (1 << 2) +/* unused (1 << 3) */ +#define PTL_MD_TRUNCATE (1 << 4) +#define PTL_MD_ACK_DISABLE (1 << 5) +#define PTL_MD_IOVEC (1 << 6) +#define PTL_MD_MAX_SIZE (1 << 7) +#define PTL_MD_KIOV (1 << 8) +#define PTL_MD_EVENT_START_DISABLE (1 << 9) +#define PTL_MD_EVENT_END_DISABLE (1 << 10) + +/* For compatibility with Cray Portals */ +#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS 0 +#define PTL_MD_PHYS 0 #define PTL_MD_THRESH_INF (-1) +/* NB lustre portals uses struct iovec internally! */ +typedef struct iovec ptl_md_iovec_t; + +typedef struct { + struct page *kiov_page; + unsigned int kiov_len; + unsigned int kiov_offset; +} ptl_kiov_t; + typedef enum { - PTL_EVENT_GET, - PTL_EVENT_PUT, - PTL_EVENT_REPLY, + PTL_EVENT_GET_START, + PTL_EVENT_GET_END, + + PTL_EVENT_PUT_START, + PTL_EVENT_PUT_END, + + PTL_EVENT_REPLY_START, + PTL_EVENT_REPLY_END, + PTL_EVENT_ACK, - PTL_EVENT_SENT, + + PTL_EVENT_SEND_START, + PTL_EVENT_SEND_END, + PTL_EVENT_UNLINK, } ptl_event_kind_t; @@ -111,17 +131,21 @@ typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t; #endif typedef struct { ptl_event_kind_t type; - ptl_err_t status; - int unlinked; ptl_process_id_t initiator; - ptl_pt_index_t portal; + ptl_uid_t uid; + ptl_jid_t jid; + ptl_pt_index_t pt_index; ptl_match_bits_t match_bits; ptl_size_t rlength; - ptl_size_t mlength; - ptl_size_t offset; - ptl_md_t mem_desc; + ptl_size_t mlength; + ptl_size_t offset; + ptl_handle_md_t md_handle; + ptl_md_t md; ptl_hdr_data_t hdr_data; - struct timeval arrival_time; + ptl_seq_t link; + ptl_ni_fail_t ni_fail_type; + + int unlinked; volatile ptl_seq_t sequence; } ptl_event_t; @@ -134,23 +158,18 @@ typedef enum { PTL_NOACK_REQ } ptl_ack_req_t; -typedef struct { - volatile ptl_seq_t sequence; - ptl_size_t size; - ptl_event_t *base; - ptl_handle_any_t cb_eq_handle; -} ptl_eq_t; - -typedef struct { - ptl_eq_t *eq; -} ptl_ni_t; +typedef void (*ptl_eq_handler_t)(ptl_event_t *event); +#define PTL_EQ_HANDLER_NONE NULL typedef struct { - int max_match_entries; /* max number of match entries */ - int max_mem_descriptors; /* max number of memory descriptors */ - int max_event_queues; /* max number of event queues */ - int max_atable_index; /* maximum access control list table index */ - int max_ptable_index; /* maximum portals table index */ + int max_mes; + int max_mds; + int max_eqs; + int max_ac_index; + int max_pt_index; + int max_md_iovecs; + int max_me_list; + int max_getput_md; } ptl_ni_limits_t; /* @@ -168,4 +187,7 @@ typedef enum { typedef int ptl_sr_value_t; +typedef int ptl_interface_t; +#define PTL_IFACE_DEFAULT (-1) + #endif diff --git a/lustre/portals/knals/Makefile.in b/lustre/portals/knals/Makefile.in index b5ed168..9763d14 100644 --- a/lustre/portals/knals/Makefile.in +++ b/lustre/portals/knals/Makefile.in @@ -1,5 +1,6 @@ @BUILD_GMNAL_TRUE@subdir-m += gmnal -@BUILD_IBNAL_TRUE@subdir-m += ibnal +@BUILD_OPENIBNAL_TRUE@subdir-m += openibnal +@BUILD_IIBNAL_TRUE@subdir-m += iibnal @BUILD_QSWNAL_TRUE@subdir-m += qswnal subdir-m += socknal diff --git a/lustre/portals/knals/autoMakefile.am b/lustre/portals/knals/autoMakefile.am index 9d04a46..0090364 100644 --- a/lustre/portals/knals/autoMakefile.am +++ b/lustre/portals/knals/autoMakefile.am @@ -3,4 +3,4 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution -SUBDIRS = gmnal ibnal qswnal socknal +SUBDIRS = gmnal iibnal openibnal qswnal socknal diff --git a/lustre/portals/knals/gmnal/gmnal.h b/lustre/portals/knals/gmnal/gmnal.h index ad46b90..9c4425b 100644 --- a/lustre/portals/knals/gmnal/gmnal.h +++ b/lustre/portals/knals/gmnal/gmnal.h @@ -55,10 +55,13 @@ #include "linux/kp30.h" #include "portals/p30.h" -#include "portals/lib-nal.h" +#include "portals/nal.h" #include "portals/lib-p30.h" #define GM_STRONG_TYPES 1 +#ifdef VERSION +#undef VERSION +#endif #include "gm.h" #include "gm_internal.h" @@ -190,8 +193,6 @@ typedef struct _gmnal_rxtwe { #define NRXTHREADS 10 /* max number of receiver threads */ typedef struct _gmnal_data_t { - int refcnt; - spinlock_t cb_lock; spinlock_t stxd_lock; struct semaphore stxd_token; gmnal_stxd_t *stxd; @@ -206,7 +207,7 @@ typedef struct _gmnal_data_t { gmnal_srxd_t *srxd; struct gm_hash *srxd_hash; nal_t *nal; - nal_cb_t *nal_cb; + lib_nal_t *libnal; struct gm_port *gm_port; unsigned int gm_local_nid; unsigned int gm_global_nid; @@ -299,7 +300,6 @@ extern gmnal_data_t *global_nal_data; #define GMNAL_GM_LOCK_INIT(a) spin_lock_init(&a->gm_lock); #define GMNAL_GM_LOCK(a) spin_lock(&a->gm_lock); #define GMNAL_GM_UNLOCK(a) spin_unlock(&a->gm_lock); -#define GMNAL_CB_LOCK_INIT(a) spin_lock_init(&a->cb_lock); /* @@ -309,13 +309,16 @@ extern gmnal_data_t *global_nal_data; /* * API NAL */ +int gmnal_api_startup(nal_t *, ptl_pid_t, + ptl_ni_limits_t *, ptl_ni_limits_t *); + int gmnal_api_forward(nal_t *, int, void *, size_t, void *, size_t); -int gmnal_api_shutdown(nal_t *, int); +void gmnal_api_shutdown(nal_t *); int gmnal_api_validate(nal_t *, void *, size_t); -void gmnal_api_yield(nal_t *); +void gmnal_api_yield(nal_t *, unsigned long *, int); void gmnal_api_lock(nal_t *, unsigned long *); @@ -323,15 +326,9 @@ void gmnal_api_unlock(nal_t *, unsigned long *); #define GMNAL_INIT_NAL(a) do { \ - a->forward = gmnal_api_forward; \ - a->shutdown = gmnal_api_shutdown; \ - a->validate = NULL; \ - a->yield = gmnal_api_yield; \ - a->lock = gmnal_api_lock; \ - a->unlock = gmnal_api_unlock; \ - a->timeout = NULL; \ - a->refct = 1; \ - a->nal_data = NULL; \ + (a)->nal_ni_init = gmnal_api_startup; \ + (a)->nal_ni_fini = gmnal_api_shutdown; \ + (a)->nal_data = NULL; \ } while (0) @@ -339,63 +336,35 @@ void gmnal_api_unlock(nal_t *, unsigned long *); * CB NAL */ -int gmnal_cb_send(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, - int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec *, size_t); - -int gmnal_cb_send_pages(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, - int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t); - -int gmnal_cb_recv(nal_cb_t *, void *, lib_msg_t *, - unsigned int, struct iovec *, size_t, size_t); - -int gmnal_cb_recv_pages(nal_cb_t *, void *, lib_msg_t *, - unsigned int, ptl_kiov_t *, size_t, size_t); - -int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t); - -int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t); - -int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *); - -void *gmnal_cb_malloc(nal_cb_t *, size_t); - -void gmnal_cb_free(nal_cb_t *, void *, size_t); - -void gmnal_cb_unmap(nal_cb_t *, unsigned int, struct iovec*, void **); - -int gmnal_cb_map(nal_cb_t *, unsigned int, struct iovec*, void **); +ptl_err_t gmnal_cb_send(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, + int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec *, size_t, size_t); -void gmnal_cb_printf(nal_cb_t *, const char *fmt, ...); +ptl_err_t gmnal_cb_send_pages(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, + int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t, size_t); -void gmnal_cb_cli(nal_cb_t *, unsigned long *); +ptl_err_t gmnal_cb_recv(lib_nal_t *, void *, lib_msg_t *, + unsigned int, struct iovec *, size_t, size_t, size_t); -void gmnal_cb_sti(nal_cb_t *, unsigned long *); +ptl_err_t gmnal_cb_recv_pages(lib_nal_t *, void *, lib_msg_t *, + unsigned int, ptl_kiov_t *, size_t, size_t, size_t); -int gmnal_cb_dist(nal_cb_t *, ptl_nid_t, unsigned long *); +int gmnal_cb_dist(lib_nal_t *, ptl_nid_t, unsigned long *); -nal_t *gmnal_init(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t rpid); +int gmnal_init(void); void gmnal_fini(void); #define GMNAL_INIT_NAL_CB(a) do { \ - a->cb_send = gmnal_cb_send; \ - a->cb_send_pages = gmnal_cb_send_pages; \ - a->cb_recv = gmnal_cb_recv; \ - a->cb_recv_pages = gmnal_cb_recv_pages; \ - a->cb_read = gmnal_cb_read; \ - a->cb_write = gmnal_cb_write; \ - a->cb_callback = gmnal_cb_callback; \ - a->cb_malloc = gmnal_cb_malloc; \ - a->cb_free = gmnal_cb_free; \ - a->cb_map = NULL; \ - a->cb_unmap = NULL; \ - a->cb_printf = gmnal_cb_printf; \ - a->cb_cli = gmnal_cb_cli; \ - a->cb_sti = gmnal_cb_sti; \ - a->cb_dist = gmnal_cb_dist; \ - a->nal_data = NULL; \ + a->libnal_send = gmnal_cb_send; \ + a->libnal_send_pages = gmnal_cb_send_pages; \ + a->libnal_recv = gmnal_cb_recv; \ + a->libnal_recv_pages = gmnal_cb_recv_pages; \ + a->libnal_map = NULL; \ + a->libnal_unmap = NULL; \ + a->libnal_dist = gmnal_cb_dist; \ + a->libnal_data = NULL; \ } while (0) @@ -450,11 +419,11 @@ void gmnal_remove_rxtwe(gmnal_data_t *); /* * Small messages */ -int gmnal_small_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int, - struct iovec *, size_t, size_t); -int gmnal_small_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, +int gmnal_small_rx(lib_nal_t *, void *, lib_msg_t *, unsigned int, + struct iovec *, size_t, size_t, size_t); +int gmnal_small_tx(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, int, ptl_nid_t, ptl_pid_t, - unsigned int, struct iovec*, int); + unsigned int, struct iovec*, size_t, int); void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t); @@ -462,12 +431,12 @@ void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t); /* * Large messages */ -int gmnal_large_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int, - struct iovec *, size_t, size_t); +int gmnal_large_rx(lib_nal_t *, void *, lib_msg_t *, unsigned int, + struct iovec *, size_t, size_t, size_t); -int gmnal_large_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, +int gmnal_large_tx(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, int, ptl_nid_t, ptl_pid_t, unsigned int, - struct iovec*, int); + struct iovec*, size_t, int); void gmnal_large_tx_callback(gm_port_t *, void *, gm_status_t); diff --git a/lustre/portals/knals/gmnal/gmnal_api.c b/lustre/portals/knals/gmnal/gmnal_api.c index 1442aa7..bd6c83e 100644 --- a/lustre/portals/knals/gmnal/gmnal_api.c +++ b/lustre/portals/knals/gmnal/gmnal_api.c @@ -30,6 +30,9 @@ gmnal_data_t *global_nal_data = NULL; #define GLOBAL_NID_STR_LEN 16 char global_nid_str[GLOBAL_NID_STR_LEN] = {0}; +ptl_handle_ni_t kgmnal_ni; + +extern int gmnal_cmd(struct portals_cfg *pcfg, void *private); /* * Write the global nid /proc/sys/gmnal/globalnid @@ -50,224 +53,112 @@ static ctl_table gmnalnal_top_sysctl_table[] = { { 0 } }; - - - - - -/* - * gmnal_api_forward - * This function takes a pack block of arguments from the NAL API - * module and passes them to the NAL CB module. The CB module unpacks - * the args and calls the appropriate function indicated by index. - * Typically this function is used to pass args between kernel and use - * space. - * As lgmanl exists entirely in kernel, just pass the arg block directly - * to the NAL CB, buy passing the args to lib_dispatch - * Arguments are - * nal_t nal Our nal - * int index the api function that initiated this call - * void *args packed block of function args - * size_t arg_len length of args block - * void *ret A return value for the API NAL - * size_t ret_len Size of the return value - * - */ - -int -gmnal_api_forward(nal_t *nal, int index, void *args, size_t arg_len, - void *ret, size_t ret_len) -{ - - nal_cb_t *nal_cb = NULL; - gmnal_data_t *nal_data = NULL; - - - - - - if (!nal || !args || (index < 0) || (arg_len < 0)) { - CDEBUG(D_ERROR, "Bad args to gmnal_api_forward\n"); - return (PTL_FAIL); - } - - if (ret && (ret_len <= 0)) { - CDEBUG(D_ERROR, "Bad args to gmnal_api_forward\n"); - return (PTL_FAIL); - } - - - if (!nal->nal_data) { - CDEBUG(D_ERROR, "bad nal, no nal data\n"); - return (PTL_FAIL); - } - - nal_data = nal->nal_data; - CDEBUG(D_INFO, "nal_data is [%p]\n", nal_data); - - if (!nal_data->nal_cb) { - CDEBUG(D_ERROR, "bad nal_data, no nal_cb\n"); - return (PTL_FAIL); - } - - nal_cb = nal_data->nal_cb; - CDEBUG(D_INFO, "nal_cb is [%p]\n", nal_cb); - - CDEBUG(D_PORTALS, "gmnal_api_forward calling lib_dispatch\n"); - lib_dispatch(nal_cb, NULL, index, args, ret); - CDEBUG(D_PORTALS, "gmnal_api_forward returns from lib_dispatch\n"); - - return(PTL_OK); -} - - /* * gmnal_api_shutdown + * nal_refct == 0 => called on last matching PtlNIFini() * Close down this interface and free any resources associated with it * nal_t nal our nal to shutdown */ -int -gmnal_api_shutdown(nal_t *nal, int interface) -{ - - gmnal_data_t *nal_data = nal->nal_data; - - CDEBUG(D_TRACE, "gmnal_api_shutdown: nal_data [%p]\n", nal_data); - - return(PTL_OK); -} - - -/* - * gmnal_api_validate - * validate a user address for use in communications - * There's nothing to be done here - */ -int -gmnal_api_validate(nal_t *nal, void *base, size_t extent) -{ - - return(PTL_OK); -} - - - -/* - * gmnal_api_yield - * Give up the processor - */ void -gmnal_api_yield(nal_t *nal) +gmnal_api_shutdown(nal_t *nal) { - CDEBUG(D_TRACE, "gmnal_api_yield : nal [%p]\n", nal); - - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - - return; -} - - - -/* - * gmnal_api_lock - * Take a threadsafe lock - */ -void -gmnal_api_lock(nal_t *nal, unsigned long *flags) -{ - gmnal_data_t *nal_data; - nal_cb_t *nal_cb; - - nal_data = nal->nal_data; - nal_cb = nal_data->nal_cb; + lib_nal_t *libnal; - nal_cb->cb_cli(nal_cb, flags); + if (nal->nal_refct != 0) + return; + - return; -} + LASSERT(nal == global_nal_data->nal); + libnal = (lib_nal_t *)nal->nal_data; + nal_data = (gmnal_data_t *)libnal->libnal_data; + LASSERT(nal_data == global_nal_data); + CDEBUG(D_TRACE, "gmnal_api_shutdown: nal_data [%p]\n", nal_data); -/* - * gmnal_api_unlock - * Release a threadsafe lock - */ -void -gmnal_api_unlock(nal_t *nal, unsigned long *flags) -{ - gmnal_data_t *nal_data; - nal_cb_t *nal_cb; + /* Stop portals calling our ioctl handler */ + libcfs_nal_cmd_unregister(GMNAL); - nal_data = nal->nal_data; - nal_cb = nal_data->nal_cb; + /* XXX for shutdown "under fire" we probably need to set a shutdown + * flag so when lib calls us we fail immediately and dont queue any + * more work but our threads can still call into lib OK. THEN + * shutdown our threads, THEN lib_fini() */ + lib_fini(libnal); - nal_cb->cb_sti(nal_cb, flags); + gmnal_stop_rxthread(nal_data); + gmnal_stop_ctthread(nal_data); + gmnal_free_txd(nal_data); + gmnal_free_srxd(nal_data); + GMNAL_GM_LOCK(nal_data); + gm_close(nal_data->gm_port); + gm_finalize(); + GMNAL_GM_UNLOCK(nal_data); + if (nal_data->sysctl) + unregister_sysctl_table (nal_data->sysctl); + /* Don't free 'nal'; it's a static struct */ + PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); - return; + global_nal_data = NULL; + PORTAL_MODULE_UNUSE; } -nal_t * -gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, - ptl_pid_t rpid) +int +gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, + ptl_ni_limits_t *requested_limits, + ptl_ni_limits_t *actual_limits) { - nal_t *nal = NULL; - nal_cb_t *nal_cb = NULL; + lib_nal_t *libnal = NULL; gmnal_data_t *nal_data = NULL; gmnal_srxd_t *srxd = NULL; gm_status_t gm_status; unsigned int local_nid = 0, global_nid = 0; - ptl_nid_t portals_nid; - ptl_pid_t portals_pid = 0; + ptl_process_id_t process_id; + + if (nal->nal_refct != 0) { + if (actual_limits != NULL) { + libnal = (lib_nal_t *)nal->nal_data; + *actual_limits = libnal->libnal_ni.ni_actual_limits; + } + return (PTL_OK); + } + /* Called on first PtlNIInit() */ - CDEBUG(D_TRACE, "gmnal_init : interface [%d], ptl_size [%d], " - "ac_size[%d]\n", interface, ptl_size, ac_size); + CDEBUG(D_TRACE, "startup\n"); + LASSERT(global_nal_data == NULL); PORTAL_ALLOC(nal_data, sizeof(gmnal_data_t)); if (!nal_data) { CDEBUG(D_ERROR, "can't get memory\n"); - return(NULL); + return(PTL_NO_SPACE); } memset(nal_data, 0, sizeof(gmnal_data_t)); /* * set the small message buffer size */ - nal_data->refcnt = 1; CDEBUG(D_INFO, "Allocd and reset nal_data[%p]\n", nal_data); CDEBUG(D_INFO, "small_msg_size is [%d]\n", nal_data->small_msg_size); - PORTAL_ALLOC(nal, sizeof(nal_t)); - if (!nal) { + PORTAL_ALLOC(libnal, sizeof(lib_nal_t)); + if (!libnal) { PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - return(NULL); + return(PTL_NO_SPACE); } - memset(nal, 0, sizeof(nal_t)); - CDEBUG(D_INFO, "Allocd and reset nal[%p]\n", nal); + memset(libnal, 0, sizeof(lib_nal_t)); + CDEBUG(D_INFO, "Allocd and reset libnal[%p]\n", libnal); - PORTAL_ALLOC(nal_cb, sizeof(nal_cb_t)); - if (!nal_cb) { - PORTAL_FREE(nal, sizeof(nal_t)); - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - return(NULL); - } - memset(nal_cb, 0, sizeof(nal_cb_t)); - CDEBUG(D_INFO, "Allocd and reset nal_cb[%p]\n", nal_cb); - - GMNAL_INIT_NAL(nal); - GMNAL_INIT_NAL_CB(nal_cb); + GMNAL_INIT_NAL_CB(libnal); /* * String them all together */ - nal->nal_data = (void*)nal_data; - nal_cb->nal_data = (void*)nal_data; + libnal->libnal_data = (void*)nal_data; nal_data->nal = nal; - nal_data->nal_cb = nal_cb; + nal_data->libnal = libnal; - GMNAL_CB_LOCK_INIT(nal_data); GMNAL_GM_LOCK_INIT(nal_data); @@ -277,15 +168,14 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, CDEBUG(D_INFO, "Calling gm_init\n"); if (gm_init() != GM_SUCCESS) { CDEBUG(D_ERROR, "call to gm_init failed\n"); - PORTAL_FREE(nal, sizeof(nal_t)); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); - return(NULL); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); + return(PTL_FAIL); } - CDEBUG(D_NET, "Calling gm_open with interface [%d], port [%d], " - "name [%s], version [%d]\n", interface, GMNAL_GM_PORT, + CDEBUG(D_NET, "Calling gm_open with port [%d], " + "name [%s], version [%d]\n", GMNAL_GM_PORT, "gmnal", GM_API_VERSION); GMNAL_GM_LOCK(nal_data); @@ -323,10 +213,9 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, GMNAL_GM_LOCK(nal_data); gm_finalize(); GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal, sizeof(nal_t)); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); - return(NULL); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); + return(PTL_FAIL); } @@ -341,10 +230,9 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, gm_close(nal_data->gm_port); gm_finalize(); GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal, sizeof(nal_t)); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); - return(NULL); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); + return(PTL_FAIL); } @@ -371,10 +259,9 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, gm_close(nal_data->gm_port); gm_finalize(); GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal, sizeof(nal_t)); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); - return(NULL); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); + return(PTL_FAIL); } gmnal_start_kernel_threads(nal_data); @@ -404,13 +291,14 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, gm_close(nal_data->gm_port); gm_finalize(); GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal, sizeof(nal_t)); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); - return(NULL); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); + return(PTL_FAIL); } + nal_data->gm_local_nid = local_nid; CDEBUG(D_INFO, "Local node id is [%u]\n", local_nid); + GMNAL_GM_LOCK(nal_data); gm_status = gm_node_id_to_global_id(nal_data->gm_port, local_nid, &global_nid); @@ -425,10 +313,9 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, gm_close(nal_data->gm_port); gm_finalize(); GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal, sizeof(nal_t)); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); - return(NULL); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); + return(PTL_FAIL); } CDEBUG(D_INFO, "Global node id is [%u]\n", global_nid); nal_data->gm_global_nid = global_nid; @@ -437,13 +324,15 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, /* pid = gm_getpid(); */ - CDEBUG(D_INFO, "portals_pid is [%u]\n", portals_pid); - portals_nid = (unsigned long)global_nid; - CDEBUG(D_INFO, "portals_nid is ["LPU64"]\n", portals_nid); + process_id.pid = requested_pid; + process_id.nid = global_nid; + + CDEBUG(D_INFO, "portals_pid is [%u]\n", process_id.pid); + CDEBUG(D_INFO, "portals_nid is ["LPU64"]\n", process_id.nid); CDEBUG(D_PORTALS, "calling lib_init\n"); - if (lib_init(nal_cb, portals_nid, portals_pid, 1024, ptl_size, - ac_size) != PTL_OK) { + if (lib_init(libnal, nal, process_id, + requested_limits, actual_limits) != PTL_OK) { CDEBUG(D_ERROR, "lib_init failed\n"); gmnal_stop_rxthread(nal_data); gmnal_stop_ctthread(nal_data); @@ -453,48 +342,83 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, gm_close(nal_data->gm_port); gm_finalize(); GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal, sizeof(nal_t)); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); - return(NULL); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); + return(PTL_FAIL); } + + if (libcfs_nal_cmd_register(GMNAL, &gmnal_cmd, libnal->libnal_data) != 0) { + CDEBUG(D_INFO, "libcfs_nal_cmd_register failed\n"); + + /* XXX these cleanup cases should be restructured to + * minimise duplication... */ + lib_fini(libnal); + + gmnal_stop_rxthread(nal_data); + gmnal_stop_ctthread(nal_data); + gmnal_free_txd(nal_data); + gmnal_free_srxd(nal_data); + GMNAL_GM_LOCK(nal_data); + gm_close(nal_data->gm_port); + gm_finalize(); + GMNAL_GM_UNLOCK(nal_data); + PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); + return(PTL_FAIL); + } + + /* might be better to initialise this at module load rather than in + * NAL startup */ nal_data->sysctl = NULL; nal_data->sysctl = register_sysctl_table (gmnalnal_top_sysctl_table, 0); CDEBUG(D_INFO, "gmnal_init finished\n"); global_nal_data = nal->nal_data; - return(nal); + + /* no unload now until shutdown */ + PORTAL_MODULE_USE; + + return(PTL_OK); } +nal_t the_gm_nal; + +/* + * Called when module loaded + */ +int gmnal_init(void) +{ + int rc; + + memset(&the_gm_nal, 0, sizeof(nal_t)); + CDEBUG(D_INFO, "reset nal[%p]\n", &the_gm_nal); + GMNAL_INIT_NAL(&the_gm_nal); + + rc = ptl_register_nal(GMNAL, &the_gm_nal); + if (rc != PTL_OK) + CERROR("Can't register GMNAL: %d\n", rc); + rc = PtlNIInit(GMNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kgmnal_ni); + if (rc != PTL_OK && rc != PTL_IFACE_DUP) { + ptl_unregister_nal(GMNAL); + return (-ENODEV); + } + + return (rc); +} + /* * Called when module removed */ void gmnal_fini() { - gmnal_data_t *nal_data = global_nal_data; - nal_t *nal = nal_data->nal; - nal_cb_t *nal_cb = nal_data->nal_cb; - CDEBUG(D_TRACE, "gmnal_fini\n"); - PtlNIFini(kgmnal_ni); - lib_fini(nal_cb); + LASSERT(global_nal_data == NULL); + PtlNIFini(kgmnal_ni); - gmnal_stop_rxthread(nal_data); - gmnal_stop_ctthread(nal_data); - gmnal_free_txd(nal_data); - gmnal_free_srxd(nal_data); - GMNAL_GM_LOCK(nal_data); - gm_close(nal_data->gm_port); - gm_finalize(); - GMNAL_GM_UNLOCK(nal_data); - if (nal_data->sysctl) - unregister_sysctl_table (nal_data->sysctl); - PORTAL_FREE(nal, sizeof(nal_t)); - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + ptl_unregister_nal(GMNAL); } diff --git a/lustre/portals/knals/gmnal/gmnal_cb.c b/lustre/portals/knals/gmnal/gmnal_cb.c index 1f287468..0ebf437 100644 --- a/lustre/portals/knals/gmnal/gmnal_cb.c +++ b/lustre/portals/knals/gmnal/gmnal_cb.c @@ -27,28 +27,28 @@ #include "gmnal.h" -int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, - unsigned int niov, struct iovec *iov, size_t mlen, - size_t rlen) +ptl_err_t gmnal_cb_recv(lib_nal_t *libnal, void *private, lib_msg_t *cookie, + unsigned int niov, struct iovec *iov, size_t offset, + size_t mlen, size_t rlen) { gmnal_srxd_t *srxd = (gmnal_srxd_t*)private; int status = PTL_OK; - CDEBUG(D_TRACE, "gmnal_cb_recv nal_cb [%p], private[%p], cookie[%p], " - "niov[%d], iov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", - nal_cb, private, cookie, niov, iov, mlen, rlen); + CDEBUG(D_TRACE, "gmnal_cb_recv libnal [%p], private[%p], cookie[%p], " + "niov[%d], iov [%p], offset["LPSZ"], mlen["LPSZ"], rlen["LPSZ"]\n", + libnal, private, cookie, niov, iov, offset, mlen, rlen); switch(srxd->type) { case(GMNAL_SMALL_MESSAGE): CDEBUG(D_INFO, "gmnal_cb_recv got small message\n"); - status = gmnal_small_rx(nal_cb, private, cookie, niov, - iov, mlen, rlen); + status = gmnal_small_rx(libnal, private, cookie, niov, + iov, offset, mlen, rlen); break; case(GMNAL_LARGE_MESSAGE_INIT): CDEBUG(D_INFO, "gmnal_cb_recv got large message init\n"); - status = gmnal_large_rx(nal_cb, private, cookie, niov, - iov, mlen, rlen); + status = gmnal_large_rx(libnal, private, cookie, niov, + iov, offset, mlen, rlen); } @@ -56,9 +56,9 @@ int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, return(status); } -int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, - unsigned int kniov, ptl_kiov_t *kiov, size_t mlen, - size_t rlen) +ptl_err_t gmnal_cb_recv_pages(lib_nal_t *libnal, void *private, lib_msg_t *cookie, + unsigned int kniov, ptl_kiov_t *kiov, size_t offset, + size_t mlen, size_t rlen) { gmnal_srxd_t *srxd = (gmnal_srxd_t*)private; int status = PTL_OK; @@ -67,9 +67,9 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, ptl_kiov_t *kiov_dup = kiov;; - CDEBUG(D_TRACE, "gmnal_cb_recv_pages nal_cb [%p],private[%p], " - "cookie[%p], kniov[%d], kiov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", - nal_cb, private, cookie, kniov, kiov, mlen, rlen); + CDEBUG(D_TRACE, "gmnal_cb_recv_pages libnal [%p],private[%p], " + "cookie[%p], kniov[%d], kiov [%p], offset["LPSZ"], mlen["LPSZ"], rlen["LPSZ"]\n", + libnal, private, cookie, kniov, kiov, offset, mlen, rlen); if (srxd->type == GMNAL_SMALL_MESSAGE) { PORTAL_ALLOC(iovec, sizeof(struct iovec)*kniov); @@ -98,8 +98,8 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, kiov++; } CDEBUG(D_INFO, "calling gmnal_small_rx\n"); - status = gmnal_small_rx(nal_cb, private, cookie, kniov, - iovec_dup, mlen, rlen); + status = gmnal_small_rx(libnal, private, cookie, kniov, + iovec_dup, offset, mlen, rlen); for (i=0; ikiov_page); kiov_dup++; @@ -113,34 +113,35 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, } -int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +ptl_err_t gmnal_cb_send(lib_nal_t *libnal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, size_t len) + unsigned int niov, struct iovec *iov, size_t offset, size_t len) { gmnal_data_t *nal_data; - CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] len["LPSZ"] nid["LPU64"]\n", - niov, len, nid); - nal_data = nal_cb->nal_data; + CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] offset["LPSZ"] len["LPSZ"] nid["LPU64"]\n", + niov, offset, len, nid); + nal_data = libnal->libnal_data; if (GMNAL_IS_SMALL_MESSAGE(nal_data, niov, iov, len)) { CDEBUG(D_INFO, "This is a small message send\n"); - gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, pid, - niov, iov, len); + gmnal_small_tx(libnal, private, cookie, hdr, type, nid, pid, + niov, iov, offset, len); } else { CDEBUG(D_ERROR, "Large message send it is not supported\n"); - lib_finalize(nal_cb, private, cookie, PTL_FAIL); + lib_finalize(libnal, private, cookie, PTL_FAIL); return(PTL_FAIL); - gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, - niov, iov, len); + gmnal_large_tx(libnal, private, cookie, hdr, type, nid, pid, + niov, iov, offset, len); } return(PTL_OK); } -int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int kniov, ptl_kiov_t *kiov, size_t len) +ptl_err_t gmnal_cb_send_pages(lib_nal_t *libnal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int kniov, ptl_kiov_t *kiov, size_t offset, size_t len) { int i = 0; @@ -148,8 +149,9 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, struct iovec *iovec = NULL, *iovec_dup = NULL; ptl_kiov_t *kiov_dup = kiov; - CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] len["LPSZ"]\n", nid, kniov, len); - nal_data = nal_cb->nal_data; + CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] offset["LPSZ"] len["LPSZ"]\n", + nid, kniov, offset, len); + nal_data = libnal->libnal_data; PORTAL_ALLOC(iovec, kniov*sizeof(struct iovec)); iovec_dup = iovec; if (GMNAL_IS_SMALL_MESSAGE(nal_data, 0, NULL, len)) { @@ -168,8 +170,8 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, iovec++; kiov++; } - gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, - pid, kniov, iovec_dup, len); + gmnal_small_tx(libnal, private, cookie, hdr, type, nid, + pid, kniov, iovec_dup, offset, len); } else { CDEBUG(D_ERROR, "Large message send it is not supported yet\n"); return(PTL_FAIL); @@ -185,8 +187,8 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, iovec++; kiov++; } - gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, - pid, kniov, iovec, len); + gmnal_large_tx(libnal, private, cookie, hdr, type, nid, + pid, kniov, iovec, offset, len); } for (i=0; ikiov_page); @@ -196,83 +198,7 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, return(PTL_OK); } -int gmnal_cb_read(nal_cb_t *nal_cb, void *private, void *dst, - user_ptr src, size_t len) -{ - gm_bcopy(src, dst, len); - return(PTL_OK); -} - -int gmnal_cb_write(nal_cb_t *nal_cb, void *private, user_ptr dst, - void *src, size_t len) -{ - gm_bcopy(src, dst, len); - return(PTL_OK); -} - -int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq, - ptl_event_t *ev) -{ - - if (eq->event_callback != NULL) { - CDEBUG(D_INFO, "found callback\n"); - eq->event_callback(ev); - } - - return(PTL_OK); -} - -void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len) -{ - void *ptr = NULL; - CDEBUG(D_TRACE, "gmnal_cb_malloc len["LPSZ"]\n", len); - PORTAL_ALLOC(ptr, len); - return(ptr); -} - -void gmnal_cb_free(nal_cb_t *nal_cb, void *buf, size_t len) -{ - CDEBUG(D_TRACE, "gmnal_cb_free :: buf[%p] len["LPSZ"]\n", buf, len); - PORTAL_FREE(buf, len); - return; -} - -void gmnal_cb_unmap(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov, - void **addrkey) -{ - return; -} - -int gmnal_cb_map(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov, - void**addrkey) -{ - return(PTL_OK); -} - -void gmnal_cb_printf(nal_cb_t *nal_cb, const char *fmt, ...) -{ - CDEBUG(D_TRACE, "gmnal_cb_printf\n"); - printk(fmt); - return; -} - -void gmnal_cb_cli(nal_cb_t *nal_cb, unsigned long *flags) -{ - gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; - - spin_lock_irqsave(&nal_data->cb_lock, *flags); - return; -} - -void gmnal_cb_sti(nal_cb_t *nal_cb, unsigned long *flags) -{ - gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; - - spin_unlock_irqrestore(&nal_data->cb_lock, *flags); - return; -} - -int gmnal_cb_dist(nal_cb_t *nal_cb, ptl_nid_t nid, unsigned long *dist) +int gmnal_cb_dist(lib_nal_t *libnal, ptl_nid_t nid, unsigned long *dist) { CDEBUG(D_TRACE, "gmnal_cb_dist\n"); if (dist) diff --git a/lustre/portals/knals/gmnal/gmnal_comm.c b/lustre/portals/knals/gmnal/gmnal_comm.c index 1bcd9bd..6a8fcbc 100644 --- a/lustre/portals/knals/gmnal/gmnal_comm.c +++ b/lustre/portals/knals/gmnal/gmnal_comm.c @@ -189,6 +189,7 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) unsigned int snode, sport, type, length; gmnal_msghdr_t *gmnal_msghdr; ptl_hdr_t *portals_hdr; + int rc; CDEBUG(D_INFO, "nal_data [%p], we[%p] type [%d]\n", nal_data, we, gmnal_type); @@ -219,10 +220,12 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) */ srxd = gmnal_rxbuffer_to_srxd(nal_data, buffer); CDEBUG(D_INFO, "Back from gmnal_rxbuffer_to_srxd\n"); - srxd->nal_data = nal_data; if (!srxd) { CDEBUG(D_ERROR, "Failed to get receive descriptor\n"); - lib_parse(nal_data->nal_cb, portals_hdr, srxd); + /* I think passing a NULL srxd to lib_parse will crash + * gmnal_recv() */ + LBUG(); + lib_parse(nal_data->libnal, portals_hdr, srxd); return(GMNAL_STATUS_FAIL); } @@ -234,6 +237,7 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) return(GMNAL_STATUS_OK); } + srxd->nal_data = nal_data; srxd->type = gmnal_type; srxd->nsiov = gmnal_msghdr->niov; srxd->gm_source_node = gmnal_msghdr->sender_node_id; @@ -245,7 +249,12 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) * cb_recv is responsible for returning the buffer * for future receive */ - lib_parse(nal_data->nal_cb, portals_hdr, srxd); + rc = lib_parse(nal_data->libnal, portals_hdr, srxd); + + if (rc != PTL_OK) { + /* I just received garbage; take appropriate action... */ + LBUG(); + } return(GMNAL_STATUS_OK); } @@ -309,19 +318,19 @@ gmnal_rx_bad(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, gmnal_srxd_t *srxd) * Call lib_finalize */ int -gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, - unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) +gmnal_small_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, + unsigned int niov, struct iovec *iov, size_t offset, size_t mlen, size_t rlen) { gmnal_srxd_t *srxd = NULL; void *buffer = NULL; - gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; + gmnal_data_t *nal_data = (gmnal_data_t*)libnal->libnal_data; CDEBUG(D_TRACE, "niov [%d] mlen["LPSZ"]\n", niov, mlen); if (!private) { CDEBUG(D_ERROR, "gmnal_small_rx no context\n"); - lib_finalize(nal_cb, private, cookie, PTL_FAIL); + lib_finalize(libnal, private, cookie, PTL_FAIL); return(PTL_FAIL); } @@ -331,11 +340,24 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, buffer += sizeof(ptl_hdr_t); while(niov--) { - CDEBUG(D_INFO, "processing [%p] len ["LPSZ"]\n", iov, - iov->iov_len); - gm_bcopy(buffer, iov->iov_base, iov->iov_len); - buffer += iov->iov_len; - iov++; + if (offset >= iov->iov_len) { + offset -= iov->iov_len; + } else if (offset > 0) { + CDEBUG(D_INFO, "processing [%p] base [%p] len %d, " + "offset %d, len ["LPSZ"]\n", iov, + iov->iov_base + offset, iov->iov_len, offset, + iov->iov_len - offset); + gm_bcopy(buffer, iov->iov_base + offset, + iov->iov_len - offset); + offset = 0; + buffer += iov->iov_len - offset; + } else { + CDEBUG(D_INFO, "processing [%p] len ["LPSZ"]\n", iov, + iov->iov_len); + gm_bcopy(buffer, iov->iov_base, iov->iov_len); + buffer += iov->iov_len; + } + iov++; } @@ -343,7 +365,7 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, * let portals library know receive is complete */ CDEBUG(D_PORTALS, "calling lib_finalize\n"); - lib_finalize(nal_cb, private, cookie, PTL_OK); + lib_finalize(libnal, private, cookie, PTL_OK); /* * return buffer so it can be used again */ @@ -365,11 +387,11 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, * The callback function informs when the send is complete. */ int -gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +gmnal_small_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t global_nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, int size) + unsigned int niov, struct iovec *iov, size_t offset, int size) { - gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; + gmnal_data_t *nal_data = (gmnal_data_t*)libnal->libnal_data; gmnal_stxd_t *stxd = NULL; void *buffer = NULL; gmnal_msghdr_t *msghdr = NULL; @@ -377,9 +399,9 @@ gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, unsigned int local_nid; gm_status_t gm_status = GM_SUCCESS; - CDEBUG(D_TRACE, "gmnal_small_tx nal_cb [%p] private [%p] cookie [%p] " + CDEBUG(D_TRACE, "gmnal_small_tx libnal [%p] private [%p] cookie [%p] " "hdr [%p] type [%d] global_nid ["LPU64"] pid [%d] niov [%d] " - "iov [%p] size [%d]\n", nal_cb, private, cookie, hdr, type, + "iov [%p] size [%d]\n", libnal, private, cookie, hdr, type, global_nid, pid, niov, iov, size); CDEBUG(D_INFO, "portals_hdr:: dest_nid ["LPU64"], src_nid ["LPU64"]\n", @@ -428,11 +450,21 @@ gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, buffer += sizeof(ptl_hdr_t); while(niov--) { - CDEBUG(D_INFO, "processing iov [%p] len ["LPSZ"] to [%p]\n", - iov, iov->iov_len, buffer); - gm_bcopy(iov->iov_base, buffer, iov->iov_len); - buffer+= iov->iov_len; - iov++; + if (offset >= iov->iov_len) { + offset -= iov->iov_len; + } else if (offset > 0) { + CDEBUG(D_INFO, "processing iov [%p] base [%p] len ["LPSZ"] to [%p]\n", + iov, iov->iov_base + offset, iov->iov_len - offset, buffer); + gm_bcopy(iov->iov_base + offset, buffer, iov->iov_len - offset); + buffer+= iov->iov_len - offset; + offset = 0; + } else { + CDEBUG(D_INFO, "processing iov [%p] len ["LPSZ"] to [%p]\n", + iov, iov->iov_len, buffer); + gm_bcopy(iov->iov_base, buffer, iov->iov_len); + buffer+= iov->iov_len; + } + iov++; } CDEBUG(D_INFO, "sending\n"); @@ -472,7 +504,7 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) gmnal_stxd_t *stxd = (gmnal_stxd_t*)context; lib_msg_t *cookie = stxd->cookie; gmnal_data_t *nal_data = (gmnal_data_t*)stxd->nal_data; - nal_cb_t *nal_cb = nal_data->nal_cb; + lib_nal_t *libnal = nal_data->libnal; if (!stxd) { CDEBUG(D_TRACE, "send completion event for unknown stxd\n"); @@ -592,7 +624,7 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) return; } gmnal_return_stxd(nal_data, stxd); - lib_finalize(nal_cb, stxd, cookie, PTL_OK); + lib_finalize(libnal, stxd, cookie, PTL_OK); return; } @@ -645,9 +677,9 @@ void gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, * this ack, deregister the memory. Only 1 send token is required here. */ int -gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +gmnal_large_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t global_nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, int size) + unsigned int niov, struct iovec *iov, size_t offset, int size) { gmnal_data_t *nal_data; @@ -661,15 +693,15 @@ gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int niov_dup; - CDEBUG(D_TRACE, "gmnal_large_tx nal_cb [%p] private [%p], cookie [%p] " + CDEBUG(D_TRACE, "gmnal_large_tx libnal [%p] private [%p], cookie [%p] " "hdr [%p], type [%d] global_nid ["LPU64"], pid [%d], niov [%d], " - "iov [%p], size [%d]\n", nal_cb, private, cookie, hdr, type, + "iov [%p], size [%d]\n", libnal, private, cookie, hdr, type, global_nid, pid, niov, iov, size); - if (nal_cb) - nal_data = (gmnal_data_t*)nal_cb->nal_data; + if (libnal) + nal_data = (gmnal_data_t*)libnal->libnal_data; else { - CDEBUG(D_ERROR, "no nal_cb.\n"); + CDEBUG(D_ERROR, "no libnal.\n"); return(GMNAL_STATUS_FAIL); } @@ -712,30 +744,39 @@ gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, mlen += sizeof(ptl_hdr_t); CDEBUG(D_INFO, "mlen is [%d]\n", mlen); + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + } + + LASSERT(offset >= 0); + /* + * Store the iovs in the stxd for we can get + * them later if we need them + */ + stxd->iov[0].iov_base = iov->iov_base + offset; + stxd->iov[0].iov_len = iov->iov_len - offset; + CDEBUG(D_NET, "Copying iov [%p] to [%p], niov=%d\n", iov, stxd->iov, niov); + if (niov > 1) + gm_bcopy(&iov[1], &stxd->iov[1], (niov-1)*sizeof(struct iovec)); + stxd->niov = niov; + /* * copy the iov to the buffer so target knows * where to get the data from */ CDEBUG(D_INFO, "processing iov to [%p]\n", buffer); - gm_bcopy(iov, buffer, niov*sizeof(struct iovec)); - mlen += niov*(sizeof(struct iovec)); + gm_bcopy(stxd->iov, buffer, stxd->niov*sizeof(struct iovec)); + mlen += stxd->niov*(sizeof(struct iovec)); CDEBUG(D_INFO, "mlen is [%d]\n", mlen); - - - /* - * Store the iovs in the stxd for we can get - * them later if we need them - */ - CDEBUG(D_NET, "Copying iov [%p] to [%p]\n", iov, stxd->iov); - gm_bcopy(iov, stxd->iov, niov*sizeof(struct iovec)); - stxd->niov = niov; - /* * register the memory so the NIC can get hold of the data * This is a slow process. it'd be good to overlap it * with something else. */ + iov = stxd->iov; iov_dup = iov; niov_dup = niov; while(niov--) { @@ -811,11 +852,11 @@ gmnal_large_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) * data from the sender. */ int -gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, - unsigned int nriov, struct iovec *riov, size_t mlen, - size_t rlen) +gmnal_large_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, + unsigned int nriov, struct iovec *riov, size_t offset, + size_t mlen, size_t rlen) { - gmnal_data_t *nal_data = nal_cb->nal_data; + gmnal_data_t *nal_data = libnal->libnal_data; gmnal_srxd_t *srxd = (gmnal_srxd_t*)private; void *buffer = NULL; struct iovec *riov_dup; @@ -823,13 +864,13 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, gmnal_msghdr_t *msghdr = NULL; gm_status_t gm_status; - CDEBUG(D_TRACE, "gmnal_large_rx :: nal_cb[%p], private[%p], " + CDEBUG(D_TRACE, "gmnal_large_rx :: libnal[%p], private[%p], " "cookie[%p], niov[%d], iov[%p], mlen["LPSZ"], rlen["LPSZ"]\n", - nal_cb, private, cookie, nriov, riov, mlen, rlen); + libnal, private, cookie, nriov, riov, mlen, rlen); if (!srxd) { CDEBUG(D_ERROR, "gmnal_large_rx no context\n"); - lib_finalize(nal_cb, private, cookie, PTL_FAIL); + lib_finalize(libnal, private, cookie, PTL_FAIL); return(PTL_FAIL); } @@ -854,6 +895,25 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, * If the iovecs match, could interleave * gm_registers and gm_gets for each element */ + while (offset >= riov->iov_len) { + offset -= riov->iov_len; + riov++; + nriov--; + } + LASSERT (nriov >= 0); + LASSERT (offset >= 0); + /* + * do this so the final gm_get callback can deregister the memory + */ + PORTAL_ALLOC(srxd->riov, nriov*(sizeof(struct iovec))); + + srxd->riov[0].iov_base = riov->iov_base + offset; + srxd->riov[0].iov_len = riov->iov_len - offset; + if (nriov > 1) + gm_bcopy(&riov[1], &srxd->riov[1], (nriov-1)*(sizeof(struct iovec))); + srxd->nriov = nriov; + + riov = srxd->riov; nriov_dup = nriov; riov_dup = riov; while(nriov--) { @@ -879,17 +939,12 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, /* * give back srxd and buffer. Send NACK to sender */ + PORTAL_FREE(srxd->riov, nriov_dup*(sizeof(struct iovec))); return(PTL_FAIL); } GMNAL_GM_UNLOCK(nal_data); riov++; } - /* - * do this so the final gm_get callback can deregister the memory - */ - PORTAL_ALLOC(srxd->riov, nriov_dup*(sizeof(struct iovec))); - gm_bcopy(riov_dup, srxd->riov, nriov_dup*(sizeof(struct iovec))); - srxd->nriov = nriov_dup; /* * now do gm_get to get the data @@ -1092,7 +1147,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context, gmnal_ltxd_t *ltxd = (gmnal_ltxd_t*)context; gmnal_srxd_t *srxd = ltxd->srxd; - nal_cb_t *nal_cb = srxd->nal_data->nal_cb; + lib_nal_t *libnal = srxd->nal_data->libnal; int lastone; struct iovec *riov; int nriov; @@ -1126,7 +1181,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context, * Let our client application proceed */ CDEBUG(D_ERROR, "final callback context[%p]\n", srxd); - lib_finalize(nal_cb, srxd, srxd->cookie, PTL_OK); + lib_finalize(libnal, srxd, srxd->cookie, PTL_OK); /* * send an ack to the sender to let him know we got the data @@ -1276,7 +1331,7 @@ gmnal_large_tx_ack_callback(gm_port_t *gm_port, void *context, void gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) { - nal_cb_t *nal_cb = nal_data->nal_cb; + lib_nal_t *libnal = nal_data->libnal; gmnal_stxd_t *stxd = NULL; gmnal_msghdr_t *msghdr = NULL; void *buffer = NULL; @@ -1291,7 +1346,7 @@ gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) CDEBUG(D_INFO, "gmnal_large_tx_ack_received stxd [%p]\n", stxd); - lib_finalize(nal_cb, stxd, stxd->cookie, PTL_OK); + lib_finalize(libnal, stxd, stxd->cookie, PTL_OK); /* * extract the iovec from the stxd, deregister the memory. diff --git a/lustre/portals/knals/gmnal/gmnal_module.c b/lustre/portals/knals/gmnal/gmnal_module.c index 31f6819..3aca90f 100644 --- a/lustre/portals/knals/gmnal/gmnal_module.c +++ b/lustre/portals/knals/gmnal/gmnal_module.c @@ -32,9 +32,6 @@ int num_rx_threads = -1; int num_stxds = 5; int gm_port = 4; -ptl_handle_ni_t kgmnal_ni; - - int gmnal_cmd(struct portals_cfg *pcfg, void *private) { @@ -58,9 +55,15 @@ gmnal_cmd(struct portals_cfg *pcfg, void *private) copy_from_user(name, pcfg->pcfg_pbuf1, pcfg->pcfg_plen1); GMNAL_GM_LOCK(nal_data); - nid = gm_host_name_to_node_id(nal_data->gm_port, name); + //nid = gm_host_name_to_node_id(nal_data->gm_port, name); + gm_status = gm_host_name_to_node_id_ex (nal_data->gm_port, 0, name, &nid); GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_INFO, "Local node id is [%d]\n", nid); + if (gm_status != GM_SUCCESS) { + CDEBUG(D_INFO, "gm_host_name_to_node_id_ex(...host %s) failed[%d]\n", + name, gm_status); + return (-1); + } else + CDEBUG(D_INFO, "Local node %s id is [%d]\n", name, nid); GMNAL_GM_LOCK(nal_data); gm_status = gm_node_id_to_global_id(nal_data->gm_port, nid, &gnid); @@ -90,28 +93,16 @@ gmnal_load(void) CDEBUG(D_TRACE, "This is the gmnal module initialisation routine\n"); - CDEBUG(D_INFO, "Calling gmnal_init\n"); - status = PtlNIInit(gmnal_init, 32, 4, 0, &kgmnal_ni); + status = gmnal_init(); if (status == PTL_OK) { - CDEBUG(D_INFO, "Portals GMNAL initialised ok kgmnal_ni\n"); + CDEBUG(D_INFO, "Portals GMNAL initialised ok\n"); } else { CDEBUG(D_INFO, "Portals GMNAL Failed to initialise\n"); - return(1); + return(-ENODEV); } - CDEBUG(D_INFO, "Calling kportal_nal_register\n"); - /* - * global_nal_data is set by gmnal_init - */ - if (kportal_nal_register(GMNAL, &gmnal_cmd, global_nal_data) != 0) { - CDEBUG(D_INFO, "kportal_nal_register failed\n"); - return(1); - } - - CDEBUG(D_INFO, "Calling PORTAL_SYMBOL_REGISTER\n"); - PORTAL_SYMBOL_REGISTER(kgmnal_ni); CDEBUG(D_INFO, "This is the end of the gmnal init routine"); @@ -122,11 +113,7 @@ gmnal_load(void) static void __exit gmnal_unload(void) { - - kportal_nal_unregister(GMNAL); - PORTAL_SYMBOL_UNREGISTER(kgmnal_ni); gmnal_fini(); - global_nal_data = NULL; return; } @@ -135,8 +122,6 @@ module_init(gmnal_load); module_exit(gmnal_unload); -EXPORT_SYMBOL(kgmnal_ni); - MODULE_PARM(gmnal_small_msg_size, "i"); MODULE_PARM(num_rx_threads, "i"); MODULE_PARM(num_stxds, "i"); diff --git a/lustre/portals/knals/ibnal/Makefile.in b/lustre/portals/knals/ibnal/Makefile.in deleted file mode 100644 index e180b3e..0000000 --- a/lustre/portals/knals/ibnal/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := kibnal -kibnal-objs := ibnal.o ibnal_cb.o - -EXTRA_PRE_CFLAGS := @IBCPPFLAGS@ - -@INCLUDE_RULES@ diff --git a/lustre/portals/knals/ibnal/autoMakefile.am b/lustre/portals/knals/ibnal/autoMakefile.am deleted file mode 100644 index ffe084c..0000000 --- a/lustre/portals/knals/ibnal/autoMakefile.am +++ /dev/null @@ -1,10 +0,0 @@ -if MODULES -if !CRAY_PORTALS -if BUILD_IBNAL -modulenet_DATA = kibnal$(KMODEXT) -endif -endif -endif - -MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(kibnal-objs:%.o=%.c) ibnal.h diff --git a/lustre/portals/knals/ibnal/ibnal.c b/lustre/portals/knals/ibnal/ibnal.c deleted file mode 100644 index 948badf..0000000 --- a/lustre/portals/knals/ibnal/ibnal.c +++ /dev/null @@ -1,2146 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Based on ksocknal, qswnal, and gmnal - * - * Copyright (C) 2003 LANL - * Author: HB Chen - * Los Alamos National Lab - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "ibnal.h" - -// portal handle ID for this IB-NAL -ptl_handle_ni_t kibnal_ni; - -// message send buffer mutex -spinlock_t MSBuf_mutex[NUM_MBUF]; - -// message recv buffer mutex -spinlock_t MRBuf_mutex[NUM_MBUF]; - -// IB-NAL API information -nal_t kibnal_api; - -// nal's private data -kibnal_data_t kibnal_data; - -int ibnal_debug = 0; -VAPI_pd_hndl_t Pd_hndl; -unsigned int Num_posted_recv_buf; - -// registered send buffer list -Memory_buffer_info MSbuf_list[NUM_MBUF]; - -// registered recv buffer list -Memory_buffer_info MRbuf_list[NUM_MBUF]; - -// -// for router -// currently there is no need fo IBA -// -kpr_nal_interface_t kibnal_router_interface = { - kprni_nalid: IBNAL, - kprni_arg: &kibnal_data, - kprni_fwd: kibnal_fwd_packet, // forward data to router - // is router invloving the - // data transmision -}; - - -// Queue-pair list -QP_info QP_list[NUM_QPS]; - -// information associated with a HCA -HCA_info Hca_data; - -// something about HCA -VAPI_hca_hndl_t Hca_hndl; // assume we only use one HCA now -VAPI_hca_vendor_t Hca_vendor; -VAPI_hca_cap_t Hca_cap; -VAPI_hca_port_t Hca_port_1_props; -VAPI_hca_port_t Hca_port_2_props; -VAPI_hca_attr_t Hca_attr; -VAPI_hca_attr_mask_t Hca_attr_mask; -VAPI_cq_hndl_t Cq_RQ_hndl; // CQ's handle -VAPI_cq_hndl_t Cq_SQ_hndl; // CQ's handle -VAPI_cq_hndl_t Cq_hndl; // CQ's handle -Remote_QP_Info L_QP_data; -Remote_QP_Info R_QP_data; - - -// -// forward API -// -int -kibnal_forward(nal_t *nal, - int id, - void *args, - size_t args_len, - void *ret, - size_t ret_len) -{ - kibnal_data_t *knal_data = nal->nal_data; - nal_cb_t *nal_cb = knal_data->kib_cb; - - // ASSERT checking - LASSERT (nal == &kibnal_api); - LASSERT (knal_data == &kibnal_data); - LASSERT (nal_cb == &kibnal_lib); - - // dispatch forward API function - - CDEBUG(D_NET,"kibnal_forward: function id = %d\n", id); - - lib_dispatch(nal_cb, knal_data, id, args, ret); - - CDEBUG(D_TRACE,"IBNAL- Done kibnal_forward\n"); - - return PTL_OK; // always return PTL_OK -} - -// -// lock API -// -void -kibnal_lock(nal_t *nal, unsigned long *flags) -{ - kibnal_data_t *knal_data = nal->nal_data; - nal_cb_t *nal_cb = knal_data->kib_cb; - - // ASSERT checking - LASSERT (nal == &kibnal_api); - LASSERT (knal_data == &kibnal_data); - LASSERT (nal_cb == &kibnal_lib); - - // disable logical interrrupt - nal_cb->cb_cli(nal_cb,flags); - - CDEBUG(D_TRACE,"IBNAL-Done kibnal_lock\n"); - -} - -// -// unlock API -// -void -kibnal_unlock(nal_t *nal, unsigned long *flags) -{ - kibnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kib_cb; - - // ASSERT checking - LASSERT (nal == &kibnal_api); - LASSERT (k == &kibnal_data); - LASSERT (nal_cb == &kibnal_lib); - - // enable logical interrupt - nal_cb->cb_sti(nal_cb,flags); - - CDEBUG(D_TRACE,"IBNAL-Done kibnal_unlock"); - -} - -// -// shutdown API -// showdown this network interface -// -int -kibnal_shutdown(nal_t *nal, int ni) -{ - VAPI_ret_t vstat; - kibnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kib_cb; - - // assert checking - LASSERT (nal == &kibnal_api); - LASSERT (k == &kibnal_data); - LASSERT (nal_cb == &kibnal_lib); - - // take down this IB network interface - // there is not corresponding cb function to hande this - // do we actually need this one - // reference to IB network interface shutdown - // - - vstat = IB_Close_HCA(); - - if (vstat != VAPI_OK) { - CERROR("Failed to close HCA - %s\n",VAPI_strerror(vstat)); - return (~PTL_OK); - } - - CDEBUG(D_TRACE,"IBNAL- Done kibnal_shutdown\n"); - - return PTL_OK; -} - -// -// yield -// when do we call this yield function -// -void -kibnal_yield( nal_t *nal ) -{ - kibnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kib_cb; - - // assert checking - LASSERT (nal == &kibnal_api); - LASSERT (k == &kibnal_data); - LASSERT (nal_cb == &kibnal_lib); - - // check under what condition that we need to - // call schedule() - // who set this need_resched - if (current->need_resched) - schedule(); - - CDEBUG(D_TRACE,"IBNAL-Done kibnal_yield"); - - return; -} - -// -// ibnal init -// -nal_t * -kibnal_init(int interface, // no use here - ptl_pt_index_t ptl_size, - ptl_ac_index_t ac_size, - ptl_pid_t requested_pid // no use here - ) -{ - nal_t *nal = NULL; - nal_cb_t *nal_cb = NULL; - kibnal_data_t *nal_data = NULL; - int rc; - - unsigned int nnids = 1; // number of nids - // do we know how many nodes are in this - // system related to this kib_nid - // - - CDEBUG(D_NET, "kibnal_init:calling lib_init with nid 0x%u\n", - kibnal_data.kib_nid); - - - CDEBUG(D_NET, "kibnal_init: interface [%d], ptl_size [%d], ac_size[%d]\n", - interface, ptl_size, ac_size); - CDEBUG(D_NET, "kibnal_init: &kibnal_lib 0x%X\n", &kibnal_lib); - CDEBUG(D_NET, "kibnal_init: kibnal_data.kib_nid %d\n", kibnal_data.kib_nid); - - rc = lib_init(&kibnal_lib, - kibnal_data.kib_nid, - 0, // process id is set as 0 - nnids, - ptl_size, - ac_size); - - if(rc != PTL_OK) { - CERROR("kibnal_init: Failed lib_init with nid 0x%u, rc=%d\n", - kibnal_data.kib_nid,rc); - } - else { - CDEBUG(D_NET,"kibnal_init: DONE lib_init with nid 0x%x%x\n", - kibnal_data.kib_nid); - } - - return &kibnal_api; - -} - - -// -// called before remove ibnal kernel module -// -void __exit -kibnal_finalize(void) -{ - struct list_head *tmp; - - inter_module_unregister("kibnal_ni"); - - // release resources allocated to this Infiniband network interface - PtlNIFini(kibnal_ni); - - lib_fini(&kibnal_lib); - - IB_Close_HCA(); - - // how much do we need to do here? - list_for_each(tmp, &kibnal_data.kib_list) { - kibnal_rx_t *conn; - conn = list_entry(tmp, kibnal_rx_t, krx_item); - CDEBUG(D_IOCTL, "freeing conn %p\n",conn); - tmp = tmp->next; - list_del(&conn->krx_item); - PORTAL_FREE(conn, sizeof(*conn)); - } - - CDEBUG(D_MALLOC,"done kmem %d\n",atomic_read(&portal_kmemory)); - CDEBUG(D_TRACE,"IBNAL-Done kibnal_finalize\n"); - - return; -} - - -// -// * k_server_thread is a kernel thread -// use a shared memory ro exchange HCA's data with a pthread in user -// address space -// * will be replaced when CM is used to handle communication management -// - -void k_server_thread(Remote_QP_Info *hca_data) -{ - int segment_id; - const int shared_segment_size = sizeof(Remote_QP_Info); - key_t key = HCA_EXCHANGE_SHM_KEY; - unsigned long raddr; - int exchanged_done = NO; - int i; - - Remote_QP_Info *exchange_hca_data; - - long *n; - long *uaddr; - long ret = 0; - - // create a shared memory with pre-agreement key - segment_id = sys_shmget(key, - shared_segment_size, - IPC_CREAT | 0666); - - - // attached to shared memoru - // raddr is pointed to an user address space - // use this address to update shared menory content - ret = sys_shmat(segment_id, 0 , SHM_RND, &raddr); - -#ifdef IBNAL_DEBUG - if(ret >= 0) { - CDEBUG(D_NET,"k_server_thread: Shared memory attach success ret = 0X%d,&raddr" - " 0X%x (*(&raddr))=0x%x \n", ret, &raddr, (*(&raddr))); - printk("k_server_thread: Shared memory attach success ret = 0X%d, &raddr" - " 0X%x (*(&raddr))=0x%x \n", ret, &raddr, (*(&raddr))); - } - else { - CERROR("k_server_thread: Shared memory attach failed ret = 0x%d \n", ret); - printk("k_server_thread: Shared memory attach failed ret = 0x%d \n", ret); - return; - } -#endif - - n = &raddr; - uaddr = *n; // get the U-address - /* cast uaddr to exchange_hca_data */ - exchange_hca_data = (Remote_QP_Info *) uaddr; - - /* copy data from local HCA to shared memory */ - exchange_hca_data->opcode = hca_data->opcode; - exchange_hca_data->length = hca_data->length; - - for(i=0; i < NUM_QPS; i++) { - exchange_hca_data->dlid[i] = hca_data->dlid[i]; - exchange_hca_data->rqp_num[i] = hca_data->rqp_num[i]; - } - - // periodically check shared memory until get updated - // remote HCA's data from user mode pthread - while(exchanged_done == NO) { - if(exchange_hca_data->opcode == RECV_QP_INFO){ - exchanged_done = YES; - /* copy data to local buffer from shared memory */ - hca_data->opcode = exchange_hca_data->opcode; - hca_data->length = exchange_hca_data->length; - - for(i=0; i < NUM_QPS; i++) { - hca_data->dlid[i] = exchange_hca_data->dlid[i]; - hca_data->rqp_num[i] = exchange_hca_data->rqp_num[i]; - } - break; - } - else { - schedule_timeout(1000); - } - } - - // detached shared memory - sys_shmdt(uaddr); - - CDEBUG(D_NET, "Exit from kernel thread: k_server_thread \n"); - printk("Exit from kernel thread: k_server_thread \n"); - - return; - -} - -// -// create QP -// -VAPI_ret_t -create_qp(QP_info *qp, int qp_index) -{ - - VAPI_ret_t vstat; - VAPI_qp_init_attr_t qp_init_attr; - VAPI_qp_prop_t qp_prop; - - qp->hca_hndl = Hca_hndl; - qp->port = 1; // default - qp->slid = Hca_port_1_props.lid; - qp->hca_port = Hca_port_1_props; - - - /* Queue Pair Creation Attributes */ - qp_init_attr.cap.max_oust_wr_rq = NUM_WQE; - qp_init_attr.cap.max_oust_wr_sq = NUM_WQE; - qp_init_attr.cap.max_sg_size_rq = NUM_SG; - qp_init_attr.cap.max_sg_size_sq = NUM_SG; - qp_init_attr.pd_hndl = qp->pd_hndl; - qp_init_attr.rdd_hndl = 0; - qp_init_attr.rq_cq_hndl = qp->rq_cq_hndl; - /* we use here polling */ - //qp_init_attr.rq_sig_type = VAPI_SIGNAL_REQ_WR; - qp_init_attr.rq_sig_type = VAPI_SIGNAL_ALL_WR; - qp_init_attr.sq_cq_hndl = qp->sq_cq_hndl; - /* we use here polling */ - //qp_init_attr.sq_sig_type = VAPI_SIGNAL_REQ_WR; - qp_init_attr.sq_sig_type = VAPI_SIGNAL_ALL_WR; - // transport servce - reliable connection - - qp_init_attr.ts_type = VAPI_TS_RC; - - vstat = VAPI_create_qp(qp->hca_hndl, - &qp_init_attr, - &qp->qp_hndl, &qp_prop); - - if (vstat != VAPI_OK) { - CERROR("Failed creating QP. Return Failed - %s\n",VAPI_strerror(vstat)); - return vstat; - } - - qp->qp_num = qp_prop.qp_num; // the qp number - qp->last_posted_send_id = 0; // user defined work request ID - qp->last_posted_rcv_id = 0; // user defined work request ID - qp->cur_send_outstanding = 0; - qp->cur_posted_rcv_bufs = 0; - qp->snd_rcv_balance = 0; - - CDEBUG(D_OTHER, "create_qp: qp_num = %d, slid = %d, qp_hndl = 0X%X", - qp->qp_num, qp->slid, qp->qp_hndl); - - // initialize spin-lock mutex variables - spin_lock_init(&(qp->snd_mutex)); - spin_lock_init(&(qp->rcv_mutex)); - spin_lock_init(&(qp->bl_mutex)); - spin_lock_init(&(qp->cln_mutex)); - // number of outstanding requests on the send Q - qp->cur_send_outstanding = 0; - // number of posted receive buffers - qp->cur_posted_rcv_bufs = 0; - qp->snd_rcv_balance = 0; - - return(VAPI_OK); - -} - -// -// initialize a UD qp state to RTR and RTS -// -VAPI_ret_t -init_qp_UD(QP_info *qp, int qp_index) -{ - VAPI_qp_attr_t qp_attr; - VAPI_qp_init_attr_t qp_init_attr; - VAPI_qp_attr_mask_t qp_attr_mask; - VAPI_qp_cap_t qp_cap; - VAPI_ret_t vstat; - - /* Move from RST to INIT */ - /* Change QP to INIT */ - - CDEBUG(D_OTHER, "Changing QP state to INIT qp-index = %d\n", qp_index); - - QP_ATTR_MASK_CLR_ALL(qp_attr_mask); - - qp_attr.qp_state = VAPI_INIT; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE); - - CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.pkey_ix = 0; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX); - - CDEBUG(D_OTHER, "pkey_ix qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.port = qp->port; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PORT); - - CDEBUG(D_OTHER, "port qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.qkey = 0; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QKEY); - - CDEBUG(D_OTHER, "qkey qp_attr_mask = 0X%x\n", qp_attr_mask); - - /* If I do not set this mask, I get an error from HH. QPM should catch it */ - - vstat = VAPI_modify_qp(qp->hca_hndl, - qp->qp_hndl, - &qp_attr, - &qp_attr_mask, - &qp_cap); - - if (vstat != VAPI_OK) { - CERROR("Failed modifying QP from RST to INIT. %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - CDEBUG(D_OTHER, "Modifying QP from RST to INIT.\n"); - - vstat= VAPI_query_qp(qp->hca_hndl, - qp->qp_hndl, - &qp_attr, - &qp_attr_mask, - &qp_init_attr); - - if (vstat != VAPI_OK) { - CERROR("Failed query QP. %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - /* Move from INIT to RTR */ - /* Change QP to RTR */ - CDEBUG(D_OTHER, "Changing QP state to RTR\n"); - - QP_ATTR_MASK_CLR_ALL(qp_attr_mask); - - qp_attr.qp_state = VAPI_RTR; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE); - - CDEBUG(D_OTHER, "INIT to RTR- qp_state : qp_attr_mask = 0X%x\n", qp_attr_mask); - - vstat = VAPI_modify_qp(qp->hca_hndl, - qp->qp_hndl, - &qp_attr, - &qp_attr_mask, - &qp_cap); - - if (vstat != VAPI_OK) { - CERROR("Failed modifying QP from INIT to RTR. %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - CDEBUG(D_OTHER, "Modifying QP from INIT to RTR.\n"); - - vstat= VAPI_query_qp(qp->hca_hndl, - qp->qp_hndl, - &qp_attr, - &qp_attr_mask, - &qp_init_attr); - - if (vstat != VAPI_OK) { - CERROR("Failed query QP. %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - /* RTR to RTS - Change QP to RTS */ - CDEBUG(D_OTHER, "Changing QP state to RTS\n"); - - QP_ATTR_MASK_CLR_ALL(qp_attr_mask); - - qp_attr.qp_state = VAPI_RTS; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE); - - qp_attr.sq_psn = START_SQ_PSN; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_SQ_PSN); - - vstat = VAPI_modify_qp(qp->hca_hndl, - qp->qp_hndl, - &qp_attr, - &qp_attr_mask, - &qp_cap); - - if (vstat != VAPI_OK) { - CERROR("Failed modifying QP from RTR to RTS. %s:%s\n", - VAPI_strerror_sym(vstat), - VAPI_strerror(vstat)); - return(vstat); - } - - CDEBUG(D_OTHER, "Modifying QP from RTR to RTS. \n"); - - vstat= VAPI_query_qp(qp->hca_hndl, - qp->qp_hndl, - &qp_attr, - &qp_attr_mask, - &qp_init_attr); - - if (vstat != VAPI_OK) { - CERROR("Failed query QP. %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - // - // a QP is at RTS state NOW - // - - CDEBUG(D_OTHER, "IBNAL- UD qp is at RTS NOW\n"); - - return(vstat); - -} - - - -// -// initialize a RC qp state to RTR and RTS -// RC transport service -// -VAPI_ret_t -init_qp_RC(QP_info *qp, int qp_index) -{ - VAPI_qp_attr_t qp_attr; - VAPI_qp_init_attr_t qp_init_attr; - VAPI_qp_attr_mask_t qp_attr_mask; - VAPI_qp_cap_t qp_cap; - VAPI_ret_t vstat; - - /* Move from RST to INIT */ - /* Change QP to INIT */ - - CDEBUG(D_OTHER, "Changing QP state to INIT qp-index = %d\n", qp_index); - - QP_ATTR_MASK_CLR_ALL(qp_attr_mask); - - qp_attr.qp_state = VAPI_INIT; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE); - - CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.pkey_ix = 0; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX); - - CDEBUG(D_OTHER, "pkey_ix qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.port = qp->port; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PORT); - - CDEBUG(D_OTHER, "port qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.remote_atomic_flags = VAPI_EN_REM_WRITE | VAPI_EN_REM_READ; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_REMOTE_ATOMIC_FLAGS); - - CDEBUG(D_OTHER, "remote_atomic_flags qp_attr_mask = 0X%x\n", qp_attr_mask); - - /* If I do not set this mask, I get an error from HH. QPM should catch it */ - - vstat = VAPI_modify_qp(qp->hca_hndl, - qp->qp_hndl, - &qp_attr, - &qp_attr_mask, - &qp_cap); - - if (vstat != VAPI_OK) { - CERROR("Failed modifying QP from RST to INIT. %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - vstat= VAPI_query_qp(qp->hca_hndl, - qp->qp_hndl, - &qp_attr, - &qp_attr_mask, - &qp_init_attr); - - if (vstat != VAPI_OK) { - CERROR("Failed query QP. %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - /* Move from INIT to RTR */ - /* Change QP to RTR */ - CDEBUG(D_OTHER, "Changing QP state to RTR qp_indexi %d\n", qp_index); - - QP_ATTR_MASK_CLR_ALL(qp_attr_mask); - qp_attr.qp_state = VAPI_RTR; - - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE); - - CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.av.sl = 0;/* RESPONDER_SL */ - qp_attr.av.grh_flag = FALSE; - qp_attr.av.dlid = qp->dlid;/*RESPONDER_LID;*/ - qp_attr.av.static_rate = 0; - qp_attr.av.src_path_bits = 0; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_AV); - - CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.path_mtu = MTU_2048;// default is MTU_2048 - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PATH_MTU); - - CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.rq_psn = START_RQ_PSN; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RQ_PSN); - - CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.qp_ous_rd_atom = NUM_WQE; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_OUS_RD_ATOM); - - CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.pkey_ix = 0; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX); - - CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.min_rnr_timer = 10; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_MIN_RNR_TIMER); - - CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask); - - qp_attr.dest_qp_num = qp->rqp_num; - - CDEBUG(D_OTHER, "remore qp num %d\n", qp->rqp_num); - - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_DEST_QP_NUM); - - CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask); - - vstat = VAPI_modify_qp(qp->hca_hndl, - qp->qp_hndl, - &qp_attr, - &qp_attr_mask, - &qp_cap); - - - if (vstat != VAPI_OK) { - CERROR("Failed modifying QP from INIT to RTR. qp_index %d - %s\n", - qp_index, VAPI_strerror(vstat)); - return(vstat); - } - - vstat= VAPI_query_qp(qp->hca_hndl, - qp->qp_hndl, - &qp_attr, - &qp_attr_mask, - &qp_init_attr); - - if (vstat != VAPI_OK) { - CERROR("Failed query QP. %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - /* RTR to RTS - Change QP to RTS */ - CDEBUG(D_OTHER, "Changing QP state to RTS\n"); - - QP_ATTR_MASK_CLR_ALL(qp_attr_mask); - - qp_attr.qp_state = VAPI_RTS; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE); - - qp_attr.sq_psn = START_SQ_PSN; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_SQ_PSN); - - qp_attr.timeout = 0x18; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_TIMEOUT); - - qp_attr.retry_count = 10; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RETRY_COUNT); - - qp_attr.rnr_retry = 14; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RNR_RETRY); - - qp_attr.ous_dst_rd_atom = 100; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_OUS_DST_RD_ATOM); - - qp_attr.min_rnr_timer = 5; - QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_MIN_RNR_TIMER); - - vstat = VAPI_modify_qp(qp->hca_hndl, - qp->qp_hndl, - &qp_attr, - &qp_attr_mask, - &qp_cap); - - if (vstat != VAPI_OK) { - CERROR("Failed modifying QP from RTR to RTS. %s:%s\n", - VAPI_strerror_sym(vstat), VAPI_strerror(vstat)); - return(vstat); - } - - vstat= VAPI_query_qp(qp->hca_hndl, - qp->qp_hndl, - &qp_attr, - &qp_attr_mask, - &qp_init_attr); - - if (vstat != VAPI_OK) { - CERROR("Failed query QP. %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - // - // a QP is at RTS state NOW - // - - CDEBUG(D_OTHER, "IBNAL- RC qp is at RTS NOW\n"); - - return(vstat); -} - - - -VAPI_ret_t -IB_Open_HCA(kibnal_data_t *kib_data) -{ - - VAPI_ret_t vstat; - VAPI_cqe_num_t cqe_active_num; - QP_info *qp; - int i; - int Num_posted_recv_buf; - - /* Open HCA */ - CDEBUG(D_PORTALS, "Opening an HCA\n"); - - vstat = VAPI_open_hca(HCA_ID, &Hca_hndl); - vstat = EVAPI_get_hca_hndl(HCA_ID, &Hca_hndl); - if (vstat != VAPI_OK) { - CERROR("Failed opening the HCA: %s. %s...\n",HCA_ID,VAPI_strerror(vstat)); - return(vstat); - } - - /* Get HCA CAP */ - vstat = VAPI_query_hca_cap(Hca_hndl, &Hca_vendor, &Hca_cap); - if (vstat != VAPI_OK) { - CERROR("Failed query hca cap %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - /* Get port 1 info */ - vstat = VAPI_query_hca_port_prop(Hca_hndl, HCA_PORT_1 , &Hca_port_1_props); - if (vstat != VAPI_OK) { - CERROR("Failed query port cap %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - /* Get port 2 info */ - vstat = VAPI_query_hca_port_prop(Hca_hndl, HCA_PORT_2, &Hca_port_2_props); - if (vstat != VAPI_OK) { - CERROR("Failed query port cap %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - // Get a PD - CDEBUG(D_PORTALS, "Allocating PD \n"); - vstat = VAPI_alloc_pd(Hca_hndl,&Pd_hndl); - if (vstat != VAPI_OK) { - CERROR("Failed allocating a PD. %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - vstat = createMemRegion(Hca_hndl, Pd_hndl); - if (vstat != VAPI_OK) { - CERROR("Failed registering a memory region.%s\n",VAPI_strerror(vstat)); - return(vstat); - } - - /* Create CQ for RQ*/ - CDEBUG(D_PORTALS, "Creating a send completion queue\n"); - - vstat = VAPI_create_cq(Hca_hndl, - NUM_CQE, - &Cq_hndl, - &cqe_active_num); - - if (vstat != VAPI_OK) { - CERROR("Failed creating a CQ. %s\n",VAPI_strerror(vstat)); - return(vstat); - } - - if(NUM_CQE == cqe_active_num) { - CERROR("VAPI_create_cq: NUM_CQE EQ cqe_active_num \n"); - } - else { - CDEBUG(D_NET, "VAPI_create_cq: NUM_CQE %d , actual cqe_active_num %d \n", - NUM_CQE, cqe_active_num); - } - - Cq_SQ_hndl = Cq_hndl; - Cq_RQ_hndl = Cq_hndl; - - // - // create QPs - // - for(i=0; i < NUM_QPS; i++) { - QP_list[i].pd_hndl = Pd_hndl; - QP_list[i].hca_hndl = Hca_hndl; - // sq rq use the same Cq_hndl - QP_list[i].sq_cq_hndl = Cq_hndl; - QP_list[i].rq_cq_hndl = Cq_hndl; - vstat = create_qp(&QP_list[i], i); - if (vstat != VAPI_OK) { - CERROR("Failed creating a QP %d %s\n",i, VAPI_strerror(vstat)); - return(vstat); - } - } - - // - // record HCA data - // - - Hca_data.hca_hndl = Hca_hndl; // HCA handle - Hca_data.pd_hndl = Pd_hndl; // protection domain - Hca_data.port = 1; // port number - Hca_data.num_qp = NUM_QPS; // number of qp used - - for(i=0; i < NUM_QPS; i++) { - Hca_data.qp_ptr[i] = &QP_list[i]; // point to QP_list - } - - Hca_data.num_cq = NUM_CQ; // number of cq used - Hca_data.cq_hndl = Cq_hndl; // - Hca_data.sq_cq_hndl = Cq_SQ_hndl; // - Hca_data.rq_cq_hndl = Cq_RQ_hndl; // - Hca_data.kib_data = kib_data; // - Hca_data.slid = QP_list[0].slid;// - - // prepare L_QP_data - -#ifdef USE_SHARED_MEMORY_AND_SOCKET - - /* - * + use a shared-memory between a user thread and a kernel thread - * for HCA's data exchange on the same node - * + use socket in user mode to exhange HCA's data with a remote node - */ - - - R_QP_data.opcode = SEND_QP_INFO; - R_QP_data.length = sizeof(L_QP_data); - - for(i=0; i < NUM_QPS; i++) { - // my slid will be used in a remote node as dlid - R_QP_data.dlid[i] = QP_list[i].slid; - // my qp_num will be used in remode node as remote_qp_number - // RC is used here so we need dlid and rqp_num - R_QP_data.rqp_num[i] = QP_list[i].qp_num ; - } - - // create a kernel thread for exchanging HCA's data - // R_QP_data will be exchanged with a remoe node - - kernel_thread(k_server_thread, &R_QP_data, 0); // - // check if the HCA'data have been updated by kernel_thread - // loop until the HCA's data is updated - // make sure that uagent is running - - // QP info is exchanged with a remote node - while (1) { - schedule_timeout(1000); - if(R_QP_data.opcode == RECV_QP_INFO) { - CDEBUG(D_NET, "HCA's data is being updated\n"); - break; - } - } - -#endif - -#ifdef USE_SHARED_MEMORY_AND_MULTICAST - - /* - * + use a shared-memory between a user thread and a kernel thread - * for HCA's data exchange on the same node - * + use Infinoband UR/multicast in user mode to exhange HCA's data with i - * a remote node - */ - - // use CM, opemSM - -#endif - - // - for(i=0; i < NUM_QPS; i++) { - qp = (QP_info *) &QP_list[i]; - QP_list[i].rqp_num = R_QP_data.rqp_num[i]; // remoter qp number - QP_list[i].dlid = R_QP_data.dlid[i]; // remote dlid - } - - // already have remote_qp_num adn dlid information - // initialize QP to RTR/RTS state - // - for(i=0; i < NUM_QPS; i++) { - vstat = init_qp_RC(&QP_list[i], i); - if (vstat != VAPI_OK) { - CERROR("Failed change a QP %d to RTS state%s\n", - i,VAPI_strerror(vstat)); - return(vstat); - } - } - - // post receiving buffer before any send happened - - Num_posted_recv_buf = post_recv_bufs( (VAPI_wr_id_t ) START_RECV_WRQ_ID); - - // for irregular completion event or some unexpected failure event - vstat = IB_Set_Async_Event_Handler(Hca_data, &kibnal_data); - if (vstat != VAPI_OK) { - CERROR("IB_Set_Async_Event_Handler failed: %d\n", vstat); - return vstat; - } - - - CDEBUG(D_PORTALS, "IBNAL- done with IB_Open_HCA\n"); - - for(i=0; i < NUM_MBUF; i++) { - spin_lock_init(&MSB_mutex[i]); - } - - return(VAPI_OK); - -} - - -/* - Function: IB_Set_Event_Handler() - - IN Hca_info hca_data - IN kibnal_data_t *kib_data -- private data - OUT NONE - - return: VAPI_OK - success - else - fail - -*/ - -VAPI_ret_t -IB_Set_Event_Handler(HCA_info hca_data, kibnal_data_t *kib_data) -{ - VAPI_ret_t vstat; - EVAPI_compl_handler_hndl_t comp_handler_hndl; - - // register CQE_Event_Hnadler - // VAPI function - vstat = VAPI_set_comp_event_handler(hca_data.hca_hndl, - CQE_event_handler, - &hca_data); - - /* - or use extended VAPI function - vstat = EVAPI_set_comp_eventh(hca_data.hca_hndl, - hca_data.cq_hndl, - CQE_event_handler, - &hca_data, - &comp_handler_hndl - ); - */ - - if (vstat != VAPI_OK) { - CERROR("IB_Set_Event_Handler: failed EVAPI_set_comp_eventh for" - " HCA ID = %s (%s).\n", HCA_ID, VAPI_strerror(vstat)); - return vstat; - } - - // issue a request for completion ievent notification - vstat = VAPI_req_comp_notif(hca_data.hca_hndl, - hca_data.cq_hndl, - VAPI_NEXT_COMP); - - if (vstat != VAPI_OK) { - CERROR("IB_Set_Event_Handler: failed VAPI_req_comp_notif for HCA ID" - " = %s (%s).\n", HCA_ID, VAPI_strerror(vstat)); - } - - return vstat; -} - - - -/* - Function: IB_Set_Async_Event_Handler() - - IN HCA_info hca_data - IN kibnal_data_t *kib_data -- private data - OUT NONE - - return: VAPI_OK - success - else - fail - -*/ - - -VAPI_ret_t -IB_Set_Async_Event_Handler(HCA_info hca_data, kibnal_data_t *kib_data) -{ - VAPI_ret_t vstat; - - // - // register an asynchronous event handler for this HCA - // - - vstat= VAPI_set_async_event_handler(hca_data.hca_hndl, - async_event_handler, - kib_data); - - if (vstat != VAPI_OK) { - CERROR("IB_Set_Async_Event_Handler: failed VAPI_set_async_comp_event_handler" - " for HCA ID = %s (%s).\n", HCA_ID, VAPI_strerror(vstat)); - } - - return vstat; -} - -// -// IB_Close_HCA -// close this Infiniband HCA interface -// release allocated resources to system -// -VAPI_ret_t -IB_Close_HCA(void ) -{ - - VAPI_ret_t vstat; - int ok = 1; - int i; - - /* Destroy QP */ - CDEBUG(D_PORTALS, "Destroying QP\n"); - - for(i=0; i < NUM_QPS; i++) { - vstat = VAPI_destroy_qp(QP_list[i].hca_hndl, QP_list[i].qp_hndl); - if (vstat != VAPI_OK) { - CERROR("Failed destroying QP %d. %s\n", i, VAPI_strerror(vstat)); - ok = 0; - } - } - - if (ok) { - /* Destroy CQ */ - CDEBUG(D_PORTALS, "Destroying CQ\n"); - for(i=0; i < NUM_QPS; i++) { - // send_cq adn receive_cq are shared the same CQ - // so only destroy one of them - vstat = VAPI_destroy_cq(QP_list[i].hca_hndl, QP_list[i].sq_cq_hndl); - if (vstat != VAPI_OK) { - CERROR("Failed destroying CQ %d. %s\n", i, VAPI_strerror(vstat)); - ok = 0; - } - } - } - - if (ok) { - /* Destroy Memory Region */ - CDEBUG(D_PORTALS, "Deregistering MR\n"); - for(i=0; i < NUM_QPS; i++) { - vstat = deleteMemRegion(&QP_list[i], i); - if (vstat != VAPI_OK) { - CERROR("Failed deregister mem reg %d. %s\n",i, VAPI_strerror(vstat)); - ok = 0; - break; - } - } - } - - if (ok) { - // finally - /* Close HCA */ - CDEBUG(D_PORTALS, "Closing HCA\n"); - vstat = VAPI_close_hca(Hca_hndl); - if (vstat != VAPI_OK) { - CERROR("Failed to close HCA. %s\n", VAPI_strerror(vstat)); - ok = 0; - } - } - - CDEBUG(D_PORTALS, "IBNAL- Done with closing HCA \n"); - - return vstat; -} - - -VAPI_ret_t -createMemRegion(VAPI_hca_hndl_t hca_hndl, - VAPI_pd_hndl_t pd_hndl) -{ - VAPI_ret_t vstat; - VAPI_mrw_t mrw; - VAPI_mrw_t rep_mr; - VAPI_mr_hndl_t rep_mr_hndl; - int buf_size; - char *bufptr; - int i; - - // send registered memory region - for(i=0; i < NUM_ENTRY; i++) { - MSbuf_list[i].buf_size = KB_32; - PORTAL_ALLOC(bufptr, MSbuf_list[i].buf_size); - if(bufptr == NULL) { - CDEBUG(D_MALLOC,"Failed to malloc a block of send memory, qix %d size %d\n", - i, MSbuf_list[i].buf_size); - CERROR("Failed to malloc a block of send memory, qix %d size %d\n", - i, MSbuf_list[i].buf_size); - return(VAPI_ENOMEM); - } - - mrw.type = VAPI_MR; - mrw.pd_hndl= pd_hndl; - mrw.start = MSbuf_list[i].buf_addr = (VAPI_virt_addr_t)(MT_virt_addr_t) bufptr; - mrw.size = MSbuf_list[i].buf_size; - mrw.acl = VAPI_EN_LOCAL_WRITE | - VAPI_EN_REMOTE_WRITE | - VAPI_EN_REMOTE_READ; - - // register send memory region - vstat = VAPI_register_mr(hca_hndl, - &mrw, - &rep_mr_hndl, - &rep_mr); - - // this memory region is going to be reused until deregister is called - if(vstat != VAPI_OK) { - CERROR("Failed registering a mem region qix %d Addr=%p, Len=%d. %s\n", - i, mrw.start, mrw.size, VAPI_strerror(vstat)); - return(vstat); - } - - MSbuf_list[i].mr = rep_mr; - MSbuf_list[i].mr_hndl = rep_mr_hndl; - MSbuf_list[i].bufptr = bufptr; - MSbuf_list[i].buf_addr = rep_mr.start; - MSbuf_list[i].status = BUF_REGISTERED; - MSbuf_list[i].ref_count = 0; - MSbuf_list[i].buf_type = REG_BUF; - MSbuf_list[i].raddr = 0x0; - MSbuf_list[i].rkey = 0x0; - } - - // RDAM buffer is not reserved for RDAM WRITE/READ - - for(i=NUM_ENTRY; i< NUM_MBUF; i++) { - MSbuf_list[i].status = BUF_UNREGISTERED; - MSbuf_list[i].buf_type = RDMA_BUF; - } - - - // recv registered memory region - for(i=0; i < NUM_ENTRY; i++) { - MRbuf_list[i].buf_size = KB_32; - PORTAL_ALLOC(bufptr, MRbuf_list[i].buf_size); - - if(bufptr == NULL) { - CDEBUG(D_MALLOC, "Failed to malloc a block of send memory, qix %d size %d\n", - i, MRbuf_list[i].buf_size); - return(VAPI_ENOMEM); - } - - mrw.type = VAPI_MR; - mrw.pd_hndl= pd_hndl; - mrw.start = (VAPI_virt_addr_t)(MT_virt_addr_t) bufptr; - mrw.size = MRbuf_list[i].buf_size; - mrw.acl = VAPI_EN_LOCAL_WRITE | - VAPI_EN_REMOTE_WRITE | - VAPI_EN_REMOTE_READ; - - // register send memory region - vstat = VAPI_register_mr(hca_hndl, - &mrw, - &rep_mr_hndl, - &rep_mr); - - // this memory region is going to be reused until deregister is called - if(vstat != VAPI_OK) { - CERROR("Failed registering a mem region qix %d Addr=%p, Len=%d. %s\n", - i, mrw.start, mrw.size, VAPI_strerror(vstat)); - return(vstat); - } - - MRbuf_list[i].mr = rep_mr; - MRbuf_list[i].mr_hndl = rep_mr_hndl; - MRbuf_list[i].bufptr = bufptr; - MRbuf_list[i].buf_addr = rep_mr.start; - MRbuf_list[i].status = BUF_REGISTERED; - MRbuf_list[i].ref_count = 0; - MRbuf_list[i].buf_type = REG_BUF; - MRbuf_list[i].raddr = 0x0; - MRbuf_list[i].rkey = rep_mr.r_key; - MRbuf_list[i].lkey = rep_mr.l_key; - - } - - // keep extra information for a qp - for(i=0; i < NUM_QPS; i++) { - QP_list[i].mr_hndl = MSbuf_list[i].mr_hndl; - QP_list[i].mr = MSbuf_list[i].mr; - QP_list[i].bufptr = MSbuf_list[i].bufptr; - QP_list[i].buf_addr = MSbuf_list[i].buf_addr; - QP_list[i].buf_size = MSbuf_list[i].buf_size; - QP_list[i].raddr = MSbuf_list[i].raddr; - QP_list[i].rkey = MSbuf_list[i].rkey; - QP_list[i].lkey = MSbuf_list[i].lkey; - } - - CDEBUG(D_PORTALS, "IBNAL- done VAPI_ret_t createMemRegion \n"); - - return vstat; - -} /* createMemRegion */ - - - -VAPI_ret_t -deleteMemRegion(QP_info *qp, int qix) -{ - VAPI_ret_t vstat; - - // - // free send memory assocaited with this memory region - // - PORTAL_FREE(MSbuf_list[qix].bufptr, MSbuf_list[qix].buf_size); - - // de-register it - vstat = VAPI_deregister_mr(qp->hca_hndl, MSbuf_list[qix].mr_hndl); - - if(vstat != VAPI_OK) { - CERROR("Failed deregistering a send mem region qix %d %s\n", - qix, VAPI_strerror(vstat)); - return vstat; - } - - // - // free recv memory assocaited with this memory region - // - PORTAL_FREE(MRbuf_list[qix].bufptr, MRbuf_list[qix].buf_size); - - // de-register it - vstat = VAPI_deregister_mr(qp->hca_hndl, MRbuf_list[qix].mr_hndl); - - if(vstat != VAPI_OK) { - CERROR("Failed deregistering a recv mem region qix %d %s\n", - qix, VAPI_strerror(vstat)); - return vstat; - } - - return vstat; -} - - -// -// polling based event handling -// + a daemon process -// + poll the CQ and check what is in the CQ -// + process incoming CQ event -// + -// - - -RDMA_Info_Exchange Rdma_info; -int Cts_Message_arrived = NO; - -void k_recv_thread(HCA_info *hca_data) -{ - VAPI_ret_t vstat; - VAPI_wc_desc_t comp_desc; - unsigned long polling_count = 0; - u_int32_t timeout_usec; - unsigned int priority = 100; - unsigned int length; - VAPI_wr_id_t wrq_id; - u_int32_t transferred_data_length; /* Num. of bytes transferred */ - void *bufdata; - VAPI_virt_addr_t bufaddr; - unsigned long buf_size = 0; - QP_info *qp; // point to QP_list - - kportal_daemonize("k_recv_thread"); // make it as a daemon process - - // tuning variable - timeout_usec = 100; // how is the impact on the performance - - // send Q and receive Q are using the same CQ - // so only poll one CQ for both operations - - CDEBUG(D_NET, "IBNAL- enter kibnal_recv_thread\n"); - CDEBUG(D_NET, "hca_hndl = 0X%x, cq_hndl=0X%x\n", - hca_data->hca_hndl,hca_data->cq_hndl); - - qp = hca_data->qp_ptr; - if(qp == NULL) { - CDEBUG(D_NET, "in recv_thread qp is NULL\n"); - CDEBUG(D_NET, "Exit from recv_thread qp is NULL\n"); - return; - } - else { - CDEBUG(D_NET, "in recv_thread qp is 0X%X\n", qp); - } - - CDEBUG(D_NET, "kibnal_recv_thread - enter event driver polling loop\n"); - - // - // use event driver - // - - - - while(1) { - polling_count++; - - // - // send Q and receive Q are using the same CQ - // so only poll one CQ for both operations - // - - vstat = VAPI_poll_cq(hca_data->hca_hndl,hca_data->cq_hndl, &comp_desc); - - if (vstat == VAPI_CQ_EMPTY) { - // there is no event in CQE - continue; - } - else { - if (vstat != (VAPI_OK)) { - CERROR("error while polling completion queuei vstat %d \n", vstat); - return; - } - } - - // process the complete event - switch(comp_desc.opcode) { - case VAPI_CQE_SQ_SEND_DATA: - // about the Send Q ,POST SEND completion - // who needs this information - // get wrq_id - // mark MSbuf_list[wr_id].status = BUF_REGISTERED - - wrq_id = comp_desc.id; - - if(RDMA_OP_ID < wrq_id) { - // this RDMA message id, adjust it to the right entry - wrq_id = wrq_id - RDMA_OP_ID; - vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.send_rdma_mr_hndl); - } - - if(vstat != VAPI_OK) { - CERROR("VAPI_CQE_SQ_SEND_DATA: Failed deregistering a RDMAi recv" " mem region %s\n", VAPI_strerror(vstat)); - } - - if((RDMA_CTS_ID <= wrq_id) && (RDMA_OP_ID < wrq_id)) { - // RTS or CTS send complete, release send buffer - if(wrq_id >= RDMA_RTS_ID) - wrq_id = wrq_id - RDMA_RTS_ID; - else - wrq_id = wrq_id - RDMA_CTS_ID; - } - - spin_lock(&MSB_mutex[(int) wrq_id]); - MRbuf_list[wrq_id].status = BUF_REGISTERED; - spin_unlock(&MSB_mutex[(int) wrq_id]); - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_SEND_DATA\n"); - break; - - case VAPI_CQE_SQ_RDMA_WRITE: - // about the Send Q, RDMA write completion - // who needs this information - // data is successfully write from pource to destionation - - // get wr_id - // mark MSbuf_list[wr_id].status = BUF_REGISTERED - // de-register rdma buffer - // - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_RDMA_WRITE\n"); - break; - - case VAPI_CQE_SQ_RDMA_READ: - // about the Send Q - // RDMA read completion - // who needs this information - // data is successfully read from destionation to source - CDEBUG(D_NET, "CQE opcode- VAPI_CQE_SQ_RDMA_READ\n"); - break; - - case VAPI_CQE_SQ_COMP_SWAP: - // about the Send Q - // RDMA write completion - // who needs this information - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_COMP_SWAP\n"); - break; - - case VAPI_CQE_SQ_FETCH_ADD: - // about the Send Q - // RDMA write completion - // who needs this information - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_FETCH_ADD\n"); - break; - - case VAPI_CQE_SQ_BIND_MRW: - // about the Send Q - // RDMA write completion - // who needs this information - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_BIND_MRW\n"); - break; - - case VAPI_CQE_RQ_SEND_DATA: - // about the Receive Q - // process the incoming data and - // forward it to ..... - // a completion recevie event is arriving at CQ - // issue a recevie to get this arriving data out from CQ - // pass the receiving data for further processing - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_SEND_DATA\n"); - wrq_id = comp_desc.id ; - transferred_data_length = comp_desc.byte_len; - - if((wrq_id >= RDMA_CTS_ID) && (wrq_id < RDMA_OP_ID)) { - // this is RTS/CTS message - // process it locally and don't pass it to portals layer - // adjust wrq_id to get the right entry in MRbfu_list - - if(wrq_id >= RDMA_RTS_ID) - wrq_id = wrq_id - RDMA_RTS_ID; - else - wrq_id = wrq_id - RDMA_CTS_ID; - - bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) MRbuf_list[wrq_id].buf_addr; - MRbuf_list[wrq_id].status = BUF_INUSE; - memcpy(&Rdma_info, &bufaddr, sizeof(RDMA_Info_Exchange)); - - if(Ready_To_send == Rdma_info.opcode) - // an RTS request message from remote node - // prepare local RDMA buffer and send local rdma info to - // remote node - CTS_handshaking_protocol(&Rdma_info); - else - if((Clear_To_send == Rdma_info.opcode) && - (RDMA_BUFFER_RESERVED == Rdma_info.flag)) - Cts_Message_arrived = YES; - else - if(RDMA_BUFFER_UNAVAILABLE == Rdma_info.flag) - CERROR("RDMA operation abort-RDMA_BUFFER_UNAVAILABLE\n"); - } - else { - // - // this is an incoming mesage for portals layer - // move to PORTALS layer for further processing - // - - bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) - MRbuf_list[wrq_id].buf_addr; - - MRbuf_list[wrq_id].status = BUF_INUSE; - transferred_data_length = comp_desc.byte_len; - - kibnal_rx(hca_data->kib_data, - bufaddr, - transferred_data_length, - MRbuf_list[wrq_id].buf_size, - priority); - } - - // repost this receiving buffer and makr it at BUF_REGISTERED - - vstat = repost_recv_buf(qp, wrq_id); - if(vstat != (VAPI_OK)) { - CERROR("error while polling completion queue\n"); - } - else { - MRbuf_list[wrq_id].status = BUF_REGISTERED; - } - - break; - - case VAPI_CQE_RQ_RDMA_WITH_IMM: - // about the Receive Q - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n"); - - wrq_id = comp_desc.id ; - transferred_data_length = comp_desc.byte_len; - - if(wrq_id == RDMA_OP_ID) { - // this is RDAM op , locate the RDAM memory buffer address - - bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) Local_rdma_info.raddr; - - transferred_data_length = comp_desc.byte_len; - - kibnal_rx(hca_data->kib_data, - bufaddr, - transferred_data_length, - Local_rdma_info.buf_length, - priority); - - // de-regiser this RDAM receiving memory buffer - // too early ?? test & check - vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.recv_rdma_mr_hndl); - if(vstat != VAPI_OK) { - CERROR("VAPI_CQE_RQ_RDMA_WITH_IMM: Failed deregistering a RDMA" - " recv mem region %s\n", VAPI_strerror(vstat)); - } - } - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n"); - break; - - case VAPI_CQE_INVAL_OPCODE: - // - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_INVAL_OPCODE\n"); - break; - - default : - CDEBUG(D_NET, "CQE opcode-unknown opcode\n"); - break; - } // switch - - schedule_timeout(RECEIVING_THREAD_TIMEOUT);//how often do we need to poll CQ - - }// receiving while loop - - -} - - -void CQE_event_handler(VAPI_hca_hndl_t hca_hndl, - VAPI_cq_hndl_t cq_hndl, - void *private) -{ - VAPI_ret_t vstat; - VAPI_wc_desc_t comp_desc; - unsigned long polling_count = 0; - u_int32_t timeout_usec; - unsigned int priority = 100; - unsigned int length; - VAPI_wr_id_t wrq_id; - u_int32_t transferred_data_length; /* Num. of bytes transferred */ - void *bufdata; - VAPI_virt_addr_t bufaddr; - unsigned long buf_size = 0; - QP_info *qp; // point to QP_list - HCA_info *hca_data; - - // send Q and receive Q are using the same CQ - // so only poll one CQ for both operations - - CDEBUG(D_NET, "IBNAL- enter CQE_event_handler\n"); - printk("IBNAL- enter CQE_event_handler\n"); - - hca_data = (HCA_info *) private; - - // - // use event driven - // - - - vstat = VAPI_poll_cq(hca_data->hca_hndl,hca_data->cq_hndl, &comp_desc); - - if (vstat == VAPI_CQ_EMPTY) { - CDEBUG(D_NET, "CQE_event_handler: there is no event in CQE, how could" - " this " "happened \n"); - printk("CQE_event_handler: there is no event in CQE, how could" - " this " "happened \n"); - - } - else { - if (vstat != (VAPI_OK)) { - CDEBUG(D_NET, "error while polling completion queue vstat %d - %s\n", - vstat, VAPI_strerror(vstat)); - printk("error while polling completion queue vstat %d - %s\n", - vstat, VAPI_strerror(vstat)); - return; - } - } - - // process the complete event - switch(comp_desc.opcode) { - case VAPI_CQE_SQ_SEND_DATA: - // about the Send Q ,POST SEND completion - // who needs this information - // get wrq_id - // mark MSbuf_list[wr_id].status = BUF_REGISTERED - - wrq_id = comp_desc.id; - -#ifdef IBNAL_SELF_TESTING - if(wrq_id == SEND_RECV_TEST_ID) { - printk("IBNAL_SELF_TESTING - VAPI_CQE_SQ_SEND_DATA \n"); - } -#else - if(RDMA_OP_ID < wrq_id) { - // this RDMA message id, adjust it to the right entry - wrq_id = wrq_id - RDMA_OP_ID; - vstat = VAPI_deregister_mr(qp->hca_hndl, - Local_rdma_info.send_rdma_mr_hndl); - } - - if(vstat != VAPI_OK) { - CERROR(" VAPI_CQE_SQ_SEND_DATA: Failed deregistering a RDMA" - " recv mem region %s\n", VAPI_strerror(vstat)); - } - - if((RDMA_CTS_ID <= wrq_id) && (RDMA_OP_ID < wrq_id)) { - // RTS or CTS send complete, release send buffer - if(wrq_id >= RDMA_RTS_ID) - wrq_id = wrq_id - RDMA_RTS_ID; - else - wrq_id = wrq_id - RDMA_CTS_ID; - } - - spin_lock(&MSB_mutex[(int) wrq_id]); - MRbuf_list[wrq_id].status = BUF_REGISTERED; - spin_unlock(&MSB_mutex[(int) wrq_id]); -#endif - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_SEND_DATA\n"); - - break; - - case VAPI_CQE_SQ_RDMA_WRITE: - // about the Send Q, RDMA write completion - // who needs this information - // data is successfully write from pource to destionation - - // get wr_id - // mark MSbuf_list[wr_id].status = BUF_REGISTERED - // de-register rdma buffer - // - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_RDMA_WRITE\n"); - break; - - case VAPI_CQE_SQ_RDMA_READ: - // about the Send Q - // RDMA read completion - // who needs this information - // data is successfully read from destionation to source - CDEBUG(D_NET, "CQE opcode- VAPI_CQE_SQ_RDMA_READ\n"); - break; - - case VAPI_CQE_SQ_COMP_SWAP: - // about the Send Q - // RDMA write completion - // who needs this information - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_COMP_SWAP\n"); - break; - - case VAPI_CQE_SQ_FETCH_ADD: - // about the Send Q - // RDMA write completion - // who needs this information - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_FETCH_ADD\n"); - break; - - case VAPI_CQE_SQ_BIND_MRW: - // about the Send Q - // RDMA write completion - // who needs this information - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_BIND_MRW\n"); - break; - - case VAPI_CQE_RQ_SEND_DATA: - // about the Receive Q - // process the incoming data and - // forward it to ..... - // a completion recevie event is arriving at CQ - // issue a recevie to get this arriving data out from CQ - // pass the receiving data for further processing - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_SEND_DATA\n"); - - wrq_id = comp_desc.id ; - -#ifdef IBNAL_SELF_TESTING - - char rbuf[KB_32]; - int i; - - if(wrq_id == SEND_RECV_TEST_ID) { - printk("IBNAL_SELF_TESTING - VAPI_CQE_RQ_SEND_DATA\n"); - } - - bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) - MRbuf_list[ SEND_RECV_TEST_BUF_ID].buf_addr; - MRbuf_list[SEND_RECV_TEST_BUF_ID].status = BUF_INUSE; - memcpy(&rbuf, &bufaddr, KB_32); - - - for(i=0; i < 16; i++) - printk("rbuf[%d]=%c, ", rbuf[i]); - printk("\n"); - - // repost this receiving buffer and makr it at BUF_REGISTERED - vstat = repost_recv_buf(qp,SEND_RECV_TEST_BUF_ID); - if(vstat != (VAPI_OK)) { - printk("error while polling completion queue\n"); - } - else { - MRbuf_list[SEND_RECV_TEST_BUF_ID].status = BUF_REGISTERED; - } -#else - transferred_data_length = comp_desc.byte_len; - - if((wrq_id >= RDMA_CTS_ID) && (wrq_id < RDMA_OP_ID)) { - // this is RTS/CTS message - // process it locally and don't pass it to portals layer - // adjust wrq_id to get the right entry in MRbfu_list - - if(wrq_id >= RDMA_RTS_ID) - wrq_id = wrq_id - RDMA_RTS_ID; - else - wrq_id = wrq_id - RDMA_CTS_ID; - - bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) - MRbuf_list[wrq_id].buf_addr; - MRbuf_list[wrq_id].status = BUF_INUSE; - memcpy(&Rdma_info, &bufaddr, sizeof(RDMA_Info_Exchange)); - - if(Ready_To_send == Rdma_info.opcode) - // an RTS request message from remote node - // prepare local RDMA buffer and send local rdma info to - // remote node - CTS_handshaking_protocol(&Rdma_info); - else - if((Clear_To_send == Rdma_info.opcode) && - (RDMA_BUFFER_RESERVED == Rdma_info.flag)) - Cts_Message_arrived = YES; - else - if(RDMA_BUFFER_UNAVAILABLE == Rdma_info.flag) - CERROR("RDMA operation abort-RDMA_BUFFER_UNAVAILABLE\n"); - } - else { - // - // this is an incoming mesage for portals layer - // move to PORTALS layer for further processing - // - - bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) - MRbuf_list[wrq_id].buf_addr; - - MRbuf_list[wrq_id].status = BUF_INUSE; - transferred_data_length = comp_desc.byte_len; - - kibnal_rx(hca_data->kib_data, - bufaddr, - transferred_data_length, - MRbuf_list[wrq_id].buf_size, - priority); - } - - // repost this receiving buffer and makr it at BUF_REGISTERED - vstat = repost_recv_buf(qp, wrq_id); - if(vstat != (VAPI_OK)) { - CERROR("error while polling completion queue\n"); - } - else { - MRbuf_list[wrq_id].status = BUF_REGISTERED; - } -#endif - - break; - - case VAPI_CQE_RQ_RDMA_WITH_IMM: - // about the Receive Q - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n"); - - wrq_id = comp_desc.id ; - transferred_data_length = comp_desc.byte_len; - - if(wrq_id == RDMA_OP_ID) { - // this is RDAM op , locate the RDAM memory buffer address - - bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) Local_rdma_info.raddr; - - transferred_data_length = comp_desc.byte_len; - - kibnal_rx(hca_data->kib_data, - bufaddr, - transferred_data_length, - Local_rdma_info.buf_length, - priority); - - // de-regiser this RDAM receiving memory buffer - // too early ?? test & check - vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.recv_rdma_mr_hndl); - if(vstat != VAPI_OK) { - CERROR("VAPI_CQE_RQ_RDMA_WITH_IMM: Failed deregistering a RDMA" - " recv mem region %s\n", VAPI_strerror(vstat)); - } - } - - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n"); - break; - - case VAPI_CQE_INVAL_OPCODE: - // - CDEBUG(D_NET, "CQE opcode-VAPI_CQE_INVAL_OPCODE\n"); - break; - - default : - CDEBUG(D_NET, "CQE opcode-unknown opcode\n"); - - break; - } // switch - - // issue a new request for completion ievent notification - vstat = VAPI_req_comp_notif(hca_data->hca_hndl, - hca_data->cq_hndl, - VAPI_NEXT_COMP); - - - if(vstat != VAPI_OK) { - CERROR("PI_req_comp_notif: Failed %s\n", VAPI_strerror(vstat)); - } - - return; // end of event handler - -} - - - -int -kibnal_cmd(struct portal_ioctl_data * data, void * private) -{ - int rc ; - - CDEBUG(D_NET, "kibnal_cmd \n"); - - return YES; -} - - - -void ibnal_send_recv_self_testing(int *my_role) -{ - VAPI_ret_t vstat; - VAPI_sr_desc_t sr_desc; - VAPI_sg_lst_entry_t sr_sg; - QP_info *qp; - VAPI_wr_id_t send_id; - int buf_id; - char sbuf[KB_32]; - char rbuf[KB_32]; - int i; - int buf_length = KB_32; - VAPI_wc_desc_t comp_desc; - int num_send = 1; - int loop_count = 0; - - // make it as a daemon process - // kportal_daemonize("ibnal_send_recv_self_testing"); - - printk("My role is 0X%X\n", *my_role); - -if(*my_role == TEST_SEND_MESSAGE) { - printk("Enter ibnal_send_recv_self_testing\n"); - - memset(&sbuf, 'a', KB_32); - memset(&rbuf, ' ', KB_32); - - send_id = SEND_RECV_TEST_ID; - buf_id = SEND_RECV_TEST_BUF_ID; - - qp = &QP_list[buf_id]; - - sr_desc.opcode = VAPI_SEND; - sr_desc.comp_type = VAPI_SIGNALED; - sr_desc.id = send_id; - - // scatter and gather info - sr_sg.len = KB_32; - sr_sg.lkey = MSbuf_list[buf_id].mr.l_key; // use send MR - sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[buf_id].buf_addr; - - // copy data to register send buffer - memcpy(&sr_sg.addr, &sbuf, buf_length); - - sr_desc.sg_lst_p = &sr_sg; - sr_desc.sg_lst_len = 1; // only 1 entry is used - sr_desc.fence = TRUE; - sr_desc.set_se = FALSE; - - /* - // call VAPI_post_sr to send out this data - vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc); - - if (vstat != VAPI_OK) { - printk("VAPI_post_sr failed (%s).\n",VAPI_strerror(vstat)); - } - - printk("VAPI_post_sr success.\n"); - */ - - } -else { - printk("I am a receiver and doing nothing here\n"); -} - - printk("ibnal_send_recv_self_testing thread exit \n"); - - return; - -} - - -// -// ibnal initialize process -// -// 1. Bring up Infiniband network interface -// * -// 2. Initialize a PORTALS nal interface -// -// -int __init -kibnal_initialize(void) -{ - int rc; - int ntok; - unsigned long sizemask; - unsigned int nid; - VAPI_ret_t vstat; - - - portals_debug_set_level(IBNAL_DEBUG_LEVEL_1); - - CDEBUG(D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory)); - - CDEBUG(D_PORTALS, "kibnal_initialize: Enter kibnal_initialize\n"); - - // set api functional pointers - kibnal_api.forward = kibnal_forward; - kibnal_api.shutdown = kibnal_shutdown; - kibnal_api.yield = kibnal_yield; - kibnal_api.validate = NULL; /* our api validate is a NOOP */ - kibnal_api.lock = kibnal_lock; - kibnal_api.unlock = kibnal_unlock; - kibnal_api.nal_data = &kibnal_data; // this is so called private data - kibnal_api.refct = 1; - kibnal_api.timeout = NULL; - kibnal_lib.nal_data = &kibnal_data; - - memset(&kibnal_data, 0, sizeof(kibnal_data)); - - // initialize kib_list list data structure - INIT_LIST_HEAD(&kibnal_data.kib_list); - - kibnal_data.kib_cb = &kibnal_lib; - - spin_lock_init(&kibnal_data.kib_dispatch_lock); - - - // - // bring up the IB inter-connect network interface - // setup QP, CQ - // - vstat = IB_Open_HCA(&kibnal_data); - - if(vstat != VAPI_OK) { - CERROR("kibnal_initialize: IB_Open_HCA failed: %d- %s\n", - vstat, VAPI_strerror(vstat)); - - printk("kibnal_initialize: IB_Open_HCA failed: %d- %s\n", - vstat, VAPI_strerror(vstat)); - return NO; - } - - kibnal_data.kib_nid = (__u64 )Hca_hndl;//convert Hca_hndl to 64-bit format - kibnal_data.kib_init = 1; - - CDEBUG(D_NET, " kibnal_data.kib_nid 0x%x%x\n", kibnal_data.kib_nid); - printk(" kibnal_data.kib_nid 0x%x%x\n", kibnal_data.kib_nid); - - /* Network interface ready to initialise */ - // get an entery in the PORTALS table for this IB protocol - - CDEBUG(D_PORTALS,"Call PtlNIInit to register this Infiniband Interface\n"); - printk("Call PtlNIInit to register this Infiniband Interface\n"); - - rc = PtlNIInit(kibnal_init, 32, 4, 0, &kibnal_ni); - - if(rc != PTL_OK) { - CERROR("kibnal_initialize: PtlNIInit failed %d\n", rc); - printk("kibnal_initialize: PtlNIInit failed %d\n", rc); - kibnal_finalize(); - return (-ENOMEM); - } - - CDEBUG(D_PORTALS,"kibnal_initialize: PtlNIInit DONE\n"); - printk("kibnal_initialize: PtlNIInit DONE\n"); - - - -#ifdef POLL_BASED_CQE_HANDLING - // create a receiving thread: main loopa - // this is polling based mail loop - kernel_thread(k_recv_thread, &Hca_data, 0); -#endif - -#ifdef EVENT_BASED_CQE_HANDLING - // for completion event handling, this is event based CQE handling - vstat = IB_Set_Event_Handler(Hca_data, &kibnal_data); - - if (vstat != VAPI_OK) { - CERROR("IB_Set_Event_Handler failed: %d - %s \n", - vstat, VAPI_strerror(vstat)); - return vstat; - } - - CDEBUG(D_PORTALS,"IB_Set_Event_Handler Done \n"); - printk("IB_Set_Event_Handler Done \n"); - -#endif - - PORTAL_SYMBOL_REGISTER(kibnal_ni); - -#ifdef IBNAL_SELF_TESTING - // - // test HCA send recv before normal event handling - // - int my_role; - my_role = TEST_SEND_MESSAGE; - - printk("my role is TEST_RECV_MESSAGE\n"); - - // kernel_thread(ibnal_send_recv_self_testing, &my_role, 0); - - ibnal_send_recv_self_testing(&my_role); - -#endif - - return 0; - -} - - - -MODULE_AUTHOR("Hsingbung(HB) Chen "); -MODULE_DESCRIPTION("Kernel Infiniband NAL v0.1"); -MODULE_LICENSE("GPL"); - -module_init (kibnal_initialize); -module_exit (kibnal_finalize); - -EXPORT_SYMBOL(kibnal_ni); - diff --git a/lustre/portals/knals/ibnal/ibnal.h b/lustre/portals/knals/ibnal/ibnal.h deleted file mode 100644 index 4a1f0d7..0000000 --- a/lustre/portals/knals/ibnal/ibnal.h +++ /dev/null @@ -1,565 +0,0 @@ -#ifndef _IBNAL_H -#define _IBNAL_H - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_IBNAL - -#include -#include -#include -#include - -// Infiniband VAPI/EVAPI header files -// Mellanox MT23108 VAPI -#include -#include -#include -#include - -// pick a port for this RDMA information exhange between two hosts -#define HOST_PORT 11211 -#define QUEUE_SIZE 1024 -#define HCA_PORT_1 1 -#define HCA_PORT_2 2 -#define DEBUG_SUBSYSTEM S_IBNAL - -#define START_SEND_WRQ_ID 0 -#define START_RECV_WRQ_ID 0 -#define START_RDMA_WRQ_ID 0 - -#define DEFAULT_PRIORITY 100 - -#define WAIT_FOT_R_RDMA_TIMEOUT 10000 -#define MAX_NUM_TRY 3000 - -#define MAX_NUM_POLL 300 -#define MAX_LOOP_COUNT 500 - -#define MAX_GID 32 -#define MCG_BUF_LENGTH 128 - -#define SHARED_SEGMENT_SIZE 0x10000 -#define HCA_EXCHANGE_SHM_KEY 999 // shared memory key for HCA data exchange - -// some internals opcodes for IB operations used in IBNAL -#define SEND_QP_INFO 0X00000001 -#define RECV_QP_INFO 0X00000010 - -// Mellanox InfiniHost MT23108 -// QP/CQ related information -// - -#define MTU_256 1 /* 1-256,2-512,3-1024,4-2048 */ -#define MTU_512 2 /* 1-256,2-512,3-1024,4-2048 */ -#define MTU_1024 3 /* 1-256,2-512,3-1024,4-2048 */ -#define MTU_2048 4 /* 1-256,2-512,3-1024,4-2048 */ - -// number of entries for each CQ and WQ -// how much do we need ? -#define NUM_CQE 1024 -#define NUM_WQE 1024 -#define MAX_OUT_SQ 64 -#define MAX_OUT_RQ 64 - -#define NUM_MBUF 256 -#define NUM_RDMA_RESERVED_ENTRY 128 -#define NUM_QPS 256 - -#define INVALID_WR_ID ((VAPI_wr_id_t) -1) - - -// for Vector IO -// scatter and gather -// Portals can support upto 64 IO-Vectors -// how much do we need ? -#define NUM_SGE 1 -#define NUM_SG 1 -#define NUM_CQ 1 - -#define ONE_KB 1024 -#define ONE_MB 1024 * ONE_KB -#define ONE_GB 1024 * ONE_MB - - -#define KB_4 1024 * 4 -#define KB_8 1024 * 8 -#define KB_16 1024 * 16 -#define KB_32 1024 * 32 -#define KB_64 1024 * 64 -#define KB_128 1024 * 128 -#define KB_256 1024 * 256 - -// 256 entry in registered buffer list -// small size message -#define Num_4_KB 64 -#define Num_8_KB 64 -#define Num_16_KB 40 -#define Num_32_KB 40 -#define Num_64_KB 40 -#define Num_128_KB 4 -#define Num_256_KB 4 - -#define SMALL_MSG_SIZE KB_32 - -#define MAX_MSG_SIZE ONE_MB * 512 - -// 128's 64KB bufer for send -// 128's 64KB bufer for recv -// used in RDAM operation only - -#define NUM_ENTRY 128 - -#define End_4_kb Num_4_KB -#define End_8_kb End_4_kb + Num_8_KB -#define End_16_kb End_8_kb + Num_16_KB -#define End_32_kb End_16_kb + Num_32_KB -#define End_64_kb End_32_kb + Num_64_KB -#define End_128_kb End_64_kb + Num_128_KB -#define End_256_kb End_128_kb+ Num_256_KB - - -#define SEND_BUF_SIZE KB_32 -#define RECV_BUF_SIZE SEND_BUF_SIZE - -// #define POLL_BASED_CQE_HANDLING 1 -#define EVENT_BASED_CQE_HANDLING 1 -#define IBNAL_SELF_TESTING 1 - -#ifdef IBNAL_SELF_TESTING -#undef IBNAL_SELF_TESTING -#endif - - -#define MSG_SIZE_SMALL 1 -#define MSG_SIZE_LARGE 2 - - - -// some defauly configuration values for early testing -#define DEFAULT_DLID 1 // default destination link ID -#define DEFAULT_QP_NUM 4 // default QP number -#define P_KEY 0xFFFF // do we need default value -#define PKEY_IX 0x0 // do we need default value -#define Q_KEY 0x012 // do we need default value -#define L_KEY 0x12345678 // do we need default value -#define R_KEY 0x87654321 // do we need default value -#define HCA_ID "InfiniHost0" // default -#define START_PSN 0 -#define START_SQ_PSN 0 -#define START_RQ_PSN 0 - - -#define __u_long_long unsigned long long - -#define IBNAL_DEBUG 1 - -#define USE_SHARED_MEMORY_AND_SOCKET 1 - -// operation type -#define TRY_SEND_ONLY 1 - -#define YES 1 -#define NO 0 - -// -// a common data structure for IB QP's operation -// each QP is associated with an QP_info structure -// -typedef struct QP_info -{ - VAPI_hca_hndl_t hca_hndl; // HCA handle - IB_port_t port; // port number - VAPI_qp_hndl_t qp_hndl; // QP's handle list - VAPI_qp_state_t qp_state; // QP's current state - VAPI_pd_hndl_t pd_hndl; // protection domain - VAPI_cq_hndl_t cq_hndl; // send-queue CQ's handle - VAPI_cq_hndl_t sq_cq_hndl; // send-queue CQ's handle - VAPI_cq_hndl_t rq_cq_hndl; // receive-queue CQ's handle - VAPI_ud_av_hndl_t av_hndl; // receive-queue CQ's handle - VAPI_qp_init_attr_t qp_init_attr; // QP's init attribute - VAPI_qp_attr_t qp_attr; // QP's attribute - dlid - VAPI_qp_prop_t qp_prop; // QP's propertities - VAPI_hca_port_t hca_port; - VAPI_qp_num_t qp_num; // QP's number - VAPI_qp_num_t rqp_num; // remote QP's number - IB_lid_t slid; - IB_lid_t dlid; - VAPI_gid_t src_gid; - - u_int32_t buf_size; - VAPI_virt_addr_t buf_addr; - char *bufptr; - VAPI_mrw_t mr; - VAPI_mr_hndl_t mr_hndl; - VAPI_virt_addr_t raddr; - VAPI_rkey_t rkey; - VAPI_lkey_t lkey; - - VAPI_wr_id_t last_posted_send_id; // user defined work request ID - VAPI_wr_id_t last_posted_rcv_id; // user defined work request ID - VAPI_mw_hndl_t mw_hndl; // memory window handle - VAPI_rkey_t mw_rkey; // memory window rkey - VAPI_sg_lst_entry_t sg_lst[256]; // scatter and gather list - int sg_list_sz; // set as NUM_SGE - VAPI_wr_id_t wr_id; // - spinlock_t snd_mutex; - spinlock_t rcv_mutex; - spinlock_t bl_mutex; - spinlock_t cln_mutex; - int cur_RDMA_outstanding; - int cur_send_outstanding; - int cur_posted_rcv_bufs; - int snd_rcv_balance; -} QP_info; - - -// buffer status -#define BUF_REGISTERED 0x10000000 -#define BUF_INUSE 0x01000000 -#define BUF_UNREGISTERED 0x00100000 - -// buffer type -#define REG_BUF 0x10000000 -#define RDMA_BUF 0x01000000 - -// -// IMM data -// -#define IMM_000 (0 << 32); -#define IMM_001 (1 << 32); -#define IMM_002 (2 << 32); -#define IMM_003 (3 << 32); -#define IMM_004 (4 << 32); -#define IMM_005 (5 << 32); -#define IMM_006 (6 << 32); -#define IMM_007 (7 << 32); -#define IMM_008 (8 << 32); -#define IMM_009 (9 << 32); -#define IMM_010 (10 << 32); -#define IMM_011 (11 << 32); -#define IMM_012 (12 << 32); -#define IMM_013 (13 << 32); -#define IMM_014 (14 << 32); -#define IMM_015 (15 << 32); -#define IMM_016 (16 << 32); -#define IMM_017 (17 << 32); -#define IMM_018 (18 << 32); -#define IMM_019 (19 << 32); -#define IMM_020 (20 << 32); -#define IMM_021 (21 << 32); -#define IMM_022 (22 << 32); -#define IMM_023 (23 << 32); -#define IMM_024 (24 << 32); -#define IMM_025 (25 << 32); -#define IMM_026 (26 << 32); -#define IMM_027 (27 << 32); -#define IMM_028 (28 << 32); -#define IMM_029 (29 << 32); -#define IMM_030 (30 << 32); -#define IMM_031 (31 << 32); - - - -typedef struct Memory_buffer_info{ - u_int32_t buf_size; - VAPI_virt_addr_t buf_addr; - char *bufptr; - VAPI_mrw_t mr; - VAPI_mr_hndl_t mr_hndl; - int status; - int ref_count; - int buf_type; - VAPI_virt_addr_t raddr; - VAPI_rkey_t rkey; - VAPI_lkey_t lkey; -} Memory_buffer_info; - -typedef struct RDMA_Info_Exchange { - int opcode; - int buf_length; - VAPI_mrw_t recv_rdma_mr; - VAPI_mr_hndl_t recv_rdma_mr_hndl; - VAPI_mrw_t send_rdma_mr; - VAPI_mr_hndl_t send_rdma_mr_hndl; - VAPI_virt_addr_t raddr; - VAPI_rkey_t rkey; - int flag; -} RDMA_Info_Exchange; - -// opcode for Rdma info exchange RTS/CTS -#define Ready_To_send 0x10000000 -#define Clear_To_send 0x01000000 - -#define RDMA_RTS_ID 5555 -#define RDMA_CTS_ID 7777 -#define RDMA_OP_ID 9999 -#define SEND_RECV_TEST_ID 2222 -#define SEND_RECV_TEST_BUF_ID 0 - -#define TEST_SEND_MESSAGE 0x00000001 -#define TEST_RECV_MESSAGE 0x00000002 - - -#define RTS_CTS_TIMEOUT 50 -#define RECEIVING_THREAD_TIMEOUT 50 -#define WAIT_FOR_SEND_BUF_TIMEOUT 50 - -#define IBNAL_DEBUG_LEVEL_1 0XFFFFFFFF -#define IBNAL_DEBUG_LEVEL_2 D_PORTALS | D_NET | D_WARNING | D_MALLOC | \ - D_ERROR | D_OTHER | D_TRACE | D_INFO - - -// flag for Rdma info exhange -#define RDMA_BUFFER_RESERVED 0x10000000 -#define RDMA_BUFFER_UNAVAILABLE 0x01000000 - - -// receiving data structure -typedef struct { - ptl_hdr_t *krx_buffer; // pointer to receiving buffer - unsigned long krx_len; // length of buffer - unsigned int krx_size; // - unsigned int krx_priority; // do we need this - struct list_head krx_item; -} kibnal_rx_t; - -// transmitting data structure -typedef struct { - nal_cb_t *ktx_nal; - void *ktx_private; - lib_msg_t *ktx_cookie; - char *ktx_buffer; - size_t ktx_len; - unsigned long ktx_size; - int ktx_ndx; - unsigned int ktx_priority; - unsigned int ktx_tgt_node; - unsigned int ktx_tgt_port_id; -} kibnal_tx_t; - - -typedef struct { - char kib_init; - char kib_shuttingdown; - IB_port_t port_num; // IB port information - struct list_head kib_list; - ptl_nid_t kib_nid; - nal_t *kib_nal; - nal_cb_t *kib_cb; - struct kib_trans *kib_trans; // do I need this - struct tq_struct kib_ready_tq; - spinlock_t kib_dispatch_lock; -} kibnal_data_t; - - -// -// A data structure for keeping the HCA information in system -// information related to HCA and hca_handle will be kept here -// -typedef struct HCA_Info -{ - VAPI_hca_hndl_t hca_hndl; // HCA handle - VAPI_pd_hndl_t pd_hndl; // protection domain - IB_port_t port; // port number - int num_qp; // number of qp used - QP_info *qp_ptr[NUM_QPS]; // point to QP_list - int num_cq; // number of cq used - VAPI_cq_hndl_t cq_hndl; - VAPI_cq_hndl_t sq_cq_hndl; - VAPI_cq_hndl_t rq_cq_hndl; - IB_lid_t dlid; - IB_lid_t slid; - kibnal_data_t *kib_data; // for PORTALS operations -} HCA_info; - - - - -// Remote HCA Info information -typedef struct Remote_HCA_Info { - unsigned long opcode; - unsigned long length; - IB_lid_t dlid[NUM_QPS]; - VAPI_qp_num_t rqp_num[NUM_QPS]; -} Remote_QP_Info; - -typedef struct Bucket_index{ - int start; - int end; -} Bucket_index; - -// functional prototypes -// infiniband initialization -int kib_init(kibnal_data_t *); - -// receiving thread -void kibnal_recv_thread(HCA_info *); -void recv_thread(HCA_info *); - -// forward data packet -void kibnal_fwd_packet (void *, kpr_fwd_desc_t *); - -// global data structures -extern kibnal_data_t kibnal_data; -extern ptl_handle_ni_t kibnal_ni; -extern nal_t kibnal_api; -extern nal_cb_t kibnal_lib; -extern QP_info QP_list[]; -extern QP_info CQ_list[]; -extern HCA_info Hca_data; -extern VAPI_hca_hndl_t Hca_hndl; -extern VAPI_pd_hndl_t Pd_hndl; -extern VAPI_hca_vendor_t Hca_vendor; -extern VAPI_hca_cap_t Hca_cap; -extern VAPI_hca_port_t Hca_port_1_props; -extern VAPI_hca_port_t Hca_port_2_props; -extern VAPI_hca_attr_t Hca_attr; -extern VAPI_hca_attr_mask_t Hca_attr_mask; -extern VAPI_cq_hndl_t Cq_SQ_hndl; -extern VAPI_cq_hndl_t Cq_RQ_hndl; -extern VAPI_cq_hndl_t Cq_hndl; -extern unsigned long User_Defined_Small_Msg_Size; -extern Remote_QP_Info L_HCA_RDMA_Info; -extern Remote_QP_Info R_HCA_RDMA_Info; -extern unsigned int Num_posted_recv_buf; -extern int R_RDMA_DATA_ARRIVED; -extern Memory_buffer_info MRbuf_list[]; -extern Memory_buffer_info MSbuf_list[]; -extern Bucket_index Bucket[]; -extern RDMA_Info_Exchange Rdma_info; -extern int Cts_Message_arrived; -extern RDMA_Info_Exchange Local_rdma_info; -extern spinlock_t MSB_mutex[]; - - - -// kernel NAL API function prototype -int kibnal_forward(nal_t *,int ,void *,size_t ,void *,size_t ); -void kibnal_lock(nal_t *, unsigned long *); -void kibnal_unlock(nal_t *, unsigned long *); -int kibnal_shutdown(nal_t *, int ); -void kibnal_yield( nal_t * ); -void kibnal_invalidate(nal_cb_t *,void *,size_t ,void *); -int kibnal_validate(nal_cb_t *,void *,size_t ,void **); - - - -nal_t *kibnal_init(int , ptl_pt_index_t , ptl_ac_index_t , ptl_pid_t ); -void __exit kibnal_finalize(void ); -VAPI_ret_t create_qp(QP_info *, int ); -VAPI_ret_t init_qp(QP_info *, int ); -VAPI_ret_t IB_Open_HCA(kibnal_data_t *); -VAPI_ret_t IB_Close_HCA(void ); -VAPI_ret_t createMemRegion(VAPI_hca_hndl_t, VAPI_pd_hndl_t); -VAPI_ret_t deleteMemRegion(QP_info *, int ); - -void ibnal_send_recv_self_testing(int *); - -int __init kibnal_initialize(void); - - - -/* CB NAL functions */ -int kibnal_send(nal_cb_t *, - void *, - lib_msg_t *, - ptl_hdr_t *, - int, - ptl_nid_t, - ptl_pid_t, - unsigned int, - ptl_kiov_t *, - size_t); - -int kibnal_send_pages(nal_cb_t *, - void *, - lib_msg_t *, - ptl_hdr_t *, - int, - ptl_nid_t, - ptl_pid_t, - unsigned int, - ptl_kiov_t *, - size_t); -int kibnal_recv(nal_cb_t *, void *, lib_msg_t *, - unsigned int, struct iovec *, size_t, size_t); -int kibnal_recv_pages(nal_cb_t *, void *, lib_msg_t *, - unsigned int, ptl_kiov_t *, size_t, size_t); -int kibnal_read(nal_cb_t *,void *,void *,user_ptr ,size_t ); -int kibnal_write(nal_cb_t *,void *,user_ptr ,void *,size_t ); -int kibnal_callback(nal_cb_t * , void *, lib_eq_t *, ptl_event_t *); -void *kibnal_malloc(nal_cb_t *,size_t ); -void kibnal_free(nal_cb_t *,void *,size_t ); -int kibnal_map(nal_cb_t *, unsigned int , struct iovec *, void **); -void kibnal_unmap(nal_cb_t *, unsigned int , struct iovec *, void **); -int kibnal_map_pages(nal_cb_t *, unsigned int , ptl_kiov_t *, void **); -void kibnal_unmap_pages(nal_cb_t * , unsigned int , ptl_kiov_t *, void **); -void kibnal_printf(nal_cb_t *, const char *, ...); -void kibnal_cli(nal_cb_t *,unsigned long *); -void kibnal_sti(nal_cb_t *,unsigned long *); -int kibnal_dist(nal_cb_t *,ptl_nid_t ,unsigned long *); - -void kibnal_fwd_packet (void *, kpr_fwd_desc_t *); -void kibnal_rx(kibnal_data_t *, - VAPI_virt_addr_t , - u_int32_t, - u_int32_t, - unsigned int); - -int kibnal_end(kibnal_data_t *); - -void async_event_handler(VAPI_hca_hndl_t , VAPI_event_record_t *,void *); - -void CQE_event_handler(VAPI_hca_hndl_t ,VAPI_cq_hndl_t , void *); - - -VAPI_ret_t Send_Small_Msg(char *, int ); -VAPI_ret_t Send_Large_Msg(char *, int ); - -VAPI_ret_t repost_recv_buf(QP_info *, VAPI_wr_id_t ); -int post_recv_bufs(VAPI_wr_id_t ); -int server_listen_thread(void *); -VAPI_wr_id_t RTS_handshaking_protocol(int ); -VAPI_wr_id_t CTS_handshaking_protocol(RDMA_Info_Exchange *); - -VAPI_ret_t createMemRegion_RDMA(VAPI_hca_hndl_t , - VAPI_pd_hndl_t , - char *, - int , - VAPI_mr_hndl_t *, - VAPI_mrw_t *); - - -VAPI_ret_t IB_Set_Event_Handler(HCA_info , kibnal_data_t *); - -VAPI_ret_t IB_Set_Async_Event_Handler(HCA_info ,kibnal_data_t *); - -VAPI_wr_id_t find_available_buf(int ); -VAPI_wr_id_t search_send_buf(int ); -VAPI_wr_id_t find_filler_list(int ,int ); -int insert_MRbuf_list(int ); - - -#endif /* _IBNAL_H */ diff --git a/lustre/portals/knals/ibnal/ibnal_cb.c b/lustre/portals/knals/ibnal/ibnal_cb.c deleted file mode 100644 index 0688062..0000000 --- a/lustre/portals/knals/ibnal/ibnal_cb.c +++ /dev/null @@ -1,1289 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Based on ksocknal and qswnal - * - * Author: Hsing-bung Chen - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -#include "ibnal.h" - - - - -RDMA_Info_Exchange Rdma_nfo; -int Cts_Msg_Arrived = NO; - - -/* - * LIB functions follow - */ - -// -// read -// copy a block of data from scr_addr to dst_addr -// it all happens in kernel space - dst_addr and src_addr -// -// original definition is to read a block od data from a -// specified user address -// -// cb_read - -int kibnal_read (nal_cb_t *nal, - void *private, - void *dst_addr, - user_ptr src_addr, - size_t len) -{ - CDEBUG(D_NET, "kibnal_read: 0x%Lx: reading %ld bytes from %p -> %p\n", - nal->ni.nid, (long)len, src_addr, dst_addr ); - - memcpy( dst_addr, src_addr, len ); - - return 0; -} - -// -// it seems that read and write are doing the same thing -// because they all happen in kernel space -// why do we need two functions like read and write -// to make PORTALS API compatable -// - -// -// write -// copy a block of data from scr_addr to dst_addr -// it all happens in kernel space - dst_addr and src_addr -// -// original definition is to write a block od data to a -// specified user address -// -// cb_write - -int kibnal_write(nal_cb_t *nal, - void *private, - user_ptr dst_addr, - void *src_addr, - size_t len) -{ - CDEBUG(D_NET, "kibnal_write: 0x%Lx: writing %ld bytes from %p -> %p\n", - nal->ni.nid, (long)len, src_addr, dst_addr ); - - - memcpy( dst_addr, src_addr, len ); - - return 0; -} - -// -// malloc -// -// either vmalloc or kmalloc is used -// dynamically allocate a block of memory based on the size of buffer -// -// cb_malloc - -void * kibnal_malloc(nal_cb_t *nal, size_t length) -{ - void *buffer; - - // PORTAL_ALLOC will do the job - // allocate a buffer with size "length" - PORTAL_ALLOC(buffer, length); - - return buffer; -} - -// -// free -// release a dynamically allocated memory pointed by buffer pointer -// -// cb_free - -void kibnal_free(nal_cb_t *nal, void *buffer, size_t length) -{ - // - // release allocated buffer to system - // - PORTAL_FREE(buffer, length); -} - -// -// invalidate -// because evernthing is in kernel space (LUSTRE) -// there is no need to mark a piece of user memory as no longer in use by -// the system -// -// cb_invalidate - -void kibnal_invalidate(nal_cb_t *nal, - void *base, - size_t extent, - void *addrkey) -{ - // do nothing - CDEBUG(D_NET, "kibnal_invalidate: 0x%Lx: invalidating %p : %d\n", - nal->ni.nid, base, extent); - return; -} - - -// -// validate -// because everything is in kernel space (LUSTRE) -// there is no need to mark a piece of user memory in use by -// the system -// -// cb_validate - -int kibnal_validate(nal_cb_t *nal, - void *base, - size_t extent, - void **addrkey) -{ - // do nothing - CDEBUG(D_NET, "kibnal_validate: 0x%Lx: validating %p : %d\n", - nal->ni.nid, base, extent); - - return 0; -} - - -// -// log messages from kernel space -// printk() is used -// -// cb_printf - -void kibnal_printf(nal_cb_t *nal, const char *fmt, ...) -{ - va_list ap; - char msg[256]; - - if (portal_debug & D_NET) { - va_start( ap, fmt ); - vsnprintf( msg, sizeof(msg), fmt, ap ); - va_end( ap ); - - printk("CPUId: %d %s",smp_processor_id(), msg); - } -} - -// -// clear interrupt -// use spin_lock to lock protected area such as MD, ME... -// so a process can enter a protected area and do some works -// this won't physicall disable interrup but use a software -// spin-lock to control some protected areas -// -// cb_cli - -void kibnal_cli(nal_cb_t *nal, unsigned long *flags) -{ - kibnal_data_t *data= nal->nal_data; - - CDEBUG(D_NET, "kibnal_cli \n"); - - spin_lock_irqsave(&data->kib_dispatch_lock,*flags); - -} - -// -// set interrupt -// use spin_lock to unlock protected area such as MD, ME... -// this won't physicall enable interrup but use a software -// spin-lock to control some protected areas -// -// cb_sti - -void kibnal_sti(nal_cb_t *nal, unsigned long *flags) -{ - kibnal_data_t *data= nal->nal_data; - - CDEBUG(D_NET, "kibnal_sti \n"); - - spin_unlock_irqrestore(&data->kib_dispatch_lock,*flags); -} - - - -// -// nic distance -// -// network distance doesn't mean much for this nal -// here we only indicate -// 0 - operation is happened on the same node -// 1 - operation is happened on different nodes -// router will handle the data routing -// -// cb_dist - -int kibnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - CDEBUG(D_NET, "kibnal_dist \n"); - - if ( nal->ni.nid == nid ) { - *dist = 0; - } - else { - *dist = 1; - } - - return 0; // always retrun 0 -} - - -// -// This is the cb_send() on IB based interconnect system -// prepare a data package and use VAPI_post_sr() to send it -// down-link out-going message -// - - -int -kibnal_send(nal_cb_t *nal, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int niov, - ptl_kiov_t *iov, - size_t len) -{ - - int rc=0; - void *buf = NULL; - unsigned long buf_length = sizeof(ptl_hdr_t) + len; - int expected_buf_size = 0; - VAPI_ret_t vstat; - - PROF_START(kibnal_send); // time stamp send start - - CDEBUG(D_NET,"kibnal_send: sending %d bytes from %p to nid: 0x%Lx pid %d\n", - buf_length, iov, nid, HCA_PORT_1); - - - // do I need to check the gateway information - // do I have problem to send direct - // do I have to forward a data packet to gateway - // - // The current connection is back-to-back - // I always know that data will be send from one-side to - // the other side - // - - // - // check data buffer size - // - // MSG_SIZE_SMALL - // regular post send - // - // MSG_SIZE_LARGE - // rdma write - - if(buf_length <= SMALL_MSG_SIZE) { - expected_buf_size = MSG_SIZE_SMALL; - } - else { - if(buf_length > MAX_MSG_SIZE) { - CERROR("kibnal_send:request exceeds Transmit data size (%d).\n", - MAX_MSG_SIZE); - rc = PTL_FAIL; - return rc; - } - else { - expected_buf_size = MSG_SIZE_LARGE; // this is a large data package - } - } - - // prepare data packet for send operation - // - // allocate a data buffer "buf" with size of buf_len(header + payload) - // --------------- - // buf | hdr | size = sizeof(ptl_hdr_t) - // -------------- - // |payload data | size = len - // --------------- - - // copy header to buf - memcpy(buf, hdr, sizeof(ptl_hdr_t)); - - // copy payload data from iov to buf - // use portals library function lib_copy_iov2buf() - - if (len != 0) - lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t), - niov, - iov, - len); - - // buf is ready to do a post send - // the send method is base on the buf_size - - CDEBUG(D_NET,"ib_send %d bytes (size %d) from %p to nid: 0x%Lx " - " port %d\n", buf_length, expected_buf_size, iov, nid, HCA_PORT_1); - - switch(expected_buf_size) { - case MSG_SIZE_SMALL: - // send small message - if((vstat = Send_Small_Msg(buf, buf_length)) != VAPI_OK){ - CERROR("Send_Small_Msg() is failed\n"); - } - break; - - case MSG_SIZE_LARGE: - // send small message - if((vstat = Send_Large_Msg(buf, buf_length)) != VAPI_OK){ - CERROR("Send_Large_Msg() is failed\n"); - } - break; - - default: - CERROR("Unknown message size %d\n", expected_buf_size); - break; - } - - PROF_FINISH(kibnal_send); // time stapm of send operation - - rc = PTL_OK; - - return rc; -} - -// -// kibnal_send_pages -// -// no support -// -// do you need this -// -int kibnal_send_pages(nal_cb_t * nal, - void *private, - lib_msg_t * cookie, - ptl_hdr_t * hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int niov, - ptl_kiov_t *iov, - size_t mlen) -{ - int rc = PTL_FAIL; - - CDEBUG(D_NET, "kibnal_send_pages\n"); - - // do nothing now for Infiniband - - return rc; -} - - - - - -// -// kibnal_fwd_packet -// -// no support -// -// do you need this -// -void kibnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) -{ - CDEBUG(D_NET, "forwarding not implemented\n"); - return; - -} - -// -// kibnal_callback -// -// no support -// -// do you need this -// -void kibnal_callback(nal_cb_t * nal, - void *private, - lib_eq_t *eq, - ptl_event_t *ev) -{ - CDEBUG(D_NET, "callback not implemented\n"); - return PTL_OK; -} - - -/* Process a received portals packet */ -// -// conver receiving data in to PORTALS header -// - -void kibnal_rx(kibnal_data_t *kib, - VAPI_virt_addr_t buffer_addr, - u_int32_t buffer_len, - u_int32_t buffer_size, - unsigned int priority) -{ - ptl_hdr_t *hdr = (ptl_hdr_t *) buffer_addr; // case to ptl header format - kibnal_rx_t krx; - - CDEBUG(D_NET,"kibnal_rx: buf %p, len %ld\n", buffer_addr, buffer_len); - - if ( buffer_len < sizeof( ptl_hdr_t ) ) { - /* XXX what's this for? */ - if (kib->kib_shuttingdown) - return; - CERROR("kibnal_rx: did not receive complete portal header, " - "len= %ld", buffer_len); - - return; - } - - // typedef struct { - // char *krx_buffer; // pointer to receiving buffer - // unsigned long krx_len; // length of buffer - // unsigned int krx_size; // - // unsigned int krx_priority; // do we need this - // struct list_head krx_item; - // } kibnal_rx_t; - // - krx.krx_buffer = hdr; - krx.krx_len = buffer_len; - krx.krx_size = buffer_size; - krx.krx_priority = priority; - - if ( hdr->dest_nid == kibnal_lib.ni.nid ) { - // this is my data - PROF_START(lib_parse); - - lib_parse(&kibnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx); - - PROF_FINISH(lib_parse); - } else { - /* forward to gateway */ - // Do we expect this happened ? - // - CERROR("kibnal_rx: forwarding not implemented yet"); - } - - return; -} - - - - -// -// kibnal_recv_pages -// -// no support -// -// do you need this -// -int -kibnal_recv_pages(nal_cb_t * nal, - void *private, - lib_msg_t * cookie, - unsigned int niov, - ptl_kiov_t *iov, - size_t mlen, - size_t rlen) -{ - - CDEBUG(D_NET, "recv_pages not implemented\n"); - return PTL_FAIL; - -} - - -int -kibnal_recv(nal_cb_t *nal, - void *private, - lib_msg_t *cookie, - unsigned int niov, - struct iovec *iov, - size_t mlen, - size_t rlen) -{ - kibnal_rx_t *krx = private; - - CDEBUG(D_NET,"kibnal_recv: mlen=%d, rlen=%d\n", mlen, rlen); - - /* What was actually received must be >= what sender claims to - * have sent. */ - LASSERT (mlen <= rlen); - - if (krx->krx_len < sizeof (ptl_hdr_t) + rlen) - return (PTL_FAIL); - - PROF_START(kibnal_recv); - - if(mlen != 0) { - PROF_START(memcpy); - lib_copy_buf2iov (niov, iov, krx->krx_buffer + - sizeof (ptl_hdr_t), mlen); - PROF_FINISH(memcpy); - } - - PROF_START(lib_finalize); - - lib_finalize(nal, private, cookie, PTL_OK); - - PROF_FINISH(lib_finalize); - PROF_FINISH(kibnal_recv); - - return PTL_OK; -} - -// -// kibnal_map -// no support -// do you need this -// -int kibnal_map(nal_cb_t * nal, - unsigned int niov, - struct iovec *iov, - void **addrkey) -{ - CDEBUG(D_NET, "map not implemented\n"); - return PTL_OK; -} - - - -// -// kibnal_unmap -// -// no support -// -// do you need this -// -void kibnal_unmap(nal_cb_t * nal, - unsigned int niov, - struct iovec *iov, - void **addrkey) -{ - CDEBUG(D_NET, "unmap not implemented\n"); - return; -} - - - -// -// kibnal_map_pages -// no support -// do you need this -/* as (un)map, but with a set of page fragments */ -int kibnal_map_pages(nal_cb_t * nal, - unsigned int niov, - ptl_kiov_t *iov, - void **addrkey) -{ - CDEBUG(D_NET, "map_pages not implemented\n"); - return PTL_OK; -} - - - -// -// kibnal_unmap_pages -// -// no support -// -// do you need this -// -void kibnal_unmap_pages(nal_cb_t * nal, - unsigned int niov, - ptl_kiov_t *iov, - void **addrkey) -{ - CDEBUG(D_NET, "unmap_pages not implemented\n"); - return ; -} - - -int kibnal_end(kibnal_data_t *kib) -{ - - /* wait for sends to finish ? */ - /* remove receive buffers */ - /* shutdown receive thread */ - - CDEBUG(D_NET, "kibnal_end\n"); - IB_Close_HCA(); - - return 0; -} - - -// -// -// asynchronous event handler: response to some unexpetced operation errors -// -// void async_event_handler(VAPI_hca_hndl_t hca_hndl, -// VAPI_event_record_t *event_record_p, -// void* private_data) -// the HCA drive will prepare evetn_record_p -// -// this handler is registered with VAPI_set_async_event_handler() -// VAPI_set_async_event_handler() is issued when an HCA is created -// -// -void async_event_handler(VAPI_hca_hndl_t hca_hndl, - VAPI_event_record_t *event_record_p, - void* private_data) -{ - // - // * event_record_p is prepared by the system when an async - // event happened - // * what to do with private_data - // * do we expect more async events happened if so what are they - // - // only log ERROR message now - - switch (event_record_p->type) { - case VAPI_PORT_ERROR: - printk("Got PORT_ERROR event. port number=%d\n", - event_record_p->modifier.port_num); - break; - case VAPI_PORT_ACTIVE: - printk("Got PORT_ACTIVE event. port number=%d\n", - event_record_p->modifier.port_num); - break; - case VAPI_QP_PATH_MIGRATED: /*QP*/ - printk("Got P_PATH_MIGRATED event. qp_hndl=%lu\n", - event_record_p->modifier.qp_hndl); - break; - case VAPI_EEC_PATH_MIGRATED: /*EEC*/ - printk("Got EEC_PATH_MIGRATED event. eec_hndl=%d\n", - event_record_p->modifier.eec_hndl); - break; - case VAPI_QP_COMM_ESTABLISHED: /*QP*/ - printk("Got QP_COMM_ESTABLISHED event. qp_hndl=%lu\n", - event_record_p->modifier.qp_hndl); - break; - case VAPI_EEC_COMM_ESTABLISHED: /*EEC*/ - printk("Got EEC_COMM_ESTABLISHED event. eec_hndl=%d\n", - event_record_p->modifier.eec_hndl); - break; - case VAPI_SEND_QUEUE_DRAINED: /*QP*/ - printk("Got SEND_QUEUE_DRAINED event. qp_hndl=%lu\n", - event_record_p->modifier.qp_hndl); - break; - case VAPI_CQ_ERROR: /*CQ*/ - printk("Got CQ_ERROR event. cq_hndl=%lu\n", - event_record_p->modifier.cq_hndl); - break; - case VAPI_LOCAL_WQ_INV_REQUEST_ERROR: /*QP*/ - printk("Got LOCAL_WQ_INV_REQUEST_ERROR event. qp_hndl=%lu\n", - event_record_p->modifier.qp_hndl); - break; - case VAPI_LOCAL_WQ_ACCESS_VIOL_ERROR: /*QP*/ - printk("Got LOCAL_WQ_ACCESS_VIOL_ERROR event. qp_hndl=%lu\n", - event_record_p->modifier.qp_hndl); - break; - case VAPI_LOCAL_WQ_CATASTROPHIC_ERROR: /*QP*/ - printk("Got LOCAL_WQ_CATASTROPHIC_ERROR event. qp_hndl=%lu\n", - event_record_p->modifier.qp_hndl); - break; - case VAPI_PATH_MIG_REQ_ERROR: /*QP*/ - printk("Got PATH_MIG_REQ_ERROR event. qp_hndl=%lu\n", - event_record_p->modifier.qp_hndl); - break; - case VAPI_LOCAL_CATASTROPHIC_ERROR: /*none*/ - printk("Got LOCAL_CATASTROPHIC_ERROR event. \n"); - break; - default: - printk(":got non-valid event type=%d. IGNORING\n", - event_record_p->type); - } - -} - - - - -VAPI_wr_id_t -search_send_buf(int buf_length) -{ - VAPI_wr_id_t send_id = -1; - u_int32_t i; - int flag = NO; - int loop_count = 0; - - CDEBUG(D_NET, "search_send_buf \n"); - - while((flag == NO) && (loop_count < MAX_LOOP_COUNT)) { - for(i=0; i < NUM_ENTRY; i++) { - // problem about using spinlock - spin_lock(&MSB_mutex[i]); - if(MSbuf_list[i].status == BUF_REGISTERED) { - MSbuf_list[i].status = BUF_INUSE;// make send buf as inuse - flag = YES; - spin_unlock(&MSB_mutex[i]); - break; - } - else - spin_unlock(&MSB_mutex[i]); - } - - loop_count++; - schedule_timeout(200); // wait for a while - } - - if(flag == NO) { - CDEBUG(D_NET, "search_send_buf: could not locate an entry in MSbuf_list\n"); - } - - send_id = (VAPI_wr_id_t ) i; - - return send_id; -} - - - -VAPI_wr_id_t -search_RDMA_recv_buf(int buf_length) -{ - VAPI_wr_id_t recv_id = -1; - u_int32_t i; - int flag = NO; - int loop_count = 0; - - CDEBUG(D_NET, "search_RDMA_recv_buf\n"); - - while((flag == NO) && (loop_count < MAX_LOOP_COUNT)) { - - for(i=NUM_ENTRY; i < NUM_MBUF; i++) { - - spin_lock(&MSB_mutex[i]); - - if((MRbuf_list[i].status == BUF_REGISTERED) && - (MRbuf_list[i].buf_size >= buf_length)) { - MSbuf_list[i].status = BUF_INUSE;// make send buf as inuse - flag = YES; - spin_unlock(&MSB_mutex[i]); - break; - } - else - spin_unlock(&MSB_mutex[i]); - } - - loop_count++; - - schedule_timeout(200); // wait for a while - } - - if(flag == NO) { - CERROR("search_RDMA_recv_buf: could not locate an entry in MBbuf_list\n"); - } - - recv_id = (VAPI_wr_id_t ) i; - - return recv_id; - -} - - - - - - - -VAPI_ret_t Send_Small_Msg(char *buf, int buf_length) -{ - VAPI_ret_t vstat; - VAPI_sr_desc_t sr_desc; - VAPI_sg_lst_entry_t sr_sg; - QP_info *qp; - VAPI_wr_id_t send_id; - - CDEBUG(D_NET, "Send_Small_Msg\n"); - - send_id = search_send_buf(buf_length); - - if(send_id < 0){ - CERROR("Send_Small_Msg: Can not find a QP \n"); - return(~VAPI_OK); - } - - qp = &QP_list[(int) send_id]; - - // find a suitable/registered send_buf from MSbuf_list - CDEBUG(D_NET, "Send_Small_Msg: current send id %d \n", send_id); - - sr_desc.opcode = VAPI_SEND; - sr_desc.comp_type = VAPI_SIGNALED; - sr_desc.id = send_id; - - - // scatter and gather info - sr_sg.len = buf_length; - sr_sg.lkey = MSbuf_list[send_id].mr.l_key; // use send MR - - sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[send_id].buf_addr; - - // copy data to register send buffer - memcpy(&sr_sg.addr, buf, buf_length); - - sr_desc.sg_lst_p = &sr_sg; - sr_desc.sg_lst_len = 1; // only 1 entry is used - sr_desc.fence = TRUE; - sr_desc.set_se = FALSE; - - // call VAPI_post_sr to send out this data - vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc); - - if (vstat != VAPI_OK) { - CERROR("VAPI_post_sr failed (%s).\n",VAPI_strerror(vstat)); - } - - CDEBUG(D_NET, "VAPI_post_sr success.\n"); - - return (vstat); - -} - - - - -VAPI_wr_id_t -RTS_handshaking_protocol(int buf_length) -{ - - VAPI_ret_t vstat; - VAPI_sr_desc_t sr_desc; - VAPI_sg_lst_entry_t sr_sg; - VAPI_wr_id_t send_id; - - RDMA_Info_Exchange rdma_info; - - rdma_info.opcode = Ready_To_send; - rdma_info.buf_length = buf_length; - rdma_info.raddr = (VAPI_virt_addr_t) 0; - rdma_info.rkey = (VAPI_rkey_t) 0 ; - - QP_info *qp; - - CDEBUG(D_NET, "RTS_handshaking_protocol\n"); - - // find a suitable/registered send_buf from MSbuf_list - send_id = search_send_buf(sizeof(RDMA_Info_Exchange)); - - qp = &QP_list[(int) send_id]; - - CDEBUG(D_NET, "RTS_CTS: current send id %d \n", send_id); - sr_desc.opcode = VAPI_SEND; - sr_desc.comp_type = VAPI_SIGNALED; - sr_desc.id = send_id + RDMA_RTS_ID;// this RTS mesage ID - - // scatter and gather info - sr_sg.len = sizeof(RDMA_Info_Exchange); - sr_sg.lkey = MSbuf_list[send_id].mr.l_key; // use send MR - sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[send_id].buf_addr; - - // copy data to register send buffer - memcpy(&sr_sg.addr, &rdma_info, sizeof(RDMA_Info_Exchange)); - - sr_desc.sg_lst_p = &sr_sg; - sr_desc.sg_lst_len = 1; // only 1 entry is used - sr_desc.fence = TRUE; - sr_desc.set_se = FALSE; - - // call VAPI_post_sr to send out this RTS message data - vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc); - - if (vstat != VAPI_OK) { - CERROR("RTS: VAPI_post_sr failed (%s).\n",VAPI_strerror_sym(vstat)); - } - - return send_id; - -} - - - -// create local receiving Memory Region for a HCA -VAPI_ret_t -createMemRegion_RDMA(VAPI_hca_hndl_t hca_hndl, - VAPI_pd_hndl_t pd_hndl, - char *bufptr, - int buf_length, - VAPI_mr_hndl_t *rep_mr_hndl, - VAPI_mrw_t *rep_mr) -{ - VAPI_ret_t vstat; - VAPI_mrw_t mrw; - - CDEBUG(D_NET, "createMemRegion_RDMA\n"); - - // memory region address and size of memory region - // allocate a block of memory for this HCA - // RDMA data buffer - - - if(bufptr == NULL) { - // need to allcate a local buffer to receive data from a - // remore VAPI_RDMA_WRITE_IMM - PORTAL_ALLOC(bufptr, buf_length); - } - - if(bufptr == NULL) { - CDEBUG(D_MALLOC, "Failed to malloc a block of RDMA receiving memory, size %d\n", - buf_length); - return(VAPI_ENOMEM); - } - - /* Register RDAM data Memory region */ - CDEBUG(D_NET, "Register a RDMA data memory region\n"); - - mrw.type = VAPI_MR; - mrw.pd_hndl= pd_hndl; - mrw.start = (VAPI_virt_addr_t )(MT_virt_addr_t )bufptr; - mrw.size = buf_length; - mrw.acl = VAPI_EN_LOCAL_WRITE | - VAPI_EN_REMOTE_WRITE | - VAPI_EN_REMOTE_READ; - - // register send memory region - vstat = VAPI_register_mr(hca_hndl, - &mrw, - rep_mr_hndl, - rep_mr); - - // this memory region is going to be reused until deregister is called - if (vstat != VAPI_OK) { - CERROR("Failed registering a mem region Addr=%p, Len=%d. %s\n", - bufptr, buf_length, VAPI_strerror(vstat)); - } - - return(vstat); - -} - - - -RDMA_Info_Exchange Local_rdma_info; - -int insert_MRbuf_list(int buf_lenght) -{ - int recv_id = NUM_ENTRY; - - CDEBUG(D_NET, "insert_MRbuf_list\n"); - - for(recv_id= NUM_ENTRY; recv_id < NUM_MBUF; recv_id++){ - if(BUF_UNREGISTERED == MRbuf_list[recv_id].status) { - MRbuf_list[recv_id].status = BUF_UNREGISTERED; - MRbuf_list[recv_id].buf_size = buf_lenght; - break; - } - } - - return recv_id; - -} - -VAPI_wr_id_t -CTS_handshaking_protocol(RDMA_Info_Exchange *rdma_info) -{ - - VAPI_ret_t vstat; - VAPI_sr_desc_t sr_desc; - VAPI_sg_lst_entry_t sr_sg; - QP_info *qp; - VAPI_wr_id_t send_id; - VAPI_mr_hndl_t rep_mr_hndl; - VAPI_mrw_t rep_mr; - int recv_id; - char *bufptr = NULL; - - // search MRbuf_list for an available entry that - // has registered data buffer with size equal to rdma_info->buf_lenght - - CDEBUG(D_NET, "CTS_handshaking_protocol\n"); - - // register memory buffer for RDAM operation - - vstat = createMemRegion_RDMA(Hca_hndl, - Pd_hndl, - bufptr, - rdma_info->buf_length, - &rep_mr_hndl, - &rep_mr); - - - Local_rdma_info.opcode = Clear_To_send; - Local_rdma_info.recv_rdma_mr = rep_mr; - Local_rdma_info.recv_rdma_mr_hndl = rep_mr_hndl; - - if (vstat != VAPI_OK) { - CERROR("CST_handshaking_protocol: Failed registering a mem region" - "Len=%d. %s\n", rdma_info->buf_length, VAPI_strerror(vstat)); - Local_rdma_info.flag = RDMA_BUFFER_UNAVAILABLE; - } - else { - // successfully allcate reserved RDAM data buffer - recv_id = insert_MRbuf_list(rdma_info->buf_length); - - if(recv_id >= NUM_ENTRY) { - MRbuf_list[recv_id].buf_addr = rep_mr.start; - MRbuf_list[recv_id].mr = rep_mr; - MRbuf_list[recv_id].mr_hndl = rep_mr_hndl; - MRbuf_list[recv_id].ref_count = 0; - Local_rdma_info.flag = RDMA_BUFFER_RESERVED; - Local_rdma_info.buf_length = rdma_info->buf_length; - Local_rdma_info.raddr = rep_mr.start; - Local_rdma_info.rkey = rep_mr.r_key; - } - else { - CERROR("Can not find an entry in MRbuf_list - how could this happen\n"); - } - } - - // find a suitable/registered send_buf from MSbuf_list - send_id = search_send_buf(sizeof(RDMA_Info_Exchange)); - CDEBUG(D_NET, "CTS: current send id %d \n", send_id); - sr_desc.opcode = VAPI_SEND; - sr_desc.comp_type = VAPI_SIGNALED; - sr_desc.id = send_id + RDMA_CTS_ID; // this CST message ID - - // scatter and gather info - sr_sg.len = sizeof(RDMA_Info_Exchange); - sr_sg.lkey = MSbuf_list[send_id].mr.l_key; // use send MR - sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[send_id].buf_addr; - - // copy data to register send buffer - memcpy(&sr_sg.addr, &Local_rdma_info, sizeof(RDMA_Info_Exchange)); - - sr_desc.sg_lst_p = &sr_sg; - sr_desc.sg_lst_len = 1; // only 1 entry is used - sr_desc.fence = TRUE; - sr_desc.set_se = FALSE; - - // call VAPI_post_sr to send out this RTS message data - vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc); - - if (vstat != VAPI_OK) { - CERROR("CTS: VAPI_post_sr failed (%s).\n",VAPI_strerror(vstat)); - } - - -} - - - -VAPI_ret_t Send_Large_Msg(char *buf, int buf_length) -{ - VAPI_ret_t vstat; - VAPI_sr_desc_t sr_desc; - VAPI_sg_lst_entry_t sr_sg; - QP_info *qp; - VAPI_mrw_t rep_mr; - VAPI_mr_hndl_t rep_mr_hndl; - int send_id; - VAPI_imm_data_t imm_data = 0XAAAA5555; - - - CDEBUG(D_NET, "Send_Large_Msg: Enter\n"); - - // register this large buf - // don't need to copy this buf to send buffer - vstat = createMemRegion_RDMA(Hca_hndl, - Pd_hndl, - buf, - buf_length, - &rep_mr_hndl, - &rep_mr); - - if (vstat != VAPI_OK) { - CERROR("Send_Large_M\sg: createMemRegion_RDMAi() failed (%s).\n", - VAPI_strerror(vstat)); - } - - - Local_rdma_info.send_rdma_mr = rep_mr; - Local_rdma_info.send_rdma_mr_hndl = rep_mr_hndl; - - // - // Prepare descriptor for send queue - // - - // ask for a remote rdma buffer with size buf_lenght - send_id = RTS_handshaking_protocol(buf_length); - - qp = &QP_list[send_id]; - - // wait for CTS message receiving from remote node - while(1){ - if(YES == Cts_Message_arrived) { - // receive CST message from remote node - // Rdma_info is available for use - break; - } - schedule_timeout(RTS_CTS_TIMEOUT); - } - - sr_desc.id = send_id + RDMA_OP_ID; - sr_desc.opcode = VAPI_RDMA_WRITE_WITH_IMM; - sr_desc.comp_type = VAPI_SIGNALED; - - // scatter and gather info - sr_sg.len = buf_length; - - // rdma mr - sr_sg.lkey = rep_mr.l_key; - sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) rep_mr.start; - sr_desc.sg_lst_p = &sr_sg; - sr_desc.sg_lst_len = 1; // only 1 entry is used - - // immediate data - not used here - sr_desc.imm_data = imm_data; - sr_desc.fence = TRUE; - sr_desc.set_se = FALSE; - - // RDAM operation only - // raddr and rkey is receiving from remote node - sr_desc.remote_addr = Rdma_info.raddr; - sr_desc.r_key = Rdma_info.rkey; - - // call VAPI_post_sr to send out this data - vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc); - - if (vstat != VAPI_OK) { - CERROR("VAPI_post_sr failed (%s).\n",VAPI_strerror_sym(vstat)); - } - -} - - - - - - -// -// repost_recv_buf -// post a used recv buffer back to recv WQE list -// wrq_id is used to indicate the starting position of recv-buffer -// -VAPI_ret_t -repost_recv_buf(QP_info *qp, - VAPI_wr_id_t wrq_id) -{ - VAPI_rr_desc_t rr; - VAPI_sg_lst_entry_t sg_entry; - VAPI_ret_t ret; - - CDEBUG(D_NET, "repost_recv_buf\n"); - - sg_entry.lkey = MRbuf_list[wrq_id].mr.l_key; - sg_entry.len = MRbuf_list[wrq_id].buf_size; - sg_entry.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MRbuf_list[wrq_id].buf_addr; - rr.opcode = VAPI_RECEIVE; - rr.comp_type = VAPI_SIGNALED; /* All with CQE (IB compliant) */ - rr.sg_lst_len = 1; /* single buffers */ - rr.sg_lst_p = &sg_entry; - rr.id = wrq_id; /* WQE id used is the index to buffers ptr array */ - - ret= VAPI_post_rr(qp->hca_hndl,qp->qp_hndl,&rr); - - if (ret != VAPI_OK){ - CERROR("failed reposting RQ WQE (%s) buffer \n",VAPI_strerror_sym(ret)); - return ret; - } - - CDEBUG(D_NET, "Successfully reposting an RQ WQE %d recv bufer \n", wrq_id); - - return ret ; -} - -// -// post_recv_bufs -// post "num_o_bufs" for receiving data -// each receiving buf (buffer starting address, size of buffer) -// each buffer is associated with an id -// -int -post_recv_bufs(VAPI_wr_id_t start_id) -{ - int i; - VAPI_rr_desc_t rr; - VAPI_sg_lst_entry_t sg_entry; - VAPI_ret_t ret; - - CDEBUG(D_NET, "post_recv_bufs\n"); - - for(i=0; i< NUM_ENTRY; i++) { - sg_entry.lkey = MRbuf_list[i].mr.l_key; - sg_entry.len = MRbuf_list[i].buf_size; - sg_entry.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MRbuf_list[i].buf_addr; - rr.opcode = VAPI_RECEIVE; - rr.comp_type = VAPI_SIGNALED; /* All with CQE (IB compliant) */ - rr.sg_lst_len = 1; /* single buffers */ - rr.sg_lst_p = &sg_entry; - rr.id = start_id+i; /* WQE id used is the index to buffers ptr array */ - - ret= VAPI_post_rr(QP_list[i].hca_hndl,QP_list[i].qp_hndl, &rr); - if (ret != VAPI_OK) { - CERROR("failed posting RQ WQE (%s)\n",VAPI_strerror_sym(ret)); - return i; - } - } - - return i; /* num of buffers posted */ -} - -int -post_RDMA_bufs(QP_info *qp, - void *buf_array, - unsigned int num_bufs, - unsigned int buf_size, - VAPI_wr_id_t start_id) -{ - - CDEBUG(D_NET, "post_RDMA_bufs \n"); - return YES; -} - - - -// -// LIB NAL -// assign function pointers to theirs corresponding entries -// - -nal_cb_t kibnal_lib = { - nal_data: &kibnal_data, /* NAL private data */ - cb_send: kibnal_send, - cb_send_pages: NULL, // not implemented - cb_recv: kibnal_recv, - cb_recv_pages: NULL, // not implemented - cb_read: kibnal_read, - cb_write: kibnal_write, - cb_callback: NULL, // not implemented - cb_malloc: kibnal_malloc, - cb_free: kibnal_free, - cb_map: NULL, // not implemented - cb_unmap: NULL, // not implemented - cb_map_pages: NULL, // not implemented - cb_unmap_pages: NULL, // not implemented - cb_printf: kibnal_printf, - cb_cli: kibnal_cli, - cb_sti: kibnal_sti, - cb_dist: kibnal_dist // no used at this moment -}; diff --git a/lustre/portals/knals/ibnal/ibnal_send_recv_self_testing.c b/lustre/portals/knals/ibnal/ibnal_send_recv_self_testing.c deleted file mode 100644 index 82defdb..0000000 --- a/lustre/portals/knals/ibnal/ibnal_send_recv_self_testing.c +++ /dev/null @@ -1,116 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * * - * * Based on ksocknal, qswnal, and gmnal - * * - * * Copyright (C) 2003 LANL - * * Author: HB Chen - * * Los Alamos National Lab - * * - * * Portals is free software; you can redistribute it and/or - * * modify it under the terms of version 2 of the GNU General Public - * * License as published by the Free Software Foundation. - * * - * * Portals is distributed in the hope that it will be useful, - * * but WITHOUT ANY WARRANTY; without even the implied warranty of - * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * * GNU General Public License for more details. - * * - * * You should have received a copy of the GNU General Public License - * * along with Portals; if not, write to the Free Software - * * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * * - * */ - -#include "ibnal.h" - - - -VAPI_ret_t ibnal_send_recv_self_testing() -{ - VAPI_ret_t vstat; - VAPI_sr_desc_t sr_desc; - VAPI_sg_lst_entry_t sr_sg; - QP_info *qp; - VAPI_wr_id_t send_id; - int buf_id; - char sbuf[KB_32]; - char rbuf[KB_32]; - int i; - int buf_length = KB_32; - VAPI_wc_desc_t comp_desc; - int num_send = 1; - int loop_count = 0; - - - printk("ibnal_send_recv_self_testing\n"); - - memset(&sbuf, 'a', KB_32); - memset(&rbuf, ' ', KB_32); - - send_id = 2222; - buf_id = 0; - - qp = &QP_list[0]; - - sr_desc.opcode = VAPI_SEND; - sr_desc.comp_type = VAPI_SIGNALED; - - // scatter and gather info - sr_sg.len = KB_32; - sr_sg.lkey = MSbuf_list[buf_id].mr.l_key; // use send MR - sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[buf_id].buf_addr; - - // copy data to register send buffer - memcpy(&sr_sg.addr, &buf, buf_length); - - sr_desc.sg_lst_p = &sr_sg; - sr_desc.sg_lst_len = 1; // only 1 entry is used - sr_desc.fence = TRUE; - sr_desc.set_se = FALSE; - - - // call VAPI_post_sr to send out this data - vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc); - - if (vstat != VAPI_OK) { - printk("VAPI_post_sr failed (%s).\n",VAPI_strerror(vstat)); - } - - printk("VAPI_post_sr success.\n"); - - // poll for completion - - while( loop_count < 100 ){ - vstat = VAPI_poll_cq(qp->hca_hndl, qp->cq_hndl, &comp_desc); - if( vstat == VAPI_OK ) { - if(comp_desc.opcode == VAPI_CQE_SQ_SEND_DATA ) { - /* SEND completion */ - printk("received SQ completion\n"); - } - else { - if(comp_desc.opcode == VAPI_CQE_RQ_SEND_DATA ) { - /* RECEIVE completion */ - printk("received RQ completion\n"); - memcpy(&rbuf, (char *) MRbuf_list[buf_id].buf_addar, KB_32); - - int n; - - n = memcmp($sbuf, &rbuf, KB_32); - printk("compare sbuf and rbuf n = %d\n", n); - - } - else { - printk("unexpected completion opcode %d \n", comp_desc.opcode); - } - } - } - - loop_count++; - schedule_timeout(500); - } - - printk("end of ibnal_self_send_recv_testing\n"); - - -} diff --git a/lustre/portals/knals/ibnal/uagent.c b/lustre/portals/knals/ibnal/uagent.c deleted file mode 100644 index d7e939a..0000000 --- a/lustre/portals/knals/ibnal/uagent.c +++ /dev/null @@ -1,391 +0,0 @@ -#include -#include -#include - - -#include -#include -#include -#include - -#include -#include -#include -#include - -// Infiniband VAPI/EVAPI header files Mellanox MT23108 VAPI -#include -#include -#include -#include - -// Remote HCA Info information - typedef struct Remote_HCA_Info { - unsigned long opcode; - unsigned long length; - IB_lid_t dlid[256]; - VAPI_qp_num_t rqp_num[256]; - VAPI_rkey_t rkey; // for remote RDAM request - unsigned long vaddr1; // virtual address fisrt 4 bytes - unsigned long vaddr2; // virtual address second 4 bytes - u_int32_t size; // size of RDMA memory buffer - char dest_ip[256]; //destination server IP address - } Remote_HCA_Info; - -#define SHARED_SEGMENT_SIZE 0x10000 // 16KB shared memory between U and K - -// some internals opcodes for IB operations used in IBNAL -#define SEND_QP_INFO 0X00000001 -#define RECV_QP_INFO 0X00000010 -#define DEFAULT_SOCKET_PORT 11211 -#define LISTEN_QUEUE_SIZE 2048 -#define DEST_IP "10.128.105.26" - -// server_thread -// + wait for an incoming connection from remote node -// + receive remote HCA's data -// -// -// -// -// -void *server_thread(void *vargp) -{ - Remote_HCA_Info *hca_data; - Remote_HCA_Info hca_data_buffer; - - int serverfd; - int infd; - struct hostent *hp; - struct sockaddr_in serveraddr; - struct sockaddr_in clientaddr; - int sin_size=sizeof(struct sockaddr_in); - int bytes_recv; - int i; - - - hca_data = (Remote_HCA_Info *) vargp; - - if((serverfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - printf("server_thread couldnot create a socket \n"); - pthread_exit((void *) 0); - } - - printf("server_thread create a socket \n"); - - bzero((char *) &serveraddr, sizeof(serveraddr)); - - serveraddr.sin_family = AF_INET; - serveraddr.sin_addr.s_addr = htons(INADDR_ANY); - serveraddr.sin_port = htons((unsigned short) DEFAULT_SOCKET_PORT); - - if(bind(serverfd,(struct sockaddr *)&serveraddr,sizeof(struct sockaddr)) < 0) { - printf("server_thread couldnot bind to a socket \n"); - pthread_exit((void *) 0); - } - - printf("server_thread bind to a socket \n"); - - if(listen(serverfd, LISTEN_QUEUE_SIZE) < 0) { - printf("server_thread couldnot listen to a socket \n"); - pthread_exit((void *) 0); - } - - printf("server_thread listen to a socket \n"); - - // - // I only expect to receive one HCA data from a remote HCA - // - printf("server_thread: Waiting for a connection\n"); - infd= accept(serverfd,(struct sockaddr*)&clientaddr,&sin_size); - printf("server_thread: Got an incoming connection"); - - /* receive data from socket into buffer */ - bytes_recv = recv(infd, - &hca_data_buffer, - sizeof(Remote_HCA_Info), - 0); - - if(bytes_recv > 0) { -/* - printf("server_thread receive data\n"); - printf("opcode is 0x%X\n", hca_data_buffer.opcode); - printf("length is 0x%X\n", hca_data_buffer.length); - - for(i=0; i < 256; i++) { - printf("dlid %d is 0x%X\n", i, hca_data_buffer.dlid[i]); - printf("rqp_num %d is 0x%X\n", hca_data_buffer.rqp_num[i]); - } - - printf("rkey is 0x%X\n", hca_data_buffer.rkey); - printf("vaddr1 is 0x%X\n", hca_data_buffer.vaddr1); - printf("vaddr2 is 0x%X\n", hca_data_buffer.vaddr2); - printf("size is 0x%X\n", hca_data_buffer.size); - printf("After conversion hton \n"); - printf("opcode is 0x%X\n", htonl(hca_data_buffer.opcode)); - printf("length is 0x%X\n", htonl(hca_data_buffer.length)); - - for(i=0; i < 256; i++) { - printf("dlid %d is 0x%X\n", htons(hca_data_buffer.dlid[i])); - printf("rqp_num %d is 0x%X\n", htonl(hca_data_buffer.rqp_num[i])); - } - - printf("rkey is 0x%X\n", htonl(hca_data_buffer.rkey)); - printf("vaddr1 is 0x%X\n", htonl(hca_data_buffer.vaddr1)); - printf("vaddr2 is 0x%X\n", htonl(hca_data_buffer.vaddr2)); - printf("size is 0x%X\n", htonl(hca_data_buffer.size)); -*/ - - hca_data->opcode = ntohl(hca_data_buffer.opcode); // long - hca_data->length = ntohl(hca_data_buffer.length); // long - - for(i=0; i < 256; i++) { - hca_data->dlid[i] = ntohs(hca_data_buffer.dlid[i]); // u_int16 - hca_data->rqp_num[i] = ntohl(hca_data_buffer.rqp_num[i]);// u_int32 - } - - hca_data->rkey = ntohl(hca_data_buffer.rkey); // u_int32 - hca_data->vaddr1 = ntohl(hca_data_buffer.vaddr1); // first word u_int32 - hca_data->vaddr2 = ntohl(hca_data_buffer.vaddr2); // second word u_int32 - hca_data->size = ntohl(hca_data_buffer.size); // u_int32 - } - else { - printf("server_thread receive ERROR bytes_recv = %d\n", bytes_recv); - } - - close(infd); - close(serverfd); - - printf("server_thread EXIT \n"); - - pthread_exit((void *) 0); - -} - -// -// client_thread -// + connect to a remote server_thread -// + send local HCA's data to remote server_thread -// -void *client_thread(void *vargp) -{ - - Remote_HCA_Info *hca_data; - Remote_HCA_Info hca_data_buffer; - - int clientfd; - struct hostent *hp; - struct sockaddr_in clientaddr; - int bytes_send; - int i; - - hca_data = (Remote_HCA_Info *) vargp; - - if((clientfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - printf("client_thread couldnot create a socket \n"); - pthread_exit((void *) 0); - } - - printf("client_thread create a socket \n"); - - bzero((char *) &clientaddr, sizeof(clientaddr)); - - clientaddr.sin_family = AF_INET; - clientaddr.sin_addr.s_addr = inet_addr(hca_data->dest_ip); - printf("client_thread get server Ip address = %s\n", hca_data->dest_ip); - clientaddr.sin_port = htons((unsigned short) DEFAULT_SOCKET_PORT); - memset(&(clientaddr.sin_zero), '\0', 8); - - connect(clientfd, (struct sockaddr *) &clientaddr, sizeof(struct sockaddr)); - - printf("client_thread connect to server Ip address = %s\n", hca_data->dest_ip); - - hca_data_buffer.opcode = htonl(hca_data->opcode); // long - hca_data_buffer.length = htonl(hca_data->length); // long - - for(i=0; i < 256; i++) { - hca_data_buffer.dlid[i] = htons(hca_data->dlid[i]); // u_int16 - hca_data_buffer.rqp_num[i] = htonl(hca_data->rqp_num[i]);// u_int32 - } - - hca_data_buffer.rkey = htonl(hca_data->rkey); // u_int32 - hca_data_buffer.vaddr1 = htonl(hca_data->vaddr1); // first word u_int32 - hca_data_buffer.vaddr2 = htonl(hca_data->vaddr2); // second word u_int32 - hca_data_buffer.size = htonl(hca_data->size); // u_int32 - - bytes_send = send(clientfd, & hca_data_buffer, sizeof(Remote_HCA_Info), 0); - - if(bytes_send == sizeof(Remote_HCA_Info)) { - printf("client_thread: send successfully \n"); - } - else { - printf("client_thread: send failed \n"); - } - - printf("client_thread EXIT \n"); - - pthread_exit((void *) 0); -} - - -// -// main -// + create a shared-memory between this main()/user address and -// a kernel thread/kernel address space associated with inbal -// kernel module -// + access local HCA's data through this shared memory -// -// + create a server_thread for receiving remote HCA's data -// + create a client_thread for sending out local HCA's data -// + after receiving remote HCA's data update this shared memory -// -int main(int argc , char *argv[]) -{ - int segment_id; - struct shmid_ds shmbuffer; - int segment_size; - const int shared_segment_size = sizeof(Remote_HCA_Info); - key_t key = 999; - unsigned long raddr; - Remote_HCA_Info *shared_memory; - Remote_HCA_Info exchange_hca_data; - Remote_HCA_Info remote_hca_data; - int i; - - /* pthread */ - pthread_t sid; - pthread_t cid; - pthread_attr_t attr; - int rc, status; - - char dest_ip[256]; - - if(argc != 2) { - printf("USAGE: uagent server_ip_address\n"); - printf("argc = %d \n", argc); - exit(1); - } - - strcpy(&exchange_hca_data.dest_ip[0], argv[1]); - printf("the destinational server IP address = %s\n", - &exchange_hca_data.dest_ip); - - segment_id = shmget(key, shared_segment_size, IPC_CREAT | 0666); - - printf("sys_shmget is done segment_id = %d\n", segment_id); - - shared_memory = (Remote_HCA_Info *) shmat(segment_id, 0, 0); - - if(shared_memory == (char *) -1) { - printf("Shared memory attach failed shared_memory=%p\n",shared_memory); - exit(0); - } - - printf("shared menory attached at address %p\n", shared_memory); - - while (1) { - if(shared_memory->opcode == SEND_QP_INFO) { - printf("Local HCA data received from kernel thread\n"); - break; - } - usleep(1000); - continue; - } - - printf("Local HCA data received from kernel thread\n"); - - // save local HCA's data in exchange_hca_data - // - exchange_hca_data.opcode = shared_memory->opcode; - exchange_hca_data.length = shared_memory->length; - - for(i=0; i < 256; i++) { - exchange_hca_data.dlid[i] = shared_memory->dlid[i]; - exchange_hca_data.rqp_num[i] = shared_memory->rqp_num[i]; - } - - exchange_hca_data.rkey = shared_memory->rkey; - exchange_hca_data.vaddr1 = shared_memory->vaddr1; - exchange_hca_data.vaddr2 = shared_memory->vaddr2; - exchange_hca_data.size = shared_memory->size; - - /* Initialize and set thread detached attribute */ - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - - /* create a server thread for procsssing incoming remote node socket data */ - // - pthread_create(&sid, - &attr, - server_thread, - (Remote_HCA_Info *) &remote_hca_data); - - printf("Main: created a server thread \n"); - - sleep(10); - - /* create a clint thread to send out local HCA data to remote node */ - pthread_create(&cid, - &attr, - client_thread, - (Remote_HCA_Info *) &exchange_hca_data); - - printf("Main: created a client thread \n"); - - /* synchronization between server_thread and client_thread */ - pthread_attr_destroy(&attr); - - rc = pthread_join(sid, (void **) &status); - if(rc) { - printf("Error: return code from pthread_join() is %d\n", rc); - exit(-1); - } - - printf("completed join with thread %d status = %d\n", sid, status); - - rc = pthread_join(cid, (void **) &status); - if(rc) { - printf("Error: return code from pthread_join() is %d\n", rc); - exit(-1); - } - printf("completed join with thread %d status = %d\n", cid, status); - - // update shared memory with remote HCA's data - - shared_memory->opcode = RECV_QP_INFO; - shared_memory->length = remote_hca_data.length; - for(i=0; i < 256; i++) { - shared_memory->dlid[i] = remote_hca_data.dlid[i]; - shared_memory->rqp_num[i]= remote_hca_data.rqp_num[i]; - } - shared_memory->rkey = remote_hca_data.rkey; - shared_memory->vaddr1 = remote_hca_data.vaddr1; - shared_memory->vaddr2 = remote_hca_data.vaddr2; - shared_memory->size = remote_hca_data.size; - - sleep(5); - - shared_memory->opcode = RECV_QP_INFO; - shared_memory->length = remote_hca_data.length; - for(i=0; i < 256; i++) { - shared_memory->dlid[i] = remote_hca_data.dlid[i]; - shared_memory->rqp_num[i]= remote_hca_data.rqp_num[i]; - } - - shared_memory->rkey = remote_hca_data.rkey; - shared_memory->vaddr1 = remote_hca_data.vaddr1; - shared_memory->vaddr2 = remote_hca_data.vaddr2; - shared_memory->size = remote_hca_data.size; - - sleep(10); - -// shmdt(shared_memory); - - printf("uagent is DONE \n"); - - - - exit(0); - -} - diff --git a/lustre/portals/knals/ibnal/.cvsignore b/lustre/portals/knals/iibnal/.cvsignore similarity index 100% rename from lustre/portals/knals/ibnal/.cvsignore rename to lustre/portals/knals/iibnal/.cvsignore index 48b17e9..5ed596b 100644 --- a/lustre/portals/knals/ibnal/.cvsignore +++ b/lustre/portals/knals/iibnal/.cvsignore @@ -1,10 +1,10 @@ .deps Makefile +.*.cmd autoMakefile.in autoMakefile *.ko *.mod.c .*.flags -.*.cmd .tmp_versions .depend diff --git a/lustre/portals/knals/iibnal/Makefile.in b/lustre/portals/knals/iibnal/Makefile.in new file mode 100644 index 0000000..e7934e2 --- /dev/null +++ b/lustre/portals/knals/iibnal/Makefile.in @@ -0,0 +1,6 @@ +MODULES := kiibnal +kiibnal-objs := iibnal.o iibnal_cb.o + +EXTRA_POST_CFLAGS := @IIBCPPFLAGS@ + +@INCLUDE_RULES@ diff --git a/lustre/portals/knals/iibnal/Makefile.mk b/lustre/portals/knals/iibnal/Makefile.mk new file mode 100644 index 0000000..0459a20 --- /dev/null +++ b/lustre/portals/knals/iibnal/Makefile.mk @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(src)/../../Kernelenv + +obj-y += kiibnal.o +kiibnal-objs := iibnal.o iibnal_cb.o + diff --git a/lustre/portals/knals/iibnal/autoMakefile.am b/lustre/portals/knals/iibnal/autoMakefile.am new file mode 100644 index 0000000..251df66 --- /dev/null +++ b/lustre/portals/knals/iibnal/autoMakefile.am @@ -0,0 +1,15 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +if MODULES +if !CRAY_PORTALS +if BUILD_IIBNAL +modulenet_DATA = kiibnal$(KMODEXT) +endif +endif +endif + +MOSTLYCLEANFILES = *.o *.ko *.mod.c +DIST_SOURCES = $(kiibnal-objs:%.o=%.c) iibnal.h diff --git a/lustre/portals/knals/iibnal/iibnal.c b/lustre/portals/knals/iibnal/iibnal.c new file mode 100644 index 0000000..09908c9 --- /dev/null +++ b/lustre/portals/knals/iibnal/iibnal.c @@ -0,0 +1,1713 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "iibnal.h" + +nal_t kibnal_api; +ptl_handle_ni_t kibnal_ni; +kib_tunables_t kibnal_tunables; + +kib_data_t kibnal_data = { + .kib_service_id = IBNAL_SERVICE_NUMBER, +}; + +#ifdef CONFIG_SYSCTL +#define IBNAL_SYSCTL 202 + +#define IBNAL_SYSCTL_TIMEOUT 1 + +static ctl_table kibnal_ctl_table[] = { + {IBNAL_SYSCTL_TIMEOUT, "timeout", + &kibnal_tunables.kib_io_timeout, sizeof (int), + 0644, NULL, &proc_dointvec}, + { 0 } +}; + +static ctl_table kibnal_top_ctl_table[] = { + {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table}, + { 0 } +}; +#endif + +#ifdef unused +void +print_service(IB_SERVICE_RECORD *service, char *tag, int rc) +{ + char name[32]; + + if (service == NULL) + { + CWARN("tag : %s\n" + "status : %d (NULL)\n", tag, rc); + return; + } + strncpy (name, service->ServiceName, sizeof(name)-1); + name[sizeof(name)-1] = 0; + + CWARN("tag : %s\n" + "status : %d\n" + "service id: "LPX64"\n" + "name : %s\n" + "NID : "LPX64"\n", tag, rc, + service->RID.ServiceID, name, + *kibnal_service_nid_field(service)); +} +#endif + +static void +kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod, + FSTATUS frc, uint32 madrc) +{ + *(FSTATUS *)arg = frc; + up (&kibnal_data.kib_nid_signal); +} + +#if IBNAL_CHECK_ADVERT +static void +kibnal_service_query_done (void *arg, QUERY *qry, + QUERY_RESULT_VALUES *qry_result) +{ + FSTATUS frc = qry_result->Status; + + if (frc != FSUCCESS && + qry_result->ResultDataSize == 0) + frc = FERROR; + + *(FSTATUS *)arg = frc; + up (&kibnal_data.kib_nid_signal); +} + +static void +kibnal_check_advert (void) +{ + QUERY *qry; + IB_SERVICE_RECORD *svc; + FSTATUS frc; + FSTATUS frc2; + + PORTAL_ALLOC(qry, sizeof(*qry)); + if (qry == NULL) + return; + + memset (qry, 0, sizeof(*qry)); + qry->InputType = InputTypeServiceRecord; + qry->OutputType = OutputTypeServiceRecord; + qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; + svc = &qry->InputValue.ServiceRecordValue.ServiceRecord; + kibnal_set_service_keys(svc, kibnal_data.kib_nid); + + frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + qry, + kibnal_service_query_done, + NULL, &frc2); + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("Immediate error %d checking SM service\n", frc); + } else { + down (&kibnal_data.kib_nid_signal); + frc = frc2; + + if (frc != 0) + CERROR ("Error %d checking SM service\n", rc); + } + + return (rc); +} +#endif + +static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type) +{ + IB_SERVICE_RECORD *svc; + + memset (fod, 0, sizeof(*fod)); + fod->Type = type; + + svc = &fod->Value.ServiceRecordValue.ServiceRecord; + svc->RID.ServiceID = kibnal_data.kib_service_id; + svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid; + svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX; + svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey; + svc->ServiceLease = 0xffffffff; + + kibnal_set_service_keys(svc, kibnal_data.kib_nid); +} + +static int +kibnal_advertise (void) +{ + FABRIC_OPERATION_DATA *fod; + IB_SERVICE_RECORD *svc; + FSTATUS frc; + FSTATUS frc2; + + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + + PORTAL_ALLOC(fod, sizeof(*fod)); + if (fod == NULL) + return (-ENOMEM); + + fill_fod(fod, FabOpSetServiceRecord); + svc = &fod->Value.ServiceRecordValue.ServiceRecord; + + CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", + svc->RID.ServiceID, + svc->ServiceName, *kibnal_service_nid_field(svc)); + + frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + fod, kibnal_service_setunset_done, + NULL, &frc2); + + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("Immediate error %d advertising NID "LPX64"\n", + frc, kibnal_data.kib_nid); + goto out; + } + + down (&kibnal_data.kib_nid_signal); + + frc = frc2; + if (frc != FSUCCESS) + CERROR ("Error %d advertising BUD "LPX64"\n", + frc, kibnal_data.kib_nid); +out: + PORTAL_FREE(fod, sizeof(*fod)); + return (frc == FSUCCESS) ? 0 : -EINVAL; +} + +static void +kibnal_unadvertise (int expect_success) +{ + FABRIC_OPERATION_DATA *fod; + IB_SERVICE_RECORD *svc; + FSTATUS frc; + FSTATUS frc2; + + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + + PORTAL_ALLOC(fod, sizeof(*fod)); + if (fod == NULL) + return; + + fill_fod(fod, FabOpDeleteServiceRecord); + svc = &fod->Value.ServiceRecordValue.ServiceRecord; + + CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n", + svc->ServiceName, *kibnal_service_nid_field(svc)); + + frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + fod, kibnal_service_setunset_done, + NULL, &frc2); + + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("Immediate error %d unadvertising NID "LPX64"\n", + frc, kibnal_data.kib_nid); + goto out; + } + + down (&kibnal_data.kib_nid_signal); + + if ((frc2 == FSUCCESS) == !!expect_success) + goto out; + + if (expect_success) + CERROR("Error %d unadvertising NID "LPX64"\n", + frc2, kibnal_data.kib_nid); + else + CWARN("Removed conflicting NID "LPX64"\n", + kibnal_data.kib_nid); + out: + PORTAL_FREE(fod, sizeof(*fod)); +} + +static int +kibnal_set_mynid(ptl_nid_t nid) +{ + struct timeval tv; + lib_ni_t *ni = &kibnal_lib.libnal_ni; + int rc; + FSTATUS frc; + + CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", + nid, ni->ni_pid.nid); + + do_gettimeofday(&tv); + + down (&kibnal_data.kib_nid_mutex); + + if (nid == kibnal_data.kib_nid) { + /* no change of NID */ + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", + kibnal_data.kib_nid, nid); + + if (kibnal_data.kib_nid != PTL_NID_ANY) { + + kibnal_unadvertise (1); + + frc = iibt_cm_cancel(kibnal_data.kib_cep); + if (frc != FSUCCESS && frc != FPENDING) + CERROR ("Error %d stopping listener\n", frc); + + frc = iibt_cm_destroy_cep(kibnal_data.kib_cep); + if (frc != FSUCCESS) + CERROR ("Error %d destroying CEP\n", frc); + + kibnal_data.kib_cep = NULL; + } + + kibnal_data.kib_nid = ni->ni_pid.nid = nid; + kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + + /* Delete all existing peers and their connections after new + * NID/incarnation set to ensure no old connections in our brave + * new world. */ + kibnal_del_peer (PTL_NID_ANY, 0); + + if (kibnal_data.kib_nid == PTL_NID_ANY) { + /* No new NID to install */ + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + /* remove any previous advert (crashed node etc) */ + kibnal_unadvertise(0); + + kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE); + if (kibnal_data.kib_cep == NULL) { + CERROR ("Can't create CEP\n"); + rc = -ENOMEM; + } else { + CM_LISTEN_INFO info; + memset (&info, 0, sizeof(info)); + info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id; + + frc = iibt_cm_listen(kibnal_data.kib_cep, &info, + kibnal_listen_callback, NULL); + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("iibt_cm_listen error: %d\n", frc); + rc = -EINVAL; + } else { + rc = 0; + } + } + + if (rc == 0) { + rc = kibnal_advertise(); + if (rc == 0) { +#if IBNAL_CHECK_ADVERT + kibnal_check_advert(); +#endif + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + iibt_cm_cancel (kibnal_data.kib_cep); + iibt_cm_destroy_cep (kibnal_data.kib_cep); + /* remove any peers that sprung up while I failed to + * advertise myself */ + kibnal_del_peer (PTL_NID_ANY, 0); + } + + kibnal_data.kib_nid = PTL_NID_ANY; + up (&kibnal_data.kib_nid_mutex); + return (rc); +} + +kib_peer_t * +kibnal_create_peer (ptl_nid_t nid) +{ + kib_peer_t *peer; + + LASSERT (nid != PTL_NID_ANY); + + PORTAL_ALLOC (peer, sizeof (*peer)); + if (peer == NULL) + return (NULL); + + memset(peer, 0, sizeof(*peer)); /* zero flags etc */ + + peer->ibp_nid = nid; + atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */ + + INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */ + INIT_LIST_HEAD (&peer->ibp_conns); + INIT_LIST_HEAD (&peer->ibp_tx_queue); + + peer->ibp_reconnect_time = jiffies; + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + + atomic_inc (&kibnal_data.kib_npeers); + return (peer); +} + +void +kibnal_destroy_peer (kib_peer_t *peer) +{ + + LASSERT (atomic_read (&peer->ibp_refcount) == 0); + LASSERT (peer->ibp_persistence == 0); + LASSERT (!kibnal_peer_active(peer)); + LASSERT (peer->ibp_connecting == 0); + LASSERT (list_empty (&peer->ibp_conns)); + LASSERT (list_empty (&peer->ibp_tx_queue)); + + PORTAL_FREE (peer, sizeof (*peer)); + + /* NB a peer's connections keep a reference on their peer until + * they are destroyed, so we can be assured that _all_ state to do + * with this peer has been cleaned up when its refcount drops to + * zero. */ + atomic_dec (&kibnal_data.kib_npeers); +} + +/* the caller is responsible for accounting for the additional reference + * that this creates */ +kib_peer_t * +kibnal_find_peer_locked (ptl_nid_t nid) +{ + struct list_head *peer_list = kibnal_nid2peerlist (nid); + struct list_head *tmp; + kib_peer_t *peer; + + list_for_each (tmp, peer_list) { + + peer = list_entry (tmp, kib_peer_t, ibp_list); + + LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ + peer->ibp_connecting != 0 || /* creating conns */ + !list_empty (&peer->ibp_conns)); /* active conn */ + + if (peer->ibp_nid != nid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", + peer, nid, atomic_read (&peer->ibp_refcount)); + return (peer); + } + return (NULL); +} + +kib_peer_t * +kibnal_get_peer (ptl_nid_t nid) +{ + kib_peer_t *peer; + + read_lock (&kibnal_data.kib_global_lock); + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) /* +1 ref for caller? */ + kib_peer_addref(peer); + read_unlock (&kibnal_data.kib_global_lock); + + return (peer); +} + +void +kibnal_unlink_peer_locked (kib_peer_t *peer) +{ + LASSERT (peer->ibp_persistence == 0); + LASSERT (list_empty(&peer->ibp_conns)); + + LASSERT (kibnal_peer_active(peer)); + list_del_init (&peer->ibp_list); + /* lose peerlist's ref */ + kib_peer_decref(peer); +} + +static int +kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) +{ + kib_peer_t *peer; + struct list_head *ptmp; + int i; + + read_lock (&kibnal_data.kib_global_lock); + + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { + + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence != 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + if (index-- > 0) + continue; + + *nidp = peer->ibp_nid; + *persistencep = peer->ibp_persistence; + + read_unlock (&kibnal_data.kib_global_lock); + return (0); + } + } + + read_unlock (&kibnal_data.kib_global_lock); + return (-ENOENT); +} + +static int +kibnal_add_persistent_peer (ptl_nid_t nid) +{ + unsigned long flags; + kib_peer_t *peer; + kib_peer_t *peer2; + + if (nid == PTL_NID_ANY) + return (-EINVAL); + + peer = kibnal_create_peer (nid); + if (peer == NULL) + return (-ENOMEM); + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + peer2 = kibnal_find_peer_locked (nid); + if (peer2 != NULL) { + kib_peer_decref (peer); + peer = peer2; + } else { + /* peer table takes existing ref on peer */ + list_add_tail (&peer->ibp_list, + kibnal_nid2peerlist (nid)); + } + + peer->ibp_persistence++; + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + return (0); +} + +static void +kibnal_del_peer_locked (kib_peer_t *peer, int single_share) +{ + struct list_head *ctmp; + struct list_head *cnxt; + kib_conn_t *conn; + + if (!single_share) + peer->ibp_persistence = 0; + else if (peer->ibp_persistence > 0) + peer->ibp_persistence--; + + if (peer->ibp_persistence != 0) + return; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + kibnal_close_conn_locked (conn, 0); + } + + /* NB peer unlinks itself when last conn is closed */ +} + +int +kibnal_del_peer (ptl_nid_t nid, int single_share) +{ + unsigned long flags; + struct list_head *ptmp; + struct list_head *pnxt; + kib_peer_t *peer; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + if (nid != PTL_NID_ANY) + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; + else { + lo = 0; + hi = kibnal_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence != 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) + continue; + + kibnal_del_peer_locked (peer, single_share); + rc = 0; /* matched something */ + + if (single_share) + goto out; + } + } + out: + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + return (rc); +} + +static kib_conn_t * +kibnal_get_conn_by_idx (int index) +{ + kib_peer_t *peer; + struct list_head *ptmp; + kib_conn_t *conn; + struct list_head *ctmp; + int i; + + read_lock (&kibnal_data.kib_global_lock); + + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { + + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence > 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + list_for_each (ctmp, &peer->ibp_conns) { + if (index-- > 0) + continue; + + conn = list_entry (ctmp, kib_conn_t, ibc_list); + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + read_unlock (&kibnal_data.kib_global_lock); + return (conn); + } + } + } + + read_unlock (&kibnal_data.kib_global_lock); + return (NULL); +} + +kib_conn_t * +kibnal_create_conn (void) +{ + kib_conn_t *conn; + int i; + __u64 vaddr = 0; + __u64 vaddr_base; + int page_offset; + int ipage; + int rc; + FSTATUS frc; + union { + IB_QP_ATTRIBUTES_CREATE qp_create; + IB_QP_ATTRIBUTES_MODIFY qp_attr; + } params; + + PORTAL_ALLOC (conn, sizeof (*conn)); + if (conn == NULL) { + CERROR ("Can't allocate connection\n"); + return (NULL); + } + + /* zero flags, NULL pointers etc... */ + memset (conn, 0, sizeof (*conn)); + + INIT_LIST_HEAD (&conn->ibc_tx_queue); + INIT_LIST_HEAD (&conn->ibc_active_txs); + spin_lock_init (&conn->ibc_lock); + + atomic_inc (&kibnal_data.kib_nconns); + /* well not really, but I call destroy() on failure, which decrements */ + + PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); + if (conn->ibc_rxs == NULL) + goto failed; + memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); + + rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1); + if (rc != 0) + goto failed; + + vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; + + for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { + struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; + kib_rx_t *rx = &conn->ibc_rxs[i]; + + rx->rx_conn = conn; + rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + + if (kibnal_whole_mem()) + rx->rx_vaddr = kibnal_page2phys(page) + + page_offset + + kibnal_data.kib_md.md_addr; + else + rx->rx_vaddr = vaddr; + + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); + + page_offset += IBNAL_MSG_SIZE; + LASSERT (page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT (ipage <= IBNAL_RX_MSG_PAGES); + } + } + + params.qp_create = (IB_QP_ATTRIBUTES_CREATE) { + .Type = QPTypeReliableConnected, + .SendQDepth = IBNAL_TX_MAX_SG * + IBNAL_MSG_QUEUE_SIZE, + .RecvQDepth = IBNAL_MSG_QUEUE_SIZE, + .SendDSListDepth = 1, + .RecvDSListDepth = 1, + .SendCQHandle = kibnal_data.kib_cq, + .RecvCQHandle = kibnal_data.kib_cq, + .PDHandle = kibnal_data.kib_pd, + .SendSignaledCompletions = TRUE, + }; + frc = iibt_qp_create(kibnal_data.kib_hca, ¶ms.qp_create, NULL, + &conn->ibc_qp, &conn->ibc_qp_attrs); + if (rc != 0) { + CERROR ("Failed to create queue pair: %d\n", rc); + goto failed; + } + + /* Mark QP created */ + conn->ibc_state = IBNAL_CONN_INIT_QP; + + params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) { + .RequestState = QPStateInit, + .Attrs = (IB_QP_ATTR_PORTGUID | + IB_QP_ATTR_PKEYINDEX | + IB_QP_ATTR_ACCESSCONTROL), + .PortGUID = kibnal_data.kib_port_guid, + .PkeyIndex = 0, + .AccessControl = { + .s = { + .RdmaWrite = 1, + .RdmaRead = 1, + }, + }, + }; + rc = iibt_qp_modify(conn->ibc_qp, ¶ms.qp_attr, NULL); + if (rc != 0) { + CERROR ("Failed to modify queue pair: %d\n", rc); + goto failed; + } + + /* 1 ref for caller */ + atomic_set (&conn->ibc_refcount, 1); + return (conn); + + failed: + kibnal_destroy_conn (conn); + return (NULL); +} + +void +kibnal_destroy_conn (kib_conn_t *conn) +{ + int rc; + FSTATUS frc; + + CDEBUG (D_NET, "connection %p\n", conn); + + LASSERT (atomic_read (&conn->ibc_refcount) == 0); + LASSERT (list_empty(&conn->ibc_tx_queue)); + LASSERT (list_empty(&conn->ibc_active_txs)); + LASSERT (conn->ibc_nsends_posted == 0); + LASSERT (conn->ibc_connreq == NULL); + + switch (conn->ibc_state) { + case IBNAL_CONN_DISCONNECTED: + /* called after connection sequence initiated */ + /* fall through */ + + case IBNAL_CONN_INIT_QP: + /* _destroy includes an implicit Reset of the QP which + * discards posted work */ + rc = iibt_qp_destroy(conn->ibc_qp); + if (rc != 0) + CERROR("Can't destroy QP: %d\n", rc); + /* fall through */ + + case IBNAL_CONN_INIT_NOTHING: + break; + + default: + LASSERT (0); + } + + if (conn->ibc_cep != NULL) { + frc = iibt_cm_destroy_cep(conn->ibc_cep); + if (frc != 0) + CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, + frc); + } + + if (conn->ibc_rx_pages != NULL) + kibnal_free_pages(conn->ibc_rx_pages); + + if (conn->ibc_rxs != NULL) + PORTAL_FREE(conn->ibc_rxs, + IBNAL_RX_MSGS * sizeof(kib_rx_t)); + + if (conn->ibc_peer != NULL) + kib_peer_decref(conn->ibc_peer); + + PORTAL_FREE(conn, sizeof (*conn)); + + atomic_dec(&kibnal_data.kib_nconns); + + if (atomic_read (&kibnal_data.kib_nconns) == 0 && + kibnal_data.kib_shutdown) { + /* I just nuked the last connection on shutdown; wake up + * everyone so they can exit. */ + wake_up_all(&kibnal_data.kib_sched_waitq); + wake_up_all(&kibnal_data.kib_connd_waitq); + } +} + +void +kibnal_put_conn (kib_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + + LASSERT (atomic_read (&conn->ibc_refcount) > 0); + if (!atomic_dec_and_test (&conn->ibc_refcount)) + return; + + /* must disconnect before dropping the final ref */ + LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + + list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); +} + +static int +kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + count++; + kibnal_close_conn_locked (conn, why); + } + + return (count); +} + +int +kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + if (conn->ibc_incarnation == incarnation) + continue; + + CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n", + peer->ibp_nid, conn->ibc_incarnation, incarnation); + + count++; + kibnal_close_conn_locked (conn, -ESTALE); + } + + return (count); +} + +static int +kibnal_close_matching_conns (ptl_nid_t nid) +{ + unsigned long flags; + kib_peer_t *peer; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + int count = 0; + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + if (nid != PTL_NID_ANY) + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; + else { + lo = 0; + hi = kibnal_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { + + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence != 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) + continue; + + count += kibnal_close_peer_conns_locked (peer, 0); + } + } + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + /* wildcards always succeed */ + if (nid == PTL_NID_ANY) + return (0); + + return (count == 0 ? -ENOENT : 0); +} + +static int +kibnal_cmd(struct portals_cfg *pcfg, void * private) +{ + int rc = -EINVAL; + ENTRY; + + LASSERT (pcfg != NULL); + + switch(pcfg->pcfg_command) { + case NAL_CMD_GET_PEER: { + ptl_nid_t nid = 0; + int share_count = 0; + + rc = kibnal_get_peer_info(pcfg->pcfg_count, + &nid, &share_count); + pcfg->pcfg_nid = nid; + pcfg->pcfg_size = 0; + pcfg->pcfg_id = 0; + pcfg->pcfg_misc = 0; + pcfg->pcfg_count = 0; + pcfg->pcfg_wait = share_count; + break; + } + case NAL_CMD_ADD_PEER: { + rc = kibnal_add_persistent_peer (pcfg->pcfg_nid); + break; + } + case NAL_CMD_DEL_PEER: { + rc = kibnal_del_peer (pcfg->pcfg_nid, + /* flags == single_share */ + pcfg->pcfg_flags != 0); + break; + } + case NAL_CMD_GET_CONN: { + kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); + + if (conn == NULL) + rc = -ENOENT; + else { + rc = 0; + pcfg->pcfg_nid = conn->ibc_peer->ibp_nid; + pcfg->pcfg_id = 0; + pcfg->pcfg_misc = 0; + pcfg->pcfg_flags = 0; + kibnal_put_conn (conn); + } + break; + } + case NAL_CMD_CLOSE_CONNECTION: { + rc = kibnal_close_matching_conns (pcfg->pcfg_nid); + break; + } + case NAL_CMD_REGISTER_MYNID: { + if (pcfg->pcfg_nid == PTL_NID_ANY) + rc = -EINVAL; + else + rc = kibnal_set_mynid (pcfg->pcfg_nid); + break; + } + } + + RETURN(rc); +} + +void +kibnal_free_pages (kib_pages_t *p) +{ + int npages = p->ibp_npages; + int rc; + int i; + + if (p->ibp_mapped) { + rc = iibt_deregister_memory(p->ibp_handle); + if (rc != 0) + CERROR ("Deregister error: %d\n", rc); + } + + for (i = 0; i < npages; i++) + if (p->ibp_pages[i] != NULL) + __free_page(p->ibp_pages[i]); + + PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); +} + +int +kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) +{ + kib_pages_t *p; + __u64 *phys_pages; + int i; + FSTATUS frc; + IB_ACCESS_CONTROL access; + + memset(&access, 0, sizeof(access)); + access.s.MWBindable = 1; + access.s.LocalWrite = 1; + access.s.RdmaRead = 1; + access.s.RdmaWrite = 1; + + PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); + if (p == NULL) { + CERROR ("Can't allocate buffer %d\n", npages); + return (-ENOMEM); + } + + memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); + p->ibp_npages = npages; + + for (i = 0; i < npages; i++) { + p->ibp_pages[i] = alloc_page (GFP_KERNEL); + if (p->ibp_pages[i] == NULL) { + CERROR ("Can't allocate page %d of %d\n", i, npages); + kibnal_free_pages(p); + return (-ENOMEM); + } + } + + if (kibnal_whole_mem()) + goto out; + + PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages)); + if (phys_pages == NULL) { + CERROR ("Can't allocate physarray for %d pages\n", npages); + /* XXX free ibp_pages? */ + kibnal_free_pages(p); + return (-ENOMEM); + } + + /* if we were using the _contig_ registration variant we would have + * an array of PhysAddr/Length pairs, but the discontiguous variant + * just takes the PhysAddr */ + for (i = 0; i < npages; i++) + phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]); + + frc = iibt_register_physical_memory(kibnal_data.kib_hca, + 0, /* requested vaddr */ + phys_pages, npages, + 0, /* offset */ + kibnal_data.kib_pd, + access, + &p->ibp_handle, &p->ibp_vaddr, + &p->ibp_lkey, &p->ibp_rkey); + + PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages)); + + if (frc != FSUCCESS) { + CERROR ("Error %d mapping %d pages\n", frc, npages); + kibnal_free_pages(p); + return (-ENOMEM); + } + + CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" " + "lkey %x rkey %x\n", npages, p->ibp_handle, + p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey); + + p->ibp_mapped = 1; +out: + *pp = p; + return (0); +} + +static int +kibnal_setup_tx_descs (void) +{ + int ipage = 0; + int page_offset = 0; + __u64 vaddr; + __u64 vaddr_base; + struct page *page; + kib_tx_t *tx; + int i; + int rc; + + /* pre-mapped messages are not bigger than 1 page */ + LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); + + /* No fancy arithmetic when we do the buffer calculations */ + LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); + + rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, + 0); + if (rc != 0) + return (rc); + + /* ignored for the whole_mem case */ + vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; + + for (i = 0; i < IBNAL_TX_MSGS; i++) { + page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; + tx = &kibnal_data.kib_tx_descs[i]; + + memset (tx, 0, sizeof(*tx)); /* zero flags etc */ + + tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + + if (kibnal_whole_mem()) + tx->tx_vaddr = kibnal_page2phys(page) + + page_offset + + kibnal_data.kib_md.md_addr; + else + tx->tx_vaddr = vaddr; + + tx->tx_isnblk = (i >= IBNAL_NTX); + tx->tx_mapped = KIB_TX_UNMAPPED; + + CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", + i, tx, tx->tx_msg, tx->tx_vaddr); + + if (tx->tx_isnblk) + list_add (&tx->tx_list, + &kibnal_data.kib_idle_nblk_txs); + else + list_add (&tx->tx_list, + &kibnal_data.kib_idle_txs); + + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); + + page_offset += IBNAL_MSG_SIZE; + LASSERT (page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT (ipage <= IBNAL_TX_MSG_PAGES); + } + } + + return (0); +} + +static void +kibnal_api_shutdown (nal_t *nal) +{ + int i; + int rc; + + if (nal->nal_refct != 0) { + /* This module got the first ref */ + PORTAL_MODULE_UNUSE; + return; + } + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + LASSERT(nal == &kibnal_api); + + switch (kibnal_data.kib_init) { + default: + CERROR ("Unexpected state %d\n", kibnal_data.kib_init); + LBUG(); + + case IBNAL_INIT_ALL: + /* stop calls to nal_cmd */ + libcfs_nal_cmd_unregister(IIBNAL); + /* No new peers */ + + /* resetting my NID to unadvertises me, removes my + * listener and nukes all current peers */ + kibnal_set_mynid (PTL_NID_ANY); + + /* Wait for all peer state to clean up (crazy) */ + i = 2; + while (atomic_read (&kibnal_data.kib_npeers) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d peers to disconnect (can take a few seconds)\n", + atomic_read (&kibnal_data.kib_npeers)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + /* fall through */ + + case IBNAL_INIT_CQ: + rc = iibt_cq_destroy(kibnal_data.kib_cq); + if (rc != 0) + CERROR ("Destroy CQ error: %d\n", rc); + /* fall through */ + + case IBNAL_INIT_TXD: + kibnal_free_pages (kibnal_data.kib_tx_pages); + /* fall through */ + + case IBNAL_INIT_MR: + if (kibnal_data.kib_md.md_handle != NULL) { + rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle); + if (rc != FSUCCESS) + CERROR ("Deregister memory: %d\n", rc); + } + /* fall through */ + +#if IBNAL_FMR + case IBNAL_INIT_FMR: + rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); + if (rc != 0) + CERROR ("Destroy FMR pool error: %d\n", rc); + /* fall through */ +#endif + case IBNAL_INIT_PD: + rc = iibt_pd_free(kibnal_data.kib_pd); + if (rc != 0) + CERROR ("Destroy PD error: %d\n", rc); + /* fall through */ + + case IBNAL_INIT_SD: + rc = iibt_sd_deregister(kibnal_data.kib_sd); + if (rc != 0) + CERROR ("Deregister SD error: %d\n", rc); + /* fall through */ + + case IBNAL_INIT_PORT: + /* XXX ??? */ + /* fall through */ + + case IBNAL_INIT_PORTATTRS: + PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList, + kibnal_data.kib_hca_attrs.PortAttributesListSize); + /* fall through */ + + case IBNAL_INIT_HCA: + rc = iibt_close_hca(kibnal_data.kib_hca); + if (rc != 0) + CERROR ("Close HCA error: %d\n", rc); + /* fall through */ + + case IBNAL_INIT_LIB: + lib_fini(&kibnal_lib); + /* fall through */ + + case IBNAL_INIT_DATA: + /* Module refcount only gets to zero when all peers + * have been closed so all lists must be empty */ + LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); + LASSERT (kibnal_data.kib_peers != NULL); + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + LASSERT (list_empty (&kibnal_data.kib_peers[i])); + } + LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); + LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); + LASSERT (list_empty (&kibnal_data.kib_sched_txq)); + LASSERT (list_empty (&kibnal_data.kib_connd_conns)); + LASSERT (list_empty (&kibnal_data.kib_connd_peers)); + + /* flag threads to terminate; wake and wait for them to die */ + kibnal_data.kib_shutdown = 1; + wake_up_all (&kibnal_data.kib_sched_waitq); + wake_up_all (&kibnal_data.kib_connd_waitq); + + i = 2; + while (atomic_read (&kibnal_data.kib_nthreads) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d threads to terminate\n", + atomic_read (&kibnal_data.kib_nthreads)); + set_current_state (TASK_INTERRUPTIBLE); + schedule_timeout (HZ); + } + /* fall through */ + + case IBNAL_INIT_NOTHING: + break; + } + + if (kibnal_data.kib_tx_descs != NULL) + PORTAL_FREE (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + + if (kibnal_data.kib_peers != NULL) + PORTAL_FREE (kibnal_data.kib_peers, + sizeof (struct list_head) * + kibnal_data.kib_peer_hash_size); + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); + + kibnal_data.kib_init = IBNAL_INIT_NOTHING; +} + +#define roundup_power(val, power) \ + ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) ) + +/* this isn't very portable or sturdy in the face of funny mem/bus configs */ +static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr) +{ + struct sysinfo si; + __u64 ret; + + /* XXX we don't bother with first-gen cards */ + if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101) + return 0ULL; + + si_meminfo(&si); + ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit; + return roundup_power(ret, 128 * 1024 * 1024); +} +#undef roundup_power + +static int +kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, + ptl_ni_limits_t *requested_limits, + ptl_ni_limits_t *actual_limits) +{ + ptl_process_id_t process_id; + int pkmem = atomic_read(&portal_kmemory); + IB_PORT_ATTRIBUTES *pattr; + FSTATUS frc; + int rc; + int n; + int i; + + LASSERT (nal == &kibnal_api); + + if (nal->nal_refct != 0) { + if (actual_limits != NULL) + *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; + /* This module got the first ref */ + PORTAL_MODULE_USE; + return (PTL_OK); + } + + LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); + + frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, + &kibnal_data.kib_interfaces); + if (frc != FSUCCESS) { + CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n", + frc); + return -ENOSYS; + } + + init_MUTEX (&kibnal_data.kib_nid_mutex); + init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal); + kibnal_data.kib_nid = PTL_NID_ANY; + + rwlock_init(&kibnal_data.kib_global_lock); + + kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; + PORTAL_ALLOC (kibnal_data.kib_peers, + sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); + if (kibnal_data.kib_peers == NULL) { + goto failed; + } + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) + INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); + + spin_lock_init (&kibnal_data.kib_connd_lock); + INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); + INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); + init_waitqueue_head (&kibnal_data.kib_connd_waitq); + + spin_lock_init (&kibnal_data.kib_sched_lock); + INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); + INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); + init_waitqueue_head (&kibnal_data.kib_sched_waitq); + + spin_lock_init (&kibnal_data.kib_tx_lock); + INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); + INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); + init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); + + PORTAL_ALLOC (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + if (kibnal_data.kib_tx_descs == NULL) { + CERROR ("Can't allocate tx descs\n"); + goto failed; + } + + /* lists/ptrs/locks initialised */ + kibnal_data.kib_init = IBNAL_INIT_DATA; + /*****************************************************/ + + process_id.pid = 0; + process_id.nid = kibnal_data.kib_nid; + + rc = lib_init(&kibnal_lib, nal, process_id, + requested_limits, actual_limits); + if (rc != PTL_OK) { + CERROR("lib_init failed: error %d\n", rc); + goto failed; + } + + /* lib interface initialised */ + kibnal_data.kib_init = IBNAL_INIT_LIB; + /*****************************************************/ + + for (i = 0; i < IBNAL_N_SCHED; i++) { + rc = kibnal_thread_start (kibnal_scheduler, (void *)i); + if (rc != 0) { + CERROR("Can't spawn iibnal scheduler[%d]: %d\n", + i, rc); + goto failed; + } + } + + rc = kibnal_thread_start (kibnal_connd, NULL); + if (rc != 0) { + CERROR ("Can't spawn iibnal connd: %d\n", rc); + goto failed; + } + + n = sizeof(kibnal_data.kib_hca_guids) / + sizeof(kibnal_data.kib_hca_guids[0]); + frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids); + if (frc != FSUCCESS) { + CERROR ("Can't get channel adapter guids: %d\n", frc); + goto failed; + } + if (n == 0) { + CERROR ("No channel adapters found\n"); + goto failed; + } + + /* Infinicon has per-HCA rather than per CQ completion handlers */ + frc = iibt_open_hca(kibnal_data.kib_hca_guids[0], + kibnal_ca_callback, + kibnal_ca_async_callback, + &kibnal_data.kib_hca, + &kibnal_data.kib_hca); + if (frc != FSUCCESS) { + CERROR ("Can't open CA[0]: %d\n", frc); + goto failed; + } + + /* Channel Adapter opened */ + kibnal_data.kib_init = IBNAL_INIT_HCA; + /*****************************************************/ + + kibnal_data.kib_hca_attrs.PortAttributesList = NULL; + kibnal_data.kib_hca_attrs.PortAttributesListSize = 0; + frc = iibt_query_hca(kibnal_data.kib_hca, + &kibnal_data.kib_hca_attrs, NULL); + if (frc != FSUCCESS) { + CERROR ("Can't size port attrs: %d\n", frc); + goto failed; + } + + PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList, + kibnal_data.kib_hca_attrs.PortAttributesListSize); + if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL) + goto failed; + + /* Port attrs allocated */ + kibnal_data.kib_init = IBNAL_INIT_PORTATTRS; + /*****************************************************/ + + frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs, + NULL); + if (frc != FSUCCESS) { + CERROR ("Can't get port attrs for CA 0: %d\n", frc); + goto failed; + } + + for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList; + pattr != NULL; + i++, pattr = pattr->Next) { + switch (pattr->PortState) { + default: + CERROR("Unexpected port[%d] state %d\n", + i, pattr->PortState); + continue; + case PortStateDown: + CDEBUG(D_NET, "port[%d] Down\n", i); + continue; + case PortStateInit: + CDEBUG(D_NET, "port[%d] Init\n", i); + continue; + case PortStateArmed: + CDEBUG(D_NET, "port[%d] Armed\n", i); + continue; + + case PortStateActive: + CDEBUG(D_NET, "port[%d] Active\n", i); + kibnal_data.kib_port = i; + kibnal_data.kib_port_guid = pattr->GUID; + kibnal_data.kib_port_pkey = pattr->PkeyTable[0]; + break; + } + break; + } + + if (pattr == NULL) { + CERROR ("Can't find an active port\n"); + goto failed; + } + + CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid); + + /* Active port found */ + kibnal_data.kib_init = IBNAL_INIT_PORT; + /*****************************************************/ + + frc = iibt_sd_register(&kibnal_data.kib_sd, NULL); + if (frc != FSUCCESS) { + CERROR ("Can't register with SD: %d\n", frc); + goto failed; + } + + /* Registered with SD OK */ + kibnal_data.kib_init = IBNAL_INIT_SD; + /*****************************************************/ + + frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd); + if (frc != FSUCCESS) { + CERROR ("Can't create PD: %d\n", rc); + goto failed; + } + + /* flag PD initialised */ + kibnal_data.kib_init = IBNAL_INIT_PD; + /*****************************************************/ + +#if IBNAL_FMR + { + const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; + struct ib_fmr_pool_param params = { + .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, + .access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ), + .pool_size = pool_size, + .dirty_watermark = (pool_size * 3)/4, + .flush_function = NULL, + .flush_arg = NULL, + .cache = 1, + }; + rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, + &kibnal_data.kib_fmr_pool); + if (rc != 0) { + CERROR ("Can't create FMR pool size %d: %d\n", + pool_size, rc); + goto failed; + } + } + + /* flag FMR pool initialised */ + kibnal_data.kib_init = IBNAL_INIT_FMR; +#endif + /*****************************************************/ + if (IBNAL_WHOLE_MEM) { + IB_MR_PHYS_BUFFER phys; + IB_ACCESS_CONTROL access; + kib_md_t *md = &kibnal_data.kib_md; + + memset(&access, 0, sizeof(access)); + access.s.MWBindable = 1; + access.s.LocalWrite = 1; + access.s.RdmaRead = 1; + access.s.RdmaWrite = 1; + + phys.PhysAddr = 0; + phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs); + if (phys.Length == 0) { + CERROR ("couldn't determine the end of phys mem\n"); + goto failed; + } + + rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca, + 0, + &phys, 1, + 0, + kibnal_data.kib_pd, + access, + &md->md_handle, + &md->md_addr, + &md->md_lkey, + &md->md_rkey); + if (rc != FSUCCESS) { + CERROR("registering physical memory failed: %d\n", + rc); + CERROR("falling back to registration per-rdma\n"); + md->md_handle = NULL; + } else { + CDEBUG(D_NET, "registered "LPU64" bytes of mem\n", + phys.Length); + kibnal_data.kib_init = IBNAL_INIT_MR; + } + } + + /*****************************************************/ + + rc = kibnal_setup_tx_descs(); + if (rc != 0) { + CERROR ("Can't register tx descs: %d\n", rc); + goto failed; + } + + /* flag TX descs initialised */ + kibnal_data.kib_init = IBNAL_INIT_TXD; + /*****************************************************/ + + { + uint32 nentries; + + frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES, + &kibnal_data.kib_cq, &kibnal_data.kib_cq, + &nentries); + if (frc != FSUCCESS) { + CERROR ("Can't create RX CQ: %d\n", frc); + goto failed; + } + + /* flag CQ initialised */ + kibnal_data.kib_init = IBNAL_INIT_CQ; + + if (nentries < IBNAL_CQ_ENTRIES) { + CERROR ("CQ only has %d entries, need %d\n", + nentries, IBNAL_CQ_ENTRIES); + goto failed; + } + + rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC); + if (rc != 0) { + CERROR ("Failed to re-arm completion queue: %d\n", rc); + goto failed; + } + } + + /*****************************************************/ + + rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL); + if (rc != 0) { + CERROR ("Can't initialise command interface (rc = %d)\n", rc); + goto failed; + } + + /* flag everything initialised */ + kibnal_data.kib_init = IBNAL_INIT_ALL; + /*****************************************************/ + + printk(KERN_INFO "Lustre: Infinicon IB NAL loaded " + "(initial mem %d)\n", pkmem); + + return (PTL_OK); + + failed: + kibnal_api_shutdown (&kibnal_api); + return (PTL_FAIL); +} + +void __exit +kibnal_module_fini (void) +{ +#ifdef CONFIG_SYSCTL + if (kibnal_tunables.kib_sysctl != NULL) + unregister_sysctl_table (kibnal_tunables.kib_sysctl); +#endif + PtlNIFini(kibnal_ni); + + ptl_unregister_nal(IIBNAL); +} + +int __init +kibnal_module_init (void) +{ + int rc; + + if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) { + CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n"); + return -EINVAL; + } + + /* the following must be sizeof(int) for proc_dointvec() */ + if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) { + CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n"); + return -EINVAL; + } + + kibnal_api.nal_ni_init = kibnal_api_startup; + kibnal_api.nal_ni_fini = kibnal_api_shutdown; + + /* Initialise dynamic tunables to defaults once only */ + kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; + + rc = ptl_register_nal(IIBNAL, &kibnal_api); + if (rc != PTL_OK) { + CERROR("Can't register IBNAL: %d\n", rc); + return (-ENOMEM); /* or something... */ + } + + /* Pure gateways want the NAL started up at module load time... */ + rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni); + if (rc != PTL_OK && rc != PTL_IFACE_DUP) { + ptl_unregister_nal(IIBNAL); + return (-ENODEV); + } + +#ifdef CONFIG_SYSCTL + /* Press on regardless even if registering sysctl doesn't work */ + kibnal_tunables.kib_sysctl = + register_sysctl_table (kibnal_top_ctl_table, 0); +#endif + return (0); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01"); +MODULE_LICENSE("GPL"); + +module_init(kibnal_module_init); +module_exit(kibnal_module_fini); + diff --git a/lustre/portals/knals/iibnal/iibnal.h b/lustre/portals/knals/iibnal/iibnal.h new file mode 100644 index 0000000..0a25a9a --- /dev/null +++ b/lustre/portals/knals/iibnal/iibnal.h @@ -0,0 +1,892 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_IBNAL + +#include +#include +#include +#include + +#include + +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) + +/* Test for GCC > 3.2.2 */ +#if GCC_VERSION <= 30202 +/* GCC 3.2.2, and presumably several versions before it, will + * miscompile this driver. See + * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */ +#error Invalid GCC version. Must use GCC >= 3.2.3 +#endif + +#define IBNAL_SERVICE_NAME "iibnal" +#define IBNAL_SERVICE_NUMBER 0x11b9a1 + +#if CONFIG_SMP +# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ +#else +# define IBNAL_N_SCHED 1 /* # schedulers */ +#endif + +#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ +#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ + +#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ + +#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ +/* 7 indicates infinite retry attempts, Infinicon recommended 5 */ +#define IBNAL_RETRY 5 /* # times to retry */ +#define IBNAL_RNR_RETRY 5 /* */ +#define IBNAL_CM_RETRY 5 /* # times to retry connection */ +#define IBNAL_FLOW_CONTROL 1 +#define IBNAL_ACK_TIMEOUT 20 /* supposedly 4 secs */ + +#define IBNAL_NTX 64 /* # tx descs */ +/* this had to be dropped down so that we only register < 255 pages per + * region. this will change if we register all memory. */ +#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */ + +#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ + +#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ + +#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ + +/* default vals for runtime tunables */ +#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ + +/************************/ +/* derived constants... */ + +/* TX messages (shared by all connections) */ +#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) +#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) + +#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1) + +/* RX messages (per connection) */ +#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE +#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) + + +/* we may have up to 2 completions per transmit + + 1 completion per receive, per connection */ +#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \ + (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)) + +#define IBNAL_RDMA_BASE 0x0eeb0000 +#define IBNAL_FMR 0 +#define IBNAL_WHOLE_MEM 1 +#define IBNAL_CKSUM 0 +//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS +#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT + +/* XXX I have no idea. */ +#define IBNAL_STARTING_PSN 1 + +typedef struct +{ + int kib_io_timeout; /* comms timeout (seconds) */ + struct ctl_table_header *kib_sysctl; /* sysctl interface */ +} kib_tunables_t; + +/* some of these have specific types in the stack that just map back + * to the uFOO types, like IB_{L,R}_KEY. */ +typedef struct +{ + int ibp_npages; /* # pages */ + int ibp_mapped; /* mapped? */ + __u64 ibp_vaddr; /* mapped region vaddr */ + __u32 ibp_lkey; /* mapped region lkey */ + __u32 ibp_rkey; /* mapped region rkey */ + IB_HANDLE ibp_handle; /* mapped region handle */ + struct page *ibp_pages[0]; +} kib_pages_t; + +typedef struct +{ + IB_HANDLE md_handle; + __u32 md_lkey; + __u32 md_rkey; + __u64 md_addr; +} kib_md_t __attribute__((packed)); + +typedef struct +{ + int kib_init; /* initialisation state */ + __u64 kib_incarnation; /* which one am I */ + int kib_shutdown; /* shut down? */ + atomic_t kib_nthreads; /* # live threads */ + + __u64 kib_service_id; /* service number I listen on */ + __u64 kib_port_guid; /* my GUID (lo 64 of GID)*/ + __u16 kib_port_pkey; /* my pkey, whatever that is */ + ptl_nid_t kib_nid; /* my NID */ + struct semaphore kib_nid_mutex; /* serialise NID ops */ + struct semaphore kib_nid_signal; /* signal completion */ + IB_HANDLE kib_cep; /* connection end point */ + + rwlock_t kib_global_lock; /* stabilize peer/conn ops */ + + struct list_head *kib_peers; /* hash table of all my known peers */ + int kib_peer_hash_size; /* size of kib_peers */ + atomic_t kib_npeers; /* # peers extant */ + atomic_t kib_nconns; /* # connections extant */ + + struct list_head kib_connd_conns; /* connections to progress */ + struct list_head kib_connd_peers; /* peers waiting for a connection */ + wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ + unsigned long kib_connd_waketime; /* when connd will wake */ + spinlock_t kib_connd_lock; /* serialise */ + + wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ + struct list_head kib_sched_txq; /* tx requiring attention */ + struct list_head kib_sched_rxq; /* rx requiring attention */ + spinlock_t kib_sched_lock; /* serialise */ + + struct kib_tx *kib_tx_descs; /* all the tx descriptors */ + kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ + + struct list_head kib_idle_txs; /* idle tx descriptors */ + struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ + wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ + __u64 kib_next_tx_cookie; /* RDMA completion cookie */ + spinlock_t kib_tx_lock; /* serialise */ + + IB_HANDLE kib_hca; /* The HCA */ + int kib_port; /* port on the device */ + IB_HANDLE kib_pd; /* protection domain */ + IB_HANDLE kib_sd; /* SD handle */ + IB_HANDLE kib_cq; /* completion queue */ + kib_md_t kib_md; /* full-mem registration */ + + void *kib_listen_handle; /* where I listen for connections */ + + IBT_INTERFACE_UNION kib_interfaces; /* The Infinicon IBT interface */ + + uint64 kib_hca_guids[8]; /* all the HCA guids */ + IB_CA_ATTRIBUTES kib_hca_attrs; /* where to get HCA attrs */ + FABRIC_OPERATION_DATA kib_fabopdata; /* (un)advertise service record */ +} kib_data_t; + +#define IBNAL_INIT_NOTHING 0 +#define IBNAL_INIT_DATA 1 +#define IBNAL_INIT_LIB 2 +#define IBNAL_INIT_HCA 3 +#define IBNAL_INIT_PORTATTRS 4 +#define IBNAL_INIT_PORT 5 +#define IBNAL_INIT_SD 6 +#define IBNAL_INIT_PD 7 +#define IBNAL_INIT_FMR 8 +#define IBNAL_INIT_MR 9 +#define IBNAL_INIT_TXD 10 +#define IBNAL_INIT_CQ 11 +#define IBNAL_INIT_ALL 12 + +/************************************************************************ + * Wire message structs. + * These are sent in sender's byte order (i.e. receiver flips). + * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD + * private data and SM service info), is LE on the wire. + */ + +/* also kib_md_t above */ + +typedef struct +{ + __u32 rd_key; /* remote key */ + __u32 rd_nob; /* # of bytes */ + __u64 rd_addr; /* remote io vaddr */ +} kib_rdma_desc_t __attribute__((packed)); + +typedef struct +{ + ptl_hdr_t ibim_hdr; /* portals header */ + char ibim_payload[0]; /* piggy-backed payload */ +} kib_immediate_msg_t __attribute__((packed)); + +/* these arrays serve two purposes during rdma. they are built on the passive + * side and sent to the active side as remote arguments. On the active side + * the descs are used as a data structure on the way to local gather items. + * the different roles result in split local/remote meaning of desc->rd_key */ +typedef struct +{ + ptl_hdr_t ibrm_hdr; /* portals header */ + __u64 ibrm_cookie; /* opaque completion cookie */ + __u32 ibrm_num_descs; /* how many descs */ + kib_rdma_desc_t ibrm_desc[0]; /* where to suck/blow */ +} kib_rdma_msg_t __attribute__((packed)); + +#define kib_rdma_msg_len(num_descs) \ + offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs]) + +typedef struct +{ + __u64 ibcm_cookie; /* opaque completion cookie */ + __u32 ibcm_status; /* completion status */ +} kib_completion_msg_t __attribute__((packed)); + +typedef struct +{ + __u32 ibm_magic; /* I'm an openibnal message */ + __u16 ibm_version; /* this is my version number */ + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ +#if IBNAL_CKSUM + __u32 ibm_nob; + __u32 ibm_cksum; +#endif + union { + kib_immediate_msg_t immediate; + kib_rdma_msg_t rdma; + kib_completion_msg_t completion; + } ibm_u __attribute__((packed)); +} kib_msg_t __attribute__((packed)); + +#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ +#define IBNAL_MSG_VERSION 1 /* current protocol version */ + +#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ +#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ +#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ +#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ +#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ +#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ + +/***********************************************************************/ + +typedef struct kib_rx /* receive message */ +{ + struct list_head rx_list; /* queue for attention */ + struct kib_conn *rx_conn; /* owning conn */ + int rx_rdma; /* RDMA completion posted? */ + int rx_posted; /* posted? */ + __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ + kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ + IB_WORK_REQ rx_wrq; + IB_LOCAL_DATASEGMENT rx_gl; /* and it's memory */ +} kib_rx_t; + +typedef struct kib_tx /* transmit message */ +{ + struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ + int tx_isnblk; /* I'm reserved for non-blocking sends */ + struct kib_conn *tx_conn; /* owning conn */ + int tx_mapped; /* mapped for RDMA? */ + int tx_sending; /* # tx callbacks outstanding */ + int tx_status; /* completion status */ + unsigned long tx_deadline; /* completion deadline */ + int tx_passive_rdma; /* peer sucks/blows */ + int tx_passive_rdma_wait; /* waiting for peer to complete */ + __u64 tx_passive_rdma_cookie; /* completion cookie */ + lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ + kib_md_t tx_md; /* RDMA mapping (active/passive) */ + __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ + kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ + int tx_nsp; /* # send work items */ + IB_WORK_REQ tx_wrq[IBNAL_TX_MAX_SG]; /* send work items... */ + IB_LOCAL_DATASEGMENT tx_gl[IBNAL_TX_MAX_SG]; /* ...and their memory */ +} kib_tx_t; + +#define KIB_TX_UNMAPPED 0 +#define KIB_TX_MAPPED 1 +#define KIB_TX_MAPPED_FMR 2 + +typedef struct kib_wire_connreq +{ + __u32 wcr_magic; /* I'm an openibnal connreq */ + __u16 wcr_version; /* this is my version number */ + __u16 wcr_queue_depth; /* this is my receive queue size */ + __u64 wcr_nid; /* peer's NID */ + __u64 wcr_incarnation; /* peer's incarnation */ +} kib_wire_connreq_t; + +typedef struct kib_gid +{ + __u64 hi, lo; +} kib_gid_t; + +typedef struct kib_connreq +{ + /* connection-in-progress */ + struct kib_conn *cr_conn; + kib_wire_connreq_t cr_wcr; + __u64 cr_tid; + IB_SERVICE_RECORD cr_service; + kib_gid_t cr_gid; + IB_PATH_RECORD cr_path; + CM_REQUEST_INFO cr_cmreq; + CM_CONN_INFO cr_discarded; + CM_REJECT_INFO cr_rej_info; +} kib_connreq_t; + +typedef struct kib_conn +{ + struct kib_peer *ibc_peer; /* owning peer */ + struct list_head ibc_list; /* stash on peer's conn list */ + __u64 ibc_incarnation; /* which instance of the peer */ + atomic_t ibc_refcount; /* # users */ + int ibc_state; /* what's happening */ + atomic_t ibc_nob; /* # bytes buffered */ + int ibc_nsends_posted; /* # uncompleted sends */ + int ibc_credits; /* # credits I have */ + int ibc_outstanding_credits; /* # credits to return */ + int ibc_rcvd_disconnect;/* received discon request */ + int ibc_sent_disconnect;/* sent discon request */ + struct list_head ibc_tx_queue; /* send queue */ + struct list_head ibc_active_txs; /* active tx awaiting completion */ + spinlock_t ibc_lock; /* serialise */ + kib_rx_t *ibc_rxs; /* the rx descs */ + kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ + IB_HANDLE ibc_qp; /* queue pair */ + IB_HANDLE ibc_cep; /* connection ID? */ + IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs; /* QP attrs */ + kib_connreq_t *ibc_connreq; /* connection request state */ +} kib_conn_t; + +#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ +#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ +#define IBNAL_CONN_CONNECTING 2 /* started to connect */ +#define IBNAL_CONN_ESTABLISHED 3 /* connection established */ +#define IBNAL_CONN_SEND_DREQ 4 /* to send disconnect req */ +#define IBNAL_CONN_DREQ 5 /* sent disconnect req */ +#define IBNAL_CONN_DREP 6 /* sent disconnect rep */ +#define IBNAL_CONN_DISCONNECTED 7 /* no more QP or CM traffic */ + +#define KIB_ASSERT_CONN_STATE(conn, state) do { \ + LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state); \ +} while (0) + +#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do { \ + LASSERTF(low <= high, "%d %d\n", low, high); \ + LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \ + "%d\n", conn->ibc_state); \ +} while (0) + +typedef struct kib_peer +{ + struct list_head ibp_list; /* stash on global peer list */ + struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ + ptl_nid_t ibp_nid; /* who's on the other end(s) */ + atomic_t ibp_refcount; /* # users */ + int ibp_persistence; /* "known" peer refs */ + struct list_head ibp_conns; /* all active connections */ + struct list_head ibp_tx_queue; /* msgs waiting for a conn */ + int ibp_connecting; /* connecting+accepting */ + unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ + unsigned long ibp_reconnect_interval; /* exponential backoff */ +} kib_peer_t; + + +extern lib_nal_t kibnal_lib; +extern kib_data_t kibnal_data; +extern kib_tunables_t kibnal_tunables; + +/******************************************************************************/ +/* Infinicon IBT interface wrappers */ +#define IIBT_IF (kibnal_data.kib_interfaces.ver2) + +static inline FSTATUS +iibt_get_hca_guids(uint32 *hca_count, EUI64 *hca_guid_list) +{ + return IIBT_IF.GetCaGuids(hca_count, hca_guid_list); +} + +static inline FSTATUS +iibt_open_hca(EUI64 hca_guid, + IB_COMPLETION_CALLBACK completion_callback, + IB_ASYNC_EVENT_CALLBACK async_event_callback, + void *arg, + IB_HANDLE *handle) +{ + return IIBT_IF.Vpi.OpenCA(hca_guid, completion_callback, + async_event_callback, arg, handle); +} + +static inline FSTATUS +iibt_query_hca(IB_HANDLE hca_handle, IB_CA_ATTRIBUTES *attrs, void **argp) +{ + return IIBT_IF.Vpi.QueryCA(hca_handle, attrs, argp); +} + +static inline FSTATUS +iibt_close_hca(IB_HANDLE hca_handle) +{ + return IIBT_IF.Vpi.CloseCA(hca_handle); +} + +static inline FSTATUS +iibt_pd_allocate(IB_HANDLE hca_handle, __u32 max_avs, IB_HANDLE *pd_handle) +{ + return IIBT_IF.Vpi.AllocatePD(hca_handle, max_avs, pd_handle); +} + +static inline FSTATUS +iibt_pd_free(IB_HANDLE pd_handle) +{ + return IIBT_IF.Vpi.FreePD(pd_handle); +} + +static inline FSTATUS +iibt_register_physical_memory(IB_HANDLE hca_handle, + IB_VIRT_ADDR requested_io_va, + void *phys_buffers, uint64 nphys_buffers, + uint32 io_va_offset, IB_HANDLE pd_handle, + IB_ACCESS_CONTROL access, + IB_HANDLE *mem_handle, + IB_VIRT_ADDR *actual_io_va, + IB_L_KEY *lkey, IB_R_KEY *rkey) +{ + return IIBT_IF.Vpi.RegisterPhysMemRegion(hca_handle, requested_io_va, + phys_buffers, nphys_buffers, + io_va_offset, pd_handle, + access, + mem_handle, actual_io_va, + lkey, rkey); +} + +static inline FSTATUS +iibt_register_contig_physical_memory(IB_HANDLE hca_handle, + IB_VIRT_ADDR requested_io_va, + IB_MR_PHYS_BUFFER *phys_buffers, + uint64 nphys_buffers, + uint32 io_va_offset, IB_HANDLE pd_handle, + IB_ACCESS_CONTROL access, + IB_HANDLE *mem_handle, + IB_VIRT_ADDR *actual_io_va, + IB_L_KEY *lkey, IB_R_KEY *rkey) +{ + return IIBT_IF.Vpi.RegisterContigPhysMemRegion(hca_handle, + requested_io_va, + phys_buffers, + nphys_buffers, + io_va_offset, pd_handle, + access, + mem_handle, actual_io_va, + lkey, rkey); +} + +static inline FSTATUS +iibt_register_memory(IB_HANDLE hca_handle, + void *virt_addr, unsigned int length, + IB_HANDLE pd_handle, + IB_ACCESS_CONTROL access, + IB_HANDLE *mem_handle, + IB_L_KEY *lkey, IB_R_KEY *rkey) +{ + return IIBT_IF.Vpi.RegisterMemRegion(hca_handle, + virt_addr, length, + pd_handle, + access, + mem_handle, + lkey, rkey); +} + +static inline FSTATUS +iibt_deregister_memory(IB_HANDLE mem_handle) +{ + return IIBT_IF.Vpi.DeregisterMemRegion(mem_handle); +} + +static inline FSTATUS +iibt_cq_create(IB_HANDLE hca_handle, uint32 requested_size, + void *arg, IB_HANDLE *cq_handle, uint32 *actual_size) +{ + return IIBT_IF.Vpi.CreateCQ(hca_handle, requested_size, + arg, cq_handle, actual_size); +} + +static inline FSTATUS +iibt_cq_poll(IB_HANDLE cq_handle, IB_WORK_COMPLETION *wc) +{ + return IIBT_IF.Vpi.PollCQ(cq_handle, wc); +} + +static inline FSTATUS +iibt_cq_rearm(IB_HANDLE cq_handle, IB_CQ_EVENT_SELECT select) +{ + return IIBT_IF.Vpi.RearmCQ(cq_handle, select); +} + +static inline FSTATUS +iibt_cq_destroy(IB_HANDLE cq_handle) +{ + return IIBT_IF.Vpi.DestroyCQ(cq_handle); +} + +static inline FSTATUS +iibt_qp_create(IB_HANDLE hca_handle, IB_QP_ATTRIBUTES_CREATE *create_attr, + void *arg, IB_HANDLE *cq_handle, + IB_QP_ATTRIBUTES_QUERY *query_attr) +{ + return IIBT_IF.Vpi.CreateQP(hca_handle, create_attr, arg, cq_handle, + query_attr); +} + +static inline FSTATUS +iibt_qp_query(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_QUERY *query_attr, + void **arg_ptr) +{ + return IIBT_IF.Vpi.QueryQP(qp_handle, query_attr, arg_ptr); +} + +static inline FSTATUS +iibt_qp_modify(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_MODIFY *modify_attr, + IB_QP_ATTRIBUTES_QUERY *query_attr) +{ + return IIBT_IF.Vpi.ModifyQP(qp_handle, modify_attr, query_attr); +} + +static inline FSTATUS +iibt_qp_destroy(IB_HANDLE qp_handle) +{ + return IIBT_IF.Vpi.DestroyQP(qp_handle); +} + +static inline FSTATUS +iibt_postrecv(IB_HANDLE qp_handle, IB_WORK_REQ *work_req) +{ + return IIBT_IF.Vpi.PostRecv(qp_handle, work_req); +} + +static inline FSTATUS +iibt_postsend(IB_HANDLE qp_handle, IB_WORK_REQ *work_req) +{ + return IIBT_IF.Vpi.PostSend(qp_handle, work_req); +} + +static inline FSTATUS +iibt_sd_register(IB_HANDLE *sd_handle, CLIENT_CONTROL_PARAMETERS *p) +{ + return IIBT_IF.Sdi.Register(sd_handle, p); +} + +static inline FSTATUS +iibt_sd_deregister(IB_HANDLE sd_handle) +{ + return IIBT_IF.Sdi.Deregister(sd_handle); +} + +static inline FSTATUS +iibt_sd_port_fabric_operation(IB_HANDLE sd_handle, EUI64 port_guid, + FABRIC_OPERATION_DATA *fod, + PFABRIC_OPERATION_CALLBACK callback, + COMMAND_CONTROL_PARAMETERS *p, void *arg) +{ + return IIBT_IF.Sdi.PortFabricOperation(sd_handle, port_guid, + fod, callback, p, arg); +} + +static inline FSTATUS +iibt_sd_query_port_fabric_information(IB_HANDLE sd_handle, EUI64 port_guid, + QUERY *qry, + PQUERY_CALLBACK callback, + COMMAND_CONTROL_PARAMETERS *p, void *arg) +{ + return IIBT_IF.Sdi.QueryPortFabricInformation(sd_handle, port_guid, + qry, callback, p, arg); +} + +static inline IB_HANDLE +iibt_cm_create_cep(CM_CEP_TYPE type) +{ + return IIBT_IF.Cmi.CmCreateCEP(type); +} + +static inline FSTATUS +iibt_cm_modify_cep(IB_HANDLE cep, uint32 attr, char* value, uint32 len, + uint32 offset) +{ + return IIBT_IF.Cmi.CmModifyCEP(cep, attr, value, len, offset); +} + +static inline FSTATUS +iibt_cm_destroy_cep(IB_HANDLE cep_handle) +{ + return IIBT_IF.Cmi.CmDestroyCEP(cep_handle); +} + +static inline FSTATUS +iibt_cm_listen(IB_HANDLE cep, CM_LISTEN_INFO *info, + PFN_CM_CALLBACK callback, void *arg) +{ + return IIBT_IF.Cmi.CmListen(cep, info, callback, arg); +} + +static inline FSTATUS +iibt_cm_cancel(IB_HANDLE cep) +{ + return IIBT_IF.Cmi.CmCancel(cep); +} + +static inline FSTATUS +iibt_cm_accept(IB_HANDLE cep, + CM_CONN_INFO *send_info, CM_CONN_INFO *recv_info, + PFN_CM_CALLBACK callback, void *arg, + IB_HANDLE *new_cep) +{ + return IIBT_IF.Cmi.CmAccept(cep, + send_info, recv_info, + callback, arg, new_cep); +} + +static inline FSTATUS +iibt_cm_reject(IB_HANDLE cep, CM_REJECT_INFO *rej) +{ + return IIBT_IF.Cmi.CmReject(cep, rej); +} + +static inline FSTATUS +iibt_cm_disconnect(IB_HANDLE cep, CM_DREQUEST_INFO *req, + CM_DREPLY_INFO *reply) +{ + return IIBT_IF.Cmi.CmDisconnect(cep, req, reply); +} + +static inline FSTATUS +iibt_cm_connect (IB_HANDLE cep, CM_REQUEST_INFO *req, + PFN_CM_CALLBACK callback, void *arg) +{ + return IIBT_IF.Cmi.CmConnect (cep, req, callback, arg); +} + +static inline int wrq_signals_completion(IB_WORK_REQ *wrq) +{ + return wrq->Req.SendRC.Options.s.SignaledCompletion == 1; +} + + +/******************************************************************************/ + +/* these are purposely avoiding using local vars so they don't increase + * stack consumption. */ + +#define kib_peer_addref(peer) do { \ + LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ + atomic_read(&peer->ibp_refcount)); \ + CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n", \ + peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ + atomic_inc(&peer->ibp_refcount); \ +} while (0) + +#define kib_peer_decref(peer) do { \ + LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ + atomic_read(&peer->ibp_refcount)); \ + CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n", \ + peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ + if (atomic_dec_and_test (&peer->ibp_refcount)) { \ + CDEBUG (D_NET, "destroying peer "LPX64" %p\n", \ + peer->ibp_nid, peer); \ + kibnal_destroy_peer (peer); \ + } \ +} while (0) + +/******************************************************************************/ + +static inline struct list_head * +kibnal_nid2peerlist (ptl_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; + + return (&kibnal_data.kib_peers [hash]); +} + +static inline int +kibnal_peer_active(kib_peer_t *peer) +{ + /* Am I in the peer hash table? */ + return (!list_empty(&peer->ibp_list)); +} + +static inline void +kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) +{ + /* CAVEAT EMPTOR: tx takes caller's ref on conn */ + + LASSERT (tx->tx_nsp > 0); /* work items set up */ + LASSERT (tx->tx_conn == NULL); /* only set here */ + + tx->tx_conn = conn; + tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; + list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); +} + +#define KIBNAL_SERVICE_KEY_MASK (IB_SERVICE_RECORD_COMP_SERVICENAME | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_1 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_2 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_3 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_4 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_5 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_6 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_7 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_8) + +static inline __u64* +kibnal_service_nid_field(IB_SERVICE_RECORD *srv) +{ + /* must be consistent with KIBNAL_SERVICE_KEY_MASK */ + return (__u64 *)srv->ServiceData8; +} + + +static inline void +kibnal_set_service_keys(IB_SERVICE_RECORD *srv, ptl_nid_t nid) +{ + LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName)); + memset (srv->ServiceName, 0, sizeof(srv->ServiceName)); + strcpy (srv->ServiceName, IBNAL_SERVICE_NAME); + + *kibnal_service_nid_field(srv) = cpu_to_le64(nid); +} + +#if 0 +static inline void +kibnal_show_rdma_attr (kib_conn_t *conn) +{ + struct ib_qp_attribute qp_attr; + int rc; + + memset (&qp_attr, 0, sizeof(qp_attr)); + rc = ib_qp_query(conn->ibc_qp, &qp_attr); + if (rc != 0) { + CERROR ("Can't get qp attrs: %d\n", rc); + return; + } + + CWARN ("RDMA CAPABILITY: write %s read %s\n", + (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? + (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid", + (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? + (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid"); +} +#endif + +#if CONFIG_X86 +static inline __u64 +kibnal_page2phys (struct page *p) +{ + __u64 page_number = p - mem_map; + + return (page_number << PAGE_SHIFT); +} +#else +# error "no page->phys" +#endif + +/* CAVEAT EMPTOR: + * We rely on tx/rx descriptor alignment to allow us to use the lowest bit + * of the work request id as a flag to determine if the completion is for a + * transmit or a receive. It seems that that the CQ entry's 'op' field + * isn't always set correctly on completions that occur after QP teardown. */ + +static inline __u64 +kibnal_ptr2wreqid (void *ptr, int isrx) +{ + unsigned long lptr = (unsigned long)ptr; + + LASSERT ((lptr & 1) == 0); + return (__u64)(lptr | (isrx ? 1 : 0)); +} + +static inline void * +kibnal_wreqid2ptr (__u64 wreqid) +{ + return (void *)(((unsigned long)wreqid) & ~1UL); +} + +static inline int +kibnal_wreqid_is_rx (__u64 wreqid) +{ + return (wreqid & 1) != 0; +} + +static inline int +kibnal_whole_mem(void) +{ + return kibnal_data.kib_md.md_handle != NULL; +} + +extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid); +extern void kibnal_destroy_peer (kib_peer_t *peer); +extern int kibnal_del_peer (ptl_nid_t nid, int single_share); +extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid); +extern void kibnal_unlink_peer_locked (kib_peer_t *peer); +extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, + __u64 incarnation); +extern kib_conn_t *kibnal_create_conn (void); +extern void kibnal_put_conn (kib_conn_t *conn); +extern void kibnal_destroy_conn (kib_conn_t *conn); +void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg); + +extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); +extern void kibnal_free_pages (kib_pages_t *p); + +extern void kibnal_check_sends (kib_conn_t *conn); +extern void kibnal_close_conn_locked (kib_conn_t *conn, int error); +extern void kibnal_destroy_conn (kib_conn_t *conn); +extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); +extern int kibnal_scheduler(void *arg); +extern int kibnal_connd (void *arg); +extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); +extern void kibnal_close_conn (kib_conn_t *conn, int why); +extern void kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, + unsigned int niov, + struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t nob); + +void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev); +void kibnal_ca_callback (void *ca_arg, void *cq_arg); diff --git a/lustre/portals/knals/iibnal/iibnal_cb.c b/lustre/portals/knals/iibnal/iibnal_cb.c new file mode 100644 index 0000000..a827ba5 --- /dev/null +++ b/lustre/portals/knals/iibnal/iibnal_cb.c @@ -0,0 +1,3018 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "iibnal.h" + +/* + * LIB functions follow + * + */ +static void +kibnal_schedule_tx_done (kib_tx_t *tx) +{ + unsigned long flags; + + spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); + + list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); + wake_up (&kibnal_data.kib_sched_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); +} + +static void +kibnal_tx_done (kib_tx_t *tx) +{ + ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; + unsigned long flags; + int i; + FSTATUS frc; + + LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ + LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ + + switch (tx->tx_mapped) { + default: + LBUG(); + + case KIB_TX_UNMAPPED: + break; + + case KIB_TX_MAPPED: + if (in_interrupt()) { + /* can't deregister memory in IRQ context... */ + kibnal_schedule_tx_done(tx); + return; + } + frc = iibt_deregister_memory(tx->tx_md.md_handle); + LASSERT (frc == FSUCCESS); + tx->tx_mapped = KIB_TX_UNMAPPED; + break; + +#if IBNAL_FMR + case KIB_TX_MAPPED_FMR: + if (in_interrupt() && tx->tx_status != 0) { + /* can't flush FMRs in IRQ context... */ + kibnal_schedule_tx_done(tx); + return; + } + + rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr); + LASSERT (rc == 0); + + if (tx->tx_status != 0) + ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); + tx->tx_mapped = KIB_TX_UNMAPPED; + break; +#endif + } + + for (i = 0; i < 2; i++) { + /* tx may have up to 2 libmsgs to finalise */ + if (tx->tx_libmsg[i] == NULL) + continue; + + lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); + tx->tx_libmsg[i] = NULL; + } + + if (tx->tx_conn != NULL) { + kibnal_put_conn (tx->tx_conn); + tx->tx_conn = NULL; + } + + tx->tx_nsp = 0; + tx->tx_passive_rdma = 0; + tx->tx_status = 0; + + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + + if (tx->tx_isnblk) { + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); + } else { + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); + wake_up (&kibnal_data.kib_idle_tx_waitq); + } + + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); +} + +static kib_tx_t * +kibnal_get_idle_tx (int may_block) +{ + unsigned long flags; + kib_tx_t *tx = NULL; + ENTRY; + + for (;;) { + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + + /* "normal" descriptor is free */ + if (!list_empty (&kibnal_data.kib_idle_txs)) { + tx = list_entry (kibnal_data.kib_idle_txs.next, + kib_tx_t, tx_list); + break; + } + + if (!may_block) { + /* may dip into reserve pool */ + if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { + CERROR ("reserved tx desc pool exhausted\n"); + break; + } + + tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, + kib_tx_t, tx_list); + break; + } + + /* block for idle tx */ + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + + wait_event (kibnal_data.kib_idle_tx_waitq, + !list_empty (&kibnal_data.kib_idle_txs) || + kibnal_data.kib_shutdown); + } + + if (tx != NULL) { + list_del (&tx->tx_list); + + /* Allocate a new passive RDMA completion cookie. It might + * not be needed, but we've got a lock right now and we're + * unlikely to wrap... */ + tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; + + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT (tx->tx_nsp == 0); + LASSERT (tx->tx_sending == 0); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (!tx->tx_passive_rdma); + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_libmsg[0] == NULL); + LASSERT (tx->tx_libmsg[1] == NULL); + } + + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + + RETURN(tx); +} + +static int +kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if kibnal_get_peer (nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->libnal_ni.ni_pid.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +static void +kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) +{ + struct list_head *ttmp; + unsigned long flags; + int idle; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + list_for_each (ttmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + if (!tx->tx_passive_rdma_wait || + tx->tx_passive_rdma_cookie != cookie) + continue; + + CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); + + tx->tx_status = status; + tx->tx_passive_rdma_wait = 0; + idle = (tx->tx_sending == 0); + + if (idle) + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + /* I could be racing with tx callbacks. It's whoever + * _makes_ tx idle that frees it */ + if (idle) + kibnal_tx_done (tx); + return; + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n", + cookie, conn->ibc_peer->ibp_nid); +} + +static __u32 +kibnal_lkey(kib_pages_t *ibp) +{ + if (kibnal_whole_mem()) + return kibnal_data.kib_md.md_lkey; + + return ibp->ibp_lkey; +} + +static void +kibnal_post_rx (kib_rx_t *rx, int do_credits) +{ + kib_conn_t *conn = rx->rx_conn; + int rc = 0; + unsigned long flags; + FSTATUS frc; + ENTRY; + + rx->rx_gl = (IB_LOCAL_DATASEGMENT) { + .Address = rx->rx_vaddr, + .Length = IBNAL_MSG_SIZE, + .Lkey = kibnal_lkey(conn->ibc_rx_pages), + }; + + rx->rx_wrq = (IB_WORK_REQ) { + .Operation = WROpRecv, + .DSListDepth = 1, + .MessageLen = IBNAL_MSG_SIZE, + .WorkReqId = kibnal_ptr2wreqid(rx, 1), + .DSList = &rx->rx_gl, + }; + + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, + IBNAL_CONN_DREP); + LASSERT (!rx->rx_posted); + rx->rx_posted = 1; + mb(); + + if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) + rc = -ECONNABORTED; + else { + frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq); + if (frc != FSUCCESS) { + CDEBUG(D_NET, "post failed %d\n", frc); + rc = -EINVAL; + } + CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq); + } + + if (rc == 0) { + if (do_credits) { + spin_lock_irqsave(&conn->ibc_lock, flags); + conn->ibc_outstanding_credits++; + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + kibnal_check_sends(conn); + } + EXIT; + return; + } + + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + CERROR ("Error posting receive -> "LPX64": %d\n", + conn->ibc_peer->ibp_nid, rc); + kibnal_close_conn (rx->rx_conn, rc); + } else { + CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", + conn->ibc_peer->ibp_nid, rc); + } + + /* Drop rx's ref */ + kibnal_put_conn (conn); + EXIT; +} + +#if IBNAL_CKSUM +static inline __u32 kibnal_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + return (sum); +} +#endif + +static void hexdump(char *string, void *ptr, int len) +{ + unsigned char *c = ptr; + int i; + + return; + + if (len < 0 || len > 2048) { + printk("XXX what the hell? %d\n",len); + return; + } + + printk("%d bytes of '%s' from 0x%p\n", len, string, ptr); + + for (i = 0; i < len;) { + printk("%02x",*(c++)); + i++; + if (!(i & 15)) { + printk("\n"); + } else if (!(i&1)) { + printk(" "); + } + } + + if(len & 15) { + printk("\n"); + } +} + +static void +kibnal_rx_callback (IB_WORK_COMPLETION *wc) +{ + kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId); + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + int nob = wc->Length; + const int base_nob = offsetof(kib_msg_t, ibm_u); + int credits; + int flipped; + unsigned long flags; + __u32 i; +#if IBNAL_CKSUM + __u32 msg_cksum; + __u32 computed_cksum; +#endif + + /* we set the QP to erroring after we've finished disconnecting, + * maybe we should do so sooner. */ + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, + IBNAL_CONN_DISCONNECTED); + + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + LASSERT (rx->rx_posted); + rx->rx_posted = 0; + mb(); + + /* receives complete with error in any case after we've started + * disconnecting */ + if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) + goto failed; + + if (wc->Status != WRStatusSuccess) { + CERROR("Rx from "LPX64" failed: %d\n", + conn->ibc_peer->ibp_nid, wc->Status); + goto failed; + } + + if (nob < base_nob) { + CERROR ("Short rx from "LPX64": %d < expected %d\n", + conn->ibc_peer->ibp_nid, nob, base_nob); + goto failed; + } + + hexdump("rx", rx->rx_msg, sizeof(kib_msg_t)); + + /* Receiver does any byte flipping if necessary... */ + + if (msg->ibm_magic == IBNAL_MSG_MAGIC) { + flipped = 0; + } else { + if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { + CERROR ("Unrecognised magic: %08x from "LPX64"\n", + msg->ibm_magic, conn->ibc_peer->ibp_nid); + goto failed; + } + flipped = 1; + __swab16s (&msg->ibm_version); + LASSERT (sizeof(msg->ibm_type) == 1); + LASSERT (sizeof(msg->ibm_credits) == 1); + } + + if (msg->ibm_version != IBNAL_MSG_VERSION) { + CERROR ("Incompatible msg version %d (%d expected)\n", + msg->ibm_version, IBNAL_MSG_VERSION); + goto failed; + } + +#if IBNAL_CKSUM + if (nob != msg->ibm_nob) { + CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob); + goto failed; + } + + msg_cksum = le32_to_cpu(msg->ibm_cksum); + msg->ibm_cksum = 0; + computed_cksum = kibnal_cksum (msg, nob); + + if (msg_cksum != computed_cksum) { + CERROR ("Checksum failure %d: (%d expected)\n", + computed_cksum, msg_cksum); +// goto failed; + } + CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob); +#endif + + /* Have I received credits that will let me send? */ + credits = msg->ibm_credits; + if (credits != 0) { + spin_lock_irqsave(&conn->ibc_lock, flags); + conn->ibc_credits += credits; + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + kibnal_check_sends(conn); + } + + switch (msg->ibm_type) { + case IBNAL_MSG_NOOP: + kibnal_post_rx (rx, 1); + return; + + case IBNAL_MSG_IMMEDIATE: + if (nob < base_nob + sizeof (kib_immediate_msg_t)) { + CERROR ("Short IMMEDIATE from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, nob); + goto failed; + } + break; + + case IBNAL_MSG_PUT_RDMA: + case IBNAL_MSG_GET_RDMA: + if (nob < base_nob + sizeof (kib_rdma_msg_t)) { + CERROR ("Short RDMA msg from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, nob); + goto failed; + } + if (flipped) + __swab32(msg->ibm_u.rdma.ibrm_num_descs); + + CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n", + msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie); + + if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) || + (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > + min(nob, IBNAL_MSG_SIZE))) { + CERROR ("num_descs %d too large\n", + msg->ibm_u.rdma.ibrm_num_descs); + goto failed; + } + + for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) { + kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i]; + + if (flipped) { + __swab32(desc->rd_key); + __swab32(desc->rd_nob); + __swab64(desc->rd_addr); + } + + CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n", + desc->rd_key, desc->rd_addr, desc->rd_nob); + } + break; + + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + if (nob < base_nob + sizeof (kib_completion_msg_t)) { + CERROR ("Short COMPLETION msg from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, nob); + goto failed; + } + if (flipped) + __swab32s(&msg->ibm_u.completion.ibcm_status); + + CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", + msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); + + kibnal_complete_passive_rdma (conn, + msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); + kibnal_post_rx (rx, 1); + return; + + default: + CERROR ("Can't parse type from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, msg->ibm_type); + goto failed; + } + + /* schedule for kibnal_rx() in thread context */ + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + + list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); + wake_up (&kibnal_data.kib_sched_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + return; + + failed: + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + kibnal_close_conn(conn, -ECONNABORTED); + + /* Don't re-post rx & drop its ref on conn */ + kibnal_put_conn(conn); +} + +void +kibnal_rx (kib_rx_t *rx) +{ + kib_msg_t *msg = rx->rx_msg; + + /* Clear flag so I can detect if I've sent an RDMA completion */ + rx->rx_rdma = 0; + + switch (msg->ibm_type) { + case IBNAL_MSG_GET_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); + /* If the incoming get was matched, I'll have initiated the + * RDMA and the completion message... */ + if (rx->rx_rdma) + break; + + /* Otherwise, I'll send a failed completion now to prevent + * the peer's GET blocking for the full timeout. */ + CERROR ("Completing unmatched RDMA GET from "LPX64"\n", + rx->rx_conn->ibc_peer->ibp_nid); + kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, + rx, NULL, 0, NULL, NULL, 0, 0); + break; + + case IBNAL_MSG_PUT_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); + if (rx->rx_rdma) + break; + /* This is most unusual, since even if lib_parse() didn't + * match anything, it should have asked us to read (and + * discard) the payload. The portals header must be + * inconsistent with this message type, so it's the + * sender's fault for sending garbage and she can time + * herself out... */ + CERROR ("Uncompleted RMDA PUT from "LPX64"\n", + rx->rx_conn->ibc_peer->ibp_nid); + break; + + case IBNAL_MSG_IMMEDIATE: + lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); + LASSERT (!rx->rx_rdma); + break; + + default: + LBUG(); + break; + } + + kibnal_post_rx (rx, 1); +} + +static struct page * +kibnal_kvaddr_to_page (unsigned long vaddr) +{ + struct page *page; + + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) + page = vmalloc_to_page ((void *)vaddr); +#if CONFIG_HIGHMEM + else if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) + page = vmalloc_to_page ((void *)vaddr); + /* in 2.4 ^ just walks the page tables */ +#endif + else + page = virt_to_page (vaddr); + + if (!VALID_PAGE (page)) + page = NULL; + + return page; +} + +static void +kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset, + unsigned long len, int active) +{ + kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma; + kib_rdma_desc_t *desc; + + LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", + ibrm->ibrm_num_descs); + + desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs]; + if (active) + desc->rd_key = kibnal_data.kib_md.md_lkey; + else + desc->rd_key = kibnal_data.kib_md.md_rkey; + desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */ + desc->rd_addr = kibnal_page2phys(page) + page_offset + + kibnal_data.kib_md.md_addr; + + ibrm->ibrm_num_descs++; +} + +static int +kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active) +{ + struct page *page; + int page_offset, len; + + while (nob > 0) { + page = kibnal_kvaddr_to_page(vaddr); + if (page == NULL) + return -EFAULT; + + page_offset = vaddr & (PAGE_SIZE - 1); + len = min(nob, (int)PAGE_SIZE - page_offset); + + kibnal_fill_ibrm(tx, page, page_offset, len, active); + nob -= len; + vaddr += len; + } + return 0; +} + +static int +kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access, + int niov, struct iovec *iov, int offset, int nob, int active) + +{ + void *vaddr; + FSTATUS frc; + + LASSERT (nob > 0); + LASSERT (niov > 0); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT (niov > 0); + } + + if (nob > iov->iov_len - offset) { + CERROR ("Can't map multiple vaddr fragments\n"); + return (-EMSGSIZE); + } + + /* our large contiguous iov could be backed by multiple physical + * pages. */ + if (kibnal_whole_mem()) { + int rc; + tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; + rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + + offset, nob, active); + if (rc != 0) { + CERROR ("Can't map iov: %d\n", rc); + return rc; + } + return 0; + } + + vaddr = (void *)(((unsigned long)iov->iov_base) + offset); + tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); + + frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob, + kibnal_data.kib_pd, access, + &tx->tx_md.md_handle, &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); + if (frc != 0) { + CERROR ("Can't map vaddr %p: %d\n", vaddr, frc); + return -EINVAL; + } + + tx->tx_mapped = KIB_TX_MAPPED; + return (0); +} + +static int +kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access, + int nkiov, ptl_kiov_t *kiov, + int offset, int nob, int active) +{ + __u64 *phys = NULL; + int page_offset; + int nphys; + int resid; + int phys_size = 0; + FSTATUS frc; + int i, rc = 0; + + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); + + LASSERT (nob > 0); + LASSERT (nkiov > 0); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT (nkiov > 0); + } + + page_offset = kiov->kiov_offset + offset; + nphys = 1; + + if (!kibnal_whole_mem()) { + phys_size = nkiov * sizeof (*phys); + PORTAL_ALLOC(phys, phys_size); + if (phys == NULL) { + CERROR ("Can't allocate tmp phys\n"); + return (-ENOMEM); + } + + phys[0] = kibnal_page2phys(kiov->kiov_page); + } else { + tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; + kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, + kiov->kiov_len, active); + } + + resid = nob - (kiov->kiov_len - offset); + + while (resid > 0) { + kiov++; + nkiov--; + LASSERT (nkiov > 0); + + if (kiov->kiov_offset != 0 || + ((resid > PAGE_SIZE) && + kiov->kiov_len < PAGE_SIZE)) { + /* Can't have gaps */ + CERROR ("Can't make payload contiguous in I/O VM:" + "page %d, offset %d, len %d \n", nphys, + kiov->kiov_offset, kiov->kiov_len); + + for (i = -nphys; i < nkiov; i++) + { + CERROR("kiov[%d] %p +%d for %d\n", + i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len); + } + + rc = -EINVAL; + goto out; + } + + if (nphys == PTL_MD_MAX_IOV) { + CERROR ("payload too big (%d)\n", nphys); + rc = -EMSGSIZE; + goto out; + } + + if (!kibnal_whole_mem()) { + LASSERT (nphys * sizeof (*phys) < phys_size); + phys[nphys] = kibnal_page2phys(kiov->kiov_page); + } else { + if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) { + CERROR ("payload too big (%d)\n", nphys); + rc = -EMSGSIZE; + goto out; + } + kibnal_fill_ibrm(tx, kiov->kiov_page, + kiov->kiov_offset, kiov->kiov_len, + active); + } + + nphys ++; + resid -= PAGE_SIZE; + } + + if (kibnal_whole_mem()) + goto out; + +#if 0 + CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset); + for (i = 0; i < nphys; i++) + CWARN (" [%d] "LPX64"\n", i, phys[i]); +#endif + +#if IBNAL_FMR +#error "iibnal hasn't learned about FMR yet" + rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, + phys, nphys, + &tx->tx_md.md_addr, + page_offset, + &tx->tx_md.md_handle.fmr, + &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); +#else + frc = iibt_register_physical_memory(kibnal_data.kib_hca, + IBNAL_RDMA_BASE, + phys, nphys, + 0, /* offset */ + kibnal_data.kib_pd, + access, + &tx->tx_md.md_handle, + &tx->tx_md.md_addr, + &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); +#endif + if (frc == FSUCCESS) { + CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n", + nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey); +#if IBNAL_FMR + tx->tx_mapped = KIB_TX_MAPPED_FMR; +#else + tx->tx_mapped = KIB_TX_MAPPED; +#endif + } else { + CERROR ("Can't map phys: %d\n", rc); + rc = -EFAULT; + } + + out: + if (phys != NULL) + PORTAL_FREE(phys, phys_size); + return (rc); +} + +static kib_conn_t * +kibnal_find_conn_locked (kib_peer_t *peer) +{ + struct list_head *tmp; + + /* just return the first connection */ + list_for_each (tmp, &peer->ibp_conns) { + return (list_entry(tmp, kib_conn_t, ibc_list)); + } + + return (NULL); +} + +void +kibnal_check_sends (kib_conn_t *conn) +{ + unsigned long flags; + kib_tx_t *tx; + int rc; + int i; + int done; + int nwork; + ENTRY; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); + + if (list_empty(&conn->ibc_tx_queue) && + conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + tx = kibnal_get_idle_tx(0); /* don't block */ + if (tx != NULL) + kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); + + spin_lock_irqsave(&conn->ibc_lock, flags); + + if (tx != NULL) { + atomic_inc(&conn->ibc_refcount); + kibnal_queue_tx_locked(tx, conn); + } + } + + while (!list_empty (&conn->ibc_tx_queue)) { + tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); + + /* We rely on this for QP sizing */ + LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG); + + LASSERT (conn->ibc_outstanding_credits >= 0); + LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_credits >= 0); + LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); + + /* Not on ibc_rdma_queue */ + LASSERT (!tx->tx_passive_rdma_wait); + + if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) + GOTO(out, 0); + + if (conn->ibc_credits == 0) /* no credits */ + GOTO(out, 1); + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) /* giving back credits */ + GOTO(out, 2); + + list_del (&tx->tx_list); + + if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && + (!list_empty(&conn->ibc_tx_queue) || + conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { + /* redundant NOOP */ + spin_unlock_irqrestore(&conn->ibc_lock, flags); + kibnal_tx_done(tx); + spin_lock_irqsave(&conn->ibc_lock, flags); + continue; + } + + tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits; + conn->ibc_outstanding_credits = 0; + + conn->ibc_nsends_posted++; + conn->ibc_credits--; + + /* we only get a tx completion for the final rdma op */ + tx->tx_sending = min(tx->tx_nsp, 2); + tx->tx_passive_rdma_wait = tx->tx_passive_rdma; + list_add (&tx->tx_list, &conn->ibc_active_txs); +#if IBNAL_CKSUM + tx->tx_msg->ibm_cksum = 0; + tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob); + CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob); +#endif + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + /* NB the gap between removing tx from the queue and sending it + * allows message re-ordering to occur */ + + LASSERT (tx->tx_nsp > 0); + + rc = -ECONNABORTED; + nwork = 0; + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + tx->tx_status = 0; + /* Driver only accepts 1 item at a time */ + for (i = 0; i < tx->tx_nsp; i++) { + hexdump("tx", tx->tx_msg, sizeof(kib_msg_t)); + rc = iibt_postsend(conn->ibc_qp, + &tx->tx_wrq[i]); + if (rc != 0) + break; + if (wrq_signals_completion(&tx->tx_wrq[i])) + nwork++; + CDEBUG(D_NET, "posted tx wrq %p\n", + &tx->tx_wrq[i]); + } + } + + spin_lock_irqsave (&conn->ibc_lock, flags); + if (rc != 0) { + /* NB credits are transferred in the actual + * message, which can only be the last work item */ + conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; + conn->ibc_credits++; + conn->ibc_nsends_posted--; + + tx->tx_status = rc; + tx->tx_passive_rdma_wait = 0; + tx->tx_sending -= tx->tx_nsp - nwork; + + done = (tx->tx_sending == 0); + if (done) + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) + CERROR ("Error %d posting transmit to "LPX64"\n", + rc, conn->ibc_peer->ibp_nid); + else + CDEBUG (D_NET, "Error %d posting transmit to " + LPX64"\n", rc, conn->ibc_peer->ibp_nid); + + kibnal_close_conn (conn, rc); + + if (done) + kibnal_tx_done (tx); + return; + } + + } + + EXIT; +out: + spin_unlock_irqrestore (&conn->ibc_lock, flags); +} + +static void +kibnal_tx_callback (IB_WORK_COMPLETION *wc) +{ + kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId); + kib_conn_t *conn; + unsigned long flags; + int idle; + + conn = tx->tx_conn; + LASSERT (conn != NULL); + LASSERT (tx->tx_sending != 0); + + spin_lock_irqsave(&conn->ibc_lock, flags); + + CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx, + tx->tx_sending, tx->tx_nsp, wc->Status); + + /* I could be racing with rdma completion. Whoever makes 'tx' idle + * gets to free it, which also drops its ref on 'conn'. If it's + * not me, then I take an extra ref on conn so it can't disappear + * under me. */ + + tx->tx_sending--; + idle = (tx->tx_sending == 0) && /* This is the final callback */ + (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ + if (idle) + list_del(&tx->tx_list); + + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + + if (tx->tx_sending == 0) + conn->ibc_nsends_posted--; + + if (wc->Status != WRStatusSuccess && + tx->tx_status == 0) + tx->tx_status = -ECONNABORTED; + + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + if (idle) + kibnal_tx_done (tx); + + if (wc->Status != WRStatusSuccess) { + CERROR ("Tx completion to "LPX64" failed: %d\n", + conn->ibc_peer->ibp_nid, wc->Status); + kibnal_close_conn (conn, -ENETDOWN); + } else { + /* can I shovel some more sends out the door? */ + kibnal_check_sends(conn); + } + + kibnal_put_conn (conn); +} + +void +kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev) +{ + /* XXX flesh out. this seems largely for async errors */ + CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode); +} + +void +kibnal_ca_callback (void *ca_arg, void *cq_arg) +{ + IB_HANDLE cq = *(IB_HANDLE *)cq_arg; + IB_HANDLE ca = *(IB_HANDLE *)ca_arg; + IB_WORK_COMPLETION wc; + int armed = 0; + + CDEBUG(D_NET, "ca %p cq %p\n", ca, cq); + + for(;;) { + while (iibt_cq_poll(cq, &wc) == FSUCCESS) { + if (kibnal_wreqid_is_rx(wc.WorkReqId)) + kibnal_rx_callback(&wc); + else + kibnal_tx_callback(&wc); + } + if (armed) + return; + if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) { + CERROR("rearm failed?\n"); + return; + } + armed = 1; + } +} + +void +kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) +{ + IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp]; + IB_WORK_REQ *wrq = &tx->tx_wrq[tx->tx_nsp]; + int fence; + int nob = offsetof (kib_msg_t, ibm_u) + body_nob; + + LASSERT (tx->tx_nsp >= 0 && + tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0])); + LASSERT (nob <= IBNAL_MSG_SIZE); + + tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC; + tx->tx_msg->ibm_version = IBNAL_MSG_VERSION; + tx->tx_msg->ibm_type = type; +#if IBNAL_CKSUM + tx->tx_msg->ibm_nob = nob; +#endif + /* Fence the message if it's bundled with an RDMA read */ + fence = (tx->tx_nsp > 0) && + (type == IBNAL_MSG_PUT_DONE); + + *gl = (IB_LOCAL_DATASEGMENT) { + .Address = tx->tx_vaddr, + .Length = IBNAL_MSG_SIZE, + .Lkey = kibnal_lkey(kibnal_data.kib_tx_pages), + }; + + wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); + wrq->Operation = WROpSend; + wrq->DSList = gl; + wrq->DSListDepth = 1; + wrq->MessageLen = nob; + wrq->Req.SendRC.ImmediateData = 0; + wrq->Req.SendRC.Options.s.SolicitedEvent = 1; + wrq->Req.SendRC.Options.s.SignaledCompletion = 1; + wrq->Req.SendRC.Options.s.ImmediateData = 0; + wrq->Req.SendRC.Options.s.Fence = fence; + + tx->tx_nsp++; +} + +static void +kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) +{ + unsigned long flags; + + spin_lock_irqsave(&conn->ibc_lock, flags); + + kibnal_queue_tx_locked (tx, conn); + + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + kibnal_check_sends(conn); +} + +static void +kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) +{ + unsigned long flags; + kib_peer_t *peer; + kib_conn_t *conn; + rwlock_t *g_lock = &kibnal_data.kib_global_lock; + + /* If I get here, I've committed to send, so I complete the tx with + * failure on any problems */ + + LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ + LASSERT (tx->tx_nsp > 0); /* work items have been set up */ + + read_lock (g_lock); + + peer = kibnal_find_peer_locked (nid); + if (peer == NULL) { + read_unlock (g_lock); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ + read_unlock (g_lock); + + kibnal_queue_tx (tx, conn); + return; + } + + /* Making one or more connections; I'll need a write lock... */ + read_unlock (g_lock); + write_lock_irqsave (g_lock, flags); + + peer = kibnal_find_peer_locked (nid); + if (peer == NULL) { + write_unlock_irqrestore (g_lock, flags); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + /* Connection exists; queue message on it */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ + write_unlock_irqrestore (g_lock, flags); + + kibnal_queue_tx (tx, conn); + return; + } + + if (peer->ibp_connecting == 0) { + if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { + write_unlock_irqrestore (g_lock, flags); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + peer->ibp_connecting = 1; + kib_peer_addref(peer); /* extra ref for connd */ + + spin_lock (&kibnal_data.kib_connd_lock); + + list_add_tail (&peer->ibp_connd_list, + &kibnal_data.kib_connd_peers); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock (&kibnal_data.kib_connd_lock); + } + + /* A connection is being established; queue the message... */ + list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); + + write_unlock_irqrestore (g_lock, flags); +} + +static ptl_err_t +kibnal_start_passive_rdma (int type, ptl_nid_t nid, + lib_msg_t *libmsg, ptl_hdr_t *hdr) +{ + int nob = libmsg->md->length; + kib_tx_t *tx; + kib_msg_t *ibmsg; + int rc; + IB_ACCESS_CONTROL access = {0,}; + + LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA); + LASSERT (nob > 0); + LASSERT (!in_interrupt()); /* Mapping could block */ + + access.s.MWBindable = 1; + access.s.LocalWrite = 1; + access.s.RdmaRead = 1; + access.s.RdmaWrite = 1; + + tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ + LASSERT (tx != NULL); + + if ((libmsg->md->options & PTL_MD_KIOV) == 0) + rc = kibnal_map_iov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.iov, + 0, nob, 0); + else + rc = kibnal_map_kiov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.kiov, + 0, nob, 0); + + if (rc != 0) { + CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); + goto failed; + } + + if (type == IBNAL_MSG_GET_RDMA) { + /* reply gets finalized when tx completes */ + tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, + nid, libmsg); + if (tx->tx_libmsg[1] == NULL) { + CERROR ("Can't create reply for GET -> "LPX64"\n", + nid); + rc = -ENOMEM; + goto failed; + } + } + + tx->tx_passive_rdma = 1; + + ibmsg = tx->tx_msg; + + ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; + ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; + /* map_kiov alrady filled the rdma descs for the whole_mem case */ + if (!kibnal_whole_mem()) { + ibmsg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_rkey; + ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; + ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; + ibmsg->ibm_u.rdma.ibrm_num_descs = 1; + } + + kibnal_init_tx_msg (tx, type, + kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs)); + + CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " + LPX64", nob %d\n", + tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey, + tx->tx_md.md_addr, nob); + + /* libmsg gets finalized when tx completes. */ + tx->tx_libmsg[0] = libmsg; + + kibnal_launch_tx(tx, nid); + return (PTL_OK); + + failed: + tx->tx_status = rc; + kibnal_tx_done (tx); + return (PTL_FAIL); +} + +void +kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, + unsigned int niov, + struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t nob) +{ + kib_msg_t *rxmsg = rx->rx_msg; + kib_msg_t *txmsg; + kib_tx_t *tx; + IB_ACCESS_CONTROL access = {0,}; + IB_WR_OP rdma_op; + int rc; + __u32 i; + + CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n", + type, status, niov, offset, nob); + + /* Called by scheduler */ + LASSERT (!in_interrupt ()); + + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + /* No data if we're completing with failure */ + LASSERT (status == 0 || nob == 0); + + LASSERT (type == IBNAL_MSG_GET_DONE || + type == IBNAL_MSG_PUT_DONE); + + /* Flag I'm completing the RDMA. Even if I fail to send the + * completion message, I will have tried my best so further + * attempts shouldn't be tried. */ + LASSERT (!rx->rx_rdma); + rx->rx_rdma = 1; + + if (type == IBNAL_MSG_GET_DONE) { + rdma_op = WROpRdmaWrite; + LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); + } else { + access.s.LocalWrite = 1; + rdma_op = WROpRdmaRead; + LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); + } + + tx = kibnal_get_idle_tx (0); /* Mustn't block */ + if (tx == NULL) { + CERROR ("tx descs exhausted on RDMA from "LPX64 + " completing locally with failure\n", + rx->rx_conn->ibc_peer->ibp_nid); + lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); + return; + } + LASSERT (tx->tx_nsp == 0); + + if (nob == 0) + GOTO(init_tx, 0); + + /* We actually need to transfer some data (the transfer + * size could get truncated to zero when the incoming + * message is matched) */ + if (kiov != NULL) + rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1); + else + rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1); + + if (rc != 0) { + CERROR ("Can't map RDMA -> "LPX64": %d\n", + rx->rx_conn->ibc_peer->ibp_nid, rc); + /* We'll skip the RDMA and complete with failure. */ + status = rc; + nob = 0; + GOTO(init_tx, rc); + } + + if (!kibnal_whole_mem()) { + tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_lkey; + tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; + tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; + tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1; + } + + /* XXX ugh. different page-sized hosts. */ + if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs != + rxmsg->ibm_u.rdma.ibrm_num_descs) { + CERROR("tx descs (%u) != rx descs (%u)\n", + tx->tx_msg->ibm_u.rdma.ibrm_num_descs, + rxmsg->ibm_u.rdma.ibrm_num_descs); + /* We'll skip the RDMA and complete with failure. */ + status = rc; + nob = 0; + GOTO(init_tx, rc); + } + + /* map_kiov filled in the rdma descs which describe our side of the + * rdma transfer. */ + /* ibrm_num_descs was verified in rx_callback */ + for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) { + kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */ + IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i]; + IB_WORK_REQ *wrq = &tx->tx_wrq[i]; + + ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i]; + rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i]; + + ds->Address = ldesc->rd_addr; + ds->Length = ldesc->rd_nob; + ds->Lkey = ldesc->rd_key; + + memset(wrq, 0, sizeof(*wrq)); + wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); + wrq->Operation = rdma_op; + wrq->DSList = ds; + wrq->DSListDepth = 1; + wrq->MessageLen = ds->Length; + wrq->Req.SendRC.ImmediateData = 0; + wrq->Req.SendRC.Options.s.SolicitedEvent = 0; + wrq->Req.SendRC.Options.s.SignaledCompletion = 0; + wrq->Req.SendRC.Options.s.ImmediateData = 0; + wrq->Req.SendRC.Options.s.Fence = 0; + wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr; + wrq->Req.SendRC.RemoteDS.Rkey = rdesc->rd_key; + + /* only the last rdma post triggers tx completion */ + if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1) + wrq->Req.SendRC.Options.s.SignaledCompletion = 1; + + tx->tx_nsp++; + } + +init_tx: + txmsg = tx->tx_msg; + + txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; + txmsg->ibm_u.completion.ibcm_status = status; + + kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); + + if (status == 0 && nob != 0) { + LASSERT (tx->tx_nsp > 1); + /* RDMA: libmsg gets finalized when the tx completes. This + * is after the completion message has been sent, which in + * turn is after the RDMA has finished. */ + tx->tx_libmsg[0] = libmsg; + } else { + LASSERT (tx->tx_nsp == 1); + /* No RDMA: local completion happens now! */ + CDEBUG(D_WARNING,"No data: immediate completion\n"); + lib_finalize (&kibnal_lib, NULL, libmsg, + status == 0 ? PTL_OK : PTL_FAIL); + } + + /* +1 ref for this tx... */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + rx->rx_conn, rx->rx_conn->ibc_state, + rx->rx_conn->ibc_peer->ibp_nid, + atomic_read (&rx->rx_conn->ibc_refcount)); + atomic_inc (&rx->rx_conn->ibc_refcount); + /* ...and queue it up */ + kibnal_queue_tx(tx, rx->rx_conn); +} + +static ptl_err_t +kibnal_sendmsg(lib_nal_t *nal, + void *private, + lib_msg_t *libmsg, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + ptl_kiov_t *payload_kiov, + size_t payload_offset, + size_t payload_nob) +{ + kib_msg_t *ibmsg; + kib_tx_t *tx; + int nob; + + /* NB 'private' is different depending on what we're sending.... */ + + CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64 + " pid %d\n", payload_nob, payload_niov, nid , pid); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + /* Thread context if we're sending payload */ + LASSERT (!in_interrupt() || payload_niov == 0); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + switch (type) { + default: + LBUG(); + return (PTL_FAIL); + + case PTL_MSG_REPLY: { + /* reply's 'private' is the incoming receive */ + kib_rx_t *rx = private; + + /* RDMA reply expected? */ + if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { + kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, + rx, libmsg, payload_niov, + payload_iov, payload_kiov, + payload_offset, payload_nob); + return (PTL_OK); + } + + /* Incoming message consistent with immediate reply? */ + if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { + CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", + nid, rx->rx_msg->ibm_type); + return (PTL_FAIL); + } + + /* Will it fit in a message? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob >= IBNAL_MSG_SIZE) { + CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", + nid, payload_nob); + return (PTL_FAIL); + } + break; + } + + case PTL_MSG_GET: + /* might the REPLY message be big enough to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, + nid, libmsg, hdr)); + break; + + case PTL_MSG_ACK: + LASSERT (payload_nob == 0); + break; + + case PTL_MSG_PUT: + /* Is the payload big enough to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, + nid, libmsg, hdr)); + + break; + } + + tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt())); + if (tx == NULL) { + CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", + type, nid, in_interrupt() ? " (intr)" : ""); + return (PTL_NO_SPACE); + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; + + if (payload_nob > 0) { + if (payload_kiov != NULL) + lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, + payload_niov, payload_iov, + payload_offset, payload_nob); + } + + kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, + offsetof(kib_immediate_msg_t, + ibim_payload[payload_nob])); + + /* libmsg gets finalized when tx completes */ + tx->tx_libmsg[0] = libmsg; + + kibnal_launch_tx(tx, nid); + return (PTL_OK); +} + +static ptl_err_t +kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, struct iovec *payload_iov, + size_t payload_offset, size_t payload_len) +{ + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, payload_iov, NULL, + payload_offset, payload_len)); +} + +static ptl_err_t +kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, ptl_kiov_t *payload_kiov, + size_t payload_offset, size_t payload_len) +{ + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, + payload_offset, payload_len)); +} + +static ptl_err_t +kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, + unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) +{ + kib_rx_t *rx = private; + kib_msg_t *rxmsg = rx->rx_msg; + int msg_nob; + + LASSERT (mlen <= rlen); + LASSERT (!in_interrupt ()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + switch (rxmsg->ibm_type) { + default: + LBUG(); + return (PTL_FAIL); + + case IBNAL_MSG_IMMEDIATE: + msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (msg_nob > IBNAL_MSG_SIZE) { + CERROR ("Immediate message from "LPX64" too big: %d\n", + rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); + return (PTL_FAIL); + } + + if (kiov != NULL) + lib_copy_buf2kiov(niov, kiov, offset, + rxmsg->ibm_u.immediate.ibim_payload, + mlen); + else + lib_copy_buf2iov(niov, iov, offset, + rxmsg->ibm_u.immediate.ibim_payload, + mlen); + + lib_finalize (nal, NULL, libmsg, PTL_OK); + return (PTL_OK); + + case IBNAL_MSG_GET_RDMA: + /* We get called here just to discard any junk after the + * GET hdr. */ + LASSERT (libmsg == NULL); + lib_finalize (nal, NULL, libmsg, PTL_OK); + return (PTL_OK); + + case IBNAL_MSG_PUT_RDMA: + kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, + rx, libmsg, + niov, iov, kiov, offset, mlen); + return (PTL_OK); + } +} + +static ptl_err_t +kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen) +{ + return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, + offset, mlen, rlen)); +} + +static ptl_err_t +kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) +{ + return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, + offset, mlen, rlen)); +} + +/***************************************************************************** + * the rest of this file concerns connection management. active connetions + * start with connect_peer, passive connections start with passive_callback. + * active disconnects start with conn_close, cm_callback starts passive + * disconnects and contains the guts of how the disconnect state machine + * progresses. + *****************************************************************************/ + +int +kibnal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&kibnal_data.kib_nthreads); + return (0); +} + +static void +kibnal_thread_fini (void) +{ + atomic_dec (&kibnal_data.kib_nthreads); +} + +/* this can be called by anyone at any time to close a connection. if + * the connection is still established it heads to the connd to start + * the disconnection in a safe context. It has no effect if called + * on a connection that is already disconnecting */ +void +kibnal_close_conn_locked (kib_conn_t *conn, int error) +{ + /* This just does the immmediate housekeeping, and schedules the + * connection for the connd to finish off. + * Caller holds kib_global_lock exclusively in irq context */ + kib_peer_t *peer = conn->ibc_peer; + + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING, + IBNAL_CONN_DISCONNECTED); + + if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) + return; /* already disconnecting */ + + CDEBUG (error == 0 ? D_NET : D_ERROR, + "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); + + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + /* kib_connd_conns takes ibc_list's ref */ + list_del (&conn->ibc_list); + } else { + /* new ref for kib_connd_conns */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + } + + if (list_empty (&peer->ibp_conns) && + peer->ibp_persistence == 0) { + /* Non-persistent peer with no more conns... */ + kibnal_unlink_peer_locked (peer); + } + + conn->ibc_state = IBNAL_CONN_SEND_DREQ; + + spin_lock (&kibnal_data.kib_connd_lock); + + list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock (&kibnal_data.kib_connd_lock); +} + +void +kibnal_close_conn (kib_conn_t *conn, int error) +{ + unsigned long flags; + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + kibnal_close_conn_locked (conn, error); + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); +} + +static void +kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) +{ + LIST_HEAD (zombies); + kib_tx_t *tx; + unsigned long flags; + + LASSERT (rc != 0); + LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + LASSERT (peer->ibp_connecting != 0); + peer->ibp_connecting--; + + if (peer->ibp_connecting != 0) { + /* another connection attempt under way (loopback?)... */ + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + return; + } + + if (list_empty(&peer->ibp_conns)) { + /* Say when active connection can be re-attempted */ + peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; + /* Increase reconnection interval */ + peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, + IBNAL_MAX_RECONNECT_INTERVAL); + + /* Take peer's blocked blocked transmits; I'll complete + * them with error */ + while (!list_empty (&peer->ibp_tx_queue)) { + tx = list_entry (peer->ibp_tx_queue.next, + kib_tx_t, tx_list); + + list_del (&tx->tx_list); + list_add_tail (&tx->tx_list, &zombies); + } + + if (kibnal_peer_active(peer) && + (peer->ibp_persistence == 0)) { + /* failed connection attempt on non-persistent peer */ + kibnal_unlink_peer_locked (peer); + } + } else { + /* Can't have blocked transmits if there are connections */ + LASSERT (list_empty(&peer->ibp_tx_queue)); + } + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + if (!list_empty (&zombies)) + CERROR ("Deleting messages for "LPX64": connection failed\n", + peer->ibp_nid); + + while (!list_empty (&zombies)) { + tx = list_entry (zombies.next, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + /* complete now */ + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + } +} + +static void +kibnal_connreq_done (kib_conn_t *conn, int active, int status) +{ + int state = conn->ibc_state; + kib_peer_t *peer = conn->ibc_peer; + kib_tx_t *tx; + unsigned long flags; + int i; + + /* passive connection has no connreq & vice versa */ + LASSERTF(!active == !(conn->ibc_connreq != NULL), + "%d %p\n", active, conn->ibc_connreq); + if (active) { + PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); + conn->ibc_connreq = NULL; + } + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + LASSERT (peer->ibp_connecting != 0); + + if (status == 0) { + /* connection established... */ + KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING); + conn->ibc_state = IBNAL_CONN_ESTABLISHED; + + if (!kibnal_peer_active(peer)) { + /* ...but peer deleted meantime */ + status = -ECONNABORTED; + } + } else { + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP, + IBNAL_CONN_CONNECTING); + } + + if (status == 0) { + /* Everything worked! */ + + peer->ibp_connecting--; + + /* +1 ref for ibc_list; caller(== CM)'s ref remains until + * the IB_CM_IDLE callback */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + list_add (&conn->ibc_list, &peer->ibp_conns); + + /* reset reconnect interval for next attempt */ + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + + /* post blocked sends to the new connection */ + spin_lock (&conn->ibc_lock); + + while (!list_empty (&peer->ibp_tx_queue)) { + tx = list_entry (peer->ibp_tx_queue.next, + kib_tx_t, tx_list); + + list_del (&tx->tx_list); + + /* +1 ref for each tx */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + kibnal_queue_tx_locked (tx, conn); + } + + spin_unlock (&conn->ibc_lock); + + /* Nuke any dangling conns from a different peer instance... */ + kibnal_close_stale_conns_locked (conn->ibc_peer, + conn->ibc_incarnation); + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + /* queue up all the receives */ + for (i = 0; i < IBNAL_RX_MSGS; i++) { + /* +1 ref for rx desc */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + + CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n", + i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg, + conn->ibc_rxs[i].rx_vaddr); + + kibnal_post_rx (&conn->ibc_rxs[i], 0); + } + + kibnal_check_sends (conn); + return; + } + + /* connection failed */ + if (state == IBNAL_CONN_CONNECTING) { + /* schedule for connd to close */ + kibnal_close_conn_locked (conn, status); + } else { + /* Don't have a CM comm_id; just wait for refs to drain */ + conn->ibc_state = IBNAL_CONN_DISCONNECTED; + } + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + kibnal_peer_connect_failed (conn->ibc_peer, active, status); + + /* If we didn't establish the connection we don't have to pass + * through the disconnect protocol before dropping the CM ref */ + if (state < IBNAL_CONN_CONNECTING) + kibnal_put_conn (conn); +} + +static int +kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep, + ptl_nid_t nid, __u64 incarnation, int queue_depth) +{ + kib_conn_t *conn = kibnal_create_conn(); + kib_peer_t *peer; + kib_peer_t *peer2; + unsigned long flags; + + if (conn == NULL) + return (-ENOMEM); + + if (queue_depth != IBNAL_MSG_QUEUE_SIZE) { + CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", + nid, queue_depth, IBNAL_MSG_QUEUE_SIZE); + atomic_dec (&conn->ibc_refcount); + kibnal_destroy_conn(conn); + return (-EPROTO); + } + + /* assume 'nid' is a new peer */ + peer = kibnal_create_peer (nid); + if (peer == NULL) { + CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_dec (&conn->ibc_refcount); + kibnal_destroy_conn(conn); + return (-ENOMEM); + } + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + peer2 = kibnal_find_peer_locked(nid); + if (peer2 == NULL) { + /* peer table takes my ref on peer */ + list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid)); + } else { + kib_peer_decref (peer); + peer = peer2; + } + + kib_peer_addref(peer); /* +1 ref for conn */ + peer->ibp_connecting++; + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + conn->ibc_peer = peer; + conn->ibc_state = IBNAL_CONN_CONNECTING; + /* conn->ibc_cep is set when cm_accept is called */ + conn->ibc_incarnation = incarnation; + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + + *connp = conn; + return (0); +} + +static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state) +{ + IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,}; + FSTATUS frc; + + modify_attr.RequestState = state; + + frc = iibt_qp_modify(qp, &modify_attr, NULL); + if (frc != FSUCCESS) + CERROR("couldn't set qp state to %d, error %d\n", state, frc); +} + +static void kibnal_flush_pending(kib_conn_t *conn) +{ + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; + unsigned long flags; + int done; + + /* NB we wait until the connection has closed before completing + * outstanding passive RDMAs so we can be sure the network can't + * touch the mapped memory any more. */ + KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED); + + /* set the QP to the error state so that we get flush callbacks + * on our posted receives which can then drop their conn refs */ + kibnal_set_qp_state(conn->ibc_qp, QPStateError); + + spin_lock_irqsave (&conn->ibc_lock, flags); + + /* grab passive RDMAs not waiting for the tx callback */ + list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + /* still waiting for tx callback? */ + if (!tx->tx_passive_rdma_wait) + continue; + + tx->tx_status = -ECONNABORTED; + tx->tx_passive_rdma_wait = 0; + done = (tx->tx_sending == 0); + + if (!done) + continue; + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + /* grab all blocked transmits */ + list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + while (!list_empty(&zombies)) { + tx = list_entry (zombies.next, kib_tx_t, tx_list); + + list_del(&tx->tx_list); + kibnal_tx_done (tx); + } +} + +static void +kibnal_reject (IB_HANDLE cep, uint16_t reason) +{ + CM_REJECT_INFO *rej; + + PORTAL_ALLOC(rej, sizeof(*rej)); + if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */ + return; + + rej->Reason = reason; + iibt_cm_reject(cep, rej); + PORTAL_FREE(rej, sizeof(*rej)); +} + +static FSTATUS +kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res, + IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn) +{ + IB_QP_ATTRIBUTES_MODIFY modify_attr; + FSTATUS frc; + ENTRY; + + modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { + .RequestState = QPStateReadyToRecv, + .RecvPSN = IBNAL_STARTING_PSN, + .DestQPNumber = qpn, + .ResponderResources = resp_res, + .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */ + .Attrs = (IB_QP_ATTR_RECVPSN | + IB_QP_ATTR_DESTQPNUMBER | + IB_QP_ATTR_RESPONDERRESOURCES | + IB_QP_ATTR_DESTAV | + IB_QP_ATTR_PATHMTU | + IB_QP_ATTR_MINRNRTIMER), + }; + GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, + &modify_attr.DestAV); + + frc = iibt_qp_modify(qp_handle, &modify_attr, NULL); + if (frc != FSUCCESS) + RETURN(frc); + + modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { + .RequestState = QPStateReadyToSend, + .FlowControl = TRUE, + .InitiatorDepth = init_depth, + .SendPSN = send_psn, + .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */ + .RetryCount = IBNAL_RETRY, + .RnrRetryCount = IBNAL_RNR_RETRY, + .Attrs = (IB_QP_ATTR_FLOWCONTROL | + IB_QP_ATTR_INITIATORDEPTH | + IB_QP_ATTR_SENDPSN | + IB_QP_ATTR_LOCALACKTIMEOUT | + IB_QP_ATTR_RETRYCOUNT | + IB_QP_ATTR_RNRRETRYCOUNT), + }; + + frc = iibt_qp_modify(qp_handle, &modify_attr, NULL); + RETURN(frc); +} + +static void +kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +{ + IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; + kib_conn_t *conn = arg; + kib_wire_connreq_t *wcr; + CM_REPLY_INFO *rep = &info->Info.Reply; + uint16_t reason; + FSTATUS frc; + + wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData; + + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { + CERROR ("Can't connect "LPX64": bad magic %08x\n", + conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic)); + GOTO(reject, reason = RC_USER_REJ); + } + + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { + CERROR ("Can't connect "LPX64": bad version %d\n", + conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic)); + GOTO(reject, reason = RC_USER_REJ); + } + + if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) { + CERROR ("Can't connect "LPX64": bad queue depth %d\n", + conn->ibc_peer->ibp_nid, + le16_to_cpu(wcr->wcr_queue_depth)); + GOTO(reject, reason = RC_USER_REJ); + } + + if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) { + CERROR ("Unexpected NID "LPX64" from "LPX64"\n", + le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid); + GOTO(reject, reason = RC_USER_REJ); + } + + CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n", + conn, conn->ibc_peer->ibp_nid); + + conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation); + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + + frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN, + min_t(__u8, rep->ArbInitiatorDepth, + ca_attr->MaxQPResponderResources), + &conn->ibc_connreq->cr_path, + min_t(__u8, rep->ArbResponderResources, + ca_attr->MaxQPInitiatorDepth), + rep->StartingPSN); + if (frc != FSUCCESS) { + CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n", + conn, conn->ibc_peer->ibp_nid, frc); + GOTO(reject, reason = RC_NO_QP); + } + + /* the callback arguments are ignored for an active accept */ + conn->ibc_connreq->cr_discarded.Status = FSUCCESS; + frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded, + NULL, NULL, NULL, NULL); + if (frc != FCM_CONNECT_ESTABLISHED) { + CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n", + conn, conn->ibc_peer->ibp_nid, frc); + kibnal_connreq_done (conn, 1, -ECONNABORTED); + /* XXX don't call reject after accept fails? */ + return; + } + + CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n", + conn, conn->ibc_peer->ibp_nid); + + kibnal_connreq_done (conn, 1, 0); + return; + +reject: + kibnal_reject(cep, reason); + kibnal_connreq_done (conn, 1, -EPROTO); +} + +/* ib_cm.h has a wealth of information on the CM procedures */ +static void +kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +{ + kib_conn_t *conn = arg; + + CDEBUG(D_NET, "status 0x%x\n", info->Status); + + /* Established Connection Notifier */ + switch (info->Status) { + default: + CERROR("unknown status %d on Connection %p -> "LPX64"\n", + info->Status, conn, conn->ibc_peer->ibp_nid); + LBUG(); + break; + + case FCM_CONNECT_REPLY: + kibnal_connect_reply(cep, info, arg); + break; + + case FCM_DISCONNECT_REQUEST: + /* XXX lock around these state management bits? */ + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) + kibnal_close_conn (conn, 0); + conn->ibc_state = IBNAL_CONN_DREP; + iibt_cm_disconnect(conn->ibc_cep, NULL, NULL); + break; + + /* these both guarantee that no more cm callbacks will occur */ + case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */ + case FCM_DISCONNECT_REPLY: + CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n", + conn, conn->ibc_peer->ibp_nid); + + conn->ibc_state = IBNAL_CONN_DISCONNECTED; + kibnal_flush_pending(conn); + kibnal_put_conn(conn); /* Lose CM's ref */ + break; + } + + return; +} + +static int +kibnal_set_cm_flags(IB_HANDLE cep) +{ + FSTATUS frc; + uint32 value = 1; + + frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK, + (char *)&value, sizeof(value), 0); + if (frc != FSUCCESS) { + CERROR("error setting timeout callback: %d\n", frc); + return -1; + } + +#if 0 + frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value, + sizeof(value), 0); + if (frc != FSUCCESS) { + CERROR("error setting async accept: %d\n", frc); + return -1; + } +#endif + + return 0; +} + +void +kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +{ + IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; + IB_QP_ATTRIBUTES_QUERY *query; + CM_REQUEST_INFO *req; + CM_CONN_INFO *rep = NULL, *rcv = NULL; + kib_wire_connreq_t *wcr; + kib_conn_t *conn = NULL; + uint16_t reason = 0; + FSTATUS frc; + int rc = 0; + + LASSERT(cep); + LASSERT(info); + LASSERT(arg == NULL); /* no conn yet for passive */ + + CDEBUG(D_NET, "status 0x%x\n", info->Status); + + req = &info->Info.Request; + wcr = (kib_wire_connreq_t *)req->PrivateData; + + CDEBUG(D_NET, "%d from "LPX64"\n", info->Status, + le64_to_cpu(wcr->wcr_nid)); + + if (info->Status == FCM_CONNECT_CANCEL) + return; + + LASSERT (info->Status == FCM_CONNECT_REQUEST); + + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { + CERROR ("Can't accept: bad magic %08x\n", + le32_to_cpu(wcr->wcr_magic)); + GOTO(out, reason = RC_USER_REJ); + } + + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { + CERROR ("Can't accept: bad version %d\n", + le16_to_cpu(wcr->wcr_magic)); + GOTO(out, reason = RC_USER_REJ); + } + + rc = kibnal_accept(&conn, cep, + le64_to_cpu(wcr->wcr_nid), + le64_to_cpu(wcr->wcr_incarnation), + le16_to_cpu(wcr->wcr_queue_depth)); + if (rc != 0) { + CERROR ("Can't accept "LPX64": %d\n", + le64_to_cpu(wcr->wcr_nid), rc); + GOTO(out, reason = RC_NO_RESOURCES); + } + + frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN, + min_t(__u8, req->CEPInfo.OfferedInitiatorDepth, + ca_attr->MaxQPResponderResources), + &req->PathInfo.Path, + min_t(__u8, req->CEPInfo.OfferedResponderResources, + ca_attr->MaxQPInitiatorDepth), + req->CEPInfo.StartingPSN); + + if (frc != FSUCCESS) { + CERROR ("Can't mark QP RTS/RTR "LPX64": %d\n", + le64_to_cpu(wcr->wcr_nid), frc); + GOTO(out, reason = RC_NO_QP); + } + + frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL); + if (frc != FSUCCESS) { + CERROR ("Couldn't query qp attributes "LPX64": %d\n", + le64_to_cpu(wcr->wcr_nid), frc); + GOTO(out, reason = RC_NO_QP); + } + query = &conn->ibc_qp_attrs; + + PORTAL_ALLOC(rep, sizeof(*rep)); + PORTAL_ALLOC(rcv, sizeof(*rcv)); + if (rep == NULL || rcv == NULL) { + CERROR ("can't reply and receive buffers\n"); + GOTO(out, reason = RC_INSUFFICIENT_RESP_RES); + } + + /* don't try to deref this into the incoming wcr :) */ + wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData; + + rep->Info.Reply = (CM_REPLY_INFO) { + .QPN = query->QPNumber, + .QKey = query->Qkey, + .StartingPSN = query->RecvPSN, + .EndToEndFlowControl = query->FlowControl, + /* XXX Hmm. */ + .ArbInitiatorDepth = query->InitiatorDepth, + .ArbResponderResources = query->ResponderResources, + .TargetAckDelay = 0, + .FailoverAccepted = 0, + .RnRRetryCount = req->CEPInfo.RnrRetryCount, + }; + + *wcr = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), + }; + + frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn, + &conn->ibc_cep); + + PORTAL_FREE(rep, sizeof(*rep)); + PORTAL_FREE(rcv, sizeof(*rcv)); + + if (frc != FCM_CONNECT_ESTABLISHED) { + /* XXX it seems we don't call reject after this point? */ + CERROR("iibt_cm_accept() failed: %d, aborting\n", frc); + rc = -ECONNABORTED; + goto out; + } + + if (kibnal_set_cm_flags(conn->ibc_cep)) { + rc = -ECONNABORTED; + goto out; + } + + CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n", + conn, conn->ibc_peer->ibp_nid); + +out: + if (reason) { + kibnal_reject(cep, reason); + rc = -ECONNABORTED; + } + if (conn != NULL) + kibnal_connreq_done(conn, 0, rc); + + return; +} + +static void +dump_path_records(PATH_RESULTS *results) +{ + IB_PATH_RECORD *path; + int i; + + for(i = 0; i < results->NumPathRecords; i++) { + path = &results->PathRecords[i]; + CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid " + LPX64":"LPX64" pkey %x\n", + i, + path->SGID.Type.Global.SubnetPrefix, + path->SGID.Type.Global.InterfaceID, + path->DGID.Type.Global.SubnetPrefix, + path->DGID.Type.Global.InterfaceID, + path->P_Key); + } +} + +static void +kibnal_pathreq_callback (void *arg, QUERY *query, + QUERY_RESULT_VALUES *query_res) +{ + IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; + kib_conn_t *conn = arg; + PATH_RESULTS *path; + FSTATUS frc; + + if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) { + CERROR ("status %d data size %d\n", query_res->Status, + query_res->ResultDataSize); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + path = (PATH_RESULTS *)query_res->QueryResult; + + if (path->NumPathRecords < 1) { + CERROR ("expected path records: %d\n", path->NumPathRecords); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + dump_path_records(path); + + /* just using the first. this is probably a horrible idea. */ + conn->ibc_connreq->cr_path = path->PathRecords[0]; + + conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE); + if (conn->ibc_cep == NULL) { + CERROR ("Can't create CEP\n"); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + if (kibnal_set_cm_flags(conn->ibc_cep)) { + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), + }; + + conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) { + .SID = conn->ibc_connreq->cr_service.RID.ServiceID, + .CEPInfo = (CM_CEP_INFO) { + .CaGUID = kibnal_data.kib_hca_guids[0], + .EndToEndFlowControl = FALSE, + .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID, + .RetryCount = IBNAL_RETRY, + .RnrRetryCount = IBNAL_RNR_RETRY, + .AckTimeout = IBNAL_ACK_TIMEOUT, + .StartingPSN = IBNAL_STARTING_PSN, + .QPN = conn->ibc_qp_attrs.QPNumber, + .QKey = conn->ibc_qp_attrs.Qkey, + .OfferedResponderResources = ca_attr->MaxQPResponderResources, + .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth, + }, + .PathInfo = (CM_CEP_PATHINFO) { + .bSubnetLocal = TRUE, + .Path = conn->ibc_connreq->cr_path, + }, + }; + +#if 0 + /* XXX set timeout just like SDP!!!*/ + conn->ibc_connreq->cr_path.packet_life = 13; +#endif + /* Flag I'm getting involved with the CM... */ + conn->ibc_state = IBNAL_CONN_CONNECTING; + + CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", + conn->ibc_connreq->cr_service.RID.ServiceID, + *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); + + memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0, + CM_REQUEST_INFO_USER_LEN); + memcpy(conn->ibc_connreq->cr_cmreq.PrivateData, + &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr)); + + /* kibnal_cm_callback gets my conn ref */ + frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq, + kibnal_cm_callback, conn); + if (frc != FPENDING && frc != FSUCCESS) { + CERROR ("Connect: %d\n", frc); + /* Back out state change as connect failed */ + conn->ibc_state = IBNAL_CONN_INIT_QP; + kibnal_connreq_done (conn, 1, -EINVAL); + } +} + +static void +dump_service_records(SERVICE_RECORD_RESULTS *results) +{ + IB_SERVICE_RECORD *svc; + int i; + + for(i = 0; i < results->NumServiceRecords; i++) { + svc = &results->ServiceRecords[i]; + CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n", + i, + svc->RID.ServiceID, + svc->RID.ServiceGID.Type.Global.SubnetPrefix, + svc->RID.ServiceGID.Type.Global.InterfaceID, + svc->RID.ServiceP_Key); + } +} + + +static void +kibnal_service_get_callback (void *arg, QUERY *query, + QUERY_RESULT_VALUES *query_res) +{ + kib_conn_t *conn = arg; + SERVICE_RECORD_RESULTS *svc; + COMMAND_CONTROL_PARAMETERS sd_params; + QUERY path_query; + FSTATUS frc; + + if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) { + CERROR ("status %d data size %d\n", query_res->Status, + query_res->ResultDataSize); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult; + + if (svc->NumServiceRecords < 1) { + CERROR ("%d service records\n", svc->NumServiceRecords); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + dump_service_records(svc); + + conn->ibc_connreq->cr_service = svc->ServiceRecords[0]; + + CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n", + query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID, + *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); + + memset(&path_query, 0, sizeof(path_query)); + path_query.InputType = InputTypePortGuidPair; + path_query.OutputType = OutputTypePathRecord; + path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid; + path_query.InputValue.PortGuidPair.DestPortGuid = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID; + + memset(&sd_params, 0, sizeof(sd_params)); + sd_params.RetryCount = IBNAL_RETRY; + sd_params.Timeout = 10 * 1000; /* wait 10 seconds */ + + /* kibnal_service_get_callback gets my conn ref */ + + frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + &path_query, + kibnal_pathreq_callback, + &sd_params, conn); + if (frc == FPENDING) + return; + + CERROR ("Path record request failed: %d\n", frc); + kibnal_connreq_done (conn, 1, -EINVAL); +} + +static void +kibnal_connect_peer (kib_peer_t *peer) +{ + COMMAND_CONTROL_PARAMETERS sd_params; + QUERY query; + FSTATUS frc; + kib_conn_t *conn = kibnal_create_conn(); + + LASSERT (peer->ibp_connecting != 0); + + if (conn == NULL) { + CERROR ("Can't allocate conn\n"); + kibnal_peer_connect_failed (peer, 1, -ENOMEM); + return; + } + + conn->ibc_peer = peer; + kib_peer_addref(peer); + + PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); + if (conn->ibc_connreq == NULL) { + CERROR ("Can't allocate connreq\n"); + kibnal_connreq_done (conn, 1, -ENOMEM); + return; + } + + memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); + + kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); + + memset(&query, 0, sizeof(query)); + query.InputType = InputTypeServiceRecord; + query.OutputType = OutputTypeServiceRecord; + query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service; + query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; + + memset(&sd_params, 0, sizeof(sd_params)); + sd_params.RetryCount = IBNAL_RETRY; + sd_params.Timeout = 10 * 1000; /* wait 10 seconds */ + + /* kibnal_service_get_callback gets my conn ref */ + frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + &query, + kibnal_service_get_callback, + &sd_params, conn); + if (frc == FPENDING) + return; + + CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc); + kibnal_connreq_done (conn, 1, frc); +} + +static int +kibnal_conn_timed_out (kib_conn_t *conn) +{ + kib_tx_t *tx; + struct list_head *ttmp; + unsigned long flags; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + list_for_each (ttmp, &conn->ibc_tx_queue) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_sending == 0); + + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; + } + } + + list_for_each (ttmp, &conn->ibc_active_txs) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; + } + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + return 0; +} + +static void +kibnal_check_conns (int idx) +{ + struct list_head *peers = &kibnal_data.kib_peers[idx]; + struct list_head *ptmp; + kib_peer_t *peer; + kib_conn_t *conn; + struct list_head *ctmp; + + again: + /* NB. We expect to have a look at all the peers and not find any + * rdmas to time out, so we just use a shared lock while we + * take a look... */ + read_lock (&kibnal_data.kib_global_lock); + + list_for_each (ptmp, peers) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); + + list_for_each (ctmp, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED); + + /* In case we have enough credits to return via a + * NOOP, but there were no non-blocking tx descs + * free to do it last time... */ + kibnal_check_sends(conn); + + if (!kibnal_conn_timed_out(conn)) + continue; + + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + + atomic_inc (&conn->ibc_refcount); + read_unlock (&kibnal_data.kib_global_lock); + + CERROR("Timed out RDMA with "LPX64"\n", + peer->ibp_nid); + + kibnal_close_conn (conn, -ETIMEDOUT); + kibnal_put_conn (conn); + + /* start again now I've dropped the lock */ + goto again; + } + } + + read_unlock (&kibnal_data.kib_global_lock); +} + +static void +kib_connd_handle_state(kib_conn_t *conn) +{ + FSTATUS frc; + + switch (conn->ibc_state) { + /* all refs have gone, free and be done with it */ + case IBNAL_CONN_DISCONNECTED: + kibnal_destroy_conn (conn); + return; /* avoid put_conn */ + + case IBNAL_CONN_SEND_DREQ: + frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL); + if (frc != FSUCCESS) /* XXX do real things */ + CERROR("disconnect failed: %d\n", frc); + conn->ibc_state = IBNAL_CONN_DREQ; + break; + + /* a callback got to the conn before we did */ + case IBNAL_CONN_DREP: + break; + + default: + CERROR ("Bad conn %p state: %d\n", conn, + conn->ibc_state); + LBUG(); + break; + } + + /* drop ref from close_conn */ + kibnal_put_conn(conn); +} + +int +kibnal_connd (void *arg) +{ + wait_queue_t wait; + unsigned long flags; + kib_conn_t *conn; + kib_peer_t *peer; + int timeout; + int i; + int peer_index = 0; + unsigned long deadline = jiffies; + + kportal_daemonize ("kibnal_connd"); + kportal_blockallsigs (); + + init_waitqueue_entry (&wait, current); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + + for (;;) { + if (!list_empty (&kibnal_data.kib_connd_conns)) { + conn = list_entry (kibnal_data.kib_connd_conns.next, + kib_conn_t, ibc_list); + list_del (&conn->ibc_list); + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + kib_connd_handle_state(conn); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + continue; + } + + if (!list_empty (&kibnal_data.kib_connd_peers)) { + peer = list_entry (kibnal_data.kib_connd_peers.next, + kib_peer_t, ibp_connd_list); + + list_del_init (&peer->ibp_connd_list); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + + kibnal_connect_peer (peer); + kib_peer_decref (peer); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + } + + /* shut down and nobody left to reap... */ + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) + break; + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + + /* careful with the jiffy wrap... */ + while ((timeout = (int)(deadline - jiffies)) <= 0) { + const int n = 4; + const int p = 1; + int chunk = kibnal_data.kib_peer_hash_size; + + /* Time to check for RDMA timeouts on a few more + * peers: I do checks every 'p' seconds on a + * proportion of the peer table and I need to check + * every connection 'n' times within a timeout + * interval, to ensure I detect a timeout on any + * connection within (n+1)/n times the timeout + * interval. */ + + if (kibnal_tunables.kib_io_timeout > n * p) + chunk = (chunk * n * p) / + kibnal_tunables.kib_io_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + kibnal_check_conns (peer_index); + peer_index = (peer_index + 1) % + kibnal_data.kib_peer_hash_size; + } + + deadline += p * HZ; + } + + kibnal_data.kib_connd_waketime = jiffies + timeout; + + set_current_state (TASK_INTERRUPTIBLE); + add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); + + if (!kibnal_data.kib_shutdown && + list_empty (&kibnal_data.kib_connd_conns) && + list_empty (&kibnal_data.kib_connd_peers)) + schedule_timeout (timeout); + + set_current_state (TASK_RUNNING); + remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + } + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + + kibnal_thread_fini (); + return (0); +} + +int +kibnal_scheduler(void *arg) +{ + long id = (long)arg; + char name[16]; + kib_rx_t *rx; + kib_tx_t *tx; + unsigned long flags; + int rc; + int counter = 0; + int did_something; + + snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); + kportal_daemonize(name); + kportal_blockallsigs(); + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + + for (;;) { + did_something = 0; + + while (!list_empty(&kibnal_data.kib_sched_txq)) { + tx = list_entry(kibnal_data.kib_sched_txq.next, + kib_tx_t, tx_list); + list_del(&tx->tx_list); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + kibnal_tx_done(tx); + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, + flags); + } + + if (!list_empty(&kibnal_data.kib_sched_rxq)) { + rx = list_entry(kibnal_data.kib_sched_rxq.next, + kib_rx_t, rx_list); + list_del(&rx->rx_list); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + + kibnal_rx(rx); + + did_something = 1; + spin_lock_irqsave(&kibnal_data.kib_sched_lock, + flags); + } + + /* shut down and no receives to complete... */ + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) + break; + + /* nothing to do or hogging CPU */ + if (!did_something || counter++ == IBNAL_RESCHED) { + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + counter = 0; + + if (!did_something) { + rc = wait_event_interruptible( + kibnal_data.kib_sched_waitq, + !list_empty(&kibnal_data.kib_sched_txq) || + !list_empty(&kibnal_data.kib_sched_rxq) || + (kibnal_data.kib_shutdown && + atomic_read (&kibnal_data.kib_nconns) == 0)); + } else { + our_cond_resched(); + } + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, + flags); + } + } + + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + + kibnal_thread_fini(); + return (0); +} + + +lib_nal_t kibnal_lib = { + libnal_data: &kibnal_data, /* NAL private data */ + libnal_send: kibnal_send, + libnal_send_pages: kibnal_send_pages, + libnal_recv: kibnal_recv, + libnal_recv_pages: kibnal_recv_pages, + libnal_dist: kibnal_dist +}; diff --git a/lustre/portals/knals/scimacnal/.cvsignore b/lustre/portals/knals/openibnal/.cvsignore similarity index 100% rename from lustre/portals/knals/scimacnal/.cvsignore rename to lustre/portals/knals/openibnal/.cvsignore index 48b17e9..5ed596b 100644 --- a/lustre/portals/knals/scimacnal/.cvsignore +++ b/lustre/portals/knals/openibnal/.cvsignore @@ -1,10 +1,10 @@ .deps Makefile +.*.cmd autoMakefile.in autoMakefile *.ko *.mod.c .*.flags -.*.cmd .tmp_versions .depend diff --git a/lustre/portals/knals/openibnal/openibnal.c b/lustre/portals/knals/openibnal/openibnal.c index 6f66143..652eb34 100644 --- a/lustre/portals/knals/openibnal/openibnal.c +++ b/lustre/portals/knals/openibnal/openibnal.c @@ -23,26 +23,25 @@ #include "openibnal.h" -nal_t koibnal_api; -ptl_handle_ni_t koibnal_ni; -koib_data_t koibnal_data; -koib_tunables_t koibnal_tunables; +nal_t kibnal_api; +ptl_handle_ni_t kibnal_ni; +kib_data_t kibnal_data; +kib_tunables_t kibnal_tunables; #ifdef CONFIG_SYSCTL -#define OPENIBNAL_SYSCTL 202 +#define IBNAL_SYSCTL 202 -#define OPENIBNAL_SYSCTL_TIMEOUT 1 -#define OPENIBNAL_SYSCTL_ZERO_COPY 2 +#define IBNAL_SYSCTL_TIMEOUT 1 -static ctl_table koibnal_ctl_table[] = { - {OPENIBNAL_SYSCTL_TIMEOUT, "timeout", - &koibnal_tunables.koib_io_timeout, sizeof (int), +static ctl_table kibnal_ctl_table[] = { + {IBNAL_SYSCTL_TIMEOUT, "timeout", + &kibnal_tunables.kib_io_timeout, sizeof (int), 0644, NULL, &proc_dointvec}, { 0 } }; -static ctl_table koibnal_top_ctl_table[] = { - {OPENIBNAL_SYSCTL, "openibnal", NULL, 0, 0555, koibnal_ctl_table}, +static ctl_table kibnal_top_ctl_table[] = { + {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table}, { 0 } }; #endif @@ -66,167 +65,183 @@ print_service(struct ib_common_attrib_service *service, char *tag, int rc) "service id: "LPX64"\n" "name : %s\n" "NID : "LPX64"\n", tag, rc, - service->service_id, name, service->service_data64[0]); + service->service_id, name, + *kibnal_service_nid_field(service)); } void -koibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status, +kibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status, struct ib_common_attrib_service *service, void *arg) { *(int *)arg = status; - up (&koibnal_data.koib_nid_signal); + up (&kibnal_data.kib_nid_signal); } +#if IBNAL_CHECK_ADVERT +void +kibnal_check_advert (void) +{ + struct ib_common_attrib_service *svc; + __u64 tid; + int rc; + int rc2; + + PORTAL_ALLOC(svc, sizeof(*svc)); + if (svc == NULL) + return; + + memset (svc, 0, sizeof (*svc)); + kibnal_set_service_keys(svc, kibnal_data.kib_nid); + + rc = ib_service_get (kibnal_data.kib_device, + kibnal_data.kib_port, + svc, + KIBNAL_SERVICE_KEY_MASK, + kibnal_tunables.kib_io_timeout * HZ, + kibnal_service_setunset_done, &rc2, + &tid); + + if (rc != 0) { + CERROR ("Immediate error %d checking SM service\n", rc); + } else { + down (&kibnal_data.kib_nid_signal); + rc = rc2; + + if (rc != 0) + CERROR ("Error %d checking SM service\n", rc); + } + + PORTAL_FREE(svc, sizeof(*svc)); +} +#endif + int -koibnal_advertise (void) +kibnal_advertise (void) { + struct ib_common_attrib_service *svc; __u64 tid; int rc; int rc2; - LASSERT (koibnal_data.koib_nid != PTL_NID_ANY); + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + + PORTAL_ALLOC(svc, sizeof(*svc)); + if (svc == NULL) + return (-ENOMEM); - memset (&koibnal_data.koib_service, 0, - sizeof (koibnal_data.koib_service)); + memset (svc, 0, sizeof (*svc)); - koibnal_data.koib_service.service_id - = koibnal_data.koib_cm_service_id; + svc->service_id = kibnal_data.kib_service_id; - rc = ib_cached_gid_get(koibnal_data.koib_device, - koibnal_data.koib_port, + rc = ib_cached_gid_get(kibnal_data.kib_device, + kibnal_data.kib_port, 0, - koibnal_data.koib_service.service_gid); + svc->service_gid); if (rc != 0) { CERROR ("Can't get port %d GID: %d\n", - koibnal_data.koib_port, rc); - return (rc); + kibnal_data.kib_port, rc); + goto out; } - rc = ib_cached_pkey_get(koibnal_data.koib_device, - koibnal_data.koib_port, + rc = ib_cached_pkey_get(kibnal_data.kib_device, + kibnal_data.kib_port, 0, - &koibnal_data.koib_service.service_pkey); + &svc->service_pkey); if (rc != 0) { CERROR ("Can't get port %d PKEY: %d\n", - koibnal_data.koib_port, rc); - return (rc); + kibnal_data.kib_port, rc); + goto out; } - koibnal_data.koib_service.service_lease = 0xffffffff; + svc->service_lease = 0xffffffff; - koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid); + kibnal_set_service_keys(svc, kibnal_data.kib_nid); CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", - koibnal_data.koib_service.service_id, - koibnal_data.koib_service.service_name, - *koibnal_service_nid_field(&koibnal_data.koib_service)); + svc->service_id, + svc->service_name, *kibnal_service_nid_field(svc)); - rc = ib_service_set (koibnal_data.koib_device, - koibnal_data.koib_port, - &koibnal_data.koib_service, + rc = ib_service_set (kibnal_data.kib_device, + kibnal_data.kib_port, + svc, IB_SA_SERVICE_COMP_MASK_ID | IB_SA_SERVICE_COMP_MASK_GID | IB_SA_SERVICE_COMP_MASK_PKEY | IB_SA_SERVICE_COMP_MASK_LEASE | - KOIBNAL_SERVICE_KEY_MASK, - koibnal_tunables.koib_io_timeout * HZ, - koibnal_service_setunset_done, &rc2, &tid); + KIBNAL_SERVICE_KEY_MASK, + kibnal_tunables.kib_io_timeout * HZ, + kibnal_service_setunset_done, &rc2, &tid); - if (rc == 0) { - down (&koibnal_data.koib_nid_signal); - rc = rc2; + if (rc != 0) { + CERROR ("Immediate error %d advertising NID "LPX64"\n", + rc, kibnal_data.kib_nid); + goto out; } - - if (rc != 0) - CERROR ("Error %d advertising SM service\n", rc); + down (&kibnal_data.kib_nid_signal); + + rc = rc2; + if (rc != 0) + CERROR ("Error %d advertising NID "LPX64"\n", + rc, kibnal_data.kib_nid); + out: + PORTAL_FREE(svc, sizeof(*svc)); return (rc); } -int -koibnal_unadvertise (int expect_success) +void +kibnal_unadvertise (int expect_success) { + struct ib_common_attrib_service *svc; __u64 tid; int rc; int rc2; - LASSERT (koibnal_data.koib_nid != PTL_NID_ANY); + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); - memset (&koibnal_data.koib_service, 0, - sizeof (koibnal_data.koib_service)); + PORTAL_ALLOC(svc, sizeof(*svc)); + if (svc == NULL) + return; - koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid); + memset (svc, 0, sizeof(*svc)); + + kibnal_set_service_keys(svc, kibnal_data.kib_nid); CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n", - koibnal_data.koib_service.service_name, - *koibnal_service_nid_field(&koibnal_data.koib_service)); - - rc = ib_service_delete (koibnal_data.koib_device, - koibnal_data.koib_port, - &koibnal_data.koib_service, - KOIBNAL_SERVICE_KEY_MASK, - koibnal_tunables.koib_io_timeout * HZ, - koibnal_service_setunset_done, &rc2, &tid); + svc->service_name, *kibnal_service_nid_field(svc)); + + rc = ib_service_delete (kibnal_data.kib_device, + kibnal_data.kib_port, + svc, + KIBNAL_SERVICE_KEY_MASK, + kibnal_tunables.kib_io_timeout * HZ, + kibnal_service_setunset_done, &rc2, &tid); if (rc != 0) { CERROR ("Immediate error %d unadvertising NID "LPX64"\n", - rc, koibnal_data.koib_nid); - return (rc); + rc, kibnal_data.kib_nid); + goto out; } - down (&koibnal_data.koib_nid_signal); + down (&kibnal_data.kib_nid_signal); if ((rc2 == 0) == !!expect_success) - return (0); + goto out; /* success: rc == 0 */ if (expect_success) CERROR("Error %d unadvertising NID "LPX64"\n", - rc, koibnal_data.koib_nid); + rc, kibnal_data.kib_nid); else CWARN("Removed conflicting NID "LPX64"\n", - koibnal_data.koib_nid); - - return (rc); -} - -int -koibnal_check_advert (void) -{ - __u64 tid; - int rc; - int rc2; - - static struct ib_common_attrib_service srv; - - memset (&srv, 0, sizeof (srv)); - - koibnal_set_service_keys(&srv, koibnal_data.koib_nid); - - rc = ib_service_get (koibnal_data.koib_device, - koibnal_data.koib_port, - &srv, - KOIBNAL_SERVICE_KEY_MASK, - koibnal_tunables.koib_io_timeout * HZ, - koibnal_service_setunset_done, &rc2, - &tid); - - if (rc != 0) { - CERROR ("Immediate error %d checking SM service\n", rc); - } else { - down (&koibnal_data.koib_nid_signal); - rc = rc2; - - if (rc != 0) - CERROR ("Error %d checking SM service\n", rc); - } - - return (rc); + kibnal_data.kib_nid); + out: + PORTAL_FREE(svc, sizeof(*svc)); } int -koibnal_set_mynid(ptl_nid_t nid) +kibnal_set_mynid(ptl_nid_t nid) { struct timeval tv; - lib_ni_t *ni = &koibnal_lib.libnal_ni; + lib_ni_t *ni = &kibnal_lib.libnal_ni; int rc; CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", @@ -234,75 +249,76 @@ koibnal_set_mynid(ptl_nid_t nid) do_gettimeofday(&tv); - down (&koibnal_data.koib_nid_mutex); + down (&kibnal_data.kib_nid_mutex); - if (nid == koibnal_data.koib_nid) { + if (nid == kibnal_data.kib_nid) { /* no change of NID */ - up (&koibnal_data.koib_nid_mutex); + up (&kibnal_data.kib_nid_mutex); return (0); } CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", - koibnal_data.koib_nid, nid); + kibnal_data.kib_nid, nid); - if (koibnal_data.koib_nid != PTL_NID_ANY) { + if (kibnal_data.kib_nid != PTL_NID_ANY) { - koibnal_unadvertise (1); + kibnal_unadvertise (1); - rc = ib_cm_listen_stop (koibnal_data.koib_listen_handle); + rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle); if (rc != 0) CERROR ("Error %d stopping listener\n", rc); } - koibnal_data.koib_nid = ni->ni_pid.nid = nid; - koibnal_data.koib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + kibnal_data.kib_nid = ni->ni_pid.nid = nid; + kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; /* Delete all existing peers and their connections after new * NID/incarnation set to ensure no old connections in our brave * new world. */ - koibnal_del_peer (PTL_NID_ANY, 0); - - rc = 0; - if (koibnal_data.koib_nid != PTL_NID_ANY) { - /* New NID installed */ + kibnal_del_peer (PTL_NID_ANY, 0); - /* remove any previous advert (crashed node etc) */ - koibnal_unadvertise(0); + if (kibnal_data.kib_nid == PTL_NID_ANY) { + /* No new NID to install */ + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + /* remove any previous advert (crashed node etc) */ + kibnal_unadvertise(0); - /* Assign new service number */ - koibnal_data.koib_cm_service_id = ib_cm_service_assign(); - CDEBUG(D_NET, "service_id "LPX64"\n", koibnal_data.koib_cm_service_id); + /* Assign new service number */ + kibnal_data.kib_service_id = ib_cm_service_assign(); + CDEBUG(D_NET, "service_id "LPX64"\n", kibnal_data.kib_service_id); - rc = ib_cm_listen(koibnal_data.koib_cm_service_id, - TS_IB_CM_SERVICE_EXACT_MASK, - koibnal_passive_conn_callback, NULL, - &koibnal_data.koib_listen_handle); - if (rc != 0) { - CERROR ("ib_cm_listen error: %d\n", rc); - goto out; + rc = ib_cm_listen(kibnal_data.kib_service_id, + TS_IB_CM_SERVICE_EXACT_MASK, + kibnal_passive_conn_callback, NULL, + &kibnal_data.kib_listen_handle); + if (rc == 0) { + rc = kibnal_advertise(); + if (rc == 0) { +#if IBNAL_CHECK_ADVERT + kibnal_check_advert(); +#endif + up (&kibnal_data.kib_nid_mutex); + return (0); } - rc = koibnal_advertise(); - - koibnal_check_advert(); - } - - out: - if (rc != 0) { - koibnal_data.koib_nid = PTL_NID_ANY; + ib_cm_listen_stop(kibnal_data.kib_listen_handle); /* remove any peers that sprung up while I failed to * advertise myself */ - koibnal_del_peer (PTL_NID_ANY, 0); + kibnal_del_peer (PTL_NID_ANY, 0); } - - up (&koibnal_data.koib_nid_mutex); - return (0); + + kibnal_data.kib_nid = PTL_NID_ANY; + up (&kibnal_data.kib_nid_mutex); + return (rc); } -koib_peer_t * -koibnal_create_peer (ptl_nid_t nid) +kib_peer_t * +kibnal_create_peer (ptl_nid_t nid) { - koib_peer_t *peer; + kib_peer_t *peer; LASSERT (nid != PTL_NID_ANY); @@ -320,20 +336,20 @@ koibnal_create_peer (ptl_nid_t nid) INIT_LIST_HEAD (&peer->ibp_tx_queue); peer->ibp_reconnect_time = jiffies; - peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; - atomic_inc (&koibnal_data.koib_npeers); + atomic_inc (&kibnal_data.kib_npeers); return (peer); } void -koibnal_destroy_peer (koib_peer_t *peer) +kibnal_destroy_peer (kib_peer_t *peer) { CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer); LASSERT (atomic_read (&peer->ibp_refcount) == 0); LASSERT (peer->ibp_persistence == 0); - LASSERT (!koibnal_peer_active(peer)); + LASSERT (!kibnal_peer_active(peer)); LASSERT (peer->ibp_connecting == 0); LASSERT (list_empty (&peer->ibp_conns)); LASSERT (list_empty (&peer->ibp_tx_queue)); @@ -344,11 +360,11 @@ koibnal_destroy_peer (koib_peer_t *peer) * they are destroyed, so we can be assured that _all_ state to do * with this peer has been cleaned up when its refcount drops to * zero. */ - atomic_dec (&koibnal_data.koib_npeers); + atomic_dec (&kibnal_data.kib_npeers); } void -koibnal_put_peer (koib_peer_t *peer) +kibnal_put_peer (kib_peer_t *peer) { CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n", peer, peer->ibp_nid, @@ -358,19 +374,19 @@ koibnal_put_peer (koib_peer_t *peer) if (!atomic_dec_and_test (&peer->ibp_refcount)) return; - koibnal_destroy_peer (peer); + kibnal_destroy_peer (peer); } -koib_peer_t * -koibnal_find_peer_locked (ptl_nid_t nid) +kib_peer_t * +kibnal_find_peer_locked (ptl_nid_t nid) { - struct list_head *peer_list = koibnal_nid2peerlist (nid); + struct list_head *peer_list = kibnal_nid2peerlist (nid); struct list_head *tmp; - koib_peer_t *peer; + kib_peer_t *peer; list_for_each (tmp, peer_list) { - peer = list_entry (tmp, koib_peer_t, ibp_list); + peer = list_entry (tmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ peer->ibp_connecting != 0 || /* creating conns */ @@ -386,46 +402,46 @@ koibnal_find_peer_locked (ptl_nid_t nid) return (NULL); } -koib_peer_t * -koibnal_get_peer (ptl_nid_t nid) +kib_peer_t * +kibnal_get_peer (ptl_nid_t nid) { - koib_peer_t *peer; + kib_peer_t *peer; - read_lock (&koibnal_data.koib_global_lock); - peer = koibnal_find_peer_locked (nid); + read_lock (&kibnal_data.kib_global_lock); + peer = kibnal_find_peer_locked (nid); if (peer != NULL) /* +1 ref for caller? */ atomic_inc (&peer->ibp_refcount); - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (peer); } void -koibnal_unlink_peer_locked (koib_peer_t *peer) +kibnal_unlink_peer_locked (kib_peer_t *peer) { LASSERT (peer->ibp_persistence == 0); LASSERT (list_empty(&peer->ibp_conns)); - LASSERT (koibnal_peer_active(peer)); + LASSERT (kibnal_peer_active(peer)); list_del_init (&peer->ibp_list); /* lose peerlist's ref */ - koibnal_put_peer (peer); + kibnal_put_peer (peer); } int -koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) +kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) { - koib_peer_t *peer; + kib_peer_t *peer; struct list_head *ptmp; int i; - read_lock (&koibnal_data.koib_global_lock); + read_lock (&kibnal_data.kib_global_lock); - for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) { + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - list_for_each (ptmp, &koibnal_data.koib_peers[i]) { + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || !list_empty (&peer->ibp_conns)); @@ -436,53 +452,53 @@ koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) *nidp = peer->ibp_nid; *persistencep = peer->ibp_persistence; - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (0); } } - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (-ENOENT); } int -koibnal_add_persistent_peer (ptl_nid_t nid) +kibnal_add_persistent_peer (ptl_nid_t nid) { unsigned long flags; - koib_peer_t *peer; - koib_peer_t *peer2; + kib_peer_t *peer; + kib_peer_t *peer2; if (nid == PTL_NID_ANY) return (-EINVAL); - peer = koibnal_create_peer (nid); + peer = kibnal_create_peer (nid); if (peer == NULL) return (-ENOMEM); - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - peer2 = koibnal_find_peer_locked (nid); + peer2 = kibnal_find_peer_locked (nid); if (peer2 != NULL) { - koibnal_put_peer (peer); + kibnal_put_peer (peer); peer = peer2; } else { /* peer table takes existing ref on peer */ list_add_tail (&peer->ibp_list, - koibnal_nid2peerlist (nid)); + kibnal_nid2peerlist (nid)); } peer->ibp_persistence++; - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return (0); } void -koibnal_del_peer_locked (koib_peer_t *peer, int single_share) +kibnal_del_peer_locked (kib_peer_t *peer, int single_share) { struct list_head *ctmp; struct list_head *cnxt; - koib_conn_t *conn; + kib_conn_t *conn; if (!single_share) peer->ibp_persistence = 0; @@ -493,38 +509,38 @@ koibnal_del_peer_locked (koib_peer_t *peer, int single_share) return; list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, koib_conn_t, ibc_list); + conn = list_entry(ctmp, kib_conn_t, ibc_list); - koibnal_close_conn_locked (conn, 0); + kibnal_close_conn_locked (conn, 0); } /* NB peer unlinks itself when last conn is closed */ } int -koibnal_del_peer (ptl_nid_t nid, int single_share) +kibnal_del_peer (ptl_nid_t nid, int single_share) { unsigned long flags; struct list_head *ptmp; struct list_head *pnxt; - koib_peer_t *peer; + kib_peer_t *peer; int lo; int hi; int i; int rc = -ENOENT; - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); if (nid != PTL_NID_ANY) - lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers; + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; - hi = koibnal_data.koib_peer_hash_size - 1; + hi = kibnal_data.kib_peer_hash_size - 1; } for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || !list_empty (&peer->ibp_conns)); @@ -532,7 +548,7 @@ koibnal_del_peer (ptl_nid_t nid, int single_share) if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) continue; - koibnal_del_peer_locked (peer, single_share); + kibnal_del_peer_locked (peer, single_share); rc = 0; /* matched something */ if (single_share) @@ -540,26 +556,26 @@ koibnal_del_peer (ptl_nid_t nid, int single_share) } } out: - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return (rc); } -koib_conn_t * -koibnal_get_conn_by_idx (int index) +kib_conn_t * +kibnal_get_conn_by_idx (int index) { - koib_peer_t *peer; + kib_peer_t *peer; struct list_head *ptmp; - koib_conn_t *conn; + kib_conn_t *conn; struct list_head *ctmp; int i; - read_lock (&koibnal_data.koib_global_lock); + read_lock (&kibnal_data.kib_global_lock); - for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) { - list_for_each (ptmp, &koibnal_data.koib_peers[i]) { + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence > 0 || peer->ibp_connecting != 0 || !list_empty (&peer->ibp_conns)); @@ -568,25 +584,25 @@ koibnal_get_conn_by_idx (int index) if (index-- > 0) continue; - conn = list_entry (ctmp, koib_conn_t, ibc_list); + conn = list_entry (ctmp, kib_conn_t, ibc_list); CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, atomic_read (&conn->ibc_refcount)); atomic_inc (&conn->ibc_refcount); - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (conn); } } } - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (NULL); } -koib_conn_t * -koibnal_create_conn (void) +kib_conn_t * +kibnal_create_conn (void) { - koib_conn_t *conn; + kib_conn_t *conn; int i; __u64 vaddr = 0; __u64 vaddr_base; @@ -608,57 +624,57 @@ koibnal_create_conn (void) memset (conn, 0, sizeof (*conn)); INIT_LIST_HEAD (&conn->ibc_tx_queue); - INIT_LIST_HEAD (&conn->ibc_rdma_queue); + INIT_LIST_HEAD (&conn->ibc_active_txs); spin_lock_init (&conn->ibc_lock); - atomic_inc (&koibnal_data.koib_nconns); + atomic_inc (&kibnal_data.kib_nconns); /* well not really, but I call destroy() on failure, which decrements */ - PORTAL_ALLOC (conn->ibc_rxs, OPENIBNAL_RX_MSGS * sizeof (koib_rx_t)); + PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); if (conn->ibc_rxs == NULL) goto failed; - memset (conn->ibc_rxs, 0, OPENIBNAL_RX_MSGS * sizeof(koib_rx_t)); + memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); - rc = koibnal_alloc_pages(&conn->ibc_rx_pages, - OPENIBNAL_RX_MSG_PAGES, - IB_ACCESS_LOCAL_WRITE); + rc = kibnal_alloc_pages(&conn->ibc_rx_pages, + IBNAL_RX_MSG_PAGES, + IB_ACCESS_LOCAL_WRITE); if (rc != 0) goto failed; - vaddr_base = vaddr = conn->ibc_rx_pages->oibp_vaddr; + vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; - for (i = ipage = page_offset = 0; i < OPENIBNAL_RX_MSGS; i++) { - struct page *page = conn->ibc_rx_pages->oibp_pages[ipage]; - koib_rx_t *rx = &conn->ibc_rxs[i]; + for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { + struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; + kib_rx_t *rx = &conn->ibc_rxs[i]; rx->rx_conn = conn; rx->rx_vaddr = vaddr; - rx->rx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset); + rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); - vaddr += OPENIBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + OPENIBNAL_RX_MSG_BYTES); + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); - page_offset += OPENIBNAL_MSG_SIZE; + page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; - LASSERT (ipage <= OPENIBNAL_RX_MSG_PAGES); + LASSERT (ipage <= IBNAL_RX_MSG_PAGES); } } params.qp_create = (struct ib_qp_create_param) { .limit = { /* Sends have an optional RDMA */ - .max_outstanding_send_request = 2 * OPENIBNAL_MSG_QUEUE_SIZE, - .max_outstanding_receive_request = OPENIBNAL_MSG_QUEUE_SIZE, + .max_outstanding_send_request = 2 * IBNAL_MSG_QUEUE_SIZE, + .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE, .max_send_gather_element = 1, .max_receive_scatter_element = 1, }, - .pd = koibnal_data.koib_pd, - .send_queue = koibnal_data.koib_tx_cq, - .receive_queue = koibnal_data.koib_rx_cq, + .pd = kibnal_data.kib_pd, + .send_queue = kibnal_data.kib_cq, + .receive_queue = kibnal_data.kib_cq, .send_policy = IB_WQ_SIGNAL_SELECTABLE, .receive_policy = IB_WQ_SIGNAL_SELECTABLE, .rd_domain = 0, @@ -673,11 +689,11 @@ koibnal_create_conn (void) } /* Mark QP created */ - conn->ibc_state = OPENIBNAL_CONN_INIT_QP; + conn->ibc_state = IBNAL_CONN_INIT_QP; params.qp_attr = (struct ib_qp_attribute) { .state = IB_QP_STATE_INIT, - .port = koibnal_data.koib_port, + .port = kibnal_data.kib_port, .enable_rdma_read = 1, .enable_rdma_write = 1, .valid_fields = (IB_QP_ATTRIBUTE_STATE | @@ -696,12 +712,12 @@ koibnal_create_conn (void) return (conn); failed: - koibnal_destroy_conn (conn); + kibnal_destroy_conn (conn); return (NULL); } void -koibnal_destroy_conn (koib_conn_t *conn) +kibnal_destroy_conn (kib_conn_t *conn) { int rc; @@ -709,21 +725,21 @@ koibnal_destroy_conn (koib_conn_t *conn) LASSERT (atomic_read (&conn->ibc_refcount) == 0); LASSERT (list_empty(&conn->ibc_tx_queue)); - LASSERT (list_empty(&conn->ibc_rdma_queue)); + LASSERT (list_empty(&conn->ibc_active_txs)); LASSERT (conn->ibc_nsends_posted == 0); LASSERT (conn->ibc_connreq == NULL); switch (conn->ibc_state) { - case OPENIBNAL_CONN_ZOMBIE: + case IBNAL_CONN_ZOMBIE: /* called after connection sequence initiated */ - case OPENIBNAL_CONN_INIT_QP: + case IBNAL_CONN_INIT_QP: rc = ib_qp_destroy(conn->ibc_qp); if (rc != 0) CERROR("Can't destroy QP: %d\n", rc); /* fall through */ - case OPENIBNAL_CONN_INIT_NOTHING: + case IBNAL_CONN_INIT_NOTHING: break; default: @@ -731,30 +747,30 @@ koibnal_destroy_conn (koib_conn_t *conn) } if (conn->ibc_rx_pages != NULL) - koibnal_free_pages(conn->ibc_rx_pages); + kibnal_free_pages(conn->ibc_rx_pages); if (conn->ibc_rxs != NULL) PORTAL_FREE(conn->ibc_rxs, - OPENIBNAL_RX_MSGS * sizeof(koib_rx_t)); + IBNAL_RX_MSGS * sizeof(kib_rx_t)); if (conn->ibc_peer != NULL) - koibnal_put_peer(conn->ibc_peer); + kibnal_put_peer(conn->ibc_peer); PORTAL_FREE(conn, sizeof (*conn)); - atomic_dec(&koibnal_data.koib_nconns); + atomic_dec(&kibnal_data.kib_nconns); - if (atomic_read (&koibnal_data.koib_nconns) == 0 && - koibnal_data.koib_shutdown) { + if (atomic_read (&kibnal_data.kib_nconns) == 0 && + kibnal_data.kib_shutdown) { /* I just nuked the last connection on shutdown; wake up * everyone so they can exit. */ - wake_up_all(&koibnal_data.koib_sched_waitq); - wake_up_all(&koibnal_data.koib_connd_waitq); + wake_up_all(&kibnal_data.kib_sched_waitq); + wake_up_all(&kibnal_data.kib_connd_waitq); } } void -koibnal_put_conn (koib_conn_t *conn) +kibnal_put_conn (kib_conn_t *conn) { unsigned long flags; @@ -767,44 +783,44 @@ koibnal_put_conn (koib_conn_t *conn) return; /* last ref only goes on zombies */ - LASSERT (conn->ibc_state == OPENIBNAL_CONN_ZOMBIE); + LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE); - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - list_add (&conn->ibc_list, &koibnal_data.koib_connd_conns); - wake_up (&koibnal_data.koib_connd_waitq); + list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); } int -koibnal_close_peer_conns_locked (koib_peer_t *peer, int why) +kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) { - koib_conn_t *conn; + kib_conn_t *conn; struct list_head *ctmp; struct list_head *cnxt; int count = 0; list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, koib_conn_t, ibc_list); + conn = list_entry (ctmp, kib_conn_t, ibc_list); count++; - koibnal_close_conn_locked (conn, why); + kibnal_close_conn_locked (conn, why); } return (count); } int -koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation) +kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) { - koib_conn_t *conn; + kib_conn_t *conn; struct list_head *ctmp; struct list_head *cnxt; int count = 0; list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, koib_conn_t, ibc_list); + conn = list_entry (ctmp, kib_conn_t, ibc_list); if (conn->ibc_incarnation == incarnation) continue; @@ -813,17 +829,17 @@ koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation) peer->ibp_nid, conn->ibc_incarnation, incarnation); count++; - koibnal_close_conn_locked (conn, -ESTALE); + kibnal_close_conn_locked (conn, -ESTALE); } return (count); } int -koibnal_close_matching_conns (ptl_nid_t nid) +kibnal_close_matching_conns (ptl_nid_t nid) { unsigned long flags; - koib_peer_t *peer; + kib_peer_t *peer; struct list_head *ptmp; struct list_head *pnxt; int lo; @@ -831,19 +847,19 @@ koibnal_close_matching_conns (ptl_nid_t nid) int i; int count = 0; - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); if (nid != PTL_NID_ANY) - lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers; + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; - hi = koibnal_data.koib_peer_hash_size - 1; + hi = kibnal_data.kib_peer_hash_size - 1; } for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) { + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || !list_empty (&peer->ibp_conns)); @@ -851,11 +867,11 @@ koibnal_close_matching_conns (ptl_nid_t nid) if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) continue; - count += koibnal_close_peer_conns_locked (peer, 0); + count += kibnal_close_peer_conns_locked (peer, 0); } } - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); /* wildcards always succeed */ if (nid == PTL_NID_ANY) @@ -865,7 +881,7 @@ koibnal_close_matching_conns (ptl_nid_t nid) } int -koibnal_cmd(struct portals_cfg *pcfg, void * private) +kibnal_cmd(struct portals_cfg *pcfg, void * private) { int rc = -EINVAL; @@ -876,8 +892,8 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private) ptl_nid_t nid = 0; int share_count = 0; - rc = koibnal_get_peer_info(pcfg->pcfg_count, - &nid, &share_count); + rc = kibnal_get_peer_info(pcfg->pcfg_count, + &nid, &share_count); pcfg->pcfg_nid = nid; pcfg->pcfg_size = 0; pcfg->pcfg_id = 0; @@ -887,17 +903,17 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private) break; } case NAL_CMD_ADD_PEER: { - rc = koibnal_add_persistent_peer (pcfg->pcfg_nid); + rc = kibnal_add_persistent_peer (pcfg->pcfg_nid); break; } case NAL_CMD_DEL_PEER: { - rc = koibnal_del_peer (pcfg->pcfg_nid, + rc = kibnal_del_peer (pcfg->pcfg_nid, /* flags == single_share */ pcfg->pcfg_flags != 0); break; } case NAL_CMD_GET_CONN: { - koib_conn_t *conn = koibnal_get_conn_by_idx (pcfg->pcfg_count); + kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); if (conn == NULL) rc = -ENOENT; @@ -907,19 +923,19 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private) pcfg->pcfg_id = 0; pcfg->pcfg_misc = 0; pcfg->pcfg_flags = 0; - koibnal_put_conn (conn); + kibnal_put_conn (conn); } break; } case NAL_CMD_CLOSE_CONNECTION: { - rc = koibnal_close_matching_conns (pcfg->pcfg_nid); + rc = kibnal_close_matching_conns (pcfg->pcfg_nid); break; } case NAL_CMD_REGISTER_MYNID: { if (pcfg->pcfg_nid == PTL_NID_ANY) rc = -EINVAL; else - rc = koibnal_set_mynid (pcfg->pcfg_nid); + rc = kibnal_set_mynid (pcfg->pcfg_nid); break; } } @@ -928,47 +944,47 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private) } void -koibnal_free_pages (koib_pages_t *p) +kibnal_free_pages (kib_pages_t *p) { - int npages = p->oibp_npages; + int npages = p->ibp_npages; int rc; int i; - if (p->oibp_mapped) { - rc = ib_memory_deregister(p->oibp_handle); + if (p->ibp_mapped) { + rc = ib_memory_deregister(p->ibp_handle); if (rc != 0) CERROR ("Deregister error: %d\n", rc); } for (i = 0; i < npages; i++) - if (p->oibp_pages[i] != NULL) - __free_page(p->oibp_pages[i]); + if (p->ibp_pages[i] != NULL) + __free_page(p->ibp_pages[i]); - PORTAL_FREE (p, offsetof(koib_pages_t, oibp_pages[npages])); + PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); } int -koibnal_alloc_pages (koib_pages_t **pp, int npages, int access) +kibnal_alloc_pages (kib_pages_t **pp, int npages, int access) { - koib_pages_t *p; + kib_pages_t *p; struct ib_physical_buffer *phys_pages; int i; int rc; - PORTAL_ALLOC(p, offsetof(koib_pages_t, oibp_pages[npages])); + PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); if (p == NULL) { CERROR ("Can't allocate buffer %d\n", npages); return (-ENOMEM); } - memset (p, 0, offsetof(koib_pages_t, oibp_pages[npages])); - p->oibp_npages = npages; + memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); + p->ibp_npages = npages; for (i = 0; i < npages; i++) { - p->oibp_pages[i] = alloc_page (GFP_KERNEL); - if (p->oibp_pages[i] == NULL) { + p->ibp_pages[i] = alloc_page (GFP_KERNEL); + if (p->ibp_pages[i] == NULL) { CERROR ("Can't allocate page %d of %d\n", i, npages); - koibnal_free_pages(p); + kibnal_free_pages(p); return (-ENOMEM); } } @@ -976,96 +992,96 @@ koibnal_alloc_pages (koib_pages_t **pp, int npages, int access) PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages)); if (phys_pages == NULL) { CERROR ("Can't allocate physarray for %d pages\n", npages); - koibnal_free_pages(p); + kibnal_free_pages(p); return (-ENOMEM); } for (i = 0; i < npages; i++) { phys_pages[i].size = PAGE_SIZE; phys_pages[i].address = - koibnal_page2phys(p->oibp_pages[i]); + kibnal_page2phys(p->ibp_pages[i]); } - p->oibp_vaddr = 0; - rc = ib_memory_register_physical(koibnal_data.koib_pd, + p->ibp_vaddr = 0; + rc = ib_memory_register_physical(kibnal_data.kib_pd, phys_pages, npages, - &p->oibp_vaddr, + &p->ibp_vaddr, npages * PAGE_SIZE, 0, access, - &p->oibp_handle, - &p->oibp_lkey, - &p->oibp_rkey); + &p->ibp_handle, + &p->ibp_lkey, + &p->ibp_rkey); PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages)); if (rc != 0) { CERROR ("Error %d mapping %d pages\n", rc, npages); - koibnal_free_pages(p); + kibnal_free_pages(p); return (rc); } - p->oibp_mapped = 1; + p->ibp_mapped = 1; *pp = p; return (0); } int -koibnal_setup_tx_descs (void) +kibnal_setup_tx_descs (void) { int ipage = 0; int page_offset = 0; __u64 vaddr; __u64 vaddr_base; struct page *page; - koib_tx_t *tx; + kib_tx_t *tx; int i; int rc; /* pre-mapped messages are not bigger than 1 page */ - LASSERT (OPENIBNAL_MSG_SIZE <= PAGE_SIZE); + LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); /* No fancy arithmetic when we do the buffer calculations */ - LASSERT (PAGE_SIZE % OPENIBNAL_MSG_SIZE == 0); + LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - rc = koibnal_alloc_pages(&koibnal_data.koib_tx_pages, - OPENIBNAL_TX_MSG_PAGES, - 0); /* local read access only */ + rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, + IBNAL_TX_MSG_PAGES, + 0); /* local read access only */ if (rc != 0) return (rc); - vaddr = vaddr_base = koibnal_data.koib_tx_pages->oibp_vaddr; + vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; - for (i = 0; i < OPENIBNAL_TX_MSGS; i++) { - page = koibnal_data.koib_tx_pages->oibp_pages[ipage]; - tx = &koibnal_data.koib_tx_descs[i]; + for (i = 0; i < IBNAL_TX_MSGS; i++) { + page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; + tx = &kibnal_data.kib_tx_descs[i]; memset (tx, 0, sizeof(*tx)); /* zero flags etc */ - tx->tx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset); + tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); tx->tx_vaddr = vaddr; - tx->tx_isnblk = (i >= OPENIBNAL_NTX); - tx->tx_mapped = KOIB_TX_UNMAPPED; + tx->tx_isnblk = (i >= IBNAL_NTX); + tx->tx_mapped = KIB_TX_UNMAPPED; CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", i, tx, tx->tx_msg, tx->tx_vaddr); if (tx->tx_isnblk) list_add (&tx->tx_list, - &koibnal_data.koib_idle_nblk_txs); + &kibnal_data.kib_idle_nblk_txs); else list_add (&tx->tx_list, - &koibnal_data.koib_idle_txs); + &kibnal_data.kib_idle_txs); - vaddr += OPENIBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + OPENIBNAL_TX_MSG_BYTES); + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); - page_offset += OPENIBNAL_MSG_SIZE; + page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; - LASSERT (ipage <= OPENIBNAL_TX_MSG_PAGES); + LASSERT (ipage <= IBNAL_TX_MSG_PAGES); } } @@ -1073,7 +1089,7 @@ koibnal_setup_tx_descs (void) } void -koibnal_api_shutdown (nal_t *nal) +kibnal_api_shutdown (nal_t *nal) { int i; int rc; @@ -1087,119 +1103,113 @@ koibnal_api_shutdown (nal_t *nal) CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", atomic_read (&portal_kmemory)); - LASSERT(nal == &koibnal_api); + LASSERT(nal == &kibnal_api); - switch (koibnal_data.koib_init) { + switch (kibnal_data.kib_init) { default: - CERROR ("Unexpected state %d\n", koibnal_data.koib_init); + CERROR ("Unexpected state %d\n", kibnal_data.kib_init); LBUG(); - case OPENIBNAL_INIT_ALL: + case IBNAL_INIT_ALL: /* stop calls to nal_cmd */ libcfs_nal_cmd_unregister(OPENIBNAL); /* No new peers */ /* resetting my NID to unadvertises me, removes my * listener and nukes all current peers */ - koibnal_set_mynid (PTL_NID_ANY); + kibnal_set_mynid (PTL_NID_ANY); /* Wait for all peer state to clean up */ i = 2; - while (atomic_read (&koibnal_data.koib_npeers) != 0) { + while (atomic_read (&kibnal_data.kib_npeers) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "waiting for %d peers to close down\n", - atomic_read (&koibnal_data.koib_npeers)); + atomic_read (&kibnal_data.kib_npeers)); set_current_state (TASK_INTERRUPTIBLE); schedule_timeout (HZ); } /* fall through */ - case OPENIBNAL_INIT_TX_CQ: - rc = ib_cq_destroy (koibnal_data.koib_tx_cq); - if (rc != 0) - CERROR ("Destroy tx CQ error: %d\n", rc); - /* fall through */ - - case OPENIBNAL_INIT_RX_CQ: - rc = ib_cq_destroy (koibnal_data.koib_rx_cq); + case IBNAL_INIT_CQ: + rc = ib_cq_destroy (kibnal_data.kib_cq); if (rc != 0) - CERROR ("Destroy rx CQ error: %d\n", rc); + CERROR ("Destroy CQ error: %d\n", rc); /* fall through */ - case OPENIBNAL_INIT_TXD: - koibnal_free_pages (koibnal_data.koib_tx_pages); + case IBNAL_INIT_TXD: + kibnal_free_pages (kibnal_data.kib_tx_pages); /* fall through */ -#if OPENIBNAL_FMR - case OPENIBNAL_INIT_FMR: - rc = ib_fmr_pool_destroy (koibnal_data.koib_fmr_pool); +#if IBNAL_FMR + case IBNAL_INIT_FMR: + rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); if (rc != 0) CERROR ("Destroy FMR pool error: %d\n", rc); /* fall through */ #endif - case OPENIBNAL_INIT_PD: - rc = ib_pd_destroy(koibnal_data.koib_pd); + case IBNAL_INIT_PD: + rc = ib_pd_destroy(kibnal_data.kib_pd); if (rc != 0) CERROR ("Destroy PD error: %d\n", rc); /* fall through */ - case OPENIBNAL_INIT_LIB: - lib_fini(&koibnal_lib); + case IBNAL_INIT_LIB: + lib_fini(&kibnal_lib); /* fall through */ - case OPENIBNAL_INIT_DATA: + case IBNAL_INIT_DATA: /* Module refcount only gets to zero when all peers * have been closed so all lists must be empty */ - LASSERT (atomic_read (&koibnal_data.koib_npeers) == 0); - LASSERT (koibnal_data.koib_peers != NULL); - for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) { - LASSERT (list_empty (&koibnal_data.koib_peers[i])); + LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); + LASSERT (kibnal_data.kib_peers != NULL); + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + LASSERT (list_empty (&kibnal_data.kib_peers[i])); } - LASSERT (atomic_read (&koibnal_data.koib_nconns) == 0); - LASSERT (list_empty (&koibnal_data.koib_sched_rxq)); - LASSERT (list_empty (&koibnal_data.koib_sched_txq)); - LASSERT (list_empty (&koibnal_data.koib_connd_conns)); - LASSERT (list_empty (&koibnal_data.koib_connd_peers)); + LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); + LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); + LASSERT (list_empty (&kibnal_data.kib_sched_txq)); + LASSERT (list_empty (&kibnal_data.kib_connd_conns)); + LASSERT (list_empty (&kibnal_data.kib_connd_peers)); /* flag threads to terminate; wake and wait for them to die */ - koibnal_data.koib_shutdown = 1; - wake_up_all (&koibnal_data.koib_sched_waitq); - wake_up_all (&koibnal_data.koib_connd_waitq); + kibnal_data.kib_shutdown = 1; + wake_up_all (&kibnal_data.kib_sched_waitq); + wake_up_all (&kibnal_data.kib_connd_waitq); i = 2; - while (atomic_read (&koibnal_data.koib_nthreads) != 0) { + while (atomic_read (&kibnal_data.kib_nthreads) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "Waiting for %d threads to terminate\n", - atomic_read (&koibnal_data.koib_nthreads)); + atomic_read (&kibnal_data.kib_nthreads)); set_current_state (TASK_INTERRUPTIBLE); schedule_timeout (HZ); } /* fall through */ - case OPENIBNAL_INIT_NOTHING: + case IBNAL_INIT_NOTHING: break; } - if (koibnal_data.koib_tx_descs != NULL) - PORTAL_FREE (koibnal_data.koib_tx_descs, - OPENIBNAL_TX_MSGS * sizeof(koib_tx_t)); + if (kibnal_data.kib_tx_descs != NULL) + PORTAL_FREE (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); - if (koibnal_data.koib_peers != NULL) - PORTAL_FREE (koibnal_data.koib_peers, + if (kibnal_data.kib_peers != NULL) + PORTAL_FREE (kibnal_data.kib_peers, sizeof (struct list_head) * - koibnal_data.koib_peer_hash_size); + kibnal_data.kib_peer_hash_size); CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", atomic_read (&portal_kmemory)); printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n", atomic_read(&portal_kmemory)); - koibnal_data.koib_init = OPENIBNAL_INIT_NOTHING; + kibnal_data.kib_init = IBNAL_INIT_NOTHING; } int -koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, +kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { @@ -1208,65 +1218,66 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, int rc; int i; - LASSERT (nal == &koibnal_api); + LASSERT (nal == &kibnal_api); if (nal->nal_refct != 0) { if (actual_limits != NULL) - *actual_limits = koibnal_lib.libnal_ni.ni_actual_limits; + *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; /* This module got the first ref */ PORTAL_MODULE_USE; return (PTL_OK); } - LASSERT (koibnal_data.koib_init == OPENIBNAL_INIT_NOTHING); + LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); - memset (&koibnal_data, 0, sizeof (koibnal_data)); /* zero pointers, flags etc */ + memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ - init_MUTEX (&koibnal_data.koib_nid_mutex); - init_MUTEX_LOCKED (&koibnal_data.koib_nid_signal); - koibnal_data.koib_nid = PTL_NID_ANY; + init_MUTEX (&kibnal_data.kib_nid_mutex); + init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal); + kibnal_data.kib_nid = PTL_NID_ANY; - rwlock_init(&koibnal_data.koib_global_lock); + rwlock_init(&kibnal_data.kib_global_lock); - koibnal_data.koib_peer_hash_size = OPENIBNAL_PEER_HASH_SIZE; - PORTAL_ALLOC (koibnal_data.koib_peers, - sizeof (struct list_head) * koibnal_data.koib_peer_hash_size); - if (koibnal_data.koib_peers == NULL) { + kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; + PORTAL_ALLOC (kibnal_data.kib_peers, + sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); + if (kibnal_data.kib_peers == NULL) { goto failed; } - for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) - INIT_LIST_HEAD(&koibnal_data.koib_peers[i]); - - spin_lock_init (&koibnal_data.koib_connd_lock); - INIT_LIST_HEAD (&koibnal_data.koib_connd_peers); - INIT_LIST_HEAD (&koibnal_data.koib_connd_conns); - init_waitqueue_head (&koibnal_data.koib_connd_waitq); - - spin_lock_init (&koibnal_data.koib_sched_lock); - INIT_LIST_HEAD (&koibnal_data.koib_sched_txq); - INIT_LIST_HEAD (&koibnal_data.koib_sched_rxq); - init_waitqueue_head (&koibnal_data.koib_sched_waitq); - - spin_lock_init (&koibnal_data.koib_tx_lock); - INIT_LIST_HEAD (&koibnal_data.koib_idle_txs); - INIT_LIST_HEAD (&koibnal_data.koib_idle_nblk_txs); - init_waitqueue_head(&koibnal_data.koib_idle_tx_waitq); - - PORTAL_ALLOC (koibnal_data.koib_tx_descs, - OPENIBNAL_TX_MSGS * sizeof(koib_tx_t)); - if (koibnal_data.koib_tx_descs == NULL) { + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) + INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); + + spin_lock_init (&kibnal_data.kib_connd_lock); + INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); + INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); + init_waitqueue_head (&kibnal_data.kib_connd_waitq); + + spin_lock_init (&kibnal_data.kib_sched_lock); + INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); + INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); + init_waitqueue_head (&kibnal_data.kib_sched_waitq); + + spin_lock_init (&kibnal_data.kib_tx_lock); + INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); + INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); + init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); + + PORTAL_ALLOC (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + if (kibnal_data.kib_tx_descs == NULL) { CERROR ("Can't allocate tx descs\n"); goto failed; } /* lists/ptrs/locks initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_DATA; + kibnal_data.kib_init = IBNAL_INIT_DATA; /*****************************************************/ + process_id.pid = requested_pid; - process_id.nid = koibnal_data.koib_nid; + process_id.nid = kibnal_data.kib_nid; - rc = lib_init(&koibnal_lib, nal, process_id, + rc = lib_init(&kibnal_lib, nal, process_id, requested_limits, actual_limits); if (rc != PTL_OK) { CERROR("lib_init failed: error %d\n", rc); @@ -1274,11 +1285,11 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } /* lib interface initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_LIB; + kibnal_data.kib_init = IBNAL_INIT_LIB; /*****************************************************/ - for (i = 0; i < OPENIBNAL_N_SCHED; i++) { - rc = koibnal_thread_start (koibnal_scheduler, (void *)i); + for (i = 0; i < IBNAL_N_SCHED; i++) { + rc = kibnal_thread_start (kibnal_scheduler, (void *)i); if (rc != 0) { CERROR("Can't spawn openibnal scheduler[%d]: %d\n", i, rc); @@ -1286,56 +1297,56 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } } - rc = koibnal_thread_start (koibnal_connd, NULL); + rc = kibnal_thread_start (kibnal_connd, NULL); if (rc != 0) { CERROR ("Can't spawn openibnal connd: %d\n", rc); goto failed; } - koibnal_data.koib_device = ib_device_get_by_index(0); - if (koibnal_data.koib_device == NULL) { + kibnal_data.kib_device = ib_device_get_by_index(0); + if (kibnal_data.kib_device == NULL) { CERROR ("Can't open ib device 0\n"); goto failed; } - rc = ib_device_properties_get(koibnal_data.koib_device, - &koibnal_data.koib_device_props); + rc = ib_device_properties_get(kibnal_data.kib_device, + &kibnal_data.kib_device_props); if (rc != 0) { CERROR ("Can't get device props: %d\n", rc); goto failed; } CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n", - koibnal_data.koib_device_props.max_initiator_per_qp, - koibnal_data.koib_device_props.max_responder_per_qp); + kibnal_data.kib_device_props.max_initiator_per_qp, + kibnal_data.kib_device_props.max_responder_per_qp); - koibnal_data.koib_port = 0; + kibnal_data.kib_port = 0; for (i = 1; i <= 2; i++) { - rc = ib_port_properties_get(koibnal_data.koib_device, i, - &koibnal_data.koib_port_props); + rc = ib_port_properties_get(kibnal_data.kib_device, i, + &kibnal_data.kib_port_props); if (rc == 0) { - koibnal_data.koib_port = i; + kibnal_data.kib_port = i; break; } } - if (koibnal_data.koib_port == 0) { + if (kibnal_data.kib_port == 0) { CERROR ("Can't find a port\n"); goto failed; } - rc = ib_pd_create(koibnal_data.koib_device, - NULL, &koibnal_data.koib_pd); + rc = ib_pd_create(kibnal_data.kib_device, + NULL, &kibnal_data.kib_pd); if (rc != 0) { CERROR ("Can't create PD: %d\n", rc); goto failed; } /* flag PD initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_PD; + kibnal_data.kib_init = IBNAL_INIT_PD; /*****************************************************/ -#if OPENIBNAL_FMR +#if IBNAL_FMR { - const int pool_size = OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK; + const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; struct ib_fmr_pool_param params = { .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, .access = (IB_ACCESS_LOCAL_WRITE | @@ -1347,8 +1358,8 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, .flush_arg = NULL, .cache = 1, }; - rc = ib_fmr_pool_create(koibnal_data.koib_pd, ¶ms, - &koibnal_data.koib_fmr_pool); + rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, + &kibnal_data.kib_fmr_pool); if (rc != 0) { CERROR ("Can't create FMR pool size %d: %d\n", pool_size, rc); @@ -1357,84 +1368,56 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } /* flag FMR pool initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_FMR; + kibnal_data.kib_init = IBNAL_INIT_FMR; #endif /*****************************************************/ - rc = koibnal_setup_tx_descs(); + rc = kibnal_setup_tx_descs(); if (rc != 0) { CERROR ("Can't register tx descs: %d\n", rc); goto failed; } /* flag TX descs initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_TXD; + kibnal_data.kib_init = IBNAL_INIT_TXD; /*****************************************************/ { struct ib_cq_callback callback = { - .context = OPENIBNAL_CALLBACK_CTXT, + .context = IBNAL_CALLBACK_CTXT, .policy = IB_CQ_PROVIDER_REARM, .function = { - .entry = koibnal_rx_callback, + .entry = kibnal_callback, }, .arg = NULL, }; - int nentries = OPENIBNAL_RX_CQ_ENTRIES; + int nentries = IBNAL_CQ_ENTRIES; - rc = ib_cq_create (koibnal_data.koib_device, + rc = ib_cq_create (kibnal_data.kib_device, &nentries, &callback, NULL, - &koibnal_data.koib_rx_cq); + &kibnal_data.kib_cq); if (rc != 0) { - CERROR ("Can't create RX CQ: %d\n", rc); + CERROR ("Can't create CQ: %d\n", rc); goto failed; } /* I only want solicited events */ - rc = ib_cq_request_notification(koibnal_data.koib_rx_cq, 1); + rc = ib_cq_request_notification(kibnal_data.kib_cq, 1); LASSERT (rc == 0); } - /* flag RX CQ initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_RX_CQ; - /*****************************************************/ - - { - struct ib_cq_callback callback = { - .context = OPENIBNAL_CALLBACK_CTXT, - .policy = IB_CQ_PROVIDER_REARM, - .function = { - .entry = koibnal_tx_callback, - }, - .arg = NULL, - }; - int nentries = OPENIBNAL_TX_CQ_ENTRIES; - - rc = ib_cq_create (koibnal_data.koib_device, - &nentries, &callback, NULL, - &koibnal_data.koib_tx_cq); - if (rc != 0) { - CERROR ("Can't create RX CQ: %d\n", rc); - goto failed; - } - - /* I only want solicited events */ - rc = ib_cq_request_notification(koibnal_data.koib_tx_cq, 1); - LASSERT (rc == 0); - } - - /* flag TX CQ initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_TX_CQ; + /* flag CQ initialised */ + kibnal_data.kib_init = IBNAL_INIT_CQ; /*****************************************************/ - rc = libcfs_nal_cmd_register(OPENIBNAL, &koibnal_cmd, NULL); + rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL); if (rc != 0) { CERROR ("Can't initialise command interface (rc = %d)\n", rc); goto failed; } /* flag everything initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_ALL; + kibnal_data.kib_init = IBNAL_INIT_ALL; /*****************************************************/ printk(KERN_INFO "Lustre: OpenIB NAL loaded " @@ -1443,44 +1426,44 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, return (PTL_OK); failed: - koibnal_api_shutdown (&koibnal_api); + kibnal_api_shutdown (&kibnal_api); return (PTL_FAIL); } void __exit -koibnal_module_fini (void) +kibnal_module_fini (void) { #ifdef CONFIG_SYSCTL - if (koibnal_tunables.koib_sysctl != NULL) - unregister_sysctl_table (koibnal_tunables.koib_sysctl); + if (kibnal_tunables.kib_sysctl != NULL) + unregister_sysctl_table (kibnal_tunables.kib_sysctl); #endif - PtlNIFini(koibnal_ni); + PtlNIFini(kibnal_ni); ptl_unregister_nal(OPENIBNAL); } int __init -koibnal_module_init (void) +kibnal_module_init (void) { int rc; /* the following must be sizeof(int) for proc_dointvec() */ - LASSERT(sizeof (koibnal_tunables.koib_io_timeout) == sizeof (int)); + LASSERT(sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int)); - koibnal_api.nal_ni_init = koibnal_api_startup; - koibnal_api.nal_ni_fini = koibnal_api_shutdown; + kibnal_api.nal_ni_init = kibnal_api_startup; + kibnal_api.nal_ni_fini = kibnal_api_shutdown; /* Initialise dynamic tunables to defaults once only */ - koibnal_tunables.koib_io_timeout = OPENIBNAL_IO_TIMEOUT; + kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; - rc = ptl_register_nal(OPENIBNAL, &koibnal_api); + rc = ptl_register_nal(OPENIBNAL, &kibnal_api); if (rc != PTL_OK) { - CERROR("Can't register OPENIBNAL: %d\n", rc); + CERROR("Can't register IBNAL: %d\n", rc); return (-ENOMEM); /* or something... */ } /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &koibnal_ni); + rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni); if (rc != PTL_OK && rc != PTL_IFACE_DUP) { ptl_unregister_nal(OPENIBNAL); return (-ENODEV); @@ -1488,8 +1471,8 @@ koibnal_module_init (void) #ifdef CONFIG_SYSCTL /* Press on regardless even if registering sysctl doesn't work */ - koibnal_tunables.koib_sysctl = - register_sysctl_table (koibnal_top_ctl_table, 0); + kibnal_tunables.kib_sysctl = + register_sysctl_table (kibnal_top_ctl_table, 0); #endif return (0); } @@ -1498,6 +1481,6 @@ MODULE_AUTHOR("Cluster File Systems, Inc. "); MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01"); MODULE_LICENSE("GPL"); -module_init(koibnal_module_init); -module_exit(koibnal_module_fini); +module_init(kibnal_module_init); +module_exit(kibnal_module_fini); diff --git a/lustre/portals/knals/openibnal/openibnal.h b/lustre/portals/knals/openibnal/openibnal.h index 301d3ae..f0610f2 100644 --- a/lustre/portals/knals/openibnal/openibnal.h +++ b/lustre/portals/knals/openibnal/openibnal.h @@ -48,7 +48,7 @@ #include #include -#define DEBUG_SUBSYSTEM S_OPENIBNAL +#define DEBUG_SUBSYSTEM S_IBNAL #include #include @@ -59,144 +59,140 @@ #include #include -#define OPENIBNAL_SERVICE_NAME "openibnal" +#define IBNAL_SERVICE_NAME "openibnal" #if CONFIG_SMP -# define OPENIBNAL_N_SCHED num_online_cpus() /* # schedulers */ +# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ #else -# define OPENIBNAL_N_SCHED 1 /* # schedulers */ +# define IBNAL_N_SCHED 1 /* # schedulers */ #endif -#define OPENIBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ -#define OPENIBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ +#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ +#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ -#define OPENIBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ +#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ -#define OPENIBNAL_MSG_QUEUE_SIZE 8 /* # messages in-flight */ -#define OPENIBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */ -#define OPENIBNAL_RETRY 7 /* # times to retry */ -#define OPENIBNAL_RNR_RETRY 7 /* */ -#define OPENIBNAL_CM_RETRY 7 /* # times to retry connection */ -#define OPENIBNAL_FLOW_CONTROL 1 -#define OPENIBNAL_RESPONDER_RESOURCES 8 +#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */ +#define IBNAL_RETRY 7 /* # times to retry */ +#define IBNAL_RNR_RETRY 7 /* */ +#define IBNAL_CM_RETRY 7 /* # times to retry connection */ +#define IBNAL_FLOW_CONTROL 1 +#define IBNAL_RESPONDER_RESOURCES 8 -#define OPENIBNAL_NTX 64 /* # tx descs */ -#define OPENIBNAL_NTX_NBLK 256 /* # reserved tx descs */ +#define IBNAL_NTX 64 /* # tx descs */ +#define IBNAL_NTX_NBLK 256 /* # reserved tx descs */ -#define OPENIBNAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define OPENIBNAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define OPENIBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ +#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ /* default vals for runtime tunables */ -#define OPENIBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ +#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ /************************/ /* derived constants... */ /* TX messages (shared by all connections) */ -#define OPENIBNAL_TX_MSGS (OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK) -#define OPENIBNAL_TX_MSG_BYTES (OPENIBNAL_TX_MSGS * OPENIBNAL_MSG_SIZE) -#define OPENIBNAL_TX_MSG_PAGES ((OPENIBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - -/* we may have up to 2 completions per transmit */ -#define OPENIBNAL_TX_CQ_ENTRIES (2*OPENIBNAL_TX_MSGS) +#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) +#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) /* RX messages (per connection) */ -#define OPENIBNAL_RX_MSGS OPENIBNAL_MSG_QUEUE_SIZE -#define OPENIBNAL_RX_MSG_BYTES (OPENIBNAL_RX_MSGS * OPENIBNAL_MSG_SIZE) -#define OPENIBNAL_RX_MSG_PAGES ((OPENIBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) +#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE +#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) -/* 1 completion per receive, per connection */ -#define OPENIBNAL_RX_CQ_ENTRIES (OPENIBNAL_RX_MSGS * OPENIBNAL_CONCURRENT_PEERS) +/* we may have up to 2 completions per transmit + + 1 completion per receive, per connection */ +#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \ + (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)) -#define OPENIBNAL_RDMA_BASE 0x0eeb0000 -#define OPENIBNAL_FMR 1 -#define OPENIBNAL_CKSUM 0 -//#define OPENIBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS -#define OPENIBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT +#define IBNAL_RDMA_BASE 0x0eeb0000 +#define IBNAL_FMR 1 +#define IBNAL_CKSUM 0 +//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS +#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT typedef struct { - int koib_io_timeout; /* comms timeout (seconds) */ - struct ctl_table_header *koib_sysctl; /* sysctl interface */ -} koib_tunables_t; + int kib_io_timeout; /* comms timeout (seconds) */ + struct ctl_table_header *kib_sysctl; /* sysctl interface */ +} kib_tunables_t; typedef struct { - int oibp_npages; /* # pages */ - int oibp_mapped; /* mapped? */ - __u64 oibp_vaddr; /* mapped region vaddr */ - __u32 oibp_lkey; /* mapped region lkey */ - __u32 oibp_rkey; /* mapped region rkey */ - struct ib_mr *oibp_handle; /* mapped region handle */ - struct page *oibp_pages[0]; -} koib_pages_t; + int ibp_npages; /* # pages */ + int ibp_mapped; /* mapped? */ + __u64 ibp_vaddr; /* mapped region vaddr */ + __u32 ibp_lkey; /* mapped region lkey */ + __u32 ibp_rkey; /* mapped region rkey */ + struct ib_mr *ibp_handle; /* mapped region handle */ + struct page *ibp_pages[0]; +} kib_pages_t; typedef struct { - int koib_init; /* initialisation state */ - __u64 koib_incarnation; /* which one am I */ - int koib_shutdown; /* shut down? */ - atomic_t koib_nthreads; /* # live threads */ - - __u64 koib_cm_service_id; /* service number I listen on */ - ptl_nid_t koib_nid; /* my NID */ - struct semaphore koib_nid_mutex; /* serialise NID ops */ - struct semaphore koib_nid_signal; /* signal completion */ - - rwlock_t koib_global_lock; /* stabilize peer/conn ops */ - - struct list_head *koib_peers; /* hash table of all my known peers */ - int koib_peer_hash_size; /* size of koib_peers */ - atomic_t koib_npeers; /* # peers extant */ - atomic_t koib_nconns; /* # connections extant */ - - struct list_head koib_connd_conns; /* connections to progress */ - struct list_head koib_connd_peers; /* peers waiting for a connection */ - wait_queue_head_t koib_connd_waitq; /* connection daemons sleep here */ - unsigned long koib_connd_waketime; /* when connd will wake */ - spinlock_t koib_connd_lock; /* serialise */ - - wait_queue_head_t koib_sched_waitq; /* schedulers sleep here */ - struct list_head koib_sched_txq; /* tx requiring attention */ - struct list_head koib_sched_rxq; /* rx requiring attention */ - spinlock_t koib_sched_lock; /* serialise */ + int kib_init; /* initialisation state */ + __u64 kib_incarnation; /* which one am I */ + int kib_shutdown; /* shut down? */ + atomic_t kib_nthreads; /* # live threads */ + + __u64 kib_service_id; /* service number I listen on */ + ptl_nid_t kib_nid; /* my NID */ + struct semaphore kib_nid_mutex; /* serialise NID ops */ + struct semaphore kib_nid_signal; /* signal completion */ + + rwlock_t kib_global_lock; /* stabilize peer/conn ops */ + + struct list_head *kib_peers; /* hash table of all my known peers */ + int kib_peer_hash_size; /* size of kib_peers */ + atomic_t kib_npeers; /* # peers extant */ + atomic_t kib_nconns; /* # connections extant */ + + struct list_head kib_connd_conns; /* connections to progress */ + struct list_head kib_connd_peers; /* peers waiting for a connection */ + wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ + unsigned long kib_connd_waketime; /* when connd will wake */ + spinlock_t kib_connd_lock; /* serialise */ + + wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ + struct list_head kib_sched_txq; /* tx requiring attention */ + struct list_head kib_sched_rxq; /* rx requiring attention */ + spinlock_t kib_sched_lock; /* serialise */ - struct koib_tx *koib_tx_descs; /* all the tx descriptors */ - koib_pages_t *koib_tx_pages; /* premapped tx msg pages */ - - struct list_head koib_idle_txs; /* idle tx descriptors */ - struct list_head koib_idle_nblk_txs; /* idle reserved tx descriptors */ - wait_queue_head_t koib_idle_tx_waitq; /* block here for tx descriptor */ - __u64 koib_next_tx_cookie; /* RDMA completion cookie */ - spinlock_t koib_tx_lock; /* serialise */ + struct kib_tx *kib_tx_descs; /* all the tx descriptors */ + kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ + + struct list_head kib_idle_txs; /* idle tx descriptors */ + struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ + wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ + __u64 kib_next_tx_cookie; /* RDMA completion cookie */ + spinlock_t kib_tx_lock; /* serialise */ - struct ib_device *koib_device; /* "the" device */ - struct ib_device_properties koib_device_props; /* its properties */ - int koib_port; /* port on the device */ - struct ib_port_properties koib_port_props; /* its properties */ - struct ib_pd *koib_pd; /* protection domain */ -#if OPENIBNAL_FMR - struct ib_fmr_pool *koib_fmr_pool; /* fast memory region pool */ + struct ib_device *kib_device; /* "the" device */ + struct ib_device_properties kib_device_props; /* its properties */ + int kib_port; /* port on the device */ + struct ib_port_properties kib_port_props; /* its properties */ + struct ib_pd *kib_pd; /* protection domain */ +#if IBNAL_FMR + struct ib_fmr_pool *kib_fmr_pool; /* fast memory region pool */ #endif - struct ib_cq *koib_rx_cq; /* receive completion queue */ - struct ib_cq *koib_tx_cq; /* transmit completion queue */ - void *koib_listen_handle; /* where I listen for connections */ - struct ib_common_attrib_service koib_service; /* SM service */ + struct ib_cq *kib_cq; /* completion queue */ + void *kib_listen_handle; /* where I listen for connections */ -} koib_data_t; - -#define OPENIBNAL_INIT_NOTHING 0 -#define OPENIBNAL_INIT_DATA 1 -#define OPENIBNAL_INIT_LIB 2 -#define OPENIBNAL_INIT_PD 3 -#define OPENIBNAL_INIT_FMR 4 -#define OPENIBNAL_INIT_TXD 5 -#define OPENIBNAL_INIT_RX_CQ 6 -#define OPENIBNAL_INIT_TX_CQ 7 -#define OPENIBNAL_INIT_ALL 8 +} kib_data_t; + +#define IBNAL_INIT_NOTHING 0 +#define IBNAL_INIT_DATA 1 +#define IBNAL_INIT_LIB 2 +#define IBNAL_INIT_PD 3 +#define IBNAL_INIT_FMR 4 +#define IBNAL_INIT_TXD 5 +#define IBNAL_INIT_CQ 6 +#define IBNAL_INIT_ALL 7 /************************************************************************ * Wire message structs. @@ -214,125 +210,125 @@ typedef struct __u32 md_lkey; __u32 md_rkey; __u64 md_addr; -} koib_md_t; +} kib_md_t; typedef struct { __u32 rd_key; /* remote key */ __u32 rd_nob; /* # of bytes */ __u64 rd_addr; /* remote io vaddr */ -} koib_rdma_desc_t; +} kib_rdma_desc_t; typedef struct { - ptl_hdr_t oibim_hdr; /* portals header */ - char oibim_payload[0]; /* piggy-backed payload */ -} koib_immediate_msg_t; + ptl_hdr_t ibim_hdr; /* portals header */ + char ibim_payload[0]; /* piggy-backed payload */ +} kib_immediate_msg_t; typedef struct { - ptl_hdr_t oibrm_hdr; /* portals header */ - __u64 oibrm_cookie; /* opaque completion cookie */ - koib_rdma_desc_t oibrm_desc; /* where to suck/blow */ -} koib_rdma_msg_t; + ptl_hdr_t ibrm_hdr; /* portals header */ + __u64 ibrm_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibrm_desc; /* where to suck/blow */ +} kib_rdma_msg_t; typedef struct { - __u64 oibcm_cookie; /* opaque completion cookie */ - __u32 oibcm_status; /* completion status */ -} koib_completion_msg_t; + __u64 ibcm_cookie; /* opaque completion cookie */ + __u32 ibcm_status; /* completion status */ +} kib_completion_msg_t; typedef struct { - __u32 oibm_magic; /* I'm an openibnal message */ - __u16 oibm_version; /* this is my version number */ - __u8 oibm_type; /* msg type */ - __u8 oibm_credits; /* returned credits */ -#if OPENIBNAL_CKSUM - __u32 oibm_nob; - __u32 oibm_cksum; + __u32 ibm_magic; /* I'm an openibnal message */ + __u16 ibm_version; /* this is my version number */ + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ +#if IBNAL_CKSUM + __u32 ibm_nob; + __u32 ibm_cksum; #endif union { - koib_immediate_msg_t immediate; - koib_rdma_msg_t rdma; - koib_completion_msg_t completion; - } oibm_u; -} koib_msg_t; - -#define OPENIBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ -#define OPENIBNAL_MSG_VERSION 1 /* current protocol version */ - -#define OPENIBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ -#define OPENIBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ -#define OPENIBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ -#define OPENIBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ -#define OPENIBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ -#define OPENIBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ + kib_immediate_msg_t immediate; + kib_rdma_msg_t rdma; + kib_completion_msg_t completion; + } ibm_u; +} kib_msg_t; + +#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ +#define IBNAL_MSG_VERSION 1 /* current protocol version */ + +#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ +#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ +#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ +#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ +#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ +#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ /***********************************************************************/ -typedef struct koib_rx /* receive message */ +typedef struct kib_rx /* receive message */ { struct list_head rx_list; /* queue for attention */ - struct koib_conn *rx_conn; /* owning conn */ + struct kib_conn *rx_conn; /* owning conn */ int rx_rdma; /* RDMA completion posted? */ int rx_posted; /* posted? */ __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ - koib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ + kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ struct ib_receive_param rx_sp; /* receive work item */ struct ib_gather_scatter rx_gl; /* and it's memory */ -} koib_rx_t; +} kib_rx_t; -typedef struct koib_tx /* transmit message */ +typedef struct kib_tx /* transmit message */ { struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ int tx_isnblk; /* I'm reserved for non-blocking sends */ - struct koib_conn *tx_conn; /* owning conn */ + struct kib_conn *tx_conn; /* owning conn */ int tx_mapped; /* mapped for RDMA? */ int tx_sending; /* # tx callbacks outstanding */ int tx_status; /* completion status */ - int tx_passive_rdma; /* waiting for peer to RDMA? */ - int tx_passive_rdma_wait; /* on ibc_rdma_queue */ - unsigned long tx_passive_rdma_deadline; /* completion deadline */ + unsigned long tx_deadline; /* completion deadline */ + int tx_passive_rdma; /* peer sucks/blows */ + int tx_passive_rdma_wait; /* waiting for peer to complete */ __u64 tx_passive_rdma_cookie; /* completion cookie */ lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ - koib_md_t tx_md; /* RDMA mapping (active/passive) */ + kib_md_t tx_md; /* RDMA mapping (active/passive) */ __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ - koib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ + kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ int tx_nsp; /* # send work items */ struct ib_send_param tx_sp[2]; /* send work items... */ struct ib_gather_scatter tx_gl[2]; /* ...and their memory */ -} koib_tx_t; +} kib_tx_t; -#define KOIB_TX_UNMAPPED 0 -#define KOIB_TX_MAPPED 1 -#define KOIB_TX_MAPPED_FMR 2 +#define KIB_TX_UNMAPPED 0 +#define KIB_TX_MAPPED 1 +#define KIB_TX_MAPPED_FMR 2 -typedef struct koib_wire_connreq +typedef struct kib_wire_connreq { __u32 wcr_magic; /* I'm an openibnal connreq */ __u16 wcr_version; /* this is my version number */ __u16 wcr_queue_depth; /* this is my receive queue size */ __u64 wcr_nid; /* peer's NID */ __u64 wcr_incarnation; /* peer's incarnation */ -} koib_wire_connreq_t; +} kib_wire_connreq_t; -typedef struct koib_connreq +typedef struct kib_connreq { /* connection-in-progress */ - struct koib_conn *cr_conn; - koib_wire_connreq_t cr_wcr; + struct kib_conn *cr_conn; + kib_wire_connreq_t cr_wcr; __u64 cr_tid; struct ib_common_attrib_service cr_service; tTS_IB_GID cr_gid; struct ib_path_record cr_path; struct ib_cm_active_param cr_connparam; -} koib_connreq_t; +} kib_connreq_t; -typedef struct koib_conn +typedef struct kib_conn { - struct koib_peer *ibc_peer; /* owning peer */ + struct kib_peer *ibc_peer; /* owning peer */ struct list_head ibc_list; /* stash on peer's conn list */ __u64 ibc_incarnation; /* which instance of the peer */ atomic_t ibc_refcount; /* # users */ @@ -342,27 +338,27 @@ typedef struct koib_conn int ibc_credits; /* # credits I have */ int ibc_outstanding_credits; /* # credits to return */ struct list_head ibc_tx_queue; /* send queue */ - struct list_head ibc_rdma_queue; /* tx awaiting RDMA completion */ + struct list_head ibc_active_txs; /* active tx awaiting completion */ spinlock_t ibc_lock; /* serialise */ - koib_rx_t *ibc_rxs; /* the rx descs */ - koib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ + kib_rx_t *ibc_rxs; /* the rx descs */ + kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ struct ib_qp *ibc_qp; /* queue pair */ __u32 ibc_qpn; /* queue pair number */ tTS_IB_CM_COMM_ID ibc_comm_id; /* connection ID? */ - koib_connreq_t *ibc_connreq; /* connection request state */ -} koib_conn_t; + kib_connreq_t *ibc_connreq; /* connection request state */ +} kib_conn_t; -#define OPENIBNAL_CONN_INIT_NOTHING 0 /* initial state */ -#define OPENIBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ -#define OPENIBNAL_CONN_CONNECTING 2 /* started to connect */ -#define OPENIBNAL_CONN_ESTABLISHED 3 /* connection established */ -#define OPENIBNAL_CONN_DEATHROW 4 /* waiting to be closed */ -#define OPENIBNAL_CONN_ZOMBIE 5 /* waiting to be freed */ +#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ +#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ +#define IBNAL_CONN_CONNECTING 2 /* started to connect */ +#define IBNAL_CONN_ESTABLISHED 3 /* connection established */ +#define IBNAL_CONN_DEATHROW 4 /* waiting to be closed */ +#define IBNAL_CONN_ZOMBIE 5 /* waiting to be freed */ -typedef struct koib_peer +typedef struct kib_peer { struct list_head ibp_list; /* stash on global peer list */ - struct list_head ibp_connd_list; /* schedule on koib_connd_peers */ + struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ ptl_nid_t ibp_nid; /* who's on the other end(s) */ atomic_t ibp_refcount; /* # users */ int ibp_persistence; /* "known" peer refs */ @@ -371,30 +367,30 @@ typedef struct koib_peer int ibp_connecting; /* connecting+accepting */ unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ unsigned long ibp_reconnect_interval; /* exponential backoff */ -} koib_peer_t; +} kib_peer_t; -extern lib_nal_t koibnal_lib; -extern koib_data_t koibnal_data; -extern koib_tunables_t koibnal_tunables; +extern lib_nal_t kibnal_lib; +extern kib_data_t kibnal_data; +extern kib_tunables_t kibnal_tunables; static inline struct list_head * -koibnal_nid2peerlist (ptl_nid_t nid) +kibnal_nid2peerlist (ptl_nid_t nid) { - unsigned int hash = ((unsigned int)nid) % koibnal_data.koib_peer_hash_size; + unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; - return (&koibnal_data.koib_peers [hash]); + return (&kibnal_data.kib_peers [hash]); } static inline int -koibnal_peer_active(koib_peer_t *peer) +kibnal_peer_active(kib_peer_t *peer) { /* Am I in the peer hash table? */ return (!list_empty(&peer->ibp_list)); } static inline void -koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn) +kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) { /* CAVEAT EMPTOR: tx takes caller's ref on conn */ @@ -402,40 +398,41 @@ koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn) LASSERT (tx->tx_conn == NULL); /* only set here */ tx->tx_conn = conn; + tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); } -#define KOIBNAL_SERVICE_KEY_MASK (IB_SA_SERVICE_COMP_MASK_NAME | \ - IB_SA_SERVICE_COMP_MASK_DATA8_1 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_2 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_3 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_4 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_5 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_6 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_7 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_8) +#define KIBNAL_SERVICE_KEY_MASK (IB_SA_SERVICE_COMP_MASK_NAME | \ + IB_SA_SERVICE_COMP_MASK_DATA8_1 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_2 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_3 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_4 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_5 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_6 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_7 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_8) static inline __u64* -koibnal_service_nid_field(struct ib_common_attrib_service *srv) +kibnal_service_nid_field(struct ib_common_attrib_service *srv) { - /* must be consistent with KOIBNAL_SERVICE_KEY_MASK */ + /* must be consistent with KIBNAL_SERVICE_KEY_MASK */ return (__u64 *)srv->service_data8; } static inline void -koibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid) +kibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid) { - LASSERT (strlen (OPENIBNAL_SERVICE_NAME) < sizeof(srv->service_name)); + LASSERT (strlen (IBNAL_SERVICE_NAME) < sizeof(srv->service_name)); memset (srv->service_name, 0, sizeof(srv->service_name)); - strcpy (srv->service_name, OPENIBNAL_SERVICE_NAME); + strcpy (srv->service_name, IBNAL_SERVICE_NAME); - *koibnal_service_nid_field(srv) = cpu_to_le64(nid); + *kibnal_service_nid_field(srv) = cpu_to_le64(nid); } #if 0 static inline void -koibnal_show_rdma_attr (koib_conn_t *conn) +kibnal_show_rdma_attr (kib_conn_t *conn) { struct ib_qp_attribute qp_attr; int rc; @@ -457,7 +454,7 @@ koibnal_show_rdma_attr (koib_conn_t *conn) #if CONFIG_X86 static inline __u64 -koibnal_page2phys (struct page *p) +kibnal_page2phys (struct page *p) { __u64 page_number = p - mem_map; @@ -467,42 +464,69 @@ koibnal_page2phys (struct page *p) # error "no page->phys" #endif -extern koib_peer_t *koibnal_create_peer (ptl_nid_t nid); -extern void koibnal_put_peer (koib_peer_t *peer); -extern int koibnal_del_peer (ptl_nid_t nid, int single_share); -extern koib_peer_t *koibnal_find_peer_locked (ptl_nid_t nid); -extern void koibnal_unlink_peer_locked (koib_peer_t *peer); -extern int koibnal_close_stale_conns_locked (koib_peer_t *peer, +/* CAVEAT EMPTOR: + * We rely on tx/rx descriptor alignment to allow us to use the lowest bit + * of the work request id as a flag to determine if the completion is for a + * transmit or a receive. It seems that that the CQ entry's 'op' field + * isn't always set correctly on completions that occur after QP teardown. */ + +static inline __u64 +kibnal_ptr2wreqid (void *ptr, int isrx) +{ + unsigned long lptr = (unsigned long)ptr; + + LASSERT ((lptr & 1) == 0); + return (__u64)(lptr | (isrx ? 1 : 0)); +} + +static inline void * +kibnal_wreqid2ptr (__u64 wreqid) +{ + return (void *)(((unsigned long)wreqid) & ~1UL); +} + +static inline int +kibnal_wreqid_is_rx (__u64 wreqid) +{ + return (wreqid & 1) != 0; +} + +extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid); +extern void kibnal_put_peer (kib_peer_t *peer); +extern int kibnal_del_peer (ptl_nid_t nid, int single_share); +extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid); +extern void kibnal_unlink_peer_locked (kib_peer_t *peer); +extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation); -extern koib_conn_t *koibnal_create_conn (void); -extern void koibnal_put_conn (koib_conn_t *conn); -extern void koibnal_destroy_conn (koib_conn_t *conn); -extern int koibnal_alloc_pages (koib_pages_t **pp, int npages, int access); -extern void koibnal_free_pages (koib_pages_t *p); +extern kib_conn_t *kibnal_create_conn (void); +extern void kibnal_put_conn (kib_conn_t *conn); +extern void kibnal_destroy_conn (kib_conn_t *conn); +extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); +extern void kibnal_free_pages (kib_pages_t *p); -extern void koibnal_check_sends (koib_conn_t *conn); +extern void kibnal_check_sends (kib_conn_t *conn); extern tTS_IB_CM_CALLBACK_RETURN -koibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, +kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg); extern tTS_IB_CM_CALLBACK_RETURN -koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, +kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg); -extern void koibnal_close_conn_locked (koib_conn_t *conn, int error); -extern void koibnal_destroy_conn (koib_conn_t *conn); -extern int koibnal_thread_start (int (*fn)(void *arg), void *arg); -extern int koibnal_scheduler(void *arg); -extern int koibnal_connd (void *arg); -extern void koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg); -extern void koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg); -extern void koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob); -extern int koibnal_close_conn (koib_conn_t *conn, int why); -extern void koibnal_start_active_rdma (int type, int status, - koib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t nob); +extern void kibnal_close_conn_locked (kib_conn_t *conn, int error); +extern void kibnal_destroy_conn (kib_conn_t *conn); +extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); +extern int kibnal_scheduler(void *arg); +extern int kibnal_connd (void *arg); +extern void kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg); +extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); +extern int kibnal_close_conn (kib_conn_t *conn, int why); +extern void kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, + unsigned int niov, + struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t nob); + diff --git a/lustre/portals/knals/openibnal/openibnal_cb.c b/lustre/portals/knals/openibnal/openibnal_cb.c index 79bf37a..d774853 100644 --- a/lustre/portals/knals/openibnal/openibnal_cb.c +++ b/lustre/portals/knals/openibnal/openibnal_cb.c @@ -28,20 +28,20 @@ * */ void -koibnal_schedule_tx_done (koib_tx_t *tx) +kibnal_schedule_tx_done (kib_tx_t *tx) { unsigned long flags; - spin_lock_irqsave (&koibnal_data.koib_sched_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); - list_add_tail(&tx->tx_list, &koibnal_data.koib_sched_txq); - wake_up (&koibnal_data.koib_sched_waitq); + list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); + wake_up (&kibnal_data.kib_sched_waitq); - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); } void -koibnal_tx_done (koib_tx_t *tx) +kibnal_tx_done (kib_tx_t *tx) { ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; unsigned long flags; @@ -49,31 +49,31 @@ koibnal_tx_done (koib_tx_t *tx) int rc; LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ - LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be on ibc_rdma_queue */ + LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ switch (tx->tx_mapped) { default: LBUG(); - case KOIB_TX_UNMAPPED: + case KIB_TX_UNMAPPED: break; - case KOIB_TX_MAPPED: + case KIB_TX_MAPPED: if (in_interrupt()) { /* can't deregister memory in IRQ context... */ - koibnal_schedule_tx_done(tx); + kibnal_schedule_tx_done(tx); return; } rc = ib_memory_deregister(tx->tx_md.md_handle.mr); LASSERT (rc == 0); - tx->tx_mapped = KOIB_TX_UNMAPPED; + tx->tx_mapped = KIB_TX_UNMAPPED; break; -#if OPENIBNAL_FMR - case KOIB_TX_MAPPED_FMR: +#if IBNAL_FMR + case KIB_TX_MAPPED_FMR: if (in_interrupt() && tx->tx_status != 0) { /* can't flush FMRs in IRQ context... */ - koibnal_schedule_tx_done(tx); + kibnal_schedule_tx_done(tx); return; } @@ -81,8 +81,8 @@ koibnal_tx_done (koib_tx_t *tx) LASSERT (rc == 0); if (tx->tx_status != 0) - ib_fmr_pool_force_flush(koibnal_data.koib_fmr_pool); - tx->tx_mapped = KOIB_TX_UNMAPPED; + ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); + tx->tx_mapped = KIB_TX_UNMAPPED; break; #endif } @@ -92,12 +92,12 @@ koibnal_tx_done (koib_tx_t *tx) if (tx->tx_libmsg[i] == NULL) continue; - lib_finalize (&koibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); + lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); tx->tx_libmsg[i] = NULL; } if (tx->tx_conn != NULL) { - koibnal_put_conn (tx->tx_conn); + kibnal_put_conn (tx->tx_conn); tx->tx_conn = NULL; } @@ -105,52 +105,52 @@ koibnal_tx_done (koib_tx_t *tx) tx->tx_passive_rdma = 0; tx->tx_status = 0; - spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); if (tx->tx_isnblk) { - list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_nblk_txs); + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); } else { - list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_txs); - wake_up (&koibnal_data.koib_idle_tx_waitq); + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); + wake_up (&kibnal_data.kib_idle_tx_waitq); } - spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); } -koib_tx_t * -koibnal_get_idle_tx (int may_block) +kib_tx_t * +kibnal_get_idle_tx (int may_block) { - unsigned long flags; - koib_tx_t *tx = NULL; + unsigned long flags; + kib_tx_t *tx = NULL; for (;;) { - spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); /* "normal" descriptor is free */ - if (!list_empty (&koibnal_data.koib_idle_txs)) { - tx = list_entry (koibnal_data.koib_idle_txs.next, - koib_tx_t, tx_list); + if (!list_empty (&kibnal_data.kib_idle_txs)) { + tx = list_entry (kibnal_data.kib_idle_txs.next, + kib_tx_t, tx_list); break; } if (!may_block) { /* may dip into reserve pool */ - if (list_empty (&koibnal_data.koib_idle_nblk_txs)) { + if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { CERROR ("reserved tx desc pool exhausted\n"); break; } - tx = list_entry (koibnal_data.koib_idle_nblk_txs.next, - koib_tx_t, tx_list); + tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, + kib_tx_t, tx_list); break; } /* block for idle tx */ - spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - wait_event (koibnal_data.koib_idle_tx_waitq, - !list_empty (&koibnal_data.koib_idle_txs) || - koibnal_data.koib_shutdown); + wait_event (kibnal_data.kib_idle_tx_waitq, + !list_empty (&kibnal_data.kib_idle_txs) || + kibnal_data.kib_shutdown); } if (tx != NULL) { @@ -159,9 +159,9 @@ koibnal_get_idle_tx (int may_block) /* Allocate a new passive RDMA completion cookie. It might * not be needed, but we've got a lock right now and we're * unlikely to wrap... */ - tx->tx_passive_rdma_cookie = koibnal_data.koib_next_tx_cookie++; + tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; - LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); LASSERT (tx->tx_nsp == 0); LASSERT (tx->tx_sending == 0); LASSERT (tx->tx_status == 0); @@ -172,15 +172,15 @@ koibnal_get_idle_tx (int may_block) LASSERT (tx->tx_libmsg[1] == NULL); } - spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); return (tx); } int -koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) +kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { - /* I would guess that if koibnal_get_peer (nid) == NULL, + /* I would guess that if kibnal_get_peer (nid) == NULL, and we're not routing, then 'nid' is very distant :) */ if ( nal->libnal_ni.ni_pid.nid == nid ) { *dist = 0; @@ -192,7 +192,7 @@ koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) } void -koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status) +kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) { struct list_head *ttmp; unsigned long flags; @@ -200,30 +200,34 @@ koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status) spin_lock_irqsave (&conn->ibc_lock, flags); - list_for_each (ttmp, &conn->ibc_rdma_queue) { - koib_tx_t *tx = list_entry(ttmp, koib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma); - LASSERT (tx->tx_passive_rdma_wait); + list_for_each (ttmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); - if (tx->tx_passive_rdma_cookie != cookie) - continue; + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); - CDEBUG(D_NET, "Complete %p "LPD64"\n", tx, cookie); + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); - list_del (&tx->tx_list); + if (!tx->tx_passive_rdma_wait || + tx->tx_passive_rdma_cookie != cookie) + continue; + + CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); + tx->tx_status = status; tx->tx_passive_rdma_wait = 0; idle = (tx->tx_sending == 0); - tx->tx_status = status; + if (idle) + list_del (&tx->tx_list); spin_unlock_irqrestore (&conn->ibc_lock, flags); /* I could be racing with tx callbacks. It's whoever * _makes_ tx idle that frees it */ if (idle) - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } @@ -234,32 +238,32 @@ koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status) } void -koibnal_post_rx (koib_rx_t *rx, int do_credits) +kibnal_post_rx (kib_rx_t *rx, int do_credits) { - koib_conn_t *conn = rx->rx_conn; + kib_conn_t *conn = rx->rx_conn; int rc; unsigned long flags; rx->rx_gl = (struct ib_gather_scatter) { .address = rx->rx_vaddr, - .length = OPENIBNAL_MSG_SIZE, - .key = conn->ibc_rx_pages->oibp_lkey, + .length = IBNAL_MSG_SIZE, + .key = conn->ibc_rx_pages->ibp_lkey, }; - + rx->rx_sp = (struct ib_receive_param) { - .work_request_id = (__u64)(unsigned long)rx, + .work_request_id = kibnal_ptr2wreqid(rx, 1), .scatter_list = &rx->rx_gl, .num_scatter_entries = 1, .device_specific = NULL, .signaled = 1, }; - LASSERT (conn->ibc_state >= OPENIBNAL_CONN_ESTABLISHED); + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); LASSERT (!rx->rx_posted); rx->rx_posted = 1; mb(); - if (conn->ibc_state != OPENIBNAL_CONN_ESTABLISHED) + if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) rc = -ECONNABORTED; else rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1); @@ -270,26 +274,26 @@ koibnal_post_rx (koib_rx_t *rx, int do_credits) conn->ibc_outstanding_credits++; spin_unlock_irqrestore(&conn->ibc_lock, flags); - koibnal_check_sends(conn); + kibnal_check_sends(conn); } return; } - if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) { + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { CERROR ("Error posting receive -> "LPX64": %d\n", conn->ibc_peer->ibp_nid, rc); - koibnal_close_conn (rx->rx_conn, rc); + kibnal_close_conn (rx->rx_conn, rc); } else { CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", conn->ibc_peer->ibp_nid, rc); } /* Drop rx's ref */ - koibnal_put_conn (conn); + kibnal_put_conn (conn); } -#if OPENIBNAL_CKSUM -__u32 koibnal_cksum (void *ptr, int nob) +#if IBNAL_CKSUM +__u32 kibnal_cksum (void *ptr, int nob) { char *c = ptr; __u32 sum = 0; @@ -302,17 +306,17 @@ __u32 koibnal_cksum (void *ptr, int nob) #endif void -koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) +kibnal_rx_callback (struct ib_cq_entry *e) { - koib_rx_t *rx = (koib_rx_t *)((unsigned long)e->work_request_id); - koib_msg_t *msg = rx->rx_msg; - koib_conn_t *conn = rx->rx_conn; + kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id); + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; int nob = e->bytes_transferred; - const int base_nob = offsetof(koib_msg_t, oibm_u); + const int base_nob = offsetof(kib_msg_t, ibm_u); int credits; int flipped; unsigned long flags; -#if OPENIBNAL_CKSUM +#if IBNAL_CKSUM __u32 msg_cksum; __u32 computed_cksum; #endif @@ -324,11 +328,11 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) /* receives complete with error in any case after we've started * closing the QP */ - if (conn->ibc_state >= OPENIBNAL_CONN_DEATHROW) + if (conn->ibc_state >= IBNAL_CONN_DEATHROW) goto failed; /* We don't post receives until the conn is established */ - LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED); + LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); if (e->status != IB_COMPLETION_STATUS_SUCCESS) { CERROR("Rx from "LPX64" failed: %d\n", @@ -344,35 +348,35 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) /* Receiver does any byte flipping if necessary... */ - if (msg->oibm_magic == OPENIBNAL_MSG_MAGIC) { + if (msg->ibm_magic == IBNAL_MSG_MAGIC) { flipped = 0; } else { - if (msg->oibm_magic != __swab32(OPENIBNAL_MSG_MAGIC)) { + if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { CERROR ("Unrecognised magic: %08x from "LPX64"\n", - msg->oibm_magic, conn->ibc_peer->ibp_nid); + msg->ibm_magic, conn->ibc_peer->ibp_nid); goto failed; } flipped = 1; - __swab16s (&msg->oibm_version); - LASSERT (sizeof(msg->oibm_type) == 1); - LASSERT (sizeof(msg->oibm_credits) == 1); + __swab16s (&msg->ibm_version); + LASSERT (sizeof(msg->ibm_type) == 1); + LASSERT (sizeof(msg->ibm_credits) == 1); } - if (msg->oibm_version != OPENIBNAL_MSG_VERSION) { + if (msg->ibm_version != IBNAL_MSG_VERSION) { CERROR ("Incompatible msg version %d (%d expected)\n", - msg->oibm_version, OPENIBNAL_MSG_VERSION); + msg->ibm_version, IBNAL_MSG_VERSION); goto failed; } -#if OPENIBNAL_CKSUM - if (nob != msg->oibm_nob) { - CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->oibm_nob); +#if IBNAL_CKSUM + if (nob != msg->ibm_nob) { + CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob); goto failed; } - msg_cksum = le32_to_cpu(msg->oibm_cksum); - msg->oibm_cksum = 0; - computed_cksum = koibnal_cksum (msg, nob); + msg_cksum = le32_to_cpu(msg->ibm_cksum); + msg->ibm_cksum = 0; + computed_cksum = kibnal_cksum (msg, nob); if (msg_cksum != computed_cksum) { CERROR ("Checksum failure %d: (%d expected)\n", @@ -383,101 +387,101 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) #endif /* Have I received credits that will let me send? */ - credits = msg->oibm_credits; + credits = msg->ibm_credits; if (credits != 0) { spin_lock_irqsave(&conn->ibc_lock, flags); conn->ibc_credits += credits; spin_unlock_irqrestore(&conn->ibc_lock, flags); - koibnal_check_sends(conn); + kibnal_check_sends(conn); } - switch (msg->oibm_type) { - case OPENIBNAL_MSG_NOOP: - koibnal_post_rx (rx, 1); + switch (msg->ibm_type) { + case IBNAL_MSG_NOOP: + kibnal_post_rx (rx, 1); return; - case OPENIBNAL_MSG_IMMEDIATE: - if (nob < base_nob + sizeof (koib_immediate_msg_t)) { + case IBNAL_MSG_IMMEDIATE: + if (nob < base_nob + sizeof (kib_immediate_msg_t)) { CERROR ("Short IMMEDIATE from "LPX64": %d\n", conn->ibc_peer->ibp_nid, nob); goto failed; } break; - case OPENIBNAL_MSG_PUT_RDMA: - case OPENIBNAL_MSG_GET_RDMA: - if (nob < base_nob + sizeof (koib_rdma_msg_t)) { + case IBNAL_MSG_PUT_RDMA: + case IBNAL_MSG_GET_RDMA: + if (nob < base_nob + sizeof (kib_rdma_msg_t)) { CERROR ("Short RDMA msg from "LPX64": %d\n", conn->ibc_peer->ibp_nid, nob); goto failed; } if (flipped) { - __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_key); - __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_nob); - __swab64s(&msg->oibm_u.rdma.oibrm_desc.rd_addr); + __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key); + __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob); + __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr); } CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n", - msg->oibm_type, msg->oibm_u.rdma.oibrm_cookie, - msg->oibm_u.rdma.oibrm_desc.rd_key, - msg->oibm_u.rdma.oibrm_desc.rd_addr, - msg->oibm_u.rdma.oibrm_desc.rd_nob); + msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie, + msg->ibm_u.rdma.ibrm_desc.rd_key, + msg->ibm_u.rdma.ibrm_desc.rd_addr, + msg->ibm_u.rdma.ibrm_desc.rd_nob); break; - case OPENIBNAL_MSG_PUT_DONE: - case OPENIBNAL_MSG_GET_DONE: - if (nob < base_nob + sizeof (koib_completion_msg_t)) { + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + if (nob < base_nob + sizeof (kib_completion_msg_t)) { CERROR ("Short COMPLETION msg from "LPX64": %d\n", conn->ibc_peer->ibp_nid, nob); goto failed; } if (flipped) - __swab32s(&msg->oibm_u.completion.oibcm_status); + __swab32s(&msg->ibm_u.completion.ibcm_status); CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", - msg->oibm_type, msg->oibm_u.completion.oibcm_cookie, - msg->oibm_u.completion.oibcm_status); + msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); - koibnal_complete_passive_rdma (conn, - msg->oibm_u.completion.oibcm_cookie, - msg->oibm_u.completion.oibcm_status); - koibnal_post_rx (rx, 1); + kibnal_complete_passive_rdma (conn, + msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); + kibnal_post_rx (rx, 1); return; default: CERROR ("Can't parse type from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, msg->oibm_type); + conn->ibc_peer->ibp_nid, msg->ibm_type); goto failed; } - /* schedule for koibnal_rx() in thread context */ - spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags); + /* schedule for kibnal_rx() in thread context */ + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - list_add_tail (&rx->rx_list, &koibnal_data.koib_sched_rxq); - wake_up (&koibnal_data.koib_sched_waitq); + list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); + wake_up (&kibnal_data.kib_sched_waitq); - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); return; failed: CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - koibnal_close_conn(conn, -ECONNABORTED); + kibnal_close_conn(conn, -ECONNABORTED); /* Don't re-post rx & drop its ref on conn */ - koibnal_put_conn(conn); + kibnal_put_conn(conn); } void -koibnal_rx (koib_rx_t *rx) +kibnal_rx (kib_rx_t *rx) { - koib_msg_t *msg = rx->rx_msg; + kib_msg_t *msg = rx->rx_msg; /* Clear flag so I can detect if I've sent an RDMA completion */ rx->rx_rdma = 0; - switch (msg->oibm_type) { - case OPENIBNAL_MSG_GET_RDMA: - lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx); + switch (msg->ibm_type) { + case IBNAL_MSG_GET_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); /* If the incoming get was matched, I'll have initiated the * RDMA and the completion message... */ if (rx->rx_rdma) @@ -487,12 +491,12 @@ koibnal_rx (koib_rx_t *rx) * the peer's GET blocking for the full timeout. */ CERROR ("Completing unmatched RDMA GET from "LPX64"\n", rx->rx_conn->ibc_peer->ibp_nid); - koibnal_start_active_rdma (OPENIBNAL_MSG_GET_DONE, -EIO, - rx, NULL, 0, NULL, NULL, 0, 0); + kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, + rx, NULL, 0, NULL, NULL, 0, 0); break; - case OPENIBNAL_MSG_PUT_RDMA: - lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx); + case IBNAL_MSG_PUT_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); if (rx->rx_rdma) break; /* This is most unusual, since even if lib_parse() didn't @@ -505,8 +509,8 @@ koibnal_rx (koib_rx_t *rx) rx->rx_conn->ibc_peer->ibp_nid); break; - case OPENIBNAL_MSG_IMMEDIATE: - lib_parse(&koibnal_lib, &msg->oibm_u.immediate.oibim_hdr, rx); + case IBNAL_MSG_IMMEDIATE: + lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); LASSERT (!rx->rx_rdma); break; @@ -515,12 +519,12 @@ koibnal_rx (koib_rx_t *rx) break; } - koibnal_post_rx (rx, 1); + kibnal_post_rx (rx, 1); } #if 0 int -koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) +kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) { struct page *page; @@ -531,7 +535,7 @@ koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) else if (vaddr >= PKMAP_BASE && vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) page = vmalloc_to_page ((void *)vaddr); - /* in 2.4 ^ just walks the page tables */ + /* in 2.4 ^ just walks the page tables */ #endif else page = virt_to_page (vaddr); @@ -540,13 +544,13 @@ koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) !VALID_PAGE (page)) return (-EFAULT); - *physp = koibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1)); + *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1)); return (0); } #endif int -koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access, +kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access, int niov, struct iovec *iov, int offset, int nob) { @@ -555,7 +559,7 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access, LASSERT (nob > 0); LASSERT (niov > 0); - LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); while (offset >= iov->iov_len) { offset -= iov->iov_len; @@ -572,7 +576,7 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access, vaddr = (void *)(((unsigned long)iov->iov_base) + offset); tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); - rc = ib_memory_register (koibnal_data.koib_pd, + rc = ib_memory_register (kibnal_data.kib_pd, vaddr, nob, access, &tx->tx_md.md_handle.mr, @@ -584,21 +588,21 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access, return (rc); } - tx->tx_mapped = KOIB_TX_MAPPED; + tx->tx_mapped = KIB_TX_MAPPED; return (0); } int -koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, +kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access, int nkiov, ptl_kiov_t *kiov, int offset, int nob) { -#if OPENIBNAL_FMR +#if IBNAL_FMR __u64 *phys; - const int mapped = KOIB_TX_MAPPED_FMR; + const int mapped = KIB_TX_MAPPED_FMR; #else struct ib_physical_buffer *phys; - const int mapped = KOIB_TX_MAPPED; + const int mapped = KIB_TX_MAPPED; #endif int page_offset; int nphys; @@ -610,7 +614,7 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, LASSERT (nob > 0); LASSERT (nkiov > 0); - LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); while (offset >= kiov->kiov_len) { offset -= kiov->kiov_len; @@ -627,10 +631,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, } page_offset = kiov->kiov_offset + offset; -#if OPENIBNAL_FMR - phys[0] = koibnal_page2phys(kiov->kiov_page); +#if IBNAL_FMR + phys[0] = kibnal_page2phys(kiov->kiov_page); #else - phys[0].address = koibnal_page2phys(kiov->kiov_page); + phys[0].address = kibnal_page2phys(kiov->kiov_page); phys[0].size = PAGE_SIZE; #endif nphys = 1; @@ -667,10 +671,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, } LASSERT (nphys * sizeof (*phys) < phys_size); -#if OPENIBNAL_FMR - phys[nphys] = koibnal_page2phys(kiov->kiov_page); +#if IBNAL_FMR + phys[nphys] = kibnal_page2phys(kiov->kiov_page); #else - phys[nphys].address = koibnal_page2phys(kiov->kiov_page); + phys[nphys].address = kibnal_page2phys(kiov->kiov_page); phys[nphys].size = PAGE_SIZE; #endif nphys++; @@ -683,10 +687,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, for (rc = 0; rc < nphys; rc++) CWARN (" [%d] "LPX64" / %d\n", rc, phys[rc].address, phys[rc].size); #endif - tx->tx_md.md_addr = OPENIBNAL_RDMA_BASE; + tx->tx_md.md_addr = IBNAL_RDMA_BASE; -#if OPENIBNAL_FMR - rc = ib_fmr_register_physical (koibnal_data.koib_fmr_pool, +#if IBNAL_FMR + rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, phys, nphys, &tx->tx_md.md_addr, page_offset, @@ -694,7 +698,7 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey); #else - rc = ib_memory_register_physical (koibnal_data.koib_pd, + rc = ib_memory_register_physical (kibnal_data.kib_pd, phys, nphys, &tx->tx_md.md_addr, nob, page_offset, @@ -717,24 +721,24 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, return (rc); } -koib_conn_t * -koibnal_find_conn_locked (koib_peer_t *peer) +kib_conn_t * +kibnal_find_conn_locked (kib_peer_t *peer) { struct list_head *tmp; /* just return the first connection */ list_for_each (tmp, &peer->ibp_conns) { - return (list_entry(tmp, koib_conn_t, ibc_list)); + return (list_entry(tmp, kib_conn_t, ibc_list)); } return (NULL); } void -koibnal_check_sends (koib_conn_t *conn) +kibnal_check_sends (kib_conn_t *conn) { unsigned long flags; - koib_tx_t *tx; + kib_tx_t *tx; int rc; int i; int done; @@ -742,39 +746,39 @@ koibnal_check_sends (koib_conn_t *conn) spin_lock_irqsave (&conn->ibc_lock, flags); + LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); + if (list_empty(&conn->ibc_tx_queue) && - conn->ibc_outstanding_credits >= OPENIBNAL_CREDIT_HIGHWATER) { + conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { spin_unlock_irqrestore(&conn->ibc_lock, flags); - - tx = koibnal_get_idle_tx(0); /* don't block */ + + tx = kibnal_get_idle_tx(0); /* don't block */ if (tx != NULL) - koibnal_init_tx_msg(tx, OPENIBNAL_MSG_NOOP, 0); + kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); spin_lock_irqsave(&conn->ibc_lock, flags); - + if (tx != NULL) { atomic_inc(&conn->ibc_refcount); - koibnal_queue_tx_locked(tx, conn); + kibnal_queue_tx_locked(tx, conn); } } - LASSERT (conn->ibc_nsends_posted <= OPENIBNAL_MSG_QUEUE_SIZE); - while (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, koib_tx_t, tx_list); + tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); /* We rely on this for QP sizing */ LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2); LASSERT (conn->ibc_outstanding_credits >= 0); - LASSERT (conn->ibc_outstanding_credits <= OPENIBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); LASSERT (conn->ibc_credits >= 0); - LASSERT (conn->ibc_credits <= OPENIBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); /* Not on ibc_rdma_queue */ LASSERT (!tx->tx_passive_rdma_wait); - if (conn->ibc_nsends_posted == OPENIBNAL_MSG_QUEUE_SIZE) + if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) break; if (conn->ibc_credits == 0) /* no credits */ @@ -786,37 +790,29 @@ koibnal_check_sends (koib_conn_t *conn) list_del (&tx->tx_list); - if (tx->tx_msg->oibm_type == OPENIBNAL_MSG_NOOP && + if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && (!list_empty(&conn->ibc_tx_queue) || - conn->ibc_outstanding_credits < OPENIBNAL_CREDIT_HIGHWATER)) { - /* Redundant NOOP */ + conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { + /* redundant NOOP */ spin_unlock_irqrestore(&conn->ibc_lock, flags); - koibnal_tx_done(tx); + kibnal_tx_done(tx); spin_lock_irqsave(&conn->ibc_lock, flags); continue; } - - /* incoming RDMA completion can find this one now */ - if (tx->tx_passive_rdma) { - list_add (&tx->tx_list, &conn->ibc_rdma_queue); - tx->tx_passive_rdma_wait = 1; - tx->tx_passive_rdma_deadline = - jiffies + koibnal_tunables.koib_io_timeout * HZ; - } - tx->tx_msg->oibm_credits = conn->ibc_outstanding_credits; + tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits; conn->ibc_outstanding_credits = 0; - /* use the free memory barrier when we unlock to ensure - * sending set before we can get the tx callback. */ conn->ibc_nsends_posted++; conn->ibc_credits--; - tx->tx_sending = tx->tx_nsp; -#if OPENIBNAL_CKSUM - tx->tx_msg->oibm_cksum = 0; - tx->tx_msg->oibm_cksum = koibnal_cksum(tx->tx_msg, tx->tx_msg->oibm_nob); - CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->oibm_cksum, tx->tx_msg->oibm_nob); + tx->tx_sending = tx->tx_nsp; + tx->tx_passive_rdma_wait = tx->tx_passive_rdma; + list_add (&tx->tx_list, &conn->ibc_active_txs); +#if IBNAL_CKSUM + tx->tx_msg->ibm_cksum = 0; + tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob); + CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob); #endif spin_unlock_irqrestore (&conn->ibc_lock, flags); @@ -827,7 +823,7 @@ koibnal_check_sends (koib_conn_t *conn) rc = -ECONNABORTED; nwork = 0; - if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) { + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { tx->tx_status = 0; /* Driver only accepts 1 item at a time */ for (i = 0; i < tx->tx_nsp; i++) { @@ -842,31 +838,31 @@ koibnal_check_sends (koib_conn_t *conn) if (rc != 0) { /* NB credits are transferred in the actual * message, which can only be the last work item */ - conn->ibc_outstanding_credits += tx->tx_msg->oibm_credits; + conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; conn->ibc_credits++; conn->ibc_nsends_posted--; - tx->tx_sending -= tx->tx_nsp - nwork; + tx->tx_status = rc; + tx->tx_passive_rdma_wait = 0; + tx->tx_sending -= tx->tx_nsp - nwork; + done = (tx->tx_sending == 0); - - if (tx->tx_passive_rdma) { - tx->tx_passive_rdma_wait = 0; + if (done) list_del (&tx->tx_list); - } spin_unlock_irqrestore (&conn->ibc_lock, flags); - if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) CERROR ("Error %d posting transmit to "LPX64"\n", rc, conn->ibc_peer->ibp_nid); else CDEBUG (D_NET, "Error %d posting transmit to " LPX64"\n", rc, conn->ibc_peer->ibp_nid); - koibnal_close_conn (conn, rc); + kibnal_close_conn (conn, rc); if (done) - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } @@ -876,10 +872,10 @@ koibnal_check_sends (koib_conn_t *conn) } void -koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) +kibnal_tx_callback (struct ib_cq_entry *e) { - koib_tx_t *tx = (koib_tx_t *)((unsigned long)e->work_request_id); - koib_conn_t *conn; + kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id); + kib_conn_t *conn; unsigned long flags; int idle; @@ -901,6 +897,8 @@ koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) tx->tx_sending--; idle = (tx->tx_sending == 0) && /* This is the final callback */ (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ + if (idle) + list_del(&tx->tx_list); CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, @@ -917,53 +915,62 @@ koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) spin_unlock_irqrestore(&conn->ibc_lock, flags); if (idle) - koibnal_tx_done (tx); + kibnal_tx_done (tx); if (e->status != IB_COMPLETION_STATUS_SUCCESS) { CERROR ("Tx completion to "LPX64" failed: %d\n", conn->ibc_peer->ibp_nid, e->status); - koibnal_close_conn (conn, -ENETDOWN); + kibnal_close_conn (conn, -ENETDOWN); } else { /* can I shovel some more sends out the door? */ - koibnal_check_sends(conn); + kibnal_check_sends(conn); } - koibnal_put_conn (conn); + kibnal_put_conn (conn); } void -koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob) +kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) +{ + if (kibnal_wreqid_is_rx(e->work_request_id)) + kibnal_rx_callback (e); + else + kibnal_tx_callback (e); +} + +void +kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) { struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp]; struct ib_send_param *sp = &tx->tx_sp[tx->tx_nsp]; int fence; - int nob = offsetof (koib_msg_t, oibm_u) + body_nob; + int nob = offsetof (kib_msg_t, ibm_u) + body_nob; LASSERT (tx->tx_nsp >= 0 && tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0])); - LASSERT (nob <= OPENIBNAL_MSG_SIZE); + LASSERT (nob <= IBNAL_MSG_SIZE); - tx->tx_msg->oibm_magic = OPENIBNAL_MSG_MAGIC; - tx->tx_msg->oibm_version = OPENIBNAL_MSG_VERSION; - tx->tx_msg->oibm_type = type; -#if OPENIBNAL_CKSUM - tx->tx_msg->oibm_nob = nob; + tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC; + tx->tx_msg->ibm_version = IBNAL_MSG_VERSION; + tx->tx_msg->ibm_type = type; +#if IBNAL_CKSUM + tx->tx_msg->ibm_nob = nob; #endif /* Fence the message if it's bundled with an RDMA read */ fence = (tx->tx_nsp > 0) && - (type == OPENIBNAL_MSG_PUT_DONE); + (type == IBNAL_MSG_PUT_DONE); *gl = (struct ib_gather_scatter) { .address = tx->tx_vaddr, .length = nob, - .key = koibnal_data.koib_tx_pages->oibp_lkey, + .key = kibnal_data.kib_tx_pages->ibp_lkey, }; /* NB If this is an RDMA read, the completion message must wait for * the RDMA to complete. Sends wait for previous RDMA writes * anyway... */ *sp = (struct ib_send_param) { - .work_request_id = (__u64)((unsigned long)tx), + .work_request_id = kibnal_ptr2wreqid(tx, 0), .op = IB_OP_SEND, .gather_list = gl, .num_gather_entries = 1, @@ -979,26 +986,26 @@ koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob) } void -koibnal_queue_tx (koib_tx_t *tx, koib_conn_t *conn) +kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) { unsigned long flags; spin_lock_irqsave(&conn->ibc_lock, flags); - koibnal_queue_tx_locked (tx, conn); + kibnal_queue_tx_locked (tx, conn); spin_unlock_irqrestore(&conn->ibc_lock, flags); - koibnal_check_sends(conn); + kibnal_check_sends(conn); } void -koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) +kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) { unsigned long flags; - koib_peer_t *peer; - koib_conn_t *conn; - rwlock_t *g_lock = &koibnal_data.koib_global_lock; + kib_peer_t *peer; + kib_conn_t *conn; + rwlock_t *g_lock = &kibnal_data.kib_global_lock; /* If I get here, I've committed to send, so I complete the tx with * failure on any problems */ @@ -1008,15 +1015,15 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) read_lock (g_lock); - peer = koibnal_find_peer_locked (nid); + peer = kibnal_find_peer_locked (nid); if (peer == NULL) { read_unlock (g_lock); tx->tx_status = -EHOSTUNREACH; - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } - conn = koibnal_find_conn_locked (peer); + conn = kibnal_find_conn_locked (peer); if (conn != NULL) { CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, @@ -1024,7 +1031,7 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ read_unlock (g_lock); - koibnal_queue_tx (tx, conn); + kibnal_queue_tx (tx, conn); return; } @@ -1032,15 +1039,15 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) read_unlock (g_lock); write_lock_irqsave (g_lock, flags); - peer = koibnal_find_peer_locked (nid); + peer = kibnal_find_peer_locked (nid); if (peer == NULL) { write_unlock_irqrestore (g_lock, flags); tx->tx_status = -EHOSTUNREACH; - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } - conn = koibnal_find_conn_locked (peer); + conn = kibnal_find_conn_locked (peer); if (conn != NULL) { /* Connection exists; queue message on it */ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", @@ -1049,7 +1056,7 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ write_unlock_irqrestore (g_lock, flags); - koibnal_queue_tx (tx, conn); + kibnal_queue_tx (tx, conn); return; } @@ -1057,20 +1064,20 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { write_unlock_irqrestore (g_lock, flags); tx->tx_status = -EHOSTUNREACH; - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } peer->ibp_connecting = 1; atomic_inc (&peer->ibp_refcount); /* extra ref for connd */ - spin_lock (&koibnal_data.koib_connd_lock); + spin_lock (&kibnal_data.kib_connd_lock); list_add_tail (&peer->ibp_connd_list, - &koibnal_data.koib_connd_peers); - wake_up (&koibnal_data.koib_connd_waitq); + &kibnal_data.kib_connd_peers); + wake_up (&kibnal_data.kib_connd_waitq); - spin_unlock (&koibnal_data.koib_connd_lock); + spin_unlock (&kibnal_data.kib_connd_lock); } /* A connection is being established; queue the message... */ @@ -1080,49 +1087,49 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) } ptl_err_t -koibnal_start_passive_rdma (int type, ptl_nid_t nid, +kibnal_start_passive_rdma (int type, ptl_nid_t nid, lib_msg_t *libmsg, ptl_hdr_t *hdr) { int nob = libmsg->md->length; - koib_tx_t *tx; - koib_msg_t *oibmsg; + kib_tx_t *tx; + kib_msg_t *ibmsg; int rc; int access; - LASSERT (type == OPENIBNAL_MSG_PUT_RDMA || - type == OPENIBNAL_MSG_GET_RDMA); + LASSERT (type == IBNAL_MSG_PUT_RDMA || + type == IBNAL_MSG_GET_RDMA); LASSERT (nob > 0); LASSERT (!in_interrupt()); /* Mapping could block */ - if (type == OPENIBNAL_MSG_PUT_RDMA) { + if (type == IBNAL_MSG_PUT_RDMA) { access = IB_ACCESS_REMOTE_READ; } else { access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; } - tx = koibnal_get_idle_tx (1); /* May block; caller is an app thread */ + tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ LASSERT (tx != NULL); if ((libmsg->md->options & PTL_MD_KIOV) == 0) - rc = koibnal_map_iov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.iov, - 0, nob); + rc = kibnal_map_iov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.iov, + 0, nob); else - rc = koibnal_map_kiov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.kiov, - 0, nob); + rc = kibnal_map_kiov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.kiov, + 0, nob); if (rc != 0) { CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); goto failed; } - if (type == OPENIBNAL_MSG_GET_RDMA) { + if (type == IBNAL_MSG_GET_RDMA) { /* reply gets finalized when tx completes */ - tx->tx_libmsg[1] = lib_create_reply_msg(&koibnal_lib, + tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg); if (tx->tx_libmsg[1] == NULL) { CERROR ("Can't create reply for GET -> "LPX64"\n", @@ -1134,15 +1141,15 @@ koibnal_start_passive_rdma (int type, ptl_nid_t nid, tx->tx_passive_rdma = 1; - oibmsg = tx->tx_msg; + ibmsg = tx->tx_msg; - oibmsg->oibm_u.rdma.oibrm_hdr = *hdr; - oibmsg->oibm_u.rdma.oibrm_cookie = tx->tx_passive_rdma_cookie; - oibmsg->oibm_u.rdma.oibrm_desc.rd_key = tx->tx_md.md_rkey; - oibmsg->oibm_u.rdma.oibrm_desc.rd_addr = tx->tx_md.md_addr; - oibmsg->oibm_u.rdma.oibrm_desc.rd_nob = nob; + ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; + ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; + ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey; + ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr; + ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob; - koibnal_init_tx_msg (tx, type, sizeof (koib_rdma_msg_t)); + kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t)); CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " LPX64", nob %d\n", @@ -1152,25 +1159,25 @@ koibnal_start_passive_rdma (int type, ptl_nid_t nid, /* libmsg gets finalized when tx completes. */ tx->tx_libmsg[0] = libmsg; - koibnal_launch_tx(tx, nid); + kibnal_launch_tx(tx, nid); return (PTL_OK); failed: tx->tx_status = rc; - koibnal_tx_done (tx); + kibnal_tx_done (tx); return (PTL_FAIL); } void -koibnal_start_active_rdma (int type, int status, - koib_rx_t *rx, lib_msg_t *libmsg, +kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, size_t offset, size_t nob) { - koib_msg_t *rxmsg = rx->rx_msg; - koib_msg_t *txmsg; - koib_tx_t *tx; + kib_msg_t *rxmsg = rx->rx_msg; + kib_msg_t *txmsg; + kib_tx_t *tx; int access; int rdma_op; int rc; @@ -1187,8 +1194,8 @@ koibnal_start_active_rdma (int type, int status, /* No data if we're completing with failure */ LASSERT (status == 0 || nob == 0); - LASSERT (type == OPENIBNAL_MSG_GET_DONE || - type == OPENIBNAL_MSG_PUT_DONE); + LASSERT (type == IBNAL_MSG_GET_DONE || + type == IBNAL_MSG_PUT_DONE); /* Flag I'm completing the RDMA. Even if I fail to send the * completion message, I will have tried my best so further @@ -1196,22 +1203,22 @@ koibnal_start_active_rdma (int type, int status, LASSERT (!rx->rx_rdma); rx->rx_rdma = 1; - if (type == OPENIBNAL_MSG_GET_DONE) { + if (type == IBNAL_MSG_GET_DONE) { access = 0; rdma_op = IB_OP_RDMA_WRITE; - LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_GET_RDMA); + LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); } else { access = IB_ACCESS_LOCAL_WRITE; rdma_op = IB_OP_RDMA_READ; - LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_PUT_RDMA); + LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); } - tx = koibnal_get_idle_tx (0); /* Mustn't block */ + tx = kibnal_get_idle_tx (0); /* Mustn't block */ if (tx == NULL) { CERROR ("tx descs exhausted on RDMA from "LPX64 " completing locally with failure\n", - rx->rx_conn->ibc_peer->ibp_nid); - lib_finalize (&koibnal_lib, NULL, libmsg, PTL_NO_SPACE); + rx->rx_conn->ibc_peer->ibp_nid); + lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); return; } LASSERT (tx->tx_nsp == 0); @@ -1222,11 +1229,11 @@ koibnal_start_active_rdma (int type, int status, * message is matched) */ if (kiov != NULL) - rc = koibnal_map_kiov (tx, access, - niov, kiov, offset, nob); + rc = kibnal_map_kiov (tx, access, + niov, kiov, offset, nob); else - rc = koibnal_map_iov (tx, access, - niov, iov, offset, nob); + rc = kibnal_map_iov (tx, access, + niov, iov, offset, nob); if (rc != 0) { CERROR ("Can't map RDMA -> "LPX64": %d\n", @@ -1242,12 +1249,12 @@ koibnal_start_active_rdma (int type, int status, }; tx->tx_sp[0] = (struct ib_send_param) { - .work_request_id = (__u64)((unsigned long)tx), + .work_request_id = kibnal_ptr2wreqid(tx, 0), .op = rdma_op, .gather_list = &tx->tx_gl[0], .num_gather_entries = 1, - .remote_address = rxmsg->oibm_u.rdma.oibrm_desc.rd_addr, - .rkey = rxmsg->oibm_u.rdma.oibrm_desc.rd_key, + .remote_address = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr, + .rkey = rxmsg->ibm_u.rdma.ibrm_desc.rd_key, .device_specific = NULL, .solicited_event = 0, .signaled = 1, @@ -1262,10 +1269,10 @@ koibnal_start_active_rdma (int type, int status, txmsg = tx->tx_msg; - txmsg->oibm_u.completion.oibcm_cookie = rxmsg->oibm_u.rdma.oibrm_cookie; - txmsg->oibm_u.completion.oibcm_status = status; + txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; + txmsg->ibm_u.completion.ibcm_status = status; - koibnal_init_tx_msg(tx, type, sizeof (koib_completion_msg_t)); + kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); if (status == 0 && nob != 0) { LASSERT (tx->tx_nsp > 1); @@ -1277,7 +1284,7 @@ koibnal_start_active_rdma (int type, int status, LASSERT (tx->tx_nsp == 1); /* No RDMA: local completion happens now! */ CDEBUG(D_WARNING,"No data: immediate completion\n"); - lib_finalize (&koibnal_lib, NULL, libmsg, + lib_finalize (&kibnal_lib, NULL, libmsg, status == 0 ? PTL_OK : PTL_FAIL); } @@ -1288,11 +1295,11 @@ koibnal_start_active_rdma (int type, int status, atomic_read (&rx->rx_conn->ibc_refcount)); atomic_inc (&rx->rx_conn->ibc_refcount); /* ...and queue it up */ - koibnal_queue_tx(tx, rx->rx_conn); + kibnal_queue_tx(tx, rx->rx_conn); } ptl_err_t -koibnal_sendmsg(lib_nal_t *nal, +kibnal_sendmsg(lib_nal_t *nal, void *private, lib_msg_t *libmsg, ptl_hdr_t *hdr, @@ -1305,8 +1312,8 @@ koibnal_sendmsg(lib_nal_t *nal, size_t payload_offset, size_t payload_nob) { - koib_msg_t *oibmsg; - koib_tx_t *tx; + kib_msg_t *ibmsg; + kib_tx_t *tx; int nob; /* NB 'private' is different depending on what we're sending.... */ @@ -1329,27 +1336,27 @@ koibnal_sendmsg(lib_nal_t *nal, case PTL_MSG_REPLY: { /* reply's 'private' is the incoming receive */ - koib_rx_t *rx = private; + kib_rx_t *rx = private; /* RDMA reply expected? */ - if (rx->rx_msg->oibm_type == OPENIBNAL_MSG_GET_RDMA) { - koibnal_start_active_rdma(OPENIBNAL_MSG_GET_DONE, 0, - rx, libmsg, payload_niov, - payload_iov, payload_kiov, - payload_offset, payload_nob); + if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { + kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, + rx, libmsg, payload_niov, + payload_iov, payload_kiov, + payload_offset, payload_nob); return (PTL_OK); } /* Incoming message consistent with immediate reply? */ - if (rx->rx_msg->oibm_type != OPENIBNAL_MSG_IMMEDIATE) { + if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", - nid, rx->rx_msg->oibm_type); + nid, rx->rx_msg->ibm_type); return (PTL_FAIL); } /* Will it fit in a message? */ - nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]); - if (nob >= OPENIBNAL_MSG_SIZE) { + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob >= IBNAL_MSG_SIZE) { CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", nid, payload_nob); return (PTL_FAIL); @@ -1359,10 +1366,10 @@ koibnal_sendmsg(lib_nal_t *nal, case PTL_MSG_GET: /* might the REPLY message be big enough to need RDMA? */ - nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[libmsg->md->length]); - if (nob > OPENIBNAL_MSG_SIZE) - return (koibnal_start_passive_rdma(OPENIBNAL_MSG_GET_RDMA, - nid, libmsg, hdr)); + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, + nid, libmsg, hdr)); break; case PTL_MSG_ACK: @@ -1371,181 +1378,181 @@ koibnal_sendmsg(lib_nal_t *nal, case PTL_MSG_PUT: /* Is the payload big enough to need RDMA? */ - nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]); - if (nob > OPENIBNAL_MSG_SIZE) - return (koibnal_start_passive_rdma(OPENIBNAL_MSG_PUT_RDMA, - nid, libmsg, hdr)); + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, + nid, libmsg, hdr)); break; } - tx = koibnal_get_idle_tx(!(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt())); + tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt())); if (tx == NULL) { CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", type, nid, in_interrupt() ? " (intr)" : ""); return (PTL_NO_SPACE); } - oibmsg = tx->tx_msg; - oibmsg->oibm_u.immediate.oibim_hdr = *hdr; + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; if (payload_nob > 0) { if (payload_kiov != NULL) - lib_copy_kiov2buf(oibmsg->oibm_u.immediate.oibim_payload, + lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, payload_niov, payload_kiov, payload_offset, payload_nob); else - lib_copy_iov2buf(oibmsg->oibm_u.immediate.oibim_payload, + lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, payload_niov, payload_iov, payload_offset, payload_nob); } - koibnal_init_tx_msg (tx, OPENIBNAL_MSG_IMMEDIATE, - offsetof(koib_immediate_msg_t, - oibim_payload[payload_nob])); + kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, + offsetof(kib_immediate_msg_t, + ibim_payload[payload_nob])); /* libmsg gets finalized when tx completes */ tx->tx_libmsg[0] = libmsg; - koibnal_launch_tx(tx, nid); + kibnal_launch_tx(tx, nid); return (PTL_OK); } ptl_err_t -koibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, +kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, struct iovec *payload_iov, size_t payload_offset, size_t payload_len) { - return (koibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_len)); + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, payload_iov, NULL, + payload_offset, payload_len)); } ptl_err_t -koibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, +kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, ptl_kiov_t *payload_kiov, size_t payload_offset, size_t payload_len) { - return (koibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_len)); + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, + payload_offset, payload_len)); } ptl_err_t -koibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, +kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, size_t offset, size_t mlen, size_t rlen) { - koib_rx_t *rx = private; - koib_msg_t *rxmsg = rx->rx_msg; - int msg_nob; + kib_rx_t *rx = private; + kib_msg_t *rxmsg = rx->rx_msg; + int msg_nob; LASSERT (mlen <= rlen); LASSERT (!in_interrupt ()); /* Either all pages or all vaddrs */ LASSERT (!(kiov != NULL && iov != NULL)); - switch (rxmsg->oibm_type) { + switch (rxmsg->ibm_type) { default: LBUG(); return (PTL_FAIL); - case OPENIBNAL_MSG_IMMEDIATE: - msg_nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[rlen]); - if (msg_nob > OPENIBNAL_MSG_SIZE) { + case IBNAL_MSG_IMMEDIATE: + msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (msg_nob > IBNAL_MSG_SIZE) { CERROR ("Immediate message from "LPX64" too big: %d\n", - rxmsg->oibm_u.immediate.oibim_hdr.src_nid, rlen); + rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); return (PTL_FAIL); } if (kiov != NULL) lib_copy_buf2kiov(niov, kiov, offset, - rxmsg->oibm_u.immediate.oibim_payload, + rxmsg->ibm_u.immediate.ibim_payload, mlen); else lib_copy_buf2iov(niov, iov, offset, - rxmsg->oibm_u.immediate.oibim_payload, + rxmsg->ibm_u.immediate.ibim_payload, mlen); lib_finalize (nal, NULL, libmsg, PTL_OK); return (PTL_OK); - case OPENIBNAL_MSG_GET_RDMA: + case IBNAL_MSG_GET_RDMA: /* We get called here just to discard any junk after the * GET hdr. */ LASSERT (libmsg == NULL); lib_finalize (nal, NULL, libmsg, PTL_OK); return (PTL_OK); - case OPENIBNAL_MSG_PUT_RDMA: - koibnal_start_active_rdma (OPENIBNAL_MSG_PUT_DONE, 0, - rx, libmsg, - niov, iov, kiov, offset, mlen); + case IBNAL_MSG_PUT_RDMA: + kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, + rx, libmsg, + niov, iov, kiov, offset, mlen); return (PTL_OK); } } ptl_err_t -koibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, +kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, struct iovec *iov, size_t offset, size_t mlen, size_t rlen) { - return (koibnal_recvmsg (nal, private, msg, niov, iov, NULL, - offset, mlen, rlen)); + return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, + offset, mlen, rlen)); } ptl_err_t -koibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, +kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, ptl_kiov_t *kiov, size_t offset, size_t mlen, size_t rlen) { - return (koibnal_recvmsg (nal, private, msg, niov, NULL, kiov, - offset, mlen, rlen)); + return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, + offset, mlen, rlen)); } int -koibnal_thread_start (int (*fn)(void *arg), void *arg) +kibnal_thread_start (int (*fn)(void *arg), void *arg) { long pid = kernel_thread (fn, arg, 0); if (pid < 0) return ((int)pid); - atomic_inc (&koibnal_data.koib_nthreads); + atomic_inc (&kibnal_data.kib_nthreads); return (0); } void -koibnal_thread_fini (void) +kibnal_thread_fini (void) { - atomic_dec (&koibnal_data.koib_nthreads); + atomic_dec (&kibnal_data.kib_nthreads); } void -koibnal_close_conn_locked (koib_conn_t *conn, int error) +kibnal_close_conn_locked (kib_conn_t *conn, int error) { /* This just does the immmediate housekeeping, and schedules the * connection for the connd to finish off. - * Caller holds koib_global_lock exclusively in irq context */ - koib_peer_t *peer = conn->ibc_peer; + * Caller holds kib_global_lock exclusively in irq context */ + kib_peer_t *peer = conn->ibc_peer; CDEBUG (error == 0 ? D_NET : D_ERROR, "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); - LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED || - conn->ibc_state == OPENIBNAL_CONN_CONNECTING); + LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED || + conn->ibc_state == IBNAL_CONN_CONNECTING); - if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) { - /* koib_connd_conns takes ibc_list's ref */ + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + /* kib_connd_conns takes ibc_list's ref */ list_del (&conn->ibc_list); } else { - /* new ref for koib_connd_conns */ + /* new ref for kib_connd_conns */ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, atomic_read (&conn->ibc_refcount)); @@ -1555,57 +1562,57 @@ koibnal_close_conn_locked (koib_conn_t *conn, int error) if (list_empty (&peer->ibp_conns) && peer->ibp_persistence == 0) { /* Non-persistent peer with no more conns... */ - koibnal_unlink_peer_locked (peer); + kibnal_unlink_peer_locked (peer); } - conn->ibc_state = OPENIBNAL_CONN_DEATHROW; + conn->ibc_state = IBNAL_CONN_DEATHROW; /* Schedule conn for closing/destruction */ - spin_lock (&koibnal_data.koib_connd_lock); + spin_lock (&kibnal_data.kib_connd_lock); - list_add_tail (&conn->ibc_list, &koibnal_data.koib_connd_conns); - wake_up (&koibnal_data.koib_connd_waitq); + list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); - spin_unlock (&koibnal_data.koib_connd_lock); + spin_unlock (&kibnal_data.kib_connd_lock); } int -koibnal_close_conn (koib_conn_t *conn, int why) +kibnal_close_conn (kib_conn_t *conn, int why) { unsigned long flags; int count = 0; - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - LASSERT (conn->ibc_state >= OPENIBNAL_CONN_CONNECTING); + LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING); - if (conn->ibc_state <= OPENIBNAL_CONN_ESTABLISHED) { + if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) { count = 1; - koibnal_close_conn_locked (conn, why); + kibnal_close_conn_locked (conn, why); } - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return (count); } void -koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc) +kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) { LIST_HEAD (zombies); - koib_tx_t *tx; + kib_tx_t *tx; unsigned long flags; LASSERT (rc != 0); - LASSERT (peer->ibp_reconnect_interval >= OPENIBNAL_MIN_RECONNECT_INTERVAL); + LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); LASSERT (peer->ibp_connecting != 0); peer->ibp_connecting--; if (peer->ibp_connecting != 0) { /* another connection attempt under way (loopback?)... */ - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return; } @@ -1614,50 +1621,50 @@ koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc) peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; /* Increase reconnection interval */ peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, - OPENIBNAL_MAX_RECONNECT_INTERVAL); + IBNAL_MAX_RECONNECT_INTERVAL); /* Take peer's blocked blocked transmits; I'll complete * them with error */ while (!list_empty (&peer->ibp_tx_queue)) { tx = list_entry (peer->ibp_tx_queue.next, - koib_tx_t, tx_list); + kib_tx_t, tx_list); list_del (&tx->tx_list); list_add_tail (&tx->tx_list, &zombies); } - if (koibnal_peer_active(peer) && + if (kibnal_peer_active(peer) && (peer->ibp_persistence == 0)) { /* failed connection attempt on non-persistent peer */ - koibnal_unlink_peer_locked (peer); + kibnal_unlink_peer_locked (peer); } } else { /* Can't have blocked transmits if there are connections */ LASSERT (list_empty(&peer->ibp_tx_queue)); } - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); if (!list_empty (&zombies)) CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid); while (!list_empty (&zombies)) { - tx = list_entry (zombies.next, koib_tx_t, tx_list); + tx = list_entry (zombies.next, kib_tx_t, tx_list); list_del (&tx->tx_list); /* complete now */ tx->tx_status = -EHOSTUNREACH; - koibnal_tx_done (tx); + kibnal_tx_done (tx); } } void -koibnal_connreq_done (koib_conn_t *conn, int active, int status) +kibnal_connreq_done (kib_conn_t *conn, int active, int status) { int state = conn->ibc_state; - koib_peer_t *peer = conn->ibc_peer; - koib_tx_t *tx; + kib_peer_t *peer = conn->ibc_peer; + kib_tx_t *tx; unsigned long flags; int rc; int i; @@ -1669,31 +1676,31 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status) conn->ibc_connreq = NULL; } - if (state == OPENIBNAL_CONN_CONNECTING) { + if (state == IBNAL_CONN_CONNECTING) { /* Install common (active/passive) callback for * disconnect/idle notification if I got as far as getting * a CM comm_id */ rc = tsIbCmCallbackModify(conn->ibc_comm_id, - koibnal_conn_callback, conn); + kibnal_conn_callback, conn); LASSERT (rc == 0); } - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); LASSERT (peer->ibp_connecting != 0); if (status == 0) { /* connection established... */ - LASSERT (state == OPENIBNAL_CONN_CONNECTING); - conn->ibc_state = OPENIBNAL_CONN_ESTABLISHED; + LASSERT (state == IBNAL_CONN_CONNECTING); + conn->ibc_state = IBNAL_CONN_ESTABLISHED; - if (!koibnal_peer_active(peer)) { + if (!kibnal_peer_active(peer)) { /* ...but peer deleted meantime */ status = -ECONNABORTED; } } else { - LASSERT (state == OPENIBNAL_CONN_INIT_QP || - state == OPENIBNAL_CONN_CONNECTING); + LASSERT (state == IBNAL_CONN_INIT_QP || + state == IBNAL_CONN_CONNECTING); } if (status == 0) { @@ -1710,14 +1717,14 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status) list_add (&conn->ibc_list, &peer->ibp_conns); /* reset reconnect interval for next attempt */ - peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; /* post blocked sends to the new connection */ spin_lock (&conn->ibc_lock); while (!list_empty (&peer->ibp_tx_queue)) { tx = list_entry (peer->ibp_tx_queue.next, - koib_tx_t, tx_list); + kib_tx_t, tx_list); list_del (&tx->tx_list); @@ -1726,19 +1733,19 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status) conn, conn->ibc_state, conn->ibc_peer->ibp_nid, atomic_read (&conn->ibc_refcount)); atomic_inc (&conn->ibc_refcount); - koibnal_queue_tx_locked (tx, conn); + kibnal_queue_tx_locked (tx, conn); } spin_unlock (&conn->ibc_lock); /* Nuke any dangling conns from a different peer instance... */ - koibnal_close_stale_conns_locked (conn->ibc_peer, - conn->ibc_incarnation); + kibnal_close_stale_conns_locked (conn->ibc_peer, + conn->ibc_incarnation); - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); /* queue up all the receives */ - for (i = 0; i < OPENIBNAL_RX_MSGS; i++) { + for (i = 0; i < IBNAL_RX_MSGS; i++) { /* +1 ref for rx desc */ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, @@ -1749,71 +1756,71 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status) i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg, conn->ibc_rxs[i].rx_vaddr); - koibnal_post_rx (&conn->ibc_rxs[i], 0); + kibnal_post_rx (&conn->ibc_rxs[i], 0); } - koibnal_check_sends (conn); + kibnal_check_sends (conn); return; } /* connection failed */ - if (state == OPENIBNAL_CONN_CONNECTING) { + if (state == IBNAL_CONN_CONNECTING) { /* schedule for connd to close */ - koibnal_close_conn_locked (conn, status); + kibnal_close_conn_locked (conn, status); } else { /* Don't have a CM comm_id; just wait for refs to drain */ - conn->ibc_state = OPENIBNAL_CONN_ZOMBIE; + conn->ibc_state = IBNAL_CONN_ZOMBIE; } - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - koibnal_peer_connect_failed (conn->ibc_peer, active, status); + kibnal_peer_connect_failed (conn->ibc_peer, active, status); - if (state != OPENIBNAL_CONN_CONNECTING) { + if (state != IBNAL_CONN_CONNECTING) { /* drop caller's ref if we're not waiting for the * IB_CM_IDLE callback */ - koibnal_put_conn (conn); + kibnal_put_conn (conn); } } int -koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid, +kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid, ptl_nid_t nid, __u64 incarnation, int queue_depth) { - koib_conn_t *conn = koibnal_create_conn(); - koib_peer_t *peer; - koib_peer_t *peer2; + kib_conn_t *conn = kibnal_create_conn(); + kib_peer_t *peer; + kib_peer_t *peer2; unsigned long flags; if (conn == NULL) return (-ENOMEM); - if (queue_depth != OPENIBNAL_MSG_QUEUE_SIZE) { + if (queue_depth != IBNAL_MSG_QUEUE_SIZE) { CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", - nid, queue_depth, OPENIBNAL_MSG_QUEUE_SIZE); + nid, queue_depth, IBNAL_MSG_QUEUE_SIZE); return (-EPROTO); } /* assume 'nid' is a new peer */ - peer = koibnal_create_peer (nid); + peer = kibnal_create_peer (nid); if (peer == NULL) { CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, atomic_read (&conn->ibc_refcount)); atomic_dec (&conn->ibc_refcount); - koibnal_destroy_conn(conn); + kibnal_destroy_conn(conn); return (-ENOMEM); } - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - peer2 = koibnal_find_peer_locked(nid); + peer2 = kibnal_find_peer_locked(nid); if (peer2 == NULL) { /* peer table takes my ref on peer */ list_add_tail (&peer->ibp_list, - koibnal_nid2peerlist(nid)); + kibnal_nid2peerlist(nid)); } else { - koibnal_put_peer (peer); + kibnal_put_peer (peer); peer = peer2; } @@ -1821,20 +1828,20 @@ koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid, atomic_inc (&peer->ibp_refcount); peer->ibp_connecting++; - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); conn->ibc_peer = peer; - conn->ibc_state = OPENIBNAL_CONN_CONNECTING; + conn->ibc_state = IBNAL_CONN_CONNECTING; conn->ibc_comm_id = cid; conn->ibc_incarnation = incarnation; - conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE; + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; *connp = conn; return (0); } tTS_IB_CM_CALLBACK_RETURN -koibnal_idle_conn_callback (tTS_IB_CM_EVENT event, +kibnal_idle_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg) @@ -1846,13 +1853,19 @@ koibnal_idle_conn_callback (tTS_IB_CM_EVENT event, } tTS_IB_CM_CALLBACK_RETURN -koibnal_conn_callback (tTS_IB_CM_EVENT event, +kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg) { - koib_conn_t *conn = arg; - int rc; + kib_conn_t *conn = arg; + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; + unsigned long flags; + int done; + int rc; /* Established Connection Notifier */ @@ -1860,24 +1873,72 @@ koibnal_conn_callback (tTS_IB_CM_EVENT event, default: CERROR("Connection %p -> "LPX64" ERROR %d\n", conn, conn->ibc_peer->ibp_nid, event); - koibnal_close_conn (conn, -ECONNABORTED); + kibnal_close_conn (conn, -ECONNABORTED); break; case TS_IB_CM_DISCONNECTED: CDEBUG(D_WARNING, "Connection %p -> "LPX64" DISCONNECTED.\n", conn, conn->ibc_peer->ibp_nid); - koibnal_close_conn (conn, 0); + kibnal_close_conn (conn, 0); break; case TS_IB_CM_IDLE: CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n", conn, conn->ibc_peer->ibp_nid); - koibnal_put_conn (conn); /* Lose CM's ref */ + kibnal_put_conn (conn); /* Lose CM's ref */ /* LASSERT (no further callbacks) */ rc = tsIbCmCallbackModify(cid, - koibnal_idle_conn_callback, conn); + kibnal_idle_conn_callback, conn); LASSERT (rc == 0); + + /* NB we wait until the connection has closed before + * completing outstanding passive RDMAs so we can be sure + * the network can't touch the mapped memory any more. */ + + spin_lock_irqsave (&conn->ibc_lock, flags); + + /* grab passive RDMAs not waiting for the tx callback */ + list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + /* still waiting for tx callback? */ + if (!tx->tx_passive_rdma_wait) + continue; + + tx->tx_status = -ECONNABORTED; + tx->tx_passive_rdma_wait = 0; + done = (tx->tx_sending == 0); + + if (!done) + continue; + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + /* grab all blocked transmits */ + list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + while (!list_empty(&zombies)) { + tx = list_entry (zombies.next, kib_tx_t, tx_list); + + list_del(&tx->tx_list); + kibnal_tx_done (tx); + } break; } @@ -1885,12 +1946,12 @@ koibnal_conn_callback (tTS_IB_CM_EVENT event, } tTS_IB_CM_CALLBACK_RETURN -koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, +kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg) { - koib_conn_t *conn = arg; + kib_conn_t *conn = arg; int rc; switch (event) { @@ -1903,12 +1964,12 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, CERROR ("Unexpected event %p -> "LPX64": %d\n", conn, conn->ibc_peer->ibp_nid, event); - koibnal_connreq_done (conn, 0, -ECONNABORTED); + kibnal_connreq_done (conn, 0, -ECONNABORTED); break; case TS_IB_CM_REQ_RECEIVED: { struct ib_cm_req_received_param *req = param; - koib_wire_connreq_t *wcr = req->remote_private_data; + kib_wire_connreq_t *wcr = req->remote_private_data; LASSERT (conn == NULL); @@ -1920,23 +1981,23 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, return TS_IB_CM_CALLBACK_ABORT; } - if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) { + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { CERROR ("Can't accept LID %04x: bad magic %08x\n", req->dlid, le32_to_cpu(wcr->wcr_magic)); return TS_IB_CM_CALLBACK_ABORT; } - if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) { + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { CERROR ("Can't accept LID %04x: bad version %d\n", req->dlid, le16_to_cpu(wcr->wcr_magic)); return TS_IB_CM_CALLBACK_ABORT; } - rc = koibnal_accept(&conn, - cid, - le64_to_cpu(wcr->wcr_nid), - le64_to_cpu(wcr->wcr_incarnation), - le16_to_cpu(wcr->wcr_queue_depth)); + rc = kibnal_accept(&conn, + cid, + le64_to_cpu(wcr->wcr_nid), + le64_to_cpu(wcr->wcr_incarnation), + le16_to_cpu(wcr->wcr_queue_depth)); if (rc != 0) { CERROR ("Can't accept "LPX64": %d\n", le64_to_cpu(wcr->wcr_nid), rc); @@ -1945,23 +2006,23 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, /* update 'arg' for next callback */ rc = tsIbCmCallbackModify(cid, - koibnal_passive_conn_callback, conn); + kibnal_passive_conn_callback, conn); LASSERT (rc == 0); req->accept_param.qp = conn->ibc_qp; - *((koib_wire_connreq_t *)req->accept_param.reply_private_data) - = (koib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(OPENIBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(OPENIBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le32(OPENIBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(koibnal_data.koib_nid), - .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation), + *((kib_wire_connreq_t *)req->accept_param.reply_private_data) + = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), }; - req->accept_param.reply_private_data_len = sizeof(koib_wire_connreq_t); - req->accept_param.responder_resources = OPENIBNAL_RESPONDER_RESOURCES; - req->accept_param.initiator_depth = OPENIBNAL_RESPONDER_RESOURCES; - req->accept_param.rnr_retry_count = OPENIBNAL_RNR_RETRY; - req->accept_param.flow_control = OPENIBNAL_FLOW_CONTROL; + req->accept_param.reply_private_data_len = sizeof(kib_wire_connreq_t); + req->accept_param.responder_resources = IBNAL_RESPONDER_RESOURCES; + req->accept_param.initiator_depth = IBNAL_RESPONDER_RESOURCES; + req->accept_param.rnr_retry_count = IBNAL_RNR_RETRY; + req->accept_param.flow_control = IBNAL_FLOW_CONTROL; CDEBUG(D_NET, "Proceeding\n"); break; @@ -1972,60 +2033,60 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n", conn, conn->ibc_peer->ibp_nid); - koibnal_connreq_done (conn, 0, 0); + kibnal_connreq_done (conn, 0, 0); break; } - /* NB if the connreq is done, we switch to koibnal_conn_callback */ + /* NB if the connreq is done, we switch to kibnal_conn_callback */ return TS_IB_CM_CALLBACK_PROCEED; } tTS_IB_CM_CALLBACK_RETURN -koibnal_active_conn_callback (tTS_IB_CM_EVENT event, +kibnal_active_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg) { - koib_conn_t *conn = arg; + kib_conn_t *conn = arg; switch (event) { case TS_IB_CM_REP_RECEIVED: { struct ib_cm_rep_received_param *rep = param; - koib_wire_connreq_t *wcr = rep->remote_private_data; + kib_wire_connreq_t *wcr = rep->remote_private_data; if (rep->remote_private_data_len < sizeof (*wcr)) { CERROR ("Short reply from "LPX64": %d\n", conn->ibc_peer->ibp_nid, rep->remote_private_data_len); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } - if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) { + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { CERROR ("Can't connect "LPX64": bad magic %08x\n", conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic)); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } - if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) { + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { CERROR ("Can't connect "LPX64": bad version %d\n", conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic)); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } - if (wcr->wcr_queue_depth != cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE)) { + if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) { CERROR ("Can't connect "LPX64": bad queue depth %d\n", conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_queue_depth)); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) { CERROR ("Unexpected NID "LPX64" from "LPX64"\n", le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } @@ -2033,7 +2094,7 @@ koibnal_active_conn_callback (tTS_IB_CM_EVENT event, conn, conn->ibc_peer->ibp_nid); conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation); - conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE; + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; break; } @@ -2041,86 +2102,86 @@ koibnal_active_conn_callback (tTS_IB_CM_EVENT event, CDEBUG(D_WARNING, "Connection %p -> "LPX64" Established\n", conn, conn->ibc_peer->ibp_nid); - koibnal_connreq_done (conn, 1, 0); + kibnal_connreq_done (conn, 1, 0); break; case TS_IB_CM_IDLE: CERROR("Connection %p -> "LPX64" IDLE\n", conn, conn->ibc_peer->ibp_nid); /* Back out state change: I'm disengaged from CM */ - conn->ibc_state = OPENIBNAL_CONN_INIT_QP; + conn->ibc_state = IBNAL_CONN_INIT_QP; - koibnal_connreq_done (conn, 1, -ECONNABORTED); + kibnal_connreq_done (conn, 1, -ECONNABORTED); break; default: CERROR("Connection %p -> "LPX64" ERROR %d\n", conn, conn->ibc_peer->ibp_nid, event); - koibnal_connreq_done (conn, 1, -ECONNABORTED); + kibnal_connreq_done (conn, 1, -ECONNABORTED); break; } - /* NB if the connreq is done, we switch to koibnal_conn_callback */ + /* NB if the connreq is done, we switch to kibnal_conn_callback */ return TS_IB_CM_CALLBACK_PROCEED; } int -koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, +kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, struct ib_path_record *resp, int remaining, void *arg) { - koib_conn_t *conn = arg; + kib_conn_t *conn = arg; if (status != 0) { CERROR ("status %d\n", status); - koibnal_connreq_done (conn, 1, status); + kibnal_connreq_done (conn, 1, status); goto out; } conn->ibc_connreq->cr_path = *resp; - conn->ibc_connreq->cr_wcr = (koib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(OPENIBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(OPENIBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(koibnal_data.koib_nid), - .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation), + conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), }; conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) { .qp = conn->ibc_qp, .req_private_data = &conn->ibc_connreq->cr_wcr, .req_private_data_len = sizeof(conn->ibc_connreq->cr_wcr), - .responder_resources = OPENIBNAL_RESPONDER_RESOURCES, - .initiator_depth = OPENIBNAL_RESPONDER_RESOURCES, - .retry_count = OPENIBNAL_RETRY, - .rnr_retry_count = OPENIBNAL_RNR_RETRY, - .cm_response_timeout = koibnal_tunables.koib_io_timeout, - .max_cm_retries = OPENIBNAL_CM_RETRY, - .flow_control = OPENIBNAL_FLOW_CONTROL, + .responder_resources = IBNAL_RESPONDER_RESOURCES, + .initiator_depth = IBNAL_RESPONDER_RESOURCES, + .retry_count = IBNAL_RETRY, + .rnr_retry_count = IBNAL_RNR_RETRY, + .cm_response_timeout = kibnal_tunables.kib_io_timeout, + .max_cm_retries = IBNAL_CM_RETRY, + .flow_control = IBNAL_FLOW_CONTROL, }; /* XXX set timeout just like SDP!!!*/ conn->ibc_connreq->cr_path.packet_life = 13; /* Flag I'm getting involved with the CM... */ - conn->ibc_state = OPENIBNAL_CONN_CONNECTING; + conn->ibc_state = IBNAL_CONN_CONNECTING; CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", conn->ibc_connreq->cr_service.service_id, - *koibnal_service_nid_field(&conn->ibc_connreq->cr_service)); + *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); - /* koibnal_connect_callback gets my conn ref */ + /* kibnal_connect_callback gets my conn ref */ status = ib_cm_connect (&conn->ibc_connreq->cr_connparam, &conn->ibc_connreq->cr_path, NULL, conn->ibc_connreq->cr_service.service_id, 0, - koibnal_active_conn_callback, conn, + kibnal_active_conn_callback, conn, &conn->ibc_comm_id); if (status != 0) { CERROR ("Connect: %d\n", status); /* Back out state change: I've not got a CM comm_id yet... */ - conn->ibc_state = OPENIBNAL_CONN_INIT_QP; - koibnal_connreq_done (conn, 1, status); + conn->ibc_state = IBNAL_CONN_INIT_QP; + kibnal_connreq_done (conn, 1, status); } out: @@ -2129,58 +2190,58 @@ koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, } void -koibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, - struct ib_common_attrib_service *resp, void *arg) +kibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, + struct ib_common_attrib_service *resp, void *arg) { - koib_conn_t *conn = arg; + kib_conn_t *conn = arg; if (status != 0) { CERROR ("status %d\n", status); - koibnal_connreq_done (conn, 1, status); + kibnal_connreq_done (conn, 1, status); return; } CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n", status, resp->service_id, - *koibnal_service_nid_field(resp)); + *kibnal_service_nid_field(resp)); conn->ibc_connreq->cr_service = *resp; - status = ib_cached_gid_get(koibnal_data.koib_device, - koibnal_data.koib_port, 0, + status = ib_cached_gid_get(kibnal_data.kib_device, + kibnal_data.kib_port, 0, conn->ibc_connreq->cr_gid); LASSERT (status == 0); - /* koibnal_pathreq_callback gets my conn ref */ - status = tsIbPathRecordRequest (koibnal_data.koib_device, - koibnal_data.koib_port, + /* kibnal_pathreq_callback gets my conn ref */ + status = tsIbPathRecordRequest (kibnal_data.kib_device, + kibnal_data.kib_port, conn->ibc_connreq->cr_gid, conn->ibc_connreq->cr_service.service_gid, conn->ibc_connreq->cr_service.service_pkey, 0, - koibnal_tunables.koib_io_timeout * HZ, + kibnal_tunables.kib_io_timeout * HZ, 0, - koibnal_pathreq_callback, conn, + kibnal_pathreq_callback, conn, &conn->ibc_connreq->cr_tid); if (status == 0) return; CERROR ("Path record request: %d\n", status); - koibnal_connreq_done (conn, 1, status); + kibnal_connreq_done (conn, 1, status); } void -koibnal_connect_peer (koib_peer_t *peer) +kibnal_connect_peer (kib_peer_t *peer) { - koib_conn_t *conn = koibnal_create_conn(); + kib_conn_t *conn = kibnal_create_conn(); int rc; LASSERT (peer->ibp_connecting != 0); if (conn == NULL) { CERROR ("Can't allocate conn\n"); - koibnal_peer_connect_failed (peer, 1, -ENOMEM); + kibnal_peer_connect_failed (peer, 1, -ENOMEM); return; } @@ -2190,85 +2251,101 @@ koibnal_connect_peer (koib_peer_t *peer) PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); if (conn->ibc_connreq == NULL) { CERROR ("Can't allocate connreq\n"); - koibnal_connreq_done (conn, 1, -ENOMEM); + kibnal_connreq_done (conn, 1, -ENOMEM); return; } memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); - koibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); + kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); - /* koibnal_service_get_callback gets my conn ref */ - rc = ib_service_get (koibnal_data.koib_device, - koibnal_data.koib_port, + /* kibnal_service_get_callback gets my conn ref */ + rc = ib_service_get (kibnal_data.kib_device, + kibnal_data.kib_port, &conn->ibc_connreq->cr_service, - KOIBNAL_SERVICE_KEY_MASK, - koibnal_tunables.koib_io_timeout * HZ, - koibnal_service_get_callback, conn, + KIBNAL_SERVICE_KEY_MASK, + kibnal_tunables.kib_io_timeout * HZ, + kibnal_service_get_callback, conn, &conn->ibc_connreq->cr_tid); if (rc == 0) return; CERROR ("ib_service_get: %d\n", rc); - koibnal_connreq_done (conn, 1, rc); + kibnal_connreq_done (conn, 1, rc); } int -koibnal_conn_timed_out (koib_conn_t *conn) +kibnal_conn_timed_out (kib_conn_t *conn) { - koib_tx_t *tx; + kib_tx_t *tx; struct list_head *ttmp; unsigned long flags; - int rc = 0; spin_lock_irqsave (&conn->ibc_lock, flags); - list_for_each (ttmp, &conn->ibc_rdma_queue) { - tx = list_entry (ttmp, koib_tx_t, tx_list); + list_for_each (ttmp, &conn->ibc_tx_queue) { + tx = list_entry (ttmp, kib_tx_t, tx_list); - LASSERT (tx->tx_passive_rdma); - LASSERT (tx->tx_passive_rdma_wait); + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_sending == 0); - if (time_after_eq (jiffies, tx->tx_passive_rdma_deadline)) { - rc = 1; - break; + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; } } + + list_for_each (ttmp, &conn->ibc_active_txs) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; + } + } + spin_unlock_irqrestore (&conn->ibc_lock, flags); - return rc; + return 0; } void -koibnal_check_conns (int idx) +kibnal_check_conns (int idx) { - struct list_head *peers = &koibnal_data.koib_peers[idx]; + struct list_head *peers = &kibnal_data.kib_peers[idx]; struct list_head *ptmp; - koib_peer_t *peer; - koib_conn_t *conn; + kib_peer_t *peer; + kib_conn_t *conn; struct list_head *ctmp; again: /* NB. We expect to have a look at all the peers and not find any * rdmas to time out, so we just use a shared lock while we * take a look... */ - read_lock (&koibnal_data.koib_global_lock); + read_lock (&kibnal_data.kib_global_lock); list_for_each (ptmp, peers) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + peer = list_entry (ptmp, kib_peer_t, ibp_list); list_for_each (ctmp, &peer->ibp_conns) { - conn = list_entry (ctmp, koib_conn_t, ibc_list); + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); - LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED); /* In case we have enough credits to return via a * NOOP, but there were no non-blocking tx descs * free to do it last time... */ - koibnal_check_sends(conn); + kibnal_check_sends(conn); - if (!koibnal_conn_timed_out(conn)) + if (!kibnal_conn_timed_out(conn)) continue; CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", @@ -2276,108 +2353,76 @@ koibnal_check_conns (int idx) atomic_read (&conn->ibc_refcount)); atomic_inc (&conn->ibc_refcount); - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); CERROR("Timed out RDMA with "LPX64"\n", peer->ibp_nid); - koibnal_close_conn (conn, -ETIMEDOUT); - koibnal_put_conn (conn); + kibnal_close_conn (conn, -ETIMEDOUT); + kibnal_put_conn (conn); /* start again now I've dropped the lock */ goto again; } } - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); } void -koibnal_terminate_conn (koib_conn_t *conn) +kibnal_terminate_conn (kib_conn_t *conn) { - unsigned long flags; int rc; - int done; CDEBUG(D_NET, "conn %p\n", conn); - LASSERT (conn->ibc_state == OPENIBNAL_CONN_DEATHROW); - conn->ibc_state = OPENIBNAL_CONN_ZOMBIE; + LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW); + conn->ibc_state = IBNAL_CONN_ZOMBIE; rc = ib_cm_disconnect (conn->ibc_comm_id); if (rc != 0) CERROR ("Error %d disconnecting conn %p -> "LPX64"\n", rc, conn, conn->ibc_peer->ibp_nid); - - /* complete blocked passive RDMAs */ - spin_lock_irqsave (&conn->ibc_lock, flags); - - while (!list_empty (&conn->ibc_rdma_queue)) { - koib_tx_t *tx = list_entry (conn->ibc_rdma_queue.next, - koib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma); - LASSERT (tx->tx_passive_rdma_wait); - - list_del (&tx->tx_list); - - tx->tx_passive_rdma_wait = 0; - done = (tx->tx_sending == 0); - - tx->tx_status = -ECONNABORTED; - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - if (done) - koibnal_tx_done (tx); - - spin_lock_irqsave (&conn->ibc_lock, flags); - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - /* Complete all blocked transmits */ - koibnal_check_sends(conn); } int -koibnal_connd (void *arg) +kibnal_connd (void *arg) { wait_queue_t wait; unsigned long flags; - koib_conn_t *conn; - koib_peer_t *peer; + kib_conn_t *conn; + kib_peer_t *peer; int timeout; int i; int peer_index = 0; unsigned long deadline = jiffies; - kportal_daemonize ("koibnal_connd"); + kportal_daemonize ("kibnal_connd"); kportal_blockallsigs (); init_waitqueue_entry (&wait, current); - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); for (;;) { - if (!list_empty (&koibnal_data.koib_connd_conns)) { - conn = list_entry (koibnal_data.koib_connd_conns.next, - koib_conn_t, ibc_list); + if (!list_empty (&kibnal_data.kib_connd_conns)) { + conn = list_entry (kibnal_data.kib_connd_conns.next, + kib_conn_t, ibc_list); list_del (&conn->ibc_list); - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); switch (conn->ibc_state) { - case OPENIBNAL_CONN_DEATHROW: + case IBNAL_CONN_DEATHROW: LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID); /* Disconnect: conn becomes a zombie in the * callback and last ref reschedules it * here... */ - koibnal_terminate_conn(conn); - koibnal_put_conn (conn); + kibnal_terminate_conn(conn); + kibnal_put_conn (conn); break; - case OPENIBNAL_CONN_ZOMBIE: - koibnal_destroy_conn (conn); + case IBNAL_CONN_ZOMBIE: + kibnal_destroy_conn (conn); break; default: @@ -2386,35 +2431,35 @@ koibnal_connd (void *arg) LBUG(); } - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); continue; } - if (!list_empty (&koibnal_data.koib_connd_peers)) { - peer = list_entry (koibnal_data.koib_connd_peers.next, - koib_peer_t, ibp_connd_list); + if (!list_empty (&kibnal_data.kib_connd_peers)) { + peer = list_entry (kibnal_data.kib_connd_peers.next, + kib_peer_t, ibp_connd_list); list_del_init (&peer->ibp_connd_list); - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - koibnal_connect_peer (peer); - koibnal_put_peer (peer); + kibnal_connect_peer (peer); + kibnal_put_peer (peer); - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } /* shut down and nobody left to reap... */ - if (koibnal_data.koib_shutdown && - atomic_read(&koibnal_data.koib_nconns) == 0) + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) break; - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); /* careful with the jiffy wrap... */ while ((timeout = (int)(deadline - jiffies)) <= 0) { const int n = 4; const int p = 1; - int chunk = koibnal_data.koib_peer_hash_size; + int chunk = kibnal_data.kib_peer_hash_size; /* Time to check for RDMA timeouts on a few more * peers: I do checks every 'p' seconds on a @@ -2424,129 +2469,129 @@ koibnal_connd (void *arg) * connection within (n+1)/n times the timeout * interval. */ - if (koibnal_tunables.koib_io_timeout > n * p) + if (kibnal_tunables.kib_io_timeout > n * p) chunk = (chunk * n * p) / - koibnal_tunables.koib_io_timeout; + kibnal_tunables.kib_io_timeout; if (chunk == 0) chunk = 1; for (i = 0; i < chunk; i++) { - koibnal_check_conns (peer_index); + kibnal_check_conns (peer_index); peer_index = (peer_index + 1) % - koibnal_data.koib_peer_hash_size; + kibnal_data.kib_peer_hash_size; } deadline += p * HZ; } - koibnal_data.koib_connd_waketime = jiffies + timeout; + kibnal_data.kib_connd_waketime = jiffies + timeout; set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&koibnal_data.koib_connd_waitq, &wait); + add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - if (!koibnal_data.koib_shutdown && - list_empty (&koibnal_data.koib_connd_conns) && - list_empty (&koibnal_data.koib_connd_peers)) + if (!kibnal_data.kib_shutdown && + list_empty (&kibnal_data.kib_connd_conns) && + list_empty (&kibnal_data.kib_connd_peers)) schedule_timeout (timeout); set_current_state (TASK_RUNNING); - remove_wait_queue (&koibnal_data.koib_connd_waitq, &wait); + remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - koibnal_thread_fini (); + kibnal_thread_fini (); return (0); } int -koibnal_scheduler(void *arg) +kibnal_scheduler(void *arg) { long id = (long)arg; char name[16]; - koib_rx_t *rx; - koib_tx_t *tx; + kib_rx_t *rx; + kib_tx_t *tx; unsigned long flags; int rc; int counter = 0; int did_something; - snprintf(name, sizeof(name), "koibnal_sd_%02ld", id); + snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); kportal_daemonize(name); kportal_blockallsigs(); - spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags); + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); for (;;) { did_something = 0; - while (!list_empty(&koibnal_data.koib_sched_txq)) { - tx = list_entry(koibnal_data.koib_sched_txq.next, - koib_tx_t, tx_list); + while (!list_empty(&kibnal_data.kib_sched_txq)) { + tx = list_entry(kibnal_data.kib_sched_txq.next, + kib_tx_t, tx_list); list_del(&tx->tx_list); - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - koibnal_tx_done(tx); + kibnal_tx_done(tx); - spin_lock_irqsave(&koibnal_data.koib_sched_lock, + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } - if (!list_empty(&koibnal_data.koib_sched_rxq)) { - rx = list_entry(koibnal_data.koib_sched_rxq.next, - koib_rx_t, rx_list); + if (!list_empty(&kibnal_data.kib_sched_rxq)) { + rx = list_entry(kibnal_data.kib_sched_rxq.next, + kib_rx_t, rx_list); list_del(&rx->rx_list); - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - koibnal_rx(rx); + kibnal_rx(rx); did_something = 1; - spin_lock_irqsave(&koibnal_data.koib_sched_lock, + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } /* shut down and no receives to complete... */ - if (koibnal_data.koib_shutdown && - atomic_read(&koibnal_data.koib_nconns) == 0) + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) break; /* nothing to do or hogging CPU */ - if (!did_something || counter++ == OPENIBNAL_RESCHED) { - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, + if (!did_something || counter++ == IBNAL_RESCHED) { + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); counter = 0; if (!did_something) { rc = wait_event_interruptible( - koibnal_data.koib_sched_waitq, - !list_empty(&koibnal_data.koib_sched_txq) || - !list_empty(&koibnal_data.koib_sched_rxq) || - (koibnal_data.koib_shutdown && - atomic_read (&koibnal_data.koib_nconns) == 0)); + kibnal_data.kib_sched_waitq, + !list_empty(&kibnal_data.kib_sched_txq) || + !list_empty(&kibnal_data.kib_sched_rxq) || + (kibnal_data.kib_shutdown && + atomic_read (&kibnal_data.kib_nconns) == 0)); } else { our_cond_resched(); } - spin_lock_irqsave(&koibnal_data.koib_sched_lock, + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } } - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - koibnal_thread_fini(); + kibnal_thread_fini(); return (0); } -lib_nal_t koibnal_lib = { - libnal_data: &koibnal_data, /* NAL private data */ - libnal_send: koibnal_send, - libnal_send_pages: koibnal_send_pages, - libnal_recv: koibnal_recv, - libnal_recv_pages: koibnal_recv_pages, - libnal_dist: koibnal_dist +lib_nal_t kibnal_lib = { + libnal_data: &kibnal_data, /* NAL private data */ + libnal_send: kibnal_send, + libnal_send_pages: kibnal_send_pages, + libnal_recv: kibnal_recv, + libnal_recv_pages: kibnal_recv_pages, + libnal_dist: kibnal_dist }; diff --git a/lustre/portals/knals/qswnal/Makefile.in b/lustre/portals/knals/qswnal/Makefile.in index 60d09c8..d27240c 100644 --- a/lustre/portals/knals/qswnal/Makefile.in +++ b/lustre/portals/knals/qswnal/Makefile.in @@ -1,6 +1,6 @@ MODULES := kqswnal kqswnal-objs := qswnal.o qswnal_cb.o -EXTRA_PRE_CFLAGS := @QSWCPPFLAGS@ -I/usr/include +EXTRA_POST_CFLAGS := @QSWCPPFLAGS@ -I/usr/include @INCLUDE_RULES@ diff --git a/lustre/portals/knals/qswnal/qswnal.c b/lustre/portals/knals/qswnal/qswnal.c index e7691a0..5aff4e9 100644 --- a/lustre/portals/knals/qswnal/qswnal.c +++ b/lustre/portals/knals/qswnal/qswnal.c @@ -24,9 +24,10 @@ #include "qswnal.h" -ptl_handle_ni_t kqswnal_ni; nal_t kqswnal_api; kqswnal_data_t kqswnal_data; +ptl_handle_ni_t kqswnal_ni; +kqswnal_tunables_t kqswnal_tunables; kpr_nal_interface_t kqswnal_router_interface = { kprni_nalid: QSWNAL, @@ -39,14 +40,14 @@ kpr_nal_interface_t kqswnal_router_interface = { #define QSWNAL_SYSCTL 201 #define QSWNAL_SYSCTL_OPTIMIZED_GETS 1 -#define QSWNAL_SYSCTL_COPY_SMALL_FWD 2 +#define QSWNAL_SYSCTL_OPTIMIZED_PUTS 2 static ctl_table kqswnal_ctl_table[] = { - {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets", - &kqswnal_data.kqn_optimized_gets, sizeof (int), + {QSWNAL_SYSCTL_OPTIMIZED_PUTS, "optimized_puts", + &kqswnal_tunables.kqn_optimized_puts, sizeof (int), 0644, NULL, &proc_dointvec}, - {QSWNAL_SYSCTL_COPY_SMALL_FWD, "copy_small_fwd", - &kqswnal_data.kqn_copy_small_fwd, sizeof (int), + {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets", + &kqswnal_tunables.kqn_optimized_gets, sizeof (int), 0644, NULL, &proc_dointvec}, {0} }; @@ -57,88 +58,13 @@ static ctl_table kqswnal_top_ctl_table[] = { }; #endif -static int -kqswnal_forward(nal_t *nal, - int id, - void *args, size_t args_len, - void *ret, size_t ret_len) -{ - kqswnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kqn_cb; - - LASSERT (nal == &kqswnal_api); - LASSERT (k == &kqswnal_data); - LASSERT (nal_cb == &kqswnal_lib); - - lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */ - return (PTL_OK); -} - -static void -kqswnal_lock (nal_t *nal, unsigned long *flags) -{ - kqswnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kqn_cb; - - LASSERT (nal == &kqswnal_api); - LASSERT (k == &kqswnal_data); - LASSERT (nal_cb == &kqswnal_lib); - - nal_cb->cb_cli(nal_cb,flags); -} - -static void -kqswnal_unlock(nal_t *nal, unsigned long *flags) -{ - kqswnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kqn_cb; - - LASSERT (nal == &kqswnal_api); - LASSERT (k == &kqswnal_data); - LASSERT (nal_cb == &kqswnal_lib); - - nal_cb->cb_sti(nal_cb,flags); -} - -static int -kqswnal_shutdown(nal_t *nal, int ni) -{ - CDEBUG (D_NET, "shutdown\n"); - - LASSERT (nal == &kqswnal_api); - return (0); -} - -static void -kqswnal_yield( nal_t *nal ) -{ - CDEBUG (D_NET, "yield\n"); - - if (need_resched()) - schedule(); - return; -} - -static nal_t * -kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, - ptl_pid_t requested_pid) -{ - ptl_nid_t mynid = kqswnal_elanid2nid (kqswnal_data.kqn_elanid); - int nnids = kqswnal_data.kqn_nnodes; - - CDEBUG(D_NET, "calling lib_init with nid "LPX64" of %d\n", mynid, nnids); - - lib_init(&kqswnal_lib, mynid, 0, nnids, ptl_size, ac_size); - - return (&kqswnal_api); -} - int kqswnal_get_tx_desc (struct portals_cfg *pcfg) { unsigned long flags; struct list_head *tmp; kqswnal_tx_t *ktx; + ptl_hdr_t *hdr; int index = pcfg->pcfg_count; int rc = -ENOENT; @@ -149,11 +75,12 @@ kqswnal_get_tx_desc (struct portals_cfg *pcfg) continue; ktx = list_entry (tmp, kqswnal_tx_t, ktx_list); + hdr = (ptl_hdr_t *)ktx->ktx_buffer; pcfg->pcfg_pbuf1 = (char *)ktx; - pcfg->pcfg_count = NTOH__u32(ktx->ktx_wire_hdr->type); - pcfg->pcfg_size = NTOH__u32(ktx->ktx_wire_hdr->payload_length); - pcfg->pcfg_nid = NTOH__u64(ktx->ktx_wire_hdr->dest_nid); + pcfg->pcfg_count = le32_to_cpu(hdr->type); + pcfg->pcfg_size = le32_to_cpu(hdr->payload_length); + pcfg->pcfg_nid = le64_to_cpu(hdr->dest_nid); pcfg->pcfg_nid2 = ktx->ktx_nid; pcfg->pcfg_misc = ktx->ktx_launcher; pcfg->pcfg_flags = (list_empty (&ktx->ktx_delayed_list) ? 0 : 1) | @@ -182,7 +109,7 @@ kqswnal_cmd (struct portals_cfg *pcfg, void *private) kqswnal_data.kqn_nid_offset); kqswnal_data.kqn_nid_offset = pcfg->pcfg_nid - kqswnal_data.kqn_elanid; - kqswnal_lib.ni.nid = pcfg->pcfg_nid; + kqswnal_lib.libnal_ni.ni_pid.nid = pcfg->pcfg_nid; return (0); default: @@ -190,11 +117,22 @@ kqswnal_cmd (struct portals_cfg *pcfg, void *private) } } -void __exit -kqswnal_finalise (void) +static void +kqswnal_shutdown(nal_t *nal) { + unsigned long flags; kqswnal_tx_t *ktx; kqswnal_rx_t *krx; + int do_lib_fini = 0; + + /* NB The first ref was this module! */ + if (nal->nal_refct != 0) { + PORTAL_MODULE_UNUSE; + return; + } + + CDEBUG (D_NET, "shutdown\n"); + LASSERT (nal == &kqswnal_api); switch (kqswnal_data.kqn_init) { @@ -202,17 +140,11 @@ kqswnal_finalise (void) LASSERT (0); case KQN_INIT_ALL: -#if CONFIG_SYSCTL - if (kqswnal_data.kqn_sysctl != NULL) - unregister_sysctl_table (kqswnal_data.kqn_sysctl); -#endif - PORTAL_SYMBOL_UNREGISTER (kqswnal_ni); - kportal_nal_unregister(QSWNAL); + libcfs_nal_cmd_unregister(QSWNAL); /* fall through */ - case KQN_INIT_PTL: - PtlNIFini (kqswnal_ni); - lib_fini (&kqswnal_lib); + case KQN_INIT_LIB: + do_lib_fini = 1; /* fall through */ case KQN_INIT_DATA: @@ -223,18 +155,24 @@ kqswnal_finalise (void) } /**********************************************************************/ - /* Make router stop her calling me and fail any more call-ins */ + /* Tell router we're shutting down. Any router calls my threads + * make will now fail immediately and the router will stop calling + * into me. */ kpr_shutdown (&kqswnal_data.kqn_router); - + /**********************************************************************/ - /* flag threads we've started to terminate and wait for all to ack */ - + /* Signal the start of shutdown... */ + spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags); kqswnal_data.kqn_shuttingdown = 1; - wake_up_all (&kqswnal_data.kqn_sched_waitq); + spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags); - while (atomic_read (&kqswnal_data.kqn_nthreads_running) != 0) { - CDEBUG(D_NET, "waiting for %d threads to start shutting down\n", - atomic_read (&kqswnal_data.kqn_nthreads_running)); + wake_up_all(&kqswnal_data.kqn_idletxd_waitq); + + /**********************************************************************/ + /* wait for sends that have allocated a tx desc to launch or give up */ + while (atomic_read (&kqswnal_data.kqn_pending_txs) != 0) { + CDEBUG(D_NET, "waiting for %d pending sends\n", + atomic_read (&kqswnal_data.kqn_pending_txs)); set_current_state (TASK_UNINTERRUPTIBLE); schedule_timeout (HZ); } @@ -242,18 +180,27 @@ kqswnal_finalise (void) /**********************************************************************/ /* close elan comms */ #if MULTIRAIL_EKC + /* Shut down receivers first; rx callbacks might try sending... */ if (kqswnal_data.kqn_eprx_small != NULL) ep_free_rcvr (kqswnal_data.kqn_eprx_small); if (kqswnal_data.kqn_eprx_large != NULL) ep_free_rcvr (kqswnal_data.kqn_eprx_large); + /* NB ep_free_rcvr() returns only after we've freed off all receive + * buffers (see shutdown handling in kqswnal_requeue_rx()). This + * means we must have completed any messages we passed to + * lib_parse() or kpr_fwd_start(). */ + if (kqswnal_data.kqn_eptx != NULL) ep_free_xmtr (kqswnal_data.kqn_eptx); - /* freeing the xmtr completes all txs pdq */ + /* NB ep_free_xmtr() returns only after all outstanding transmits + * have called their callback... */ LASSERT(list_empty(&kqswnal_data.kqn_activetxds)); #else + /* "Old" EKC just pretends to shutdown cleanly but actually + * provides no guarantees */ if (kqswnal_data.kqn_eprx_small != NULL) ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small); @@ -272,7 +219,6 @@ kqswnal_finalise (void) #endif /**********************************************************************/ /* flag threads to terminate, wake them and wait for them to die */ - kqswnal_data.kqn_shuttingdown = 2; wake_up_all (&kqswnal_data.kqn_sched_waitq); @@ -290,10 +236,12 @@ kqswnal_finalise (void) #if MULTIRAIL_EKC LASSERT (list_empty (&kqswnal_data.kqn_readyrxds)); + LASSERT (list_empty (&kqswnal_data.kqn_delayedtxds)); + LASSERT (list_empty (&kqswnal_data.kqn_delayedfwds)); #endif /**********************************************************************/ - /* Complete any blocked forwarding packets with error + /* Complete any blocked forwarding packets, with error */ while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq)) @@ -301,23 +249,16 @@ kqswnal_finalise (void) kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, kpr_fwd_desc_t, kprfd_list); list_del (&fwd->kprfd_list); - kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH); - } - - while (!list_empty (&kqswnal_data.kqn_delayedfwds)) - { - kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, - kpr_fwd_desc_t, kprfd_list); - list_del (&fwd->kprfd_list); - kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH); + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -ESHUTDOWN); } /**********************************************************************/ - /* Wait for router to complete any packets I sent her - */ + /* finalise router and portals lib */ kpr_deregister (&kqswnal_data.kqn_router); + if (do_lib_fini) + lib_fini (&kqswnal_lib); /**********************************************************************/ /* Unmap message buffers and free all descriptors and buffers @@ -328,7 +269,7 @@ kqswnal_finalise (void) * ep_dvma_release() get fixed (and releases any mappings in the * region), we can delete all the code from here --------> */ - for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx =ktx->ktx_alloclist){ + for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) { /* If ktx has a buffer, it got mapped; unmap now. NB only * the pre-mapped stuff is still mapped since all tx descs * must be idle */ @@ -339,8 +280,8 @@ kqswnal_finalise (void) &ktx->ktx_ebuffer); } - for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx =krx->krx_alloclist){ - /* If krx_kiov[0].kiov_page got allocated, it got mapped. + for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) { + /* If krx_kiov[0].kiov_page got allocated, it got mapped. * NB subsequent pages get merged */ if (krx->krx_kiov[0].kiov_page != NULL) @@ -351,10 +292,10 @@ kqswnal_finalise (void) /* <----------- to here */ if (kqswnal_data.kqn_ep_rx_nmh != NULL) - ep_dvma_release(kqswnal_data.kqn_ep,kqswnal_data.kqn_ep_rx_nmh); + ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_rx_nmh); if (kqswnal_data.kqn_ep_tx_nmh != NULL) - ep_dvma_release(kqswnal_data.kqn_ep,kqswnal_data.kqn_ep_tx_nmh); + ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_tx_nmh); #else if (kqswnal_data.kqn_eprxdmahandle != NULL) { @@ -410,8 +351,10 @@ kqswnal_finalise (void) atomic_read(&portal_kmemory)); } -static int __init -kqswnal_initialise (void) +static int +kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, + ptl_ni_limits_t *requested_limits, + ptl_ni_limits_t *actual_limits) { #if MULTIRAIL_EKC EP_RAILMASK all_rails = EP_RAILMASK_ALL; @@ -423,37 +366,26 @@ kqswnal_initialise (void) kqswnal_rx_t *krx; kqswnal_tx_t *ktx; int elan_page_idx; + ptl_process_id_t my_process_id; int pkmem = atomic_read(&portal_kmemory); - LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING); + LASSERT (nal == &kqswnal_api); - CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory)); + if (nal->nal_refct != 0) { + if (actual_limits != NULL) + *actual_limits = kqswnal_lib.libnal_ni.ni_actual_limits; + /* This module got the first ref */ + PORTAL_MODULE_USE; + return (PTL_OK); + } - kqswnal_api.forward = kqswnal_forward; - kqswnal_api.shutdown = kqswnal_shutdown; - kqswnal_api.yield = kqswnal_yield; - kqswnal_api.validate = NULL; /* our api validate is a NOOP */ - kqswnal_api.lock = kqswnal_lock; - kqswnal_api.unlock = kqswnal_unlock; - kqswnal_api.nal_data = &kqswnal_data; + LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING); - kqswnal_lib.nal_data = &kqswnal_data; + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory)); - memset(&kqswnal_rpc_success, 0, sizeof(kqswnal_rpc_success)); - memset(&kqswnal_rpc_failed, 0, sizeof(kqswnal_rpc_failed)); -#if MULTIRAIL_EKC - kqswnal_rpc_failed.Data[0] = -ECONNREFUSED; -#else - kqswnal_rpc_failed.Status = -ECONNREFUSED; -#endif /* ensure all pointers NULL etc */ memset (&kqswnal_data, 0, sizeof (kqswnal_data)); - kqswnal_data.kqn_optimized_gets = KQSW_OPTIMIZED_GETS; - kqswnal_data.kqn_copy_small_fwd = KQSW_COPY_SMALL_FWD; - - kqswnal_data.kqn_cb = &kqswnal_lib; - INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds); INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds); @@ -468,22 +400,28 @@ kqswnal_initialise (void) spin_lock_init (&kqswnal_data.kqn_sched_lock); init_waitqueue_head (&kqswnal_data.kqn_sched_waitq); - spin_lock_init (&kqswnal_data.kqn_statelock); + /* Leave kqn_rpc_success zeroed */ +#if MULTIRAIL_EKC + kqswnal_data.kqn_rpc_failed.Data[0] = -ECONNREFUSED; +#else + kqswnal_data.kqn_rpc_failed.Status = -ECONNREFUSED; +#endif /* pointers/lists/locks initialised */ kqswnal_data.kqn_init = KQN_INIT_DATA; - + #if MULTIRAIL_EKC kqswnal_data.kqn_ep = ep_system(); if (kqswnal_data.kqn_ep == NULL) { CERROR("Can't initialise EKC\n"); - return (-ENODEV); + kqswnal_shutdown(nal); + return (PTL_IFACE_INVALID); } if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) { CERROR("Can't get elan ID\n"); - kqswnal_finalise(); - return (-ENODEV); + kqswnal_shutdown(nal); + return (PTL_IFACE_INVALID); } #else /**********************************************************************/ @@ -493,7 +431,8 @@ kqswnal_initialise (void) if (kqswnal_data.kqn_ep == NULL) { CERROR ("Can't get elan device 0\n"); - return (-ENODEV); + kqswnal_shutdown(nal); + return (PTL_IFACE_INVALID); } #endif @@ -508,8 +447,8 @@ kqswnal_initialise (void) if (kqswnal_data.kqn_eptx == NULL) { CERROR ("Can't allocate transmitter\n"); - kqswnal_finalise (); - return (-ENOMEM); + kqswnal_shutdown (nal); + return (PTL_NO_SPACE); } /**********************************************************************/ @@ -521,8 +460,8 @@ kqswnal_initialise (void) if (kqswnal_data.kqn_eprx_small == NULL) { CERROR ("Can't install small msg receiver\n"); - kqswnal_finalise (); - return (-ENOMEM); + kqswnal_shutdown (nal); + return (PTL_NO_SPACE); } kqswnal_data.kqn_eprx_large = ep_alloc_rcvr (kqswnal_data.kqn_ep, @@ -531,8 +470,8 @@ kqswnal_initialise (void) if (kqswnal_data.kqn_eprx_large == NULL) { CERROR ("Can't install large msg receiver\n"); - kqswnal_finalise (); - return (-ENOMEM); + kqswnal_shutdown (nal); + return (PTL_NO_SPACE); } /**********************************************************************/ @@ -546,8 +485,8 @@ kqswnal_initialise (void) EP_PERM_WRITE); if (kqswnal_data.kqn_ep_tx_nmh == NULL) { CERROR("Can't reserve tx dma space\n"); - kqswnal_finalise(); - return (-ENOMEM); + kqswnal_shutdown(nal); + return (PTL_NO_SPACE); } #else dmareq.Waitfn = DDI_DMA_SLEEP; @@ -561,8 +500,8 @@ kqswnal_initialise (void) if (rc != DDI_SUCCESS) { CERROR ("Can't reserve rx dma space\n"); - kqswnal_finalise (); - return (-ENOMEM); + kqswnal_shutdown (nal); + return (PTL_NO_SPACE); } #endif /**********************************************************************/ @@ -575,8 +514,8 @@ kqswnal_initialise (void) EP_PERM_WRITE); if (kqswnal_data.kqn_ep_tx_nmh == NULL) { CERROR("Can't reserve rx dma space\n"); - kqswnal_finalise(); - return (-ENOMEM); + kqswnal_shutdown(nal); + return (PTL_NO_SPACE); } #else dmareq.Waitfn = DDI_DMA_SLEEP; @@ -591,8 +530,8 @@ kqswnal_initialise (void) if (rc != DDI_SUCCESS) { CERROR ("Can't reserve rx dma space\n"); - kqswnal_finalise (); - return (-ENOMEM); + kqswnal_shutdown (nal); + return (PTL_NO_SPACE); } #endif /**********************************************************************/ @@ -606,18 +545,19 @@ kqswnal_initialise (void) PORTAL_ALLOC (ktx, sizeof(*ktx)); if (ktx == NULL) { - kqswnal_finalise (); - return (-ENOMEM); + kqswnal_shutdown (nal); + return (PTL_NO_SPACE); } + memset(ktx, 0, sizeof(*ktx)); /* NULL pointers; zero flags */ ktx->ktx_alloclist = kqswnal_data.kqn_txds; kqswnal_data.kqn_txds = ktx; PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); if (ktx->ktx_buffer == NULL) { - kqswnal_finalise (); - return (-ENOMEM); + kqswnal_shutdown (nal); + return (PTL_NO_SPACE); } /* Map pre-allocated buffer NOW, to save latency on transmit */ @@ -640,6 +580,9 @@ kqswnal_initialise (void) INIT_LIST_HEAD (&ktx->ktx_delayed_list); ktx->ktx_state = KTX_IDLE; +#if MULTIRAIL_EKC + ktx->ktx_rail = -1; /* unset rail */ +#endif ktx->ktx_isnblk = (i >= KQSW_NTXMSGS); list_add_tail (&ktx->ktx_list, ktx->ktx_isnblk ? &kqswnal_data.kqn_nblk_idletxds : @@ -648,7 +591,6 @@ kqswnal_initialise (void) /**********************************************************************/ /* Allocate/Initialise receive descriptors */ - kqswnal_data.kqn_rxds = NULL; elan_page_idx = 0; for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) @@ -662,10 +604,11 @@ kqswnal_initialise (void) PORTAL_ALLOC(krx, sizeof(*krx)); if (krx == NULL) { - kqswnal_finalise(); - return (-ENOSPC); + kqswnal_shutdown(nal); + return (PTL_NO_SPACE); } + memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */ krx->krx_alloclist = kqswnal_data.kqn_rxds; kqswnal_data.kqn_rxds = krx; @@ -686,8 +629,8 @@ kqswnal_initialise (void) struct page *page = alloc_page(GFP_KERNEL); if (page == NULL) { - kqswnal_finalise (); - return (-ENOMEM); + kqswnal_shutdown (nal); + return (PTL_NO_SPACE); } krx->krx_kiov[j].kiov_page = page; @@ -731,21 +674,26 @@ kqswnal_initialise (void) /**********************************************************************/ /* Network interface ready to initialise */ - rc = PtlNIInit(kqswnal_init, 32, 4, 0, &kqswnal_ni); - if (rc != 0) + my_process_id.nid = kqswnal_elanid2nid(kqswnal_data.kqn_elanid); + my_process_id.pid = requested_pid; + + rc = lib_init(&kqswnal_lib, nal, my_process_id, + requested_limits, actual_limits); + if (rc != PTL_OK) { - CERROR ("PtlNIInit failed %d\n", rc); - kqswnal_finalise (); - return (-ENOMEM); + CERROR ("lib_init failed %d\n", rc); + kqswnal_shutdown (nal); + return (rc); } - kqswnal_data.kqn_init = KQN_INIT_PTL; + kqswnal_data.kqn_init = KQN_INIT_LIB; /**********************************************************************/ /* Queue receives, now that it's OK to run their completion callbacks */ - for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx =krx->krx_alloclist){ + for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) { /* NB this enqueue can allocate/sleep (attr == 0) */ + krx->krx_state = KRX_POSTED; #if MULTIRAIL_EKC rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, &krx->krx_elanbuffer, 0); @@ -757,8 +705,8 @@ kqswnal_initialise (void) if (rc != EP_SUCCESS) { CERROR ("failed ep_queue_receive %d\n", rc); - kqswnal_finalise (); - return (-ENOMEM); + kqswnal_shutdown (nal); + return (PTL_FAIL); } } @@ -769,8 +717,8 @@ kqswnal_initialise (void) if (rc != 0) { CERROR ("failed to spawn scheduling thread: %d\n", rc); - kqswnal_finalise (); - return (rc); + kqswnal_shutdown (nal); + return (PTL_FAIL); } } @@ -779,19 +727,13 @@ kqswnal_initialise (void) rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface); CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc); - rc = kportal_nal_register (QSWNAL, &kqswnal_cmd, NULL); + rc = libcfs_nal_cmd_register (QSWNAL, &kqswnal_cmd, NULL); if (rc != 0) { CERROR ("Can't initialise command interface (rc = %d)\n", rc); - kqswnal_finalise (); - return (rc); + kqswnal_shutdown (nal); + return (PTL_FAIL); } -#if CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - kqswnal_data.kqn_sysctl = register_sysctl_table (kqswnal_top_ctl_table, 0); -#endif - - PORTAL_SYMBOL_REGISTER(kqswnal_ni); kqswnal_data.kqn_init = KQN_INIT_ALL; printk(KERN_INFO "Lustre: Routing QSW NAL loaded on node %d of %d " @@ -800,9 +742,55 @@ kqswnal_initialise (void) kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled", pkmem); - return (0); + return (PTL_OK); +} + +void __exit +kqswnal_finalise (void) +{ +#if CONFIG_SYSCTL + if (kqswnal_tunables.kqn_sysctl != NULL) + unregister_sysctl_table (kqswnal_tunables.kqn_sysctl); +#endif + PtlNIFini(kqswnal_ni); + + ptl_unregister_nal(QSWNAL); } +static int __init +kqswnal_initialise (void) +{ + int rc; + + kqswnal_api.nal_ni_init = kqswnal_startup; + kqswnal_api.nal_ni_fini = kqswnal_shutdown; + + /* Initialise dynamic tunables to defaults once only */ + kqswnal_tunables.kqn_optimized_puts = KQSW_OPTIMIZED_PUTS; + kqswnal_tunables.kqn_optimized_gets = KQSW_OPTIMIZED_GETS; + + rc = ptl_register_nal(QSWNAL, &kqswnal_api); + if (rc != PTL_OK) { + CERROR("Can't register QSWNAL: %d\n", rc); + return (-ENOMEM); /* or something... */ + } + + /* Pure gateways, and the workaround for 'EKC blocks forever until + * the service is active' want the NAL started up at module load + * time... */ + rc = PtlNIInit(QSWNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kqswnal_ni); + if (rc != PTL_OK && rc != PTL_IFACE_DUP) { + ptl_unregister_nal(QSWNAL); + return (-ENODEV); + } + +#if CONFIG_SYSCTL + /* Press on regardless even if registering sysctl doesn't work */ + kqswnal_tunables.kqn_sysctl = + register_sysctl_table (kqswnal_top_ctl_table, 0); +#endif + return (0); +} MODULE_AUTHOR("Cluster File Systems, Inc. "); MODULE_DESCRIPTION("Kernel Quadrics/Elan NAL v1.01"); @@ -810,5 +798,3 @@ MODULE_LICENSE("GPL"); module_init (kqswnal_initialise); module_exit (kqswnal_finalise); - -EXPORT_SYMBOL (kqswnal_ni); diff --git a/lustre/portals/knals/qswnal/qswnal.h b/lustre/portals/knals/qswnal/qswnal.h index f96893f..b08d710 100644 --- a/lustre/portals/knals/qswnal/qswnal.h +++ b/lustre/portals/knals/qswnal/qswnal.h @@ -18,7 +18,7 @@ * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * - * Basic library routines. + * Basic library routines. * */ @@ -99,17 +99,18 @@ typedef unsigned long kqsw_csum_t; #define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ #define KQSW_NTXMSGS 8 /* # normal transmit messages */ -#define KQSW_NNBLK_TXMSGS 256 /* # reserved transmit messages if can't block */ +#define KQSW_NNBLK_TXMSGS 512 /* # reserved transmit messages if can't block */ #define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ -#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */ +#define KQSW_EP_ENVELOPES_LARGE 256 /* # large ep envelopes */ #define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ #define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ #define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ -#define KQSW_OPTIMIZED_GETS 1 /* optimized gets? */ +#define KQSW_OPTIMIZED_GETS 1 /* optimize gets >= this size */ +#define KQSW_OPTIMIZED_PUTS (32<<10) /* optimize puts >= this size */ #define KQSW_COPY_SMALL_FWD 0 /* copy small fwd messages to pre-mapped buffer? */ /* @@ -157,12 +158,18 @@ typedef struct kqswnal_rx int krx_npages; /* # pages in receive buffer */ int krx_nob; /* Number Of Bytes received into buffer */ int krx_rpc_reply_needed; /* peer waiting for EKC RPC reply */ - int krx_rpc_reply_sent; /* rpc reply sent */ + int krx_rpc_reply_status; /* what status to send */ + int krx_state; /* what this RX is doing */ atomic_t krx_refcount; /* how to tell when rpc is done */ kpr_fwd_desc_t krx_fwd; /* embedded forwarding descriptor */ ptl_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */ } kqswnal_rx_t; +#define KRX_POSTED 1 /* receiving */ +#define KRX_PARSE 2 /* ready to be parsed */ +#define KRX_COMPLETING 3 /* waiting to be completed */ + + typedef struct kqswnal_tx { struct list_head ktx_list; /* enqueue idle/active */ @@ -176,16 +183,16 @@ typedef struct kqswnal_tx int ktx_nmappedpages; /* # pages mapped for current message */ int ktx_port; /* destination ep port */ ptl_nid_t ktx_nid; /* destination node */ - void *ktx_args[2]; /* completion passthru */ + void *ktx_args[3]; /* completion passthru */ char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */ /* debug/info fields */ pid_t ktx_launcher; /* pid of launching process */ - ptl_hdr_t *ktx_wire_hdr; /* portals header (wire endian) */ int ktx_nfrag; /* # message frags */ #if MULTIRAIL_EKC + int ktx_rail; /* preferred rail */ EP_NMD ktx_ebuffer; /* elan mapping of ktx_buffer */ EP_NMD ktx_frags[EP_MAXFRAG];/* elan mapping of msg frags */ #else @@ -195,23 +202,28 @@ typedef struct kqswnal_tx } kqswnal_tx_t; #define KTX_IDLE 0 /* on kqn_(nblk_)idletxds */ -#define KTX_SENDING 1 /* local send */ -#define KTX_FORWARDING 2 /* routing a packet */ -#define KTX_GETTING 3 /* local optimised get */ +#define KTX_FORWARDING 1 /* sending a forwarded packet */ +#define KTX_SENDING 2 /* normal send */ +#define KTX_GETTING 3 /* sending optimised get */ +#define KTX_PUTTING 4 /* sending optimised put */ +#define KTX_RDMAING 5 /* handling optimised put/get */ + +typedef struct +{ + /* dynamic tunables... */ + int kqn_optimized_puts; /* optimized PUTs? */ + int kqn_optimized_gets; /* optimized GETs? */ +#if CONFIG_SYSCTL + struct ctl_table_header *kqn_sysctl; /* sysctl interface */ +#endif +} kqswnal_tunables_t; typedef struct { char kqn_init; /* what's been initialised */ char kqn_shuttingdown; /* I'm trying to shut down */ - atomic_t kqn_nthreads; /* # threads not terminated */ - atomic_t kqn_nthreads_running;/* # threads still running */ - - int kqn_optimized_gets; /* optimized GETs? */ - int kqn_copy_small_fwd; /* fwd small msgs from pre-allocated buffer? */ + atomic_t kqn_nthreads; /* # threads running */ -#if CONFIG_SYSCTL - struct ctl_table_header *kqn_sysctl; /* sysctl interface */ -#endif kqswnal_rx_t *kqn_rxds; /* stack of all the receive descriptors */ kqswnal_tx_t *kqn_txds; /* stack of all the transmit descriptors */ @@ -221,6 +233,7 @@ typedef struct spinlock_t kqn_idletxd_lock; /* serialise idle txd access */ wait_queue_head_t kqn_idletxd_waitq; /* sender blocks here waiting for idle txd */ struct list_head kqn_idletxd_fwdq; /* forwarded packets block here waiting for idle txd */ + atomic_t kqn_pending_txs; /* # transmits being prepped */ spinlock_t kqn_sched_lock; /* serialise packet schedulers */ wait_queue_head_t kqn_sched_waitq; /* scheduler blocks here */ @@ -229,8 +242,6 @@ typedef struct struct list_head kqn_delayedfwds; /* delayed forwards */ struct list_head kqn_delayedtxds; /* delayed transmits */ - spinlock_t kqn_statelock; /* cb_cli/cb_sti */ - nal_cb_t *kqn_cb; /* -> kqswnal_lib */ #if MULTIRAIL_EKC EP_SYS *kqn_ep; /* elan system */ EP_NMH *kqn_ep_tx_nmh; /* elan reserved tx vaddrs */ @@ -248,28 +259,27 @@ typedef struct ptl_nid_t kqn_nid_offset; /* this cluster's NID offset */ int kqn_nnodes; /* this cluster's size */ int kqn_elanid; /* this nodes's elan ID */ + + EP_STATUSBLK kqn_rpc_success; /* preset RPC reply status blocks */ + EP_STATUSBLK kqn_rpc_failed; } kqswnal_data_t; /* kqn_init state */ #define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ #define KQN_INIT_DATA 1 -#define KQN_INIT_PTL 2 +#define KQN_INIT_LIB 2 #define KQN_INIT_ALL 3 -extern nal_cb_t kqswnal_lib; -extern nal_t kqswnal_api; -extern kqswnal_data_t kqswnal_data; - -/* global pre-prepared replies to keep off the stack */ -extern EP_STATUSBLK kqswnal_rpc_success; -extern EP_STATUSBLK kqswnal_rpc_failed; +extern lib_nal_t kqswnal_lib; +extern nal_t kqswnal_api; +extern kqswnal_tunables_t kqswnal_tunables; +extern kqswnal_data_t kqswnal_data; extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg); extern void kqswnal_rxhandler(EP_RXD *rxd); extern int kqswnal_scheduler (void *); extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); -extern void kqswnal_dma_reply_complete (EP_RXD *rxd); -extern void kqswnal_requeue_rx (kqswnal_rx_t *krx); +extern void kqswnal_rx_done (kqswnal_rx_t *krx); static inline ptl_nid_t kqswnal_elanid2nid (int elanid) @@ -288,6 +298,12 @@ kqswnal_nid2elanid (ptl_nid_t nid) return (nid - kqswnal_data.kqn_nid_offset); } +static inline ptl_nid_t +kqswnal_rx_nid(kqswnal_rx_t *krx) +{ + return (kqswnal_elanid2nid(ep_rxd_node(krx->krx_rxd))); +} + static inline int kqswnal_pages_spanned (void *base, int nob) { @@ -310,11 +326,11 @@ static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob) } #endif -static inline void kqswnal_rx_done (kqswnal_rx_t *krx) +static inline void kqswnal_rx_decref (kqswnal_rx_t *krx) { LASSERT (atomic_read (&krx->krx_refcount) > 0); if (atomic_dec_and_test (&krx->krx_refcount)) - kqswnal_requeue_rx(krx); + kqswnal_rx_done(krx); } #if MULTIRAIL_EKC diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c index 08453a0..97b5a26 100644 --- a/lustre/portals/knals/qswnal/qswnal_cb.c +++ b/lustre/portals/knals/qswnal/qswnal_cb.c @@ -26,91 +26,14 @@ #include "qswnal.h" -EP_STATUSBLK kqswnal_rpc_success; -EP_STATUSBLK kqswnal_rpc_failed; - /* * LIB functions follow * */ -static ptl_err_t -kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, - size_t len) -{ - CDEBUG (D_NET, LPX64": reading "LPSZ" bytes from %p -> %p\n", - nal->ni.nid, len, src_addr, dst_addr ); - memcpy( dst_addr, src_addr, len ); - - return (PTL_OK); -} - -static ptl_err_t -kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, - size_t len) -{ - CDEBUG (D_NET, LPX64": writing "LPSZ" bytes from %p -> %p\n", - nal->ni.nid, len, src_addr, dst_addr ); - memcpy( dst_addr, src_addr, len ); - - return (PTL_OK); -} - -static void * -kqswnal_malloc(nal_cb_t *nal, size_t len) -{ - void *buf; - - PORTAL_ALLOC(buf, len); - return (buf); -} - -static void -kqswnal_free(nal_cb_t *nal, void *buf, size_t len) -{ - PORTAL_FREE(buf, len); -} - -static void -kqswnal_printf (nal_cb_t * nal, const char *fmt, ...) -{ - va_list ap; - char msg[256]; - - va_start (ap, fmt); - vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ - va_end (ap); - - msg[sizeof (msg) - 1] = 0; /* ensure terminated */ - - CDEBUG (D_NET, "%s", msg); -} - -#if (defined(CONFIG_SPARC32) || defined(CONFIG_SPARC64)) -# error "Can't save/restore irq contexts in different procedures" -#endif - -static void -kqswnal_cli(nal_cb_t *nal, unsigned long *flags) -{ - kqswnal_data_t *data= nal->nal_data; - - spin_lock_irqsave(&data->kqn_statelock, *flags); -} - - -static void -kqswnal_sti(nal_cb_t *nal, unsigned long *flags) -{ - kqswnal_data_t *data= nal->nal_data; - - spin_unlock_irqrestore(&data->kqn_statelock, *flags); -} - - static int -kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +kqswnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { - if (nid == nal->ni.nid) + if (nid == nal->libnal_ni.ni_pid.nid) *dist = 0; /* it's me */ else if (kqswnal_nid2elanid (nid) >= 0) *dist = 1; /* it's my peer */ @@ -136,6 +59,8 @@ kqswnal_unmap_tx (kqswnal_tx_t *ktx) { #if MULTIRAIL_EKC int i; + + ktx->ktx_rail = -1; /* unset rail */ #endif if (ktx->ktx_nmappedpages == 0) @@ -174,10 +99,13 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_ char *ptr; #if MULTIRAIL_EKC EP_RAILMASK railmask; - int rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx, - EP_RAILMASK_ALL, - kqswnal_nid2elanid(ktx->ktx_nid)); - + int rail; + + if (ktx->ktx_rail < 0) + ktx->ktx_rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx, + EP_RAILMASK_ALL, + kqswnal_nid2elanid(ktx->ktx_nid)); + rail = ktx->ktx_rail; if (rail < 0) { CERROR("No rails available for "LPX64"\n", ktx->ktx_nid); return (-ENETDOWN); @@ -201,11 +129,12 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_ do { int fraglen = kiov->kiov_len - offset; - /* nob exactly spans the iovs */ - LASSERT (fraglen <= nob); - /* each frag fits in a page */ + /* each page frag is contained in one page */ LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE); + if (fraglen > nob) + fraglen = nob; + nmapped++; if (nmapped > maxmapped) { CERROR("Can't map message in %d pages (max %d)\n", @@ -291,10 +220,13 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, uint32_t basepage = ktx->ktx_basepage + nmapped; #if MULTIRAIL_EKC EP_RAILMASK railmask; - int rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx, - EP_RAILMASK_ALL, - kqswnal_nid2elanid(ktx->ktx_nid)); + int rail; + if (ktx->ktx_rail < 0) + ktx->ktx_rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx, + EP_RAILMASK_ALL, + kqswnal_nid2elanid(ktx->ktx_nid)); + rail = ktx->ktx_rail; if (rail < 0) { CERROR("No rails available for "LPX64"\n", ktx->ktx_nid); return (-ENETDOWN); @@ -317,11 +249,12 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, do { int fraglen = iov->iov_len - offset; - long npages = kqswnal_pages_spanned (iov->iov_base, fraglen); - - /* nob exactly spans the iovs */ - LASSERT (fraglen <= nob); + long npages; + if (fraglen > nob) + fraglen = nob; + npages = kqswnal_pages_spanned (iov->iov_base, fraglen); + nmapped += npages; if (nmapped > maxmapped) { CERROR("Can't map message in %d pages (max %d)\n", @@ -415,7 +348,8 @@ kqswnal_put_idle_tx (kqswnal_tx_t *ktx) list_add (&ktx->ktx_list, &kqswnal_data.kqn_idletxds); /* anything blocking for a tx descriptor? */ - if (!list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */ + if (!kqswnal_data.kqn_shuttingdown && + !list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */ { CDEBUG(D_NET,"wakeup fwd\n"); @@ -449,6 +383,9 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) for (;;) { spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + if (kqswnal_data.kqn_shuttingdown) + break; + /* "normal" descriptor is free */ if (!list_empty (&kqswnal_data.kqn_idletxds)) { ktx = list_entry (kqswnal_data.kqn_idletxds.next, @@ -456,14 +393,8 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) break; } - /* "normal" descriptor pool is empty */ - - if (fwd != NULL) { /* forwarded packet => queue for idle txd */ - CDEBUG (D_NET, "blocked fwd [%p]\n", fwd); - list_add_tail (&fwd->kprfd_list, - &kqswnal_data.kqn_idletxd_fwdq); + if (fwd != NULL) /* forwarded packet? */ break; - } /* doing a local transmit */ if (!may_block) { @@ -483,13 +414,20 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) CDEBUG (D_NET, "blocking for tx desc\n"); wait_event (kqswnal_data.kqn_idletxd_waitq, - !list_empty (&kqswnal_data.kqn_idletxds)); + !list_empty (&kqswnal_data.kqn_idletxds) || + kqswnal_data.kqn_shuttingdown); } if (ktx != NULL) { list_del (&ktx->ktx_list); list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds); ktx->ktx_launcher = current->pid; + atomic_inc(&kqswnal_data.kqn_pending_txs); + } else if (fwd != NULL) { + /* queue forwarded packet until idle txd available */ + CDEBUG (D_NET, "blocked fwd [%p]\n", fwd); + list_add_tail (&fwd->kprfd_list, + &kqswnal_data.kqn_idletxd_fwdq); } spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); @@ -503,40 +441,29 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) void kqswnal_tx_done (kqswnal_tx_t *ktx, int error) { - lib_msg_t *msg; - lib_msg_t *repmsg = NULL; - switch (ktx->ktx_state) { case KTX_FORWARDING: /* router asked me to forward this packet */ kpr_fwd_done (&kqswnal_data.kqn_router, (kpr_fwd_desc_t *)ktx->ktx_args[0], error); break; - case KTX_SENDING: /* packet sourced locally */ - lib_finalize (&kqswnal_lib, ktx->ktx_args[0], + case KTX_RDMAING: /* optimized GET/PUT handled */ + case KTX_PUTTING: /* optimized PUT sent */ + case KTX_SENDING: /* normal send */ + lib_finalize (&kqswnal_lib, NULL, (lib_msg_t *)ktx->ktx_args[1], - (error == 0) ? PTL_OK : - (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL); + (error == 0) ? PTL_OK : PTL_FAIL); break; - case KTX_GETTING: /* Peer has DMA-ed direct? */ - msg = (lib_msg_t *)ktx->ktx_args[1]; - - if (error == 0) { - repmsg = lib_fake_reply_msg (&kqswnal_lib, - ktx->ktx_nid, msg->md); - if (repmsg == NULL) - error = -ENOMEM; - } - - if (error == 0) { - lib_finalize (&kqswnal_lib, ktx->ktx_args[0], - msg, PTL_OK); - lib_finalize (&kqswnal_lib, NULL, repmsg, PTL_OK); - } else { - lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg, - (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL); - } + case KTX_GETTING: /* optimized GET sent & REPLY received */ + /* Complete the GET with success since we can't avoid + * delivering a REPLY event; we committed to it when we + * launched the GET */ + lib_finalize (&kqswnal_lib, NULL, + (lib_msg_t *)ktx->ktx_args[1], PTL_OK); + lib_finalize (&kqswnal_lib, NULL, + (lib_msg_t *)ktx->ktx_args[2], + (error == 0) ? PTL_OK : PTL_FAIL); break; default: @@ -564,16 +491,27 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) kqswnal_notify_peer_down(ktx); status = -EHOSTDOWN; - } else if (ktx->ktx_state == KTX_GETTING) { - /* RPC completed OK; what did our peer put in the status + } else switch (ktx->ktx_state) { + + case KTX_GETTING: + case KTX_PUTTING: + /* RPC completed OK; but what did our peer put in the status * block? */ #if MULTIRAIL_EKC status = ep_txd_statusblk(txd)->Data[0]; #else status = ep_txd_statusblk(txd)->Status; #endif - } else { + break; + + case KTX_FORWARDING: + case KTX_SENDING: status = 0; + break; + + default: + LBUG(); + break; } kqswnal_tx_done (ktx, status); @@ -590,22 +528,29 @@ kqswnal_launch (kqswnal_tx_t *ktx) ktx->ktx_launchtime = jiffies; + if (kqswnal_data.kqn_shuttingdown) + return (-ESHUTDOWN); + LASSERT (dest >= 0); /* must be a peer */ - if (ktx->ktx_state == KTX_GETTING) { - /* NB ktx_frag[0] is the GET hdr + kqswnal_remotemd_t. The - * other frags are the GET sink which we obviously don't - * send here :) */ + #if MULTIRAIL_EKC + if (ktx->ktx_nmappedpages != 0) + attr = EP_SET_PREFRAIL(attr, ktx->ktx_rail); +#endif + + switch (ktx->ktx_state) { + case KTX_GETTING: + case KTX_PUTTING: + /* NB ktx_frag[0] is the GET/PUT hdr + kqswnal_remotemd_t. + * The other frags are the payload, awaiting RDMA */ rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest, ktx->ktx_port, attr, kqswnal_txhandler, ktx, NULL, ktx->ktx_frags, 1); -#else - rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest, - ktx->ktx_port, attr, kqswnal_txhandler, - ktx, NULL, ktx->ktx_frags, 1); -#endif - } else { + break; + + case KTX_FORWARDING: + case KTX_SENDING: #if MULTIRAIL_EKC rc = ep_transmit_message(kqswnal_data.kqn_eptx, dest, ktx->ktx_port, attr, @@ -617,6 +562,12 @@ kqswnal_launch (kqswnal_tx_t *ktx) kqswnal_txhandler, ktx, ktx->ktx_frags, ktx->ktx_nfrag); #endif + break; + + default: + LBUG(); + rc = -EINVAL; /* no compiler warning please */ + break; } switch (rc) { @@ -624,8 +575,6 @@ kqswnal_launch (kqswnal_tx_t *ktx) return (0); case EP_ENOMEM: /* can't allocate ep txd => queue for later */ - LASSERT (in_interrupt()); - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds); @@ -641,6 +590,7 @@ kqswnal_launch (kqswnal_tx_t *ktx) } } +#if 0 static char * hdr_type_string (ptl_hdr_t *hdr) { @@ -664,42 +614,42 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr) char *type_str = hdr_type_string (hdr); CERROR("P3 Header at %p of type %s length %d\n", hdr, type_str, - NTOH__u32(hdr->payload_length)); - CERROR(" From nid/pid "LPU64"/%u\n", NTOH__u64(hdr->src_nid), - NTOH__u32(hdr->src_pid)); - CERROR(" To nid/pid "LPU64"/%u\n", NTOH__u64(hdr->dest_nid), - NTOH__u32(hdr->dest_pid)); + le32_to_cpu(hdr->payload_length)); + CERROR(" From nid/pid "LPU64"/%u\n", le64_to_cpu(hdr->src_nid), + le32_to_cpu(hdr->src_pid)); + CERROR(" To nid/pid "LPU64"/%u\n", le64_to_cpu(hdr->dest_nid), + le32_to_cpu(hdr->dest_pid)); - switch (NTOH__u32(hdr->type)) { + switch (le32_to_cpu(hdr->type)) { case PTL_MSG_PUT: CERROR(" Ptl index %d, ack md "LPX64"."LPX64", " "match bits "LPX64"\n", - NTOH__u32 (hdr->msg.put.ptl_index), + le32_to_cpu(hdr->msg.put.ptl_index), hdr->msg.put.ack_wmd.wh_interface_cookie, hdr->msg.put.ack_wmd.wh_object_cookie, - NTOH__u64 (hdr->msg.put.match_bits)); + le64_to_cpu(hdr->msg.put.match_bits)); CERROR(" offset %d, hdr data "LPX64"\n", - NTOH__u32(hdr->msg.put.offset), + le32_to_cpu(hdr->msg.put.offset), hdr->msg.put.hdr_data); break; case PTL_MSG_GET: CERROR(" Ptl index %d, return md "LPX64"."LPX64", " "match bits "LPX64"\n", - NTOH__u32 (hdr->msg.get.ptl_index), + le32_to_cpu(hdr->msg.get.ptl_index), hdr->msg.get.return_wmd.wh_interface_cookie, hdr->msg.get.return_wmd.wh_object_cookie, hdr->msg.get.match_bits); CERROR(" Length %d, src offset %d\n", - NTOH__u32 (hdr->msg.get.sink_length), - NTOH__u32 (hdr->msg.get.src_offset)); + le32_to_cpu(hdr->msg.get.sink_length), + le32_to_cpu(hdr->msg.get.src_offset)); break; case PTL_MSG_ACK: CERROR(" dst md "LPX64"."LPX64", manipulated length %d\n", hdr->msg.ack.dst_wmd.wh_interface_cookie, hdr->msg.ack.dst_wmd.wh_object_cookie, - NTOH__u32 (hdr->msg.ack.mlength)); + le32_to_cpu(hdr->msg.ack.mlength)); break; case PTL_MSG_REPLY: @@ -709,6 +659,7 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr) } } /* end of print_hdr() */ +#endif #if !MULTIRAIL_EKC void @@ -770,114 +721,297 @@ kqswnal_eiovs2datav (int ndv, EP_DATAVEC *dv, CERROR ("DATAVEC too small\n"); return (-E2BIG); } +#else +int +kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag, + int nrfrag, EP_NMD *rfrag) +{ + int i; + + if (nlfrag != nrfrag) { + CERROR("Can't cope with unequal # frags: %d local %d remote\n", + nlfrag, nrfrag); + return (-EINVAL); + } + + for (i = 0; i < nlfrag; i++) + if (lfrag[i].nmd_len != rfrag[i].nmd_len) { + CERROR("Can't cope with unequal frags %d(%d):" + " %d local %d remote\n", + i, nlfrag, lfrag[i].nmd_len, rfrag[i].nmd_len); + return (-EINVAL); + } + + return (0); +} #endif -int -kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, - struct iovec *iov, ptl_kiov_t *kiov, - int offset, int nob) +kqswnal_remotemd_t * +kqswnal_parse_rmd (kqswnal_rx_t *krx, int type, ptl_nid_t expected_nid) { - kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; char *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page); + ptl_hdr_t *hdr = (ptl_hdr_t *)buffer; kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + KQSW_HDR_SIZE); - int rc; -#if MULTIRAIL_EKC - int i; -#else - EP_DATAVEC datav[EP_MAXFRAG]; - int ndatav; -#endif - LASSERT (krx->krx_rpc_reply_needed); - LASSERT ((iov == NULL) != (kiov == NULL)); + ptl_nid_t nid = kqswnal_rx_nid(krx); + + /* Note (1) lib_parse has already flipped hdr. + * (2) RDMA addresses are sent in native endian-ness. When + * EKC copes with different endian nodes, I'll fix this (and + * eat my hat :) */ + + LASSERT (krx->krx_nob >= sizeof(*hdr)); + + if (hdr->type != type) { + CERROR ("Unexpected optimized get/put type %d (%d expected)" + "from "LPX64"\n", hdr->type, type, nid); + return (NULL); + } + + if (hdr->src_nid != nid) { + CERROR ("Unexpected optimized get/put source NID " + LPX64" from "LPX64"\n", hdr->src_nid, nid); + return (NULL); + } + + LASSERT (nid == expected_nid); - /* see kqswnal_sendmsg comment regarding endian-ness */ if (buffer + krx->krx_nob < (char *)(rmd + 1)) { /* msg too small to discover rmd size */ CERROR ("Incoming message [%d] too small for RMD (%d needed)\n", krx->krx_nob, (int)(((char *)(rmd + 1)) - buffer)); - return (-EINVAL); + return (NULL); } - + if (buffer + krx->krx_nob < (char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) { /* rmd doesn't fit in the incoming message */ CERROR ("Incoming message [%d] too small for RMD[%d] (%d needed)\n", krx->krx_nob, rmd->kqrmd_nfrag, (int)(((char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) - buffer)); - return (-EINVAL); + return (NULL); } - /* Map the source data... */ + return (rmd); +} + +void +kqswnal_rdma_store_complete (EP_RXD *rxd) +{ + int status = ep_rxd_status(rxd); + kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); + kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; + + CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, + "rxd %p, ktx %p, status %d\n", rxd, ktx, status); + + LASSERT (ktx->ktx_state == KTX_RDMAING); + LASSERT (krx->krx_rxd == rxd); + LASSERT (krx->krx_rpc_reply_needed); + + krx->krx_rpc_reply_needed = 0; + kqswnal_rx_decref (krx); + + /* free ktx & finalize() its lib_msg_t */ + kqswnal_tx_done(ktx, (status == EP_SUCCESS) ? 0 : -ECONNABORTED); +} + +void +kqswnal_rdma_fetch_complete (EP_RXD *rxd) +{ + /* Completed fetching the PUT data */ + int status = ep_rxd_status(rxd); + kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); + kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; + unsigned long flags; + + CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, + "rxd %p, ktx %p, status %d\n", rxd, ktx, status); + + LASSERT (ktx->ktx_state == KTX_RDMAING); + LASSERT (krx->krx_rxd == rxd); + LASSERT (krx->krx_rpc_reply_needed); + + /* Set the RPC completion status */ + status = (status == EP_SUCCESS) ? 0 : -ECONNABORTED; + krx->krx_rpc_reply_status = status; + + /* free ktx & finalize() its lib_msg_t */ + kqswnal_tx_done(ktx, status); + + if (!in_interrupt()) { + /* OK to complete the RPC now (iff I had the last ref) */ + kqswnal_rx_decref (krx); + return; + } + + LASSERT (krx->krx_state == KRX_PARSE); + krx->krx_state = KRX_COMPLETING; + + /* Complete the RPC in thread context */ + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds); + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); +} + +int +kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type, + int niov, struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t len) +{ + kqswnal_remotemd_t *rmd; + kqswnal_tx_t *ktx; + int eprc; + int rc; +#if !MULTIRAIL_EKC + EP_DATAVEC datav[EP_MAXFRAG]; + int ndatav; +#endif + + LASSERT (type == PTL_MSG_GET || type == PTL_MSG_PUT); + /* Not both mapped and paged payload */ + LASSERT (iov == NULL || kiov == NULL); + /* RPC completes with failure by default */ + LASSERT (krx->krx_rpc_reply_needed); + LASSERT (krx->krx_rpc_reply_status != 0); + + rmd = kqswnal_parse_rmd(krx, type, libmsg->ev.initiator.nid); + if (rmd == NULL) + return (-EPROTO); + + if (len == 0) { + /* data got truncated to nothing. */ + lib_finalize(&kqswnal_lib, krx, libmsg, PTL_OK); + /* Let kqswnal_rx_done() complete the RPC with success */ + krx->krx_rpc_reply_status = 0; + return (0); + } + + /* NB I'm using 'ktx' just to map the local RDMA buffers; I'm not + actually sending a portals message with it */ + ktx = kqswnal_get_idle_tx(NULL, 0); + if (ktx == NULL) { + CERROR ("Can't get txd for RDMA with "LPX64"\n", + libmsg->ev.initiator.nid); + return (-ENOMEM); + } + + ktx->ktx_state = KTX_RDMAING; + ktx->ktx_nid = libmsg->ev.initiator.nid; + ktx->ktx_args[0] = krx; + ktx->ktx_args[1] = libmsg; + +#if MULTIRAIL_EKC + /* Map on the rail the RPC prefers */ + ktx->ktx_rail = ep_rcvr_prefrail(krx->krx_eprx, + ep_rxd_railmask(krx->krx_rxd)); +#endif + + /* Start mapping at offset 0 (we're not mapping any headers) */ ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0; + if (kiov != NULL) - rc = kqswnal_map_tx_kiov (ktx, offset, nob, nfrag, kiov); + rc = kqswnal_map_tx_kiov(ktx, offset, len, niov, kiov); else - rc = kqswnal_map_tx_iov (ktx, offset, nob, nfrag, iov); + rc = kqswnal_map_tx_iov(ktx, offset, len, niov, iov); if (rc != 0) { - CERROR ("Can't map source data: %d\n", rc); - return (rc); + CERROR ("Can't map local RDMA data: %d\n", rc); + goto out; } #if MULTIRAIL_EKC - if (ktx->ktx_nfrag != rmd->kqrmd_nfrag) { - CERROR("Can't cope with unequal # frags: %d local %d remote\n", - ktx->ktx_nfrag, rmd->kqrmd_nfrag); - return (-EINVAL); + rc = kqswnal_check_rdma (ktx->ktx_nfrag, ktx->ktx_frags, + rmd->kqrmd_nfrag, rmd->kqrmd_frag); + if (rc != 0) { + CERROR ("Incompatible RDMA descriptors\n"); + goto out; } - - for (i = 0; i < rmd->kqrmd_nfrag; i++) - if (ktx->ktx_frags[i].nmd_len != rmd->kqrmd_frag[i].nmd_len) { - CERROR("Can't cope with unequal frags %d(%d):" - " %d local %d remote\n", - i, rmd->kqrmd_nfrag, - ktx->ktx_frags[i].nmd_len, - rmd->kqrmd_frag[i].nmd_len); - return (-EINVAL); - } #else - ndatav = kqswnal_eiovs2datav (EP_MAXFRAG, datav, - ktx->ktx_nfrag, ktx->ktx_frags, - rmd->kqrmd_nfrag, rmd->kqrmd_frag); + switch (type) { + default: + LBUG(); + + case PTL_MSG_GET: + ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav, + ktx->ktx_nfrag, ktx->ktx_frags, + rmd->kqrmd_nfrag, rmd->kqrmd_frag); + break; + + case PTL_MSG_PUT: + ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav, + rmd->kqrmd_nfrag, rmd->kqrmd_frag, + ktx->ktx_nfrag, ktx->ktx_frags); + break; + } + if (ndatav < 0) { CERROR ("Can't create datavec: %d\n", ndatav); - return (ndatav); + rc = ndatav; + goto out; } #endif - /* Our caller will start to race with kqswnal_dma_reply_complete... */ - LASSERT (atomic_read (&krx->krx_refcount) == 1); - atomic_set (&krx->krx_refcount, 2); + LASSERT (atomic_read(&krx->krx_refcount) > 0); + /* Take an extra ref for the completion callback */ + atomic_inc(&krx->krx_refcount); -#if MULTIRAIL_EKC - rc = ep_complete_rpc(krx->krx_rxd, kqswnal_dma_reply_complete, ktx, - &kqswnal_rpc_success, - ktx->ktx_frags, rmd->kqrmd_frag, rmd->kqrmd_nfrag); - if (rc == EP_SUCCESS) - return (0); + switch (type) { + default: + LBUG(); - /* Well we tried... */ - krx->krx_rpc_reply_needed = 0; + case PTL_MSG_GET: +#if MULTIRAIL_EKC + eprc = ep_complete_rpc(krx->krx_rxd, + kqswnal_rdma_store_complete, ktx, + &kqswnal_data.kqn_rpc_success, + ktx->ktx_frags, rmd->kqrmd_frag, rmd->kqrmd_nfrag); #else - rc = ep_complete_rpc (krx->krx_rxd, kqswnal_dma_reply_complete, ktx, - &kqswnal_rpc_success, datav, ndatav); - if (rc == EP_SUCCESS) - return (0); - - /* "old" EKC destroys rxd on failed completion */ - krx->krx_rxd = NULL; + eprc = ep_complete_rpc (krx->krx_rxd, + kqswnal_rdma_store_complete, ktx, + &kqswnal_data.kqn_rpc_success, + datav, ndatav); + if (eprc != EP_SUCCESS) /* "old" EKC destroys rxd on failed completion */ + krx->krx_rxd = NULL; #endif + if (eprc != EP_SUCCESS) { + CERROR("can't complete RPC: %d\n", eprc); + /* don't re-attempt RPC completion */ + krx->krx_rpc_reply_needed = 0; + rc = -ECONNABORTED; + } + break; + + case PTL_MSG_PUT: +#if MULTIRAIL_EKC + eprc = ep_rpc_get (krx->krx_rxd, + kqswnal_rdma_fetch_complete, ktx, + rmd->kqrmd_frag, ktx->ktx_frags, ktx->ktx_nfrag); +#else + eprc = ep_rpc_get (krx->krx_rxd, + kqswnal_rdma_fetch_complete, ktx, + datav, ndatav); +#endif + if (eprc != EP_SUCCESS) { + CERROR("ep_rpc_get failed: %d\n", eprc); + rc = -ECONNABORTED; + } + break; + } - CERROR("can't complete RPC: %d\n", rc); - - /* reset refcount back to 1: we're not going to be racing with - * kqswnal_dma_reply_complete. */ - atomic_set (&krx->krx_refcount, 1); + out: + if (rc != 0) { + kqswnal_rx_decref(krx); /* drop callback's ref */ + kqswnal_put_idle_tx (ktx); + } - return (-ECONNABORTED); + atomic_dec(&kqswnal_data.kqn_pending_txs); + return (rc); } static ptl_err_t -kqswnal_sendmsg (nal_cb_t *nal, +kqswnal_sendmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, ptl_hdr_t *hdr, @@ -899,6 +1033,8 @@ kqswnal_sendmsg (nal_cb_t *nal, int sumoff; int sumnob; #endif + /* NB 1. hdr is in network byte order */ + /* 2. 'private' depends on the message type */ CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64 " pid %u\n", payload_nob, payload_niov, nid, pid); @@ -910,13 +1046,22 @@ kqswnal_sendmsg (nal_cb_t *nal, LASSERT (payload_kiov == NULL || !in_interrupt ()); /* payload is either all vaddrs or all pages */ LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - + if (payload_nob > KQSW_MAXPAYLOAD) { CERROR ("request exceeds MTU size "LPSZ" (max %u).\n", payload_nob, KQSW_MAXPAYLOAD); return (PTL_FAIL); } + if (type == PTL_MSG_REPLY && /* can I look in 'private' */ + ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) { /* is it an RPC */ + /* Must be a REPLY for an optimized GET */ + rc = kqswnal_rdma ((kqswnal_rx_t *)private, libmsg, PTL_MSG_GET, + payload_niov, payload_iov, payload_kiov, + payload_offset, payload_nob); + return ((rc == 0) ? PTL_OK : PTL_FAIL); + } + targetnid = nid; if (kqswnal_nid2elanid (nid) < 0) { /* Can't send direct: find gateway? */ rc = kpr_lookup (&kqswnal_data.kqn_router, nid, @@ -939,40 +1084,18 @@ kqswnal_sendmsg (nal_cb_t *nal, type == PTL_MSG_REPLY || in_interrupt())); if (ktx == NULL) { - kqswnal_cerror_hdr (hdr); - return (PTL_NOSPACE); + CERROR ("Can't get txd for msg type %d for "LPX64"\n", + type, libmsg->ev.initiator.nid); + return (PTL_NO_SPACE); } + ktx->ktx_state = KTX_SENDING; ktx->ktx_nid = targetnid; ktx->ktx_args[0] = private; ktx->ktx_args[1] = libmsg; - - if (type == PTL_MSG_REPLY && - ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) { - if (nid != targetnid || - kqswnal_nid2elanid(nid) != - ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd)) { - CERROR("Optimized reply nid conflict: " - "nid "LPX64" via "LPX64" elanID %d\n", - nid, targetnid, - ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd)); - return (PTL_FAIL); - } - - /* peer expects RPC completion with GET data */ - rc = kqswnal_dma_reply (ktx, payload_niov, - payload_iov, payload_kiov, - payload_offset, payload_nob); - if (rc == 0) - return (PTL_OK); - - CERROR ("Can't DMA reply to "LPX64": %d\n", nid, rc); - kqswnal_put_idle_tx (ktx); - return (PTL_FAIL); - } + ktx->ktx_args[2] = NULL; /* set when a GET commits to REPLY */ memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */ - ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer; #if KQSW_CHECKSUM csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr)); @@ -1012,28 +1135,31 @@ kqswnal_sendmsg (nal_cb_t *nal, memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum)); #endif - if (kqswnal_data.kqn_optimized_gets && - type == PTL_MSG_GET && /* doing a GET */ - nid == targetnid) { /* not forwarding */ + /* The first frag will be the pre-mapped buffer for (at least) the + * portals header. */ + ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; + + if (nid == targetnid && /* not forwarding */ + ((type == PTL_MSG_GET && /* optimize GET? */ + kqswnal_tunables.kqn_optimized_gets != 0 && + le32_to_cpu(hdr->msg.get.sink_length) >= kqswnal_tunables.kqn_optimized_gets) || + (type == PTL_MSG_PUT && /* optimize PUT? */ + kqswnal_tunables.kqn_optimized_puts != 0 && + payload_nob >= kqswnal_tunables.kqn_optimized_puts))) { lib_md_t *md = libmsg->md; kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(ktx->ktx_buffer + KQSW_HDR_SIZE); - /* Optimised path: I send over the Elan vaddrs of the get - * sink buffers, and my peer DMAs directly into them. + /* Optimised path: I send over the Elan vaddrs of the local + * buffers, and my peer DMAs directly to/from them. * * First I set up ktx as if it was going to send this * payload, (it needs to map it anyway). This fills * ktx_frags[1] and onward with the network addresses * of the GET sink frags. I copy these into ktx_buffer, - * immediately after the header, and send that as my GET - * message. - * - * Note that the addresses are sent in native endian-ness. - * When EKC copes with different endian nodes, I'll fix - * this (and eat my hat :) */ + * immediately after the header, and send that as my + * message. */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - ktx->ktx_state = KTX_GETTING; + ktx->ktx_state = (type == PTL_MSG_PUT) ? KTX_PUTTING : KTX_GETTING; if ((libmsg->md->options & PTL_MD_KIOV) != 0) rc = kqswnal_map_tx_kiov (ktx, 0, md->length, @@ -1041,11 +1167,8 @@ kqswnal_sendmsg (nal_cb_t *nal, else rc = kqswnal_map_tx_iov (ktx, 0, md->length, md->md_niov, md->md_iov.iov); - - if (rc < 0) { - kqswnal_put_idle_tx (ktx); - return (PTL_FAIL); - } + if (rc != 0) + goto out; rmd->kqrmd_nfrag = ktx->ktx_nfrag - 1; @@ -1066,12 +1189,21 @@ kqswnal_sendmsg (nal_cb_t *nal, ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + payload_nob; #endif + if (type == PTL_MSG_GET) { + /* Allocate reply message now while I'm in thread context */ + ktx->ktx_args[2] = lib_create_reply_msg (&kqswnal_lib, + nid, libmsg); + if (ktx->ktx_args[2] == NULL) + goto out; + + /* NB finalizing the REPLY message is my + * responsibility now, whatever happens. */ + } + } else if (payload_nob <= KQSW_TX_MAXCONTIG) { /* small message: single frag copied into the pre-mapped buffer */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - ktx->ktx_state = KTX_SENDING; #if MULTIRAIL_EKC ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, KQSW_HDR_SIZE + payload_nob); @@ -1093,8 +1225,6 @@ kqswnal_sendmsg (nal_cb_t *nal, /* large message: multiple frags: first is hdr in pre-mapped buffer */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - ktx->ktx_state = KTX_SENDING; #if MULTIRAIL_EKC ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, KQSW_HDR_SIZE); @@ -1108,29 +1238,44 @@ kqswnal_sendmsg (nal_cb_t *nal, else rc = kqswnal_map_tx_iov (ktx, payload_offset, payload_nob, payload_niov, payload_iov); - if (rc != 0) { - kqswnal_put_idle_tx (ktx); - return (PTL_FAIL); - } + if (rc != 0) + goto out; } ktx->ktx_port = (payload_nob <= KQSW_SMALLPAYLOAD) ? EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE; rc = kqswnal_launch (ktx); - if (rc != 0) { /* failed? */ - CERROR ("Failed to send packet to "LPX64": %d\n", targetnid, rc); + + out: + CDEBUG(rc == 0 ? D_NET : D_ERROR, + "%s "LPSZ" bytes to "LPX64" via "LPX64": rc %d\n", + rc == 0 ? "Sent" : "Failed to send", + payload_nob, nid, targetnid, rc); + + if (rc != 0) { + if (ktx->ktx_state == KTX_GETTING && + ktx->ktx_args[2] != NULL) { + /* We committed to reply, but there was a problem + * launching the GET. We can't avoid delivering a + * REPLY event since we committed above, so we + * pretend the GET succeeded but the REPLY + * failed. */ + rc = 0; + lib_finalize (&kqswnal_lib, private, libmsg, PTL_OK); + lib_finalize (&kqswnal_lib, private, + (lib_msg_t *)ktx->ktx_args[2], PTL_FAIL); + } + kqswnal_put_idle_tx (ktx); - return (PTL_FAIL); } - - CDEBUG(D_NET, "sent "LPSZ" bytes to "LPX64" via "LPX64"\n", - payload_nob, nid, targetnid); - return (PTL_OK); + + atomic_dec(&kqswnal_data.kqn_pending_txs); + return (rc == 0 ? PTL_OK : PTL_FAIL); } static ptl_err_t -kqswnal_send (nal_cb_t *nal, +kqswnal_send (lib_nal_t *nal, void *private, lib_msg_t *libmsg, ptl_hdr_t *hdr, @@ -1148,7 +1293,7 @@ kqswnal_send (nal_cb_t *nal, } static ptl_err_t -kqswnal_send_pages (nal_cb_t *nal, +kqswnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *libmsg, ptl_hdr_t *hdr, @@ -1187,18 +1332,17 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) if (ktx == NULL) /* can't get txd right now */ return; /* fwd will be scheduled when tx desc freed */ - if (nid == kqswnal_lib.ni.nid) /* gateway is me */ + if (nid == kqswnal_lib.libnal_ni.ni_pid.nid) /* gateway is me */ nid = fwd->kprfd_target_nid; /* target is final dest */ if (kqswnal_nid2elanid (nid) < 0) { CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid); rc = -EHOSTUNREACH; - goto failed; + goto out; } /* copy hdr into pre-mapped buffer */ memcpy(ktx->ktx_buffer, fwd->kprfd_hdr, sizeof(ptl_hdr_t)); - ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer; ktx->ktx_port = (nob <= KQSW_SMALLPAYLOAD) ? EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE; @@ -1233,20 +1377,19 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) #endif rc = kqswnal_map_tx_kiov (ktx, 0, nob, niov, kiov); if (rc != 0) - goto failed; + goto out; } rc = kqswnal_launch (ktx); - if (rc == 0) - return; + out: + if (rc != 0) { + CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc); - failed: - LASSERT (rc != 0); - CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc); + /* complete now (with failure) */ + kqswnal_tx_done (ktx, rc); + } - kqswnal_put_idle_tx (ktx); - /* complete now (with failure) */ - kpr_fwd_done (&kqswnal_data.kqn_router, fwd, rc); + atomic_dec(&kqswnal_data.kqn_pending_txs); } void @@ -1261,32 +1404,51 @@ kqswnal_fwd_callback (void *arg, int error) ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_kiov[0].kiov_page); CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", - NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error); + le64_to_cpu(hdr->src_nid), le64_to_cpu(hdr->dest_nid),error); } - kqswnal_requeue_rx (krx); + LASSERT (atomic_read(&krx->krx_refcount) == 1); + kqswnal_rx_decref (krx); } void -kqswnal_dma_reply_complete (EP_RXD *rxd) +kqswnal_requeue_rx (kqswnal_rx_t *krx) { - int status = ep_rxd_status(rxd); - kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); - kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; - lib_msg_t *msg = (lib_msg_t *)ktx->ktx_args[1]; - - CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, - "rxd %p, ktx %p, status %d\n", rxd, ktx, status); + LASSERT (atomic_read(&krx->krx_refcount) == 0); + LASSERT (!krx->krx_rpc_reply_needed); - LASSERT (krx->krx_rxd == rxd); - LASSERT (krx->krx_rpc_reply_needed); + krx->krx_state = KRX_POSTED; - krx->krx_rpc_reply_needed = 0; - kqswnal_rx_done (krx); +#if MULTIRAIL_EKC + if (kqswnal_data.kqn_shuttingdown) { + /* free EKC rxd on shutdown */ + ep_complete_receive(krx->krx_rxd); + } else { + /* repost receive */ + ep_requeue_receive(krx->krx_rxd, + kqswnal_rxhandler, krx, + &krx->krx_elanbuffer, 0); + } +#else + if (kqswnal_data.kqn_shuttingdown) + return; - lib_finalize (&kqswnal_lib, NULL, msg, - (status == EP_SUCCESS) ? PTL_OK : PTL_FAIL); - kqswnal_put_idle_tx (ktx); + if (krx->krx_rxd == NULL) { + /* We had a failed ep_complete_rpc() which nukes the + * descriptor in "old" EKC */ + int eprc = ep_queue_receive(krx->krx_eprx, + kqswnal_rxhandler, krx, + krx->krx_elanbuffer, + krx->krx_npages * PAGE_SIZE, 0); + LASSERT (eprc == EP_SUCCESS); + /* We don't handle failure here; it's incredibly rare + * (never reported?) and only happens with "old" EKC */ + } else { + ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx, + krx->krx_elanbuffer, + krx->krx_npages * PAGE_SIZE); + } +#endif } void @@ -1306,97 +1468,74 @@ kqswnal_rpc_complete (EP_RXD *rxd) } void -kqswnal_requeue_rx (kqswnal_rx_t *krx) +kqswnal_rx_done (kqswnal_rx_t *krx) { - int rc; + int rc; + EP_STATUSBLK *sblk; LASSERT (atomic_read(&krx->krx_refcount) == 0); if (krx->krx_rpc_reply_needed) { + /* We've not completed the peer's RPC yet... */ + sblk = (krx->krx_rpc_reply_status == 0) ? + &kqswnal_data.kqn_rpc_success : + &kqswnal_data.kqn_rpc_failed; - /* We failed to complete the peer's optimized GET (e.g. we - * couldn't map the source buffers). We complete the - * peer's EKC rpc now with failure. */ + LASSERT (!in_interrupt()); #if MULTIRAIL_EKC - rc = ep_complete_rpc(krx->krx_rxd, kqswnal_rpc_complete, krx, - &kqswnal_rpc_failed, NULL, NULL, 0); + rc = ep_complete_rpc(krx->krx_rxd, + kqswnal_rpc_complete, krx, + sblk, NULL, NULL, 0); if (rc == EP_SUCCESS) return; - - CERROR("can't complete RPC: %d\n", rc); #else - if (krx->krx_rxd != NULL) { - /* We didn't try (and fail) to complete earlier... */ - rc = ep_complete_rpc(krx->krx_rxd, - kqswnal_rpc_complete, krx, - &kqswnal_rpc_failed, NULL, 0); - if (rc == EP_SUCCESS) - return; - - CERROR("can't complete RPC: %d\n", rc); - } - - /* NB the old ep_complete_rpc() frees rxd on failure, so we - * have to requeue from scratch here, unless we're shutting - * down */ - if (kqswnal_data.kqn_shuttingdown) + rc = ep_complete_rpc(krx->krx_rxd, + kqswnal_rpc_complete, krx, + sblk, NULL, 0); + if (rc == EP_SUCCESS) return; - rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, - krx->krx_elanbuffer, - krx->krx_npages * PAGE_SIZE, 0); - LASSERT (rc == EP_SUCCESS); - /* We don't handle failure here; it's incredibly rare - * (never reported?) and only happens with "old" EKC */ - return; + /* "old" EKC destroys rxd on failed completion */ + krx->krx_rxd = NULL; #endif + CERROR("can't complete RPC: %d\n", rc); + krx->krx_rpc_reply_needed = 0; } -#if MULTIRAIL_EKC - if (kqswnal_data.kqn_shuttingdown) { - /* free EKC rxd on shutdown */ - ep_complete_receive(krx->krx_rxd); - } else { - /* repost receive */ - ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx, - &krx->krx_elanbuffer, 0); - } -#else - /* don't actually requeue on shutdown */ - if (!kqswnal_data.kqn_shuttingdown) - ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx, - krx->krx_elanbuffer, krx->krx_npages * PAGE_SIZE); -#endif + kqswnal_requeue_rx(krx); } void -kqswnal_rx (kqswnal_rx_t *krx) +kqswnal_parse (kqswnal_rx_t *krx) { ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(krx->krx_kiov[0].kiov_page); - ptl_nid_t dest_nid = NTOH__u64 (hdr->dest_nid); + ptl_nid_t dest_nid = le64_to_cpu(hdr->dest_nid); int payload_nob; int nob; int niov; - LASSERT (atomic_read(&krx->krx_refcount) == 0); + LASSERT (atomic_read(&krx->krx_refcount) == 1); + + if (dest_nid == kqswnal_lib.libnal_ni.ni_pid.nid) { /* It's for me :) */ + /* I ignore parse errors since I'm not consuming a byte + * stream */ + (void)lib_parse (&kqswnal_lib, hdr, krx); - if (dest_nid == kqswnal_lib.ni.nid) { /* It's for me :) */ - atomic_set(&krx->krx_refcount, 1); - lib_parse (&kqswnal_lib, hdr, krx); - kqswnal_rx_done(krx); + /* Drop my ref; any RDMA activity takes an additional ref */ + kqswnal_rx_decref(krx); return; } #if KQSW_CHECKSUM - CERROR ("checksums for forwarded packets not implemented\n"); - LBUG (); + LASSERTF (0, "checksums for forwarded packets not implemented\n"); #endif + if (kqswnal_nid2elanid (dest_nid) >= 0) /* should have gone direct to peer */ { CERROR("dropping packet from "LPX64" for "LPX64 - ": target is peer\n", NTOH__u64(hdr->src_nid), dest_nid); + ": target is peer\n", le64_to_cpu(hdr->src_nid), dest_nid); - kqswnal_requeue_rx (krx); + kqswnal_rx_decref (krx); return; } @@ -1438,7 +1577,9 @@ kqswnal_rxhandler(EP_RXD *rxd) rxd, krx, nob, status); LASSERT (krx != NULL); - + LASSERT (krx->krx_state = KRX_POSTED); + + krx->krx_state = KRX_PARSE; krx->krx_rxd = rxd; krx->krx_nob = nob; #if MULTIRAIL_EKC @@ -1446,7 +1587,10 @@ kqswnal_rxhandler(EP_RXD *rxd) #else krx->krx_rpc_reply_needed = ep_rxd_isrpc(rxd); #endif - + /* Default to failure if an RPC reply is requested but not handled */ + krx->krx_rpc_reply_status = -EPROTO; + atomic_set (&krx->krx_refcount, 1); + /* must receive a whole header to be able to parse */ if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t)) { @@ -1462,12 +1606,12 @@ kqswnal_rxhandler(EP_RXD *rxd) CERROR("receive status failed with status %d nob %d\n", ep_rxd_status(rxd), nob); #endif - kqswnal_requeue_rx (krx); + kqswnal_rx_decref(krx); return; } if (!in_interrupt()) { - kqswnal_rx (krx); + kqswnal_parse(krx); return; } @@ -1488,30 +1632,30 @@ kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64 ", dpid %d, spid %d, type %d\n", ishdr ? "Header" : "Payload", krx, - NTOH__u64(hdr->dest_nid), NTOH__u64(hdr->src_nid) - NTOH__u32(hdr->dest_pid), NTOH__u32(hdr->src_pid), - NTOH__u32(hdr->type)); + le64_to_cpu(hdr->dest_nid), le64_to_cpu(hdr->src_nid) + le32_to_cpu(hdr->dest_pid), le32_to_cpu(hdr->src_pid), + le32_to_cpu(hdr->type)); - switch (NTOH__u32 (hdr->type)) + switch (le32_to_cpu(hdr->type)) { case PTL_MSG_ACK: CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64 " len %u\n", - NTOH__u32(hdr->msg.ack.mlength), + le32_to_cpu(hdr->msg.ack.mlength), hdr->msg.ack.dst_wmd.handle_cookie, hdr->msg.ack.dst_wmd.handle_idx, - NTOH__u64(hdr->msg.ack.match_bits), - NTOH__u32(hdr->msg.ack.length)); + le64_to_cpu(hdr->msg.ack.match_bits), + le32_to_cpu(hdr->msg.ack.length)); break; case PTL_MSG_PUT: CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64 " len %u off %u data "LPX64"\n", - NTOH__u32(hdr->msg.put.ptl_index), + le32_to_cpu(hdr->msg.put.ptl_index), hdr->msg.put.ack_wmd.handle_cookie, hdr->msg.put.ack_wmd.handle_idx, - NTOH__u64(hdr->msg.put.match_bits), - NTOH__u32(hdr->msg.put.length), - NTOH__u32(hdr->msg.put.offset), + le64_to_cpu(hdr->msg.put.match_bits), + le32_to_cpu(hdr->msg.put.length), + le32_to_cpu(hdr->msg.put.offset), hdr->msg.put.hdr_data); break; case PTL_MSG_GET: @@ -1527,7 +1671,7 @@ kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) #endif static ptl_err_t -kqswnal_recvmsg (nal_cb_t *nal, +kqswnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, @@ -1539,16 +1683,18 @@ kqswnal_recvmsg (nal_cb_t *nal, { kqswnal_rx_t *krx = (kqswnal_rx_t *)private; char *buffer = page_address(krx->krx_kiov[0].kiov_page); + ptl_hdr_t *hdr = (ptl_hdr_t *)buffer; int page; char *page_ptr; int page_nob; char *iov_ptr; int iov_nob; int frag; + int rc; #if KQSW_CHECKSUM kqsw_csum_t senders_csum; kqsw_csum_t payload_csum = 0; - kqsw_csum_t hdr_csum = kqsw_csum(0, buffer, sizeof(ptl_hdr_t)); + kqsw_csum_t hdr_csum = kqsw_csum(0, hdr, sizeof(*hdr)); size_t csum_len = mlen; int csum_frags = 0; int csum_nob = 0; @@ -1561,8 +1707,18 @@ kqswnal_recvmsg (nal_cb_t *nal, if (senders_csum != hdr_csum) kqswnal_csum_error (krx, 1); #endif + /* NB lib_parse() has already flipped *hdr */ + CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen); + if (krx->krx_rpc_reply_needed && + hdr->type == PTL_MSG_PUT) { + /* This must be an optimized PUT */ + rc = kqswnal_rdma (krx, libmsg, PTL_MSG_PUT, + niov, iov, kiov, offset, mlen); + return (rc == 0 ? PTL_OK : PTL_FAIL); + } + /* What was actually received must be >= payload. */ LASSERT (mlen <= rlen); if (krx->krx_nob < KQSW_HDR_SIZE + mlen) { @@ -1678,7 +1834,7 @@ kqswnal_recvmsg (nal_cb_t *nal, } static ptl_err_t -kqswnal_recv(nal_cb_t *nal, +kqswnal_recv(lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, @@ -1693,7 +1849,7 @@ kqswnal_recv(nal_cb_t *nal, } static ptl_err_t -kqswnal_recv_pages (nal_cb_t *nal, +kqswnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, @@ -1716,7 +1872,6 @@ kqswnal_thread_start (int (*fn)(void *arg), void *arg) return ((int)pid); atomic_inc (&kqswnal_data.kqn_nthreads); - atomic_inc (&kqswnal_data.kqn_nthreads_running); return (0); } @@ -1735,7 +1890,6 @@ kqswnal_scheduler (void *arg) unsigned long flags; int rc; int counter = 0; - int shuttingdown = 0; int did_something; kportal_daemonize ("kqswnal_sched"); @@ -1745,18 +1899,6 @@ kqswnal_scheduler (void *arg) for (;;) { - if (kqswnal_data.kqn_shuttingdown != shuttingdown) { - - if (kqswnal_data.kqn_shuttingdown == 2) - break; - - /* During stage 1 of shutdown we are still responsive - * to receives */ - - atomic_dec (&kqswnal_data.kqn_nthreads_running); - shuttingdown = kqswnal_data.kqn_shuttingdown; - } - did_something = 0; if (!list_empty (&kqswnal_data.kqn_readyrxds)) @@ -1767,14 +1909,24 @@ kqswnal_scheduler (void *arg) spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); - kqswnal_rx (krx); + switch (krx->krx_state) { + case KRX_PARSE: + kqswnal_parse (krx); + break; + case KRX_COMPLETING: + /* Drop last ref to reply to RPC and requeue */ + LASSERT (krx->krx_rpc_reply_needed); + kqswnal_rx_decref (krx); + break; + default: + LBUG(); + } did_something = 1; spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); } - if (!shuttingdown && - !list_empty (&kqswnal_data.kqn_delayedtxds)) + if (!list_empty (&kqswnal_data.kqn_delayedtxds)) { ktx = list_entry(kqswnal_data.kqn_delayedtxds.next, kqswnal_tx_t, ktx_list); @@ -1783,31 +1935,31 @@ kqswnal_scheduler (void *arg) flags); rc = kqswnal_launch (ktx); - if (rc != 0) /* failed: ktx_nid down? */ - { + if (rc != 0) { CERROR("Failed delayed transmit to "LPX64 ": %d\n", ktx->ktx_nid, rc); kqswnal_tx_done (ktx, rc); } + atomic_dec (&kqswnal_data.kqn_pending_txs); did_something = 1; spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); } - if (!shuttingdown & - !list_empty (&kqswnal_data.kqn_delayedfwds)) + if (!list_empty (&kqswnal_data.kqn_delayedfwds)) { fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list); list_del (&fwd->kprfd_list); spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + /* If we're shutting down, this will just requeue fwd on kqn_idletxd_fwdq */ kqswnal_fwd_packet (NULL, fwd); did_something = 1; spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); } - /* nothing to do or hogging CPU */ + /* nothing to do or hogging CPU */ if (!did_something || counter++ == KQSW_RESCHED) { spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); @@ -1815,8 +1967,13 @@ kqswnal_scheduler (void *arg) counter = 0; if (!did_something) { + if (kqswnal_data.kqn_shuttingdown == 2) { + /* We only exit in stage 2 of shutdown when + * there's nothing left to do */ + break; + } rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq, - kqswnal_data.kqn_shuttingdown != shuttingdown || + kqswnal_data.kqn_shuttingdown == 2 || !list_empty(&kqswnal_data.kqn_readyrxds) || !list_empty(&kqswnal_data.kqn_delayedtxds) || !list_empty(&kqswnal_data.kqn_delayedfwds)); @@ -1828,25 +1985,16 @@ kqswnal_scheduler (void *arg) } } - spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); - kqswnal_thread_fini (); return (0); } -nal_cb_t kqswnal_lib = +lib_nal_t kqswnal_lib = { - nal_data: &kqswnal_data, /* NAL private data */ - cb_send: kqswnal_send, - cb_send_pages: kqswnal_send_pages, - cb_recv: kqswnal_recv, - cb_recv_pages: kqswnal_recv_pages, - cb_read: kqswnal_read, - cb_write: kqswnal_write, - cb_malloc: kqswnal_malloc, - cb_free: kqswnal_free, - cb_printf: kqswnal_printf, - cb_cli: kqswnal_cli, - cb_sti: kqswnal_sti, - cb_dist: kqswnal_dist + libnal_data: &kqswnal_data, /* NAL private data */ + libnal_send: kqswnal_send, + libnal_send_pages: kqswnal_send_pages, + libnal_recv: kqswnal_recv, + libnal_recv_pages: kqswnal_recv_pages, + libnal_dist: kqswnal_dist }; diff --git a/lustre/portals/knals/socknal/socknal.c b/lustre/portals/knals/socknal/socknal.c index bbe19cf..7642770 100644 --- a/lustre/portals/knals/socknal/socknal.c +++ b/lustre/portals/knals/socknal/socknal.c @@ -25,13 +25,10 @@ #include "socknal.h" +nal_t ksocknal_api; +ksock_nal_data_t ksocknal_data; ptl_handle_ni_t ksocknal_ni; -static nal_t ksocknal_api; -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -ksock_nal_data_t ksocknal_data; -#else -static ksock_nal_data_t ksocknal_data; -#endif +ksock_tunables_t ksocknal_tunables; kpr_nal_interface_t ksocknal_router_interface = { kprni_nalid: SOCKNAL, @@ -40,31 +37,58 @@ kpr_nal_interface_t ksocknal_router_interface = { kprni_notify: ksocknal_notify, }; +#ifdef CONFIG_SYSCTL #define SOCKNAL_SYSCTL 200 -#define SOCKNAL_SYSCTL_TIMEOUT 1 -#define SOCKNAL_SYSCTL_EAGER_ACK 2 -#define SOCKNAL_SYSCTL_ZERO_COPY 3 -#define SOCKNAL_SYSCTL_TYPED 4 -#define SOCKNAL_SYSCTL_MIN_BULK 5 +#define SOCKNAL_SYSCTL_TIMEOUT 1 +#define SOCKNAL_SYSCTL_EAGER_ACK 2 +#define SOCKNAL_SYSCTL_ZERO_COPY 3 +#define SOCKNAL_SYSCTL_TYPED 4 +#define SOCKNAL_SYSCTL_MIN_BULK 5 +#define SOCKNAL_SYSCTL_BUFFER_SIZE 6 +#define SOCKNAL_SYSCTL_NAGLE 7 +#define SOCKNAL_SYSCTL_IRQ_AFFINITY 8 +#define SOCKNAL_SYSCTL_KEEPALIVE_IDLE 9 +#define SOCKNAL_SYSCTL_KEEPALIVE_COUNT 10 +#define SOCKNAL_SYSCTL_KEEPALIVE_INTVL 11 static ctl_table ksocknal_ctl_table[] = { {SOCKNAL_SYSCTL_TIMEOUT, "timeout", - &ksocknal_data.ksnd_io_timeout, sizeof (int), + &ksocknal_tunables.ksnd_io_timeout, sizeof (int), 0644, NULL, &proc_dointvec}, {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack", - &ksocknal_data.ksnd_eager_ack, sizeof (int), + &ksocknal_tunables.ksnd_eager_ack, sizeof (int), 0644, NULL, &proc_dointvec}, #if SOCKNAL_ZC {SOCKNAL_SYSCTL_ZERO_COPY, "zero_copy", - &ksocknal_data.ksnd_zc_min_frag, sizeof (int), + &ksocknal_tunables.ksnd_zc_min_frag, sizeof (int), 0644, NULL, &proc_dointvec}, #endif {SOCKNAL_SYSCTL_TYPED, "typed", - &ksocknal_data.ksnd_typed_conns, sizeof (int), + &ksocknal_tunables.ksnd_typed_conns, sizeof (int), 0644, NULL, &proc_dointvec}, {SOCKNAL_SYSCTL_MIN_BULK, "min_bulk", - &ksocknal_data.ksnd_min_bulk, sizeof (int), + &ksocknal_tunables.ksnd_min_bulk, sizeof (int), + 0644, NULL, &proc_dointvec}, + {SOCKNAL_SYSCTL_BUFFER_SIZE, "buffer_size", + &ksocknal_tunables.ksnd_buffer_size, sizeof(int), + 0644, NULL, &proc_dointvec}, + {SOCKNAL_SYSCTL_NAGLE, "nagle", + &ksocknal_tunables.ksnd_nagle, sizeof(int), + 0644, NULL, &proc_dointvec}, +#if CPU_AFFINITY + {SOCKNAL_SYSCTL_IRQ_AFFINITY, "irq_affinity", + &ksocknal_tunables.ksnd_irq_affinity, sizeof(int), + 0644, NULL, &proc_dointvec}, +#endif + {SOCKNAL_SYSCTL_KEEPALIVE_IDLE, "keepalive_idle", + &ksocknal_tunables.ksnd_keepalive_idle, sizeof(int), + 0644, NULL, &proc_dointvec}, + {SOCKNAL_SYSCTL_KEEPALIVE_COUNT, "keepalive_count", + &ksocknal_tunables.ksnd_keepalive_count, sizeof(int), + 0644, NULL, &proc_dointvec}, + {SOCKNAL_SYSCTL_KEEPALIVE_INTVL, "keepalive_intvl", + &ksocknal_tunables.ksnd_keepalive_intvl, sizeof(int), 0644, NULL, &proc_dointvec}, { 0 } }; @@ -73,73 +97,12 @@ static ctl_table ksocknal_top_ctl_table[] = { {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table}, { 0 } }; - -int -ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len, - void *ret, size_t ret_len) -{ - ksock_nal_data_t *k; - nal_cb_t *nal_cb; - - k = nal->nal_data; - nal_cb = k->ksnd_nal_cb; - - lib_dispatch(nal_cb, k, id, args, ret); /* ksocknal_send needs k */ - return PTL_OK; -} - -int -ksocknal_api_shutdown(nal_t *nal, int ni) -{ - return PTL_OK; -} - -void -ksocknal_api_yield(nal_t *nal) -{ - our_cond_resched(); - return; -} - -void -ksocknal_api_lock(nal_t *nal, unsigned long *flags) -{ - ksock_nal_data_t *k; - nal_cb_t *nal_cb; - - k = nal->nal_data; - nal_cb = k->ksnd_nal_cb; - nal_cb->cb_cli(nal_cb,flags); -} - -void -ksocknal_api_unlock(nal_t *nal, unsigned long *flags) -{ - ksock_nal_data_t *k; - nal_cb_t *nal_cb; - - k = nal->nal_data; - nal_cb = k->ksnd_nal_cb; - nal_cb->cb_sti(nal_cb,flags); -} - -nal_t * -ksocknal_init(int interface, ptl_pt_index_t ptl_size, - ptl_ac_index_t ac_size, ptl_pid_t requested_pid) -{ - CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", (ptl_nid_t)0); - lib_init(&ksocknal_lib, (ptl_nid_t)0, 0, 10, ptl_size, ac_size); - return (&ksocknal_api); -} - -/* - * EXTRA functions follow - */ +#endif int ksocknal_set_mynid(ptl_nid_t nid) { - lib_ni_t *ni = &ksocknal_lib.ni; + lib_ni_t *ni = &ksocknal_lib.libnal_ni; /* FIXME: we have to do this because we call lib_init() at module * insertion time, which is before we have 'mynid' available. lib_init @@ -148,9 +111,9 @@ ksocknal_set_mynid(ptl_nid_t nid) * problem. */ CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->nid); + nid, ni->ni_pid.nid); - ni->nid = nid; + ni->ni_pid.nid = nid; return (0); } @@ -202,9 +165,25 @@ ksocknal_bind_irq (unsigned int irq) #endif } +ksock_interface_t * +ksocknal_ip2iface(__u32 ip) +{ + int i; + ksock_interface_t *iface; + + for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) { + LASSERT(i < SOCKNAL_MAX_INTERFACES); + iface = &ksocknal_data.ksnd_interfaces[i]; + + if (iface->ksni_ipaddr == ip) + return (iface); + } + + return (NULL); +} + ksock_route_t * -ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, - int irq_affinity, int eager) +ksocknal_create_route (__u32 ipaddr, int port) { ksock_route_t *route; @@ -213,19 +192,16 @@ ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, return (NULL); atomic_set (&route->ksnr_refcount, 1); - route->ksnr_sharecount = 0; route->ksnr_peer = NULL; route->ksnr_timeout = jiffies; route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; route->ksnr_ipaddr = ipaddr; route->ksnr_port = port; - route->ksnr_buffer_size = buffer_size; - route->ksnr_irq_affinity = irq_affinity; - route->ksnr_eager = eager; route->ksnr_connecting = 0; route->ksnr_connected = 0; route->ksnr_deleted = 0; route->ksnr_conn_count = 0; + route->ksnr_share_count = 0; return (route); } @@ -233,8 +209,6 @@ ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, void ksocknal_destroy_route (ksock_route_t *route) { - LASSERT (route->ksnr_sharecount == 0); - if (route->ksnr_peer != NULL) ksocknal_put_peer (route->ksnr_peer); @@ -265,7 +239,7 @@ ksocknal_create_peer (ptl_nid_t nid) if (peer == NULL) return (NULL); - memset (peer, 0, sizeof (*peer)); + memset (peer, 0, sizeof (*peer)); /* NULL pointers/clear flags etc */ peer->ksnp_nid = nid; atomic_set (&peer->ksnp_refcount, 1); /* 1 ref for caller */ @@ -323,8 +297,6 @@ ksocknal_find_peer_locked (ptl_nid_t nid) peer = list_entry (tmp, ksock_peer_t, ksnp_list); LASSERT (!peer->ksnp_closing); - LASSERT (!(list_empty (&peer->ksnp_routes) && - list_empty (&peer->ksnp_conns))); if (peer->ksnp_nid != nid) continue; @@ -353,6 +325,18 @@ ksocknal_get_peer (ptl_nid_t nid) void ksocknal_unlink_peer_locked (ksock_peer_t *peer) { + int i; + __u32 ip; + + for (i = 0; i < peer->ksnp_n_passive_ips; i++) { + LASSERT (i < SOCKNAL_MAX_INTERFACES); + ip = peer->ksnp_passive_ips[i]; + + ksocknal_ip2iface(ip)->ksni_npeers--; + } + + LASSERT (list_empty(&peer->ksnp_conns)); + LASSERT (list_empty(&peer->ksnp_routes)); LASSERT (!peer->ksnp_closing); peer->ksnp_closing = 1; list_del (&peer->ksnp_list); @@ -360,49 +344,210 @@ ksocknal_unlink_peer_locked (ksock_peer_t *peer) ksocknal_put_peer (peer); } -ksock_route_t * -ksocknal_get_route_by_idx (int index) +int +ksocknal_get_peer_info (int index, ptl_nid_t *nid, + __u32 *myip, __u32 *peer_ip, int *port, + int *conn_count, int *share_count) { ksock_peer_t *peer; struct list_head *ptmp; ksock_route_t *route; struct list_head *rtmp; int i; + int j; + int rc = -ENOENT; read_lock (&ksocknal_data.ksnd_global_lock); for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { peer = list_entry (ptmp, ksock_peer_t, ksnp_list); - LASSERT (!(list_empty (&peer->ksnp_routes) && - list_empty (&peer->ksnp_conns))); + if (peer->ksnp_n_passive_ips == 0 && + list_empty(&peer->ksnp_routes)) { + if (index-- > 0) + continue; + + *nid = peer->ksnp_nid; + *myip = 0; + *peer_ip = 0; + *port = 0; + *conn_count = 0; + *share_count = 0; + rc = 0; + goto out; + } + for (j = 0; j < peer->ksnp_n_passive_ips; j++) { + if (index-- > 0) + continue; + + *nid = peer->ksnp_nid; + *myip = peer->ksnp_passive_ips[j]; + *peer_ip = 0; + *port = 0; + *conn_count = 0; + *share_count = 0; + rc = 0; + goto out; + } + list_for_each (rtmp, &peer->ksnp_routes) { if (index-- > 0) continue; - route = list_entry (rtmp, ksock_route_t, ksnr_list); - atomic_inc (&route->ksnr_refcount); - read_unlock (&ksocknal_data.ksnd_global_lock); - return (route); + route = list_entry(rtmp, ksock_route_t, + ksnr_list); + + *nid = peer->ksnp_nid; + *myip = route->ksnr_myipaddr; + *peer_ip = route->ksnr_ipaddr; + *port = route->ksnr_port; + *conn_count = route->ksnr_conn_count; + *share_count = route->ksnr_share_count; + rc = 0; + goto out; } } } - + out: read_unlock (&ksocknal_data.ksnd_global_lock); - return (NULL); + return (rc); +} + +void +ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn) +{ + ksock_peer_t *peer = route->ksnr_peer; + int type = conn->ksnc_type; + ksock_interface_t *iface; + + conn->ksnc_route = route; + atomic_inc (&route->ksnr_refcount); + + if (route->ksnr_myipaddr != conn->ksnc_myipaddr) { + if (route->ksnr_myipaddr == 0) { + /* route wasn't bound locally yet (the initial route) */ + CWARN("Binding "LPX64" %u.%u.%u.%u to %u.%u.%u.%u\n", + peer->ksnp_nid, + HIPQUAD(route->ksnr_ipaddr), + HIPQUAD(conn->ksnc_myipaddr)); + } else { + CWARN("Rebinding "LPX64" %u.%u.%u.%u from " + "%u.%u.%u.%u to %u.%u.%u.%u\n", + peer->ksnp_nid, + HIPQUAD(route->ksnr_ipaddr), + HIPQUAD(route->ksnr_myipaddr), + HIPQUAD(conn->ksnc_myipaddr)); + + iface = ksocknal_ip2iface(route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes--; + } + route->ksnr_myipaddr = conn->ksnc_myipaddr; + iface = ksocknal_ip2iface(route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes++; + } + + route->ksnr_connected |= (1<ksnr_connecting &= ~(1<ksnr_conn_count++; + + /* Successful connection => further attempts can + * proceed immediately */ + route->ksnr_timeout = jiffies; + route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; +} + +void +ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route) +{ + struct list_head *tmp; + ksock_conn_t *conn; + int type; + ksock_route_t *route2; + + LASSERT (route->ksnr_peer == NULL); + LASSERT (route->ksnr_connecting == 0); + LASSERT (route->ksnr_connected == 0); + + /* LASSERT(unique) */ + list_for_each(tmp, &peer->ksnp_routes) { + route2 = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route2->ksnr_ipaddr == route->ksnr_ipaddr) { + CERROR ("Duplicate route "LPX64" %u.%u.%u.%u\n", + peer->ksnp_nid, HIPQUAD(route->ksnr_ipaddr)); + LBUG(); + } + } + + route->ksnr_peer = peer; + atomic_inc (&peer->ksnp_refcount); + /* peer's routelist takes over my ref on 'route' */ + list_add_tail(&route->ksnr_list, &peer->ksnp_routes); + + list_for_each(tmp, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + type = conn->ksnc_type; + + if (conn->ksnc_ipaddr != route->ksnr_ipaddr) + continue; + + ksocknal_associate_route_conn_locked(route, conn); + /* keep going (typed routes) */ + } +} + +void +ksocknal_del_route_locked (ksock_route_t *route) +{ + ksock_peer_t *peer = route->ksnr_peer; + ksock_interface_t *iface; + ksock_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + + LASSERT (!route->ksnr_deleted); + + /* Close associated conns */ + list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { + conn = list_entry(ctmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_route != route) + continue; + + ksocknal_close_conn_locked (conn, 0); + } + + if (route->ksnr_myipaddr != 0) { + iface = ksocknal_ip2iface(route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes--; + } + + route->ksnr_deleted = 1; + list_del (&route->ksnr_list); + ksocknal_put_route (route); /* drop peer's ref */ + + if (list_empty (&peer->ksnp_routes) && + list_empty (&peer->ksnp_conns)) { + /* I've just removed the last autoconnect route of a peer + * with no active connections */ + ksocknal_unlink_peer_locked (peer); + } } int -ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, - int bind_irq, int share, int eager) +ksocknal_add_peer (ptl_nid_t nid, __u32 ipaddr, int port) { unsigned long flags; + struct list_head *tmp; ksock_peer_t *peer; ksock_peer_t *peer2; ksock_route_t *route; - struct list_head *rtmp; ksock_route_t *route2; if (nid == PTL_NID_ANY) @@ -413,8 +558,7 @@ ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, if (peer == NULL) return (-ENOMEM); - route = ksocknal_create_route (ipaddr, port, bufnob, - bind_irq, eager); + route = ksocknal_create_route (ipaddr, port); if (route == NULL) { ksocknal_put_peer (peer); return (-ENOMEM); @@ -427,36 +571,27 @@ ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, ksocknal_put_peer (peer); peer = peer2; } else { - /* peer table takes existing ref on peer */ - list_add (&peer->ksnp_list, - ksocknal_nid2peerlist (nid)); + /* peer table takes my ref on peer */ + list_add_tail (&peer->ksnp_list, + ksocknal_nid2peerlist (nid)); } route2 = NULL; - if (share) { - /* check for existing route to this NID via this ipaddr */ - list_for_each (rtmp, &peer->ksnp_routes) { - route2 = list_entry (rtmp, ksock_route_t, ksnr_list); - - if (route2->ksnr_ipaddr == ipaddr) - break; - - route2 = NULL; - } + list_for_each (tmp, &peer->ksnp_routes) { + route2 = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route2->ksnr_ipaddr == ipaddr) + break; + + route2 = NULL; } - - if (route2 != NULL) { - ksocknal_put_route (route); - route = route2; + if (route2 == NULL) { + ksocknal_add_route_locked(peer, route); + route->ksnr_share_count++; } else { - /* route takes a ref on peer */ - route->ksnr_peer = peer; - atomic_inc (&peer->ksnp_refcount); - /* peer's route list takes existing ref on route */ - list_add_tail (&route->ksnr_list, &peer->ksnp_routes); + ksocknal_put_route(route); + route2->ksnr_share_count++; } - - route->ksnr_sharecount++; write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); @@ -464,59 +599,75 @@ ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, } void -ksocknal_del_route_locked (ksock_route_t *route, int share, int keep_conn) +ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip, int single_share) { - ksock_peer_t *peer = route->ksnr_peer; ksock_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; + ksock_route_t *route; + struct list_head *tmp; + struct list_head *nxt; + int nshared; - if (!share) - route->ksnr_sharecount = 0; - else { - route->ksnr_sharecount--; - if (route->ksnr_sharecount != 0) - return; - } + LASSERT (!peer->ksnp_closing); - list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { - conn = list_entry(ctmp, ksock_conn_t, ksnc_list); + list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); - if (conn->ksnc_route != route) + if (single_share && route->ksnr_share_count == 0) continue; - - if (!keep_conn) { - ksocknal_close_conn_locked (conn, 0); + + /* no match */ + if (!(ip == 0 || route->ksnr_ipaddr == ip)) continue; + + if (!single_share) + route->ksnr_share_count = 0; + else if (route->ksnr_share_count > 0) + route->ksnr_share_count--; + + if (route->ksnr_share_count == 0) { + /* This deletes associated conns too */ + ksocknal_del_route_locked (route); } - /* keeping the conn; just dissociate it and route... */ - conn->ksnc_route = NULL; - ksocknal_put_route (route); /* drop conn's ref on route */ + if (single_share) + break; } - - route->ksnr_deleted = 1; - list_del (&route->ksnr_list); - ksocknal_put_route (route); /* drop peer's ref */ - if (list_empty (&peer->ksnp_routes) && - list_empty (&peer->ksnp_conns)) { - /* I've just removed the last autoconnect route of a peer - * with no active connections */ - ksocknal_unlink_peer_locked (peer); + nshared = 0; + list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + nshared += route->ksnr_share_count; + } + + if (nshared == 0) { + /* remove everything else if there are no explicit entries + * left */ + + list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + /* we should only be removing auto-entries */ + LASSERT(route->ksnr_share_count == 0); + ksocknal_del_route_locked (route); + } + + list_for_each_safe (tmp, nxt, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + ksocknal_close_conn_locked(conn, 0); + } } + + /* NB peer unlinks itself when last conn/route is removed */ } int -ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, int share, int keep_conn) +ksocknal_del_peer (ptl_nid_t nid, __u32 ip, int single_share) { unsigned long flags; struct list_head *ptmp; struct list_head *pnxt; ksock_peer_t *peer; - struct list_head *rtmp; - struct list_head *rnxt; - ksock_route_t *route; int lo; int hi; int i; @@ -538,22 +689,14 @@ ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, int share, int keep_conn) if (!(nid == PTL_NID_ANY || peer->ksnp_nid == nid)) continue; - list_for_each_safe (rtmp, rnxt, &peer->ksnp_routes) { - route = list_entry (rtmp, ksock_route_t, - ksnr_list); - - if (!(ipaddr == 0 || - route->ksnr_ipaddr == ipaddr)) - continue; + ksocknal_del_peer_locked (peer, ip, single_share); + rc = 0; /* matched! */ - ksocknal_del_route_locked (route, share, keep_conn); - rc = 0; /* matched something */ - if (share) - goto out; - } + if (single_share) + break; } } - out: + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); return (rc); @@ -574,8 +717,7 @@ ksocknal_get_conn_by_idx (int index) list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { peer = list_entry (ptmp, ksock_peer_t, ksnp_list); - LASSERT (!(list_empty (&peer->ksnp_routes) && - list_empty (&peer->ksnp_conns))); + LASSERT (!peer->ksnp_closing); list_for_each (ctmp, &peer->ksnp_conns) { if (index-- > 0) @@ -593,8 +735,8 @@ ksocknal_get_conn_by_idx (int index) return (NULL); } -void -ksocknal_get_peer_addr (ksock_conn_t *conn) +int +ksocknal_get_conn_addrs (ksock_conn_t *conn) { struct sockaddr_in sin; int len = sizeof (sin); @@ -604,24 +746,37 @@ ksocknal_get_peer_addr (ksock_conn_t *conn) (struct sockaddr *)&sin, &len, 2); /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ LASSERT (!conn->ksnc_closing); - LASSERT (len <= sizeof (sin)); if (rc != 0) { CERROR ("Error %d getting sock peer IP\n", rc); - return; + return rc; } conn->ksnc_ipaddr = ntohl (sin.sin_addr.s_addr); conn->ksnc_port = ntohs (sin.sin_port); + + rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, + (struct sockaddr *)&sin, &len, 0); + if (rc != 0) { + CERROR ("Error %d getting sock local IP\n", rc); + return rc; + } + + conn->ksnc_myipaddr = ntohl (sin.sin_addr.s_addr); + + return 0; } unsigned int -ksocknal_conn_irq (ksock_conn_t *conn) +ksocknal_sock_irq (struct socket *sock) { int irq = 0; struct dst_entry *dst; - dst = sk_dst_get (conn->ksnc_sock->sk); + if (!ksocknal_tunables.ksnd_irq_affinity) + return 0; + + dst = sk_dst_get (sock->sk); if (dst != NULL) { if (dst->dev != NULL) { irq = dst->dev->irq; @@ -633,8 +788,6 @@ ksocknal_conn_irq (ksock_conn_t *conn) dst_release (dst); } - /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ - LASSERT (!conn->ksnc_closing); return (irq); } @@ -656,7 +809,7 @@ ksocknal_choose_scheduler_locked (unsigned int irq) /* software NIC (irq == 0) || not associated with a scheduler yet. * Choose the CPU with the fewest connections... */ sched = &ksocknal_data.ksnd_schedulers[0]; - for (i = 1; i < SOCKNAL_N_SCHED; i++) + for (i = 1; i < ksocknal_data.ksnd_nschedulers; i++) if (sched->kss_nconns > ksocknal_data.ksnd_schedulers[i].kss_nconns) sched = &ksocknal_data.ksnd_schedulers[i]; @@ -665,22 +818,286 @@ ksocknal_choose_scheduler_locked (unsigned int irq) info->ksni_valid = 1; info->ksni_sched = sched - ksocknal_data.ksnd_schedulers; - /* no overflow... */ - LASSERT (info->ksni_sched == sched - ksocknal_data.ksnd_schedulers); + /* no overflow... */ + LASSERT (info->ksni_sched == sched - ksocknal_data.ksnd_schedulers); + } + + return (sched); +} + +int +ksocknal_local_ipvec (__u32 *ipaddrs) +{ + int i; + int nip; + + read_lock (&ksocknal_data.ksnd_global_lock); + + nip = ksocknal_data.ksnd_ninterfaces; + for (i = 0; i < nip; i++) { + LASSERT (i < SOCKNAL_MAX_INTERFACES); + + ipaddrs[i] = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr; + LASSERT (ipaddrs[i] != 0); + } + + read_unlock (&ksocknal_data.ksnd_global_lock); + return (nip); +} + +int +ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips) +{ + int best_netmatch = 0; + int best_xor = 0; + int best = -1; + int this_xor; + int this_netmatch; + int i; + + for (i = 0; i < nips; i++) { + if (ips[i] == 0) + continue; + + this_xor = (ips[i] ^ iface->ksni_ipaddr); + this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best < 0 || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_xor > this_xor))) + continue; + + best = i; + best_netmatch = this_netmatch; + best_xor = this_xor; + } + + LASSERT (best >= 0); + return (best); +} + +int +ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) +{ + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + unsigned long flags; + ksock_interface_t *iface; + ksock_interface_t *best_iface; + int n_ips; + int i; + int j; + int k; + __u32 ip; + __u32 xor; + int this_netmatch; + int best_netmatch; + int best_npeers; + + /* CAVEAT EMPTOR: We do all our interface matching with an + * exclusive hold of global lock at IRQ priority. We're only + * expecting to be dealing with small numbers of interfaces, so the + * O(n**3)-ness shouldn't matter */ + + /* Also note that I'm not going to return more than n_peerips + * interfaces, even if I have more myself */ + + write_lock_irqsave(global_lock, flags); + + LASSERT (n_peerips <= SOCKNAL_MAX_INTERFACES); + LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES); + + n_ips = MIN(n_peerips, ksocknal_data.ksnd_ninterfaces); + + for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) { + /* ^ yes really... */ + + /* If we have any new interfaces, first tick off all the + * peer IPs that match old interfaces, then choose new + * interfaces to match the remaining peer IPS. + * We don't forget interfaces we've stopped using; we might + * start using them again... */ + + if (i < peer->ksnp_n_passive_ips) { + /* Old interface. */ + ip = peer->ksnp_passive_ips[i]; + best_iface = ksocknal_ip2iface(ip); + + /* peer passive ips are kept up to date */ + LASSERT(best_iface != NULL); + } else { + /* choose a new interface */ + LASSERT (i == peer->ksnp_n_passive_ips); + + best_iface = NULL; + best_netmatch = 0; + best_npeers = 0; + + for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) { + iface = &ksocknal_data.ksnd_interfaces[j]; + ip = iface->ksni_ipaddr; + + for (k = 0; k < peer->ksnp_n_passive_ips; k++) + if (peer->ksnp_passive_ips[k] == ip) + break; + + if (k < peer->ksnp_n_passive_ips) /* using it already */ + continue; + + k = ksocknal_match_peerip(iface, peerips, n_peerips); + xor = (ip ^ peerips[k]); + this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best_iface == NULL || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_npeers > iface->ksni_npeers))) + continue; + + best_iface = iface; + best_netmatch = this_netmatch; + best_npeers = iface->ksni_npeers; + } + + best_iface->ksni_npeers++; + ip = best_iface->ksni_ipaddr; + peer->ksnp_passive_ips[i] = ip; + peer->ksnp_n_passive_ips = i+1; + } + + LASSERT (best_iface != NULL); + + /* mark the best matching peer IP used */ + j = ksocknal_match_peerip(best_iface, peerips, n_peerips); + peerips[j] = 0; + } + + /* Overwrite input peer IP addresses */ + memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips)); + + write_unlock_irqrestore(global_lock, flags); + + return (n_ips); +} + +void +ksocknal_create_routes(ksock_peer_t *peer, int port, + __u32 *peer_ipaddrs, int npeer_ipaddrs) +{ + ksock_route_t *newroute = NULL; + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + unsigned long flags; + struct list_head *rtmp; + ksock_route_t *route; + ksock_interface_t *iface; + ksock_interface_t *best_iface; + int best_netmatch; + int this_netmatch; + int best_nroutes; + int i; + int j; + + /* CAVEAT EMPTOR: We do all our interface matching with an + * exclusive hold of global lock at IRQ priority. We're only + * expecting to be dealing with small numbers of interfaces, so the + * O(n**3)-ness here shouldn't matter */ + + write_lock_irqsave(global_lock, flags); + + LASSERT (npeer_ipaddrs <= SOCKNAL_MAX_INTERFACES); + + for (i = 0; i < npeer_ipaddrs; i++) { + if (newroute != NULL) { + newroute->ksnr_ipaddr = peer_ipaddrs[i]; + } else { + write_unlock_irqrestore(global_lock, flags); + + newroute = ksocknal_create_route(peer_ipaddrs[i], port); + if (newroute == NULL) + return; + + write_lock_irqsave(global_lock, flags); + } + + /* Already got a route? */ + route = NULL; + list_for_each(rtmp, &peer->ksnp_routes) { + route = list_entry(rtmp, ksock_route_t, ksnr_list); + + if (route->ksnr_ipaddr == newroute->ksnr_ipaddr) + break; + + route = NULL; + } + if (route != NULL) + continue; + + best_iface = NULL; + best_nroutes = 0; + best_netmatch = 0; + + LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES); + + /* Select interface to connect from */ + for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) { + iface = &ksocknal_data.ksnd_interfaces[j]; + + /* Using this interface already? */ + list_for_each(rtmp, &peer->ksnp_routes) { + route = list_entry(rtmp, ksock_route_t, ksnr_list); + + if (route->ksnr_myipaddr == iface->ksni_ipaddr) + break; + + route = NULL; + } + if (route != NULL) + continue; + + this_netmatch = (((iface->ksni_ipaddr ^ + newroute->ksnr_ipaddr) & + iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best_iface == NULL || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_nroutes > iface->ksni_nroutes))) + continue; + + best_iface = iface; + best_netmatch = this_netmatch; + best_nroutes = iface->ksni_nroutes; + } + + if (best_iface == NULL) + continue; + + newroute->ksnr_myipaddr = best_iface->ksni_ipaddr; + best_iface->ksni_nroutes++; + + ksocknal_add_route_locked(peer, newroute); + newroute = NULL; } - - return (sched); + + write_unlock_irqrestore(global_lock, flags); + if (newroute != NULL) + ksocknal_put_route(newroute); } int -ksocknal_create_conn (ksock_route_t *route, struct socket *sock, - int bind_irq, int type) +ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) { + int passive = (type == SOCKNAL_CONN_NONE); + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + __u32 ipaddrs[SOCKNAL_MAX_INTERFACES]; + int nipaddrs; ptl_nid_t nid; + struct list_head *tmp; __u64 incarnation; unsigned long flags; ksock_conn_t *conn; - ksock_peer_t *peer; + ksock_conn_t *conn2; + ksock_peer_t *peer = NULL; ksock_peer_t *peer2; ksock_sched_t *sched; unsigned int irq; @@ -693,45 +1110,23 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, * it, and sock->file has that pre-cooked... */ LASSERT (sock->file != NULL); LASSERT (file_count(sock->file) > 0); + LASSERT (route == NULL || !passive); rc = ksocknal_setup_sock (sock); if (rc != 0) return (rc); - if (route == NULL) { - /* acceptor or explicit connect */ - nid = PTL_NID_ANY; - } else { - LASSERT (type != SOCKNAL_CONN_NONE); - /* autoconnect: expect this nid on exchange */ - nid = route->ksnr_peer->ksnp_nid; - } - - rc = ksocknal_hello (sock, &nid, &type, &incarnation); - if (rc != 0) - return (rc); - - peer = NULL; - if (route == NULL) { /* not autoconnect */ - /* Assume this socket connects to a brand new peer */ - peer = ksocknal_create_peer (nid); - if (peer == NULL) - return (-ENOMEM); - } + irq = ksocknal_sock_irq (sock); PORTAL_ALLOC(conn, sizeof(*conn)); - if (conn == NULL) { - if (peer != NULL) - ksocknal_put_peer (peer); + if (conn == NULL) return (-ENOMEM); - } memset (conn, 0, sizeof (*conn)); conn->ksnc_peer = NULL; conn->ksnc_route = NULL; conn->ksnc_sock = sock; conn->ksnc_type = type; - conn->ksnc_incarnation = incarnation; conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; conn->ksnc_saved_write_space = sock->sk->sk_write_space; atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for me */ @@ -745,73 +1140,147 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, conn->ksnc_tx_scheduled = 0; atomic_set (&conn->ksnc_tx_nob, 0); - ksocknal_get_peer_addr (conn); + /* stash conn's local and remote addrs */ + rc = ksocknal_get_conn_addrs (conn); + if (rc != 0) + goto failed_0; - irq = ksocknal_conn_irq (conn); + if (!passive) { + /* Active connection sends HELLO eagerly */ + rc = ksocknal_local_ipvec(ipaddrs); + if (rc < 0) + goto failed_0; + nipaddrs = rc; - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + rc = ksocknal_send_hello (conn, ipaddrs, nipaddrs); + if (rc != 0) + goto failed_0; + } + + /* Find out/confirm peer's NID and connection type and get the + * vector of interfaces she's willing to let me connect to */ + nid = (route == NULL) ? PTL_NID_ANY : route->ksnr_peer->ksnp_nid; + rc = ksocknal_recv_hello (conn, &nid, &incarnation, ipaddrs); + if (rc < 0) + goto failed_0; + nipaddrs = rc; + LASSERT (nid != PTL_NID_ANY); if (route != NULL) { - /* Autoconnected! */ - LASSERT ((route->ksnr_connected & (1 << type)) == 0); - LASSERT ((route->ksnr_connecting & (1 << type)) != 0); - - if (route->ksnr_deleted) { - /* This conn was autoconnected, but the autoconnect - * route got deleted while it was being - * established! */ - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, - flags); - PORTAL_FREE (conn, sizeof (*conn)); - return (-ESTALE); + peer = route->ksnr_peer; + atomic_inc(&peer->ksnp_refcount); + } else { + peer = ksocknal_create_peer(nid); + if (peer == NULL) { + rc = -ENOMEM; + goto failed_0; } + write_lock_irqsave(global_lock, flags); - /* associate conn/route */ - conn->ksnc_route = route; - atomic_inc (&route->ksnr_refcount); - - route->ksnr_connecting &= ~(1 << type); - route->ksnr_connected |= (1 << type); - route->ksnr_conn_count++; - route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; + peer2 = ksocknal_find_peer_locked(nid); + if (peer2 == NULL) { + /* NB this puts an "empty" peer in the peer + * table (which takes my ref) */ + list_add_tail(&peer->ksnp_list, + ksocknal_nid2peerlist(nid)); + } else { + ksocknal_put_peer(peer); + peer = peer2; + } + /* +1 ref for me */ + atomic_inc(&peer->ksnp_refcount); - peer = route->ksnr_peer; + write_unlock_irqrestore(global_lock, flags); + } + + if (!passive) { + ksocknal_create_routes(peer, conn->ksnc_port, + ipaddrs, nipaddrs); + rc = 0; } else { - /* Not an autoconnected connection; see if there is an - * existing peer for this NID */ - peer2 = ksocknal_find_peer_locked (nid); - if (peer2 != NULL) { - ksocknal_put_peer (peer); - peer = peer2; - } else { - list_add (&peer->ksnp_list, - ksocknal_nid2peerlist (nid)); - /* peer list takes over existing ref */ + rc = ksocknal_select_ips(peer, ipaddrs, nipaddrs); + LASSERT (rc >= 0); + rc = ksocknal_send_hello (conn, ipaddrs, rc); + } + if (rc < 0) + goto failed_1; + + write_lock_irqsave (global_lock, flags); + + if (peer->ksnp_closing || + (route != NULL && route->ksnr_deleted)) { + /* route/peer got closed under me */ + rc = -ESTALE; + goto failed_2; + } + + /* Refuse to duplicate an existing connection (both sides might + * autoconnect at once), unless this is a loopback connection */ + if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) { + list_for_each(tmp, &peer->ksnp_conns) { + conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr || + conn2->ksnc_myipaddr != conn->ksnc_myipaddr || + conn2->ksnc_type != conn->ksnc_type || + conn2->ksnc_incarnation != incarnation) + continue; + + CWARN("Not creating duplicate connection to " + "%u.%u.%u.%u type %d\n", + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type); + rc = -EALREADY; + goto failed_2; } } + /* If the connection created by this route didn't bind to the IP + * address the route connected to, the connection/route matching + * code below probably isn't going to work. */ + if (route != NULL && + route->ksnr_ipaddr != conn->ksnc_ipaddr) { + CERROR("Route "LPX64" %u.%u.%u.%u connected to %u.%u.%u.%u\n", + peer->ksnp_nid, + HIPQUAD(route->ksnr_ipaddr), + HIPQUAD(conn->ksnc_ipaddr)); + } + + /* Search for a route corresponding to the new connection and + * create an association. This allows incoming connections created + * by routes in my peer to match my own route entries so I don't + * continually create duplicate routes. */ + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route->ksnr_ipaddr != conn->ksnc_ipaddr) + continue; + + ksocknal_associate_route_conn_locked(route, conn); + break; + } + /* Give conn a ref on sock->file since we're going to return success */ get_file(sock->file); - LASSERT (!peer->ksnp_closing); - - conn->ksnc_peer = peer; - atomic_inc (&peer->ksnp_refcount); + conn->ksnc_peer = peer; /* conn takes my ref on peer */ + conn->ksnc_incarnation = incarnation; peer->ksnp_last_alive = jiffies; peer->ksnp_error = 0; + sched = ksocknal_choose_scheduler_locked (irq); + sched->kss_nconns++; + conn->ksnc_scheduler = sched; + /* Set the deadline for the outgoing HELLO to drain */ + conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued; conn->ksnc_tx_deadline = jiffies + - ksocknal_data.ksnd_io_timeout * HZ; + ksocknal_tunables.ksnd_io_timeout * HZ; + mb(); /* order with adding to peer's conn list */ list_add (&conn->ksnc_list, &peer->ksnp_conns); atomic_inc (&conn->ksnc_refcount); - sched = ksocknal_choose_scheduler_locked (irq); - sched->kss_nconns++; - conn->ksnc_scheduler = sched; - /* NB my callbacks block while I hold ksnd_global_lock */ sock->sk->sk_user_data = conn; sock->sk->sk_data_ready = ksocknal_data_ready; @@ -819,10 +1288,7 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, /* Take all the packets blocking for a connection. * NB, it might be nicer to share these blocked packets among any - * other connections that are becoming established, however that - * confuses the normal packet launching operation, which selects a - * connection and queues the packet on it without needing an - * exclusive lock on ksnd_global_lock. */ + * other connections that are becoming established. */ while (!list_empty (&peer->ksnp_tx_queue)) { tx = list_entry (peer->ksnp_tx_queue.next, ksock_tx_t, tx_list); @@ -831,27 +1297,47 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, ksocknal_queue_tx_locked (tx, conn); } - rc = ksocknal_close_stale_conns_locked (peer, incarnation); - - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - + rc = ksocknal_close_stale_conns_locked(peer, incarnation); if (rc != 0) CERROR ("Closed %d stale conns to nid "LPX64" ip %d.%d.%d.%d\n", rc, conn->ksnc_peer->ksnp_nid, HIPQUAD(conn->ksnc_ipaddr)); - if (bind_irq) /* irq binding required */ - ksocknal_bind_irq (irq); + write_unlock_irqrestore (global_lock, flags); + + ksocknal_bind_irq (irq); /* Call the callbacks right now to get things going. */ - ksocknal_data_ready (sock->sk, 0); - ksocknal_write_space (sock->sk); + if (ksocknal_getconnsock(conn) == 0) { + ksocknal_data_ready (sock->sk, 0); + ksocknal_write_space (sock->sk); + ksocknal_putconnsock(conn); + } - CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64" ip %d.%d.%d.%d\n", - conn, conn->ksnc_peer->ksnp_nid, HIPQUAD(conn->ksnc_ipaddr)); + CWARN("New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d" + " incarnation:"LPX64" sched[%d]/%d\n", + nid, HIPQUAD(conn->ksnc_myipaddr), + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation, + (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq); ksocknal_put_conn (conn); return (0); + + failed_2: + if (!peer->ksnp_closing && + list_empty (&peer->ksnp_conns) && + list_empty (&peer->ksnp_routes)) + ksocknal_unlink_peer_locked(peer); + write_unlock_irqrestore(global_lock, flags); + + failed_1: + ksocknal_put_peer (peer); + + failed_0: + PORTAL_FREE (conn, sizeof(*conn)); + + LASSERT (rc != 0); + return (rc); } void @@ -860,14 +1346,19 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error) /* This just does the immmediate housekeeping, and queues the * connection for the reaper to terminate. * Caller holds ksnd_global_lock exclusively in irq context */ - ksock_peer_t *peer = conn->ksnc_peer; - ksock_route_t *route; + ksock_peer_t *peer = conn->ksnc_peer; + ksock_route_t *route; + ksock_conn_t *conn2; + struct list_head *tmp; LASSERT (peer->ksnp_error == 0); LASSERT (!conn->ksnc_closing); conn->ksnc_closing = 1; atomic_inc (&ksocknal_data.ksnd_nclosing_conns); + /* ksnd_deathrow_conns takes over peer's ref */ + list_del (&conn->ksnc_list); + route = conn->ksnc_route; if (route != NULL) { /* dissociate conn from route... */ @@ -875,18 +1366,28 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error) LASSERT ((route->ksnr_connecting & (1 << conn->ksnc_type)) == 0); LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0); - route->ksnr_connected &= ~(1 << conn->ksnc_type); + conn2 = NULL; + list_for_each(tmp, &peer->ksnp_conns) { + conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn2->ksnc_route == route && + conn2->ksnc_type == conn->ksnc_type) + break; + + conn2 = NULL; + } + if (conn2 == NULL) + route->ksnr_connected &= ~(1 << conn->ksnc_type); + conn->ksnc_route = NULL; +#if 0 /* irrelevent with only eager routes */ list_del (&route->ksnr_list); /* make route least favourite */ list_add_tail (&route->ksnr_list, &peer->ksnp_routes); - +#endif ksocknal_put_route (route); /* drop conn's ref on route */ } - /* ksnd_deathrow_conns takes over peer's ref */ - list_del (&conn->ksnc_list); - if (list_empty (&peer->ksnp_conns)) { /* No more connections to this peer */ @@ -1076,6 +1577,11 @@ ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation) if (conn->ksnc_incarnation == incarnation) continue; + + CWARN("Closing stale conn nid:"LPX64" ip:%08x/%d " + "incarnation:"LPX64"("LPX64")\n", + peer->ksnp_nid, conn->ksnc_ipaddr, conn->ksnc_port, + conn->ksnc_incarnation, incarnation); count++; ksocknal_close_conn_locked (conn, -ESTALE); @@ -1296,44 +1802,213 @@ ksocknal_push (ptl_nid_t nid) } int -ksocknal_cmd(struct portals_cfg *pcfg, void * private) +ksocknal_add_interface(__u32 ipaddress, __u32 netmask) +{ + unsigned long flags; + ksock_interface_t *iface; + int rc; + int i; + int j; + struct list_head *ptmp; + ksock_peer_t *peer; + struct list_head *rtmp; + ksock_route_t *route; + + if (ipaddress == 0 || + netmask == 0) + return (-EINVAL); + + write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); + + iface = ksocknal_ip2iface(ipaddress); + if (iface != NULL) { + /* silently ignore dups */ + rc = 0; + } else if (ksocknal_data.ksnd_ninterfaces == SOCKNAL_MAX_INTERFACES) { + rc = -ENOSPC; + } else { + iface = &ksocknal_data.ksnd_interfaces[ksocknal_data.ksnd_ninterfaces++]; + + iface->ksni_ipaddr = ipaddress; + iface->ksni_netmask = netmask; + iface->ksni_nroutes = 0; + iface->ksni_npeers = 0; + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(ptmp, ksock_peer_t, ksnp_list); + + for (j = 0; i < peer->ksnp_n_passive_ips; j++) + if (peer->ksnp_passive_ips[j] == ipaddress) + iface->ksni_npeers++; + + list_for_each(rtmp, &peer->ksnp_routes) { + route = list_entry(rtmp, ksock_route_t, ksnr_list); + + if (route->ksnr_myipaddr == ipaddress) + iface->ksni_nroutes++; + } + } + } + + rc = 0; + /* NB only new connections will pay attention to the new interface! */ + } + + write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); + + return (rc); +} + +void +ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr) +{ + struct list_head *tmp; + struct list_head *nxt; + ksock_route_t *route; + ksock_conn_t *conn; + int i; + int j; + + for (i = 0; i < peer->ksnp_n_passive_ips; i++) + if (peer->ksnp_passive_ips[i] == ipaddr) { + for (j = i+1; j < peer->ksnp_n_passive_ips; j++) + peer->ksnp_passive_ips[j-1] = + peer->ksnp_passive_ips[j]; + peer->ksnp_n_passive_ips--; + break; + } + + list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { + route = list_entry (tmp, ksock_route_t, ksnr_list); + + if (route->ksnr_myipaddr != ipaddr) + continue; + + if (route->ksnr_share_count != 0) { + /* Manually created; keep, but unbind */ + route->ksnr_myipaddr = 0; + } else { + ksocknal_del_route_locked(route); + } + } + + list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_myipaddr == ipaddr) + ksocknal_close_conn_locked (conn, 0); + } +} + +int +ksocknal_del_interface(__u32 ipaddress) { - int rc = -EINVAL; + int rc = -ENOENT; + unsigned long flags; + struct list_head *tmp; + struct list_head *nxt; + ksock_peer_t *peer; + __u32 this_ip; + int i; + int j; + + write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); + + for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) { + this_ip = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr; + + if (!(ipaddress == 0 || + ipaddress == this_ip)) + continue; + + rc = 0; + + for (j = i+1; j < ksocknal_data.ksnd_ninterfaces; j++) + ksocknal_data.ksnd_interfaces[j-1] = + ksocknal_data.ksnd_interfaces[j]; + + ksocknal_data.ksnd_ninterfaces--; + + for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) { + list_for_each_safe(tmp, nxt, &ksocknal_data.ksnd_peers[j]) { + peer = list_entry(tmp, ksock_peer_t, ksnp_list); + + ksocknal_peer_del_interface_locked(peer, this_ip); + } + } + } + + write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); + + return (rc); +} - LASSERT (pcfg != NULL); +int +ksocknal_cmd(struct portals_cfg *pcfg, void * private) +{ + int rc; switch(pcfg->pcfg_command) { - case NAL_CMD_GET_AUTOCONN: { - ksock_route_t *route = ksocknal_get_route_by_idx (pcfg->pcfg_count); + case NAL_CMD_GET_INTERFACE: { + ksock_interface_t *iface; + + read_lock (&ksocknal_data.ksnd_global_lock); - if (route == NULL) + if (pcfg->pcfg_count < 0 || + pcfg->pcfg_count >= ksocknal_data.ksnd_ninterfaces) { rc = -ENOENT; - else { + } else { rc = 0; - pcfg->pcfg_nid = route->ksnr_peer->ksnp_nid; - pcfg->pcfg_id = route->ksnr_ipaddr; - pcfg->pcfg_misc = route->ksnr_port; - pcfg->pcfg_count = route->ksnr_conn_count; - pcfg->pcfg_size = route->ksnr_buffer_size; - pcfg->pcfg_wait = route->ksnr_sharecount; - pcfg->pcfg_flags = (route->ksnr_irq_affinity ? 2 : 0) | - (route->ksnr_eager ? 4 : 0); - ksocknal_put_route (route); + iface = &ksocknal_data.ksnd_interfaces[pcfg->pcfg_count]; + + pcfg->pcfg_id = iface->ksni_ipaddr; + pcfg->pcfg_misc = iface->ksni_netmask; + pcfg->pcfg_fd = iface->ksni_npeers; + pcfg->pcfg_count = iface->ksni_nroutes; } + + read_unlock (&ksocknal_data.ksnd_global_lock); + break; + } + case NAL_CMD_ADD_INTERFACE: { + rc = ksocknal_add_interface(pcfg->pcfg_id, /* IP address */ + pcfg->pcfg_misc); /* net mask */ + break; + } + case NAL_CMD_DEL_INTERFACE: { + rc = ksocknal_del_interface(pcfg->pcfg_id); /* IP address */ + break; + } + case NAL_CMD_GET_PEER: { + ptl_nid_t nid = 0; + __u32 myip = 0; + __u32 ip = 0; + int port = 0; + int conn_count = 0; + int share_count = 0; + + rc = ksocknal_get_peer_info(pcfg->pcfg_count, &nid, + &myip, &ip, &port, + &conn_count, &share_count); + pcfg->pcfg_nid = nid; + pcfg->pcfg_size = myip; + pcfg->pcfg_id = ip; + pcfg->pcfg_misc = port; + pcfg->pcfg_count = conn_count; + pcfg->pcfg_wait = share_count; break; } - case NAL_CMD_ADD_AUTOCONN: { - rc = ksocknal_add_route (pcfg->pcfg_nid, pcfg->pcfg_id, - pcfg->pcfg_misc, pcfg->pcfg_size, - (pcfg->pcfg_flags & 0x02) != 0, - (pcfg->pcfg_flags & 0x04) != 0, - (pcfg->pcfg_flags & 0x08) != 0); + case NAL_CMD_ADD_PEER: { + rc = ksocknal_add_peer (pcfg->pcfg_nid, + pcfg->pcfg_id, /* IP */ + pcfg->pcfg_misc); /* port */ break; } - case NAL_CMD_DEL_AUTOCONN: { - rc = ksocknal_del_route (pcfg->pcfg_nid, pcfg->pcfg_id, - (pcfg->pcfg_flags & 1) != 0, - (pcfg->pcfg_flags & 2) != 0); + case NAL_CMD_DEL_PEER: { + rc = ksocknal_del_peer (pcfg->pcfg_nid, + pcfg->pcfg_id, /* IP */ + pcfg->pcfg_flags); /* single_share? */ break; } case NAL_CMD_GET_CONN: { @@ -1342,11 +2017,23 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private) if (conn == NULL) rc = -ENOENT; else { + int txmem; + int rxmem; + int nagle; + + ksocknal_get_conn_tunables(conn, &txmem, &rxmem, &nagle); + rc = 0; - pcfg->pcfg_nid = conn->ksnc_peer->ksnp_nid; - pcfg->pcfg_id = conn->ksnc_ipaddr; - pcfg->pcfg_misc = conn->ksnc_port; - pcfg->pcfg_flags = conn->ksnc_type; + pcfg->pcfg_nid = conn->ksnc_peer->ksnp_nid; + pcfg->pcfg_id = conn->ksnc_ipaddr; + pcfg->pcfg_misc = conn->ksnc_port; + pcfg->pcfg_fd = conn->ksnc_myipaddr; + pcfg->pcfg_flags = conn->ksnc_type; + pcfg->pcfg_gw_nal = conn->ksnc_scheduler - + ksocknal_data.ksnd_schedulers; + pcfg->pcfg_count = txmem; + pcfg->pcfg_size = rxmem; + pcfg->pcfg_wait = nagle; ksocknal_put_conn (conn); } break; @@ -1364,12 +2051,13 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private) case SOCKNAL_CONN_CONTROL: case SOCKNAL_CONN_BULK_IN: case SOCKNAL_CONN_BULK_OUT: - rc = ksocknal_create_conn(NULL, sock, pcfg->pcfg_flags, type); + rc = ksocknal_create_conn(NULL, sock, type); + break; default: + rc = -EINVAL; break; } - if (rc != 0) - fput (sock->file); + fput (sock->file); break; } case NAL_CMD_CLOSE_CONNECTION: { @@ -1385,6 +2073,9 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private) rc = ksocknal_push (pcfg->pcfg_nid); break; } + default: + rc = -EINVAL; + break; } return rc; @@ -1424,7 +2115,7 @@ ksocknal_free_buffers (void) if (ksocknal_data.ksnd_schedulers != NULL) PORTAL_FREE (ksocknal_data.ksnd_schedulers, - sizeof (ksock_sched_t) * SOCKNAL_N_SCHED); + sizeof (ksock_sched_t) * ksocknal_data.ksnd_nschedulers); PORTAL_FREE (ksocknal_data.ksnd_peers, sizeof (struct list_head) * @@ -1432,37 +2123,39 @@ ksocknal_free_buffers (void) } void -ksocknal_module_fini (void) +ksocknal_api_shutdown (nal_t *nal) { - int i; + ksock_sched_t *sched; + int i; + + if (nal->nal_refct != 0) { + /* This module got the first ref */ + PORTAL_MODULE_UNUSE; + return; + } CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", atomic_read (&portal_kmemory)); + LASSERT(nal == &ksocknal_api); + switch (ksocknal_data.ksnd_init) { default: LASSERT (0); case SOCKNAL_INIT_ALL: -#if CONFIG_SYSCTL - if (ksocknal_data.ksnd_sysctl != NULL) - unregister_sysctl_table (ksocknal_data.ksnd_sysctl); -#endif - kportal_nal_unregister(SOCKNAL); - PORTAL_SYMBOL_UNREGISTER (ksocknal_ni); + libcfs_nal_cmd_unregister(SOCKNAL); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; /* fall through */ - case SOCKNAL_INIT_PTL: + case SOCKNAL_INIT_LIB: /* No more calls to ksocknal_cmd() to create new * autoroutes/connections since we're being unloaded. */ - PtlNIFini(ksocknal_ni); - /* Delete all autoroute entries */ - ksocknal_del_route(PTL_NID_ANY, 0, 0, 0); + /* Delete all peers */ + ksocknal_del_peer(PTL_NID_ANY, 0, 0); - /* Delete all connections */ - ksocknal_close_matching_conns (PTL_NID_ANY, 0); - /* Wait for all peer state to clean up */ i = 2; while (atomic_read (&ksocknal_data.ksnd_npeers) != 0) { @@ -1476,11 +2169,11 @@ ksocknal_module_fini (void) /* Tell lib we've stopped calling into her. */ lib_fini(&ksocknal_lib); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; /* fall through */ case SOCKNAL_INIT_DATA: - /* Module refcount only gets to zero when all peers - * have been closed so all lists must be empty */ LASSERT (atomic_read (&ksocknal_data.ksnd_npeers) == 0); LASSERT (ksocknal_data.ksnd_peers != NULL); for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { @@ -1493,7 +2186,7 @@ ksocknal_module_fini (void) LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns)); if (ksocknal_data.ksnd_schedulers != NULL) - for (i = 0; i < SOCKNAL_N_SCHED; i++) { + for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i]; @@ -1510,19 +2203,30 @@ ksocknal_module_fini (void) wake_up_all (&ksocknal_data.ksnd_autoconnectd_waitq); wake_up_all (&ksocknal_data.ksnd_reaper_waitq); - for (i = 0; i < SOCKNAL_N_SCHED; i++) - wake_up_all(&ksocknal_data.ksnd_schedulers[i].kss_waitq); + for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { + sched = &ksocknal_data.ksnd_schedulers[i]; + wake_up_all(&sched->kss_waitq); + } - while (atomic_read (&ksocknal_data.ksnd_nthreads) != 0) { - CDEBUG (D_NET, "waitinf for %d threads to terminate\n", - atomic_read (&ksocknal_data.ksnd_nthreads)); + i = 4; + read_lock(&ksocknal_data.ksnd_global_lock); + while (ksocknal_data.ksnd_nthreads != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d threads to terminate\n", + ksocknal_data.ksnd_nthreads); + read_unlock(&ksocknal_data.ksnd_global_lock); set_current_state (TASK_UNINTERRUPTIBLE); schedule_timeout (HZ); + read_lock(&ksocknal_data.ksnd_global_lock); } + read_unlock(&ksocknal_data.ksnd_global_lock); kpr_deregister (&ksocknal_data.ksnd_router); ksocknal_free_buffers(); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING; /* fall through */ case SOCKNAL_INIT_NOTHING: @@ -1537,7 +2241,7 @@ ksocknal_module_fini (void) } -void __init +void ksocknal_init_incarnation (void) { struct timeval tv; @@ -1553,43 +2257,31 @@ ksocknal_init_incarnation (void) (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; } -int __init -ksocknal_module_init (void) +int +ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, + ptl_ni_limits_t *requested_limits, + ptl_ni_limits_t *actual_limits) { - int pkmem = atomic_read(&portal_kmemory); - int rc; - int i; - int j; + ptl_process_id_t process_id; + int pkmem = atomic_read(&portal_kmemory); + int rc; + int i; + int j; - /* packet descriptor must fit in a router descriptor's scratchpad */ - LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); - /* the following must be sizeof(int) for proc_dointvec() */ - LASSERT(sizeof (ksocknal_data.ksnd_io_timeout) == sizeof (int)); - LASSERT(sizeof (ksocknal_data.ksnd_eager_ack) == sizeof (int)); - /* check ksnr_connected/connecting field large enough */ - LASSERT(SOCKNAL_CONN_NTYPES <= 4); - - LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + LASSERT (nal == &ksocknal_api); - ksocknal_api.forward = ksocknal_api_forward; - ksocknal_api.shutdown = ksocknal_api_shutdown; - ksocknal_api.yield = ksocknal_api_yield; - ksocknal_api.validate = NULL; /* our api validate is a NOOP */ - ksocknal_api.lock = ksocknal_api_lock; - ksocknal_api.unlock = ksocknal_api_unlock; - ksocknal_api.nal_data = &ksocknal_data; + if (nal->nal_refct != 0) { + if (actual_limits != NULL) + *actual_limits = ksocknal_lib.libnal_ni.ni_actual_limits; + /* This module got the first ref */ + PORTAL_MODULE_USE; + return (PTL_OK); + } - ksocknal_lib.nal_data = &ksocknal_data; + LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ - ksocknal_data.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT; - ksocknal_data.ksnd_eager_ack = SOCKNAL_EAGER_ACK; - ksocknal_data.ksnd_typed_conns = SOCKNAL_TYPED_CONNS; - ksocknal_data.ksnd_min_bulk = SOCKNAL_MIN_BULK; -#if SOCKNAL_ZC - ksocknal_data.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; -#endif ksocknal_init_incarnation(); ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; @@ -1603,9 +2295,6 @@ ksocknal_module_init (void) rwlock_init(&ksocknal_data.ksnd_global_lock); - ksocknal_data.ksnd_nal_cb = &ksocknal_lib; - spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock); - spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); @@ -1632,14 +2321,15 @@ ksocknal_module_init (void) /* flag lists/ptrs/locks initialised */ ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; + ksocknal_data.ksnd_nschedulers = ksocknal_nsched(); PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, - sizeof(ksock_sched_t) * SOCKNAL_N_SCHED); + sizeof(ksock_sched_t) * ksocknal_data.ksnd_nschedulers); if (ksocknal_data.ksnd_schedulers == NULL) { - ksocknal_module_fini (); + ksocknal_api_shutdown (nal); return (-ENOMEM); } - for (i = 0; i < SOCKNAL_N_SCHED; i++) { + for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i]; spin_lock_init (&kss->kss_lock); @@ -1651,23 +2341,27 @@ ksocknal_module_init (void) init_waitqueue_head (&kss->kss_waitq); } - rc = PtlNIInit(ksocknal_init, 32, 4, 0, &ksocknal_ni); - if (rc != 0) { - CERROR("ksocknal: PtlNIInit failed: error %d\n", rc); - ksocknal_module_fini (); + /* NB we have to wait to be told our true NID... */ + process_id.pid = requested_pid; + process_id.nid = 0; + + rc = lib_init(&ksocknal_lib, nal, process_id, + requested_limits, actual_limits); + if (rc != PTL_OK) { + CERROR("lib_init failed: error %d\n", rc); + ksocknal_api_shutdown (nal); return (rc); } - PtlNIDebug(ksocknal_ni, ~0); - ksocknal_data.ksnd_init = SOCKNAL_INIT_PTL; // flag PtlNIInit() called + ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; // flag lib_init() called - for (i = 0; i < SOCKNAL_N_SCHED; i++) { + for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { rc = ksocknal_thread_start (ksocknal_scheduler, &ksocknal_data.ksnd_schedulers[i]); if (rc != 0) { CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc); - ksocknal_module_fini (); + ksocknal_api_shutdown (nal); return (rc); } } @@ -1676,7 +2370,7 @@ ksocknal_module_init (void) rc = ksocknal_thread_start (ksocknal_autoconnectd, (void *)((long)i)); if (rc != 0) { CERROR("Can't spawn socknal autoconnectd: %d\n", rc); - ksocknal_module_fini (); + ksocknal_api_shutdown (nal); return (rc); } } @@ -1684,7 +2378,7 @@ ksocknal_module_init (void) rc = ksocknal_thread_start (ksocknal_reaper, NULL); if (rc != 0) { CERROR ("Can't spawn socknal reaper: %d\n", rc); - ksocknal_module_fini (); + ksocknal_api_shutdown (nal); return (rc); } @@ -1694,7 +2388,7 @@ ksocknal_module_init (void) CDEBUG(D_NET, "Can't initialise routing interface " "(rc = %d): not routing\n", rc); } else { - /* Only allocate forwarding buffers if I'm on a gateway */ + /* Only allocate forwarding buffers if there's a router */ for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++) { @@ -1710,7 +2404,7 @@ ksocknal_module_init (void) PORTAL_ALLOC(fmb, offsetof(ksock_fmb_t, fmb_kiov[pool->fmp_buff_pages])); if (fmb == NULL) { - ksocknal_module_fini(); + ksocknal_api_shutdown(nal); return (-ENOMEM); } @@ -1720,7 +2414,7 @@ ksocknal_module_init (void) fmb->fmb_kiov[j].kiov_page = alloc_page(GFP_KERNEL); if (fmb->fmb_kiov[j].kiov_page == NULL) { - ksocknal_module_fini (); + ksocknal_api_shutdown (nal); return (-ENOMEM); } @@ -1731,27 +2425,100 @@ ksocknal_module_init (void) } } - rc = kportal_nal_register(SOCKNAL, &ksocknal_cmd, NULL); + rc = libcfs_nal_cmd_register(SOCKNAL, &ksocknal_cmd, NULL); if (rc != 0) { CERROR ("Can't initialise command interface (rc = %d)\n", rc); - ksocknal_module_fini (); + ksocknal_api_shutdown (nal); return (rc); } - PORTAL_SYMBOL_REGISTER(ksocknal_ni); - -#ifdef CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - ksocknal_data.ksnd_sysctl = register_sysctl_table (ksocknal_top_ctl_table, 0); -#endif /* flag everything initialised */ ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; printk(KERN_INFO "Lustre: Routing socket NAL loaded " - "(Routing %s, initial mem %d)\n", + "(Routing %s, initial mem %d, incarnation "LPX64")\n", kpr_routing (&ksocknal_data.ksnd_router) ? - "enabled" : "disabled", pkmem); + "enabled" : "disabled", pkmem, ksocknal_data.ksnd_incarnation); + + return (0); +} + +void __exit +ksocknal_module_fini (void) +{ +#ifdef CONFIG_SYSCTL + if (ksocknal_tunables.ksnd_sysctl != NULL) + unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl); +#endif + PtlNIFini(ksocknal_ni); + + ptl_unregister_nal(SOCKNAL); +} + +int __init +ksocknal_module_init (void) +{ + int rc; + + /* packet descriptor must fit in a router descriptor's scratchpad */ + LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + /* the following must be sizeof(int) for proc_dointvec() */ + LASSERT(sizeof (ksocknal_tunables.ksnd_io_timeout) == sizeof (int)); + LASSERT(sizeof (ksocknal_tunables.ksnd_eager_ack) == sizeof (int)); + LASSERT(sizeof (ksocknal_tunables.ksnd_typed_conns) == sizeof (int)); + LASSERT(sizeof (ksocknal_tunables.ksnd_min_bulk) == sizeof (int)); + LASSERT(sizeof (ksocknal_tunables.ksnd_buffer_size) == sizeof (int)); + LASSERT(sizeof (ksocknal_tunables.ksnd_nagle) == sizeof (int)); + LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_idle) == sizeof (int)); + LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_count) == sizeof (int)); + LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_intvl) == sizeof (int)); +#if CPU_AFFINITY + LASSERT(sizeof (ksocknal_tunables.ksnd_irq_affinity) == sizeof (int)); +#endif +#if SOCKNAL_ZC + LASSERT(sizeof (ksocknal_tunables.ksnd_zc_min_frag) == sizeof (int)); +#endif + /* check ksnr_connected/connecting field large enough */ + LASSERT(SOCKNAL_CONN_NTYPES <= 4); + + ksocknal_api.nal_ni_init = ksocknal_api_startup; + ksocknal_api.nal_ni_fini = ksocknal_api_shutdown; + + /* Initialise dynamic tunables to defaults once only */ + ksocknal_tunables.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT; + ksocknal_tunables.ksnd_eager_ack = SOCKNAL_EAGER_ACK; + ksocknal_tunables.ksnd_typed_conns = SOCKNAL_TYPED_CONNS; + ksocknal_tunables.ksnd_min_bulk = SOCKNAL_MIN_BULK; + ksocknal_tunables.ksnd_buffer_size = SOCKNAL_BUFFER_SIZE; + ksocknal_tunables.ksnd_nagle = SOCKNAL_NAGLE; + ksocknal_tunables.ksnd_keepalive_idle = SOCKNAL_KEEPALIVE_IDLE; + ksocknal_tunables.ksnd_keepalive_count = SOCKNAL_KEEPALIVE_COUNT; + ksocknal_tunables.ksnd_keepalive_intvl = SOCKNAL_KEEPALIVE_INTVL; +#if CPU_AFFINITY + ksocknal_tunables.ksnd_irq_affinity = SOCKNAL_IRQ_AFFINITY; +#endif +#if SOCKNAL_ZC + ksocknal_tunables.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; +#endif + + rc = ptl_register_nal(SOCKNAL, &ksocknal_api); + if (rc != PTL_OK) { + CERROR("Can't register SOCKNAL: %d\n", rc); + return (-ENOMEM); /* or something... */ + } + /* Pure gateways want the NAL started up at module load time... */ + rc = PtlNIInit(SOCKNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &ksocknal_ni); + if (rc != PTL_OK && rc != PTL_IFACE_DUP) { + ptl_unregister_nal(SOCKNAL); + return (-ENODEV); + } + +#ifdef CONFIG_SYSCTL + /* Press on regardless even if registering sysctl doesn't work */ + ksocknal_tunables.ksnd_sysctl = + register_sysctl_table (ksocknal_top_ctl_table, 0); +#endif return (0); } @@ -1762,4 +2529,3 @@ MODULE_LICENSE("GPL"); module_init(ksocknal_module_init); module_exit(ksocknal_module_fini); -EXPORT_SYMBOL (ksocknal_ni); diff --git a/lustre/portals/knals/socknal/socknal.h b/lustre/portals/knals/socknal/socknal.h index 2bef800..b8bbefd 100644 --- a/lustre/portals/knals/socknal/socknal.h +++ b/lustre/portals/knals/socknal/socknal.h @@ -67,7 +67,6 @@ #include #include -#define SOCKNAL_N_SCHED ksocknal_nsched() /* # socknal schedulers */ #define SOCKNAL_N_AUTOCONNECTD 4 /* # socknal autoconnect daemons */ #define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ @@ -79,7 +78,12 @@ #define SOCKNAL_TYPED_CONNS 1 /* unidirectional large, bidirectional small? */ #define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ #define SOCKNAL_MIN_BULK (1<<10) /* smallest "large" message */ -#define SOCKNAL_USE_KEEPALIVES 0 /* use tcp/ip keepalive? */ +#define SOCKNAL_BUFFER_SIZE (8<<20) /* default socket buffer size */ +#define SOCKNAL_NAGLE 0 /* enable/disable NAGLE? */ +#define SOCKNAL_IRQ_AFFINITY 1 /* enable/disable IRQ affinity? */ +#define SOCKNAL_KEEPALIVE_IDLE 0 /* # seconds idle before 1st probe */ +#define SOCKNAL_KEEPALIVE_COUNT 10 /* # unanswered probes to determine peer death */ +#define SOCKNAL_KEEPALIVE_INTVL 1 /* seconds between probes */ #define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ @@ -94,9 +98,14 @@ #define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ #define SOCKNAL_ENOMEM_RETRY 1 /* jiffies between retries */ +#define SOCKNAL_MAX_INTERFACES 16 /* Largest number of interfaces we bind */ + +#define SOCKNAL_ROUND_ROBIN 0 /* round robin / load balance */ + #define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10) #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72)) +# define sk_allocation allocation # define sk_data_ready data_ready # define sk_write_space write_space # define sk_user_data user_data @@ -107,6 +116,7 @@ #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) # define sk_wmem_queued wmem_queued +# define sk_err err #endif typedef struct /* pool of forwarding buffers */ @@ -131,34 +141,52 @@ typedef struct /* per scheduler state */ int kss_nconns; /* # connections assigned to this scheduler */ } ksock_sched_t; -typedef struct { +typedef struct +{ int ksni_valid:1; /* been set yet? */ int ksni_bound:1; /* bound to a cpu yet? */ int ksni_sched:6; /* which scheduler (assumes < 64) */ } ksock_irqinfo_t; -typedef struct { - int ksnd_init; /* initialisation state */ +typedef struct +{ + __u32 ksni_ipaddr; /* interface's IP address */ + __u32 ksni_netmask; /* interface's network mask */ + int ksni_nroutes; /* # routes using (active) */ + int ksni_npeers; /* # peers using (passive) */ +} ksock_interface_t; + +typedef struct +{ int ksnd_io_timeout; /* "stuck" socket timeout (seconds) */ int ksnd_eager_ack; /* make TCP ack eagerly? */ int ksnd_typed_conns; /* drive sockets by type? */ int ksnd_min_bulk; /* smallest "large" message */ + int ksnd_buffer_size; /* socket buffer size */ + int ksnd_nagle; /* enable NAGLE? */ + int ksnd_irq_affinity; /* enable IRQ affinity? */ + int ksnd_keepalive_idle; /* # idle secs before 1st probe */ + int ksnd_keepalive_count; /* # probes */ + int ksnd_keepalive_intvl; /* time between probes */ #if SOCKNAL_ZC unsigned int ksnd_zc_min_frag; /* minimum zero copy frag size */ #endif struct ctl_table_header *ksnd_sysctl; /* sysctl interface */ +} ksock_tunables_t; + +typedef struct +{ + int ksnd_init; /* initialisation state */ __u64 ksnd_incarnation; /* my epoch */ rwlock_t ksnd_global_lock; /* stabilize peer/conn ops */ struct list_head *ksnd_peers; /* hash table of all my known peers */ int ksnd_peer_hash_size; /* size of ksnd_peers */ - nal_cb_t *ksnd_nal_cb; - spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ - - atomic_t ksnd_nthreads; /* # live threads */ + int ksnd_nthreads; /* # live threads */ int ksnd_shuttingdown; /* tell threads to exit */ - ksock_sched_t *ksnd_schedulers; /* scheduler state */ + int ksnd_nschedulers; /* # schedulers */ + ksock_sched_t *ksnd_schedulers; /* their state */ atomic_t ksnd_npeers; /* total # peers extant */ atomic_t ksnd_nclosing_conns; /* # closed conns extant */ @@ -186,11 +214,14 @@ typedef struct { spinlock_t ksnd_autoconnectd_lock; /* serialise */ ksock_irqinfo_t ksnd_irqinfo[NR_IRQS];/* irq->scheduler lookup */ + + int ksnd_ninterfaces; + ksock_interface_t ksnd_interfaces[SOCKNAL_MAX_INTERFACES]; /* published interfaces */ } ksock_nal_data_t; #define SOCKNAL_INIT_NOTHING 0 #define SOCKNAL_INIT_DATA 1 -#define SOCKNAL_INIT_PTL 2 +#define SOCKNAL_INIT_LIB 2 #define SOCKNAL_INIT_ALL 3 /* A packet just assembled for transmission is represented by 1 or more @@ -286,6 +317,7 @@ typedef struct ksock_conn void *ksnc_saved_write_space; /* socket's original write_space() callback */ atomic_t ksnc_refcount; /* # users */ ksock_sched_t *ksnc_scheduler; /* who schedules this connection */ + __u32 ksnc_myipaddr; /* my IP */ __u32 ksnc_ipaddr; /* peer's IP */ int ksnc_port; /* peer's port */ int ksnc_closing; /* being shut down */ @@ -313,6 +345,7 @@ typedef struct ksock_conn struct list_head ksnc_tx_list; /* where I enq waiting for output space */ struct list_head ksnc_tx_queue; /* packets waiting to be sent */ unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out */ + int ksnc_tx_bufnob; /* send buffer marker */ atomic_t ksnc_tx_nob; /* # bytes queued */ int ksnc_tx_ready; /* write space */ int ksnc_tx_scheduled; /* being progressed */ @@ -328,17 +361,15 @@ typedef struct ksock_route struct list_head ksnr_connect_list; /* chain on autoconnect list */ struct ksock_peer *ksnr_peer; /* owning peer */ atomic_t ksnr_refcount; /* # users */ - int ksnr_sharecount; /* lconf usage counter */ unsigned long ksnr_timeout; /* when (in jiffies) reconnection can happen next */ unsigned int ksnr_retry_interval; /* how long between retries */ - __u32 ksnr_ipaddr; /* an IP address for this peer */ + __u32 ksnr_myipaddr; /* my IP */ + __u32 ksnr_ipaddr; /* IP address to connect to */ int ksnr_port; /* port to connect to */ - int ksnr_buffer_size; /* size of socket buffers */ - unsigned int ksnr_irq_affinity:1; /* set affinity? */ - unsigned int ksnr_eager:1; /* connect eagery? */ unsigned int ksnr_connecting:4; /* autoconnects in progress by type */ unsigned int ksnr_connected:4; /* connections established by type */ unsigned int ksnr_deleted:1; /* been removed from peer? */ + unsigned int ksnr_share_count; /* created explicitly? */ int ksnr_conn_count; /* # conns established by this route */ } ksock_route_t; @@ -347,31 +378,35 @@ typedef struct ksock_peer struct list_head ksnp_list; /* stash on global peer list */ ptl_nid_t ksnp_nid; /* who's on the other end(s) */ atomic_t ksnp_refcount; /* # users */ + int ksnp_sharecount; /* lconf usage counter */ int ksnp_closing; /* being closed */ int ksnp_error; /* errno on closing last conn */ struct list_head ksnp_conns; /* all active connections */ struct list_head ksnp_routes; /* routes */ struct list_head ksnp_tx_queue; /* waiting packets */ unsigned long ksnp_last_alive; /* when (in jiffies) I was last alive */ + int ksnp_n_passive_ips; /* # of... */ + __u32 ksnp_passive_ips[SOCKNAL_MAX_INTERFACES]; /* preferred local interfaces */ } ksock_peer_t; -extern nal_cb_t ksocknal_lib; +extern lib_nal_t ksocknal_lib; extern ksock_nal_data_t ksocknal_data; +extern ksock_tunables_t ksocknal_tunables; static inline struct list_head * -ksocknal_nid2peerlist (ptl_nid_t nid) +ksocknal_nid2peerlist (ptl_nid_t nid) { unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size; - + return (&ksocknal_data.ksnd_peers [hash]); } static inline int -ksocknal_getconnsock (ksock_conn_t *conn) +ksocknal_getconnsock (ksock_conn_t *conn) { int rc = -ESHUTDOWN; - + read_lock (&ksocknal_data.ksnd_global_lock); if (!conn->ksnc_closing) { rc = 0; @@ -389,7 +424,7 @@ ksocknal_putconnsock (ksock_conn_t *conn) } #ifndef CONFIG_SMP -static inline +static inline int ksocknal_nsched(void) { return 1; @@ -414,7 +449,7 @@ ksocknal_irqsched2cpu(int i) { return i; } -# else +# else static inline int ksocknal_nsched(void) { @@ -431,16 +466,13 @@ ksocknal_sched2cpu(int i) { if (smp_num_siblings == 1) return i; - + return (i * 2); } static inline int ksocknal_irqsched2cpu(int i) { - if (smp_num_siblings == 1) - return ksocknal_sched2cpu(i); - return (ksocknal_sched2cpu(i) + 1); } # endif @@ -453,7 +485,7 @@ extern ksock_peer_t *ksocknal_get_peer (ptl_nid_t nid); extern int ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, int single, int keep_conn); extern int ksocknal_create_conn (ksock_route_t *route, - struct socket *sock, int bind_irq, int type); + struct socket *sock, int type); extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why); extern void ksocknal_terminate_conn (ksock_conn_t *conn); extern void ksocknal_destroy_conn (ksock_conn_t *conn); @@ -474,6 +506,9 @@ extern void ksocknal_data_ready(struct sock *sk, int n); extern void ksocknal_write_space(struct sock *sk); extern int ksocknal_autoconnectd (void *arg); extern int ksocknal_reaper (void *arg); +extern int ksocknal_get_conn_tunables (ksock_conn_t *conn, int *txmem, + int *rxmem, int *nagle); extern int ksocknal_setup_sock (struct socket *sock); -extern int ksocknal_hello (struct socket *sock, - ptl_nid_t *nid, int *type, __u64 *incarnation); +extern int ksocknal_send_hello (ksock_conn_t *conn, __u32 *ipaddrs, int nipaddrs); +extern int ksocknal_recv_hello (ksock_conn_t *conn, + ptl_nid_t *nid, __u64 *incarnation, __u32 *ipaddrs); diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c index f6ac855..762133e 100644 --- a/lustre/portals/knals/socknal/socknal_cb.c +++ b/lustre/portals/knals/socknal/socknal_cb.c @@ -32,87 +32,12 @@ * LIB functions follow * */ -ptl_err_t -ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr, - user_ptr src_addr, size_t len) -{ - CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n", - nal->ni.nid, (long)len, src_addr, dst_addr); - - memcpy( dst_addr, src_addr, len ); - return PTL_OK; -} - -ptl_err_t -ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, - void *src_addr, size_t len) -{ - CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n", - nal->ni.nid, (long)len, src_addr, dst_addr); - - memcpy( dst_addr, src_addr, len ); - return PTL_OK; -} - -void * -ksocknal_malloc(nal_cb_t *nal, size_t len) -{ - void *buf; - - PORTAL_ALLOC(buf, len); - - if (buf != NULL) - memset(buf, 0, len); - - return (buf); -} - -void -ksocknal_free(nal_cb_t *nal, void *buf, size_t len) -{ - PORTAL_FREE(buf, len); -} - -void -ksocknal_printf(nal_cb_t *nal, const char *fmt, ...) -{ - va_list ap; - char msg[256]; - - va_start (ap, fmt); - vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ - va_end (ap); - - msg[sizeof (msg) - 1] = 0; /* ensure terminated */ - - CDEBUG (D_NET, "%s", msg); -} - -void -ksocknal_cli(nal_cb_t *nal, unsigned long *flags) -{ - ksock_nal_data_t *data = nal->nal_data; - - /* OK to ignore 'flags'; we're only ever serialise threads and - * never need to lock out interrupts */ - spin_lock(&data->ksnd_nal_cb_lock); -} - -void -ksocknal_sti(nal_cb_t *nal, unsigned long *flags) -{ - ksock_nal_data_t *data; - data = nal->nal_data; - - spin_unlock(&data->ksnd_nal_cb_lock); -} - int -ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +ksocknal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { /* I would guess that if ksocknal_get_peer (nid) == NULL, and we're not routing, then 'nid' is very distant :) */ - if ( nal->ni.nid == nid ) { + if (nal->libnal_ni.ni_pid.nid == nid) { *dist = 0; } else { *dist = 1; @@ -251,7 +176,7 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) LASSERT (tx->tx_nkiov > 0); #if SOCKNAL_ZC - if (fragsize >= ksocknal_data.ksnd_zc_min_frag && + if (fragsize >= ksocknal_tunables.ksnd_zc_min_frag && (sock->sk->route_caps & NETIF_F_SG) && (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) { @@ -304,6 +229,7 @@ int ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) { int rc; + int bufnob; if (ksocknal_data.ksnd_stall_tx != 0) { set_current_state (TASK_UNINTERRUPTIBLE); @@ -329,6 +255,20 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) rc = ksocknal_send_kiov (conn, tx); } + bufnob = conn->ksnc_sock->sk->sk_wmem_queued; + if (rc > 0) /* sent something? */ + conn->ksnc_tx_bufnob += rc; /* account it */ + + if (bufnob < conn->ksnc_tx_bufnob) { + /* allocated send buffer bytes < computed; infer + * something got ACKed */ + conn->ksnc_tx_deadline = jiffies + + ksocknal_tunables.ksnd_io_timeout * HZ; + conn->ksnc_peer->ksnp_last_alive = jiffies; + conn->ksnc_tx_bufnob = bufnob; + mb(); + } + if (rc <= 0) { /* Didn't write anything. * @@ -361,18 +301,10 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) break; } + /* socket's wmem_queued now includes 'rc' bytes */ + atomic_sub (rc, &conn->ksnc_tx_nob); rc = 0; - /* Consider the connection alive since we managed to chuck - * more data into it. Really, we'd like to consider it - * alive only when the peer ACKs something, but - * write_space() only gets called back while SOCK_NOSPACE - * is set. Instead, we presume peer death has occurred if - * the socket doesn't drain within a timout */ - conn->ksnc_tx_deadline = jiffies + - ksocknal_data.ksnd_io_timeout * HZ; - conn->ksnc_peer->ksnp_last_alive = jiffies; - } while (tx->tx_resid != 0); ksocknal_putconnsock (conn); @@ -433,7 +365,7 @@ ksocknal_recv_iov (ksock_conn_t *conn) /* received something... */ conn->ksnc_peer->ksnp_last_alive = jiffies; conn->ksnc_rx_deadline = jiffies + - ksocknal_data.ksnd_io_timeout * HZ; + ksocknal_tunables.ksnd_io_timeout * HZ; mb(); /* order with setting rx_started */ conn->ksnc_rx_started = 1; @@ -492,7 +424,7 @@ ksocknal_recv_kiov (ksock_conn_t *conn) /* received something... */ conn->ksnc_peer->ksnp_last_alive = jiffies; conn->ksnc_rx_deadline = jiffies + - ksocknal_data.ksnd_io_timeout * HZ; + ksocknal_tunables.ksnd_io_timeout * HZ; mb(); /* order with setting rx_started */ conn->ksnc_rx_started = 1; @@ -551,7 +483,7 @@ ksocknal_receive (ksock_conn_t *conn) if (conn->ksnc_rx_nob_wanted == 0) { /* Completed a message segment (header or payload) */ - if ((ksocknal_data.ksnd_eager_ack & conn->ksnc_type) != 0 && + if ((ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0 && (conn->ksnc_rx_state == SOCKNAL_RX_BODY || conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD)) { /* Remind the socket to ack eagerly... */ @@ -594,8 +526,6 @@ ksocknal_tx_done (ksock_tx_t *tx, int asynch) ENTRY; if (tx->tx_conn != NULL) { - /* This tx got queued on a conn; do the accounting... */ - atomic_sub (tx->tx_nob, &tx->tx_conn->ksnc_tx_nob); #if SOCKNAL_ZC /* zero copy completion isn't always from * process_transmit() so it needs to keep a ref on @@ -710,9 +640,9 @@ ksocknal_launch_autoconnect_locked (ksock_route_t *route) LASSERT (!route->ksnr_deleted); LASSERT ((route->ksnr_connected & (1 << SOCKNAL_CONN_ANY)) == 0); LASSERT ((route->ksnr_connected & KSNR_TYPED_ROUTES) != KSNR_TYPED_ROUTES); - LASSERT (!route->ksnr_connecting); + LASSERT (route->ksnr_connecting == 0); - if (ksocknal_data.ksnd_typed_conns) + if (ksocknal_tunables.ksnd_typed_conns) route->ksnr_connecting = KSNR_TYPED_ROUTES & ~route->ksnr_connected; else @@ -772,13 +702,16 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer) int tnob = 0; ksock_conn_t *fallback = NULL; int fnob = 0; + ksock_conn_t *conn; - /* Find the conn with the shortest tx queue */ list_for_each (tmp, &peer->ksnp_conns) { ksock_conn_t *c = list_entry(tmp, ksock_conn_t, ksnc_list); +#if SOCKNAL_ROUND_ROBIN + const int nob = 0; +#else int nob = atomic_read(&c->ksnc_tx_nob) + c->ksnc_sock->sk->sk_wmem_queued; - +#endif LASSERT (!c->ksnc_closing); if (fallback == NULL || nob < fnob) { @@ -786,7 +719,7 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer) fnob = nob; } - if (!ksocknal_data.ksnd_typed_conns) + if (!ksocknal_tunables.ksnd_typed_conns) continue; switch (c->ksnc_type) { @@ -797,11 +730,11 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer) case SOCKNAL_CONN_BULK_IN: continue; case SOCKNAL_CONN_BULK_OUT: - if (tx->tx_nob < ksocknal_data.ksnd_min_bulk) + if (tx->tx_nob < ksocknal_tunables.ksnd_min_bulk) continue; break; case SOCKNAL_CONN_CONTROL: - if (tx->tx_nob >= ksocknal_data.ksnd_min_bulk) + if (tx->tx_nob >= ksocknal_tunables.ksnd_min_bulk) continue; break; } @@ -813,7 +746,16 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer) } /* prefer the typed selection */ - return ((typed != NULL) ? typed : fallback); + conn = (typed != NULL) ? typed : fallback; + +#if SOCKNAL_ROUND_ROBIN + if (conn != NULL) { + /* round-robin all else being equal */ + list_del (&conn->ksnc_list); + list_add_tail (&conn->ksnc_list, &peer->ksnp_conns); + } +#endif + return conn; } void @@ -844,9 +786,14 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) #endif spin_lock_irqsave (&sched->kss_lock, flags); - conn->ksnc_tx_deadline = jiffies + - ksocknal_data.ksnd_io_timeout * HZ; - mb(); /* order with list_add_tail */ + if (list_empty(&conn->ksnc_tx_queue) && + conn->ksnc_sock->sk->sk_wmem_queued == 0) { + /* First packet starts the timeout */ + conn->ksnc_tx_deadline = jiffies + + ksocknal_tunables.ksnd_io_timeout * HZ; + conn->ksnc_tx_bufnob = 0; + mb(); /* order with adding to tx_queue */ + } list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); @@ -868,42 +815,32 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) { struct list_head *tmp; ksock_route_t *route; - ksock_route_t *candidate = NULL; - int found = 0; int bits; list_for_each (tmp, &peer->ksnp_routes) { route = list_entry (tmp, ksock_route_t, ksnr_list); bits = route->ksnr_connected; - - if ((bits & KSNR_TYPED_ROUTES) == KSNR_TYPED_ROUTES || - (bits & (1 << SOCKNAL_CONN_ANY)) != 0 || - route->ksnr_connecting != 0) { - /* All typed connections have been established, or - * an untyped connection has been established, or - * connections are currently being established */ - found = 1; + + /* All typed connections established? */ + if ((bits & KSNR_TYPED_ROUTES) == KSNR_TYPED_ROUTES) + continue; + + /* Untyped connection established? */ + if ((bits & (1 << SOCKNAL_CONN_ANY)) != 0) + continue; + + /* connection being established? */ + if (route->ksnr_connecting != 0) continue; - } /* too soon to retry this guy? */ if (!time_after_eq (jiffies, route->ksnr_timeout)) continue; - /* always do eager routes */ - if (route->ksnr_eager) - return (route); - - if (candidate == NULL) { - /* If we don't find any other route that is fully - * connected or connecting, the first connectable - * route is returned. If it fails to connect, it - * will get placed at the end of the list */ - candidate = route; - } + return (route); } - - return (found ? NULL : candidate); + + return (NULL); } ksock_route_t * @@ -951,8 +888,9 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) tx->tx_hdr = (ptl_hdr_t *)tx->tx_iov[0].iov_base; g_lock = &ksocknal_data.ksnd_global_lock; +#if !SOCKNAL_ROUND_ROBIN read_lock (g_lock); - + peer = ksocknal_find_target_peer_locked (tx, nid); if (peer == NULL) { read_unlock (g_lock); @@ -969,19 +907,17 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) return (0); } } - - /* Making one or more connections; I'll need a write lock... */ - - atomic_inc (&peer->ksnp_refcount); /* +1 ref for me while I unlock */ + + /* I'll need a write lock... */ read_unlock (g_lock); - write_lock_irqsave (g_lock, flags); - - if (peer->ksnp_closing) { /* peer deleted as I blocked! */ - write_unlock_irqrestore (g_lock, flags); - ksocknal_put_peer (peer); +#endif + write_lock_irqsave(g_lock, flags); + + peer = ksocknal_find_target_peer_locked (tx, nid); + if (peer == NULL) { + write_unlock_irqrestore(g_lock, flags); return (-EHOSTUNREACH); } - ksocknal_put_peer (peer); /* drop ref I got above */ for (;;) { /* launch any/all autoconnections that need it */ @@ -1014,7 +950,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) } ptl_err_t -ksocknal_sendmsg(nal_cb_t *nal, +ksocknal_sendmsg(lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, @@ -1063,7 +999,7 @@ ksocknal_sendmsg(nal_cb_t *nal, if (ltx == NULL) { CERROR("Can't allocate tx desc type %d size %d %s\n", type, desc_size, in_interrupt() ? "(intr)" : ""); - return (PTL_NOSPACE); + return (PTL_NO_SPACE); } atomic_inc(&ksocknal_data.ksnd_nactive_ltxs); @@ -1111,7 +1047,7 @@ ksocknal_sendmsg(nal_cb_t *nal, } ptl_err_t -ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, +ksocknal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, struct iovec *payload_iov, size_t payload_offset, size_t payload_len) @@ -1123,7 +1059,7 @@ ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, } ptl_err_t -ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, +ksocknal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, ptl_kiov_t *payload_kiov, size_t payload_offset, size_t payload_len) @@ -1145,7 +1081,7 @@ ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); /* I'm the gateway; must be the last hop */ - if (nid == ksocknal_lib.ni.nid) + if (nid == ksocknal_lib.libnal_ni.ni_pid.nid) nid = fwd->kprfd_target_nid; /* setup iov for hdr */ @@ -1167,19 +1103,26 @@ ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) int ksocknal_thread_start (int (*fn)(void *arg), void *arg) { - long pid = kernel_thread (fn, arg, 0); + long pid = kernel_thread (fn, arg, 0); + unsigned long flags; if (pid < 0) return ((int)pid); - atomic_inc (&ksocknal_data.ksnd_nthreads); + write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); + ksocknal_data.ksnd_nthreads++; + write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); return (0); } void ksocknal_thread_fini (void) { - atomic_dec (&ksocknal_data.ksnd_nthreads); + unsigned long flags; + + write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); + ksocknal_data.ksnd_nthreads--; + write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); } void @@ -1197,14 +1140,14 @@ ksocknal_fmb_callback (void *arg, int error) if (error != 0) CERROR("Failed to route packet from " LPX64" %s to "LPX64" %s: %d\n", - NTOH__u64(hdr->src_nid), - portals_nid2str(SOCKNAL, NTOH__u64(hdr->src_nid), ipbuf), - NTOH__u64(hdr->dest_nid), - portals_nid2str(SOCKNAL, NTOH__u64(hdr->dest_nid), ipbuf2), + le64_to_cpu(hdr->src_nid), + portals_nid2str(SOCKNAL, le64_to_cpu(hdr->src_nid), ipbuf), + le64_to_cpu(hdr->dest_nid), + portals_nid2str(SOCKNAL, le64_to_cpu(hdr->dest_nid), ipbuf2), error); else CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n", - NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid)); + le64_to_cpu(hdr->src_nid), le64_to_cpu(hdr->dest_nid)); /* drop peer ref taken on init */ ksocknal_put_peer (fmb->fmb_peer); @@ -1284,7 +1227,7 @@ int ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) { int payload_nob = conn->ksnc_rx_nob_left; - ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); + ptl_nid_t dest_nid = le64_to_cpu(conn->ksnc_hdr.dest_nid); int niov = 0; int nob = payload_nob; @@ -1321,7 +1264,7 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) if (payload_nob == 0) { /* got complete packet already */ CDEBUG (D_NET, "%p "LPX64"->"LPX64" fwd_start (immediate)\n", - conn, NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid); + conn, le64_to_cpu(conn->ksnc_hdr.src_nid), dest_nid); kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); @@ -1342,7 +1285,7 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) memcpy(conn->ksnc_rx_kiov, fmb->fmb_kiov, niov * sizeof(ptl_kiov_t)); CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, - NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, payload_nob); + le64_to_cpu(conn->ksnc_hdr.src_nid), dest_nid, payload_nob); return (0); } @@ -1350,9 +1293,9 @@ void ksocknal_fwd_parse (ksock_conn_t *conn) { ksock_peer_t *peer; - ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); - ptl_nid_t src_nid = NTOH__u64 (conn->ksnc_hdr.src_nid); - int body_len = NTOH__u32 (conn->ksnc_hdr.payload_length); + ptl_nid_t dest_nid = le64_to_cpu(conn->ksnc_hdr.dest_nid); + ptl_nid_t src_nid = le64_to_cpu(conn->ksnc_hdr.src_nid); + int body_len = le32_to_cpu(conn->ksnc_hdr.payload_length); char str[PTL_NALFMT_SIZE]; char str2[PTL_NALFMT_SIZE]; @@ -1529,8 +1472,9 @@ ksocknal_process_receive (ksock_conn_t *conn) switch (conn->ksnc_rx_state) { case SOCKNAL_RX_HEADER: - if (conn->ksnc_hdr.type != HTON__u32(PTL_MSG_HELLO) && - NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) { + if (conn->ksnc_hdr.type != cpu_to_le32(PTL_MSG_HELLO) && + le64_to_cpu(conn->ksnc_hdr.dest_nid) != + ksocknal_lib.libnal_ni.ni_pid.nid) { /* This packet isn't for me */ ksocknal_fwd_parse (conn); switch (conn->ksnc_rx_state) { @@ -1547,7 +1491,13 @@ ksocknal_process_receive (ksock_conn_t *conn) } /* sets wanted_len, iovs etc */ - lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); + rc = lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); + + if (rc != PTL_OK) { + /* I just received garbage: give up on this conn */ + ksocknal_close_conn_and_siblings (conn, rc); + return (-EPROTO); + } if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ conn->ksnc_rx_state = SOCKNAL_RX_BODY; @@ -1569,8 +1519,8 @@ ksocknal_process_receive (ksock_conn_t *conn) case SOCKNAL_RX_BODY_FWD: /* payload all received */ CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", - conn, NTOH__u64 (conn->ksnc_hdr.src_nid), - NTOH__u64 (conn->ksnc_hdr.dest_nid), + conn, le64_to_cpu(conn->ksnc_hdr.src_nid), + le64_to_cpu(conn->ksnc_hdr.dest_nid), conn->ksnc_rx_nob_left); /* forward the packet. NB ksocknal_init_fmb() put fmb into @@ -1594,7 +1544,7 @@ ksocknal_process_receive (ksock_conn_t *conn) } ptl_err_t -ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, +ksocknal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, struct iovec *iov, size_t offset, size_t mlen, size_t rlen) { @@ -1622,7 +1572,7 @@ ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, } ptl_err_t -ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg, +ksocknal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, ptl_kiov_t *kiov, size_t offset, size_t mlen, size_t rlen) { @@ -1649,6 +1599,25 @@ ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg, return (PTL_OK); } +static inline int +ksocknal_sched_cansleep(ksock_sched_t *sched) +{ + unsigned long flags; + int rc; + + spin_lock_irqsave(&sched->kss_lock, flags); + + rc = (!ksocknal_data.ksnd_shuttingdown && +#if SOCKNAL_ZC + list_empty(&sched->kss_zctxdone_list) && +#endif + list_empty(&sched->kss_rx_conns) && + list_empty(&sched->kss_tx_conns)); + + spin_unlock_irqrestore(&sched->kss_lock, flags); + return (rc); +} + int ksocknal_scheduler (void *arg) { ksock_sched_t *sched = (ksock_sched_t *)arg; @@ -1665,12 +1634,13 @@ int ksocknal_scheduler (void *arg) kportal_blockallsigs (); #if (CONFIG_SMP && CPU_AFFINITY) + id = ksocknal_sched2cpu(id); if (cpu_online(id)) { cpumask_t m; cpu_set(id, m); set_cpus_allowed(current, m); } else { - CERROR ("Can't set CPU affinity for %s\n", name); + CERROR ("Can't set CPU affinity for %s to %d\n", name, id); } #endif /* CONFIG_SMP && CPU_AFFINITY */ @@ -1798,18 +1768,8 @@ int ksocknal_scheduler (void *arg) nloops = 0; if (!did_something) { /* wait for something to do */ -#if SOCKNAL_ZC rc = wait_event_interruptible (sched->kss_waitq, - ksocknal_data.ksnd_shuttingdown || - !list_empty(&sched->kss_rx_conns) || - !list_empty(&sched->kss_tx_conns) || - !list_empty(&sched->kss_zctxdone_list)); -#else - rc = wait_event_interruptible (sched->kss_waitq, - ksocknal_data.ksnd_shuttingdown || - !list_empty(&sched->kss_rx_conns) || - !list_empty(&sched->kss_tx_conns)); -#endif + !ksocknal_sched_cansleep(sched)); LASSERT (rc == 0); } else our_cond_resched(); @@ -1997,133 +1957,245 @@ ksocknal_sock_read (struct socket *sock, void *buffer, int nob) } int -ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, - __u64 *incarnation) +ksocknal_send_hello (ksock_conn_t *conn, __u32 *ipaddrs, int nipaddrs) { - int rc; + /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */ + struct socket *sock = conn->ksnc_sock; ptl_hdr_t hdr; ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; - char ipbuf[PTL_NALFMT_SIZE]; - char ipbuf2[PTL_NALFMT_SIZE]; + int i; + int rc; - LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); + LASSERT (conn->ksnc_type != SOCKNAL_CONN_NONE); + LASSERT (nipaddrs <= SOCKNAL_MAX_INTERFACES); - memset (&hdr, 0, sizeof (hdr)); - hmv->magic = __cpu_to_le32 (PORTALS_PROTO_MAGIC); - hmv->version_major = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); - hmv->version_minor = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); + /* No need for getconnsock/putconnsock */ + LASSERT (!conn->ksnc_closing); + + LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); + hmv->magic = cpu_to_le32 (PORTALS_PROTO_MAGIC); + hmv->version_major = cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); + hmv->version_minor = cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); - hdr.src_nid = __cpu_to_le64 (ksocknal_lib.ni.nid); - hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); + hdr.src_nid = cpu_to_le64 (ksocknal_lib.libnal_ni.ni_pid.nid); + hdr.type = cpu_to_le32 (PTL_MSG_HELLO); + hdr.payload_length = cpu_to_le32 (nipaddrs * sizeof(*ipaddrs)); - hdr.msg.hello.type = __cpu_to_le32 (*type); + hdr.msg.hello.type = cpu_to_le32 (conn->ksnc_type); hdr.msg.hello.incarnation = - __cpu_to_le64 (ksocknal_data.ksnd_incarnation); + cpu_to_le64 (ksocknal_data.ksnd_incarnation); - /* Assume sufficient socket buffering for this message */ - rc = ksocknal_sock_write (sock, &hdr, sizeof (hdr)); + /* Receiver is eager */ + rc = ksocknal_sock_write (sock, &hdr, sizeof(hdr)); if (rc != 0) { - CERROR ("Error %d sending HELLO to "LPX64" %s\n", - rc, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf)); + CERROR ("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n", + rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); return (rc); } + + if (nipaddrs == 0) + return (0); + + for (i = 0; i < nipaddrs; i++) { + ipaddrs[i] = __cpu_to_le32 (ipaddrs[i]); + } + + rc = ksocknal_sock_write (sock, ipaddrs, nipaddrs * sizeof(*ipaddrs)); + if (rc != 0) + CERROR ("Error %d sending HELLO payload (%d)" + " to %u.%u.%u.%u/%d\n", rc, nipaddrs, + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + return (rc); +} + +int +ksocknal_invert_type(int type) +{ + switch (type) + { + case SOCKNAL_CONN_ANY: + case SOCKNAL_CONN_CONTROL: + return (type); + case SOCKNAL_CONN_BULK_IN: + return SOCKNAL_CONN_BULK_OUT; + case SOCKNAL_CONN_BULK_OUT: + return SOCKNAL_CONN_BULK_IN; + default: + return (SOCKNAL_CONN_NONE); + } +} + +int +ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, + __u64 *incarnation, __u32 *ipaddrs) +{ + struct socket *sock = conn->ksnc_sock; + int rc; + int nips; + int i; + int type; + ptl_hdr_t hdr; + ptl_magicversion_t *hmv; + + hmv = (ptl_magicversion_t *)&hdr.dest_nid; + LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); rc = ksocknal_sock_read (sock, hmv, sizeof (*hmv)); if (rc != 0) { - CERROR ("Error %d reading HELLO from "LPX64" %s\n", - rc, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf)); + CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); return (rc); } - if (hmv->magic != __le32_to_cpu (PORTALS_PROTO_MAGIC)) { - CERROR ("Bad magic %#08x (%#08x expected) from "LPX64" %s\n", - __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, *nid, - portals_nid2str(SOCKNAL, *nid, ipbuf)); + if (hmv->magic != le32_to_cpu (PORTALS_PROTO_MAGIC)) { + CERROR ("Bad magic %#08x (%#08x expected) from %u.%u.%u.%u\n", + __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, + HIPQUAD(conn->ksnc_ipaddr)); return (-EPROTO); } - if (hmv->version_major != __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || - hmv->version_minor != __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { + if (hmv->version_major != cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || + hmv->version_minor != cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { CERROR ("Incompatible protocol version %d.%d (%d.%d expected)" - " from "LPX64" %s\n", - __le16_to_cpu (hmv->version_major), - __le16_to_cpu (hmv->version_minor), + " from %u.%u.%u.%u\n", + le16_to_cpu (hmv->version_major), + le16_to_cpu (hmv->version_minor), PORTALS_PROTO_VERSION_MAJOR, PORTALS_PROTO_VERSION_MINOR, - *nid, portals_nid2str(SOCKNAL, *nid, ipbuf)); + HIPQUAD(conn->ksnc_ipaddr)); return (-EPROTO); } -#if (PORTALS_PROTO_VERSION_MAJOR != 0) -# error "This code only understands protocol version 0.x" +#if (PORTALS_PROTO_VERSION_MAJOR != 1) +# error "This code only understands protocol version 1.x" #endif - /* version 0 sends magic/version as the dest_nid of a 'hello' header, - * so read the rest of it in now... */ + /* version 1 sends magic/version as the dest_nid of a 'hello' + * header, followed by payload full of interface IP addresses. + * Read the rest of it in now... */ rc = ksocknal_sock_read (sock, hmv + 1, sizeof (hdr) - sizeof (*hmv)); if (rc != 0) { - CERROR ("Error %d reading rest of HELLO hdr from "LPX64" %s\n", - rc, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf)); + CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); return (rc); } /* ...and check we got what we expected */ - if (hdr.type != __cpu_to_le32 (PTL_MSG_HELLO) || - hdr.payload_length != __cpu_to_le32 (0)) { - CERROR ("Expecting a HELLO hdr with 0 payload," - " but got type %d with %d payload from "LPX64" %s\n", - __le32_to_cpu (hdr.type), - __le32_to_cpu (hdr.payload_length), *nid, - portals_nid2str(SOCKNAL, *nid, ipbuf)); + if (hdr.type != cpu_to_le32 (PTL_MSG_HELLO)) { + CERROR ("Expecting a HELLO hdr," + " but got type %d from %u.%u.%u.%u\n", + le32_to_cpu (hdr.type), + HIPQUAD(conn->ksnc_ipaddr)); return (-EPROTO); } - if (__le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) { - CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY\n"); + if (le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) { + CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY" + "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr)); return (-EPROTO); } if (*nid == PTL_NID_ANY) { /* don't know peer's nid yet */ - *nid = __le64_to_cpu(hdr.src_nid); - } else if (*nid != __le64_to_cpu (hdr.src_nid)) { - CERROR ("Connected to nid "LPX64" %s, but expecting "LPX64" %s\n", - __le64_to_cpu (hdr.src_nid), - portals_nid2str(SOCKNAL, - __le64_to_cpu(hdr.src_nid), - ipbuf), - *nid, portals_nid2str(SOCKNAL, *nid, ipbuf2)); + *nid = le64_to_cpu(hdr.src_nid); + } else if (*nid != le64_to_cpu (hdr.src_nid)) { + CERROR ("Connected to nid "LPX64"@%u.%u.%u.%u " + "but expecting "LPX64"\n", + le64_to_cpu (hdr.src_nid), + HIPQUAD(conn->ksnc_ipaddr), *nid); return (-EPROTO); } - if (*type == SOCKNAL_CONN_NONE) { + type = __le32_to_cpu(hdr.msg.hello.type); + + if (conn->ksnc_type == SOCKNAL_CONN_NONE) { /* I've accepted this connection; peer determines type */ - *type = __le32_to_cpu(hdr.msg.hello.type); - switch (*type) { - case SOCKNAL_CONN_ANY: - case SOCKNAL_CONN_CONTROL: - break; - case SOCKNAL_CONN_BULK_IN: - *type = SOCKNAL_CONN_BULK_OUT; - break; - case SOCKNAL_CONN_BULK_OUT: - *type = SOCKNAL_CONN_BULK_IN; - break; - default: - CERROR ("Unexpected type %d from "LPX64" %s\n", - *type, *nid, - portals_nid2str(SOCKNAL, *nid, ipbuf)); + conn->ksnc_type = ksocknal_invert_type(type); + if (conn->ksnc_type == SOCKNAL_CONN_NONE) { + CERROR ("Unexpected type %d from "LPX64"@%u.%u.%u.%u\n", + type, *nid, HIPQUAD(conn->ksnc_ipaddr)); return (-EPROTO); } - } else if (__le32_to_cpu(hdr.msg.hello.type) != SOCKNAL_CONN_NONE) { - CERROR ("Mismatched types: me %d "LPX64" %s %d\n", - *type, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf), - __le32_to_cpu(hdr.msg.hello.type)); + } else if (ksocknal_invert_type(type) != conn->ksnc_type) { + CERROR ("Mismatched types: me %d, "LPX64"@%u.%u.%u.%u %d\n", + conn->ksnc_type, *nid, HIPQUAD(conn->ksnc_ipaddr), + le32_to_cpu(hdr.msg.hello.type)); return (-EPROTO); } - *incarnation = __le64_to_cpu(hdr.msg.hello.incarnation); + *incarnation = le64_to_cpu(hdr.msg.hello.incarnation); - return (0); + nips = __le32_to_cpu (hdr.payload_length) / sizeof (__u32); + + if (nips > SOCKNAL_MAX_INTERFACES || + nips * sizeof(__u32) != __le32_to_cpu (hdr.payload_length)) { + CERROR("Bad payload length %d from "LPX64"@%u.%u.%u.%u\n", + __le32_to_cpu (hdr.payload_length), + *nid, HIPQUAD(conn->ksnc_ipaddr)); + } + + if (nips == 0) + return (0); + + rc = ksocknal_sock_read (sock, ipaddrs, nips * sizeof(*ipaddrs)); + if (rc != 0) { + CERROR ("Error %d reading IPs from "LPX64"@%u.%u.%u.%u\n", + rc, *nid, HIPQUAD(conn->ksnc_ipaddr)); + return (rc); + } + + for (i = 0; i < nips; i++) { + ipaddrs[i] = __le32_to_cpu(ipaddrs[i]); + + if (ipaddrs[i] == 0) { + CERROR("Zero IP[%d] from "LPX64"@%u.%u.%u.%u\n", + i, *nid, HIPQUAD(conn->ksnc_ipaddr)); + return (-EPROTO); + } + } + + return (nips); +} + +int +ksocknal_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) +{ + mm_segment_t oldmm = get_fs (); + struct socket *sock = conn->ksnc_sock; + int len; + int rc; + + rc = ksocknal_getconnsock (conn); + if (rc != 0) { + LASSERT (conn->ksnc_closing); + *txmem = *rxmem = *nagle = 0; + return (-ESHUTDOWN); + } + + set_fs (KERNEL_DS); + + len = sizeof(*txmem); + rc = sock_getsockopt(sock, SOL_SOCKET, SO_SNDBUF, + (char *)txmem, &len); + if (rc == 0) { + len = sizeof(*rxmem); + rc = sock_getsockopt(sock, SOL_SOCKET, SO_RCVBUF, + (char *)rxmem, &len); + } + if (rc == 0) { + len = sizeof(*nagle); + rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY, + (char *)nagle, &len); + } + + set_fs (oldmm); + ksocknal_putconnsock (conn); + + if (rc == 0) + *nagle = !*nagle; + else + *txmem = *rxmem = *nagle = 0; + + return (rc); } int @@ -2132,13 +2204,13 @@ ksocknal_setup_sock (struct socket *sock) mm_segment_t oldmm = get_fs (); int rc; int option; + int keep_idle; + int keep_intvl; + int keep_count; + int do_keepalive; struct linger linger; -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) sock->sk->sk_allocation = GFP_NOFS; -#else - sock->sk->allocation = GFP_NOFS; -#endif /* Ensure this socket aborts active sends immediately when we close * it. */ @@ -2165,55 +2237,95 @@ ksocknal_setup_sock (struct socket *sock) return (rc); } -#if SOCKNAL_USE_KEEPALIVES - /* Keepalives: If 3/4 of the timeout elapses, start probing every - * second until the timeout elapses. */ + if (!ksocknal_tunables.ksnd_nagle) { + option = 1; + + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't disable nagle: %d\n", rc); + return (rc); + } + } + + if (ksocknal_tunables.ksnd_buffer_size > 0) { + option = ksocknal_tunables.ksnd_buffer_size; + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set send buffer %d: %d\n", + option, rc); + return (rc); + } + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set receive buffer %d: %d\n", + option, rc); + return (rc); + } + } + + /* snapshot tunables */ + keep_idle = ksocknal_tunables.ksnd_keepalive_idle; + keep_count = ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl; + + do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); - option = (ksocknal_data.ksnd_io_timeout * 3) / 4; + option = (do_keepalive ? 1 : 0); set_fs (KERNEL_DS); - rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE, - (char *)&option, sizeof (option)); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&option, sizeof (option)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc); + CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); return (rc); } - - option = 1; + + if (!do_keepalive) + return (0); + set_fs (KERNEL_DS); - rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL, - (char *)&option, sizeof (option)); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE, + (char *)&keep_idle, sizeof (keep_idle)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc); + CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc); return (rc); } - - option = ksocknal_data.ksnd_io_timeout / 4; + set_fs (KERNEL_DS); - rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT, - (char *)&option, sizeof (option)); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL, + (char *)&keep_intvl, sizeof (keep_intvl)); set_fs (oldmm); if (rc != 0) { CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc); return (rc); } - option = 1; set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&option, sizeof (option)); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT, + (char *)&keep_count, sizeof (keep_count)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); + CERROR ("Can't set TCP_KEEPCNT: %d\n", rc); return (rc); } -#endif + return (0); } static int -ksocknal_connect_sock(struct socket **sockp, int *may_retry, +ksocknal_connect_sock(struct socket **sockp, int *may_retry, ksock_route_t *route, int local_port) { struct sockaddr_in locaddr; @@ -2227,7 +2339,9 @@ ksocknal_connect_sock(struct socket **sockp, int *may_retry, memset(&locaddr, 0, sizeof(locaddr)); locaddr.sin_family = AF_INET; locaddr.sin_port = htons(local_port); - locaddr.sin_addr.s_addr = INADDR_ANY; + locaddr.sin_addr.s_addr = + (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr) + : INADDR_ANY; memset (&srvaddr, 0, sizeof (srvaddr)); srvaddr.sin_family = AF_INET; @@ -2266,7 +2380,7 @@ ksocknal_connect_sock(struct socket **sockp, int *may_retry, /* Set the socket timeouts, so our connection attempt completes in * finite time */ - tv.tv_sec = ksocknal_data.ksnd_io_timeout; + tv.tv_sec = ksocknal_tunables.ksnd_io_timeout; tv.tv_usec = 0; set_fs (KERNEL_DS); @@ -2274,8 +2388,8 @@ ksocknal_connect_sock(struct socket **sockp, int *may_retry, (char *)&tv, sizeof (tv)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set send timeout %d: %d\n", - ksocknal_data.ksnd_io_timeout, rc); + CERROR ("Can't set send timeout %d: %d\n", + ksocknal_tunables.ksnd_io_timeout, rc); goto failed; } @@ -2285,12 +2399,12 @@ ksocknal_connect_sock(struct socket **sockp, int *may_retry, set_fs (oldmm); if (rc != 0) { CERROR ("Can't set receive timeout %d: %d\n", - ksocknal_data.ksnd_io_timeout, rc); + ksocknal_tunables.ksnd_io_timeout, rc); goto failed; } - option = 1; set_fs (KERNEL_DS); + option = 1; rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&option, sizeof (option)); set_fs (oldmm); @@ -2298,29 +2412,6 @@ ksocknal_connect_sock(struct socket **sockp, int *may_retry, CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); goto failed; } - - if (route->ksnr_buffer_size != 0) { - option = route->ksnr_buffer_size; - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set send buffer %d: %d\n", - route->ksnr_buffer_size, rc); - goto failed; - } - - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set receive buffer %d: %d\n", - route->ksnr_buffer_size, rc); - goto failed; - } - } rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr, sizeof(locaddr)); @@ -2348,7 +2439,8 @@ ksocknal_connect_sock(struct socket **sockp, int *may_retry, *may_retry = (rc == -EADDRNOTAVAIL); CDEBUG(*may_retry ? D_NET : D_ERROR, - "Error %d connecting to %u.%u.%u.%u/%d\n", rc, + "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, + HIPQUAD(route->ksnr_myipaddr), local_port, HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); failed: @@ -2374,12 +2466,11 @@ ksocknal_connect_peer (ksock_route_t *route, int type) rc = ksocknal_connect_sock(&sock, &may_retry, route, port); if (rc == 0) { - rc = ksocknal_create_conn(route, sock, - route->ksnr_irq_affinity, type); + rc = ksocknal_create_conn(route, sock, type); fput(sock->file); return rc; } - + if (!may_retry) return rc; } @@ -2405,7 +2496,6 @@ ksocknal_autoconnect (ksock_route_t *route) LASSERT (type < SOCKNAL_CONN_NTYPES); rc = ksocknal_connect_peer (route, type); - if (rc != 0) break; @@ -2445,12 +2535,13 @@ ksocknal_autoconnect (ksock_route_t *route) } while (!list_empty (&peer->ksnp_tx_queue)); } - /* make this route least-favourite for re-selection */ +#if 0 /* irrelevent with only eager routes */ if (!route->ksnr_deleted) { + /* make this route least-favourite for re-selection */ list_del(&route->ksnr_list); list_add_tail(&route->ksnr_list, &peer->ksnp_routes); } - +#endif write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); while (!list_empty (&zombies)) { @@ -2459,15 +2550,15 @@ ksocknal_autoconnect (ksock_route_t *route) tx = list_entry (zombies.next, ksock_tx_t, tx_list); CERROR ("Deleting packet type %d len %d ("LPX64" %s->"LPX64" %s)\n", - NTOH__u32 (tx->tx_hdr->type), - NTOH__u32 (tx->tx_hdr->payload_length), - NTOH__u64 (tx->tx_hdr->src_nid), + le32_to_cpu (tx->tx_hdr->type), + le32_to_cpu (tx->tx_hdr->payload_length), + le64_to_cpu (tx->tx_hdr->src_nid), portals_nid2str(SOCKNAL, - NTOH__u64(tx->tx_hdr->src_nid), + le64_to_cpu(tx->tx_hdr->src_nid), ipbuf), - NTOH__u64 (tx->tx_hdr->dest_nid), + le64_to_cpu (tx->tx_hdr->dest_nid), portals_nid2str(SOCKNAL, - NTOH__u64(tx->tx_hdr->src_nid), + le64_to_cpu(tx->tx_hdr->src_nid), ipbuf2)); list_del (&tx->tx_list); @@ -2496,24 +2587,26 @@ ksocknal_autoconnectd (void *arg) if (!list_empty (&ksocknal_data.ksnd_autoconnectd_routes)) { route = list_entry (ksocknal_data.ksnd_autoconnectd_routes.next, ksock_route_t, ksnr_connect_list); - + list_del (&route->ksnr_connect_list); spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); ksocknal_autoconnect (route); ksocknal_put_route (route); - spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); + spin_lock_irqsave(&ksocknal_data.ksnd_autoconnectd_lock, + flags); continue; } - - spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); - rc = wait_event_interruptible (ksocknal_data.ksnd_autoconnectd_waitq, - ksocknal_data.ksnd_shuttingdown || - !list_empty (&ksocknal_data.ksnd_autoconnectd_routes)); + spin_unlock_irqrestore(&ksocknal_data.ksnd_autoconnectd_lock, + flags); - spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); + rc = wait_event_interruptible(ksocknal_data.ksnd_autoconnectd_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&ksocknal_data.ksnd_autoconnectd_routes)); + + spin_lock_irqsave(&ksocknal_data.ksnd_autoconnectd_lock, flags); } spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); @@ -2528,32 +2621,39 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer) /* We're called with a shared lock on ksnd_global_lock */ ksock_conn_t *conn; struct list_head *ctmp; - ksock_sched_t *sched; list_for_each (ctmp, &peer->ksnp_conns) { conn = list_entry (ctmp, ksock_conn_t, ksnc_list); - sched = conn->ksnc_scheduler; /* Don't need the {get,put}connsock dance to deref ksnc_sock... */ LASSERT (!conn->ksnc_closing); - + + if (conn->ksnc_sock->sk->sk_err != 0) { + /* Something (e.g. failed keepalive) set the socket error */ + atomic_inc (&conn->ksnc_refcount); + CERROR ("Socket error %d: "LPX64" %p %d.%d.%d.%d\n", + conn->ksnc_sock->sk->sk_err, peer->ksnp_nid, + conn, HIPQUAD(conn->ksnc_ipaddr)); + return (conn); + } + if (conn->ksnc_rx_started && time_after_eq (jiffies, conn->ksnc_rx_deadline)) { /* Timed out incomplete incoming message */ atomic_inc (&conn->ksnc_refcount); CERROR ("Timed out RX from "LPX64" %p %d.%d.%d.%d\n", - peer->ksnp_nid, conn, HIPQUAD(conn->ksnc_ipaddr)); + peer->ksnp_nid,conn,HIPQUAD(conn->ksnc_ipaddr)); return (conn); } - + if ((!list_empty (&conn->ksnc_tx_queue) || conn->ksnc_sock->sk->sk_wmem_queued != 0) && time_after_eq (jiffies, conn->ksnc_tx_deadline)) { - /* Timed out messages queued for sending, or - * messages buffered in the socket's send buffer */ + /* Timed out messages queued for sending or + * buffered in the socket's send buffer */ atomic_inc (&conn->ksnc_refcount); - CERROR ("Timed out TX to "LPX64" %s%d %p %d.%d.%d.%d\n", - peer->ksnp_nid, + CERROR ("Timed out TX to "LPX64" %s%d %p %d.%d.%d.%d\n", + peer->ksnp_nid, list_empty (&conn->ksnc_tx_queue) ? "" : "Q ", conn->ksnc_sock->sk->sk_wmem_queued, conn, HIPQUAD(conn->ksnc_ipaddr)); @@ -2693,9 +2793,9 @@ ksocknal_reaper (void *arg) * timeout on any connection within (n+1)/n times the * timeout interval. */ - if (ksocknal_data.ksnd_io_timeout > n * p) + if (ksocknal_tunables.ksnd_io_timeout > n * p) chunk = (chunk * n * p) / - ksocknal_data.ksnd_io_timeout; + ksocknal_tunables.ksnd_io_timeout; if (chunk == 0) chunk = 1; @@ -2716,8 +2816,8 @@ ksocknal_reaper (void *arg) } ksocknal_data.ksnd_reaper_waketime = jiffies + timeout; - add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); set_current_state (TASK_INTERRUPTIBLE); + add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); if (!ksocknal_data.ksnd_shuttingdown && list_empty (&ksocknal_data.ksnd_deathrow_conns) && @@ -2736,18 +2836,11 @@ ksocknal_reaper (void *arg) return (0); } -nal_cb_t ksocknal_lib = { - nal_data: &ksocknal_data, /* NAL private data */ - cb_send: ksocknal_send, - cb_send_pages: ksocknal_send_pages, - cb_recv: ksocknal_recv, - cb_recv_pages: ksocknal_recv_pages, - cb_read: ksocknal_read, - cb_write: ksocknal_write, - cb_malloc: ksocknal_malloc, - cb_free: ksocknal_free, - cb_printf: ksocknal_printf, - cb_cli: ksocknal_cli, - cb_sti: ksocknal_sti, - cb_dist: ksocknal_dist +lib_nal_t ksocknal_lib = { + libnal_data: &ksocknal_data, /* NAL private data */ + libnal_send: ksocknal_send, + libnal_send_pages: ksocknal_send_pages, + libnal_recv: ksocknal_recv, + libnal_recv_pages: ksocknal_recv_pages, + libnal_dist: ksocknal_dist }; diff --git a/lustre/portals/libcfs/.cvsignore b/lustre/portals/libcfs/.cvsignore index df12db6..c6f0aa4 100644 --- a/lustre/portals/libcfs/.cvsignore +++ b/lustre/portals/libcfs/.cvsignore @@ -4,7 +4,6 @@ link-stamp .*.cmd autoMakefile.in autoMakefile -sources *.ko *.mod.c .*.flags diff --git a/lustre/portals/libcfs/Makefile.in b/lustre/portals/libcfs/Makefile.in index 598adc1..0967123 100644 --- a/lustre/portals/libcfs/Makefile.in +++ b/lustre/portals/libcfs/Makefile.in @@ -1,9 +1,4 @@ -MODULES = portals +MODULES = libcfs libcfs-objs := debug.o lwt.o module.o proc.o tracefile.o -api-sources := $(wildcard @LUSTRE@/portals/portals/api-*.c) -lib-sources := $(wildcard @LUSTRE@/portals/portals/lib-*.c) - -portals-objs += $(libcfs-objs) $(patsubst %.c,%.o,$(notdir $(api-sources) $(lib-sources))) - @INCLUDE_RULES@ diff --git a/lustre/portals/libcfs/autoMakefile.am b/lustre/portals/libcfs/autoMakefile.am index cacd769..9c27693 100644 --- a/lustre/portals/libcfs/autoMakefile.am +++ b/lustre/portals/libcfs/autoMakefile.am @@ -4,17 +4,8 @@ # See the file COPYING in this distribution if MODULES -modulenet_DATA := portals$(KMODEXT) +modulenet_DATA := libcfs$(KMODEXT) endif -sources: - rm -f sources - @for i in $(api-sources) $(lib-sources) ; do \ - echo ln -sf $$i . ; \ - ln -sf $$i . || exit 1 ; \ - done - touch sources - MOSTLYCLEANFILES = *.o *.ko *.mod.c -CLEANFILES = sources lib-*.c api-*.c -DIST_SOURCES = $(libcfs-objs:%.o=%.c) *.h +DIST_SOURCES = $(libcfs-objs:%.o=%.c) tracefile.h diff --git a/lustre/portals/libcfs/debug.c b/lustre/portals/libcfs/debug.c index 3e5531a..f571958 100644 --- a/lustre/portals/libcfs/debug.c +++ b/lustre/portals/libcfs/debug.c @@ -191,7 +191,8 @@ int portals_debug_mark_buffer(char *text) void portals_debug_set_level(unsigned int debug_level) { - printk("Lustre: Setting portals debug level to %08x\n", debug_level); + printk(KERN_WARNING "Lustre: Setting portals debug level to %08x\n", + debug_level); portal_debug = debug_level; } @@ -250,31 +251,47 @@ void portals_run_lbug_upcall(char *file, const char *fn, const int line) char *portals_nid2str(int nal, ptl_nid_t nid, char *str) { + if (nid == PTL_NID_ANY) { + snprintf(str, PTL_NALFMT_SIZE, "%s", "PTL_NID_ANY"); + return str; + } + switch(nal){ /* XXX this could be a nal method of some sort, 'cept it's config * dependent whether (say) socknal NIDs are actually IP addresses... */ -#ifndef CRAY_PORTALS +#if !CRAY_PORTALS case TCPNAL: /* userspace NAL */ + case IIBNAL: + case OPENIBNAL: case SOCKNAL: - snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u", + snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u", (__u32)(nid >> 32), HIPQUAD(nid)); break; case QSWNAL: case GMNAL: - case IBNAL: - case SCIMACNAL: - snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u", + snprintf(str, PTL_NALFMT_SIZE, "%u:%u", (__u32)(nid >> 32), (__u32)nid); break; #endif default: - snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx", + snprintf(str, PTL_NALFMT_SIZE, "?%x? %llx", nal, (long long)nid); + break; } return str; } +char *portals_id2str(int nal, ptl_process_id_t id, char *str) +{ + int len; + + portals_nid2str(nal, id.nid, str); + len = strlen(str); + snprintf(str + len, PTL_NALFMT_SIZE, "-%u", id.pid); + return str; +} + #ifdef __KERNEL__ char stack_backtrace[LUSTRE_TRACE_SIZE]; spinlock_t stack_backtrace_lock = SPIN_LOCK_UNLOCKED; @@ -350,7 +367,9 @@ out: char *portals_debug_dumpstack(void) { - return "dump_stack\n"; + char *buf = stack_backtrace; + buf[0] = '\0'; + return buf; } #endif /* __arch_um__ */ @@ -370,3 +389,4 @@ EXPORT_SYMBOL(portals_debug_set_level); EXPORT_SYMBOL(portals_run_upcall); EXPORT_SYMBOL(portals_run_lbug_upcall); EXPORT_SYMBOL(portals_nid2str); +EXPORT_SYMBOL(portals_id2str); diff --git a/lustre/portals/libcfs/lwt.c b/lustre/portals/libcfs/lwt.c index a24423e..3f6a9c2 100644 --- a/lustre/portals/libcfs/lwt.c +++ b/lustre/portals/libcfs/lwt.c @@ -45,9 +45,12 @@ #if LWT_SUPPORT +#if !KLWT_SUPPORT int lwt_enabled; +lwt_cpu_t lwt_cpus[NR_CPUS]; +#endif + int lwt_pages_per_cpu; -lwt_cpu_t lwt_cpus[LWT_MAX_CPUS]; /* NB only root is allowed to retrieve LWT info; it's an open door into the * kernel... */ @@ -97,23 +100,35 @@ lwt_control (int enable, int clear) if (!capable(CAP_SYS_ADMIN)) return (-EPERM); - if (clear) - for (i = 0; i < num_online_cpus(); i++) { - p = lwt_cpus[i].lwtc_current_page; + if (!enable) { + LWT_EVENT(0,0,0,0); + lwt_enabled = 0; + mb(); + /* give people some time to stop adding traces */ + schedule_timeout(10); + } - for (j = 0; j < lwt_pages_per_cpu; j++) { - memset (p->lwtp_events, 0, PAGE_SIZE); + for (i = 0; i < num_online_cpus(); i++) { + p = lwt_cpus[i].lwtc_current_page; - p = list_entry (p->lwtp_list.next, - lwt_page_t, lwtp_list); - } + if (p == NULL) + return (-ENODATA); + + if (!clear) + continue; + + for (j = 0; j < lwt_pages_per_cpu; j++) { + memset (p->lwtp_events, 0, PAGE_SIZE); + + p = list_entry (p->lwtp_list.next, + lwt_page_t, lwtp_list); + } } - lwt_enabled = enable; - mb(); - if (!enable) { - /* give people some time to stop adding traces */ - schedule_timeout(10); + if (enable) { + lwt_enabled = 1; + mb(); + LWT_EVENT(0,0,0,0); } return (0); @@ -141,6 +156,9 @@ lwt_snapshot (cycles_t *now, int *ncpu, int *total_size, for (i = 0; i < num_online_cpus(); i++) { p = lwt_cpus[i].lwtc_current_page; + + if (p == NULL) + return (-ENODATA); for (j = 0; j < lwt_pages_per_cpu; j++) { if (copy_to_user(user_ptr, p->lwtp_events, @@ -162,11 +180,12 @@ lwt_init () { int i; int j; + + for (i = 0; i < num_online_cpus(); i++) + if (lwt_cpus[i].lwtc_current_page != NULL) + return (-EALREADY); - if (num_online_cpus() > LWT_MAX_CPUS) { - CERROR ("Too many CPUs\n"); - return (-EINVAL); - } + LASSERT (!lwt_enabled); /* NULL pointers, zero scalars */ memset (lwt_cpus, 0, sizeof (lwt_cpus)); @@ -207,6 +226,8 @@ lwt_init () lwt_enabled = 1; mb(); + LWT_EVENT(0,0,0,0); + return (0); } @@ -214,10 +235,9 @@ void lwt_fini () { int i; - - if (num_online_cpus() > LWT_MAX_CPUS) - return; + lwt_control(0, 0); + for (i = 0; i < num_online_cpus(); i++) while (lwt_cpus[i].lwtc_current_page != NULL) { lwt_page_t *lwtp = lwt_cpus[i].lwtc_current_page; diff --git a/lustre/portals/libcfs/module.c b/lustre/portals/libcfs/module.c index f1d086b..a2422e3 100644 --- a/lustre/portals/libcfs/module.c +++ b/lustre/portals/libcfs/module.c @@ -47,19 +47,17 @@ #include #include #include -#include #include #define PORTAL_MINOR 240 -extern void (kping_client)(struct portal_ioctl_data *); - struct nal_cmd_handler { - nal_cmd_handler_t nch_handler; - void * nch_private; + int nch_number; + nal_cmd_handler_fn *nch_handler; + void *nch_private; }; -static struct nal_cmd_handler nal_cmd[NAL_MAX_NR + 1]; +static struct nal_cmd_handler nal_cmd[16]; static DECLARE_MUTEX(nal_cmd_sem); #ifdef PORTAL_DEBUG @@ -204,7 +202,7 @@ kportal_blockallsigs () } /* called when opening /dev/device */ -static int kportal_psdev_open(struct inode * inode, struct file * file) +static int libcfs_psdev_open(struct inode * inode, struct file * file) { struct portals_device_userstate *pdu; ENTRY; @@ -225,7 +223,7 @@ static int kportal_psdev_open(struct inode * inode, struct file * file) } /* called when closing /dev/device */ -static int kportal_psdev_release(struct inode * inode, struct file * file) +static int libcfs_psdev_release(struct inode * inode, struct file * file) { struct portals_device_userstate *pdu; ENTRY; @@ -248,265 +246,139 @@ static inline void freedata(void *data, int len) PORTAL_FREE(data, len); } -static int -kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, - ptl_nid_t lo_nid, ptl_nid_t hi_nid) +struct nal_cmd_handler * +libcfs_find_nal_cmd_handler(int nal) { - int rc; - kpr_control_interface_t *ci; - - ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET (kpr_control_interface); - if (ci == NULL) - return (-ENODEV); + int i; - rc = ci->kprci_add_route (gateway_nalid, gateway_nid, lo_nid, hi_nid); + for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++) + if (nal_cmd[i].nch_handler != NULL && + nal_cmd[i].nch_number == nal) + return (&nal_cmd[i]); - PORTAL_SYMBOL_PUT(kpr_control_interface); - return (rc); + return (NULL); } -static int -kportal_del_route(int gw_nalid, ptl_nid_t gw_nid, - ptl_nid_t lo, ptl_nid_t hi) -{ - int rc; - kpr_control_interface_t *ci; - - ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface); - if (ci == NULL) - return (-ENODEV); - - rc = ci->kprci_del_route (gw_nalid, gw_nid, lo, hi); - - PORTAL_SYMBOL_PUT(kpr_control_interface); - return (rc); -} - -static int -kportal_notify_router (int gw_nalid, ptl_nid_t gw_nid, - int alive, time_t when) +int +libcfs_nal_cmd_register(int nal, nal_cmd_handler_fn *handler, void *private) { - int rc; - kpr_control_interface_t *ci; - - /* No error if router not preset. Sysadmin is allowed to notify - * _everywhere_ when a NID boots or crashes, even if they know - * nothing of the peer. */ - ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface); - if (ci == NULL) - return (0); + struct nal_cmd_handler *cmd; + int i; + int rc; - rc = ci->kprci_notify (gw_nalid, gw_nid, alive, when); - - PORTAL_SYMBOL_PUT(kpr_control_interface); - return (rc); -} - -static int -kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp, - ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp, int *alivep) -{ - int gateway_nalid; - ptl_nid_t gateway_nid; - ptl_nid_t lo_nid; - ptl_nid_t hi_nid; - int alive; - int rc; - kpr_control_interface_t *ci; + CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler); - ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET(kpr_control_interface); - if (ci == NULL) - return (-ENODEV); + down(&nal_cmd_sem); - rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, - &lo_nid, &hi_nid, &alive); + if (libcfs_find_nal_cmd_handler(nal) != NULL) { + up (&nal_cmd_sem); + return (-EBUSY); + } - if (rc == 0) { - CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64", %s\n", - index, gateway_nalid, gateway_nid, lo_nid, hi_nid, - alive ? "up" : "down"); - - *gateway_nalidp = (__u32)gateway_nalid; - *gateway_nidp = gateway_nid; - *lo_nidp = lo_nid; - *hi_nidp = hi_nid; - *alivep = alive; + cmd = NULL; + for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++) + if (nal_cmd[i].nch_handler == NULL) { + cmd = &nal_cmd[i]; + break; + } + + if (cmd == NULL) { + rc = -EBUSY; + } else { + rc = 0; + cmd->nch_number = nal; + cmd->nch_handler = handler; + cmd->nch_private = private; } - PORTAL_SYMBOL_PUT (kpr_control_interface); - return (rc); + up(&nal_cmd_sem); + + return rc; } +EXPORT_SYMBOL(libcfs_nal_cmd_register); -static int -kportal_router_cmd(struct portals_cfg *pcfg, void * private) +void +libcfs_nal_cmd_unregister(int nal) { - int err = -EINVAL; - ENTRY; - - switch(pcfg->pcfg_command) { - default: - CDEBUG(D_IOCTL, "Inappropriate cmd: %d\n", pcfg->pcfg_command); - break; - - case NAL_CMD_ADD_ROUTE: - CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n", - pcfg->pcfg_nal, pcfg->pcfg_nid, - pcfg->pcfg_nid2, pcfg->pcfg_nid3); - err = kportal_add_route(pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_nid2, pcfg->pcfg_nid3); - break; + struct nal_cmd_handler *cmd; - case NAL_CMD_DEL_ROUTE: - CDEBUG (D_IOCTL, "Removing routes via [%d] "LPU64" : "LPU64" - "LPU64"\n", - pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_nid2, pcfg->pcfg_nid3); - err = kportal_del_route (pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_nid2, pcfg->pcfg_nid3); - break; + CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal); - case NAL_CMD_NOTIFY_ROUTER: { - CDEBUG (D_IOCTL, "Notifying peer [%d] "LPU64" %s @ %ld\n", - pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_flags ? "Enabling" : "Disabling", - (time_t)pcfg->pcfg_nid3); - - err = kportal_notify_router (pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_flags, - (time_t)pcfg->pcfg_nid3); - break; - } - - case NAL_CMD_GET_ROUTE: - CDEBUG (D_IOCTL, "Getting route [%d]\n", pcfg->pcfg_count); - err = kportal_get_route(pcfg->pcfg_count, &pcfg->pcfg_gw_nal, - &pcfg->pcfg_nid, - &pcfg->pcfg_nid2, &pcfg->pcfg_nid3, - &pcfg->pcfg_flags); - break; - } - RETURN(err); + down(&nal_cmd_sem); + cmd = libcfs_find_nal_cmd_handler(nal); + LASSERT (cmd != NULL); + cmd->nch_handler = NULL; + cmd->nch_private = NULL; + up(&nal_cmd_sem); } +EXPORT_SYMBOL(libcfs_nal_cmd_unregister); int -kportal_nal_cmd(struct portals_cfg *pcfg) +libcfs_nal_cmd(struct portals_cfg *pcfg) { + struct nal_cmd_handler *cmd; __u32 nal = pcfg->pcfg_nal; - int rc = -EINVAL; - + int rc = -EINVAL; ENTRY; down(&nal_cmd_sem); - if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) { + cmd = libcfs_find_nal_cmd_handler(nal); + if (cmd != NULL) { CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, pcfg->pcfg_command); - rc = nal_cmd[nal].nch_handler(pcfg, nal_cmd[nal].nch_private); + rc = cmd->nch_handler(pcfg, cmd->nch_private); } else { CERROR("invalid nal: %d, cmd: %d\n", nal, pcfg->pcfg_command); } up(&nal_cmd_sem); - RETURN(rc); -} - -ptl_handle_ni_t * -kportal_get_ni (int nal) -{ - switch (nal) - { - case QSWNAL: - return (PORTAL_SYMBOL_GET(kqswnal_ni)); - case SOCKNAL: - return (PORTAL_SYMBOL_GET(ksocknal_ni)); - case GMNAL: - return (PORTAL_SYMBOL_GET(kgmnal_ni)); - case IBNAL: - return (PORTAL_SYMBOL_GET(kibnal_ni)); - case TCPNAL: - /* userspace NAL */ - return (NULL); - case SCIMACNAL: - return (PORTAL_SYMBOL_GET(kscimacnal_ni)); - default: - /* A warning to a naive caller */ - CERROR ("unknown nal: %d\n", nal); - return (NULL); - } + RETURN(rc); } +EXPORT_SYMBOL(libcfs_nal_cmd); -void -kportal_put_ni (int nal) -{ +static DECLARE_RWSEM(ioctl_list_sem); +static LIST_HEAD(ioctl_list); - switch (nal) - { - case QSWNAL: - PORTAL_SYMBOL_PUT(kqswnal_ni); - break; - case SOCKNAL: - PORTAL_SYMBOL_PUT(ksocknal_ni); - break; - case GMNAL: - PORTAL_SYMBOL_PUT(kgmnal_ni); - break; - case IBNAL: - PORTAL_SYMBOL_PUT(kibnal_ni); - break; - case TCPNAL: - /* A lesson to a malicious caller */ - LBUG (); - case SCIMACNAL: - PORTAL_SYMBOL_PUT(kscimacnal_ni); - break; - default: - CERROR ("unknown nal: %d\n", nal); - } -} - -int -kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private) +int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand) { int rc = 0; + down_read(&ioctl_list_sem); + if (!list_empty(&hand->item)) + rc = -EBUSY; + up_read(&ioctl_list_sem); - CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler); - - if (nal > 0 && nal <= NAL_MAX_NR) { - down(&nal_cmd_sem); - if (nal_cmd[nal].nch_handler != NULL) - rc = -EBUSY; - else { - nal_cmd[nal].nch_handler = handler; - nal_cmd[nal].nch_private = private; - } - up(&nal_cmd_sem); + if (rc == 0) { + down_write(&ioctl_list_sem); + list_add_tail(&hand->item, &ioctl_list); + up_write(&ioctl_list_sem); } - return rc; + RETURN(0); } +EXPORT_SYMBOL(libcfs_register_ioctl); -int -kportal_nal_unregister(int nal) +int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand) { int rc = 0; + down_read(&ioctl_list_sem); + if (list_empty(&hand->item)) + rc = -ENOENT; + up_read(&ioctl_list_sem); - CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal); - - if (nal > 0 && nal <= NAL_MAX_NR) { - down(&nal_cmd_sem); - nal_cmd[nal].nch_handler = NULL; - nal_cmd[nal].nch_private = NULL; - up(&nal_cmd_sem); + if (rc == 0) { + down_write(&ioctl_list_sem); + list_del_init(&hand->item); + up_write(&ioctl_list_sem); } - return rc; + RETURN(0); } +EXPORT_SYMBOL(libcfs_deregister_ioctl); - -static int kportal_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) +static int libcfs_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) { - int err = 0; + int err = -EINVAL; char buf[1024]; struct portal_ioctl_data *data; - char str[PTL_NALFMT_SIZE]; - ENTRY; if (current->fsuid != 0) @@ -542,101 +414,67 @@ static int kportal_ioctl(struct inode *inode, struct file *file, RETURN(-EINVAL); portals_debug_mark_buffer(data->ioc_inlbuf1); RETURN(0); - case IOC_PORTAL_PING: { - void (*ping)(struct portal_ioctl_data *); - - CDEBUG(D_IOCTL, "doing %d pings to nid "LPX64" (%s)\n", - data->ioc_count, data->ioc_nid, - portals_nid2str(data->ioc_nal, data->ioc_nid, str)); - ping = PORTAL_SYMBOL_GET(kping_client); - if (!ping) - CERROR("PORTAL_SYMBOL_GET failed\n"); - else { - ping(data); - PORTAL_SYMBOL_PUT(kping_client); - } - RETURN(0); - } - - case IOC_PORTAL_GET_NID: { - const ptl_handle_ni_t *nip; - ptl_process_id_t pid; +#if LWT_SUPPORT + case IOC_PORTAL_LWT_CONTROL: + err = lwt_control (data->ioc_flags, data->ioc_misc); + break; - CDEBUG (D_IOCTL, "Getting nid for nal [%d]\n", data->ioc_nal); + case IOC_PORTAL_LWT_SNAPSHOT: { + cycles_t now; + int ncpu; + int total_size; - nip = kportal_get_ni (data->ioc_nal); - if (nip == NULL) - RETURN (-EINVAL); + err = lwt_snapshot (&now, &ncpu, &total_size, + data->ioc_pbuf1, data->ioc_plen1); + data->ioc_nid = now; + data->ioc_count = ncpu; + data->ioc_misc = total_size; - err = PtlGetId (*nip, &pid); - LASSERT (err == PTL_OK); - kportal_put_ni (data->ioc_nal); + /* Hedge against broken user/kernel typedefs (e.g. cycles_t) */ + data->ioc_nid2 = sizeof(lwt_event_t); + data->ioc_nid3 = offsetof(lwt_event_t, lwte_where); - data->ioc_nid = pid.nid; - if (copy_to_user ((char *)arg, data, sizeof (*data))) + if (err == 0 && + copy_to_user((char *)arg, data, sizeof (*data))) err = -EFAULT; break; } + case IOC_PORTAL_LWT_LOOKUP_STRING: + err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1, + data->ioc_pbuf2, data->ioc_plen2); + if (err == 0 && + copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; +#endif case IOC_PORTAL_NAL_CMD: { struct portals_cfg pcfg; - LASSERT (data->ioc_plen1 == sizeof(pcfg)); - err = copy_from_user(&pcfg, (void *)data->ioc_pbuf1, - sizeof(pcfg)); - if ( err ) { - EXIT; - return err; + if (data->ioc_plen1 != sizeof(pcfg)) { + CERROR("Bad ioc_plen1 %d (wanted %d)\n", + data->ioc_plen1, sizeof(pcfg)); + err = -EINVAL; + break; } - CDEBUG (D_IOCTL, "nal command nal %d cmd %d\n", pcfg.pcfg_nal, - pcfg.pcfg_command); - err = kportal_nal_cmd(&pcfg); - if (err == 0) { - if (copy_to_user((char *)data->ioc_pbuf1, &pcfg, - sizeof (pcfg))) - err = -EFAULT; - if (copy_to_user((char *)arg, data, sizeof (*data))) - err = -EFAULT; + if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, + sizeof(pcfg))) { + err = -EFAULT; + break; } - break; - } - case IOC_PORTAL_FAIL_NID: { - const ptl_handle_ni_t *nip; - CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n", - data->ioc_nal, data->ioc_nid, data->ioc_count); - - nip = kportal_get_ni (data->ioc_nal); - if (nip == NULL) - return (-EINVAL); + CDEBUG (D_IOCTL, "nal command nal %d cmd %d\n", pcfg.pcfg_nal, + pcfg.pcfg_command); + err = libcfs_nal_cmd(&pcfg); - err = PtlFailNid (*nip, data->ioc_nid, data->ioc_count); - kportal_put_ni (data->ioc_nal); - break; - } -#if LWT_SUPPORT - case IOC_PORTAL_LWT_CONTROL: - err = lwt_control (data->ioc_flags, data->ioc_misc); - break; - - case IOC_PORTAL_LWT_SNAPSHOT: - err = lwt_snapshot (&data->ioc_nid, - &data->ioc_count, &data->ioc_misc, - data->ioc_pbuf1, data->ioc_plen1); - if (err == 0 && - copy_to_user((char *)arg, data, sizeof (*data))) - err = -EFAULT; - break; - - case IOC_PORTAL_LWT_LOOKUP_STRING: - err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1, - data->ioc_pbuf2, data->ioc_plen2); if (err == 0 && - copy_to_user((char *)arg, data, sizeof (*data))) + copy_to_user((char *)data->ioc_pbuf1, &pcfg, + sizeof (pcfg))) err = -EFAULT; break; -#endif + } + case IOC_PORTAL_MEMHOG: if (!capable (CAP_SYS_ADMIN)) err = -EPERM; @@ -652,26 +490,34 @@ static int kportal_ioctl(struct inode *inode, struct file *file, } break; - default: + default: { + struct libcfs_ioctl_handler *hand; err = -EINVAL; - break; + down_read(&ioctl_list_sem); + list_for_each_entry(hand, &ioctl_list, item) { + err = hand->handle_ioctl(data, cmd, arg); + if (err != -EINVAL) + break; + } + up_read(&ioctl_list_sem); + } break; } RETURN(err); } -static struct file_operations portalsdev_fops = { - ioctl: kportal_ioctl, - open: kportal_psdev_open, - release: kportal_psdev_release +static struct file_operations libcfs_fops = { + ioctl: libcfs_ioctl, + open: libcfs_psdev_open, + release: libcfs_psdev_release }; -static struct miscdevice portal_dev = { +static struct miscdevice libcfs_dev = { PORTAL_MINOR, "portals", - &portalsdev_fops + &libcfs_fops }; extern int insert_proc(void); @@ -680,7 +526,7 @@ MODULE_AUTHOR("Peter J. Braam "); MODULE_DESCRIPTION("Portals v3.1"); MODULE_LICENSE("GPL"); -static int init_kportals_module(void) +static int init_libcfs_module(void) { int rc; @@ -697,41 +543,23 @@ static int init_kportals_module(void) goto cleanup_debug; } #endif - sema_init(&nal_cmd_sem, 1); - - rc = misc_register(&portal_dev); + rc = misc_register(&libcfs_dev); if (rc) { CERROR("misc_register: error %d\n", rc); goto cleanup_lwt; } - rc = PtlInit(); - if (rc) { - CERROR("PtlInit: error %d\n", rc); - goto cleanup_deregister; - } - rc = insert_proc(); if (rc) { CERROR("insert_proc: error %d\n", rc); - goto cleanup_fini; - } - - rc = kportal_nal_register(ROUTER, kportal_router_cmd, NULL); - if (rc) { - CERROR("kportal_nal_registre: ROUTER error %d\n", rc); - goto cleanup_proc; + goto cleanup_deregister; } CDEBUG (D_OTHER, "portals setup OK\n"); return (0); - cleanup_proc: - remove_proc(); - cleanup_fini: - PtlFini(); cleanup_deregister: - misc_deregister(&portal_dev); + misc_deregister(&libcfs_dev); cleanup_lwt: #if LWT_SUPPORT lwt_fini(); @@ -741,18 +569,16 @@ static int init_kportals_module(void) return rc; } -static void exit_kportals_module(void) +static void exit_libcfs_module(void) { int rc; - kportal_nal_unregister(ROUTER); remove_proc(); - PtlFini(); CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n", atomic_read(&portal_kmemory)); - rc = misc_deregister(&portal_dev); + rc = misc_deregister(&libcfs_dev); if (rc) CERROR("misc_deregister error %d\n", rc); @@ -769,48 +595,9 @@ static void exit_kportals_module(void) printk(KERN_ERR "LustreError: portals_debug_cleanup: %d\n", rc); } -EXPORT_SYMBOL(lib_dispatch); -EXPORT_SYMBOL(PtlMEAttach); -EXPORT_SYMBOL(PtlMEInsert); -EXPORT_SYMBOL(PtlMEUnlink); -EXPORT_SYMBOL(PtlEQAlloc); -EXPORT_SYMBOL(PtlMDAttach); -EXPORT_SYMBOL(PtlMDUnlink); -EXPORT_SYMBOL(PtlNIInit); -EXPORT_SYMBOL(PtlNIFini); -EXPORT_SYMBOL(PtlNIDebug); -EXPORT_SYMBOL(PtlInit); -EXPORT_SYMBOL(PtlFini); -EXPORT_SYMBOL(PtlPut); -EXPORT_SYMBOL(PtlGet); -EXPORT_SYMBOL(ptl_err_str); -EXPORT_SYMBOL(PtlEQWait); -EXPORT_SYMBOL(PtlEQFree); -EXPORT_SYMBOL(PtlEQGet); -EXPORT_SYMBOL(PtlGetId); -EXPORT_SYMBOL(PtlMDBind); -EXPORT_SYMBOL(lib_iov_nob); -EXPORT_SYMBOL(lib_copy_iov2buf); -EXPORT_SYMBOL(lib_copy_buf2iov); -EXPORT_SYMBOL(lib_extract_iov); -EXPORT_SYMBOL(lib_kiov_nob); -EXPORT_SYMBOL(lib_copy_kiov2buf); -EXPORT_SYMBOL(lib_copy_buf2kiov); -EXPORT_SYMBOL(lib_extract_kiov); -EXPORT_SYMBOL(lib_finalize); -EXPORT_SYMBOL(lib_parse); -EXPORT_SYMBOL(lib_fake_reply_msg); -EXPORT_SYMBOL(lib_init); -EXPORT_SYMBOL(lib_fini); -EXPORT_SYMBOL(dispatch_name); EXPORT_SYMBOL(kportal_daemonize); EXPORT_SYMBOL(kportal_blockallsigs); -EXPORT_SYMBOL(kportal_nal_register); -EXPORT_SYMBOL(kportal_nal_unregister); EXPORT_SYMBOL(kportal_assertion_failed); -EXPORT_SYMBOL(kportal_get_ni); -EXPORT_SYMBOL(kportal_put_ni); -EXPORT_SYMBOL(kportal_nal_cmd); -module_init(init_kportals_module); -module_exit (exit_kportals_module); +module_init(init_libcfs_module); +module_exit(exit_libcfs_module); diff --git a/lustre/portals/libcfs/proc.c b/lustre/portals/libcfs/proc.c index 4b39902..08446a0 100644 --- a/lustre/portals/libcfs/proc.c +++ b/lustre/portals/libcfs/proc.c @@ -62,16 +62,18 @@ extern char debug_file_path[1024]; extern char portals_upcall[1024]; #define PSDEV_PORTALS (0x100) -#define PSDEV_DEBUG 1 /* control debugging */ -#define PSDEV_SUBSYSTEM_DEBUG 2 /* control debugging */ -#define PSDEV_PRINTK 3 /* force all errors to console */ -#define PSDEV_CONSOLE 4 /* allow _any_ messages to console */ -#define PSDEV_DEBUG_PATH 5 /* crashdump log location */ -#define PSDEV_DEBUG_DUMP_PATH 6 /* crashdump tracelog location */ -#define PSDEV_PORTALS_UPCALL 7 /* User mode upcall script */ - -#define PORTALS_PRIMARY_CTLCNT 7 -static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = { +enum { + PSDEV_DEBUG = 1, /* control debugging */ + PSDEV_SUBSYSTEM_DEBUG, /* control debugging */ + PSDEV_PRINTK, /* force all errors to console */ + PSDEV_CONSOLE, /* allow _any_ messages to console */ + PSDEV_DEBUG_PATH, /* crashdump log location */ + PSDEV_DEBUG_DUMP_PATH, /* crashdump tracelog location */ + PSDEV_PORTALS_UPCALL, /* User mode upcall script */ + PSDEV_PORTALS_MEMUSED, /* bytes currently PORTAL_ALLOCated */ +}; + +static struct ctl_table portals_table[] = { {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL, &proc_dointvec}, {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug, @@ -83,6 +85,8 @@ static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = { {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall, sizeof(portals_upcall), 0644, NULL, &proc_dostring, &sysctl_string}, + {PSDEV_PORTALS_MEMUSED, "memused", (int *)&portal_kmemory.counter, + sizeof(int), 0644, NULL, &proc_dointvec}, {0} }; diff --git a/lustre/portals/portals/Makefile.in b/lustre/portals/portals/Makefile.in index 71067ac..c0f2e71 100644 --- a/lustre/portals/portals/Makefile.in +++ b/lustre/portals/portals/Makefile.in @@ -1,6 +1,6 @@ -#MODULES := portals -#portals-objs := api-eq.o api-init.o api-me.o api-errno.o api-ni.o api-wrap.o -#portals-objs += lib-dispatch.o lib-init.o lib-me.o lib-msg.o lib-eq.o lib-md.o -#portals-objs += lib-move.o lib-ni.o lib-pid.o +MODULES := portals +portals-objs := api-errno.o api-ni.o api-wrap.o +portals-objs += lib-init.o lib-me.o lib-msg.o lib-eq.o lib-md.o +portals-objs += lib-move.o lib-ni.o lib-pid.o module.o @INCLUDE_RULES@ diff --git a/lustre/portals/portals/Makefile.mk b/lustre/portals/portals/Makefile.mk index de01765..088902a 100644 --- a/lustre/portals/portals/Makefile.mk +++ b/lustre/portals/portals/Makefile.mk @@ -6,7 +6,7 @@ include $(src)/../Kernelenv obj-y += portals.o -portals-objs := lib-dispatch.o lib-eq.o lib-init.o lib-md.o lib-me.o \ +portals-objs := lib-eq.o lib-init.o lib-md.o lib-me.o \ lib-move.o lib-msg.o lib-ni.o lib-pid.o \ - api-eq.o api-errno.o api-init.o api-me.o api-ni.o \ - api-wrap.o module.o + api-errno.o api-ni.o api-wrap.o \ + module.o diff --git a/lustre/portals/portals/api-eq.c b/lustre/portals/portals/api-eq.c deleted file mode 100644 index 964b9d8..0000000 --- a/lustre/portals/portals/api-eq.c +++ /dev/null @@ -1,202 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-eq.c - * User-level event queue management routines - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include - -int ptl_eq_init(void) -{ - /* Nothing to do anymore... */ - return PTL_OK; -} - -void ptl_eq_fini(void) -{ - /* Nothing to do anymore... */ -} - -int ptl_eq_ni_init(nal_t * nal) -{ - /* Nothing to do anymore... */ - return PTL_OK; -} - -void ptl_eq_ni_fini(nal_t * nal) -{ - /* Nothing to do anymore... */ -} - -int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev) -{ - ptl_eq_t *eq; - int rc, new_index; - unsigned long flags; - ptl_event_t *new_event; - nal_t *nal; - ENTRY; - - if (!ptl_init) - RETURN(PTL_NOINIT); - - nal = ptl_hndl2nal(&eventq); - if (!nal) - RETURN(PTL_INV_EQ); - - eq = ptl_handle2usereq(&eventq); - nal->lock(nal, &flags); - - /* size must be a power of 2 to handle a wrapped sequence # */ - LASSERT (eq->size != 0 && - eq->size == LOWEST_BIT_SET (eq->size)); - - new_index = eq->sequence & (eq->size - 1); - new_event = &eq->base[new_index]; - CDEBUG(D_INFO, "new_event: %p, sequence: %lu, eq->size: %u\n", - new_event, eq->sequence, eq->size); - if (PTL_SEQ_GT (eq->sequence, new_event->sequence)) { - nal->unlock(nal, &flags); - RETURN(PTL_EQ_EMPTY); - } - - *ev = *new_event; - - /* ensure event is delivered correctly despite possible - races with lib_finalize */ - if (eq->sequence != new_event->sequence) { - CERROR("DROPPING EVENT: eq seq %lu ev seq %lu\n", - eq->sequence, new_event->sequence); - rc = PTL_EQ_DROPPED; - } else { - rc = PTL_OK; - } - - eq->sequence = new_event->sequence + 1; - nal->unlock(nal, &flags); - RETURN(rc); -} - - -int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) -{ - int rc; - - /* PtlEQGet does the handle checking */ - while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) { - nal_t *nal = ptl_hndl2nal(&eventq_in); - - if (nal->yield) - nal->yield(nal); - } - - return rc; -} - -#ifndef __KERNEL__ -#if 0 -static jmp_buf eq_jumpbuf; - -static void eq_timeout(int signal) -{ - sigset_t set; - - /* signal will be automatically disabled in sig handler, - * must enable it before long jump - */ - sigemptyset(&set); - sigaddset(&set, SIGALRM); - sigprocmask(SIG_UNBLOCK, &set, NULL); - - longjmp(eq_jumpbuf, -1); -} - -int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, - int timeout) -{ - static void (*prev) (int) = NULL; - static int left_over; - time_t time_at_start; - int rc; - - if (setjmp(eq_jumpbuf)) { - signal(SIGALRM, prev); - alarm(left_over - timeout); - return PTL_EQ_EMPTY; - } - - left_over = alarm(timeout); - prev = signal(SIGALRM, eq_timeout); - time_at_start = time(NULL); - if (left_over && left_over < timeout) - alarm(left_over); - - rc = PtlEQWait(eventq_in, event_out); - - signal(SIGALRM, prev); - alarm(left_over); /* Should compute how long we waited */ - - return rc; -} -#else -#include - -/* FIXME - * Here timeout need a trick with tcpnal, definitely unclean but OK for - * this moment. - */ - -/* global variables defined by tcpnal */ -extern int __tcpnal_eqwait_timeout_value; -extern int __tcpnal_eqwait_timedout; - -int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, - int timeout) -{ - int rc; - - if (!timeout) - return PtlEQWait(eventq_in, event_out); - - __tcpnal_eqwait_timeout_value = timeout; - - while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) { - nal_t *nal = ptl_hndl2nal(&eventq_in); - - if (nal->yield) - nal->yield(nal); - - if (__tcpnal_eqwait_timedout) { - if (__tcpnal_eqwait_timedout != ETIMEDOUT) - printf("Warning: yield return error %d\n", - __tcpnal_eqwait_timedout); - rc = PTL_EQ_EMPTY; - break; - } - } - - __tcpnal_eqwait_timeout_value = 0; - - return rc; -} -#endif -#endif /* __KERNEL__ */ diff --git a/lustre/portals/portals/api-errno.c b/lustre/portals/portals/api-errno.c index b5e7aa1..9a4e5ac 100644 --- a/lustre/portals/portals/api-errno.c +++ b/lustre/portals/portals/api-errno.c @@ -12,43 +12,37 @@ const char *ptl_err_str[] = { "PTL_OK", "PTL_SEGV", - "PTL_NOSPACE", - "PTL_INUSE", + "PTL_NO_SPACE", + "PTL_ME_IN_USE", "PTL_VAL_FAILED", "PTL_NAL_FAILED", - "PTL_NOINIT", - "PTL_INIT_DUP", - "PTL_INIT_INV", - "PTL_AC_INV_INDEX", - - "PTL_INV_ASIZE", - "PTL_INV_HANDLE", - "PTL_INV_MD", - "PTL_INV_ME", - "PTL_INV_NI", + "PTL_NO_INIT", + "PTL_IFACE_DUP", + "PTL_IFACE_INVALID", + + "PTL_HANDLE_INVALID", + "PTL_MD_INVALID", + "PTL_ME_INVALID", /* If you change these, you must update the number table in portals/errno.h */ - "PTL_ILL_MD", - "PTL_INV_PROC", - "PTL_INV_PSIZE", - "PTL_INV_PTINDEX", - "PTL_INV_REG", - - "PTL_INV_SR_INDX", - "PTL_ML_TOOLONG", - "PTL_ADDR_UNKNOWN", - "PTL_INV_EQ", + "PTL_PROCESS_INVALID", + "PTL_PT_INDEX_INVALID", + + "PTL_SR_INDEX_INVALID", + "PTL_EQ_INVALID", "PTL_EQ_DROPPED", "PTL_EQ_EMPTY", - "PTL_NOUPDATE", + "PTL_MD_NO_UPDATE", "PTL_FAIL", - "PTL_NOT_IMPLEMENTED", - "PTL_NO_ACK", - "PTL_IOV_TOO_MANY", - "PTL_IOV_TOO_SMALL", + "PTL_IOV_INVALID", + + "PTL_EQ_IN_USE", + + "PTL_NI_INVALID", + "PTL_MD_ILLEGAL", - "PTL_EQ_INUSE", + "PTL_MAX_ERRNO" }; /* If you change these, you must update the number table in portals/errno.h */ diff --git a/lustre/portals/portals/api-init.c b/lustre/portals/portals/api-init.c deleted file mode 100644 index 0a64864..0000000 --- a/lustre/portals/portals/api-init.c +++ /dev/null @@ -1,63 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-init.c - * Initialization and global data for the p30 user side library - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include - -int ptl_init; - -int __p30_initialized; -int __p30_myr_initialized; -int __p30_ip_initialized; -ptl_handle_ni_t __myr_ni_handle; -ptl_handle_ni_t __ip_ni_handle; - -int __p30_myr_timeout = 10; -int __p30_ip_timeout; - -int PtlInit(void) -{ - - if (ptl_init) - return PTL_OK; - - ptl_ni_init(); - ptl_me_init(); - ptl_eq_init(); - ptl_init = 1; - __p30_initialized = 1; - - return PTL_OK; -} - - -void PtlFini(void) -{ - - /* Reverse order of initialization */ - ptl_eq_fini(); - ptl_me_fini(); - ptl_ni_fini(); - ptl_init = 0; -} diff --git a/lustre/portals/portals/api-me.c b/lustre/portals/portals/api-me.c deleted file mode 100644 index e724e58..0000000 --- a/lustre/portals/portals/api-me.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-me.c - * Match Entry local operations. - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include - -int ptl_me_init(void) -{ - return PTL_OK; -} -void ptl_me_fini(void) -{ /* Nothing to do */ -} -int ptl_me_ni_init(nal_t * nal) -{ - return PTL_OK; -} - -void ptl_me_ni_fini(nal_t * nal) -{ /* Nothing to do... */ -} diff --git a/lustre/portals/portals/api-ni.c b/lustre/portals/portals/api-ni.c index 18eea91..72d3b41 100644 --- a/lustre/portals/portals/api-ni.c +++ b/lustre/portals/portals/api-ni.c @@ -23,15 +23,39 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#define DEBUG_SUBSYSTEM S_PORTALS #include +int ptl_init; + /* Put some magic in the NI handle so uninitialised/zeroed handles are easy * to spot */ #define NI_HANDLE_MAGIC 0xebc0de00 #define NI_HANDLE_MASK 0x000000ff -#define MAX_NIS 8 -static nal_t *ptl_interfaces[MAX_NIS]; -int ptl_num_interfaces = 0; + +static struct nal_t *ptl_nal_table[NAL_MAX_NR + 1]; + +#ifdef __KERNEL__ +DECLARE_MUTEX(ptl_mutex); + +static void ptl_mutex_enter (void) +{ + down (&ptl_mutex); +} + +static void ptl_mutex_exit (void) +{ + up (&ptl_mutex); +} +#else +static void ptl_mutex_enter (void) +{ +} + +static void ptl_mutex_exit (void) +{ +} +#endif nal_t *ptl_hndl2nal(ptl_handle_any_t *handle) { @@ -42,156 +66,200 @@ nal_t *ptl_hndl2nal(ptl_handle_any_t *handle) * invalidated out from under her (or worse, swapped for a * completely different interface!) */ + LASSERT (ptl_init); + if (((idx ^ NI_HANDLE_MAGIC) & ~NI_HANDLE_MASK) != 0) return NULL; idx &= NI_HANDLE_MASK; - if (idx < MAX_NIS) - return ptl_interfaces[idx]; + + if (idx > NAL_MAX_NR || + ptl_nal_table[idx] == NULL || + ptl_nal_table[idx]->nal_refct == 0) + return NULL; - return NULL; + return ptl_nal_table[idx]; } -int ptl_ni_init(void) +int ptl_register_nal (ptl_interface_t interface, nal_t *nal) { - int i; - - LASSERT (MAX_NIS <= (NI_HANDLE_MASK + 1)); + int rc; - for (i = 0; i < MAX_NIS; i++) - ptl_interfaces[i] = NULL; + ptl_mutex_enter(); + + if (interface < 0 || interface > NAL_MAX_NR) + rc = PTL_IFACE_INVALID; + else if (ptl_nal_table[interface] != NULL) + rc = PTL_IFACE_DUP; + else { + rc = PTL_OK; + ptl_nal_table[interface] = nal; + LASSERT(nal->nal_refct == 0); + } - return PTL_OK; + ptl_mutex_exit(); + return (rc); } -void ptl_ni_fini(void) +void ptl_unregister_nal (ptl_interface_t interface) { - int i; - - for (i = 0; i < MAX_NIS; i++) { - nal_t *nal = ptl_interfaces[i]; - if (!nal) - continue; + LASSERT(interface >= 0 && interface <= NAL_MAX_NR); + LASSERT(ptl_nal_table[interface] != NULL); + LASSERT(ptl_nal_table[interface]->nal_refct == 0); + + ptl_mutex_enter(); + + ptl_nal_table[interface] = NULL; - if (nal->shutdown) - nal->shutdown(nal, i); - } + ptl_mutex_exit(); } -#ifdef __KERNEL__ -DECLARE_MUTEX(ptl_ni_init_mutex); - -static void ptl_ni_init_mutex_enter (void) +int PtlInit(int *max_interfaces) { - down (&ptl_ni_init_mutex); -} + LASSERT(!strcmp(ptl_err_str[PTL_MAX_ERRNO], "PTL_MAX_ERRNO")); -static void ptl_ni_init_mutex_exit (void) -{ - up (&ptl_ni_init_mutex); -} + /* If this assertion fails, we need more bits in NI_HANDLE_MASK and + * to shift NI_HANDLE_MAGIC left appropriately */ + LASSERT (NAL_MAX_NR < (NI_HANDLE_MASK + 1)); + + if (max_interfaces != NULL) + *max_interfaces = NAL_MAX_NR + 1; + + ptl_mutex_enter(); + + if (!ptl_init) { + /* NULL pointers, clear flags */ + memset(ptl_nal_table, 0, sizeof(ptl_nal_table)); +#ifndef __KERNEL__ + /* Kernel NALs register themselves when their module loads, + * and unregister themselves when their module is unloaded. + * Userspace NALs, are plugged in explicitly here... */ + { + extern nal_t procapi_nal; + + /* XXX pretend it's socknal to keep liblustre happy... */ + ptl_nal_table[SOCKNAL] = &procapi_nal; + LASSERT (procapi_nal.nal_refct == 0); + } +#endif + ptl_init = 1; + } -#else -static void ptl_ni_init_mutex_enter (void) -{ + ptl_mutex_exit(); + + return PTL_OK; } -static void ptl_ni_init_mutex_exit (void) +void PtlFini(void) { -} + nal_t *nal; + int i; + + ptl_mutex_enter(); + + if (ptl_init) { + for (i = 0; i <= NAL_MAX_NR; i++) { + + nal = ptl_nal_table[i]; + if (nal == NULL) + continue; + + if (nal->nal_refct != 0) { + CWARN("NAL %d has outstanding refcount %d\n", + i, nal->nal_refct); + nal->nal_ni_fini(nal); + } + + ptl_nal_table[i] = NULL; + } -#endif + ptl_init = 0; + } + + ptl_mutex_exit(); +} -int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size, - ptl_ac_index_t acl_size, ptl_pid_t requested_pid, - ptl_handle_ni_t * handle) +int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid, + ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits, + ptl_handle_ni_t *handle) { nal_t *nal; - int i; + int i; + int rc; if (!ptl_init) - return PTL_NOINIT; - - ptl_ni_init_mutex_enter (); - - nal = interface(ptl_num_interfaces, ptl_size, acl_size, requested_pid); - - if (!nal) { - ptl_ni_init_mutex_exit (); - return PTL_NAL_FAILED; + return PTL_NO_INIT; + + ptl_mutex_enter (); + + if (interface == PTL_IFACE_DEFAULT) { + for (i = 0; i <= NAL_MAX_NR; i++) + if (ptl_nal_table[i] != NULL) { + interface = i; + break; + } + /* NB if no interfaces are registered, 'interface' will + * fail the valid test below */ } - - for (i = 0; i < ptl_num_interfaces; i++) { - if (ptl_interfaces[i] == nal) { - nal->refct++; - handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | i; - CDEBUG(D_OTHER, "Returning existing NAL (%d)\n", i); - ptl_ni_init_mutex_exit (); - return PTL_OK; - } + + if (interface < 0 || + interface > NAL_MAX_NR || + ptl_nal_table[interface] == NULL) { + GOTO(out, rc = PTL_IFACE_INVALID); } - nal->refct = 1; - if (ptl_num_interfaces >= MAX_NIS) { - if (nal->shutdown) - nal->shutdown (nal, ptl_num_interfaces); - ptl_ni_init_mutex_exit (); - return PTL_NOSPACE; - } + nal = ptl_nal_table[interface]; + nal->nal_handle.nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | interface; + nal->nal_handle.cookie = 0; + + CDEBUG(D_OTHER, "Starting up NAL (%d) refs %d\n", interface, nal->nal_refct); + rc = nal->nal_ni_init(nal, requested_pid, desired_limits, actual_limits); - handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | ptl_num_interfaces; - ptl_interfaces[ptl_num_interfaces++] = nal; + if (rc != PTL_OK) { + CERROR("Error %d starting up NAL %d, refs %d\n", rc, + interface, nal->nal_refct); + GOTO(out, rc); + } + + if (nal->nal_refct != 0) { + /* Caller gets to know if this was the first ref or not */ + rc = PTL_IFACE_DUP; + } + + nal->nal_refct++; + *handle = nal->nal_handle; - ptl_eq_ni_init(nal); - ptl_me_ni_init(nal); + out: + ptl_mutex_exit (); - ptl_ni_init_mutex_exit (); - return PTL_OK; + return rc; } - int PtlNIFini(ptl_handle_ni_t ni) { nal_t *nal; - int idx; - int rc; + int idx; if (!ptl_init) - return PTL_NOINIT; + return PTL_NO_INIT; - ptl_ni_init_mutex_enter (); + ptl_mutex_enter (); nal = ptl_hndl2nal (&ni); if (nal == NULL) { - ptl_ni_init_mutex_exit (); - return PTL_INV_HANDLE; + ptl_mutex_exit (); + return PTL_HANDLE_INVALID; } idx = ni.nal_idx & NI_HANDLE_MASK; - nal->refct--; - if (nal->refct > 0) { - ptl_ni_init_mutex_exit (); - return PTL_OK; - } - - ptl_me_ni_fini(nal); - ptl_eq_ni_fini(nal); - - rc = PTL_OK; - if (nal->shutdown) - rc = nal->shutdown(nal, idx); + LASSERT(nal->nal_refct > 0); - ptl_interfaces[idx] = NULL; - ptl_num_interfaces--; + nal->nal_refct--; - ptl_ni_init_mutex_exit (); - return rc; -} - -int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * ni_out) -{ - *ni_out = handle_in; + /* nal_refct == 0 tells nal->shutdown to really shut down */ + nal->nal_ni_fini(nal); + ptl_mutex_exit (); return PTL_OK; } diff --git a/lustre/portals/portals/api-wrap.c b/lustre/portals/portals/api-wrap.c index d23a6aa..37f6c0b 100644 --- a/lustre/portals/portals/api-wrap.c +++ b/lustre/portals/portals/api-wrap.c @@ -26,152 +26,114 @@ # define DEBUG_SUBSYSTEM S_PORTALS #include -static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf, - int argsize, void *retbuf, int retsize) +void PtlSnprintHandle(char *str, int len, ptl_handle_any_t h) { - nal_t *nal; - - if (!ptl_init) { - CERROR("Not initialized\n"); - return PTL_NOINIT; - } - - nal = ptl_hndl2nal(&any_h); - if (!nal) - return PTL_INV_HANDLE; - - nal->forward(nal, cmd, argbuf, argsize, retbuf, retsize); + snprintf(str, len, "0x%lx."LPX64, h.nal_idx, h.cookie); +} +int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t *ni_out) +{ + if (!ptl_init) + return PTL_NO_INIT; + + if (ptl_hndl2nal(&handle_in) == NULL) + return PTL_HANDLE_INVALID; + + *ni_out = handle_in; return PTL_OK; } int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id) { - PtlGetId_in args; - PtlGetId_out ret; - int rc; - - args.handle_in = ni_handle; + nal_t *nal; - rc = do_forward(ni_handle, PTL_GETID, &args, sizeof(args), &ret, - sizeof(ret)); - if (rc != PTL_OK) - return rc; + if (!ptl_init) + return PTL_NO_INIT; - if (id) - *id = ret.id_out; + nal = ptl_hndl2nal(&ni_handle); + if (nal == NULL) + return PTL_NI_INVALID; - return ret.rc; + return nal->nal_get_id(nal, id); } -int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) +int PtlGetUid(ptl_handle_ni_t ni_handle, ptl_uid_t *uid) { - PtlFailNid_in args; - PtlFailNid_out ret; - int rc; - - args.interface = interface; - args.nid = nid; - args.threshold = threshold; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; - rc = do_forward (interface, PTL_FAILNID, - &args, sizeof(args), &ret, sizeof (ret)); + nal = ptl_hndl2nal(&ni_handle); + if (nal == NULL) + return PTL_NI_INVALID; - return ((rc != PTL_OK) ? rc : ret.rc); + /* We don't support different uids yet */ + *uid = 0; + return PTL_OK; } -int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, - ptl_sr_value_t * status_out) +int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) { - PtlNIStatus_in args; - PtlNIStatus_out ret; - int rc; - - args.interface_in = interface_in; - args.register_in = register_in; - - rc = do_forward(interface_in, PTL_NISTATUS, &args, sizeof(args), &ret, - sizeof(ret)); - - if (rc != PTL_OK) - return rc; + nal_t *nal; - if (status_out) - *status_out = ret.status_out; + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface); + if (nal == NULL) + return PTL_NI_INVALID; - return ret.rc; + return nal->nal_fail_nid(nal, nid, threshold); } -int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, - unsigned long *distance_out) +int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, + ptl_sr_value_t *status_out) { - PtlNIDist_in args; - PtlNIDist_out ret; - int rc; - - args.interface_in = interface_in; - args.process_in = process_in; - - rc = do_forward(interface_in, PTL_NIDIST, &args, sizeof(args), &ret, - sizeof(ret)); + nal_t *nal; - if (rc != PTL_OK) - return rc; - - if (distance_out) - *distance_out = ret.distance_out; + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface_in); + if (nal == NULL) + return PTL_NI_INVALID; - return ret.rc; + return nal->nal_ni_status(nal, register_in, status_out); } - - -unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in) +int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, + unsigned long *distance_out) { - PtlNIDebug_in args; - PtlNIDebug_out ret; - int rc; - - args.mask_in = mask_in; + nal_t *nal; - rc = do_forward(ni, PTL_NIDEBUG, &args, sizeof(args), &ret, - sizeof(ret)); - - if (rc != PTL_OK) - return rc; + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface_in); + if (nal == NULL) + return PTL_NI_INVALID; - return ret.rc; + return nal->nal_ni_dist(nal, &process_in, distance_out); } int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, - ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out) + ptl_ins_pos_t pos_in, ptl_handle_me_t *handle_out) { - PtlMEAttach_in args; - PtlMEAttach_out ret; - int rc; - - args.interface_in = interface_in; - args.index_in = index_in; - args.match_id_in = match_id_in; - args.match_bits_in = match_bits_in; - args.ignore_bits_in = ignore_bits_in; - args.unlink_in = unlink_in; - args.position_in = pos_in; - - rc = do_forward(interface_in, PTL_MEATTACH, &args, sizeof(args), &ret, - sizeof(ret)); - - if (rc != PTL_OK) - return rc; - - if (handle_out) { - handle_out->nal_idx = interface_in.nal_idx; - handle_out->cookie = ret.handle_out.cookie; - } - - return ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface_in); + if (nal == NULL) + return PTL_NI_INVALID; + + return nal->nal_me_attach(nal, index_in, match_id_in, + match_bits_in, ignore_bits_in, + unlink_in, pos_in, handle_out); } int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, @@ -179,421 +141,226 @@ int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, ptl_handle_me_t * handle_out) { - PtlMEInsert_in args; - PtlMEInsert_out ret; - int rc; - - args.current_in = current_in; - args.match_id_in = match_id_in; - args.match_bits_in = match_bits_in; - args.ignore_bits_in = ignore_bits_in; - args.unlink_in = unlink_in; - args.position_in = position_in; - - rc = do_forward(current_in, PTL_MEINSERT, &args, sizeof(args), &ret, - sizeof(ret)); - - if (rc != PTL_OK) - return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; - - if (handle_out) { - handle_out->nal_idx = current_in.nal_idx; - handle_out->cookie = ret.handle_out.cookie; - } - return ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(¤t_in); + if (nal == NULL) + return PTL_ME_INVALID; + + return nal->nal_me_insert(nal, ¤t_in, match_id_in, + match_bits_in, ignore_bits_in, + unlink_in, position_in, handle_out); } int PtlMEUnlink(ptl_handle_me_t current_in) { - PtlMEUnlink_in args; - PtlMEUnlink_out ret; - int rc; - - args.current_in = current_in; - args.unlink_in = PTL_RETAIN; - - rc = do_forward(current_in, PTL_MEUNLINK, &args, sizeof(args), &ret, - sizeof(ret)); + nal_t *nal; - if (rc != PTL_OK) - return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(¤t_in); + if (nal == NULL) + return PTL_ME_INVALID; - return ret.rc; + return nal->nal_me_unlink(nal, ¤t_in); } -int PtlTblDump(ptl_handle_ni_t ni, int index_in) +int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) { - PtlTblDump_in args; - PtlTblDump_out ret; - int rc; + nal_t *nal; - args.index_in = index_in; - - rc = do_forward(ni, PTL_TBLDUMP, &args, sizeof(args), &ret, - sizeof(ret)); + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&me_in); + if (nal == NULL) + return PTL_ME_INVALID; - if (rc != PTL_OK) - return rc; + if (!PtlHandleIsEqual(md_in.eq_handle, PTL_EQ_NONE) && + ptl_hndl2nal(&md_in.eq_handle) != nal) + return PTL_MD_ILLEGAL; - return ret.rc; + return (nal->nal_md_attach)(nal, &me_in, &md_in, + unlink_in, handle_out); } -int PtlMEDump(ptl_handle_me_t current_in) +int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t *handle_out) { - PtlMEDump_in args; - PtlMEDump_out ret; - int rc; - - args.current_in = current_in; + nal_t *nal; - rc = do_forward(current_in, PTL_MEDUMP, &args, sizeof(args), &ret, - sizeof(ret)); + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&ni_in); + if (nal == NULL) + return PTL_NI_INVALID; - if (rc != PTL_OK) - return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + if (!PtlHandleIsEqual(md_in.eq_handle, PTL_EQ_NONE) && + ptl_hndl2nal(&md_in.eq_handle) != nal) + return PTL_MD_ILLEGAL; - return ret.rc; + return (nal->nal_md_bind)(nal, &md_in, unlink_in, handle_out); } -static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in) +int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout, + ptl_md_t *new_inout, ptl_handle_eq_t testq_in) { - nal_t *nal; - int rc; - int i; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&md_in); + if (nal == NULL) + return PTL_MD_INVALID; - if (!ptl_init) { - CERROR("PtlMDAttach/Bind/Update: Not initialized\n"); - return PTL_NOINIT; - } + if (!PtlHandleIsEqual(testq_in, PTL_EQ_NONE) && + ptl_hndl2nal(&testq_in) != nal) + return PTL_EQ_INVALID; - nal = ptl_hndl2nal(¤t_in); - if (!nal) - return PTL_INV_HANDLE; - - if (nal->validate != NULL) /* nal->validate not a NOOP */ - { - if ((md_in.options & PTL_MD_IOV) == 0) /* contiguous */ - { - rc = nal->validate (nal, md_in.start, md_in.length); - if (rc) - return (PTL_SEGV); - } - else - { - struct iovec *iov = (struct iovec *)md_in.start; - - for (i = 0; i < md_in.niov; i++, iov++) - { - rc = nal->validate (nal, iov->iov_base, iov->iov_len); - if (rc) - return (PTL_SEGV); - } - } - } - - return 0; + return (nal->nal_md_update)(nal, &md_in, + old_inout, new_inout, &testq_in); } -static ptl_handle_eq_t md2eq (ptl_md_t *md) +int PtlMDUnlink(ptl_handle_md_t md_in) { - if (PtlHandleEqual (md->eventq, PTL_EQ_NONE)) - return (PTL_EQ_NONE); + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; - return (ptl_handle2usereq (&md->eventq)->cb_eq_handle); + nal = ptl_hndl2nal(&md_in); + if (nal == NULL) + return PTL_MD_INVALID; + + return (nal->nal_md_unlink)(nal, &md_in); } - -int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) +int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count, + ptl_eq_handler_t callback, + ptl_handle_eq_t *handle_out) { - PtlMDAttach_in args; - PtlMDAttach_out ret; - int rc; - - rc = validate_md(me_in, md_in); - if (rc == PTL_OK) { - args.eq_in = md2eq(&md_in); - args.me_in = me_in; - args.md_in = md_in; - args.unlink_in = unlink_in; - - rc = do_forward(me_in, PTL_MDATTACH, - &args, sizeof(args), &ret, sizeof(ret)); - } - - if (rc != PTL_OK) - return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; - - if (handle_out) { - handle_out->nal_idx = me_in.nal_idx; - handle_out->cookie = ret.handle_out.cookie; - } - return ret.rc; -} - + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface); + if (nal == NULL) + return PTL_NI_INVALID; + return (nal->nal_eq_alloc)(nal, count, callback, handle_out); +} -int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, - ptl_handle_md_t * handle_out) +int PtlEQFree(ptl_handle_eq_t eventq) { - PtlMDBind_in args; - PtlMDBind_out ret; - int rc; - - rc = validate_md(ni_in, md_in); - if (rc != PTL_OK) - return rc; - - args.eq_in = md2eq(&md_in); - args.ni_in = ni_in; - args.md_in = md_in; + nal_t *nal; - rc = do_forward(ni_in, PTL_MDBIND, - &args, sizeof(args), &ret, sizeof(ret)); - - if (rc != PTL_OK) - return rc; + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&eventq); + if (nal == NULL) + return PTL_EQ_INVALID; - if (handle_out) { - handle_out->nal_idx = ni_in.nal_idx; - handle_out->cookie = ret.handle_out.cookie; - } - return ret.rc; + return (nal->nal_eq_free)(nal, &eventq); } -int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout, - ptl_md_t *new_inout, ptl_handle_eq_t testq_in) +int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t *ev) { - PtlMDUpdate_internal_in args; - PtlMDUpdate_internal_out ret; - int rc; - - args.md_in = md_in; - - if (old_inout) { - args.old_inout = *old_inout; - args.old_inout_valid = 1; - } else - args.old_inout_valid = 0; - - if (new_inout) { - rc = validate_md (md_in, *new_inout); - if (rc != PTL_OK) - return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; - args.new_inout = *new_inout; - args.new_inout_valid = 1; - } else - args.new_inout_valid = 0; - - if (PtlHandleEqual (testq_in, PTL_EQ_NONE)) { - args.testq_in = PTL_EQ_NONE; - args.sequence_in = -1; - } else { - ptl_eq_t *eq = ptl_handle2usereq (&testq_in); - - args.testq_in = eq->cb_eq_handle; - args.sequence_in = eq->sequence; - } - - rc = do_forward(md_in, PTL_MDUPDATE, &args, sizeof(args), &ret, - sizeof(ret)); - if (rc != PTL_OK) - return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; - - if (old_inout) - *old_inout = ret.old_inout; - - return ret.rc; + int which; + + return (PtlEQPoll (&eventq, 1, 0, ev, &which)); } -int PtlMDUnlink(ptl_handle_md_t md_in) +int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) { - PtlMDUnlink_in args; - PtlMDUnlink_out ret; - int rc; - - args.md_in = md_in; - rc = do_forward(md_in, PTL_MDUNLINK, &args, sizeof(args), &ret, - sizeof(ret)); - if (rc != PTL_OK) - return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; - - return ret.rc; + int which; + + return (PtlEQPoll (&eventq_in, 1, PTL_TIME_FOREVER, + event_out, &which)); } -int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count, - int (*callback) (ptl_event_t * event), - ptl_handle_eq_t * handle_out) +int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout, + ptl_event_t *event_out, int *which_out) { - ptl_eq_t *eq = NULL; - ptl_event_t *ev = NULL; - PtlEQAlloc_in args; - PtlEQAlloc_out ret; - int rc, i; - nal_t *nal; + int i; + nal_t *nal; if (!ptl_init) - return PTL_NOINIT; - - nal = ptl_hndl2nal (&interface); - if (nal == NULL) - return PTL_INV_HANDLE; - - if (count != LOWEST_BIT_SET(count)) { /* not a power of 2 already */ - do { /* knock off all but the top bit... */ - count &= ~LOWEST_BIT_SET (count); - } while (count != LOWEST_BIT_SET(count)); - - count <<= 1; /* ...and round up */ - } - - if (count == 0) /* catch bad parameter / overflow on roundup */ - return (PTL_VAL_FAILED); - - PORTAL_ALLOC(ev, count * sizeof(ptl_event_t)); - if (!ev) - return PTL_NOSPACE; - - for (i = 0; i < count; i++) - ev[i].sequence = 0; - - if (nal->validate != NULL) { - rc = nal->validate(nal, ev, count * sizeof(ptl_event_t)); - if (rc != PTL_OK) - goto fail; - } - - args.ni_in = interface; - args.count_in = count; - args.base_in = ev; - args.len_in = count * sizeof(*ev); - args.callback_in = callback; - - rc = do_forward(interface, PTL_EQALLOC, &args, sizeof(args), &ret, - sizeof(ret)); - if (rc != PTL_OK) - goto fail; - if (ret.rc) - GOTO(fail, rc = ret.rc); - - PORTAL_ALLOC(eq, sizeof(*eq)); - if (!eq) { - rc = PTL_NOSPACE; - goto fail; - } - - eq->sequence = 1; - eq->size = count; - eq->base = ev; - - /* EQ handles are a little wierd. PtlEQGet() just looks at the - * queued events in shared memory. It doesn't want to do_forward() - * at all, so the cookie in the EQ handle we pass out of here is - * simply a pointer to the event queue we just set up. We stash - * the handle returned by do_forward(), so we can pass it back via - * do_forward() when we need to. */ - - eq->cb_eq_handle.nal_idx = interface.nal_idx; - eq->cb_eq_handle.cookie = ret.handle_out.cookie; - - handle_out->nal_idx = interface.nal_idx; - handle_out->cookie = (__u64)((unsigned long)eq); - return PTL_OK; - -fail: - PORTAL_FREE(ev, count * sizeof(ptl_event_t)); - return rc; -} + return PTL_NO_INIT; -int PtlEQFree(ptl_handle_eq_t eventq) -{ - PtlEQFree_in args; - PtlEQFree_out ret; - ptl_eq_t *eq; - int rc; + if (neq_in < 1) + return PTL_EQ_INVALID; - eq = ptl_handle2usereq (&eventq); - args.eventq_in = eq->cb_eq_handle; - - rc = do_forward(eq->cb_eq_handle, PTL_EQFREE, &args, - sizeof(args), &ret, sizeof(ret)); + nal = ptl_hndl2nal(&eventqs_in[0]); + if (nal == NULL) + return PTL_EQ_INVALID; - /* XXX we're betting rc == PTL_OK here */ - PORTAL_FREE(eq->base, eq->size * sizeof(ptl_event_t)); - PORTAL_FREE(eq, sizeof(*eq)); + for (i = 1; i < neq_in; i++) + if (ptl_hndl2nal(&eventqs_in[i]) != nal) + return PTL_EQ_INVALID; - return rc; + return (nal->nal_eq_poll)(nal, eventqs_in, neq_in, timeout, + event_out, which_out); } + int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, ptl_process_id_t match_id_in, ptl_pt_index_t portal_in) { - PtlACEntry_in args; - PtlACEntry_out ret; - int rc; - - /* - * Copy arguments into the argument block to - * hand to the forwarding object - */ - args.ni_in = ni_in; - args.index_in = index_in; - args.match_id_in = match_id_in; - args.portal_in = portal_in; - - rc = do_forward(ni_in, PTL_ACENTRY, &args, sizeof(args), &ret, - sizeof(ret)); - - return (rc != PTL_OK) ? rc : ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&ni_in); + if (nal == NULL) + return PTL_NI_INVALID; + + return (nal->nal_ace_entry)(nal, index_in, match_id_in, portal_in); } int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, ptl_process_id_t target_in, ptl_pt_index_t portal_in, - ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, + ptl_ac_index_t ac_in, ptl_match_bits_t match_bits_in, ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in) { - PtlPut_in args; - PtlPut_out ret; - int rc; - - /* - * Copy arguments into the argument block to - * hand to the forwarding object - */ - args.md_in = md_in; - args.ack_req_in = ack_req_in; - args.target_in = target_in; - args.portal_in = portal_in; - args.cookie_in = cookie_in; - args.match_bits_in = match_bits_in; - args.offset_in = offset_in; - args.hdr_data_in = hdr_data_in; - - rc = do_forward(md_in, PTL_PUT, &args, sizeof(args), &ret, sizeof(ret)); - - return (rc != PTL_OK) ? rc : ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&md_in); + if (nal == NULL) + return PTL_MD_INVALID; + + return (nal->nal_put)(nal, &md_in, ack_req_in, + &target_in, portal_in, ac_in, + match_bits_in, offset_in, hdr_data_in); } int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, - ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, + ptl_pt_index_t portal_in, ptl_ac_index_t ac_in, ptl_match_bits_t match_bits_in, ptl_size_t offset_in) { - PtlGet_in args; - PtlGet_out ret; - int rc; - - /* - * Copy arguments into the argument block to - * hand to the forwarding object - */ - args.md_in = md_in; - args.target_in = target_in; - args.portal_in = portal_in; - args.cookie_in = cookie_in; - args.match_bits_in = match_bits_in; - args.offset_in = offset_in; - - rc = do_forward(md_in, PTL_GET, &args, sizeof(args), &ret, sizeof(ret)); - - return (rc != PTL_OK) ? rc : ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&md_in); + if (nal == NULL) + return PTL_MD_INVALID; + + return (nal->nal_get)(nal, &md_in, + &target_in, portal_in, ac_in, + match_bits_in, offset_in); } + diff --git a/lustre/portals/portals/autoMakefile.am b/lustre/portals/portals/autoMakefile.am index 22565dd..285f8fe 100644 --- a/lustre/portals/portals/autoMakefile.am +++ b/lustre/portals/portals/autoMakefile.am @@ -3,8 +3,8 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution -my_sources = api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c \ - lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c \ +my_sources = api-errno.c api-ni.c api-wrap.c \ + lib-init.c lib-me.c lib-msg.c lib-eq.c \ lib-md.c lib-move.c lib-ni.c lib-pid.c if !CRAY_PORTALS @@ -16,11 +16,11 @@ libportals_a_CPPFLAGS = $(LLCPPFLAGS) libportals_a_CFLAGS = $(LLCFLAGS) endif -#if MODULES -#modulenet_DATA = portals$(KMODEXT) -#endif # MODULES +if MODULES +modulenet_DATA = portals$(KMODEXT) +endif # MODULES endif # CRAY_PORTALS MOSTLYCLEANFILES = *.o *.ko *.mod.c -#DIST_SOURCES = $(portals-objs:%.o=%.c) +DIST_SOURCES = $(portals-objs:%.o=%.c) diff --git a/lustre/portals/portals/lib-dispatch.c b/lustre/portals/portals/lib-dispatch.c deleted file mode 100644 index 13036c7..0000000 --- a/lustre/portals/portals/lib-dispatch.c +++ /dev/null @@ -1,80 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib/lib-dispatch.c - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include -#include - -typedef struct { - int (*fun) (nal_cb_t * nal, void *private, void *in, void *out); - char *name; -} dispatch_table_t; - -static dispatch_table_t dispatch_table[] = { - [PTL_GETID] {do_PtlGetId, "PtlGetId"}, - [PTL_NISTATUS] {do_PtlNIStatus, "PtlNIStatus"}, - [PTL_NIDIST] {do_PtlNIDist, "PtlNIDist"}, - [PTL_NIDEBUG] {do_PtlNIDebug, "PtlNIDebug"}, - [PTL_MEATTACH] {do_PtlMEAttach, "PtlMEAttach"}, - [PTL_MEINSERT] {do_PtlMEInsert, "PtlMEInsert"}, - [PTL_MEUNLINK] {do_PtlMEUnlink, "PtlMEUnlink"}, - [PTL_TBLDUMP] {do_PtlTblDump, "PtlTblDump"}, - [PTL_MEDUMP] {do_PtlMEDump, "PtlMEDump"}, - [PTL_MDATTACH] {do_PtlMDAttach, "PtlMDAttach"}, - [PTL_MDBIND] {do_PtlMDBind, "PtlMDBind"}, - [PTL_MDUPDATE] {do_PtlMDUpdate_internal, "PtlMDUpdate_internal"}, - [PTL_MDUNLINK] {do_PtlMDUnlink, "PtlMDUnlink"}, - [PTL_EQALLOC] {do_PtlEQAlloc_internal, "PtlEQAlloc_internal"}, - [PTL_EQFREE] {do_PtlEQFree_internal, "PtlEQFree_internal"}, - [PTL_PUT] {do_PtlPut, "PtlPut"}, - [PTL_GET] {do_PtlGet, "PtlGet"}, - [PTL_FAILNID] {do_PtlFailNid, "PtlFailNid"}, - /* */ {0, ""} -}; - -/* - * This really should be elsewhere, but lib-p30/dispatch.c is - * an automatically generated file. - */ -void lib_dispatch(nal_cb_t * nal, void *private, int index, void *arg_block, - void *ret_block) -{ - lib_ni_t *ni = &nal->ni; - - if (index < 0 || index > LIB_MAX_DISPATCH || - !dispatch_table[index].fun) { - CDEBUG(D_NET, LPU64": Invalid API call %d\n", ni->nid, index); - return; - } - - CDEBUG(D_NET, LPU64": API call %s (%d)\n", ni->nid, - dispatch_table[index].name, index); - - dispatch_table[index].fun(nal, private, arg_block, ret_block); -} - -char *dispatch_name(int index) -{ - return dispatch_table[index].name; -} diff --git a/lustre/portals/portals/lib-eq.c b/lustre/portals/portals/lib-eq.c index ce343c1..8ea6fdd 100644 --- a/lustre/portals/portals/lib-eq.c +++ b/lustre/portals/portals/lib-eq.c @@ -25,104 +25,241 @@ #define DEBUG_SUBSYSTEM S_PORTALS #include -#include -int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *v_args, - void *v_ret) +int +lib_api_eq_alloc (nal_t *apinal, ptl_size_t count, + ptl_eq_handler_t callback, + ptl_handle_eq_t *handle) { - /* - * Incoming: - * ptl_handle_ni_t ni_in - * ptl_size_t count_in - * void * base_in - * - * Outgoing: - * ptl_handle_eq_t * handle_out - */ - - PtlEQAlloc_in *args = v_args; - PtlEQAlloc_out *ret = v_ret; - - lib_eq_t *eq; - unsigned long flags; - - /* api should have rounded up */ - if (args->count_in != LOWEST_BIT_SET (args->count_in)) - return ret->rc = PTL_VAL_FAILED; + lib_nal_t *nal = apinal->nal_data; + lib_eq_t *eq; + unsigned long flags; + int rc; + /* We need count to be a power of 2 so that when eq_{enq,deq}_seq + * overflow, they don't skip entries, so the queue has the same + * apparant capacity at all times */ + + if (count != LOWEST_BIT_SET(count)) { /* not a power of 2 already */ + do { /* knock off all but the top bit... */ + count &= ~LOWEST_BIT_SET (count); + } while (count != LOWEST_BIT_SET(count)); + + count <<= 1; /* ...and round up */ + } + + if (count == 0) /* catch bad parameter / overflow on roundup */ + return (PTL_VAL_FAILED); + eq = lib_eq_alloc (nal); if (eq == NULL) - return (ret->rc = PTL_NOSPACE); + return (PTL_NO_SPACE); - state_lock(nal, &flags); + PORTAL_ALLOC(eq->eq_events, count * sizeof(ptl_event_t)); + if (eq->eq_events == NULL) { + LIB_LOCK(nal, flags); + lib_eq_free (nal, eq); + LIB_UNLOCK(nal, flags); + } - if (nal->cb_map != NULL) { + if (nal->libnal_map != NULL) { struct iovec iov = { - .iov_base = args->base_in, - .iov_len = args->count_in * sizeof (ptl_event_t) }; + .iov_base = eq->eq_events, + .iov_len = count * sizeof(ptl_event_t)}; - ret->rc = nal->cb_map (nal, 1, &iov, &eq->eq_addrkey); - if (ret->rc != PTL_OK) { + rc = nal->libnal_map(nal, 1, &iov, &eq->eq_addrkey); + if (rc != PTL_OK) { + LIB_LOCK(nal, flags); lib_eq_free (nal, eq); - - state_unlock (nal, &flags); - return (ret->rc); + LIB_UNLOCK(nal, flags); + return (rc); } } - eq->sequence = 1; - eq->base = args->base_in; - eq->size = args->count_in; + /* NB this resets all event sequence numbers to 0, to be earlier + * than eq_deq_seq */ + memset(eq->eq_events, 0, count * sizeof(ptl_event_t)); + + eq->eq_deq_seq = 1; + eq->eq_enq_seq = 1; + eq->eq_size = count; eq->eq_refcount = 0; - eq->event_callback = args->callback_in; + eq->eq_callback = callback; + + LIB_LOCK(nal, flags); lib_initialise_handle (nal, &eq->eq_lh, PTL_COOKIE_TYPE_EQ); - list_add (&eq->eq_list, &nal->ni.ni_active_eqs); + list_add (&eq->eq_list, &nal->libnal_ni.ni_active_eqs); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - ptl_eq2handle(&ret->handle_out, eq); - return (ret->rc = PTL_OK); + ptl_eq2handle(handle, nal, eq); + return (PTL_OK); } -int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *v_args, - void *v_ret) +int +lib_api_eq_free(nal_t *apinal, ptl_handle_eq_t *eqh) { - /* - * Incoming: - * ptl_handle_eq_t eventq_in - * - * Outgoing: - */ - - PtlEQFree_in *args = v_args; - PtlEQFree_out *ret = v_ret; - lib_eq_t *eq; - long flags; + lib_nal_t *nal = apinal->nal_data; + lib_eq_t *eq; + int size; + ptl_event_t *events; + void *addrkey; + unsigned long flags; - state_lock (nal, &flags); + LIB_LOCK(nal, flags); - eq = ptl_handle2eq(&args->eventq_in, nal); + eq = ptl_handle2eq(eqh, nal); if (eq == NULL) { - ret->rc = PTL_INV_EQ; - } else if (eq->eq_refcount != 0) { - ret->rc = PTL_EQ_INUSE; + LIB_UNLOCK(nal, flags); + return (PTL_EQ_INVALID); + } + + if (eq->eq_refcount != 0) { + LIB_UNLOCK(nal, flags); + return (PTL_EQ_IN_USE); + } + + /* stash for free after lock dropped */ + events = eq->eq_events; + size = eq->eq_size; + addrkey = eq->eq_addrkey; + + lib_invalidate_handle (nal, &eq->eq_lh); + list_del (&eq->eq_list); + lib_eq_free (nal, eq); + + LIB_UNLOCK(nal, flags); + + if (nal->libnal_unmap != NULL) { + struct iovec iov = { + .iov_base = events, + .iov_len = size * sizeof(ptl_event_t)}; + + nal->libnal_unmap(nal, 1, &iov, &addrkey); + } + + PORTAL_FREE(events, size * sizeof (ptl_event_t)); + + return (PTL_OK); +} + +int +lib_get_event (lib_eq_t *eq, ptl_event_t *ev) +{ + int new_index = eq->eq_deq_seq & (eq->eq_size - 1); + ptl_event_t *new_event = &eq->eq_events[new_index]; + int rc; + ENTRY; + + CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", + new_event, eq->eq_deq_seq, eq->eq_size); + + if (PTL_SEQ_GT (eq->eq_deq_seq, new_event->sequence)) { + RETURN(PTL_EQ_EMPTY); + } + + /* We've got a new event... */ + *ev = *new_event; + + /* ...but did it overwrite an event we've not seen yet? */ + if (eq->eq_deq_seq == new_event->sequence) { + rc = PTL_OK; } else { - if (nal->cb_unmap != NULL) { - struct iovec iov = { - .iov_base = eq->base, - .iov_len = eq->size * sizeof (ptl_event_t) }; - - nal->cb_unmap(nal, 1, &iov, &eq->eq_addrkey); + CERROR("Event Queue Overflow: eq seq %lu ev seq %lu\n", + eq->eq_deq_seq, new_event->sequence); + rc = PTL_EQ_DROPPED; + } + + eq->eq_deq_seq = new_event->sequence + 1; + RETURN(rc); +} + + +int +lib_api_eq_poll (nal_t *apinal, + ptl_handle_eq_t *eventqs, int neq, int timeout_ms, + ptl_event_t *event, int *which) +{ + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; + unsigned long flags; + int i; + int rc; +#ifdef __KERNEL__ + wait_queue_t wq; + unsigned long now; +#else + struct timeval then; + struct timeval now; + struct timespec ts; +#endif + ENTRY; + + LIB_LOCK(nal, flags); + + for (;;) { + for (i = 0; i < neq; i++) { + lib_eq_t *eq = ptl_handle2eq(&eventqs[i], nal); + + rc = lib_get_event (eq, event); + if (rc != PTL_EQ_EMPTY) { + LIB_UNLOCK(nal, flags); + *which = i; + RETURN(rc); + } + } + + if (timeout_ms == 0) { + LIB_UNLOCK (nal, flags); + RETURN (PTL_EQ_EMPTY); } - lib_invalidate_handle (nal, &eq->eq_lh); - list_del (&eq->eq_list); - lib_eq_free (nal, eq); - ret->rc = PTL_OK; - } + /* Some architectures force us to do spin locking/unlocking + * in the same stack frame, means we can abstract the + * locking here */ +#ifdef __KERNEL__ + init_waitqueue_entry(&wq, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&ni->ni_waitq, &wq); - state_unlock (nal, &flags); + LIB_UNLOCK(nal, flags); - return (ret->rc); + if (timeout_ms < 0) { + schedule (); + } else { + now = jiffies; + schedule_timeout((timeout_ms * HZ)/1000); + timeout_ms -= ((jiffies - now) * 1000)/HZ; + if (timeout_ms < 0) + timeout_ms = 0; + } + + LIB_LOCK(nal, flags); +#else + if (timeout_ms < 0) { + pthread_cond_wait(&ni->ni_cond, &ni->ni_mutex); + } else { + gettimeofday(&then, NULL); + + ts.tv_sec = then.tv_sec + timeout_ms/1000; + ts.tv_nsec = then.tv_usec * 1000 + + (timeout_ms%1000) * 1000000; + if (ts.tv_nsec >= 1000000000) { + ts.tv_sec++; + ts.tv_nsec -= 1000000000; + } + + pthread_cond_timedwait(&ni->ni_cond, + &ni->ni_mutex, &ts); + + gettimeofday(&now, NULL); + timeout_ms -= (now.tv_sec - then.tv_sec) * 1000 + + (now.tv_usec - then.tv_usec) / 1000; + + if (timeout_ms < 0) + timeout_ms = 0; + } +#endif + } } diff --git a/lustre/portals/portals/lib-init.c b/lustre/portals/portals/lib-init.c index d4d8860..9d97bc1 100644 --- a/lustre/portals/portals/lib-init.c +++ b/lustre/portals/portals/lib-init.c @@ -41,19 +41,26 @@ #ifndef PTL_USE_LIB_FREELIST int -kportal_descriptor_setup (nal_cb_t *nal) +kportal_descriptor_setup (lib_nal_t *nal, + ptl_ni_limits_t *requested_limits, + ptl_ni_limits_t *actual_limits) { + /* Ignore requested limits! */ + actual_limits->max_mes = INT_MAX; + actual_limits->max_mds = INT_MAX; + actual_limits->max_eqs = INT_MAX; + return PTL_OK; } void -kportal_descriptor_cleanup (nal_cb_t *nal) +kportal_descriptor_cleanup (lib_nal_t *nal) { } #else int -lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) +lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int n, int size) { char *space; @@ -61,9 +68,9 @@ lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) size += offsetof (lib_freeobj_t, fo_contents); - space = nal->cb_malloc (nal, n * size); + PORTAL_ALLOC(space, n * size); if (space == NULL) - return (PTL_NOSPACE); + return (PTL_NO_SPACE); INIT_LIST_HEAD (&fl->fl_list); fl->fl_objs = space; @@ -81,7 +88,7 @@ lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) } void -lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl) +lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl) { struct list_head *el; int count; @@ -95,55 +102,67 @@ lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl) LASSERT (count == fl->fl_nobjs); - nal->cb_free (nal, fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); + PORTAL_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); memset (fl, 0, sizeof (fl)); } int -kportal_descriptor_setup (nal_cb_t *nal) +kportal_descriptor_setup (lib_nal_t *nal, + ptl_ni_limits_t *requested_limits, + ptl_ni_limits_t *actual_limits) { /* NB on failure caller must still call kportal_descriptor_cleanup */ /* ****** */ - int rc; - - memset (&nal->ni.ni_free_mes, 0, sizeof (nal->ni.ni_free_mes)); - memset (&nal->ni.ni_free_msgs, 0, sizeof (nal->ni.ni_free_msgs)); - memset (&nal->ni.ni_free_mds, 0, sizeof (nal->ni.ni_free_mds)); - memset (&nal->ni.ni_free_eqs, 0, sizeof (nal->ni.ni_free_eqs)); - - rc = lib_freelist_init (nal, &nal->ni.ni_free_mes, + lib_ni_t *ni = &nal->libnal_ni; + int rc; + + memset (&ni->ni_free_mes, 0, sizeof (ni->ni_free_mes)); + memset (&ni->ni_free_msgs, 0, sizeof (ni->ni_free_msgs)); + memset (&ni->ni_free_mds, 0, sizeof (ni->ni_free_mds)); + memset (&ni->ni_free_eqs, 0, sizeof (ni->ni_free_eqs)); + + /* Ignore requested limits! */ + actual_limits->max_mes = MAX_MES; + actual_limits->max_mds = MAX_MDS; + actual_limits->max_eqs = MAX_EQS; + /* Hahahah what a load of bollocks. There's nowhere to + * specify the max # messages in-flight */ + + rc = lib_freelist_init (nal, &ni->ni_free_mes, MAX_MES, sizeof (lib_me_t)); if (rc != PTL_OK) return (rc); - rc = lib_freelist_init (nal, &nal->ni.ni_free_msgs, + rc = lib_freelist_init (nal, &ni->ni_free_msgs, MAX_MSGS, sizeof (lib_msg_t)); if (rc != PTL_OK) return (rc); - rc = lib_freelist_init (nal, &nal->ni.ni_free_mds, + rc = lib_freelist_init (nal, &ni->ni_free_mds, MAX_MDS, sizeof (lib_md_t)); if (rc != PTL_OK) return (rc); - rc = lib_freelist_init (nal, &nal->ni.ni_free_eqs, + rc = lib_freelist_init (nal, &ni->ni_free_eqs, MAX_EQS, sizeof (lib_eq_t)); return (rc); } void -kportal_descriptor_cleanup (nal_cb_t *nal) +kportal_descriptor_cleanup (lib_nal_t *nal) { - lib_freelist_fini (nal, &nal->ni.ni_free_mes); - lib_freelist_fini (nal, &nal->ni.ni_free_msgs); - lib_freelist_fini (nal, &nal->ni.ni_free_mds); - lib_freelist_fini (nal, &nal->ni.ni_free_eqs); + lib_ni_t *ni = &nal->libnal_ni; + + lib_freelist_fini (nal, &ni->ni_free_mes); + lib_freelist_fini (nal, &ni->ni_free_msgs); + lib_freelist_fini (nal, &ni->ni_free_mds); + lib_freelist_fini (nal, &ni->ni_free_eqs); } #endif __u64 -lib_create_interface_cookie (nal_cb_t *nal) +lib_create_interface_cookie (lib_nal_t *nal) { /* NB the interface cookie in wire handles guards against delayed * replies and ACKs appearing valid in a new instance of the same @@ -164,9 +183,9 @@ lib_create_interface_cookie (nal_cb_t *nal) } int -lib_setup_handle_hash (nal_cb_t *nal) +lib_setup_handle_hash (lib_nal_t *nal) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; int i; /* Arbitrary choice of hash table size */ @@ -175,11 +194,10 @@ lib_setup_handle_hash (nal_cb_t *nal) #else ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4; #endif - ni->ni_lh_hash_table = - (struct list_head *)nal->cb_malloc (nal, ni->ni_lh_hash_size - * sizeof (struct list_head)); + PORTAL_ALLOC(ni->ni_lh_hash_table, + ni->ni_lh_hash_size * sizeof (struct list_head)); if (ni->ni_lh_hash_table == NULL) - return (PTL_NOSPACE); + return (PTL_NO_SPACE); for (i = 0; i < ni->ni_lh_hash_size; i++) INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]); @@ -190,22 +208,22 @@ lib_setup_handle_hash (nal_cb_t *nal) } void -lib_cleanup_handle_hash (nal_cb_t *nal) +lib_cleanup_handle_hash (lib_nal_t *nal) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; if (ni->ni_lh_hash_table == NULL) return; - nal->cb_free (nal, ni->ni_lh_hash_table, - ni->ni_lh_hash_size * sizeof (struct list_head)); + PORTAL_FREE(ni->ni_lh_hash_table, + ni->ni_lh_hash_size * sizeof (struct list_head)); } lib_handle_t * -lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type) +lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type) { /* ALWAYS called with statelock held */ - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; struct list_head *list; struct list_head *el; unsigned int hash; @@ -227,10 +245,10 @@ lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type) } void -lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type) +lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type) { /* ALWAYS called with statelock held */ - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; unsigned int hash; LASSERT (type >= 0 && type < PTL_COOKIE_TYPES); @@ -242,99 +260,131 @@ lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type) } void -lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh) +lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh) { list_del (&lh->lh_hash_chain); } int -lib_init(nal_cb_t * nal, ptl_nid_t nid, ptl_pid_t pid, int gsize, - ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size) +lib_init(lib_nal_t *libnal, nal_t *apinal, + ptl_process_id_t process_id, + ptl_ni_limits_t *requested_limits, + ptl_ni_limits_t *actual_limits) { int rc = PTL_OK; - lib_ni_t *ni = &nal->ni; - int i; + lib_ni_t *ni = &libnal->libnal_ni; + int ptl_size; + int i; ENTRY; /* NB serialised in PtlNIInit() */ - if (ni->refcnt != 0) { /* already initialised */ - ni->refcnt++; - goto out; - } - lib_assert_wire_constants (); - - /* - * Allocate the portal table for this interface - * and all per-interface objects. - */ - memset(&ni->counters, 0, sizeof(lib_counters_t)); - rc = kportal_descriptor_setup (nal); + /* Setup the API nal with the lib API handling functions */ + apinal->nal_get_id = lib_api_get_id; + apinal->nal_ni_status = lib_api_ni_status; + apinal->nal_ni_dist = lib_api_ni_dist; + apinal->nal_fail_nid = lib_api_fail_nid; + apinal->nal_me_attach = lib_api_me_attach; + apinal->nal_me_insert = lib_api_me_insert; + apinal->nal_me_unlink = lib_api_me_unlink; + apinal->nal_md_attach = lib_api_md_attach; + apinal->nal_md_bind = lib_api_md_bind; + apinal->nal_md_unlink = lib_api_md_unlink; + apinal->nal_md_update = lib_api_md_update; + apinal->nal_eq_alloc = lib_api_eq_alloc; + apinal->nal_eq_free = lib_api_eq_free; + apinal->nal_eq_poll = lib_api_eq_poll; + apinal->nal_put = lib_api_put; + apinal->nal_get = lib_api_get; + + apinal->nal_data = libnal; + ni->ni_api = apinal; + + rc = kportal_descriptor_setup (libnal, requested_limits, + &ni->ni_actual_limits); if (rc != PTL_OK) goto out; + memset(&ni->ni_counters, 0, sizeof(lib_counters_t)); + INIT_LIST_HEAD (&ni->ni_active_msgs); INIT_LIST_HEAD (&ni->ni_active_mds); INIT_LIST_HEAD (&ni->ni_active_eqs); - INIT_LIST_HEAD (&ni->ni_test_peers); - ni->ni_interface_cookie = lib_create_interface_cookie (nal); +#ifdef __KERNEL__ + spin_lock_init (&ni->ni_lock); + init_waitqueue_head (&ni->ni_waitq); +#else + pthread_mutex_init(&ni->ni_mutex, NULL); + pthread_cond_init(&ni->ni_cond, NULL); +#endif + + ni->ni_interface_cookie = lib_create_interface_cookie (libnal); ni->ni_next_object_cookie = 0; - rc = lib_setup_handle_hash (nal); + rc = lib_setup_handle_hash (libnal); if (rc != PTL_OK) goto out; - ni->nid = nid; - ni->pid = pid; - - ni->num_nodes = gsize; - ni->tbl.size = ptl_size; - - ni->tbl.tbl = nal->cb_malloc(nal, sizeof(struct list_head) * ptl_size); - if (ni->tbl.tbl == NULL) { - rc = PTL_NOSPACE; + ni->ni_pid = process_id; + + if (requested_limits != NULL) + ptl_size = requested_limits->max_pt_index + 1; + else + ptl_size = 64; + + ni->ni_portals.size = ptl_size; + PORTAL_ALLOC(ni->ni_portals.tbl, + ptl_size * sizeof(struct list_head)); + if (ni->ni_portals.tbl == NULL) { + rc = PTL_NO_SPACE; goto out; } for (i = 0; i < ptl_size; i++) - INIT_LIST_HEAD(&(ni->tbl.tbl[i])); + INIT_LIST_HEAD(&(ni->ni_portals.tbl[i])); + + /* max_{mes,mds,eqs} set in kportal_descriptor_setup */ + + /* We don't have an access control table! */ + ni->ni_actual_limits.max_ac_index = -1; + + ni->ni_actual_limits.max_pt_index = ptl_size - 1; + ni->ni_actual_limits.max_md_iovecs = PTL_MD_MAX_IOV; + ni->ni_actual_limits.max_me_list = INT_MAX; + + /* We don't support PtlGetPut! */ + ni->ni_actual_limits.max_getput_md = 0; - ni->debug = PTL_DEBUG_NONE; - ni->up = 1; - ni->refcnt++; + if (actual_limits != NULL) + *actual_limits = ni->ni_actual_limits; out: if (rc != PTL_OK) { - lib_cleanup_handle_hash (nal); - kportal_descriptor_cleanup (nal); + lib_cleanup_handle_hash (libnal); + kportal_descriptor_cleanup (libnal); } RETURN (rc); } int -lib_fini(nal_cb_t * nal) +lib_fini(lib_nal_t *nal) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; int idx; - ni->refcnt--; - - if (ni->refcnt != 0) - goto out; - - /* NB no stat_lock() since this is the last reference. The NAL + /* NB no state_lock() since this is the last reference. The NAL * should have shut down already, so it should be safe to unlink * and free all descriptors, even those that appear committed to a * network op (eg MD with non-zero pending count) */ - for (idx = 0; idx < ni->tbl.size; idx++) - while (!list_empty (&ni->tbl.tbl[idx])) { - lib_me_t *me = list_entry (ni->tbl.tbl[idx].next, + for (idx = 0; idx < ni->ni_portals.size; idx++) + while (!list_empty (&ni->ni_portals.tbl[idx])) { + lib_me_t *me = list_entry (ni->ni_portals.tbl[idx].next, lib_me_t, me_list); CERROR ("Active me %p on exit\n", me); @@ -369,12 +419,16 @@ lib_fini(nal_cb_t * nal) lib_msg_free (nal, msg); } - nal->cb_free(nal, ni->tbl.tbl, sizeof(struct list_head) * ni->tbl.size); - ni->up = 0; + PORTAL_FREE(ni->ni_portals.tbl, + ni->ni_portals.size * sizeof(struct list_head)); lib_cleanup_handle_hash (nal); kportal_descriptor_cleanup (nal); - out: +#ifndef __KERNEL__ + pthread_mutex_destroy(&ni->ni_mutex); + pthread_cond_destroy(&ni->ni_cond); +#endif + return (PTL_OK); } diff --git a/lustre/portals/portals/lib-md.c b/lustre/portals/portals/lib-md.c index a1ed583..6deadb8 100644 --- a/lustre/portals/portals/lib-md.c +++ b/lustre/portals/portals/lib-md.c @@ -31,164 +31,176 @@ #endif #include -#include -/* - * must be called with state lock held - */ -void lib_md_unlink(nal_cb_t * nal, lib_md_t * md) +/* must be called with state lock held */ +void +lib_md_unlink(lib_nal_t *nal, lib_md_t *md) { - lib_me_t *me = md->me; + if ((md->md_flags & PTL_MD_FLAG_ZOMBIE) == 0) { + /* first unlink attempt... */ + lib_me_t *me = md->me; + + md->md_flags |= PTL_MD_FLAG_ZOMBIE; + + /* Disassociate from ME (if any), and unlink it if it was created + * with PTL_UNLINK */ + if (me != NULL) { + me->md = NULL; + if (me->unlink == PTL_UNLINK) + lib_me_unlink(nal, me); + } + + /* emsure all future handle lookups fail */ + lib_invalidate_handle(nal, &md->md_lh); + } if (md->pending != 0) { CDEBUG(D_NET, "Queueing unlink of md %p\n", md); - md->md_flags |= PTL_MD_FLAG_UNLINK; return; } CDEBUG(D_NET, "Unlinking md %p\n", md); if ((md->options & PTL_MD_KIOV) != 0) { - if (nal->cb_unmap_pages != NULL) - nal->cb_unmap_pages (nal, md->md_niov, md->md_iov.kiov, - &md->md_addrkey); - } else if (nal->cb_unmap != NULL) - nal->cb_unmap (nal, md->md_niov, md->md_iov.iov, - &md->md_addrkey); - - if (me) { - me->md = NULL; - if (me->unlink == PTL_UNLINK) - lib_me_unlink(nal, me); + if (nal->libnal_unmap_pages != NULL) + nal->libnal_unmap_pages (nal, + md->md_niov, + md->md_iov.kiov, + &md->md_addrkey); + } else if (nal->libnal_unmap != NULL) { + nal->libnal_unmap (nal, + md->md_niov, md->md_iov.iov, + &md->md_addrkey); } - if (md->eq != NULL) - { + if (md->eq != NULL) { md->eq->eq_refcount--; LASSERT (md->eq->eq_refcount >= 0); } - lib_invalidate_handle (nal, &md->md_lh); list_del (&md->md_list); lib_md_free(nal, md); } /* must be called with state lock held */ -static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, - ptl_md_t *md, ptl_handle_eq_t *eqh, int unlink) +static int +lib_md_build(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd, int unlink) { - const int max_size_opts = PTL_MD_AUTO_UNLINK | - PTL_MD_MAX_SIZE; lib_eq_t *eq = NULL; int rc; int i; + int niov; + int total_length = 0; /* NB we are passed an allocated, but uninitialised/active md. * if we return success, caller may lib_md_unlink() it. * otherwise caller may only lib_md_free() it. */ - if (!PtlHandleEqual (*eqh, PTL_EQ_NONE)) { - eq = ptl_handle2eq(eqh, nal); + if (!PtlHandleIsEqual (umd->eq_handle, PTL_EQ_NONE)) { + eq = ptl_handle2eq(&umd->eq_handle, nal); if (eq == NULL) - return PTL_INV_EQ; + return PTL_EQ_INVALID; } - /* Must check this _before_ allocation. Also, note that non-iov - * MDs must set md_niov to 0. */ - LASSERT((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0 || - md->niov <= PTL_MD_MAX_IOV); - - if ((md->options & max_size_opts) != 0 && /* max size used */ - (md->max_size < 0 || md->max_size > md->length)) // illegal max_size - return PTL_INV_MD; - - new->me = NULL; - new->start = md->start; - new->length = md->length; - new->offset = 0; - new->max_size = md->max_size; - new->unlink = unlink; - new->options = md->options; - new->user_ptr = md->user_ptr; - new->eq = eq; - new->threshold = md->threshold; - new->pending = 0; - new->md_flags = 0; - - if ((md->options & PTL_MD_IOV) != 0) { - int total_length = 0; - - if ((md->options & PTL_MD_KIOV) != 0) /* Can't specify both */ - return PTL_INV_MD; - - new->md_niov = md->niov; - - if (nal->cb_read (nal, private, new->md_iov.iov, md->start, - md->niov * sizeof (new->md_iov.iov[0]))) - return PTL_SEGV; - - for (i = 0; i < new->md_niov; i++) { + /* This implementation doesn't know how to create START events or + * disable END events. Best to LASSERT our caller is compliant so + * we find out quickly... */ + LASSERT (eq == NULL || + ((umd->options & PTL_MD_EVENT_START_DISABLE) != 0 && + (umd->options & PTL_MD_EVENT_END_DISABLE) == 0)); + + lmd->me = NULL; + lmd->start = umd->start; + lmd->offset = 0; + lmd->max_size = umd->max_size; + lmd->options = umd->options; + lmd->user_ptr = umd->user_ptr; + lmd->eq = eq; + lmd->threshold = umd->threshold; + lmd->pending = 0; + lmd->md_flags = (unlink == PTL_UNLINK) ? PTL_MD_FLAG_AUTO_UNLINK : 0; + + if ((umd->options & PTL_MD_IOVEC) != 0) { + + if ((umd->options & PTL_MD_KIOV) != 0) /* Can't specify both */ + return PTL_MD_ILLEGAL; + + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.iov, umd->start, + niov * sizeof (lmd->md_iov.iov[0])); + + for (i = 0; i < niov; i++) { /* We take the base address on trust */ - if (new->md_iov.iov[i].iov_len <= 0) /* invalid length */ - return PTL_VAL_FAILED; + if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */ + return PTL_MD_ILLEGAL; - total_length += new->md_iov.iov[i].iov_len; + total_length += lmd->md_iov.iov[i].iov_len; } - if (md->length > total_length) - return PTL_IOV_TOO_SMALL; - - if (nal->cb_map != NULL) { - rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, - &new->md_addrkey); + lmd->length = total_length; + + if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) // illegal max_size + return PTL_MD_ILLEGAL; + + if (nal->libnal_map != NULL) { + rc = nal->libnal_map (nal, niov, lmd->md_iov.iov, + &lmd->md_addrkey); if (rc != PTL_OK) return (rc); } - } else if ((md->options & PTL_MD_KIOV) != 0) { + } else if ((umd->options & PTL_MD_KIOV) != 0) { #ifndef __KERNEL__ - return PTL_INV_MD; -#else - int total_length = 0; - + return PTL_MD_ILLEGAL; +#else /* Trap attempt to use paged I/O if unsupported early. */ - if (nal->cb_send_pages == NULL || - nal->cb_recv_pages == NULL) - return PTL_INV_MD; + if (nal->libnal_send_pages == NULL || + nal->libnal_recv_pages == NULL) + return PTL_MD_INVALID; - new->md_niov = md->niov; + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.kiov, umd->start, + niov * sizeof (lmd->md_iov.kiov[0])); - if (nal->cb_read (nal, private, new->md_iov.kiov, md->start, - md->niov * sizeof (new->md_iov.kiov[0]))) - return PTL_SEGV; - - for (i = 0; i < new->md_niov; i++) { + for (i = 0; i < niov; i++) { /* We take the page pointer on trust */ - if (new->md_iov.kiov[i].kiov_offset + - new->md_iov.kiov[i].kiov_len > PAGE_SIZE ) + if (lmd->md_iov.kiov[i].kiov_offset + + lmd->md_iov.kiov[i].kiov_len > PAGE_SIZE ) return PTL_VAL_FAILED; /* invalid length */ - total_length += new->md_iov.kiov[i].kiov_len; + total_length += lmd->md_iov.kiov[i].kiov_len; } - if (md->length > total_length) - return PTL_IOV_TOO_SMALL; + lmd->length = total_length; + + if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) // illegal max_size + return PTL_MD_ILLEGAL; - if (nal->cb_map_pages != NULL) { - rc = nal->cb_map_pages (nal, new->md_niov, new->md_iov.kiov, - &new->md_addrkey); + if (nal->libnal_map_pages != NULL) { + rc = nal->libnal_map_pages (nal, niov, lmd->md_iov.kiov, + &lmd->md_addrkey); if (rc != PTL_OK) return (rc); } #endif } else { /* contiguous */ - new->md_niov = 1; - new->md_iov.iov[0].iov_base = md->start; - new->md_iov.iov[0].iov_len = md->length; - - if (nal->cb_map != NULL) { - rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, - &new->md_addrkey); + lmd->length = umd->length; + lmd->md_niov = niov = 1; + lmd->md_iov.iov[0].iov_base = umd->start; + lmd->md_iov.iov[0].iov_len = umd->length; + + if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > umd->length)) // illegal max_size + return PTL_MD_ILLEGAL; + + if (nal->libnal_map != NULL) { + rc = nal->libnal_map (nal, niov, lmd->md_iov.iov, + &lmd->md_addrkey); if (rc != PTL_OK) return (rc); } @@ -198,140 +210,125 @@ static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, eq->eq_refcount++; /* It's good; let handle2md succeed and add to active mds */ - lib_initialise_handle (nal, &new->md_lh, PTL_COOKIE_TYPE_MD); - list_add (&new->md_list, &nal->ni.ni_active_mds); + lib_initialise_handle (nal, &lmd->md_lh, PTL_COOKIE_TYPE_MD); + list_add (&lmd->md_list, &nal->libnal_ni.ni_active_mds); return PTL_OK; } /* must be called with state lock held */ -void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md, ptl_md_t * new) +void +lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd) { /* NB this doesn't copy out all the iov entries so when a * discontiguous MD is copied out, the target gets to know the * original iov pointer (in start) and the number of entries it had * and that's all. */ - new->start = md->start; - new->length = md->length; - new->threshold = md->threshold; - new->max_size = md->max_size; - new->options = md->options; - new->user_ptr = md->user_ptr; - ptl_eq2handle(&new->eventq, md->eq); - new->niov = ((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0) ? 0 : md->md_niov; + umd->start = lmd->start; + umd->length = ((lmd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) == 0) ? + lmd->length : lmd->md_niov; + umd->threshold = lmd->threshold; + umd->max_size = lmd->max_size; + umd->options = lmd->options; + umd->user_ptr = lmd->user_ptr; + ptl_eq2handle(&umd->eq_handle, nal, lmd->eq); } -int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_md_attach(nal_t *apinal, ptl_handle_me_t *meh, + ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle) { - /* - * Incoming: - * ptl_handle_me_t current_in - * ptl_md_t md_in - * ptl_unlink_t unlink_in - * - * Outgoing: - * ptl_handle_md_t * handle_out - */ - - PtlMDAttach_in *args = v_args; - PtlMDAttach_out *ret = v_ret; - lib_me_t *me; - lib_md_t *md; + lib_nal_t *nal = apinal->nal_data; + lib_me_t *me; + lib_md_t *md; unsigned long flags; + int rc; - if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 && - args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */ - return (ret->rc = PTL_IOV_TOO_MANY); + if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && + umd->length > PTL_MD_MAX_IOV) /* too many fragments */ + return PTL_IOV_INVALID; - md = lib_md_alloc(nal, &args->md_in); + md = lib_md_alloc(nal, umd); if (md == NULL) - return (ret->rc = PTL_NOSPACE); + return PTL_NO_SPACE; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - me = ptl_handle2me(&args->me_in, nal); + me = ptl_handle2me(meh, nal); if (me == NULL) { - ret->rc = PTL_INV_ME; + rc = PTL_ME_INVALID; } else if (me->md != NULL) { - ret->rc = PTL_INUSE; + rc = PTL_ME_IN_USE; } else { - ret->rc = lib_md_build(nal, md, private, &args->md_in, - &args->eq_in, args->unlink_in); - - if (ret->rc == PTL_OK) { + rc = lib_md_build(nal, md, umd, unlink); + if (rc == PTL_OK) { me->md = md; md->me = me; - ptl_md2handle(&ret->handle_out, md); + ptl_md2handle(handle, nal, md); - state_unlock (nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_OK); } } lib_md_free (nal, md); - state_unlock (nal, &flags); - return (ret->rc); + LIB_UNLOCK(nal, flags); + return (rc); } -int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_md_bind(nal_t *apinal, + ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle) { - /* - * Incoming: - * ptl_handle_ni_t ni_in - * ptl_md_t md_in - * - * Outgoing: - * ptl_handle_md_t * handle_out - */ - - PtlMDBind_in *args = v_args; - PtlMDBind_out *ret = v_ret; - lib_md_t *md; + lib_nal_t *nal = apinal->nal_data; + lib_md_t *md; unsigned long flags; + int rc; - if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 && - args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */ - return (ret->rc = PTL_IOV_TOO_MANY); + if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && + umd->length > PTL_MD_MAX_IOV) /* too many fragments */ + return PTL_IOV_INVALID; - md = lib_md_alloc(nal, &args->md_in); + md = lib_md_alloc(nal, umd); if (md == NULL) - return (ret->rc = PTL_NOSPACE); + return PTL_NO_SPACE; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - ret->rc = lib_md_build(nal, md, private, - &args->md_in, &args->eq_in, PTL_UNLINK); + rc = lib_md_build(nal, md, umd, unlink); - if (ret->rc == PTL_OK) { - ptl_md2handle(&ret->handle_out, md); + if (rc == PTL_OK) { + ptl_md2handle(handle, nal, md); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_OK); } lib_md_free (nal, md); - state_unlock(nal, &flags); - return (ret->rc); + LIB_UNLOCK(nal, flags); + return (rc); } -int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_md_unlink (nal_t *apinal, ptl_handle_md_t *mdh) { - PtlMDUnlink_in *args = v_args; - PtlMDUnlink_out *ret = v_ret; + lib_nal_t *nal = apinal->nal_data; ptl_event_t ev; lib_md_t *md; unsigned long flags; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - md = ptl_handle2md(&args->md_in, nal); + md = ptl_handle2md(mdh, nal); if (md == NULL) { - state_unlock(nal, &flags); - return (ret->rc = PTL_INV_MD); + LIB_UNLOCK(nal, flags); + return PTL_MD_INVALID; } /* If the MD is busy, lib_md_unlink just marks it for deletion, and @@ -343,104 +340,87 @@ int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) memset(&ev, 0, sizeof(ev)); ev.type = PTL_EVENT_UNLINK; - ev.status = PTL_OK; + ev.ni_fail_type = PTL_OK; ev.unlinked = 1; - lib_md_deconstruct(nal, md, &ev.mem_desc); + lib_md_deconstruct(nal, md, &ev.md); + ptl_md2handle(&ev.md_handle, nal, md); - lib_enq_event_locked(nal, private, md->eq, &ev); + lib_enq_event_locked(nal, NULL, md->eq, &ev); } - lib_md_deconstruct(nal, md, &ret->status_out); lib_md_unlink(nal, md); - ret->rc = PTL_OK; - - state_unlock(nal, &flags); - return (PTL_OK); + LIB_UNLOCK(nal, flags); + return PTL_OK; } -int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args, - void *v_ret) +int +lib_api_md_update (nal_t *apinal, + ptl_handle_md_t *mdh, + ptl_md_t *oldumd, ptl_md_t *newumd, + ptl_handle_eq_t *testqh) { - /* - * Incoming: - * ptl_handle_md_t md_in - * ptl_md_t * old_inout - * ptl_md_t * new_inout - * ptl_handle_eq_t testq_in - * ptl_seq_t sequence_in - * - * Outgoing: - * ptl_md_t * old_inout - * ptl_md_t * new_inout - */ - PtlMDUpdate_internal_in *args = v_args; - PtlMDUpdate_internal_out *ret = v_ret; - lib_md_t *md; - lib_eq_t *test_eq = NULL; - ptl_md_t *new = &args->new_inout; + lib_nal_t *nal = apinal->nal_data; + lib_md_t *md; + lib_eq_t *test_eq = NULL; unsigned long flags; + int rc; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - md = ptl_handle2md(&args->md_in, nal); + md = ptl_handle2md(mdh, nal); if (md == NULL) { - ret->rc = PTL_INV_MD; + rc = PTL_MD_INVALID; goto out; } - if (args->old_inout_valid) - lib_md_deconstruct(nal, md, &ret->old_inout); + if (oldumd != NULL) + lib_md_deconstruct(nal, md, oldumd); - if (!args->new_inout_valid) { - ret->rc = PTL_OK; + if (newumd == NULL) { + rc = PTL_OK; goto out; } - /* XXX fttb, the new MD must be the same type wrt fragmentation */ - if (((new->options ^ md->options) & - (PTL_MD_IOV | PTL_MD_KIOV)) != 0) { - ret->rc = PTL_INV_MD; - goto out; - } - - if (new->niov > md->md_niov) { - ret->rc = PTL_IOV_TOO_MANY; + /* XXX fttb, the new MD must be the same "shape" wrt fragmentation, + * since we simply overwrite the old lib-md */ + if ((((newumd->options ^ md->options) & + (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0) || + ((newumd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0 && + newumd->length != md->md_niov)) { + rc = PTL_IOV_INVALID; goto out; } - if (new->niov < md->md_niov) { - ret->rc = PTL_IOV_TOO_SMALL; - goto out; - } - - if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) { - test_eq = ptl_handle2eq(&args->testq_in, nal); + if (!PtlHandleIsEqual (*testqh, PTL_EQ_NONE)) { + test_eq = ptl_handle2eq(testqh, nal); if (test_eq == NULL) { - ret->rc = PTL_INV_EQ; + rc = PTL_EQ_INVALID; goto out; } } if (md->pending != 0) { - ret->rc = PTL_NOUPDATE; - goto out; + rc = PTL_MD_NO_UPDATE; + goto out; } if (test_eq == NULL || - test_eq->sequence == args->sequence_in) { + test_eq->eq_deq_seq == test_eq->eq_enq_seq) { lib_me_t *me = md->me; + int unlink = (md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) ? + PTL_UNLINK : PTL_RETAIN; // #warning this does not track eq refcounts properly - ret->rc = lib_md_build(nal, md, private, - new, &new->eventq, md->unlink); + rc = lib_md_build(nal, md, newumd, unlink); md->me = me; } else { - ret->rc = PTL_NOUPDATE; + rc = PTL_MD_NO_UPDATE; } out: - state_unlock(nal, &flags); - return (ret->rc); + LIB_UNLOCK(nal, flags); + + return rc; } diff --git a/lustre/portals/portals/lib-me.c b/lustre/portals/portals/lib-me.c index 31ac214..9665b4f 100644 --- a/lustre/portals/portals/lib-me.c +++ b/lustre/portals/portals/lib-me.c @@ -31,128 +31,130 @@ #endif #include -#include -static void lib_me_dump(nal_cb_t * nal, lib_me_t * me); - -int do_PtlMEAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_me_attach(nal_t *apinal, + ptl_pt_index_t portal, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle) { - PtlMEAttach_in *args = v_args; - PtlMEAttach_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; - lib_ptl_t *tbl = &ni->tbl; + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; + lib_ptl_t *tbl = &ni->ni_portals; + lib_me_t *me; unsigned long flags; - lib_me_t *me; - if (args->index_in >= tbl->size) - return ret->rc = PTL_INV_PTINDEX; + if (portal >= tbl->size) + return PTL_PT_INDEX_INVALID; /* Should check for valid matchid, but not yet */ - if (0) - return ret->rc = PTL_INV_PROC; me = lib_me_alloc (nal); if (me == NULL) - return (ret->rc = PTL_NOSPACE); + return PTL_NO_SPACE; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - me->match_id = args->match_id_in; - me->match_bits = args->match_bits_in; - me->ignore_bits = args->ignore_bits_in; - me->unlink = args->unlink_in; + me->match_id = match_id; + me->match_bits = match_bits; + me->ignore_bits = ignore_bits; + me->unlink = unlink; me->md = NULL; lib_initialise_handle (nal, &me->me_lh, PTL_COOKIE_TYPE_ME); - if (args->position_in == PTL_INS_AFTER) - list_add_tail(&me->me_list, &(tbl->tbl[args->index_in])); + if (pos == PTL_INS_AFTER) + list_add_tail(&me->me_list, &(tbl->tbl[portal])); else - list_add(&me->me_list, &(tbl->tbl[args->index_in])); + list_add(&me->me_list, &(tbl->tbl[portal])); - ptl_me2handle(&ret->handle_out, me); + ptl_me2handle(handle, nal, me); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return ret->rc = PTL_OK; + return PTL_OK; } -int do_PtlMEInsert(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_me_insert(nal_t *apinal, + ptl_handle_me_t *current_meh, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle) { - PtlMEInsert_in *args = v_args; - PtlMEInsert_out *ret = v_ret; + lib_nal_t *nal = apinal->nal_data; + lib_me_t *current_me; + lib_me_t *new_me; unsigned long flags; - lib_me_t *me; - lib_me_t *new; - new = lib_me_alloc (nal); - if (new == NULL) - return (ret->rc = PTL_NOSPACE); + new_me = lib_me_alloc (nal); + if (new_me == NULL) + return PTL_NO_SPACE; /* Should check for valid matchid, but not yet */ - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - me = ptl_handle2me(&args->current_in, nal); - if (me == NULL) { - lib_me_free (nal, new); + current_me = ptl_handle2me(current_meh, nal); + if (current_me == NULL) { + lib_me_free (nal, new_me); - state_unlock (nal, &flags); - return (ret->rc = PTL_INV_ME); + LIB_UNLOCK(nal, flags); + return PTL_ME_INVALID; } - new->match_id = args->match_id_in; - new->match_bits = args->match_bits_in; - new->ignore_bits = args->ignore_bits_in; - new->unlink = args->unlink_in; - new->md = NULL; + new_me->match_id = match_id; + new_me->match_bits = match_bits; + new_me->ignore_bits = ignore_bits; + new_me->unlink = unlink; + new_me->md = NULL; - lib_initialise_handle (nal, &new->me_lh, PTL_COOKIE_TYPE_ME); + lib_initialise_handle (nal, &new_me->me_lh, PTL_COOKIE_TYPE_ME); - if (args->position_in == PTL_INS_AFTER) - list_add_tail(&new->me_list, &me->me_list); + if (pos == PTL_INS_AFTER) + list_add_tail(&new_me->me_list, ¤t_me->me_list); else - list_add(&new->me_list, &me->me_list); + list_add(&new_me->me_list, ¤t_me->me_list); - ptl_me2handle(&ret->handle_out, new); + ptl_me2handle(handle, nal, new_me); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return ret->rc = PTL_OK; + return PTL_OK; } -int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_me_unlink (nal_t *apinal, ptl_handle_me_t *meh) { - PtlMEUnlink_in *args = v_args; - PtlMEUnlink_out *ret = v_ret; + lib_nal_t *nal = apinal->nal_data; unsigned long flags; - lib_me_t *me; + lib_me_t *me; + int rc; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - me = ptl_handle2me(&args->current_in, nal); + me = ptl_handle2me(meh, nal); if (me == NULL) { - ret->rc = PTL_INV_ME; + rc = PTL_ME_INVALID; } else { lib_me_unlink(nal, me); - ret->rc = PTL_OK; + rc = PTL_OK; } - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return (ret->rc); + return (rc); } /* call with state_lock please */ -void lib_me_unlink(nal_cb_t *nal, lib_me_t *me) +void +lib_me_unlink(lib_nal_t *nal, lib_me_t *me) { - lib_ni_t *ni = &nal->ni; - - if (ni->debug & PTL_DEBUG_UNLINK) { - ptl_handle_any_t handle; - ptl_me2handle(&handle, me); - } - list_del (&me->me_list); if (me->md) { @@ -164,64 +166,20 @@ void lib_me_unlink(nal_cb_t *nal, lib_me_t *me) lib_me_free(nal, me); } -int do_PtlTblDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) -{ - PtlTblDump_in *args = v_args; - PtlTblDump_out *ret = v_ret; - lib_ptl_t *tbl = &nal->ni.tbl; - ptl_handle_any_t handle; - struct list_head *tmp; - unsigned long flags; - - if (args->index_in < 0 || args->index_in >= tbl->size) - return ret->rc = PTL_INV_PTINDEX; - - nal->cb_printf(nal, "Portal table index %d\n", args->index_in); - - state_lock(nal, &flags); - list_for_each(tmp, &(tbl->tbl[args->index_in])) { - lib_me_t *me = list_entry(tmp, lib_me_t, me_list); - ptl_me2handle(&handle, me); - lib_me_dump(nal, me); - } - state_unlock(nal, &flags); - - return ret->rc = PTL_OK; -} - -int do_PtlMEDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +#if 0 +static void +lib_me_dump(lib_nal_t *nal, lib_me_t * me) { - PtlMEDump_in *args = v_args; - PtlMEDump_out *ret = v_ret; - lib_me_t *me; - unsigned long flags; - - state_lock(nal, &flags); - - me = ptl_handle2me(&args->current_in, nal); - if (me == NULL) { - ret->rc = PTL_INV_ME; - } else { - lib_me_dump(nal, me); - ret->rc = PTL_OK; - } + CWARN("Match Entry %p ("LPX64")\n", me, + me->me_lh.lh_cookie); - state_unlock(nal, &flags); + CWARN("\tMatch/Ignore\t= %016lx / %016lx\n", + me->match_bits, me->ignore_bits); - return ret->rc; -} - -static void lib_me_dump(nal_cb_t * nal, lib_me_t * me) -{ - nal->cb_printf(nal, "Match Entry %p ("LPX64")\n", me, - me->me_lh.lh_cookie); - - nal->cb_printf(nal, "\tMatch/Ignore\t= %016lx / %016lx\n", - me->match_bits, me->ignore_bits); - - nal->cb_printf(nal, "\tMD\t= %p\n", me->md); - nal->cb_printf(nal, "\tprev\t= %p\n", - list_entry(me->me_list.prev, lib_me_t, me_list)); - nal->cb_printf(nal, "\tnext\t= %p\n", - list_entry(me->me_list.next, lib_me_t, me_list)); + CWARN("\tMD\t= %p\n", me->md); + CWARN("\tprev\t= %p\n", + list_entry(me->me_list.prev, lib_me_t, me_list)); + CWARN("\tnext\t= %p\n", + list_entry(me->me_list.next, lib_me_t, me_list)); } +#endif diff --git a/lustre/portals/portals/lib-move.c b/lustre/portals/portals/lib-move.c index ecd543c..d584f1c 100644 --- a/lustre/portals/portals/lib-move.c +++ b/lustre/portals/portals/lib-move.c @@ -31,37 +31,32 @@ #endif #include #include -#include -/* - * Right now it does not check access control lists. - * - * We only support one MD per ME, which is how the Portals 3.1 spec is written. - * All previous complication is removed. - */ +/* forward ref */ +static void lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg); -static lib_me_t * -lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid, - ptl_pid_t src_pid, ptl_size_t rlength, ptl_size_t roffset, - ptl_match_bits_t match_bits, ptl_size_t *mlength_out, - ptl_size_t *offset_out, int *unlink_out) +static lib_md_t * +lib_match_md(lib_nal_t *nal, int index, int op_mask, + ptl_nid_t src_nid, ptl_pid_t src_pid, + ptl_size_t rlength, ptl_size_t roffset, + ptl_match_bits_t match_bits, lib_msg_t *msg, + ptl_size_t *mlength_out, ptl_size_t *offset_out) { - lib_ni_t *ni = &nal->ni; - struct list_head *match_list = &ni->tbl.tbl[index]; + lib_ni_t *ni = &nal->libnal_ni; + struct list_head *match_list = &ni->ni_portals.tbl[index]; struct list_head *tmp; lib_me_t *me; lib_md_t *md; ptl_size_t mlength; ptl_size_t offset; - ENTRY; CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d " "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits); - if (index < 0 || index >= ni->tbl.size) { + if (index < 0 || index >= ni->ni_portals.size) { CERROR("Invalid portal %d not in [0-%d]\n", - index, ni->tbl.size); + index, ni->ni_portals.size); goto failed; } @@ -75,18 +70,21 @@ lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid, LASSERT (me == md->me); - /* MD deactivated */ - if (md->threshold == 0) - continue; - /* mismatched MD op */ if ((md->options & op_mask) == 0) continue; + /* MD exhausted */ + if (lib_md_exhausted(md)) + continue; + /* mismatched ME nid/pid? */ if (me->match_id.nid != PTL_NID_ANY && me->match_id.nid != src_nid) continue; + + CDEBUG(D_NET, "match_id.pid [%x], src_pid [%x]\n", + me->match_id.pid, src_pid); if (me->match_id.pid != PTL_PID_ANY && me->match_id.pid != src_pid) @@ -103,10 +101,12 @@ lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid, else offset = roffset; - mlength = md->length - offset; - if ((md->options & PTL_MD_MAX_SIZE) != 0 && - mlength > md->max_size) + if ((md->options & PTL_MD_MAX_SIZE) != 0) { mlength = md->max_size; + LASSERT (md->offset + mlength <= md->length); + } else { + mlength = md->length - offset; + } if (rlength <= mlength) { /* fits in allowed space */ mlength = rlength; @@ -118,78 +118,103 @@ lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid, goto failed; } + /* Commit to this ME/MD */ + CDEBUG(D_NET, "Incoming %s index %x from "LPU64"/%u of " + "length %d/%d into md "LPX64" [%d] + %d\n", + (op_mask == PTL_MD_OP_PUT) ? "put" : "get", + index, src_nid, src_pid, mlength, rlength, + md->md_lh.lh_cookie, md->md_niov, offset); + + lib_commit_md(nal, md, msg); md->offset = offset + mlength; + /* NB Caller sets ev.type and ev.hdr_data */ + msg->ev.initiator.nid = src_nid; + msg->ev.initiator.pid = src_pid; + msg->ev.pt_index = index; + msg->ev.match_bits = match_bits; + msg->ev.rlength = rlength; + msg->ev.mlength = mlength; + msg->ev.offset = offset; + + lib_md_deconstruct(nal, md, &msg->ev.md); + ptl_md2handle(&msg->ev.md_handle, nal, md); + *offset_out = offset; *mlength_out = mlength; - *unlink_out = ((md->options & PTL_MD_AUTO_UNLINK) != 0 && - md->offset >= (md->length - md->max_size)); - RETURN (me); + + /* Auto-unlink NOW, so the ME gets unlinked if required. + * We bumped md->pending above so the MD just gets flagged + * for unlink when it is finalized. */ + if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) != 0 && + lib_md_exhausted(md)) + lib_md_unlink(nal, md); + + RETURN (md); } failed: CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64 " offset %d length %d: no match\n", - ni->nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT", + ni->ni_pid.nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT", src_nid, src_pid, index, match_bits, roffset, rlength); RETURN(NULL); } -int do_PtlFailNid (nal_cb_t *nal, void *private, void *v_args, void *v_ret) +int lib_api_fail_nid (nal_t *apinal, ptl_nid_t nid, unsigned int threshold) { - PtlFailNid_in *args = v_args; - PtlFailNid_out *ret = v_ret; + lib_nal_t *nal = apinal->nal_data; lib_test_peer_t *tp; unsigned long flags; struct list_head *el; struct list_head *next; struct list_head cull; - if (args->threshold != 0) { + if (threshold != 0) { /* Adding a new entry */ - tp = (lib_test_peer_t *)nal->cb_malloc (nal, sizeof (*tp)); + PORTAL_ALLOC(tp, sizeof(*tp)); if (tp == NULL) - return (ret->rc = PTL_FAIL); + return PTL_NO_SPACE; - tp->tp_nid = args->nid; - tp->tp_threshold = args->threshold; + tp->tp_nid = nid; + tp->tp_threshold = threshold; - state_lock (nal, &flags); - list_add (&tp->tp_list, &nal->ni.ni_test_peers); - state_unlock (nal, &flags); - return (ret->rc = PTL_OK); + LIB_LOCK(nal, flags); + list_add_tail (&tp->tp_list, &nal->libnal_ni.ni_test_peers); + LIB_UNLOCK(nal, flags); + return PTL_OK; } /* removing entries */ INIT_LIST_HEAD (&cull); - state_lock (nal, &flags); + LIB_LOCK(nal, flags); - list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) { tp = list_entry (el, lib_test_peer_t, tp_list); if (tp->tp_threshold == 0 || /* needs culling anyway */ - args->nid == PTL_NID_ANY || /* removing all entries */ - tp->tp_nid == args->nid) /* matched this one */ + nid == PTL_NID_ANY || /* removing all entries */ + tp->tp_nid == nid) /* matched this one */ { list_del (&tp->tp_list); list_add (&tp->tp_list, &cull); } } - state_unlock (nal, &flags); + LIB_UNLOCK(nal, flags); while (!list_empty (&cull)) { tp = list_entry (cull.next, lib_test_peer_t, tp_list); list_del (&tp->tp_list); - nal->cb_free (nal, tp, sizeof (*tp)); + PORTAL_FREE(tp, sizeof (*tp)); } - return (ret->rc = PTL_OK); + return PTL_OK; } static int -fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) +fail_peer (lib_nal_t *nal, ptl_nid_t nid, int outgoing) { lib_test_peer_t *tp; struct list_head *el; @@ -200,9 +225,9 @@ fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) INIT_LIST_HEAD (&cull); - state_lock (nal, &flags); + LIB_LOCK (nal, flags); - list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) { tp = list_entry (el, lib_test_peer_t, tp_list); if (tp->tp_threshold == 0) { @@ -234,13 +259,13 @@ fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) } } - state_unlock (nal, &flags); + LIB_UNLOCK (nal, flags); while (!list_empty (&cull)) { tp = list_entry (cull.next, lib_test_peer_t, tp_list); list_del (&tp->tp_list); - nal->cb_free (nal, tp, sizeof (*tp)); + PORTAL_FREE(tp, sizeof (*tp)); } return (fail); @@ -531,52 +556,52 @@ lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, #endif ptl_err_t -lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, +lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen) { if (mlen == 0) - return (nal->cb_recv(nal, private, msg, - 0, NULL, - offset, mlen, rlen)); + return (nal->libnal_recv(nal, private, msg, + 0, NULL, + offset, mlen, rlen)); if ((md->options & PTL_MD_KIOV) == 0) - return (nal->cb_recv(nal, private, msg, - md->md_niov, md->md_iov.iov, - offset, mlen, rlen)); + return (nal->libnal_recv(nal, private, msg, + md->md_niov, md->md_iov.iov, + offset, mlen, rlen)); - return (nal->cb_recv_pages(nal, private, msg, - md->md_niov, md->md_iov.kiov, - offset, mlen, rlen)); + return (nal->libnal_recv_pages(nal, private, msg, + md->md_niov, md->md_iov.kiov, + offset, mlen, rlen)); } ptl_err_t -lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, +lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, lib_md_t *md, ptl_size_t offset, ptl_size_t len) { if (len == 0) - return (nal->cb_send(nal, private, msg, - hdr, type, nid, pid, - 0, NULL, - offset, len)); + return (nal->libnal_send(nal, private, msg, + hdr, type, nid, pid, + 0, NULL, + offset, len)); if ((md->options & PTL_MD_KIOV) == 0) - return (nal->cb_send(nal, private, msg, - hdr, type, nid, pid, - md->md_niov, md->md_iov.iov, - offset, len)); - - return (nal->cb_send_pages(nal, private, msg, - hdr, type, nid, pid, - md->md_niov, md->md_iov.kiov, - offset, len)); + return (nal->libnal_send(nal, private, msg, + hdr, type, nid, pid, + md->md_niov, md->md_iov.iov, + offset, len)); + + return (nal->libnal_send_pages(nal, private, msg, + hdr, type, nid, pid, + md->md_niov, md->md_iov.kiov, + offset, len)); } static void -lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg) +lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg) { - /* ALWAYS called holding the state_lock */ - lib_counters_t *counters = &nal->ni.counters; + /* ALWAYS called holding the LIB_LOCK */ + lib_counters_t *counters = &nal->libnal_ni.ni_counters; /* Here, we commit the MD to a network OP by marking it busy and * decrementing its threshold. Come what may, the network "owns" @@ -593,11 +618,11 @@ lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg) if (counters->msgs_alloc > counters->msgs_max) counters->msgs_max = counters->msgs_alloc; - list_add (&msg->msg_list, &nal->ni.ni_active_msgs); + list_add (&msg->msg_list, &nal->libnal_ni.ni_active_msgs); } static void -lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr) +lib_drop_message (lib_nal_t *nal, void *private, ptl_hdr_t *hdr) { unsigned long flags; @@ -605,10 +630,10 @@ lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr) * to receive (init_msg() not called) and therefore can't cause an * event. */ - state_lock(nal, &flags); - nal->ni.counters.drop_count++; - nal->ni.counters.drop_length += hdr->payload_length; - state_unlock(nal, &flags); + LIB_LOCK(nal, flags); + nal->libnal_ni.ni_counters.drop_count++; + nal->libnal_ni.ni_counters.drop_length += hdr->payload_length; + LIB_UNLOCK(nal, flags); /* NULL msg => if NAL calls lib_finalize it will be a noop */ (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); @@ -622,146 +647,98 @@ lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr) * */ static ptl_err_t -parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) +parse_put(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; ptl_size_t mlength = 0; ptl_size_t offset = 0; - int unlink = 0; ptl_err_t rc; - lib_me_t *me; lib_md_t *md; unsigned long flags; /* Convert put fields to host byte order */ - hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits); - hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index); - hdr->msg.put.offset = NTOH__u32 (hdr->msg.put.offset); - - state_lock(nal, &flags); - - me = lib_find_me(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT, - hdr->src_nid, hdr->src_pid, - hdr->payload_length, hdr->msg.put.offset, - hdr->msg.put.match_bits, - &mlength, &offset, &unlink); - if (me == NULL) { - state_unlock(nal, &flags); + hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits); + hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); + hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); + + LIB_LOCK(nal, flags); + + md = lib_match_md(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT, + hdr->src_nid, hdr->src_pid, + hdr->payload_length, hdr->msg.put.offset, + hdr->msg.put.match_bits, msg, + &mlength, &offset); + if (md == NULL) { + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } - md = me->md; - CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d " - "into md "LPX64" [%d] + %d\n", hdr->msg.put.ptl_index, - hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length, - md->md_lh.lh_cookie, md->md_niov, offset); - - lib_commit_md(nal, md, msg); - - msg->ev.type = PTL_EVENT_PUT; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.portal = hdr->msg.put.ptl_index; - msg->ev.match_bits = hdr->msg.put.match_bits; - msg->ev.rlength = hdr->payload_length; - msg->ev.mlength = mlength; - msg->ev.offset = offset; + msg->ev.type = PTL_EVENT_PUT_END; msg->ev.hdr_data = hdr->msg.put.hdr_data; - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) && !(md->options & PTL_MD_ACK_DISABLE)) { msg->ack_wmd = hdr->msg.put.ack_wmd; } - ni->counters.recv_count++; - ni->counters.recv_length += mlength; + ni->ni_counters.recv_count++; + ni->ni_counters.recv_length += mlength; - /* only unlink after MD's pending count has been bumped in - * lib_commit_md() otherwise lib_me_unlink() will nuke it */ - if (unlink) - lib_me_unlink (nal, me); - - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); rc = lib_recv(nal, private, msg, md, offset, mlength, hdr->payload_length); if (rc != PTL_OK) CERROR(LPU64": error on receiving PUT from "LPU64": %d\n", - ni->nid, hdr->src_nid, rc); + ni->ni_pid.nid, hdr->src_nid, rc); return (rc); } static ptl_err_t -parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) +parse_get(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; ptl_size_t mlength = 0; ptl_size_t offset = 0; - int unlink = 0; - lib_me_t *me; lib_md_t *md; ptl_hdr_t reply; unsigned long flags; int rc; /* Convert get fields to host byte order */ - hdr->msg.get.match_bits = NTOH__u64 (hdr->msg.get.match_bits); - hdr->msg.get.ptl_index = NTOH__u32 (hdr->msg.get.ptl_index); - hdr->msg.get.sink_length = NTOH__u32 (hdr->msg.get.sink_length); - hdr->msg.get.src_offset = NTOH__u32 (hdr->msg.get.src_offset); - - state_lock(nal, &flags); - - me = lib_find_me(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET, - hdr->src_nid, hdr->src_pid, - hdr->msg.get.sink_length, hdr->msg.get.src_offset, - hdr->msg.get.match_bits, - &mlength, &offset, &unlink); - if (me == NULL) { - state_unlock(nal, &flags); + hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits); + hdr->msg.get.ptl_index = le32_to_cpu(hdr->msg.get.ptl_index); + hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length); + hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset); + + LIB_LOCK(nal, flags); + + md = lib_match_md(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET, + hdr->src_nid, hdr->src_pid, + hdr->msg.get.sink_length, hdr->msg.get.src_offset, + hdr->msg.get.match_bits, msg, + &mlength, &offset); + if (md == NULL) { + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } - md = me->md; - CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d " - "from md "LPX64" [%d] + %d\n", hdr->msg.get.ptl_index, - hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length, - md->md_lh.lh_cookie, md->md_niov, offset); - - lib_commit_md(nal, md, msg); - - msg->ev.type = PTL_EVENT_GET; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.portal = hdr->msg.get.ptl_index; - msg->ev.match_bits = hdr->msg.get.match_bits; - msg->ev.rlength = hdr->payload_length; - msg->ev.mlength = mlength; - msg->ev.offset = offset; + msg->ev.type = PTL_EVENT_GET_END; msg->ev.hdr_data = 0; - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - - ni->counters.send_count++; - ni->counters.send_length += mlength; + ni->ni_counters.send_count++; + ni->ni_counters.send_length += mlength; - /* only unlink after MD's refcount has been bumped in - * lib_commit_md() otherwise lib_me_unlink() will nuke it */ - if (unlink) - lib_me_unlink (nal, me); - - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); memset (&reply, 0, sizeof (reply)); - reply.type = HTON__u32 (PTL_MSG_REPLY); - reply.dest_nid = HTON__u64 (hdr->src_nid); - reply.src_nid = HTON__u64 (ni->nid); - reply.dest_pid = HTON__u32 (hdr->src_pid); - reply.src_pid = HTON__u32 (ni->pid); - reply.payload_length = HTON__u32 (mlength); + reply.type = cpu_to_le32(PTL_MSG_REPLY); + reply.dest_nid = cpu_to_le64(hdr->src_nid); + reply.dest_pid = cpu_to_le32(hdr->src_pid); + reply.src_nid = cpu_to_le64(ni->ni_pid.nid); + reply.src_pid = cpu_to_le32(ni->ni_pid.pid); + reply.payload_length = cpu_to_le32(mlength); reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd; @@ -772,7 +749,7 @@ parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->src_nid, hdr->src_pid, md, offset, mlength); if (rc != PTL_OK) CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n", - ni->nid, hdr->src_nid, rc); + ni->ni_pid.nid, hdr->src_nid, rc); /* Discard any junk after the hdr */ (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); @@ -781,27 +758,27 @@ parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) } static ptl_err_t -parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) +parse_reply(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; lib_md_t *md; int rlength; int length; unsigned long flags; ptl_err_t rc; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); /* NB handles only looked up by creator (no flips) */ md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal); if (md == NULL || md->threshold == 0) { CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n", - ni->nid, hdr->src_nid, + ni->ni_pid.nid, hdr->src_nid, md == NULL ? "invalid" : "inactive", hdr->msg.reply.dst_wmd.wh_interface_cookie, hdr->msg.reply.dst_wmd.wh_object_cookie); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } @@ -813,10 +790,10 @@ parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) if ((md->options & PTL_MD_TRUNCATE) == 0) { CERROR (LPU64": Dropping REPLY from "LPU64 " length %d for MD "LPX64" would overflow (%d)\n", - ni->nid, hdr->src_nid, length, + ni->ni_pid.nid, hdr->src_nid, length, hdr->msg.reply.dst_wmd.wh_object_cookie, md->length); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } length = md->length; @@ -828,56 +805,57 @@ parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) lib_commit_md(nal, md, msg); - msg->ev.type = PTL_EVENT_REPLY; + msg->ev.type = PTL_EVENT_REPLY_END; msg->ev.initiator.nid = hdr->src_nid; msg->ev.initiator.pid = hdr->src_pid; msg->ev.rlength = rlength; msg->ev.mlength = length; msg->ev.offset = 0; - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + lib_md_deconstruct(nal, md, &msg->ev.md); + ptl_md2handle(&msg->ev.md_handle, nal, md); - ni->counters.recv_count++; - ni->counters.recv_length += length; + ni->ni_counters.recv_count++; + ni->ni_counters.recv_length += length; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); rc = lib_recv(nal, private, msg, md, 0, length, rlength); if (rc != PTL_OK) CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n", - ni->nid, hdr->src_nid, rc); + ni->ni_pid.nid, hdr->src_nid, rc); return (rc); } static ptl_err_t -parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) +parse_ack(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; lib_md_t *md; unsigned long flags; /* Convert ack fields to host byte order */ - hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits); - hdr->msg.ack.mlength = NTOH__u32 (hdr->msg.ack.mlength); + hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits); + hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength); - state_lock(nal, &flags); + LIB_LOCK(nal, flags); /* NB handles only looked up by creator (no flips) */ md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal); if (md == NULL || md->threshold == 0) { CDEBUG(D_INFO, LPU64": Dropping ACK from "LPU64" to %s MD " - LPX64"."LPX64"\n", ni->nid, hdr->src_nid, + LPX64"."LPX64"\n", ni->ni_pid.nid, hdr->src_nid, (md == NULL) ? "invalid" : "inactive", hdr->msg.ack.dst_wmd.wh_interface_cookie, hdr->msg.ack.dst_wmd.wh_object_cookie); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n", - ni->nid, hdr->src_nid, + ni->ni_pid.nid, hdr->src_nid, hdr->msg.ack.dst_wmd.wh_object_cookie); lib_commit_md(nal, md, msg); @@ -888,11 +866,12 @@ parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) msg->ev.mlength = hdr->msg.ack.mlength; msg->ev.match_bits = hdr->msg.ack.match_bits; - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + lib_md_deconstruct(nal, md, &msg->ev.md); + ptl_md2handle(&msg->ev.md_handle, nal, md); - ni->counters.recv_count++; + ni->ni_counters.recv_count++; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); /* We have received and matched up the ack OK, create the * completion event now... */ @@ -923,129 +902,154 @@ hdr_type_string (ptl_hdr_t *hdr) } } -void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr) +void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr) { char *type_str = hdr_type_string (hdr); - nal->cb_printf(nal, "P3 Header at %p of type %s\n", hdr, type_str); - nal->cb_printf(nal, " From nid/pid %Lu/%Lu", hdr->src_nid, - hdr->src_pid); - nal->cb_printf(nal, " To nid/pid %Lu/%Lu\n", hdr->dest_nid, - hdr->dest_pid); + CWARN("P3 Header at %p of type %s\n", hdr, type_str); + CWARN(" From nid/pid "LPX64"/%u", hdr->src_nid, hdr->src_pid); + CWARN(" To nid/pid "LPX64"/%u\n", hdr->dest_nid, hdr->dest_pid); switch (hdr->type) { default: break; case PTL_MSG_PUT: - nal->cb_printf(nal, - " Ptl index %d, ack md "LPX64"."LPX64", " - "match bits "LPX64"\n", - hdr->msg.put.ptl_index, - hdr->msg.put.ack_wmd.wh_interface_cookie, - hdr->msg.put.ack_wmd.wh_object_cookie, - hdr->msg.put.match_bits); - nal->cb_printf(nal, - " Length %d, offset %d, hdr data "LPX64"\n", - hdr->payload_length, hdr->msg.put.offset, - hdr->msg.put.hdr_data); + CWARN(" Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPX64"\n", + hdr->msg.put.ptl_index, + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + hdr->msg.put.match_bits); + CWARN(" Length %d, offset %d, hdr data "LPX64"\n", + hdr->payload_length, hdr->msg.put.offset, + hdr->msg.put.hdr_data); break; case PTL_MSG_GET: - nal->cb_printf(nal, - " Ptl index %d, return md "LPX64"."LPX64", " - "match bits "LPX64"\n", hdr->msg.get.ptl_index, - hdr->msg.get.return_wmd.wh_interface_cookie, - hdr->msg.get.return_wmd.wh_object_cookie, - hdr->msg.get.match_bits); - nal->cb_printf(nal, - " Length %d, src offset %d\n", - hdr->msg.get.sink_length, - hdr->msg.get.src_offset); + CWARN(" Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPX64"\n", hdr->msg.get.ptl_index, + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + CWARN(" Length %d, src offset %d\n", + hdr->msg.get.sink_length, + hdr->msg.get.src_offset); break; case PTL_MSG_ACK: - nal->cb_printf(nal, " dst md "LPX64"."LPX64", " - "manipulated length %d\n", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie, - hdr->msg.ack.mlength); + CWARN(" dst md "LPX64"."LPX64", " + "manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + hdr->msg.ack.mlength); break; case PTL_MSG_REPLY: - nal->cb_printf(nal, " dst md "LPX64"."LPX64", " - "length %d\n", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie, - hdr->payload_length); + CWARN(" dst md "LPX64"."LPX64", " + "length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + hdr->payload_length); } } /* end of print_hdr() */ -void -lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private) +ptl_err_t +lib_parse(lib_nal_t *nal, ptl_hdr_t *hdr, void *private) { unsigned long flags; ptl_err_t rc; lib_msg_t *msg; + + /* NB we return PTL_OK if we manage to parse the header and believe + * it looks OK. Anything that goes wrong with receiving the + * message after that point is the responsibility of the NAL */ /* convert common fields to host byte order */ - hdr->dest_nid = NTOH__u64 (hdr->dest_nid); - hdr->src_nid = NTOH__u64 (hdr->src_nid); - hdr->dest_pid = NTOH__u32 (hdr->dest_pid); - hdr->src_pid = NTOH__u32 (hdr->src_pid); - hdr->type = NTOH__u32 (hdr->type); - hdr->payload_length = NTOH__u32(hdr->payload_length); -#if 0 - nal->cb_printf(nal, "%d: lib_parse: nal=%p hdr=%p type=%d\n", - nal->ni.nid, nal, hdr, hdr->type); - print_hdr(nal, hdr); -#endif - if (hdr->type == PTL_MSG_HELLO) { + hdr->type = le32_to_cpu(hdr->type); + hdr->src_nid = le64_to_cpu(hdr->src_nid); + hdr->src_pid = le32_to_cpu(hdr->src_pid); + hdr->dest_pid = le32_to_cpu(hdr->dest_pid); + hdr->payload_length = le32_to_cpu(hdr->payload_length); + + switch (hdr->type) { + case PTL_MSG_HELLO: { /* dest_nid is really ptl_magicversion_t */ ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid; - CERROR (LPU64": Dropping unexpected HELLO message: " + mv->magic = le32_to_cpu(mv->magic); + mv->version_major = le16_to_cpu(mv->version_major); + mv->version_minor = le16_to_cpu(mv->version_minor); + + if (mv->magic == PORTALS_PROTO_MAGIC && + mv->version_major == PORTALS_PROTO_VERSION_MAJOR && + mv->version_minor == PORTALS_PROTO_VERSION_MINOR) { + CWARN (LPU64": Dropping unexpected HELLO message: " + "magic %d, version %d.%d from "LPD64"\n", + nal->libnal_ni.ni_pid.nid, mv->magic, + mv->version_major, mv->version_minor, + hdr->src_nid); + + /* it's good but we don't want it */ + lib_drop_message(nal, private, hdr); + return PTL_OK; + } + + /* we got garbage */ + CERROR (LPU64": Bad HELLO message: " "magic %d, version %d.%d from "LPD64"\n", - nal->ni.nid, mv->magic, + nal->libnal_ni.ni_pid.nid, mv->magic, mv->version_major, mv->version_minor, hdr->src_nid); - lib_drop_message(nal, private, hdr); - return; + return PTL_FAIL; } - - if (hdr->dest_nid != nal->ni.nid) { - CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64 - " (not me)\n", nal->ni.nid, hdr_type_string (hdr), - hdr->src_nid, hdr->dest_nid); - lib_drop_message(nal, private, hdr); - return; + + case PTL_MSG_ACK: + case PTL_MSG_PUT: + case PTL_MSG_GET: + case PTL_MSG_REPLY: + hdr->dest_nid = le64_to_cpu(hdr->dest_nid); + if (hdr->dest_nid != nal->libnal_ni.ni_pid.nid) { + CERROR(LPU64": BAD dest NID in %s message from" + LPU64" to "LPU64" (not me)\n", + nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), + hdr->src_nid, hdr->dest_nid); + return PTL_FAIL; + } + break; + + default: + CERROR(LPU64": Bad message type 0x%x from "LPU64"\n", + nal->libnal_ni.ni_pid.nid, hdr->type, hdr->src_nid); + return PTL_FAIL; } - if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + /* We've decided we're not receiving garbage since we can parse the + * header. We will return PTL_OK come what may... */ + + if (!list_empty (&nal->libnal_ni.ni_test_peers) && /* normally we don't */ fail_peer (nal, hdr->src_nid, 0)) /* shall we now? */ { CERROR(LPU64": Dropping incoming %s from "LPU64 ": simulated failure\n", - nal->ni.nid, hdr_type_string (hdr), + nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), hdr->src_nid); lib_drop_message(nal, private, hdr); - return; + return PTL_OK; } msg = lib_msg_alloc(nal); if (msg == NULL) { CERROR(LPU64": Dropping incoming %s from "LPU64 ": can't allocate a lib_msg_t\n", - nal->ni.nid, hdr_type_string (hdr), + nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), hdr->src_nid); lib_drop_message(nal, private, hdr); - return; + return PTL_OK; } - do_gettimeofday(&msg->ev.arrival_time); - switch (hdr->type) { case PTL_MSG_ACK: rc = parse_ack(nal, hdr, private, msg); @@ -1060,10 +1064,8 @@ lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private) rc = parse_reply(nal, hdr, private, msg); break; default: - CERROR(LPU64": Dropping message from "LPU64 - ": Bad type=0x%x\n", nal->ni.nid, hdr->src_nid, - hdr->type); - rc = PTL_FAIL; + LASSERT(0); + rc = PTL_FAIL; /* no compiler warning please */ break; } @@ -1072,138 +1074,129 @@ lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private) /* committed... */ lib_finalize(nal, private, msg, rc); } else { - state_lock(nal, &flags); - lib_msg_free(nal, msg); /* expects state_lock held */ - state_unlock(nal, &flags); + LIB_LOCK(nal, flags); + lib_msg_free(nal, msg); /* expects LIB_LOCK held */ + LIB_UNLOCK(nal, flags); lib_drop_message(nal, private, hdr); } } + + return PTL_OK; + /* That's "OK I can parse it", not "OK I like it" :) */ } int -do_PtlPut(nal_cb_t *nal, void *private, void *v_args, void *v_ret) +lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, + ptl_ack_req_t ack, ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, + ptl_size_t offset, ptl_hdr_data_t hdr_data) { - /* - * Incoming: - * ptl_handle_md_t md_in - * ptl_ack_req_t ack_req_in - * ptl_process_id_t target_in - * ptl_pt_index_t portal_in - * ptl_ac_index_t cookie_in - * ptl_match_bits_t match_bits_in - * ptl_size_t offset_in - * - * Outgoing: - */ - - PtlPut_in *args = v_args; - ptl_process_id_t *id = &args->target_in; - PtlPut_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; lib_msg_t *msg; ptl_hdr_t hdr; lib_md_t *md; unsigned long flags; int rc; - if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + if (!list_empty (&ni->ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ { - CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", - nal->ni.nid, id->nid); - return (ret->rc = PTL_INV_PROC); + CERROR("Dropping PUT to "LPU64": simulated failure\n", + id->nid); + return PTL_PROCESS_INVALID; } msg = lib_msg_alloc(nal); if (msg == NULL) { CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n", - ni->nid, id->nid); - return (ret->rc = PTL_NOSPACE); + ni->ni_pid.nid, id->nid); + return PTL_NO_SPACE; } - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - md = ptl_handle2md(&args->md_in, nal); + md = ptl_handle2md(mdh, nal); if (md == NULL || md->threshold == 0) { lib_msg_free(nal, msg); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return (ret->rc = PTL_INV_MD); + return PTL_MD_INVALID; } - CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid, - (unsigned long)id->pid); + CDEBUG(D_NET, "PtlPut -> "LPX64"\n", id->nid); memset (&hdr, 0, sizeof (hdr)); - hdr.type = HTON__u32 (PTL_MSG_PUT); - hdr.dest_nid = HTON__u64 (id->nid); - hdr.src_nid = HTON__u64 (ni->nid); - hdr.dest_pid = HTON__u32 (id->pid); - hdr.src_pid = HTON__u32 (ni->pid); - hdr.payload_length = HTON__u32 (md->length); + hdr.type = cpu_to_le32(PTL_MSG_PUT); + hdr.dest_nid = cpu_to_le64(id->nid); + hdr.dest_pid = cpu_to_le32(id->pid); + hdr.src_nid = cpu_to_le64(ni->ni_pid.nid); + hdr.src_pid = cpu_to_le32(ni->ni_pid.pid); + hdr.payload_length = cpu_to_le32(md->length); /* NB handles only looked up by creator (no flips) */ - if (args->ack_req_in == PTL_ACK_REQ) { + if (ack == PTL_ACK_REQ) { hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie; hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie; } else { hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE; } - hdr.msg.put.match_bits = HTON__u64 (args->match_bits_in); - hdr.msg.put.ptl_index = HTON__u32 (args->portal_in); - hdr.msg.put.offset = HTON__u32 (args->offset_in); - hdr.msg.put.hdr_data = args->hdr_data_in; + hdr.msg.put.match_bits = cpu_to_le64(match_bits); + hdr.msg.put.ptl_index = cpu_to_le32(portal); + hdr.msg.put.offset = cpu_to_le32(offset); + hdr.msg.put.hdr_data = hdr_data; lib_commit_md(nal, md, msg); - msg->ev.type = PTL_EVENT_SENT; - msg->ev.initiator.nid = ni->nid; - msg->ev.initiator.pid = ni->pid; - msg->ev.portal = args->portal_in; - msg->ev.match_bits = args->match_bits_in; + msg->ev.type = PTL_EVENT_SEND_END; + msg->ev.initiator.nid = ni->ni_pid.nid; + msg->ev.initiator.pid = ni->ni_pid.pid; + msg->ev.pt_index = portal; + msg->ev.match_bits = match_bits; msg->ev.rlength = md->length; msg->ev.mlength = md->length; - msg->ev.offset = args->offset_in; - msg->ev.hdr_data = args->hdr_data_in; + msg->ev.offset = offset; + msg->ev.hdr_data = hdr_data; - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + lib_md_deconstruct(nal, md, &msg->ev.md); + ptl_md2handle(&msg->ev.md_handle, nal, md); - ni->counters.send_count++; - ni->counters.send_length += md->length; + ni->ni_counters.send_count++; + ni->ni_counters.send_length += md->length; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - rc = lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, + rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_PUT, id->nid, id->pid, md, 0, md->length); if (rc != PTL_OK) { - CERROR(LPU64": error sending PUT to "LPU64": %d\n", - ni->nid, id->nid, rc); - lib_finalize (nal, private, msg, rc); + CERROR("Error sending PUT to "LPX64": %d\n", + id->nid, rc); + lib_finalize (nal, NULL, msg, rc); } /* completion will be signalled by an event */ - return ret->rc = PTL_OK; + return PTL_OK; } lib_msg_t * -lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd) +lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg) { /* The NAL can DMA direct to the GET md (i.e. no REPLY msg). This - * returns a msg the NAL can pass to lib_finalize() so that a REPLY - * event still occurs. + * returns a msg for the NAL to pass to lib_finalize() when the sink + * data has been received. * - * CAVEAT EMPTOR: 'getmd' is passed by pointer so it MUST be valid. - * This can only be guaranteed while a lib_msg_t holds a reference - * on it (ie. pending > 0), so best call this before the - * lib_finalize() of the original GET. */ + * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when + * lib_finalize() is called on it, so the NAL must call this first */ - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; lib_msg_t *msg = lib_msg_alloc(nal); + lib_md_t *getmd = getmsg->md; unsigned long flags; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); LASSERT (getmd->pending > 0); @@ -1225,143 +1218,132 @@ lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd) lib_commit_md (nal, getmd, msg); - msg->ev.type = PTL_EVENT_REPLY; + msg->ev.type = PTL_EVENT_REPLY_END; msg->ev.initiator.nid = peer_nid; msg->ev.initiator.pid = 0; /* XXX FIXME!!! */ msg->ev.rlength = msg->ev.mlength = getmd->length; msg->ev.offset = 0; - lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc); + lib_md_deconstruct(nal, getmd, &msg->ev.md); + ptl_md2handle(&msg->ev.md_handle, nal, getmd); - ni->counters.recv_count++; - ni->counters.recv_length += getmd->length; + ni->ni_counters.recv_count++; + ni->ni_counters.recv_length += getmd->length; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return msg; drop_msg: lib_msg_free(nal, msg); drop: - nal->ni.counters.drop_count++; - nal->ni.counters.drop_length += getmd->length; + nal->libnal_ni.ni_counters.drop_count++; + nal->libnal_ni.ni_counters.drop_length += getmd->length; - state_unlock (nal, &flags); + LIB_UNLOCK (nal, flags); return NULL; } int -do_PtlGet(nal_cb_t *nal, void *private, void *v_args, void *v_ret) +lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, ptl_size_t offset) { - /* - * Incoming: - * ptl_handle_md_t md_in - * ptl_process_id_t target_in - * ptl_pt_index_t portal_in - * ptl_ac_index_t cookie_in - * ptl_match_bits_t match_bits_in - * ptl_size_t offset_in - * - * Outgoing: - */ - - PtlGet_in *args = v_args; - ptl_process_id_t *id = &args->target_in; - PtlGet_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; lib_msg_t *msg; ptl_hdr_t hdr; lib_md_t *md; unsigned long flags; int rc; - if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + if (!list_empty (&ni->ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ { - CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", - nal->ni.nid, id->nid); - return (ret->rc = PTL_INV_PROC); + CERROR("Dropping PUT to "LPX64": simulated failure\n", + id->nid); + return PTL_PROCESS_INVALID; } msg = lib_msg_alloc(nal); if (msg == NULL) { - CERROR(LPU64": Dropping GET to "LPU64": ENOMEM on lib_msg_t\n", - ni->nid, id->nid); - return (ret->rc = PTL_NOSPACE); + CERROR("Dropping GET to "LPU64": ENOMEM on lib_msg_t\n", + id->nid); + return PTL_NO_SPACE; } - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - md = ptl_handle2md(&args->md_in, nal); + md = ptl_handle2md(mdh, nal); if (md == NULL || !md->threshold) { lib_msg_free(nal, msg); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return ret->rc = PTL_INV_MD; + return PTL_MD_INVALID; } CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid, (unsigned long)id->pid); memset (&hdr, 0, sizeof (hdr)); - hdr.type = HTON__u32 (PTL_MSG_GET); - hdr.dest_nid = HTON__u64 (id->nid); - hdr.src_nid = HTON__u64 (ni->nid); - hdr.dest_pid = HTON__u32 (id->pid); - hdr.src_pid = HTON__u32 (ni->pid); + hdr.type = cpu_to_le32(PTL_MSG_GET); + hdr.dest_nid = cpu_to_le64(id->nid); + hdr.dest_pid = cpu_to_le32(id->pid); + hdr.src_nid = cpu_to_le64(ni->ni_pid.nid); + hdr.src_pid = cpu_to_le32(ni->ni_pid.pid); hdr.payload_length = 0; /* NB handles only looked up by creator (no flips) */ hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie; hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie; - hdr.msg.get.match_bits = HTON__u64 (args->match_bits_in); - hdr.msg.get.ptl_index = HTON__u32 (args->portal_in); - hdr.msg.get.src_offset = HTON__u32 (args->offset_in); - hdr.msg.get.sink_length = HTON__u32 (md->length); + hdr.msg.get.match_bits = cpu_to_le64(match_bits); + hdr.msg.get.ptl_index = cpu_to_le32(portal); + hdr.msg.get.src_offset = cpu_to_le32(offset); + hdr.msg.get.sink_length = cpu_to_le32(md->length); lib_commit_md(nal, md, msg); - msg->ev.type = PTL_EVENT_SENT; - msg->ev.initiator.nid = ni->nid; - msg->ev.initiator.pid = ni->pid; - msg->ev.portal = args->portal_in; - msg->ev.match_bits = args->match_bits_in; + msg->ev.type = PTL_EVENT_SEND_END; + msg->ev.initiator = ni->ni_pid; + msg->ev.pt_index = portal; + msg->ev.match_bits = match_bits; msg->ev.rlength = md->length; msg->ev.mlength = md->length; - msg->ev.offset = args->offset_in; + msg->ev.offset = offset; msg->ev.hdr_data = 0; - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + lib_md_deconstruct(nal, md, &msg->ev.md); + ptl_md2handle(&msg->ev.md_handle, nal, md); - ni->counters.send_count++; + ni->ni_counters.send_count++; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - rc = lib_send (nal, private, msg, &hdr, PTL_MSG_GET, + rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_GET, id->nid, id->pid, NULL, 0, 0); if (rc != PTL_OK) { CERROR(LPU64": error sending GET to "LPU64": %d\n", - ni->nid, id->nid, rc); - lib_finalize (nal, private, msg, rc); + ni->ni_pid.nid, id->nid, rc); + lib_finalize (nal, NULL, msg, rc); } /* completion will be signalled by an event */ - return ret->rc = PTL_OK; + return PTL_OK; } void lib_assert_wire_constants (void) { /* Wire protocol assertions generated by 'wirecheck' - * running on Linux robert.bartonsoftware.com 2.4.20-18.9 #1 Thu May 29 06:54:41 EDT 2003 i68 - * with gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5) */ + * running on Linux mdevi 2.4.21-p4smp-55chaos #1 SMP Tue Jun 8 14:38:44 PDT 2004 i686 i686 i + * with gcc version 3.2.3 20030502 (Red Hat Linux 3.2.3-34) */ /* Constants... */ LASSERT (PORTALS_PROTO_MAGIC == 0xeebc0ded); - LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); - LASSERT (PORTALS_PROTO_VERSION_MINOR == 3); + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 1); + LASSERT (PORTALS_PROTO_VERSION_MINOR == 0); LASSERT (PTL_MSG_ACK == 0); LASSERT (PTL_MSG_PUT == 1); LASSERT (PTL_MSG_GET == 2); @@ -1370,76 +1352,76 @@ void lib_assert_wire_constants (void) /* Checks for struct ptl_handle_wire_t */ LASSERT ((int)sizeof(ptl_handle_wire_t) == 16); - LASSERT (offsetof(ptl_handle_wire_t, wh_interface_cookie) == 0); + LASSERT ((int)offsetof(ptl_handle_wire_t, wh_interface_cookie) == 0); LASSERT ((int)sizeof(((ptl_handle_wire_t *)0)->wh_interface_cookie) == 8); - LASSERT (offsetof(ptl_handle_wire_t, wh_object_cookie) == 8); + LASSERT ((int)offsetof(ptl_handle_wire_t, wh_object_cookie) == 8); LASSERT ((int)sizeof(((ptl_handle_wire_t *)0)->wh_object_cookie) == 8); /* Checks for struct ptl_magicversion_t */ LASSERT ((int)sizeof(ptl_magicversion_t) == 8); - LASSERT (offsetof(ptl_magicversion_t, magic) == 0); + LASSERT ((int)offsetof(ptl_magicversion_t, magic) == 0); LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->magic) == 4); - LASSERT (offsetof(ptl_magicversion_t, version_major) == 4); + LASSERT ((int)offsetof(ptl_magicversion_t, version_major) == 4); LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->version_major) == 2); - LASSERT (offsetof(ptl_magicversion_t, version_minor) == 6); + LASSERT ((int)offsetof(ptl_magicversion_t, version_minor) == 6); LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->version_minor) == 2); /* Checks for struct ptl_hdr_t */ LASSERT ((int)sizeof(ptl_hdr_t) == 72); - LASSERT (offsetof(ptl_hdr_t, dest_nid) == 0); + LASSERT ((int)offsetof(ptl_hdr_t, dest_nid) == 0); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->dest_nid) == 8); - LASSERT (offsetof(ptl_hdr_t, src_nid) == 8); + LASSERT ((int)offsetof(ptl_hdr_t, src_nid) == 8); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->src_nid) == 8); - LASSERT (offsetof(ptl_hdr_t, dest_pid) == 16); + LASSERT ((int)offsetof(ptl_hdr_t, dest_pid) == 16); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->dest_pid) == 4); - LASSERT (offsetof(ptl_hdr_t, src_pid) == 20); + LASSERT ((int)offsetof(ptl_hdr_t, src_pid) == 20); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->src_pid) == 4); - LASSERT (offsetof(ptl_hdr_t, type) == 24); + LASSERT ((int)offsetof(ptl_hdr_t, type) == 24); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->type) == 4); - LASSERT (offsetof(ptl_hdr_t, payload_length) == 28); + LASSERT ((int)offsetof(ptl_hdr_t, payload_length) == 28); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->payload_length) == 4); - LASSERT (offsetof(ptl_hdr_t, msg) == 32); + LASSERT ((int)offsetof(ptl_hdr_t, msg) == 32); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg) == 40); /* Ack */ - LASSERT (offsetof(ptl_hdr_t, msg.ack.dst_wmd) == 32); + LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.dst_wmd) == 32); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.dst_wmd) == 16); - LASSERT (offsetof(ptl_hdr_t, msg.ack.match_bits) == 48); + LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.match_bits) == 48); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.match_bits) == 8); - LASSERT (offsetof(ptl_hdr_t, msg.ack.mlength) == 56); + LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.mlength) == 56); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.mlength) == 4); /* Put */ - LASSERT (offsetof(ptl_hdr_t, msg.put.ack_wmd) == 32); + LASSERT ((int)offsetof(ptl_hdr_t, msg.put.ack_wmd) == 32); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.ack_wmd) == 16); - LASSERT (offsetof(ptl_hdr_t, msg.put.match_bits) == 48); + LASSERT ((int)offsetof(ptl_hdr_t, msg.put.match_bits) == 48); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.match_bits) == 8); - LASSERT (offsetof(ptl_hdr_t, msg.put.hdr_data) == 56); + LASSERT ((int)offsetof(ptl_hdr_t, msg.put.hdr_data) == 56); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.hdr_data) == 8); - LASSERT (offsetof(ptl_hdr_t, msg.put.ptl_index) == 64); + LASSERT ((int)offsetof(ptl_hdr_t, msg.put.ptl_index) == 64); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.ptl_index) == 4); - LASSERT (offsetof(ptl_hdr_t, msg.put.offset) == 68); + LASSERT ((int)offsetof(ptl_hdr_t, msg.put.offset) == 68); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.offset) == 4); /* Get */ - LASSERT (offsetof(ptl_hdr_t, msg.get.return_wmd) == 32); + LASSERT ((int)offsetof(ptl_hdr_t, msg.get.return_wmd) == 32); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.return_wmd) == 16); - LASSERT (offsetof(ptl_hdr_t, msg.get.match_bits) == 48); + LASSERT ((int)offsetof(ptl_hdr_t, msg.get.match_bits) == 48); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.match_bits) == 8); - LASSERT (offsetof(ptl_hdr_t, msg.get.ptl_index) == 56); + LASSERT ((int)offsetof(ptl_hdr_t, msg.get.ptl_index) == 56); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.ptl_index) == 4); - LASSERT (offsetof(ptl_hdr_t, msg.get.src_offset) == 60); + LASSERT ((int)offsetof(ptl_hdr_t, msg.get.src_offset) == 60); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.src_offset) == 4); - LASSERT (offsetof(ptl_hdr_t, msg.get.sink_length) == 64); + LASSERT ((int)offsetof(ptl_hdr_t, msg.get.sink_length) == 64); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.sink_length) == 4); /* Reply */ - LASSERT (offsetof(ptl_hdr_t, msg.reply.dst_wmd) == 32); + LASSERT ((int)offsetof(ptl_hdr_t, msg.reply.dst_wmd) == 32); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.reply.dst_wmd) == 16); /* Hello */ - LASSERT (offsetof(ptl_hdr_t, msg.hello.incarnation) == 32); + LASSERT ((int)offsetof(ptl_hdr_t, msg.hello.incarnation) == 32); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.hello.incarnation) == 8); - LASSERT (offsetof(ptl_hdr_t, msg.hello.type) == 40); + LASSERT ((int)offsetof(ptl_hdr_t, msg.hello.type) == 40); LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.hello.type) == 4); } diff --git a/lustre/portals/portals/lib-msg.c b/lustre/portals/portals/lib-msg.c index 04c69b1..54e89bc 100644 --- a/lustre/portals/portals/lib-msg.c +++ b/lustre/portals/portals/lib-msg.c @@ -33,55 +33,47 @@ #include void -lib_enq_event_locked (nal_cb_t *nal, void *private, +lib_enq_event_locked (lib_nal_t *nal, void *private, lib_eq_t *eq, ptl_event_t *ev) { ptl_event_t *eq_slot; - int rc; - - ev->sequence = eq->sequence++; /* Allocate the next queue slot */ - - /* size must be a power of 2 to handle a wrapped sequence # */ - LASSERT (eq->size != 0 && - eq->size == LOWEST_BIT_SET (eq->size)); - eq_slot = eq->base + (ev->sequence & (eq->size - 1)); - - /* Copy the event into the allocated slot, ensuring all the rest of - * the event's contents have been copied _before_ the sequence - * number gets updated. A processes 'getting' an event waits on - * the next queue slot's sequence to be 'new'. When it is, _all_ - * other event fields had better be consistent. I assert - * 'sequence' is the last member, so I only need a 2 stage copy. */ - LASSERT(sizeof (ptl_event_t) == - offsetof(ptl_event_t, sequence) + sizeof(ev->sequence)); - - rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev, - offsetof (ptl_event_t, sequence)); - LASSERT (rc == PTL_OK); - -#ifdef __KERNEL__ - barrier(); -#endif - /* Updating the sequence number is what makes the event 'new' NB if - * the cb_write below isn't atomic, this could cause a race with - * PtlEQGet */ - rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence, - (void *)&ev->sequence,sizeof (ev->sequence)); - LASSERT (rc == PTL_OK); + /* Allocate the next queue slot */ + ev->link = ev->sequence = eq->eq_enq_seq++; + /* NB we don't support START events yet and we don't create a separate + * UNLINK event unless an explicit unlink succeeds, so the link + * sequence is pretty useless */ + /* We don't support different uid/jids yet */ + ev->uid = 0; + ev->jid = 0; + + /* size must be a power of 2 to handle sequence # overflow */ + LASSERT (eq->eq_size != 0 && + eq->eq_size == LOWEST_BIT_SET (eq->eq_size)); + eq_slot = eq->eq_events + (ev->sequence & (eq->eq_size - 1)); + + /* There is no race since both event consumers and event producers + * take the LIB_LOCK(), so we don't screw around with memory + * barriers, setting the sequence number last or wierd structure + * layout assertions. */ + *eq_slot = *ev; + + /* Call the callback handler (if any) */ + if (eq->eq_callback != NULL) + eq->eq_callback (eq_slot); + + /* Wake anyone sleeping for an event (see lib-eq.c) */ #ifdef __KERNEL__ - barrier(); + if (waitqueue_active(&nal->libnal_ni.ni_waitq)) + wake_up_all(&nal->libnal_ni.ni_waitq); +#else + pthread_cond_broadcast(&nal->libnal_ni.ni_cond); #endif - - if (nal->cb_callback != NULL) - nal->cb_callback(nal, private, eq, ev); - else if (eq->event_callback != NULL) - eq->event_callback(ev); } void -lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) +lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) { lib_md_t *md; int unlink; @@ -89,10 +81,6 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) int rc; ptl_hdr_t ack; - /* ni went down while processing this message */ - if (nal->ni.up == 0) - return; - if (msg == NULL) return; @@ -100,19 +88,19 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) if (status == PTL_OK && !ptl_is_wire_handle_none(&msg->ack_wmd)) { - LASSERT(msg->ev.type == PTL_EVENT_PUT); + LASSERT(msg->ev.type == PTL_EVENT_PUT_END); memset (&ack, 0, sizeof (ack)); - ack.type = HTON__u32 (PTL_MSG_ACK); - ack.dest_nid = HTON__u64 (msg->ev.initiator.nid); - ack.src_nid = HTON__u64 (nal->ni.nid); - ack.dest_pid = HTON__u32 (msg->ev.initiator.pid); - ack.src_pid = HTON__u32 (nal->ni.pid); + ack.type = cpu_to_le32(PTL_MSG_ACK); + ack.dest_nid = cpu_to_le64(msg->ev.initiator.nid); + ack.dest_pid = cpu_to_le32(msg->ev.initiator.pid); + ack.src_nid = cpu_to_le64(nal->libnal_ni.ni_pid.nid); + ack.src_pid = cpu_to_le32(nal->libnal_ni.ni_pid.pid); ack.payload_length = 0; ack.msg.ack.dst_wmd = msg->ack_wmd; ack.msg.ack.match_bits = msg->ev.match_bits; - ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength); + ack.msg.ack.mlength = cpu_to_le32(msg->ev.mlength); rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK, msg->ev.initiator.nid, msg->ev.initiator.pid, @@ -126,18 +114,23 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) md = msg->md; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); /* Now it's safe to drop my caller's ref */ md->pending--; LASSERT (md->pending >= 0); /* Should I unlink this MD? */ - unlink = (md->pending == 0 && /* No other refs */ - (md->threshold == 0 || /* All ops done */ - md->md_flags & PTL_MD_FLAG_UNLINK) != 0); /* black spot */ - - msg->ev.status = status; + if (md->pending != 0) /* other refs */ + unlink = 0; + else if ((md->md_flags & PTL_MD_FLAG_ZOMBIE) != 0) + unlink = 1; + else if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) == 0) + unlink = 0; + else + unlink = lib_md_exhausted(md); + + msg->ev.ni_fail_type = status; msg->ev.unlinked = unlink; if (md->eq != NULL) @@ -147,8 +140,8 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) lib_md_unlink(nal, md); list_del (&msg->msg_list); - nal->ni.counters.msgs_alloc--; + nal->libnal_ni.ni_counters.msgs_alloc--; lib_msg_free(nal, msg); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); } diff --git a/lustre/portals/portals/lib-ni.c b/lustre/portals/portals/lib-ni.c index 9e90576..0f298a0 100644 --- a/lustre/portals/portals/lib-ni.c +++ b/lustre/portals/portals/lib-ni.c @@ -25,104 +25,48 @@ #define DEBUG_SUBSYSTEM S_PORTALS #include -#include #define MAX_DIST 18446744073709551615ULL -int do_PtlNIDebug(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int lib_api_ni_status (nal_t *apinal, ptl_sr_index_t sr_idx, + ptl_sr_value_t *status) { - PtlNIDebug_in *args = v_args; - PtlNIDebug_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; - - ret->rc = ni->debug; - ni->debug = args->mask_in; - - return 0; -} - -int do_PtlNIStatus(nal_cb_t * nal, void *private, void *v_args, void *v_ret) -{ - /* - * Incoming: - * ptl_handle_ni_t interface_in - * ptl_sr_index_t register_in - * - * Outgoing: - * ptl_sr_value_t * status_out - */ - - PtlNIStatus_in *args = v_args; - PtlNIStatus_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; - lib_counters_t *count = &ni->counters; - - if (!args) - return ret->rc = PTL_SEGV; - - ret->rc = PTL_OK; - ret->status_out = 0; - - /* - * I hate this sort of code.... Hash tables, offset lists? - * Treat the counters as an array of ints? - */ - if (args->register_in == PTL_SR_DROP_COUNT) - ret->status_out = count->drop_count; - - else if (args->register_in == PTL_SR_DROP_LENGTH) - ret->status_out = count->drop_length; - - else if (args->register_in == PTL_SR_RECV_COUNT) - ret->status_out = count->recv_count; - - else if (args->register_in == PTL_SR_RECV_LENGTH) - ret->status_out = count->recv_length; - - else if (args->register_in == PTL_SR_SEND_COUNT) - ret->status_out = count->send_count; - - else if (args->register_in == PTL_SR_SEND_LENGTH) - ret->status_out = count->send_length; - - else if (args->register_in == PTL_SR_MSGS_MAX) - ret->status_out = count->msgs_max; - else - ret->rc = PTL_INV_SR_INDX; - - return ret->rc; + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; + lib_counters_t *count = &ni->ni_counters; + + switch (sr_idx) { + case PTL_SR_DROP_COUNT: + *status = count->drop_count; + return PTL_OK; + case PTL_SR_DROP_LENGTH: + *status = count->drop_length; + return PTL_OK; + case PTL_SR_RECV_COUNT: + *status = count->recv_count; + return PTL_OK; + case PTL_SR_RECV_LENGTH: + *status = count->recv_length; + return PTL_OK; + case PTL_SR_SEND_COUNT: + *status = count->send_count; + return PTL_OK; + case PTL_SR_SEND_LENGTH: + *status = count->send_length; + return PTL_OK; + case PTL_SR_MSGS_MAX: + *status = count->msgs_max; + return PTL_OK; + default: + *status = 0; + return PTL_SR_INDEX_INVALID; + } } -int do_PtlNIDist(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int lib_api_ni_dist (nal_t *apinal, ptl_process_id_t *pid, unsigned long *dist) { - /* - * Incoming: - * ptl_handle_ni_t interface_in - * ptl_process_id_t process_in - - * - * Outgoing: - * unsigned long * distance_out - - */ - - PtlNIDist_in *args = v_args; - PtlNIDist_out *ret = v_ret; - - unsigned long dist; - ptl_process_id_t id_in = args->process_in; - ptl_nid_t nid; - int rc; - - nid = id_in.nid; - - if ((rc = nal->cb_dist(nal, nid, &dist)) != 0) { - ret->distance_out = (unsigned long) MAX_DIST; - return PTL_INV_PROC; - } - - ret->distance_out = dist; + lib_nal_t *nal = apinal->nal_data; - return ret->rc = PTL_OK; + return (nal->libnal_dist(nal, pid->nid, dist)); } diff --git a/lustre/portals/portals/lib-pid.c b/lustre/portals/portals/lib-pid.c index 12eebb5..ff2a601 100644 --- a/lustre/portals/portals/lib-pid.c +++ b/lustre/portals/portals/lib-pid.c @@ -35,24 +35,12 @@ extern int getpid(void); # include #endif #include -#include -int do_PtlGetId(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_get_id(nal_t *apinal, ptl_process_id_t *pid) { - /* - * Incoming: - * ptl_handle_ni_t handle_in - * - * Outgoing: - * ptl_process_id_t * id_out - * ptl_id_t * gsize_out - */ - - PtlGetId_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; - - ret->id_out.nid = ni->nid; - ret->id_out.pid = ni->pid; - - return ret->rc = PTL_OK; + lib_nal_t *nal = apinal->nal_data; + + *pid = nal->libnal_ni.ni_pid; + return PTL_OK; } diff --git a/lustre/portals/portals/module.c b/lustre/portals/portals/module.c index eb41dfd..61ef372 100644 --- a/lustre/portals/portals/module.c +++ b/lustre/portals/portals/module.c @@ -83,7 +83,8 @@ static int kportal_ioctl(struct portal_ioctl_data *data, CDEBUG (D_IOCTL, "Getting nid for nal [%d]\n", data->ioc_nal); - err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih); + err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, + NULL, &nih); if (!(err == PTL_OK || err == PTL_IFACE_DUP)) RETURN (-EINVAL); @@ -104,7 +105,8 @@ static int kportal_ioctl(struct portal_ioctl_data *data, CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n", data->ioc_nal, data->ioc_nid, data->ioc_count); - err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih); + err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, + NULL, &nih); if (!(err == PTL_OK || err == PTL_IFACE_DUP)) return (-EINVAL); diff --git a/lustre/portals/router/proc.c b/lustre/portals/router/proc.c index ad4dd87..a1397d2 100644 --- a/lustre/portals/router/proc.c +++ b/lustre/portals/router/proc.c @@ -42,7 +42,7 @@ struct name2num { { "elan", QSWNAL}, { "tcp", SOCKNAL}, { "gm", GMNAL}, - { "ib", IBNAL}, + { "ib", OPENIBNAL}, { NULL, -1} }; diff --git a/lustre/portals/router/router.c b/lustre/portals/router/router.c index 6fcd83a..448ab1f 100644 --- a/lustre/portals/router/router.c +++ b/lustre/portals/router/router.c @@ -49,13 +49,6 @@ kpr_router_interface_t kpr_router_interface = { kprri_deregister: kpr_deregister_nal, }; -kpr_control_interface_t kpr_control_interface = { - kprci_add_route: kpr_add_route, - kprci_del_route: kpr_del_route, - kprci_get_route: kpr_get_route, - kprci_notify: kpr_sys_notify, -}; - int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) { @@ -290,18 +283,9 @@ kpr_shutdown_nal (void *arg) LASSERT (!ne->kpne_shutdown); LASSERT (!in_interrupt()); - write_lock_irqsave (&kpr_rwlock, flags); /* locking a bit spurious... */ + write_lock_irqsave (&kpr_rwlock, flags); ne->kpne_shutdown = 1; - write_unlock_irqrestore (&kpr_rwlock, flags); /* except it's a memory barrier */ - - while (atomic_read (&ne->kpne_refcount) != 0) - { - CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n", - ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount)); - - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - } + write_unlock_irqrestore (&kpr_rwlock, flags); } void @@ -313,15 +297,22 @@ kpr_deregister_nal (void *arg) CDEBUG (D_NET, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid); LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */ - LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */ LASSERT (!in_interrupt()); write_lock_irqsave (&kpr_rwlock, flags); - list_del (&ne->kpne_list); - write_unlock_irqrestore (&kpr_rwlock, flags); + /* Wait until all outstanding messages/notifications have completed */ + while (atomic_read (&ne->kpne_refcount) != 0) + { + CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n", + ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount)); + + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + PORTAL_FREE (ne, sizeof (*ne)); PORTAL_MODULE_UNUSE; } @@ -378,12 +369,15 @@ kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob, CDEBUG (D_NET, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid); - - if (ne->kpne_shutdown) /* caller is shutting down */ - return (-ENOENT); + LASSERT (!in_interrupt()); read_lock (&kpr_rwlock); + if (ne->kpne_shutdown) { /* caller is shutting down */ + read_unlock (&kpr_rwlock); + return (-ENOENT); + } + /* Search routes for one that has a gateway to target_nid on the callers network */ list_for_each (e, &kpr_routes) { @@ -453,25 +447,26 @@ kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) struct list_head *e; kpr_route_entry_t *re; kpr_nal_entry_t *tmp_ne; + int rc; CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d\n", fwd, target_nid, src_ne->kpne_interface.kprni_nalid); LASSERT (nob == lib_kiov_nob (fwd->kprfd_niov, fwd->kprfd_kiov)); - - atomic_inc (&kpr_queue_depth); - atomic_inc (&src_ne->kpne_refcount); /* source nal is busy until fwd completes */ + LASSERT (!in_interrupt()); + + read_lock (&kpr_rwlock); kpr_fwd_packets++; /* (loose) stats accounting */ kpr_fwd_bytes += nob + sizeof(ptl_hdr_t); - if (src_ne->kpne_shutdown) /* caller is shutting down */ + if (src_ne->kpne_shutdown) { /* caller is shutting down */ + rc = -ESHUTDOWN; goto out; + } fwd->kprfd_router_arg = src_ne; /* stash caller's nal entry */ - read_lock (&kpr_rwlock); - /* Search routes for one that has a gateway to target_nid NOT on the caller's network */ list_for_each (e, &kpr_routes) { @@ -508,7 +503,9 @@ kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) kpr_update_weight (ge, nob); fwd->kprfd_gateway_nid = ge->kpge_nid; - atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */ + atomic_inc (&src_ne->kpne_refcount); /* source and dest nals are */ + atomic_inc (&dst_ne->kpne_refcount); /* busy until fwd completes */ + atomic_inc (&kpr_queue_depth); read_unlock (&kpr_rwlock); @@ -521,18 +518,16 @@ kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) return; } - read_unlock (&kpr_rwlock); + rc = -EHOSTUNREACH; out: kpr_fwd_errors++; - CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd, - target_nid, src_ne->kpne_interface.kprni_nalid); + CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %d: %d\n", + fwd, target_nid, src_ne->kpne_interface.kprni_nalid, rc); - /* Can't find anywhere to forward to */ - (fwd->kprfd_callback)(fwd->kprfd_callback_arg, -EHOSTUNREACH); + (fwd->kprfd_callback)(fwd->kprfd_callback_arg, rc); - atomic_dec (&kpr_queue_depth); - atomic_dec (&src_ne->kpne_refcount); + read_unlock (&kpr_rwlock); } void @@ -635,7 +630,7 @@ kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, int kpr_sys_notify (int gateway_nalid, ptl_nid_t gateway_nid, - int alive, time_t when) + int alive, time_t when) { return (kpr_do_notify (0, gateway_nalid, gateway_nid, alive, when)); } @@ -695,11 +690,12 @@ kpr_del_route (int gw_nalid, ptl_nid_t gw_nid, } int -kpr_get_route (int idx, int *gateway_nalid, ptl_nid_t *gateway_nid, - ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive) +kpr_get_route (int idx, __u32 *gateway_nalid, ptl_nid_t *gateway_nid, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, __u32 *alive) { struct list_head *e; + LASSERT (!in_interrupt()); read_lock(&kpr_rwlock); for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { @@ -723,11 +719,67 @@ kpr_get_route (int idx, int *gateway_nalid, ptl_nid_t *gateway_nid, return (-ENOENT); } +static int +kpr_nal_cmd(struct portals_cfg *pcfg, void * private) +{ + int err = -EINVAL; + ENTRY; + + switch(pcfg->pcfg_command) { + default: + CDEBUG(D_IOCTL, "Inappropriate cmd: %d\n", pcfg->pcfg_command); + break; + + case NAL_CMD_ADD_ROUTE: + CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n", + pcfg->pcfg_nal, pcfg->pcfg_nid, + pcfg->pcfg_nid2, pcfg->pcfg_nid3); + err = kpr_add_route(pcfg->pcfg_gw_nal, pcfg->pcfg_nid, + pcfg->pcfg_nid2, pcfg->pcfg_nid3); + break; + + case NAL_CMD_DEL_ROUTE: + CDEBUG (D_IOCTL, "Removing routes via [%d] "LPU64" : "LPU64" - "LPU64"\n", + pcfg->pcfg_gw_nal, pcfg->pcfg_nid, + pcfg->pcfg_nid2, pcfg->pcfg_nid3); + err = kpr_del_route (pcfg->pcfg_gw_nal, pcfg->pcfg_nid, + pcfg->pcfg_nid2, pcfg->pcfg_nid3); + break; + + case NAL_CMD_NOTIFY_ROUTER: { + CDEBUG (D_IOCTL, "Notifying peer [%d] "LPU64" %s @ %ld\n", + pcfg->pcfg_gw_nal, pcfg->pcfg_nid, + pcfg->pcfg_flags ? "Enabling" : "Disabling", + (time_t)pcfg->pcfg_nid3); + + err = kpr_sys_notify (pcfg->pcfg_gw_nal, pcfg->pcfg_nid, + pcfg->pcfg_flags, (time_t)pcfg->pcfg_nid3); + break; + } + + case NAL_CMD_GET_ROUTE: + CDEBUG (D_IOCTL, "Getting route [%d]\n", pcfg->pcfg_count); + err = kpr_get_route(pcfg->pcfg_count, &pcfg->pcfg_gw_nal, + &pcfg->pcfg_nid, + &pcfg->pcfg_nid2, &pcfg->pcfg_nid3, + &pcfg->pcfg_flags); + break; + } + RETURN(err); +} + + static void /*__exit*/ kpr_finalise (void) { LASSERT (list_empty (&kpr_nals)); + libcfs_nal_cmd_unregister(ROUTER); + + PORTAL_SYMBOL_UNREGISTER(kpr_router_interface); + + kpr_proc_fini(); + while (!list_empty (&kpr_routes)) { kpr_route_entry_t *re = list_entry(kpr_routes.next, kpr_route_entry_t, @@ -737,11 +789,6 @@ kpr_finalise (void) PORTAL_FREE(re, sizeof (*re)); } - kpr_proc_fini(); - - PORTAL_SYMBOL_UNREGISTER(kpr_router_interface); - PORTAL_SYMBOL_UNREGISTER(kpr_control_interface); - CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n", atomic_read(&portal_kmemory)); } @@ -749,14 +796,21 @@ kpr_finalise (void) static int __init kpr_initialise (void) { + int rc; + CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n", atomic_read(&portal_kmemory)); kpr_routes_generation = 0; kpr_proc_init(); + rc = libcfs_nal_cmd_register(ROUTER, kpr_nal_cmd, NULL); + if (rc != 0) { + CERROR("Can't register nal cmd handler\n"); + return (rc); + } + PORTAL_SYMBOL_REGISTER(kpr_router_interface); - PORTAL_SYMBOL_REGISTER(kpr_control_interface); return (0); } @@ -767,5 +821,4 @@ MODULE_LICENSE("GPL"); module_init (kpr_initialise); module_exit (kpr_finalise); -EXPORT_SYMBOL (kpr_control_interface); EXPORT_SYMBOL (kpr_router_interface); diff --git a/lustre/portals/router/router.h b/lustre/portals/router/router.h index 611d808..27e4983 100644 --- a/lustre/portals/router/router.h +++ b/lustre/portals/router/router.h @@ -93,20 +93,12 @@ extern void kpr_deregister_nal (void *arg); extern void kpr_proc_init (void); extern void kpr_proc_fini (void); -extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, - ptl_nid_t lo_nid, ptl_nid_t hi_nid); -extern int kpr_del_route (int gw_nal, ptl_nid_t gw_nid, - ptl_nid_t lo, ptl_nid_t hi); -extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, - ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive); -extern int kpr_sys_notify (int gw_nalid, ptl_nid_t gw_nid, - int alive, time_t when); - extern unsigned int kpr_routes_generation; extern unsigned long long kpr_fwd_bytes; extern unsigned long kpr_fwd_packets; extern unsigned long kpr_fwd_errors; extern atomic_t kpr_queue_depth; + extern struct list_head kpr_routes; extern rwlock_t kpr_rwlock; diff --git a/lustre/portals/tests/ping_cli.c b/lustre/portals/tests/ping_cli.c index 85c0d71..7a3f8a0 100644 --- a/lustre/portals/tests/ping_cli.c +++ b/lustre/portals/tests/ping_cli.c @@ -46,7 +46,7 @@ static struct pingcli_data *client = NULL; static int count = 0; static void -pingcli_shutdown(int err) +pingcli_shutdown(ptl_handle_ni_t nih, int err) { int rc; @@ -70,7 +70,7 @@ pingcli_shutdown(int err) if ((rc = PtlMEUnlink (client->me))) PDEBUG ("PtlMEUnlink", rc); case 3: - kportal_put_ni (client->args->ioc_nal); + PtlNIFini(nih); case 4: /* Free our buffers */ @@ -84,29 +84,27 @@ pingcli_shutdown(int err) CDEBUG (D_OTHER, "ping client released resources\n"); } /* pingcli_shutdown() */ -static int pingcli_callback(ptl_event_t *ev) +static void pingcli_callback(ptl_event_t *ev) { int i, magic; - i = *(int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned)); - magic = *(int *)(ev->mem_desc.start + ev->offset); + i = *(int *)(ev->md.start + ev->offset + sizeof(unsigned)); + magic = *(int *)(ev->md.start + ev->offset); if(magic != 0xcafebabe) { - printk ("LustreError: Unexpected response \n"); - return 1; + CERROR("Unexpected response %x\n", magic); } if((i == count) || !count) wake_up_process (client->tsk); else - printk ("LustreError: Received response after timeout for %d\n",i); - return 1; + CERROR("Received response after timeout for %d\n",i); } static struct pingcli_data * pingcli_start(struct portal_ioctl_data *args) { - ptl_handle_ni_t *nip; + ptl_handle_ni_t nih = PTL_INVALID_HANDLE; unsigned ping_head_magic = PING_HEADER_MAGIC; unsigned ping_bulk_magic = PING_BULK_MAGIC; int rc; @@ -127,7 +125,7 @@ pingcli_start(struct portal_ioctl_data *args) if (client->outbuf == NULL) { CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); - pingcli_shutdown (4); + pingcli_shutdown (nih, 4); return (NULL); } @@ -136,23 +134,24 @@ pingcli_start(struct portal_ioctl_data *args) if (client->inbuf == NULL) { CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); - pingcli_shutdown (4); + pingcli_shutdown (nih, 4); return (NULL); } /* Aquire and initialize the proper nal for portals. */ - if ((nip = kportal_get_ni (args->ioc_nal)) == NULL) + rc = PtlNIInit(args->ioc_nal, 0, NULL, NULL, &nih); + if (rc != PTL_OK || rc != PTL_IFACE_DUP) { CERROR ("NAL %d not loaded\n", args->ioc_nal); - pingcli_shutdown (4); + pingcli_shutdown (nih, 4); return (NULL); } /* Based on the initialization aquire our unique portal ID. */ - if ((rc = PtlGetId (*nip, &client->myid))) + if ((rc = PtlGetId (nih, &client->myid))) { CERROR ("PtlGetId error %d\n", rc); - pingcli_shutdown (2); + pingcli_shutdown (nih, 2); return (NULL); } @@ -164,20 +163,20 @@ pingcli_start(struct portal_ioctl_data *args) client->id_remote.nid = args->ioc_nid; client->id_remote.pid = 0; - if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT, + if ((rc = PtlMEAttach (nih, PTL_PING_CLIENT, client->id_local, 0, ~0, PTL_RETAIN, PTL_INS_AFTER, &client->me))) { CERROR ("PtlMEAttach error %d\n", rc); - pingcli_shutdown (2); + pingcli_shutdown (nih, 2); return (NULL); } /* Allocate the event queue for this network interface */ - if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq))) + if ((rc = PtlEQAlloc (nih, 64, pingcli_callback, &client->eq))) { CERROR ("PtlEQAlloc error %d\n", rc); - pingcli_shutdown (2); + pingcli_shutdown (nih, 2); return (NULL); } @@ -187,35 +186,35 @@ pingcli_start(struct portal_ioctl_data *args) client->md_in_head.length = (args->ioc_size + STDSIZE) * count; client->md_in_head.threshold = PTL_MD_THRESH_INF; - client->md_in_head.options = PTL_MD_OP_PUT; + client->md_in_head.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; client->md_in_head.user_ptr = NULL; - client->md_in_head.eventq = client->eq; + client->md_in_head.eq_handle = client->eq; memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count); /* Attach the incoming buffer */ if ((rc = PtlMDAttach (client->me, client->md_in_head, PTL_UNLINK, &client->md_in_head_h))) { CERROR ("PtlMDAttach error %d\n", rc); - pingcli_shutdown (1); + pingcli_shutdown (nih, 1); return (NULL); } /* Setup the outgoing ping header */ client->md_out_head.start = client->outbuf; client->md_out_head.length = STDSIZE + args->ioc_size; client->md_out_head.threshold = args->ioc_count; - client->md_out_head.options = PTL_MD_OP_PUT; + client->md_out_head.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; client->md_out_head.user_ptr = NULL; - client->md_out_head.eventq = PTL_EQ_NONE; + client->md_out_head.eq_handle = PTL_EQ_NONE; memcpy (client->outbuf, &ping_head_magic, sizeof(ping_bulk_magic)); count = 0; /* Bind the outgoing ping header */ - if ((rc=PtlMDBind (*nip, client->md_out_head, - &client->md_out_head_h))) { + if ((rc=PtlMDBind (nih, client->md_out_head, + PTL_UNLINK, &client->md_out_head_h))) { CERROR ("PtlMDBind error %d\n", rc); - pingcli_shutdown (1); + pingcli_shutdown (nih, 1); return NULL; } while ((args->ioc_count - count)) { @@ -230,20 +229,20 @@ pingcli_start(struct portal_ioctl_data *args) if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { PDEBUG ("PtlPut (header)", rc); - pingcli_shutdown (1); + pingcli_shutdown (nih, 1); return NULL; } - printk ("Lustre: sent msg no %d", count); + CWARN ("Lustre: sent msg no %d", count); set_current_state (TASK_INTERRUPTIBLE); rc = schedule_timeout (20 * args->ioc_timeout); if (rc == 0) { - printk ("LustreError: :: timeout .....\n"); + CERROR ("timeout .....\n"); } else { do_gettimeofday (&tv2); - printk("Lustre: :: Reply in %u usec\n", - (unsigned)((tv2.tv_sec - tv1.tv_sec) - * 1000000 + (tv2.tv_usec - tv1.tv_usec))); + CWARN("Reply in %u usec\n", + (unsigned)((tv2.tv_sec - tv1.tv_sec) + * 1000000 + (tv2.tv_usec - tv1.tv_usec))); } count++; } @@ -255,7 +254,7 @@ pingcli_start(struct portal_ioctl_data *args) PORTAL_FREE (client->inbuf, (args->ioc_size + STDSIZE) * args->ioc_count); - pingcli_shutdown (2); + pingcli_shutdown (nih, 2); /* Success! */ return NULL; diff --git a/lustre/portals/tests/ping_srv.c b/lustre/portals/tests/ping_srv.c index 1e40ed8..dec806a 100644 --- a/lustre/portals/tests/ping_srv.c +++ b/lustre/portals/tests/ping_srv.c @@ -81,7 +81,7 @@ static void *pingsrv_shutdown(int err) PDEBUG ("PtlMEUnlink", rc); case 3: - kportal_put_ni (nal); + PtlNIFini (server->ni); case 4: @@ -116,12 +116,12 @@ int pingsrv_thread(void *arg) continue; } - magic = *((int *)(server->evnt.mem_desc.start + magic = *((int *)(server->evnt.md.start + server->evnt.offset)); if(magic != 0xdeadbeef) { - printk("LustreError: Unexpected Packet to the server\n"); + CERROR("Unexpected Packet to the server\n"); } memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic)); @@ -129,13 +129,13 @@ int pingsrv_thread(void *arg) server->mdout.length = server->evnt.rlength; server->mdout.start = server->in_buf; server->mdout.threshold = 1; - server->mdout.options = PTL_MD_OP_PUT; + server->mdout.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; server->mdout.user_ptr = NULL; - server->mdout.eventq = PTL_EQ_NONE; + server->mdout.eq_handle = PTL_EQ_NONE; /* Bind the outgoing buffer */ if ((rc = PtlMDBind (server->ni, server->mdout, - &server->mdout_h))) { + PTL_UNLINK, &server->mdout_h))) { PDEBUG ("PtlMDBind", rc); pingsrv_shutdown (1); return 1; @@ -145,9 +145,9 @@ int pingsrv_thread(void *arg) server->mdin.start = server->in_buf; server->mdin.length = MAXSIZE; server->mdin.threshold = 1; - server->mdin.options = PTL_MD_OP_PUT; + server->mdin.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; server->mdin.user_ptr = NULL; - server->mdin.eventq = server->eq; + server->mdin.eq_handle = server->eq; if ((rc = PtlMDAttach (server->me, server->mdin, PTL_UNLINK, &server->mdin_h))) { @@ -167,49 +167,49 @@ int pingsrv_thread(void *arg) return 0; } -static int pingsrv_packet(ptl_event_t *ev) +static void pingsrv_packet(ptl_event_t *ev) { atomic_inc (&pkt); wake_up_process (server->tsk); - return 1; } /* pingsrv_head() */ -static int pingsrv_callback(ptl_event_t *ev) +static void pingsrv_callback(ptl_event_t *ev) { if (ev == NULL) { CERROR ("null in callback, ev=%p\n", ev); - return 0; + return; } server->evnt = *ev; - printk ("Lustre: received ping from nid "LPX64" " + CWARN ("received ping from nid "LPX64" " "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n", ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, - *((int *)(ev->mem_desc.start + ev->offset)), - *((int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned))), - *((int *)(ev->mem_desc.start + ev->offset + 2 * + *((int *)(ev->md.start + ev->offset)), + *((int *)(ev->md.start + ev->offset + sizeof(unsigned))), + *((int *)(ev->md.start + ev->offset + 2 * sizeof(unsigned)))); packets_valid++; - return pingsrv_packet(ev); + pingsrv_packet(ev); } /* pingsrv_callback() */ static struct pingsrv_data *pingsrv_setup(void) { - ptl_handle_ni_t *nip; int rc; + server->ni = PTL_INVALID_HANDLE; + /* Aquire and initialize the proper nal for portals. */ - if ((nip = kportal_get_ni (nal)) == NULL) { + rc = PtlNIInit(nal, 0, NULL, NULL, &server->ni); + if (!(rc == PTL_OK || rc == PTL_IFACE_DUP)) { CDEBUG (D_OTHER, "NAL %d not loaded\n", nal); return pingsrv_shutdown (4); } - server->ni= *nip; /* Based on the initialization aquire our unique portal ID. */ if ((rc = PtlGetId (server->ni, &server->my_id))) { @@ -229,7 +229,7 @@ static struct pingsrv_data *pingsrv_setup(void) } - if ((rc = PtlEQAlloc (server->ni, 1024, pingsrv_callback, + if ((rc = PtlEQAlloc (server->ni, 1024, &pingsrv_callback, &server->eq))) { PDEBUG ("PtlEQAlloc (callback)", rc); return pingsrv_shutdown (2); @@ -245,9 +245,9 @@ static struct pingsrv_data *pingsrv_setup(void) server->mdin.start = server->in_buf; server->mdin.length = MAXSIZE; server->mdin.threshold = 1; - server->mdin.options = PTL_MD_OP_PUT; + server->mdin.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; server->mdin.user_ptr = NULL; - server->mdin.eventq = server->eq; + server->mdin.eq_handle = server->eq; memset (server->in_buf, 0, STDSIZE); if ((rc = PtlMDAttach (server->me, server->mdin, @@ -298,7 +298,7 @@ static void /*__exit*/ pingsrv_cleanup(void) MODULE_PARM(nal, "i"); MODULE_PARM_DESC(nal, "Use the specified NAL " - "(6-kscimacnal, 2-ksocknal, 1-kqswnal)"); + "(2-ksocknal, 1-kqswnal)"); MODULE_AUTHOR("Brian Behlendorf (LLNL)"); MODULE_DESCRIPTION("A kernel space ping server for portals testing"); diff --git a/lustre/portals/tests/sping_cli.c b/lustre/portals/tests/sping_cli.c index 64a1dd2..730ba00 100644 --- a/lustre/portals/tests/sping_cli.c +++ b/lustre/portals/tests/sping_cli.c @@ -51,7 +51,7 @@ static struct pingcli_data *client = NULL; static int count = 0; static void -pingcli_shutdown(int err) +pingcli_shutdown(ptl_handle_ni_t nih, int err) { int rc; @@ -72,7 +72,7 @@ pingcli_shutdown(int err) if ((rc = PtlMEUnlink (client->me))) PDEBUG ("PtlMEUnlink", rc); case 3: - kportal_put_ni (client->args->ioc_nal); + PtlNIFini (nih); case 4: /* Free our buffers */ @@ -92,17 +92,16 @@ pingcli_shutdown(int err) CDEBUG (D_OTHER, "ping client released resources\n"); } /* pingcli_shutdown() */ -static int pingcli_callback(ptl_event_t *ev) +static void pingcli_callback(ptl_event_t *ev) { - wake_up_process (client->tsk); - return 1; + wake_up_process (client->tsk); } static struct pingcli_data * pingcli_start(struct portal_ioctl_data *args) { - const ptl_handle_ni_t *nip; + ptl_handle_ni_t nih = PTL_INVALID_HANDLE; unsigned ping_head_magic = PING_HEADER_MAGIC; char str[PTL_NALFMT_SIZE]; int rc; @@ -122,7 +121,7 @@ pingcli_start(struct portal_ioctl_data *args) if (client->outbuf == NULL) { CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); - pingcli_shutdown (4); + pingcli_shutdown (nih, 4); return (NULL); } @@ -131,23 +130,24 @@ pingcli_start(struct portal_ioctl_data *args) if (client->inbuf == NULL) { CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); - pingcli_shutdown (4); + pingcli_shutdown (nih, 4); return (NULL); } /* Aquire and initialize the proper nal for portals. */ - if ((nip = kportal_get_ni (args->ioc_nal)) == NULL) + rc = PtlNIInit(args->ioc_nal, 0, NULL, NULL, &nih); + if (rc != PTL_OK && rc != PTL_IFACE_DUP) { CERROR ("NAL %d not loaded.\n", args->ioc_nal); - pingcli_shutdown (4); + pingcli_shutdown (nih, 4); return (NULL); } /* Based on the initialization aquire our unique portal ID. */ - if ((rc = PtlGetId (*nip, &client->myid))) + if ((rc = PtlGetId (nih, &client->myid))) { CERROR ("PtlGetId error %d\n", rc); - pingcli_shutdown (2); + pingcli_shutdown (nih, 2); return (NULL); } @@ -159,20 +159,20 @@ pingcli_start(struct portal_ioctl_data *args) client->id_remote.nid = args->ioc_nid; client->id_remote.pid = 0; - if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT, + if ((rc = PtlMEAttach (nih, PTL_PING_CLIENT, client->id_local, 0, ~0, PTL_RETAIN, PTL_INS_AFTER, &client->me))) { CERROR ("PtlMEAttach error %d\n", rc); - pingcli_shutdown (2); + pingcli_shutdown (nih, 2); return (NULL); } /* Allocate the event queue for this network interface */ - if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq))) + if ((rc = PtlEQAlloc (nih, 64, pingcli_callback, &client->eq))) { CERROR ("PtlEQAlloc error %d\n", rc); - pingcli_shutdown (2); + pingcli_shutdown (nih, 2); return (NULL); } @@ -180,16 +180,16 @@ pingcli_start(struct portal_ioctl_data *args) client->md_in_head.start = client->inbuf; client->md_in_head.length = STDSIZE; client->md_in_head.threshold = 1; - client->md_in_head.options = PTL_MD_OP_PUT; + client->md_in_head.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; client->md_in_head.user_ptr = NULL; - client->md_in_head.eventq = client->eq; + client->md_in_head.eq_handle = client->eq; memset (client->inbuf, 0, STDSIZE); /* Attach the incoming buffer */ if ((rc = PtlMDAttach (client->me, client->md_in_head, PTL_UNLINK, &client->md_in_head_h))) { CERROR ("PtlMDAttach error %d\n", rc); - pingcli_shutdown (1); + pingcli_shutdown (nih, 1); return (NULL); } @@ -197,24 +197,24 @@ pingcli_start(struct portal_ioctl_data *args) client->md_out_head.start = client->outbuf; client->md_out_head.length = STDSIZE; client->md_out_head.threshold = 1; - client->md_out_head.options = PTL_MD_OP_PUT; + client->md_out_head.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; client->md_out_head.user_ptr = NULL; - client->md_out_head.eventq = PTL_EQ_NONE; + client->md_out_head.eq_handle = PTL_EQ_NONE; memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic)); /* Bind the outgoing ping header */ - if ((rc=PtlMDBind (*nip, client->md_out_head, - &client->md_out_head_h))) { + if ((rc=PtlMDBind (nih, client->md_out_head, + PTL_UNLINK, &client->md_out_head_h))) { CERROR ("PtlMDBind error %d\n", rc); - pingcli_shutdown (1); + pingcli_shutdown (nih, 1); return (NULL); } /* Put the ping packet */ if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { PDEBUG ("PtlPut (header)", rc); - pingcli_shutdown (1); + pingcli_shutdown (nih, 1); return NULL; } @@ -222,14 +222,14 @@ pingcli_start(struct portal_ioctl_data *args) set_current_state (TASK_INTERRUPTIBLE); rc = schedule_timeout (20 * args->ioc_timeout); if (rc == 0) { - printk ("LustreError: Time out on the server\n"); - pingcli_shutdown (2); + CERROR ("Time out on the server\n"); + pingcli_shutdown (nih, 2); return NULL; - } else - printk("Lustre: Received respose from the server \n"); - + } else { + CWARN("Received respose from the server \n"); + } - pingcli_shutdown (2); + pingcli_shutdown (nih, 2); /* Success! */ return NULL; diff --git a/lustre/portals/tests/sping_srv.c b/lustre/portals/tests/sping_srv.c index b8bda29..f2382d1 100644 --- a/lustre/portals/tests/sping_srv.c +++ b/lustre/portals/tests/sping_srv.c @@ -53,7 +53,7 @@ #define STDSIZE (sizeof(int) + sizeof(int) + 4) -static int nal = 0; // Your NAL, +static int nal = PTL_IFACE_DEFAULT; // Your NAL, static unsigned long packets_valid = 0; // Valid packets static int running = 1; atomic_t pkt; @@ -86,7 +86,7 @@ static void *pingsrv_shutdown(int err) PDEBUG ("PtlMEUnlink", rc); case 3: - kportal_put_ni (nal); + PtlNIFini(server->ni); case 4: @@ -121,13 +121,13 @@ int pingsrv_thread(void *arg) server->mdout.start = server->in_buf; server->mdout.length = STDSIZE; server->mdout.threshold = 1; - server->mdout.options = PTL_MD_OP_PUT; + server->mdout.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; server->mdout.user_ptr = NULL; - server->mdout.eventq = PTL_EQ_NONE; + server->mdout.eq_handle = PTL_EQ_NONE; /* Bind the outgoing buffer */ if ((rc = PtlMDBind (server->ni, server->mdout, - &server->mdout_h))) { + PTL_UNLINK, &server->mdout_h))) { PDEBUG ("PtlMDBind", rc); pingsrv_shutdown (1); return 1; @@ -137,9 +137,9 @@ int pingsrv_thread(void *arg) server->mdin.start = server->in_buf; server->mdin.length = STDSIZE; server->mdin.threshold = 1; - server->mdin.options = PTL_MD_OP_PUT; + server->mdin.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; server->mdin.user_ptr = NULL; - server->mdin.eventq = server->eq; + server->mdin.eq_handle = server->eq; if ((rc = PtlMDAttach (server->me, server->mdin, PTL_UNLINK, &server->mdin_h))) { @@ -159,47 +159,46 @@ int pingsrv_thread(void *arg) return 0; } -static int pingsrv_packet(ptl_event_t *ev) +static void pingsrv_packet(ptl_event_t *ev) { atomic_inc (&pkt); wake_up_process (server->tsk); - return 1; } /* pingsrv_head() */ -static int pingsrv_callback(ptl_event_t *ev) +static void pingsrv_callback(ptl_event_t *ev) { if (ev == NULL) { CERROR ("null in callback, ev=%p\n", ev); - return 0; + return; } server->evnt = *ev; - printk ("Lustre: received ping from nid "LPX64" " - "(off=%u rlen=%u mlen=%u head=%x)\n", - ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, - *((int *)(ev->mem_desc.start + ev->offset))); + CWARN("Lustre: received ping from nid "LPX64" " + "(off=%u rlen=%u mlen=%u head=%x)\n", + ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, + *((int *)(ev->md.start + ev->offset))); packets_valid++; - return pingsrv_packet(ev); + pingsrv_packet(ev); } /* pingsrv_callback() */ static struct pingsrv_data *pingsrv_setup(void) { - ptl_handle_ni_t *nip; int rc; /* Aquire and initialize the proper nal for portals. */ - if ((nip = kportal_get_ni (nal)) == NULL) { + server->ni = PTL_INVALID_HANDLE; + + rc = PtlNIInit(nal, 0, NULL, NULL, &server->ni); + if (rc != PTL_OK && rc != PTL_IFACE_DUP) { CDEBUG (D_OTHER, "Nal %d not loaded.\n", nal); return pingsrv_shutdown (4); } - server->ni= *nip; - /* Based on the initialization aquire our unique portal ID. */ if ((rc = PtlGetId (server->ni, &server->my_id))) { PDEBUG ("PtlGetId", rc); @@ -234,9 +233,9 @@ static struct pingsrv_data *pingsrv_setup(void) server->mdin.start = server->in_buf; server->mdin.length = STDSIZE; server->mdin.threshold = 1; - server->mdin.options = PTL_MD_OP_PUT; + server->mdin.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; server->mdin.user_ptr = NULL; - server->mdin.eventq = server->eq; + server->mdin.eq_handle = server->eq; memset (server->in_buf, 0, STDSIZE); if ((rc = PtlMDAttach (server->me, server->mdin, @@ -285,7 +284,7 @@ static void /*__exit*/ pingsrv_cleanup(void) MODULE_PARM(nal, "i"); MODULE_PARM_DESC(nal, "Use the specified NAL " - "(6-kscimacnal, 2-ksocknal, 1-kqswnal)"); + "(2-ksocknal, 1-kqswnal)"); MODULE_AUTHOR("Brian Behlendorf (LLNL)"); MODULE_DESCRIPTION("A kernel space ping server for portals testing"); diff --git a/lustre/portals/unals/Makefile.am b/lustre/portals/unals/Makefile.am index 15080b0..3437d39 100644 --- a/lustre/portals/unals/Makefile.am +++ b/lustre/portals/unals/Makefile.am @@ -1,13 +1,10 @@ if LIBLUSTRE +if !CRAY_PORTALS noinst_LIBRARIES = libtcpnal.a endif +endif -noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h \ - ipmap.h bridge.h procbridge.h - -libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h \ - dispatch.h table.h timer.h address.c procapi.c proclib.c \ - connection.c tcpnal.c connection.h - +noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h +libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS) libtcpnal_a_CFLAGS = $(LLCFLAGS) diff --git a/lustre/portals/unals/address.c b/lustre/portals/unals/address.c index 6507924..f329e2a 100644 --- a/lustre/portals/unals/address.c +++ b/lustre/portals/unals/address.c @@ -91,8 +91,8 @@ void set_address(bridge t,ptl_pid_t pidrequest) int port; if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; else port=pidrequest; - t->nal_cb->ni.nid=get_node_id(); - t->nal_cb->ni.pid=port; + t->lib_nal->libnal_ni.ni_pid.nid=get_node_id(); + t->lib_nal->libnal_ni.ni_pid.pid=port; } #else @@ -120,10 +120,9 @@ void set_address(bridge t,ptl_pid_t pidrequest) in_addr = get_node_id(); t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ - t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) - << PNAL_VNODE_SHIFT) - + virtnode; - + t->lib_nal->libnal_ni.ni_pid.nid = ((in_addr & PNAL_HOSTID_MASK) + << PNAL_VNODE_SHIFT) + + virtnode; pid=pidrequest; /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ #ifdef notyet @@ -141,6 +140,6 @@ void set_address(bridge t,ptl_pid_t pidrequest) return; } else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; - t->nal_cb->ni.pid=pid; + t->lib_nal->libnal_ni.ni_pid.pid=pid; } #endif diff --git a/lustre/portals/unals/bridge.h b/lustre/portals/unals/bridge.h index 9a90ab8..d2f0f2c 100644 --- a/lustre/portals/unals/bridge.h +++ b/lustre/portals/unals/bridge.h @@ -10,10 +10,16 @@ #define TCPNAL_PROCBRIDGE_H #include +#include + +#define PTL_IFACE_TCP 1 +#define PTL_IFACE_ER 2 +#define PTL_IFACE_SS 3 +#define PTL_IFACE_MAX 4 typedef struct bridge { int alive; - nal_cb_t *nal_cb; + lib_nal_t *lib_nal; void *lower; void *local; void (*shutdown)(struct bridge *); @@ -22,12 +28,6 @@ typedef struct bridge { } *bridge; -nal_t *bridge_init(ptl_interface_t nal, - ptl_pid_t pid_request, - ptl_ni_limits_t *desired, - ptl_ni_limits_t *actual, - int *rc); - typedef int (*nal_initialize)(bridge); extern nal_initialize nal_table[PTL_IFACE_MAX]; diff --git a/lustre/portals/unals/connection.c b/lustre/portals/unals/connection.c index 7b4cecd..b399fcf 100644 --- a/lustre/portals/unals/connection.c +++ b/lustre/portals/unals/connection.c @@ -201,35 +201,30 @@ static int new_connection(void *z) return(1); } -/* FIXME assuming little endian, cleanup!! */ -#define __cpu_to_le64(x) ((__u64)(x)) -#define __le64_to_cpu(x) ((__u64)(x)) -#define __cpu_to_le32(x) ((__u32)(x)) -#define __le32_to_cpu(x) ((__u32)(x)) -#define __cpu_to_le16(x) ((__u16)(x)) -#define __le16_to_cpu(x) ((__u16)(x)) - extern ptl_nid_t tcpnal_mynid; int tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) { int rc; + int nob; ptl_hdr_t hdr; ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); memset (&hdr, 0, sizeof (hdr)); - hmv->magic = __cpu_to_le32 (PORTALS_PROTO_MAGIC); - hmv->version_major = __cpu_to_le32 (PORTALS_PROTO_VERSION_MAJOR); - hmv->version_minor = __cpu_to_le32 (PORTALS_PROTO_VERSION_MINOR); + hmv->magic = cpu_to_le32(PORTALS_PROTO_MAGIC); + hmv->version_major = cpu_to_le32(PORTALS_PROTO_VERSION_MAJOR); + hmv->version_minor = cpu_to_le32(PORTALS_PROTO_VERSION_MINOR); - hdr.src_nid = __cpu_to_le64 (tcpnal_mynid); - hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); + hdr.src_nid = cpu_to_le64(tcpnal_mynid); + hdr.type = cpu_to_le32(PTL_MSG_HELLO); + + hdr.msg.hello.type = cpu_to_le32(type); + hdr.msg.hello.incarnation = cpu_to_le64(incarnation); - hdr.msg.hello.type = __cpu_to_le32 (type); - hdr.msg.hello.incarnation = 0; + /* I don't send any interface info */ /* Assume sufficient socket buffering for this message */ rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr)); @@ -244,28 +239,28 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) return (rc); } - if (hmv->magic != __le32_to_cpu (PORTALS_PROTO_MAGIC)) { + if (hmv->magic != le32_to_cpu(PORTALS_PROTO_MAGIC)) { CERROR ("Bad magic %#08x (%#08x expected) from "LPX64"\n", - __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, *nid); + cpu_to_le32(hmv->magic), PORTALS_PROTO_MAGIC, *nid); return (-EPROTO); } - if (hmv->version_major != __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || - hmv->version_minor != __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { + if (hmv->version_major != cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || + hmv->version_minor != cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { CERROR ("Incompatible protocol version %d.%d (%d.%d expected)" " from "LPX64"\n", - __le16_to_cpu (hmv->version_major), - __le16_to_cpu (hmv->version_minor), + le16_to_cpu (hmv->version_major), + le16_to_cpu (hmv->version_minor), PORTALS_PROTO_VERSION_MAJOR, PORTALS_PROTO_VERSION_MINOR, *nid); return (-EPROTO); } -#if (PORTALS_PROTO_VERSION_MAJOR != 0) -# error "This code only understands protocol version 0.x" +#if (PORTALS_PROTO_VERSION_MAJOR != 1) +# error "This code only understands protocol version 1.x" #endif - /* version 0 sends magic/version as the dest_nid of a 'hello' header, + /* version 1 sends magic/version as the dest_nid of a 'hello' header, * so read the rest of it in now... */ rc = syscall(SYS_read, sockfd, hmv + 1, sizeof(hdr) - sizeof(*hmv)); @@ -276,27 +271,49 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) } /* ...and check we got what we expected */ - if (hdr.type != __cpu_to_le32 (PTL_MSG_HELLO) || - hdr.payload_length != __cpu_to_le32 (0)) { - CERROR ("Expecting a HELLO hdr with 0 payload," + if (hdr.type != cpu_to_le32 (PTL_MSG_HELLO)) { + CERROR ("Expecting a HELLO hdr " " but got type %d with %d payload from "LPX64"\n", - __le32_to_cpu (hdr.type), - __le32_to_cpu (hdr.payload_length), *nid); + le32_to_cpu (hdr.type), + le32_to_cpu (hdr.payload_length), *nid); return (-EPROTO); } - if (__le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) { + if (le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) { CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY\n"); return (-EPROTO); } if (*nid == PTL_NID_ANY) { /* don't know peer's nid yet */ - *nid = __le64_to_cpu(hdr.src_nid); - } else if (*nid != __le64_to_cpu (hdr.src_nid)) { + *nid = le64_to_cpu(hdr.src_nid); + } else if (*nid != le64_to_cpu (hdr.src_nid)) { CERROR ("Connected to nid "LPX64", but expecting "LPX64"\n", - __le64_to_cpu (hdr.src_nid), *nid); + le64_to_cpu (hdr.src_nid), *nid); + return (-EPROTO); + } + + /* Ignore any interface info in the payload */ + nob = le32_to_cpu(hdr.payload_length); + if (nob > getpagesize()) { + CERROR("Unexpected HELLO payload %d from "LPX64"\n", + nob, *nid); return (-EPROTO); } + if (nob > 0) { + char *space = (char *)malloc(nob); + + if (space == NULL) { + CERROR("Can't allocate scratch buffer %d\n", nob); + return (-ENOMEM); + } + + rc = syscall(SYS_read, sockfd, space, nob); + if (rc <= 0) { + CERROR("Error %d skipping HELLO payload from " + LPX64"\n", rc, *nid); + return (rc); + } + } return (0); } @@ -325,6 +342,8 @@ connection force_tcp_connection(manager m, int rport; ptl_nid_t peernid = PTL_NID_ANY; + port = tcpnal_acceptor_port; + id[0] = ip; id[1] = port; @@ -366,7 +385,7 @@ connection force_tcp_connection(manager m, sizeof(struct sockaddr_in)); if (rc == 0) { break; - } else if (errno != EADDRINUSE) { + } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) { perror("Error connecting to remote host"); close(fd); goto out; @@ -411,6 +430,7 @@ out: return (conn); } + /* Function: bind_socket * Arguments: t: the nal state for this interface * port: the port to attempt to bind to diff --git a/lustre/portals/unals/dispatch.h b/lustre/portals/unals/dispatch.h index 34dd070..a8f916d9 100644 --- a/lustre/portals/unals/dispatch.h +++ b/lustre/portals/unals/dispatch.h @@ -37,3 +37,10 @@ void remove_io_handler (io_handler i); void init_unix_timer(void); void select_timer_block(when until); when now(void); + +/* + * hacking for CFS internal MPI testing + */ +#if !CRAY_PORTALS +#define ENABLE_SELECT_DISPATCH +#endif diff --git a/lustre/portals/unals/procapi.c b/lustre/portals/unals/procapi.c index c27f555..6b471c0 100644 --- a/lustre/portals/unals/procapi.c +++ b/lustre/portals/unals/procapi.c @@ -60,34 +60,6 @@ void procbridge_wakeup_nal(procbridge p) syscall(SYS_write, p->notifier[0], buf, sizeof(buf)); } -/* Function: forward - * Arguments: nal_t *nal: pointer to my top-side nal structure - * id: the command to pass to the lower layer - * args, args_len:pointer to and length of the request - * ret, ret_len: pointer to and size of the result - * Returns: a portals status code - * - * forwards a packaged api call from the 'api' side to the 'library' - * side, and collects the result - */ -static int procbridge_forward(nal_t *n, int id, void *args, size_t args_len, - void *ret, size_t ret_len) -{ - bridge b = (bridge) n->nal_data; - - if (id == PTL_FINI) { - lib_fini(b->nal_cb); - - if (b->shutdown) - (*b->shutdown)(b); - } - - lib_dispatch(b->nal_cb, NULL, id, args, ret); - - return (PTL_OK); -} - - /* Function: shutdown * Arguments: nal: a pointer to my top side nal structure * ni: my network interface index @@ -95,9 +67,10 @@ static int procbridge_forward(nal_t *n, int id, void *args, size_t args_len, * cleanup nal state, reclaim the lower side thread and * its state using PTL_FINI codepoint */ -static int procbridge_shutdown(nal_t *n, int ni) +static void procbridge_shutdown(nal_t *n) { - bridge b=(bridge)n->nal_data; + lib_nal_t *nal = n->nal_data; + bridge b=(bridge)nal->libnal_data; procbridge p=(procbridge)b->local; p->nal_flags |= NAL_FLAG_STOPPING; @@ -114,77 +87,31 @@ static int procbridge_shutdown(nal_t *n, int ni) } while (1); free(p); - return(0); -} - - -/* Function: validate - * useless stub - */ -static int procbridge_validate(nal_t *nal, void *base, size_t extent) -{ - return(0); } -/* FIXME cfs temporary workaround! FIXME - * global time out value - */ -int __tcpnal_eqwait_timeout_value = 0; -int __tcpnal_eqwait_timedout = 0; - -/* Function: yield - * Arguments: pid: - * - * this function was originally intended to allow the - * lower half thread to be scheduled to allow progress. we - * overload it to explicitly block until signalled by the - * lower half. - */ -static void procbridge_yield(nal_t *n) -{ - bridge b=(bridge)n->nal_data; - procbridge p=(procbridge)b->local; +/* forward decl */ +extern int procbridge_startup (nal_t *, ptl_pid_t, + ptl_ni_limits_t *, ptl_ni_limits_t *); - pthread_mutex_lock(&p->mutex); - if (!__tcpnal_eqwait_timeout_value) { - pthread_cond_wait(&p->cond,&p->mutex); - } else { - struct timeval now; - struct timespec timeout; - - gettimeofday(&now, NULL); - timeout.tv_sec = now.tv_sec + __tcpnal_eqwait_timeout_value; - timeout.tv_nsec = now.tv_usec * 1000; - - __tcpnal_eqwait_timedout = - pthread_cond_timedwait(&p->cond, &p->mutex, &timeout); - } - pthread_mutex_unlock(&p->mutex); -} - - -static void procbridge_lock(nal_t * nal, unsigned long *flags){} -static void procbridge_unlock(nal_t * nal, unsigned long *flags){} /* api_nal * the interface vector to allow the generic code to access - * this nal. this is seperate from the library side nal_cb. + * this nal. this is seperate from the library side lib_nal. * TODO: should be dyanmically allocated */ -static nal_t api_nal = { - ni: {0}, +nal_t procapi_nal = { nal_data: NULL, - forward: procbridge_forward, - shutdown: procbridge_shutdown, - validate: procbridge_validate, - yield: procbridge_yield, - lock: procbridge_lock, - unlock: procbridge_unlock + nal_ni_init: procbridge_startup, + nal_ni_fini: procbridge_shutdown, }; ptl_nid_t tcpnal_mynid; -/* Function: procbridge_interface +#ifdef ENABLE_SELECT_DISPATCH +procbridge __global_procbridge = NULL; +#endif + +/* Function: procbridge_startup * * Arguments: pid: requested process id (port offset) * PTL_ID_ANY not supported. @@ -192,65 +119,62 @@ ptl_nid_t tcpnal_mynid; * and effectively ignored * actual: limits actually allocated and returned * - * Returns: a pointer to my statically allocated top side NAL - * structure + * Returns: portals rc * * initializes the tcp nal. we define unix_failure as an * error wrapper to cut down clutter. */ -nal_t *procbridge_interface(int num_interface, - ptl_pt_index_t ptl_size, - ptl_ac_index_t acl_size, - ptl_pid_t requested_pid) +int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, + ptl_ni_limits_t *requested_limits, + ptl_ni_limits_t *actual_limits) { nal_init_args_t args; + procbridge p; bridge b; - static int initialized=0; - ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + /* XXX nal_type is purely private to tcpnal here */ int nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */ - if(initialized) return (&api_nal); + LASSERT(nal == &procapi_nal); init_unix_timer(); b=(bridge)malloc(sizeof(struct bridge)); p=(procbridge)malloc(sizeof(struct procbridge)); - api_nal.nal_data=b; b->local=p; - if (ptl_size) - limits.max_ptable_index = ptl_size; - if (acl_size) - limits.max_atable_index = acl_size; - args.nia_requested_pid = requested_pid; - args.nia_limits = &limits; + args.nia_requested_limits = requested_limits; + args.nia_actual_limits = actual_limits; args.nia_nal_type = nal_type; args.nia_bridge = b; + args.nia_apinal = nal; /* init procbridge */ pthread_mutex_init(&p->mutex,0); pthread_cond_init(&p->cond, 0); p->nal_flags = 0; - pthread_mutex_init(&p->nal_cb_lock, 0); /* initialize notifier */ if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) { perror("socketpair failed"); - return NULL; + return PTL_FAIL; } if (!register_io_handler(p->notifier[1], READ_HANDLER, procbridge_notifier_handler, p)) { perror("fail to register notifier handler"); - return NULL; + return PTL_FAIL; } +#ifdef ENABLE_SELECT_DISPATCH + __global_procbridge = p; +#endif + /* create nal thread */ if (pthread_create(&p->t, NULL, nal_thread, &args)) { perror("nal_init: pthread_create"); - return(NULL); + return PTL_FAIL; } do { @@ -264,10 +188,9 @@ nal_t *procbridge_interface(int num_interface, } while (1); if (p->nal_flags & NAL_FLAG_STOPPED) - return (NULL); + return PTL_FAIL; - b->nal_cb->ni.nid = tcpnal_mynid; - initialized = 1; + b->lib_nal->libnal_ni.ni_pid.nid = tcpnal_mynid; - return (&api_nal); + return PTL_OK; } diff --git a/lustre/portals/unals/procbridge.h b/lustre/portals/unals/procbridge.h index 965f83d..1f91ced 100644 --- a/lustre/portals/unals/procbridge.h +++ b/lustre/portals/unals/procbridge.h @@ -30,14 +30,15 @@ typedef struct procbridge { int nal_flags; - pthread_mutex_t nal_cb_lock; } *procbridge; typedef struct nal_init_args { ptl_pid_t nia_requested_pid; - ptl_ni_limits_t *nia_limits; + ptl_ni_limits_t *nia_requested_limits; + ptl_ni_limits_t *nia_actual_limits; int nia_nal_type; bridge nia_bridge; + nal_t *nia_apinal; } nal_init_args_t; extern void *nal_thread(void *); @@ -50,10 +51,6 @@ extern void *nal_thread(void *); #define MAX_PTLS 128 extern void set_address(bridge t,ptl_pid_t pidrequest); -extern nal_t *procbridge_interface(int num_interface, - ptl_pt_index_t ptl_size, - ptl_ac_index_t acl_size, - ptl_pid_t requested_pid); extern void procbridge_wakeup_nal(procbridge p); #endif diff --git a/lustre/portals/unals/proclib.c b/lustre/portals/unals/proclib.c index 2a5ba0d..7ee7c71 100644 --- a/lustre/portals/unals/proclib.c +++ b/lustre/portals/unals/proclib.c @@ -43,94 +43,27 @@ /* the following functions are stubs to satisfy the nal definition without doing anything particularily useful*/ -static ptl_err_t nal_write(nal_cb_t *nal, - void *private, - user_ptr dst_addr, - void *src_addr, - size_t len) -{ - memcpy(dst_addr, src_addr, len); - return PTL_OK; -} - -static ptl_err_t nal_read(nal_cb_t * nal, - void *private, - void *dst_addr, - user_ptr src_addr, - size_t len) -{ - memcpy(dst_addr, src_addr, len); - return PTL_OK; -} - -static void *nal_malloc(nal_cb_t *nal, - size_t len) -{ - void *buf = malloc(len); - return buf; -} - -static void nal_free(nal_cb_t *nal, - void *buf, - size_t len) -{ - free(buf); -} - -static void nal_printf(nal_cb_t *nal, - const char *fmt, - ...) -{ - va_list ap; - - va_start(ap, fmt); - vprintf(fmt, ap); - va_end(ap); -} - - -static void nal_cli(nal_cb_t *nal, - unsigned long *flags) -{ - bridge b = (bridge) nal->nal_data; - procbridge p = (procbridge) b->local; - - pthread_mutex_lock(&p->nal_cb_lock); -} - - -static void nal_sti(nal_cb_t *nal, - unsigned long *flags) -{ - bridge b = (bridge)nal->nal_data; - procbridge p = (procbridge) b->local; - - pthread_mutex_unlock(&p->nal_cb_lock); -} - - -static int nal_dist(nal_cb_t *nal, +static int nal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { return 0; } -static void wakeup_topside(void *z) +static void check_stopping(void *z) { bridge b = z; procbridge p = b->local; - int stop; + if ((p->nal_flags & NAL_FLAG_STOPPING) == 0) + return; + pthread_mutex_lock(&p->mutex); - stop = p->nal_flags & NAL_FLAG_STOPPING; - if (stop) - p->nal_flags |= NAL_FLAG_STOPPED; + p->nal_flags |= NAL_FLAG_STOPPED; pthread_cond_broadcast(&p->cond); pthread_mutex_unlock(&p->mutex); - if (stop) - pthread_exit(0); + pthread_exit(0); } @@ -146,9 +79,6 @@ static void wakeup_topside(void *z) * We define a limit macro to place a ceiling on limits * for syntactic convenience */ -#define LIMIT(x,y,max)\ - if ((unsigned int)x > max) y = max; - extern int tcpnal_init(bridge); nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0}; @@ -159,46 +89,30 @@ void *nal_thread(void *z) bridge b = args->nia_bridge; procbridge p=b->local; int rc; - ptl_pid_t pid_request; + ptl_process_id_t process_id; int nal_type; - ptl_ni_limits_t desired; - ptl_ni_limits_t actual; - b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t)); - b->nal_cb->nal_data=b; - b->nal_cb->cb_read=nal_read; - b->nal_cb->cb_write=nal_write; - b->nal_cb->cb_malloc=nal_malloc; - b->nal_cb->cb_free=nal_free; - b->nal_cb->cb_map=NULL; - b->nal_cb->cb_unmap=NULL; - b->nal_cb->cb_printf=nal_printf; - b->nal_cb->cb_cli=nal_cli; - b->nal_cb->cb_sti=nal_sti; - b->nal_cb->cb_dist=nal_dist; - - pid_request = args->nia_requested_pid; - desired = *args->nia_limits; - nal_type = args->nia_nal_type; + b->lib_nal=(lib_nal_t *)malloc(sizeof(lib_nal_t)); + b->lib_nal->libnal_data=b; + b->lib_nal->libnal_map=NULL; + b->lib_nal->libnal_unmap=NULL; + b->lib_nal->libnal_dist=nal_dist; - actual = desired; - LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES); - LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS); - LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS); - LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS); - LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS); + nal_type = args->nia_nal_type; - set_address(b,pid_request); + /* Wierd, but this sets b->lib_nal->libnal_ni.ni_pid.{nid,pid}, which + * lib_init() is about to do from the process_id passed to it...*/ + set_address(b,args->nia_requested_pid); + process_id = b->lib_nal->libnal_ni.ni_pid; + if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); /* initialize the generic 'library' level code */ - rc = lib_init(b->nal_cb, - b->nal_cb->ni.nid, - b->nal_cb->ni.pid, - 10, - actual.max_ptable_index, - actual.max_atable_index); + rc = lib_init(b->lib_nal, args->nia_apinal, + process_id, + args->nia_requested_limits, + args->nia_actual_limits); /* * Whatever the initialization returned is passed back to the @@ -207,18 +121,17 @@ void *nal_thread(void *z) */ /* this should perform error checking */ pthread_mutex_lock(&p->mutex); - p->nal_flags |= rc ? NAL_FLAG_STOPPED : NAL_FLAG_RUNNING; + p->nal_flags |= (rc != PTL_OK) ? NAL_FLAG_STOPPED : NAL_FLAG_RUNNING; pthread_cond_broadcast(&p->cond); pthread_mutex_unlock(&p->mutex); - if (!rc) { + if (rc == PTL_OK) { /* the thunk function is called each time the timer loop performs an operation and returns to blocking mode. we overload this function to inform the api side that it may be interested in looking at the event queue */ - register_thunk(wakeup_topside,b); + register_thunk(check_stopping,b); timer_loop(); } return(0); } -#undef LIMIT diff --git a/lustre/portals/unals/select.c b/lustre/portals/unals/select.c index c4ccae1..09e1542 100644 --- a/lustre/portals/unals/select.c +++ b/lustre/portals/unals/select.c @@ -34,8 +34,12 @@ #include #include #include +#include +#include +#include #include #include +#include static struct timeval beginning_of_epoch; @@ -95,40 +99,22 @@ void remove_io_handler (io_handler i) i->disabled=1; } -static void set_flag(io_handler n,fd_set *fds) +static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e) { - if (n->type & READ_HANDLER) FD_SET(n->fd, &fds[0]); - if (n->type & WRITE_HANDLER) FD_SET(n->fd,&fds[1]); - if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, &fds[2]); + if (n->type & READ_HANDLER) FD_SET(n->fd, r); + if (n->type & WRITE_HANDLER) FD_SET(n->fd, w); + if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e); } - -/* Function: select_timer_block - * Arguments: until: an absolute time when the select should return - * - * This function dispatches the various file descriptors' handler - * functions, if the kernel indicates there is io available. - */ -void select_timer_block(when until) +static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e) { - fd_set fds[3]; - struct timeval timeout; - struct timeval *timeout_pointer; - int result; io_handler j; io_handler *k; + int max = 0; - /* TODO: loop until the entire interval is expired*/ - if (until){ - when interval=until-now(); - timeout.tv_sec=(interval>>32); - timeout.tv_usec=((interval<<32)/1000000)>>32; - timeout_pointer=&timeout; - } else timeout_pointer=0; - - FD_ZERO(&fds[0]); - FD_ZERO(&fds[1]); - FD_ZERO(&fds[2]); + FD_ZERO(r); + FD_ZERO(w); + FD_ZERO(e); for (k=&io_handlers;*k;){ if ((*k)->disabled){ j=*k; @@ -136,24 +122,291 @@ void select_timer_block(when until) free(j); } if (*k) { - set_flag(*k,fds); + set_flag(*k,r,w,e); + if ((*k)->fd > max) + max = (*k)->fd; k=&(*k)->next; } } + return max + 1; +} + +static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e) +{ + io_handler j; + int n = 0, t; + + for (j = io_handlers; j; j = j->next) { + if (j->disabled) + continue; + + t = 0; + if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) { + FD_CLR(j->fd, r); + t++; + } + if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) { + FD_CLR(j->fd, w); + t++; + } + if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) { + FD_CLR(j->fd, e); + t++; + } + if (t == 0) + continue; + + if (!(*j->function)(j->argument)) + j->disabled = 1; + + n += t; + } + + return n; +} - result=select(FD_SETSIZE, &fds[0], &fds[1], &fds[2], timeout_pointer); +#ifdef ENABLE_SELECT_DISPATCH - if (result > 0) - for (j=io_handlers;j;j=j->next){ - if (!(j->disabled) && - ((FD_ISSET(j->fd, &fds[0]) && (j->type & READ_HANDLER)) || - (FD_ISSET(j->fd, &fds[1]) && (j->type & WRITE_HANDLER)) || - (FD_ISSET(j->fd, &fds[2]) && (j->type & EXCEPTION_HANDLER)))){ - if (!(*j->function)(j->argument)) - j->disabled=1; +static struct { + pthread_mutex_t mutex; + pthread_cond_t cond; + int submitted; + int nready; + int maxfd; + fd_set *rset; + fd_set *wset; + fd_set *eset; + struct timeval *timeout; + struct timeval submit_time; +} fd_extra = { + PTHREAD_MUTEX_INITIALIZER, + PTHREAD_COND_INITIALIZER, + 0, 0, 0, + NULL, NULL, NULL, NULL, +}; + +extern int liblustre_wait_event(int timeout); +extern procbridge __global_procbridge; + +/* + * this will intercept syscall select() of user apps + * such as MPI libs. + */ +int select(int n, fd_set *rset, fd_set *wset, fd_set *eset, + struct timeval *timeout) +{ + LASSERT(fd_extra.submitted == 0); + + fd_extra.nready = 0; + fd_extra.maxfd = n; + fd_extra.rset = rset; + fd_extra.wset = wset; + fd_extra.eset = eset; + fd_extra.timeout = timeout; + + liblustre_wait_event(0); + pthread_mutex_lock(&fd_extra.mutex); + gettimeofday(&fd_extra.submit_time, NULL); + fd_extra.submitted = 1; + LASSERT(__global_procbridge); + procbridge_wakeup_nal(__global_procbridge); + +again: + if (fd_extra.submitted) + pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex); + pthread_mutex_unlock(&fd_extra.mutex); + + liblustre_wait_event(0); + + pthread_mutex_lock(&fd_extra.mutex); + if (fd_extra.submitted) + goto again; + pthread_mutex_unlock(&fd_extra.mutex); + + LASSERT(fd_extra.nready >= 0); + LASSERT(fd_extra.submitted == 0); + return fd_extra.nready; +} + +static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset) +{ + int i; + + LASSERT(rset); + LASSERT(wset); + LASSERT(eset); + + for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) { + LASSERT(!fd_extra.rset || + !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i])); + LASSERT(!fd_extra.wset || + !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i])); + LASSERT(!fd_extra.eset || + !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i])); + + if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i]) + __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i]; + if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i]) + __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i]; + if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i]) + __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i]; + } + + return (fd_extra.maxfd > max ? fd_extra.maxfd : max); +} + +static inline +int timeval_ge(struct timeval *tv1, struct timeval *tv2) +{ + LASSERT(tv1 && tv2); + return ((tv1->tv_sec - tv2->tv_sec) * 1000000 + + (tv1->tv_usec - tv2->tv_usec) >= 0); +} + +/* + * choose the most recent timeout value + */ +static struct timeval *choose_timeout(struct timeval *tv1, + struct timeval *tv2) +{ + if (!tv1) + return tv2; + else if (!tv2) + return tv1; + + if (timeval_ge(tv1, tv2)) + return tv2; + else + return tv1; +} + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer, *select_timeout; + int max, nready, nexec; + int fd_handling; + +again: + if (until) { + when interval; + + interval = until - now(); + timeout.tv_sec = (interval >> 32); + timeout.tv_usec = ((interval << 32) / 1000000) >> 32; + timeout_pointer = &timeout; + } else + timeout_pointer = NULL; + + fd_handling = 0; + max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]); + select_timeout = timeout_pointer; + + pthread_mutex_lock(&fd_extra.mutex); + fd_handling = fd_extra.submitted; + pthread_mutex_unlock(&fd_extra.mutex); + if (fd_handling) { + max = merge_fds(max, &fds[0], &fds[1], &fds[2]); + select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout); + } + + /* XXX only compile for linux */ +#if __WORDSIZE == 64 + nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2], + select_timeout); +#else + nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2], + select_timeout); +#endif + if (nready < 0) { + CERROR("select return err %d, errno %d\n", nready, errno); + return; + } + + if (nready) { + nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]); + nready -= nexec; + } else + nexec = 0; + + /* even both nready & nexec are 0, we still need try to wakeup + * upper thread since it may have timed out + */ + if (fd_handling) { + LASSERT(nready >= 0); + + pthread_mutex_lock(&fd_extra.mutex); + if (nready) { + if (fd_extra.rset) + *fd_extra.rset = fds[0]; + if (fd_extra.wset) + *fd_extra.wset = fds[1]; + if (fd_extra.eset) + *fd_extra.eset = fds[2]; + fd_extra.nready = nready; + fd_extra.submitted = 0; + } else { + struct timeval t; + + fd_extra.nready = 0; + if (fd_extra.timeout) { + gettimeofday(&t, NULL); + if (timeval_ge(&t, &fd_extra.submit_time)) + fd_extra.submitted = 0; } } + + pthread_cond_signal(&fd_extra.cond); + pthread_mutex_unlock(&fd_extra.mutex); + } + + /* haven't found portals event, go back to loop if time + * is not expired */ + if (!nexec) { + if (timeout_pointer == NULL || now() >= until) + goto again; + } +} + +#else /* !ENABLE_SELECT_DISPATCH */ + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer; + int max, nready; + +again: + if (until) { + when interval; + interval = until - now(); + timeout.tv_sec = (interval >> 32); + timeout.tv_usec = ((interval << 32) / 1000000) >> 32; + timeout_pointer = &timeout; + } else + timeout_pointer = NULL; + + max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]); + + nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer); + if (nready > 0) + execute_callbacks(&fds[0], &fds[1], &fds[2]); } +#endif /* ENABLE_SELECT_DISPATCH */ /* Function: init_unix_timer() * is called to initialize the library diff --git a/lustre/portals/unals/tcpnal.c b/lustre/portals/unals/tcpnal.c index 0c47f42..abb6d01 100644 --- a/lustre/portals/unals/tcpnal.c +++ b/lustre/portals/unals/tcpnal.c @@ -55,7 +55,7 @@ * * sends a packet to the peer, after insuring that a connection exists */ -ptl_err_t tcpnal_send(nal_cb_t *n, +ptl_err_t tcpnal_send(lib_nal_t *n, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, @@ -68,7 +68,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n, size_t len) { connection c; - bridge b=(bridge)n->nal_data; + bridge b=(bridge)n->libnal_data; struct iovec tiov[257]; static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER; ptl_err_t rc = PTL_OK; @@ -142,7 +142,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n, /* Function: tcpnal_recv - * Arguments: nal_cb_t *nal: pointer to my nal control block + * Arguments: lib_nal_t *nal: pointer to my nal control block * void *private: connection pointer passed through * lib_parse() * lib_msg_t *cookie: passed back to portals library @@ -154,7 +154,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n, * blocking read of the requested data. must drain out the * difference of mainpulated and requested lengths from the network */ -ptl_err_t tcpnal_recv(nal_cb_t *n, +ptl_err_t tcpnal_recv(lib_nal_t *n, void *private, lib_msg_t *cookie, unsigned int niov, @@ -217,7 +217,8 @@ static int from_connection(void *a, void *d) ptl_hdr_t hdr; if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ - lib_parse(b->nal_cb, &hdr, c); + lib_parse(b->lib_nal, &hdr, c); + /*TODO: check error status*/ return(1); } return(0); @@ -239,19 +240,17 @@ int tcpnal_init(bridge b) { manager m; - b->nal_cb->cb_send=tcpnal_send; - b->nal_cb->cb_recv=tcpnal_recv; + b->lib_nal->libnal_send=tcpnal_send; + b->lib_nal->libnal_recv=tcpnal_recv; b->shutdown=tcpnal_shutdown; - if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid, - b->nal_cb->ni.pid), + if (!(m=init_connections(PNAL_PORT(b->lib_nal->libnal_ni.ni_pid.nid, + b->lib_nal->libnal_ni.ni_pid.pid), from_connection,b))){ /* TODO: this needs to shut down the newly created junk */ return(PTL_NAL_FAILED); } - /* XXX cfs hack */ - b->nal_cb->ni.pid=0; b->lower=m; return(PTL_OK); } diff --git a/lustre/portals/utils/Makefile.am b/lustre/portals/utils/Makefile.am index 051bcd9..1d9f905 100644 --- a/lustre/portals/utils/Makefile.am +++ b/lustre/portals/utils/Makefile.am @@ -9,14 +9,22 @@ if LIBLUSTRE noinst_LIBRARIES = libuptlctl.a -libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h +endif + +libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c libuptlctl_a_CPPFLAGS = $(LLCPPFLAGS) libuptlctl_a_CFLAGS = $(LLCFLAGS) -endif -if UTILS -sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid +sbin_PROGRAMS = debugctl + lib_LIBRARIES = libptlctl.a + +libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h + +if UTILS +if !CRAY_PORTALS +sbin_PROGRAMS += acceptor ptlctl routerstat wirecheck gmnalnid +endif endif acceptor_SOURCES = acceptor.c @@ -24,16 +32,15 @@ acceptor_LDADD = $(LIBWRAP) wirecheck_SOURCES = wirecheck.c -libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h - gmnalnid_SOURCES = gmnalnid.c ptlctl_SOURCES = ptlctl.c ptlctl_LDADD = -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE) ptlctl_DEPENDENCIES = libptlctl.a +routerstat_SOURCES = routerstat.c + debugctl_SOURCES = debugctl.c debugctl_LDADD = -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE) debugctl_DEPENDENCIES = libptlctl.a -routerstat_SOURCES = routerstat.c diff --git a/lustre/portals/utils/acceptor.c b/lustre/portals/utils/acceptor.c index cff2235..524d128 100644 --- a/lustre/portals/utils/acceptor.c +++ b/lustre/portals/utils/acceptor.c @@ -69,83 +69,31 @@ int pidfile_exists(char *name, int port) return (0); } -int -parse_size (int *sizep, char *str) -{ - int size; - char mod[32]; - - switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) - { - default: - return (-1); - - case 1: - *sizep = size; - return (0); - - case 2: - switch (*mod) - { - case 'g': - case 'G': - *sizep = size << 30; - return (0); - - case 'm': - case 'M': - *sizep = size << 20; - return (0); - - case 'k': - case 'K': - *sizep = size << 10; - return (0); - - default: - *sizep = size; - return (0); - } - } -} - void show_connection (int fd, __u32 net_ip) { struct hostent *h = gethostbyaddr ((char *)&net_ip, sizeof net_ip, AF_INET); __u32 host_ip = ntohl (net_ip); - int rxmem = 0; - int txmem = 0; - int nonagle = 0; int len; char host[1024]; - len = sizeof (txmem); - if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &len) != 0) - perror ("Cannot get write buffer size"); - - len = sizeof (rxmem); - if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &len) != 0) - perror ("Cannot get read buffer size"); - - len = sizeof (nonagle); - if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &len) != 0) - perror ("Cannot get nagle"); - if (h == NULL) snprintf (host, sizeof(host), "%d.%d.%d.%d", (host_ip >> 24) & 0xff, (host_ip >> 16) & 0xff, (host_ip >> 8) & 0xff, host_ip & 0xff); else snprintf (host, sizeof(host), "%s", h->h_name); - syslog (LOG_INFO, "Accepted host: %s snd: %d rcv %d nagle: %s\n", - host, txmem, rxmem, nonagle ? "disabled" : "enabled"); + syslog (LOG_INFO, "Accepted host: %s\n", host); } void usage (char *myname) { - fprintf (stderr, "Usage: %s [-r recv_mem] [-s send_mem] [-n] [-p] [-N nal_id] port\n", myname); + fprintf (stderr, + "Usage: %s [-N nal_id] [-p] [-l] port\n\n" + " -l\tKeep stdin/stdout open\n" + " -p\tAllow connections from non-privileged ports\n", + myname); exit (1); } @@ -154,52 +102,29 @@ int main(int argc, char **argv) int o, fd, rc, port, pfd; struct sockaddr_in srvaddr; int c; - int rxmem = 0; - int txmem = 0; int noclose = 0; - int nonagle = 1; int nal = SOCKNAL; - int bind_irq = 0; int rport; int require_privports = 1; - while ((c = getopt (argc, argv, "N:pr:s:nli")) != -1) - switch (c) - { - case 'r': - if (parse_size (&rxmem, optarg) != 0 || rxmem < 0) - usage (argv[0]); - break; - - case 's': - if (parse_size (&txmem, optarg) != 0 || txmem < 0) - usage (argv[0]); - break; - - case 'n': - nonagle = 0; + while ((c = getopt (argc, argv, "N:lp")) != -1) { + switch (c) { + case 'N': + if (sscanf(optarg, "%d", &nal) != 1 || + nal < 0 || nal > NAL_MAX_NR) + usage(argv[0]); break; - case 'l': noclose = 1; break; - - case 'i': - bind_irq = 1; - break; case 'p': require_privports = 0; break; - case 'N': - if (parse_size(&nal, optarg) != 0 || - nal < 0 || nal > NAL_MAX_NR) - usage(argv[0]); - break; - default: usage (argv[0]); break; } + } if (optind >= argc) usage (argv[0]); @@ -226,37 +151,6 @@ int main(int argc, char **argv) exit(1); } - if (nonagle) - { - o = 1; - rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)); - if (rc != 0) - { - perror ("Cannot disable nagle"); - exit (1); - } - } - - if (txmem != 0) - { - rc = setsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, sizeof (txmem)); - if (rc != 0) - { - perror ("Cannot set write buffer size"); - exit (1); - } - } - - if (rxmem != 0) - { - rc = setsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, sizeof (rxmem)); - if (rc != 0) - { - perror ("Cannot set read buffer size"); - exit (1); - } - } - rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); if ( rc == -1 ) { perror("bind: "); @@ -291,12 +185,11 @@ int main(int argc, char **argv) int cfd; struct portal_ioctl_data data; struct portals_cfg pcfg; - int privileged = 0; - char addrstr[INET_ADDRSTRLEN]; #ifdef HAVE_LIBWRAP struct request_info request; #endif - + char addrstr[INET_ADDRSTRLEN]; + cfd = accept(fd, (struct sockaddr *)&clntaddr, &len); if ( cfd < 0 ) { perror("accept"); @@ -304,7 +197,6 @@ int main(int argc, char **argv) continue; } - rport = ntohs(clntaddr.sin_port); #ifdef HAVE_LIBWRAP /* libwrap access control */ request_init(&request, RQ_DAEMON, "lustre", RQ_FILE, cfd, 0); @@ -313,18 +205,20 @@ int main(int argc, char **argv) inet_ntop(AF_INET, &clntaddr.sin_addr, addrstr, INET_ADDRSTRLEN); syslog(LOG_WARNING, "Unauthorized access from %s:%hd\n", - addrstr, rport); + addrstr, ntohs(clntaddr.sin_port)); close (cfd); continue; } #endif - if (require_privports && rport >= IPPORT_RESERVED) { + if (require_privports && ntohs(clntaddr.sin_port) >= IPPORT_RESERVED) { inet_ntop(AF_INET, &clntaddr.sin_addr, addrstr, INET_ADDRSTRLEN); - syslog(LOG_ERR, "Closing non-privileged connection from %s:%d\n", - addrstr, rport); - close(cfd); + syslog(LOG_ERR, "Closing non-privileged connection from %s:%d\n", + addrstr, ntohs(clntaddr.sin_port)); + rc = close(cfd); + if (rc) + perror ("close un-privileged client failed"); continue; } @@ -333,13 +227,12 @@ int main(int argc, char **argv) PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD); pcfg.pcfg_nal = nal; pcfg.pcfg_fd = cfd; - pcfg.pcfg_flags = bind_irq; pcfg.pcfg_misc = SOCKNAL_CONN_NONE; /* == incoming connection */ - + PORTAL_IOC_INIT(data); data.ioc_pbuf1 = (char*)&pcfg; data.ioc_plen1 = sizeof(pcfg); - + if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) { perror("ioctl failed"); } else { diff --git a/lustre/portals/utils/debug.c b/lustre/portals/utils/debug.c index 538af44..afbf1cb 100644 --- a/lustre/portals/utils/debug.c +++ b/lustre/portals/utils/debug.c @@ -24,17 +24,20 @@ */ #define __USE_FILE_OFFSET64 +#define _GNU_SOURCE #include #include +#ifdef HAVE_NETDB_H #include +#endif #include #include +#include "ioctl.h" #include #include #include -#include #ifndef __CYGWIN__ # include #endif @@ -45,36 +48,47 @@ #include #include +#ifdef HAVE_LINUX_VERSION_H #include #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) #define BUG() /* workaround for module.h includes */ #include #endif +#endif /* !HAVE_LINUX_VERSION_H */ + +#include #include #include #include "parser.h" +#include + static char rawbuf[8192]; static char *buf = rawbuf; static int max = 8192; -//static int g_pfd = -1; +/*static int g_pfd = -1;*/ static int subsystem_mask = ~0; static int debug_mask = ~0; #define MAX_MARK_SIZE 100 static const char *portal_debug_subsystems[] = - {"undefined", "mdc", "mds", "osc", "ost", "class", "log", "llite", - "rpc", "mgmt", "portals", "socknal", "qswnal", "pinger", "filter", - "ptlbd", "echo", "ldlm", "lov", "gmnal", "router", "cobd", "ibnal", - NULL}; + {"undefined", "mdc", "mds", "osc", + "ost", "class", "log", "llite", + "rpc", "mgmt", "portals", "socknal", + "qswnal", "pinger", "filter", "ptlbd", + "echo", "ldlm", "lov", "gmnal", + "router", "cobd", "ibnal", "sm", + "asobd", "confobd", NULL}; static const char *portal_debug_masks[] = - {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", - "blocks", "net", "warning", "buffs", "other", "dentry", "portals", - "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", - "reada", NULL}; + {"trace", "inode", "super", "ext2", + "malloc", "cache", "info", "ioctl", + "blocks", "net", "warning", "buffs", + "other", "dentry", "portals", "page", + "dlmtrace", "error", "emerg", "ha", + "rpctrace", "vfstrace", "reada", NULL}; struct debug_daemon_cmd { char *cmd; @@ -183,9 +197,6 @@ static int applymask(char* procpath, int value) return 0; } -extern char *dump_filename; -extern int dump(int dev_id, int opc, void *buf); - static void applymask_all(unsigned int subs_mask, unsigned int debug_mask) { if (!dump_filename) { @@ -243,7 +254,7 @@ struct dbg_line { static void list_add_ordered(struct dbg_line *new, struct list_head *head) { struct list_head *pos; - struct dbg_line *curr, *next; + struct dbg_line *curr; list_for_each(pos, head) { curr = list_entry(pos, struct dbg_line, chain); @@ -289,7 +300,7 @@ static int parse_buffer(FILE *in, FILE *out) char buf[4097], *p; int rc; unsigned long dropped = 0, kept = 0; - struct list_head chunk_list, *pos; + struct list_head chunk_list; INIT_LIST_HEAD(&chunk_list); @@ -371,15 +382,24 @@ int jt_dbg_debug_kernel(int argc, char **argv) fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]); return 0; } - sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] : "/tmp/lustre-log", - time(NULL), getpid()); - if (argc > 2) + if (argc > 2) { raw = atoi(argv[2]); + } else if (argc > 1 && (argv[1][0] == '0' || argv[1][0] == '1')) { + raw = atoi(argv[1]); + argc--; + } else { + sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] : + "/tmp/lustre-log", time(NULL), getpid()); + } + unlink(filename); fd = open("/proc/sys/portals/dump_kernel", O_WRONLY); if (fd < 0) { + if (errno == ENOENT) /* no dump file created */ + return 0; + fprintf(stderr, "open(dump_kernel) failed: %s\n", strerror(errno)); return 1; @@ -411,11 +431,15 @@ int jt_dbg_debug_kernel(int argc, char **argv) if (out == NULL) { fprintf(stderr, "fopen(%s) failed: %s\n", argv[1], strerror(errno)); + fclose(in); return 1; } } rc = parse_buffer(in, out); + fclose(in); + if (argc > 1) + fclose(out); if (rc) { fprintf(stderr, "parse_buffer failed; leaving tmp file %s " "behind.\n", filename); @@ -431,23 +455,40 @@ int jt_dbg_debug_kernel(int argc, char **argv) int jt_dbg_debug_file(int argc, char **argv) { + int fdin,fdout; FILE *in, *out = stdout; if (argc > 3 || argc < 2) { fprintf(stderr, "usage: %s [output]\n", argv[0]); return 0; } - in = fopen(argv[1], "r"); + fdin = open(argv[1], O_RDONLY | O_LARGEFILE); + if (fdin == -1) { + fprintf(stderr, "open(%s) failed: %s\n", argv[1], + strerror(errno)); + return 1; + } + in = fdopen(fdin, "r"); if (in == NULL) { fprintf(stderr, "fopen(%s) failed: %s\n", argv[1], strerror(errno)); + close(fdin); return 1; } if (argc > 2) { - out = fopen(argv[2], "w"); + fdout = open(argv[2], O_CREAT | O_WRONLY | O_LARGEFILE); + if (fdout == -1) { + fprintf(stderr, "open(%s) failed: %s\n", argv[2], + strerror(errno)); + fclose(in); + return 1; + } + out = fdopen(fdout, "w"); if (out == NULL) { fprintf(stderr, "fopen(%s) failed: %s\n", argv[2], strerror(errno)); + fclose(in); + close(fdout); return 1; } } @@ -489,7 +530,8 @@ int jt_dbg_debug_daemon(int argc, char **argv) strncat(size, argv[3], sizeof(size) - 6); rc = write(fd, size, strlen(size)); if (rc != strlen(size)) { - fprintf(stderr, "set %s failed: %s\n", size, strerror(errno)); + fprintf(stderr, "set %s failed: %s\n", + size, strerror(errno)); } } } @@ -590,7 +632,8 @@ int jt_dbg_mark_debug_buf(int argc, char **argv) static struct mod_paths { char *name, *path; } mod_paths[] = { - {"portals", "lustre/portals/libcfs"}, + {"libcfs", "lustre/portals/libcfs"}, + {"portals", "lustre/portals/portals"}, {"ksocknal", "lustre/portals/knals/socknal"}, {"kptlrouter", "lustre/portals/router"}, {"lvfs", "lustre/lvfs"}, @@ -603,6 +646,7 @@ static struct mod_paths { {"mds", "lustre/mds"}, {"mdc", "lustre/mdc"}, {"llite", "lustre/llite"}, + {"smfs", "lustre/smfs"}, {"obdecho", "lustre/obdecho"}, {"ldlm", "lustre/ldlm"}, {"obdfilter", "lustre/obdfilter"}, @@ -611,18 +655,22 @@ static struct mod_paths { {"fsfilt_ext3", "lustre/lvfs"}, {"fsfilt_extN", "lustre/lvfs"}, {"fsfilt_reiserfs", "lustre/lvfs"}, + {"fsfilt_smfs", "lustre/lvfs"}, + {"fsfilt_ldiskfs", "lustre/lvfs"}, {"mds_ext2", "lustre/mds"}, {"mds_ext3", "lustre/mds"}, {"mds_extN", "lustre/mds"}, {"ptlbd", "lustre/ptlbd"}, {"mgmt_svc", "lustre/mgmt"}, {"mgmt_cli", "lustre/mgmt"}, + {"conf_obd", "lustre/obdclass"}, {NULL, NULL} }; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -int jt_dbg_modules(int argc, char **argv) +static int jt_dbg_modules_2_4(int argc, char **argv) { +#ifdef HAVE_LINUX_VERSION_H +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) struct mod_paths *mp; char *path = ".."; char *kernel = "linux"; @@ -657,9 +705,12 @@ int jt_dbg_modules(int argc, char **argv) } return 0; +#endif /* Headers are 2.6-only */ +#endif /* !HAVE_LINUX_VERSION_H */ + return -EINVAL; } -#else -int jt_dbg_modules(int argc, char **argv) + +static int jt_dbg_modules_2_5(int argc, char **argv) { struct mod_paths *mp; char *path = ".."; @@ -699,7 +750,26 @@ int jt_dbg_modules(int argc, char **argv) return 0; } -#endif /* linux 2.5 */ + +int jt_dbg_modules(int argc, char **argv) +{ + int rc = 0; + struct utsname sysinfo; + + rc = uname(&sysinfo); + if (rc) { + printf("uname() failed: %s\n", strerror(errno)); + return 0; + } + + if (sysinfo.release[2] > '4') { + return jt_dbg_modules_2_5(argc, argv); + } else { + return jt_dbg_modules_2_4(argc, argv); + } + + return 0; +} int jt_dbg_panic(int argc, char **argv) { diff --git a/lustre/portals/utils/l_ioctl.c b/lustre/portals/utils/l_ioctl.c index 1adcc8e..0671c24 100644 --- a/lustre/portals/utils/l_ioctl.c +++ b/lustre/portals/utils/l_ioctl.c @@ -56,7 +56,7 @@ static struct ioc_dev ioc_dev_list[10]; struct dump_hdr { int magic; int dev_id; - int opc; + unsigned int opc; }; char *dump_filename; @@ -101,7 +101,7 @@ open_ioc_dev(int dev_id) static int -do_ioctl(int dev_id, int opc, void *buf) +do_ioctl(int dev_id, unsigned int opc, void *buf) { int fd, rc; @@ -131,7 +131,7 @@ get_dump_file() * used, but for now it will assumed whatever app reads the file will * know what to do. */ int -dump(int dev_id, int opc, void *buf) +dump(int dev_id, unsigned int opc, void *buf) { FILE *fp; struct dump_hdr dump_hdr; @@ -212,7 +212,7 @@ set_ioctl_dump(char * file) } int -l_ioctl(int dev_id, int opc, void *buf) +l_ioctl(int dev_id, unsigned int opc, void *buf) { return current_ioc_handler(dev_id, opc, buf); } @@ -226,7 +226,7 @@ l_ioctl(int dev_id, int opc, void *buf) * each device used in the dump. */ int -parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)) +parse_dump(char * dump_file, ioc_handler_t ioc_func) { int line =0; struct stat st; diff --git a/lustre/portals/utils/parser.c b/lustre/portals/utils/parser.c index 82b4022..b91295b 100644 --- a/lustre/portals/utils/parser.c +++ b/lustre/portals/utils/parser.c @@ -642,68 +642,6 @@ int Parser_arg2int(const char *inp, long *result, int base) return 1; } -/* Convert human readable size string to and int; "1k" -> 1000 */ -int Parser_size (int *sizep, char *str) { - int size; - char mod[32]; - - switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) { - default: - return (-1); - - case 1: - *sizep = size; - return (0); - - case 2: - switch (*mod) { - case 'g': - case 'G': - *sizep = size << 30; - return (0); - - case 'm': - case 'M': - *sizep = size << 20; - return (0); - - case 'k': - case 'K': - *sizep = size << 10; - return (0); - - default: - *sizep = size; - return (0); - } - } -} - -/* Convert a string boolean to an int; "enable" -> 1 */ -int Parser_bool (int *b, char *str) { - if (!strcasecmp (str, "no") || - !strcasecmp (str, "n") || - !strcasecmp (str, "off") || - !strcasecmp (str, "down") || - !strcasecmp (str, "disable")) - { - *b = 0; - return (0); - } - - if (!strcasecmp (str, "yes") || - !strcasecmp (str, "y") || - !strcasecmp (str, "on") || - !strcasecmp (str, "up") || - !strcasecmp (str, "enable")) - { - *b = 1; - return (0); - } - - return (-1); -} - int Parser_quit(int argc, char **argv) { argc = argc; diff --git a/lustre/portals/utils/parser.h b/lustre/portals/utils/parser.h index 44e8f2a..9e7e95a 100644 --- a/lustre/portals/utils/parser.h +++ b/lustre/portals/utils/parser.h @@ -64,10 +64,4 @@ char *Parser_strarg(char *inp, const char *prompt, const char *deft, /* Extracts an integer from a string with a base */ int Parser_arg2int(const char *inp, long *result, int base); -/* Convert human readable size string to and int; "1k" -> 1000 */ -int Parser_size(int *sizep, char *str); - -/* Convert a string boolean to an int; "enable" -> 1 */ -int Parser_bool(int *b, char *str); - #endif diff --git a/lustre/portals/utils/portals.c b/lustre/portals/utils/portals.c index 6025ee6..d5d29dc 100644 --- a/lustre/portals/utils/portals.c +++ b/lustre/portals/utils/portals.c @@ -22,13 +22,17 @@ #include #include +#ifdef HAVE_NETDB_H #include +#endif #include +#ifdef HAVE_NETINET_TCP_H #include -#include +#endif #include #include #include +#include "ioctl.h" #include #include #include @@ -40,15 +44,6 @@ #include -#warning assuming little endian - -#define __cpu_to_le64(x) ((__u64)(x)) -#define __le64_to_cpu(x) ((__u64)(x)) -#define __cpu_to_le32(x) ((__u32)(x)) -#define __le32_to_cpu(x) ((__u32)(x)) -#define __cpu_to_le16(x) ((__u16)(x)) -#define __le16_to_cpu(x) ((__u16)(x)) - #endif /* __CYGWIN__ */ #include @@ -60,14 +55,9 @@ unsigned int portal_debug; unsigned int portal_printk; -unsigned int portal_stack; static unsigned int g_nal = 0; -static int g_socket_txmem = 0; -static int g_socket_rxmem = 0; -static int g_socket_nonagle = 1; - typedef struct { char *name; @@ -79,13 +69,75 @@ static name2num_t nalnames[] = { {"tcp", SOCKNAL}, {"elan", QSWNAL}, {"gm", GMNAL}, - {"ib", IBNAL}, - {"scimac", SCIMACNAL}, + {"openib", OPENIBNAL}, + {"iib", IIBNAL}, {NULL, -1} }; static cfg_record_cb_t g_record_cb; +/* Convert a string boolean to an int; "enable" -> 1 */ +int ptl_parse_bool (int *b, char *str) { + if (!strcasecmp (str, "no") || + !strcasecmp (str, "n") || + !strcasecmp (str, "off") || + !strcasecmp (str, "down") || + !strcasecmp (str, "disable")) + { + *b = 0; + return (0); + } + + if (!strcasecmp (str, "yes") || + !strcasecmp (str, "y") || + !strcasecmp (str, "on") || + !strcasecmp (str, "up") || + !strcasecmp (str, "enable")) + { + *b = 1; + return (0); + } + + return (-1); +} + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int ptl_parse_size (int *sizep, char *str) { + int size; + char mod[32]; + + switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) { + default: + return (-1); + + case 1: + *sizep = size; + return (0); + + case 2: + switch (*mod) { + case 'g': + case 'G': + *sizep = size << 30; + return (0); + + case 'm': + case 'M': + *sizep = size << 20; + return (0); + + case 'k': + case 'K': + *sizep = size << 10; + return (0); + + default: + *sizep = size; + return (0); + } + } +} + int ptl_set_cfg_record_cb(cfg_record_cb_t cb) { @@ -158,6 +210,7 @@ nal2name (int nal) return ((e == NULL) ? "???" : e->name); } +#ifdef HAVE_GETHOSTBYNAME static struct hostent * ptl_gethostbyname(char * hname) { struct hostent *he; @@ -178,6 +231,7 @@ ptl_gethostbyname(char * hname) { } return he; } +#endif int ptl_parse_port (int *port, char *str) @@ -223,20 +277,13 @@ ptl_parse_time (time_t *t, char *str) } int -ptl_parse_ipaddr (__u32 *ipaddrp, char *str) +ptl_parse_ipquad (__u32 *ipaddrp, char *str) { - struct hostent *he; int a; int b; int c; int d; - if (!strcmp (str, "_all_")) - { - *ipaddrp = 0; - return (0); - } - if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 && (a & ~0xff) == 0 && (b & ~0xff) == 0 && (c & ~0xff) == 0 && (d & ~0xff) == 0) @@ -244,7 +291,27 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str) *ipaddrp = (a<<24)|(b<<16)|(c<<8)|d; return (0); } - + + return (-1); +} + +int +ptl_parse_ipaddr (__u32 *ipaddrp, char *str) +{ +#ifdef HAVE_GETHOSTBYNAME + struct hostent *he; +#endif + + if (!strcmp (str, "_all_")) + { + *ipaddrp = 0; + return (0); + } + + if (ptl_parse_ipquad(ipaddrp, str) == 0) + return (0); + +#if HAVE_GETHOSTBYNAME if ((('a' <= str[0] && str[0] <= 'z') || ('A' <= str[0] && str[0] <= 'Z')) && (he = ptl_gethostbyname (str)) != NULL) @@ -254,21 +321,28 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str) *ipaddrp = ntohl(addr); /* HOST byte order */ return (0); } +#endif return (-1); } char * -ptl_ipaddr_2_str (__u32 ipaddr, char *str) +ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup) { +#ifdef HAVE_GETHOSTBYNAME __u32 net_ip; struct hostent *he; - - net_ip = htonl (ipaddr); - he = gethostbyaddr (&net_ip, sizeof (net_ip), AF_INET); - if (he != NULL) - return (he->h_name); - + + if (lookup) { + net_ip = htonl (ipaddr); + he = gethostbyaddr (&net_ip, sizeof (net_ip), AF_INET); + if (he != NULL) { + strcpy(str, he->h_name); + return (str); + } + } +#endif + sprintf (str, "%d.%d.%d.%d", (ipaddr >> 24) & 0xff, (ipaddr >> 16) & 0xff, (ipaddr >> 8) & 0xff, ipaddr & 0xff); @@ -302,22 +376,42 @@ ptl_parse_nid (ptl_nid_t *nidp, char *str) return (-1); } +__u64 ptl_nid2u64(ptl_nid_t nid) +{ + switch (sizeof (nid)) { + case 8: + return (nid); + case 4: + return ((__u32)nid); + default: + fprintf(stderr, "Unexpected sizeof(ptl_nid_t) == %u\n", sizeof(nid)); + abort(); + /* notreached */ + return (-1); + } +} + char * ptl_nid2str (char *buffer, ptl_nid_t nid) { - struct hostent *he = NULL; + __u64 nid64 = ptl_nid2u64(nid); +#ifdef HAVE_GETHOSTBYNAME + struct hostent *he = 0; /* Don't try to resolve NIDs that are e.g. Elan host IDs. Assume * TCP addresses in the 0.x.x.x subnet are not in use. This can * happen on routers and slows things down a _lot_. Bug 3442. */ if (nid & 0xff000000) { __u32 addr = htonl((__u32)nid); /* back to NETWORK byte order */ - he = gethostbyaddr((const char *)&addr, sizeof(addr), AF_INET); + + he = gethostbyaddr ((const char *)&addr, sizeof (addr), AF_INET); } + if (he != NULL) - sprintf(buffer, "%#x:%s", (int)(nid >> 32), he->h_name); + sprintf(buffer, "%#x:%s", (int)(nid64 >> 32), he->h_name); else - sprintf(buffer, LPX64, nid); +#endif /* HAVE_GETHOSTBYNAME */ + sprintf(buffer, LPX64, nid64); return (buffer); } @@ -441,11 +535,11 @@ int jt_ptl_network(int argc, char **argv) return (-1); } -int -jt_ptl_print_autoconnects (int argc, char **argv) +int +jt_ptl_print_interfaces (int argc, char **argv) { - struct portals_cfg pcfg; - char buffer[64]; + struct portals_cfg pcfg; + char buffer[3][64]; int index; int rc; @@ -453,99 +547,193 @@ jt_ptl_print_autoconnects (int argc, char **argv) return -1; for (index = 0;;index++) { - PCFG_INIT (pcfg, NAL_CMD_GET_AUTOCONN); - pcfg.pcfg_count = index; + PCFG_INIT (pcfg, NAL_CMD_GET_INTERFACE); + pcfg.pcfg_count = index; rc = pcfg_ioctl (&pcfg); if (rc != 0) break; - printf (LPX64"@%s:%d #%d buffer %d " - "nonagle %s affinity %s eager %s share %d\n", - pcfg.pcfg_nid, ptl_ipaddr_2_str (pcfg.pcfg_id, buffer), - pcfg.pcfg_misc, pcfg.pcfg_count, pcfg.pcfg_size, - (pcfg.pcfg_flags & 1) ? "on" : "off", - (pcfg.pcfg_flags & 2) ? "on" : "off", - (pcfg.pcfg_flags & 4) ? "on" : "off", - pcfg.pcfg_wait); + printf ("%s: (%s/%s) npeer %d nroute %d\n", + ptl_ipaddr_2_str(pcfg.pcfg_id, buffer[2], 1), + ptl_ipaddr_2_str(pcfg.pcfg_id, buffer[0], 0), + ptl_ipaddr_2_str(pcfg.pcfg_misc, buffer[1], 0), + pcfg.pcfg_fd, pcfg.pcfg_count); } if (index == 0) - printf ("\n"); + printf ("\n"); return 0; } -int -jt_ptl_add_autoconnect (int argc, char **argv) +int +jt_ptl_add_interface (int argc, char **argv) { - struct portals_cfg pcfg; - ptl_nid_t nid; - __u32 ip; - int port; - int irq_affinity = 0; - int share = 0; - int eager = 0; + struct portals_cfg pcfg; + __u32 ipaddr; int rc; + __u32 netmask = 0xffffff00; + int i; + int count; + char *end; - if (argc < 4 || argc > 5) { - fprintf (stderr, "usage: %s nid ipaddr port [ise]\n", argv[0]); + if (argc < 2 || argc > 3) { + fprintf (stderr, "usage: %s ipaddr [netmask]\n", argv[0]); return 0; } - if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + if (!g_nal_is_compatible(argv[0], SOCKNAL, 0)) return -1; - if (ptl_parse_nid (&nid, argv[1]) != 0 || - nid == PTL_NID_ANY) { - fprintf (stderr, "Can't parse NID: %s\n", argv[1]); + if (ptl_parse_ipaddr(&ipaddr, argv[1]) != 0) { + fprintf (stderr, "Can't parse ip: %s\n", argv[1]); return -1; } - if (ptl_parse_ipaddr (&ip, argv[2]) != 0) { - fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]); + if (argc > 2 ) { + count = strtol(argv[2], &end, 0); + if (count > 0 && count < 32 && *end == 0) { + netmask = 0; + for (i = count; i > 0; i--) + netmask = netmask|(1<<(32-i)); + } else if (ptl_parse_ipquad(&netmask, argv[2]) != 0) { + fprintf (stderr, "Can't parse netmask: %s\n", argv[2]); + return -1; + } + } + + PCFG_INIT(pcfg, NAL_CMD_ADD_INTERFACE); + pcfg.pcfg_id = ipaddr; + pcfg.pcfg_misc = netmask; + + rc = pcfg_ioctl (&pcfg); + if (rc != 0) { + fprintf (stderr, "failed to add interface: %s\n", + strerror (errno)); return -1; } - if (ptl_parse_port (&port, argv[3]) != 0) { - fprintf (stderr, "Can't parse port: %s\n", argv[3]); + return 0; +} + +int +jt_ptl_del_interface (int argc, char **argv) +{ + struct portals_cfg pcfg; + int rc; + __u32 ipaddr = 0; + + if (argc > 2) { + fprintf (stderr, "usage: %s [ipaddr]\n", argv[0]); + return 0; + } + + if (!g_nal_is_compatible(argv[0], SOCKNAL, 0)) + return -1; + + if (argc == 2 && + ptl_parse_ipaddr(&ipaddr, argv[1]) != 0) { + fprintf (stderr, "Can't parse ip: %s\n", argv[1]); return -1; } + + PCFG_INIT(pcfg, NAL_CMD_DEL_INTERFACE); + pcfg.pcfg_id = ipaddr; - if (argc > 4) { - char *opts = argv[4]; - - while (*opts != 0) - switch (*opts++) { - case 'i': - irq_affinity = 1; - break; - case 's': - share = 1; - break; - case 'e': - eager = 1; - break; - default: - fprintf (stderr, "Can't parse options: %s\n", - argv[4]); - return -1; - } + rc = pcfg_ioctl (&pcfg); + if (rc != 0) { + fprintf (stderr, "failed to delete interface: %s\n", + strerror (errno)); + return -1; } - PCFG_INIT(pcfg, NAL_CMD_ADD_AUTOCONN); + return 0; +} + +int +jt_ptl_print_peers (int argc, char **argv) +{ + struct portals_cfg pcfg; + char buffer[2][64]; + int index; + int rc; + + if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0)) + return -1; + + for (index = 0;;index++) { + PCFG_INIT (pcfg, NAL_CMD_GET_PEER); + pcfg.pcfg_count = index; + + rc = pcfg_ioctl (&pcfg); + if (rc != 0) + break; + + if (g_nal_is_compatible(NULL, SOCKNAL, 0)) + printf (LPX64"[%d]%s@%s:%d #%d\n", + pcfg.pcfg_nid, pcfg.pcfg_wait, + ptl_ipaddr_2_str (pcfg.pcfg_size, buffer[0], 1), + ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), + pcfg.pcfg_misc, pcfg.pcfg_count); + else + printf (LPX64"[%d]\n", + pcfg.pcfg_nid, pcfg.pcfg_wait); + } + + if (index == 0) + printf ("\n"); + return 0; +} + +int +jt_ptl_add_peer (int argc, char **argv) +{ + struct portals_cfg pcfg; + ptl_nid_t nid; + __u32 ip = 0; + int port = 0; + int rc; + + if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0)) + return -1; + + if (g_nal_is_compatible(NULL, SOCKNAL, 0)) { + if (argc != 4) { + fprintf (stderr, "usage(tcp): %s nid ipaddr port\n", + argv[0]); + return 0; + } + } else if (argc != 2) { + fprintf (stderr, "usage(openib,iib): %s nid\n", argv[0]); + return 0; + } + + if (ptl_parse_nid (&nid, argv[1]) != 0 || + nid == PTL_NID_ANY) { + fprintf (stderr, "Can't parse NID: %s\n", argv[1]); + return -1; + } + + if (g_nal_is_compatible (NULL, SOCKNAL, 0)) { + if (ptl_parse_ipaddr (&ip, argv[2]) != 0) { + fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]); + return -1; + } + + if (ptl_parse_port (&port, argv[3]) != 0) { + fprintf (stderr, "Can't parse port: %s\n", argv[3]); + return -1; + } + } + + PCFG_INIT(pcfg, NAL_CMD_ADD_PEER); pcfg.pcfg_nid = nid; pcfg.pcfg_id = ip; pcfg.pcfg_misc = port; - /* only passing one buffer size! */ - pcfg.pcfg_size = MAX (g_socket_rxmem, g_socket_txmem); - pcfg.pcfg_flags = (g_socket_nonagle ? 0x01 : 0) | - (irq_affinity ? 0x02 : 0) | - (share ? 0x04 : 0) | - (eager ? 0x08 : 0); rc = pcfg_ioctl (&pcfg); if (rc != 0) { - fprintf (stderr, "failed to enable autoconnect: %s\n", + fprintf (stderr, "failed to add peer: %s\n", strerror (errno)); return -1; } @@ -554,63 +742,63 @@ jt_ptl_add_autoconnect (int argc, char **argv) } int -jt_ptl_del_autoconnect (int argc, char **argv) +jt_ptl_del_peer (int argc, char **argv) { struct portals_cfg pcfg; ptl_nid_t nid = PTL_NID_ANY; - __u32 ip = 0; - int share = 0; - int keep_conn = 0; + __u32 ip = 0; + int single_share = 0; + int argidx; int rc; - if (argc > 4) { - fprintf (stderr, "usage: %s [nid] [ipaddr] [sk]\n", - argv[0]); - return 0; - } - - if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0)) return -1; + if (g_nal_is_compatible(NULL, SOCKNAL, 0)) { + if (argc > 4) { + fprintf (stderr, "usage: %s [nid] [ipaddr] [single_share]\n", + argv[0]); + return 0; + } + } else if (argc > 3) { + fprintf (stderr, "usage: %s [nid] [single_share]\n", argv[0]); + return 0; + } + if (argc > 1 && ptl_parse_nid (&nid, argv[1]) != 0) { fprintf (stderr, "Can't parse nid: %s\n", argv[1]); return -1; } - if (argc > 2 && - ptl_parse_ipaddr (&ip, argv[2]) != 0) { - fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]); - return -1; + argidx = 2; + if (g_nal_is_compatible(NULL, SOCKNAL, 0)) { + if (argc > argidx && + ptl_parse_ipaddr (&ip, argv[argidx]) != 0) { + fprintf (stderr, "Can't parse ip addr: %s\n", + argv[argidx]); + return -1; + } + argidx++; } - - if (argc > 3) { - char *opts = argv[3]; - - while (*opts != 0) - switch (*opts++) { - case 's': - share = 1; - break; - case 'k': - keep_conn = 1; - break; - default: - fprintf (stderr, "Can't parse flags: %s\n", - argv[3]); - return -1; - } + + if (argc > argidx) { + if (!strcmp (argv[argidx], "single_share")) { + single_share = 1; + } else { + fprintf (stderr, "Unrecognised arg %s'\n", argv[3]); + return -1; + } } - PCFG_INIT(pcfg, NAL_CMD_DEL_AUTOCONN); - pcfg.pcfg_nid = nid; - pcfg.pcfg_id = ip; - pcfg.pcfg_flags = (share ? 1 : 0) | - (keep_conn ? 2 : 0); + PCFG_INIT(pcfg, NAL_CMD_DEL_PEER); + pcfg.pcfg_nid = nid; + pcfg.pcfg_id = ip; + pcfg.pcfg_flags = single_share; rc = pcfg_ioctl (&pcfg); if (rc != 0) { - fprintf (stderr, "failed to remove autoconnect route: %s\n", + fprintf (stderr, "failed to remove peer: %s\n", strerror (errno)); return -1; } @@ -622,11 +810,11 @@ int jt_ptl_print_connections (int argc, char **argv) { struct portals_cfg pcfg; - char buffer[64]; + char buffer[2][64]; int index; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0)) return -1; for (index = 0;;index++) { @@ -637,14 +825,23 @@ jt_ptl_print_connections (int argc, char **argv) if (rc != 0) break; - printf (LPX64"@%s:%d:%s\n", - pcfg.pcfg_nid, - ptl_ipaddr_2_str (pcfg.pcfg_id, buffer), - pcfg.pcfg_misc, - (pcfg.pcfg_flags == SOCKNAL_CONN_ANY) ? "A" : - (pcfg.pcfg_flags == SOCKNAL_CONN_CONTROL) ? "C" : - (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_IN) ? "I" : - (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_OUT) ? "O" : "?"); + if (g_nal_is_compatible (NULL, SOCKNAL, 0)) + printf ("[%d]%s:"LPX64"@%s:%d:%s %d/%d %s\n", + pcfg.pcfg_gw_nal, /* scheduler */ + ptl_ipaddr_2_str (pcfg.pcfg_fd, buffer[0], 1), /* local IP addr */ + pcfg.pcfg_nid, + ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), /* remote IP addr */ + pcfg.pcfg_misc, /* remote port */ + (pcfg.pcfg_flags == SOCKNAL_CONN_ANY) ? "A" : + (pcfg.pcfg_flags == SOCKNAL_CONN_CONTROL) ? "C" : + (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_IN) ? "I" : + (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_OUT) ? "O" : "?", + pcfg.pcfg_count, /* tx buffer size */ + pcfg.pcfg_size, /* rx buffer size */ + pcfg.pcfg_wait ? "nagle" : "nonagle"); + else + printf (LPX64"\n", + pcfg.pcfg_nid); } if (index == 0) @@ -654,23 +851,22 @@ jt_ptl_print_connections (int argc, char **argv) int jt_ptl_connect(int argc, char **argv) { +#ifndef HAVE_CONNECT + /* no connect() support */ + return -1; +#else /* HAVE_CONNECT */ struct portals_cfg pcfg; struct sockaddr_in srvaddr; struct sockaddr_in locaddr; __u32 ipaddr; char *flag; int fd, rc; - int nonagle = 0; - int rxmem = 0; - int txmem = 0; - int bind_irq = 0; int type = SOCKNAL_CONN_ANY; int port, rport; int o; - int olen; if (argc < 3) { - fprintf(stderr, "usage: %s ip port [xibctr]\n", argv[0]); + fprintf(stderr, "usage: %s ip port [type]\n", argv[0]); return 0; } @@ -692,10 +888,6 @@ int jt_ptl_connect(int argc, char **argv) for (flag = argv[3]; *flag != 0; flag++) switch (*flag) { - case 'i': - bind_irq = 1; - break; - case 'I': if (type != SOCKNAL_CONN_ANY) { fprintf(stderr, "Can't flag type twice\n"); @@ -726,8 +918,8 @@ int jt_ptl_connect(int argc, char **argv) return (-1); } - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; locaddr.sin_addr.s_addr = INADDR_ANY; memset(&srvaddr, 0, sizeof(srvaddr)); @@ -735,6 +927,7 @@ int jt_ptl_connect(int argc, char **argv) srvaddr.sin_port = htons(port); srvaddr.sin_addr.s_addr = htonl(ipaddr); + for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) { fd = socket(PF_INET, SOCK_STREAM, 0); if ( fd < 0 ) { @@ -745,35 +938,6 @@ int jt_ptl_connect(int argc, char **argv) o = 1; rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o)); - - if (g_socket_nonagle) { - o = 1; - rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)); - if (rc != 0) { - fprintf(stderr, "cannot disable nagle: %s\n", - strerror(errno)); - return (-1); - } - } - - if (g_socket_rxmem != 0) { - o = g_socket_rxmem; - rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)); - if (rc != 0) { - fprintf(stderr, "cannot set receive buffer size: %s\n", - strerror(errno)); - return (-1); - } - } - - if (g_socket_txmem != 0) { - o = g_socket_txmem; - rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)); - if (rc != 0) { - fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno)); - return (-1); - } - } locaddr.sin_port = htons(rport); rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); @@ -799,18 +963,8 @@ int jt_ptl_connect(int argc, char **argv) return -1; } - olen = sizeof (txmem); - if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0) - fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno)); - olen = sizeof (rxmem); - if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0) - fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno)); - olen = sizeof (nonagle); - if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0) - fprintf (stderr, "Can't get nagle: %s\n", strerror (errno)); - - printf("Connected host: %s snd: %d rcv: %d nagle: %s type: %s\n", - argv[1], txmem, rxmem, nonagle ? "Disabled" : "Enabled", + printf("Connected host: %s type: %s\n", + argv[1], (type == SOCKNAL_CONN_ANY) ? "A" : (type == SOCKNAL_CONN_CONTROL) ? "C" : (type == SOCKNAL_CONN_BULK_IN) ? "I" : @@ -819,7 +973,6 @@ int jt_ptl_connect(int argc, char **argv) PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD); pcfg.pcfg_nal = g_nal; pcfg.pcfg_fd = fd; - pcfg.pcfg_flags = bind_irq; pcfg.pcfg_misc = type; rc = pcfg_ioctl(&pcfg); @@ -837,11 +990,12 @@ int jt_ptl_connect(int argc, char **argv) fprintf(stderr, "close failed: %d\n", rc); return 0; +#endif /* HAVE_CONNECT */ } int jt_ptl_disconnect(int argc, char **argv) { - struct portals_cfg pcfg; + struct portals_cfg pcfg; ptl_nid_t nid = PTL_NID_ANY; __u32 ipaddr = 0; int rc; @@ -851,7 +1005,7 @@ int jt_ptl_disconnect(int argc, char **argv) return 0; } - if (!g_nal_is_compatible (NULL, SOCKNAL, 0)) + if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, IIBNAL, 0)) return 0; if (argc >= 2 && @@ -860,7 +1014,8 @@ int jt_ptl_disconnect(int argc, char **argv) return -1; } - if (argc >= 3 && + if (g_nal_is_compatible (NULL, SOCKNAL, 0) && + argc >= 3 && ptl_parse_ipaddr (&ipaddr, argv[2]) != 0) { fprintf (stderr, "Can't parse ip addr %s\n", argv[2]); return -1; @@ -882,7 +1037,7 @@ int jt_ptl_disconnect(int argc, char **argv) int jt_ptl_push_connection (int argc, char **argv) { - struct portals_cfg pcfg; + struct portals_cfg pcfg; int rc; ptl_nid_t nid = PTL_NID_ANY; __u32 ipaddr = 0; @@ -923,7 +1078,7 @@ int jt_ptl_push_connection (int argc, char **argv) int jt_ptl_print_active_txs (int argc, char **argv) { - struct portals_cfg pcfg; + struct portals_cfg pcfg; int index; int rc; @@ -1045,7 +1200,7 @@ int jt_ptl_mynid(int argc, char **argv) char *nidstr; struct portals_cfg pcfg; ptl_nid_t mynid; - + if (argc > 2) { fprintf(stderr, "usage: %s [NID]\n", argv[0]); fprintf(stderr, "NID defaults to the primary IP address of the machine.\n"); @@ -1079,7 +1234,8 @@ int jt_ptl_mynid(int argc, char **argv) fprintf(stderr, "setting my NID failed: %s\n", strerror(errno)); else - printf("registered my nid "LPX64" (%s)\n", mynid, hostname); + printf("registered my nid "LPX64" (%s)\n", + ptl_nid2u64(mynid), hostname); return 0; } @@ -1131,61 +1287,6 @@ jt_ptl_fail_nid (int argc, char **argv) } int -jt_ptl_rxmem (int argc, char **argv) -{ - int size; - - if (argc > 1) - { - if (Parser_size (&size, argv[1]) != 0 || size < 0) - { - fprintf (stderr, "Can't parse size %s\n", argv[1]); - return (0); - } - - g_socket_rxmem = size; - } - printf ("Socket rmem = %d\n", g_socket_rxmem); - return (0); -} - -int -jt_ptl_txmem (int argc, char **argv) -{ - int size; - - if (argc > 1) - { - if (Parser_size (&size, argv[1]) != 0 || size < 0) - { - fprintf (stderr, "Can't parse size %s\n", argv[1]); - return (0); - } - g_socket_txmem = size; - } - printf ("Socket txmem = %d\n", g_socket_txmem); - return (0); -} - -int -jt_ptl_nagle (int argc, char **argv) -{ - int enable; - - if (argc > 1) - { - if (Parser_bool (&enable, argv[1]) != 0) - { - fprintf (stderr, "Can't parse boolean %s\n", argv[1]); - return (-1); - } - g_socket_nonagle = !enable; - } - printf ("Nagle %s\n", g_socket_nonagle ? "disabled" : "enabled"); - return (0); -} - -int jt_ptl_add_route (int argc, char **argv) { struct portals_cfg pcfg; @@ -1297,7 +1398,8 @@ jt_ptl_del_route (int argc, char **argv) rc = pcfg_ioctl(&pcfg); if (rc != 0) { - fprintf (stderr, "NAL_CMD_DEL_ROUTE ("LPX64") failed: %s\n", nid, strerror (errno)); + fprintf (stderr, "NAL_CMD_DEL_ROUTE ("LPX64") failed: %s\n", + ptl_nid2u64(nid), strerror (errno)); return (-1); } @@ -1327,7 +1429,7 @@ jt_ptl_notify_router (int argc, char **argv) return (-1); } - if (Parser_bool (&enable, argv[2]) != 0) { + if (ptl_parse_bool (&enable, argv[2]) != 0) { fprintf (stderr, "Can't parse boolean %s\n", argv[2]); return (-1); } @@ -1359,7 +1461,7 @@ jt_ptl_notify_router (int argc, char **argv) if (rc != 0) { fprintf (stderr, "NAL_CMD_NOTIFY_ROUTER ("LPX64") failed: %s\n", - nid, strerror (errno)); + ptl_nid2u64(nid), strerror (errno)); return (-1); } @@ -1442,9 +1544,19 @@ lwt_snapshot(cycles_t *now, int *ncpu, int *totalsize, return (-1); } + /* crappy overloads */ + if (data.ioc_nid2 != sizeof(lwt_event_t) || + data.ioc_nid3 != offsetof(lwt_event_t, lwte_where)) { + fprintf(stderr,"kernel/user LWT event mismatch %d(%d),%d(%d)\n", + (int)data.ioc_nid2, sizeof(lwt_event_t), + (int)data.ioc_nid3, + (int)offsetof(lwt_event_t, lwte_where)); + return (-1); + } + LASSERT (data.ioc_count != 0); LASSERT (data.ioc_misc != 0); - + if (now != NULL) *now = data.ioc_nid; @@ -1515,15 +1627,21 @@ lwt_put_string(char *ustr) static int lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e) { - char whenstr[32]; +#ifndef __WORDSIZE +# error "__WORDSIZE not defined" +#elif __WORDSIZE == 32 +# define XFMT "%#010lx" +#elif __WORDSIZE== 64 +# define XFMT "%#018lx" +#else +# error "Unexpected __WORDSIZE" +#endif char *where = lwt_get_string(e->lwte_where); if (where == NULL) return (-1); - sprintf(whenstr, LPD64, e->lwte_when - t0); - - fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n", + fprintf(f, XFMT" "XFMT" "XFMT" "XFMT": "XFMT" %2d %10.6f %10.2f %s\n", e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4, (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0), (t0 == e->lwte_when) ? 0.0 : (e->lwte_when - tlast) / mhz, @@ -1532,6 +1650,7 @@ lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t lwt_put_string(where); return (0); +#undef XFMT } double @@ -1557,13 +1676,14 @@ get_cycles_per_usec () int jt_ptl_lwt(int argc, char **argv) { + const int lwt_max_cpus = 32; int ncpus; int totalspace; int nevents_per_cpu; lwt_event_t *events; - lwt_event_t *cpu_event[LWT_MAX_CPUS + 1]; - lwt_event_t *next_event[LWT_MAX_CPUS]; - lwt_event_t *first_event[LWT_MAX_CPUS]; + lwt_event_t *cpu_event[lwt_max_cpus + 1]; + lwt_event_t *next_event[lwt_max_cpus]; + lwt_event_t *first_event[lwt_max_cpus]; int cpu; lwt_event_t *e; int rc; @@ -1574,6 +1694,7 @@ jt_ptl_lwt(int argc, char **argv) cycles_t tnow; struct timeval tvnow; int printed_date = 0; + int nlines = 0; FILE *f = stdout; if (argc < 2 || @@ -1604,9 +1725,9 @@ jt_ptl_lwt(int argc, char **argv) if (lwt_snapshot(NULL, &ncpus, &totalspace, NULL, 0) != 0) return (-1); - if (ncpus > LWT_MAX_CPUS) { + if (ncpus > lwt_max_cpus) { fprintf(stderr, "Too many cpus: %d (%d)\n", - ncpus, LWT_MAX_CPUS); + ncpus, lwt_max_cpus); return (-1); } @@ -1723,6 +1844,12 @@ jt_ptl_lwt(int argc, char **argv) rc = lwt_print(f, t0, tlast, mhz, cpu, next_event[cpu]); if (rc != 0) break; + + if (++nlines % 10000 == 0 && f != stdout) { + /* show some activity... */ + printf("."); + fflush (stdout); + } } tlast = next_event[cpu]->lwte_when; @@ -1736,8 +1863,10 @@ jt_ptl_lwt(int argc, char **argv) next_event[cpu] = NULL; } - if (f != stdout) + if (f != stdout) { + printf("\n"); fclose(f); + } free(events); return (0); diff --git a/lustre/portals/utils/ptlctl.c b/lustre/portals/utils/ptlctl.c index c65ecb2..03cfe77 100644 --- a/lustre/portals/utils/ptlctl.c +++ b/lustre/portals/utils/ptlctl.c @@ -30,9 +30,12 @@ command_t list[] = { {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"}, - {"print_autoconns", jt_ptl_print_autoconnects, 0, "print autoconnect entries (no args)"}, - {"add_autoconn", jt_ptl_add_autoconnect, 0, "add autoconnect entry (args: nid host [ise])"}, - {"del_autoconn", jt_ptl_del_autoconnect, 0, "delete autoconnect entry (args: [nid] [host] [ks])"}, + {"print_interfaces", jt_ptl_print_interfaces, 0, "print interface entries (no args)"}, + {"add_interface", jt_ptl_add_interface, 0, "add interface entry (args: ip [netmask])"}, + {"del_interface", jt_ptl_del_interface, 0, "delete interface entries (args: [ip])"}, + {"print_peers", jt_ptl_print_peers, 0, "print peer entries (no args)"}, + {"add_peer", jt_ptl_add_peer, 0, "add peer entry (args: nid host port)"}, + {"del_peer", jt_ptl_del_peer, 0, "delete peer entry (args: [nid] [host])"}, {"print_conns", jt_ptl_print_connections, 0, "print connections (no args)"}, {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: host port [iIOC])"}, {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [nid] [host]"}, @@ -48,9 +51,6 @@ command_t list[] = { {"set_route", jt_ptl_notify_router, 0, "enable/disable a route in the routing table (args: gatewayNID up/down [time]"}, {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"}, - {"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"}, - {"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"}, - {"nagle", jt_ptl_nagle, 0, "Enable/Disable Nagle (args: [on/off])"}, {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"}, {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"}, {"help", Parser_help, 0, "help"}, diff --git a/lustre/portals/utils/wirecheck.c b/lustre/portals/utils/wirecheck.c index a73a5217..6316290 100644 --- a/lustre/portals/utils/wirecheck.c +++ b/lustre/portals/utils/wirecheck.c @@ -34,7 +34,7 @@ do { \ #define CHECK_MEMBER_OFFSET(s,m) \ do { \ - CHECK_VALUE(offsetof(s, m)); \ + CHECK_VALUE((int)offsetof(s, m)); \ } while (0) #define CHECK_MEMBER_SIZEOF(s,m) \ diff --git a/lustre/ptlrpc/Makefile.in b/lustre/ptlrpc/Makefile.in index abc403b..946aa0c 100644 --- a/lustre/ptlrpc/Makefile.in +++ b/lustre/ptlrpc/Makefile.in @@ -12,7 +12,7 @@ ldlm_objs += $(LDLM)ldlm_flock.o ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o recov_thread.o ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o -ptlrpc_objs += lproc_ptlrpc.o +ptlrpc_objs += pers.o lproc_ptlrpc.o ptlrpc-objs := $(ldlm_objs) $(ptlrpc_objs) default: all diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 8321e73..b82c5ce 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -92,9 +92,9 @@ static inline struct ptlrpc_bulk_desc *new_bulk(int npages, int type, int portal spin_lock_init(&desc->bd_lock); init_waitqueue_head(&desc->bd_waitq); - desc->bd_max_pages = npages; - desc->bd_page_count = 0; - desc->bd_md_h = PTL_HANDLE_NONE; + desc->bd_max_iov = npages; + desc->bd_iov_count = 0; + desc->bd_md_h = PTL_INVALID_HANDLE; desc->bd_portal = portal; desc->bd_type = type; @@ -152,27 +152,15 @@ struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp (struct ptlrpc_request *req, void ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page, int pageoffset, int len) { -#ifdef __KERNEL__ - ptl_kiov_t *kiov = &desc->bd_iov[desc->bd_page_count]; -#else - struct iovec *iov = &desc->bd_iov[desc->bd_page_count]; -#endif - LASSERT(desc->bd_page_count < desc->bd_max_pages); + LASSERT(desc->bd_iov_count < desc->bd_max_iov); LASSERT(page != NULL); LASSERT(pageoffset >= 0); LASSERT(len > 0); LASSERT(pageoffset + len <= PAGE_SIZE); -#ifdef __KERNEL__ - kiov->kiov_page = page; - kiov->kiov_offset = pageoffset; - kiov->kiov_len = len; -#else - iov->iov_base = page->addr + pageoffset; - iov->iov_len = len; -#endif - desc->bd_page_count++; desc->bd_nob += len; + + ptlrpc_add_bulk_page(desc, page, pageoffset, len); } void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) @@ -180,7 +168,7 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) ENTRY; LASSERT(desc != NULL); - LASSERT(desc->bd_page_count != LI_POISON); /* not freed already */ + LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */ LASSERT(!desc->bd_network_rw); /* network hands off or */ LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); if (desc->bd_export) @@ -188,8 +176,8 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) else class_import_put(desc->bd_import); - OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, - bd_iov[desc->bd_max_pages])); + OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, + bd_iov[desc->bd_max_iov])); EXIT; } @@ -535,6 +523,7 @@ static int after_reply(struct ptlrpc_request *req) static int ptlrpc_send_new_req(struct ptlrpc_request *req) { + char str[PTL_NALFMT_SIZE]; struct obd_import *imp; unsigned long flags; int rc; @@ -579,11 +568,11 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req) req->rq_reqmsg->status = current->pid; CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:ni:nid:opc" - " %s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm, + " %s:%s:%d:"LPU64":%s:%s:%d\n", current->comm, imp->imp_obd->obd_uuid.uuid, req->rq_reqmsg->status, req->rq_xid, imp->imp_connection->c_peer.peer_ni->pni_name, - imp->imp_connection->c_peer.peer_nid, + ptlrpc_peernid2str(&imp->imp_connection->c_peer, str), req->rq_reqmsg->opc); rc = ptl_send_rpc(req); @@ -597,6 +586,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req) int ptlrpc_check_set(struct ptlrpc_request_set *set) { + char str[PTL_NALFMT_SIZE]; unsigned long flags; struct list_head *tmp; int force_timer_recalc = 0; @@ -797,11 +787,11 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) } CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:ni:nid:" - "opc %s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm, + "opc %s:%s:%d:"LPU64":%s:%s:%d\n", current->comm, imp->imp_obd->obd_uuid.uuid, req->rq_reqmsg->status, req->rq_xid, imp->imp_connection->c_peer.peer_ni->pni_name, - imp->imp_connection->c_peer.peer_nid, + ptlrpc_peernid2str(&imp->imp_connection->c_peer, str), req->rq_reqmsg->opc); set->set_remaining--; @@ -1123,13 +1113,10 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request) if (!ptlrpc_client_receiving_reply(request)) return; - rc = PtlMDUnlink (request->rq_reply_md_h); - if (rc == PTL_INV_MD) { - LASSERT (!ptlrpc_client_receiving_reply(request)); - return; - } - - LASSERT (rc == PTL_OK); + PtlMDUnlink (request->rq_reply_md_h); + + /* We have to l_wait_event() whatever the result, to give liblustre + * a chance to run reply_in_callback() */ if (request->rq_set != NULL) wq = &request->rq_set->set_waitq; @@ -1320,6 +1307,7 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, int ptlrpc_queue_wait(struct ptlrpc_request *req) { + char str[PTL_NALFMT_SIZE]; int rc = 0; int brc; struct l_wait_info lwi; @@ -1336,11 +1324,11 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) req->rq_reqmsg->status = current->pid; LASSERT(imp->imp_obd != NULL); CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:ni:nid:opc " - "%s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm, + "%s:%s:%d:"LPU64":%s:%s:%d\n", current->comm, imp->imp_obd->obd_uuid.uuid, req->rq_reqmsg->status, req->rq_xid, imp->imp_connection->c_peer.peer_ni->pni_name, - imp->imp_connection->c_peer.peer_nid, + ptlrpc_peernid2str(&imp->imp_connection->c_peer, str), req->rq_reqmsg->opc); /* Mark phase here for a little debug help */ @@ -1423,11 +1411,11 @@ restart: DEBUG_REQ(D_NET, req, "-- done sleeping"); CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:ni:nid:opc " - "%s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm, + "%s:%s:%d:"LPU64":%s:%s:%d\n", current->comm, imp->imp_obd->obd_uuid.uuid, req->rq_reqmsg->status, req->rq_xid, imp->imp_connection->c_peer.peer_ni->pni_name, - imp->imp_connection->c_peer.peer_nid, + ptlrpc_peernid2str(&imp->imp_connection->c_peer, str), req->rq_reqmsg->opc); spin_lock_irqsave(&imp->imp_lock, flags); diff --git a/lustre/ptlrpc/connection.c b/lustre/ptlrpc/connection.c index c6a4163..c2c5288 100644 --- a/lustre/ptlrpc/connection.c +++ b/lustre/ptlrpc/connection.c @@ -37,15 +37,17 @@ static struct list_head conn_unused_list; void ptlrpc_dump_connections(void) { + char str[PTL_NALFMT_SIZE]; struct list_head *tmp; struct ptlrpc_connection *c; ENTRY; list_for_each(tmp, &conn_list) { c = list_entry(tmp, struct ptlrpc_connection, c_link); - CERROR("Connection %p/%s has refcount %d (nid="LPX64" on %s)\n", + CERROR("Connection %p/%s has refcount %d (nid=%s on %s)\n", c, c->c_remote_uuid.uuid, atomic_read(&c->c_refcount), - c->c_peer.peer_nid, c->c_peer.peer_ni->pni_name); + ptlrpc_peernid2str(&c->c_peer, str), + c->c_peer.peer_ni->pni_name); } EXIT; } @@ -53,18 +55,19 @@ void ptlrpc_dump_connections(void) struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer, struct obd_uuid *uuid) { + char str[PTL_NALFMT_SIZE]; struct list_head *tmp, *pos; struct ptlrpc_connection *c; ENTRY; - CDEBUG(D_INFO, "peer is "LPX64" on %s\n", - peer->peer_nid, peer->peer_ni->pni_name); + CDEBUG(D_INFO, "peer is %s on %s\n", + ptlrpc_id2str(peer, str), peer->peer_ni->pni_name); spin_lock(&conn_lock); list_for_each(tmp, &conn_list) { c = list_entry(tmp, struct ptlrpc_connection, c_link); - if (peer->peer_nid == c->c_peer.peer_nid && + if (memcmp(peer, &c->c_peer, sizeof(*peer)) == 0 && peer->peer_ni == c->c_peer.peer_ni) { ptlrpc_connection_addref(c); GOTO(out, c); @@ -73,7 +76,7 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer, list_for_each_safe(tmp, pos, &conn_unused_list) { c = list_entry(tmp, struct ptlrpc_connection, c_link); - if (peer->peer_nid == c->c_peer.peer_nid && + if (memcmp(peer, &c->c_peer, sizeof(*peer)) == 0 && peer->peer_ni == c->c_peer.peer_ni) { ptlrpc_connection_addref(c); list_del(&c->c_link); @@ -106,6 +109,7 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer, int ptlrpc_put_connection(struct ptlrpc_connection *c) { + char str[PTL_NALFMT_SIZE]; int rc = 0; ENTRY; @@ -114,8 +118,9 @@ int ptlrpc_put_connection(struct ptlrpc_connection *c) RETURN(0); } - CDEBUG (D_INFO, "connection=%p refcount %d to "LPX64" on %s\n", - c, atomic_read(&c->c_refcount) - 1, c->c_peer.peer_nid, + CDEBUG (D_INFO, "connection=%p refcount %d to %s on %s\n", + c, atomic_read(&c->c_refcount) - 1, + ptlrpc_peernid2str(&c->c_peer, str), c->c_peer.peer_ni->pni_name); if (atomic_dec_and_test(&c->c_refcount)) { @@ -134,10 +139,12 @@ int ptlrpc_put_connection(struct ptlrpc_connection *c) struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *c) { + char str[PTL_NALFMT_SIZE]; ENTRY; atomic_inc(&c->c_refcount); - CDEBUG (D_INFO, "connection=%p refcount %d to "LPX64" on %s\n", - c, atomic_read(&c->c_refcount), c->c_peer.peer_nid, + CDEBUG (D_INFO, "connection=%p refcount %d to %s on %s\n", + c, atomic_read(&c->c_refcount), + ptlrpc_peernid2str(&c->c_peer, str), c->c_peer.peer_ni->pni_name); RETURN(c); } @@ -151,6 +158,7 @@ void ptlrpc_init_connection(void) void ptlrpc_cleanup_connection(void) { + char str[PTL_NALFMT_SIZE]; struct list_head *tmp, *pos; struct ptlrpc_connection *c; @@ -162,9 +170,10 @@ void ptlrpc_cleanup_connection(void) } list_for_each_safe(tmp, pos, &conn_list) { c = list_entry(tmp, struct ptlrpc_connection, c_link); - CERROR("Connection %p/%s has refcount %d (nid="LPX64" on %s)\n", + CERROR("Connection %p/%s has refcount %d (nid=%s on %s)\n", c, c->c_remote_uuid.uuid, atomic_read(&c->c_refcount), - c->c_peer.peer_nid, c->c_peer.peer_ni->pni_name); + ptlrpc_peernid2str(&c->c_peer, str), + c->c_peer.peer_ni->pni_name); list_del(&c->c_link); OBD_FREE(c, sizeof(*c)); } diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index aab86ea..a2e5bc2 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -29,8 +29,15 @@ #endif #include #include +#include "ptlrpc_internal.h" -struct ptlrpc_ni ptlrpc_interfaces[NAL_MAX_NR]; +#if !defined(__KERNEL__) && CRAY_PORTALS +/* forward ref in events.c */ +static void cray_portals_callback(ptl_event_t *ev); +#endif + + +struct ptlrpc_ni ptlrpc_interfaces[8]; int ptlrpc_ninterfaces; /* @@ -38,20 +45,20 @@ int ptlrpc_ninterfaces; */ void request_out_callback(ptl_event_t *ev) { - struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; struct ptlrpc_request *req = cbid->cbid_arg; unsigned long flags; ENTRY; - LASSERT (ev->type == PTL_EVENT_SENT || + LASSERT (ev->type == PTL_EVENT_SEND_END || ev->type == PTL_EVENT_UNLINK); LASSERT (ev->unlinked); - DEBUG_REQ((ev->status == PTL_OK) ? D_NET : D_ERROR, req, - "type %d, status %d", ev->type, ev->status); + DEBUG_REQ((ev->ni_fail_type == PTL_NI_OK) ? D_NET : D_ERROR, req, + "type %d, status %d", ev->type, ev->ni_fail_type); if (ev->type == PTL_EVENT_UNLINK || - ev->status != PTL_OK) { + ev->ni_fail_type != PTL_NI_OK) { /* Failed send: make it seem like the reply timed out, just * like failing sends in client.c does currently... */ @@ -73,28 +80,28 @@ void request_out_callback(ptl_event_t *ev) */ void reply_in_callback(ptl_event_t *ev) { - struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; struct ptlrpc_request *req = cbid->cbid_arg; unsigned long flags; ENTRY; - LASSERT (ev->type == PTL_EVENT_PUT || + LASSERT (ev->type == PTL_EVENT_PUT_END || ev->type == PTL_EVENT_UNLINK); LASSERT (ev->unlinked); - LASSERT (ev->mem_desc.start == req->rq_repmsg); + LASSERT (ev->md.start == req->rq_repmsg); LASSERT (ev->offset == 0); LASSERT (ev->mlength <= req->rq_replen); - DEBUG_REQ((ev->status == PTL_OK) ? D_NET : D_ERROR, req, - "type %d, status %d", ev->type, ev->status); + DEBUG_REQ((ev->ni_fail_type == PTL_NI_OK) ? D_NET : D_ERROR, req, + "type %d, status %d", ev->type, ev->ni_fail_type); spin_lock_irqsave (&req->rq_lock, flags); LASSERT (req->rq_receiving_reply); req->rq_receiving_reply = 0; - if (ev->type == PTL_EVENT_PUT && - ev->status == PTL_OK) { + if (ev->type == PTL_EVENT_PUT_END && + ev->ni_fail_type == PTL_NI_OK) { req->rq_replied = 1; req->rq_nob_received = ev->mlength; } @@ -112,21 +119,21 @@ void reply_in_callback(ptl_event_t *ev) */ void client_bulk_callback (ptl_event_t *ev) { - struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; struct ptlrpc_bulk_desc *desc = cbid->cbid_arg; unsigned long flags; ENTRY; LASSERT ((desc->bd_type == BULK_PUT_SINK && - ev->type == PTL_EVENT_PUT) || + ev->type == PTL_EVENT_PUT_END) || (desc->bd_type == BULK_GET_SOURCE && - ev->type == PTL_EVENT_GET) || + ev->type == PTL_EVENT_GET_END) || ev->type == PTL_EVENT_UNLINK); LASSERT (ev->unlinked); - CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR, + CDEBUG((ev->ni_fail_type == PTL_NI_OK) ? D_NET : D_ERROR, "event type %d, status %d, desc %p\n", - ev->type, ev->status, desc); + ev->type, ev->ni_fail_type, desc); spin_lock_irqsave (&desc->bd_lock, flags); @@ -134,7 +141,7 @@ void client_bulk_callback (ptl_event_t *ev) desc->bd_network_rw = 0; if (ev->type != PTL_EVENT_UNLINK && - ev->status == PTL_OK) { + ev->ni_fail_type == PTL_NI_OK) { desc->bd_success = 1; desc->bd_nob_transferred = ev->mlength; } @@ -152,23 +159,24 @@ void client_bulk_callback (ptl_event_t *ev) */ void request_in_callback(ptl_event_t *ev) { - struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg; struct ptlrpc_srv_ni *srv_ni = rqbd->rqbd_srv_ni; struct ptlrpc_service *service = srv_ni->sni_service; struct ptlrpc_request *req; + char str[PTL_NALFMT_SIZE]; unsigned long flags; ENTRY; - LASSERT (ev->type == PTL_EVENT_PUT || + LASSERT (ev->type == PTL_EVENT_PUT_END || ev->type == PTL_EVENT_UNLINK); - LASSERT ((char *)ev->mem_desc.start >= rqbd->rqbd_buffer); - LASSERT ((char *)ev->mem_desc.start + ev->offset + ev->mlength <= + LASSERT ((char *)ev->md.start >= rqbd->rqbd_buffer); + LASSERT ((char *)ev->md.start + ev->offset + ev->mlength <= rqbd->rqbd_buffer + service->srv_buf_size); - CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR, + CDEBUG((ev->ni_fail_type == PTL_OK) ? D_NET : D_ERROR, "event type %d, status %d, service %s\n", - ev->type, ev->status, service->srv_name); + ev->type, ev->ni_fail_type, service->srv_name); if (ev->unlinked) { /* If this is the last request message to fit in the @@ -179,16 +187,18 @@ void request_in_callback(ptl_event_t *ev) req = &rqbd->rqbd_req; memset(req, 0, sizeof (*req)); } else { - LASSERT (ev->type == PTL_EVENT_PUT); - if (ev->status != PTL_OK) { + LASSERT (ev->type == PTL_EVENT_PUT_END); + if (ev->ni_fail_type != PTL_NI_OK) { /* We moaned above already... */ return; } OBD_ALLOC_GFP(req, sizeof(*req), GFP_ATOMIC); if (req == NULL) { CERROR("Can't allocate incoming request descriptor: " - "Dropping %s RPC from "LPX64"\n", - service->srv_name, ev->initiator.nid); + "Dropping %s RPC from %s\n", + service->srv_name, + portals_id2str(srv_ni->sni_ni->pni_number, + ev->initiator, str)); return; } } @@ -197,15 +207,16 @@ void request_in_callback(ptl_event_t *ev) * flags are reset and scalars are zero. We only set the message * size to non-zero if this was a successful receive. */ req->rq_xid = ev->match_bits; - req->rq_reqmsg = ev->mem_desc.start + ev->offset; - if (ev->type == PTL_EVENT_PUT && - ev->status == PTL_OK) + req->rq_reqmsg = ev->md.start + ev->offset; + if (ev->type == PTL_EVENT_PUT_END && + ev->ni_fail_type == PTL_NI_OK) req->rq_reqlen = ev->mlength; - req->rq_arrival_time = ev->arrival_time; - req->rq_peer.peer_nid = ev->initiator.nid; + do_gettimeofday(&req->rq_arrival_time); + req->rq_peer.peer_id = ev->initiator; req->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni; + ptlrpc_id2str(&req->rq_peer, req->rq_peerstr); req->rq_rqbd = rqbd; - + spin_lock_irqsave (&service->srv_lock, flags); if (ev->unlinked) { @@ -242,14 +253,14 @@ void request_in_callback(ptl_event_t *ev) */ void reply_out_callback(ptl_event_t *ev) { - struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; struct ptlrpc_reply_state *rs = cbid->cbid_arg; struct ptlrpc_srv_ni *sni = rs->rs_srv_ni; struct ptlrpc_service *svc = sni->sni_service; unsigned long flags; ENTRY; - LASSERT (ev->type == PTL_EVENT_SENT || + LASSERT (ev->type == PTL_EVENT_SEND_END || ev->type == PTL_EVENT_ACK || ev->type == PTL_EVENT_UNLINK); @@ -280,27 +291,27 @@ void reply_out_callback(ptl_event_t *ev) */ void server_bulk_callback (ptl_event_t *ev) { - struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; struct ptlrpc_bulk_desc *desc = cbid->cbid_arg; unsigned long flags; ENTRY; - LASSERT (ev->type == PTL_EVENT_SENT || + LASSERT (ev->type == PTL_EVENT_SEND_END || ev->type == PTL_EVENT_UNLINK || (desc->bd_type == BULK_PUT_SOURCE && ev->type == PTL_EVENT_ACK) || (desc->bd_type == BULK_GET_SINK && - ev->type == PTL_EVENT_REPLY)); + ev->type == PTL_EVENT_REPLY_END)); - CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR, + CDEBUG((ev->ni_fail_type == PTL_NI_OK) ? D_NET : D_ERROR, "event type %d, status %d, desc %p\n", - ev->type, ev->status, desc); + ev->type, ev->ni_fail_type, desc); spin_lock_irqsave (&desc->bd_lock, flags); if ((ev->type == PTL_EVENT_ACK || - ev->type == PTL_EVENT_REPLY) && - ev->status == PTL_OK) { + ev->type == PTL_EVENT_REPLY_END) && + ev->ni_fail_type == PTL_NI_OK) { /* We heard back from the peer, so even if we get this * before the SENT event (oh yes we can), we know we * read/wrote the peer buffer and how much... */ @@ -318,9 +329,9 @@ void server_bulk_callback (ptl_event_t *ev) EXIT; } -static int ptlrpc_master_callback(ptl_event_t *ev) +static void ptlrpc_master_callback(ptl_event_t *ev) { - struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; void (*callback)(ptl_event_t *ev) = cbid->cbid_fn; /* Honestly, it's best to find out early. */ @@ -333,32 +344,33 @@ static int ptlrpc_master_callback(ptl_event_t *ev) callback == server_bulk_callback); callback (ev); - return (0); } int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer) { struct ptlrpc_ni *pni; - struct lustre_peer lpeer; + __u32 peer_nal; + ptl_nid_t peer_nid; int i; - int rc = lustre_uuid_to_peer (uuid->uuid, &lpeer); - + char str[PTL_NALFMT_SIZE]; + int rc = lustre_uuid_to_peer(uuid->uuid, + &peer_nal, &peer_nid); if (rc != 0) RETURN (rc); for (i = 0; i < ptlrpc_ninterfaces; i++) { pni = &ptlrpc_interfaces[i]; - if (!memcmp(&lpeer.peer_ni, &pni->pni_ni_h, - sizeof (lpeer.peer_ni))) { - peer->peer_nid = lpeer.peer_nid; + if (pni->pni_number == peer_nal) { + peer->peer_id.nid = peer_nid; + peer->peer_id.pid = LUSTRE_SRV_PTL_PID; peer->peer_ni = pni; return (0); } } - CERROR("Can't find ptlrpc interface for "LPX64" ni handle %08lx."LPX64"\n", - lpeer.peer_nid, lpeer.peer_ni.nal_idx, lpeer.peer_ni.cookie); + CERROR("Can't find ptlrpc interface for NAL %d, NID %s\n", + peer_nal, portals_nid2str(peer_nal, peer_nid, str)); return (-ENOENT); } @@ -381,10 +393,10 @@ void ptlrpc_ni_fini(struct ptlrpc_ni *pni) LBUG(); case PTL_OK: - kportal_put_ni (pni->pni_number); + PtlNIFini(pni->pni_ni_h); return; - case PTL_EQ_INUSE: + case PTL_EQ_IN_USE: if (retries != 0) CWARN("Event queue for %s still busy\n", pni->pni_name); @@ -399,33 +411,68 @@ void ptlrpc_ni_fini(struct ptlrpc_ni *pni) /* notreached */ } +ptl_pid_t ptl_get_pid(void) +{ + ptl_pid_t pid; + +#ifndef __KERNEL__ + pid = getpid(); +#else + pid = LUSTRE_SRV_PTL_PID; +#endif + return pid; +} + int ptlrpc_ni_init(int number, char *name, struct ptlrpc_ni *pni) { int rc; - ptl_handle_ni_t *nip = kportal_get_ni (number); - - if (nip == NULL) { - CDEBUG (D_NET, "Network interface %s not loaded\n", name); + char str[20]; + ptl_handle_ni_t nih; + ptl_pid_t pid; + + pid = ptl_get_pid(); + + /* We're not passing any limits yet... */ + rc = PtlNIInit(number, pid, NULL, NULL, &nih); + if (rc != PTL_OK && rc != PTL_IFACE_DUP) { + CDEBUG (D_NET, "Can't init network interface %s: %d\n", + name, rc); return (-ENOENT); } - CDEBUG (D_NET, "init %d %s: nal_idx %ld\n", number, name, nip->nal_idx); + CDEBUG(D_NET, "My pid is: %x\n", ptl_get_pid()); + + PtlSnprintHandle(str, sizeof(str), nih); + CDEBUG (D_NET, "init %d %s: %s\n", number, name, str); pni->pni_name = name; pni->pni_number = number; - pni->pni_ni_h = *nip; + pni->pni_ni_h = nih; - pni->pni_eq_h = PTL_HANDLE_NONE; + pni->pni_eq_h = PTL_INVALID_HANDLE; + /* CAVEAT EMPTOR: how we process portals events is _radically_ + * different depending on... */ #ifdef __KERNEL__ - /* kernel: portals calls the callback when the event is added to the - * queue, so we don't care if we lose events */ + /* kernel portals calls our master callback when events are added to + * the event queue. In fact lustre never pulls events off this queue, + * so it's only sized for some debug history. */ rc = PtlEQAlloc(pni->pni_ni_h, 1024, ptlrpc_master_callback, &pni->pni_eq_h); #else - /* liblustre: no asynchronous callback and allocate a nice big event - * queue so we don't drop any events... */ - rc = PtlEQAlloc(pni->pni_ni_h, 10240, NULL, &pni->pni_eq_h); + /* liblustre calls the master callback when it removes events from the + * event queue. The event queue has to be big enough not to drop + * anything */ +# if CRAY_PORTALS + /* cray portals implements a non-standard callback to notify us there + * are buffered events even when the app is not doing a filesystem + * call. */ + rc = PtlEQAlloc(pni->pni_ni_h, 10240, cray_portals_callback, + &pni->pni_eq_h); +# else + rc = PtlEQAlloc(pni->pni_ni_h, 10240, PTL_EQ_HANDLER_NONE, + &pni->pni_eq_h); +# endif #endif if (rc != PTL_OK) GOTO (fail, rc = -ENOMEM); @@ -473,19 +520,16 @@ liblustre_check_events (int timeout) { ptl_event_t ev; int rc; + int i; ENTRY; - if (timeout) { - rc = PtlEQWait_timeout(ptlrpc_interfaces[0].pni_eq_h, &ev, timeout); - } else { - rc = PtlEQGet (ptlrpc_interfaces[0].pni_eq_h, &ev); - } + rc = PtlEQPoll(&ptlrpc_interfaces[0].pni_eq_h, 1, timeout * 1000, + &ev, &i); if (rc == PTL_EQ_EMPTY) RETURN(0); LASSERT (rc == PTL_EQ_DROPPED || rc == PTL_OK); -#ifndef __KERNEL__ /* liblustre: no asynch callback so we can't affort to miss any * events... */ if (rc == PTL_EQ_DROPPED) { @@ -494,10 +538,11 @@ liblustre_check_events (int timeout) } ptlrpc_master_callback (&ev); -#endif RETURN(1); } +int liblustre_waiting = 0; + int liblustre_wait_event (int timeout) { @@ -505,40 +550,63 @@ liblustre_wait_event (int timeout) struct liblustre_wait_callback *llwc; int found_something = 0; - /* First check for any new events */ - if (liblustre_check_events(0)) - found_something = 1; + /* single threaded recursion check... */ + liblustre_waiting = 1; - /* Now give all registered callbacks a bite at the cherry */ - list_for_each(tmp, &liblustre_wait_callbacks) { - llwc = list_entry(tmp, struct liblustre_wait_callback, - llwc_list); - - if (llwc->llwc_fn(llwc->llwc_arg)) + for (;;) { + /* Deal with all pending events */ + while (liblustre_check_events(0)) found_something = 1; - } - /* return to caller if something happened */ - if (found_something) - return 1; - - /* block for an event, returning immediately on timeout */ - if (!liblustre_check_events(timeout)) - return 0; - - /* an event occurred; let all registered callbacks progress... */ - list_for_each(tmp, &liblustre_wait_callbacks) { - llwc = list_entry(tmp, struct liblustre_wait_callback, - llwc_list); + /* Give all registered callbacks a bite at the cherry */ + list_for_each(tmp, &liblustre_wait_callbacks) { + llwc = list_entry(tmp, struct liblustre_wait_callback, + llwc_list); - if (llwc->llwc_fn(llwc->llwc_arg)) - found_something = 1; + if (llwc->llwc_fn(llwc->llwc_arg)) + found_something = 1; + } + + if (found_something || timeout == 0) + break; + + /* Nothing so far, but I'm allowed to block... */ + found_something = liblustre_check_events(timeout); + if (!found_something) /* still nothing */ + break; /* I timed out */ } - /* ...and tell caller something happened */ - return 1; + liblustre_waiting = 0; + + return found_something; +} + +#if CRAY_PORTALS +static void cray_portals_callback(ptl_event_t *ev) +{ + /* We get a callback from the client Cray portals implementation + * whenever anyone calls PtlEQPoll(), and an event queue with a + * callback handler has outstanding events. + * + * If it's not liblustre calling PtlEQPoll(), this lets us know we + * have outstanding events which we handle with + * liblustre_wait_event(). + * + * Otherwise, we're already eagerly consuming events and we'd + * handle events out of order if we recursed. */ + if (!liblustre_waiting) + liblustre_wait_event(0); } #endif +#endif /* __KERNEL__ */ + +int ptlrpc_default_nal(void) +{ + if (ptlrpc_ninterfaces == 0) + return (-ENOENT); + + return (ptlrpc_interfaces[0].pni_number); +} int ptlrpc_init_portals(void) { @@ -548,11 +616,17 @@ int ptlrpc_init_portals(void) int number; char *name; } ptl_nis[] = { - {QSWNAL, "qswnal"}, - {SOCKNAL, "socknal"}, - {GMNAL, "gmnal"}, - {IBNAL, "ibnal"}, - {TCPNAL, "tcpnal"}}; +#if !CRAY_PORTALS + {QSWNAL, "qswnal"}, + {SOCKNAL, "socknal"}, + {GMNAL, "gmnal"}, + {OPENIBNAL, "openibnal"}, + {IIBNAL, "iibnal"}, + {TCPNAL, "tcpnal"}, +#else + {CRAY_KB_ERNAL, "cray_kb_ernal"}, +#endif + }; int rc; int i; diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index f6affa8..1171fb5 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -23,7 +23,6 @@ #define DEBUG_SUBSYSTEM S_RPC #ifndef __KERNEL__ #include -#include #endif #include #include @@ -35,7 +34,6 @@ static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len, ptl_ack_req_t ack, struct ptlrpc_cb_id *cbid, struct ptlrpc_connection *conn, int portal, __u64 xid) { - ptl_process_id_t remote_id; int rc; ptl_md_t md; char str[PTL_NALFMT_SIZE]; @@ -43,22 +41,16 @@ static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len, LASSERT (portal != 0); LASSERT (conn != NULL); - CDEBUG (D_INFO, "conn=%p ni %s nid "LPX64" (%s) on %s\n", + CDEBUG (D_INFO, "conn=%p ni %s id %s on %s\n", conn, conn->c_peer.peer_ni->pni_name, - conn->c_peer.peer_nid, - portals_nid2str(conn->c_peer.peer_ni->pni_number, - conn->c_peer.peer_nid, str), + ptlrpc_id2str(&conn->c_peer, str), conn->c_peer.peer_ni->pni_name); - - remote_id.nid = conn->c_peer.peer_nid, - remote_id.pid = 0; - md.start = base; md.length = len; md.threshold = (ack == PTL_ACK_REQ) ? 2 : 1; - md.options = 0; + md.options = PTLRPC_MD_OPTIONS; md.user_ptr = cbid; - md.eventq = conn->c_peer.peer_ni->pni_eq_h; + md.eq_handle = conn->c_peer.peer_ni->pni_eq_h; if (ack == PTL_ACK_REQ && OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_ACK | OBD_FAIL_ONCE)) { @@ -67,24 +59,26 @@ static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len, obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED; } - rc = PtlMDBind (conn->c_peer.peer_ni->pni_ni_h, md, mdh); + rc = PtlMDBind (conn->c_peer.peer_ni->pni_ni_h, md, + PTL_UNLINK, mdh); if (rc != PTL_OK) { CERROR ("PtlMDBind failed: %d\n", rc); - LASSERT (rc == PTL_NOSPACE); + LASSERT (rc == PTL_NO_SPACE); RETURN (-ENOMEM); } CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n", len, portal, xid); - rc = PtlPut (*mdh, ack, remote_id, portal, 0, xid, 0, 0); + rc = PtlPut (*mdh, ack, conn->c_peer.peer_id, portal, 0, xid, 0, 0); if (rc != PTL_OK) { int rc2; /* We're going to get an UNLINK event when I unlink below, * which will complete just like any other failed send, so * I fall through and return success here! */ - CERROR("PtlPut("LPU64", %d, "LPD64") failed: %d\n", - remote_id.nid, portal, xid, rc); + CERROR("PtlPut(%s, %d, "LPD64") failed: %d\n", + ptlrpc_id2str(&conn->c_peer, str), + portal, xid, rc); rc2 = PtlMDUnlink(*mdh); LASSERTF(rc2 == PTL_OK, "rc2 = %d\n", rc2); } @@ -97,9 +91,9 @@ int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc) int rc; int rc2; struct ptlrpc_peer *peer; - ptl_process_id_t remote_id; ptl_md_t md; __u64 xid; + char str[PTL_NALFMT_SIZE]; ENTRY; if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_PTLRPC_BULK_PUT_NET)) @@ -112,56 +106,50 @@ int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc) desc->bd_success = 0; peer = &desc->bd_export->exp_connection->c_peer; - md.start = &desc->bd_iov[0]; - md.niov = desc->bd_page_count; - md.length = desc->bd_nob; - md.eventq = peer->peer_ni->pni_eq_h; - md.threshold = 2; /* SENT and ACK/REPLY */ -#ifdef __KERNEL__ - md.options = PTL_MD_KIOV; -#else - md.options = PTL_MD_IOV; -#endif md.user_ptr = &desc->bd_cbid; + md.eq_handle = peer->peer_ni->pni_eq_h; + md.threshold = 2; /* SENT and ACK/REPLY */ + md.options = PTLRPC_MD_OPTIONS; + ptlrpc_fill_bulk_md(&md, desc); + LASSERT (desc->bd_cbid.cbid_fn == server_bulk_callback); LASSERT (desc->bd_cbid.cbid_arg == desc); /* NB total length may be 0 for a read past EOF, so we send a 0 * length bulk, since the client expects a bulk event. */ - rc = PtlMDBind(peer->peer_ni->pni_ni_h, md, &desc->bd_md_h); + rc = PtlMDBind(peer->peer_ni->pni_ni_h, md, + PTL_UNLINK, &desc->bd_md_h); if (rc != PTL_OK) { CERROR("PtlMDBind failed: %d\n", rc); - LASSERT (rc == PTL_NOSPACE); + LASSERT (rc == PTL_NO_SPACE); RETURN(-ENOMEM); } /* Client's bulk and reply matchbits are the same */ xid = desc->bd_req->rq_xid; - remote_id.nid = peer->peer_nid; - remote_id.pid = 0; - CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d on %s " - "nid "LPX64" pid %d xid "LPX64"\n", - md.niov, md.length, desc->bd_portal, peer->peer_ni->pni_name, - remote_id.nid, remote_id.pid, xid); + "nid %s pid %d xid "LPX64"\n", desc->bd_iov_count, + desc->bd_nob, desc->bd_portal, peer->peer_ni->pni_name, + ptlrpc_id2str(peer, str), peer->peer_id.pid, xid); /* Network is about to get at the memory */ desc->bd_network_rw = 1; if (desc->bd_type == BULK_PUT_SOURCE) - rc = PtlPut (desc->bd_md_h, PTL_ACK_REQ, remote_id, + rc = PtlPut (desc->bd_md_h, PTL_ACK_REQ, peer->peer_id, desc->bd_portal, 0, xid, 0, 0); else - rc = PtlGet (desc->bd_md_h, remote_id, + rc = PtlGet (desc->bd_md_h, peer->peer_id, desc->bd_portal, 0, xid, 0); - + if (rc != PTL_OK) { /* Can't send, so we unlink the MD bound above. The UNLINK * event this creates will signal completion with failure, * so we return SUCCESS here! */ - CERROR("Transfer("LPU64", %d, "LPX64") failed: %d\n", - remote_id.nid, desc->bd_portal, xid, rc); + CERROR("Transfer(%s, %d, "LPX64") failed: %d\n", + ptlrpc_id2str(peer, str), + desc->bd_portal, xid, rc); rc2 = PtlMDUnlink(desc->bd_md_h); LASSERT (rc2 == PTL_OK); } @@ -182,16 +170,11 @@ void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc) return; /* never started */ /* The unlink ensures the callback happens ASAP and is the last - * one. If it fails, it must be because completion just - * happened. */ + * one. If it fails, it must be because completion just happened, + * but we must still l_wait_event() in this case, to give liblustre + * a chance to run server_bulk_callback()*/ - rc = PtlMDUnlink (desc->bd_md_h); - if (rc == PTL_INV_MD) { - LASSERT(!ptlrpc_bulk_active(desc)); - return; - } - - LASSERT (rc == PTL_OK); + PtlMDUnlink (desc->bd_md_h); for (;;) { /* Network access will complete in finite time but the HUGE @@ -213,7 +196,6 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req) struct ptlrpc_peer *peer; int rc; int rc2; - ptl_process_id_t source_id; ptl_handle_me_t me_h; ptl_md_t md; ENTRY; @@ -224,7 +206,7 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req) /* NB no locking required until desc is on the network */ LASSERT (desc->bd_nob > 0); LASSERT (!desc->bd_network_rw); - LASSERT (desc->bd_page_count <= PTLRPC_MAX_BRW_PAGES); + LASSERT (desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); LASSERT (desc->bd_req != NULL); LASSERT (desc->bd_type == BULK_PUT_SINK || desc->bd_type == BULK_GET_SOURCE); @@ -233,19 +215,14 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req) peer = &desc->bd_import->imp_connection->c_peer; - md.start = &desc->bd_iov[0]; - md.niov = desc->bd_page_count; - md.length = desc->bd_nob; - md.eventq = peer->peer_ni->pni_eq_h; - md.threshold = 1; /* PUT or GET */ - md.options = (desc->bd_type == BULK_GET_SOURCE) ? - PTL_MD_OP_GET : PTL_MD_OP_PUT; -#ifdef __KERNEL__ - md.options |= PTL_MD_KIOV; -#else - md.options |= PTL_MD_IOV; -#endif md.user_ptr = &desc->bd_cbid; + md.eq_handle = peer->peer_ni->pni_eq_h; + md.threshold = 1; /* PUT or GET */ + md.options = PTLRPC_MD_OPTIONS | + ((desc->bd_type == BULK_GET_SOURCE) ? + PTL_MD_OP_GET : PTL_MD_OP_PUT); + ptlrpc_fill_bulk_md(&md, desc); + LASSERT (desc->bd_cbid.cbid_fn == client_bulk_callback); LASSERT (desc->bd_cbid.cbid_arg == desc); @@ -256,15 +233,12 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req) desc->bd_registered = 1; desc->bd_last_xid = req->rq_xid; - source_id.nid = desc->bd_import->imp_connection->c_peer.peer_nid; - source_id.pid = PTL_PID_ANY; - - rc = PtlMEAttach(peer->peer_ni->pni_ni_h, - desc->bd_portal, source_id, req->rq_xid, 0, - PTL_UNLINK, PTL_INS_AFTER, &me_h); + rc = PtlMEAttach(peer->peer_ni->pni_ni_h, desc->bd_portal, + desc->bd_import->imp_connection->c_peer.peer_id, + req->rq_xid, 0, PTL_UNLINK, PTL_INS_AFTER, &me_h); if (rc != PTL_OK) { CERROR("PtlMEAttach failed: %d\n", rc); - LASSERT (rc == PTL_NOSPACE); + LASSERT (rc == PTL_NO_SPACE); RETURN (-ENOMEM); } @@ -273,7 +247,7 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req) rc = PtlMDAttach(me_h, md, PTL_UNLINK, &desc->bd_md_h); if (rc != PTL_OK) { CERROR("PtlMDAttach failed: %d\n", rc); - LASSERT (rc == PTL_NOSPACE); + LASSERT (rc == PTL_NO_SPACE); desc->bd_network_rw = 0; rc2 = PtlMEUnlink (me_h); LASSERT (rc2 == PTL_OK); @@ -283,7 +257,7 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req) CDEBUG(D_NET, "Setup bulk %s buffers: %u pages %u bytes, xid "LPX64", " "portal %u on %s\n", desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink", - md.niov, md.length, + desc->bd_iov_count, desc->bd_nob, req->rq_xid, desc->bd_portal, peer->peer_ni->pni_name); RETURN(0); } @@ -305,17 +279,12 @@ void ptlrpc_unregister_bulk (struct ptlrpc_request *req) LASSERT (desc->bd_req == req); /* bd_req NULL until registered */ /* the unlink ensures the callback happens ASAP and is the last - * one. If it fails, it must be because completion just - * happened. */ - - rc = PtlMDUnlink (desc->bd_md_h); - if (rc == PTL_INV_MD) { - LASSERT(!ptlrpc_bulk_active(desc)); - return; - } - - LASSERT (rc == PTL_OK); + * one. If it fails, it must be because completion just happened, + * but we must still l_wait_event() in this case to give liblustre + * a chance to run client_bulk_callback() */ + PtlMDUnlink (desc->bd_md_h); + if (req->rq_set != NULL) wq = &req->rq_set->set_waitq; else @@ -416,7 +385,6 @@ int ptl_send_rpc(struct ptlrpc_request *request) int rc2; struct ptlrpc_connection *connection; unsigned long flags; - ptl_process_id_t source_id; ptl_handle_me_t reply_me_h; ptl_md_t reply_md; ENTRY; @@ -438,10 +406,7 @@ int ptl_send_rpc(struct ptlrpc_request *request) request->rq_reqmsg->handle = request->rq_import->imp_remote_handle; request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST; request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt; - - source_id.nid = connection->c_peer.peer_nid; - source_id.pid = PTL_PID_ANY; - + LASSERT (request->rq_replen != 0); if (request->rq_repmsg == NULL) OBD_ALLOC(request->rq_repmsg, request->rq_replen); @@ -450,11 +415,11 @@ int ptl_send_rpc(struct ptlrpc_request *request) rc = PtlMEAttach(connection->c_peer.peer_ni->pni_ni_h, request->rq_reply_portal, /* XXX FIXME bug 249 */ - source_id, request->rq_xid, 0, PTL_UNLINK, - PTL_INS_AFTER, &reply_me_h); + connection->c_peer.peer_id, request->rq_xid, 0, + PTL_UNLINK, PTL_INS_AFTER, &reply_me_h); if (rc != PTL_OK) { CERROR("PtlMEAttach failed: %d\n", rc); - LASSERT (rc == PTL_NOSPACE); + LASSERT (rc == PTL_NO_SPACE); GOTO(cleanup_repmsg, rc = -ENOMEM); } @@ -473,15 +438,15 @@ int ptl_send_rpc(struct ptlrpc_request *request) reply_md.start = request->rq_repmsg; reply_md.length = request->rq_replen; reply_md.threshold = 1; - reply_md.options = PTL_MD_OP_PUT; + reply_md.options = PTLRPC_MD_OPTIONS | PTL_MD_OP_PUT; reply_md.user_ptr = &request->rq_reply_cbid; - reply_md.eventq = connection->c_peer.peer_ni->pni_eq_h; + reply_md.eq_handle = connection->c_peer.peer_ni->pni_eq_h; rc = PtlMDAttach(reply_me_h, reply_md, PTL_UNLINK, &request->rq_reply_md_h); if (rc != PTL_OK) { CERROR("PtlMDAttach failed: %d\n", rc); - LASSERT (rc == PTL_NOSPACE); + LASSERT (rc == PTL_NO_SPACE); GOTO(cleanup_me, rc -ENOMEM); } @@ -537,10 +502,8 @@ int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd) ptl_md_t md; ptl_handle_me_t me_h; - CDEBUG(D_NET, "PtlMEAttach: portal %d on %s h %lx."LPX64"\n", - service->srv_req_portal, srv_ni->sni_ni->pni_name, - srv_ni->sni_ni->pni_ni_h.nal_idx, - srv_ni->sni_ni->pni_ni_h.cookie); + CDEBUG(D_NET, "PtlMEAttach: portal %d on %s\n", + service->srv_req_portal, srv_ni->sni_ni->pni_name); if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_PTLRPC_RQBD)) return (-ENOMEM); @@ -555,20 +518,20 @@ int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd) LASSERT(rqbd->rqbd_refcount == 0); rqbd->rqbd_refcount = 1; - md.start = rqbd->rqbd_buffer; - md.length = service->srv_buf_size; - md.max_size = service->srv_max_req_size; - md.threshold = PTL_MD_THRESH_INF; - md.options = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE | PTL_MD_AUTO_UNLINK; - md.user_ptr = &rqbd->rqbd_cbid; - md.eventq = srv_ni->sni_ni->pni_eq_h; + md.start = rqbd->rqbd_buffer; + md.length = service->srv_buf_size; + md.max_size = service->srv_max_req_size; + md.threshold = PTL_MD_THRESH_INF; + md.options = PTLRPC_MD_OPTIONS | PTL_MD_OP_PUT | PTL_MD_MAX_SIZE; + md.user_ptr = &rqbd->rqbd_cbid; + md.eq_handle = srv_ni->sni_ni->pni_eq_h; rc = PtlMDAttach(me_h, md, PTL_UNLINK, &rqbd->rqbd_md_h); if (rc == PTL_OK) return (0); CERROR("PtlMDAttach failed: %d; \n", rc); - LASSERT (rc == PTL_NOSPACE); + LASSERT (rc == PTL_NO_SPACE); rc = PtlMEUnlink (me_h); LASSERT (rc == PTL_OK); rqbd->rqbd_refcount = 0; diff --git a/lustre/ptlrpc/pers.c b/lustre/ptlrpc/pers.c index bcbf095..6f5d086 100644 --- a/lustre/ptlrpc/pers.c +++ b/lustre/ptlrpc/pers.c @@ -35,7 +35,8 @@ #include "ptlrpc_internal.h" #ifdef __KERNEL__ -#ifndef CRAY_PORTALS +#if !CRAY_PORTALS + void ptlrpc_fill_bulk_md (ptl_md_t *md, struct ptlrpc_bulk_desc *desc) { LASSERT (desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); @@ -57,11 +58,16 @@ void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page, desc->bd_iov_count++; } -#else + +#else /* CRAY_PORTALS */ +#ifdef PTL_MD_KIOV +#error "Conflicting compilation directives" +#endif + void ptlrpc_fill_bulk_md (ptl_md_t *md, struct ptlrpc_bulk_desc *desc) { LASSERT (desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); - LASSERT (!(md->options & (PTL_MD_IOVEC | PTL_MD_KIOV | PTL_MD_PHYS))); + LASSERT (!(md->options & (PTL_MD_IOVEC | PTL_MD_PHYS))); md->options |= (PTL_MD_IOVEC | PTL_MD_PHYS); md->start = &desc->bd_iov[0]; @@ -79,22 +85,24 @@ void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page, desc->bd_iov_count++; } -#endif +#endif /* CRAY_PORTALS */ #else /* !__KERNEL__ */ + void ptlrpc_fill_bulk_md(ptl_md_t *md, struct ptlrpc_bulk_desc *desc) { +#if CRAY_PORTALS + LASSERT (!(md->options & (PTL_MD_IOVEC | PTL_MD_PHYS))); + LASSERT (desc->bd_iov_count == 1); +#else LASSERT (!(md->options & (PTL_MD_IOVEC | PTL_MD_KIOV | PTL_MD_PHYS))); - +#endif if (desc->bd_iov_count == 1) { md->start = desc->bd_iov[0].iov_base; md->length = desc->bd_iov[0].iov_len; return; } -#if CRAY_PORTALS - LBUG(); -#endif md->options |= PTL_MD_IOVEC; md->start = &desc->bd_iov[0]; md->length = desc->bd_iov_count; @@ -104,14 +112,12 @@ static int can_merge_iovs(ptl_md_iovec_t *existing, ptl_md_iovec_t *candidate) { if (existing->iov_base + existing->iov_len == candidate->iov_base) return 1; - /* XXX it's good to have an warning here, but user-level echo_client - * will hit this. reenable it when we fixed echo_client. - */ #if 0 + /* Enable this section to provide earlier evidence of fragmented bulk */ CERROR("Can't merge iovs %p for %x, %p for %x\n", existing->iov_base, existing->iov_len, candidate->iov_base, candidate->iov_len); -#endif +#endif return 0; } @@ -129,4 +135,5 @@ void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page, desc->bd_iov_count++; } } -#endif + +#endif /* !__KERNEL__ */ diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index 1db774e..e49b5f9 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -107,6 +107,11 @@ enum { int ptlrpc_expire_one_request(struct ptlrpc_request *req); +/* pers.c */ +void ptlrpc_fill_bulk_md(ptl_md_t *md, struct ptlrpc_bulk_desc *desc); +void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page, + int pageoffset, int len); + /* pinger.c */ int ptlrpc_start_pinger(void); int ptlrpc_stop_pinger(void); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index f3caf6a..94eb45d 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -48,17 +48,17 @@ ptlrpc_free_server_req (struct ptlrpc_request *req) OBD_FREE(req, sizeof(*req)); } - + static char * ptlrpc_alloc_request_buffer (int size) { char *ptr; - + if (size > SVC_BUF_VMALLOC_THRESHOLD) OBD_VMALLOC(ptr, size); else OBD_ALLOC(ptr, size); - + return (ptr); } @@ -372,7 +372,7 @@ ptlrpc_server_free_request(struct ptlrpc_service *svc, struct ptlrpc_request *re ptlrpc_free_server_req(req); } -static int +static int ptlrpc_server_handle_request (struct ptlrpc_service *svc) { struct ptlrpc_request *request; @@ -419,17 +419,16 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc) #endif rc = lustre_unpack_msg (request->rq_reqmsg, request->rq_reqlen); if (rc != 0) { - CERROR ("error unpacking request: ptl %d from "LPX64 + CERROR ("error unpacking request: ptl %d from %s" " xid "LPU64"\n", svc->srv_req_portal, - request->rq_peer.peer_nid, request->rq_xid); + request->rq_peerstr, request->rq_xid); goto out; } rc = -EINVAL; if (request->rq_reqmsg->type != PTL_RPC_MSG_REQUEST) { - CERROR("wrong packet type received (type=%u) from " - LPX64"\n", request->rq_reqmsg->type, - request->rq_peer.peer_nid); + CERROR("wrong packet type received (type=%u) from %s\n", + request->rq_reqmsg->type, request->rq_peerstr); goto out; } @@ -439,9 +438,10 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc) * client's timeout is similar to mine, she'll be timing out this * REQ anyway (bug 1502) */ if (timediff / 1000000 > (long)obd_timeout) { - CERROR("Dropping timed-out opc %d request from "LPX64 + CERROR("Dropping timed-out opc %d request from %s" ": %ld seconds old\n", request->rq_reqmsg->opc, - request->rq_peer.peer_nid, timediff / 1000000); + request->rq_peerstr, + timediff / 1000000); goto out; } @@ -461,26 +461,27 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc) } CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:ni:nid:opc " - "%s:%s+%d:%d:"LPU64":%s:"LPX64":%d\n", current->comm, + "%s:%s+%d:%d:"LPU64":%s:%s:%d\n", current->comm, (request->rq_export ? (char *)request->rq_export->exp_client_uuid.uuid : "0"), (request->rq_export ? atomic_read(&request->rq_export->exp_refcount) : -99), request->rq_reqmsg->status, request->rq_xid, request->rq_peer.peer_ni->pni_name, - request->rq_peer.peer_nid, + request->rq_peerstr, request->rq_reqmsg->opc); rc = svc->srv_handler(request); + CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:ni:nid:opc " - "%s:%s+%d:%d:"LPU64":%s:"LPX64":%d\n", current->comm, + "%s:%s+%d:%d:"LPU64":%s:%s:%d\n", current->comm, (request->rq_export ? (char *)request->rq_export->exp_client_uuid.uuid : "0"), (request->rq_export ? atomic_read(&request->rq_export->exp_refcount) : -99), request->rq_reqmsg->status, request->rq_xid, request->rq_peer.peer_ni->pni_name, - request->rq_peer.peer_nid, + request->rq_peerstr, request->rq_reqmsg->opc); put_conn: @@ -493,9 +494,9 @@ put_conn: timediff = timeval_sub(&work_end, &work_start); CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA, - "request "LPU64" opc %u from NID "LPX64" processed in %ldus " + "request "LPU64" opc %u from %s processed in %ldus " "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc, - request->rq_peer.peer_nid, + request->rq_peerstr, timediff, timeval_sub(&work_end, &request->rq_arrival_time)); if (svc->srv_stats != NULL) { @@ -522,6 +523,7 @@ ptlrpc_server_handle_reply (struct ptlrpc_service *svc) struct obd_device *obd; int nlocks; int been_handled; + char str[PTL_NALFMT_SIZE]; ENTRY; spin_lock_irqsave (&svc->srv_lock, flags); @@ -566,10 +568,11 @@ ptlrpc_server_handle_reply (struct ptlrpc_service *svc) /* If we see this, we should already have seen the warning * in mds_steal_ack_locks() */ CWARN("All locks stolen from rs %p x"LPD64".t"LPD64 - " o%d NID"LPX64"\n", + " o%d NID %s\n", rs, rs->rs_xid, rs->rs_transno, - rs->rs_msg.opc, exp->exp_connection->c_peer.peer_nid); + rs->rs_msg.opc, + ptlrpc_peernid2str(&exp->exp_connection->c_peer, str)); } if ((!been_handled && rs->rs_on_net) || @@ -662,7 +665,8 @@ static void ptlrpc_check_rqbd_pools(struct ptlrpc_service *svc) { struct ptlrpc_srv_ni *sni; - int i, avail = 0; + int i; + int avail = 0; int low_water = svc->srv_nbuf_per_group/2; for (i = 0; i < ptlrpc_ninterfaces; i++) { @@ -673,6 +677,7 @@ ptlrpc_check_rqbd_pools(struct ptlrpc_service *svc) if (sni->sni_nrqbd_receiving <= low_water) ptlrpc_grow_req_bufs(sni); } + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQBUF_AVAIL_CNTR, avail); } @@ -897,7 +902,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) rqbd_list); rc = PtlMDUnlink(rqbd->rqbd_md_h); - LASSERT (rc == PTL_OK || rc == PTL_INV_MD); + LASSERT (rc == PTL_OK || rc == PTL_MD_INVALID); } /* Wait for the network to release any buffers it's diff --git a/lustre/utils/Lustre/lustredb.py b/lustre/utils/Lustre/lustredb.py index eda5779..fb578fa 100644 --- a/lustre/utils/Lustre/lustredb.py +++ b/lustre/utils/Lustre/lustredb.py @@ -269,6 +269,13 @@ class LustreDB_XML(LustreDB): ret.append((net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)) return ret + def get_hostaddr(self): + ret = [] + list = self.dom_node.getElementsByTagName('hostaddr') + for node in list: + ret.append(node.firstChild.data) + return ret + def _update_active(self, tgt, new): raise Lustre.LconfError("updates not implemented for XML") diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 0a1d5bc..aa7b1aa 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -250,21 +250,16 @@ class DaemonHandler: log(self.pidfile(), e) class AcceptorHandler(DaemonHandler): - def __init__(self, port, net_type, send_mem, recv_mem, irq_aff): + def __init__(self, port, net_type): DaemonHandler.__init__(self, "acceptor") self.port = port self.flags = '' - self.send_mem = send_mem - self.recv_mem = recv_mem - - if irq_aff: - self.flags = self.flags + ' -i' def pidfile(self): return "/var/run/%s-%d.pid" % (self.command, self.port) def command_line(self): - return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port))) + return string.join(map(str,(self.flags, self.port))) acceptors = {} @@ -418,33 +413,51 @@ class LCTLInterface: quit """ % (net, nid) self.run(cmds) + # add an interface + def add_interface(self, net, ip, netmask = ""): + """ add an interface """ + cmds = """ + network %s + add_interface %s %s + quit """ % (net, ip, netmask) + self.run(cmds) + + # delete an interface + def del_interface(self, net, ip): + """ delete an interface """ + cmds = """ + network %s + del_interface %s + quit """ % (net, ip) + self.run(cmds) + # create a new connection def add_uuid(self, net_type, uuid, nid): cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type) self.run(cmds) - def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr, - port, flags): + def add_peer(self, net_type, nid, hostaddr, port): if net_type in ('tcp',) and not config.lctl_dump: cmds = """ network %s - send_mem %d - recv_mem %d - add_autoconn %s %s %d %s + add_peer %s %s %d quit""" % (net_type, - send_mem, - recv_mem, - nid, hostaddr, port, flags ) + nid, hostaddr, port ) + self.run(cmds) + elif net_type in ('openib','iib',) and not config.lctl_dump: + cmds = """ + network %s + add_peer %s + quit""" % (net_type, + nid ) self.run(cmds) def connect(self, srv): self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid) - if srv.net_type in ('tcp',) and not config.lctl_dump: - flags = 'se' - if srv.irq_affinity: - flags = flags + 'i' - self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem, - srv.nid, srv.hostaddr, srv.port, flags) + if srv.net_type in ('tcp','openib','iib',) and not config.lctl_dump: + if srv.hostaddr[0]: + hostaddr = string.split(srv.hostaddr[0], '/')[0] + self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port) # Recover a device def recover(self, dev_name, new_conn): @@ -499,21 +512,31 @@ class LCTLInterface: self.run(cmds) - def del_autoconn(self, net_type, nid, hostaddr): + def del_peer(self, net_type, nid, hostaddr): if net_type in ('tcp',) and not config.lctl_dump: cmds = """ ignore_errors network %s - del_autoconn %s %s s + del_peer %s %s single_share quit""" % (net_type, nid, hostaddr) self.run(cmds) + elif net_type in ('openib','iib',) and not config.lctl_dump: + cmds = """ + ignore_errors + network %s + del_peer %s single_share + quit""" % (net_type, + nid) + self.run(cmds) # disconnect one connection def disconnect(self, srv): self.del_uuid(srv.nid_uuid) - if srv.net_type in ('tcp',) and not config.lctl_dump: - self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr) + if srv.net_type in ('tcp','openib','iib',) and not config.lctl_dump: + if srv.hostaddr[0]: + hostaddr = string.split(srv.hostaddr[0], '/')[0] + self.del_peer(srv.net_type, srv.nid, hostaddr) def del_uuid(self, uuid): cmds = """ @@ -928,7 +951,7 @@ def sys_get_local_nid(net_type, wildcard, cluster_id): def sys_get_local_address(net_type, wildcard, cluster_id): """Return the local address for the network type.""" local = "" - if net_type in ('tcp',): + if net_type in ('tcp','openib','iib',): if ':' in wildcard: iface, star = string.split(wildcard, ':') local = if2addr(iface) @@ -1124,9 +1147,6 @@ class Network(Module): self.nid = self.db.get_val('nid', '*') self.cluster_id = self.db.get_val('clusterid', "0") self.port = self.db.get_val_int('port', 0) - self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF) - self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF) - self.irq_affinity = self.db.get_val_int('irqaffinity', 0) if '*' in self.nid: self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id) @@ -1139,14 +1159,17 @@ class Network(Module): self.nid_uuid = self.nid_to_uuid(self.nid) - self.hostaddr = self.db.get_val('hostaddr', self.nid) - if '*' in self.hostaddr: - self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id) - if not self.hostaddr: - panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id) - debug("hostaddr:", self.hostaddr) - - self.add_portals_module("libcfs", 'portals') + self.hostaddr = self.db.get_hostaddr() + if len(self.hostaddr) == 0: + self.hostaddr.append(self.nid) + if '*' in self.hostaddr[0]: + self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id) + if not self.hostaddr[0]: + panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id) + debug("hostaddr:", self.hostaddr[0]) + + self.add_portals_module("libcfs", 'libcfs') + self.add_portals_module("portals", 'portals') if node_needs_router(): self.add_portals_module("router", 'kptlrouter') if self.net_type == 'tcp': @@ -1155,6 +1178,10 @@ class Network(Module): self.add_portals_module("knals/qswnal", 'kqswnal') if self.net_type == 'gm': self.add_portals_module("knals/gmnal", 'kgmnal') + if self.net_type == 'openib': + self.add_portals_module("knals/openibnal", 'kopenibnal') + if self.net_type == 'iib': + self.add_portals_module("knals/iibnal", 'kiibnal') def nid_to_uuid(self, nid): return "NID_%s_UUID" %(nid,) @@ -1167,6 +1194,13 @@ class Network(Module): lctl.network(self.net_type, self.nid) if self.net_type == 'tcp': sys_tweak_socknal() + for hostaddr in self.db.get_hostaddr(): + ip = string.split(hostaddr, '/')[0] + if len(string.split(hostaddr, '/')) == 2: + netmask = string.split(hostaddr, '/')[1] + else: + netmask = "" + lctl.add_interface(self.net_type, ip, netmask) if self.net_type == 'elan': sys_optimize_elan() if self.port and node_is_router(): @@ -1209,6 +1243,10 @@ class Network(Module): stop_acceptor(self.port) if node_is_router(): self.disconnect_peer_gateways() + if self.net_type == 'tcp': + for hostaddr in self.db.get_hostaddr(): + ip = string.split(hostaddr, '/')[0] + lctl.del_interface(self.net_type, ip) class RouteTable(Module): def __init__(self,db): @@ -1216,9 +1254,9 @@ class RouteTable(Module): def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi): - # only setup connections for tcp NALs + # only setup connections for tcp, openib, and iib NALs srvdb = None - if not net_type in ('tcp',): + if not net_type in ('tcp','openib','iib',): return None # connect to target if route is to single node and this node is the gw @@ -2104,9 +2142,7 @@ def find_local_clusters(node_db): if srv.port > 0: if acceptors.has_key(srv.port): panic("duplicate port:", srv.port) - acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type, - srv.send_mem, srv.recv_mem, - srv.irq_affinity) + acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type) # This node is a gateway. is_router = 0 diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index 6603aca..fad8fb6 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -27,7 +27,6 @@ #include #include -#include #include #include "obdctl.h" #include "parser.h" @@ -68,14 +67,20 @@ command_t cmdlist[] = { "usage: --net "}, {"network", jt_ptl_network, 0, "commands that follow apply to net\n" "usage: network "}, - {"autoconn_list", jt_ptl_print_autoconnects, 0, "print autoconnect entries\n" - "usage: print_autoconns"}, - {"add_autoconn", jt_ptl_add_autoconnect, 0, "add an autoconnect entry\n" - "usage: add_autoconn [ise]"}, - {"del_autoconn",jt_ptl_del_autoconnect,0,"remove an autoconnect entry\n" + {"interface_list", jt_ptl_print_interfaces,0,"print interface entries\n" + "usage: interface_list"}, + {"add_interface", jt_ptl_add_interface, 0, "add interface entry\n" + "usage: add_interface ip [netmask]"}, + {"del_interface", jt_ptl_del_interface, 0, "del interface entry\n" + "usage: del_interface [ip]"}, + {"peer_list", jt_ptl_print_peers, 0, "print peer entries\n" + "usage: peer_list"}, + {"add_peer", jt_ptl_add_peer, 0, "add an peer entry\n" + "usage: add_peer "}, + {"del_peer", jt_ptl_del_peer, 0, "remove an peer entry\n" "usage: del_autoconn [] [] [ks]"}, - {"conn_list", jt_ptl_print_connections, 0, "connect to a remote nid\n" - "usage: print_conns"}, + {"conn_list", jt_ptl_print_connections, 0, "print all the connected remote nid\n" + "usage: conn_list"}, {"connect", jt_ptl_connect, 0, "connect to a remote nid\n" "usage: connect [iIOC]"}, {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid\n" @@ -91,7 +96,7 @@ command_t cmdlist[] = { {"add_uuid", jt_lcfg_add_uuid, 0, "associate a UUID with a nid\n" "usage: add_uuid "}, {"close_uuid", jt_obd_close_uuid, 0, "disconnect a UUID\n" - "usage: close_uuid )"}, + "usage: close_uuid "}, {"del_uuid", jt_lcfg_del_uuid, 0, "delete a UUID association\n" "usage: del_uuid "}, {"add_route", jt_ptl_add_route, 0, @@ -109,15 +114,6 @@ command_t cmdlist[] = { {"show_route", jt_ptl_print_routes, 0, "print the portals routing table, same as route_list\n" "usage: show_route"}, - {"recv_mem", jt_ptl_rxmem, 0, "set socket receive buffer size, " - "if size is omited the current size is reported.\n" - "usage: recv_mem [size]"}, - {"send_mem", jt_ptl_txmem, 0, "set socket send buffer size, " - "if size is omited the current size is reported.\n" - "usage: send_mem [size]"}, - {"nagle", jt_ptl_nagle, 0, "enable/disable nagle, omitting the " - "argument will cause the current nagle setting to be reported.\n" - "usage: nagle [on/off]"}, {"fail", jt_ptl_fail_nid, 0, "fail/restore communications.\n" "Omitting the count means indefinitely, 0 means restore, " "otherwise fail 'count' messages.\n" @@ -286,7 +282,7 @@ command_t cmdlist[] = { "usage: mark "}, {"filter", jt_dbg_filter, 0, "filter message type\n" "usage: filter "}, - {"show", jt_dbg_show, 0, "show type of messages\n" + {"show", jt_dbg_show, 0, "Show specific type of messages\n" "usage: show "}, {"debug_list", jt_dbg_list, 0, "list subsystem and debug types\n" "usage: debug_list "}, diff --git a/lustre/utils/lmc b/lustre/utils/lmc index 53985a7..33d6839 100755 --- a/lustre/utils/lmc +++ b/lustre/utils/lmc @@ -74,8 +74,8 @@ Object creation command summary: --node node_name --nid nid --cluster_id - --nettype tcp|elan|gm - --hostaddr addr + --nettype tcp|elan|gm|openib|iib + --hostaddr ip[/netmask] --port port --tcpbuf size --irq_affinity 0|1 @@ -108,7 +108,7 @@ Object creation command summary: --fstype extN|ext3 --journal_size size --inode_size size - --obdtype obdecho|obdfilter + --osdtype obdecho|obdfilter --ostuuid uuid --add mtpt - Mountpoint @@ -119,8 +119,10 @@ Object creation command summary: --add route --node nodename + --router --gw nid - --tgt nid + --gateway_cluster_id nid + --target_cluster_id nid --lo nid --hi nid @@ -133,6 +135,7 @@ Object creation command summary: """ PARAM = Lustre.Options.PARAM +PARAMLIST = Lustre.Options.PARAMLIST lmc_options = [ # lmc input/output options ('reference', "Print short reference for commands."), @@ -155,12 +158,10 @@ lmc_options = [ ('subsystem', "Specify which Lustre subsystems have debug output recorded in the log", PARAM), # network - ('nettype', "Specify the network type. This can be tcp/elan/gm.", PARAM), + ('nettype', "Specify the network type. This can be tcp/elan/gm/openib/iib.", PARAM), ('nid', "Give the network ID, e.g ElanID/IP Address as used by portals.", PARAM), - ('tcpbuf', "Optional argument to specify the TCP buffer size.", PARAM, "0"), ('port', "Optional argument to specify the TCP port number.", PARAM, DEFAULT_PORT), - ('irq_affinity', "Optional argument.", PARAM, 0), - ('hostaddr', "", PARAM,""), + ('hostaddr', "Optional argument to specify the host address.", PARAMLIST), ('cluster_id', "Specify the cluster ID", PARAM, "0"), # routes @@ -317,21 +318,16 @@ class GenConfig: return new def network(self, name, uuid, nid, cluster_id, net, hostaddr="", - port=0, tcpbuf=0, irq_aff=0): + port=0): """create node""" network = self.newService("network", name, uuid) network.setAttribute("nettype", net); self.addElement(network, "nid", nid) self.addElement(network, "clusterid", cluster_id) - if hostaddr: - self.addElement(network, "hostaddr", hostaddr) + for host in hostaddr: + self.addElement(network, "hostaddr", host) if port: self.addElement(network, "port", "%d" %(port)) - if tcpbuf: - self.addElement(network, "sendmem", "%d" %(tcpbuf)) - self.addElement(network, "recvmem", "%d" %(tcpbuf)) - if irq_aff: - self.addElement(network, "irqaffinity", "%d" %(irq_aff)) return network @@ -637,12 +633,8 @@ def add_net(gen, lustre, options): if net_type in ('tcp',): port = get_option_int(options, 'port') - tcpbuf = get_option_int(options, 'tcpbuf') - irq_aff = get_option_int(options, 'irq_affinity') - elif net_type in ('elan', 'gm'): + elif net_type in ('elan', 'gm', 'openib','iib'): port = 0 - tcpbuf = 0 - irq_aff = 0 else: print "Unknown net_type: ", net_type sys.exit(2) @@ -657,7 +649,7 @@ def add_net(gen, lustre, options): net_name = new_name('NET_'+ node_name +'_'+ net_type) net_uuid = new_uuid(net_name) node.appendChild(gen.network(net_name, net_uuid, nid, cluster_id, net_type, - hostaddr, port, tcpbuf, irq_aff)) + hostaddr, port)) node_add_profile(gen, node, "network", net_uuid) diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 8fb5d2e..49f4bbe 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -36,7 +36,7 @@ do { \ #define CHECK_MEMBER_OFFSET(s,m) \ do { \ - CHECK_VALUE(offsetof(struct s, m)); \ + CHECK_VALUE((int)offsetof(struct s, m)); \ } while(0) #define CHECK_MEMBER_SIZEOF(s,m) \ -- 1.8.3.1