From d6781ceec3d62da95602cd19711bda6c3dc08947 Mon Sep 17 00:00:00 2001 From: ericm Date: Sat, 4 Oct 2003 11:34:42 +0000 Subject: [PATCH] merge b_devel -> b_eq 20031004 kernel only --- lnet/archdep.m4 | 2 +- lnet/include/linux/kp30.h | 154 +- lnet/include/lnet/lnetctl.h | 2 + lnet/include/lnet/ptlctl.h | 2 + lnet/klnds/gmlnd/Makefile.am | 4 +- lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch | 43 - lnet/klnds/gmlnd/gmlnd.h | 510 +- lnet/klnds/gmlnd/gmlnd_cb.c | 659 +-- lnet/klnds/gmlnd/gmnal.c | 284 - lnet/klnds/qswlnd/qswlnd.c | 5 +- lnet/klnds/qswlnd/qswlnd.h | 1 + lnet/klnds/qswlnd/qswlnd_cb.c | 65 +- lnet/klnds/scimaclnd/scimacnal_cb.c | 2 +- lnet/klnds/socklnd/socklnd.c | 167 +- lnet/klnds/socklnd/socklnd.h | 47 +- lnet/klnds/socklnd/socklnd_cb.c | 364 +- lnet/klnds/toelnd/toenal.c | 4 +- lnet/klnds/toelnd/toenal_cb.c | 3 +- lnet/libcfs/Makefile.am | 2 +- lnet/libcfs/debug.c | 88 +- lnet/libcfs/module.c | 120 +- lnet/libcfs/proc.c | 11 +- lnet/lnet/api-init.c | 1 + lnet/lnet/lib-move.c | 1 + lnet/router/router.c | 513 +- lnet/router/router.h | 36 +- lnet/tests/ping_cli.c | 10 +- lnet/tests/ping_srv.c | 4 +- lnet/tests/sping_cli.c | 4 +- lnet/tests/sping_srv.c | 2 +- lnet/tests/startclient.sh | 9 +- lnet/tests/startserver.sh | 9 +- lnet/utils/.cvsignore | 1 + lnet/utils/Makefile.am | 4 +- lnet/utils/parser.c | 2 + lnet/utils/portals.c | 587 ++- lnet/utils/ptlctl.c | 10 +- lustre/Makefile.mk | 1 + .../kernel-2.4.20-rh-2.4-i686-smp.config | 1861 +++++++ .../kernel_configs/linux-2.4.20-rh-i686-smp.config | 1861 +++++++ lustre/kernel_patches/lustre-kernel.spec.in | 113 + .../patches/ext3-htree-2.4.22-rh.patch | 2569 ++++++++++ lustre/kernel_patches/patches/ext3-htree.patch | 2570 ++++++++++ .../patches/ext3-orphan_lock-2.4.22-rh.patch | 158 + .../kernel_patches/patches/ext3-wantedi-2.6.patch | 192 + .../patches/extN-wantedi-2.4.22-rh.patch | 217 + .../patches/iod-stock-exports-2.4.22-rh.patch | 48 + .../patches/socket-exports-2.4.22-rh.patch | 41 + .../patches/tcp-zero-copy-2.4.22-rh.patch | 459 ++ .../patches/vfs_intent-2.4.22-rh.patch | 1809 +++++++ .../patches/xattr-0.8.54-2.4.22-rh.patch | 5403 ++++++++++++++++++++ lustre/kernel_patches/pc/ext3-htree-2.4.22-rh.pc | 11 + .../pc/ext3-orphan_lock-2.4.22-rh.pc | 5 + lustre/kernel_patches/pc/ext3-wantedi-2.6.pc | 6 + lustre/kernel_patches/pc/extN-wantedi-2.4.22-rh.pc | 6 + .../pc/iod-stock-exports-2.4.22-rh.pc | 3 + .../kernel_patches/pc/socket-exports-2.4.22-rh.pc | 3 + .../kernel_patches/pc/tcp-zero-copy-2.4.22-rh.pc | 5 + lustre/kernel_patches/pc/vfs_intent-2.4.22-rh.pc | 12 + lustre/kernel_patches/pc/xattr-0.8.54-2.4.22-rh.pc | 58 + lustre/kernel_patches/series/rh-2.4.22 | 23 + lustre/kernel_patches/targets/rh-2.4.target | 5 + lustre/portals/archdep.m4 | 2 +- lustre/portals/include/linux/kp30.h | 154 +- lustre/portals/include/portals/ptlctl.h | 2 + lustre/portals/knals/gmnal/Makefile.am | 4 +- .../portals/knals/gmnal/gm-1.5.2.1-exports.patch | 43 - lustre/portals/knals/gmnal/gmnal.c | 284 - lustre/portals/knals/gmnal/gmnal.h | 510 +- lustre/portals/knals/gmnal/gmnal_cb.c | 659 +-- lustre/portals/knals/qswnal/qswnal.c | 5 +- lustre/portals/knals/qswnal/qswnal.h | 1 + lustre/portals/knals/qswnal/qswnal_cb.c | 65 +- lustre/portals/knals/scimacnal/scimacnal_cb.c | 2 +- lustre/portals/knals/socknal/socknal.c | 167 +- lustre/portals/knals/socknal/socknal.h | 47 +- lustre/portals/knals/socknal/socknal_cb.c | 364 +- lustre/portals/knals/toenal/toenal.c | 4 +- lustre/portals/knals/toenal/toenal_cb.c | 3 +- lustre/portals/libcfs/Makefile.am | 2 +- lustre/portals/libcfs/debug.c | 88 +- lustre/portals/libcfs/module.c | 120 +- lustre/portals/libcfs/proc.c | 11 +- lustre/portals/portals/api-init.c | 1 + lustre/portals/portals/lib-move.c | 1 + lustre/portals/router/router.c | 513 +- lustre/portals/router/router.h | 36 +- lustre/portals/tests/ping_cli.c | 10 +- lustre/portals/tests/ping_srv.c | 4 +- lustre/portals/tests/sping_cli.c | 4 +- lustre/portals/tests/sping_srv.c | 2 +- lustre/portals/tests/startclient.sh | 9 +- lustre/portals/tests/startserver.sh | 9 +- lustre/portals/utils/.cvsignore | 1 + lustre/portals/utils/Makefile.am | 4 +- lustre/portals/utils/parser.c | 2 + lustre/portals/utils/portals.c | 587 ++- lustre/portals/utils/ptlctl.c | 10 +- lustre/scripts/lmake | 340 ++ lustre/scripts/lustre-kernel-2.4.spec.in | 113 + lustre/scripts/lwizard | 319 ++ lustre/tests/filter_survey.sh | 230 + lustre/tests/ll_sparseness_verify.c | 238 + lustre/tests/ll_sparseness_write.c | 59 + lustre/tests/mrename.c | 19 + lustre/tests/run-llog.sh | 16 + lustre/utils/lfs.c | 180 + lustre/utils/liblustreapi.c | 415 ++ lustre/utils/lwizard | 319 ++ 109 files changed, 24521 insertions(+), 2628 deletions(-) delete mode 100644 lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch delete mode 100644 lnet/klnds/gmlnd/gmnal.c create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.4.20-rh-2.4-i686-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/linux-2.4.20-rh-i686-smp.config create mode 100644 lustre/kernel_patches/lustre-kernel.spec.in create mode 100644 lustre/kernel_patches/patches/ext3-htree-2.4.22-rh.patch create mode 100644 lustre/kernel_patches/patches/ext3-htree.patch create mode 100644 lustre/kernel_patches/patches/ext3-orphan_lock-2.4.22-rh.patch create mode 100644 lustre/kernel_patches/patches/ext3-wantedi-2.6.patch create mode 100644 lustre/kernel_patches/patches/extN-wantedi-2.4.22-rh.patch create mode 100644 lustre/kernel_patches/patches/iod-stock-exports-2.4.22-rh.patch create mode 100644 lustre/kernel_patches/patches/socket-exports-2.4.22-rh.patch create mode 100644 lustre/kernel_patches/patches/tcp-zero-copy-2.4.22-rh.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.22-rh.patch create mode 100644 lustre/kernel_patches/patches/xattr-0.8.54-2.4.22-rh.patch create mode 100644 lustre/kernel_patches/pc/ext3-htree-2.4.22-rh.pc create mode 100644 lustre/kernel_patches/pc/ext3-orphan_lock-2.4.22-rh.pc create mode 100644 lustre/kernel_patches/pc/ext3-wantedi-2.6.pc create mode 100644 lustre/kernel_patches/pc/extN-wantedi-2.4.22-rh.pc create mode 100644 lustre/kernel_patches/pc/iod-stock-exports-2.4.22-rh.pc create mode 100644 lustre/kernel_patches/pc/socket-exports-2.4.22-rh.pc create mode 100644 lustre/kernel_patches/pc/tcp-zero-copy-2.4.22-rh.pc create mode 100644 lustre/kernel_patches/pc/vfs_intent-2.4.22-rh.pc create mode 100644 lustre/kernel_patches/pc/xattr-0.8.54-2.4.22-rh.pc create mode 100644 lustre/kernel_patches/series/rh-2.4.22 create mode 100644 lustre/kernel_patches/targets/rh-2.4.target delete mode 100644 lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch delete mode 100644 lustre/portals/knals/gmnal/gmnal.c create mode 100755 lustre/scripts/lmake create mode 100644 lustre/scripts/lustre-kernel-2.4.spec.in create mode 100755 lustre/scripts/lwizard create mode 100644 lustre/tests/filter_survey.sh create mode 100644 lustre/tests/ll_sparseness_verify.c create mode 100644 lustre/tests/ll_sparseness_write.c create mode 100644 lustre/tests/mrename.c create mode 100644 lustre/tests/run-llog.sh create mode 100644 lustre/utils/lfs.c create mode 100644 lustre/utils/liblustreapi.c create mode 100755 lustre/utils/lwizard diff --git a/lnet/archdep.m4 b/lnet/archdep.m4 index 428fe42..0aa83b7 100644 --- a/lnet/archdep.m4 +++ b/lnet/archdep.m4 @@ -286,7 +286,7 @@ if test "${with_gm+set}" = set; then if test "${with_gm}" = yes; then with_gm="-I/usr/local/gm/include" else - with_gm=-I"$with_gm/include" + with_gm="-I$with_gm/include -I$with_gm/drivers -I$with_gm/drivers/linux/gm" fi GMNAL="gmnal" else diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h index a0626cc..69ae296 100644 --- a/lnet/include/linux/kp30.h +++ b/lnet/include/linux/kp30.h @@ -19,6 +19,7 @@ extern unsigned int portal_subsystem_debug; extern unsigned int portal_stack; extern unsigned int portal_debug; extern unsigned int portal_printk; +extern unsigned int portal_cerror; /* Debugging subsystems (32 bits, non-overlapping) */ #define S_UNDEFINED (1 << 0) #define S_MDC (1 << 1) @@ -106,6 +107,8 @@ extern unsigned int portal_printk; #if 1 #define CDEBUG(mask, format, a...) \ do { \ + if (portal_cerror == 0) \ + break; \ CHECK_STACK(CDEBUG_STACK); \ if (!(mask) || ((mask) & (D_ERROR | D_EMERG)) || \ (portal_debug & (mask) && \ @@ -207,8 +210,22 @@ extern void kportal_assertion_failed(char *expr, char *file, const char *func, const int line); #define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__, \ __FUNCTION__, __LINE__)) +/* it would be great to dump_stack() here, but some kernels + * export it as show_stack() and I can't be bothered to + * proprely engage in that dance right now */ +#define LASSERTF(cond, fmt...) \ + do { \ + if (unlikely(!(cond))) { \ + portals_debug_msg(0, D_EMERG, __FILE__, __FUNCTION__,\ + __LINE__, CDEBUG_STACK, \ + "ASSERTION(" #cond ") failed:" fmt);\ + LBUG(); \ + } \ + } while (0) + #else #define LASSERT(e) +#define LASSERTF(cond, fmt...) do { } while (0) #endif #ifdef __arch_um__ @@ -371,12 +388,14 @@ typedef struct { } kpr_fwd_desc_t; typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd); +typedef void (*kpr_notify_t)(void *arg, ptl_nid_t peer, int alive); /* NAL's routing interface (Kernel Portals Routing Nal Interface) */ typedef const struct { int kprni_nalid; /* NAL's id */ void *kprni_arg; /* Arg to pass when calling into NAL */ kpr_fwd_t kprni_fwd; /* NAL's forwarding entrypoint */ + kpr_notify_t kprni_notify; /* NAL's notification entrypoint */ } kpr_nal_interface_t; /* Router's routing interface (Kernel Portals Routing Router Interface) */ @@ -386,9 +405,10 @@ typedef const struct { int (*kprri_register) (kpr_nal_interface_t *nal_interface, void **router_arg); - /* ask the router to find a gateway that forwards to 'nid' and is a peer - * of the calling NAL */ - int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, + /* ask the router to find a gateway that forwards to 'nid' and is a + * peer of the calling NAL; assume caller will send 'nob' bytes of + * payload there */ + int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid); /* hand a packet over to the router for forwarding */ @@ -398,6 +418,10 @@ typedef const struct { void (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd, int error); + /* notify the router about peer state */ + void (*kprri_notify) (void *router_arg, ptl_nid_t peer, + int alive, time_t when); + /* the calling NAL is shutting down */ void (*kprri_shutdown) (void *router_arg); @@ -416,10 +440,14 @@ typedef struct { typedef const struct { int (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, ptl_nid_t hi_nid); - int (*kprci_del_route)(ptl_nid_t nid); + int (*kprci_del_route)(int gateway_nal, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid); int (*kprci_get_route)(int index, int *gateway_nal, - ptl_nid_t *gateway, ptl_nid_t *lo_nid, - ptl_nid_t *hi_nid); + ptl_nid_t *gateway, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, + int *alive); + int (*kprci_notify)(int gateway_nal, ptl_nid_t gateway_nid, + int alive, time_t when); } kpr_control_interface_t; extern kpr_control_interface_t kpr_control_interface; @@ -449,12 +477,12 @@ kpr_routing (kpr_router_t *router) } static inline int -kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid) +kpr_lookup (kpr_router_t *router, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid) { if (!kpr_routing (router)) - return (-EHOSTUNREACH); + return (-ENETUNREACH); - return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, + return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, nob, gateway_nid)); } @@ -476,7 +504,7 @@ static inline void kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd) { if (!kpr_routing (router)) - fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH); + fwd->kprfd_callback (fwd->kprfd_callback_arg, -ENETUNREACH); else router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd); } @@ -489,6 +517,16 @@ kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error) } static inline void +kpr_notify (kpr_router_t *router, + ptl_nid_t peer, int alive, time_t when) +{ + if (!kpr_routing (router)) + return; + + router->kpr_interface->kprri_notify(router->kpr_arg, peer, alive, when); +} + +static inline void kpr_shutdown (kpr_router_t *router) { if (kpr_routing (router)) @@ -554,6 +592,7 @@ extern struct prof_ent prof_ents[MAX_PROFS]; #endif /* PORTALS_PROFILING */ /* debug.c */ +void portals_run_upcall(char **argv); void portals_run_lbug_upcall(char * file, const char *fn, const int line); void portals_debug_dumplog(void); int portals_debug_init(unsigned long bufsize); @@ -608,8 +647,10 @@ extern void kportal_blockallsigs (void); # undef NDEBUG # include # define LASSERT(e) assert(e) +# define LASSERTF(cond, args...) assert(cond) # else # define LASSERT(e) +# define LASSERTF(cond, args...) do { } while (0) # endif # define printk(format, args...) printf (format, ## args) # define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0); @@ -624,6 +665,92 @@ extern void kportal_blockallsigs (void); # define CURRENT_TIME time(0) #endif +/******************************************************************************/ +/* Light-weight trace + * Support for temporary event tracing with minimal Heisenberg effect. */ +#define LWT_SUPPORT 1 + +typedef struct { + cycles_t lwte_when; + char *lwte_where; + void *lwte_task; + long lwte_p1; + long lwte_p2; + long lwte_p3; + long lwte_p4; +} lwt_event_t; + +#if LWT_SUPPORT +#ifdef __KERNEL__ +#define LWT_EVENTS_PER_PAGE (PAGE_SIZE / sizeof (lwt_event_t)) + +typedef struct _lwt_page { + struct list_head lwtp_list; + struct page *lwtp_page; + lwt_event_t *lwtp_events; +} lwt_page_t; + +typedef struct { + int lwtc_current_index; + lwt_page_t *lwtc_current_page; +} lwt_cpu_t; + +extern int lwt_enabled; +extern lwt_cpu_t lwt_cpus[]; + +extern int lwt_init (void); +extern void lwt_fini (void); +extern int lwt_lookup_string (int *size, char *knlptr, + char *usrptr, int usrsize); +extern int lwt_control (int enable, int clear); +extern int lwt_snapshot (int *ncpu, int *total_size, + void *user_ptr, int user_size); + +/* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set. + * This stuff is meant for finding specific problems; it never stays in + * production code... */ + +#define LWTSTR(n) #n +#define LWTWHERE(f,l) f ":" LWTSTR(l) + +#define LWT_EVENT(p1, p2, p3, p4) \ +do { \ + unsigned long flags; \ + lwt_cpu_t *cpu; \ + lwt_page_t *p; \ + lwt_event_t *e; \ + \ + local_irq_save (flags); \ + \ + if (lwt_enabled) { \ + cpu = &lwt_cpus[smp_processor_id()]; \ + p = cpu->lwtc_current_page; \ + e = &p->lwtp_events[cpu->lwtc_current_index++]; \ + \ + if (cpu->lwtc_current_index >= LWT_EVENTS_PER_PAGE) { \ + cpu->lwtc_current_page = \ + list_entry (p->lwtp_list.next, \ + lwt_page_t, lwtp_list); \ + cpu->lwtc_current_index = 0; \ + } \ + \ + e->lwte_when = get_cycles(); \ + e->lwte_where = LWTWHERE(__FILE__,__LINE__); \ + e->lwte_task = current; \ + e->lwte_p1 = (long)(p1); \ + e->lwte_p2 = (long)(p2); \ + e->lwte_p3 = (long)(p3); \ + e->lwte_p4 = (long)(p4); \ + } \ + \ + local_irq_restore (flags); \ +} while (0) +#else /* __KERNEL__ */ +#define LWT_EVENT(p1,p2,p3,p4) /* no userland implementation yet */ +#endif /* __KERNEL__ */ +#endif /* LWT_SUPPORT */ + + #include /* @@ -858,8 +985,11 @@ static inline int portal_ioctl_getdata(char *buf, char *end, void *arg) #define IOC_PORTAL_GET_NID _IOWR('e', 39, long) #define IOC_PORTAL_FAIL_NID _IOWR('e', 40, long) #define IOC_PORTAL_SET_DAEMON _IOWR('e', 41, long) - -#define IOC_PORTAL_MAX_NR 41 +#define IOC_PORTAL_NOTIFY_ROUTER _IOWR('e', 42, long) +#define IOC_PORTAL_LWT_CONTROL _IOWR('e', 43, long) +#define IOC_PORTAL_LWT_SNAPSHOT _IOWR('e', 44, long) +#define IOC_PORTAL_LWT_LOOKUP_STRING _IOWR('e', 45, long) +#define IOC_PORTAL_MAX_NR 45 enum { QSWNAL = 1, diff --git a/lnet/include/lnet/lnetctl.h b/lnet/include/lnet/lnetctl.h index 8278111..7763f1b 100644 --- a/lnet/include/lnet/lnetctl.h +++ b/lnet/include/lnet/lnetctl.h @@ -54,8 +54,10 @@ int jt_ptl_txmem (int argc, char **argv); int jt_ptl_nagle (int argc, char **argv); int jt_ptl_add_route (int argc, char **argv); int jt_ptl_del_route (int argc, char **argv); +int jt_ptl_notify_router (int argc, char **argv); int jt_ptl_print_routes (int argc, char **argv); int jt_ptl_fail_nid (int argc, char **argv); +int jt_ptl_lwt(int argc, char **argv); int dbg_initialize(int argc, char **argv); int jt_dbg_filter(int argc, char **argv); diff --git a/lnet/include/lnet/ptlctl.h b/lnet/include/lnet/ptlctl.h index 8278111..7763f1b 100644 --- a/lnet/include/lnet/ptlctl.h +++ b/lnet/include/lnet/ptlctl.h @@ -54,8 +54,10 @@ int jt_ptl_txmem (int argc, char **argv); int jt_ptl_nagle (int argc, char **argv); int jt_ptl_add_route (int argc, char **argv); int jt_ptl_del_route (int argc, char **argv); +int jt_ptl_notify_router (int argc, char **argv); int jt_ptl_print_routes (int argc, char **argv); int jt_ptl_fail_nid (int argc, char **argv); +int jt_ptl_lwt(int argc, char **argv); int dbg_initialize(int argc, char **argv); int jt_dbg_filter(int argc, char **argv); diff --git a/lnet/klnds/gmlnd/Makefile.am b/lnet/klnds/gmlnd/Makefile.am index 1dc6f4e..bac4680 100644 --- a/lnet/klnds/gmlnd/Makefile.am +++ b/lnet/klnds/gmlnd/Makefile.am @@ -9,5 +9,5 @@ MODULE = kgmnal modulenet_DATA = kgmnal.o EXTRA_PROGRAMS = kgmnal -DEFS = -kgmnal_SOURCES = gmnal.c gmnal_cb.c gmnal.h +DEFS = -DGM_KERNEL +kgmnal_SOURCES = gmnal.h gmnal_api.c gmnal_cb.c gmnal_comm.c gmnal_utils.c gmnal_module.c diff --git a/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch b/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch deleted file mode 100644 index 23c80d9..0000000 --- a/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch +++ /dev/null @@ -1,43 +0,0 @@ -diff -ru gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c ---- gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c Mon Jul 1 10:35:09 2002 -+++ gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c Thu Sep 19 14:19:38 2002 -@@ -30,6 +30,8 @@ - * - ************************************************************************/ - -+#define EXPORT_SYMTAB -+ - #include - #include - -@@ -4075,6 +4077,28 @@ - return 0; - } - -+EXPORT_SYMBOL(gm_blocking_receive_no_spin); -+EXPORT_SYMBOL(gm_close); -+EXPORT_SYMBOL(gm_dma_free); -+EXPORT_SYMBOL(gm_dma_malloc); -+EXPORT_SYMBOL(gm_drop_sends); -+EXPORT_SYMBOL(gm_finalize); -+EXPORT_SYMBOL(gm_get_node_id); -+EXPORT_SYMBOL(gm_init); -+EXPORT_SYMBOL(gm_initialize_alarm); -+EXPORT_SYMBOL(gm_max_node_id_in_use); -+EXPORT_SYMBOL(gm_min_size_for_length); -+EXPORT_SYMBOL(gm_num_receive_tokens); -+EXPORT_SYMBOL(gm_num_send_tokens); -+EXPORT_SYMBOL(gm_open); -+EXPORT_SYMBOL(gm_provide_receive_buffer); -+EXPORT_SYMBOL(gm_resume_sending); -+EXPORT_SYMBOL(gm_send_with_callback); -+EXPORT_SYMBOL(gm_set_acceptable_sizes); -+EXPORT_SYMBOL(gm_set_alarm); -+EXPORT_SYMBOL(gm_unknown); -+ -+ - /* - This file uses GM standard indentation. - -Only in gm-1.5.2.1_Linux-cfs/drivers/linux/gm: gm_arch.c~ -Only in gm-1.5.2.1_Linux-cfs/: trace diff --git a/lnet/klnds/gmlnd/gmlnd.h b/lnet/klnds/gmlnd/gmlnd.h index 47e8c3c..fdde839 100644 --- a/lnet/klnds/gmlnd/gmlnd.h +++ b/lnet/klnds/gmlnd/gmlnd.h @@ -1,101 +1,455 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2003 Los Alamos National Laboratory (LANL) + * + * This file is part of Lustre, http://www.lustre.org/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#ifndef _GMNAL_H -#define _GMNAL_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include + + +/* + * Portals GM kernel NAL header file + * This file makes all declaration and prototypes + * for the API side and CB side of the NAL + */ +#ifndef __INCLUDE_GMNAL_H__ +#define __INCLUDE_GMNAL_H__ + +#include "linux/config.h" +#include "linux/module.h" +#include "linux/tty.h" +#include "linux/kernel.h" +#include "linux/mm.h" +#include "linux/string.h" +#include "linux/stat.h" +#include "linux/errno.h" +#include "linux/locks.h" +#include "linux/unistd.h" +#include "linux/init.h" +#include "linux/sem.h" +#include "linux/vmalloc.h" +#ifdef MODVERSIONS +#include +#endif #define DEBUG_SUBSYSTEM S_GMNAL -#include -#include -#include +#include "portals/nal.h" +#include "portals/api.h" +#include "portals/errno.h" +#include "linux/kp30.h" +#include "portals/p30.h" + +#include "portals/lib-nal.h" +#include "portals/lib-p30.h" + +#define GM_STRONG_TYPES 1 +#include "gm.h" +#include "gm_internal.h" + + +/* + * Defines for the API NAL + */ + +/* + * Small message size is configurable + * insmod can set small_msg_size + * which is used to populate nal_data.small_msg_size + */ +#define GMNAL_SMALL_MESSAGE 1078 +#define GMNAL_LARGE_MESSAGE_INIT 1079 +#define GMNAL_LARGE_MESSAGE_ACK 1080 +#define GMNAL_LARGE_MESSAGE_FINI 1081 + +extern int gmnal_small_msg_size; +extern int num_rx_threads; +extern int num_stxds; +#define GMNAL_SMALL_MSG_SIZE(a) a->small_msg_size +#define GMNAL_IS_SMALL_MESSAGE(n,a,b,c) gmnal_is_small_msg(n, a, b, c) +#define GMNAL_MAGIC 0x1234abcd + + +/* + * Small Transmit Descriptor + * A structre to keep track of a small transmit operation + * This structure has a one-to-one relationship with a small + * transmit buffer (both create by gmnal_stxd_alloc). + * There are two free list of stxd. One for use by clients of the NAL + * and the other by the NAL rxthreads when doing sends. + * This helps prevent deadlock caused by stxd starvation. + */ +typedef struct _gmnal_stxd_t { + void *buffer; + int buffer_size; + gm_size_t gm_size; + int msg_size; + int gm_target_node; + int gm_priority; + int type; + struct _gmnal_data_t *nal_data; + lib_msg_t *cookie; + int niov; + struct iovec iov[PTL_MD_MAX_IOV]; + struct _gmnal_srxd_t *srxd; + struct _gmnal_stxd_t *next; + int rxt; + int kniov; + struct iovec *iovec_dup; +} gmnal_stxd_t; + +/* + * as for gmnal_stxd_t + * a hash table in nal_data find srxds from + * the rx buffer address. hash table populated at init time + */ +typedef struct _gmnal_srxd_t { + void *buffer; + int size; + gm_size_t gmsize; + unsigned int gm_source_node; + gmnal_stxd_t *source_stxd; + int type; + int nsiov; + int nriov; + struct iovec *riov; + int ncallbacks; + spinlock_t callback_lock; + int callback_status; + lib_msg_t *cookie; + struct _gmnal_srxd_t *next; + struct _gmnal_data_t *nal_data; +} gmnal_srxd_t; + +/* + * Header which lmgnal puts at the start of each message + */ +typedef struct _gmnal_msghdr { + int magic; + int type; + unsigned int sender_node_id; + gmnal_stxd_t *stxd; + int niov; + } gmnal_msghdr_t; +#define GMNAL_MSGHDR_SIZE sizeof(gmnal_msghdr_t) + +/* + * the caretaker thread (ct_thread) gets receive events + * (and other events) from the myrinet device via the GM2 API. + * caretaker thread populates one work entry for each receive event, + * puts it on a Q in nal_data and wakes a receive thread to + * process the receive. + * Processing a portals receive can involve a transmit operation. + * Because of this the caretaker thread cannot process receives + * as it may get deadlocked when supply of transmit descriptors + * is exhausted (as caretaker thread is responsible for replacing + * transmit descriptors on the free list) + */ +typedef struct _gmnal_rxtwe { + gm_recv_event_t *rx; + struct _gmnal_rxtwe *next; +} gmnal_rxtwe_t; + +/* + * 1 receive thread started on each CPU + */ +#define NRXTHREADS 10 /* max number of receiver threads */ + +typedef struct _gmnal_data_t { + int refcnt; + spinlock_t cb_lock; + spinlock_t stxd_lock; + struct semaphore stxd_token; + gmnal_stxd_t *stxd; + spinlock_t rxt_stxd_lock; + struct semaphore rxt_stxd_token; + gmnal_stxd_t *rxt_stxd; + spinlock_t srxd_lock; + struct semaphore srxd_token; + gmnal_srxd_t *srxd; + struct gm_hash *srxd_hash; + nal_t *nal; + nal_cb_t *nal_cb; + struct gm_port *gm_port; + unsigned int gm_local_nid; + unsigned int gm_global_nid; + spinlock_t gm_lock; + long rxthread_pid[NRXTHREADS]; + int rxthread_stop_flag; + spinlock_t rxthread_flag_lock; + long rxthread_flag; + long ctthread_pid; + int ctthread_flag; + gm_alarm_t ctthread_alarm; + int small_msg_size; + int small_msg_gmsize; + gmnal_rxtwe_t *rxtwe_head; + gmnal_rxtwe_t *rxtwe_tail; + spinlock_t rxtwe_lock; + struct semaphore rxtwe_wait; +} gmnal_data_t; + +/* + * Flags to start/stop and check status of threads + * each rxthread sets 1 bit (any bit) of the flag on startup + * and clears 1 bit when exiting + */ +#define GMNAL_THREAD_RESET 0 +#define GMNAL_THREAD_STOP 666 +#define GMNAL_CTTHREAD_STARTED 333 +#define GMNAL_RXTHREADS_STARTED ( (1< +/* + * FUNCTION PROTOTYPES + */ + +/* + * Locking macros + */ /* - * Myrinet GM NAL + * For the Small tx and rx descriptor lists */ -#define NPAGES_LARGE 16 -#define NPAGES_SMALL 1 -#define MSG_LEN_LARGE NPAGES_LARGE*PAGE_SIZE -#define MSG_LEN_SMALL NPAGES_SMALL*PAGE_SIZE -#define MSG_SIZE_LARGE (gm_min_size_for_length(MSG_LEN_LARGE)) -#define MSG_SIZE_SMALL (gm_min_size_for_length(MSG_LEN_SMALL)) +#define GMNAL_TXD_LOCK_INIT(a) spin_lock_init(&a->stxd_lock); +#define GMNAL_TXD_LOCK(a) spin_lock(&a->stxd_lock); +#define GMNAL_TXD_UNLOCK(a) spin_unlock(&a->stxd_lock); +#define GMNAL_TXD_TOKEN_INIT(a, n) sema_init(&a->stxd_token, n); +#define GMNAL_TXD_GETTOKEN(a) down(&a->stxd_token); +#define GMNAL_TXD_TRYGETTOKEN(a) down_trylock(&a->stxd_token) +#define GMNAL_TXD_RETURNTOKEN(a) up(&a->stxd_token); -#define TXMSGS 64 /* Number of Transmit Messages */ -#define ENVELOPES 8 /* Number of outstanding receive msgs */ +#define GMNAL_RXT_TXD_LOCK_INIT(a) spin_lock_init(&a->rxt_stxd_lock); +#define GMNAL_RXT_TXD_LOCK(a) spin_lock(&a->rxt_stxd_lock); +#define GMNAL_RXT_TXD_UNLOCK(a) spin_unlock(&a->rxt_stxd_lock); +#define GMNAL_RXT_TXD_TOKEN_INIT(a, n) sema_init(&a->rxt_stxd_token, n); +#define GMNAL_RXT_TXD_GETTOKEN(a) down(&a->rxt_stxd_token); +#define GMNAL_RXT_TXD_TRYGETTOKEN(a) down_trylock(&a->rxt_stxd_token) +#define GMNAL_RXT_TXD_RETURNTOKEN(a) up(&a->rxt_stxd_token); -#define KGM_PORT_NUM 3 -#define KGM_HOSTNAME "kgmnal" +#define GMNAL_RXD_LOCK_INIT(a) spin_lock_init(&a->srxd_lock); +#define GMNAL_RXD_LOCK(a) spin_lock(&a->srxd_lock); +#define GMNAL_RXD_UNLOCK(a) spin_unlock(&a->srxd_lock); +#define GMNAL_RXD_TOKEN_INIT(a, n) sema_init(&a->srxd_token, n); +#define GMNAL_RXD_GETTOKEN(a) down(&a->srxd_token); +#define GMNAL_RXD_TRYGETTOKEN(a) down_trylock(&a->srxd_token) +#define GMNAL_RXD_RETURNTOKEN(a) up(&a->srxd_token); +#define GMNAL_GM_LOCK_INIT(a) spin_lock_init(&a->gm_lock); +#define GMNAL_GM_LOCK(a) spin_lock(&a->gm_lock); +#define GMNAL_GM_UNLOCK(a) spin_unlock(&a->gm_lock); +#define GMNAL_CB_LOCK_INIT(a) spin_lock_init(&a->cb_lock); -typedef struct { - char *krx_buffer; - unsigned long krx_len; - unsigned int krx_size; - unsigned int krx_priority; - struct list_head krx_item; -} kgmnal_rx_t; +/* + * Memory Allocator + */ + +/* + * API NAL + */ +int gmnal_api_forward(nal_t *, int, void *, size_t, void *, size_t); + +int gmnal_api_shutdown(nal_t *, int); + +int gmnal_api_validate(nal_t *, void *, size_t); + +void gmnal_api_yield(nal_t *); + +void gmnal_api_lock(nal_t *, unsigned long *); + +void gmnal_api_unlock(nal_t *, unsigned long *); + + +#define GMNAL_INIT_NAL(a) do { \ + a->forward = gmnal_api_forward; \ + a->shutdown = gmnal_api_shutdown; \ + a->validate = NULL; \ + a->yield = gmnal_api_yield; \ + a->lock = gmnal_api_lock; \ + a->unlock = gmnal_api_unlock; \ + a->timeout = NULL; \ + a->refct = 1; \ + a->nal_data = NULL; \ + } while (0) + + +/* + * CB NAL + */ + +int gmnal_cb_send(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, + int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec *, size_t); + +int gmnal_cb_send_pages(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, + int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t); + +int gmnal_cb_recv(nal_cb_t *, void *, lib_msg_t *, + unsigned int, struct iovec *, size_t, size_t); + +int gmnal_cb_recv_pages(nal_cb_t *, void *, lib_msg_t *, + unsigned int, ptl_kiov_t *, size_t, size_t); + +int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t); + +int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t); + +int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *); + +void *gmnal_cb_malloc(nal_cb_t *, size_t); + +void gmnal_cb_free(nal_cb_t *, void *, size_t); + +void gmnal_cb_unmap(nal_cb_t *, unsigned int, struct iovec*, void **); + +int gmnal_cb_map(nal_cb_t *, unsigned int, struct iovec*, void **); + +void gmnal_cb_printf(nal_cb_t *, const char *fmt, ...); + +void gmnal_cb_cli(nal_cb_t *, unsigned long *); + +void gmnal_cb_sti(nal_cb_t *, unsigned long *); + +int gmnal_cb_dist(nal_cb_t *, ptl_nid_t, unsigned long *); + +nal_t *gmnal_init(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t rpid); + +void gmnal_fini(void); + + + +#define GMNAL_INIT_NAL_CB(a) do { \ + a->cb_send = gmnal_cb_send; \ + a->cb_send_pages = gmnal_cb_send_pages; \ + a->cb_recv = gmnal_cb_recv; \ + a->cb_recv_pages = gmnal_cb_recv_pages; \ + a->cb_read = gmnal_cb_read; \ + a->cb_write = gmnal_cb_write; \ + a->cb_callback = gmnal_cb_callback; \ + a->cb_malloc = gmnal_cb_malloc; \ + a->cb_free = gmnal_cb_free; \ + a->cb_map = NULL; \ + a->cb_unmap = NULL; \ + a->cb_printf = gmnal_cb_printf; \ + a->cb_cli = gmnal_cb_cli; \ + a->cb_sti = gmnal_cb_sti; \ + a->cb_dist = gmnal_cb_dist; \ + a->nal_data = NULL; \ + } while (0) + + +/* + * Small Transmit and Receive Descriptor Functions + */ +int gmnal_alloc_stxd(gmnal_data_t *); +void gmnal_free_stxd(gmnal_data_t *); +gmnal_stxd_t* gmnal_get_stxd(gmnal_data_t *, int); +void gmnal_return_stxd(gmnal_data_t *, gmnal_stxd_t *); + +int gmnal_alloc_srxd(gmnal_data_t *); +void gmnal_free_srxd(gmnal_data_t *); +gmnal_srxd_t* gmnal_get_srxd(gmnal_data_t *, int); +void gmnal_return_srxd(gmnal_data_t *, gmnal_srxd_t *); + +/* + * general utility functions + */ +gmnal_srxd_t *gmnal_rxbuffer_to_srxd(gmnal_data_t *, void*); +void gmnal_stop_rxthread(gmnal_data_t *); +void gmnal_stop_ctthread(gmnal_data_t *); +void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t); +void gmnal_drop_sends_callback(gm_port_t *, void *, gm_status_t); +char *gmnal_gm_error(gm_status_t); +char *gmnal_rxevent(gm_recv_event_t*); +int gmnal_is_small_msg(gmnal_data_t*, int, struct iovec*, int); +void gmnal_yield(int); +int gmnal_start_kernel_threads(gmnal_data_t *); + + +/* + * Communication functions + */ + +/* + * Receive threads + */ +int gmnal_ct_thread(void *); /* caretaker thread */ +int gmnal_rx_thread(void *); /* receive thread */ +int gmnal_pre_receive(gmnal_data_t*, gm_recv_t*, int); +int gmnal_rx_bad(gmnal_data_t *, gm_recv_t *, gmnal_srxd_t *); +int gmnal_rx_requeue_buffer(gmnal_data_t *, gmnal_srxd_t *); +int gmnal_add_rxtwe(gmnal_data_t *, gm_recv_event_t *); +gmnal_rxtwe_t * gmnal_get_rxtwe(gmnal_data_t *); +void gmnal_remove_rxtwe(gmnal_data_t *); + + +/* + * Small messages + */ +int gmnal_small_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int, + struct iovec *, size_t, size_t); +int gmnal_small_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, + int, ptl_nid_t, ptl_pid_t, + unsigned int, struct iovec*, int); +void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t); + + + +/* + * Large messages + */ +int gmnal_large_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int, + struct iovec *, size_t, size_t); -typedef struct { - nal_cb_t *ktx_nal; - void *ktx_private; - lib_msg_t *ktx_cookie; - char *ktx_buffer; - size_t ktx_len; - unsigned long ktx_size; - int ktx_ndx; - unsigned int ktx_priority; - unsigned int ktx_tgt_node; - unsigned int ktx_tgt_port_id; -} kgmnal_tx_t; +int gmnal_large_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, + int, ptl_nid_t, ptl_pid_t, unsigned int, + struct iovec*, int); +void gmnal_large_tx_callback(gm_port_t *, void *, gm_status_t); -typedef struct { - char kgm_init; - char kgm_shuttingdown; - struct gm_port *kgm_port; - struct list_head kgm_list; - ptl_nid_t kgm_nid; - nal_cb_t *kgm_cb; - struct kgm_trans *kgm_trans; - struct tq_struct kgm_ready_tq; - spinlock_t kgm_dispatch_lock; - spinlock_t kgm_update_lock; - spinlock_t kgm_send_lock; -} kgmnal_data_t; +int gmnal_remote_get(gmnal_srxd_t *, int, struct iovec*, int, + struct iovec*); -int kgm_init(kgmnal_data_t *kgm_data); -int kgmnal_recv_thread(void *); -int gm_return_mynid(void); -void kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +void gmnal_remote_get_callback(gm_port_t *, void *, gm_status_t); -extern kgmnal_data_t kgmnal_data; -extern nal_t kgmnal_api; -extern nal_cb_t kgmnal_lib; +int gmnal_copyiov(int, gmnal_srxd_t *, int, struct iovec*, int, + struct iovec*); -#endif /* _GMNAL_H */ +void gmnal_large_tx_ack(gmnal_data_t *, gmnal_srxd_t *); +void gmnal_large_tx_ack_callback(gm_port_t *, void *, gm_status_t); +void gmnal_large_tx_ack_received(gmnal_data_t *, gmnal_srxd_t *); +#endif /*__INCLUDE_GMNAL_H__*/ diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c index 3d4c86d..093ee64 100644 --- a/lnet/klnds/gmlnd/gmlnd_cb.c +++ b/lnet/klnds/gmlnd/gmlnd_cb.c @@ -1,517 +1,290 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Based on ksocknal and qswnal + * Copyright (c) 2003 Los Alamos National Laboratory (LANL) * - * Copyright (C) 2002 Cluster File Systems, Inc. - * Author: Robert Read + * This file is part of Lustre, http://www.lustre.org/ * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or + * Lustre is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * - * Portals is distributed in the hope that it will be useful, + * Lustre is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software + * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/* TODO - * preallocate send buffers, store on list - * put receive buffers on queue, handle with receive threads - * use routing - */ - -#include "gmnal.h" - -extern kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *,int); - -static kgmnal_tx_t * -get_trans(void) -{ - kgmnal_tx_t *t; - PORTAL_ALLOC(t, (sizeof(kgmnal_tx_t))); - return t; -} - -static void -put_trans(kgmnal_tx_t *t) -{ - PORTAL_FREE(t, sizeof(kgmnal_tx_t)); -} - -int -kgmnal_ispeer (ptl_nid_t nid) -{ - unsigned int gmnid = (unsigned int)nid; - unsigned int nnids; - - gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids); - - return ((ptl_nid_t)gmnid == nid &&/* didn't lose high bits on conversion ? */ - gmnid < nnids); /* it's in this machine */ -} /* - * LIB functions follow - * + * This file implements the nal cb functions */ -static int -kgmnal_read (nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, - size_t len) -{ - CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n", - nal->ni.nid, (long)len, src_addr, dst_addr ); - memcpy( dst_addr, src_addr, len ); - return 0; -} - -static int -kgmnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, - size_t len) -{ - CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n", - nal->ni.nid, (long)len, src_addr, dst_addr ); - memcpy( dst_addr, src_addr, len ); - return 0; -} -static void * -kgmnal_malloc(nal_cb_t *nal, size_t len) -{ - void *buf; - PORTAL_ALLOC(buf, len); - return buf; -} +#include "gmnal.h" -static void -kgmnal_free(nal_cb_t *nal, void *buf, size_t len) +int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, + unsigned int niov, struct iovec *iov, size_t mlen, + size_t rlen) { - PORTAL_FREE(buf, len); + gmnal_srxd_t *srxd = (gmnal_srxd_t*)private; + int status = PTL_OK; + + + CDEBUG(D_TRACE, "gmnal_cb_recv nal_cb [%p], private[%p], cookie[%p], + niov[%d], iov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", + nal_cb, private, cookie, niov, iov, mlen, rlen); + + switch(srxd->type) { + case(GMNAL_SMALL_MESSAGE): + CDEBUG(D_INFO, "gmnal_cb_recv got small message\n"); + status = gmnal_small_rx(nal_cb, private, cookie, niov, + iov, mlen, rlen); + break; + case(GMNAL_LARGE_MESSAGE_INIT): + CDEBUG(D_INFO, "gmnal_cb_recv got large message init\n"); + status = gmnal_large_rx(nal_cb, private, cookie, niov, + iov, mlen, rlen); + } + + + CDEBUG(D_INFO, "gmnal_cb_recv gmnal_return status [%d]\n", status); + return(status); } -static void -kgmnal_printf(nal_cb_t *nal, const char *fmt, ...) +int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, + unsigned int kniov, ptl_kiov_t *kiov, size_t mlen, + size_t rlen) { - va_list ap; - char msg[256]; - - if (portal_debug & D_NET) { - va_start( ap, fmt ); - vsnprintf( msg, sizeof(msg), fmt, ap ); - va_end( ap ); - - printk("CPUId: %d %s",smp_processor_id(), msg); - } + gmnal_srxd_t *srxd = (gmnal_srxd_t*)private; + int status = PTL_OK; + struct iovec *iovec = NULL, *iovec_dup = NULL; + int i = 0; + + + CDEBUG(D_TRACE, "gmnal_cb_recv_pages nal_cb [%p],private[%p], + cookie[%p], kniov[%d], kiov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", + nal_cb, private, cookie, kniov, kiov, mlen, rlen); + + if (srxd->type == GMNAL_SMALL_MESSAGE) { + PORTAL_ALLOC(iovec, sizeof(struct iovec)*kniov); + if (!iovec) { + CDEBUG(D_ERROR, "Can't malloc\n"); + return(GMNAL_STATUS_FAIL); + } + iovec_dup = iovec; + + /* + * map each page and create an iovec for it + */ + for (i=0; ikiov_page, kiov->kiov_len, + kiov->kiov_offset); + iovec->iov_len = kiov->kiov_len; + CDEBUG(D_INFO, "Calling kmap[%p]", kiov->kiov_page); + + iovec->iov_base = kmap(kiov->kiov_page) + + kiov->kiov_offset; + + CDEBUG(D_INFO, "iov_base is [%p]\n", iovec->iov_base); + iovec++; + kiov++; + } + CDEBUG(D_INFO, "calling gmnal_small_rx\n"); + status = gmnal_small_rx(nal_cb, private, cookie, kniov, + iovec_dup, mlen, rlen); + PORTAL_FREE(iovec_dup, sizeof(struct iovec)*kniov); + } + + + CDEBUG(D_INFO, "gmnal_return status [%d]\n", status); + return(status); } -static void -kgmnal_cli(nal_cb_t *nal, unsigned long *flags) +int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, struct iovec *iov, size_t len) { - kgmnal_data_t *data= nal->nal_data; - spin_lock_irqsave(&data->kgm_dispatch_lock,*flags); + gmnal_data_t *nal_data; + + + CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] len["LPSZ"] nid["LPU64"]\n", + niov, len, nid); + nal_data = nal_cb->nal_data; + + if (GMNAL_IS_SMALL_MESSAGE(nal_data, niov, iov, len)) { + CDEBUG(D_INFO, "This is a small message send\n"); + gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, pid, + niov, iov, len); + } else { + CDEBUG(D_ERROR, "Large message send it is not supported\n"); + lib_finalize(nal_cb, private, cookie); + return(PTL_FAIL); + gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, + niov, iov, len); + } + return(PTL_OK); } - -static void -kgmnal_sti(nal_cb_t *nal, unsigned long *flags) +int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int kniov, ptl_kiov_t *kiov, size_t len) { - kgmnal_data_t *data= nal->nal_data; - spin_unlock_irqrestore(&data->kgm_dispatch_lock,*flags); + int i = 0; + gmnal_data_t *nal_data; + struct iovec *iovec = NULL, *iovec_dup = NULL; + + CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] len["LPSZ"]\n", nid, kniov, len); + nal_data = nal_cb->nal_data; + PORTAL_ALLOC(iovec, kniov*sizeof(struct iovec)); + iovec_dup = iovec; + if (GMNAL_IS_SMALL_MESSAGE(nal_data, 0, NULL, len)) { + CDEBUG(D_INFO, "This is a small message send\n"); + + for (i=0; ikiov_page, kiov->kiov_len, + kiov->kiov_offset); + + iovec->iov_base = kmap(kiov->kiov_page) + + kiov->kiov_offset; + + iovec->iov_len = kiov->kiov_len; + iovec++; + kiov++; + } + gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, + pid, kniov, iovec_dup, len); + } else { + CDEBUG(D_ERROR, "Large message send it is not supported yet\n"); + return(PTL_FAIL); + for (i=0; ikiov_page, kiov->kiov_len, + kiov->kiov_offset); + + iovec->iov_base = kmap(kiov->kiov_page) + + kiov->kiov_offset; + iovec->iov_len = kiov->kiov_len; + iovec++; + kiov++; + } + gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, + pid, kniov, iovec, len); + } + PORTAL_FREE(iovec_dup, kniov*sizeof(struct iovec)); + return(PTL_OK); } - -static int -kgmnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +int gmnal_cb_read(nal_cb_t *nal_cb, void *private, void *dst, + user_ptr src, size_t len) { - /* network distance doesn't mean much for this nal */ - if ( nal->ni.nid == nid ) { - *dist = 0; - } else { - *dist = 1; - } - - return 0; + gm_bcopy(src, dst, len); + return(PTL_OK); } -/* FIXME rmr: add rounting code here */ -static void -kgmnal_tx_done(kgmnal_tx_t *trans, int error) -{ - lib_finalize(trans->ktx_nal, trans->ktx_private, trans->ktx_cookie); - - gm_dma_free(kgmnal_data.kgm_port, trans->ktx_buffer); - - trans->ktx_buffer = NULL; - trans->ktx_len = 0; - - put_trans(trans); -} -static char * gm_error_strings[GM_NUM_STATUS_CODES] = { - [GM_SUCCESS] = "GM_SUCCESS", - [GM_SEND_TIMED_OUT] = "GM_SEND_TIMED_OUT", - [GM_SEND_REJECTED] = "GM_SEND_REJECTED", - [GM_SEND_TARGET_PORT_CLOSED] = "GM_SEND_TARGET_PORT_CLOSED", - [GM_SEND_TARGET_NODE_UNREACHABLE] = "GM_SEND_TARGET_NODE_UNREACHABLE", - [GM_SEND_DROPPED] = "GM_SEND_DROPPED", - [GM_SEND_PORT_CLOSED] = "GM_SEND_PORT_CLOSED", -}; - -inline char * get_error(int status) +int gmnal_cb_write(nal_cb_t *nal_cb, void *private, user_ptr dst, + void *src, size_t len) { - if (gm_error_strings[status] != NULL) - return gm_error_strings[status]; - else - return "Unknown error"; + gm_bcopy(src, dst, len); + return(PTL_OK); } -static void -kgmnal_errhandler(struct gm_port *p, void *context, gm_status_t status) +int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq, + ptl_event_t *ev) { - CDEBUG(D_NET,"error callback: ktx %p status %d\n", context, status); -} -static void -kgmnal_txhandler(struct gm_port *p, void *context, gm_status_t status) -{ - kgmnal_tx_t *ktx = (kgmnal_tx_t *)context; - int err = 0; - - LASSERT (p != NULL); - LASSERT (ktx != NULL); - - CDEBUG(D_NET,"ktx %p status %d nid 0x%x pid %d\n", ktx, status, - ktx->ktx_tgt_node, ktx->ktx_tgt_port_id); - - switch((int)status) { - case GM_SUCCESS: /* normal */ - break; - case GM_SEND_TIMED_OUT: /* application error */ - case GM_SEND_REJECTED: /* size of msg unacceptable */ - case GM_SEND_TARGET_PORT_CLOSED: - CERROR("%s (%d):\n", get_error(status), status); - gm_resume_sending(kgmnal_data.kgm_port, ktx->ktx_priority, - ktx->ktx_tgt_node, ktx->ktx_tgt_port_id, - kgmnal_errhandler, NULL); - err = -EIO; - break; - case GM_SEND_TARGET_NODE_UNREACHABLE: - case GM_SEND_PORT_CLOSED: - CERROR("%s (%d):\n", get_error(status), status); - gm_drop_sends(kgmnal_data.kgm_port, ktx->ktx_priority, - ktx->ktx_tgt_node, ktx->ktx_tgt_port_id, - kgmnal_errhandler, NULL); - err = -EIO; - break; - case GM_SEND_DROPPED: - CERROR("%s (%d):\n", get_error(status), status); - err = -EIO; - break; - default: - CERROR("Unknown status: %d\n", status); - err = -EIO; - break; - } - - kgmnal_tx_done(ktx, err); + if (eq->event_callback != NULL) { + CDEBUG(D_INFO, "found callback\n"); + eq->event_callback(ev); + } + + return(PTL_OK); } -/* - */ - -static int -kgmnal_send(nal_cb_t *nal, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - int options, - unsigned int niov, - lib_md_iov_t *iov, - size_t len) +void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len) { - /* - * ipnal assumes that this is the private as passed to lib_dispatch.. - * so do we :/ - */ - kgmnal_tx_t *ktx=NULL; - int rc=0; - void * buf; - int buf_len = sizeof(ptl_hdr_t) + len; - int buf_size = 0; - - LASSERT ((options & PTL_MD_KIOV) == 0); - - PROF_START(gmnal_send); - - - CDEBUG(D_NET, "sending %d bytes from %p to nid: 0x%Lx pid %d\n", - len, iov, nid, KGM_PORT_NUM); - - /* ensure there is an available tx handle */ - - /* save transaction info to trans for later finalize and cleanup */ - ktx = get_trans(); - if (ktx == NULL) { - rc = -ENOMEM; - goto send_exit; - } - - /* hmmm... GM doesn't support vectored write, so need to allocate buffer to coalesce - header and data. - Also, memory must be dma'able or registered with GM. */ - - if (buf_len <= MSG_LEN_SMALL) { - buf_size = MSG_SIZE_SMALL; - } else if (buf_len <= MSG_LEN_LARGE) { - buf_size = MSG_SIZE_LARGE; - } else { - printk("kgmnal:request exceeds TX MTU size (%d).\n", - MSG_SIZE_LARGE); - rc = -1; - goto send_exit; - } - - buf = gm_dma_malloc(kgmnal_data.kgm_port, buf_len); - if (buf == NULL) { - rc = -ENOMEM; - goto send_exit; - } - memcpy(buf, hdr, sizeof(ptl_hdr_t)); - - if (len != 0) - lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t), - options, niov, iov, len); - - ktx->ktx_nal = nal; - ktx->ktx_private = private; - ktx->ktx_cookie = cookie; - ktx->ktx_len = buf_len; - ktx->ktx_size = buf_size; - ktx->ktx_buffer = buf; - ktx->ktx_priority = GM_LOW_PRIORITY; - ktx->ktx_tgt_node = nid; - ktx->ktx_tgt_port_id = KGM_PORT_NUM; - - CDEBUG(D_NET, "gm_send %d bytes (size %d) from %p to nid: 0x%Lx " - "pid %d pri %d\n", buf_len, buf_size, iov, nid, KGM_PORT_NUM, - GM_LOW_PRIORITY); - - gm_send_with_callback(kgmnal_data.kgm_port, buf, buf_size, - buf_len, GM_LOW_PRIORITY, - nid, KGM_PORT_NUM, - kgmnal_txhandler, ktx); - - PROF_FINISH(gmnal_send); - send_exit: - return rc; -} -void -kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) -{ - CERROR ("forwarding not implemented\n"); + void *ptr = NULL; + CDEBUG(D_TRACE, "gmnal_cb_malloc len["LPSZ"]\n", len); + PORTAL_ALLOC(ptr, len); + return(ptr); } -void -kqswnal_fwd_callback (void *arg, int error) +void gmnal_cb_free(nal_cb_t *nal_cb, void *buf, size_t len) { - CERROR ("forwarding not implemented\n"); + CDEBUG(D_TRACE, "gmnal_cb_free :: buf[%p] len["LPSZ"]\n", buf, len); + PORTAL_FREE(buf, len); + return; } - -static inline void -kgmnal_requeue_rx(kgmnal_rx_t *krx) +void gmnal_cb_unmap(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov, + void **addrkey) { - gm_provide_receive_buffer(kgmnal_data.kgm_port, krx->krx_buffer, - krx->krx_size, krx->krx_priority); + return; } -/* Process a received portals packet */ - -/* Receive Interrupt Handler */ -static void kgmnal_rx(kgmnal_data_t *kgm, unsigned long len, unsigned int size, - void * buf, unsigned int pri) +int gmnal_cb_map(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov, + void**addrkey) { - ptl_hdr_t *hdr = buf; - kgmnal_rx_t krx; - - CDEBUG(D_NET,"buf %p, len %ld\n", buf, len); - - if ( len < sizeof( ptl_hdr_t ) ) { - /* XXX what's this for? */ - if (kgm->kgm_shuttingdown) - return; - CERROR("kgmnal: did not receive complete portal header, " - "len= %ld", len); - gm_provide_receive_buffer(kgm->kgm_port, buf, size, pri); - return; - } - - /* might want to use seperate threads to handle receive */ - krx.krx_buffer = buf; - krx.krx_len = len; - krx.krx_size = size; - krx.krx_priority = pri; - - if ( hdr->dest_nid == kgmnal_lib.ni.nid ) { - PROF_START(lib_parse); - lib_parse(&kgmnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx); - PROF_FINISH(lib_parse); - } else if (kgmnal_ispeer(hdr->dest_nid)) { - /* should have gone direct to peer */ - CERROR("dropping packet from 0x%llx to 0x%llx: target is " - "a peer", hdr->src_nid, hdr->dest_nid); - kgmnal_requeue_rx(&krx); - } else { - /* forward to gateway */ - CERROR("forwarding not implemented yet"); - kgmnal_requeue_rx(&krx); - } - - return; + return(PTL_OK); } - -static int kgmnal_recv(nal_cb_t *nal, - void *private, - lib_msg_t *cookie, - int options, - unsigned int niov, - lib_md_iov_t *iov, - size_t mlen, - size_t rlen) +void gmnal_cb_printf(nal_cb_t *nal_cb, const char *fmt, ...) { - kgmnal_rx_t *krx = private; - - LASSERT ((options & PTL_MD_KIOV) == 0); - - CDEBUG(D_NET,"mlen=%d, rlen=%d\n", mlen, rlen); - - /* What was actually received must be >= what sender claims to - * have sent. This is an LASSERT, since lib-move doesn't - * check cb return code yet. */ - LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen); - LASSERT (mlen <= rlen); - - PROF_START(gmnal_recv); - - if(mlen != 0) { - PROF_START(memcpy); - lib_copy_buf2iov (options, niov, iov, - krx->krx_buffer + sizeof (ptl_hdr_t), mlen); - PROF_FINISH(memcpy); - } - - PROF_START(lib_finalize); - lib_finalize(nal, private, cookie); - PROF_FINISH(lib_finalize); - - kgmnal_requeue_rx(krx); - - PROF_FINISH(gmnal_recv); - - return rlen; + CDEBUG(D_TRACE, "gmnal_cb_printf\n"); + printk(fmt); + return; } - -static void kgmnal_shutdown(void * none) +void gmnal_cb_cli(nal_cb_t *nal_cb, unsigned long *flags) { - CERROR("called\n"); - return; + gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; + + spin_lock_irqsave(&nal_data->cb_lock, *flags); + return; } -/* - * Set terminate and use alarm to wake up the recv thread. - */ -static void recv_shutdown(kgmnal_data_t *kgm) +void gmnal_cb_sti(nal_cb_t *nal_cb, unsigned long *flags) { - gm_alarm_t alarm; + gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; - kgm->kgm_shuttingdown = 1; - gm_initialize_alarm(&alarm); - gm_set_alarm(kgm->kgm_port, &alarm, 1, kgmnal_shutdown, NULL); + spin_unlock_irqrestore(&nal_data->cb_lock, *flags); + return; } -int kgmnal_end(kgmnal_data_t *kgm) +int gmnal_cb_dist(nal_cb_t *nal_cb, ptl_nid_t nid, unsigned long *dist) { + CDEBUG(D_TRACE, "gmnal_cb_dist\n"); + if (dist) + *dist = 27; + return(PTL_OK); +} - /* wait for sends to finish ? */ - /* remove receive buffers */ - /* shutdown receive thread */ - recv_shutdown(kgm); - return 0; -} - -/* Used only for the spinner */ -int kgmnal_recv_thread(void *arg) -{ - kgmnal_data_t *kgm = arg; - - LASSERT(kgm != NULL); - - kportal_daemonize("kgmnal_rx"); - - while(1) { - gm_recv_event_t *e; - int priority = GM_LOW_PRIORITY; - if (kgm->kgm_shuttingdown) - break; - - e = gm_blocking_receive_no_spin(kgm->kgm_port); - if (e == NULL) { - CERROR("gm_blocking_receive returned NULL\n"); - break; - } - - switch(gm_ntohc(e->recv.type)) { - case GM_HIGH_RECV_EVENT: - priority = GM_HIGH_PRIORITY; - /* fall through */ - case GM_RECV_EVENT: - kgmnal_rx(kgm, gm_ntohl(e->recv.length), - gm_ntohc(e->recv.size), - gm_ntohp(e->recv.buffer), priority); - break; - case GM_ALARM_EVENT: - CERROR("received alarm"); - gm_unknown(kgm->kgm_port, e); - break; - case GM_BAD_SEND_DETECTED_EVENT: /* ?? */ - CERROR("received bad send!\n"); - break; - default: - gm_unknown(kgm->kgm_port, e); - } - } - - CERROR("shuttting down.\n"); - return 0; -} -nal_cb_t kgmnal_lib = { - nal_data: &kgmnal_data, /* NAL private data */ - cb_send: kgmnal_send, - cb_recv: kgmnal_recv, - cb_read: kgmnal_read, - cb_write: kgmnal_write, - cb_malloc: kgmnal_malloc, - cb_free: kgmnal_free, - cb_printf: kgmnal_printf, - cb_cli: kgmnal_cli, - cb_sti: kgmnal_sti, - cb_dist: kgmnal_dist -}; +EXPORT_SYMBOL(gmnal_cb_send); +EXPORT_SYMBOL(gmnal_cb_send_pages); +EXPORT_SYMBOL(gmnal_cb_recv); +EXPORT_SYMBOL(gmnal_cb_recv_pages); +EXPORT_SYMBOL(gmnal_cb_read); +EXPORT_SYMBOL(gmnal_cb_write); +EXPORT_SYMBOL(gmnal_cb_cli); +EXPORT_SYMBOL(gmnal_cb_sti); +EXPORT_SYMBOL(gmnal_cb_dist); +EXPORT_SYMBOL(gmnal_cb_printf); +EXPORT_SYMBOL(gmnal_cb_map); +EXPORT_SYMBOL(gmnal_cb_unmap); +EXPORT_SYMBOL(gmnal_cb_callback); +EXPORT_SYMBOL(gmnal_cb_free); +EXPORT_SYMBOL(gmnal_cb_malloc); diff --git a/lnet/klnds/gmlnd/gmnal.c b/lnet/klnds/gmlnd/gmnal.c deleted file mode 100644 index 0cffc158..0000000 --- a/lnet/klnds/gmlnd/gmnal.c +++ /dev/null @@ -1,284 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Based on ksocknal and qswnal - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * Author: Robert Read - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include "gmnal.h" - -ptl_handle_ni_t kgmnal_ni; -nal_t kgmnal_api; - -kgmnal_data_t kgmnal_data; -int gmnal_debug = 0; - -kpr_nal_interface_t kqswnal_router_interface = { - kprni_nalid: GMNAL, - kprni_arg: NULL, - kprni_fwd: kgmnal_fwd_packet, -}; - -static int kgmnal_forward(nal_t *nal, - int id, - void *args, size_t args_len, - void *ret, size_t ret_len) -{ - kgmnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kgm_cb; - - LASSERT (nal == &kgmnal_api); - LASSERT (k == &kgmnal_data); - LASSERT (nal_cb == &kgmnal_lib); - - lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */ - return PTL_OK; -} - -static void kgmnal_lock(nal_t *nal, unsigned long *flags) -{ - kgmnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kgm_cb; - - - LASSERT (nal == &kgmnal_api); - LASSERT (k == &kgmnal_data); - LASSERT (nal_cb == &kgmnal_lib); - - nal_cb->cb_cli(nal_cb,flags); -} - -static void kgmnal_unlock(nal_t *nal, unsigned long *flags) -{ - kgmnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kgm_cb; - - - LASSERT (nal == &kgmnal_api); - LASSERT (k == &kgmnal_data); - LASSERT (nal_cb == &kgmnal_lib); - - nal_cb->cb_sti(nal_cb,flags); -} - -static int kgmnal_shutdown(nal_t *nal, int ni) -{ - LASSERT (nal == &kgmnal_api); - return 0; -} - -static void kgmnal_yield( nal_t *nal ) -{ - LASSERT (nal == &kgmnal_api); - - if (current->need_resched) - schedule(); - return; -} - -kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *data,int ndx) -{ - kgmnal_rx_t *conn; - - PORTAL_ALLOC(conn, sizeof(kgmnal_rx_t)); - /* Check for out of mem here */ - if (conn==NULL) { - printk("kgm_add_recv: memory alloc failed\n"); - return NULL; - } - - list_add(&conn->krx_item,(struct list_head *)&data->kgm_list); - // conn->ndx=ndx; - // conn->len=conn->ptlhdr_copied=0; - // conn->loopback=0; - return conn; -} - -static nal_t *kgmnal_init(int interface, ptl_pt_index_t ptl_size, - ptl_ac_index_t ac_size, ptl_pid_t requested_pid) -{ - unsigned int nnids; - - gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids); - - CDEBUG(D_NET, "calling lib_init with nid 0x%Lx of %d\n", - kgmnal_data.kgm_nid, nnids); - lib_init(&kgmnal_lib, kgmnal_data.kgm_nid, 0, nnids,ptl_size, ac_size); - return &kgmnal_api; -} - -static void /*__exit*/ -kgmnal_finalize(void) -{ - struct list_head *tmp; - - PORTAL_SYMBOL_UNREGISTER (kgmnal_ni); - PtlNIFini(kgmnal_ni); - lib_fini(&kgmnal_api); - - if (kgmnal_data.kgm_port) { - gm_close(kgmnal_data.kgm_port); - } - - /* FIXME: free dma buffers */ - /* FIXME: kill receiver thread */ - - PORTAL_FREE (kgmnal_data.kgm_trans, bsizeof(kgmnal_tx_t)*TXMSGS); - - list_for_each(tmp, &kgmnal_data.kgm_list) { - kgmnal_rx_t *conn; - conn = list_entry(tmp, kgmnal_rx_t, krx_item); - CDEBUG(D_IOCTL, "freeing conn %p\n",conn); - tmp = tmp->next; - list_del(&conn->krx_item); - PORTAL_FREE(conn, sizeof(*conn)); - } - - CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory)); - - return; -} - -static int __init -kgmnal_initialize(void) -{ - int rc; - int ntok; - unsigned long sizemask; - unsigned int nid; - - CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory)); - - kgmnal_api.forward = kgmnal_forward; - kgmnal_api.shutdown = kgmnal_shutdown; - kgmnal_api.yield = kgmnal_yield; - kgmnal_api.validate = NULL; /* our api validate is a NOOP */ - kgmnal_api.lock= kgmnal_lock; - kgmnal_api.unlock= kgmnal_unlock; - kgmnal_api.nal_data = &kgmnal_data; - - kgmnal_lib.nal_data = &kgmnal_data; - - memset(&kgmnal_data, 0, sizeof(kgmnal_data)); - - INIT_LIST_HEAD(&kgmnal_data.kgm_list); - kgmnal_data.kgm_cb = &kgmnal_lib; - - /* Allocate transmit descriptors */ - PORTAL_ALLOC (kgmnal_data.kgm_trans, sizeof(kgmnal_tx_t)*TXMSGS); - if (kgmnal_data.kgm_trans==NULL) { - printk("kgmnal: init: failed to allocate transmit " - "descriptors\n"); - return -1; - } - memset(kgmnal_data.kgm_trans,-1,sizeof(kgmnal_tx_t)*(TXMSGS)); - - spin_lock_init(&kgmnal_data.kgm_dispatch_lock); - spin_lock_init(&kgmnal_data.kgm_update_lock); - spin_lock_init(&kgmnal_data.kgm_send_lock); - - /* Do the receiver and xmtr allocation */ - - rc = gm_init(); - if (rc != GM_SUCCESS) { - CERROR("gm_init failed: %d\n", rc); - return -1; - } - - rc = gm_open(&kgmnal_data.kgm_port, 0 , KGM_PORT_NUM, KGM_HOSTNAME, - GM_API_VERSION_1_1); - if (rc != GM_SUCCESS) { - gm_finalize(); - kgmnal_data.kgm_port = NULL; - CERROR("gm_open failed: %d\n", rc); - return -1; - } - gm_get_node_id(kgmnal_data.kgm_port, &nid); - kgmnal_data.kgm_nid = nid; - /* Allocate 2 different sizes of buffers. For new, use half - the tokens for each. */ - ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2; - CDEBUG(D_NET, "gmnal_init: creating %d large %d byte recv buffers\n", - ntok, MSG_LEN_LARGE); - while (ntok-- > 0) { - void * buffer = gm_dma_malloc(kgmnal_data.kgm_port, - MSG_LEN_LARGE); - if (buffer == NULL) { - CERROR("gm_init failed: %d\n", rc); - return (-ENOMEM); - } - CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d " - "pri %d\n ", kgmnal_data.kgm_port, buffer, - MSG_LEN_LARGE, MSG_SIZE_LARGE, GM_LOW_PRIORITY); - - gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer, - MSG_SIZE_LARGE, GM_LOW_PRIORITY); - } - - ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2; - CDEBUG(D_NET, "gmnal_init: creating %d small %d byte recv buffers\n", - ntok, MSG_LEN_SMALL); - while (ntok-- > 0) { - void * buffer = gm_dma_malloc(kgmnal_data.kgm_port, - MSG_LEN_SMALL); - if (buffer == NULL) { - CERROR("gm_init failed: %d\n", rc); - return (-ENOMEM); - } - CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d " - "pri %d\n ", kgmnal_data.kgm_port, buffer, - MSG_LEN_SMALL, MSG_SIZE_SMALL, GM_LOW_PRIORITY); - - gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer, - MSG_SIZE_SMALL, GM_LOW_PRIORITY); - - } - sizemask = (1 << MSG_SIZE_LARGE) | (1 << MSG_SIZE_SMALL); - CDEBUG(D_NET, "gm_set_acceptable_sizes port %p pri %d mask 0x%x\n", - kgmnal_data.kgm_port, GM_LOW_PRIORITY, sizemask); - gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_LOW_PRIORITY, - sizemask); - gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_HIGH_PRIORITY, 0); - - /* Initialize Network Interface */ - rc = PtlNIInit(kgmnal_init, 32, 4, 0, &kgmnal_ni); - if (rc) { - CERROR("PtlNIInit failed %d\n", rc); - return (-ENOMEM); - } - - /* Start receiver thread */ - kernel_thread(kgmnal_recv_thread, &kgmnal_data, 0); - - PORTAL_SYMBOL_REGISTER(kgmnal_ni); - - kgmnal_data.kgm_init = 1; - - return 0; -} - -MODULE_AUTHOR("Robert Read "); -MODULE_DESCRIPTION("Kernel Myrinet GM NAL v0.1"); -MODULE_LICENSE("GPL"); - -module_init (kgmnal_initialize); -module_exit (kgmnal_finalize); - -EXPORT_SYMBOL (kgmnal_ni); diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c index d4ee960..cdafba9 100644 --- a/lnet/klnds/qswlnd/qswlnd.c +++ b/lnet/klnds/qswlnd/qswlnd.c @@ -32,6 +32,7 @@ kpr_nal_interface_t kqswnal_router_interface = { kprni_nalid: QSWNAL, kprni_arg: NULL, kprni_fwd: kqswnal_fwd_packet, + kprni_notify: NULL, /* we're connectionless */ }; @@ -322,7 +323,7 @@ kqswnal_finalise (void) CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory)); - printk (KERN_INFO "Routing QSW NAL unloaded (final mem %d)\n", + printk (KERN_INFO "Lustre: Routing QSW NAL unloaded (final mem %d)\n", atomic_read(&portal_kmemory)); } @@ -630,7 +631,7 @@ kqswnal_initialise (void) PORTAL_SYMBOL_REGISTER(kqswnal_ni); kqswnal_data.kqn_init = KQN_INIT_ALL; - printk(KERN_INFO "Routing QSW NAL loaded on node %d of %d " + printk(KERN_INFO "Lustre: Routing QSW NAL loaded on node %d of %d " "(Routing %s, initial mem %d)\n", kqswnal_data.kqn_elanid, kqswnal_data.kqn_nnodes, kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled", diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index a27239c..4cb9ad9 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -169,6 +169,7 @@ typedef struct void *ktx_args[2]; /* completion passthru */ E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ + unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */ /* debug/info fields */ pid_t ktx_launcher; /* pid of launching process */ diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index 6390137..6e19783 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -114,6 +114,18 @@ kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) } void +kqswnal_notify_peer_down(kqswnal_tx_t *ktx) +{ + struct timeval now; + time_t then; + + do_gettimeofday (&now); + then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ; + + kpr_notify(&kqswnal_data.kqn_router, ktx->ktx_nid, 0, then); +} + +void kqswnal_unmap_tx (kqswnal_tx_t *ktx) { if (ktx->ktx_nmappedpages == 0) @@ -451,7 +463,10 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) if (status != EP_SUCCESS) { - CERROR ("kqswnal: Transmit failed with %d\n", status); + CERROR ("Tx completion to "LPX64" failed: %d\n", + ktx->ktx_nid, status); + + kqswnal_notify_peer_down(ktx); status = -EIO; } else if (ktx->ktx_state == KTX_GETTING) { @@ -474,7 +489,9 @@ kqswnal_launch (kqswnal_tx_t *ktx) int dest = kqswnal_nid2elanid (ktx->ktx_nid); long flags; int rc; - + + ktx->ktx_launchtime = jiffies; + LASSERT (dest >= 0); /* must be a peer */ if (ktx->ktx_state == KTX_GETTING) { LASSERT (KQSW_OPTIMIZE_GETS); @@ -487,25 +504,29 @@ kqswnal_launch (kqswnal_tx_t *ktx) ktx, ktx->ktx_frags.iov, ktx->ktx_nfrag); } - if (rc != ENOMEM) - return (rc); - - /* can't allocate ep txd => queue for later */ + switch (rc) { + case ESUCCESS: /* success */ + return (0); - LASSERT (in_interrupt()); /* not called by thread (not looping) */ + case ENOMEM: /* can't allocate ep txd => queue for later */ + LASSERT (in_interrupt()); - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds); - if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) - wake_up (&kqswnal_data.kqn_sched_waitq); + list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); - spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + return (0); - return (0); + default: /* fatal error */ + CERROR ("Tx to "LPX64" failed: %d\n", ktx->ktx_nid, rc); + kqswnal_notify_peer_down(ktx); + return (rc); + } } - static char * hdr_type_string (ptl_hdr_t *hdr) { @@ -748,7 +769,8 @@ kqswnal_sendmsg (nal_cb_t *nal, targetnid = nid; if (kqswnal_nid2elanid (nid) < 0) { /* Can't send direct: find gateway? */ - rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &targetnid); + rc = kpr_lookup (&kqswnal_data.kqn_router, nid, + sizeof (ptl_hdr_t) + payload_nob, &targetnid); if (rc != 0) { CERROR("Can't route to "LPX64": router error %d\n", nid, rc); @@ -777,6 +799,16 @@ kqswnal_sendmsg (nal_cb_t *nal, #if KQSW_OPTIMIZE_GETS if (type == PTL_MSG_REPLY && ep_rxd_isrpc(((kqswnal_rx_t *)private)->krx_rxd)) { + if (nid != targetnid || + kqswnal_nid2elanid(nid) != + ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd)) { + CERROR("Optimized reply nid conflict: " + "nid "LPX64" via "LPX64" elanID %d\n", + nid, targetnid, + ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd)); + return(PTL_FAIL); + } + /* peer expects RPC completion with GET data */ rc = kqswnal_dma_reply (ktx, payload_niov, payload_iov, @@ -901,7 +933,8 @@ kqswnal_sendmsg (nal_cb_t *nal, return (PTL_FAIL); } - CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, targetnid); + CDEBUG(D_NET, "sent "LPSZ" bytes to "LPX64" via "LPX64"\n", + payload_nob, nid, targetnid); return (PTL_OK); } diff --git a/lnet/klnds/scimaclnd/scimacnal_cb.c b/lnet/klnds/scimaclnd/scimacnal_cb.c index cc0c102..7d5796e 100644 --- a/lnet/klnds/scimaclnd/scimacnal_cb.c +++ b/lnet/klnds/scimaclnd/scimacnal_cb.c @@ -74,7 +74,7 @@ kscimacnal_printf(nal_cb_t *nal, const char *fmt, ...) vsnprintf( msg, sizeof(msg), fmt, ap ); va_end( ap ); - printk("CPUId: %d %s",smp_processor_id(), msg); + printk("Lustre: CPUId: %d %s",smp_processor_id(), msg); } } diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index e7232a0..33f950e 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -37,8 +37,34 @@ kpr_nal_interface_t ksocknal_router_interface = { kprni_nalid: SOCKNAL, kprni_arg: &ksocknal_data, kprni_fwd: ksocknal_fwd_packet, + kprni_notify: ksocknal_notify, }; +#define SOCKNAL_SYSCTL 200 + +#define SOCKNAL_SYSCTL_TIMEOUT 1 +#define SOCKNAL_SYSCTL_EAGER_ACK 2 +#define SOCKNAL_SYSCTL_ZERO_COPY 3 + +static ctl_table ksocknal_ctl_table[] = { + {SOCKNAL_SYSCTL_TIMEOUT, "timeout", + &ksocknal_data.ksnd_io_timeout, sizeof (int), + 0644, NULL, &proc_dointvec}, + {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack", + &ksocknal_data.ksnd_eager_ack, sizeof (int), + 0644, NULL, &proc_dointvec}, +#if SOCKNAL_ZC + {SOCKNAL_SYSCTL_EAGER_ACK, "zero_copy", + &ksocknal_data.ksnd_zc_min_frag, sizeof (int), + 0644, NULL, &proc_dointvec}, +#endif + { 0 } +}; + +static ctl_table ksocknal_top_ctl_table[] = { + {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table}, + { 0 } +}; int ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len, @@ -160,7 +186,7 @@ ksocknal_bind_irq (unsigned int irq) snprintf (cmdline, sizeof (cmdline), "echo %d > /proc/irq/%u/smp_affinity", 1 << info->ksni_sched, irq); - printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n", + printk (KERN_INFO "Lustre: Binding irq %u to CPU %d with cmd: %s\n", irq, info->ksni_sched, cmdline); /* FIXME: Find a better method of setting IRQ affinity... @@ -172,7 +198,7 @@ ksocknal_bind_irq (unsigned int irq) ksock_route_t * ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, - int irq_affinity, int xchange_nids, int nonagel) + int nonagel, int xchange_nids, int irq_affinity, int eager) { ksock_route_t *route; @@ -183,7 +209,7 @@ ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, atomic_set (&route->ksnr_refcount, 1); route->ksnr_sharecount = 0; route->ksnr_peer = NULL; - route->ksnr_timeout = jiffies_64; + route->ksnr_timeout = jiffies; route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; route->ksnr_ipaddr = ipaddr; route->ksnr_port = port; @@ -191,6 +217,7 @@ ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, route->ksnr_irq_affinity = irq_affinity; route->ksnr_xchange_nids = xchange_nids; route->ksnr_nonagel = nonagel; + route->ksnr_eager = eager; route->ksnr_connecting = 0; route->ksnr_deleted = 0; route->ksnr_generation = 0; @@ -371,7 +398,8 @@ ksocknal_get_route_by_idx (int index) int ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, - int nonagle, int xchange_nids, int bind_irq, int share) + int nonagle, int xchange_nids, int bind_irq, + int share, int eager) { unsigned long flags; ksock_peer_t *peer; @@ -388,8 +416,8 @@ ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, if (peer == NULL) return (-ENOMEM); - route = ksocknal_create_route (ipaddr, port, bufnob, - nonagle, xchange_nids, bind_irq); + route = ksocknal_create_route (ipaddr, port, bufnob, nonagle, + xchange_nids, bind_irq, eager); if (route == NULL) { ksocknal_put_peer (peer); return (-ENOMEM); @@ -454,7 +482,7 @@ ksocknal_del_route_locked (ksock_route_t *route, int share, int keep_conn) if (conn != NULL) { if (!keep_conn) - ksocknal_close_conn_locked (conn); + ksocknal_close_conn_locked (conn, 0); else { /* keeping the conn; just dissociate it and route... */ conn->ksnc_route = NULL; @@ -568,14 +596,12 @@ ksocknal_get_peer_addr (ksock_conn_t *conn) struct sockaddr_in sin; int len = sizeof (sin); int rc; - - rc = ksocknal_getconnsock (conn); - LASSERT (rc == 0); rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, (struct sockaddr *)&sin, &len, 2); + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); LASSERT (len <= sizeof (sin)); - ksocknal_putconnsock (conn); if (rc != 0) { CERROR ("Error %d getting sock peer IP\n", rc); @@ -590,12 +616,8 @@ unsigned int ksocknal_conn_irq (ksock_conn_t *conn) { int irq = 0; - int rc; struct dst_entry *dst; - rc = ksocknal_getconnsock (conn); - LASSERT (rc == 0); - dst = sk_dst_get (conn->ksnc_sock->sk); if (dst != NULL) { if (dst->dev != NULL) { @@ -608,7 +630,8 @@ ksocknal_conn_irq (ksock_conn_t *conn) dst_release (dst); } - ksocknal_putconnsock (conn); + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); return (irq); } @@ -660,11 +683,13 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, int rc; /* NB, sock has an associated file since (a) this connection might - * have been created in userland and (b) we need the refcounting so - * that we don't close the socket while I/O is being done on it. */ + * have been created in userland and (b) we need to refcount the + * socket so that we don't close it while I/O is being done on + * it, and sock->file has that pre-cooked... */ LASSERT (sock->file != NULL); + LASSERT (file_count(sock->file) > 0); - rc = ksocknal_set_linger (sock); + rc = ksocknal_setup_sock (sock); if (rc != 0) return (rc); @@ -696,9 +721,6 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, ksocknal_new_packet (conn, 0); INIT_LIST_HEAD (&conn->ksnc_tx_queue); -#if SOCKNAL_ZC - INIT_LIST_HEAD (&conn->ksnc_tx_pending); -#endif conn->ksnc_tx_ready = 0; conn->ksnc_tx_scheduled = 0; atomic_set (&conn->ksnc_tx_nob, 0); @@ -753,6 +775,8 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, conn->ksnc_peer = peer; atomic_inc (&peer->ksnp_refcount); + peer->ksnp_last_alive = jiffies; + peer->ksnp_error = 0; list_add (&conn->ksnc_list, &peer->ksnp_conns); atomic_inc (&conn->ksnc_refcount); @@ -797,7 +821,7 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, } void -ksocknal_close_conn_locked (ksock_conn_t *conn) +ksocknal_close_conn_locked (ksock_conn_t *conn, int error) { /* This just does the immmediate housekeeping, and queues the * connection for the reaper to terminate. @@ -805,6 +829,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn) ksock_peer_t *peer = conn->ksnc_peer; ksock_route_t *route; + LASSERT (peer->ksnp_error == 0); LASSERT (!conn->ksnc_closing); conn->ksnc_closing = 1; atomic_inc (&ksocknal_data.ksnd_nclosing_conns); @@ -825,11 +850,16 @@ ksocknal_close_conn_locked (ksock_conn_t *conn) /* ksnd_deathrow_conns takes over peer's ref */ list_del (&conn->ksnc_list); - if (list_empty (&peer->ksnp_conns) && - list_empty (&peer->ksnp_routes)) { - /* I've just closed last conn belonging to a - * non-autoconnecting peer */ - ksocknal_unlink_peer_locked (peer); + if (list_empty (&peer->ksnp_conns)) { + /* No more connections to this peer */ + + peer->ksnp_error = error; /* stash last conn close reason */ + + if (list_empty (&peer->ksnp_routes)) { + /* I've just closed last conn belonging to a + * non-autoconnecting peer */ + ksocknal_unlink_peer_locked (peer); + } } spin_lock (&ksocknal_data.ksnd_reaper_lock); @@ -842,7 +872,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn) } int -ksocknal_close_conn_unlocked (ksock_conn_t *conn) +ksocknal_close_conn_unlocked (ksock_conn_t *conn, int why) { unsigned long flags; int did_it = 0; @@ -851,7 +881,7 @@ ksocknal_close_conn_unlocked (ksock_conn_t *conn) if (!conn->ksnc_closing) { did_it = 1; - ksocknal_close_conn_locked (conn); + ksocknal_close_conn_locked (conn, why); } write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); @@ -867,6 +897,10 @@ ksocknal_terminate_conn (ksock_conn_t *conn) * ksnc_refcount will eventually hit zero, and then the reaper will * destroy it. */ unsigned long flags; + ksock_peer_t *peer = conn->ksnc_peer; + struct timeval now; + time_t then = 0; + int notify = 0; /* serialise with callbacks */ write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); @@ -886,6 +920,16 @@ ksocknal_terminate_conn (ksock_conn_t *conn) conn->ksnc_scheduler->kss_nconns--; + if (peer->ksnp_error != 0) { + /* peer's last conn closed in error */ + LASSERT (list_empty (&peer->ksnp_conns)); + + /* convert peer's last-known-alive timestamp from jiffies */ + do_gettimeofday (&now); + then = now.tv_sec - (jiffies - peer->ksnp_last_alive)/HZ; + notify = 1; + } + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); /* The socket is closed on the final put; either here, or in @@ -894,6 +938,10 @@ ksocknal_terminate_conn (ksock_conn_t *conn) * immediately, aborting anything buffered in it. Any hung * zero-copy transmits will therefore complete in finite time. */ ksocknal_putconnsock (conn); + + if (notify) + kpr_notify (&ksocknal_data.ksnd_router, peer->ksnp_nid, + 0, then); } void @@ -906,9 +954,7 @@ ksocknal_destroy_conn (ksock_conn_t *conn) LASSERT (conn->ksnc_route == NULL); LASSERT (!conn->ksnc_tx_scheduled); LASSERT (!conn->ksnc_rx_scheduled); -#if SOCKNAL_ZC - LASSERT (list_empty (&conn->ksnc_tx_pending)); -#endif + /* complete queued packets */ while (!list_empty (&conn->ksnc_tx_queue)) { ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, @@ -1010,7 +1056,7 @@ ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr) continue; rc = 0; - ksocknal_close_conn_locked (conn); + ksocknal_close_conn_locked (conn, 0); } } } @@ -1020,6 +1066,24 @@ ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr) return (rc); } +void +ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive) +{ + /* The router is telling me she's been notified of a change in + * gateway state.... */ + + CDEBUG (D_NET, "gw "LPX64" %s\n", gw_nid, alive ? "up" : "down"); + + if (!alive) { + /* If the gateway crashed, close all open connections... */ + ksocknal_close_conn (gw_nid, 0); + return; + } + + /* ...otherwise do nothing. We can only establish new connections + * if we have autroutes, and these connect on demand. */ +} + #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) struct tcp_opt *sock2tcp_opt(struct sock *sk) { @@ -1177,7 +1241,8 @@ ksocknal_cmd(struct portal_ioctl_data * data, void * private) data->ioc_wait = route->ksnr_sharecount; data->ioc_flags = (route->ksnr_nonagel ? 1 : 0) | (route->ksnr_xchange_nids ? 2 : 0) | - (route->ksnr_irq_affinity ? 4 : 0); + (route->ksnr_irq_affinity ? 4 : 0) | + (route->ksnr_eager ? 8 : 0); ksocknal_put_route (route); } break; @@ -1185,10 +1250,11 @@ ksocknal_cmd(struct portal_ioctl_data * data, void * private) case NAL_CMD_ADD_AUTOCONN: { rc = ksocknal_add_route (data->ioc_nid, data->ioc_id, data->ioc_misc, data->ioc_size, - (data->ioc_flags & 1) != 0, - (data->ioc_flags & 2) != 0, - (data->ioc_flags & 4) != 0, - (data->ioc_flags & 8) != 0); + (data->ioc_flags & 0x01) != 0, + (data->ioc_flags & 0x02) != 0, + (data->ioc_flags & 0x04) != 0, + (data->ioc_flags & 0x08) != 0, + (data->ioc_flags & 0x10) != 0); break; } case NAL_CMD_DEL_AUTOCONN: { @@ -1287,6 +1353,10 @@ ksocknal_module_fini (void) LASSERT (0); case SOCKNAL_INIT_ALL: +#if CONFIG_SYSCTL + if (ksocknal_data.ksnd_sysctl != NULL) + unregister_sysctl_table (ksocknal_data.ksnd_sysctl); +#endif kportal_nal_unregister(SOCKNAL); PORTAL_SYMBOL_UNREGISTER (ksocknal_ni); /* fall through */ @@ -1349,7 +1419,7 @@ ksocknal_module_fini (void) CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", atomic_read (&portal_kmemory)); - printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n", + printk(KERN_INFO "Lustre: Routing socket NAL unloaded (final mem %d)\n", atomic_read(&portal_kmemory)); } @@ -1364,6 +1434,9 @@ ksocknal_module_init (void) /* packet descriptor must fit in a router descriptor's scratchpad */ LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + /* the following must be sizeof(int) for proc_dointvec() */ + LASSERT(sizeof (ksocknal_data.ksnd_io_timeout) == sizeof (int)); + LASSERT(sizeof (ksocknal_data.ksnd_eager_ack) == sizeof (int)); LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); @@ -1379,6 +1452,12 @@ ksocknal_module_init (void) memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ + ksocknal_data.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT; + ksocknal_data.ksnd_eager_ack = SOCKNAL_EAGER_ACK; +#if SOCKNAL_ZC + ksocknal_data.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; +#endif + ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; PORTAL_ALLOC (ksocknal_data.ksnd_peers, sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size); @@ -1561,11 +1640,15 @@ ksocknal_module_init (void) PORTAL_SYMBOL_REGISTER(ksocknal_ni); +#ifdef CONFIG_SYSCTL + /* Press on regardless even if registering sysctl doesn't work */ + ksocknal_data.ksnd_sysctl = register_sysctl_table (ksocknal_top_ctl_table, 0); +#endif /* flag everything initialised */ ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; - printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial " - "mem %d)\n", + printk(KERN_INFO "Lustre: Routing socket NAL loaded " + "(Routing %s, initial mem %d)\n", kpr_routing (&ksocknal_data.ksnd_router) ? "enabled" : "disabled", pkmem); diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 7a13396..a345ff7 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -68,7 +69,12 @@ #define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ #define SOCKNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ -#define SOCKNAL_IO_TIMEOUT (60*HZ) /* default comms timeout */ +/* default vals for runtime tunables */ +#define SOCKNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ +#define SOCKNAL_EAGER_ACK 1 /* default eager ack (boolean) */ +#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ + +#define SOCKNAL_USE_KEEPALIVES 0 /* use tcp/ip keepalive? */ #define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ @@ -78,8 +84,6 @@ # define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ #endif -#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ - #define SOCKNAL_NLTXS 128 /* # normal transmit messages */ #define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ @@ -95,10 +99,6 @@ #define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10) -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -# define jiffies_64 jiffies -#endif - #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72)) # define sk_data_ready data_ready # define sk_write_space write_space @@ -136,6 +136,12 @@ typedef struct { typedef struct { int ksnd_init; /* initialisation state */ + int ksnd_io_timeout; /* "stuck" socket timeout (seconds) */ + int ksnd_eager_ack; /* make TCP ack eagerly? */ +#if SOCKNAL_ZC + unsigned int ksnd_zc_min_frag; /* minimum zero copy frag size */ +#endif + struct ctl_table_header *ksnd_sysctl; /* sysctl interface */ rwlock_t ksnd_global_lock; /* stabilize peer/conn ops */ struct list_head *ksnd_peers; /* hash table of all my known peers */ @@ -205,7 +211,6 @@ struct ksock_route; /* forward ref */ typedef struct /* transmit packet */ { struct list_head tx_list; /* queue on conn for transmission etc */ - __u64 tx_deadline; /* when (in jiffies) tx times out */ char tx_isfwd; /* forwarding / sourced here */ int tx_nob; /* # packet bytes */ int tx_resid; /* residual bytes */ @@ -291,10 +296,11 @@ typedef struct ksock_conn __u32 ksnc_ipaddr; /* peer's IP */ int ksnc_port; /* peer's port */ int ksnc_closing; /* being shut down */ - + /* READER */ struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ - __u64 ksnc_rx_deadline; /* when receive times out */ + unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times out */ + int ksnc_rx_started; /* started receiving a message */ int ksnc_rx_ready; /* data ready to read */ int ksnc_rx_scheduled; /* being progressed */ int ksnc_rx_state; /* what is being read */ @@ -311,9 +317,7 @@ typedef struct ksock_conn /* WRITER */ struct list_head ksnc_tx_list; /* where I enq waiting for output space */ struct list_head ksnc_tx_queue; /* packets waiting to be sent */ -#if SOCKNAL_ZC - struct list_head ksnc_tx_pending; /* zc packets pending callback */ -#endif + unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out */ atomic_t ksnc_tx_nob; /* # bytes queued */ int ksnc_tx_ready; /* write space */ int ksnc_tx_scheduled; /* being progressed */ @@ -326,7 +330,7 @@ typedef struct ksock_route struct ksock_peer *ksnr_peer; /* owning peer */ atomic_t ksnr_refcount; /* # users */ int ksnr_sharecount; /* lconf usage counter */ - __u64 ksnr_timeout; /* when reconnection can happen next */ + unsigned long ksnr_timeout; /* when (in jiffies) reconnection can happen next */ unsigned int ksnr_retry_interval; /* how long between retries */ __u32 ksnr_ipaddr; /* an IP address for this peer */ int ksnr_port; /* port to connect to */ @@ -334,8 +338,9 @@ typedef struct ksock_route unsigned int ksnr_irq_affinity:1; /* set affinity? */ unsigned int ksnr_xchange_nids:1; /* do hello protocol? */ unsigned int ksnr_nonagel:1; /* disable nagle? */ - unsigned int ksnr_connecting; /* autoconnect in progress? */ - unsigned int ksnr_deleted; /* been removed from peer? */ + unsigned int ksnr_eager:1; /* connect eagery? */ + unsigned int ksnr_connecting:1; /* autoconnect in progress? */ + unsigned int ksnr_deleted:1; /* been removed from peer? */ int ksnr_generation; /* connection incarnation # */ ksock_conn_t *ksnr_conn; /* NULL/active connection */ } ksock_route_t; @@ -346,13 +351,14 @@ typedef struct ksock_peer ptl_nid_t ksnp_nid; /* who's on the other end(s) */ atomic_t ksnp_refcount; /* # users */ int ksnp_closing; /* being closed */ + int ksnp_error; /* errno on closing last conn */ struct list_head ksnp_conns; /* all active connections */ struct list_head ksnp_routes; /* routes */ struct list_head ksnp_tx_queue; /* waiting packets */ + unsigned long ksnp_last_alive; /* when (in jiffies) I was last alive */ } ksock_peer_t; - extern nal_cb_t ksocknal_lib; extern ksock_nal_data_t ksocknal_data; @@ -393,8 +399,8 @@ extern int ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, int single, int keep_conn); extern int ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, struct socket *sock, int bind_irq); -extern void ksocknal_close_conn_locked (ksock_conn_t *conn); -extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn); +extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why); +extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn, int why); extern void ksocknal_terminate_conn (ksock_conn_t *conn); extern void ksocknal_destroy_conn (ksock_conn_t *conn); extern void ksocknal_put_conn (ksock_conn_t *conn); @@ -404,6 +410,7 @@ extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn); extern void ksocknal_tx_done (ksock_tx_t *tx, int asynch); extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); extern void ksocknal_fmb_callback (void *arg, int error); +extern void ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive); extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); extern int ksocknal_scheduler (void *arg); @@ -411,4 +418,4 @@ extern void ksocknal_data_ready(struct sock *sk, int n); extern void ksocknal_write_space(struct sock *sk); extern int ksocknal_autoconnectd (void *arg); extern int ksocknal_reaper (void *arg); -extern int ksocknal_set_linger (struct socket *sock); +extern int ksocknal_setup_sock (struct socket *sock); diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 656a0c5..65db867 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -25,12 +25,6 @@ #include "socknal.h" -int ksocknal_io_timeout = SOCKNAL_IO_TIMEOUT; -#if SOCKNAL_ZC -int ksocknal_do_zc = 1; -int ksocknal_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; -#endif - /* * LIB functions follow * @@ -225,7 +219,7 @@ ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) struct iovec *iov = tx->tx_iov; int fragsize = iov->iov_len; unsigned long vaddr = (unsigned long)iov->iov_base; - int more = !list_empty (&conn->ksnc_tx_queue) | + int more = (!list_empty (&conn->ksnc_tx_queue)) | (tx->tx_niov > 1) | (tx->tx_nkiov > 1); #if SOCKNAL_ZC @@ -241,10 +235,9 @@ ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) LASSERT (tx->tx_niov > 0); #if SOCKNAL_ZC - if (ksocknal_do_zc && + if (zcsize >= ksocknal_data.ksnd_zc_min_frag && (sock->sk->route_caps & NETIF_F_SG) && (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && - zcsize >= ksocknal_zc_min_frag && (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) { CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", @@ -306,7 +299,7 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) int fragsize = kiov->kiov_len; struct page *page = kiov->kiov_page; int offset = kiov->kiov_offset; - int more = !list_empty (&conn->ksnc_tx_queue) | + int more = (!list_empty (&conn->ksnc_tx_queue)) | (tx->tx_nkiov > 1); int rc; @@ -318,10 +311,9 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) LASSERT (tx->tx_nkiov > 0); #if SOCKNAL_ZC - if (ksocknal_do_zc && + if (fragsize >= ksocknal_data.ksnd_zc_min_frag && (sock->sk->route_caps & NETIF_F_SG) && - (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && - fragsize >= ksocknal_zc_min_frag) { + (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) { CDEBUG(D_NET, "page %p + offset %x for %d\n", page, offset, fragsize); @@ -349,6 +341,7 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) set_fs (KERNEL_DS); rc = sock_sendmsg(sock, &msg, fragsize); set_fs (oldmm); + kunmap (page); } @@ -410,6 +403,16 @@ ksocknal_sendmsg (ksock_conn_t *conn, ksock_tx_t *tx) break; } + /* Consider the connection alive since we managed to chuck + * more data into it. Really, we'd like to consider it + * alive only when the peer ACKs something, but + * write_space() only gets called back while SOCK_NOSPACE + * is set. Instead, we presume peer death has occurred if + * the socket doesn't drain within a timout */ + conn->ksnc_tx_deadline = jiffies + + ksocknal_data.ksnd_io_timeout * HZ; + conn->ksnc_peer->ksnp_last_alive = jiffies; + if (tx->tx_resid == 0) { /* sent everything */ rc = 0; break; @@ -420,6 +423,24 @@ ksocknal_sendmsg (ksock_conn_t *conn, ksock_tx_t *tx) RETURN (rc); } +void +ksocknal_eager_ack (ksock_conn_t *conn) +{ + int opt = 1; + mm_segment_t oldmm = get_fs(); + struct socket *sock = conn->ksnc_sock; + + /* Remind the socket to ACK eagerly. If I don't, the socket might + * think I'm about to send something it could piggy-back the ACK + * on, introducing delay in completing zero-copy sends in my + * peer. */ + + set_fs(KERNEL_DS); + sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK, + (char *)&opt, sizeof (opt)); + set_fs(oldmm); +} + int ksocknal_recv_iov (ksock_conn_t *conn) { @@ -453,6 +474,13 @@ ksocknal_recv_iov (ksock_conn_t *conn) if (rc <= 0) return (rc); + /* received something... */ + conn->ksnc_peer->ksnp_last_alive = jiffies; + conn->ksnc_rx_deadline = jiffies + + ksocknal_data.ksnd_io_timeout * HZ; + mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + conn->ksnc_rx_nob_wanted -= rc; conn->ksnc_rx_nob_left -= rc; @@ -499,11 +527,19 @@ ksocknal_recv_kiov (ksock_conn_t *conn) rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT); /* NB this is just a boolean............................^ */ set_fs (oldmm); + kunmap (page); if (rc <= 0) return (rc); + /* received something... */ + conn->ksnc_peer->ksnp_last_alive = jiffies; + conn->ksnc_rx_deadline = jiffies + + ksocknal_data.ksnd_io_timeout * HZ; + mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + conn->ksnc_rx_nob_wanted -= rc; conn->ksnc_rx_nob_left -= rc; @@ -541,20 +577,33 @@ ksocknal_recvmsg (ksock_conn_t *conn) rc = -ESHUTDOWN; break; } - + if (conn->ksnc_rx_niov != 0) rc = ksocknal_recv_iov (conn); else rc = ksocknal_recv_kiov (conn); - + if (rc <= 0) { /* error/EOF or partial receive */ - if (rc == -EAGAIN) + if (rc == -EAGAIN) { rc = 1; + } else if (rc == 0 && conn->ksnc_rx_started) { + /* EOF in the middle of a message */ + rc = -EPROTO; + } break; } + /* Completed a fragment */ + if (conn->ksnc_rx_nob_wanted == 0) { + /* Completed a message segment (header or payload) */ + if (ksocknal_data.ksnd_eager_ack && + (conn->ksnc_rx_state == SOCKNAL_RX_BODY || + conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD)) { + /* Remind the socket to ack eagerly... */ + ksocknal_eager_ack(conn); + } rc = 1; break; } @@ -577,7 +626,6 @@ ksocknal_zc_callback (zccd_t *zcd) spin_lock_irqsave (&sched->kss_lock, flags); - list_del (&tx->tx_list); /* remove from kss_zctxpending_list */ list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list); if (waitqueue_active (&sched->kss_waitq)) wake_up (&sched->kss_waitq); @@ -628,23 +676,12 @@ ksocknal_tx_launched (ksock_tx_t *tx) { #if SOCKNAL_ZC if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { - unsigned long flags; ksock_conn_t *conn = tx->tx_conn; - ksock_sched_t *sched = conn->ksnc_scheduler; /* zccd skbufs are still in-flight. First take a ref on * conn, so it hangs about for ksocknal_tx_done... */ atomic_inc (&conn->ksnc_refcount); - /* Stash it for timeout... - * NB We have to hold a lock to stash the tx, and we have - * stash it before we zcc_put(), but we have to _not_ hold - * this lock when we zcc_put(), otherwise we could deadlock - * if it turns out to be the last put. Aaaaarrrrggghhh! */ - spin_lock_irqsave (&sched->kss_lock, flags); - list_add_tail (&tx->tx_list, &conn->ksnc_tx_pending); - spin_unlock_irqrestore (&sched->kss_lock, flags); - /* ...then drop the initial ref on zccd, so the zero copy * callback can occur */ zccd_put (&tx->tx_zccd); @@ -659,10 +696,10 @@ ksocknal_tx_launched (ksock_tx_t *tx) void ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) { - ksock_conn_t *conn; - ksock_tx_t *tx; - int rc; - + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + LASSERT (!list_empty (&sched->kss_tx_conns)); conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list); list_del (&conn->ksnc_tx_list); @@ -686,7 +723,7 @@ ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc); if (rc != 0) { - if (ksocknal_close_conn_unlocked (conn)) { + if (ksocknal_close_conn_unlocked (conn, rc)) { /* I'm the first to close */ CERROR ("[%p] Error %d on write to "LPX64" ip %08x:%d\n", conn, rc, conn->ksnc_peer->ksnp_nid, @@ -696,7 +733,6 @@ ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) spin_lock_irqsave (&sched->kss_lock, *irq_flags); } else if (tx->tx_resid == 0) { - /* everything went; assume more can go, and avoid * write_space locking */ conn->ksnc_tx_ready = 1; @@ -763,7 +799,8 @@ ksocknal_find_target_peer_locked (ksock_tx_t *tx, ptl_nid_t nid) return (NULL); } - rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &target_nid); + rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob, + &target_nid); if (rc != 0) { CERROR ("Can't route to "LPX64": router error %d\n", nid, rc); return (NULL); @@ -820,8 +857,7 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) #endif spin_lock_irqsave (&sched->kss_lock, flags); - - tx->tx_deadline = jiffies_64 + ksocknal_io_timeout; + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); if (conn->ksnc_tx_ready && /* able to send */ @@ -839,7 +875,7 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) } ksock_route_t * -ksocknal_find_connectable_route_locked (ksock_peer_t *peer) +ksocknal_find_connectable_route_locked (ksock_peer_t *peer, int eager_only) { struct list_head *tmp; ksock_route_t *route; @@ -849,7 +885,8 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) if (route->ksnr_conn == NULL && /* not connected */ !route->ksnr_connecting && /* not connecting */ - route->ksnr_timeout <= jiffies_64) /* OK to retry */ + (!eager_only || route->ksnr_eager) && /* wants to connect */ + time_after_eq (jiffies, route->ksnr_timeout)) /* OK to retry */ return (route); } @@ -880,7 +917,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) ksock_conn_t *conn; ksock_route_t *route; rwlock_t *g_lock; - + /* Ensure the frags we've been given EXACTLY match the number of * bytes we want to send. Many TCP/IP stacks disregard any total * size parameters passed to them and just look at the frags. @@ -907,17 +944,19 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) return (PTL_FAIL); } - /* Any routes need to be connected? (need write lock if so) */ - if (ksocknal_find_connectable_route_locked (peer) == NULL) { + if (ksocknal_find_connectable_route_locked(peer, 1) == NULL) { conn = ksocknal_find_conn_locked (tx, peer); if (conn != NULL) { + /* I've got no unconnected autoconnect routes that + * need to be connected, and I do have an actual + * connection... */ ksocknal_queue_tx_locked (tx, conn); read_unlock (g_lock); return (PTL_OK); } } - /* need a write lock now to change peer state... */ + /* Making one or more connections; I'll need a write lock... */ atomic_inc (&peer->ksnp_refcount); /* +1 ref for me while I unlock */ read_unlock (g_lock); @@ -930,23 +969,37 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) } ksocknal_put_peer (peer); /* drop ref I got above */ - /* I may launch autoconnects, now we're write locked... */ - while ((route = ksocknal_find_connectable_route_locked (peer)) != NULL) + + for (;;) { + /* launch all eager autoconnections */ + route = ksocknal_find_connectable_route_locked (peer, 1); + if (route == NULL) + break; + ksocknal_launch_autoconnect_locked (route); + } conn = ksocknal_find_conn_locked (tx, peer); if (conn != NULL) { + /* Connection exists; queue message on it */ ksocknal_queue_tx_locked (tx, conn); write_unlock_irqrestore (g_lock, flags); return (PTL_OK); } - + if (ksocknal_find_connecting_route_locked (peer) == NULL) { - /* no routes actually connecting now */ - write_unlock_irqrestore (g_lock, flags); - return (PTL_FAIL); + /* no autoconnect routes actually connecting now. Scrape + * the barrel for non-eager autoconnects */ + route = ksocknal_find_connectable_route_locked (peer, 0); + if (route != NULL) { + ksocknal_launch_autoconnect_locked (route); + } else { + write_unlock_irqrestore (g_lock, flags); + return (PTL_FAIL); + } } + /* At least 1 connection is being established; queue the message... */ list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue); write_unlock_irqrestore (g_lock, flags); @@ -1127,6 +1180,9 @@ ksocknal_fmb_callback (void *arg, int error) CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n", NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid)); + /* drop peer ref taken on init */ + ksocknal_put_peer (fmb->fmb_peer); + spin_lock_irqsave (&fmp->fmp_lock, flags); list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs); @@ -1139,9 +1195,6 @@ ksocknal_fmb_callback (void *arg, int error) spin_unlock_irqrestore (&fmp->fmp_lock, flags); - /* drop peer ref taken on init */ - ksocknal_put_peer (fmb->fmb_peer); - if (conn == NULL) return; @@ -1279,7 +1332,6 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) conn->ksnc_cookie = fmb; /* stash fmb for later */ conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ - conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout; /* start timeout */ /* payload is desc's iov-ed buffer, but skipping the hdr */ LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) / @@ -1323,7 +1375,7 @@ ksocknal_fwd_parse (ksock_conn_t *conn) dest_nid, body_len); ksocknal_new_packet (conn, 0); /* on to new packet */ - ksocknal_close_conn_unlocked (conn); /* give up on conn */ + ksocknal_close_conn_unlocked (conn, -EINVAL); /* give up on conn */ return; } @@ -1373,6 +1425,9 @@ ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) int skipped; if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_started = 0; + mb (); /* racing with timeout thread */ + conn->ksnc_rx_state = SOCKNAL_RX_HEADER; conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); @@ -1464,20 +1519,21 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) rc = ksocknal_recvmsg(conn); if (rc <= 0) { - if (ksocknal_close_conn_unlocked (conn)) { + if (ksocknal_close_conn_unlocked (conn, rc)) { /* I'm the first to close */ if (rc < 0) CERROR ("[%p] Error %d on read from "LPX64" ip %08x:%d\n", conn, rc, conn->ksnc_peer->ksnp_nid, conn->ksnc_ipaddr, conn->ksnc_port); else - CERROR ("[%p] EOF from "LPX64" ip %08x:%d\n", - conn, conn->ksnc_peer->ksnp_nid, - conn->ksnc_ipaddr, conn->ksnc_port); + CWARN ("[%p] EOF from "LPX64" ip %08x:%d\n", + conn, conn->ksnc_peer->ksnp_nid, + conn->ksnc_ipaddr, conn->ksnc_port); } goto out; } + if (conn->ksnc_rx_nob_wanted != 0) /* short read */ goto out; /* try again later */ @@ -1506,9 +1562,6 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) /* sets wanted_len, iovs etc */ lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); - /* start timeout (lib is waiting for finalize) */ - conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout; - if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ conn->ksnc_rx_state = SOCKNAL_RX_BODY; goto try_read; /* go read the payload */ @@ -1517,7 +1570,6 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) case SOCKNAL_RX_BODY: /* payload all received */ - conn->ksnc_rx_deadline = 0; /* cancel timeout */ lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie); /* Fall through */ @@ -1534,9 +1586,6 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) NTOH__u64 (conn->ksnc_hdr.dest_nid), conn->ksnc_rx_nob_left); - /* cancel timeout (only needed it while fmb allocated) */ - conn->ksnc_rx_deadline = 0; - /* forward the packet. NB ksocknal_init_fmb() put fmb into * conn->ksnc_cookie */ fmb = (ksock_fmb_t *)conn->ksnc_cookie; @@ -1631,15 +1680,20 @@ int ksocknal_scheduler (void *arg) int id = sched - ksocknal_data.ksnd_schedulers; char name[16]; - snprintf (name, sizeof (name),"ksocknald[%d]", id); + snprintf (name, sizeof (name),"ksocknald_%02d", id); kportal_daemonize (name); kportal_blockallsigs (); #if (CONFIG_SMP && CPU_AFFINITY) - if ((cpu_online_map & (1 << id)) != 0) + if ((cpu_online_map & (1 << id)) != 0) { +#if 1 current->cpus_allowed = (1 << id); - else +#else + set_cpus_allowed (current, 1<kss_lock, flags); @@ -1722,7 +1776,10 @@ ksocknal_data_ready (struct sock *sk, int n) if (conn == NULL) { /* raced with ksocknal_close_sock */ LASSERT (sk->sk_data_ready != &ksocknal_data_ready); sk->sk_data_ready (sk, n); - } else if (!conn->ksnc_rx_ready) { /* new news */ + goto out; + } + + if (!conn->ksnc_rx_ready) { /* new news */ /* Set ASAP in case of concurrent calls to me */ conn->ksnc_rx_ready = 1; @@ -1747,6 +1804,7 @@ ksocknal_data_ready (struct sock *sk, int n) spin_unlock_irqrestore (&sched->kss_lock, flags); } + out: read_unlock (&ksocknal_data.ksnd_global_lock); EXIT; @@ -1776,7 +1834,12 @@ ksocknal_write_space (struct sock *sk) if (conn == NULL) { /* raced with ksocknal_close_sock */ LASSERT (sk->sk_write_space != &ksocknal_write_space); sk->sk_write_space (sk); - } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ + + read_unlock (&ksocknal_data.ksnd_global_lock); + return; + } + + if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags); if (!conn->ksnc_tx_ready) { /* new news */ @@ -1967,7 +2030,7 @@ ksocknal_exchange_nids (struct socket *sock, ptl_nid_t nid) } int -ksocknal_set_linger (struct socket *sock) +ksocknal_setup_sock (struct socket *sock) { mm_segment_t oldmm = get_fs (); int rc; @@ -1998,7 +2061,51 @@ ksocknal_set_linger (struct socket *sock) CERROR ("Can't set SO_LINGER2: %d\n", rc); return (rc); } + +#if SOCKNAL_USE_KEEPALIVES + /* Keepalives: If 3/4 of the timeout elapses, start probing every + * second until the timeout elapses. */ + + option = (ksocknal_data.ksnd_io_timeout * 3) / 4; + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc); + return (rc); + } + + option = 1; + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc); + return (rc); + } + option = ksocknal_data.ksnd_io_timeout / 4; + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc); + return (rc); + } + + option = 1; + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); + return (rc); + } +#endif return (0); } @@ -2007,7 +2114,6 @@ ksocknal_connect_peer (ksock_route_t *route) { struct sockaddr_in peer_addr; mm_segment_t oldmm = get_fs(); - __u64 n; struct timeval tv; int fd; struct socket *sock; @@ -2020,8 +2126,8 @@ ksocknal_connect_peer (ksock_route_t *route) } /* Ugh; have to map_fd for compatibility with sockets passed in - * from userspace. And we actually need the refcounting that - * this gives you :) */ + * from userspace. And we actually need the sock->file refcounting + * that this gives you :) */ fd = sock_map_fd (sock); if (fd < 0) { @@ -2033,22 +2139,19 @@ ksocknal_connect_peer (ksock_route_t *route) /* NB the fd now owns the ref on sock->file */ LASSERT (sock->file != NULL); LASSERT (file_count(sock->file) == 1); - + /* Set the socket timeouts, so our connection attempt completes in * finite time */ - tv.tv_sec = ksocknal_io_timeout / HZ; - n = ksocknal_io_timeout % HZ; - n = n * 1000000 + HZ - 1; - do_div (n, HZ); - tv.tv_usec = n; + tv.tv_sec = ksocknal_data.ksnd_io_timeout; + tv.tv_usec = 0; set_fs (KERNEL_DS); rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof (tv)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set send timeout %d (in HZ): %d\n", - ksocknal_io_timeout, rc); + CERROR ("Can't set send timeout %d: %d\n", + ksocknal_data.ksnd_io_timeout, rc); goto out; } @@ -2057,8 +2160,8 @@ ksocknal_connect_peer (ksock_route_t *route) (char *)&tv, sizeof (tv)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set receive timeout %d (in HZ): %d\n", - ksocknal_io_timeout, rc); + CERROR ("Can't set receive timeout %d: %d\n", + ksocknal_data.ksnd_io_timeout, rc); goto out; } @@ -2154,7 +2257,7 @@ ksocknal_autoconnect (ksock_route_t *route) route->ksnr_connecting = 0; LASSERT (route->ksnr_retry_interval != 0); - route->ksnr_timeout = jiffies_64 + route->ksnr_retry_interval; + route->ksnr_timeout = jiffies + route->ksnr_retry_interval; route->ksnr_retry_interval = MIN (route->ksnr_retry_interval * 2, SOCKNAL_MAX_RECONNECT_INTERVAL); @@ -2198,7 +2301,7 @@ ksocknal_autoconnectd (void *arg) ksock_route_t *route; int rc; - snprintf (name, sizeof (name), "ksocknal_ad[%ld]", id); + snprintf (name, sizeof (name), "ksocknal_ad%02ld", id); kportal_daemonize (name); kportal_blockallsigs (); @@ -2239,55 +2342,47 @@ ksock_conn_t * ksocknal_find_timed_out_conn (ksock_peer_t *peer) { /* We're called with a shared lock on ksnd_global_lock */ - unsigned long flags; ksock_conn_t *conn; struct list_head *ctmp; - ksock_tx_t *tx; - struct list_head *ttmp; ksock_sched_t *sched; list_for_each (ctmp, &peer->ksnp_conns) { conn = list_entry (ctmp, ksock_conn_t, ksnc_list); sched = conn->ksnc_scheduler; - - if (conn->ksnc_rx_deadline != 0 && - conn->ksnc_rx_deadline <= jiffies_64) - goto timed_out; - spin_lock_irqsave (&sched->kss_lock, flags); + /* Don't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); - list_for_each (ttmp, &conn->ksnc_tx_queue) { - tx = list_entry (ttmp, ksock_tx_t, tx_list); - LASSERT (tx->tx_deadline != 0); - - if (tx->tx_deadline <= jiffies_64) - goto timed_out_locked; + if (conn->ksnc_rx_started && + time_after_eq (jiffies, conn->ksnc_rx_deadline)) { + /* Timed out incomplete incoming message */ + atomic_inc (&conn->ksnc_refcount); + CERROR ("Timed out RX from "LPX64" %p\n", + peer->ksnp_nid, conn); + return (conn); } -#if SOCKNAL_ZC - list_for_each (ttmp, &conn->ksnc_tx_pending) { - tx = list_entry (ttmp, ksock_tx_t, tx_list); - LASSERT (tx->tx_deadline != 0); - - if (tx->tx_deadline <= jiffies_64) - goto timed_out_locked; + + if ((!list_empty (&conn->ksnc_tx_queue) || + conn->ksnc_sock->sk->wmem_queued != 0) && + time_after_eq (jiffies, conn->ksnc_tx_deadline)) { + /* Timed out messages queued for sending, or + * messages buffered in the socket's send buffer */ + atomic_inc (&conn->ksnc_refcount); + CERROR ("Timed out TX to "LPX64" %s%d %p\n", + peer->ksnp_nid, + list_empty (&conn->ksnc_tx_queue) ? "" : "Q ", + conn->ksnc_sock->sk->wmem_queued, conn); + return (conn); } -#endif - spin_unlock_irqrestore (&sched->kss_lock, flags); - continue; - - timed_out_locked: - spin_unlock_irqrestore (&sched->kss_lock, flags); - timed_out: - atomic_inc (&conn->ksnc_refcount); - return (conn); } return (NULL); } void -ksocknal_check_peer_timeouts (struct list_head *peers) +ksocknal_check_peer_timeouts (int idx) { + struct list_head *peers = &ksocknal_data.ksnd_peers[idx]; struct list_head *ptmp; ksock_peer_t *peer; ksock_conn_t *conn; @@ -2305,7 +2400,7 @@ ksocknal_check_peer_timeouts (struct list_head *peers) if (conn != NULL) { read_unlock (&ksocknal_data.ksnd_global_lock); - if (ksocknal_close_conn_unlocked (conn)) { + if (ksocknal_close_conn_unlocked (conn, -ETIMEDOUT)) { /* I actually closed... */ CERROR ("Timeout out conn->"LPX64" ip %x:%d\n", peer->ksnp_nid, conn->ksnc_ipaddr, @@ -2330,8 +2425,9 @@ ksocknal_reaper (void *arg) unsigned long flags; ksock_conn_t *conn; int timeout; + int i; int peer_index = 0; - __u64 deadline = jiffies_64; + unsigned long deadline = jiffies; kportal_daemonize ("ksocknal_reaper"); kportal_blockallsigs (); @@ -2371,12 +2467,32 @@ ksocknal_reaper (void *arg) spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); - while ((timeout = deadline - jiffies_64) <= 0) { - /* Time to check for timeouts on a few more peers */ - ksocknal_check_peer_timeouts (&ksocknal_data.ksnd_peers[peer_index]); + /* careful with the jiffy wrap... */ + while ((timeout = ((int)deadline - (int)jiffies)) <= 0) { + const int n = 4; + const int p = 1; + int chunk = ksocknal_data.ksnd_peer_hash_size; + + /* Time to check for timeouts on a few more peers: I do + * checks every 'p' seconds on a proportion of the peer + * table and I need to check every connection 'n' times + * within a timeout interval, to ensure I detect a + * timeout on any connection within (n+1)/n times the + * timeout interval. */ + + if (ksocknal_data.ksnd_io_timeout > n * p) + chunk = (chunk * n * p) / + ksocknal_data.ksnd_io_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + ksocknal_check_peer_timeouts (peer_index); + peer_index = (peer_index + 1) % + ksocknal_data.ksnd_peer_hash_size; + } - peer_index = (peer_index + 1) % SOCKNAL_PEER_HASH_SIZE; - deadline += HZ; + deadline += p * HZ; } add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); diff --git a/lnet/klnds/toelnd/toenal.c b/lnet/klnds/toelnd/toenal.c index 77ee473..dc7e447 100644 --- a/lnet/klnds/toelnd/toenal.c +++ b/lnet/klnds/toelnd/toenal.c @@ -438,7 +438,7 @@ ktoenal_module_fini (void) CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", atomic_read (&portal_kmemory)); - printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n", + printk(KERN_INFO "Lustre: Routing socket NAL unloaded (final mem %d)\n", atomic_read(&portal_kmemory)); } @@ -612,7 +612,7 @@ ktoenal_module_init (void) /* flag everything initialised */ ktoenal_data.ksnd_init = SOCKNAL_INIT_ALL; - printk(KERN_INFO"Routing TOE NAL loaded (Routing %s, initial mem %d)\n", + printk(KERN_INFO "Lustre: Routing TOE NAL loaded (Routing %s, initial mem %d)\n", kpr_routing(&ktoenal_data.ksnd_router) ? "enabled" : "disabled", pkmem); diff --git a/lnet/klnds/toelnd/toenal_cb.c b/lnet/klnds/toelnd/toenal_cb.c index 2800788..8282141 100644 --- a/lnet/klnds/toelnd/toenal_cb.c +++ b/lnet/klnds/toelnd/toenal_cb.c @@ -431,7 +431,8 @@ ktoenal_send(nal_cb_t *nal, void *private, lib_msg_t *cookie, if ((conn = ktoenal_get_conn (nid)) == NULL) { /* It's not a peer; try to find a gateway */ - rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid); + rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, payload_niov, + &gatewaynid); if (rc != 0) { CERROR ("Can't route to "LPX64": router error %d\n", nid, rc); diff --git a/lnet/libcfs/Makefile.am b/lnet/libcfs/Makefile.am index 20d7fbd..cf9220b 100644 --- a/lnet/libcfs/Makefile.am +++ b/lnet/libcfs/Makefile.am @@ -20,7 +20,7 @@ link-stamp: echo timestamp > link-stamp DEFS = -portals_SOURCES = $(LINKS) module.c proc.c debug.c +portals_SOURCES = $(LINKS) module.c proc.c debug.c lwt.c # Don't distribute any patched files. dist-hook: diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c index 16ef401..1c5cf47 100644 --- a/lnet/libcfs/debug.c +++ b/lnet/libcfs/debug.c @@ -244,7 +244,7 @@ int portals_do_debug_dumplog(void *arg) PTR_ERR(file)); GOTO(out, PTR_ERR(file)); } else { - printk(KERN_ALERT "dumping log to %s ... writing ...\n", + printk(KERN_ALERT "LustreError: dumping log to %s ... writing ...\n", debug_file_name); } @@ -259,7 +259,7 @@ int portals_do_debug_dumplog(void *arg) } else { rc = file->f_op->write(file, debug_buf, debug_off,&file->f_pos); } - printk("wrote %d bytes\n", rc); + printk("LustreError: wrote %d bytes\n", rc); set_fs(oldfs); rc = file->f_op->fsync(file, file->f_dentry, 1); @@ -293,7 +293,7 @@ int portals_debug_daemon(void *arg) CERROR("cannot open %s for logging", debug_daemon_file_path); GOTO(out1, PTR_ERR(file)); } else { - printk(KERN_ALERT "daemon dumping log to %s ... writing ...\n", + printk(KERN_ALERT "LustreError: daemon dumping log to %s ... writing ...\n", debug_daemon_file_path); } @@ -340,7 +340,7 @@ int portals_debug_daemon(void *arg) size, &file->f_pos); if (rc < 0) { printk(KERN_ALERT - "Debug_daemon write error %d\n", rc); + "LustreError: Debug_daemon write error %d\n", rc); goto out; } start += rc; @@ -362,7 +362,7 @@ int portals_debug_daemon(void *arg) rc = file->f_op->fsync(file, file->f_dentry, 1); if (rc < 0) { printk(KERN_ALERT - "Debug_daemon sync error %d\n", rc); + "LustreError: Debug_daemon sync error %d\n", rc); goto out; } if (debug_daemon_state.stopped) @@ -406,12 +406,12 @@ void portals_debug_print(void) while (start1 < end1) { int count = MIN(1024, end1 - start1); - printk("%*s", count, start1); + printk("LustreError: %*s", count, start1); start1 += 1024; } while (start2 < end2) { int count = MIN(1024, end2 - start2); - printk("%*s", count, start2); + printk("LustreError: %*s", count, start2); start2 += 1024; } } @@ -426,7 +426,7 @@ void portals_debug_dumplog(void) rc = kernel_thread(portals_do_debug_dumplog, NULL, CLONE_VM | CLONE_FS | CLONE_FILES); if (rc < 0) { - printk(KERN_ERR "cannot start dump thread\n"); + printk(KERN_ERR "LustreError: cannot start dump thread\n"); return; } sleep_on(&debug_ctlwq); @@ -450,7 +450,7 @@ int portals_debug_daemon_start(char *file, unsigned int size) debug_daemon_state.lctl_event = 0; rc = kernel_thread(portals_debug_daemon, NULL, 0); if (rc < 0) { - printk(KERN_ERR "cannot start debug daemon thread\n"); + printk(KERN_ERR "LustreError: cannot start debug daemon thread\n"); strncpy(debug_daemon_file_path, "\0", 1); return rc; } @@ -753,7 +753,7 @@ portals_debug_msg(int subsys, int mask, char *file, const char *fn, unsigned long debug_off; if (debug_buf == NULL) { - printk("portals_debug_msg: debug_buf is NULL!\n"); + printk("LustreError: portals_debug_msg: debug_buf is NULL!\n"); return; } @@ -784,7 +784,7 @@ portals_debug_msg(int subsys, int mask, char *file, const char *fn, max_nob = debug_size - debug_off + DEBUG_OVERFLOW; if (max_nob <= 0) { spin_unlock_irqrestore(&portals_debug_lock, flags); - printk("logic error in portals_debug_msg: <0 bytes to write\n"); + printk("LustreError: logic error in portals_debug_msg: <0 bytes to write\n"); return; } @@ -828,11 +828,13 @@ portals_debug_msg(int subsys, int mask, char *file, const char *fn, /* Print to console, while msg is contiguous in debug_buf */ /* NB safely terminated see above */ if ((mask & D_EMERG) != 0) - printk(KERN_EMERG "%s", debug_buf + debug_off + prefix_nob); + printk(KERN_EMERG "LustreError: %s", + debug_buf + debug_off + prefix_nob); if ((mask & D_ERROR) != 0) - printk(KERN_ERR "%s", debug_buf + debug_off + prefix_nob); + printk(KERN_ERR "LustreError: %s", + debug_buf + debug_off + prefix_nob); else if (portal_printk) - printk("<%d>%s", portal_printk, debug_buf+debug_off+prefix_nob); + printk("<%d>LustreError: %s", portal_printk, debug_buf+debug_off+prefix_nob); base_offset = debug_off & 0xFFFF; debug_off += prefix_nob + msg_nob; @@ -855,45 +857,65 @@ out: void portals_debug_set_level(unsigned int debug_level) { - printk("Setting portals debug level to %08x\n", debug_level); + printk("Lustre: Setting portals debug level to %08x\n", debug_level); portal_debug = debug_level; } +void portals_run_upcall(char **argv) +{ + int rc; + int argc; + char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + ENTRY; + + argv[0] = portals_upcall; + argc = 1; + while (argv[argc] != NULL) + argc++; + + LASSERT(argc >= 2); + + rc = call_usermodehelper(argv[0], argv, envp); + if (rc < 0) { + CERROR("Error %d invoking portals upcall %s %s%s%s%s%s%s%s%s; " + "check /proc/sys/portals/upcall\n", + rc, argv[0], argv[1], + argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], + argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], + argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], + argc < 6 ? "" : ",..."); + } else { + CERROR("Invoked portals upcall %s %s%s%s%s%s%s%s%s\n", + argv[0], argv[1], + argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], + argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], + argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], + argc < 6 ? "" : ",..."); + } +} + void portals_run_lbug_upcall(char *file, const char *fn, const int line) { char *argv[6]; - char *envp[3]; char buf[32]; - int rc; ENTRY; snprintf (buf, sizeof buf, "%d", line); - argv[0] = portals_upcall; argv[1] = "LBUG"; argv[2] = file; argv[3] = (char *)fn; argv[4] = buf; argv[5] = NULL; - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[2] = NULL; - - rc = USERMODEHELPER(argv[0], argv, envp); - if (rc < 0) { - CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check " - "/proc/sys/portals/upcall\n", - argv[0], argv[1], argv[2], argv[3], argv[4], rc); - - } else { - CERROR("Invoked upcall %s %s %s %s %s\n", - argv[0], argv[1], argv[2], argv[3], argv[4]); - } + portals_run_upcall (argv); } - EXPORT_SYMBOL(portals_debug_dumplog); EXPORT_SYMBOL(portals_debug_msg); EXPORT_SYMBOL(portals_debug_set_level); +EXPORT_SYMBOL(portals_run_upcall); EXPORT_SYMBOL(portals_run_lbug_upcall); diff --git a/lnet/libcfs/module.c b/lnet/libcfs/module.c index 7ecbe43..308158b 100644 --- a/lnet/libcfs/module.c +++ b/lnet/libcfs/module.c @@ -122,8 +122,8 @@ static inline void freedata(void *data, int len) } static int -kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, - ptl_nid_t hi_nid) +kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid) { int rc; kpr_control_interface_t *ci; @@ -139,7 +139,8 @@ kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, } static int -kportal_del_route(ptl_nid_t target) +kportal_del_route(int gw_nalid, ptl_nid_t gw_nid, + ptl_nid_t lo, ptl_nid_t hi) { int rc; kpr_control_interface_t *ci; @@ -148,7 +149,24 @@ kportal_del_route(ptl_nid_t target) if (ci == NULL) return (-ENODEV); - rc = ci->kprci_del_route (target); + rc = ci->kprci_del_route (gw_nalid, gw_nid, lo, hi); + + PORTAL_SYMBOL_PUT(kpr_control_interface); + return (rc); +} + +static int +kportal_notify_router (int gw_nalid, ptl_nid_t gw_nid, + int alive, time_t when) +{ + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_notify (gw_nalid, gw_nid, alive, when); PORTAL_SYMBOL_PUT(kpr_control_interface); return (rc); @@ -156,12 +174,13 @@ kportal_del_route(ptl_nid_t target) static int kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp, - ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp) + ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp, int *alivep) { int gateway_nalid; ptl_nid_t gateway_nid; ptl_nid_t lo_nid; ptl_nid_t hi_nid; + int alive; int rc; kpr_control_interface_t *ci; @@ -169,17 +188,19 @@ kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp, if (ci == NULL) return (-ENODEV); - rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid, - &hi_nid); + rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, + &lo_nid, &hi_nid, &alive); if (rc == 0) { - CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n", - index, gateway_nalid, gateway_nid, lo_nid, hi_nid); + CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64", %s\n", + index, gateway_nalid, gateway_nid, lo_nid, hi_nid, + alive ? "up" : "down"); *gateway_nalidp = (__u32)gateway_nalid; - *gateway_nidp = (__u32)gateway_nid; - *lo_nidp = (__u32)lo_nid; - *hi_nidp = (__u32)hi_nid; + *gateway_nidp = gateway_nid; + *lo_nidp = lo_nid; + *hi_nidp = hi_nid; + *alivep = alive; } PORTAL_SYMBOL_PUT (kpr_control_interface); @@ -367,23 +388,38 @@ static int kportal_ioctl(struct inode *inode, struct file *file, case IOC_PORTAL_ADD_ROUTE: CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n", - data->ioc_nal, data->ioc_nid, data->ioc_nid2, - data->ioc_nid3); + data->ioc_nal, data->ioc_nid, + data->ioc_nid2, data->ioc_nid3); err = kportal_add_route(data->ioc_nal, data->ioc_nid, - MIN (data->ioc_nid2, data->ioc_nid3), - MAX (data->ioc_nid2, data->ioc_nid3)); + data->ioc_nid2, data->ioc_nid3); break; case IOC_PORTAL_DEL_ROUTE: - CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid); - err = kportal_del_route (data->ioc_nid); + CDEBUG (D_IOCTL, "Removing routes via [%d] "LPU64" : "LPU64" - "LPU64"\n", + data->ioc_nal, data->ioc_nid, + data->ioc_nid2, data->ioc_nid3); + err = kportal_del_route (data->ioc_nal, data->ioc_nid, + data->ioc_nid2, data->ioc_nid3); break; + case IOC_PORTAL_NOTIFY_ROUTER: { + CDEBUG (D_IOCTL, "Notifying peer [%d] "LPU64" %s @ %ld\n", + data->ioc_nal, data->ioc_nid, + data->ioc_flags ? "Enabling" : "Disabling", + (time_t)data->ioc_nid3); + + err = kportal_notify_router (data->ioc_nal, data->ioc_nid, + data->ioc_flags, + (time_t)data->ioc_nid3); + break; + } + case IOC_PORTAL_GET_ROUTE: CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count); err = kportal_get_route(data->ioc_count, &data->ioc_nal, - &data->ioc_nid, &data->ioc_nid2, - &data->ioc_nid3); + &data->ioc_nid, + &data->ioc_nid2, &data->ioc_nid3, + &data->ioc_flags); if (err == 0) if (copy_to_user((char *)arg, data, sizeof (*data))) err = -EFAULT; @@ -432,7 +468,27 @@ static int kportal_ioctl(struct inode *inode, struct file *file, kportal_put_ni (data->ioc_nal); break; } - +#if LWT_SUPPORT + case IOC_PORTAL_LWT_CONTROL: + err = lwt_control (data->ioc_flags, data->ioc_misc); + break; + + case IOC_PORTAL_LWT_SNAPSHOT: + err = lwt_snapshot (&data->ioc_count, &data->ioc_misc, + data->ioc_pbuf1, data->ioc_plen1); + if (err == 0 && + copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + + case IOC_PORTAL_LWT_LOOKUP_STRING: + err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1, + data->ioc_pbuf2, data->ioc_plen2); + if (err == 0 && + copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; +#endif default: err = -EINVAL; break; @@ -467,16 +523,23 @@ static int init_kportals_module(void) rc = portals_debug_init(5 * 1024 * 1024); if (rc < 0) { - printk(KERN_ERR "portals_debug_init: %d\n", rc); + printk(KERN_ERR "LustreError: portals_debug_init: %d\n", rc); return (rc); } +#if LWT_SUPPORT + rc = lwt_init(); + if (rc != 0) { + CERROR("lwt_init: error %d\n", rc); + goto cleanup_debug; + } +#endif sema_init(&nal_cmd_sem, 1); rc = misc_register(&portal_dev); if (rc) { CERROR("misc_register: error %d\n", rc); - goto cleanup_debug; + goto cleanup_lwt; } rc = PtlInit(); @@ -498,6 +561,10 @@ static int init_kportals_module(void) PtlFini(); cleanup_deregister: misc_deregister(&portal_dev); + cleanup_lwt: +#if LWT_SUPPORT + lwt_fini(); +#endif cleanup_debug: portals_debug_cleanup(); return rc; @@ -518,13 +585,17 @@ static void exit_kportals_module(void) if (rc) CERROR("misc_deregister error %d\n", rc); +#if LWT_SUPPORT + lwt_fini(); +#endif + if (atomic_read(&portal_kmemory) != 0) CERROR("Portals memory leaked: %d bytes\n", atomic_read(&portal_kmemory)); rc = portals_debug_cleanup(); if (rc) - printk(KERN_ERR "portals_debug_cleanup: %d\n", rc); + printk(KERN_ERR "LustreError: portals_debug_cleanup: %d\n", rc); } EXPORT_SYMBOL(lib_dispatch); @@ -546,6 +617,7 @@ EXPORT_SYMBOL(portal_subsystem_debug); EXPORT_SYMBOL(portal_debug); EXPORT_SYMBOL(portal_stack); EXPORT_SYMBOL(portal_printk); +EXPORT_SYMBOL(portal_cerror); EXPORT_SYMBOL(PtlEQWait); EXPORT_SYMBOL(PtlEQFree); EXPORT_SYMBOL(PtlEQGet); diff --git a/lnet/libcfs/proc.c b/lnet/libcfs/proc.c index 8817ace..c1b2aec 100644 --- a/lnet/libcfs/proc.c +++ b/lnet/libcfs/proc.c @@ -65,11 +65,12 @@ extern char portals_upcall[1024]; #define PSDEV_DEBUG 1 /* control debugging */ #define PSDEV_SUBSYSTEM_DEBUG 2 /* control debugging */ #define PSDEV_PRINTK 3 /* force all errors to console */ -#define PSDEV_DEBUG_PATH 4 /* crashdump log location */ -#define PSDEV_DEBUG_DUMP_PATH 5 /* crashdump tracelog location */ -#define PSDEV_PORTALS_UPCALL 6 /* User mode upcall script */ +#define PSDEV_CONSOLE 4 /* allow _any_ messages to console */ +#define PSDEV_DEBUG_PATH 5 /* crashdump log location */ +#define PSDEV_DEBUG_DUMP_PATH 6 /* crashdump tracelog location */ +#define PSDEV_PORTALS_UPCALL 7 /* User mode upcall script */ -#define PORTALS_PRIMARY_CTLCNT 6 +#define PORTALS_PRIMARY_CTLCNT 7 static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = { {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL, &proc_dointvec}, @@ -77,6 +78,8 @@ static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = { sizeof(int), 0644, NULL, &proc_dointvec}, {PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL, &proc_dointvec}, + {PSDEV_CONSOLE, "console", &portal_cerror, sizeof(int), 0644, NULL, + &proc_dointvec}, {PSDEV_DEBUG_PATH, "debug_path", debug_file_path, sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string}, {PSDEV_DEBUG_DUMP_PATH, "debug_daemon_path", debug_daemon_file_path, diff --git a/lnet/lnet/api-init.c b/lnet/lnet/api-init.c index dc1fead..f77a439 100644 --- a/lnet/lnet/api-init.c +++ b/lnet/lnet/api-init.c @@ -28,6 +28,7 @@ int ptl_init; unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_QSWNAL | S_SOCKNAL | S_GMNAL); unsigned int portal_debug = ~0; +unsigned int portal_cerror = 1; unsigned int portal_printk; unsigned int portal_stack; diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 62db766..f710476 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1052,6 +1052,7 @@ int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) ": simulated failure\n", nal->ni.nid, hdr_type_string (hdr), hdr->src_nid); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); return (-1); } diff --git a/lnet/router/router.c b/lnet/router/router.c index 27a7fba..32f741f 100644 --- a/lnet/router/router.c +++ b/lnet/router/router.c @@ -24,6 +24,7 @@ #include "router.h" LIST_HEAD(kpr_routes); +LIST_HEAD(kpr_gateways); LIST_HEAD(kpr_nals); unsigned long long kpr_fwd_bytes; @@ -42,6 +43,7 @@ kpr_router_interface_t kpr_router_interface = { kprri_lookup: kpr_lookup_target, kprri_fwd_start: kpr_forward_packet, kprri_fwd_done: kpr_complete_packet, + kprri_notify: kpr_nal_notify, kprri_shutdown: kpr_shutdown_nal, kprri_deregister: kpr_deregister_nal, }; @@ -50,6 +52,7 @@ kpr_control_interface_t kpr_control_interface = { kprci_add_route: kpr_add_route, kprci_del_route: kpr_del_route, kprci_get_route: kpr_get_route, + kprci_notify: kpr_sys_notify, }; int @@ -59,7 +62,7 @@ kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) struct list_head *e; kpr_nal_entry_t *ne; - CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid); + CDEBUG (D_NET, "Registering NAL %d\n", nalif->kprni_nalid); PORTAL_ALLOC (ne, sizeof (*ne)); if (ne == NULL) @@ -96,12 +99,186 @@ kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) } void +kpr_do_upcall (void *arg) +{ + kpr_upcall_t *u = (kpr_upcall_t *)arg; + char nalstr[10]; + char nidstr[36]; + char whenstr[36]; + char *argv[] = { + NULL, + "ROUTER_NOTIFY", + nalstr, + nidstr, + u->kpru_alive ? "up" : "down", + whenstr, + NULL}; + + snprintf (nalstr, sizeof(nalstr), "%d", u->kpru_nal_id); + snprintf (nidstr, sizeof(nidstr), LPX64, u->kpru_nid); + snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when); + + portals_run_upcall (argv); + + kfree (u); +} + +void +kpr_upcall (int gw_nalid, ptl_nid_t gw_nid, int alive, time_t when) +{ + /* May be in arbitrary context */ + kpr_upcall_t *u = kmalloc (sizeof (kpr_upcall_t), GFP_ATOMIC); + + if (u == NULL) { + CERROR ("Upcall out of memory: nal %d nid "LPX64" %s\n", + gw_nalid, gw_nid, alive ? "up" : "down"); + return; + } + + u->kpru_nal_id = gw_nalid; + u->kpru_nid = gw_nid; + u->kpru_alive = alive; + u->kpru_when = when; + + prepare_work (&u->kpru_tq, kpr_do_upcall, u); + schedule_work (&u->kpru_tq); +} + +int +kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, + int alive, time_t when) +{ + unsigned long flags; + int rc = -ENOENT; + kpr_nal_entry_t *ne = NULL; + kpr_gateway_entry_t *ge = NULL; + struct timeval now; + struct list_head *e; + struct list_head *n; + + CDEBUG (D_ERROR, "%s notifying [%d] "LPX64": %s\n", + byNal ? "NAL" : "userspace", + gateway_nalid, gateway_nid, alive ? "up" : "down"); + + /* can't do predictions... */ + do_gettimeofday (&now); + if (when > now.tv_sec) { + CWARN ("Ignoring prediction from %s of [%d] "LPX64" %s " + "%ld seconds in the future\n", + byNal ? "NAL" : "userspace", + gateway_nalid, gateway_nid, + alive ? "up" : "down", + when - now.tv_sec); + return (EINVAL); + } + + LASSERT (when <= now.tv_sec); + + /* Serialise with lookups (i.e. write lock) */ + write_lock_irqsave(&kpr_rwlock, flags); + + list_for_each_safe (e, n, &kpr_gateways) { + + ge = list_entry(e, kpr_gateway_entry_t, kpge_list); + if ((gateway_nalid != 0 && + ge->kpge_nalid != gateway_nalid) || + ge->kpge_nid != gateway_nid) + continue; + + rc = 0; + break; + } + + if (rc != 0) { + /* gateway not found */ + write_unlock_irqrestore(&kpr_rwlock, flags); + CDEBUG (D_NET, "Gateway not found\n"); + return (rc); + } + + if (when < ge->kpge_timestamp) { + /* out of date information */ + write_unlock_irqrestore (&kpr_rwlock, flags); + CDEBUG (D_NET, "Out of date\n"); + return (0); + } + + /* update timestamp */ + ge->kpge_timestamp = when; + + if ((!ge->kpge_alive) == (!alive)) { + /* new date for old news */ + write_unlock_irqrestore (&kpr_rwlock, flags); + CDEBUG (D_NET, "Old news\n"); + return (0); + } + + ge->kpge_alive = alive; + CDEBUG(D_NET, "set "LPX64" [%p] %d\n", gateway_nid, ge, alive); + + if (alive) { + /* Reset all gateway weights so the newly-enabled gateway + * doesn't have to play catch-up */ + list_for_each_safe (e, n, &kpr_gateways) { + kpr_gateway_entry_t *ge = list_entry(e, kpr_gateway_entry_t, + kpge_list); + atomic_set (&ge->kpge_weight, 0); + } + } + + if (!byNal) { + /* userland notified me: notify NAL? */ + ne = kpr_find_nal_entry_locked (ge->kpge_nalid); + if (ne != NULL) { + if (ne->kpne_shutdown || + ne->kpne_interface.kprni_notify == NULL) { + /* no need to notify */ + ne = NULL; + } else { + /* take a ref on this NAL until notifying + * it has completed... */ + atomic_inc (&ne->kpne_refcount); + } + } + } + + write_unlock_irqrestore(&kpr_rwlock, flags); + + if (ne != NULL) { + ne->kpne_interface.kprni_notify (ne->kpne_interface.kprni_arg, + gateway_nid, alive); + /* 'ne' can disappear now... */ + atomic_dec (&ne->kpne_refcount); + } + + if (byNal) { + /* It wasn't userland that notified me... */ + CWARN ("Upcall: NAL %d NID "LPX64" is %s\n", + gateway_nalid, gateway_nid, + alive ? "alive" : "dead"); + kpr_upcall (gateway_nalid, gateway_nid, alive, when); + } else { + CDEBUG (D_NET, " NOT Doing upcall\n"); + } + + return (0); +} + +void +kpr_nal_notify (void *arg, ptl_nid_t peer, int alive, time_t when) +{ + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + + kpr_do_notify (1, ne->kpne_interface.kprni_nalid, peer, alive, when); +} + +void kpr_shutdown_nal (void *arg) { unsigned long flags; kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid); + CDEBUG (D_NET, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid); LASSERT (!ne->kpne_shutdown); LASSERT (!in_interrupt()); @@ -126,7 +303,7 @@ kpr_deregister_nal (void *arg) unsigned long flags; kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid); + CDEBUG (D_NET, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid); LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */ LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */ @@ -142,15 +319,58 @@ kpr_deregister_nal (void *arg) PORTAL_MODULE_UNUSE; } +int +kpr_ge_isbetter (kpr_gateway_entry_t *ge1, kpr_gateway_entry_t *ge2) +{ + const int significant_bits = 0x00ffffff; + /* We use atomic_t to record/compare route weights for + * load-balancing. Here we limit ourselves to only using + * 'significant_bits' when we do an 'after' comparison */ + + int diff = (atomic_read (&ge1->kpge_weight) - + atomic_read (&ge2->kpge_weight)) & significant_bits; + int rc = (diff > (significant_bits >> 1)); + + CDEBUG(D_NET, "[%p]"LPX64"=%d %s [%p]"LPX64"=%d\n", + ge1, ge1->kpge_nid, atomic_read (&ge1->kpge_weight), + rc ? ">" : "<", + ge2, ge2->kpge_nid, atomic_read (&ge2->kpge_weight)); + + return (rc); +} + +void +kpr_update_weight (kpr_gateway_entry_t *ge, int nob) +{ + int weight = 1 + (nob + sizeof (ptl_hdr_t)/2)/sizeof (ptl_hdr_t); + + /* We've chosen this route entry (i.e. gateway) to forward payload + * of length 'nob'; update the route's weight to make it less + * favoured. Note that the weight is 1 plus the payload size + * rounded and scaled to the portals header size, so we get better + * use of the significant bits in kpge_weight. */ + + CDEBUG(D_NET, "gateway [%p]"LPX64" += %d\n", ge, + ge->kpge_nid, weight); + + atomic_add (weight, &ge->kpge_weight); +} int -kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) +kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob, + ptl_nid_t *gateway_nidp) { - kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - struct list_head *e; - int rc = -ENOENT; + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + struct list_head *e; + kpr_route_entry_t *re; + kpr_gateway_entry_t *ge = NULL; + int rc = -ENOENT; + + /* Caller wants to know if 'target_nid' can be reached via a gateway + * ON HER OWN NETWORK */ - CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid); + CDEBUG (D_NET, "lookup "LPX64" from NAL %d\n", target_nid, + ne->kpne_interface.kprni_nalid); if (ne->kpne_shutdown) /* caller is shutting down */ return (-ENOENT); @@ -159,9 +379,8 @@ kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) /* Search routes for one that has a gateway to target_nid on the callers network */ - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) - { - kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + list_for_each (e, &kpr_routes) { + re = list_entry (e, kpr_route_entry_t, kpre_list); if (re->kpre_lo_nid > target_nid || re->kpre_hi_nid < target_nid) @@ -169,33 +388,66 @@ kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) /* found table entry */ - if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */ - rc = -EHOSTUNREACH; - else - { - rc = 0; - *gateway_nidp = re->kpre_gateway_nid; - } - break; + if (re->kpre_gateway->kpge_nalid != ne->kpne_interface.kprni_nalid || + !re->kpre_gateway->kpge_alive) { + /* different NAL or gateway down */ + rc = -EHOSTUNREACH; + continue; + } + + if (ge == NULL || + kpr_ge_isbetter (re->kpre_gateway, ge)) + ge = re->kpre_gateway; } + if (ge != NULL) { + kpr_update_weight (ge, nob); + *gateway_nidp = ge->kpge_nid; + rc = 0; + } + read_unlock (&kpr_rwlock); - CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n", + /* NB can't deref 're' now; it might have been removed! */ + + CDEBUG (D_NET, "lookup "LPX64" from NAL %d: %d ("LPX64")\n", target_nid, ne->kpne_interface.kprni_nalid, rc, (rc == 0) ? *gateway_nidp : (ptl_nid_t)0); return (rc); } +kpr_nal_entry_t * +kpr_find_nal_entry_locked (int nal_id) +{ + struct list_head *e; + + /* Called with kpr_rwlock held */ + + list_for_each (e, &kpr_nals) { + kpr_nal_entry_t *ne = list_entry (e, kpr_nal_entry_t, kpne_list); + + if (nal_id != ne->kpne_interface.kprni_nalid) /* no match */ + continue; + + return (ne); + } + + return (NULL); +} + void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) { - kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg; - ptl_nid_t target_nid = fwd->kprfd_target_nid; - int nob = fwd->kprfd_nob; - struct list_head *e; - - CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd, + kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg; + ptl_nid_t target_nid = fwd->kprfd_target_nid; + int nob = fwd->kprfd_nob; + kpr_gateway_entry_t *ge = NULL; + kpr_nal_entry_t *dst_ne = NULL; + struct list_head *e; + kpr_route_entry_t *re; + kpr_nal_entry_t *tmp_ne; + + CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d\n", fwd, target_nid, src_ne->kpne_interface.kprni_nalid); LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */ @@ -216,53 +468,58 @@ kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) /* Search routes for one that has a gateway to target_nid NOT on the caller's network */ - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) - { - kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + list_for_each (e, &kpr_routes) { + re = list_entry (e, kpr_route_entry_t, kpre_list); if (re->kpre_lo_nid > target_nid || /* no match */ re->kpre_hi_nid < target_nid) continue; - CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd, - target_nid, src_ne->kpne_interface.kprni_nalid, - re->kpre_gateway_nid, re->kpre_gateway_nalid); + if (re->kpre_gateway->kpge_nalid == src_ne->kpne_interface.kprni_nalid) + continue; /* don't route to same NAL */ - if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid) - break; /* don't route to same NAL */ + if (!re->kpre_gateway->kpge_alive) + continue; /* gateway is dead */ + + tmp_ne = kpr_find_nal_entry_locked (re->kpre_gateway->kpge_nalid); - /* Search for gateway's NAL's entry */ - - for (e = kpr_nals.next; e != &kpr_nals; e = e->next) - { - kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list); - - if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */ - continue; + if (tmp_ne == NULL || + tmp_ne->kpne_shutdown) { + /* NAL must be registered and not shutting down */ + continue; + } - if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */ - break; + if (ge == NULL || + kpr_ge_isbetter (re->kpre_gateway, ge)) { + ge = re->kpre_gateway; + dst_ne = tmp_ne; + } + } + + if (ge != NULL) { + LASSERT (dst_ne != NULL); + + kpr_update_weight (ge, nob); - fwd->kprfd_gateway_nid = re->kpre_gateway_nid; - atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */ + fwd->kprfd_gateway_nid = ge->kpge_nid; + atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */ - read_unlock (&kpr_rwlock); + read_unlock (&kpr_rwlock); - CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd, - target_nid, src_ne->kpne_interface.kprni_nalid, - fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid); + CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d: " + "to "LPX64" on NAL %d\n", + fwd, target_nid, src_ne->kpne_interface.kprni_nalid, + fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid); - dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd); - return; - } - break; + dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd); + return; } - read_unlock (&kpr_rwlock); + read_unlock (&kpr_rwlock); out: kpr_fwd_errors++; - CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd, + CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd, target_nid, src_ne->kpne_interface.kprni_nalid); /* Can't find anywhere to forward to */ @@ -278,14 +535,14 @@ kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error) kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg; kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg; - CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd, + CDEBUG (D_NET, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd, src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error); atomic_dec (&dst_ne->kpne_refcount); /* CAVEAT EMPTOR dst_ne can disappear now!!! */ (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error); - CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd, + CDEBUG (D_NET, "complete(2) [%p] from NAL %d: %d\n", fwd, src_ne->kpne_interface.kprni_nalid, error); atomic_dec (&kpr_queue_depth); @@ -293,49 +550,76 @@ kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error) } int -kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, - ptl_nid_t hi_nid) +kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid) { - unsigned long flags; - struct list_head *e; - kpr_route_entry_t *re; + unsigned long flags; + struct list_head *e; + kpr_route_entry_t *re; + kpr_gateway_entry_t *ge; + int dup = 0; - CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n", + CDEBUG(D_NET, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n", gateway_nalid, gateway_nid, lo_nid, hi_nid); - LASSERT(lo_nid <= hi_nid); + if (gateway_nalid == PTL_NID_ANY || + lo_nid == PTL_NID_ANY || + hi_nid == PTL_NID_ANY || + lo_nid > hi_nid) + return (-EINVAL); + + PORTAL_ALLOC (ge, sizeof (*ge)); + if (ge == NULL) + return (-ENOMEM); + + ge->kpge_nalid = gateway_nalid; + ge->kpge_nid = gateway_nid; + ge->kpge_alive = 1; + ge->kpge_timestamp = 0; + ge->kpge_refcount = 0; + atomic_set (&ge->kpge_weight, 0); PORTAL_ALLOC (re, sizeof (*re)); if (re == NULL) return (-ENOMEM); - re->kpre_gateway_nalid = gateway_nalid; - re->kpre_gateway_nid = gateway_nid; re->kpre_lo_nid = lo_nid; re->kpre_hi_nid = hi_nid; LASSERT(!in_interrupt()); write_lock_irqsave (&kpr_rwlock, flags); - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { - kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t, - kpre_list); - - if (re->kpre_lo_nid > re2->kpre_hi_nid || - re->kpre_hi_nid < re2->kpre_lo_nid) - continue; + list_for_each (e, &kpr_gateways) { + kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t, + kpge_list); + + if (ge2->kpge_nalid == gateway_nalid && + ge2->kpge_nid == gateway_nid) { + PORTAL_FREE (ge, sizeof (*ge)); + ge = ge2; + dup = 1; + break; + } + } - CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]" - "to ["LPX64" - "LPX64"]\n", - re->kpre_lo_nid, re->kpre_hi_nid, - re2->kpre_lo_nid, re2->kpre_hi_nid); + if (!dup) { + /* Adding a new gateway... */ + + list_add (&ge->kpge_list, &kpr_gateways); - write_unlock_irqrestore (&kpr_rwlock, flags); + /* ...zero all gateway weights so this one doesn't have to + * play catch-up */ - PORTAL_FREE (re, sizeof (*re)); - return (-EINVAL); + list_for_each (e, &kpr_gateways) { + kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t, + kpge_list); + atomic_set (&ge2->kpge_weight, 0); + } + } + re->kpre_gateway = ge; + ge->kpge_refcount++; list_add (&re->kpre_list, &kpr_routes); write_unlock_irqrestore (&kpr_rwlock, flags); @@ -343,49 +627,82 @@ kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, } int -kpr_del_route (ptl_nid_t nid) +kpr_sys_notify (int gateway_nalid, ptl_nid_t gateway_nid, + int alive, time_t when) { + return (kpr_do_notify (0, gateway_nalid, gateway_nid, alive, when)); +} + +int +kpr_del_route (int gw_nalid, ptl_nid_t gw_nid, + ptl_nid_t lo, ptl_nid_t hi) +{ + int specific = (lo != PTL_NID_ANY); unsigned long flags; + int rc = -ENOENT; struct list_head *e; + struct list_head *n; - CDEBUG(D_OTHER, "Del route "LPX64"\n", nid); + CDEBUG(D_NET, "Del route [%d] "LPX64" : "LPX64" - "LPX64"\n", + gw_nalid, gw_nid, lo, hi); LASSERT(!in_interrupt()); + + /* NB Caller may specify either all routes via the given gateway + * (lo/hi == PTL_NID_ANY) or a specific route entry (lo/hi are + * actual NIDs) */ + + if (specific ? (hi == PTL_NID_ANY || hi < lo) : (hi != PTL_NID_ANY)) + return (-EINVAL); + write_lock_irqsave(&kpr_rwlock, flags); - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { - kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + list_for_each_safe (e, n, &kpr_routes) { + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, kpre_list); - - if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid) + kpr_gateway_entry_t *ge = re->kpre_gateway; + + if (ge->kpge_nalid != gw_nalid || + ge->kpge_nid != gw_nid || + (specific && + (lo != re->kpre_lo_nid || hi != re->kpre_hi_nid))) continue; - list_del (&re->kpre_list); - write_unlock_irqrestore(&kpr_rwlock, flags); + rc = 0; + if (--ge->kpge_refcount == 0) { + list_del (&ge->kpge_list); + PORTAL_FREE (ge, sizeof (*ge)); + } + + list_del (&re->kpre_list); PORTAL_FREE(re, sizeof (*re)); - return (0); + + if (specific) + break; } write_unlock_irqrestore(&kpr_rwlock, flags); - return (-ENOENT); + return (rc); } int -kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid, - ptl_nid_t *lo_nid, ptl_nid_t *hi_nid) +kpr_get_route (int idx, int *gateway_nalid, ptl_nid_t *gateway_nid, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive) { struct list_head *e; read_lock(&kpr_rwlock); for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { - kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, - kpre_list); - + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + kpre_list); + kpr_gateway_entry_t *ge = re->kpre_gateway; + if (idx-- == 0) { - *gateway_nalid = re->kpre_gateway_nalid; - *gateway_nid = re->kpre_gateway_nid; + *gateway_nalid = ge->kpge_nalid; + *gateway_nid = ge->kpge_nid; + *alive = ge->kpge_alive; *lo_nid = re->kpre_lo_nid; *hi_nid = re->kpre_hi_nid; diff --git a/lnet/router/router.h b/lnet/router/router.h index 19159ab..dc9e4ed 100644 --- a/lnet/router/router.h +++ b/lnet/router/router.h @@ -52,17 +52,40 @@ typedef struct typedef struct { + struct list_head kpge_list; + atomic_t kpge_weight; + time_t kpge_timestamp; + int kpge_alive; + int kpge_nalid; + int kpge_refcount; + ptl_nid_t kpge_nid; +} kpr_gateway_entry_t; + +typedef struct +{ struct list_head kpre_list; - int kpre_gateway_nalid; - ptl_nid_t kpre_gateway_nid; + kpr_gateway_entry_t *kpre_gateway; ptl_nid_t kpre_lo_nid; ptl_nid_t kpre_hi_nid; } kpr_route_entry_t; +typedef struct +{ + struct tq_struct kpru_tq; + int kpru_nal_id; + ptl_nid_t kpru_nid; + int kpru_alive; + time_t kpru_when; +} kpr_upcall_t; + extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp); -extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp); +extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob, + ptl_nid_t *gateway_nidp); +extern kpr_nal_entry_t *kpr_find_nal_entry_locked (int nal_id); extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd); extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error); +extern void kpr_nal_notify (void *arg, ptl_nid_t peer, + int alive, time_t when); extern void kpr_shutdown_nal (void *arg); extern void kpr_deregister_nal (void *arg); @@ -71,9 +94,12 @@ extern void kpr_proc_fini (void); extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, ptl_nid_t hi_nid); -extern int kpr_del_route (ptl_nid_t nid); +extern int kpr_del_route (int gw_nal, ptl_nid_t gw_nid, + ptl_nid_t lo, ptl_nid_t hi); extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, - ptl_nid_t *lo_nid, ptl_nid_t *hi_nid); + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive); +extern int kpr_sys_notify (int gw_nalid, ptl_nid_t gw_nid, + int alive, time_t when); extern unsigned long long kpr_fwd_bytes; extern unsigned long kpr_fwd_packets; diff --git a/lnet/tests/ping_cli.c b/lnet/tests/ping_cli.c index 4d04ffb..22bdb45 100644 --- a/lnet/tests/ping_cli.c +++ b/lnet/tests/ping_cli.c @@ -91,14 +91,14 @@ static int pingcli_callback(ptl_event_t *ev) magic = *(int *)(ev->mem_desc.start + ev->offset); if(magic != 0xcafebabe) { - printk ("Unexpected response \n"); + printk ("LustreError: Unexpected response \n"); return 1; } if((i == count) || !count) wake_up_process (client->tsk); else - printk ("Received response after timeout for %d\n",i); + printk ("LustreError: Received response after timeout for %d\n",i); return 1; } @@ -229,15 +229,15 @@ pingcli_start(struct portal_ioctl_data *args) pingcli_shutdown (1); return NULL; } - printk ("sent msg no %d", count); + printk ("Lustre: sent msg no %d", count); set_current_state (TASK_INTERRUPTIBLE); rc = schedule_timeout (20 * args->ioc_timeout); if (rc == 0) { - printk (" :: timeout .....\n"); + printk ("LustreError: :: timeout .....\n"); } else { do_gettimeofday (&tv2); - printk(" :: Reply in %u usec\n", + printk("Lustre: :: Reply in %u usec\n", (unsigned)((tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec))); } diff --git a/lnet/tests/ping_srv.c b/lnet/tests/ping_srv.c index 873e11c..2a96f55 100644 --- a/lnet/tests/ping_srv.c +++ b/lnet/tests/ping_srv.c @@ -121,7 +121,7 @@ int pingsrv_thread(void *arg) if(magic != 0xdeadbeef) { - printk("Unexpected Packet to the server\n"); + printk("LustreError: Unexpected Packet to the server\n"); } memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic)); @@ -183,7 +183,7 @@ static int pingsrv_callback(ptl_event_t *ev) } server->evnt = *ev; - printk ("received ping from nid "LPX64" " + printk ("Lustre: received ping from nid "LPX64" " "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n", ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, *((int *)(ev->mem_desc.start + ev->offset)), diff --git a/lnet/tests/sping_cli.c b/lnet/tests/sping_cli.c index 35e114b..c37db4c 100644 --- a/lnet/tests/sping_cli.c +++ b/lnet/tests/sping_cli.c @@ -219,11 +219,11 @@ pingcli_start(struct portal_ioctl_data *args) set_current_state (TASK_INTERRUPTIBLE); rc = schedule_timeout (20 * args->ioc_timeout); if (rc == 0) { - printk (" Time out on the server\n"); + printk ("LustreError: Time out on the server\n"); pingcli_shutdown (2); return NULL; } else - printk("Received respose from the server \n"); + printk("Lustre: Received respose from the server \n"); pingcli_shutdown (2); diff --git a/lnet/tests/sping_srv.c b/lnet/tests/sping_srv.c index 2b45a46..0d52e1f 100644 --- a/lnet/tests/sping_srv.c +++ b/lnet/tests/sping_srv.c @@ -175,7 +175,7 @@ static int pingsrv_callback(ptl_event_t *ev) } server->evnt = *ev; - printk ("received ping from nid "LPX64" " + printk ("Lustre: received ping from nid "LPX64" " "(off=%u rlen=%u mlen=%u head=%x)\n", ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, *((int *)(ev->mem_desc.start + ev->offset))); diff --git a/lnet/tests/startclient.sh b/lnet/tests/startclient.sh index c9b7c16..de01bc7 100644 --- a/lnet/tests/startclient.sh +++ b/lnet/tests/startclient.sh @@ -29,9 +29,16 @@ case "$1" in /sbin/insmod ./$PING echo kqswnal > /tmp/nal ;; + + gm) + /sbin/insmod portals + /sbin/insmod kgmnal + /sbin/insmod ./$PING + echo kgmnal > /tmp/nal + ;; *) - echo "Usage : ${0} < tcp | toe | elan >" + echo "Usage : ${0} < tcp | toe | elan | gm>" exit 1; esac exit 0; diff --git a/lnet/tests/startserver.sh b/lnet/tests/startserver.sh index 942300e..4f66eeb 100644 --- a/lnet/tests/startserver.sh +++ b/lnet/tests/startserver.sh @@ -29,9 +29,16 @@ case "$1" in /sbin/insmod ./$PING nal=4 echo kqswnal > /tmp/nal ;; + + gm) + /sbin/insmod portals + /sbin/insmod kgmnal + /sbin/insmod ./$PING nal=3 + echo kgmnal > /tmp/nal + ;; *) - echo "Usage : ${0} < tcp | toe | elan >" + echo "Usage : ${0} < tcp | toe | elan | gm>" exit 1; esac ../utils/acceptor 9999& diff --git a/lnet/utils/.cvsignore b/lnet/utils/.cvsignore index 8e474ad..e2a0d44 100644 --- a/lnet/utils/.cvsignore +++ b/lnet/utils/.cvsignore @@ -6,4 +6,5 @@ ptlctl .deps routerstat wirecheck +gmnalnid .*.cmd diff --git a/lnet/utils/Makefile.am b/lnet/utils/Makefile.am index 5b3968d..7d38ba8 100644 --- a/lnet/utils/Makefile.am +++ b/lnet/utils/Makefile.am @@ -10,7 +10,7 @@ LINK = $(CC) -o $@ if LIBLUSTRE sbin_PROGRAMS = ptlctl debugctl routerstat wirecheck else -sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck +sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid endif lib_LIBRARIES = libptlctl.a @@ -20,6 +20,8 @@ wirecheck_SOURCES = wirecheck.c libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h +gmnalnid_SOURCES = gmnalnid.c + ptlctl_SOURCES = ptlctl.c ptlctl_LDADD = -L. -lptlctl -lncurses # -lefence ptlctl_DEPENDENCIES = libptlctl.a diff --git a/lnet/utils/parser.c b/lnet/utils/parser.c index 4d93645..eccf507 100644 --- a/lnet/utils/parser.c +++ b/lnet/utils/parser.c @@ -676,6 +676,7 @@ int Parser_bool (int *b, char *str) { if (!strcasecmp (str, "no") || !strcasecmp (str, "n") || !strcasecmp (str, "off") || + !strcasecmp (str, "down") || !strcasecmp (str, "disable")) { *b = 0; @@ -685,6 +686,7 @@ int Parser_bool (int *b, char *str) { if (!strcasecmp (str, "yes") || !strcasecmp (str, "y") || !strcasecmp (str, "on") || + !strcasecmp (str, "up") || !strcasecmp (str, "enable")) { *b = 1; diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index b68bf34..c5e374f 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -60,7 +60,7 @@ unsigned int portal_debug; unsigned int portal_printk; unsigned int portal_stack; - +unsigned int portal_cerror; static unsigned int g_nal = 0; @@ -75,6 +75,7 @@ typedef struct } name2num_t; static name2num_t nalnames[] = { + {"any", 0}, {"tcp", SOCKNAL}, {"toe", TOENAL}, {"elan", QSWNAL}, @@ -110,7 +111,7 @@ ptl_name2nal (char *str) { name2num_t *e = name2num_lookup_name (nalnames, str); - return ((e == NULL) ? 0 : e->num); + return ((e == NULL) ? -1 : e->num); } static char * @@ -143,6 +144,49 @@ ptl_gethostbyname(char * hname) { } int +ptl_parse_port (int *port, char *str) +{ + char *end; + + *port = strtol (str, &end, 0); + + if (*end == 0 && /* parsed whole string */ + *port > 0 && *port < 65536) /* minimal sanity check */ + return (0); + + return (-1); +} + +int +ptl_parse_time (time_t *t, char *str) +{ + char *end; + int n; + struct tm tm; + + *t = strtol (str, &end, 0); + if (*end == 0) /* parsed whole string */ + return (0); + + memset (&tm, 0, sizeof (tm)); + n = sscanf (str, "%d-%d-%d %d:%d:%d", + &tm.tm_year, &tm.tm_mon, &tm.tm_mday, + &tm.tm_hour, &tm.tm_min, &tm.tm_sec); + if (n != 6) + return (-1); + + tm.tm_mon--; /* convert to 0 == Jan */ + tm.tm_year -= 1900; /* y2k quirk */ + tm.tm_isdst = -1; /* dunno if it's daylight savings... */ + + *t = mktime (&tm); + if (*t == (time_t)-1) + return (-1); + + return (0); +} + +int ptl_parse_ipaddr (__u32 *ipaddrp, char *str) { struct hostent *he; @@ -198,8 +242,9 @@ ptl_ipaddr_2_str (__u32 ipaddr, char *str) int ptl_parse_nid (ptl_nid_t *nidp, char *str) { - __u32 ipaddr; - long lval; + __u32 ipaddr; + char *end; + unsigned long long ullval; if (!strcmp (str, "_all_")) { *nidp = PTL_NID_ANY; @@ -211,15 +256,10 @@ ptl_parse_nid (ptl_nid_t *nidp, char *str) return (0); } - if (sscanf (str, "%li", &lval) == 1) - { - *nidp = (ptl_nid_t)lval; - return (0); - } - - if (sscanf (str, "%lx", &lval) == 1) - { - *nidp = (ptl_nid_t)lval; + ullval = strtoull(str, &end, 0); + if (*end == 0) { + /* parsed whole string */ + *nidp = (ptl_nid_t)ullval; return (0); } @@ -235,21 +275,29 @@ ptl_nid2str (char *buffer, ptl_nid_t nid) if (he != NULL) strcpy (buffer, he->h_name); else - sprintf (buffer, "0x"LPX64, nid); + sprintf (buffer, LPX64, nid); return (buffer); } -int g_nal_is_compatible (char *cmd, ...) +int g_nal_is_set () { - va_list ap; - int nal; - if (g_nal == 0) { fprintf (stderr, "Error: you must run the 'network' command first.\n"); return (0); } - + + return (1); +} + +int g_nal_is_compatible (char *cmd, ...) +{ + va_list ap; + int nal; + + if (!g_nal_is_set ()) + return (0); + va_start (ap, cmd); do { @@ -260,9 +308,13 @@ int g_nal_is_compatible (char *cmd, ...) if (g_nal == nal) return (1); - - fprintf (stderr, "Command %s not compatible with nal %s\n", - cmd, nal2name (g_nal)); + + if (cmd != NULL) { + /* Don't complain verbosely if we've not been passed a command + * name to complain about! */ + fprintf (stderr, "Command %s not compatible with nal %s\n", + cmd, nal2name (g_nal)); + } return (0); } @@ -335,7 +387,7 @@ int jt_ptl_network(int argc, char **argv) int nal; if (argc == 2 && - (nal = ptl_name2nal (argv[1])) != 0) { + (nal = ptl_name2nal (argv[1])) >= 0) { g_nal = nal; return (0); } @@ -368,12 +420,14 @@ jt_ptl_print_autoconnects (int argc, char **argv) if (rc != 0) break; - printf (LPX64"@%s:%d #%d buffer %d nonagle %s xchg %s affinity %s share %d\n", + printf (LPX64"@%s:%d #%d buffer %d nonagle %s xchg %s " + "affinity %s eager %s share %d\n", data.ioc_nid, ptl_ipaddr_2_str (data.ioc_id, buffer), data.ioc_misc, data.ioc_count, data.ioc_size, (data.ioc_flags & 1) ? "on" : "off", (data.ioc_flags & 2) ? "on" : "off", (data.ioc_flags & 4) ? "on" : "off", + (data.ioc_flags & 8) ? "on" : "off", data.ioc_wait); } @@ -392,10 +446,11 @@ jt_ptl_add_autoconnect (int argc, char **argv) int xchange_nids = 0; int irq_affinity = 0; int share = 0; + int eager = 0; int rc; if (argc < 4 || argc > 5) { - fprintf (stderr, "usage: %s nid ipaddr port [ixs]\n", argv[0]); + fprintf (stderr, "usage: %s nid ipaddr port [ixse]\n", argv[0]); return 0; } @@ -413,8 +468,11 @@ jt_ptl_add_autoconnect (int argc, char **argv) return -1; } - port = atol (argv[3]); - + if (ptl_parse_port (&port, argv[3]) != 0) { + fprintf (stderr, "Can't parse port: %s\n", argv[3]); + return -1; + } + if (argc > 4) { char *opts = argv[4]; @@ -429,6 +487,9 @@ jt_ptl_add_autoconnect (int argc, char **argv) case 's': share = 1; break; + case 'e': + eager = 1; + break; default: fprintf (stderr, "Can't parse options: %s\n", argv[4]); @@ -444,10 +505,11 @@ jt_ptl_add_autoconnect (int argc, char **argv) data.ioc_misc = port; /* only passing one buffer size! */ data.ioc_size = MAX (g_socket_rxmem, g_socket_txmem); - data.ioc_flags = (g_socket_nonagle ? 1 : 0) | - (xchange_nids ? 2 : 0) | - (irq_affinity ? 4 : 0) | - (share ? 8 : 0); + data.ioc_flags = (g_socket_nonagle ? 0x01 : 0) | + (xchange_nids ? 0x02 : 0) | + (irq_affinity ? 0x04 : 0) | + (share ? 0x08 : 0) | + (eager ? 0x10 : 0); rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); if (rc != 0) { @@ -547,7 +609,7 @@ jt_ptl_print_connections (int argc, char **argv) if (rc != 0) break; - printf (LPD64"@%s:%d\n", + printf (LPX64"@%s:%d\n", data.ioc_nid, ptl_ipaddr_2_str (data.ioc_id, buffer), data.ioc_misc); @@ -661,7 +723,11 @@ int jt_ptl_connect(int argc, char **argv) return -1; } - port = atol(argv[2]); + if (ptl_parse_port (&port, argv[2]) != 0) { + fprintf (stderr, "Can't parse port: %s\n", argv[2]); + return -1; + } + if (argc > 3) for (flag = argv[3]; *flag != 0; flag++) switch (*flag) @@ -790,8 +856,8 @@ int jt_ptl_disconnect(int argc, char **argv) return 0; } - if (!g_nal_is_compatible (argv[0], SOCKNAL, TOENAL, 0)) - return -1; + if (!g_nal_is_compatible (NULL, SOCKNAL, TOENAL, 0)) + return 0; if (argc >= 2 && ptl_parse_nid (&nid, argv[1]) != 0) { @@ -917,11 +983,8 @@ int jt_ptl_ping(int argc, char **argv) return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + if (!g_nal_is_set()) return -1; - } if (ptl_parse_nid (&nid, argv[1]) != 0) { @@ -972,11 +1035,9 @@ int jt_ptl_shownid(int argc, char **argv) return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command first\n"); + if (!g_nal_is_set()) return -1; - } - + PORTAL_IOC_INIT (data); data.ioc_nal = g_nal; rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); @@ -1002,11 +1063,8 @@ int jt_ptl_mynid(int argc, char **argv) return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + if (!g_nal_is_set()) return -1; - } if (argc >= 2) nidstr = argv[1]; @@ -1052,11 +1110,8 @@ jt_ptl_fail_nid (int argc, char **argv) return (0); } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + if (!g_nal_is_set()) return (-1); - } if (!strcmp (argv[1], "_all_")) nid = PTL_NID_ANY; @@ -1135,7 +1190,7 @@ jt_ptl_nagle (int argc, char **argv) if (Parser_bool (&enable, argv[1]) != 0) { fprintf (stderr, "Can't parse boolean %s\n", argv[1]); - return (0); + return (-1); } g_socket_nonagle = !enable; } @@ -1158,11 +1213,8 @@ jt_ptl_add_route (int argc, char **argv) return (0); } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + if (!g_nal_is_set()) return (-1); - } if (ptl_parse_nid (&gateway_nid, argv[1]) != 0) { @@ -1205,6 +1257,8 @@ jt_ptl_del_route (int argc, char **argv) { struct portal_ioctl_data data; ptl_nid_t nid; + ptl_nid_t nid1 = PTL_NID_ANY; + ptl_nid_t nid2 = PTL_NID_ANY; int rc; if (argc < 2) @@ -1213,14 +1267,43 @@ jt_ptl_del_route (int argc, char **argv) return (0); } + if (!g_nal_is_set()) + return (-1); + if (ptl_parse_nid (&nid, argv[1]) != 0) { - fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]); + fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]); return (-1); } + if (argc >= 3 && + ptl_parse_nid (&nid1, argv[2]) != 0) + { + fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[2]); + return (-1); + } + + if (argc < 4) { + nid2 = nid1; + } else { + if (ptl_parse_nid (&nid2, argv[3]) != 0) { + fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[3]); + return (-1); + } + + if (nid1 > nid2) { + ptl_nid_t tmp = nid1; + + nid1 = nid2; + nid2 = tmp; + } + } + PORTAL_IOC_INIT(data); + data.ioc_nal = g_nal; data.ioc_nid = nid; + data.ioc_nid2 = nid1; + data.ioc_nid3 = nid2; rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data); if (rc != 0) @@ -1233,6 +1316,67 @@ jt_ptl_del_route (int argc, char **argv) } int +jt_ptl_notify_router (int argc, char **argv) +{ + struct portal_ioctl_data data; + int enable; + ptl_nid_t nid; + int rc; + struct timeval now; + time_t when; + + if (argc < 3) + { + fprintf (stderr, "usage: %s targetNID [