From 23de47e82bd999ec651f927097922413527cca71 Mon Sep 17 00:00:00 2001 From: phil Date: Fri, 4 Jun 2004 15:14:58 +0000 Subject: [PATCH] - merge 2 weeks of b1_4 fixes onto HEAD - b1_4 is basically the parent of HEAD, because that's the direction that changes flow, as strange as that sounds. so there's a HEAD_BASE tag which sits on b1_4 --- .../kernel_patches/patches/iopen-2.6-suse.patch | 58 +- .../kernel_patches/series/ldiskfs-2.6-suse.series | 1 + ldiskfs/ldiskfs/autoMakefile.am | 2 + lnet/archdep.m4 | 5 +- lnet/include/linux/kp30.h | 7 - lnet/include/linux/kpr.h | 2 +- lnet/include/linux/libcfs.h | 8 +- lnet/include/linux/portals_lib.h | 4 + lnet/include/lnet/api-support.h | 5 - lnet/include/lnet/api.h | 16 - lnet/include/lnet/arg-blocks.h | 268 ------- lnet/include/lnet/errno.h | 5 +- lnet/include/lnet/internal.h | 13 - lnet/include/lnet/lib-dispatch.h | 45 -- lnet/include/lnet/lib-lnet.h | 262 ++++--- lnet/include/lnet/lib-nal.h | 116 --- lnet/include/lnet/lib-p30.h | 262 ++++--- lnet/include/lnet/lib-types.h | 129 +++- lnet/include/lnet/nal.h | 67 +- lnet/include/lnet/types.h | 11 - lnet/klnds/gmlnd/gmlnd.h | 66 +- lnet/klnds/gmlnd/gmlnd_api.c | 202 +---- lnet/klnds/gmlnd/gmlnd_cb.c | 128 +--- lnet/klnds/gmlnd/gmlnd_comm.c | 65 +- lnet/klnds/qswlnd/qswlnd.c | 159 +--- lnet/klnds/qswlnd/qswlnd.h | 47 +- lnet/klnds/qswlnd/qswlnd_cb.c | 817 ++++++++++++--------- lnet/klnds/socklnd/socklnd.c | 120 +-- lnet/klnds/socklnd/socklnd.h | 6 +- lnet/klnds/socklnd/socklnd_cb.c | 170 ++--- lnet/libcfs/module.c | 69 +- lnet/lnet/Makefile.in | 4 +- lnet/lnet/Makefile.mk | 6 +- lnet/lnet/api-eq.c | 120 --- lnet/lnet/api-errno.c | 3 + lnet/lnet/api-init.c | 49 -- lnet/lnet/api-me.c | 28 - lnet/lnet/api-ni.c | 31 +- lnet/lnet/api-wrap.c | 602 ++++++--------- lnet/lnet/autoMakefile.am | 4 +- lnet/lnet/lib-dispatch.c | 79 -- lnet/lnet/lib-eq.c | 279 +++++-- lnet/lnet/lib-init.c | 175 +++-- lnet/lnet/lib-md.c | 383 +++++----- lnet/lnet/lib-me.c | 203 +++-- lnet/lnet/lib-move.c | 555 +++++++------- lnet/lnet/lib-msg.c | 66 +- lnet/lnet/lib-ni.c | 114 +-- lnet/lnet/lib-pid.c | 24 +- lnet/lnet/module.c | 2 - lnet/ulnds/address.c | 13 +- lnet/ulnds/bridge.h | 2 +- lnet/ulnds/procapi.c | 105 +-- lnet/ulnds/procbridge.h | 2 +- lnet/ulnds/proclib.c | 108 +-- lnet/ulnds/socklnd/address.c | 13 +- lnet/ulnds/socklnd/bridge.h | 2 +- lnet/ulnds/socklnd/procapi.c | 105 +-- lnet/ulnds/socklnd/procbridge.h | 2 +- lnet/ulnds/socklnd/proclib.c | 108 +-- lnet/ulnds/socklnd/tcplnd.c | 21 +- lnet/ulnds/tcplnd.c | 21 +- lustre/ChangeLog | 12 +- lustre/autoMakefile.am | 6 +- lustre/autogen.sh | 18 +- lustre/configure.in | 2 +- lustre/include/linux/lustre_compat25.h | 6 +- lustre/include/linux/lustre_net.h | 2 + .../patches/ext-2.4-patch-1-chaos.patch | 10 +- .../patches/ext-2.4-patch-1-suse-2.4.19.patch | 8 +- .../patches/ext-2.4-patch-1-suse.patch | 10 +- .../kernel_patches/patches/ext-2.4-patch-1.patch | 10 +- .../kernel_patches/patches/ext-2.4-patch-4.patch | 18 +- .../patches/ext3-htree-2.4.19-pre1.patch | 24 +- .../patches/ext3-htree-2.4.21-chaos.patch | 24 +- .../patches/ext3-htree-2.4.22-rh.patch | 24 +- .../patches/ext3-htree-rename_fix.patch | 24 + .../kernel_patches/patches/ext3-htree-suse.patch | 24 +- lustre/kernel_patches/patches/ext3-htree.patch | 24 +- .../patches/ext3-pdirops-2.4.24-chaos.patch | 66 +- .../kernel_patches/patches/htree-ext3-2.4.18.patch | 39 +- lustre/kernel_patches/patches/iopen-2.6-suse.patch | 58 +- .../patches/loop-sync-2.4.21-suse.patch | 11 + lustre/kernel_patches/patches/lustre_version.patch | 3 +- .../patches/md_path_lookup-2.6-suse.patch | 25 + .../patches/vfs_intent-2.6-suse.patch | 348 +++++---- .../patches/vfs_nointent-2.6-suse.patch | 163 ++-- lustre/kernel_patches/series/2.6-suse.series | 1 + .../kernel_patches/series/ldiskfs-2.6-suse.series | 1 + lustre/kernel_patches/series/suse-2.4.21-2 | 1 + lustre/kernel_patches/targets/2.6-suse.target | 6 +- lustre/ldiskfs/autoMakefile.am | 2 + lustre/ldlm/ldlm_lockd.c | 5 + lustre/ldlm/ldlm_resource.c | 4 +- lustre/liblustre/tests/Makefile.am | 4 +- lustre/llite/file.c | 6 +- lustre/lov/lov_obd.c | 22 +- lustre/mds/handler.c | 48 +- lustre/mds/mds_internal.h | 2 + lustre/mds/mds_lov.c | 140 ++-- lustre/mds/mds_open.c | 6 + lustre/mds/mds_unlink_open.c | 4 +- lustre/obdclass/class_obd.c | 2 +- lustre/obdclass/simple.c | 266 ------- lustre/obdfilter/filter_io_26.c | 8 +- lustre/osc/osc_create.c | 53 +- lustre/osc/osc_request.c | 5 +- lustre/portals/archdep.m4 | 5 +- lustre/portals/include/linux/kp30.h | 7 - lustre/portals/include/linux/kpr.h | 2 +- lustre/portals/include/linux/libcfs.h | 8 +- lustre/portals/include/linux/portals_lib.h | 4 + lustre/portals/include/portals/api-support.h | 5 - lustre/portals/include/portals/api.h | 16 - lustre/portals/include/portals/arg-blocks.h | 268 ------- lustre/portals/include/portals/errno.h | 5 +- lustre/portals/include/portals/lib-dispatch.h | 45 -- lustre/portals/include/portals/lib-nal.h | 116 --- lustre/portals/include/portals/lib-p30.h | 262 ++++--- lustre/portals/include/portals/lib-types.h | 129 +++- lustre/portals/include/portals/nal.h | 67 +- lustre/portals/include/portals/types.h | 11 - lustre/portals/knals/gmnal/gmnal.h | 66 +- lustre/portals/knals/gmnal/gmnal_api.c | 202 +---- lustre/portals/knals/gmnal/gmnal_cb.c | 128 +--- lustre/portals/knals/gmnal/gmnal_comm.c | 65 +- lustre/portals/knals/qswnal/qswnal.c | 159 +--- lustre/portals/knals/qswnal/qswnal.h | 47 +- lustre/portals/knals/qswnal/qswnal_cb.c | 817 ++++++++++++--------- lustre/portals/knals/socknal/socknal.c | 120 +-- lustre/portals/knals/socknal/socknal.h | 6 +- lustre/portals/knals/socknal/socknal_cb.c | 170 ++--- lustre/portals/libcfs/module.c | 69 +- lustre/portals/portals/Makefile.in | 4 +- lustre/portals/portals/Makefile.mk | 6 +- lustre/portals/portals/api-eq.c | 120 --- lustre/portals/portals/api-errno.c | 3 + lustre/portals/portals/api-init.c | 49 -- lustre/portals/portals/api-me.c | 28 - lustre/portals/portals/api-ni.c | 31 +- lustre/portals/portals/api-wrap.c | 602 ++++++--------- lustre/portals/portals/autoMakefile.am | 4 +- lustre/portals/portals/lib-dispatch.c | 79 -- lustre/portals/portals/lib-eq.c | 279 +++++-- lustre/portals/portals/lib-init.c | 175 +++-- lustre/portals/portals/lib-md.c | 383 +++++----- lustre/portals/portals/lib-me.c | 203 +++-- lustre/portals/portals/lib-move.c | 555 +++++++------- lustre/portals/portals/lib-msg.c | 66 +- lustre/portals/portals/lib-ni.c | 114 +-- lustre/portals/portals/lib-pid.c | 24 +- lustre/portals/portals/module.c | 2 - lustre/portals/unals/address.c | 13 +- lustre/portals/unals/bridge.h | 2 +- lustre/portals/unals/procapi.c | 105 +-- lustre/portals/unals/procbridge.h | 2 +- lustre/portals/unals/proclib.c | 108 +-- lustre/portals/unals/tcpnal.c | 21 +- lustre/ptlbd/autoMakefile.am | 2 + lustre/ptlrpc/events.c | 8 +- lustre/ptlrpc/ptlrpc_internal.h | 2 - lustre/ptlrpc/ptlrpc_module.c | 1 + lustre/scripts/cvsdiffclient | 19 +- lustre/scripts/land1.sh | 20 +- lustre/scripts/lmake | 23 +- lustre/scripts/lustre-kernel-2.4.spec.in | 17 +- lustre/scripts/merge1.sh | 18 +- lustre/tests/.cvsignore | 1 + lustre/tests/Makefile.am | 3 +- lustre/tests/cfg/local.sh | 2 +- lustre/tests/recovery-small.sh | 14 + lustre/tests/rename_many.c | 263 +++++++ lustre/tests/replay-dual.sh | 21 + lustre/tests/replay-single.sh | 27 +- lustre/tests/sanity.sh | 30 +- lustre/utils/lconf | 2 +- lustre/utils/llmount.c | 327 +++++++-- 177 files changed, 6295 insertions(+), 8191 deletions(-) delete mode 100644 lnet/include/lnet/arg-blocks.h delete mode 100644 lnet/include/lnet/lib-dispatch.h delete mode 100644 lnet/include/lnet/lib-nal.h delete mode 100644 lnet/lnet/api-eq.c delete mode 100644 lnet/lnet/api-init.c delete mode 100644 lnet/lnet/api-me.c delete mode 100644 lnet/lnet/lib-dispatch.c create mode 100644 lustre/kernel_patches/patches/ext3-htree-rename_fix.patch create mode 100644 lustre/kernel_patches/patches/loop-sync-2.4.21-suse.patch create mode 100644 lustre/kernel_patches/patches/md_path_lookup-2.6-suse.patch delete mode 100644 lustre/obdclass/simple.c delete mode 100644 lustre/portals/include/portals/arg-blocks.h delete mode 100644 lustre/portals/include/portals/lib-dispatch.h delete mode 100644 lustre/portals/include/portals/lib-nal.h delete mode 100644 lustre/portals/portals/api-eq.c delete mode 100644 lustre/portals/portals/api-init.c delete mode 100644 lustre/portals/portals/api-me.c delete mode 100644 lustre/portals/portals/lib-dispatch.c create mode 100644 lustre/tests/rename_many.c diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch b/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch index 2133355..8a8d115 100644 --- a/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch @@ -8,8 +8,8 @@ Index: linux-stage/fs/ext3/Makefile =================================================================== ---- linux-stage.orig/fs/ext3/Makefile 2004-05-07 16:00:16.000000000 -0400 -+++ linux-stage/fs/ext3/Makefile 2004-05-07 16:00:17.000000000 -0400 +--- linux-stage.orig/fs/ext3/Makefile 2004-05-11 17:21:20.000000000 -0400 ++++ linux-stage/fs/ext3/Makefile 2004-05-11 17:21:21.000000000 -0400 @@ -4,7 +4,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o @@ -21,8 +21,8 @@ Index: linux-stage/fs/ext3/Makefile ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o Index: linux-stage/fs/ext3/inode.c =================================================================== ---- linux-stage.orig/fs/ext3/inode.c 2004-05-07 16:00:16.000000000 -0400 -+++ linux-stage/fs/ext3/inode.c 2004-05-07 17:21:59.000000000 -0400 +--- linux-stage.orig/fs/ext3/inode.c 2004-05-11 17:21:21.000000000 -0400 ++++ linux-stage/fs/ext3/inode.c 2004-05-11 17:21:21.000000000 -0400 @@ -37,6 +37,7 @@ #include #include @@ -43,8 +43,8 @@ Index: linux-stage/fs/ext3/inode.c bh = iloc.bh; Index: linux-stage/fs/ext3/iopen.c =================================================================== ---- linux-stage.orig/fs/ext3/iopen.c 2004-05-07 16:00:17.000000000 -0400 -+++ linux-stage/fs/ext3/iopen.c 2004-05-07 17:22:37.000000000 -0400 +--- linux-stage.orig/fs/ext3/iopen.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-stage/fs/ext3/iopen.c 2004-05-11 17:21:21.000000000 -0400 @@ -0,0 +1,272 @@ +/* + * linux/fs/ext3/iopen.c @@ -320,8 +320,8 @@ Index: linux-stage/fs/ext3/iopen.c +} Index: linux-stage/fs/ext3/iopen.h =================================================================== ---- linux-stage.orig/fs/ext3/iopen.h 2004-05-07 16:00:17.000000000 -0400 -+++ linux-stage/fs/ext3/iopen.h 2004-05-07 16:00:17.000000000 -0400 +--- linux-stage.orig/fs/ext3/iopen.h 1969-12-31 19:00:00.000000000 -0500 ++++ linux-stage/fs/ext3/iopen.h 2004-05-11 17:21:21.000000000 -0400 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -340,8 +340,8 @@ Index: linux-stage/fs/ext3/iopen.h + struct inode *inode, int rehash); Index: linux-stage/fs/ext3/namei.c =================================================================== ---- linux-stage.orig/fs/ext3/namei.c 2004-05-07 16:00:16.000000000 -0400 -+++ linux-stage/fs/ext3/namei.c 2004-05-07 16:00:17.000000000 -0400 +--- linux-stage.orig/fs/ext3/namei.c 2004-05-11 17:21:20.000000000 -0400 ++++ linux-stage/fs/ext3/namei.c 2004-05-11 17:21:21.000000000 -0400 @@ -37,6 +37,7 @@ #include #include @@ -420,30 +420,30 @@ Index: linux-stage/fs/ext3/namei.c } Index: linux-stage/fs/ext3/super.c =================================================================== ---- linux-stage.orig/fs/ext3/super.c 2004-05-07 16:00:16.000000000 -0400 -+++ linux-stage/fs/ext3/super.c 2004-05-07 17:21:59.000000000 -0400 +--- linux-stage.orig/fs/ext3/super.c 2004-05-11 17:21:21.000000000 -0400 ++++ linux-stage/fs/ext3/super.c 2004-05-11 17:44:53.000000000 -0400 @@ -536,7 +536,7 @@ Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, -- Opt_ignore, Opt_err, -+ Opt_ignore, Opt_err, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_ignore, Opt_barrier, ++ Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_err, }; - static match_table_t tokens = { -@@ -575,6 +575,9 @@ - {Opt_ignore, "noquota"}, +@@ -577,6 +577,9 @@ {Opt_ignore, "quota"}, {Opt_ignore, "usrquota"}, -+ {Opt_iopen, "iopen"}, -+ {Opt_noiopen, "noiopen"}, -+ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_err, NULL} }; -@@ -762,6 +765,18 @@ - case Opt_abort: - set_opt(sbi->s_mount_opt, ABORT); +@@ -772,6 +775,18 @@ + else + clear_opt(sbi->s_mount_opt, BARRIER); break; + case Opt_iopen: + set_opt (sbi->s_mount_opt, IOPEN); @@ -462,14 +462,14 @@ Index: linux-stage/fs/ext3/super.c default: Index: linux-stage/include/linux/ext3_fs.h =================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2004-05-07 16:00:16.000000000 -0400 -+++ linux-stage/include/linux/ext3_fs.h 2004-05-07 16:00:17.000000000 -0400 -@@ -325,6 +325,8 @@ - #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ +--- linux-stage.orig/include/linux/ext3_fs.h 2004-05-11 17:21:20.000000000 -0400 ++++ linux-stage/include/linux/ext3_fs.h 2004-05-11 17:21:21.000000000 -0400 +@@ -326,6 +326,8 @@ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ -+#define EXT3_MOUNT_IOPEN 0x10000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x20000 /* Make iopen world-readable */ + #define EXT3_MOUNT_BARRIER 0x10000 /* Use block barriers */ ++#define EXT3_MOUNT_IOPEN 0x20000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x40000 /* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series index cff99dd..d27088e 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series @@ -7,3 +7,4 @@ ext3-init-generation-2.6-suse.patch ext3-ea-in-inode-2.6-suse.patch export-ext3-2.6-suse.patch ext3-include-fixes-2.6-suse.patch +ext3-htree-rename_fix.patch diff --git a/ldiskfs/ldiskfs/autoMakefile.am b/ldiskfs/ldiskfs/autoMakefile.am index b24081e..11838d6 100644 --- a/ldiskfs/ldiskfs/autoMakefile.am +++ b/ldiskfs/ldiskfs/autoMakefile.am @@ -1,6 +1,8 @@ +if MODULES if LDISKFS modulefs_DATA = ldiskfs$(KMODEXT) endif +endif ldiskfs_linux_headers := $(addprefix linux/,$(subst ext3,ldiskfs,$(notdir $(linux_headers)))) diff --git a/lnet/archdep.m4 b/lnet/archdep.m4 index 636ee1d..cb6e0a2 100644 --- a/lnet/archdep.m4 +++ b/lnet/archdep.m4 @@ -218,12 +218,13 @@ if test x$enable_modules != xno ; then fi LUSTRE_MODULE_TRY_MAKE( [#include ], - [LINUXRELEASE=UTS_RELEASE], + [char *LINUXRELEASE; + LINUXRELEASE=UTS_RELEASE;], [$makerule LUSTRE_KERNEL_TEST=conftest.i], [test -s kernel-tests/conftest.i], [ # LINUXRELEASE="UTS_RELEASE" - eval $(grep LINUXRELEASE kernel-tests/conftest.i) + eval $(grep "LINUXRELEASE=" kernel-tests/conftest.i) ],[ AC_MSG_RESULT([unknown]) AC_MSG_ERROR([Could not preprocess test program. Consult config.log for details.]) diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h index c55dd37..6ef28a8 100644 --- a/lnet/include/linux/kp30.h +++ b/lnet/include/linux/kp30.h @@ -7,12 +7,6 @@ #include #define PORTAL_DEBUG -#ifndef offsetof -# define offsetof(typ,memb) ((unsigned long)((char *)&(((typ *)0)->memb))) -#endif - -#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) - #ifdef __KERNEL__ # include # include @@ -647,7 +641,6 @@ enum { TCPNAL = 5, ROUTER = 6, IBNAL = 7, - CRAY_KB_ERNAL = 8, NAL_ENUM_END_MARKER }; diff --git a/lnet/include/linux/kpr.h b/lnet/include/linux/kpr.h index 51d2d2f..1127698 100644 --- a/lnet/include/linux/kpr.h +++ b/lnet/include/linux/kpr.h @@ -4,7 +4,7 @@ #ifndef _KPR_H #define _KPR_H -# include /* for ptl_hdr_t */ +# include /* for ptl_hdr_t */ /******************************************************************************/ /* Kernel Portals Router interface */ diff --git a/lnet/include/linux/libcfs.h b/lnet/include/linux/libcfs.h index c2a15f4..a205163 100644 --- a/lnet/include/linux/libcfs.h +++ b/lnet/include/linux/libcfs.h @@ -79,9 +79,11 @@ extern unsigned int portal_cerror; #define S_PTLROUTER 0x00100000 #define S_COBD 0x00200000 #define S_IBNAL 0x00400000 -#define S_LMV 0x00800000 -#define S_SM 0x01000000 -#define S_CMOBD 0x02000000 +#define S_SM 0x00800000 +#define S_ASOBD 0x01000000 +#define S_LMV 0x02000000 +#define S_CMOBD 0x04000000 + /* If you change these values, please keep portals/utils/debug.c * up to date! */ diff --git a/lnet/include/linux/portals_lib.h b/lnet/include/linux/portals_lib.h index 609290d..b4741cc 100644 --- a/lnet/include/linux/portals_lib.h +++ b/lnet/include/linux/portals_lib.h @@ -77,8 +77,10 @@ static inline char *strdup(const char *str) #endif #ifdef __KERNEL__ +# define NTOH__u16(var) le16_to_cpu(var) # define NTOH__u32(var) le32_to_cpu(var) # define NTOH__u64(var) le64_to_cpu(var) +# define HTON__u16(var) cpu_to_le16(var) # define HTON__u32(var) cpu_to_le32(var) # define HTON__u64(var) cpu_to_le64(var) #else @@ -92,8 +94,10 @@ static inline char *strdup(const char *str) }; \ (ret); \ }) +# define NTOH__u16(var) (var) # define NTOH__u32(var) (var) # define NTOH__u64(var) (expansion_u64(var)) +# define HTON__u16(var) (var) # define HTON__u32(var) (var) # define HTON__u64(var) (expansion_u64(var)) #endif diff --git a/lnet/include/lnet/api-support.h b/lnet/include/lnet/api-support.h index cfae78c..c5994c6 100644 --- a/lnet/include/lnet/api-support.h +++ b/lnet/include/lnet/api-support.h @@ -19,9 +19,4 @@ #include #include -#include -/* Hack for 2.4.18 macro name collision */ -#ifdef yield -#undef yield -#endif diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h index 6d382bb..c7aaced 100644 --- a/lnet/include/lnet/api.h +++ b/lnet/include/lnet/api.h @@ -5,7 +5,6 @@ #include -#ifndef PTL_NO_WRAP int PtlInit(int *); void PtlFini(void); @@ -17,8 +16,6 @@ int PtlNIInitialized(ptl_interface_t); int PtlNIFini(ptl_handle_ni_t interface_in); -#endif - int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id); @@ -32,9 +29,7 @@ int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, unsigned long *distance_out); -#ifndef PTL_NO_WRAP int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out); -#endif /* @@ -74,16 +69,12 @@ int PtlMEUnlink(ptl_handle_me_t current_in); int PtlMEUnlinkList(ptl_handle_me_t current_in); -int PtlTblDump(ptl_handle_ni_t ni, int index_in); -int PtlMEDump(ptl_handle_me_t current_in); - /* * Memory descriptors */ -#ifndef PTL_NO_WRAP int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in, ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out); @@ -95,7 +86,6 @@ int PtlMDUnlink(ptl_handle_md_t md_in); int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout, ptl_md_t * new_inout, ptl_handle_eq_t testq_in); -#endif /* These should not be called by users */ int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout, @@ -108,16 +98,11 @@ int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout, /* * Event queues */ -#ifndef PTL_NO_WRAP - -/* These should be called by users */ int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in, ptl_eq_handler_t handler, ptl_handle_eq_t *handle_out); int PtlEQFree(ptl_handle_eq_t eventq_in); -int PtlEQCount(ptl_handle_eq_t eventq_in, ptl_size_t * count_out); - int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); @@ -125,7 +110,6 @@ int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout, ptl_event_t *event_out, int *which_out); -#endif /* * Access Control Table diff --git a/lnet/include/lnet/arg-blocks.h b/lnet/include/lnet/arg-blocks.h deleted file mode 100644 index 21e30d5..0000000 --- a/lnet/include/lnet/arg-blocks.h +++ /dev/null @@ -1,268 +0,0 @@ -#ifndef PTL_BLOCKS_H -#define PTL_BLOCKS_H - -#include "build_check.h" - -/* - * blocks.h - * - * Argument block types for the Portals 3.0 library - * Generated by idl - * - */ - -#include - -/* put LIB_MAX_DISPATCH last here -- these must match the - assignements to the dispatch table in lib-p30/dispatch.c */ -#define PTL_GETID 1 -#define PTL_NISTATUS 2 -#define PTL_NIDIST 3 -// #define PTL_NIDEBUG 4 -#define PTL_MEATTACH 5 -#define PTL_MEINSERT 6 -// #define PTL_MEPREPEND 7 -#define PTL_MEUNLINK 8 -#define PTL_TBLDUMP 9 -#define PTL_MEDUMP 10 -#define PTL_MDATTACH 11 -// #define PTL_MDINSERT 12 -#define PTL_MDBIND 13 -#define PTL_MDUPDATE 14 -#define PTL_MDUNLINK 15 -#define PTL_EQALLOC 16 -#define PTL_EQFREE 17 -#define PTL_ACENTRY 18 -#define PTL_PUT 19 -#define PTL_GET 20 -#define PTL_FAILNID 21 -#define LIB_MAX_DISPATCH 21 - -typedef struct PtlFailNid_in { - ptl_handle_ni_t interface; - ptl_nid_t nid; - unsigned int threshold; -} PtlFailNid_in; - -typedef struct PtlFailNid_out { - int rc; -} PtlFailNid_out; - -typedef struct PtlGetId_in { - ptl_handle_ni_t handle_in; -} PtlGetId_in; - -typedef struct PtlGetId_out { - int rc; - ptl_process_id_t id_out; -} PtlGetId_out; - -typedef struct PtlNIStatus_in { - ptl_handle_ni_t interface_in; - ptl_sr_index_t register_in; -} PtlNIStatus_in; - -typedef struct PtlNIStatus_out { - int rc; - ptl_sr_value_t status_out; -} PtlNIStatus_out; - - -typedef struct PtlNIDist_in { - ptl_handle_ni_t interface_in; - ptl_process_id_t process_in; -} PtlNIDist_in; - -typedef struct PtlNIDist_out { - int rc; - unsigned long distance_out; -} PtlNIDist_out; - - -typedef struct PtlNIDebug_in { - unsigned int mask_in; -} PtlNIDebug_in; - -typedef struct PtlNIDebug_out { - unsigned int rc; -} PtlNIDebug_out; - - -typedef struct PtlMEAttach_in { - ptl_handle_ni_t interface_in; - ptl_pt_index_t index_in; - ptl_ins_pos_t position_in; - ptl_process_id_t match_id_in; - ptl_match_bits_t match_bits_in; - ptl_match_bits_t ignore_bits_in; - ptl_unlink_t unlink_in; -} PtlMEAttach_in; - -typedef struct PtlMEAttach_out { - int rc; - ptl_handle_me_t handle_out; -} PtlMEAttach_out; - - -typedef struct PtlMEInsert_in { - ptl_handle_me_t current_in; - ptl_process_id_t match_id_in; - ptl_match_bits_t match_bits_in; - ptl_match_bits_t ignore_bits_in; - ptl_unlink_t unlink_in; - ptl_ins_pos_t position_in; -} PtlMEInsert_in; - -typedef struct PtlMEInsert_out { - int rc; - ptl_handle_me_t handle_out; -} PtlMEInsert_out; - -typedef struct PtlMEUnlink_in { - ptl_handle_me_t current_in; - ptl_unlink_t unlink_in; -} PtlMEUnlink_in; - -typedef struct PtlMEUnlink_out { - int rc; -} PtlMEUnlink_out; - - -typedef struct PtlTblDump_in { - int index_in; -} PtlTblDump_in; - -typedef struct PtlTblDump_out { - int rc; -} PtlTblDump_out; - - -typedef struct PtlMEDump_in { - ptl_handle_me_t current_in; -} PtlMEDump_in; - -typedef struct PtlMEDump_out { - int rc; -} PtlMEDump_out; - - -typedef struct PtlMDAttach_in { - ptl_handle_me_t me_in; - ptl_handle_eq_t eq_in; - ptl_md_t md_in; - ptl_unlink_t unlink_in; -} PtlMDAttach_in; - -typedef struct PtlMDAttach_out { - int rc; - ptl_handle_md_t handle_out; -} PtlMDAttach_out; - - -typedef struct PtlMDBind_in { - ptl_handle_ni_t ni_in; - ptl_handle_eq_t eq_in; - ptl_md_t md_in; - ptl_unlink_t unlink_in; -} PtlMDBind_in; - -typedef struct PtlMDBind_out { - int rc; - ptl_handle_md_t handle_out; -} PtlMDBind_out; - - -typedef struct PtlMDUpdate_internal_in { - ptl_handle_md_t md_in; - ptl_handle_eq_t testq_in; - ptl_seq_t sequence_in; - - ptl_md_t old_inout; - int old_inout_valid; - ptl_md_t new_inout; - int new_inout_valid; -} PtlMDUpdate_internal_in; - -typedef struct PtlMDUpdate_internal_out { - int rc; - ptl_md_t old_inout; - ptl_md_t new_inout; -} PtlMDUpdate_internal_out; - - -typedef struct PtlMDUnlink_in { - ptl_handle_md_t md_in; -} PtlMDUnlink_in; - -typedef struct PtlMDUnlink_out { - int rc; - ptl_md_t status_out; -} PtlMDUnlink_out; - - -typedef struct PtlEQAlloc_in { - ptl_handle_ni_t ni_in; - ptl_size_t count_in; - void *base_in; - int len_in; - ptl_eq_handler_t callback_in; -} PtlEQAlloc_in; - -typedef struct PtlEQAlloc_out { - int rc; - ptl_handle_eq_t handle_out; -} PtlEQAlloc_out; - - -typedef struct PtlEQFree_in { - ptl_handle_eq_t eventq_in; -} PtlEQFree_in; - -typedef struct PtlEQFree_out { - int rc; -} PtlEQFree_out; - - -typedef struct PtlACEntry_in { - ptl_handle_ni_t ni_in; - ptl_ac_index_t index_in; - ptl_process_id_t match_id_in; - ptl_pt_index_t portal_in; -} PtlACEntry_in; - -typedef struct PtlACEntry_out { - int rc; -} PtlACEntry_out; - - -typedef struct PtlPut_in { - ptl_handle_md_t md_in; - ptl_ack_req_t ack_req_in; - ptl_process_id_t target_in; - ptl_pt_index_t portal_in; - ptl_ac_index_t cookie_in; - ptl_match_bits_t match_bits_in; - ptl_size_t offset_in; - ptl_hdr_data_t hdr_data_in; -} PtlPut_in; - -typedef struct PtlPut_out { - int rc; -} PtlPut_out; - - -typedef struct PtlGet_in { - ptl_handle_md_t md_in; - ptl_process_id_t target_in; - ptl_pt_index_t portal_in; - ptl_ac_index_t cookie_in; - ptl_match_bits_t match_bits_in; - ptl_size_t offset_in; -} PtlGet_in; - -typedef struct PtlGet_out { - int rc; -} PtlGet_out; - - -#endif diff --git a/lnet/include/lnet/errno.h b/lnet/include/lnet/errno.h index a98bfd9..42f2626 100644 --- a/lnet/include/lnet/errno.h +++ b/lnet/include/lnet/errno.h @@ -41,7 +41,10 @@ typedef enum { PTL_EQ_IN_USE = 21, - PTL_MAX_ERRNO = 22 + PTL_NI_INVALID = 22, + PTL_MD_ILLEGAL = 23, + + PTL_MAX_ERRNO = 24 } ptl_err_t; /* If you change these, you must update the string table in api-errno.c */ diff --git a/lnet/include/lnet/internal.h b/lnet/include/lnet/internal.h index 25778e4..eae00a0 100644 --- a/lnet/include/lnet/internal.h +++ b/lnet/include/lnet/internal.h @@ -13,17 +13,4 @@ extern int ptl_init; /* Has the library been initialized */ -extern int ptl_ni_init(void); -extern void ptl_ni_fini(void); - -static inline ptl_eq_t * -ptl_handle2usereq (ptl_handle_eq_t *handle) -{ - /* EQ handles are a little wierd. On the "user" side, the cookie - * is just a pointer to a queue of events in shared memory. It's - * cb_eq_handle is the "real" handle which we pass when we - * call do_forward(). */ - return (ptl_eq_t *)((unsigned long)handle->cookie); -} - #endif diff --git a/lnet/include/lnet/lib-dispatch.h b/lnet/include/lnet/lib-dispatch.h deleted file mode 100644 index 610c776..0000000 --- a/lnet/include/lnet/lib-dispatch.h +++ /dev/null @@ -1,45 +0,0 @@ -#ifndef PTL_DISPATCH_H -#define PTL_DISPATCH_H - -#include "build_check.h" -/* - * include/dispatch.h - * - * Dispatch table header and externs for remote side - * operations - * - * Generated by idl - * - */ - -#include -#include - -extern int do_PtlGetId(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlNIStatus(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlNIDist(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMEAttach(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMEInsert(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMEPrepend(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlTblDump(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMEDump(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMDAttach(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlMDBind(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlPut(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlGet(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlFailNid (nal_cb_t *nal, void *private, void *args, void *ret); - -extern char *dispatch_name(int index); -#endif diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index efa929c..4daf219 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -17,13 +17,13 @@ #else # include # include +# include #endif #include #include #include +#include #include -#include -#include static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) { @@ -31,17 +31,18 @@ static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); } -#define state_lock(nal,flagsp) \ -do { \ - CDEBUG(D_PORTALS, "taking state lock\n"); \ - nal->cb_cli(nal, flagsp); \ -} while (0) +#ifdef __KERNEL__ +#define LIB_LOCK(nal,flags) \ + spin_lock_irqsave(&(nal)->libnal_ni.ni_lock, flags) +#define LIB_UNLOCK(nal,flags) \ + spin_unlock_irqrestore(&(nal)->libnal_ni.ni_lock, flags) +#else +#define LIB_LOCK(nal,flags) \ + (pthread_mutex_lock(&(nal)->libnal_ni.ni_mutex), (flags) = 0) +#define LIB_UNLOCK(nal,flags) \ + pthread_mutex_unlock(&(nal)->libnal_ni.ni_mutex) +#endif -#define state_unlock(nal,flagsp) \ -{ \ - CDEBUG(D_PORTALS, "releasing state lock\n"); \ - nal->cb_sti(nal, flagsp); \ -} #ifdef PTL_USE_LIB_FREELIST @@ -50,13 +51,13 @@ do { \ #define MAX_MSGS 2048 /* Outstanding messages */ #define MAX_EQS 512 -extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize); -extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl); +extern int lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int nobj, int objsize); +extern void lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl); static inline void * lib_freelist_alloc (lib_freelist_t *fl) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_freeobj_t *o; if (list_empty (&fl->fl_list)) @@ -70,7 +71,7 @@ lib_freelist_alloc (lib_freelist_t *fl) static inline void lib_freelist_free (lib_freelist_t *fl, void *obj) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); list_add (&o->fo_list, &fl->fl_list); @@ -78,78 +79,78 @@ lib_freelist_free (lib_freelist_t *fl, void *obj) static inline lib_eq_t * -lib_eq_alloc (nal_cb_t *nal) +lib_eq_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_eq_t *eq; - state_lock (nal, &flags); - eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + eq = (lib_eq_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_eqs); + LIB_UNLOCK (nal, flags); return (eq); } static inline void -lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_eqs, eq); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_eqs, eq); } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) +lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_md_t *md; - state_lock (nal, &flags); - md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + md = (lib_md_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mds); + LIB_UNLOCK (nal, flags); return (md); } static inline void -lib_md_free (nal_cb_t *nal, lib_md_t *md) +lib_md_free (lib_nal_t *nal, lib_md_t *md) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_mds, md); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_mds, md); } static inline lib_me_t * -lib_me_alloc (nal_cb_t *nal) +lib_me_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_me_t *me; - state_lock (nal, &flags); - me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + me = (lib_me_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mes); + LIB_UNLOCK (nal, flags); return (me); } static inline void -lib_me_free (nal_cb_t *nal, lib_me_t *me) +lib_me_free (lib_nal_t *nal, lib_me_t *me) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_mes, me); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_mes, me); } static inline lib_msg_t * -lib_msg_alloc (nal_cb_t *nal) +lib_msg_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_msg_t *msg; - state_lock (nal, &flags); - msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + msg = (lib_msg_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_msgs); + LIB_UNLOCK (nal, flags); if (msg != NULL) { /* NULL pointers, clear flags etc */ @@ -160,18 +161,18 @@ lib_msg_alloc (nal_cb_t *nal) } static inline void -lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) +lib_msg_free (lib_nal_t *nal, lib_msg_t *msg) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_msgs, msg); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_msgs, msg); } #else static inline lib_eq_t * -lib_eq_alloc (nal_cb_t *nal) +lib_eq_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ lib_eq_t *eq; PORTAL_ALLOC(eq, sizeof(*eq)); @@ -179,16 +180,16 @@ lib_eq_alloc (nal_cb_t *nal) } static inline void -lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ PORTAL_FREE(eq, sizeof(*eq)); } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) +lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ lib_md_t *md; int size; int niov; @@ -214,9 +215,9 @@ lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) } static inline void -lib_md_free (nal_cb_t *nal, lib_md_t *md) +lib_md_free (lib_nal_t *nal, lib_md_t *md) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ int size; if ((md->options & PTL_MD_KIOV) != 0) @@ -228,9 +229,9 @@ lib_md_free (nal_cb_t *nal, lib_md_t *md) } static inline lib_me_t * -lib_me_alloc (nal_cb_t *nal) +lib_me_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ lib_me_t *me; PORTAL_ALLOC(me, sizeof(*me)); @@ -238,16 +239,16 @@ lib_me_alloc (nal_cb_t *nal) } static inline void -lib_me_free(nal_cb_t *nal, lib_me_t *me) +lib_me_free(lib_nal_t *nal, lib_me_t *me) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ PORTAL_FREE(me, sizeof(*me)); } static inline lib_msg_t * -lib_msg_alloc(nal_cb_t *nal) +lib_msg_alloc(lib_nal_t *nal) { - /* NEVER called with statelock held; may be in interrupt... */ + /* NEVER called with liblock held; may be in interrupt... */ lib_msg_t *msg; if (in_interrupt()) @@ -264,27 +265,28 @@ lib_msg_alloc(nal_cb_t *nal) } static inline void -lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) +lib_msg_free(lib_nal_t *nal, lib_msg_t *msg) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ PORTAL_FREE(msg, sizeof(*msg)); } #endif -extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type); -extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type); -extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh); +extern lib_handle_t *lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type); +extern void lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type); +extern void lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh); static inline void -ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq) +ptl_eq2handle (ptl_handle_eq_t *handle, lib_nal_t *nal, lib_eq_t *eq) { + handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = eq->eq_lh.lh_cookie; } static inline lib_eq_t * -ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) +ptl_handle2eq (ptl_handle_eq_t *handle, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, PTL_COOKIE_TYPE_EQ); if (lh == NULL) @@ -294,15 +296,16 @@ ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) } static inline void -ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md) +ptl_md2handle (ptl_handle_md_t *handle, lib_nal_t *nal, lib_md_t *md) { + handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = md->md_lh.lh_cookie; } static inline lib_md_t * -ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) +ptl_handle2md (ptl_handle_md_t *handle, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, PTL_COOKIE_TYPE_MD); if (lh == NULL) @@ -312,12 +315,12 @@ ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) } static inline lib_md_t * -ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) +ptl_wire_handle2md (ptl_handle_wire_t *wh, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh; - if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie) + if (wh->wh_interface_cookie != nal->libnal_ni.ni_interface_cookie) return (NULL); lh = lib_lookup_cookie (nal, wh->wh_object_cookie, @@ -329,15 +332,16 @@ ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) } static inline void -ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me) +ptl_me2handle (ptl_handle_me_t *handle, lib_nal_t *nal, lib_me_t *me) { + handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = me->me_lh.lh_cookie; } static inline lib_me_t * -ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) +ptl_handle2me (ptl_handle_me_t *handle, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, PTL_COOKIE_TYPE_ME); if (lh == NULL) @@ -346,35 +350,30 @@ ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) return (lh_entry (lh, lib_me_t, me_lh)); } -extern int lib_init(nal_cb_t *cb, ptl_process_id_t pid, +extern int lib_init(lib_nal_t *libnal, nal_t *apinal, + ptl_process_id_t pid, ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits); -extern int lib_fini(nal_cb_t * cb); -extern void lib_dispatch(nal_cb_t * cb, void *private, int index, - void *arg_block, void *ret_block); -extern char *dispatch_name(int index); +extern int lib_fini(lib_nal_t *libnal); /* - * When the NAL detects an incoming message, it should call - * lib_parse() decode it. The NAL callbacks will be handed - * the private cookie as a way for the NAL to maintain state - * about which transaction is being processed. An extra parameter, - * lib_cookie will contain the necessary information for - * finalizing the message. - * - * After it has finished the handling the message, it should - * call lib_finalize() with the lib_cookie parameter. - * Call backs will be made to write events, send acks or - * replies and so on. + * When the NAL detects an incoming message header, it should call + * lib_parse() decode it. If the message header is garbage, lib_parse() + * returns immediately with failure, otherwise the NAL callbacks will be + * called to receive the message body. They are handed the private cookie + * as a way for the NAL to maintain state about which transaction is being + * processed. An extra parameter, lib_msg contains the lib-level message + * state for passing to lib_finalize() when the message body has been + * received. */ -extern void lib_enq_event_locked (nal_cb_t *nal, void *private, +extern void lib_enq_event_locked (lib_nal_t *nal, void *private, lib_eq_t *eq, ptl_event_t *ev); -extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg, +extern void lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_ni_fail_t ni_fail_type); -extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private); -extern lib_msg_t *lib_create_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, +extern ptl_err_t lib_parse (lib_nal_t *nal, ptl_hdr_t *hdr, void *private); +extern lib_msg_t *lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, lib_msg_t *get_msg); -extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr); +extern void print_hdr (lib_nal_t * nal, ptl_hdr_t * hdr); extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); @@ -397,14 +396,65 @@ extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, extern void lib_assert_wire_constants (void); -extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, +extern ptl_err_t lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); -extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, +extern ptl_err_t lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, lib_md_t *md, ptl_size_t offset, ptl_size_t len); -extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, - ptl_md_t * md_out); -extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in); -extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in); +extern int lib_api_ni_status (nal_t *nal, ptl_sr_index_t sr_idx, + ptl_sr_value_t *status); +extern int lib_api_ni_dist (nal_t *nal, ptl_process_id_t *pid, + unsigned long *dist); + +extern int lib_api_eq_alloc (nal_t *nal, ptl_size_t count, + ptl_eq_handler_t callback, + ptl_handle_eq_t *handle); +extern int lib_api_eq_free(nal_t *nal, ptl_handle_eq_t *eqh); +extern int lib_api_eq_poll (nal_t *nal, + ptl_handle_eq_t *eventqs, int neq, int timeout_ms, + ptl_event_t *event, int *which); + +extern int lib_api_me_attach(nal_t *nal, + ptl_pt_index_t portal, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); +extern int lib_api_me_insert(nal_t *nal, + ptl_handle_me_t *current_meh, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); +extern int lib_api_me_unlink (nal_t *nal, ptl_handle_me_t *meh); +extern void lib_me_unlink(lib_nal_t *nal, lib_me_t *me); + +extern int lib_api_get_id(nal_t *nal, ptl_process_id_t *pid); + +extern void lib_md_unlink(lib_nal_t *nal, lib_md_t *md); +extern void lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd); +extern int lib_api_md_attach(nal_t *nal, ptl_handle_me_t *meh, + ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle); +extern int lib_api_md_bind(nal_t *nal, ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle); +extern int lib_api_md_unlink (nal_t *nal, ptl_handle_md_t *mdh); +extern int lib_api_md_update (nal_t *nal, ptl_handle_md_t *mdh, + ptl_md_t *oldumd, ptl_md_t *newumd, + ptl_handle_eq_t *testqh); + +extern int lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, + ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, ptl_size_t offset); +extern int lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, + ptl_ack_req_t ack, ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, + ptl_size_t offset, ptl_hdr_data_t hdr_data); +extern int lib_api_fail_nid(nal_t *apinal, ptl_nid_t nid, unsigned int threshold); + #endif diff --git a/lnet/include/lnet/lib-nal.h b/lnet/include/lnet/lib-nal.h deleted file mode 100644 index d1d0495..0000000 --- a/lnet/include/lnet/lib-nal.h +++ /dev/null @@ -1,116 +0,0 @@ -#ifndef _LIB_NAL_H_ -#define _LIB_NAL_H_ - -#include "build_check.h" -/* - * nal.h - * - * Library side headers that define the abstraction layer's - * responsibilities and interfaces - */ - -#include - -struct nal_cb_t { - /* - * Per interface portal table, access control table - * and NAL private data field; - */ - lib_ni_t ni; - void *nal_data; - /* - * send: Sends a preformatted header and payload data to a - * specified remote process. The payload is scattered over 'niov' - * fragments described by iov, starting at 'offset' for 'mlen' - * bytes. - * NB the NAL may NOT overwrite iov. - * PTL_OK on success => NAL has committed to send and will call - * lib_finalize on completion - */ - ptl_err_t (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen); - - /* as send, but with a set of page fragments (NULL if not supported) */ - ptl_err_t (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, ptl_kiov_t *iov, - size_t offset, size_t mlen); - /* - * recv: Receives an incoming message from a remote process. The - * payload is to be received into the scattered buffer of 'niov' - * fragments described by iov, starting at 'offset' for 'mlen' - * bytes. Payload bytes after 'mlen' up to 'rlen' are to be - * discarded. - * NB the NAL may NOT overwrite iov. - * PTL_OK on success => NAL has committed to receive and will call - * lib_finalize on completion - */ - ptl_err_t (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen); - - /* as recv, but with a set of page fragments (NULL if not supported) */ - ptl_err_t (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - unsigned int niov, ptl_kiov_t *iov, - size_t offset, size_t mlen, size_t rlen); - /* - * read: Reads a block of data from a specified user address - */ - ptl_err_t (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr, - user_ptr src_addr, size_t len); - - /* - * write: Writes a block of data into a specified user address - */ - ptl_err_t (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr, - void *src_addr, size_t len); - - /* - * callback: Calls an event callback - * NULL => lib calls eq's callback (if any) directly. - */ - void (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq, - ptl_event_t *ev); - - /* - * malloc: Acquire a block of memory in a system independent - * fashion. - */ - void *(*cb_malloc) (nal_cb_t * nal, size_t len); - - void (*cb_free) (nal_cb_t * nal, void *buf, size_t len); - - /* - * (un)map: Tell the NAL about some memory it will access. - * *addrkey passed to cb_unmap() is what cb_map() set it to. - * type of *iov depends on options. - * Set to NULL if not required. - */ - ptl_err_t (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, - void **addrkey); - void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, - void **addrkey); - - /* as (un)map, but with a set of page fragments */ - ptl_err_t (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, - void **addrkey); - void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, - void **addrkey); - - void (*cb_printf) (nal_cb_t * nal, const char *fmt, ...); - - /* Turn interrupts off (begin of protected area) */ - void (*cb_cli) (nal_cb_t * nal, unsigned long *flags); - - /* Turn interrupts on (end of protected area) */ - void (*cb_sti) (nal_cb_t * nal, unsigned long *flags); - - /* - * Calculate a network "distance" to given node - */ - int (*cb_dist) (nal_cb_t * nal, ptl_nid_t nid, unsigned long *dist); -}; - -#endif diff --git a/lnet/include/lnet/lib-p30.h b/lnet/include/lnet/lib-p30.h index efa929c..4daf219 100644 --- a/lnet/include/lnet/lib-p30.h +++ b/lnet/include/lnet/lib-p30.h @@ -17,13 +17,13 @@ #else # include # include +# include #endif #include #include #include +#include #include -#include -#include static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) { @@ -31,17 +31,18 @@ static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); } -#define state_lock(nal,flagsp) \ -do { \ - CDEBUG(D_PORTALS, "taking state lock\n"); \ - nal->cb_cli(nal, flagsp); \ -} while (0) +#ifdef __KERNEL__ +#define LIB_LOCK(nal,flags) \ + spin_lock_irqsave(&(nal)->libnal_ni.ni_lock, flags) +#define LIB_UNLOCK(nal,flags) \ + spin_unlock_irqrestore(&(nal)->libnal_ni.ni_lock, flags) +#else +#define LIB_LOCK(nal,flags) \ + (pthread_mutex_lock(&(nal)->libnal_ni.ni_mutex), (flags) = 0) +#define LIB_UNLOCK(nal,flags) \ + pthread_mutex_unlock(&(nal)->libnal_ni.ni_mutex) +#endif -#define state_unlock(nal,flagsp) \ -{ \ - CDEBUG(D_PORTALS, "releasing state lock\n"); \ - nal->cb_sti(nal, flagsp); \ -} #ifdef PTL_USE_LIB_FREELIST @@ -50,13 +51,13 @@ do { \ #define MAX_MSGS 2048 /* Outstanding messages */ #define MAX_EQS 512 -extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize); -extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl); +extern int lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int nobj, int objsize); +extern void lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl); static inline void * lib_freelist_alloc (lib_freelist_t *fl) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_freeobj_t *o; if (list_empty (&fl->fl_list)) @@ -70,7 +71,7 @@ lib_freelist_alloc (lib_freelist_t *fl) static inline void lib_freelist_free (lib_freelist_t *fl, void *obj) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); list_add (&o->fo_list, &fl->fl_list); @@ -78,78 +79,78 @@ lib_freelist_free (lib_freelist_t *fl, void *obj) static inline lib_eq_t * -lib_eq_alloc (nal_cb_t *nal) +lib_eq_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_eq_t *eq; - state_lock (nal, &flags); - eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + eq = (lib_eq_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_eqs); + LIB_UNLOCK (nal, flags); return (eq); } static inline void -lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_eqs, eq); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_eqs, eq); } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) +lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_md_t *md; - state_lock (nal, &flags); - md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + md = (lib_md_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mds); + LIB_UNLOCK (nal, flags); return (md); } static inline void -lib_md_free (nal_cb_t *nal, lib_md_t *md) +lib_md_free (lib_nal_t *nal, lib_md_t *md) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_mds, md); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_mds, md); } static inline lib_me_t * -lib_me_alloc (nal_cb_t *nal) +lib_me_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_me_t *me; - state_lock (nal, &flags); - me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + me = (lib_me_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mes); + LIB_UNLOCK (nal, flags); return (me); } static inline void -lib_me_free (nal_cb_t *nal, lib_me_t *me) +lib_me_free (lib_nal_t *nal, lib_me_t *me) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_mes, me); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_mes, me); } static inline lib_msg_t * -lib_msg_alloc (nal_cb_t *nal) +lib_msg_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_msg_t *msg; - state_lock (nal, &flags); - msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + msg = (lib_msg_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_msgs); + LIB_UNLOCK (nal, flags); if (msg != NULL) { /* NULL pointers, clear flags etc */ @@ -160,18 +161,18 @@ lib_msg_alloc (nal_cb_t *nal) } static inline void -lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) +lib_msg_free (lib_nal_t *nal, lib_msg_t *msg) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_msgs, msg); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_msgs, msg); } #else static inline lib_eq_t * -lib_eq_alloc (nal_cb_t *nal) +lib_eq_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ lib_eq_t *eq; PORTAL_ALLOC(eq, sizeof(*eq)); @@ -179,16 +180,16 @@ lib_eq_alloc (nal_cb_t *nal) } static inline void -lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ PORTAL_FREE(eq, sizeof(*eq)); } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) +lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ lib_md_t *md; int size; int niov; @@ -214,9 +215,9 @@ lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) } static inline void -lib_md_free (nal_cb_t *nal, lib_md_t *md) +lib_md_free (lib_nal_t *nal, lib_md_t *md) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ int size; if ((md->options & PTL_MD_KIOV) != 0) @@ -228,9 +229,9 @@ lib_md_free (nal_cb_t *nal, lib_md_t *md) } static inline lib_me_t * -lib_me_alloc (nal_cb_t *nal) +lib_me_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ lib_me_t *me; PORTAL_ALLOC(me, sizeof(*me)); @@ -238,16 +239,16 @@ lib_me_alloc (nal_cb_t *nal) } static inline void -lib_me_free(nal_cb_t *nal, lib_me_t *me) +lib_me_free(lib_nal_t *nal, lib_me_t *me) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ PORTAL_FREE(me, sizeof(*me)); } static inline lib_msg_t * -lib_msg_alloc(nal_cb_t *nal) +lib_msg_alloc(lib_nal_t *nal) { - /* NEVER called with statelock held; may be in interrupt... */ + /* NEVER called with liblock held; may be in interrupt... */ lib_msg_t *msg; if (in_interrupt()) @@ -264,27 +265,28 @@ lib_msg_alloc(nal_cb_t *nal) } static inline void -lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) +lib_msg_free(lib_nal_t *nal, lib_msg_t *msg) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ PORTAL_FREE(msg, sizeof(*msg)); } #endif -extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type); -extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type); -extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh); +extern lib_handle_t *lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type); +extern void lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type); +extern void lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh); static inline void -ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq) +ptl_eq2handle (ptl_handle_eq_t *handle, lib_nal_t *nal, lib_eq_t *eq) { + handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = eq->eq_lh.lh_cookie; } static inline lib_eq_t * -ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) +ptl_handle2eq (ptl_handle_eq_t *handle, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, PTL_COOKIE_TYPE_EQ); if (lh == NULL) @@ -294,15 +296,16 @@ ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) } static inline void -ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md) +ptl_md2handle (ptl_handle_md_t *handle, lib_nal_t *nal, lib_md_t *md) { + handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = md->md_lh.lh_cookie; } static inline lib_md_t * -ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) +ptl_handle2md (ptl_handle_md_t *handle, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, PTL_COOKIE_TYPE_MD); if (lh == NULL) @@ -312,12 +315,12 @@ ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) } static inline lib_md_t * -ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) +ptl_wire_handle2md (ptl_handle_wire_t *wh, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh; - if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie) + if (wh->wh_interface_cookie != nal->libnal_ni.ni_interface_cookie) return (NULL); lh = lib_lookup_cookie (nal, wh->wh_object_cookie, @@ -329,15 +332,16 @@ ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) } static inline void -ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me) +ptl_me2handle (ptl_handle_me_t *handle, lib_nal_t *nal, lib_me_t *me) { + handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = me->me_lh.lh_cookie; } static inline lib_me_t * -ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) +ptl_handle2me (ptl_handle_me_t *handle, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, PTL_COOKIE_TYPE_ME); if (lh == NULL) @@ -346,35 +350,30 @@ ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) return (lh_entry (lh, lib_me_t, me_lh)); } -extern int lib_init(nal_cb_t *cb, ptl_process_id_t pid, +extern int lib_init(lib_nal_t *libnal, nal_t *apinal, + ptl_process_id_t pid, ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits); -extern int lib_fini(nal_cb_t * cb); -extern void lib_dispatch(nal_cb_t * cb, void *private, int index, - void *arg_block, void *ret_block); -extern char *dispatch_name(int index); +extern int lib_fini(lib_nal_t *libnal); /* - * When the NAL detects an incoming message, it should call - * lib_parse() decode it. The NAL callbacks will be handed - * the private cookie as a way for the NAL to maintain state - * about which transaction is being processed. An extra parameter, - * lib_cookie will contain the necessary information for - * finalizing the message. - * - * After it has finished the handling the message, it should - * call lib_finalize() with the lib_cookie parameter. - * Call backs will be made to write events, send acks or - * replies and so on. + * When the NAL detects an incoming message header, it should call + * lib_parse() decode it. If the message header is garbage, lib_parse() + * returns immediately with failure, otherwise the NAL callbacks will be + * called to receive the message body. They are handed the private cookie + * as a way for the NAL to maintain state about which transaction is being + * processed. An extra parameter, lib_msg contains the lib-level message + * state for passing to lib_finalize() when the message body has been + * received. */ -extern void lib_enq_event_locked (nal_cb_t *nal, void *private, +extern void lib_enq_event_locked (lib_nal_t *nal, void *private, lib_eq_t *eq, ptl_event_t *ev); -extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg, +extern void lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_ni_fail_t ni_fail_type); -extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private); -extern lib_msg_t *lib_create_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, +extern ptl_err_t lib_parse (lib_nal_t *nal, ptl_hdr_t *hdr, void *private); +extern lib_msg_t *lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, lib_msg_t *get_msg); -extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr); +extern void print_hdr (lib_nal_t * nal, ptl_hdr_t * hdr); extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); @@ -397,14 +396,65 @@ extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, extern void lib_assert_wire_constants (void); -extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, +extern ptl_err_t lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); -extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, +extern ptl_err_t lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, lib_md_t *md, ptl_size_t offset, ptl_size_t len); -extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, - ptl_md_t * md_out); -extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in); -extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in); +extern int lib_api_ni_status (nal_t *nal, ptl_sr_index_t sr_idx, + ptl_sr_value_t *status); +extern int lib_api_ni_dist (nal_t *nal, ptl_process_id_t *pid, + unsigned long *dist); + +extern int lib_api_eq_alloc (nal_t *nal, ptl_size_t count, + ptl_eq_handler_t callback, + ptl_handle_eq_t *handle); +extern int lib_api_eq_free(nal_t *nal, ptl_handle_eq_t *eqh); +extern int lib_api_eq_poll (nal_t *nal, + ptl_handle_eq_t *eventqs, int neq, int timeout_ms, + ptl_event_t *event, int *which); + +extern int lib_api_me_attach(nal_t *nal, + ptl_pt_index_t portal, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); +extern int lib_api_me_insert(nal_t *nal, + ptl_handle_me_t *current_meh, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); +extern int lib_api_me_unlink (nal_t *nal, ptl_handle_me_t *meh); +extern void lib_me_unlink(lib_nal_t *nal, lib_me_t *me); + +extern int lib_api_get_id(nal_t *nal, ptl_process_id_t *pid); + +extern void lib_md_unlink(lib_nal_t *nal, lib_md_t *md); +extern void lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd); +extern int lib_api_md_attach(nal_t *nal, ptl_handle_me_t *meh, + ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle); +extern int lib_api_md_bind(nal_t *nal, ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle); +extern int lib_api_md_unlink (nal_t *nal, ptl_handle_md_t *mdh); +extern int lib_api_md_update (nal_t *nal, ptl_handle_md_t *mdh, + ptl_md_t *oldumd, ptl_md_t *newumd, + ptl_handle_eq_t *testqh); + +extern int lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, + ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, ptl_size_t offset); +extern int lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, + ptl_ack_req_t ack, ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, + ptl_size_t offset, ptl_hdr_data_t hdr_data); +extern int lib_api_fail_nid(nal_t *apinal, ptl_nid_t nid, unsigned int threshold); + #endif diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index ef618c7..6549988 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -13,6 +13,7 @@ #include "build_check.h" #include +#include #ifdef __KERNEL__ # include # include @@ -22,9 +23,6 @@ # include #endif -/* struct nal_cb_t is defined in lib-nal.h */ -typedef struct nal_cb_t nal_cb_t; - typedef char *user_ptr; typedef struct lib_msg_t lib_msg_t; typedef struct lib_ptl_t lib_ptl_t; @@ -165,11 +163,12 @@ typedef struct { struct lib_eq_t { struct list_head eq_list; lib_handle_t eq_lh; - ptl_seq_t sequence; - ptl_size_t size; - ptl_event_t *base; + ptl_seq_t eq_enq_seq; + ptl_seq_t eq_deq_seq; + ptl_size_t eq_size; + ptl_event_t *eq_events; int eq_refcount; - ptl_eq_handler_t event_callback; + ptl_eq_handler_t eq_callback; void *eq_addrkey; }; @@ -244,29 +243,117 @@ typedef struct { /* PTL_COOKIE_TYPES must be a power of 2, so the cookie type can be * extracted by masking with (PTL_COOKIE_TYPES - 1) */ -typedef struct { - ptl_nid_t nid; - ptl_pid_t pid; - lib_ptl_t tbl; - lib_counters_t counters; - ptl_ni_limits_t actual_limits; +typedef struct lib_ni +{ + nal_t *ni_api; + ptl_process_id_t ni_pid; + lib_ptl_t ni_portals; + lib_counters_t ni_counters; + ptl_ni_limits_t ni_actual_limits; int ni_lh_hash_size; /* size of lib handle hash table */ struct list_head *ni_lh_hash_table; /* all extant lib handles, this interface */ __u64 ni_next_object_cookie; /* cookie generator */ __u64 ni_interface_cookie; /* uniquely identifies this ni in this epoch */ - struct list_head ni_test_peers; + struct list_head ni_test_peers; #ifdef PTL_USE_LIB_FREELIST - lib_freelist_t ni_free_mes; - lib_freelist_t ni_free_msgs; - lib_freelist_t ni_free_mds; - lib_freelist_t ni_free_eqs; + lib_freelist_t ni_free_mes; + lib_freelist_t ni_free_msgs; + lib_freelist_t ni_free_mds; + lib_freelist_t ni_free_eqs; +#endif + + struct list_head ni_active_msgs; + struct list_head ni_active_mds; + struct list_head ni_active_eqs; + +#ifdef __KERNEL__ + spinlock_t ni_lock; + wait_queue_head_t ni_waitq; +#else + pthread_mutex_t ni_mutex; + pthread_cond_t ni_cond; #endif - struct list_head ni_active_msgs; - struct list_head ni_active_mds; - struct list_head ni_active_eqs; } lib_ni_t; + +typedef struct lib_nal +{ + /* lib-level interface state */ + lib_ni_t libnal_ni; + + /* NAL-private data */ + void *libnal_data; + + /* + * send: Sends a preformatted header and payload data to a + * specified remote process. The payload is scattered over 'niov' + * fragments described by iov, starting at 'offset' for 'mlen' + * bytes. + * NB the NAL may NOT overwrite iov. + * PTL_OK on success => NAL has committed to send and will call + * lib_finalize on completion + */ + ptl_err_t (*libnal_send) + (struct lib_nal *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen); + + /* as send, but with a set of page fragments (NULL if not supported) */ + ptl_err_t (*libnal_send_pages) + (struct lib_nal *nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, ptl_kiov_t *iov, + size_t offset, size_t mlen); + /* + * recv: Receives an incoming message from a remote process. The + * payload is to be received into the scattered buffer of 'niov' + * fragments described by iov, starting at 'offset' for 'mlen' + * bytes. Payload bytes after 'mlen' up to 'rlen' are to be + * discarded. + * NB the NAL may NOT overwrite iov. + * PTL_OK on success => NAL has committed to receive and will call + * lib_finalize on completion + */ + ptl_err_t (*libnal_recv) + (struct lib_nal *nal, void *private, lib_msg_t * cookie, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen); + + /* as recv, but with a set of page fragments (NULL if not supported) */ + ptl_err_t (*libnal_recv_pages) + (struct lib_nal *nal, void *private, lib_msg_t * cookie, + unsigned int niov, ptl_kiov_t *iov, + size_t offset, size_t mlen, size_t rlen); + + /* + * (un)map: Tell the NAL about some memory it will access. + * *addrkey passed to libnal_unmap() is what libnal_map() set it to. + * type of *iov depends on options. + * Set to NULL if not required. + */ + ptl_err_t (*libnal_map) + (struct lib_nal *nal, unsigned int niov, struct iovec *iov, + void **addrkey); + void (*libnal_unmap) + (struct lib_nal *nal, unsigned int niov, struct iovec *iov, + void **addrkey); + + /* as (un)map, but with a set of page fragments */ + ptl_err_t (*libnal_map_pages) + (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + void (*libnal_unmap_pages) + (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + + void (*libnal_printf)(struct lib_nal *nal, const char *fmt, ...); + + /* Calculate a network "distance" to given node */ + int (*libnal_dist) (struct lib_nal *nal, ptl_nid_t nid, unsigned long *dist); +} lib_nal_t; + #endif diff --git a/lnet/include/lnet/nal.h b/lnet/include/lnet/nal.h index 1f925c1..bf86569 100644 --- a/lnet/include/lnet/nal.h +++ b/lnet/include/lnet/nal.h @@ -11,32 +11,73 @@ #include -#ifdef yield -#undef yield -#endif - typedef struct nal_t nal_t; struct nal_t { + /* common interface state */ int nal_refct; + ptl_handle_ni_t nal_handle; + + /* NAL-private data */ void *nal_data; - int (*startup) (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *req, ptl_ni_limits_t *actual); + /* NAL API implementation + * NB only nal_ni_init needs to be set when the NAL registers itself */ + int (*nal_ni_init) (nal_t *nal, ptl_pid_t requested_pid, + ptl_ni_limits_t *req, ptl_ni_limits_t *actual); - void (*shutdown) (nal_t *nal); + void (*nal_ni_fini) (nal_t *nal); - int (*forward) (nal_t *nal, int index, /* Function ID */ - void *args, size_t arg_len, void *ret, size_t ret_len); + int (*nal_get_id) (nal_t *nal, ptl_process_id_t *id); + int (*nal_ni_status) (nal_t *nal, ptl_sr_index_t register, ptl_sr_value_t *status); + int (*nal_ni_dist) (nal_t *nal, ptl_process_id_t *id, unsigned long *distance); + int (*nal_fail_nid) (nal_t *nal, ptl_nid_t nid, unsigned int threshold); - int (*yield) (nal_t *nal, unsigned long *flags, int milliseconds); + int (*nal_me_attach) (nal_t *nal, ptl_pt_index_t portal, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); + int (*nal_me_insert) (nal_t *nal, ptl_handle_me_t *me, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); + int (*nal_me_unlink) (nal_t *nal, ptl_handle_me_t *me); + + int (*nal_md_attach) (nal_t *nal, ptl_handle_me_t *me, + ptl_md_t *md, ptl_unlink_t unlink, + ptl_handle_md_t *handle); + int (*nal_md_bind) (nal_t *nal, + ptl_md_t *md, ptl_unlink_t unlink, + ptl_handle_md_t *handle); + int (*nal_md_unlink) (nal_t *nal, ptl_handle_md_t *md); + int (*nal_md_update) (nal_t *nal, ptl_handle_md_t *md, + ptl_md_t *old_md, ptl_md_t *new_md, + ptl_handle_eq_t *testq); - void (*lock) (nal_t *nal, unsigned long *flags); + int (*nal_eq_alloc) (nal_t *nal, ptl_size_t count, + ptl_eq_handler_t handler, + ptl_handle_eq_t *handle); + int (*nal_eq_free) (nal_t *nal, ptl_handle_eq_t *eq); + int (*nal_eq_poll) (nal_t *nal, + ptl_handle_eq_t *eqs, int neqs, int timeout, + ptl_event_t *event, int *which); - void (*unlock) (nal_t *nal, unsigned long *flags); + int (*nal_ace_entry) (nal_t *nal, ptl_ac_index_t index, + ptl_process_id_t match_id, ptl_pt_index_t portal); + + int (*nal_put) (nal_t *nal, ptl_handle_md_t *md, ptl_ack_req_t ack, + ptl_process_id_t *target, ptl_pt_index_t portal, + ptl_ac_index_t ac, ptl_match_bits_t match, + ptl_size_t offset, ptl_hdr_data_t hdr_data); + int (*nal_get) (nal_t *nal, ptl_handle_md_t *md, + ptl_process_id_t *target, ptl_pt_index_t portal, + ptl_ac_index_t ac, ptl_match_bits_t match, + ptl_size_t offset); }; -extern nal_t *ptl_hndl2nal(ptl_handle_any_t * any); +extern nal_t *ptl_hndl2nal(ptl_handle_any_t *any); #ifdef __KERNEL__ extern int ptl_register_nal(ptl_interface_t interface, nal_t *nal); diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h index ef2712b..250b954 100644 --- a/lnet/include/lnet/types.h +++ b/lnet/include/lnet/types.h @@ -153,17 +153,6 @@ typedef void (*ptl_eq_handler_t)(ptl_event_t *event); #define PTL_EQ_HANDLER_NONE NULL typedef struct { - volatile ptl_seq_t sequence; - ptl_size_t size; - ptl_event_t *base; - ptl_handle_any_t cb_eq_handle; -} ptl_eq_t; - -typedef struct { - ptl_eq_t *eq; -} ptl_ni_t; - -typedef struct { int max_mes; int max_mds; int max_eqs; diff --git a/lnet/klnds/gmlnd/gmlnd.h b/lnet/klnds/gmlnd/gmlnd.h index e48552e..ca98f84 100644 --- a/lnet/klnds/gmlnd/gmlnd.h +++ b/lnet/klnds/gmlnd/gmlnd.h @@ -190,7 +190,6 @@ typedef struct _gmnal_rxtwe { #define NRXTHREADS 10 /* max number of receiver threads */ typedef struct _gmnal_data_t { - spinlock_t cb_lock; spinlock_t stxd_lock; struct semaphore stxd_token; gmnal_stxd_t *stxd; @@ -205,7 +204,7 @@ typedef struct _gmnal_data_t { gmnal_srxd_t *srxd; struct gm_hash *srxd_hash; nal_t *nal; - nal_cb_t *nal_cb; + lib_nal_t *libnal; struct gm_port *gm_port; unsigned int gm_local_nid; unsigned int gm_global_nid; @@ -298,7 +297,6 @@ extern gmnal_data_t *global_nal_data; #define GMNAL_GM_LOCK_INIT(a) spin_lock_init(&a->gm_lock); #define GMNAL_GM_LOCK(a) spin_lock(&a->gm_lock); #define GMNAL_GM_UNLOCK(a) spin_unlock(&a->gm_lock); -#define GMNAL_CB_LOCK_INIT(a) spin_lock_init(&a->cb_lock); /* @@ -340,39 +338,19 @@ void gmnal_api_unlock(nal_t *, unsigned long *); * CB NAL */ -int gmnal_cb_send(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, +int gmnal_cb_send(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec *, size_t); -int gmnal_cb_send_pages(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, +int gmnal_cb_send_pages(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t); -int gmnal_cb_recv(nal_cb_t *, void *, lib_msg_t *, +int gmnal_cb_recv(lib_nal_t *, void *, lib_msg_t *, unsigned int, struct iovec *, size_t, size_t); -int gmnal_cb_recv_pages(nal_cb_t *, void *, lib_msg_t *, +int gmnal_cb_recv_pages(lib_nal_t *, void *, lib_msg_t *, unsigned int, ptl_kiov_t *, size_t, size_t); -int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t); - -int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t); - -int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *); - -void *gmnal_cb_malloc(nal_cb_t *, size_t); - -void gmnal_cb_free(nal_cb_t *, void *, size_t); - -void gmnal_cb_unmap(nal_cb_t *, unsigned int, struct iovec*, void **); - -int gmnal_cb_map(nal_cb_t *, unsigned int, struct iovec*, void **); - -void gmnal_cb_printf(nal_cb_t *, const char *fmt, ...); - -void gmnal_cb_cli(nal_cb_t *, unsigned long *); - -void gmnal_cb_sti(nal_cb_t *, unsigned long *); - -int gmnal_cb_dist(nal_cb_t *, ptl_nid_t, unsigned long *); +int gmnal_cb_dist(lib_nal_t *, ptl_nid_t, unsigned long *); int gmnal_init(void); @@ -381,22 +359,14 @@ void gmnal_fini(void); #define GMNAL_INIT_NAL_CB(a) do { \ - a->cb_send = gmnal_cb_send; \ - a->cb_send_pages = gmnal_cb_send_pages; \ - a->cb_recv = gmnal_cb_recv; \ - a->cb_recv_pages = gmnal_cb_recv_pages; \ - a->cb_read = gmnal_cb_read; \ - a->cb_write = gmnal_cb_write; \ - a->cb_callback = gmnal_cb_callback; \ - a->cb_malloc = gmnal_cb_malloc; \ - a->cb_free = gmnal_cb_free; \ - a->cb_map = NULL; \ - a->cb_unmap = NULL; \ - a->cb_printf = gmnal_cb_printf; \ - a->cb_cli = gmnal_cb_cli; \ - a->cb_sti = gmnal_cb_sti; \ - a->cb_dist = gmnal_cb_dist; \ - a->nal_data = NULL; \ + a->libnal_send = gmnal_cb_send; \ + a->libnal_send_pages = gmnal_cb_send_pages; \ + a->libnal_recv = gmnal_cb_recv; \ + a->libnal_recv_pages = gmnal_cb_recv_pages; \ + a->libnal_map = NULL; \ + a->libnal_unmap = NULL; \ + a->libnal_dist = gmnal_cb_dist; \ + a->libnal_data = NULL; \ } while (0) @@ -451,9 +421,9 @@ void gmnal_remove_rxtwe(gmnal_data_t *); /* * Small messages */ -int gmnal_small_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int, +int gmnal_small_rx(lib_nal_t *, void *, lib_msg_t *, unsigned int, struct iovec *, size_t, size_t); -int gmnal_small_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, +int gmnal_small_tx(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec*, int); void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t); @@ -463,10 +433,10 @@ void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t); /* * Large messages */ -int gmnal_large_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int, +int gmnal_large_rx(lib_nal_t *, void *, lib_msg_t *, unsigned int, struct iovec *, size_t, size_t); -int gmnal_large_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, +int gmnal_large_tx(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec*, int); diff --git a/lnet/klnds/gmlnd/gmlnd_api.c b/lnet/klnds/gmlnd/gmlnd_api.c index 7c94f93..002587d 100644 --- a/lnet/klnds/gmlnd/gmlnd_api.c +++ b/lnet/klnds/gmlnd/gmlnd_api.c @@ -50,77 +50,6 @@ static ctl_table gmnalnal_top_sysctl_table[] = { { 0 } }; - - - - - -/* - * gmnal_api_forward - * This function takes a pack block of arguments from the NAL API - * module and passes them to the NAL CB module. The CB module unpacks - * the args and calls the appropriate function indicated by index. - * Typically this function is used to pass args between kernel and use - * space. - * As lgmanl exists entirely in kernel, just pass the arg block directly - * to the NAL CB, buy passing the args to lib_dispatch - * Arguments are - * nal_t nal Our nal - * int index the api function that initiated this call - * void *args packed block of function args - * size_t arg_len length of args block - * void *ret A return value for the API NAL - * size_t ret_len Size of the return value - * - */ - -int -gmnal_api_forward(nal_t *nal, int index, void *args, size_t arg_len, - void *ret, size_t ret_len) -{ - - nal_cb_t *nal_cb = NULL; - gmnal_data_t *nal_data = NULL; - - - - - - if (!nal || !args || (index < 0) || (arg_len < 0)) { - CDEBUG(D_ERROR, "Bad args to gmnal_api_forward\n"); - return (PTL_FAIL); - } - - if (ret && (ret_len <= 0)) { - CDEBUG(D_ERROR, "Bad args to gmnal_api_forward\n"); - return (PTL_FAIL); - } - - - if (!nal->nal_data) { - CDEBUG(D_ERROR, "bad nal, no nal data\n"); - return (PTL_FAIL); - } - - nal_data = nal->nal_data; - CDEBUG(D_INFO, "nal_data is [%p]\n", nal_data); - - if (!nal_data->nal_cb) { - CDEBUG(D_ERROR, "bad nal_data, no nal_cb\n"); - return (PTL_FAIL); - } - - nal_cb = nal_data->nal_cb; - CDEBUG(D_INFO, "nal_cb is [%p]\n", nal_cb); - - CDEBUG(D_PORTALS, "gmnal_api_forward calling lib_dispatch\n"); - lib_dispatch(nal_cb, NULL, index, args, ret); - CDEBUG(D_PORTALS, "gmnal_api_forward returns from lib_dispatch\n"); - - return(PTL_OK); -} - - /* * gmnal_api_shutdown * nal_refct == 0 => called on last matching PtlNIFini() @@ -131,7 +60,7 @@ void gmnal_api_shutdown(nal_t *nal, int interface) { gmnal_data_t *nal_data; - nal_cb_t *nal_cb; + lib_nal_t *libnal; if (nal->nal_refct != 0) return; @@ -139,9 +68,9 @@ gmnal_api_shutdown(nal_t *nal, int interface) CDEBUG(D_TRACE, "gmnal_api_shutdown: nal_data [%p]\n", nal_data); LASSERT(nal == global_nal_data->nal); - nal_data = nal->nal_data; + libnal = (lib_nal_t *)nal->nal_data; + nal_data = (gmnal_data_t *)libnal->libnal_data; LASSERT(nal_data == global_nal_data); - nal_cb = nal_data->nal_cb; /* Stop portals calling our ioctl handler */ libcfs_nal_cmd_unregister(GMNAL); @@ -150,7 +79,7 @@ gmnal_api_shutdown(nal_t *nal, int interface) * flag so when lib calls us we fail immediately and dont queue any * more work but our threads can still call into lib OK. THEN * shutdown our threads, THEN lib_fini() */ - lib_fini(nal_cb); + lib_fini(libnal); gmnal_stop_rxthread(nal_data); gmnal_stop_ctthread(nal_data); @@ -162,94 +91,22 @@ gmnal_api_shutdown(nal_t *nal, int interface) GMNAL_GM_UNLOCK(nal_data); if (nal_data->sysctl) unregister_sysctl_table (nal_data->sysctl); - PORTAL_FREE(nal, sizeof(nal_t)); + /* Don't free 'nal'; it's a static struct */ PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); global_nal_data = NULL; PORTAL_MODULE_UNUSE; } -/* - * gmnal_api_validate - * validate a user address for use in communications - * There's nothing to be done here - */ -int -gmnal_api_validate(nal_t *nal, void *base, size_t extent) -{ - - return(PTL_OK); -} - - - -/* - * gmnal_api_yield - * Give up the processor - */ -void -gmnal_api_yield(nal_t *nal, unsigned long *flags, int milliseconds) -{ - CDEBUG(D_TRACE, "gmnal_api_yield : nal [%p]\n", nal); - - if (milliseconds != 0) { - CERROR("Blocking yield not implemented yet\n"); - LBUG(); - } - - our_cond_resched(); - return; -} - - - -/* - * gmnal_api_lock - * Take a threadsafe lock - */ -void -gmnal_api_lock(nal_t *nal, unsigned long *flags) -{ - - gmnal_data_t *nal_data; - nal_cb_t *nal_cb; - - nal_data = nal->nal_data; - nal_cb = nal_data->nal_cb; - - nal_cb->cb_cli(nal_cb, flags); - - return; -} - -/* - * gmnal_api_unlock - * Release a threadsafe lock - */ -void -gmnal_api_unlock(nal_t *nal, unsigned long *flags) -{ - gmnal_data_t *nal_data; - nal_cb_t *nal_cb; - - nal_data = nal->nal_data; - nal_cb = nal_data->nal_cb; - - nal_cb->cb_sti(nal_cb, flags); - - return; -} - - int gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { - nal_cb_t *nal_cb = NULL; + lib_nal_t *libnal = NULL; gmnal_data_t *nal_data = NULL; gmnal_srxd_t *srxd = NULL; gm_status_t gm_status; @@ -258,9 +115,8 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, if (nal->nal_refct != 0) { if (actual_limits != NULL) { - nal_data = (gmnal_data_t *)nal->nal_data; - nal_cb = nal_data->nal_cb; - *actual_limits = nal->_cb->ni.actual_limits; + libnal = (lib_nal_t *)nal->nal_data; + *actual_limits = nal->libnal_ni.ni_actual_limits; return (PTL_OK); } @@ -283,24 +139,22 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, CDEBUG(D_INFO, "Allocd and reset nal_data[%p]\n", nal_data); CDEBUG(D_INFO, "small_msg_size is [%d]\n", nal_data->small_msg_size); - PORTAL_ALLOC(nal_cb, sizeof(nal_cb_t)); - if (!nal_cb) { + PORTAL_ALLOC(libnal, sizeof(lib_nal_t)); + if (!libnal) { PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); return(PTL_NO_SPACE); } - memset(nal_cb, 0, sizeof(nal_cb_t)); - CDEBUG(D_INFO, "Allocd and reset nal_cb[%p]\n", nal_cb); + memset(libnal, 0, sizeof(lib_nal_t)); + CDEBUG(D_INFO, "Allocd and reset libnal[%p]\n", libnal); - GMNAL_INIT_NAL_CB(nal_cb); + GMNAL_INIT_NAL_CB(libnal); /* * String them all together */ - nal->nal_data = (void*)nal_data; - nal_cb->nal_data = (void*)nal_data; + libnal->libnal_data = (void*)nal_data; nal_data->nal = nal; - nal_data->nal_cb = nal_cb; + nal_data->libnal = libnal; - GMNAL_CB_LOCK_INIT(nal_data); GMNAL_GM_LOCK_INIT(nal_data); @@ -311,7 +165,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, if (gm_init() != GM_SUCCESS) { CDEBUG(D_ERROR, "call to gm_init failed\n"); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } @@ -356,7 +210,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } @@ -373,7 +227,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } @@ -402,7 +256,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } @@ -434,7 +288,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } nal_data->gm_local_nid = local_nid; @@ -454,7 +308,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } CDEBUG(D_INFO, "Global node id is [%u]\n", global_nid); @@ -471,7 +325,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, CDEBUG(D_INFO, "portals_nid is ["LPU64"]\n", process_id.nid); CDEBUG(D_PORTALS, "calling lib_init\n"); - if (lib_init(nal_cb, process_id, + if (lib_init(libnal, nal, process_id, requested_limits, actual_limits) != PTL_OK) { CDEBUG(D_ERROR, "lib_init failed\n"); gmnal_stop_rxthread(nal_data); @@ -483,7 +337,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } @@ -493,7 +347,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, /* XXX these cleanup cases should be restructured to * minimise duplication... */ - lib_fini(nal_cb); + lib_fini(libnal); gmnal_stop_rxthread(nal_data); gmnal_stop_ctthread(nal_data); @@ -504,7 +358,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } @@ -550,10 +404,6 @@ int gmnal_init(void) */ void gmnal_fini() { - gmnal_data_t *nal_data = global_nal_data; - nal_t *nal = nal_data->nal; - nal_cb_t *nal_cb = nal_data->nal_cb; - CDEBUG(D_TRACE, "gmnal_fini\n"); LASSERT(global_nal_data == NULL); diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c index ece1380..e99d3ec 100644 --- a/lnet/klnds/gmlnd/gmlnd_cb.c +++ b/lnet/klnds/gmlnd/gmlnd_cb.c @@ -27,7 +27,7 @@ #include "gmnal.h" -int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +int gmnal_cb_recv(lib_nal_t *libnal, void *private, lib_msg_t *cookie, unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) { @@ -35,19 +35,19 @@ int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int status = PTL_OK; - CDEBUG(D_TRACE, "gmnal_cb_recv nal_cb [%p], private[%p], cookie[%p], " + CDEBUG(D_TRACE, "gmnal_cb_recv libnal [%p], private[%p], cookie[%p], " "niov[%d], iov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", - nal_cb, private, cookie, niov, iov, mlen, rlen); + libnal, private, cookie, niov, iov, mlen, rlen); switch(srxd->type) { case(GMNAL_SMALL_MESSAGE): CDEBUG(D_INFO, "gmnal_cb_recv got small message\n"); - status = gmnal_small_rx(nal_cb, private, cookie, niov, + status = gmnal_small_rx(libnal, private, cookie, niov, iov, mlen, rlen); break; case(GMNAL_LARGE_MESSAGE_INIT): CDEBUG(D_INFO, "gmnal_cb_recv got large message init\n"); - status = gmnal_large_rx(nal_cb, private, cookie, niov, + status = gmnal_large_rx(libnal, private, cookie, niov, iov, mlen, rlen); } @@ -56,7 +56,7 @@ int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, return(status); } -int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +int gmnal_cb_recv_pages(lib_nal_t *libnal, void *private, lib_msg_t *cookie, unsigned int kniov, ptl_kiov_t *kiov, size_t mlen, size_t rlen) { @@ -67,9 +67,9 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, ptl_kiov_t *kiov_dup = kiov;; - CDEBUG(D_TRACE, "gmnal_cb_recv_pages nal_cb [%p],private[%p], " + CDEBUG(D_TRACE, "gmnal_cb_recv_pages libnal [%p],private[%p], " "cookie[%p], kniov[%d], kiov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", - nal_cb, private, cookie, kniov, kiov, mlen, rlen); + libnal, private, cookie, kniov, kiov, mlen, rlen); if (srxd->type == GMNAL_SMALL_MESSAGE) { PORTAL_ALLOC(iovec, sizeof(struct iovec)*kniov); @@ -98,7 +98,7 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, kiov++; } CDEBUG(D_INFO, "calling gmnal_small_rx\n"); - status = gmnal_small_rx(nal_cb, private, cookie, kniov, + status = gmnal_small_rx(libnal, private, cookie, kniov, iovec_dup, mlen, rlen); for (i=0; ikiov_page); @@ -113,7 +113,7 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, } -int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +int gmnal_cb_send(lib_nal_t *libnal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int niov, struct iovec *iov, size_t len) { @@ -123,24 +123,25 @@ int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] len["LPSZ"] nid["LPU64"]\n", niov, len, nid); - nal_data = nal_cb->nal_data; + nal_data = libnal->libnal_data; if (GMNAL_IS_SMALL_MESSAGE(nal_data, niov, iov, len)) { CDEBUG(D_INFO, "This is a small message send\n"); - gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, pid, + gmnal_small_tx(libnal, private, cookie, hdr, type, nid, pid, niov, iov, len); } else { CDEBUG(D_ERROR, "Large message send it is not supported\n"); - lib_finalize(nal_cb, private, cookie, PTL_FAIL); + lib_finalize(libnal, private, cookie, PTL_FAIL); return(PTL_FAIL); - gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, + gmnal_large_tx(libnal, private, cookie, hdr, type, nid, pid, niov, iov, len); } return(PTL_OK); } -int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int kniov, ptl_kiov_t *kiov, size_t len) +int gmnal_cb_send_pages(lib_nal_t *libnal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int kniov, ptl_kiov_t *kiov, size_t len) { int i = 0; @@ -149,7 +150,7 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, ptl_kiov_t *kiov_dup = kiov; CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] len["LPSZ"]\n", nid, kniov, len); - nal_data = nal_cb->nal_data; + nal_data = libnal->libnal_data; PORTAL_ALLOC(iovec, kniov*sizeof(struct iovec)); iovec_dup = iovec; if (GMNAL_IS_SMALL_MESSAGE(nal_data, 0, NULL, len)) { @@ -168,7 +169,7 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, iovec++; kiov++; } - gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, + gmnal_small_tx(libnal, private, cookie, hdr, type, nid, pid, kniov, iovec_dup, len); } else { CDEBUG(D_ERROR, "Large message send it is not supported yet\n"); @@ -185,7 +186,7 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, iovec++; kiov++; } - gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, + gmnal_large_tx(libnal, private, cookie, hdr, type, nid, pid, kniov, iovec, len); } for (i=0; ievent_callback != NULL) { - CDEBUG(D_INFO, "found callback\n"); - eq->event_callback(ev); - } - - return(PTL_OK); -} - -void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len) -{ - void *ptr = NULL; - CDEBUG(D_TRACE, "gmnal_cb_malloc len["LPSZ"]\n", len); - PORTAL_ALLOC(ptr, len); - return(ptr); -} - -void gmnal_cb_free(nal_cb_t *nal_cb, void *buf, size_t len) -{ - CDEBUG(D_TRACE, "gmnal_cb_free :: buf[%p] len["LPSZ"]\n", buf, len); - PORTAL_FREE(buf, len); - return; -} - -void gmnal_cb_unmap(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov, - void **addrkey) -{ - return; -} - -int gmnal_cb_map(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov, - void**addrkey) -{ - return(PTL_OK); -} - -void gmnal_cb_printf(nal_cb_t *nal_cb, const char *fmt, ...) -{ - CDEBUG(D_TRACE, "gmnal_cb_printf\n"); - printk(fmt); - return; -} - -void gmnal_cb_cli(nal_cb_t *nal_cb, unsigned long *flags) -{ - gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; - - spin_lock_irqsave(&nal_data->cb_lock, *flags); - return; -} - -void gmnal_cb_sti(nal_cb_t *nal_cb, unsigned long *flags) -{ - gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; - - spin_unlock_irqrestore(&nal_data->cb_lock, *flags); - return; -} - -void gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq, ptl_event_t *ev) -{ - /* holding cb_lock */ - - if (eq->event_callback != NULL) - eq->event_callback(ev); - - /* We will wake theads sleeping in yield() here, AFTER the - * callback, when we implement blocking yield */ -} - -int gmnal_cb_dist(nal_cb_t *nal_cb, ptl_nid_t nid, unsigned long *dist) +int gmnal_cb_dist(lib_nal_t *libnal, ptl_nid_t nid, unsigned long *dist) { CDEBUG(D_TRACE, "gmnal_cb_dist\n"); if (dist) diff --git a/lnet/klnds/gmlnd/gmlnd_comm.c b/lnet/klnds/gmlnd/gmlnd_comm.c index 1bcd9bd..4af7186 100644 --- a/lnet/klnds/gmlnd/gmlnd_comm.c +++ b/lnet/klnds/gmlnd/gmlnd_comm.c @@ -189,6 +189,7 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) unsigned int snode, sport, type, length; gmnal_msghdr_t *gmnal_msghdr; ptl_hdr_t *portals_hdr; + int rc; CDEBUG(D_INFO, "nal_data [%p], we[%p] type [%d]\n", nal_data, we, gmnal_type); @@ -219,10 +220,12 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) */ srxd = gmnal_rxbuffer_to_srxd(nal_data, buffer); CDEBUG(D_INFO, "Back from gmnal_rxbuffer_to_srxd\n"); - srxd->nal_data = nal_data; if (!srxd) { CDEBUG(D_ERROR, "Failed to get receive descriptor\n"); - lib_parse(nal_data->nal_cb, portals_hdr, srxd); + /* I think passing a NULL srxd to lib_parse will crash + * gmnal_recv() */ + LBUG(); + lib_parse(nal_data->libnal, portals_hdr, srxd); return(GMNAL_STATUS_FAIL); } @@ -234,6 +237,7 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) return(GMNAL_STATUS_OK); } + srxd->nal_data = nal_data; srxd->type = gmnal_type; srxd->nsiov = gmnal_msghdr->niov; srxd->gm_source_node = gmnal_msghdr->sender_node_id; @@ -245,7 +249,12 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) * cb_recv is responsible for returning the buffer * for future receive */ - lib_parse(nal_data->nal_cb, portals_hdr, srxd); + rc = lib_parse(nal_data->libnal, portals_hdr, srxd); + + if (rc != PTL_OK) { + /* I just received garbage; take appropriate action... */ + LBUG(); + } return(GMNAL_STATUS_OK); } @@ -309,19 +318,19 @@ gmnal_rx_bad(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, gmnal_srxd_t *srxd) * Call lib_finalize */ int -gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +gmnal_small_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) { gmnal_srxd_t *srxd = NULL; void *buffer = NULL; - gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; + gmnal_data_t *nal_data = (gmnal_data_t*)libnal->nal_data; CDEBUG(D_TRACE, "niov [%d] mlen["LPSZ"]\n", niov, mlen); if (!private) { CDEBUG(D_ERROR, "gmnal_small_rx no context\n"); - lib_finalize(nal_cb, private, cookie, PTL_FAIL); + lib_finalize(libnal, private, cookie, PTL_FAIL); return(PTL_FAIL); } @@ -343,7 +352,7 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, * let portals library know receive is complete */ CDEBUG(D_PORTALS, "calling lib_finalize\n"); - lib_finalize(nal_cb, private, cookie, PTL_OK); + lib_finalize(libnal, private, cookie, PTL_OK); /* * return buffer so it can be used again */ @@ -365,11 +374,11 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, * The callback function informs when the send is complete. */ int -gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +gmnal_small_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t global_nid, ptl_pid_t pid, unsigned int niov, struct iovec *iov, int size) { - gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; + gmnal_data_t *nal_data = (gmnal_data_t*)libnal->nal_data; gmnal_stxd_t *stxd = NULL; void *buffer = NULL; gmnal_msghdr_t *msghdr = NULL; @@ -377,9 +386,9 @@ gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, unsigned int local_nid; gm_status_t gm_status = GM_SUCCESS; - CDEBUG(D_TRACE, "gmnal_small_tx nal_cb [%p] private [%p] cookie [%p] " + CDEBUG(D_TRACE, "gmnal_small_tx libnal [%p] private [%p] cookie [%p] " "hdr [%p] type [%d] global_nid ["LPU64"] pid [%d] niov [%d] " - "iov [%p] size [%d]\n", nal_cb, private, cookie, hdr, type, + "iov [%p] size [%d]\n", libnal, private, cookie, hdr, type, global_nid, pid, niov, iov, size); CDEBUG(D_INFO, "portals_hdr:: dest_nid ["LPU64"], src_nid ["LPU64"]\n", @@ -472,7 +481,7 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) gmnal_stxd_t *stxd = (gmnal_stxd_t*)context; lib_msg_t *cookie = stxd->cookie; gmnal_data_t *nal_data = (gmnal_data_t*)stxd->nal_data; - nal_cb_t *nal_cb = nal_data->nal_cb; + lib_nal_t *libnal = nal_data->libnal; if (!stxd) { CDEBUG(D_TRACE, "send completion event for unknown stxd\n"); @@ -592,7 +601,7 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) return; } gmnal_return_stxd(nal_data, stxd); - lib_finalize(nal_cb, stxd, cookie, PTL_OK); + lib_finalize(libnal, stxd, cookie, PTL_OK); return; } @@ -645,7 +654,7 @@ void gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, * this ack, deregister the memory. Only 1 send token is required here. */ int -gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +gmnal_large_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t global_nid, ptl_pid_t pid, unsigned int niov, struct iovec *iov, int size) { @@ -661,15 +670,15 @@ gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int niov_dup; - CDEBUG(D_TRACE, "gmnal_large_tx nal_cb [%p] private [%p], cookie [%p] " + CDEBUG(D_TRACE, "gmnal_large_tx libnal [%p] private [%p], cookie [%p] " "hdr [%p], type [%d] global_nid ["LPU64"], pid [%d], niov [%d], " - "iov [%p], size [%d]\n", nal_cb, private, cookie, hdr, type, + "iov [%p], size [%d]\n", libnal, private, cookie, hdr, type, global_nid, pid, niov, iov, size); - if (nal_cb) - nal_data = (gmnal_data_t*)nal_cb->nal_data; + if (libnal) + nal_data = (gmnal_data_t*)libnal->nal_data; else { - CDEBUG(D_ERROR, "no nal_cb.\n"); + CDEBUG(D_ERROR, "no libnal.\n"); return(GMNAL_STATUS_FAIL); } @@ -811,11 +820,11 @@ gmnal_large_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) * data from the sender. */ int -gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +gmnal_large_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, unsigned int nriov, struct iovec *riov, size_t mlen, size_t rlen) { - gmnal_data_t *nal_data = nal_cb->nal_data; + gmnal_data_t *nal_data = libnal->nal_data; gmnal_srxd_t *srxd = (gmnal_srxd_t*)private; void *buffer = NULL; struct iovec *riov_dup; @@ -823,13 +832,13 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, gmnal_msghdr_t *msghdr = NULL; gm_status_t gm_status; - CDEBUG(D_TRACE, "gmnal_large_rx :: nal_cb[%p], private[%p], " + CDEBUG(D_TRACE, "gmnal_large_rx :: libnal[%p], private[%p], " "cookie[%p], niov[%d], iov[%p], mlen["LPSZ"], rlen["LPSZ"]\n", - nal_cb, private, cookie, nriov, riov, mlen, rlen); + libnal, private, cookie, nriov, riov, mlen, rlen); if (!srxd) { CDEBUG(D_ERROR, "gmnal_large_rx no context\n"); - lib_finalize(nal_cb, private, cookie, PTL_FAIL); + lib_finalize(libnal, private, cookie, PTL_FAIL); return(PTL_FAIL); } @@ -1092,7 +1101,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context, gmnal_ltxd_t *ltxd = (gmnal_ltxd_t*)context; gmnal_srxd_t *srxd = ltxd->srxd; - nal_cb_t *nal_cb = srxd->nal_data->nal_cb; + lib_nal_t *libnal = srxd->nal_data->libnal; int lastone; struct iovec *riov; int nriov; @@ -1126,7 +1135,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context, * Let our client application proceed */ CDEBUG(D_ERROR, "final callback context[%p]\n", srxd); - lib_finalize(nal_cb, srxd, srxd->cookie, PTL_OK); + lib_finalize(libnal, srxd, srxd->cookie, PTL_OK); /* * send an ack to the sender to let him know we got the data @@ -1276,7 +1285,7 @@ gmnal_large_tx_ack_callback(gm_port_t *gm_port, void *context, void gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) { - nal_cb_t *nal_cb = nal_data->nal_cb; + lib_nal_t *libnal = nal_data->libnal; gmnal_stxd_t *stxd = NULL; gmnal_msghdr_t *msghdr = NULL; void *buffer = NULL; @@ -1291,7 +1300,7 @@ gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) CDEBUG(D_INFO, "gmnal_large_tx_ack_received stxd [%p]\n", stxd); - lib_finalize(nal_cb, stxd, stxd->cookie, PTL_OK); + lib_finalize(libnal, stxd, stxd->cookie, PTL_OK); /* * extract the iovec from the stxd, deregister the memory. diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c index f4005de..c595450 100644 --- a/lnet/klnds/qswlnd/qswlnd.c +++ b/lnet/klnds/qswlnd/qswlnd.c @@ -43,6 +43,9 @@ kpr_nal_interface_t kqswnal_router_interface = { #define QSWNAL_SYSCTL_COPY_SMALL_FWD 2 static ctl_table kqswnal_ctl_table[] = { + {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_puts", + &kqswnal_tunables.kqn_optimized_puts, sizeof (int), + 0644, NULL, &proc_dointvec}, {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets", &kqswnal_tunables.kqn_optimized_gets, sizeof (int), 0644, NULL, &proc_dointvec}, @@ -55,88 +58,6 @@ static ctl_table kqswnal_top_ctl_table[] = { }; #endif -static int -kqswnal_forward(nal_t *nal, - int id, - void *args, size_t args_len, - void *ret, size_t ret_len) -{ - kqswnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kqn_cb; - - LASSERT (nal == &kqswnal_api); - LASSERT (k == &kqswnal_data); - LASSERT (nal_cb == &kqswnal_lib); - - lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */ - return (PTL_OK); -} - -static void -kqswnal_lock (nal_t *nal, unsigned long *flags) -{ - kqswnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kqn_cb; - - LASSERT (nal == &kqswnal_api); - LASSERT (k == &kqswnal_data); - LASSERT (nal_cb == &kqswnal_lib); - - nal_cb->cb_cli(nal_cb,flags); -} - -static void -kqswnal_unlock(nal_t *nal, unsigned long *flags) -{ - kqswnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kqn_cb; - - LASSERT (nal == &kqswnal_api); - LASSERT (k == &kqswnal_data); - LASSERT (nal_cb == &kqswnal_lib); - - nal_cb->cb_sti(nal_cb,flags); -} - -static int -kqswnal_yield(nal_t *nal, unsigned long *flags, int milliseconds) -{ - /* NB called holding statelock */ - wait_queue_t wait; - unsigned long now = jiffies; - - CDEBUG (D_NET, "yield\n"); - - if (milliseconds == 0) { - if (need_resched()) - schedule(); - return 0; - } - - init_waitqueue_entry(&wait, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kqswnal_data.kqn_yield_waitq, &wait); - - kqswnal_unlock(nal, flags); - - if (milliseconds < 0) - schedule (); - else - schedule_timeout((milliseconds * HZ) / 1000); - - kqswnal_lock(nal, flags); - - remove_wait_queue(&kqswnal_data.kqn_yield_waitq, &wait); - - if (milliseconds > 0) { - milliseconds -= ((jiffies - now) * 1000) / HZ; - if (milliseconds < 0) - milliseconds = 0; - } - - return (milliseconds); -} - int kqswnal_get_tx_desc (struct portals_cfg *pcfg) { @@ -186,7 +107,7 @@ kqswnal_cmd (struct portals_cfg *pcfg, void *private) kqswnal_data.kqn_nid_offset); kqswnal_data.kqn_nid_offset = pcfg->pcfg_nid - kqswnal_data.kqn_elanid; - kqswnal_lib.ni.nid = pcfg->pcfg_nid; + kqswnal_lib.libnal_ni.ni_pid.nid = pcfg->pcfg_nid; return (0); default: @@ -469,9 +390,11 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, ptl_process_id_t my_process_id; int pkmem = atomic_read(&portal_kmemory); + LASSERT (nal == &kqswnal_api); + if (nal->nal_refct != 0) { if (actual_limits != NULL) - *actual_limits = kqswnal_lib.ni.actual_limits; + *actual_limits = kqswnal_lib.libnal_ni.ni_actual_limits; /* This module got the first ref */ PORTAL_MODULE_USE; return (PTL_OK); @@ -481,18 +404,9 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory)); - memset(&kqswnal_rpc_success, 0, sizeof(kqswnal_rpc_success)); - memset(&kqswnal_rpc_failed, 0, sizeof(kqswnal_rpc_failed)); -#if MULTIRAIL_EKC - kqswnal_rpc_failed.Data[0] = -ECONNREFUSED; -#else - kqswnal_rpc_failed.Status = -ECONNREFUSED; -#endif /* ensure all pointers NULL etc */ memset (&kqswnal_data, 0, sizeof (kqswnal_data)); - kqswnal_data.kqn_cb = &kqswnal_lib; - INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds); INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds); @@ -507,8 +421,12 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, spin_lock_init (&kqswnal_data.kqn_sched_lock); init_waitqueue_head (&kqswnal_data.kqn_sched_waitq); - spin_lock_init (&kqswnal_data.kqn_statelock); - init_waitqueue_head (&kqswnal_data.kqn_yield_waitq); + /* Leave kqn_rpc_success zeroed */ +#if MULTIRAIL_EKC + kqswnal_data.kqn_rpc_failed.Data[0] = -ECONNREFUSED; +#else + kqswnal_data.kqn_rpc_failed.Status = -ECONNREFUSED; +#endif /* pointers/lists/locks initialised */ kqswnal_data.kqn_init = KQN_INIT_DATA; @@ -517,13 +435,13 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, kqswnal_data.kqn_ep = ep_system(); if (kqswnal_data.kqn_ep == NULL) { CERROR("Can't initialise EKC\n"); - kqswnal_shutdown(&kqswnal_api); + kqswnal_shutdown(nal); return (PTL_IFACE_INVALID); } if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) { CERROR("Can't get elan ID\n"); - kqswnal_shutdown(&kqswnal_api); + kqswnal_shutdown(nal); return (PTL_IFACE_INVALID); } #else @@ -534,7 +452,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (kqswnal_data.kqn_ep == NULL) { CERROR ("Can't get elan device 0\n"); - kqswnal_shutdown(&kqswnal_api); + kqswnal_shutdown(nal); return (PTL_IFACE_INVALID); } #endif @@ -550,7 +468,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (kqswnal_data.kqn_eptx == NULL) { CERROR ("Can't allocate transmitter\n"); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -563,7 +481,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (kqswnal_data.kqn_eprx_small == NULL) { CERROR ("Can't install small msg receiver\n"); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -573,7 +491,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (kqswnal_data.kqn_eprx_large == NULL) { CERROR ("Can't install large msg receiver\n"); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -588,7 +506,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, EP_PERM_WRITE); if (kqswnal_data.kqn_ep_tx_nmh == NULL) { CERROR("Can't reserve tx dma space\n"); - kqswnal_shutdown(&kqswnal_api); + kqswnal_shutdown(nal); return (PTL_NO_SPACE); } #else @@ -603,7 +521,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (rc != DDI_SUCCESS) { CERROR ("Can't reserve rx dma space\n"); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } #endif @@ -617,7 +535,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, EP_PERM_WRITE); if (kqswnal_data.kqn_ep_tx_nmh == NULL) { CERROR("Can't reserve rx dma space\n"); - kqswnal_shutdown(&kqswnal_api); + kqswnal_shutdown(nal); return (PTL_NO_SPACE); } #else @@ -633,7 +551,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (rc != DDI_SUCCESS) { CERROR ("Can't reserve rx dma space\n"); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } #endif @@ -644,7 +562,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); if (kqswnal_data.kqn_txds == NULL) { - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -660,7 +578,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); if (ktx->ktx_buffer == NULL) { - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -697,7 +615,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE)); if (kqswnal_data.kqn_rxds == NULL) { - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -732,7 +650,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, struct page *page = alloc_page(GFP_KERNEL); if (page == NULL) { - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -780,12 +698,12 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, my_process_id.nid = kqswnal_elanid2nid(kqswnal_data.kqn_elanid); my_process_id.pid = 0; - rc = lib_init(&kqswnal_lib, my_process_id, + rc = lib_init(&kqswnal_lib, nal, my_process_id, requested_limits, actual_limits); if (rc != PTL_OK) { CERROR ("lib_init failed %d\n", rc); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (rc); } @@ -799,6 +717,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; /* NB this enqueue can allocate/sleep (attr == 0) */ + krx->krx_state = KRX_POSTED; #if MULTIRAIL_EKC rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, &krx->krx_elanbuffer, 0); @@ -810,7 +729,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (rc != EP_SUCCESS) { CERROR ("failed ep_queue_receive %d\n", rc); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_FAIL); } } @@ -822,7 +741,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (rc != 0) { CERROR ("failed to spawn scheduling thread: %d\n", rc); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_FAIL); } } @@ -835,7 +754,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, rc = libcfs_nal_cmd_register (QSWNAL, &kqswnal_cmd, NULL); if (rc != 0) { CERROR ("Can't initialise command interface (rc = %d)\n", rc); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_FAIL); } @@ -867,17 +786,11 @@ kqswnal_initialise (void) { int rc; - kqswnal_api.startup = kqswnal_startup; - kqswnal_api.shutdown = kqswnal_shutdown; - kqswnal_api.forward = kqswnal_forward; - kqswnal_api.yield = kqswnal_yield; - kqswnal_api.lock = kqswnal_lock; - kqswnal_api.unlock = kqswnal_unlock; - kqswnal_api.nal_data = &kqswnal_data; - - kqswnal_lib.nal_data = &kqswnal_data; + kqswnal_api.nal_ni_init = kqswnal_startup; + kqswnal_api.nal_ni_fini = kqswnal_shutdown; /* Initialise dynamic tunables to defaults once only */ + kqswnal_tunables.kqn_optimized_puts = KQSW_OPTIMIZED_PUTS; kqswnal_tunables.kqn_optimized_gets = KQSW_OPTIMIZED_GETS; rc = ptl_register_nal(QSWNAL, &kqswnal_api); diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index 6978aa0..b085caa 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -109,7 +109,8 @@ typedef unsigned long kqsw_csum_t; #define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ -#define KQSW_OPTIMIZED_GETS 1 /* optimized gets? */ +#define KQSW_OPTIMIZED_GETS 1 /* optimize gets >= this size */ +#define KQSW_OPTIMIZED_PUTS (32<<10) /* optimize puts >= this size */ #define KQSW_COPY_SMALL_FWD 0 /* copy small fwd messages to pre-mapped buffer? */ /* @@ -156,12 +157,18 @@ typedef struct int krx_npages; /* # pages in receive buffer */ int krx_nob; /* Number Of Bytes received into buffer */ int krx_rpc_reply_needed; /* peer waiting for EKC RPC reply */ - int krx_rpc_reply_sent; /* rpc reply sent */ + int krx_rpc_reply_status; /* what status to send */ + int krx_state; /* what this RX is doing */ atomic_t krx_refcount; /* how to tell when rpc is done */ kpr_fwd_desc_t krx_fwd; /* embedded forwarding descriptor */ ptl_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */ } kqswnal_rx_t; +#define KRX_POSTED 1 /* receiving */ +#define KRX_PARSE 2 /* ready to be parsed */ +#define KRX_COMPLETING 3 /* waiting to be completed */ + + typedef struct { struct list_head ktx_list; /* enqueue idle/active */ @@ -174,7 +181,7 @@ typedef struct int ktx_nmappedpages; /* # pages mapped for current message */ int ktx_port; /* destination ep port */ ptl_nid_t ktx_nid; /* destination node */ - void *ktx_args[2]; /* completion passthru */ + void *ktx_args[3]; /* completion passthru */ char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */ @@ -193,13 +200,16 @@ typedef struct } kqswnal_tx_t; #define KTX_IDLE 0 /* on kqn_(nblk_)idletxds */ -#define KTX_SENDING 1 /* local send */ -#define KTX_FORWARDING 2 /* routing a packet */ -#define KTX_GETTING 3 /* local optimised get */ +#define KTX_FORWARDING 1 /* sending a forwarded packet */ +#define KTX_SENDING 2 /* normal send */ +#define KTX_GETTING 3 /* sending optimised get */ +#define KTX_PUTTING 4 /* sending optimised put */ +#define KTX_RDMAING 5 /* handling optimised put/get */ typedef struct { /* dynamic tunables... */ + int kqn_optimized_puts; /* optimized PUTs? */ int kqn_optimized_gets; /* optimized GETs? */ #if CONFIG_SYSCTL struct ctl_table_header *kqn_sysctl; /* sysctl interface */ @@ -230,9 +240,6 @@ typedef struct struct list_head kqn_delayedfwds; /* delayed forwards */ struct list_head kqn_delayedtxds; /* delayed transmits */ - spinlock_t kqn_statelock; /* cb_cli/cb_sti */ - wait_queue_head_t kqn_yield_waitq; /* where yield waits */ - nal_cb_t *kqn_cb; /* -> kqswnal_lib */ #if MULTIRAIL_EKC EP_SYS *kqn_ep; /* elan system */ EP_NMH *kqn_ep_tx_nmh; /* elan reserved tx vaddrs */ @@ -250,6 +257,9 @@ typedef struct ptl_nid_t kqn_nid_offset; /* this cluster's NID offset */ int kqn_nnodes; /* this cluster's size */ int kqn_elanid; /* this nodes's elan ID */ + + EP_STATUSBLK kqn_rpc_success; /* preset RPC reply status blocks */ + EP_STATUSBLK kqn_rpc_failed; } kqswnal_data_t; /* kqn_init state */ @@ -258,21 +268,16 @@ typedef struct #define KQN_INIT_LIB 2 #define KQN_INIT_ALL 3 -extern nal_cb_t kqswnal_lib; +extern lib_nal_t kqswnal_lib; extern nal_t kqswnal_api; extern kqswnal_tunables_t kqswnal_tunables; extern kqswnal_data_t kqswnal_data; -/* global pre-prepared replies to keep off the stack */ -extern EP_STATUSBLK kqswnal_rpc_success; -extern EP_STATUSBLK kqswnal_rpc_failed; - extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg); extern void kqswnal_rxhandler(EP_RXD *rxd); extern int kqswnal_scheduler (void *); extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); -extern void kqswnal_dma_reply_complete (EP_RXD *rxd); -extern void kqswnal_requeue_rx (kqswnal_rx_t *krx); +extern void kqswnal_rx_done (kqswnal_rx_t *krx); static inline ptl_nid_t kqswnal_elanid2nid (int elanid) @@ -291,6 +296,12 @@ kqswnal_nid2elanid (ptl_nid_t nid) return (nid - kqswnal_data.kqn_nid_offset); } +static inline ptl_nid_t +kqswnal_rx_nid(kqswnal_rx_t *krx) +{ + return (kqswnal_elanid2nid(ep_rxd_node(krx->krx_rxd))); +} + static inline int kqswnal_pages_spanned (void *base, int nob) { @@ -313,11 +324,11 @@ static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob) } #endif -static inline void kqswnal_rx_done (kqswnal_rx_t *krx) +static inline void kqswnal_rx_decref (kqswnal_rx_t *krx) { LASSERT (atomic_read (&krx->krx_refcount) > 0); if (atomic_dec_and_test (&krx->krx_refcount)) - kqswnal_requeue_rx(krx); + kqswnal_rx_done(krx); } #if MULTIRAIL_EKC diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index 2bcb853..e1237a8 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -26,102 +26,14 @@ #include "qswnal.h" -EP_STATUSBLK kqswnal_rpc_success; -EP_STATUSBLK kqswnal_rpc_failed; - /* * LIB functions follow * */ -static ptl_err_t -kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, - size_t len) -{ - CDEBUG (D_NET, LPX64": reading "LPSZ" bytes from %p -> %p\n", - nal->ni.nid, len, src_addr, dst_addr ); - memcpy( dst_addr, src_addr, len ); - - return (PTL_OK); -} - -static ptl_err_t -kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, - size_t len) -{ - CDEBUG (D_NET, LPX64": writing "LPSZ" bytes from %p -> %p\n", - nal->ni.nid, len, src_addr, dst_addr ); - memcpy( dst_addr, src_addr, len ); - - return (PTL_OK); -} - -static void * -kqswnal_malloc(nal_cb_t *nal, size_t len) -{ - void *buf; - - PORTAL_ALLOC(buf, len); - return (buf); -} - -static void -kqswnal_free(nal_cb_t *nal, void *buf, size_t len) -{ - PORTAL_FREE(buf, len); -} - -static void -kqswnal_printf (nal_cb_t * nal, const char *fmt, ...) -{ - va_list ap; - char msg[256]; - - va_start (ap, fmt); - vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ - va_end (ap); - - msg[sizeof (msg) - 1] = 0; /* ensure terminated */ - - CDEBUG (D_NET, "%s", msg); -} - -#if (defined(CONFIG_SPARC32) || defined(CONFIG_SPARC64)) -# error "Can't save/restore irq contexts in different procedures" -#endif - -static void -kqswnal_cli(nal_cb_t *nal, unsigned long *flags) -{ - kqswnal_data_t *data= nal->nal_data; - - spin_lock_irqsave(&data->kqn_statelock, *flags); -} - - -static void -kqswnal_sti(nal_cb_t *nal, unsigned long *flags) -{ - kqswnal_data_t *data= nal->nal_data; - - spin_unlock_irqrestore(&data->kqn_statelock, *flags); -} - -static void -kqswnal_callback(nal_cb_t *nal, void *private, lib_eq_t *eq, ptl_event_t *ev) -{ - /* holding kqn_statelock */ - - if (eq->event_callback != NULL) - eq->event_callback(ev); - - if (waitqueue_active(&kqswnal_data.kqn_yield_waitq)) - wake_up_all(&kqswnal_data.kqn_yield_waitq); -} - static int -kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +kqswnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { - if (nid == nal->ni.nid) + if (nid == nal->libnal_ni.ni_pid.nid) *dist = 0; /* it's me */ else if (kqswnal_nid2elanid (nid) >= 0) *dist = 1; /* it's my peer */ @@ -212,11 +124,12 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_ do { int fraglen = kiov->kiov_len - offset; - /* nob exactly spans the iovs */ - LASSERT (fraglen <= nob); - /* each frag fits in a page */ + /* each page frag is contained in one page */ LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE); + if (fraglen > nob) + fraglen = nob; + nmapped++; if (nmapped > maxmapped) { CERROR("Can't map message in %d pages (max %d)\n", @@ -328,11 +241,12 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, do { int fraglen = iov->iov_len - offset; - long npages = kqswnal_pages_spanned (iov->iov_base, fraglen); - - /* nob exactly spans the iovs */ - LASSERT (fraglen <= nob); + long npages; + if (fraglen > nob) + fraglen = nob; + npages = kqswnal_pages_spanned (iov->iov_base, fraglen); + nmapped += npages; if (nmapped > maxmapped) { CERROR("Can't map message in %d pages (max %d)\n", @@ -519,40 +433,29 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) void kqswnal_tx_done (kqswnal_tx_t *ktx, int error) { - lib_msg_t *msg; - lib_msg_t *repmsg = NULL; - switch (ktx->ktx_state) { case KTX_FORWARDING: /* router asked me to forward this packet */ kpr_fwd_done (&kqswnal_data.kqn_router, (kpr_fwd_desc_t *)ktx->ktx_args[0], error); break; - case KTX_SENDING: /* packet sourced locally */ - lib_finalize (&kqswnal_lib, ktx->ktx_args[0], + case KTX_RDMAING: /* optimized GET/PUT handled */ + case KTX_PUTTING: /* optimized PUT sent */ + case KTX_SENDING: /* normal send */ + lib_finalize (&kqswnal_lib, NULL, (lib_msg_t *)ktx->ktx_args[1], - (error == 0) ? PTL_OK : - (error == -ENOMEM) ? PTL_NO_SPACE : PTL_FAIL); + (error == 0) ? PTL_OK : PTL_FAIL); break; - case KTX_GETTING: /* Peer has DMA-ed direct? */ - msg = (lib_msg_t *)ktx->ktx_args[1]; - - if (error == 0) { - repmsg = lib_create_reply_msg (&kqswnal_lib, - ktx->ktx_nid, msg); - if (repmsg == NULL) - error = -ENOMEM; - } - - if (error == 0) { - lib_finalize (&kqswnal_lib, ktx->ktx_args[0], - msg, PTL_OK); - lib_finalize (&kqswnal_lib, NULL, repmsg, PTL_OK); - } else { - lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg, - (error == -ENOMEM) ? PTL_NO_SPACE : PTL_FAIL); - } + case KTX_GETTING: /* optimized GET sent & REPLY received */ + /* Complete the GET with success since we can't avoid + * delivering a REPLY event; we committed to it when we + * launched the GET */ + lib_finalize (&kqswnal_lib, NULL, + (lib_msg_t *)ktx->ktx_args[1], PTL_OK); + lib_finalize (&kqswnal_lib, NULL, + (lib_msg_t *)ktx->ktx_args[2], + (error == 0) ? PTL_OK : PTL_FAIL); break; default: @@ -580,16 +483,27 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) kqswnal_notify_peer_down(ktx); status = -EHOSTDOWN; - } else if (ktx->ktx_state == KTX_GETTING) { - /* RPC completed OK; what did our peer put in the status + } else switch (ktx->ktx_state) { + + case KTX_GETTING: + case KTX_PUTTING: + /* RPC completed OK; but what did our peer put in the status * block? */ #if MULTIRAIL_EKC status = ep_txd_statusblk(txd)->Data[0]; #else status = ep_txd_statusblk(txd)->Status; #endif - } else { + break; + + case KTX_FORWARDING: + case KTX_SENDING: status = 0; + break; + + default: + LBUG(); + break; } kqswnal_tx_done (ktx, status); @@ -610,21 +524,20 @@ kqswnal_launch (kqswnal_tx_t *ktx) return (-ESHUTDOWN); LASSERT (dest >= 0); /* must be a peer */ - if (ktx->ktx_state == KTX_GETTING) { - /* NB ktx_frag[0] is the GET hdr + kqswnal_remotemd_t. The - * other frags are the GET sink which we obviously don't - * send here :) */ -#if MULTIRAIL_EKC + + switch (ktx->ktx_state) { + case KTX_GETTING: + case KTX_PUTTING: + /* NB ktx_frag[0] is the GET/PUT hdr + kqswnal_remotemd_t. + * The other frags are the payload, awaiting RDMA */ rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest, ktx->ktx_port, attr, kqswnal_txhandler, ktx, NULL, ktx->ktx_frags, 1); -#else - rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest, - ktx->ktx_port, attr, kqswnal_txhandler, - ktx, NULL, ktx->ktx_frags, 1); -#endif - } else { + break; + + case KTX_FORWARDING: + case KTX_SENDING: #if MULTIRAIL_EKC rc = ep_transmit_message(kqswnal_data.kqn_eptx, dest, ktx->ktx_port, attr, @@ -636,6 +549,12 @@ kqswnal_launch (kqswnal_tx_t *ktx) kqswnal_txhandler, ktx, ktx->ktx_frags, ktx->ktx_nfrag); #endif + break; + + default: + LBUG(); + rc = -EINVAL; /* no compiler warning please */ + break; } switch (rc) { @@ -658,6 +577,7 @@ kqswnal_launch (kqswnal_tx_t *ktx) } } +#if 0 static char * hdr_type_string (ptl_hdr_t *hdr) { @@ -726,6 +646,7 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr) } } /* end of print_hdr() */ +#endif #if !MULTIRAIL_EKC void @@ -787,114 +708,291 @@ kqswnal_eiovs2datav (int ndv, EP_DATAVEC *dv, CERROR ("DATAVEC too small\n"); return (-E2BIG); } +#else +int +kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag, + int nrfrag, EP_NMD *rfrag) +{ + int i; + + if (nlfrag != nrfrag) { + CERROR("Can't cope with unequal # frags: %d local %d remote\n", + nlfrag, nrfrag); + return (-EINVAL); + } + + for (i = 0; i < nlfrag; i++) + if (lfrag[i].nmd_len != rfrag[i].nmd_len) { + CERROR("Can't cope with unequal frags %d(%d):" + " %d local %d remote\n", + i, nlfrag, lfrag[i].nmd_len, rfrag[i].nmd_len); + return (-EINVAL); + } + + return (0); +} #endif -int -kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, - struct iovec *iov, ptl_kiov_t *kiov, - int offset, int nob) +kqswnal_remotemd_t * +kqswnal_parse_rmd (kqswnal_rx_t *krx, int type, ptl_nid_t expected_nid) { - kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; char *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page); + ptl_hdr_t *hdr = (ptl_hdr_t *)buffer; kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + KQSW_HDR_SIZE); - int rc; -#if MULTIRAIL_EKC - int i; -#else - EP_DATAVEC datav[EP_MAXFRAG]; - int ndatav; -#endif - LASSERT (krx->krx_rpc_reply_needed); - LASSERT ((iov == NULL) != (kiov == NULL)); + ptl_nid_t nid = kqswnal_rx_nid(krx); + + /* Note (1) lib_parse has already flipped hdr. + * (2) RDMA addresses are sent in native endian-ness. When + * EKC copes with different endian nodes, I'll fix this (and + * eat my hat :) */ + + LASSERT (krx->krx_nob >= sizeof(*hdr)); + + if (hdr->type != type) { + CERROR ("Unexpected optimized get/put type %d (%d expected)" + "from "LPX64"\n", hdr->type, type, nid); + return (NULL); + } + + if (hdr->src_nid != nid) { + CERROR ("Unexpected optimized get/put source NID " + LPX64" from "LPX64"\n", hdr->src_nid, nid); + return (NULL); + } + + LASSERT (nid == expected_nid); - /* see kqswnal_sendmsg comment regarding endian-ness */ if (buffer + krx->krx_nob < (char *)(rmd + 1)) { /* msg too small to discover rmd size */ CERROR ("Incoming message [%d] too small for RMD (%d needed)\n", krx->krx_nob, (int)(((char *)(rmd + 1)) - buffer)); - return (-EINVAL); + return (NULL); } - + if (buffer + krx->krx_nob < (char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) { /* rmd doesn't fit in the incoming message */ CERROR ("Incoming message [%d] too small for RMD[%d] (%d needed)\n", krx->krx_nob, rmd->kqrmd_nfrag, (int)(((char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) - buffer)); - return (-EINVAL); + return (NULL); } - /* Map the source data... */ + return (rmd); +} + +void +kqswnal_rdma_store_complete (EP_RXD *rxd) +{ + int status = ep_rxd_status(rxd); + kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); + kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; + + CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, + "rxd %p, ktx %p, status %d\n", rxd, ktx, status); + + LASSERT (ktx->ktx_state == KTX_RDMAING); + LASSERT (krx->krx_rxd == rxd); + LASSERT (krx->krx_rpc_reply_needed); + + krx->krx_rpc_reply_needed = 0; + kqswnal_rx_decref (krx); + + /* free ktx & finalize() its lib_msg_t */ + kqswnal_tx_done(ktx, (status == EP_SUCCESS) ? 0 : -ECONNABORTED); +} + +void +kqswnal_rdma_fetch_complete (EP_RXD *rxd) +{ + /* Completed fetching the PUT data */ + int status = ep_rxd_status(rxd); + kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); + kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; + unsigned long flags; + + CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, + "rxd %p, ktx %p, status %d\n", rxd, ktx, status); + + LASSERT (ktx->ktx_state == KTX_RDMAING); + LASSERT (krx->krx_rxd == rxd); + LASSERT (krx->krx_rpc_reply_needed); + + /* Set the RPC completion status */ + status = (status == EP_SUCCESS) ? 0 : -ECONNABORTED; + krx->krx_rpc_reply_status = status; + + /* free ktx & finalize() its lib_msg_t */ + kqswnal_tx_done(ktx, status); + + if (!in_interrupt()) { + /* OK to complete the RPC now (iff I had the last ref) */ + kqswnal_rx_decref (krx); + return; + } + + LASSERT (krx->krx_state == KRX_PARSE); + krx->krx_state = KRX_COMPLETING; + + /* Complete the RPC in thread context */ + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds); + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); +} + +int +kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type, + int niov, struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t len) +{ + kqswnal_remotemd_t *rmd; + kqswnal_tx_t *ktx; + int eprc; + int rc; +#if !MULTIRAIL_EKC + EP_DATAVEC datav[EP_MAXFRAG]; + int ndatav; +#endif + + LASSERT (type == PTL_MSG_GET || type == PTL_MSG_PUT); + /* Not both mapped and paged payload */ + LASSERT (iov == NULL || kiov == NULL); + /* RPC completes with failure by default */ + LASSERT (krx->krx_rpc_reply_needed); + LASSERT (krx->krx_rpc_reply_status != 0); + + rmd = kqswnal_parse_rmd(krx, type, libmsg->ev.initiator.nid); + if (rmd == NULL) + return (-EPROTO); + + if (len == 0) { + /* data got truncated to nothing. */ + lib_finalize(&kqswnal_lib, krx, libmsg, PTL_OK); + /* Let kqswnal_rx_done() complete the RPC with success */ + krx->krx_rpc_reply_status = 0; + return (0); + } + + /* NB I'm using 'ktx' just to map the local RDMA buffers; I'm not + actually sending a portals message with it */ + ktx = kqswnal_get_idle_tx(NULL, 0); + if (ktx == NULL) { + CERROR ("Can't get txd for RDMA with "LPX64"\n", + libmsg->ev.initiator.nid); + return (-ENOMEM); + } + + ktx->ktx_state = KTX_RDMAING; + ktx->ktx_nid = libmsg->ev.initiator.nid; + ktx->ktx_args[0] = krx; + ktx->ktx_args[1] = libmsg; + + /* Start mapping at offset 0 (we're not mapping any headers) */ ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0; + if (kiov != NULL) - rc = kqswnal_map_tx_kiov (ktx, offset, nob, nfrag, kiov); + rc = kqswnal_map_tx_kiov(ktx, offset, len, niov, kiov); else - rc = kqswnal_map_tx_iov (ktx, offset, nob, nfrag, iov); + rc = kqswnal_map_tx_iov(ktx, offset, len, niov, iov); if (rc != 0) { - CERROR ("Can't map source data: %d\n", rc); - return (rc); + CERROR ("Can't map local RDMA data: %d\n", rc); + goto out; } #if MULTIRAIL_EKC - if (ktx->ktx_nfrag != rmd->kqrmd_nfrag) { - CERROR("Can't cope with unequal # frags: %d local %d remote\n", - ktx->ktx_nfrag, rmd->kqrmd_nfrag); - return (-EINVAL); + rc = kqswnal_check_rdma (ktx->ktx_nfrag, ktx->ktx_frags, + rmd->kqrmd_nfrag, rmd->kqrmd_frag); + if (rc != 0) { + CERROR ("Incompatible RDMA descriptors\n"); + goto out; } - - for (i = 0; i < rmd->kqrmd_nfrag; i++) - if (ktx->ktx_frags[i].nmd_len != rmd->kqrmd_frag[i].nmd_len) { - CERROR("Can't cope with unequal frags %d(%d):" - " %d local %d remote\n", - i, rmd->kqrmd_nfrag, - ktx->ktx_frags[i].nmd_len, - rmd->kqrmd_frag[i].nmd_len); - return (-EINVAL); - } #else - ndatav = kqswnal_eiovs2datav (EP_MAXFRAG, datav, - ktx->ktx_nfrag, ktx->ktx_frags, - rmd->kqrmd_nfrag, rmd->kqrmd_frag); + switch (type) { + default: + LBUG(); + + case PTL_MSG_GET: + ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav, + ktx->ktx_nfrag, ktx->ktx_frags, + rmd->kqrmd_nfrag, rmd->kqrmd_frag); + break; + + case PTL_MSG_PUT: + ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav, + rmd->kqrmd_nfrag, rmd->kqrmd_frag, + ktx->ktx_nfrag, ktx->ktx_frags); + break; + } + if (ndatav < 0) { CERROR ("Can't create datavec: %d\n", ndatav); - return (ndatav); + rc = ndatav; + goto out; } #endif - /* Our caller will start to race with kqswnal_dma_reply_complete... */ - LASSERT (atomic_read (&krx->krx_refcount) == 1); - atomic_set (&krx->krx_refcount, 2); + LASSERT (atomic_read(&krx->krx_refcount) > 0); + /* Take an extra ref for the completion callback */ + atomic_inc(&krx->krx_refcount); -#if MULTIRAIL_EKC - rc = ep_complete_rpc(krx->krx_rxd, kqswnal_dma_reply_complete, ktx, - &kqswnal_rpc_success, - ktx->ktx_frags, rmd->kqrmd_frag, rmd->kqrmd_nfrag); - if (rc == EP_SUCCESS) - return (0); + switch (type) { + default: + LBUG(); - /* Well we tried... */ - krx->krx_rpc_reply_needed = 0; + case PTL_MSG_GET: +#if MULTIRAIL_EKC + eprc = ep_complete_rpc(krx->krx_rxd, + kqswnal_rdma_store_complete, ktx, + &kqswnal_data.kqn_rpc_success, + ktx->ktx_frags, rmd->kqrmd_frag, rmd->kqrmd_nfrag); #else - rc = ep_complete_rpc (krx->krx_rxd, kqswnal_dma_reply_complete, ktx, - &kqswnal_rpc_success, datav, ndatav); - if (rc == EP_SUCCESS) - return (0); - - /* "old" EKC destroys rxd on failed completion */ - krx->krx_rxd = NULL; + eprc = ep_complete_rpc (krx->krx_rxd, + kqswnal_rdma_store_complete, ktx, + &kqswnal_data.kqn_rpc_success, + datav, ndatav); + if (eprc != EP_SUCCESS) /* "old" EKC destroys rxd on failed completion */ + krx->krx_rxd = NULL; #endif + if (eprc != EP_SUCCESS) { + CERROR("can't complete RPC: %d\n", eprc); + /* don't re-attempt RPC completion */ + krx->krx_rpc_reply_needed = 0; + rc = -ECONNABORTED; + } + break; + + case PTL_MSG_PUT: +#if MULTIRAIL_EKC + eprc = ep_rpc_get (krx->krx_rxd, + kqswnal_rdma_fetch_complete, ktx, + rmd->kqrmd_frag, ktx->ktx_frags, ktx->ktx_nfrag); +#else + eprc = ep_rpc_get (krx->krx_rxd, + kqswnal_rdma_fetch_complete, ktx, + datav, ndatav); +#endif + if (eprc != EP_SUCCESS) { + CERROR("ep_rpc_get failed: %d\n", eprc); + rc = -ECONNABORTED; + } + break; + } - CERROR("can't complete RPC: %d\n", rc); - - /* reset refcount back to 1: we're not going to be racing with - * kqswnal_dma_reply_complete. */ - atomic_set (&krx->krx_refcount, 1); + out: + if (rc != 0) { + kqswnal_rx_decref(krx); /* drop callback's ref */ + kqswnal_put_idle_tx (ktx); + } - return (-ECONNABORTED); + atomic_dec(&kqswnal_data.kqn_pending_txs); + return (rc); } static ptl_err_t -kqswnal_sendmsg (nal_cb_t *nal, +kqswnal_sendmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, ptl_hdr_t *hdr, @@ -916,6 +1014,8 @@ kqswnal_sendmsg (nal_cb_t *nal, int sumoff; int sumnob; #endif + /* NB 1. hdr is in network byte order */ + /* 2. 'private' depends on the message type */ CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64 " pid %u\n", payload_nob, payload_niov, nid, pid); @@ -934,6 +1034,15 @@ kqswnal_sendmsg (nal_cb_t *nal, return (PTL_FAIL); } + if (type == PTL_MSG_REPLY && /* can I look in 'private' */ + ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) { /* is it an RPC */ + /* Must be a REPLY for an optimized GET */ + rc = kqswnal_rdma ((kqswnal_rx_t *)private, libmsg, PTL_MSG_GET, + payload_niov, payload_iov, payload_kiov, + payload_offset, payload_nob); + return ((rc == 0) ? PTL_OK : PTL_FAIL); + } + targetnid = nid; if (kqswnal_nid2elanid (nid) < 0) { /* Can't send direct: find gateway? */ rc = kpr_lookup (&kqswnal_data.kqn_router, nid, @@ -956,35 +1065,16 @@ kqswnal_sendmsg (nal_cb_t *nal, type == PTL_MSG_REPLY || in_interrupt())); if (ktx == NULL) { - kqswnal_cerror_hdr (hdr); + CERROR ("Can't get txd for msg type %d for "LPX64"\n", + type, libmsg->ev.initiator.nid); return (PTL_NO_SPACE); } + ktx->ktx_state = KTX_SENDING; ktx->ktx_nid = targetnid; ktx->ktx_args[0] = private; ktx->ktx_args[1] = libmsg; - - if (type == PTL_MSG_REPLY && - ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) { - if (nid != targetnid || - kqswnal_nid2elanid(nid) != - ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd)) { - CERROR("Optimized reply nid conflict: " - "nid "LPX64" via "LPX64" elanID %d\n", - nid, targetnid, - ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd)); - rc = -EINVAL; - goto out; - } - - /* peer expects RPC completion with GET data */ - rc = kqswnal_dma_reply (ktx, payload_niov, - payload_iov, payload_kiov, - payload_offset, payload_nob); - if (rc != 0) - CERROR ("Can't DMA reply to "LPX64": %d\n", nid, rc); - goto out; - } + ktx->ktx_args[2] = NULL; /* set when a GET commits to REPLY */ memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */ ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer; @@ -1027,28 +1117,31 @@ kqswnal_sendmsg (nal_cb_t *nal, memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum)); #endif - if (kqswnal_tunables.kqn_optimized_gets && - type == PTL_MSG_GET && /* doing a GET */ - nid == targetnid) { /* not forwarding */ + /* The first frag will be the pre-mapped buffer for (at least) the + * portals header. */ + ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; + + if (nid == targetnid && /* not forwarding */ + ((type == PTL_MSG_GET && /* optimize GET? */ + kqswnal_tunables.kqn_optimized_gets != 0 && + NTOH__u32(hdr->msg.get.sink_length) >= kqswnal_tunables.kqn_optimized_gets) || + (type == PTL_MSG_PUT && /* optimize PUT? */ + kqswnal_tunables.kqn_optimized_puts != 0 && + payload_nob >= kqswnal_tunables.kqn_optimized_puts))) { lib_md_t *md = libmsg->md; kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(ktx->ktx_buffer + KQSW_HDR_SIZE); - /* Optimised path: I send over the Elan vaddrs of the get - * sink buffers, and my peer DMAs directly into them. + /* Optimised path: I send over the Elan vaddrs of the local + * buffers, and my peer DMAs directly to/from them. * * First I set up ktx as if it was going to send this * payload, (it needs to map it anyway). This fills * ktx_frags[1] and onward with the network addresses * of the GET sink frags. I copy these into ktx_buffer, - * immediately after the header, and send that as my GET - * message. - * - * Note that the addresses are sent in native endian-ness. - * When EKC copes with different endian nodes, I'll fix - * this (and eat my hat :) */ + * immediately after the header, and send that as my + * message. */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - ktx->ktx_state = KTX_GETTING; + ktx->ktx_state = (type == PTL_MSG_PUT) ? KTX_PUTTING : KTX_GETTING; if ((libmsg->md->options & PTL_MD_KIOV) != 0) rc = kqswnal_map_tx_kiov (ktx, 0, md->length, @@ -1078,12 +1171,21 @@ kqswnal_sendmsg (nal_cb_t *nal, ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + payload_nob; #endif + if (type == PTL_MSG_GET) { + /* Allocate reply message now while I'm in thread context */ + ktx->ktx_args[2] = lib_create_reply_msg (&kqswnal_lib, + nid, libmsg); + if (ktx->ktx_args[2] == NULL) + goto out; + + /* NB finalizing the REPLY message is my + * responsibility now, whatever happens. */ + } + } else if (payload_nob <= KQSW_TX_MAXCONTIG) { /* small message: single frag copied into the pre-mapped buffer */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - ktx->ktx_state = KTX_SENDING; #if MULTIRAIL_EKC ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, KQSW_HDR_SIZE + payload_nob); @@ -1105,8 +1207,6 @@ kqswnal_sendmsg (nal_cb_t *nal, /* large message: multiple frags: first is hdr in pre-mapped buffer */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - ktx->ktx_state = KTX_SENDING; #if MULTIRAIL_EKC ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, KQSW_HDR_SIZE); @@ -1135,15 +1235,29 @@ kqswnal_sendmsg (nal_cb_t *nal, rc == 0 ? "Sent" : "Failed to send", payload_nob, nid, targetnid, rc); - if (rc != 0) + if (rc != 0) { + if (ktx->ktx_state == KTX_GETTING && + ktx->ktx_args[2] != NULL) { + /* We committed to reply, but there was a problem + * launching the GET. We can't avoid delivering a + * REPLY event since we committed above, so we + * pretend the GET succeeded but the REPLY + * failed. */ + rc = 0; + lib_finalize (&kqswnal_lib, private, libmsg, PTL_OK); + lib_finalize (&kqswnal_lib, private, + (lib_msg_t *)ktx->ktx_args[2], PTL_FAIL); + } + kqswnal_put_idle_tx (ktx); - + } + atomic_dec(&kqswnal_data.kqn_pending_txs); return (rc == 0 ? PTL_OK : PTL_FAIL); } static ptl_err_t -kqswnal_send (nal_cb_t *nal, +kqswnal_send (lib_nal_t *nal, void *private, lib_msg_t *libmsg, ptl_hdr_t *hdr, @@ -1161,7 +1275,7 @@ kqswnal_send (nal_cb_t *nal, } static ptl_err_t -kqswnal_send_pages (nal_cb_t *nal, +kqswnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *libmsg, ptl_hdr_t *hdr, @@ -1200,7 +1314,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) if (ktx == NULL) /* can't get txd right now */ return; /* fwd will be scheduled when tx desc freed */ - if (nid == kqswnal_lib.ni.nid) /* gateway is me */ + if (nid == kqswnal_lib.libnal_ni.ni_pid.nid) /* gateway is me */ nid = fwd->kprfd_target_nid; /* target is final dest */ if (kqswnal_nid2elanid (nid) < 0) { @@ -1254,9 +1368,8 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) if (rc != 0) { CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc); - kqswnal_put_idle_tx (ktx); /* complete now (with failure) */ - kpr_fwd_done (&kqswnal_data.kqn_router, fwd, rc); + kqswnal_tx_done (ktx, rc); } atomic_dec(&kqswnal_data.kqn_pending_txs); @@ -1277,29 +1390,48 @@ kqswnal_fwd_callback (void *arg, int error) NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error); } - kqswnal_requeue_rx (krx); + LASSERT (atomic_read(&krx->krx_refcount) == 1); + kqswnal_rx_decref (krx); } void -kqswnal_dma_reply_complete (EP_RXD *rxd) +kqswnal_requeue_rx (kqswnal_rx_t *krx) { - int status = ep_rxd_status(rxd); - kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); - kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; - lib_msg_t *msg = (lib_msg_t *)ktx->ktx_args[1]; - - CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, - "rxd %p, ktx %p, status %d\n", rxd, ktx, status); + LASSERT (atomic_read(&krx->krx_refcount) == 0); + LASSERT (!krx->krx_rpc_reply_needed); - LASSERT (krx->krx_rxd == rxd); - LASSERT (krx->krx_rpc_reply_needed); + krx->krx_state = KRX_POSTED; - krx->krx_rpc_reply_needed = 0; - kqswnal_rx_done (krx); +#if MULTIRAIL_EKC + if (kqswnal_data.kqn_shuttingdown) { + /* free EKC rxd on shutdown */ + ep_complete_receive(krx->krx_rxd); + } else { + /* repost receive */ + ep_requeue_receive(krx->krx_rxd, + kqswnal_rxhandler, krx, + &krx->krx_elanbuffer, 0); + } +#else + if (kqswnal_data.kqn_shuttingdown) + return; - lib_finalize (&kqswnal_lib, NULL, msg, - (status == EP_SUCCESS) ? PTL_OK : PTL_FAIL); - kqswnal_put_idle_tx (ktx); + if (krx->krx_rxd == NULL) { + /* We had a failed ep_complete_rpc() which nukes the + * descriptor in "old" EKC */ + int eprc = ep_queue_receive(krx->krx_eprx, + kqswnal_rxhandler, krx, + krx->krx_elanbuffer, + krx->krx_npages * PAGE_SIZE, 0); + LASSERT (eprc == EP_SUCCESS); + /* We don't handle failure here; it's incredibly rare + * (never reported?) and only happens with "old" EKC */ + } else { + ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx, + krx->krx_elanbuffer, + krx->krx_npages * PAGE_SIZE); + } +#endif } void @@ -1319,71 +1451,45 @@ kqswnal_rpc_complete (EP_RXD *rxd) } void -kqswnal_requeue_rx (kqswnal_rx_t *krx) +kqswnal_rx_done (kqswnal_rx_t *krx) { - int rc; + int rc; + EP_STATUSBLK *sblk; LASSERT (atomic_read(&krx->krx_refcount) == 0); if (krx->krx_rpc_reply_needed) { + /* We've not completed the peer's RPC yet... */ + sblk = (krx->krx_rpc_reply_status == 0) ? + &kqswnal_data.kqn_rpc_success : + &kqswnal_data.kqn_rpc_failed; - /* We failed to complete the peer's optimized GET (e.g. we - * couldn't map the source buffers). We complete the - * peer's EKC rpc now with failure. */ + LASSERT (!in_interrupt()); #if MULTIRAIL_EKC - rc = ep_complete_rpc(krx->krx_rxd, kqswnal_rpc_complete, krx, - &kqswnal_rpc_failed, NULL, NULL, 0); + rc = ep_complete_rpc(krx->krx_rxd, + kqswnal_rpc_complete, krx, + sblk, NULL, NULL, 0); if (rc == EP_SUCCESS) return; - - CERROR("can't complete RPC: %d\n", rc); #else - if (krx->krx_rxd != NULL) { - /* We didn't try (and fail) to complete earlier... */ - rc = ep_complete_rpc(krx->krx_rxd, - kqswnal_rpc_complete, krx, - &kqswnal_rpc_failed, NULL, 0); - if (rc == EP_SUCCESS) - return; - - CERROR("can't complete RPC: %d\n", rc); - } - - /* NB the old ep_complete_rpc() frees rxd on failure, so we - * have to requeue from scratch here, unless we're shutting - * down */ - if (kqswnal_data.kqn_shuttingdown) + rc = ep_complete_rpc(krx->krx_rxd, + kqswnal_rpc_complete, krx, + sblk, NULL, 0); + if (rc == EP_SUCCESS) return; - rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, - krx->krx_elanbuffer, - krx->krx_npages * PAGE_SIZE, 0); - LASSERT (rc == EP_SUCCESS); - /* We don't handle failure here; it's incredibly rare - * (never reported?) and only happens with "old" EKC */ - return; + /* "old" EKC destroys rxd on failed completion */ + krx->krx_rxd = NULL; #endif + CERROR("can't complete RPC: %d\n", rc); + krx->krx_rpc_reply_needed = 0; } -#if MULTIRAIL_EKC - if (kqswnal_data.kqn_shuttingdown) { - /* free EKC rxd on shutdown */ - ep_complete_receive(krx->krx_rxd); - } else { - /* repost receive */ - ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx, - &krx->krx_elanbuffer, 0); - } -#else - /* don't actually requeue on shutdown */ - if (!kqswnal_data.kqn_shuttingdown) - ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx, - krx->krx_elanbuffer, krx->krx_npages * PAGE_SIZE); -#endif + kqswnal_requeue_rx(krx); } void -kqswnal_rx (kqswnal_rx_t *krx) +kqswnal_parse (kqswnal_rx_t *krx) { ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(krx->krx_kiov[0].kiov_page); ptl_nid_t dest_nid = NTOH__u64 (hdr->dest_nid); @@ -1391,25 +1497,28 @@ kqswnal_rx (kqswnal_rx_t *krx) int nob; int niov; - LASSERT (atomic_read(&krx->krx_refcount) == 0); + LASSERT (atomic_read(&krx->krx_refcount) == 1); + + if (dest_nid == kqswnal_lib.libnal_ni.ni_pid.nid) { /* It's for me :) */ + /* I ignore parse errors since I'm not consuming a byte + * stream */ + (void)lib_parse (&kqswnal_lib, hdr, krx); - if (dest_nid == kqswnal_lib.ni.nid) { /* It's for me :) */ - atomic_set(&krx->krx_refcount, 1); - lib_parse (&kqswnal_lib, hdr, krx); - kqswnal_rx_done(krx); + /* Drop my ref; any RDMA activity takes an additional ref */ + kqswnal_rx_decref(krx); return; } #if KQSW_CHECKSUM - CERROR ("checksums for forwarded packets not implemented\n"); - LBUG (); + LASSERTF (0, "checksums for forwarded packets not implemented\n"); #endif + if (kqswnal_nid2elanid (dest_nid) >= 0) /* should have gone direct to peer */ { CERROR("dropping packet from "LPX64" for "LPX64 ": target is peer\n", NTOH__u64(hdr->src_nid), dest_nid); - kqswnal_requeue_rx (krx); + kqswnal_rx_decref (krx); return; } @@ -1451,7 +1560,9 @@ kqswnal_rxhandler(EP_RXD *rxd) rxd, krx, nob, status); LASSERT (krx != NULL); - + LASSERT (krx->krx_state = KRX_POSTED); + + krx->krx_state = KRX_PARSE; krx->krx_rxd = rxd; krx->krx_nob = nob; #if MULTIRAIL_EKC @@ -1459,7 +1570,10 @@ kqswnal_rxhandler(EP_RXD *rxd) #else krx->krx_rpc_reply_needed = ep_rxd_isrpc(rxd); #endif - + /* Default to failure if an RPC reply is requested but not handled */ + krx->krx_rpc_reply_status = -EPROTO; + atomic_set (&krx->krx_refcount, 1); + /* must receive a whole header to be able to parse */ if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t)) { @@ -1475,12 +1589,12 @@ kqswnal_rxhandler(EP_RXD *rxd) CERROR("receive status failed with status %d nob %d\n", ep_rxd_status(rxd), nob); #endif - kqswnal_requeue_rx (krx); + kqswnal_rx_decref(krx); return; } if (!in_interrupt()) { - kqswnal_rx (krx); + kqswnal_parse(krx); return; } @@ -1540,7 +1654,7 @@ kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) #endif static ptl_err_t -kqswnal_recvmsg (nal_cb_t *nal, +kqswnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, @@ -1552,16 +1666,18 @@ kqswnal_recvmsg (nal_cb_t *nal, { kqswnal_rx_t *krx = (kqswnal_rx_t *)private; char *buffer = page_address(krx->krx_kiov[0].kiov_page); + ptl_hdr_t *hdr = (ptl_hdr_t *)buffer; int page; char *page_ptr; int page_nob; char *iov_ptr; int iov_nob; int frag; + int rc; #if KQSW_CHECKSUM kqsw_csum_t senders_csum; kqsw_csum_t payload_csum = 0; - kqsw_csum_t hdr_csum = kqsw_csum(0, buffer, sizeof(ptl_hdr_t)); + kqsw_csum_t hdr_csum = kqsw_csum(0, hdr, sizeof(*hdr)); size_t csum_len = mlen; int csum_frags = 0; int csum_nob = 0; @@ -1574,8 +1690,18 @@ kqswnal_recvmsg (nal_cb_t *nal, if (senders_csum != hdr_csum) kqswnal_csum_error (krx, 1); #endif + /* NB lib_parse() has already flipped *hdr */ + CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen); + if (krx->krx_rpc_reply_needed && + hdr->type == PTL_MSG_PUT) { + /* This must be an optimized PUT */ + rc = kqswnal_rdma (krx, libmsg, PTL_MSG_PUT, + niov, iov, kiov, offset, mlen); + return (rc == 0 ? PTL_OK : PTL_FAIL); + } + /* What was actually received must be >= payload. */ LASSERT (mlen <= rlen); if (krx->krx_nob < KQSW_HDR_SIZE + mlen) { @@ -1691,7 +1817,7 @@ kqswnal_recvmsg (nal_cb_t *nal, } static ptl_err_t -kqswnal_recv(nal_cb_t *nal, +kqswnal_recv(lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, @@ -1706,7 +1832,7 @@ kqswnal_recv(nal_cb_t *nal, } static ptl_err_t -kqswnal_recv_pages (nal_cb_t *nal, +kqswnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, @@ -1766,7 +1892,18 @@ kqswnal_scheduler (void *arg) spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); - kqswnal_rx (krx); + switch (krx->krx_state) { + case KRX_PARSE: + kqswnal_parse (krx); + break; + case KRX_COMPLETING: + /* Drop last ref to reply to RPC and requeue */ + LASSERT (krx->krx_rpc_reply_needed); + kqswnal_rx_decref (krx); + break; + default: + LBUG(); + } did_something = 1; spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); @@ -1835,20 +1972,12 @@ kqswnal_scheduler (void *arg) return (0); } -nal_cb_t kqswnal_lib = +lib_nal_t kqswnal_lib = { - nal_data: &kqswnal_data, /* NAL private data */ - cb_send: kqswnal_send, - cb_send_pages: kqswnal_send_pages, - cb_recv: kqswnal_recv, - cb_recv_pages: kqswnal_recv_pages, - cb_read: kqswnal_read, - cb_write: kqswnal_write, - cb_malloc: kqswnal_malloc, - cb_free: kqswnal_free, - cb_printf: kqswnal_printf, - cb_cli: kqswnal_cli, - cb_sti: kqswnal_sti, - cb_callback: kqswnal_callback, - cb_dist: kqswnal_dist + libnal_data: &kqswnal_data, /* NAL private data */ + libnal_send: kqswnal_send, + libnal_send_pages: kqswnal_send_pages, + libnal_recv: kqswnal_recv, + libnal_recv_pages: kqswnal_recv_pages, + libnal_dist: kqswnal_dist }; diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 32bbbec..9d39cb1b 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -74,83 +74,9 @@ static ctl_table ksocknal_top_ctl_table[] = { #endif int -ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len, - void *ret, size_t ret_len) -{ - ksock_nal_data_t *k; - nal_cb_t *nal_cb; - - k = nal->nal_data; - nal_cb = k->ksnd_nal_cb; - - lib_dispatch(nal_cb, k, id, args, ret); /* ksocknal_send needs k */ - return PTL_OK; -} - -void -ksocknal_api_lock(nal_t *nal, unsigned long *flags) -{ - ksock_nal_data_t *k; - nal_cb_t *nal_cb; - - k = nal->nal_data; - nal_cb = k->ksnd_nal_cb; - nal_cb->cb_cli(nal_cb,flags); -} - -void -ksocknal_api_unlock(nal_t *nal, unsigned long *flags) -{ - ksock_nal_data_t *k; - nal_cb_t *nal_cb; - - k = nal->nal_data; - nal_cb = k->ksnd_nal_cb; - nal_cb->cb_sti(nal_cb,flags); -} - -int -ksocknal_api_yield(nal_t *nal, unsigned long *flags, int milliseconds) -{ - /* NB called holding statelock */ - wait_queue_t wait; - unsigned long now = jiffies; - - CDEBUG (D_NET, "yield\n"); - - if (milliseconds == 0) { - our_cond_resched(); - return 0; - } - - init_waitqueue_entry(&wait, current); - set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&ksocknal_data.ksnd_yield_waitq, &wait); - - ksocknal_api_unlock(nal, flags); - - if (milliseconds < 0) - schedule (); - else - schedule_timeout((milliseconds * HZ) / 1000); - - ksocknal_api_lock(nal, flags); - - remove_wait_queue (&ksocknal_data.ksnd_yield_waitq, &wait); - - if (milliseconds > 0) { - milliseconds -= ((jiffies - now) * 1000) / HZ; - if (milliseconds < 0) - milliseconds = 0; - } - - return (milliseconds); -} - -int ksocknal_set_mynid(ptl_nid_t nid) { - lib_ni_t *ni = &ksocknal_lib.ni; + lib_ni_t *ni = &ksocknal_lib.libnal_ni; /* FIXME: we have to do this because we call lib_init() at module * insertion time, which is before we have 'mynid' available. lib_init @@ -159,9 +85,9 @@ ksocknal_set_mynid(ptl_nid_t nid) * problem. */ CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->nid); + nid, ni->ni_pid.nid); - ni->nid = nid; + ni->ni_pid.nid = nid; return (0); } @@ -1527,14 +1453,18 @@ ksocknal_api_shutdown (nal_t *nal) /* flag threads to terminate; wake and wait for them to die */ ksocknal_data.ksnd_shuttingdown = 1; + mb(); wake_up_all (&ksocknal_data.ksnd_autoconnectd_waitq); wake_up_all (&ksocknal_data.ksnd_reaper_waitq); for (i = 0; i < SOCKNAL_N_SCHED; i++) wake_up_all(&ksocknal_data.ksnd_schedulers[i].kss_waitq); + i = 4; while (atomic_read (&ksocknal_data.ksnd_nthreads) != 0) { - CDEBUG (D_NET, "waitinf for %d threads to terminate\n", + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d threads to terminate\n", atomic_read (&ksocknal_data.ksnd_nthreads)); set_current_state (TASK_UNINTERRUPTIBLE); schedule_timeout (HZ); @@ -1590,7 +1520,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, if (nal->nal_refct != 0) { if (actual_limits != NULL) - *actual_limits = ksocknal_lib.ni.actual_limits; + *actual_limits = ksocknal_lib.libnal_ni.ni_actual_limits; /* This module got the first ref */ PORTAL_MODULE_USE; return (PTL_OK); @@ -1613,10 +1543,6 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, rwlock_init(&ksocknal_data.ksnd_global_lock); - ksocknal_data.ksnd_nal_cb = &ksocknal_lib; - spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock); - init_waitqueue_head(&ksocknal_data.ksnd_yield_waitq); - spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); @@ -1646,7 +1572,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, sizeof(ksock_sched_t) * SOCKNAL_N_SCHED); if (ksocknal_data.ksnd_schedulers == NULL) { - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (-ENOMEM); } @@ -1666,11 +1592,11 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, process_id.pid = 0; process_id.nid = 0; - rc = lib_init(&ksocknal_lib, process_id, + rc = lib_init(&ksocknal_lib, nal, process_id, requested_limits, actual_limits); if (rc != PTL_OK) { CERROR("lib_init failed: error %d\n", rc); - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (rc); } @@ -1682,7 +1608,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, if (rc != 0) { CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc); - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (rc); } } @@ -1691,7 +1617,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, rc = ksocknal_thread_start (ksocknal_autoconnectd, (void *)((long)i)); if (rc != 0) { CERROR("Can't spawn socknal autoconnectd: %d\n", rc); - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (rc); } } @@ -1699,7 +1625,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, rc = ksocknal_thread_start (ksocknal_reaper, NULL); if (rc != 0) { CERROR ("Can't spawn socknal reaper: %d\n", rc); - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (rc); } @@ -1725,7 +1651,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, PORTAL_ALLOC(fmb, offsetof(ksock_fmb_t, fmb_kiov[pool->fmp_buff_pages])); if (fmb == NULL) { - ksocknal_api_shutdown(&ksocknal_api); + ksocknal_api_shutdown(nal); return (-ENOMEM); } @@ -1735,7 +1661,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, fmb->fmb_kiov[j].kiov_page = alloc_page(GFP_KERNEL); if (fmb->fmb_kiov[j].kiov_page == NULL) { - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (-ENOMEM); } @@ -1749,7 +1675,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, rc = libcfs_nal_cmd_register(SOCKNAL, &ksocknal_cmd, NULL); if (rc != 0) { CERROR ("Can't initialise command interface (rc = %d)\n", rc); - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (rc); } @@ -1794,14 +1720,8 @@ ksocknal_module_init (void) /* check ksnr_connected/connecting field large enough */ LASSERT(SOCKNAL_CONN_NTYPES <= 4); - ksocknal_api.startup = ksocknal_api_startup; - ksocknal_api.forward = ksocknal_api_forward; - ksocknal_api.shutdown = ksocknal_api_shutdown; - ksocknal_api.lock = ksocknal_api_lock; - ksocknal_api.unlock = ksocknal_api_unlock; - ksocknal_api.nal_data = &ksocknal_data; - - ksocknal_lib.nal_data = &ksocknal_data; + ksocknal_api.nal_ni_init = ksocknal_api_startup; + ksocknal_api.nal_ni_fini = ksocknal_api_shutdown; /* Initialise dynamic tunables to defaults once only */ ksocknal_tunables.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT; diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 87b23dc..ff73f71 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -160,10 +160,6 @@ typedef struct { struct list_head *ksnd_peers; /* hash table of all my known peers */ int ksnd_peer_hash_size; /* size of ksnd_peers */ - nal_cb_t *ksnd_nal_cb; - spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ - wait_queue_head_t ksnd_yield_waitq; /* where yield waits */ - atomic_t ksnd_nthreads; /* # live threads */ int ksnd_shuttingdown; /* tell threads to exit */ ksock_sched_t *ksnd_schedulers; /* scheduler state */ @@ -364,7 +360,7 @@ typedef struct ksock_peer } ksock_peer_t; -extern nal_cb_t ksocknal_lib; +extern lib_nal_t ksocknal_lib; extern ksock_nal_data_t ksocknal_data; extern ksock_tunables_t ksocknal_tunables; diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 21e0abe..5815d16 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -32,101 +32,12 @@ * LIB functions follow * */ -ptl_err_t -ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr, - user_ptr src_addr, size_t len) -{ - CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n", - nal->ni.nid, (long)len, src_addr, dst_addr); - - memcpy( dst_addr, src_addr, len ); - return PTL_OK; -} - -ptl_err_t -ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, - void *src_addr, size_t len) -{ - CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n", - nal->ni.nid, (long)len, src_addr, dst_addr); - - memcpy( dst_addr, src_addr, len ); - return PTL_OK; -} - -void * -ksocknal_malloc(nal_cb_t *nal, size_t len) -{ - void *buf; - - PORTAL_ALLOC(buf, len); - - if (buf != NULL) - memset(buf, 0, len); - - return (buf); -} - -void -ksocknal_free(nal_cb_t *nal, void *buf, size_t len) -{ - PORTAL_FREE(buf, len); -} - -void -ksocknal_printf(nal_cb_t *nal, const char *fmt, ...) -{ - va_list ap; - char msg[256]; - - va_start (ap, fmt); - vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ - va_end (ap); - - msg[sizeof (msg) - 1] = 0; /* ensure terminated */ - - CDEBUG (D_NET, "%s", msg); -} - -void -ksocknal_cli(nal_cb_t *nal, unsigned long *flags) -{ - ksock_nal_data_t *data = nal->nal_data; - - /* OK to ignore 'flags'; we're only ever serialise threads and - * never need to lock out interrupts */ - spin_lock(&data->ksnd_nal_cb_lock); -} - -void -ksocknal_sti(nal_cb_t *nal, unsigned long *flags) -{ - ksock_nal_data_t *data; - data = nal->nal_data; - - /* OK to ignore 'flags'; we're only ever serialise threads and - * never need to lock out interrupts */ - spin_unlock(&data->ksnd_nal_cb_lock); -} - -void -ksocknal_callback(nal_cb_t *nal, void *private, lib_eq_t *eq, ptl_event_t *ev) -{ - /* holding ksnd_nal_cb_lock */ - - if (eq->event_callback != NULL) - eq->event_callback(ev); - - if (waitqueue_active(&ksocknal_data.ksnd_yield_waitq)) - wake_up_all(&ksocknal_data.ksnd_yield_waitq); -} - int -ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +ksocknal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { /* I would guess that if ksocknal_get_peer (nid) == NULL, and we're not routing, then 'nid' is very distant :) */ - if ( nal->ni.nid == nid ) { + if (nal->libnal_ni.ni_pid.nid == nid) { *dist = 0; } else { *dist = 1; @@ -882,8 +793,8 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) { struct list_head *tmp; ksock_route_t *route; - ksock_route_t *candidate = NULL; - int found = 0; + ksock_route_t *first_lazy = NULL; + int found_connecting_or_connected = 0; int bits; list_for_each (tmp, &peer->ksnp_routes) { @@ -896,7 +807,7 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) /* All typed connections have been established, or * an untyped connection has been established, or * connections are currently being established */ - found = 1; + found_connecting_or_connected = 1; continue; } @@ -904,20 +815,24 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) if (!time_after_eq (jiffies, route->ksnr_timeout)) continue; - /* always do eager routes */ + /* eager routes always want to be connected */ if (route->ksnr_eager) return (route); - if (candidate == NULL) { - /* If we don't find any other route that is fully - * connected or connecting, the first connectable - * route is returned. If it fails to connect, it - * will get placed at the end of the list */ - candidate = route; - } + if (first_lazy == NULL) + first_lazy = route; } - - return (found ? NULL : candidate); + + /* No eager routes need to be connected. If some connection has + * already been established, or is being established there's nothing to + * do. Otherwise we return the first lazy route we found. If it fails + * to connect, it will go to the end of the list. */ + + if (!list_empty (&peer->ksnp_conns) || + found_connecting_or_connected) + return (NULL); + + return (first_lazy); } ksock_route_t * @@ -1028,7 +943,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) } ptl_err_t -ksocknal_sendmsg(nal_cb_t *nal, +ksocknal_sendmsg(lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, @@ -1125,7 +1040,7 @@ ksocknal_sendmsg(nal_cb_t *nal, } ptl_err_t -ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, +ksocknal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, struct iovec *payload_iov, size_t payload_offset, size_t payload_len) @@ -1137,7 +1052,7 @@ ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, } ptl_err_t -ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, +ksocknal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, ptl_kiov_t *payload_kiov, size_t payload_offset, size_t payload_len) @@ -1159,7 +1074,7 @@ ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); /* I'm the gateway; must be the last hop */ - if (nid == ksocknal_lib.ni.nid) + if (nid == ksocknal_lib.libnal_ni.ni_pid.nid) nid = fwd->kprfd_target_nid; /* setup iov for hdr */ @@ -1544,7 +1459,8 @@ ksocknal_process_receive (ksock_conn_t *conn) switch (conn->ksnc_rx_state) { case SOCKNAL_RX_HEADER: if (conn->ksnc_hdr.type != HTON__u32(PTL_MSG_HELLO) && - NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) { + NTOH__u64(conn->ksnc_hdr.dest_nid) != + ksocknal_lib.libnal_ni.ni_pid.nid) { /* This packet isn't for me */ ksocknal_fwd_parse (conn); switch (conn->ksnc_rx_state) { @@ -1561,7 +1477,13 @@ ksocknal_process_receive (ksock_conn_t *conn) } /* sets wanted_len, iovs etc */ - lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); + rc = lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); + + if (rc != PTL_OK) { + /* I just received garbage: give up on this conn */ + ksocknal_close_conn_and_siblings (conn, rc); + return (-EPROTO); + } if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ conn->ksnc_rx_state = SOCKNAL_RX_BODY; @@ -1608,7 +1530,7 @@ ksocknal_process_receive (ksock_conn_t *conn) } ptl_err_t -ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, +ksocknal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, struct iovec *iov, size_t offset, size_t mlen, size_t rlen) { @@ -1636,7 +1558,7 @@ ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, } ptl_err_t -ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg, +ksocknal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, ptl_kiov_t *kiov, size_t offset, size_t mlen, size_t rlen) { @@ -2029,7 +1951,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, hmv->version_major = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); hmv->version_minor = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); - hdr.src_nid = __cpu_to_le64 (ksocknal_lib.ni.nid); + hdr.src_nid = __cpu_to_le64 (ksocknal_lib.libnal_ni.ni_pid.nid); hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); hdr.msg.hello.type = __cpu_to_le32 (*type); @@ -2698,19 +2620,11 @@ ksocknal_reaper (void *arg) return (0); } -nal_cb_t ksocknal_lib = { - nal_data: &ksocknal_data, /* NAL private data */ - cb_send: ksocknal_send, - cb_send_pages: ksocknal_send_pages, - cb_recv: ksocknal_recv, - cb_recv_pages: ksocknal_recv_pages, - cb_read: ksocknal_read, - cb_write: ksocknal_write, - cb_malloc: ksocknal_malloc, - cb_free: ksocknal_free, - cb_printf: ksocknal_printf, - cb_cli: ksocknal_cli, - cb_sti: ksocknal_sti, - cb_callback: ksocknal_callback, - cb_dist: ksocknal_dist +lib_nal_t ksocknal_lib = { + libnal_data: &ksocknal_data, /* NAL private data */ + libnal_send: ksocknal_send, + libnal_send_pages: ksocknal_send_pages, + libnal_recv: ksocknal_recv, + libnal_recv_pages: ksocknal_recv_pages, + libnal_dist: ksocknal_dist }; diff --git a/lnet/libcfs/module.c b/lnet/libcfs/module.c index 4e63c86..06f1578 100644 --- a/lnet/libcfs/module.c +++ b/lnet/libcfs/module.c @@ -52,11 +52,12 @@ #define PORTAL_MINOR 240 struct nal_cmd_handler { + int nch_number; nal_cmd_handler_fn *nch_handler; void *nch_private; }; -static struct nal_cmd_handler nal_cmd[NAL_MAX_NR + 1]; +static struct nal_cmd_handler nal_cmd[16]; static DECLARE_MUTEX(nal_cmd_sem); #ifdef PORTAL_DEBUG @@ -245,23 +246,53 @@ static inline void freedata(void *data, int len) PORTAL_FREE(data, len); } +struct nal_cmd_handler * +libcfs_find_nal_cmd_handler(int nal) +{ + int i; + + for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++) + if (nal_cmd[i].nch_handler != NULL && + nal_cmd[i].nch_number == nal) + return (&nal_cmd[i]); + + return (NULL); +} + int libcfs_nal_cmd_register(int nal, nal_cmd_handler_fn *handler, void *private) { - int rc = 0; + struct nal_cmd_handler *cmd; + int i; + int rc; CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler); - if (nal > 0 && nal <= NAL_MAX_NR) { - down(&nal_cmd_sem); - if (nal_cmd[nal].nch_handler != NULL) - rc = -EBUSY; - else { - nal_cmd[nal].nch_handler = handler; - nal_cmd[nal].nch_private = private; + down(&nal_cmd_sem); + + if (libcfs_find_nal_cmd_handler(nal) != NULL) { + up (&nal_cmd_sem); + return (-EBUSY); + } + + cmd = NULL; + for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++) + if (nal_cmd[i].nch_handler == NULL) { + cmd = &nal_cmd[i]; + break; } - up(&nal_cmd_sem); + + if (cmd == NULL) { + rc = -EBUSY; + } else { + rc = 0; + cmd->nch_number = nal; + cmd->nch_handler = handler; + cmd->nch_private = private; } + + up(&nal_cmd_sem); + return rc; } EXPORT_SYMBOL(libcfs_nal_cmd_register); @@ -269,14 +300,15 @@ EXPORT_SYMBOL(libcfs_nal_cmd_register); void libcfs_nal_cmd_unregister(int nal) { - CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal); + struct nal_cmd_handler *cmd; - LASSERT(nal > 0 && nal <= NAL_MAX_NR); - LASSERT(nal_cmd[nal].nch_handler != NULL); + CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal); down(&nal_cmd_sem); - nal_cmd[nal].nch_handler = NULL; - nal_cmd[nal].nch_private = NULL; + cmd = libcfs_find_nal_cmd_handler(nal); + LASSERT (cmd != NULL); + cmd->nch_handler = NULL; + cmd->nch_private = NULL; up(&nal_cmd_sem); } EXPORT_SYMBOL(libcfs_nal_cmd_unregister); @@ -284,16 +316,17 @@ EXPORT_SYMBOL(libcfs_nal_cmd_unregister); int libcfs_nal_cmd(struct portals_cfg *pcfg) { + struct nal_cmd_handler *cmd; __u32 nal = pcfg->pcfg_nal; int rc = -EINVAL; ENTRY; down(&nal_cmd_sem); - if (nal > 0 && nal <= NAL_MAX_NR && - nal_cmd[nal].nch_handler != NULL) { + cmd = libcfs_find_nal_cmd_handler(nal); + if (cmd != NULL) { CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, pcfg->pcfg_command); - rc = nal_cmd[nal].nch_handler(pcfg, nal_cmd[nal].nch_private); + rc = cmd->nch_handler(pcfg, cmd->nch_private); } up(&nal_cmd_sem); diff --git a/lnet/lnet/Makefile.in b/lnet/lnet/Makefile.in index 6ce334b..c0f2e71 100644 --- a/lnet/lnet/Makefile.in +++ b/lnet/lnet/Makefile.in @@ -1,6 +1,6 @@ MODULES := portals -portals-objs := api-eq.o api-init.o api-me.o api-errno.o api-ni.o api-wrap.o -portals-objs += lib-dispatch.o lib-init.o lib-me.o lib-msg.o lib-eq.o lib-md.o +portals-objs := api-errno.o api-ni.o api-wrap.o +portals-objs += lib-init.o lib-me.o lib-msg.o lib-eq.o lib-md.o portals-objs += lib-move.o lib-ni.o lib-pid.o module.o @INCLUDE_RULES@ diff --git a/lnet/lnet/Makefile.mk b/lnet/lnet/Makefile.mk index de01765..088902a 100644 --- a/lnet/lnet/Makefile.mk +++ b/lnet/lnet/Makefile.mk @@ -6,7 +6,7 @@ include $(src)/../Kernelenv obj-y += portals.o -portals-objs := lib-dispatch.o lib-eq.o lib-init.o lib-md.o lib-me.o \ +portals-objs := lib-eq.o lib-init.o lib-md.o lib-me.o \ lib-move.o lib-msg.o lib-ni.o lib-pid.o \ - api-eq.o api-errno.o api-init.o api-me.o api-ni.o \ - api-wrap.o module.o + api-errno.o api-ni.o api-wrap.o \ + module.o diff --git a/lnet/lnet/api-eq.c b/lnet/lnet/api-eq.c deleted file mode 100644 index 0306043..0000000 --- a/lnet/lnet/api-eq.c +++ /dev/null @@ -1,120 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-eq.c - * User-level event queue management routines - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include - -int ptl_get_event (ptl_eq_t *eq, ptl_event_t *ev) -{ - int new_index = eq->sequence & (eq->size - 1); - ptl_event_t *new_event = &eq->base[new_index]; - ENTRY; - - CDEBUG(D_INFO, "new_event: %p, sequence: %lu, eq->size: %u\n", - new_event, eq->sequence, eq->size); - - if (PTL_SEQ_GT (eq->sequence, new_event->sequence)) { - RETURN(PTL_EQ_EMPTY); - } - - *ev = *new_event; - - /* ensure event is delivered correctly despite possible - races with lib_finalize */ - if (eq->sequence != new_event->sequence) { - CERROR("DROPPING EVENT: eq seq %lu ev seq %lu\n", - eq->sequence, new_event->sequence); - RETURN(PTL_EQ_DROPPED); - } - - eq->sequence = new_event->sequence + 1; - RETURN(PTL_OK); -} - -int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev) -{ - int which; - - return (PtlEQPoll (&eventq, 1, 0, ev, &which)); -} - -int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) -{ - int which; - - return (PtlEQPoll (&eventq_in, 1, PTL_TIME_FOREVER, - event_out, &which)); -} - -int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout, - ptl_event_t *event_out, int *which_out) -{ - nal_t *nal; - int i; - int rc; - unsigned long flags; - - if (!ptl_init) - RETURN(PTL_NO_INIT); - - if (neq_in < 1) - RETURN(PTL_EQ_INVALID); - - nal = ptl_hndl2nal(&eventqs_in[0]); - if (nal == NULL) - RETURN(PTL_EQ_INVALID); - - nal->lock(nal, &flags); - - for (;;) { - for (i = 0; i < neq_in; i++) { - ptl_eq_t *eq = ptl_handle2usereq(&eventqs_in[i]); - - if (i > 0 && - ptl_hndl2nal(&eventqs_in[i]) != nal) { - nal->unlock(nal, &flags); - RETURN (PTL_EQ_INVALID); - } - - /* size must be a power of 2 to handle a wrapped sequence # */ - LASSERT (eq->size != 0 && - eq->size == LOWEST_BIT_SET (eq->size)); - - rc = ptl_get_event (eq, event_out); - if (rc != PTL_EQ_EMPTY) { - nal->unlock(nal, &flags); - *which_out = i; - RETURN(rc); - } - } - - if (timeout == 0) { - nal->unlock(nal, &flags); - RETURN (PTL_EQ_EMPTY); - } - - timeout = nal->yield(nal, &flags, timeout); - } -} diff --git a/lnet/lnet/api-errno.c b/lnet/lnet/api-errno.c index 1c01c88..9a4e5ac 100644 --- a/lnet/lnet/api-errno.c +++ b/lnet/lnet/api-errno.c @@ -40,6 +40,9 @@ const char *ptl_err_str[] = { "PTL_EQ_IN_USE", + "PTL_NI_INVALID", + "PTL_MD_ILLEGAL", + "PTL_MAX_ERRNO" }; /* If you change these, you must update the number table in portals/errno.h */ diff --git a/lnet/lnet/api-init.c b/lnet/lnet/api-init.c deleted file mode 100644 index 9a98714..0000000 --- a/lnet/lnet/api-init.c +++ /dev/null @@ -1,49 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-init.c - * Initialization and global data for the p30 user side library - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include - -int PtlInit(int *max_interfaces) -{ - if (max_interfaces != NULL) - *max_interfaces = NAL_MAX_NR; - - LASSERT(!strcmp(ptl_err_str[PTL_MAX_ERRNO], "PTL_MAX_ERRNO")); - - return ptl_ni_init(); -} - - -void PtlFini(void) -{ - ptl_ni_fini(); -} - - -void PtlSnprintHandle(char *str, int len, ptl_handle_any_t h) -{ - snprintf(str, len, "0x%lx."LPX64, h.nal_idx, h.cookie); -} diff --git a/lnet/lnet/api-me.c b/lnet/lnet/api-me.c deleted file mode 100644 index 37f0150..0000000 --- a/lnet/lnet/api-me.c +++ /dev/null @@ -1,28 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-me.c - * Match Entry local operations. - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include - diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 4f37d13..56afd45 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -66,6 +66,8 @@ nal_t *ptl_hndl2nal(ptl_handle_any_t *handle) * invalidated out from under her (or worse, swapped for a * completely different interface!) */ + LASSERT (ptl_init); + if (((idx ^ NI_HANDLE_MAGIC) & ~NI_HANDLE_MASK) != 0) return NULL; @@ -112,12 +114,17 @@ void ptl_unregister_nal (ptl_interface_t interface) ptl_mutex_exit(); } -int ptl_ni_init(void) +int PtlInit(int *max_interfaces) { + LASSERT(!strcmp(ptl_err_str[PTL_MAX_ERRNO], "PTL_MAX_ERRNO")); + /* If this assertion fails, we need more bits in NI_HANDLE_MASK and * to shift NI_HANDLE_MAGIC left appropriately */ LASSERT (NAL_MAX_NR <= (NI_HANDLE_MASK + 1)); + if (max_interfaces != NULL) + *max_interfaces = NAL_MAX_NR; + ptl_mutex_enter(); if (!ptl_init) { @@ -143,7 +150,7 @@ int ptl_ni_init(void) return PTL_OK; } -void ptl_ni_fini(void) +void PtlFini(void) { nal_t *nal; int i; @@ -160,7 +167,7 @@ void ptl_ni_fini(void) if (nal->nal_refct != 0) { CWARN("NAL %d has outstanding refcount %d\n", i, nal->nal_refct); - nal->shutdown(nal); + nal->nal_ni_fini(nal); } ptl_nal_table[i] = NULL; @@ -202,9 +209,11 @@ int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid, } nal = ptl_nal_table[interface]; - + nal->nal_handle.nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | interface; + nal->nal_handle.cookie = 0; + CDEBUG(D_OTHER, "Starting up NAL (%d) refs %d\n", interface, nal->nal_refct); - rc = nal->startup(nal, requested_pid, desired_limits, actual_limits); + rc = nal->nal_ni_init(nal, requested_pid, desired_limits, actual_limits); if (rc != PTL_OK) { CERROR("Error %d starting up NAL %d, refs %d\n", rc, @@ -218,10 +227,11 @@ int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid, } nal->nal_refct++; - handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | interface; + *handle = nal->nal_handle; out: ptl_mutex_exit (); + return rc; } @@ -248,15 +258,8 @@ int PtlNIFini(ptl_handle_ni_t ni) nal->nal_refct--; /* nal_refct == 0 tells nal->shutdown to really shut down */ - nal->shutdown(nal); + nal->nal_ni_fini(nal); ptl_mutex_exit (); return PTL_OK; } - -int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * ni_out) -{ - *ni_out = handle_in; - - return PTL_OK; -} diff --git a/lnet/lnet/api-wrap.c b/lnet/lnet/api-wrap.c index 3e6f9ce..d7ff020 100644 --- a/lnet/lnet/api-wrap.c +++ b/lnet/lnet/api-wrap.c @@ -26,133 +26,98 @@ # define DEBUG_SUBSYSTEM S_PORTALS #include -static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf, - int argsize, void *retbuf, int retsize) +void PtlSnprintHandle(char *str, int len, ptl_handle_any_t h) { - nal_t *nal; + snprintf(str, len, "0x%lx."LPX64, h.nal_idx, h.cookie); +} - if (!ptl_init) { - CERROR("Not initialized\n"); +int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t *ni_out) +{ + if (!ptl_init) return PTL_NO_INIT; - } - - nal = ptl_hndl2nal(&any_h); - if (!nal) + + if (ptl_hndl2nal(&handle_in) == NULL) return PTL_HANDLE_INVALID; - - nal->forward(nal, cmd, argbuf, argsize, retbuf, retsize); - + + *ni_out = handle_in; return PTL_OK; } int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id) { - PtlGetId_in args; - PtlGetId_out ret; - int rc; - - args.handle_in = ni_handle; + nal_t *nal; - rc = do_forward(ni_handle, PTL_GETID, &args, sizeof(args), &ret, - sizeof(ret)); - if (rc != PTL_OK) - return rc; + if (!ptl_init) + return PTL_NO_INIT; - if (id) - *id = ret.id_out; + nal = ptl_hndl2nal(&ni_handle); + if (nal == NULL) + return PTL_NI_INVALID; - return ret.rc; + return nal->nal_get_id(nal, id); } int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) { - PtlFailNid_in args; - PtlFailNid_out ret; - int rc; - - args.interface = interface; - args.nid = nid; - args.threshold = threshold; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; - rc = do_forward (interface, PTL_FAILNID, - &args, sizeof(args), &ret, sizeof (ret)); + nal = ptl_hndl2nal(&interface); + if (nal == NULL) + return PTL_NI_INVALID; - return ((rc != PTL_OK) ? rc : ret.rc); + return nal->nal_fail_nid(nal, nid, threshold); } int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, - ptl_sr_value_t * status_out) + ptl_sr_value_t *status_out) { - PtlNIStatus_in args; - PtlNIStatus_out ret; - int rc; + nal_t *nal; - args.interface_in = interface_in; - args.register_in = register_in; - - rc = do_forward(interface_in, PTL_NISTATUS, &args, sizeof(args), &ret, - sizeof(ret)); - - if (rc != PTL_OK) - return rc; - - if (status_out) - *status_out = ret.status_out; + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface_in); + if (nal == NULL) + return PTL_NI_INVALID; - return ret.rc; + return nal->nal_ni_status(nal, register_in, status_out); } int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, unsigned long *distance_out) { - PtlNIDist_in args; - PtlNIDist_out ret; - int rc; - - args.interface_in = interface_in; - args.process_in = process_in; - - rc = do_forward(interface_in, PTL_NIDIST, &args, sizeof(args), &ret, - sizeof(ret)); + nal_t *nal; - if (rc != PTL_OK) - return rc; - - if (distance_out) - *distance_out = ret.distance_out; + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface_in); + if (nal == NULL) + return PTL_NI_INVALID; - return ret.rc; + return nal->nal_ni_dist(nal, &process_in, distance_out); } int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, - ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out) + ptl_ins_pos_t pos_in, ptl_handle_me_t *handle_out) { - PtlMEAttach_in args; - PtlMEAttach_out ret; - int rc; - - args.interface_in = interface_in; - args.index_in = index_in; - args.match_id_in = match_id_in; - args.match_bits_in = match_bits_in; - args.ignore_bits_in = ignore_bits_in; - args.unlink_in = unlink_in; - args.position_in = pos_in; - - rc = do_forward(interface_in, PTL_MEATTACH, &args, sizeof(args), &ret, - sizeof(ret)); - - if (rc != PTL_OK) - return rc; - - if (handle_out) { - handle_out->nal_idx = interface_in.nal_idx; - handle_out->cookie = ret.handle_out.cookie; - } - - return ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface_in); + if (nal == NULL) + return PTL_NI_INVALID; + + return nal->nal_me_attach(nal, index_in, match_id_in, + match_bits_in, ignore_bits_in, + unlink_in, pos_in, handle_out); } int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, @@ -160,367 +125,226 @@ int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, ptl_handle_me_t * handle_out) { - PtlMEInsert_in args; - PtlMEInsert_out ret; - int rc; - - args.current_in = current_in; - args.match_id_in = match_id_in; - args.match_bits_in = match_bits_in; - args.ignore_bits_in = ignore_bits_in; - args.unlink_in = unlink_in; - args.position_in = position_in; - - rc = do_forward(current_in, PTL_MEINSERT, &args, sizeof(args), &ret, - sizeof(ret)); - - if (rc != PTL_OK) - return (rc == PTL_HANDLE_INVALID) ? PTL_ME_INVALID : rc; - - if (handle_out) { - handle_out->nal_idx = current_in.nal_idx; - handle_out->cookie = ret.handle_out.cookie; - } - return ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(¤t_in); + if (nal == NULL) + return PTL_ME_INVALID; + + return nal->nal_me_insert(nal, ¤t_in, match_id_in, + match_bits_in, ignore_bits_in, + unlink_in, position_in, handle_out); } int PtlMEUnlink(ptl_handle_me_t current_in) { - PtlMEUnlink_in args; - PtlMEUnlink_out ret; - int rc; + nal_t *nal; - args.current_in = current_in; - args.unlink_in = PTL_RETAIN; - - rc = do_forward(current_in, PTL_MEUNLINK, &args, sizeof(args), &ret, - sizeof(ret)); - - if (rc != PTL_OK) - return (rc == PTL_HANDLE_INVALID) ? PTL_ME_INVALID : rc; + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(¤t_in); + if (nal == NULL) + return PTL_ME_INVALID; - return ret.rc; + return nal->nal_me_unlink(nal, ¤t_in); } -int PtlTblDump(ptl_handle_ni_t ni, int index_in) +int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) { - PtlTblDump_in args; - PtlTblDump_out ret; - int rc; + nal_t *nal; - args.index_in = index_in; - - rc = do_forward(ni, PTL_TBLDUMP, &args, sizeof(args), &ret, - sizeof(ret)); + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&me_in); + if (nal == NULL) + return PTL_ME_INVALID; - if (rc != PTL_OK) - return rc; + if (!PtlHandleIsEqual(md_in.eventq, PTL_EQ_NONE) && + ptl_hndl2nal(&md_in.eventq) != nal) + return PTL_MD_ILLEGAL; - return ret.rc; + return (nal->nal_md_attach)(nal, &me_in, &md_in, + unlink_in, handle_out); } -int PtlMEDump(ptl_handle_me_t current_in) +int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t *handle_out) { - PtlMEDump_in args; - PtlMEDump_out ret; - int rc; + nal_t *nal; - args.current_in = current_in; - - rc = do_forward(current_in, PTL_MEDUMP, &args, sizeof(args), &ret, - sizeof(ret)); + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&ni_in); + if (nal == NULL) + return PTL_NI_INVALID; - if (rc != PTL_OK) - return (rc == PTL_HANDLE_INVALID) ? PTL_ME_INVALID : rc; + if (!PtlHandleIsEqual(md_in.eventq, PTL_EQ_NONE) && + ptl_hndl2nal(&md_in.eventq) != nal) + return PTL_MD_ILLEGAL; - return ret.rc; + return (nal->nal_md_bind)(nal, &md_in, unlink_in, handle_out); } -static ptl_handle_eq_t md2eq (ptl_md_t *md) +int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout, + ptl_md_t *new_inout, ptl_handle_eq_t testq_in) { - if (PtlHandleIsEqual (md->eventq, PTL_EQ_NONE)) - return (PTL_EQ_NONE); + nal_t *nal; - return (ptl_handle2usereq (&md->eventq)->cb_eq_handle); -} - - -int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) -{ - PtlMDAttach_in args; - PtlMDAttach_out ret; - int rc; - - args.eq_in = md2eq(&md_in); - args.me_in = me_in; - args.md_in = md_in; - args.unlink_in = unlink_in; - - rc = do_forward(me_in, PTL_MDATTACH, - &args, sizeof(args), &ret, sizeof(ret)); - - if (rc != PTL_OK) - return (rc == PTL_HANDLE_INVALID) ? PTL_ME_INVALID : rc; - - if (handle_out) { - handle_out->nal_idx = me_in.nal_idx; - handle_out->cookie = ret.handle_out.cookie; - } - return ret.rc; -} - + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&md_in); + if (nal == NULL) + return PTL_MD_INVALID; + if (!PtlHandleIsEqual(testq_in, PTL_EQ_NONE) && + ptl_hndl2nal(&testq_in) != nal) + return PTL_EQ_INVALID; -int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) -{ - PtlMDBind_in args; - PtlMDBind_out ret; - int rc; - - args.eq_in = md2eq(&md_in); - args.ni_in = ni_in; - args.md_in = md_in; - args.unlink_in = unlink_in; - - rc = do_forward(ni_in, PTL_MDBIND, - &args, sizeof(args), &ret, sizeof(ret)); - - if (rc != PTL_OK) - return rc; - - if (handle_out) { - handle_out->nal_idx = ni_in.nal_idx; - handle_out->cookie = ret.handle_out.cookie; - } - return ret.rc; + return (nal->nal_md_update)(nal, &md_in, + old_inout, new_inout, &testq_in); } -int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout, - ptl_md_t *new_inout, ptl_handle_eq_t testq_in) +int PtlMDUnlink(ptl_handle_md_t md_in) { - PtlMDUpdate_internal_in args; - PtlMDUpdate_internal_out ret; - int rc; - - args.md_in = md_in; - - if (old_inout) { - args.old_inout = *old_inout; - args.old_inout_valid = 1; - } else - args.old_inout_valid = 0; - - if (new_inout) { - args.new_inout = *new_inout; - args.new_inout_valid = 1; - } else - args.new_inout_valid = 0; - - if (PtlHandleIsEqual (testq_in, PTL_EQ_NONE)) { - args.testq_in = PTL_EQ_NONE; - args.sequence_in = -1; - } else { - ptl_eq_t *eq = ptl_handle2usereq (&testq_in); - - args.testq_in = eq->cb_eq_handle; - args.sequence_in = eq->sequence; - } - - rc = do_forward(md_in, PTL_MDUPDATE, &args, sizeof(args), &ret, - sizeof(ret)); - if (rc != PTL_OK) - return (rc == PTL_HANDLE_INVALID) ? PTL_MD_INVALID : rc; - - if (old_inout) - *old_inout = ret.old_inout; - - return ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&md_in); + if (nal == NULL) + return PTL_MD_INVALID; + + return (nal->nal_md_unlink)(nal, &md_in); } -int PtlMDUnlink(ptl_handle_md_t md_in) +int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count, + ptl_eq_handler_t callback, + ptl_handle_eq_t *handle_out) { - PtlMDUnlink_in args; - PtlMDUnlink_out ret; - int rc; - - args.md_in = md_in; - rc = do_forward(md_in, PTL_MDUNLINK, &args, sizeof(args), &ret, - sizeof(ret)); - if (rc != PTL_OK) - return (rc == PTL_HANDLE_INVALID) ? PTL_MD_INVALID : rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface); + if (nal == NULL) + return PTL_NI_INVALID; - return ret.rc; + return (nal->nal_eq_alloc)(nal, count, callback, handle_out); } -int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count, - ptl_eq_handler_t callback, - ptl_handle_eq_t * handle_out) +int PtlEQFree(ptl_handle_eq_t eventq) { - ptl_eq_t *eq = NULL; - ptl_event_t *ev = NULL; - PtlEQAlloc_in args; - PtlEQAlloc_out ret; - int rc, i; - nal_t *nal; + nal_t *nal; if (!ptl_init) return PTL_NO_INIT; - nal = ptl_hndl2nal (&interface); + nal = ptl_hndl2nal(&eventq); if (nal == NULL) - return PTL_HANDLE_INVALID; + return PTL_EQ_INVALID; - if (count != LOWEST_BIT_SET(count)) { /* not a power of 2 already */ - do { /* knock off all but the top bit... */ - count &= ~LOWEST_BIT_SET (count); - } while (count != LOWEST_BIT_SET(count)); - - count <<= 1; /* ...and round up */ - } - - if (count == 0) /* catch bad parameter / overflow on roundup */ - return (PTL_VAL_FAILED); - - PORTAL_ALLOC(ev, count * sizeof(ptl_event_t)); - if (!ev) - return PTL_NO_SPACE; - - for (i = 0; i < count; i++) - ev[i].sequence = 0; - - args.ni_in = interface; - args.count_in = count; - args.base_in = ev; - args.len_in = count * sizeof(*ev); - args.callback_in = callback; - - rc = do_forward(interface, PTL_EQALLOC, &args, sizeof(args), &ret, - sizeof(ret)); - if (rc != PTL_OK) - goto fail; - if (ret.rc) - GOTO(fail, rc = ret.rc); - - PORTAL_ALLOC(eq, sizeof(*eq)); - if (!eq) { - rc = PTL_NO_SPACE; - goto fail; - } - - eq->sequence = 1; - eq->size = count; - eq->base = ev; - - /* EQ handles are a little wierd. PtlEQGet() just looks at the - * queued events in shared memory. It doesn't want to do_forward() - * at all, so the cookie in the EQ handle we pass out of here is - * simply a pointer to the event queue we just set up. We stash - * the handle returned by do_forward(), so we can pass it back via - * do_forward() when we need to. */ - - eq->cb_eq_handle.nal_idx = interface.nal_idx; - eq->cb_eq_handle.cookie = ret.handle_out.cookie; - - handle_out->nal_idx = interface.nal_idx; - handle_out->cookie = (__u64)((unsigned long)eq); - return PTL_OK; + return (nal->nal_eq_free)(nal, &eventq); +} -fail: - PORTAL_FREE(ev, count * sizeof(ptl_event_t)); - return rc; +int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t *ev) +{ + int which; + + return (PtlEQPoll (&eventq, 1, 0, ev, &which)); } -int PtlEQFree(ptl_handle_eq_t eventq) +int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) { - PtlEQFree_in args; - PtlEQFree_out ret; - ptl_eq_t *eq; - int rc; + int which; + + return (PtlEQPoll (&eventq_in, 1, PTL_TIME_FOREVER, + event_out, &which)); +} - eq = ptl_handle2usereq (&eventq); - args.eventq_in = eq->cb_eq_handle; +int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout, + ptl_event_t *event_out, int *which_out) +{ + int i; + nal_t *nal; - rc = do_forward(eq->cb_eq_handle, PTL_EQFREE, &args, - sizeof(args), &ret, sizeof(ret)); + if (!ptl_init) + return PTL_NO_INIT; + + if (neq_in < 1) + return PTL_EQ_INVALID; + + nal = ptl_hndl2nal(&eventqs_in[0]); + if (nal == NULL) + return PTL_EQ_INVALID; - /* XXX we're betting rc == PTL_OK here */ - PORTAL_FREE(eq->base, eq->size * sizeof(ptl_event_t)); - PORTAL_FREE(eq, sizeof(*eq)); + for (i = 1; i < neq_in; i++) + if (ptl_hndl2nal(&eventqs_in[i]) != nal) + return PTL_EQ_INVALID; - return rc; + return (nal->nal_eq_poll)(nal, eventqs_in, neq_in, timeout, + event_out, which_out); } + int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, ptl_process_id_t match_id_in, ptl_pt_index_t portal_in) { - PtlACEntry_in args; - PtlACEntry_out ret; - int rc; - - /* - * Copy arguments into the argument block to - * hand to the forwarding object - */ - args.ni_in = ni_in; - args.index_in = index_in; - args.match_id_in = match_id_in; - args.portal_in = portal_in; - - rc = do_forward(ni_in, PTL_ACENTRY, &args, sizeof(args), &ret, - sizeof(ret)); - - return (rc != PTL_OK) ? rc : ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&ni_in); + if (nal == NULL) + return PTL_NI_INVALID; + + return (nal->nal_ace_entry)(nal, index_in, match_id_in, portal_in); } int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, ptl_process_id_t target_in, ptl_pt_index_t portal_in, - ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, + ptl_ac_index_t ac_in, ptl_match_bits_t match_bits_in, ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in) { - PtlPut_in args; - PtlPut_out ret; - int rc; - - /* - * Copy arguments into the argument block to - * hand to the forwarding object - */ - args.md_in = md_in; - args.ack_req_in = ack_req_in; - args.target_in = target_in; - args.portal_in = portal_in; - args.cookie_in = cookie_in; - args.match_bits_in = match_bits_in; - args.offset_in = offset_in; - args.hdr_data_in = hdr_data_in; - - rc = do_forward(md_in, PTL_PUT, &args, sizeof(args), &ret, sizeof(ret)); - - return (rc != PTL_OK) ? rc : ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&md_in); + if (nal == NULL) + return PTL_MD_INVALID; + + return (nal->nal_put)(nal, &md_in, ack_req_in, + &target_in, portal_in, ac_in, + match_bits_in, offset_in, hdr_data_in); } int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, - ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, + ptl_pt_index_t portal_in, ptl_ac_index_t ac_in, ptl_match_bits_t match_bits_in, ptl_size_t offset_in) { - PtlGet_in args; - PtlGet_out ret; - int rc; - - /* - * Copy arguments into the argument block to - * hand to the forwarding object - */ - args.md_in = md_in; - args.target_in = target_in; - args.portal_in = portal_in; - args.cookie_in = cookie_in; - args.match_bits_in = match_bits_in; - args.offset_in = offset_in; - - rc = do_forward(md_in, PTL_GET, &args, sizeof(args), &ret, sizeof(ret)); - - return (rc != PTL_OK) ? rc : ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&md_in); + if (nal == NULL) + return PTL_MD_INVALID; + + return (nal->nal_get)(nal, &md_in, + &target_in, portal_in, ac_in, + match_bits_in, offset_in); } + diff --git a/lnet/lnet/autoMakefile.am b/lnet/lnet/autoMakefile.am index bf7a107..285f8fe 100644 --- a/lnet/lnet/autoMakefile.am +++ b/lnet/lnet/autoMakefile.am @@ -3,8 +3,8 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution -my_sources = api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c \ - lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c \ +my_sources = api-errno.c api-ni.c api-wrap.c \ + lib-init.c lib-me.c lib-msg.c lib-eq.c \ lib-md.c lib-move.c lib-ni.c lib-pid.c if !CRAY_PORTALS diff --git a/lnet/lnet/lib-dispatch.c b/lnet/lnet/lib-dispatch.c deleted file mode 100644 index 798e117..0000000 --- a/lnet/lnet/lib-dispatch.c +++ /dev/null @@ -1,79 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib/lib-dispatch.c - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include -#include - -typedef struct { - int (*fun) (nal_cb_t * nal, void *private, void *in, void *out); - char *name; -} dispatch_table_t; - -static dispatch_table_t dispatch_table[] = { - [PTL_GETID] {do_PtlGetId, "PtlGetId"}, - [PTL_NISTATUS] {do_PtlNIStatus, "PtlNIStatus"}, - [PTL_NIDIST] {do_PtlNIDist, "PtlNIDist"}, - [PTL_MEATTACH] {do_PtlMEAttach, "PtlMEAttach"}, - [PTL_MEINSERT] {do_PtlMEInsert, "PtlMEInsert"}, - [PTL_MEUNLINK] {do_PtlMEUnlink, "PtlMEUnlink"}, - [PTL_TBLDUMP] {do_PtlTblDump, "PtlTblDump"}, - [PTL_MEDUMP] {do_PtlMEDump, "PtlMEDump"}, - [PTL_MDATTACH] {do_PtlMDAttach, "PtlMDAttach"}, - [PTL_MDBIND] {do_PtlMDBind, "PtlMDBind"}, - [PTL_MDUPDATE] {do_PtlMDUpdate_internal, "PtlMDUpdate_internal"}, - [PTL_MDUNLINK] {do_PtlMDUnlink, "PtlMDUnlink"}, - [PTL_EQALLOC] {do_PtlEQAlloc_internal, "PtlEQAlloc_internal"}, - [PTL_EQFREE] {do_PtlEQFree_internal, "PtlEQFree_internal"}, - [PTL_PUT] {do_PtlPut, "PtlPut"}, - [PTL_GET] {do_PtlGet, "PtlGet"}, - [PTL_FAILNID] {do_PtlFailNid, "PtlFailNid"}, - /* */ {0, ""} -}; - -/* - * This really should be elsewhere, but lib-p30/dispatch.c is - * an automatically generated file. - */ -void lib_dispatch(nal_cb_t * nal, void *private, int index, void *arg_block, - void *ret_block) -{ - lib_ni_t *ni = &nal->ni; - - if (index < 0 || index > LIB_MAX_DISPATCH || - !dispatch_table[index].fun) { - CDEBUG(D_NET, LPU64": Invalid API call %d\n", ni->nid, index); - return; - } - - CDEBUG(D_NET, LPU64": API call %s (%d)\n", ni->nid, - dispatch_table[index].name, index); - - dispatch_table[index].fun(nal, private, arg_block, ret_block); -} - -char *dispatch_name(int index) -{ - return dispatch_table[index].name; -} diff --git a/lnet/lnet/lib-eq.c b/lnet/lnet/lib-eq.c index 8a91860..8ea6fdd 100644 --- a/lnet/lnet/lib-eq.c +++ b/lnet/lnet/lib-eq.c @@ -25,104 +25,241 @@ #define DEBUG_SUBSYSTEM S_PORTALS #include -#include -int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *v_args, - void *v_ret) +int +lib_api_eq_alloc (nal_t *apinal, ptl_size_t count, + ptl_eq_handler_t callback, + ptl_handle_eq_t *handle) { - /* - * Incoming: - * ptl_handle_ni_t ni_in - * ptl_size_t count_in - * void * base_in - * - * Outgoing: - * ptl_handle_eq_t * handle_out - */ - - PtlEQAlloc_in *args = v_args; - PtlEQAlloc_out *ret = v_ret; - - lib_eq_t *eq; - unsigned long flags; - - /* api should have rounded up */ - if (args->count_in != LOWEST_BIT_SET (args->count_in)) - return ret->rc = PTL_VAL_FAILED; + lib_nal_t *nal = apinal->nal_data; + lib_eq_t *eq; + unsigned long flags; + int rc; + /* We need count to be a power of 2 so that when eq_{enq,deq}_seq + * overflow, they don't skip entries, so the queue has the same + * apparant capacity at all times */ + + if (count != LOWEST_BIT_SET(count)) { /* not a power of 2 already */ + do { /* knock off all but the top bit... */ + count &= ~LOWEST_BIT_SET (count); + } while (count != LOWEST_BIT_SET(count)); + + count <<= 1; /* ...and round up */ + } + + if (count == 0) /* catch bad parameter / overflow on roundup */ + return (PTL_VAL_FAILED); + eq = lib_eq_alloc (nal); if (eq == NULL) - return (ret->rc = PTL_NO_SPACE); + return (PTL_NO_SPACE); - state_lock(nal, &flags); + PORTAL_ALLOC(eq->eq_events, count * sizeof(ptl_event_t)); + if (eq->eq_events == NULL) { + LIB_LOCK(nal, flags); + lib_eq_free (nal, eq); + LIB_UNLOCK(nal, flags); + } - if (nal->cb_map != NULL) { + if (nal->libnal_map != NULL) { struct iovec iov = { - .iov_base = args->base_in, - .iov_len = args->count_in * sizeof (ptl_event_t) }; + .iov_base = eq->eq_events, + .iov_len = count * sizeof(ptl_event_t)}; - ret->rc = nal->cb_map (nal, 1, &iov, &eq->eq_addrkey); - if (ret->rc != PTL_OK) { + rc = nal->libnal_map(nal, 1, &iov, &eq->eq_addrkey); + if (rc != PTL_OK) { + LIB_LOCK(nal, flags); lib_eq_free (nal, eq); - - state_unlock (nal, &flags); - return (ret->rc); + LIB_UNLOCK(nal, flags); + return (rc); } } - eq->sequence = 1; - eq->base = args->base_in; - eq->size = args->count_in; + /* NB this resets all event sequence numbers to 0, to be earlier + * than eq_deq_seq */ + memset(eq->eq_events, 0, count * sizeof(ptl_event_t)); + + eq->eq_deq_seq = 1; + eq->eq_enq_seq = 1; + eq->eq_size = count; eq->eq_refcount = 0; - eq->event_callback = args->callback_in; + eq->eq_callback = callback; + + LIB_LOCK(nal, flags); lib_initialise_handle (nal, &eq->eq_lh, PTL_COOKIE_TYPE_EQ); - list_add (&eq->eq_list, &nal->ni.ni_active_eqs); + list_add (&eq->eq_list, &nal->libnal_ni.ni_active_eqs); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - ptl_eq2handle(&ret->handle_out, eq); - return (ret->rc = PTL_OK); + ptl_eq2handle(handle, nal, eq); + return (PTL_OK); } -int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *v_args, - void *v_ret) +int +lib_api_eq_free(nal_t *apinal, ptl_handle_eq_t *eqh) { - /* - * Incoming: - * ptl_handle_eq_t eventq_in - * - * Outgoing: - */ - - PtlEQFree_in *args = v_args; - PtlEQFree_out *ret = v_ret; - lib_eq_t *eq; - long flags; + lib_nal_t *nal = apinal->nal_data; + lib_eq_t *eq; + int size; + ptl_event_t *events; + void *addrkey; + unsigned long flags; - state_lock (nal, &flags); + LIB_LOCK(nal, flags); - eq = ptl_handle2eq(&args->eventq_in, nal); + eq = ptl_handle2eq(eqh, nal); if (eq == NULL) { - ret->rc = PTL_EQ_INVALID; - } else if (eq->eq_refcount != 0) { - ret->rc = PTL_EQ_IN_USE; + LIB_UNLOCK(nal, flags); + return (PTL_EQ_INVALID); + } + + if (eq->eq_refcount != 0) { + LIB_UNLOCK(nal, flags); + return (PTL_EQ_IN_USE); + } + + /* stash for free after lock dropped */ + events = eq->eq_events; + size = eq->eq_size; + addrkey = eq->eq_addrkey; + + lib_invalidate_handle (nal, &eq->eq_lh); + list_del (&eq->eq_list); + lib_eq_free (nal, eq); + + LIB_UNLOCK(nal, flags); + + if (nal->libnal_unmap != NULL) { + struct iovec iov = { + .iov_base = events, + .iov_len = size * sizeof(ptl_event_t)}; + + nal->libnal_unmap(nal, 1, &iov, &addrkey); + } + + PORTAL_FREE(events, size * sizeof (ptl_event_t)); + + return (PTL_OK); +} + +int +lib_get_event (lib_eq_t *eq, ptl_event_t *ev) +{ + int new_index = eq->eq_deq_seq & (eq->eq_size - 1); + ptl_event_t *new_event = &eq->eq_events[new_index]; + int rc; + ENTRY; + + CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", + new_event, eq->eq_deq_seq, eq->eq_size); + + if (PTL_SEQ_GT (eq->eq_deq_seq, new_event->sequence)) { + RETURN(PTL_EQ_EMPTY); + } + + /* We've got a new event... */ + *ev = *new_event; + + /* ...but did it overwrite an event we've not seen yet? */ + if (eq->eq_deq_seq == new_event->sequence) { + rc = PTL_OK; } else { - if (nal->cb_unmap != NULL) { - struct iovec iov = { - .iov_base = eq->base, - .iov_len = eq->size * sizeof (ptl_event_t) }; - - nal->cb_unmap(nal, 1, &iov, &eq->eq_addrkey); + CERROR("Event Queue Overflow: eq seq %lu ev seq %lu\n", + eq->eq_deq_seq, new_event->sequence); + rc = PTL_EQ_DROPPED; + } + + eq->eq_deq_seq = new_event->sequence + 1; + RETURN(rc); +} + + +int +lib_api_eq_poll (nal_t *apinal, + ptl_handle_eq_t *eventqs, int neq, int timeout_ms, + ptl_event_t *event, int *which) +{ + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; + unsigned long flags; + int i; + int rc; +#ifdef __KERNEL__ + wait_queue_t wq; + unsigned long now; +#else + struct timeval then; + struct timeval now; + struct timespec ts; +#endif + ENTRY; + + LIB_LOCK(nal, flags); + + for (;;) { + for (i = 0; i < neq; i++) { + lib_eq_t *eq = ptl_handle2eq(&eventqs[i], nal); + + rc = lib_get_event (eq, event); + if (rc != PTL_EQ_EMPTY) { + LIB_UNLOCK(nal, flags); + *which = i; + RETURN(rc); + } + } + + if (timeout_ms == 0) { + LIB_UNLOCK (nal, flags); + RETURN (PTL_EQ_EMPTY); } - lib_invalidate_handle (nal, &eq->eq_lh); - list_del (&eq->eq_list); - lib_eq_free (nal, eq); - ret->rc = PTL_OK; - } + /* Some architectures force us to do spin locking/unlocking + * in the same stack frame, means we can abstract the + * locking here */ +#ifdef __KERNEL__ + init_waitqueue_entry(&wq, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&ni->ni_waitq, &wq); - state_unlock (nal, &flags); + LIB_UNLOCK(nal, flags); - return (ret->rc); + if (timeout_ms < 0) { + schedule (); + } else { + now = jiffies; + schedule_timeout((timeout_ms * HZ)/1000); + timeout_ms -= ((jiffies - now) * 1000)/HZ; + if (timeout_ms < 0) + timeout_ms = 0; + } + + LIB_LOCK(nal, flags); +#else + if (timeout_ms < 0) { + pthread_cond_wait(&ni->ni_cond, &ni->ni_mutex); + } else { + gettimeofday(&then, NULL); + + ts.tv_sec = then.tv_sec + timeout_ms/1000; + ts.tv_nsec = then.tv_usec * 1000 + + (timeout_ms%1000) * 1000000; + if (ts.tv_nsec >= 1000000000) { + ts.tv_sec++; + ts.tv_nsec -= 1000000000; + } + + pthread_cond_timedwait(&ni->ni_cond, + &ni->ni_mutex, &ts); + + gettimeofday(&now, NULL); + timeout_ms -= (now.tv_sec - then.tv_sec) * 1000 + + (now.tv_usec - then.tv_usec) / 1000; + + if (timeout_ms < 0) + timeout_ms = 0; + } +#endif + } } diff --git a/lnet/lnet/lib-init.c b/lnet/lnet/lib-init.c index c62dbc2..9d97bc1 100644 --- a/lnet/lnet/lib-init.c +++ b/lnet/lnet/lib-init.c @@ -41,7 +41,7 @@ #ifndef PTL_USE_LIB_FREELIST int -kportal_descriptor_setup (nal_cb_t *nal, +kportal_descriptor_setup (lib_nal_t *nal, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { @@ -54,13 +54,13 @@ kportal_descriptor_setup (nal_cb_t *nal, } void -kportal_descriptor_cleanup (nal_cb_t *nal) +kportal_descriptor_cleanup (lib_nal_t *nal) { } #else int -lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) +lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int n, int size) { char *space; @@ -68,7 +68,7 @@ lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) size += offsetof (lib_freeobj_t, fo_contents); - space = nal->cb_malloc (nal, n * size); + PORTAL_ALLOC(space, n * size); if (space == NULL) return (PTL_NO_SPACE); @@ -88,7 +88,7 @@ lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) } void -lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl) +lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl) { struct list_head *el; int count; @@ -102,23 +102,24 @@ lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl) LASSERT (count == fl->fl_nobjs); - nal->cb_free (nal, fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); + PORTAL_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); memset (fl, 0, sizeof (fl)); } int -kportal_descriptor_setup (nal_cb_t *nal, +kportal_descriptor_setup (lib_nal_t *nal, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { /* NB on failure caller must still call kportal_descriptor_cleanup */ /* ****** */ - int rc; + lib_ni_t *ni = &nal->libnal_ni; + int rc; - memset (&nal->ni.ni_free_mes, 0, sizeof (nal->ni.ni_free_mes)); - memset (&nal->ni.ni_free_msgs, 0, sizeof (nal->ni.ni_free_msgs)); - memset (&nal->ni.ni_free_mds, 0, sizeof (nal->ni.ni_free_mds)); - memset (&nal->ni.ni_free_eqs, 0, sizeof (nal->ni.ni_free_eqs)); + memset (&ni->ni_free_mes, 0, sizeof (ni->ni_free_mes)); + memset (&ni->ni_free_msgs, 0, sizeof (ni->ni_free_msgs)); + memset (&ni->ni_free_mds, 0, sizeof (ni->ni_free_mds)); + memset (&ni->ni_free_eqs, 0, sizeof (ni->ni_free_eqs)); /* Ignore requested limits! */ actual_limits->max_mes = MAX_MES; @@ -127,39 +128,41 @@ kportal_descriptor_setup (nal_cb_t *nal, /* Hahahah what a load of bollocks. There's nowhere to * specify the max # messages in-flight */ - rc = lib_freelist_init (nal, &nal->ni.ni_free_mes, + rc = lib_freelist_init (nal, &ni->ni_free_mes, MAX_MES, sizeof (lib_me_t)); if (rc != PTL_OK) return (rc); - rc = lib_freelist_init (nal, &nal->ni.ni_free_msgs, + rc = lib_freelist_init (nal, &ni->ni_free_msgs, MAX_MSGS, sizeof (lib_msg_t)); if (rc != PTL_OK) return (rc); - rc = lib_freelist_init (nal, &nal->ni.ni_free_mds, + rc = lib_freelist_init (nal, &ni->ni_free_mds, MAX_MDS, sizeof (lib_md_t)); if (rc != PTL_OK) return (rc); - rc = lib_freelist_init (nal, &nal->ni.ni_free_eqs, + rc = lib_freelist_init (nal, &ni->ni_free_eqs, MAX_EQS, sizeof (lib_eq_t)); return (rc); } void -kportal_descriptor_cleanup (nal_cb_t *nal) +kportal_descriptor_cleanup (lib_nal_t *nal) { - lib_freelist_fini (nal, &nal->ni.ni_free_mes); - lib_freelist_fini (nal, &nal->ni.ni_free_msgs); - lib_freelist_fini (nal, &nal->ni.ni_free_mds); - lib_freelist_fini (nal, &nal->ni.ni_free_eqs); + lib_ni_t *ni = &nal->libnal_ni; + + lib_freelist_fini (nal, &ni->ni_free_mes); + lib_freelist_fini (nal, &ni->ni_free_msgs); + lib_freelist_fini (nal, &ni->ni_free_mds); + lib_freelist_fini (nal, &ni->ni_free_eqs); } #endif __u64 -lib_create_interface_cookie (nal_cb_t *nal) +lib_create_interface_cookie (lib_nal_t *nal) { /* NB the interface cookie in wire handles guards against delayed * replies and ACKs appearing valid in a new instance of the same @@ -180,9 +183,9 @@ lib_create_interface_cookie (nal_cb_t *nal) } int -lib_setup_handle_hash (nal_cb_t *nal) +lib_setup_handle_hash (lib_nal_t *nal) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; int i; /* Arbitrary choice of hash table size */ @@ -191,9 +194,8 @@ lib_setup_handle_hash (nal_cb_t *nal) #else ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4; #endif - ni->ni_lh_hash_table = - (struct list_head *)nal->cb_malloc (nal, ni->ni_lh_hash_size - * sizeof (struct list_head)); + PORTAL_ALLOC(ni->ni_lh_hash_table, + ni->ni_lh_hash_size * sizeof (struct list_head)); if (ni->ni_lh_hash_table == NULL) return (PTL_NO_SPACE); @@ -206,22 +208,22 @@ lib_setup_handle_hash (nal_cb_t *nal) } void -lib_cleanup_handle_hash (nal_cb_t *nal) +lib_cleanup_handle_hash (lib_nal_t *nal) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; if (ni->ni_lh_hash_table == NULL) return; - nal->cb_free (nal, ni->ni_lh_hash_table, - ni->ni_lh_hash_size * sizeof (struct list_head)); + PORTAL_FREE(ni->ni_lh_hash_table, + ni->ni_lh_hash_size * sizeof (struct list_head)); } lib_handle_t * -lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type) +lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type) { /* ALWAYS called with statelock held */ - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; struct list_head *list; struct list_head *el; unsigned int hash; @@ -243,10 +245,10 @@ lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type) } void -lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type) +lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type) { /* ALWAYS called with statelock held */ - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; unsigned int hash; LASSERT (type >= 0 && type < PTL_COOKIE_TYPES); @@ -258,95 +260,120 @@ lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type) } void -lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh) +lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh) { list_del (&lh->lh_hash_chain); } int -lib_init(nal_cb_t *nal, ptl_process_id_t process_id, +lib_init(lib_nal_t *libnal, nal_t *apinal, + ptl_process_id_t process_id, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { int rc = PTL_OK; - lib_ni_t *ni = &nal->ni; - int ptl_size; - int i; + lib_ni_t *ni = &libnal->libnal_ni; + int ptl_size; + int i; ENTRY; /* NB serialised in PtlNIInit() */ lib_assert_wire_constants (); - - /* - * Allocate the portal table for this interface - * and all per-interface objects. - */ - memset(&ni->counters, 0, sizeof(lib_counters_t)); - rc = kportal_descriptor_setup (nal, requested_limits, - &ni->actual_limits); + /* Setup the API nal with the lib API handling functions */ + apinal->nal_get_id = lib_api_get_id; + apinal->nal_ni_status = lib_api_ni_status; + apinal->nal_ni_dist = lib_api_ni_dist; + apinal->nal_fail_nid = lib_api_fail_nid; + apinal->nal_me_attach = lib_api_me_attach; + apinal->nal_me_insert = lib_api_me_insert; + apinal->nal_me_unlink = lib_api_me_unlink; + apinal->nal_md_attach = lib_api_md_attach; + apinal->nal_md_bind = lib_api_md_bind; + apinal->nal_md_unlink = lib_api_md_unlink; + apinal->nal_md_update = lib_api_md_update; + apinal->nal_eq_alloc = lib_api_eq_alloc; + apinal->nal_eq_free = lib_api_eq_free; + apinal->nal_eq_poll = lib_api_eq_poll; + apinal->nal_put = lib_api_put; + apinal->nal_get = lib_api_get; + + apinal->nal_data = libnal; + ni->ni_api = apinal; + + rc = kportal_descriptor_setup (libnal, requested_limits, + &ni->ni_actual_limits); if (rc != PTL_OK) goto out; + memset(&ni->ni_counters, 0, sizeof(lib_counters_t)); + INIT_LIST_HEAD (&ni->ni_active_msgs); INIT_LIST_HEAD (&ni->ni_active_mds); INIT_LIST_HEAD (&ni->ni_active_eqs); - INIT_LIST_HEAD (&ni->ni_test_peers); - ni->ni_interface_cookie = lib_create_interface_cookie (nal); +#ifdef __KERNEL__ + spin_lock_init (&ni->ni_lock); + init_waitqueue_head (&ni->ni_waitq); +#else + pthread_mutex_init(&ni->ni_mutex, NULL); + pthread_cond_init(&ni->ni_cond, NULL); +#endif + + ni->ni_interface_cookie = lib_create_interface_cookie (libnal); ni->ni_next_object_cookie = 0; - rc = lib_setup_handle_hash (nal); + rc = lib_setup_handle_hash (libnal); if (rc != PTL_OK) goto out; - ni->nid = process_id.nid; - ni->pid = process_id.pid; + ni->ni_pid = process_id; if (requested_limits != NULL) ptl_size = requested_limits->max_pt_index + 1; else ptl_size = 64; - ni->tbl.size = ptl_size; - ni->tbl.tbl = nal->cb_malloc(nal, sizeof(struct list_head) * ptl_size); - if (ni->tbl.tbl == NULL) { + ni->ni_portals.size = ptl_size; + PORTAL_ALLOC(ni->ni_portals.tbl, + ptl_size * sizeof(struct list_head)); + if (ni->ni_portals.tbl == NULL) { rc = PTL_NO_SPACE; goto out; } for (i = 0; i < ptl_size; i++) - INIT_LIST_HEAD(&(ni->tbl.tbl[i])); + INIT_LIST_HEAD(&(ni->ni_portals.tbl[i])); /* max_{mes,mds,eqs} set in kportal_descriptor_setup */ /* We don't have an access control table! */ - ni->actual_limits.max_ac_index = -1; + ni->ni_actual_limits.max_ac_index = -1; - ni->actual_limits.max_pt_index = ptl_size - 1; - ni->actual_limits.max_md_iovecs = PTL_MD_MAX_IOV; - ni->actual_limits.max_me_list = INT_MAX; + ni->ni_actual_limits.max_pt_index = ptl_size - 1; + ni->ni_actual_limits.max_md_iovecs = PTL_MD_MAX_IOV; + ni->ni_actual_limits.max_me_list = INT_MAX; /* We don't support PtlGetPut! */ - ni->actual_limits.max_getput_md = 0; + ni->ni_actual_limits.max_getput_md = 0; if (actual_limits != NULL) - *actual_limits = ni->actual_limits; + *actual_limits = ni->ni_actual_limits; out: if (rc != PTL_OK) { - lib_cleanup_handle_hash (nal); - kportal_descriptor_cleanup (nal); + lib_cleanup_handle_hash (libnal); + kportal_descriptor_cleanup (libnal); } RETURN (rc); } int -lib_fini(nal_cb_t * nal) +lib_fini(lib_nal_t *nal) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; int idx; /* NB no state_lock() since this is the last reference. The NAL @@ -355,9 +382,9 @@ lib_fini(nal_cb_t * nal) * network op (eg MD with non-zero pending count) */ - for (idx = 0; idx < ni->tbl.size; idx++) - while (!list_empty (&ni->tbl.tbl[idx])) { - lib_me_t *me = list_entry (ni->tbl.tbl[idx].next, + for (idx = 0; idx < ni->ni_portals.size; idx++) + while (!list_empty (&ni->ni_portals.tbl[idx])) { + lib_me_t *me = list_entry (ni->ni_portals.tbl[idx].next, lib_me_t, me_list); CERROR ("Active me %p on exit\n", me); @@ -392,10 +419,16 @@ lib_fini(nal_cb_t * nal) lib_msg_free (nal, msg); } - nal->cb_free(nal, ni->tbl.tbl, sizeof(struct list_head) * ni->tbl.size); + PORTAL_FREE(ni->ni_portals.tbl, + ni->ni_portals.size * sizeof(struct list_head)); lib_cleanup_handle_hash (nal); kportal_descriptor_cleanup (nal); +#ifndef __KERNEL__ + pthread_mutex_destroy(&ni->ni_mutex); + pthread_cond_destroy(&ni->ni_cond); +#endif + return (PTL_OK); } diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index 64a55b9..a4df791 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -31,10 +31,10 @@ #endif #include -#include /* must be called with state lock held */ -void lib_md_unlink(nal_cb_t * nal, lib_md_t * md) +void +lib_md_unlink(lib_nal_t *nal, lib_md_t *md) { if ((md->md_flags & PTL_MD_FLAG_ZOMBIE) == 0) { /* first unlink attempt... */ @@ -62,12 +62,15 @@ void lib_md_unlink(nal_cb_t * nal, lib_md_t * md) CDEBUG(D_NET, "Unlinking md %p\n", md); if ((md->options & PTL_MD_KIOV) != 0) { - if (nal->cb_unmap_pages != NULL) - nal->cb_unmap_pages (nal, md->md_niov, md->md_iov.kiov, - &md->md_addrkey); - } else if (nal->cb_unmap != NULL) { - nal->cb_unmap (nal, md->md_niov, md->md_iov.iov, - &md->md_addrkey); + if (nal->libnal_unmap_pages != NULL) + nal->libnal_unmap_pages (nal, + md->md_niov, + md->md_iov.kiov, + &md->md_addrkey); + } else if (nal->libnal_unmap != NULL) { + nal->libnal_unmap (nal, + md->md_niov, md->md_iov.iov, + &md->md_addrkey); } if (md->eq != NULL) { @@ -80,124 +83,124 @@ void lib_md_unlink(nal_cb_t * nal, lib_md_t * md) } /* must be called with state lock held */ -static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, - ptl_md_t *md, ptl_handle_eq_t *eqh, int unlink) +static int +lib_md_build(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd, int unlink) { lib_eq_t *eq = NULL; int rc; int i; int niov; + int total_length = 0; /* NB we are passed an allocated, but uninitialised/active md. * if we return success, caller may lib_md_unlink() it. * otherwise caller may only lib_md_free() it. */ - if (!PtlHandleIsEqual (*eqh, PTL_EQ_NONE)) { - eq = ptl_handle2eq(eqh, nal); + if (!PtlHandleIsEqual (umd->eventq, PTL_EQ_NONE)) { + eq = ptl_handle2eq(&umd->eventq, nal); if (eq == NULL) return PTL_EQ_INVALID; } - /* Must check this _before_ allocation. Also, note that non-iov - * MDs must set md_niov to 0. */ - LASSERT((md->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) == 0 || - md->length <= PTL_MD_MAX_IOV); - /* This implementation doesn't know how to create START events or * disable END events. Best to LASSERT our caller is compliant so * we find out quickly... */ - LASSERT (PtlHandleIsEqual (*eqh, PTL_EQ_NONE) || - ((md->options & PTL_MD_EVENT_START_DISABLE) != 0 && - (md->options & PTL_MD_EVENT_END_DISABLE) == 0)); - - if ((md->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ - (md->max_size < 0 || md->max_size > md->length)) // illegal max_size - return PTL_MD_INVALID; - - new->me = NULL; - new->start = md->start; - new->offset = 0; - new->max_size = md->max_size; - new->options = md->options; - new->user_ptr = md->user_ptr; - new->eq = eq; - new->threshold = md->threshold; - new->pending = 0; - new->md_flags = (unlink == PTL_UNLINK) ? PTL_MD_FLAG_AUTO_UNLINK : 0; - - if ((md->options & PTL_MD_IOVEC) != 0) { - int total_length = 0; - - if ((md->options & PTL_MD_KIOV) != 0) /* Can't specify both */ - return PTL_MD_INVALID; - - new->md_niov = niov = md->length; - - if (nal->cb_read (nal, private, new->md_iov.iov, md->start, - niov * sizeof (new->md_iov.iov[0]))) - return PTL_SEGV; + LASSERT (eq == NULL || + ((umd->options & PTL_MD_EVENT_START_DISABLE) != 0 && + (umd->options & PTL_MD_EVENT_END_DISABLE) == 0)); + + lmd->me = NULL; + lmd->start = umd->start; + lmd->offset = 0; + lmd->max_size = umd->max_size; + lmd->options = umd->options; + lmd->user_ptr = umd->user_ptr; + lmd->eq = eq; + lmd->threshold = umd->threshold; + lmd->pending = 0; + lmd->md_flags = (unlink == PTL_UNLINK) ? PTL_MD_FLAG_AUTO_UNLINK : 0; + + if ((umd->options & PTL_MD_IOVEC) != 0) { + + if ((umd->options & PTL_MD_KIOV) != 0) /* Can't specify both */ + return PTL_MD_ILLEGAL; + + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.iov, umd->start, + niov * sizeof (lmd->md_iov.iov[0])); for (i = 0; i < niov; i++) { /* We take the base address on trust */ - if (new->md_iov.iov[i].iov_len <= 0) /* invalid length */ - return PTL_VAL_FAILED; + if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */ + return PTL_MD_ILLEGAL; - total_length += new->md_iov.iov[i].iov_len; + total_length += lmd->md_iov.iov[i].iov_len; } - new->length = total_length; + lmd->length = total_length; - if (nal->cb_map != NULL) { - rc = nal->cb_map (nal, niov, new->md_iov.iov, - &new->md_addrkey); + if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) // illegal max_size + return PTL_MD_ILLEGAL; + + if (nal->libnal_map != NULL) { + rc = nal->libnal_map (nal, niov, lmd->md_iov.iov, + &lmd->md_addrkey); if (rc != PTL_OK) return (rc); } - } else if ((md->options & PTL_MD_KIOV) != 0) { + } else if ((umd->options & PTL_MD_KIOV) != 0) { #ifndef __KERNEL__ - return PTL_MD_INVALID; -#else - int total_length = 0; - + return PTL_MD_ILLEGAL; +#else /* Trap attempt to use paged I/O if unsupported early. */ - if (nal->cb_send_pages == NULL || - nal->cb_recv_pages == NULL) + if (nal->libnal_send_pages == NULL || + nal->libnal_recv_pages == NULL) return PTL_MD_INVALID; - new->md_niov = niov = md->length; + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.kiov, umd->start, + niov * sizeof (lmd->md_iov.kiov[0])); - if (nal->cb_read (nal, private, new->md_iov.kiov, md->start, - niov * sizeof (new->md_iov.kiov[0]))) - return PTL_SEGV; - for (i = 0; i < niov; i++) { /* We take the page pointer on trust */ - if (new->md_iov.kiov[i].kiov_offset + - new->md_iov.kiov[i].kiov_len > PAGE_SIZE ) + if (lmd->md_iov.kiov[i].kiov_offset + + lmd->md_iov.kiov[i].kiov_len > PAGE_SIZE ) return PTL_VAL_FAILED; /* invalid length */ - total_length += new->md_iov.kiov[i].kiov_len; + total_length += lmd->md_iov.kiov[i].kiov_len; } - new->length = total_length; + lmd->length = total_length; + + if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) // illegal max_size + return PTL_MD_ILLEGAL; - if (nal->cb_map_pages != NULL) { - rc = nal->cb_map_pages (nal, niov, new->md_iov.kiov, - &new->md_addrkey); + if (nal->libnal_map_pages != NULL) { + rc = nal->libnal_map_pages (nal, niov, lmd->md_iov.kiov, + &lmd->md_addrkey); if (rc != PTL_OK) return (rc); } #endif } else { /* contiguous */ - new->length = md->length; - new->md_niov = niov = 1; - new->md_iov.iov[0].iov_base = md->start; - new->md_iov.iov[0].iov_len = md->length; - - if (nal->cb_map != NULL) { - rc = nal->cb_map (nal, niov, new->md_iov.iov, - &new->md_addrkey); + lmd->length = umd->length; + lmd->md_niov = niov = 1; + lmd->md_iov.iov[0].iov_base = umd->start; + lmd->md_iov.iov[0].iov_len = umd->length; + + if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > umd->length)) // illegal max_size + return PTL_MD_ILLEGAL; + + if (nal->libnal_map != NULL) { + rc = nal->libnal_map (nal, niov, lmd->md_iov.iov, + &lmd->md_addrkey); if (rc != PTL_OK) return (rc); } @@ -207,140 +210,125 @@ static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, eq->eq_refcount++; /* It's good; let handle2md succeed and add to active mds */ - lib_initialise_handle (nal, &new->md_lh, PTL_COOKIE_TYPE_MD); - list_add (&new->md_list, &nal->ni.ni_active_mds); + lib_initialise_handle (nal, &lmd->md_lh, PTL_COOKIE_TYPE_MD); + list_add (&lmd->md_list, &nal->libnal_ni.ni_active_mds); return PTL_OK; } /* must be called with state lock held */ -void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md, ptl_md_t * new) +void +lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd) { /* NB this doesn't copy out all the iov entries so when a * discontiguous MD is copied out, the target gets to know the * original iov pointer (in start) and the number of entries it had * and that's all. */ - new->start = md->start; - new->length = ((md->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) == 0) ? - md->length : md->md_niov; - new->threshold = md->threshold; - new->max_size = md->max_size; - new->options = md->options; - new->user_ptr = md->user_ptr; - ptl_eq2handle(&new->eventq, md->eq); + umd->start = lmd->start; + umd->length = ((lmd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) == 0) ? + lmd->length : lmd->md_niov; + umd->threshold = lmd->threshold; + umd->max_size = lmd->max_size; + umd->options = lmd->options; + umd->user_ptr = lmd->user_ptr; + ptl_eq2handle(&umd->eventq, nal, lmd->eq); } -int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_md_attach(nal_t *apinal, ptl_handle_me_t *meh, + ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle) { - /* - * Incoming: - * ptl_handle_me_t current_in - * ptl_md_t md_in - * ptl_unlink_t unlink_in - * - * Outgoing: - * ptl_handle_md_t * handle_out - */ - - PtlMDAttach_in *args = v_args; - PtlMDAttach_out *ret = v_ret; - lib_me_t *me; - lib_md_t *md; + lib_nal_t *nal = apinal->nal_data; + lib_me_t *me; + lib_md_t *md; unsigned long flags; + int rc; - if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && - args->md_in.length > PTL_MD_MAX_IOV) /* too many fragments */ - return (ret->rc = PTL_IOV_INVALID); + if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && + umd->length > PTL_MD_MAX_IOV) /* too many fragments */ + return PTL_IOV_INVALID; - md = lib_md_alloc(nal, &args->md_in); + md = lib_md_alloc(nal, umd); if (md == NULL) - return (ret->rc = PTL_NO_SPACE); + return PTL_NO_SPACE; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - me = ptl_handle2me(&args->me_in, nal); + me = ptl_handle2me(meh, nal); if (me == NULL) { - ret->rc = PTL_ME_INVALID; + rc = PTL_ME_INVALID; } else if (me->md != NULL) { - ret->rc = PTL_ME_IN_USE; + rc = PTL_ME_IN_USE; } else { - ret->rc = lib_md_build(nal, md, private, &args->md_in, - &args->eq_in, args->unlink_in); - - if (ret->rc == PTL_OK) { + rc = lib_md_build(nal, md, umd, unlink); + if (rc == PTL_OK) { me->md = md; md->me = me; - ptl_md2handle(&ret->handle_out, md); + ptl_md2handle(handle, nal, md); - state_unlock (nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_OK); } } lib_md_free (nal, md); - state_unlock (nal, &flags); - return (ret->rc); + LIB_UNLOCK(nal, flags); + return (rc); } -int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_md_bind(nal_t *apinal, + ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle) { - /* - * Incoming: - * ptl_handle_ni_t ni_in - * ptl_md_t md_in - * - * Outgoing: - * ptl_handle_md_t * handle_out - */ - - PtlMDBind_in *args = v_args; - PtlMDBind_out *ret = v_ret; - lib_md_t *md; + lib_nal_t *nal = apinal->nal_data; + lib_md_t *md; unsigned long flags; + int rc; - if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && - args->md_in.length > PTL_MD_MAX_IOV) /* too many fragments */ - return (ret->rc = PTL_IOV_INVALID); + if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && + umd->length > PTL_MD_MAX_IOV) /* too many fragments */ + return PTL_IOV_INVALID; - md = lib_md_alloc(nal, &args->md_in); + md = lib_md_alloc(nal, umd); if (md == NULL) - return (ret->rc = PTL_NO_SPACE); + return PTL_NO_SPACE; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - ret->rc = lib_md_build(nal, md, private, &args->md_in, - &args->eq_in, args->unlink_in); + rc = lib_md_build(nal, md, umd, unlink); - if (ret->rc == PTL_OK) { - ptl_md2handle(&ret->handle_out, md); + if (rc == PTL_OK) { + ptl_md2handle(handle, nal, md); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_OK); } lib_md_free (nal, md); - state_unlock(nal, &flags); - return (ret->rc); + LIB_UNLOCK(nal, flags); + return (rc); } -int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_md_unlink (nal_t *apinal, ptl_handle_md_t *mdh) { - PtlMDUnlink_in *args = v_args; - PtlMDUnlink_out *ret = v_ret; + lib_nal_t *nal = apinal->nal_data; ptl_event_t ev; lib_md_t *md; unsigned long flags; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - md = ptl_handle2md(&args->md_in, nal); + md = ptl_handle2md(mdh, nal); if (md == NULL) { - state_unlock(nal, &flags); - return (ret->rc = PTL_MD_INVALID); + LIB_UNLOCK(nal, flags); + return PTL_MD_INVALID; } /* If the MD is busy, lib_md_unlink just marks it for deletion, and @@ -356,95 +344,82 @@ int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) ev.unlinked = 1; lib_md_deconstruct(nal, md, &ev.mem_desc); - lib_enq_event_locked(nal, private, md->eq, &ev); + lib_enq_event_locked(nal, NULL, md->eq, &ev); } - lib_md_deconstruct(nal, md, &ret->status_out); lib_md_unlink(nal, md); - ret->rc = PTL_OK; - state_unlock(nal, &flags); - - return (PTL_OK); + LIB_UNLOCK(nal, flags); + return PTL_OK; } -int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args, - void *v_ret) +int +lib_api_md_update (nal_t *apinal, + ptl_handle_md_t *mdh, + ptl_md_t *oldumd, ptl_md_t *newumd, + ptl_handle_eq_t *testqh) { - /* - * Incoming: - * ptl_handle_md_t md_in - * ptl_md_t * old_inout - * ptl_md_t * new_inout - * ptl_handle_eq_t testq_in - * ptl_seq_t sequence_in - * - * Outgoing: - * ptl_md_t * old_inout - * ptl_md_t * new_inout - */ - PtlMDUpdate_internal_in *args = v_args; - PtlMDUpdate_internal_out *ret = v_ret; - lib_md_t *md; - lib_eq_t *test_eq = NULL; - ptl_md_t *new = &args->new_inout; + lib_nal_t *nal = apinal->nal_data; + lib_md_t *md; + lib_eq_t *test_eq = NULL; unsigned long flags; + int rc; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - md = ptl_handle2md(&args->md_in, nal); + md = ptl_handle2md(mdh, nal); if (md == NULL) { - ret->rc = PTL_MD_INVALID; + rc = PTL_MD_INVALID; goto out; } - if (args->old_inout_valid) - lib_md_deconstruct(nal, md, &ret->old_inout); + if (oldumd != NULL) + lib_md_deconstruct(nal, md, oldumd); - if (!args->new_inout_valid) { - ret->rc = PTL_OK; + if (newumd == NULL) { + rc = PTL_OK; goto out; } /* XXX fttb, the new MD must be the same "shape" wrt fragmentation, * since we simply overwrite the old lib-md */ - if ((((new->options ^ md->options) & + if ((((newumd->options ^ md->options) & (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0) || - ((new->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0 && - new->length != md->md_niov)) { - ret->rc = PTL_IOV_INVALID; + ((newumd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0 && + newumd->length != md->md_niov)) { + rc = PTL_IOV_INVALID; goto out; } - if (!PtlHandleIsEqual (args->testq_in, PTL_EQ_NONE)) { - test_eq = ptl_handle2eq(&args->testq_in, nal); + if (!PtlHandleIsEqual (*testqh, PTL_EQ_NONE)) { + test_eq = ptl_handle2eq(testqh, nal); if (test_eq == NULL) { - ret->rc = PTL_EQ_INVALID; + rc = PTL_EQ_INVALID; goto out; } } if (md->pending != 0) { - ret->rc = PTL_MD_NO_UPDATE; - goto out; + rc = PTL_MD_NO_UPDATE; + goto out; } if (test_eq == NULL || - test_eq->sequence == args->sequence_in) { + test_eq->eq_deq_seq == test_eq->eq_enq_seq) { lib_me_t *me = md->me; int unlink = (md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) ? PTL_UNLINK : PTL_RETAIN; // #warning this does not track eq refcounts properly - ret->rc = lib_md_build(nal, md, private, - new, &new->eventq, unlink); + rc = lib_md_build(nal, md, newumd, unlink); md->me = me; } else { - ret->rc = PTL_MD_NO_UPDATE; + rc = PTL_MD_NO_UPDATE; } out: - state_unlock(nal, &flags); - return (ret->rc); + LIB_UNLOCK(nal, flags); + + return rc; } diff --git a/lnet/lnet/lib-me.c b/lnet/lnet/lib-me.c index 271fc82..9665b4f 100644 --- a/lnet/lnet/lib-me.c +++ b/lnet/lnet/lib-me.c @@ -31,120 +31,129 @@ #endif #include -#include -static void lib_me_dump(nal_cb_t * nal, lib_me_t * me); - -int do_PtlMEAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_me_attach(nal_t *apinal, + ptl_pt_index_t portal, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle) { - PtlMEAttach_in *args = v_args; - PtlMEAttach_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; - lib_ptl_t *tbl = &ni->tbl; + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; + lib_ptl_t *tbl = &ni->ni_portals; + lib_me_t *me; unsigned long flags; - lib_me_t *me; - if (args->index_in >= tbl->size) - return ret->rc = PTL_PT_INDEX_INVALID; + if (portal >= tbl->size) + return PTL_PT_INDEX_INVALID; /* Should check for valid matchid, but not yet */ - if (0) - return ret->rc = PTL_PROCESS_INVALID; me = lib_me_alloc (nal); if (me == NULL) - return (ret->rc = PTL_NO_SPACE); + return PTL_NO_SPACE; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - me->match_id = args->match_id_in; - me->match_bits = args->match_bits_in; - me->ignore_bits = args->ignore_bits_in; - me->unlink = args->unlink_in; + me->match_id = match_id; + me->match_bits = match_bits; + me->ignore_bits = ignore_bits; + me->unlink = unlink; me->md = NULL; lib_initialise_handle (nal, &me->me_lh, PTL_COOKIE_TYPE_ME); - if (args->position_in == PTL_INS_AFTER) - list_add_tail(&me->me_list, &(tbl->tbl[args->index_in])); + if (pos == PTL_INS_AFTER) + list_add_tail(&me->me_list, &(tbl->tbl[portal])); else - list_add(&me->me_list, &(tbl->tbl[args->index_in])); + list_add(&me->me_list, &(tbl->tbl[portal])); - ptl_me2handle(&ret->handle_out, me); + ptl_me2handle(handle, nal, me); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return ret->rc = PTL_OK; + return PTL_OK; } -int do_PtlMEInsert(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_me_insert(nal_t *apinal, + ptl_handle_me_t *current_meh, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle) { - PtlMEInsert_in *args = v_args; - PtlMEInsert_out *ret = v_ret; + lib_nal_t *nal = apinal->nal_data; + lib_me_t *current_me; + lib_me_t *new_me; unsigned long flags; - lib_me_t *me; - lib_me_t *new; - new = lib_me_alloc (nal); - if (new == NULL) - return (ret->rc = PTL_NO_SPACE); + new_me = lib_me_alloc (nal); + if (new_me == NULL) + return PTL_NO_SPACE; /* Should check for valid matchid, but not yet */ - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - me = ptl_handle2me(&args->current_in, nal); - if (me == NULL) { - lib_me_free (nal, new); + current_me = ptl_handle2me(current_meh, nal); + if (current_me == NULL) { + lib_me_free (nal, new_me); - state_unlock (nal, &flags); - return (ret->rc = PTL_ME_INVALID); + LIB_UNLOCK(nal, flags); + return PTL_ME_INVALID; } - new->match_id = args->match_id_in; - new->match_bits = args->match_bits_in; - new->ignore_bits = args->ignore_bits_in; - new->unlink = args->unlink_in; - new->md = NULL; + new_me->match_id = match_id; + new_me->match_bits = match_bits; + new_me->ignore_bits = ignore_bits; + new_me->unlink = unlink; + new_me->md = NULL; - lib_initialise_handle (nal, &new->me_lh, PTL_COOKIE_TYPE_ME); + lib_initialise_handle (nal, &new_me->me_lh, PTL_COOKIE_TYPE_ME); - if (args->position_in == PTL_INS_AFTER) - list_add_tail(&new->me_list, &me->me_list); + if (pos == PTL_INS_AFTER) + list_add_tail(&new_me->me_list, ¤t_me->me_list); else - list_add(&new->me_list, &me->me_list); + list_add(&new_me->me_list, ¤t_me->me_list); - ptl_me2handle(&ret->handle_out, new); + ptl_me2handle(handle, nal, new_me); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return ret->rc = PTL_OK; + return PTL_OK; } -int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_me_unlink (nal_t *apinal, ptl_handle_me_t *meh) { - PtlMEUnlink_in *args = v_args; - PtlMEUnlink_out *ret = v_ret; + lib_nal_t *nal = apinal->nal_data; unsigned long flags; - lib_me_t *me; + lib_me_t *me; + int rc; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - me = ptl_handle2me(&args->current_in, nal); + me = ptl_handle2me(meh, nal); if (me == NULL) { - ret->rc = PTL_ME_INVALID; + rc = PTL_ME_INVALID; } else { lib_me_unlink(nal, me); - ret->rc = PTL_OK; + rc = PTL_OK; } - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return (ret->rc); + return (rc); } /* call with state_lock please */ -void lib_me_unlink(nal_cb_t *nal, lib_me_t *me) +void +lib_me_unlink(lib_nal_t *nal, lib_me_t *me) { list_del (&me->me_list); @@ -157,64 +166,20 @@ void lib_me_unlink(nal_cb_t *nal, lib_me_t *me) lib_me_free(nal, me); } -int do_PtlTblDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +#if 0 +static void +lib_me_dump(lib_nal_t *nal, lib_me_t * me) { - PtlTblDump_in *args = v_args; - PtlTblDump_out *ret = v_ret; - lib_ptl_t *tbl = &nal->ni.tbl; - ptl_handle_any_t handle; - struct list_head *tmp; - unsigned long flags; + CWARN("Match Entry %p ("LPX64")\n", me, + me->me_lh.lh_cookie); - if (args->index_in < 0 || args->index_in >= tbl->size) - return ret->rc = PTL_PT_INDEX_INVALID; - - nal->cb_printf(nal, "Portal table index %d\n", args->index_in); - - state_lock(nal, &flags); - list_for_each(tmp, &(tbl->tbl[args->index_in])) { - lib_me_t *me = list_entry(tmp, lib_me_t, me_list); - ptl_me2handle(&handle, me); - lib_me_dump(nal, me); - } - state_unlock(nal, &flags); + CWARN("\tMatch/Ignore\t= %016lx / %016lx\n", + me->match_bits, me->ignore_bits); - return ret->rc = PTL_OK; -} - -int do_PtlMEDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) -{ - PtlMEDump_in *args = v_args; - PtlMEDump_out *ret = v_ret; - lib_me_t *me; - unsigned long flags; - - state_lock(nal, &flags); - - me = ptl_handle2me(&args->current_in, nal); - if (me == NULL) { - ret->rc = PTL_ME_INVALID; - } else { - lib_me_dump(nal, me); - ret->rc = PTL_OK; - } - - state_unlock(nal, &flags); - - return ret->rc; -} - -static void lib_me_dump(nal_cb_t * nal, lib_me_t * me) -{ - nal->cb_printf(nal, "Match Entry %p ("LPX64")\n", me, - me->me_lh.lh_cookie); - - nal->cb_printf(nal, "\tMatch/Ignore\t= %016lx / %016lx\n", - me->match_bits, me->ignore_bits); - - nal->cb_printf(nal, "\tMD\t= %p\n", me->md); - nal->cb_printf(nal, "\tprev\t= %p\n", - list_entry(me->me_list.prev, lib_me_t, me_list)); - nal->cb_printf(nal, "\tnext\t= %p\n", - list_entry(me->me_list.next, lib_me_t, me_list)); + CWARN("\tMD\t= %p\n", me->md); + CWARN("\tprev\t= %p\n", + list_entry(me->me_list.prev, lib_me_t, me_list)); + CWARN("\tnext\t= %p\n", + list_entry(me->me_list.next, lib_me_t, me_list)); } +#endif diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 477ddf8..9dcc06e 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -31,20 +31,19 @@ #endif #include #include -#include /* forward ref */ -static void lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg); +static void lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg); static lib_md_t * -lib_match_md(nal_cb_t *nal, int index, int op_mask, +lib_match_md(lib_nal_t *nal, int index, int op_mask, ptl_nid_t src_nid, ptl_pid_t src_pid, ptl_size_t rlength, ptl_size_t roffset, ptl_match_bits_t match_bits, lib_msg_t *msg, ptl_size_t *mlength_out, ptl_size_t *offset_out) { - lib_ni_t *ni = &nal->ni; - struct list_head *match_list = &ni->tbl.tbl[index]; + lib_ni_t *ni = &nal->libnal_ni; + struct list_head *match_list = &ni->ni_portals.tbl[index]; struct list_head *tmp; lib_me_t *me; lib_md_t *md; @@ -55,9 +54,9 @@ lib_match_md(nal_cb_t *nal, int index, int op_mask, CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d " "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits); - if (index < 0 || index >= ni->tbl.size) { + if (index < 0 || index >= ni->ni_portals.size) { CERROR("Invalid portal %d not in [0-%d]\n", - index, ni->tbl.size); + index, ni->ni_portals.size); goto failed; } @@ -153,66 +152,65 @@ lib_match_md(nal_cb_t *nal, int index, int op_mask, failed: CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64 " offset %d length %d: no match\n", - ni->nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT", + ni->ni_pid.nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT", src_nid, src_pid, index, match_bits, roffset, rlength); RETURN(NULL); } -int do_PtlFailNid (nal_cb_t *nal, void *private, void *v_args, void *v_ret) +int lib_api_fail_nid (nal_t *apinal, ptl_nid_t nid, unsigned int threshold) { - PtlFailNid_in *args = v_args; - PtlFailNid_out *ret = v_ret; + lib_nal_t *nal = apinal->nal_data; lib_test_peer_t *tp; unsigned long flags; struct list_head *el; struct list_head *next; struct list_head cull; - if (args->threshold != 0) { + if (threshold != 0) { /* Adding a new entry */ - tp = (lib_test_peer_t *)nal->cb_malloc (nal, sizeof (*tp)); + PORTAL_ALLOC(tp, sizeof(*tp)); if (tp == NULL) - return (ret->rc = PTL_FAIL); + return PTL_NO_SPACE; - tp->tp_nid = args->nid; - tp->tp_threshold = args->threshold; + tp->tp_nid = nid; + tp->tp_threshold = threshold; - state_lock (nal, &flags); - list_add (&tp->tp_list, &nal->ni.ni_test_peers); - state_unlock (nal, &flags); - return (ret->rc = PTL_OK); + LIB_LOCK(nal, flags); + list_add_tail (&tp->tp_list, &nal->libnal_ni.ni_test_peers); + LIB_UNLOCK(nal, flags); + return PTL_OK; } /* removing entries */ INIT_LIST_HEAD (&cull); - state_lock (nal, &flags); + LIB_LOCK(nal, flags); - list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) { tp = list_entry (el, lib_test_peer_t, tp_list); if (tp->tp_threshold == 0 || /* needs culling anyway */ - args->nid == PTL_NID_ANY || /* removing all entries */ - tp->tp_nid == args->nid) /* matched this one */ + nid == PTL_NID_ANY || /* removing all entries */ + tp->tp_nid == nid) /* matched this one */ { list_del (&tp->tp_list); list_add (&tp->tp_list, &cull); } } - state_unlock (nal, &flags); + LIB_UNLOCK(nal, flags); while (!list_empty (&cull)) { tp = list_entry (cull.next, lib_test_peer_t, tp_list); list_del (&tp->tp_list); - nal->cb_free (nal, tp, sizeof (*tp)); + PORTAL_FREE(tp, sizeof (*tp)); } - return (ret->rc = PTL_OK); + return PTL_OK; } static int -fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) +fail_peer (lib_nal_t *nal, ptl_nid_t nid, int outgoing) { lib_test_peer_t *tp; struct list_head *el; @@ -223,9 +221,9 @@ fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) INIT_LIST_HEAD (&cull); - state_lock (nal, &flags); + LIB_LOCK (nal, flags); - list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) { tp = list_entry (el, lib_test_peer_t, tp_list); if (tp->tp_threshold == 0) { @@ -257,13 +255,13 @@ fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) } } - state_unlock (nal, &flags); + LIB_UNLOCK (nal, flags); while (!list_empty (&cull)) { tp = list_entry (cull.next, lib_test_peer_t, tp_list); list_del (&tp->tp_list); - nal->cb_free (nal, tp, sizeof (*tp)); + PORTAL_FREE(tp, sizeof (*tp)); } return (fail); @@ -554,52 +552,52 @@ lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, #endif ptl_err_t -lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, +lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen) { if (mlen == 0) - return (nal->cb_recv(nal, private, msg, - 0, NULL, - offset, mlen, rlen)); + return (nal->libnal_recv(nal, private, msg, + 0, NULL, + offset, mlen, rlen)); if ((md->options & PTL_MD_KIOV) == 0) - return (nal->cb_recv(nal, private, msg, - md->md_niov, md->md_iov.iov, - offset, mlen, rlen)); + return (nal->libnal_recv(nal, private, msg, + md->md_niov, md->md_iov.iov, + offset, mlen, rlen)); - return (nal->cb_recv_pages(nal, private, msg, - md->md_niov, md->md_iov.kiov, - offset, mlen, rlen)); + return (nal->libnal_recv_pages(nal, private, msg, + md->md_niov, md->md_iov.kiov, + offset, mlen, rlen)); } ptl_err_t -lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, +lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, lib_md_t *md, ptl_size_t offset, ptl_size_t len) { if (len == 0) - return (nal->cb_send(nal, private, msg, - hdr, type, nid, pid, - 0, NULL, - offset, len)); + return (nal->libnal_send(nal, private, msg, + hdr, type, nid, pid, + 0, NULL, + offset, len)); if ((md->options & PTL_MD_KIOV) == 0) - return (nal->cb_send(nal, private, msg, - hdr, type, nid, pid, - md->md_niov, md->md_iov.iov, - offset, len)); - - return (nal->cb_send_pages(nal, private, msg, - hdr, type, nid, pid, - md->md_niov, md->md_iov.kiov, - offset, len)); + return (nal->libnal_send(nal, private, msg, + hdr, type, nid, pid, + md->md_niov, md->md_iov.iov, + offset, len)); + + return (nal->libnal_send_pages(nal, private, msg, + hdr, type, nid, pid, + md->md_niov, md->md_iov.kiov, + offset, len)); } static void -lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg) +lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg) { - /* ALWAYS called holding the state_lock */ - lib_counters_t *counters = &nal->ni.counters; + /* ALWAYS called holding the LIB_LOCK */ + lib_counters_t *counters = &nal->libnal_ni.ni_counters; /* Here, we commit the MD to a network OP by marking it busy and * decrementing its threshold. Come what may, the network "owns" @@ -616,11 +614,11 @@ lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg) if (counters->msgs_alloc > counters->msgs_max) counters->msgs_max = counters->msgs_alloc; - list_add (&msg->msg_list, &nal->ni.ni_active_msgs); + list_add (&msg->msg_list, &nal->libnal_ni.ni_active_msgs); } static void -lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr) +lib_drop_message (lib_nal_t *nal, void *private, ptl_hdr_t *hdr) { unsigned long flags; @@ -628,10 +626,10 @@ lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr) * to receive (init_msg() not called) and therefore can't cause an * event. */ - state_lock(nal, &flags); - nal->ni.counters.drop_count++; - nal->ni.counters.drop_length += hdr->payload_length; - state_unlock(nal, &flags); + LIB_LOCK(nal, flags); + nal->libnal_ni.ni_counters.drop_count++; + nal->libnal_ni.ni_counters.drop_length += hdr->payload_length; + LIB_UNLOCK(nal, flags); /* NULL msg => if NAL calls lib_finalize it will be a noop */ (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); @@ -645,9 +643,9 @@ lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr) * */ static ptl_err_t -parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) +parse_put(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; ptl_size_t mlength = 0; ptl_size_t offset = 0; ptl_err_t rc; @@ -659,7 +657,7 @@ parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index); hdr->msg.put.offset = NTOH__u32 (hdr->msg.put.offset); - state_lock(nal, &flags); + LIB_LOCK(nal, flags); md = lib_match_md(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT, hdr->src_nid, hdr->src_pid, @@ -667,7 +665,7 @@ parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->msg.put.match_bits, msg, &mlength, &offset); if (md == NULL) { - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } @@ -679,24 +677,24 @@ parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) msg->ack_wmd = hdr->msg.put.ack_wmd; } - ni->counters.recv_count++; - ni->counters.recv_length += mlength; + ni->ni_counters.recv_count++; + ni->ni_counters.recv_length += mlength; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); rc = lib_recv(nal, private, msg, md, offset, mlength, hdr->payload_length); if (rc != PTL_OK) CERROR(LPU64": error on receiving PUT from "LPU64": %d\n", - ni->nid, hdr->src_nid, rc); + ni->ni_pid.nid, hdr->src_nid, rc); return (rc); } static ptl_err_t -parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) +parse_get(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; ptl_size_t mlength = 0; ptl_size_t offset = 0; lib_md_t *md; @@ -710,7 +708,7 @@ parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->msg.get.sink_length = NTOH__u32 (hdr->msg.get.sink_length); hdr->msg.get.src_offset = NTOH__u32 (hdr->msg.get.src_offset); - state_lock(nal, &flags); + LIB_LOCK(nal, flags); md = lib_match_md(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET, hdr->src_nid, hdr->src_pid, @@ -718,24 +716,24 @@ parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->msg.get.match_bits, msg, &mlength, &offset); if (md == NULL) { - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } msg->ev.type = PTL_EVENT_GET_END; msg->ev.hdr_data = 0; - ni->counters.send_count++; - ni->counters.send_length += mlength; + ni->ni_counters.send_count++; + ni->ni_counters.send_length += mlength; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); memset (&reply, 0, sizeof (reply)); reply.type = HTON__u32 (PTL_MSG_REPLY); reply.dest_nid = HTON__u64 (hdr->src_nid); - reply.src_nid = HTON__u64 (ni->nid); reply.dest_pid = HTON__u32 (hdr->src_pid); - reply.src_pid = HTON__u32 (ni->pid); + reply.src_nid = HTON__u64 (ni->ni_pid.nid); + reply.src_pid = HTON__u32 (ni->ni_pid.pid); reply.payload_length = HTON__u32 (mlength); reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd; @@ -747,7 +745,7 @@ parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->src_nid, hdr->src_pid, md, offset, mlength); if (rc != PTL_OK) CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n", - ni->nid, hdr->src_nid, rc); + ni->ni_pid.nid, hdr->src_nid, rc); /* Discard any junk after the hdr */ (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); @@ -756,27 +754,27 @@ parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) } static ptl_err_t -parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) +parse_reply(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; lib_md_t *md; int rlength; int length; unsigned long flags; ptl_err_t rc; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); /* NB handles only looked up by creator (no flips) */ md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal); if (md == NULL || md->threshold == 0) { CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n", - ni->nid, hdr->src_nid, + ni->ni_pid.nid, hdr->src_nid, md == NULL ? "invalid" : "inactive", hdr->msg.reply.dst_wmd.wh_interface_cookie, hdr->msg.reply.dst_wmd.wh_object_cookie); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } @@ -788,10 +786,10 @@ parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) if ((md->options & PTL_MD_TRUNCATE) == 0) { CERROR (LPU64": Dropping REPLY from "LPU64 " length %d for MD "LPX64" would overflow (%d)\n", - ni->nid, hdr->src_nid, length, + ni->ni_pid.nid, hdr->src_nid, length, hdr->msg.reply.dst_wmd.wh_object_cookie, md->length); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } length = md->length; @@ -812,23 +810,23 @@ parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - ni->counters.recv_count++; - ni->counters.recv_length += length; + ni->ni_counters.recv_count++; + ni->ni_counters.recv_length += length; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); rc = lib_recv(nal, private, msg, md, 0, length, rlength); if (rc != PTL_OK) CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n", - ni->nid, hdr->src_nid, rc); + ni->ni_pid.nid, hdr->src_nid, rc); return (rc); } static ptl_err_t -parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) +parse_ack(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; lib_md_t *md; unsigned long flags; @@ -836,23 +834,23 @@ parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits); hdr->msg.ack.mlength = NTOH__u32 (hdr->msg.ack.mlength); - state_lock(nal, &flags); + LIB_LOCK(nal, flags); /* NB handles only looked up by creator (no flips) */ md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal); if (md == NULL || md->threshold == 0) { CDEBUG(D_INFO, LPU64": Dropping ACK from "LPU64" to %s MD " - LPX64"."LPX64"\n", ni->nid, hdr->src_nid, + LPX64"."LPX64"\n", ni->ni_pid.nid, hdr->src_nid, (md == NULL) ? "invalid" : "inactive", hdr->msg.ack.dst_wmd.wh_interface_cookie, hdr->msg.ack.dst_wmd.wh_object_cookie); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n", - ni->nid, hdr->src_nid, + ni->ni_pid.nid, hdr->src_nid, hdr->msg.ack.dst_wmd.wh_object_cookie); lib_commit_md(nal, md, msg); @@ -865,9 +863,9 @@ parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - ni->counters.recv_count++; + ni->ni_counters.recv_count++; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); /* We have received and matched up the ack OK, create the * completion event now... */ @@ -898,125 +896,152 @@ hdr_type_string (ptl_hdr_t *hdr) } } -void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr) +void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr) { char *type_str = hdr_type_string (hdr); - nal->cb_printf(nal, "P3 Header at %p of type %s\n", hdr, type_str); - nal->cb_printf(nal, " From nid/pid %Lu/%Lu", hdr->src_nid, - hdr->src_pid); - nal->cb_printf(nal, " To nid/pid %Lu/%Lu\n", hdr->dest_nid, - hdr->dest_pid); + CWARN("P3 Header at %p of type %s\n", hdr, type_str); + CWARN(" From nid/pid "LPX64"/%u", hdr->src_nid, hdr->src_pid); + CWARN(" To nid/pid "LPX64"/%u\n", hdr->dest_nid, hdr->dest_pid); switch (hdr->type) { default: break; case PTL_MSG_PUT: - nal->cb_printf(nal, - " Ptl index %d, ack md "LPX64"."LPX64", " - "match bits "LPX64"\n", - hdr->msg.put.ptl_index, - hdr->msg.put.ack_wmd.wh_interface_cookie, - hdr->msg.put.ack_wmd.wh_object_cookie, - hdr->msg.put.match_bits); - nal->cb_printf(nal, - " Length %d, offset %d, hdr data "LPX64"\n", - hdr->payload_length, hdr->msg.put.offset, - hdr->msg.put.hdr_data); + CWARN(" Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPX64"\n", + hdr->msg.put.ptl_index, + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + hdr->msg.put.match_bits); + CWARN(" Length %d, offset %d, hdr data "LPX64"\n", + hdr->payload_length, hdr->msg.put.offset, + hdr->msg.put.hdr_data); break; case PTL_MSG_GET: - nal->cb_printf(nal, - " Ptl index %d, return md "LPX64"."LPX64", " - "match bits "LPX64"\n", hdr->msg.get.ptl_index, - hdr->msg.get.return_wmd.wh_interface_cookie, - hdr->msg.get.return_wmd.wh_object_cookie, - hdr->msg.get.match_bits); - nal->cb_printf(nal, - " Length %d, src offset %d\n", - hdr->msg.get.sink_length, - hdr->msg.get.src_offset); + CWARN(" Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPX64"\n", hdr->msg.get.ptl_index, + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + CWARN(" Length %d, src offset %d\n", + hdr->msg.get.sink_length, + hdr->msg.get.src_offset); break; case PTL_MSG_ACK: - nal->cb_printf(nal, " dst md "LPX64"."LPX64", " - "manipulated length %d\n", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie, - hdr->msg.ack.mlength); + CWARN(" dst md "LPX64"."LPX64", " + "manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + hdr->msg.ack.mlength); break; case PTL_MSG_REPLY: - nal->cb_printf(nal, " dst md "LPX64"."LPX64", " - "length %d\n", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie, - hdr->payload_length); + CWARN(" dst md "LPX64"."LPX64", " + "length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + hdr->payload_length); } } /* end of print_hdr() */ -void -lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private) +ptl_err_t +lib_parse(lib_nal_t *nal, ptl_hdr_t *hdr, void *private) { unsigned long flags; ptl_err_t rc; lib_msg_t *msg; + + /* NB we return PTL_OK if we manage to parse the header and believe + * it looks OK. Anything that goes wrong with receiving the + * message after that point is the responsibility of the NAL */ /* convert common fields to host byte order */ - hdr->dest_nid = NTOH__u64 (hdr->dest_nid); + hdr->type = NTOH__u32 (hdr->type); hdr->src_nid = NTOH__u64 (hdr->src_nid); - hdr->dest_pid = NTOH__u32 (hdr->dest_pid); hdr->src_pid = NTOH__u32 (hdr->src_pid); - hdr->type = NTOH__u32 (hdr->type); + hdr->dest_pid = NTOH__u32 (hdr->dest_pid); hdr->payload_length = NTOH__u32(hdr->payload_length); -#if 0 - nal->cb_printf(nal, "%d: lib_parse: nal=%p hdr=%p type=%d\n", - nal->ni.nid, nal, hdr, hdr->type); - print_hdr(nal, hdr); -#endif - if (hdr->type == PTL_MSG_HELLO) { + + switch (hdr->type) { + case PTL_MSG_HELLO: { /* dest_nid is really ptl_magicversion_t */ ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid; - CERROR (LPU64": Dropping unexpected HELLO message: " + mv->magic = NTOH__u32(mv->magic); + mv->version_major = NTOH__u16(mv->version_major); + mv->version_minor = NTOH__u16(mv->version_minor); + + if (mv->magic == PORTALS_PROTO_MAGIC && + mv->version_major == PORTALS_PROTO_VERSION_MAJOR && + mv->version_minor == PORTALS_PROTO_VERSION_MINOR) { + CWARN (LPU64": Dropping unexpected HELLO message: " + "magic %d, version %d.%d from "LPD64"\n", + nal->libnal_ni.ni_pid.nid, mv->magic, + mv->version_major, mv->version_minor, + hdr->src_nid); + + /* it's good but we don't want it */ + lib_drop_message(nal, private, hdr); + return PTL_OK; + } + + /* we got garbage */ + CERROR (LPU64": Bad HELLO message: " "magic %d, version %d.%d from "LPD64"\n", - nal->ni.nid, mv->magic, + nal->libnal_ni.ni_pid.nid, mv->magic, mv->version_major, mv->version_minor, hdr->src_nid); - lib_drop_message(nal, private, hdr); - return; + return PTL_FAIL; } - - if (hdr->dest_nid != nal->ni.nid) { - CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64 - " (not me)\n", nal->ni.nid, hdr_type_string (hdr), - hdr->src_nid, hdr->dest_nid); - lib_drop_message(nal, private, hdr); - return; + + case PTL_MSG_ACK: + case PTL_MSG_PUT: + case PTL_MSG_GET: + case PTL_MSG_REPLY: + hdr->dest_nid = NTOH__u64 (hdr->dest_nid); + if (hdr->dest_nid != nal->libnal_ni.ni_pid.nid) { + CERROR(LPU64": BAD dest NID in %s message from" + LPU64" to "LPU64" (not me)\n", + nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), + hdr->src_nid, hdr->dest_nid); + return PTL_FAIL; + } + break; + + default: + CERROR(LPU64": Bad message type 0x%x from "LPU64"\n", + nal->libnal_ni.ni_pid.nid, hdr->type, hdr->src_nid); + return PTL_FAIL; } - if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + /* We've decided we're not receiving garbage since we can parse the + * header. We will return PTL_OK come what may... */ + + if (!list_empty (&nal->libnal_ni.ni_test_peers) && /* normally we don't */ fail_peer (nal, hdr->src_nid, 0)) /* shall we now? */ { CERROR(LPU64": Dropping incoming %s from "LPU64 ": simulated failure\n", - nal->ni.nid, hdr_type_string (hdr), + nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), hdr->src_nid); lib_drop_message(nal, private, hdr); - return; + return PTL_OK; } msg = lib_msg_alloc(nal); if (msg == NULL) { CERROR(LPU64": Dropping incoming %s from "LPU64 ": can't allocate a lib_msg_t\n", - nal->ni.nid, hdr_type_string (hdr), + nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), hdr->src_nid); lib_drop_message(nal, private, hdr); - return; + return PTL_OK; } switch (hdr->type) { @@ -1033,10 +1058,8 @@ lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private) rc = parse_reply(nal, hdr, private, msg); break; default: - CERROR(LPU64": Dropping message from "LPU64 - ": Bad type=0x%x\n", nal->ni.nid, hdr->src_nid, - hdr->type); - rc = PTL_FAIL; + LASSERT(0); + rc = PTL_FAIL; /* no compiler warning please */ break; } @@ -1045,123 +1068,114 @@ lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private) /* committed... */ lib_finalize(nal, private, msg, rc); } else { - state_lock(nal, &flags); - lib_msg_free(nal, msg); /* expects state_lock held */ - state_unlock(nal, &flags); + LIB_LOCK(nal, flags); + lib_msg_free(nal, msg); /* expects LIB_LOCK held */ + LIB_UNLOCK(nal, flags); lib_drop_message(nal, private, hdr); } } + + return PTL_OK; + /* That's "OK I can parse it", not "OK I like it" :) */ } int -do_PtlPut(nal_cb_t *nal, void *private, void *v_args, void *v_ret) +lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, + ptl_ack_req_t ack, ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, + ptl_size_t offset, ptl_hdr_data_t hdr_data) { - /* - * Incoming: - * ptl_handle_md_t md_in - * ptl_ack_req_t ack_req_in - * ptl_process_id_t target_in - * ptl_pt_index_t portal_in - * ptl_ac_index_t cookie_in - * ptl_match_bits_t match_bits_in - * ptl_size_t offset_in - * - * Outgoing: - */ - - PtlPut_in *args = v_args; - ptl_process_id_t *id = &args->target_in; - PtlPut_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; lib_msg_t *msg; ptl_hdr_t hdr; lib_md_t *md; unsigned long flags; int rc; - if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + if (!list_empty (&ni->ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ { - CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", - nal->ni.nid, id->nid); - return (ret->rc = PTL_PROCESS_INVALID); + CERROR("Dropping PUT to "LPU64": simulated failure\n", + id->nid); + return PTL_PROCESS_INVALID; } msg = lib_msg_alloc(nal); if (msg == NULL) { CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n", - ni->nid, id->nid); - return (ret->rc = PTL_NO_SPACE); + ni->ni_pid.nid, id->nid); + return PTL_NO_SPACE; } - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - md = ptl_handle2md(&args->md_in, nal); + md = ptl_handle2md(mdh, nal); if (md == NULL || md->threshold == 0) { lib_msg_free(nal, msg); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return (ret->rc = PTL_MD_INVALID); + return PTL_MD_INVALID; } - CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid, - (unsigned long)id->pid); + CDEBUG(D_NET, "PtlPut -> "LPX64"\n", id->nid); memset (&hdr, 0, sizeof (hdr)); hdr.type = HTON__u32 (PTL_MSG_PUT); hdr.dest_nid = HTON__u64 (id->nid); - hdr.src_nid = HTON__u64 (ni->nid); hdr.dest_pid = HTON__u32 (id->pid); - hdr.src_pid = HTON__u32 (ni->pid); + hdr.src_nid = HTON__u64 (ni->ni_pid.nid); + hdr.src_pid = HTON__u32 (ni->ni_pid.pid); hdr.payload_length = HTON__u32 (md->length); /* NB handles only looked up by creator (no flips) */ - if (args->ack_req_in == PTL_ACK_REQ) { + if (ack == PTL_ACK_REQ) { hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie; hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie; } else { hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE; } - hdr.msg.put.match_bits = HTON__u64 (args->match_bits_in); - hdr.msg.put.ptl_index = HTON__u32 (args->portal_in); - hdr.msg.put.offset = HTON__u32 (args->offset_in); - hdr.msg.put.hdr_data = args->hdr_data_in; + hdr.msg.put.match_bits = HTON__u64 (match_bits); + hdr.msg.put.ptl_index = HTON__u32 (portal); + hdr.msg.put.offset = HTON__u32 (offset); + hdr.msg.put.hdr_data = hdr_data; lib_commit_md(nal, md, msg); msg->ev.type = PTL_EVENT_SEND_END; - msg->ev.initiator.nid = ni->nid; - msg->ev.initiator.pid = ni->pid; - msg->ev.portal = args->portal_in; - msg->ev.match_bits = args->match_bits_in; + msg->ev.initiator.nid = ni->ni_pid.nid; + msg->ev.initiator.pid = ni->ni_pid.pid; + msg->ev.portal = portal; + msg->ev.match_bits = match_bits; msg->ev.rlength = md->length; msg->ev.mlength = md->length; - msg->ev.offset = args->offset_in; - msg->ev.hdr_data = args->hdr_data_in; + msg->ev.offset = offset; + msg->ev.hdr_data = hdr_data; lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - ni->counters.send_count++; - ni->counters.send_length += md->length; + ni->ni_counters.send_count++; + ni->ni_counters.send_length += md->length; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - rc = lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, + rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_PUT, id->nid, id->pid, md, 0, md->length); if (rc != PTL_OK) { - CERROR(LPU64": error sending PUT to "LPU64": %d\n", - ni->nid, id->nid, rc); - lib_finalize (nal, private, msg, rc); + CERROR("Error sending PUT to "LPX64": %d\n", + id->nid, rc); + lib_finalize (nal, NULL, msg, rc); } /* completion will be signalled by an event */ - return ret->rc = PTL_OK; + return PTL_OK; } lib_msg_t * -lib_create_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg) +lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg) { /* The NAL can DMA direct to the GET md (i.e. no REPLY msg). This * returns a msg for the NAL to pass to lib_finalize() when the sink @@ -1170,12 +1184,12 @@ lib_create_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg) * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when * lib_finalize() is called on it, so the NAL must call this first */ - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; lib_msg_t *msg = lib_msg_alloc(nal); lib_md_t *getmd = getmsg->md; unsigned long flags; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); LASSERT (getmd->pending > 0); @@ -1205,72 +1219,60 @@ lib_create_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg) lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc); - ni->counters.recv_count++; - ni->counters.recv_length += getmd->length; + ni->ni_counters.recv_count++; + ni->ni_counters.recv_length += getmd->length; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return msg; drop_msg: lib_msg_free(nal, msg); drop: - nal->ni.counters.drop_count++; - nal->ni.counters.drop_length += getmd->length; + nal->libnal_ni.ni_counters.drop_count++; + nal->libnal_ni.ni_counters.drop_length += getmd->length; - state_unlock (nal, &flags); + LIB_UNLOCK (nal, flags); return NULL; } int -do_PtlGet(nal_cb_t *nal, void *private, void *v_args, void *v_ret) +lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, ptl_size_t offset) { - /* - * Incoming: - * ptl_handle_md_t md_in - * ptl_process_id_t target_in - * ptl_pt_index_t portal_in - * ptl_ac_index_t cookie_in - * ptl_match_bits_t match_bits_in - * ptl_size_t offset_in - * - * Outgoing: - */ - - PtlGet_in *args = v_args; - ptl_process_id_t *id = &args->target_in; - PtlGet_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; lib_msg_t *msg; ptl_hdr_t hdr; lib_md_t *md; unsigned long flags; int rc; - if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + if (!list_empty (&ni->ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ { - CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", - nal->ni.nid, id->nid); - return (ret->rc = PTL_PROCESS_INVALID); + CERROR("Dropping PUT to "LPX64": simulated failure\n", + id->nid); + return PTL_PROCESS_INVALID; } msg = lib_msg_alloc(nal); if (msg == NULL) { - CERROR(LPU64": Dropping GET to "LPU64": ENOMEM on lib_msg_t\n", - ni->nid, id->nid); - return (ret->rc = PTL_NO_SPACE); + CERROR("Dropping GET to "LPU64": ENOMEM on lib_msg_t\n", + id->nid); + return PTL_NO_SPACE; } - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - md = ptl_handle2md(&args->md_in, nal); + md = ptl_handle2md(mdh, nal); if (md == NULL || !md->threshold) { lib_msg_free(nal, msg); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return ret->rc = PTL_MD_INVALID; + return PTL_MD_INVALID; } CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid, @@ -1279,48 +1281,47 @@ do_PtlGet(nal_cb_t *nal, void *private, void *v_args, void *v_ret) memset (&hdr, 0, sizeof (hdr)); hdr.type = HTON__u32 (PTL_MSG_GET); hdr.dest_nid = HTON__u64 (id->nid); - hdr.src_nid = HTON__u64 (ni->nid); hdr.dest_pid = HTON__u32 (id->pid); - hdr.src_pid = HTON__u32 (ni->pid); + hdr.src_nid = HTON__u64 (ni->ni_pid.nid); + hdr.src_pid = HTON__u32 (ni->ni_pid.pid); hdr.payload_length = 0; /* NB handles only looked up by creator (no flips) */ hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie; hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie; - hdr.msg.get.match_bits = HTON__u64 (args->match_bits_in); - hdr.msg.get.ptl_index = HTON__u32 (args->portal_in); - hdr.msg.get.src_offset = HTON__u32 (args->offset_in); + hdr.msg.get.match_bits = HTON__u64 (match_bits); + hdr.msg.get.ptl_index = HTON__u32 (portal); + hdr.msg.get.src_offset = HTON__u32 (offset); hdr.msg.get.sink_length = HTON__u32 (md->length); lib_commit_md(nal, md, msg); msg->ev.type = PTL_EVENT_SEND_END; - msg->ev.initiator.nid = ni->nid; - msg->ev.initiator.pid = ni->pid; - msg->ev.portal = args->portal_in; - msg->ev.match_bits = args->match_bits_in; + msg->ev.initiator = ni->ni_pid; + msg->ev.portal = portal; + msg->ev.match_bits = match_bits; msg->ev.rlength = md->length; msg->ev.mlength = md->length; - msg->ev.offset = args->offset_in; + msg->ev.offset = offset; msg->ev.hdr_data = 0; lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - ni->counters.send_count++; + ni->ni_counters.send_count++; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - rc = lib_send (nal, private, msg, &hdr, PTL_MSG_GET, + rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_GET, id->nid, id->pid, NULL, 0, 0); if (rc != PTL_OK) { CERROR(LPU64": error sending GET to "LPU64": %d\n", - ni->nid, id->nid, rc); - lib_finalize (nal, private, msg, rc); + ni->ni_pid.nid, id->nid, rc); + lib_finalize (nal, NULL, msg, rc); } /* completion will be signalled by an event */ - return ret->rc = PTL_OK; + return PTL_OK; } void lib_assert_wire_constants (void) diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 1b69533..328b8d8 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -33,55 +33,39 @@ #include void -lib_enq_event_locked (nal_cb_t *nal, void *private, +lib_enq_event_locked (lib_nal_t *nal, void *private, lib_eq_t *eq, ptl_event_t *ev) { ptl_event_t *eq_slot; - int rc; - ev->sequence = eq->sequence++; /* Allocate the next queue slot */ - - /* size must be a power of 2 to handle a wrapped sequence # */ - LASSERT (eq->size != 0 && - eq->size == LOWEST_BIT_SET (eq->size)); - eq_slot = eq->base + (ev->sequence & (eq->size - 1)); + ev->sequence = eq->eq_enq_seq++; /* Allocate the next queue slot */ - /* Copy the event into the allocated slot, ensuring all the rest of - * the event's contents have been copied _before_ the sequence - * number gets updated. A processes 'getting' an event waits on - * the next queue slot's sequence to be 'new'. When it is, _all_ - * other event fields had better be consistent. I assert - * 'sequence' is the last member, so I only need a 2 stage copy. */ + /* size must be a power of 2 to handle sequence # overflow */ + LASSERT (eq->eq_size != 0 && + eq->eq_size == LOWEST_BIT_SET (eq->eq_size)); + eq_slot = eq->eq_events + (ev->sequence & (eq->eq_size - 1)); - LASSERT(sizeof (ptl_event_t) == - offsetof(ptl_event_t, sequence) + sizeof(ev->sequence)); + /* There is no race since both event consumers and event producers + * take the LIB_LOCK(), so we don't screw around with memory + * barriers, setting the sequence number last or wierd structure + * layout assertions. */ + *eq_slot = *ev; - rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev, - offsetof (ptl_event_t, sequence)); - LASSERT (rc == PTL_OK); + /* Call the callback handler (if any) */ + if (eq->eq_callback != NULL) + eq->eq_callback (eq_slot); + /* Wake anyone sleeping for an event (see lib-eq.c) */ #ifdef __KERNEL__ - barrier(); -#endif - /* Updating the sequence number is what makes the event 'new' NB if - * the cb_write below isn't atomic, this could cause a race with - * PtlEQGet */ - rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence, - (void *)&ev->sequence,sizeof (ev->sequence)); - LASSERT (rc == PTL_OK); - -#ifdef __KERNEL__ - barrier(); + if (waitqueue_active(&nal->libnal_ni.ni_waitq)) + wake_up_all(&nal->libnal_ni.ni_waitq); +#else + pthread_cond_broadcast(&nal->libnal_ni.ni_cond); #endif - - if (nal->cb_callback != NULL) - nal->cb_callback(nal, private, eq, ev); - else if (eq->event_callback != NULL) - eq->event_callback(ev); } void -lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) +lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) { lib_md_t *md; int unlink; @@ -101,9 +85,9 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) memset (&ack, 0, sizeof (ack)); ack.type = HTON__u32 (PTL_MSG_ACK); ack.dest_nid = HTON__u64 (msg->ev.initiator.nid); - ack.src_nid = HTON__u64 (nal->ni.nid); ack.dest_pid = HTON__u32 (msg->ev.initiator.pid); - ack.src_pid = HTON__u32 (nal->ni.pid); + ack.src_nid = HTON__u64 (nal->libnal_ni.ni_pid.nid); + ack.src_pid = HTON__u32 (nal->libnal_ni.ni_pid.pid); ack.payload_length = 0; ack.msg.ack.dst_wmd = msg->ack_wmd; @@ -122,7 +106,7 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) md = msg->md; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); /* Now it's safe to drop my caller's ref */ md->pending--; @@ -148,8 +132,8 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) lib_md_unlink(nal, md); list_del (&msg->msg_list); - nal->ni.counters.msgs_alloc--; + nal->libnal_ni.ni_counters.msgs_alloc--; lib_msg_free(nal, msg); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); } diff --git a/lnet/lnet/lib-ni.c b/lnet/lnet/lib-ni.c index aa959fc..0f298a0 100644 --- a/lnet/lnet/lib-ni.c +++ b/lnet/lnet/lib-ni.c @@ -25,92 +25,48 @@ #define DEBUG_SUBSYSTEM S_PORTALS #include -#include #define MAX_DIST 18446744073709551615ULL -int do_PtlNIStatus(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int lib_api_ni_status (nal_t *apinal, ptl_sr_index_t sr_idx, + ptl_sr_value_t *status) { - /* - * Incoming: - * ptl_handle_ni_t interface_in - * ptl_sr_index_t register_in - * - * Outgoing: - * ptl_sr_value_t * status_out - */ - - PtlNIStatus_in *args = v_args; - PtlNIStatus_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; - lib_counters_t *count = &ni->counters; - - if (!args) - return ret->rc = PTL_SEGV; - - ret->rc = PTL_OK; - ret->status_out = 0; - - /* - * I hate this sort of code.... Hash tables, offset lists? - * Treat the counters as an array of ints? - */ - if (args->register_in == PTL_SR_DROP_COUNT) - ret->status_out = count->drop_count; - - else if (args->register_in == PTL_SR_DROP_LENGTH) - ret->status_out = count->drop_length; - - else if (args->register_in == PTL_SR_RECV_COUNT) - ret->status_out = count->recv_count; - - else if (args->register_in == PTL_SR_RECV_LENGTH) - ret->status_out = count->recv_length; - - else if (args->register_in == PTL_SR_SEND_COUNT) - ret->status_out = count->send_count; - - else if (args->register_in == PTL_SR_SEND_LENGTH) - ret->status_out = count->send_length; - - else if (args->register_in == PTL_SR_MSGS_MAX) - ret->status_out = count->msgs_max; - else - ret->rc = PTL_SR_INDEX_INVALID; - - return ret->rc; + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; + lib_counters_t *count = &ni->ni_counters; + + switch (sr_idx) { + case PTL_SR_DROP_COUNT: + *status = count->drop_count; + return PTL_OK; + case PTL_SR_DROP_LENGTH: + *status = count->drop_length; + return PTL_OK; + case PTL_SR_RECV_COUNT: + *status = count->recv_count; + return PTL_OK; + case PTL_SR_RECV_LENGTH: + *status = count->recv_length; + return PTL_OK; + case PTL_SR_SEND_COUNT: + *status = count->send_count; + return PTL_OK; + case PTL_SR_SEND_LENGTH: + *status = count->send_length; + return PTL_OK; + case PTL_SR_MSGS_MAX: + *status = count->msgs_max; + return PTL_OK; + default: + *status = 0; + return PTL_SR_INDEX_INVALID; + } } -int do_PtlNIDist(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int lib_api_ni_dist (nal_t *apinal, ptl_process_id_t *pid, unsigned long *dist) { - /* - * Incoming: - * ptl_handle_ni_t interface_in - * ptl_process_id_t process_in - - * - * Outgoing: - * unsigned long * distance_out - - */ - - PtlNIDist_in *args = v_args; - PtlNIDist_out *ret = v_ret; - - unsigned long dist; - ptl_process_id_t id_in = args->process_in; - ptl_nid_t nid; - int rc; - - nid = id_in.nid; - - if ((rc = nal->cb_dist(nal, nid, &dist)) != 0) { - ret->distance_out = (unsigned long) MAX_DIST; - return PTL_PROCESS_INVALID; - } - - ret->distance_out = dist; + lib_nal_t *nal = apinal->nal_data; - return ret->rc = PTL_OK; + return (nal->libnal_dist(nal, pid->nid, dist)); } diff --git a/lnet/lnet/lib-pid.c b/lnet/lnet/lib-pid.c index 12eebb5..ff2a601 100644 --- a/lnet/lnet/lib-pid.c +++ b/lnet/lnet/lib-pid.c @@ -35,24 +35,12 @@ extern int getpid(void); # include #endif #include -#include -int do_PtlGetId(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_get_id(nal_t *apinal, ptl_process_id_t *pid) { - /* - * Incoming: - * ptl_handle_ni_t handle_in - * - * Outgoing: - * ptl_process_id_t * id_out - * ptl_id_t * gsize_out - */ - - PtlGetId_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; - - ret->id_out.nid = ni->nid; - ret->id_out.pid = ni->pid; - - return ret->rc = PTL_OK; + lib_nal_t *nal = apinal->nal_data; + + *pid = nal->libnal_ni.ni_pid; + return PTL_OK; } diff --git a/lnet/lnet/module.c b/lnet/lnet/module.c index 40e9da4..5615a724 100644 --- a/lnet/lnet/module.c +++ b/lnet/lnet/module.c @@ -160,7 +160,6 @@ EXPORT_SYMBOL(ptl_register_nal); EXPORT_SYMBOL(ptl_unregister_nal); EXPORT_SYMBOL(ptl_err_str); -EXPORT_SYMBOL(lib_dispatch); EXPORT_SYMBOL(PtlMEAttach); EXPORT_SYMBOL(PtlMEInsert); EXPORT_SYMBOL(PtlMEUnlink); @@ -192,7 +191,6 @@ EXPORT_SYMBOL(lib_parse); EXPORT_SYMBOL(lib_create_reply_msg); EXPORT_SYMBOL(lib_init); EXPORT_SYMBOL(lib_fini); -EXPORT_SYMBOL(dispatch_name); MODULE_AUTHOR("Peter J. Braam "); MODULE_DESCRIPTION("Portals v3.1"); diff --git a/lnet/ulnds/address.c b/lnet/ulnds/address.c index 6507924..f329e2a 100644 --- a/lnet/ulnds/address.c +++ b/lnet/ulnds/address.c @@ -91,8 +91,8 @@ void set_address(bridge t,ptl_pid_t pidrequest) int port; if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; else port=pidrequest; - t->nal_cb->ni.nid=get_node_id(); - t->nal_cb->ni.pid=port; + t->lib_nal->libnal_ni.ni_pid.nid=get_node_id(); + t->lib_nal->libnal_ni.ni_pid.pid=port; } #else @@ -120,10 +120,9 @@ void set_address(bridge t,ptl_pid_t pidrequest) in_addr = get_node_id(); t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ - t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) - << PNAL_VNODE_SHIFT) - + virtnode; - + t->lib_nal->libnal_ni.ni_pid.nid = ((in_addr & PNAL_HOSTID_MASK) + << PNAL_VNODE_SHIFT) + + virtnode; pid=pidrequest; /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ #ifdef notyet @@ -141,6 +140,6 @@ void set_address(bridge t,ptl_pid_t pidrequest) return; } else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; - t->nal_cb->ni.pid=pid; + t->lib_nal->libnal_ni.ni_pid.pid=pid; } #endif diff --git a/lnet/ulnds/bridge.h b/lnet/ulnds/bridge.h index 90ce324..d2f0f2c 100644 --- a/lnet/ulnds/bridge.h +++ b/lnet/ulnds/bridge.h @@ -19,7 +19,7 @@ typedef struct bridge { int alive; - nal_cb_t *nal_cb; + lib_nal_t *lib_nal; void *lower; void *local; void (*shutdown)(struct bridge *); diff --git a/lnet/ulnds/procapi.c b/lnet/ulnds/procapi.c index e40c4b9..f3843d7 100644 --- a/lnet/ulnds/procapi.c +++ b/lnet/ulnds/procapi.c @@ -60,34 +60,6 @@ void procbridge_wakeup_nal(procbridge p) syscall(SYS_write, p->notifier[0], buf, sizeof(buf)); } -/* Function: forward - * Arguments: nal_t *nal: pointer to my top-side nal structure - * id: the command to pass to the lower layer - * args, args_len:pointer to and length of the request - * ret, ret_len: pointer to and size of the result - * Returns: a portals status code - * - * forwards a packaged api call from the 'api' side to the 'library' - * side, and collects the result - */ -static int procbridge_forward(nal_t *n, int id, void *args, size_t args_len, - void *ret, size_t ret_len) -{ - bridge b = (bridge) n->nal_data; - - if (id == PTL_FINI) { - lib_fini(b->nal_cb); - - if (b->shutdown) - (*b->shutdown)(b); - } - - lib_dispatch(b->nal_cb, NULL, id, args, ret); - - return (PTL_OK); -} - - /* Function: shutdown * Arguments: nal: a pointer to my top side nal structure * ni: my network interface index @@ -97,7 +69,8 @@ static int procbridge_forward(nal_t *n, int id, void *args, size_t args_len, */ static void procbridge_shutdown(nal_t *n) { - bridge b=(bridge)n->nal_data; + lib_nal_t *nal = n->nal_data; + bridge b=(bridge)nal->libnal_data; procbridge p=(procbridge)b->local; p->nal_flags |= NAL_FLAG_STOPPING; @@ -117,83 +90,19 @@ static void procbridge_shutdown(nal_t *n) } -static void procbridge_lock(nal_t * n, unsigned long *flags) -{ - bridge b=(bridge)n->nal_data; - procbridge p=(procbridge)b->local; - - pthread_mutex_lock(&p->mutex); -} - -static void procbridge_unlock(nal_t * n, unsigned long *flags) -{ - bridge b=(bridge)n->nal_data; - procbridge p=(procbridge)b->local; - - pthread_mutex_unlock(&p->mutex); -} - -/* Function: yield - * Arguments: pid: - * - * this function was originally intended to allow the - * lower half thread to be scheduled to allow progress. we - * overload it to explicitly block until signalled by the - * lower half. - */ -static int procbridge_yield(nal_t *n, unsigned long *flags, int milliseconds) -{ - bridge b=(bridge)n->nal_data; - procbridge p=(procbridge)b->local; - - if (milliseconds == 0) - return 0; - - if (milliseconds < 0) { - pthread_cond_wait(&p->cond,&p->mutex); - } else { - struct timeval then; - struct timeval now; - struct timespec timeout; - - gettimeofday(&then, NULL); - timeout.tv_sec = then.tv_sec + milliseconds/1000; - timeout.tv_nsec = then.tv_usec * 1000 + milliseconds % 1000 * 1000000; - if (timeout.tv_nsec >= 1000000000) { - timeout.tv_sec++; - timeout.tv_nsec -= 1000000000; - } - - pthread_cond_timedwait(&p->cond, &p->mutex, &timeout); - - gettimeofday(&now, NULL); - milliseconds -= (now.tv_sec - then.tv_sec) * 1000 + - (now.tv_usec - then.tv_usec) / 1000; - - if (milliseconds < 0) - milliseconds = 0; - } - - return (milliseconds); -} - /* forward decl */ extern int procbridge_startup (nal_t *, ptl_pid_t, ptl_ni_limits_t *, ptl_ni_limits_t *); /* api_nal * the interface vector to allow the generic code to access - * this nal. this is seperate from the library side nal_cb. + * this nal. this is seperate from the library side lib_nal. * TODO: should be dyanmically allocated */ nal_t procapi_nal = { nal_data: NULL, - startup: procbridge_startup, - shutdown: procbridge_shutdown, - forward: procbridge_forward, - yield: procbridge_yield, - lock: procbridge_lock, - unlock: procbridge_unlock + nal_ni_init: procbridge_startup, + nal_ni_fini: procbridge_shutdown, }; ptl_nid_t tcpnal_mynid; @@ -228,7 +137,6 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, b=(bridge)malloc(sizeof(struct bridge)); p=(procbridge)malloc(sizeof(struct procbridge)); - nal->nal_data=b; b->local=p; args.nia_requested_pid = requested_pid; @@ -236,6 +144,7 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, args.nia_actual_limits = actual_limits; args.nia_nal_type = nal_type; args.nia_bridge = b; + args.nia_apinal = nal; /* init procbridge */ pthread_mutex_init(&p->mutex,0); @@ -273,7 +182,7 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, if (p->nal_flags & NAL_FLAG_STOPPED) return PTL_FAIL; - b->nal_cb->ni.nid = tcpnal_mynid; + b->lib_nal->libnal_ni.ni_pid.nid = tcpnal_mynid; return PTL_OK; } diff --git a/lnet/ulnds/procbridge.h b/lnet/ulnds/procbridge.h index 1c8e7dd..1f91ced 100644 --- a/lnet/ulnds/procbridge.h +++ b/lnet/ulnds/procbridge.h @@ -30,7 +30,6 @@ typedef struct procbridge { int nal_flags; - pthread_mutex_t nal_cb_lock; } *procbridge; typedef struct nal_init_args { @@ -39,6 +38,7 @@ typedef struct nal_init_args { ptl_ni_limits_t *nia_actual_limits; int nia_nal_type; bridge nia_bridge; + nal_t *nia_apinal; } nal_init_args_t; extern void *nal_thread(void *); diff --git a/lnet/ulnds/proclib.c b/lnet/ulnds/proclib.c index af0745b..7ee7c71 100644 --- a/lnet/ulnds/proclib.c +++ b/lnet/ulnds/proclib.c @@ -43,85 +43,7 @@ /* the following functions are stubs to satisfy the nal definition without doing anything particularily useful*/ -static ptl_err_t nal_write(nal_cb_t *nal, - void *private, - user_ptr dst_addr, - void *src_addr, - size_t len) -{ - memcpy(dst_addr, src_addr, len); - return PTL_OK; -} - -static ptl_err_t nal_read(nal_cb_t * nal, - void *private, - void *dst_addr, - user_ptr src_addr, - size_t len) -{ - memcpy(dst_addr, src_addr, len); - return PTL_OK; -} - -static void *nal_malloc(nal_cb_t *nal, - size_t len) -{ - void *buf = malloc(len); - return buf; -} - -static void nal_free(nal_cb_t *nal, - void *buf, - size_t len) -{ - free(buf); -} - -static void nal_printf(nal_cb_t *nal, - const char *fmt, - ...) -{ - va_list ap; - - va_start(ap, fmt); - vprintf(fmt, ap); - va_end(ap); -} - - -static void nal_cli(nal_cb_t *nal, - unsigned long *flags) -{ - bridge b = (bridge) nal->nal_data; - procbridge p = (procbridge) b->local; - - pthread_mutex_lock(&p->mutex); -} - - -static void nal_sti(nal_cb_t *nal, - unsigned long *flags) -{ - bridge b = (bridge)nal->nal_data; - procbridge p = (procbridge) b->local; - - pthread_mutex_unlock(&p->mutex); -} - -static void nal_callback(nal_cb_t *nal, void *private, - lib_eq_t *eq, ptl_event_t *ev) -{ - bridge b = (bridge)nal->nal_data; - procbridge p = (procbridge) b->local; - - /* holding p->mutex */ - if (eq->event_callback != NULL) - eq->event_callback(ev); - - pthread_cond_broadcast(&p->cond); -} - -static int nal_dist(nal_cb_t *nal, +static int nal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { @@ -170,33 +92,25 @@ void *nal_thread(void *z) ptl_process_id_t process_id; int nal_type; - b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t)); - b->nal_cb->nal_data=b; - b->nal_cb->cb_read=nal_read; - b->nal_cb->cb_write=nal_write; - b->nal_cb->cb_malloc=nal_malloc; - b->nal_cb->cb_free=nal_free; - b->nal_cb->cb_map=NULL; - b->nal_cb->cb_unmap=NULL; - b->nal_cb->cb_printf=nal_printf; - b->nal_cb->cb_cli=nal_cli; - b->nal_cb->cb_sti=nal_sti; - b->nal_cb->cb_callback=nal_callback; - b->nal_cb->cb_dist=nal_dist; + b->lib_nal=(lib_nal_t *)malloc(sizeof(lib_nal_t)); + b->lib_nal->libnal_data=b; + b->lib_nal->libnal_map=NULL; + b->lib_nal->libnal_unmap=NULL; + b->lib_nal->libnal_dist=nal_dist; nal_type = args->nia_nal_type; - /* Wierd, but this sets b->nal_cb->ni.{nid,pid}, which lib_init() is - * about to do from the process_id passed to it...*/ + /* Wierd, but this sets b->lib_nal->libnal_ni.ni_pid.{nid,pid}, which + * lib_init() is about to do from the process_id passed to it...*/ set_address(b,args->nia_requested_pid); - process_id.pid = b->nal_cb->ni.pid; - process_id.nid = b->nal_cb->ni.nid; + process_id = b->lib_nal->libnal_ni.ni_pid; if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); /* initialize the generic 'library' level code */ - rc = lib_init(b->nal_cb, process_id, + rc = lib_init(b->lib_nal, args->nia_apinal, + process_id, args->nia_requested_limits, args->nia_actual_limits); diff --git a/lnet/ulnds/socklnd/address.c b/lnet/ulnds/socklnd/address.c index 6507924..f329e2a 100644 --- a/lnet/ulnds/socklnd/address.c +++ b/lnet/ulnds/socklnd/address.c @@ -91,8 +91,8 @@ void set_address(bridge t,ptl_pid_t pidrequest) int port; if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; else port=pidrequest; - t->nal_cb->ni.nid=get_node_id(); - t->nal_cb->ni.pid=port; + t->lib_nal->libnal_ni.ni_pid.nid=get_node_id(); + t->lib_nal->libnal_ni.ni_pid.pid=port; } #else @@ -120,10 +120,9 @@ void set_address(bridge t,ptl_pid_t pidrequest) in_addr = get_node_id(); t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ - t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) - << PNAL_VNODE_SHIFT) - + virtnode; - + t->lib_nal->libnal_ni.ni_pid.nid = ((in_addr & PNAL_HOSTID_MASK) + << PNAL_VNODE_SHIFT) + + virtnode; pid=pidrequest; /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ #ifdef notyet @@ -141,6 +140,6 @@ void set_address(bridge t,ptl_pid_t pidrequest) return; } else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; - t->nal_cb->ni.pid=pid; + t->lib_nal->libnal_ni.ni_pid.pid=pid; } #endif diff --git a/lnet/ulnds/socklnd/bridge.h b/lnet/ulnds/socklnd/bridge.h index 90ce324..d2f0f2c 100644 --- a/lnet/ulnds/socklnd/bridge.h +++ b/lnet/ulnds/socklnd/bridge.h @@ -19,7 +19,7 @@ typedef struct bridge { int alive; - nal_cb_t *nal_cb; + lib_nal_t *lib_nal; void *lower; void *local; void (*shutdown)(struct bridge *); diff --git a/lnet/ulnds/socklnd/procapi.c b/lnet/ulnds/socklnd/procapi.c index e40c4b9..f3843d7 100644 --- a/lnet/ulnds/socklnd/procapi.c +++ b/lnet/ulnds/socklnd/procapi.c @@ -60,34 +60,6 @@ void procbridge_wakeup_nal(procbridge p) syscall(SYS_write, p->notifier[0], buf, sizeof(buf)); } -/* Function: forward - * Arguments: nal_t *nal: pointer to my top-side nal structure - * id: the command to pass to the lower layer - * args, args_len:pointer to and length of the request - * ret, ret_len: pointer to and size of the result - * Returns: a portals status code - * - * forwards a packaged api call from the 'api' side to the 'library' - * side, and collects the result - */ -static int procbridge_forward(nal_t *n, int id, void *args, size_t args_len, - void *ret, size_t ret_len) -{ - bridge b = (bridge) n->nal_data; - - if (id == PTL_FINI) { - lib_fini(b->nal_cb); - - if (b->shutdown) - (*b->shutdown)(b); - } - - lib_dispatch(b->nal_cb, NULL, id, args, ret); - - return (PTL_OK); -} - - /* Function: shutdown * Arguments: nal: a pointer to my top side nal structure * ni: my network interface index @@ -97,7 +69,8 @@ static int procbridge_forward(nal_t *n, int id, void *args, size_t args_len, */ static void procbridge_shutdown(nal_t *n) { - bridge b=(bridge)n->nal_data; + lib_nal_t *nal = n->nal_data; + bridge b=(bridge)nal->libnal_data; procbridge p=(procbridge)b->local; p->nal_flags |= NAL_FLAG_STOPPING; @@ -117,83 +90,19 @@ static void procbridge_shutdown(nal_t *n) } -static void procbridge_lock(nal_t * n, unsigned long *flags) -{ - bridge b=(bridge)n->nal_data; - procbridge p=(procbridge)b->local; - - pthread_mutex_lock(&p->mutex); -} - -static void procbridge_unlock(nal_t * n, unsigned long *flags) -{ - bridge b=(bridge)n->nal_data; - procbridge p=(procbridge)b->local; - - pthread_mutex_unlock(&p->mutex); -} - -/* Function: yield - * Arguments: pid: - * - * this function was originally intended to allow the - * lower half thread to be scheduled to allow progress. we - * overload it to explicitly block until signalled by the - * lower half. - */ -static int procbridge_yield(nal_t *n, unsigned long *flags, int milliseconds) -{ - bridge b=(bridge)n->nal_data; - procbridge p=(procbridge)b->local; - - if (milliseconds == 0) - return 0; - - if (milliseconds < 0) { - pthread_cond_wait(&p->cond,&p->mutex); - } else { - struct timeval then; - struct timeval now; - struct timespec timeout; - - gettimeofday(&then, NULL); - timeout.tv_sec = then.tv_sec + milliseconds/1000; - timeout.tv_nsec = then.tv_usec * 1000 + milliseconds % 1000 * 1000000; - if (timeout.tv_nsec >= 1000000000) { - timeout.tv_sec++; - timeout.tv_nsec -= 1000000000; - } - - pthread_cond_timedwait(&p->cond, &p->mutex, &timeout); - - gettimeofday(&now, NULL); - milliseconds -= (now.tv_sec - then.tv_sec) * 1000 + - (now.tv_usec - then.tv_usec) / 1000; - - if (milliseconds < 0) - milliseconds = 0; - } - - return (milliseconds); -} - /* forward decl */ extern int procbridge_startup (nal_t *, ptl_pid_t, ptl_ni_limits_t *, ptl_ni_limits_t *); /* api_nal * the interface vector to allow the generic code to access - * this nal. this is seperate from the library side nal_cb. + * this nal. this is seperate from the library side lib_nal. * TODO: should be dyanmically allocated */ nal_t procapi_nal = { nal_data: NULL, - startup: procbridge_startup, - shutdown: procbridge_shutdown, - forward: procbridge_forward, - yield: procbridge_yield, - lock: procbridge_lock, - unlock: procbridge_unlock + nal_ni_init: procbridge_startup, + nal_ni_fini: procbridge_shutdown, }; ptl_nid_t tcpnal_mynid; @@ -228,7 +137,6 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, b=(bridge)malloc(sizeof(struct bridge)); p=(procbridge)malloc(sizeof(struct procbridge)); - nal->nal_data=b; b->local=p; args.nia_requested_pid = requested_pid; @@ -236,6 +144,7 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, args.nia_actual_limits = actual_limits; args.nia_nal_type = nal_type; args.nia_bridge = b; + args.nia_apinal = nal; /* init procbridge */ pthread_mutex_init(&p->mutex,0); @@ -273,7 +182,7 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, if (p->nal_flags & NAL_FLAG_STOPPED) return PTL_FAIL; - b->nal_cb->ni.nid = tcpnal_mynid; + b->lib_nal->libnal_ni.ni_pid.nid = tcpnal_mynid; return PTL_OK; } diff --git a/lnet/ulnds/socklnd/procbridge.h b/lnet/ulnds/socklnd/procbridge.h index 1c8e7dd..1f91ced 100644 --- a/lnet/ulnds/socklnd/procbridge.h +++ b/lnet/ulnds/socklnd/procbridge.h @@ -30,7 +30,6 @@ typedef struct procbridge { int nal_flags; - pthread_mutex_t nal_cb_lock; } *procbridge; typedef struct nal_init_args { @@ -39,6 +38,7 @@ typedef struct nal_init_args { ptl_ni_limits_t *nia_actual_limits; int nia_nal_type; bridge nia_bridge; + nal_t *nia_apinal; } nal_init_args_t; extern void *nal_thread(void *); diff --git a/lnet/ulnds/socklnd/proclib.c b/lnet/ulnds/socklnd/proclib.c index af0745b..7ee7c71 100644 --- a/lnet/ulnds/socklnd/proclib.c +++ b/lnet/ulnds/socklnd/proclib.c @@ -43,85 +43,7 @@ /* the following functions are stubs to satisfy the nal definition without doing anything particularily useful*/ -static ptl_err_t nal_write(nal_cb_t *nal, - void *private, - user_ptr dst_addr, - void *src_addr, - size_t len) -{ - memcpy(dst_addr, src_addr, len); - return PTL_OK; -} - -static ptl_err_t nal_read(nal_cb_t * nal, - void *private, - void *dst_addr, - user_ptr src_addr, - size_t len) -{ - memcpy(dst_addr, src_addr, len); - return PTL_OK; -} - -static void *nal_malloc(nal_cb_t *nal, - size_t len) -{ - void *buf = malloc(len); - return buf; -} - -static void nal_free(nal_cb_t *nal, - void *buf, - size_t len) -{ - free(buf); -} - -static void nal_printf(nal_cb_t *nal, - const char *fmt, - ...) -{ - va_list ap; - - va_start(ap, fmt); - vprintf(fmt, ap); - va_end(ap); -} - - -static void nal_cli(nal_cb_t *nal, - unsigned long *flags) -{ - bridge b = (bridge) nal->nal_data; - procbridge p = (procbridge) b->local; - - pthread_mutex_lock(&p->mutex); -} - - -static void nal_sti(nal_cb_t *nal, - unsigned long *flags) -{ - bridge b = (bridge)nal->nal_data; - procbridge p = (procbridge) b->local; - - pthread_mutex_unlock(&p->mutex); -} - -static void nal_callback(nal_cb_t *nal, void *private, - lib_eq_t *eq, ptl_event_t *ev) -{ - bridge b = (bridge)nal->nal_data; - procbridge p = (procbridge) b->local; - - /* holding p->mutex */ - if (eq->event_callback != NULL) - eq->event_callback(ev); - - pthread_cond_broadcast(&p->cond); -} - -static int nal_dist(nal_cb_t *nal, +static int nal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { @@ -170,33 +92,25 @@ void *nal_thread(void *z) ptl_process_id_t process_id; int nal_type; - b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t)); - b->nal_cb->nal_data=b; - b->nal_cb->cb_read=nal_read; - b->nal_cb->cb_write=nal_write; - b->nal_cb->cb_malloc=nal_malloc; - b->nal_cb->cb_free=nal_free; - b->nal_cb->cb_map=NULL; - b->nal_cb->cb_unmap=NULL; - b->nal_cb->cb_printf=nal_printf; - b->nal_cb->cb_cli=nal_cli; - b->nal_cb->cb_sti=nal_sti; - b->nal_cb->cb_callback=nal_callback; - b->nal_cb->cb_dist=nal_dist; + b->lib_nal=(lib_nal_t *)malloc(sizeof(lib_nal_t)); + b->lib_nal->libnal_data=b; + b->lib_nal->libnal_map=NULL; + b->lib_nal->libnal_unmap=NULL; + b->lib_nal->libnal_dist=nal_dist; nal_type = args->nia_nal_type; - /* Wierd, but this sets b->nal_cb->ni.{nid,pid}, which lib_init() is - * about to do from the process_id passed to it...*/ + /* Wierd, but this sets b->lib_nal->libnal_ni.ni_pid.{nid,pid}, which + * lib_init() is about to do from the process_id passed to it...*/ set_address(b,args->nia_requested_pid); - process_id.pid = b->nal_cb->ni.pid; - process_id.nid = b->nal_cb->ni.nid; + process_id = b->lib_nal->libnal_ni.ni_pid; if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); /* initialize the generic 'library' level code */ - rc = lib_init(b->nal_cb, process_id, + rc = lib_init(b->lib_nal, args->nia_apinal, + process_id, args->nia_requested_limits, args->nia_actual_limits); diff --git a/lnet/ulnds/socklnd/tcplnd.c b/lnet/ulnds/socklnd/tcplnd.c index 0c47f42..34a9c9d 100644 --- a/lnet/ulnds/socklnd/tcplnd.c +++ b/lnet/ulnds/socklnd/tcplnd.c @@ -55,7 +55,7 @@ * * sends a packet to the peer, after insuring that a connection exists */ -ptl_err_t tcpnal_send(nal_cb_t *n, +ptl_err_t tcpnal_send(lib_nal_t *n, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, @@ -68,7 +68,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n, size_t len) { connection c; - bridge b=(bridge)n->nal_data; + bridge b=(bridge)n->libnal_data; struct iovec tiov[257]; static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER; ptl_err_t rc = PTL_OK; @@ -142,7 +142,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n, /* Function: tcpnal_recv - * Arguments: nal_cb_t *nal: pointer to my nal control block + * Arguments: lib_nal_t *nal: pointer to my nal control block * void *private: connection pointer passed through * lib_parse() * lib_msg_t *cookie: passed back to portals library @@ -154,7 +154,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n, * blocking read of the requested data. must drain out the * difference of mainpulated and requested lengths from the network */ -ptl_err_t tcpnal_recv(nal_cb_t *n, +ptl_err_t tcpnal_recv(lib_nal_t *n, void *private, lib_msg_t *cookie, unsigned int niov, @@ -217,7 +217,8 @@ static int from_connection(void *a, void *d) ptl_hdr_t hdr; if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ - lib_parse(b->nal_cb, &hdr, c); + lib_parse(b->lib_nal, &hdr, c); + /*TODO: check error status*/ return(1); } return(0); @@ -239,19 +240,19 @@ int tcpnal_init(bridge b) { manager m; - b->nal_cb->cb_send=tcpnal_send; - b->nal_cb->cb_recv=tcpnal_recv; + b->lib_nal->libnal_send=tcpnal_send; + b->lib_nal->libnal_recv=tcpnal_recv; b->shutdown=tcpnal_shutdown; - if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid, - b->nal_cb->ni.pid), + if (!(m=init_connections(PNAL_PORT(b->lib_nal->libnal_ni.ni_pid.nid, + b->lib_nal->libnal_ni.ni_pid.pid), from_connection,b))){ /* TODO: this needs to shut down the newly created junk */ return(PTL_NAL_FAILED); } /* XXX cfs hack */ - b->nal_cb->ni.pid=0; + b->lib_nal->libnal_ni.ni_pid.pid=0; b->lower=m; return(PTL_OK); } diff --git a/lnet/ulnds/tcplnd.c b/lnet/ulnds/tcplnd.c index 0c47f42..34a9c9d 100644 --- a/lnet/ulnds/tcplnd.c +++ b/lnet/ulnds/tcplnd.c @@ -55,7 +55,7 @@ * * sends a packet to the peer, after insuring that a connection exists */ -ptl_err_t tcpnal_send(nal_cb_t *n, +ptl_err_t tcpnal_send(lib_nal_t *n, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, @@ -68,7 +68,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n, size_t len) { connection c; - bridge b=(bridge)n->nal_data; + bridge b=(bridge)n->libnal_data; struct iovec tiov[257]; static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER; ptl_err_t rc = PTL_OK; @@ -142,7 +142,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n, /* Function: tcpnal_recv - * Arguments: nal_cb_t *nal: pointer to my nal control block + * Arguments: lib_nal_t *nal: pointer to my nal control block * void *private: connection pointer passed through * lib_parse() * lib_msg_t *cookie: passed back to portals library @@ -154,7 +154,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n, * blocking read of the requested data. must drain out the * difference of mainpulated and requested lengths from the network */ -ptl_err_t tcpnal_recv(nal_cb_t *n, +ptl_err_t tcpnal_recv(lib_nal_t *n, void *private, lib_msg_t *cookie, unsigned int niov, @@ -217,7 +217,8 @@ static int from_connection(void *a, void *d) ptl_hdr_t hdr; if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ - lib_parse(b->nal_cb, &hdr, c); + lib_parse(b->lib_nal, &hdr, c); + /*TODO: check error status*/ return(1); } return(0); @@ -239,19 +240,19 @@ int tcpnal_init(bridge b) { manager m; - b->nal_cb->cb_send=tcpnal_send; - b->nal_cb->cb_recv=tcpnal_recv; + b->lib_nal->libnal_send=tcpnal_send; + b->lib_nal->libnal_recv=tcpnal_recv; b->shutdown=tcpnal_shutdown; - if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid, - b->nal_cb->ni.pid), + if (!(m=init_connections(PNAL_PORT(b->lib_nal->libnal_ni.ni_pid.nid, + b->lib_nal->libnal_ni.ni_pid.pid), from_connection,b))){ /* TODO: this needs to shut down the newly created junk */ return(PTL_NAL_FAILED); } /* XXX cfs hack */ - b->nal_cb->ni.pid=0; + b->lib_nal->libnal_ni.ni_pid.pid=0; b->lower=m; return(PTL_OK); } diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 1b957a3..a563e0d 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1,7 +1,6 @@ tbd Cluster File Systems, Inc. * version 1.2.x * bug fixes - - clear page cache after eviction (2766) - don't dereference NULL peer_ni in ldlm_handle_ast_error (3258) - don't allow unlinking open directory if it isn't empty (2904) - handle partial page writes in filter; fix 512b direct IO (3138) @@ -12,7 +11,7 @@ tbd Cluster File Systems, Inc. * miscellania - drop scimac NAL (unmaintained) -tbd Cluster File Systems, Inc. +2004-05-27 Cluster File Systems, Inc. * version 1.2.2 * bug fixes - don't copy lvb into (possibly NULL) reply on error (2983) @@ -47,11 +46,20 @@ tbd Cluster File Systems, Inc. - update iopen-2.6 patch with fixes from 2399,2517,2904 (3301) - don't leak open file on MDS after open resend (3325) - serialize filter_precreate and filter_destroy_precreated (3329) + - loop device shouldn't call sync_dev() for nul device (3092) + - clear page cache after eviction (2766) + - resynchronize MDS->OST in background (2824) + - refuse to mount the same filesystem twice on same mountpoint (3394) + - allow llmount to create routes for mounting behind routers (3320) + - push lock cancellation to blocking thread for glimpse ASTs (3409) + - don't call osc_set_data_with_check() for TEST_LOCK matches (3159) + - fix rare problem with rename on htree directories (3417) * miscellania - allow default OST striping configuration per directory (1414) - fix compilation for qswnal for 2.6 kernels (3125) - increase maximum number of MDS request buffers for large systems - change liblustreapi to be useful for external progs like lfsck (3098) + - increase local configuration timeout for slow disks (3353) 2004-03-22 Cluster File Systems, Inc. * version 1.2.1 diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am index c5d668d..04e6356 100644 --- a/lustre/autoMakefile.am +++ b/lustre/autoMakefile.am @@ -56,6 +56,10 @@ lvfs-sources: modules: lustre_build_version $(DEP) $(LDISKFS) lvfs-sources $(MAKE) $(ARCH_UM) -C $(LINUX) -f $(PWD)/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$(LINUX_CONFIG) SUBDIRS=$(PWD) -o tmp_include_depends -o scripts -o include/config/MARKER $@ +endif # MODULES + +all-recursive: lustre_build_version + lustre_build_version: perl $(top_builddir)/scripts/version_tag.pl $(top_srcdir) $(top_builddir) > tmpver echo "#define LUSTRE_RELEASE @RELEASE@" >> tmpver @@ -64,8 +68,6 @@ lustre_build_version: $(RM) tmpver || \ mv tmpver $(top_builddir)/include/linux/lustre_build_version.h -endif # MODULES - dist-hook: find $(distdir) -name .deps | xargs rm -rf find $(distdir) -name CVS | xargs rm -rf diff --git a/lustre/autogen.sh b/lustre/autogen.sh index 004852e..e1c2c6c 100644 --- a/lustre/autogen.sh +++ b/lustre/autogen.sh @@ -48,7 +48,7 @@ or for RH9 systems you can use: ftp://fr2.rpmfind.net/linux/redhat/9/en/os/i386/RedHat/RPMS/autoconf-2.57-3.noarch.rpm EOF - [ "$cmd" = "automake" -a "$required" = "1.7.8" ] && cat >&2 <&2 </dev/null ; then error_msg "missing" fi - version=$($cmd --version | awk "BEGIN { IGNORECASE=1 } /$cmd \(GNU $cmd\)/ { print \$4 }") + version=$($cmd --version | awk "BEGIN { IGNORECASE=1 } /$tool \(GNU $tool\)/ { print \$4 }") echo "found $version" if ! compare_versions "$required" "$version" ; then error_msg "too old" fi } -check_version automake "1.7.8" -check_version autoconf "2.57" +check_version automake automake-1.7 "1.7.8" +check_version autoconf autoconf "2.57" echo "Running aclocal..." -aclocal +aclocal-1.7 echo "Running autoheader..." autoheader echo "Running automake..." -automake -a -c +automake-1.7 -a -c echo "Running autoconf..." autoconf diff --git a/lustre/configure.in b/lustre/configure.in index 3373fd0..d667270 100644 --- a/lustre/configure.in +++ b/lustre/configure.in @@ -5,7 +5,7 @@ AC_INIT AC_CANONICAL_SYSTEM -AM_INIT_AUTOMAKE(lustre, HEAD) +AM_INIT_AUTOMAKE(lustre, b1_4) # AM_MAINTAINER_MODE # Four main targets: lustre kernel modules, utilities, tests, and liblustre diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 120e996..13363bd 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -77,12 +77,12 @@ static inline void lustre_daemonize_helper(void) { LASSERT(current->signal != NULL); - current->session = 1; + current->signal->session = 1; if (current->group_leader) - current->group_leader->__pgrp = 1; + current->group_leader->signal->pgrp = 1; else CERROR("we aren't group leader\n"); - current->tty = NULL; + current->signal->tty = NULL; } static inline int cleanup_group_info(void) diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index 9b89859..09fd52e 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -652,6 +652,8 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, char *name); int ptlrpc_unregister_service(struct ptlrpc_service *service); int liblustre_check_services (void *arg); +void ptlrpc_daemonize(void); + struct ptlrpc_svc_data { char *name; diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch index 3de6a8f..f6b2f43 100644 --- a/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch +++ b/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch @@ -1395,7 +1395,7 @@ + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; @@ -1416,9 +1416,9 @@ + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); -+ to->rec_len = rec_len; ++ to->rec_len = cpu_to_le16(rec_len); + prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); + } + de = next; + } @@ -1642,8 +1642,8 @@ + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *) &root->dotdot; -+ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len); ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; ++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); + de = (struct ext3_dir_entry_2 *) data1; diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1-suse-2.4.19.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1-suse-2.4.19.patch index e937932..9730921 100644 --- a/lustre/kernel_patches/patches/ext-2.4-patch-1-suse-2.4.19.patch +++ b/lustre/kernel_patches/patches/ext-2.4-patch-1-suse-2.4.19.patch @@ -1405,7 +1405,7 @@ Index: linux-2.4.19/fs/ext3/namei.c + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; @@ -1426,7 +1426,7 @@ Index: linux-2.4.19/fs/ext3/namei.c + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); -+ to->rec_len = rec_len; ++ to->rec_len = cpu_to_le16(rec_len); + prev = to; + to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); + } @@ -1652,8 +1652,8 @@ Index: linux-2.4.19/fs/ext3/namei.c + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *) &root->dotdot; -+ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len); ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; ++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); + de = (struct ext3_dir_entry_2 *) data1; diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1-suse.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1-suse.patch index 748671f..28a1ad6 100644 --- a/lustre/kernel_patches/patches/ext-2.4-patch-1-suse.patch +++ b/lustre/kernel_patches/patches/ext-2.4-patch-1-suse.patch @@ -1395,7 +1395,7 @@ + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; @@ -1416,9 +1416,9 @@ + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); -+ to->rec_len = rec_len; ++ to->rec_len = cpu_to_le16(rec_len); + prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); + } + de = next; + } @@ -1642,8 +1642,8 @@ + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *) &root->dotdot; -+ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len); ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; ++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); + de = (struct ext3_dir_entry_2 *) data1; diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1.patch index 748671f..28a1ad6 100644 --- a/lustre/kernel_patches/patches/ext-2.4-patch-1.patch +++ b/lustre/kernel_patches/patches/ext-2.4-patch-1.patch @@ -1395,7 +1395,7 @@ + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; @@ -1416,9 +1416,9 @@ + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); -+ to->rec_len = rec_len; ++ to->rec_len = cpu_to_le16(rec_len); + prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); + } + de = next; + } @@ -1642,8 +1642,8 @@ + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *) &root->dotdot; -+ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len); ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; ++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); + de = (struct ext3_dir_entry_2 *) data1; diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-4.patch b/lustre/kernel_patches/patches/ext-2.4-patch-4.patch index 67f5afa..4c8d4fa 100644 --- a/lustre/kernel_patches/patches/ext-2.4-patch-4.patch +++ b/lustre/kernel_patches/patches/ext-2.4-patch-4.patch @@ -18,19 +18,23 @@ diff -Nru a/fs/ext3/namei.c b/fs/ext3/namei.c --- a/fs/ext3/namei.c Thu Nov 7 10:57:49 2002 +++ b/fs/ext3/namei.c Thu Nov 7 10:57:49 2002 -@@ -2173,7 +2173,26 @@ +@@ -2173,7 +2173,30 @@ /* * ok, that's it */ - ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ if (retval == -ENOENT) { -+ /* -+ * old_de could have moved out from under us. -+ */ ++ if (le32_to_cpu(old_de->inode) != old_inode->i_ino || ++ old_de->name_len != old_dentry->d_name.len || ++ strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || ++ (retval = ext3_delete_entry(handle, old_dir, ++ old_de, old_bh)) == -ENOENT) { ++ /* old_de could have moved from under us during htree split, so ++ * make sure that we are deleting the right entry. We might ++ * also be pointing to a stale entry in the unused part of ++ * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; -+ ++ + old_bh2 = ext3_find_entry(old_dentry, &old_de2); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, diff --git a/lustre/kernel_patches/patches/ext3-htree-2.4.19-pre1.patch b/lustre/kernel_patches/patches/ext3-htree-2.4.19-pre1.patch index 7865c63..0806c38 100644 --- a/lustre/kernel_patches/patches/ext3-htree-2.4.19-pre1.patch +++ b/lustre/kernel_patches/patches/ext3-htree-2.4.19-pre1.patch @@ -1420,7 +1420,7 @@ Index: linux-2.4.19-pre1/fs/ext3/namei.c + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; @@ -1441,9 +1441,9 @@ Index: linux-2.4.19-pre1/fs/ext3/namei.c + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); -+ to->rec_len = rec_len; ++ to->rec_len = cpu_to_le16(rec_len); + prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); + } + de = next; + } @@ -2258,19 +2258,23 @@ Index: linux-2.4.19-pre1/fs/ext3/namei.c if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) handle->h_sync = 1; -@@ -1070,14 +2174,33 @@ +@@ -1070,14 +2174,37 @@ /* * ok, that's it */ - ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ if (retval == -ENOENT) { -+ /* -+ * old_de could have moved out from under us. -+ */ ++ if (le32_to_cpu(old_de->inode) != old_inode->i_ino || ++ old_de->name_len != old_dentry->d_name.len || ++ strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || ++ (retval = ext3_delete_entry(handle, old_dir, ++ old_de, old_bh)) == -ENOENT) { ++ /* old_de could have moved from under us during htree split, so ++ * make sure that we are deleting the right entry. We might ++ * also be pointing to a stale entry in the unused part of ++ * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; -+ ++ + old_bh2 = ext3_find_entry(old_dentry, &old_de2); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, diff --git a/lustre/kernel_patches/patches/ext3-htree-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-htree-2.4.21-chaos.patch index 3a9719b..4b445f5 100644 --- a/lustre/kernel_patches/patches/ext3-htree-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-htree-2.4.21-chaos.patch @@ -1420,7 +1420,7 @@ Index: linux-2.4.21-chaos/fs/ext3/namei.c + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; @@ -1441,9 +1441,9 @@ Index: linux-2.4.21-chaos/fs/ext3/namei.c + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); -+ to->rec_len = rec_len; ++ to->rec_len = cpu_to_le16(rec_len); + prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ to = (struct ext3_dir_entry_2 *)((char *) to + rec_len); + } + de = next; + } @@ -2263,19 +2263,23 @@ Index: linux-2.4.21-chaos/fs/ext3/namei.c if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) handle->h_sync = 1; -@@ -1070,14 +2174,33 @@ +@@ -1070,14 +2174,37 @@ /* * ok, that's it */ - ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ if (retval == -ENOENT) { -+ /* -+ * old_de could have moved out from under us. -+ */ ++ if (le32_to_cpu(old_de->inode) != old_inode->i_ino || ++ old_de->name_len != old_dentry->d_name.len || ++ strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || ++ (retval = ext3_delete_entry(handle, old_dir, ++ old_de, old_bh)) == -ENOENT) { ++ /* old_de could have moved from under us during htree split, so ++ * make sure that we are deleting the right entry. We might ++ * also be pointing to a stale entry in the unused part of ++ * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; -+ ++ + old_bh2 = ext3_find_entry(old_dentry, &old_de2); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, diff --git a/lustre/kernel_patches/patches/ext3-htree-2.4.22-rh.patch b/lustre/kernel_patches/patches/ext3-htree-2.4.22-rh.patch index 436bd34..ca2cacf 100644 --- a/lustre/kernel_patches/patches/ext3-htree-2.4.22-rh.patch +++ b/lustre/kernel_patches/patches/ext3-htree-2.4.22-rh.patch @@ -1410,7 +1410,7 @@ + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; @@ -1431,9 +1431,9 @@ + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); -+ to->rec_len = rec_len; ++ to->rec_len = cpu_to_le16(rec_len); + prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); + } + de = next; + } @@ -2253,19 +2253,23 @@ if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) handle->h_sync = 1; -@@ -1070,14 +2174,33 @@ static int ext3_rename (struct inode * o +@@ -1070,14 +2174,37 @@ static int ext3_rename (struct inode * o /* * ok, that's it */ - ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ if (retval == -ENOENT) { -+ /* -+ * old_de could have moved out from under us. -+ */ ++ if (le32_to_cpu(old_de->inode) != old_inode->i_ino || ++ old_de->name_len != old_dentry->d_name.len || ++ strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || ++ (retval = ext3_delete_entry(handle, old_dir, ++ old_de, old_bh)) == -ENOENT) { ++ /* old_de could have moved from under us during htree split, so ++ * make sure that we are deleting the right entry. We might ++ * also be pointing to a stale entry in the unused part of ++ * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; -+ ++ + old_bh2 = ext3_find_entry(old_dentry, &old_de2); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, diff --git a/lustre/kernel_patches/patches/ext3-htree-rename_fix.patch b/lustre/kernel_patches/patches/ext3-htree-rename_fix.patch new file mode 100644 index 0000000..75bf288 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-htree-rename_fix.patch @@ -0,0 +1,24 @@ +===== fs/ext3/namei.c 1.52 vs edited ===== +--- 1.52/fs/ext3/namei.c Mon May 10 05:25:34 2004 ++++ edited/fs/ext3/namei.c Thu May 20 19:57:10 2004 +@@ -2264,11 +2264,15 @@ + /* + * ok, that's it + */ +- retval = ext3_delete_entry(handle, old_dir, old_de, old_bh); +- if (retval == -ENOENT) { +- /* +- * old_de could have moved out from under us. +- */ ++ if (le32_to_cpu(old_de->inode) != old_inode->i_ino || ++ old_de->name_len != old_dentry->d_name.len || ++ strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || ++ (retval = ext3_delete_entry(handle, old_dir, ++ old_de, old_bh)) == -ENOENT) { ++ /* old_de could have moved from under us during htree split, so ++ * make sure that we are deleting the right entry. We might ++ * also be pointing to a stale entry in the unused part of ++ * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; + diff --git a/lustre/kernel_patches/patches/ext3-htree-suse.patch b/lustre/kernel_patches/patches/ext3-htree-suse.patch index a6e96f0..3e5148e 100644 --- a/lustre/kernel_patches/patches/ext3-htree-suse.patch +++ b/lustre/kernel_patches/patches/ext3-htree-suse.patch @@ -1420,7 +1420,7 @@ Index: linux-2.4.21-suse/fs/ext3/namei.c + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; @@ -1441,9 +1441,9 @@ Index: linux-2.4.21-suse/fs/ext3/namei.c + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); -+ to->rec_len = rec_len; ++ to->rec_len = cpu_to_le16(rec_len); + prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); + } + de = next; + } @@ -2227,19 +2227,23 @@ Index: linux-2.4.21-suse/fs/ext3/namei.c if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) handle->h_sync = 1; -@@ -1069,14 +2172,33 @@ +@@ -1069,14 +2172,37 @@ /* * ok, that's it */ - ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ if (retval == -ENOENT) { -+ /* -+ * old_de could have moved out from under us. -+ */ ++ if (le32_to_cpu(old_de->inode) != old_inode->i_ino || ++ old_de->name_len != old_dentry->d_name.len || ++ strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || ++ (retval = ext3_delete_entry(handle, old_dir, ++ old_de, old_bh)) == -ENOENT) { ++ /* old_de could have moved from under us during htree split, so ++ * make sure that we are deleting the right entry. We might ++ * also be pointing to a stale entry in the unused part of ++ * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; -+ ++ + old_bh2 = ext3_find_entry(old_dentry, &old_de2); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, diff --git a/lustre/kernel_patches/patches/ext3-htree.patch b/lustre/kernel_patches/patches/ext3-htree.patch index 903118b..31f2ae3 100644 --- a/lustre/kernel_patches/patches/ext3-htree.patch +++ b/lustre/kernel_patches/patches/ext3-htree.patch @@ -1410,7 +1410,7 @@ + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; @@ -1431,9 +1431,9 @@ + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); -+ to->rec_len = rec_len; ++ to->rec_len = cpu_to_le16(rec_len); + prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); + } + de = next; + } @@ -2254,19 +2254,23 @@ if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) handle->h_sync = 1; -@@ -1071,14 +2174,33 @@ static int ext3_rename (struct inode * o +@@ -1071,14 +2174,37 @@ static int ext3_rename (struct inode * o /* * ok, that's it */ - ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ if (retval == -ENOENT) { -+ /* -+ * old_de could have moved out from under us. -+ */ ++ if (le32_to_cpu(old_de->inode) != old_inode->i_ino || ++ old_de->name_len != old_dentry->d_name.len || ++ strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || ++ (retval = ext3_delete_entry(handle, old_dir, ++ old_de, old_bh)) == -ENOENT) { ++ /* old_de could have moved from under us during htree split, so ++ * make sure that we are deleting the right entry. We might ++ * also be pointing to a stale entry in the unused part of ++ * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; -+ ++ + old_bh2 = ext3_find_entry(old_dentry, &old_de2); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, diff --git a/lustre/kernel_patches/patches/ext3-pdirops-2.4.24-chaos.patch b/lustre/kernel_patches/patches/ext3-pdirops-2.4.24-chaos.patch index 9602b80..c472368 100644 --- a/lustre/kernel_patches/patches/ext3-pdirops-2.4.24-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-pdirops-2.4.24-chaos.patch @@ -6,10 +6,10 @@ include/linux/ext3_fs_i.h | 6 6 files changed, 500 insertions(+), 109 deletions(-) -Index: linux-2.4.24/fs/ext3/namei.c +Index: lum/fs/ext3/namei.c =================================================================== ---- linux-2.4.24.orig/fs/ext3/namei.c 2004-05-22 12:08:41.000000000 +0800 -+++ linux-2.4.24/fs/ext3/namei.c 2004-05-22 12:11:40.000000000 +0800 +--- lum.orig/fs/ext3/namei.c 2004-06-03 16:32:28.000000000 -0400 ++++ lum/fs/ext3/namei.c 2004-06-03 16:45:45.000000000 -0400 @@ -51,6 +51,9 @@ { struct buffer_head *bh; @@ -545,7 +545,7 @@ Index: linux-2.4.24/fs/ext3/namei.c + (struct ext3_dir_entry_2 *) (from + map->offs); rec_len = EXT3_DIR_REC_LEN(de->name_len); memcpy (to, de, rec_len); - ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; + ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); @@ -987,7 +1150,8 @@ static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) @@ -977,7 +977,7 @@ Index: linux-2.4.24/fs/ext3/namei.c if (bh) brelse(bh); dx_release(frames); -@@ -1901,6 +2220,7 @@ +@@ -1905,6 +2224,7 @@ struct buffer_head * bh; struct ext3_dir_entry_2 * de; handle_t *handle; @@ -985,7 +985,7 @@ Index: linux-2.4.24/fs/ext3/namei.c handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); if (IS_ERR(handle)) { -@@ -1908,7 +2228,7 @@ +@@ -1912,7 +2232,7 @@ } retval = -ENOENT; @@ -994,7 +994,7 @@ Index: linux-2.4.24/fs/ext3/namei.c if (!bh) goto end_rmdir; -@@ -1919,14 +2239,19 @@ +@@ -1923,14 +2243,19 @@ DQUOT_INIT(inode); retval = -EIO; @@ -1016,7 +1016,7 @@ Index: linux-2.4.24/fs/ext3/namei.c if (retval) goto end_rmdir; if (inode->i_nlink != 2) -@@ -1985,6 +2310,7 @@ +@@ -1989,6 +2314,7 @@ struct buffer_head * bh; struct ext3_dir_entry_2 * de; handle_t *handle; @@ -1024,7 +1024,7 @@ Index: linux-2.4.24/fs/ext3/namei.c handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); if (IS_ERR(handle)) { -@@ -1995,7 +2321,7 @@ +@@ -1999,7 +2325,7 @@ handle->h_sync = 1; retval = -ENOENT; @@ -1033,7 +1033,7 @@ Index: linux-2.4.24/fs/ext3/namei.c if (!bh) goto end_unlink; -@@ -2003,8 +2329,10 @@ +@@ -2007,8 +2333,10 @@ DQUOT_INIT(inode); retval = -EIO; @@ -1045,7 +1045,7 @@ Index: linux-2.4.24/fs/ext3/namei.c if (!inode->i_nlink) { ext3_warning (inode->i_sb, "ext3_unlink", -@@ -2013,6 +2341,7 @@ +@@ -2017,6 +2345,7 @@ inode->i_nlink = 1; } retval = ext3_delete_entry(handle, dir, de, bh); @@ -1053,7 +1053,7 @@ Index: linux-2.4.24/fs/ext3/namei.c if (retval) goto end_unlink; dir->i_ctime = dir->i_mtime = CURRENT_TIME; -@@ -2151,6 +2480,7 @@ +@@ -2155,6 +2484,7 @@ struct buffer_head * old_bh, * new_bh, * dir_bh; struct ext3_dir_entry_2 * old_de, * new_de; int retval; @@ -1061,7 +1061,7 @@ Index: linux-2.4.24/fs/ext3/namei.c old_bh = new_bh = dir_bh = NULL; -@@ -2163,7 +2493,10 @@ +@@ -2167,7 +2497,10 @@ if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) handle->h_sync = 1; @@ -1073,7 +1073,7 @@ Index: linux-2.4.24/fs/ext3/namei.c /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process -@@ -2176,7 +2509,7 @@ +@@ -2180,7 +2513,7 @@ goto end_rename; new_inode = new_dentry->d_inode; @@ -1082,16 +1082,16 @@ Index: linux-2.4.24/fs/ext3/namei.c if (new_bh) { if (!new_inode) { brelse (new_bh); -@@ -2239,7 +2572,7 @@ +@@ -2247,7 +2580,7 @@ struct buffer_head *old_bh2; struct ext3_dir_entry_2 *old_de2; - + - old_bh2 = ext3_find_entry(old_dentry, &old_de2); + old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */); if (old_bh2) { retval = ext3_delete_entry(handle, old_dir, old_de2, old_bh2); -@@ -2282,6 +2615,14 @@ +@@ -2290,6 +2623,14 @@ retval = 0; end_rename: @@ -1106,7 +1106,7 @@ Index: linux-2.4.24/fs/ext3/namei.c brelse (dir_bh); brelse (old_bh); brelse (new_bh); -@@ -2290,6 +2631,29 @@ +@@ -2298,6 +2639,29 @@ } /* @@ -1136,10 +1136,10 @@ Index: linux-2.4.24/fs/ext3/namei.c * directories can handle most operations... */ struct inode_operations ext3_dir_inode_operations = { -Index: linux-2.4.24/fs/ext3/super.c +Index: lum/fs/ext3/super.c =================================================================== ---- linux-2.4.24.orig/fs/ext3/super.c 2004-05-22 12:09:38.000000000 +0800 -+++ linux-2.4.24/fs/ext3/super.c 2004-05-22 12:11:40.000000000 +0800 +--- lum.orig/fs/ext3/super.c 2004-06-03 16:32:28.000000000 -0400 ++++ lum/fs/ext3/super.c 2004-06-03 16:37:15.000000000 -0400 @@ -733,6 +733,9 @@ if (want_numeric(value, "sb", sb_block)) return 0; @@ -1173,10 +1173,10 @@ Index: linux-2.4.24/fs/ext3/super.c return sb; failed_mount3: -Index: linux-2.4.24/fs/ext3/inode.c +Index: lum/fs/ext3/inode.c =================================================================== ---- linux-2.4.24.orig/fs/ext3/inode.c 2004-05-22 12:09:48.000000000 +0800 -+++ linux-2.4.24/fs/ext3/inode.c 2004-05-22 12:11:40.000000000 +0800 +--- lum.orig/fs/ext3/inode.c 2004-06-03 16:32:29.000000000 -0400 ++++ lum/fs/ext3/inode.c 2004-06-03 16:37:15.000000000 -0400 @@ -2251,6 +2251,9 @@ } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext3_dir_inode_operations; @@ -1187,10 +1187,10 @@ Index: linux-2.4.24/fs/ext3/inode.c } else if (S_ISLNK(inode->i_mode)) { if (ext3_inode_is_fast_symlink(inode)) inode->i_op = &ext3_fast_symlink_inode_operations; -Index: linux-2.4.24/fs/ext3/ialloc.c +Index: lum/fs/ext3/ialloc.c =================================================================== ---- linux-2.4.24.orig/fs/ext3/ialloc.c 2004-05-22 12:09:38.000000000 +0800 -+++ linux-2.4.24/fs/ext3/ialloc.c 2004-05-22 12:11:40.000000000 +0800 +--- lum.orig/fs/ext3/ialloc.c 2004-06-03 16:32:28.000000000 -0400 ++++ lum/fs/ext3/ialloc.c 2004-06-03 16:37:15.000000000 -0400 @@ -609,6 +609,9 @@ return ERR_PTR(-EDQUOT); } @@ -1201,10 +1201,10 @@ Index: linux-2.4.24/fs/ext3/ialloc.c return inode; fail: -Index: linux-2.4.24/include/linux/ext3_fs.h +Index: lum/include/linux/ext3_fs.h =================================================================== ---- linux-2.4.24.orig/include/linux/ext3_fs.h 2004-05-22 12:09:42.000000000 +0800 -+++ linux-2.4.24/include/linux/ext3_fs.h 2004-05-22 12:11:41.000000000 +0800 +--- lum.orig/include/linux/ext3_fs.h 2004-06-03 16:32:28.000000000 -0400 ++++ lum/include/linux/ext3_fs.h 2004-06-03 16:37:15.000000000 -0400 @@ -320,6 +320,7 @@ /* * Mount flags @@ -1213,10 +1213,10 @@ Index: linux-2.4.24/include/linux/ext3_fs.h #define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */ #define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */ #define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */ -Index: linux-2.4.24/include/linux/ext3_fs_i.h +Index: lum/include/linux/ext3_fs_i.h =================================================================== ---- linux-2.4.24.orig/include/linux/ext3_fs_i.h 2004-05-22 12:09:38.000000000 +0800 -+++ linux-2.4.24/include/linux/ext3_fs_i.h 2004-05-22 12:13:54.000000000 +0800 +--- lum.orig/include/linux/ext3_fs_i.h 2004-06-03 16:32:28.000000000 -0400 ++++ lum/include/linux/ext3_fs_i.h 2004-06-03 16:37:15.000000000 -0400 @@ -17,6 +17,7 @@ #define _LINUX_EXT3_FS_I diff --git a/lustre/kernel_patches/patches/htree-ext3-2.4.18.patch b/lustre/kernel_patches/patches/htree-ext3-2.4.18.patch index a8bce7c..2100f53 100644 --- a/lustre/kernel_patches/patches/htree-ext3-2.4.18.patch +++ b/lustre/kernel_patches/patches/htree-ext3-2.4.18.patch @@ -595,7 +595,7 @@ + ext3_dirent *de = (ext3_dirent *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((ext3_dirent *) to)->rec_len = rec_len; ++ ((ext3_dirent *)to)->rec_len = le16_to_cpu(rec_len); + to += rec_len; + map++; + } @@ -644,9 +644,9 @@ + + /* Fancy dance to stay within two buffers */ + de2 = dx_copy_dirents (data1, data2, map + split, count - split); -+ data3 = (char *) de2 + de2->rec_len; ++ data3 = (char *) de2 + le16_to_cpu(de2->rec_len); + de = dx_copy_dirents (data1, data3, map, split); -+ memcpy(data1, data3, (char *) de + de->rec_len - data3); ++ memcpy(data1, data3, (char *) de + le16_to_cpu(de->rec_len) - data3); + de = (ext3_dirent *) ((char *) de - data3 + data1); // relocate de + de->rec_len = cpu_to_le16(data1 + dir->i_sb->s_blocksize - (char *)de); + de2->rec_len = cpu_to_le16(data2 + dir->i_sb->s_blocksize-(char *)de2); @@ -1136,7 +1136,38 @@ if (IS_ERR(handle)) return PTR_ERR(handle); -@@ -1077,7 +1844,7 @@ +@@ -1069,14 +1837,37 @@ + /* + * ok, that's it + */ +- ext3_delete_entry(handle, old_dir, old_de, old_bh); ++ if (le32_to_cpu(old_de->inode) != old_inode->i_ino || ++ old_de->name_len != old_dentry->d_name.len || ++ strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || ++ (retval = ext3_delete_entry(handle, old_dir, ++ old_de, old_bh)) == -ENOENT) { ++ /* old_de could have moved from under us during htree split, so ++ * make sure that we are deleting the right entry. We might ++ * also be pointing to a stale entry in the unused part of ++ * old_bh so just checking inum and the name isn't enough. */ ++ struct buffer_head *old_bh2; ++ struct ext3_dir_entry_2 *old_de2; ++ ++ old_bh2 = ext3_find_entry(old_dentry, &old_de2); ++ if (old_bh2) { ++ retval = ext3_delete_entry(handle, old_dir, ++ old_de2, old_bh2); ++ brelse(old_bh2); ++ } ++ } ++ if (retval) { ++ ext3_warning(old_dir->i_sb, "ext3_rename", ++ "Deleting old file (%lu), %d, error=%d", ++ old_dir->i_ino, old_dir->i_nlink, retval); ++ } + + if (new_inode) { + new_inode->i_nlink--; new_inode->i_ctime = CURRENT_TIME; } old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; diff --git a/lustre/kernel_patches/patches/iopen-2.6-suse.patch b/lustre/kernel_patches/patches/iopen-2.6-suse.patch index 2133355..8a8d115 100644 --- a/lustre/kernel_patches/patches/iopen-2.6-suse.patch +++ b/lustre/kernel_patches/patches/iopen-2.6-suse.patch @@ -8,8 +8,8 @@ Index: linux-stage/fs/ext3/Makefile =================================================================== ---- linux-stage.orig/fs/ext3/Makefile 2004-05-07 16:00:16.000000000 -0400 -+++ linux-stage/fs/ext3/Makefile 2004-05-07 16:00:17.000000000 -0400 +--- linux-stage.orig/fs/ext3/Makefile 2004-05-11 17:21:20.000000000 -0400 ++++ linux-stage/fs/ext3/Makefile 2004-05-11 17:21:21.000000000 -0400 @@ -4,7 +4,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o @@ -21,8 +21,8 @@ Index: linux-stage/fs/ext3/Makefile ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o Index: linux-stage/fs/ext3/inode.c =================================================================== ---- linux-stage.orig/fs/ext3/inode.c 2004-05-07 16:00:16.000000000 -0400 -+++ linux-stage/fs/ext3/inode.c 2004-05-07 17:21:59.000000000 -0400 +--- linux-stage.orig/fs/ext3/inode.c 2004-05-11 17:21:21.000000000 -0400 ++++ linux-stage/fs/ext3/inode.c 2004-05-11 17:21:21.000000000 -0400 @@ -37,6 +37,7 @@ #include #include @@ -43,8 +43,8 @@ Index: linux-stage/fs/ext3/inode.c bh = iloc.bh; Index: linux-stage/fs/ext3/iopen.c =================================================================== ---- linux-stage.orig/fs/ext3/iopen.c 2004-05-07 16:00:17.000000000 -0400 -+++ linux-stage/fs/ext3/iopen.c 2004-05-07 17:22:37.000000000 -0400 +--- linux-stage.orig/fs/ext3/iopen.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-stage/fs/ext3/iopen.c 2004-05-11 17:21:21.000000000 -0400 @@ -0,0 +1,272 @@ +/* + * linux/fs/ext3/iopen.c @@ -320,8 +320,8 @@ Index: linux-stage/fs/ext3/iopen.c +} Index: linux-stage/fs/ext3/iopen.h =================================================================== ---- linux-stage.orig/fs/ext3/iopen.h 2004-05-07 16:00:17.000000000 -0400 -+++ linux-stage/fs/ext3/iopen.h 2004-05-07 16:00:17.000000000 -0400 +--- linux-stage.orig/fs/ext3/iopen.h 1969-12-31 19:00:00.000000000 -0500 ++++ linux-stage/fs/ext3/iopen.h 2004-05-11 17:21:21.000000000 -0400 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -340,8 +340,8 @@ Index: linux-stage/fs/ext3/iopen.h + struct inode *inode, int rehash); Index: linux-stage/fs/ext3/namei.c =================================================================== ---- linux-stage.orig/fs/ext3/namei.c 2004-05-07 16:00:16.000000000 -0400 -+++ linux-stage/fs/ext3/namei.c 2004-05-07 16:00:17.000000000 -0400 +--- linux-stage.orig/fs/ext3/namei.c 2004-05-11 17:21:20.000000000 -0400 ++++ linux-stage/fs/ext3/namei.c 2004-05-11 17:21:21.000000000 -0400 @@ -37,6 +37,7 @@ #include #include @@ -420,30 +420,30 @@ Index: linux-stage/fs/ext3/namei.c } Index: linux-stage/fs/ext3/super.c =================================================================== ---- linux-stage.orig/fs/ext3/super.c 2004-05-07 16:00:16.000000000 -0400 -+++ linux-stage/fs/ext3/super.c 2004-05-07 17:21:59.000000000 -0400 +--- linux-stage.orig/fs/ext3/super.c 2004-05-11 17:21:21.000000000 -0400 ++++ linux-stage/fs/ext3/super.c 2004-05-11 17:44:53.000000000 -0400 @@ -536,7 +536,7 @@ Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, -- Opt_ignore, Opt_err, -+ Opt_ignore, Opt_err, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_ignore, Opt_barrier, ++ Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_err, }; - static match_table_t tokens = { -@@ -575,6 +575,9 @@ - {Opt_ignore, "noquota"}, +@@ -577,6 +577,9 @@ {Opt_ignore, "quota"}, {Opt_ignore, "usrquota"}, -+ {Opt_iopen, "iopen"}, -+ {Opt_noiopen, "noiopen"}, -+ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_err, NULL} }; -@@ -762,6 +765,18 @@ - case Opt_abort: - set_opt(sbi->s_mount_opt, ABORT); +@@ -772,6 +775,18 @@ + else + clear_opt(sbi->s_mount_opt, BARRIER); break; + case Opt_iopen: + set_opt (sbi->s_mount_opt, IOPEN); @@ -462,14 +462,14 @@ Index: linux-stage/fs/ext3/super.c default: Index: linux-stage/include/linux/ext3_fs.h =================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2004-05-07 16:00:16.000000000 -0400 -+++ linux-stage/include/linux/ext3_fs.h 2004-05-07 16:00:17.000000000 -0400 -@@ -325,6 +325,8 @@ - #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ +--- linux-stage.orig/include/linux/ext3_fs.h 2004-05-11 17:21:20.000000000 -0400 ++++ linux-stage/include/linux/ext3_fs.h 2004-05-11 17:21:21.000000000 -0400 +@@ -326,6 +326,8 @@ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ -+#define EXT3_MOUNT_IOPEN 0x10000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x20000 /* Make iopen world-readable */ + #define EXT3_MOUNT_BARRIER 0x10000 /* Use block barriers */ ++#define EXT3_MOUNT_IOPEN 0x20000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x40000 /* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/loop-sync-2.4.21-suse.patch b/lustre/kernel_patches/patches/loop-sync-2.4.21-suse.patch new file mode 100644 index 0000000..73372b9 --- /dev/null +++ b/lustre/kernel_patches/patches/loop-sync-2.4.21-suse.patch @@ -0,0 +1,11 @@ +--- drivers/block/loop.c.bu 2004-05-11 16:27:23.000000000 -0700 ++++ drivers/block/loop.c 2004-05-11 16:28:50.000000000 -0700 +@@ -978,7 +978,7 @@ static int lo_release(struct inode *inod + + lo = &loop_dev[dev]; + +- if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) { ++ if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && lo->lo_device != 0) { + fsync_dev(lo->lo_device); + invalidate_buffers(lo->lo_device); + } diff --git a/lustre/kernel_patches/patches/lustre_version.patch b/lustre/kernel_patches/patches/lustre_version.patch index 6d2b7e6..1c5f97e 100644 --- a/lustre/kernel_patches/patches/lustre_version.patch +++ b/lustre/kernel_patches/patches/lustre_version.patch @@ -1,3 +1,4 @@ +Version 37: fix htree rename-within-same-dir (b=3417), endianness (b=2447) Version 36: don't dput dentry after error (b=2350), zero page->private (3119) Version 35: pass intent to real_lookup after revalidate failure (b=3285) Version 34: fix ext3 iopen assertion failure (b=2517, b=2399) @@ -8,6 +9,6 @@ Version 34: fix ext3 iopen assertion failure (b=2517, b=2399) --- /dev/null Fri Aug 30 17:31:37 2002 +++ linux-2.4.18-18.8.0-l12-braam/include/linux/lustre_version.h Thu Feb 13 07:58:33 2003 @@ -0,0 +1 @@ -+#define LUSTRE_KERNEL_VERSION 36 ++#define LUSTRE_KERNEL_VERSION 37 _ diff --git a/lustre/kernel_patches/patches/md_path_lookup-2.6-suse.patch b/lustre/kernel_patches/patches/md_path_lookup-2.6-suse.patch new file mode 100644 index 0000000..4e2b66d --- /dev/null +++ b/lustre/kernel_patches/patches/md_path_lookup-2.6-suse.patch @@ -0,0 +1,25 @@ +Index: linux-2.6.4-51.0/drivers/md/dm-path-selector.c +=================================================================== +--- linux-2.6.4-51.0.orig/drivers/md/dm-path-selector.c 2004-04-18 20:10:21.000000000 -0400 ++++ linux-2.6.4-51.0/drivers/md/dm-path-selector.c 2004-04-18 20:10:59.000000000 -0400 +@@ -129,7 +129,7 @@ + struct path *path; + }; + +-static struct path_info *path_lookup(struct list_head *head, struct path *p) ++static struct path_info *md_path_lookup(struct list_head *head, struct path *p) + { + struct path_info *pi; + +@@ -235,9 +235,9 @@ + * mind the expense of these searches. + */ + spin_lock_irqsave(&s->lock, flags); +- pi = path_lookup(&s->valid_paths, p); ++ pi = md_path_lookup(&s->valid_paths, p); + if (!pi) +- pi = path_lookup(&s->invalid_paths, p); ++ pi = md_path_lookup(&s->invalid_paths, p); + + if (!pi) + DMWARN("asked to change the state of an unknown path"); diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch b/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch index c678b4e..12436a7 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch @@ -1,42 +1,42 @@ -Index: linux-2.6.4-51.0/fs/exec.c +Index: linux-2.6.5-12.1/fs/exec.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/exec.c 2004-04-05 12:41:59.000000000 -0400 -+++ linux-2.6.4-51.0/fs/exec.c 2004-04-05 17:36:42.000000000 -0400 -@@ -122,8 +122,11 @@ - struct file * file; +--- linux-2.6.5-12.1.orig/fs/exec.c 2004-05-10 19:21:56.000000000 +0300 ++++ linux-2.6.5-12.1/fs/exec.c 2004-05-25 17:32:14.038494200 +0300 +@@ -125,9 +125,10 @@ struct nameidata nd; int error; -+ intent_init(&nd.intent, IT_OPEN); - nd.intent.open.flags = FMODE_READ; -+ error = user_path_walk_it(library, &nd); -+ -+ nd.intent.it_flags = O_RDONLY; - error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); - if (error) ++ intent_init(&nd.intent, IT_OPEN); + +- FSHOOK_BEGIN_USER_WALK(open, ++ nd.intent.it_flags = FMODE_READ; ++ FSHOOK_BEGIN_USER_WALK_IT(open, + error, + library, + LOOKUP_FOLLOW|LOOKUP_OPEN, +@@ -144,7 +145,7 @@ goto out; -@@ -136,7 +139,7 @@ - if (error) - goto exit; + } - file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); + file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent); error = PTR_ERR(file); if (IS_ERR(file)) goto out; -@@ -485,8 +488,9 @@ - int err; - struct file *file; +@@ -495,8 +496,9 @@ + + FSHOOK_BEGIN(open, err, .filename = name, .flags = O_RDONLY) - nd.intent.open.flags = FMODE_READ; - err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); + intent_init(&nd.intent, IT_OPEN); -+ nd.intent.it_flags = O_RDONLY; ++ nd.intent.it_flags = FMODE_READ; + err = path_lookup(name, LOOKUP_FOLLOW, &nd); file = ERR_PTR(err); if (!err) { -@@ -499,7 +503,7 @@ +@@ -509,7 +511,7 @@ err = -EACCES; file = ERR_PTR(err); if (!err) { @@ -45,11 +45,11 @@ Index: linux-2.6.4-51.0/fs/exec.c if (!IS_ERR(file)) { err = deny_write_access(file); if (err) { -Index: linux-2.6.4-51.0/fs/namei.c +Index: linux-2.6.5-12.1/fs/namei.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/namei.c 2004-04-05 12:41:59.000000000 -0400 -+++ linux-2.6.4-51.0/fs/namei.c 2004-04-05 17:36:42.000000000 -0400 -@@ -269,8 +269,19 @@ +--- linux-2.6.5-12.1.orig/fs/namei.c 2004-05-10 19:21:56.000000000 +0300 ++++ linux-2.6.5-12.1/fs/namei.c 2004-05-25 17:32:14.040493896 +0300 +@@ -270,8 +270,19 @@ return 0; } @@ -69,7 +69,7 @@ Index: linux-2.6.4-51.0/fs/namei.c dput(nd->dentry); mntput(nd->mnt); } -@@ -347,7 +358,10 @@ +@@ -348,7 +359,10 @@ { struct dentry * result; struct inode *dir = parent->d_inode; @@ -80,7 +80,7 @@ Index: linux-2.6.4-51.0/fs/namei.c down(&dir->i_sem); /* * First re-do the cached lookup just in case it was created -@@ -386,7 +400,10 @@ +@@ -387,7 +401,10 @@ if (result->d_op && result->d_op->d_revalidate) { if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) { dput(result); @@ -92,7 +92,7 @@ Index: linux-2.6.4-51.0/fs/namei.c } } return result; -@@ -563,6 +580,33 @@ +@@ -564,6 +581,33 @@ return PTR_ERR(dentry); } @@ -126,7 +126,7 @@ Index: linux-2.6.4-51.0/fs/namei.c /* * Name resolution. * -@@ -663,7 +705,9 @@ +@@ -664,7 +708,9 @@ if (inode->i_op->follow_link) { mntget(next.mnt); @@ -136,7 +136,7 @@ Index: linux-2.6.4-51.0/fs/namei.c dput(next.dentry); mntput(next.mnt); if (err) -@@ -702,14 +746,29 @@ +@@ -703,14 +749,29 @@ inode = nd->dentry->d_inode; /* fallthrough */ case 1: @@ -166,7 +166,7 @@ Index: linux-2.6.4-51.0/fs/namei.c if (err) break; follow_mount(&next.mnt, &next.dentry); -@@ -935,7 +994,7 @@ +@@ -936,7 +997,7 @@ } /* SMP-safe */ @@ -175,7 +175,7 @@ Index: linux-2.6.4-51.0/fs/namei.c { unsigned long hash; struct qstr this; -@@ -955,11 +1014,16 @@ +@@ -956,11 +1017,16 @@ } this.hash = end_name_hash(hash); @@ -193,29 +193,31 @@ Index: linux-2.6.4-51.0/fs/namei.c /* * namei() * -@@ -971,7 +1035,7 @@ +@@ -972,7 +1038,8 @@ * that namei follows links, while lnamei does not. * SMP-safe */ --int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) -+int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd) +-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd, const char **pname) ++int fastcall __user_walk_it(const char __user *name, unsigned flags, ++ struct nameidata *nd, const char **pname) { char *tmp = getname(name); int err = PTR_ERR(tmp); -@@ -983,6 +1047,12 @@ +@@ -987,6 +1054,13 @@ return err; } -+int __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) ++int __user_walk(const char __user *name, unsigned flags, ++ struct nameidata *nd, const char **pname) +{ + intent_init(&nd->intent, IT_LOOKUP); -+ return __user_walk_it(name, flags, nd); ++ return __user_walk_it(name, flags, nd, pname); +} + /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. -@@ -1255,8 +1325,8 @@ +@@ -1259,8 +1333,8 @@ acc_mode |= MAY_APPEND; /* Fill in the open() intent data */ @@ -226,7 +228,7 @@ Index: linux-2.6.4-51.0/fs/namei.c /* * The simplest case - just a plain lookup. -@@ -1271,6 +1341,7 @@ +@@ -1275,6 +1349,7 @@ /* * Create - we need to know the parent. */ @@ -234,7 +236,7 @@ Index: linux-2.6.4-51.0/fs/namei.c error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd); if (error) return error; -@@ -1287,7 +1358,9 @@ +@@ -1291,7 +1366,9 @@ dir = nd->dentry; nd->flags &= ~LOOKUP_PARENT; down(&dir->d_inode->i_sem); @@ -244,7 +246,7 @@ Index: linux-2.6.4-51.0/fs/namei.c do_last: error = PTR_ERR(dentry); -@@ -1392,7 +1465,9 @@ +@@ -1396,7 +1473,9 @@ } dir = nd->dentry; down(&dir->d_inode->i_sem); @@ -254,7 +256,7 @@ Index: linux-2.6.4-51.0/fs/namei.c putname(nd->last.name); goto do_last; } -@@ -2154,7 +2229,9 @@ +@@ -2196,7 +2275,9 @@ __vfs_follow_link(struct nameidata *nd, const char *link) { int res = 0; @@ -264,7 +266,7 @@ Index: linux-2.6.4-51.0/fs/namei.c if (IS_ERR(link)) goto fail; -@@ -2164,6 +2241,10 @@ +@@ -2206,6 +2287,10 @@ /* weird __emul_prefix() stuff did it */ goto out; } @@ -275,11 +277,11 @@ Index: linux-2.6.4-51.0/fs/namei.c res = link_path_walk(link, nd); out: if (current->link_count || res || nd->last_type!=LAST_NORM) -Index: linux-2.6.4-51.0/fs/namespace.c +Index: linux-2.6.5-12.1/fs/namespace.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/namespace.c 2004-04-05 12:41:59.000000000 -0400 -+++ linux-2.6.4-51.0/fs/namespace.c 2004-04-07 13:28:23.000000000 -0400 -@@ -107,6 +107,7 @@ +--- linux-2.6.5-12.1.orig/fs/namespace.c 2004-05-10 19:21:56.000000000 +0300 ++++ linux-2.6.5-12.1/fs/namespace.c 2004-05-25 17:33:44.385759328 +0300 +@@ -108,6 +108,7 @@ static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd) { @@ -287,7 +289,24 @@ Index: linux-2.6.4-51.0/fs/namespace.c old_nd->dentry = mnt->mnt_mountpoint; old_nd->mnt = mnt->mnt_parent; mnt->mnt_parent = mnt; -@@ -748,6 +749,7 @@ +@@ -533,6 +534,8 @@ + return err; + if (!old_name || !*old_name) + return -EINVAL; ++ ++ intent_init(&old_nd.intent, IT_LOOKUP); + err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + if (err) + return err; +@@ -601,6 +604,7 @@ + return -EPERM; + if (!old_name || !*old_name) + return -EINVAL; ++ intent_init(&old_nd.intent, IT_LOOKUP); + err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + if (err) + return err; +@@ -750,6 +754,7 @@ int retval = 0; int mnt_flags = 0; @@ -295,11 +314,11 @@ Index: linux-2.6.4-51.0/fs/namespace.c /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; -Index: linux-2.6.4-51.0/fs/open.c +Index: linux-2.6.5-12.1/fs/open.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/open.c 2004-04-05 12:41:59.000000000 -0400 -+++ linux-2.6.4-51.0/fs/open.c 2004-04-05 17:36:42.000000000 -0400 -@@ -211,7 +211,7 @@ +--- linux-2.6.5-12.1.orig/fs/open.c 2004-05-10 19:21:56.000000000 +0300 ++++ linux-2.6.5-12.1/fs/open.c 2004-05-25 17:32:14.042493592 +0300 +@@ -227,12 +227,12 @@ struct nameidata nd; struct inode * inode; int error; @@ -308,7 +327,13 @@ Index: linux-2.6.4-51.0/fs/open.c error = -EINVAL; if (length < 0) /* sorry, but loff_t says... */ goto out; -@@ -470,6 +470,7 @@ + +- FSHOOK_BEGIN_USER_PATH_WALK(truncate, error, path, nd, filename, .length = length) ++ FSHOOK_BEGIN_USER_PATH_WALK_IT(truncate, error, path, nd, filename, .length = length) + + inode = nd.dentry->d_inode; + +@@ -466,6 +466,7 @@ int old_fsuid, old_fsgid; kernel_cap_t old_cap; int res; @@ -316,31 +341,49 @@ Index: linux-2.6.4-51.0/fs/open.c if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; -@@ -501,6 +502,7 @@ +@@ -490,7 +491,7 @@ + else + current->cap_effective = current->cap_permitted; + +- FSHOOK_BEGIN_USER_WALK(access, ++ FSHOOK_BEGIN_USER_WALK_IT(access, + res, + filename, + LOOKUP_FOLLOW|LOOKUP_ACCESS, +@@ -506,6 +507,7 @@ if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) && !special_file(nd.dentry->d_inode->i_mode)) res = -EROFS; + path_release(&nd); - } -@@ -515,6 +517,7 @@ + FSHOOK_END_USER_WALK(access, res, path) +@@ -545,11 +547,13 @@ + + asmlinkage long sys_fchdir(unsigned int fd) { - struct nameidata nd; ++ struct nameidata nd; + struct file *file; + struct dentry *dentry; + struct inode *inode; + struct vfsmount *mnt; int error; + intent_init(&nd.intent, IT_GETATTR); - error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); - if (error) -@@ -566,6 +569,7 @@ + FSHOOK_BEGIN(fchdir, error, .fd = fd) + +@@ -582,8 +586,9 @@ { struct nameidata nd; int error; + intent_init(&nd.intent, IT_GETATTR); - error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); - if (error) -@@ -638,7 +642,7 @@ +- FSHOOK_BEGIN_USER_WALK(chroot, ++ FSHOOK_BEGIN_USER_WALK_IT(chroot, + error, + filename, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, +@@ -670,7 +675,7 @@ error = -EROFS; if (IS_RDONLY(inode)) goto dput_and_out; @@ -349,7 +392,7 @@ Index: linux-2.6.4-51.0/fs/open.c error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto dput_and_out; -@@ -746,27 +750,8 @@ +@@ -804,27 +809,8 @@ * for the internal routines (ie open_namei()/follow_link() etc). 00 is * used by symlinks. */ @@ -379,7 +422,7 @@ Index: linux-2.6.4-51.0/fs/open.c { struct file * f; struct inode *inode; -@@ -778,6 +763,7 @@ +@@ -836,6 +822,7 @@ goto cleanup_dentry; f->f_flags = flags; f->f_mode = (flags+1) & O_ACCMODE; @@ -387,7 +430,7 @@ Index: linux-2.6.4-51.0/fs/open.c inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { error = get_write_access(inode); -@@ -797,6 +783,7 @@ +@@ -855,6 +842,7 @@ error = f->f_op->open(inode,f); if (error) goto cleanup_all; @@ -395,7 +438,7 @@ Index: linux-2.6.4-51.0/fs/open.c } f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); -@@ -821,6 +808,7 @@ +@@ -879,6 +867,7 @@ cleanup_file: put_filp(f); cleanup_dentry: @@ -403,7 +446,7 @@ Index: linux-2.6.4-51.0/fs/open.c dput(dentry); mntput(mnt); return ERR_PTR(error); -@@ -828,6 +816,36 @@ +@@ -886,6 +875,36 @@ EXPORT_SYMBOL(dentry_open); @@ -440,11 +483,11 @@ Index: linux-2.6.4-51.0/fs/open.c /* * Find an empty file descriptor entry, and mark it busy. */ -Index: linux-2.6.4-51.0/fs/stat.c +Index: linux-2.6.5-12.1/fs/stat.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/stat.c 2004-04-05 12:41:59.000000000 -0400 -+++ linux-2.6.4-51.0/fs/stat.c 2004-04-05 17:36:42.000000000 -0400 -@@ -36,7 +36,7 @@ +--- linux-2.6.5-12.1.orig/fs/stat.c 2004-05-10 19:21:56.000000000 +0300 ++++ linux-2.6.5-12.1/fs/stat.c 2004-05-25 17:32:14.042493592 +0300 +@@ -37,7 +37,7 @@ EXPORT_SYMBOL(generic_fillattr); @@ -453,7 +496,7 @@ Index: linux-2.6.4-51.0/fs/stat.c { struct inode *inode = dentry->d_inode; int retval; -@@ -45,6 +45,8 @@ +@@ -46,6 +46,8 @@ if (retval) return retval; @@ -462,7 +505,7 @@ Index: linux-2.6.4-51.0/fs/stat.c if (inode->i_op->getattr) return inode->i_op->getattr(mnt, dentry, stat); -@@ -61,14 +63,20 @@ +@@ -62,14 +64,20 @@ EXPORT_SYMBOL(vfs_getattr); @@ -477,46 +520,51 @@ Index: linux-2.6.4-51.0/fs/stat.c int error; + intent_init(&nd.intent, IT_GETATTR); -- error = user_path_walk(name, &nd); -+ error = user_path_walk_it(name, &nd); - if (!error) { +- FSHOOK_BEGIN_USER_PATH_WALK(stat, error, name, nd, path, .link = false) ++ FSHOOK_BEGIN_USER_PATH_WALK_IT(stat, error, name, nd, path, .link = false) + - error = vfs_getattr(nd.mnt, nd.dentry, stat); + error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat); path_release(&nd); - } - return error; -@@ -80,10 +88,11 @@ + + FSHOOK_END_USER_WALK(stat, error, path) +@@ -83,10 +91,11 @@ { struct nameidata nd; int error; + intent_init(&nd.intent, IT_GETATTR); -- error = user_path_walk_link(name, &nd); -+ error = user_path_walk_link_it(name, &nd); - if (!error) { +- FSHOOK_BEGIN_USER_PATH_WALK_LINK(stat, error, name, nd, path, .link = true) ++ FSHOOK_BEGIN_USER_PATH_WALK_LINK_IT(stat, error, name, nd, path, .link = true) + - error = vfs_getattr(nd.mnt, nd.dentry, stat); + error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat); path_release(&nd); - } - return error; -@@ -95,9 +104,12 @@ + + FSHOOK_END_USER_WALK(stat, error, path) +@@ -99,6 +108,8 @@ + int vfs_fstat(unsigned int fd, struct kstat *stat) { - struct file *f = fget(fd); - int error = -EBADF; + int error; + struct nameidata nd; + intent_init(&nd.intent, IT_GETATTR); + FSHOOK_BEGIN(fstat, error, .fd = fd) + +@@ -106,7 +117,8 @@ + + error = -EBADF; if (f) { - error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat); + error = vfs_getattr_it(f->f_vfsmnt, f->f_dentry, &nd.intent, stat); + intent_release(&nd.intent); fput(f); } - return error; -Index: linux-2.6.4-51.0/fs/nfs/dir.c + +Index: linux-2.6.5-12.1/fs/nfs/dir.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/nfs/dir.c 2004-04-05 12:41:59.000000000 -0400 -+++ linux-2.6.4-51.0/fs/nfs/dir.c 2004-04-07 13:27:47.000000000 -0400 +--- linux-2.6.5-12.1.orig/fs/nfs/dir.c 2004-05-10 19:21:53.000000000 +0300 ++++ linux-2.6.5-12.1/fs/nfs/dir.c 2004-05-25 17:32:14.043493440 +0300 @@ -709,7 +709,7 @@ return 0; if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE)) @@ -535,10 +583,10 @@ Index: linux-2.6.4-51.0/fs/nfs/dir.c /* * The 0 argument passed into the create function should one day -Index: linux-2.6.4-51.0/fs/inode.c +Index: linux-2.6.5-12.1/fs/inode.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/inode.c 2004-04-05 12:41:59.000000000 -0400 -+++ linux-2.6.4-51.0/fs/inode.c 2004-04-05 17:36:43.000000000 -0400 +--- linux-2.6.5-12.1.orig/fs/inode.c 2004-05-10 19:21:56.000000000 +0300 ++++ linux-2.6.5-12.1/fs/inode.c 2004-05-25 17:32:14.044493288 +0300 @@ -221,6 +221,7 @@ inodes_stat.nr_unused--; } @@ -547,11 +595,11 @@ Index: linux-2.6.4-51.0/fs/inode.c /** * clear_inode - clear an inode * @inode: inode to clear -Index: linux-2.6.4-51.0/fs/super.c +Index: linux-2.6.5-12.1/fs/super.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/super.c 2004-04-05 12:41:59.000000000 -0400 -+++ linux-2.6.4-51.0/fs/super.c 2004-04-05 17:36:43.000000000 -0400 -@@ -787,6 +787,8 @@ +--- linux-2.6.5-12.1.orig/fs/super.c 2004-05-10 19:21:56.000000000 +0300 ++++ linux-2.6.5-12.1/fs/super.c 2004-05-25 17:32:14.045493136 +0300 +@@ -789,6 +789,8 @@ return (struct vfsmount *)sb; } @@ -560,10 +608,10 @@ Index: linux-2.6.4-51.0/fs/super.c struct vfsmount *kern_mount(struct file_system_type *type) { return do_kern_mount(type->name, 0, type->name, NULL); -Index: linux-2.6.4-51.0/include/linux/dcache.h +Index: linux-2.6.5-12.1/include/linux/dcache.h =================================================================== ---- linux-2.6.4-51.0.orig/include/linux/dcache.h 2004-04-05 12:42:07.000000000 -0400 -+++ linux-2.6.4-51.0/include/linux/dcache.h 2004-04-05 17:36:43.000000000 -0400 +--- linux-2.6.5-12.1.orig/include/linux/dcache.h 2004-04-04 06:38:24.000000000 +0300 ++++ linux-2.6.5-12.1/include/linux/dcache.h 2004-05-25 17:32:14.045493136 +0300 @@ -4,6 +4,7 @@ #ifdef __KERNEL__ @@ -581,11 +629,11 @@ Index: linux-2.6.4-51.0/include/linux/dcache.h struct dentry_stat_t { int nr_dentry; int nr_unused; -Index: linux-2.6.4-51.0/include/linux/fs.h +Index: linux-2.6.5-12.1/include/linux/fs.h =================================================================== ---- linux-2.6.4-51.0.orig/include/linux/fs.h 2004-04-05 12:42:07.000000000 -0400 -+++ linux-2.6.4-51.0/include/linux/fs.h 2004-04-05 17:36:43.000000000 -0400 -@@ -249,6 +249,8 @@ +--- linux-2.6.5-12.1.orig/include/linux/fs.h 2004-05-10 19:21:56.000000000 +0300 ++++ linux-2.6.5-12.1/include/linux/fs.h 2004-05-25 17:32:14.046492984 +0300 +@@ -250,6 +250,8 @@ #define ATTR_ATTR_FLAG 1024 #define ATTR_KILL_SUID 2048 #define ATTR_KILL_SGID 4096 @@ -594,7 +642,7 @@ Index: linux-2.6.4-51.0/include/linux/fs.h /* * This is the Inode Attributes structure, used for notify_change(). It -@@ -422,6 +424,7 @@ +@@ -423,6 +425,7 @@ struct block_device *i_bdev; struct cdev *i_cdev; int i_cindex; @@ -602,7 +650,7 @@ Index: linux-2.6.4-51.0/include/linux/fs.h unsigned long i_dnotify_mask; /* Directory notify events */ struct dnotify_struct *i_dnotify; /* for directory notifications */ -@@ -554,6 +557,7 @@ +@@ -556,6 +559,7 @@ spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; @@ -610,7 +658,7 @@ Index: linux-2.6.4-51.0/include/linux/fs.h }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); -@@ -874,7 +878,9 @@ +@@ -886,7 +890,9 @@ void (*truncate) (struct inode *); int (*permission) (struct inode *, int, struct nameidata *); int (*setattr) (struct dentry *, struct iattr *); @@ -620,7 +668,7 @@ Index: linux-2.6.4-51.0/include/linux/fs.h int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); -@@ -1101,6 +1107,7 @@ +@@ -1114,6 +1120,7 @@ extern int unregister_filesystem(struct file_system_type *); extern struct vfsmount *kern_mount(struct file_system_type *); extern int may_umount(struct vfsmount *); @@ -628,7 +676,7 @@ Index: linux-2.6.4-51.0/include/linux/fs.h extern long do_mount(char *, char *, char *, unsigned long, void *); extern int vfs_statfs(struct super_block *, struct kstatfs *); -@@ -1165,6 +1172,7 @@ +@@ -1178,6 +1185,7 @@ extern int do_truncate(struct dentry *, loff_t start); extern struct file *filp_open(const char *, int, int); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); @@ -636,10 +684,10 @@ Index: linux-2.6.4-51.0/include/linux/fs.h extern int filp_close(struct file *, fl_owner_t id); extern char * getname(const char __user *); -Index: linux-2.6.4-51.0/include/linux/namei.h +Index: linux-2.6.5-12.1/include/linux/namei.h =================================================================== ---- linux-2.6.4-51.0.orig/include/linux/namei.h 2004-04-05 12:42:07.000000000 -0400 -+++ linux-2.6.4-51.0/include/linux/namei.h 2004-04-05 17:36:43.000000000 -0400 +--- linux-2.6.5-12.1.orig/include/linux/namei.h 2004-05-10 19:21:56.000000000 +0300 ++++ linux-2.6.5-12.1/include/linux/namei.h 2004-05-25 17:32:14.047492832 +0300 @@ -2,25 +2,55 @@ #define _LINUX_NAMEI_H @@ -717,15 +765,15 @@ Index: linux-2.6.4-51.0/include/linux/namei.h @@ -49,6 +82,12 @@ #define LOOKUP_ACCESS (0x0400) - extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); -+extern int FASTCALL(__user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd)); + extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *, const char **)); ++extern int FASTCALL(__user_walk_it(const char __user *, unsigned, struct nameidata *, const char **)); +#define user_path_walk_it(name,nd) \ -+ __user_walk_it(name, LOOKUP_FOLLOW, nd) ++ __user_walk_it(name, LOOKUP_FOLLOW, nd, 0) +#define user_path_walk_link_it(name,nd) \ -+ __user_walk_it(name, 0, nd) ++ __user_walk_it(name, 0, nd, 0) +extern void intent_release(struct lookup_intent *); #define user_path_walk(name,nd) \ - __user_walk(name, LOOKUP_FOLLOW, nd) + __user_walk(name, LOOKUP_FOLLOW, nd, 0) #define user_path_walk_link(name,nd) \ @@ -60,7 +99,6 @@ @@ -735,11 +783,11 @@ Index: linux-2.6.4-51.0/include/linux/namei.h extern int follow_down(struct vfsmount **, struct dentry **); extern int follow_up(struct vfsmount **, struct dentry **); -Index: linux-2.6.4-51.0/kernel/exit.c +Index: linux-2.6.5-12.1/kernel/exit.c =================================================================== ---- linux-2.6.4-51.0.orig/kernel/exit.c 2004-04-05 12:42:08.000000000 -0400 -+++ linux-2.6.4-51.0/kernel/exit.c 2004-04-05 17:36:43.000000000 -0400 -@@ -259,6 +259,8 @@ +--- linux-2.6.5-12.1.orig/kernel/exit.c 2004-05-10 19:21:56.000000000 +0300 ++++ linux-2.6.5-12.1/kernel/exit.c 2004-05-25 17:32:14.047492832 +0300 +@@ -260,6 +260,8 @@ write_unlock_irq(&tasklist_lock); } @@ -748,7 +796,7 @@ Index: linux-2.6.4-51.0/kernel/exit.c void __set_special_pids(pid_t session, pid_t pgrp) { struct task_struct *curr = current; -@@ -428,6 +430,8 @@ +@@ -429,6 +431,8 @@ __exit_files(tsk); } @@ -757,3 +805,57 @@ Index: linux-2.6.4-51.0/kernel/exit.c static inline void __put_fs_struct(struct fs_struct *fs) { /* No need to hold fs->lock if we are killing it */ +Index: linux-2.6.5-12.1/include/linux/fshooks.h +=================================================================== +--- linux-2.6.5-12.1.orig/include/linux/fshooks.h 2004-05-10 19:21:56.000000000 +0300 ++++ linux-2.6.5-12.1/include/linux/fshooks.h 2004-05-25 17:32:14.048492680 +0300 +@@ -90,12 +90,18 @@ + + #define FSHOOK_BEGIN_USER_WALK(type, err, path, flags, nd, field, args...) \ + FSHOOK_BEGIN_USER_WALK_COMMON(type, err, __user_walk(path, flags, &nd, &info.field), nd, args) ++#define FSHOOK_BEGIN_USER_WALK_IT(type, err, path, flags, nd, field, args...) \ ++ FSHOOK_BEGIN_USER_WALK_COMMON(type, err, __user_walk_it(path, flags, &nd, &info.field), nd, args) + + #define FSHOOK_BEGIN_USER_PATH_WALK(type, err, path, nd, field, args...) \ + FSHOOK_BEGIN_USER_WALK_COMMON(type, err, __user_walk(path, LOOKUP_FOLLOW, &nd, &info.field), nd, args) ++#define FSHOOK_BEGIN_USER_PATH_WALK_IT(type, err, path, nd, field, args...) \ ++ FSHOOK_BEGIN_USER_WALK_COMMON(type, err, __user_walk_it(path, LOOKUP_FOLLOW, &nd, &info.field), nd, args) + + #define FSHOOK_BEGIN_USER_PATH_WALK_LINK(type, err, path, nd, field, args...) \ + FSHOOK_BEGIN_USER_WALK_COMMON(type, err, __user_walk(path, 0, &nd, &info.field), nd, args) ++#define FSHOOK_BEGIN_USER_PATH_WALK_LINK_IT(type, err, path, nd, field, args...) \ ++ FSHOOK_BEGIN_USER_WALK_COMMON(type, err, __user_walk_it(path, 0, &nd, &info.field), nd, args) + + #define FSHOOK_END_USER_WALK(type, err, field) \ + (void)(&info != (struct fshook_##type##_info *)-1L); \ +@@ -126,12 +132,18 @@ + + #define FSHOOK_BEGIN_USER_WALK(type, err, path, flags, nd, field, args...) \ + if (!(err = __user_walk(path, flags, &nd, 0))) { ++#define FSHOOK_BEGIN_USER_WALK_IT(type, err, path, flags, nd, field, args...) \ ++ if (!(err = __user_walk_it(path, flags, &nd, 0))) { + + #define FSHOOK_BEGIN_USER_PATH_WALK(type, err, path, nd, field, args...) \ + if (!(err = user_path_walk(path, &nd))) { ++#define FSHOOK_BEGIN_USER_PATH_WALK_IT(type, err, path, nd, field, args...) \ ++ if (!(err = user_path_walk_it(path, &nd))) { + + #define FSHOOK_BEGIN_USER_PATH_WALK_LINK(type, err, path, nd, field, args...) \ + if (!(err = user_path_walk_link(path, &nd))) { ++#define FSHOOK_BEGIN_USER_PATH_WALK_LINK_IT(type, err, path, nd, field, args...) \ ++ if (!(err = user_path_walk_link_it(path, &nd))) { + + #define FSHOOK_END_USER_WALK(type, err, field) ((void)0);} + +Index: linux-2.6.5-12.1/fs/block_dev.c +=================================================================== +--- linux-2.6.5-12.1.orig/fs/block_dev.c 2004-05-10 19:21:55.000000000 +0300 ++++ linux-2.6.5-12.1/fs/block_dev.c 2004-05-25 17:32:39.517620784 +0300 +@@ -834,6 +834,7 @@ + if (!path || !*path) + return ERR_PTR(-EINVAL); + ++ intent_init(&nd.intent, IT_LOOKUP); + error = path_lookup(path, LOOKUP_FOLLOW, &nd); + if (error) + return ERR_PTR(error); diff --git a/lustre/kernel_patches/patches/vfs_nointent-2.6-suse.patch b/lustre/kernel_patches/patches/vfs_nointent-2.6-suse.patch index 2bd3c6d..934dd77 100644 --- a/lustre/kernel_patches/patches/vfs_nointent-2.6-suse.patch +++ b/lustre/kernel_patches/patches/vfs_nointent-2.6-suse.patch @@ -2,11 +2,11 @@ .old..........pc/vfs_nointent_2.6.0-suse/fs/namei.c .new.........fs/namei.c -Index: linux-2.6.4-51.0/fs/namei.c +Index: linux-2.6.5-12.1/fs/namei.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/namei.c 2004-04-05 17:36:42.000000000 -0400 -+++ linux-2.6.4-51.0/fs/namei.c 2004-04-05 17:36:43.000000000 -0400 -@@ -1276,7 +1276,7 @@ +--- linux-2.6.5-12.1.orig/fs/namei.c 2004-05-11 15:41:54.000000000 -0400 ++++ linux-2.6.5-12.1/fs/namei.c 2004-05-11 15:42:00.000000000 -0400 +@@ -1292,7 +1292,7 @@ if (!error) { DQUOT_INIT(inode); @@ -15,7 +15,7 @@ Index: linux-2.6.4-51.0/fs/namei.c } put_write_access(inode); if (error) -@@ -1526,6 +1526,7 @@ +@@ -1542,6 +1542,7 @@ char * tmp; struct dentry * dentry; struct nameidata nd; @@ -23,7 +23,7 @@ Index: linux-2.6.4-51.0/fs/namei.c if (S_ISDIR(mode)) return -EPERM; -@@ -1536,6 +1537,15 @@ +@@ -1554,6 +1555,15 @@ error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; @@ -39,7 +39,7 @@ Index: linux-2.6.4-51.0/fs/namei.c dentry = lookup_create(&nd, 0); error = PTR_ERR(dentry); -@@ -1562,6 +1572,7 @@ +@@ -1580,6 +1590,7 @@ dput(dentry); } up(&nd.dentry->d_inode->i_sem); @@ -47,8 +47,8 @@ Index: linux-2.6.4-51.0/fs/namei.c path_release(&nd); out: putname(tmp); -@@ -1603,10 +1614,18 @@ - if (!IS_ERR(tmp)) { +@@ -1626,10 +1637,18 @@ + struct dentry *dentry; struct nameidata nd; + intent_init(&nd.intent, IT_LOOKUP); @@ -66,15 +66,15 @@ Index: linux-2.6.4-51.0/fs/namei.c dentry = lookup_create(&nd, 1); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { -@@ -1616,6 +1635,7 @@ +@@ -1639,6 +1658,7 @@ dput(dentry); } up(&nd.dentry->d_inode->i_sem); +out2: path_release(&nd); out: - putname(tmp); -@@ -1696,6 +1716,7 @@ + +@@ -1722,6 +1742,7 @@ char * name; struct dentry *dentry; struct nameidata nd; @@ -82,7 +82,7 @@ Index: linux-2.6.4-51.0/fs/namei.c name = getname(pathname); if(IS_ERR(name)) -@@ -1716,6 +1737,14 @@ +@@ -1744,6 +1765,14 @@ error = -EBUSY; goto exit1; } @@ -97,7 +97,7 @@ Index: linux-2.6.4-51.0/fs/namei.c down(&nd.dentry->d_inode->i_sem); dentry = lookup_hash(&nd.last, nd.dentry); error = PTR_ERR(dentry); -@@ -1774,6 +1805,7 @@ +@@ -1805,6 +1834,7 @@ struct dentry *dentry; struct nameidata nd; struct inode *inode = NULL; @@ -105,7 +105,7 @@ Index: linux-2.6.4-51.0/fs/namei.c name = getname(pathname); if(IS_ERR(name)) -@@ -1785,6 +1817,13 @@ +@@ -1818,6 +1848,13 @@ error = -EISDIR; if (nd.last_type != LAST_NORM) goto exit1; @@ -119,8 +119,8 @@ Index: linux-2.6.4-51.0/fs/namei.c down(&nd.dentry->d_inode->i_sem); dentry = lookup_hash(&nd.last, nd.dentry); error = PTR_ERR(dentry); -@@ -1852,10 +1891,18 @@ - if (!IS_ERR(to)) { +@@ -1891,10 +1928,18 @@ + struct dentry *dentry; struct nameidata nd; + intent_init(&nd.intent, IT_LOOKUP); @@ -138,15 +138,15 @@ Index: linux-2.6.4-51.0/fs/namei.c dentry = lookup_create(&nd, 0); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { -@@ -1863,6 +1910,7 @@ +@@ -1902,6 +1947,7 @@ dput(dentry); } up(&nd.dentry->d_inode->i_sem); +out2: path_release(&nd); out: - putname(to); -@@ -1926,6 +1974,8 @@ + +@@ -1968,6 +2014,8 @@ struct nameidata nd, old_nd; int error; char * to; @@ -155,7 +155,7 @@ Index: linux-2.6.4-51.0/fs/namei.c to = getname(newname); if (IS_ERR(to)) -@@ -1940,6 +1990,13 @@ +@@ -1986,6 +2034,13 @@ error = -EXDEV; if (old_nd.mnt != nd.mnt) goto out_release; @@ -169,7 +169,7 @@ Index: linux-2.6.4-51.0/fs/namei.c new_dentry = lookup_create(&nd, 0); error = PTR_ERR(new_dentry); if (!IS_ERR(new_dentry)) { -@@ -1990,7 +2047,7 @@ +@@ -2038,7 +2093,7 @@ * locking]. */ int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, @@ -178,7 +178,7 @@ Index: linux-2.6.4-51.0/fs/namei.c { int error = 0; struct inode *target; -@@ -2035,7 +2092,7 @@ +@@ -2083,7 +2138,7 @@ } int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, @@ -187,7 +187,7 @@ Index: linux-2.6.4-51.0/fs/namei.c { struct inode *target; int error; -@@ -2112,6 +2169,8 @@ +@@ -2160,6 +2215,8 @@ struct dentry * old_dentry, *new_dentry; struct dentry * trap; struct nameidata oldnd, newnd; @@ -196,7 +196,7 @@ Index: linux-2.6.4-51.0/fs/namei.c error = path_lookup(oldname, LOOKUP_PARENT, &oldnd); if (error) -@@ -2134,6 +2193,13 @@ +@@ -2182,6 +2239,13 @@ if (newnd.last_type != LAST_NORM) goto exit2; @@ -210,7 +210,7 @@ Index: linux-2.6.4-51.0/fs/namei.c trap = lock_rename(new_dir, old_dir); old_dentry = lookup_hash(&oldnd.last, old_dir); -@@ -2165,8 +2231,7 @@ +@@ -2213,8 +2277,7 @@ if (new_dentry == trap) goto exit5; @@ -220,11 +220,11 @@ Index: linux-2.6.4-51.0/fs/namei.c exit5: dput(new_dentry); exit4: -Index: linux-2.6.4-51.0/fs/open.c +Index: linux-2.6.5-12.1/fs/open.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/open.c 2004-04-05 17:36:42.000000000 -0400 -+++ linux-2.6.4-51.0/fs/open.c 2004-04-06 01:37:39.000000000 -0400 -@@ -187,9 +187,10 @@ +--- linux-2.6.5-12.1.orig/fs/open.c 2004-05-11 15:41:54.000000000 -0400 ++++ linux-2.6.5-12.1/fs/open.c 2004-05-11 16:07:02.000000000 -0400 +@@ -203,9 +203,10 @@ return error; } @@ -236,7 +236,7 @@ Index: linux-2.6.4-51.0/fs/open.c struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ -@@ -200,7 +201,14 @@ +@@ -216,7 +217,14 @@ newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; down(&dentry->d_inode->i_sem); down_write(&dentry->d_inode->i_alloc_sem); @@ -252,7 +252,7 @@ Index: linux-2.6.4-51.0/fs/open.c up_write(&dentry->d_inode->i_alloc_sem); up(&dentry->d_inode->i_sem); return err; -@@ -256,7 +264,7 @@ +@@ -271,7 +279,7 @@ error = locks_verify_truncate(inode, NULL, length); if (!error) { DQUOT_INIT(inode); @@ -261,7 +261,7 @@ Index: linux-2.6.4-51.0/fs/open.c } put_write_access(inode); -@@ -308,7 +316,7 @@ +@@ -328,7 +336,7 @@ error = locks_verify_truncate(inode, file, length); if (!error) @@ -270,30 +270,7 @@ Index: linux-2.6.4-51.0/fs/open.c out_putf: fput(file); out: -@@ -387,9 +395,19 @@ - (error = permission(inode,MAY_WRITE,&nd)) != 0) - goto dput_and_out; - } -- down(&inode->i_sem); -- error = notify_change(nd.dentry, &newattrs); -- up(&inode->i_sem); -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto dput_and_out; -+ } else { -+ down(&inode->i_sem); -+ error = notify_change(nd.dentry, &newattrs); -+ up(&inode->i_sem); -+ } - dput_and_out: - path_release(&nd); - out: -@@ -440,9 +458,19 @@ +@@ -402,9 +410,19 @@ (error = permission(inode,MAY_WRITE,&nd)) != 0) goto dput_and_out; } @@ -315,8 +292,8 @@ Index: linux-2.6.4-51.0/fs/open.c + } dput_and_out: path_release(&nd); - out: -@@ -592,36 +620,52 @@ + +@@ -613,39 +631,55 @@ return error; } @@ -326,11 +303,14 @@ Index: linux-2.6.4-51.0/fs/open.c - struct inode * inode; - struct dentry * dentry; - struct file * file; -- int err = -EBADF; +- int err; + struct inode * inode = dentry->d_inode; struct iattr newattrs; + int error = -EROFS; +- FSHOOK_BEGIN(fchmod, err, .fd = fd, .mode = mode) +- +- err = -EBADF; - file = fget(fd); - if (!file) + if (IS_RDONLY(inode)) @@ -338,18 +318,19 @@ Index: linux-2.6.4-51.0/fs/open.c + + if (inode->i_op->setattr_raw) { + struct inode_operations *op = dentry->d_inode->i_op; - -- dentry = file->f_dentry; -- inode = dentry->d_inode; ++ + newattrs.ia_mode = mode; + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + newattrs.ia_valid |= ATTR_RAW; + error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ ++ /* the file system wants to use the normal vfs path now */ + if (error != -EOPNOTSUPP) + goto out; + } +- dentry = file->f_dentry; +- inode = dentry->d_inode; +- - err = -EROFS; - if (IS_RDONLY(inode)) - goto out_putf; @@ -367,16 +348,19 @@ Index: linux-2.6.4-51.0/fs/open.c - err = notify_change(dentry, &newattrs); + error = notify_change(dentry, &newattrs); up(&inode->i_sem); + +-out_putf: +out: + return error; +} - --out_putf: ++ +asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) +{ + struct file * file; + int err = -EBADF; + ++ FSHOOK_BEGIN(fchmod, err, .fd = fd, .mode = mode) ++ + file = fget(fd); + if (!file) + goto out; @@ -384,8 +368,8 @@ Index: linux-2.6.4-51.0/fs/open.c + err = chmod_common(file->f_dentry, mode); fput(file); out: - return err; -@@ -630,32 +674,13 @@ + +@@ -657,9 +691,7 @@ asmlinkage long sys_chmod(const char __user * filename, mode_t mode) { struct nameidata nd; @@ -393,9 +377,12 @@ Index: linux-2.6.4-51.0/fs/open.c int error; - struct iattr newattrs; - error = user_path_walk(filename, &nd); - if (error) - goto out; + FSHOOK_BEGIN_USER_PATH_WALK(chmod, + error, +@@ -669,25 +701,7 @@ + .mode = mode, + .link = false) + - inode = nd.dentry->d_inode; - - error = -EROFS; @@ -413,13 +400,13 @@ Index: linux-2.6.4-51.0/fs/open.c - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(nd.dentry, &newattrs); - up(&inode->i_sem); - +- -dput_and_out: + error = chmod_common(nd.dentry, mode); path_release(&nd); - out: - return error; -@@ -676,6 +701,18 @@ + + FSHOOK_END_USER_WALK(chmod, error, path) +@@ -710,6 +724,18 @@ if (IS_RDONLY(inode)) goto out; error = -EPERM; @@ -438,7 +425,7 @@ Index: linux-2.6.4-51.0/fs/open.c if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto out; newattrs.ia_valid = ATTR_CTIME; -@@ -689,6 +726,7 @@ +@@ -723,6 +749,7 @@ } if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID|ATTR_KILL_SGID; @@ -446,11 +433,11 @@ Index: linux-2.6.4-51.0/fs/open.c down(&inode->i_sem); error = notify_change(dentry, &newattrs); up(&inode->i_sem); -Index: linux-2.6.4-51.0/fs/exec.c +Index: linux-2.6.5-12.1/fs/exec.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/exec.c 2004-04-05 17:36:42.000000000 -0400 -+++ linux-2.6.4-51.0/fs/exec.c 2004-04-05 17:36:43.000000000 -0400 -@@ -1418,7 +1418,7 @@ +--- linux-2.6.5-12.1.orig/fs/exec.c 2004-05-11 15:41:54.000000000 -0400 ++++ linux-2.6.5-12.1/fs/exec.c 2004-05-11 15:42:00.000000000 -0400 +@@ -1435,7 +1435,7 @@ goto close_fail; if (!file->f_op->write) goto close_fail; @@ -459,11 +446,11 @@ Index: linux-2.6.4-51.0/fs/exec.c goto close_fail; retval = binfmt->core_dump(signr, regs, file); -Index: linux-2.6.4-51.0/include/linux/fs.h +Index: linux-2.6.5-12.1/include/linux/fs.h =================================================================== ---- linux-2.6.4-51.0.orig/include/linux/fs.h 2004-04-05 17:36:43.000000000 -0400 -+++ linux-2.6.4-51.0/include/linux/fs.h 2004-04-05 17:36:43.000000000 -0400 -@@ -866,13 +866,20 @@ +--- linux-2.6.5-12.1.orig/include/linux/fs.h 2004-05-11 15:41:54.000000000 -0400 ++++ linux-2.6.5-12.1/include/linux/fs.h 2004-05-11 15:42:00.000000000 -0400 +@@ -878,13 +878,20 @@ int (*create) (struct inode *,struct dentry *,int, struct nameidata *); struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); int (*link) (struct dentry *,struct inode *,struct dentry *); @@ -484,7 +471,7 @@ Index: linux-2.6.4-51.0/include/linux/fs.h int (*readlink) (struct dentry *, char __user *,int); int (*follow_link) (struct dentry *, struct nameidata *); void (*truncate) (struct inode *); -@@ -1169,7 +1176,7 @@ +@@ -1182,7 +1189,7 @@ /* fs/open.c */ @@ -493,10 +480,10 @@ Index: linux-2.6.4-51.0/include/linux/fs.h extern struct file *filp_open(const char *, int, int); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *); -Index: linux-2.6.4-51.0/net/unix/af_unix.c +Index: linux-2.6.5-12.1/net/unix/af_unix.c =================================================================== ---- linux-2.6.4-51.0.orig/net/unix/af_unix.c 2004-04-05 12:42:07.000000000 -0400 -+++ linux-2.6.4-51.0/net/unix/af_unix.c 2004-04-05 17:36:43.000000000 -0400 +--- linux-2.6.5-12.1.orig/net/unix/af_unix.c 2004-04-03 22:37:36.000000000 -0500 ++++ linux-2.6.5-12.1/net/unix/af_unix.c 2004-05-11 15:42:00.000000000 -0400 @@ -676,6 +676,7 @@ int err = 0; diff --git a/lustre/kernel_patches/series/2.6-suse.series b/lustre/kernel_patches/series/2.6-suse.series index 15af341..c2c88f6 100644 --- a/lustre/kernel_patches/series/2.6-suse.series +++ b/lustre/kernel_patches/series/2.6-suse.series @@ -11,3 +11,4 @@ removepage-2.6-suse.patch dev_read_only-2.6-suse.patch export-2.6-suse.patch header-guards-2.6-suse.patch +md_path_lookup-2.6-suse.patch diff --git a/lustre/kernel_patches/series/ldiskfs-2.6-suse.series b/lustre/kernel_patches/series/ldiskfs-2.6-suse.series index cff99dd..d27088e 100644 --- a/lustre/kernel_patches/series/ldiskfs-2.6-suse.series +++ b/lustre/kernel_patches/series/ldiskfs-2.6-suse.series @@ -7,3 +7,4 @@ ext3-init-generation-2.6-suse.patch ext3-ea-in-inode-2.6-suse.patch export-ext3-2.6-suse.patch ext3-include-fixes-2.6-suse.patch +ext3-htree-rename_fix.patch diff --git a/lustre/kernel_patches/series/suse-2.4.21-2 b/lustre/kernel_patches/series/suse-2.4.21-2 index f4e7175..03e0db2 100644 --- a/lustre/kernel_patches/series/suse-2.4.21-2 +++ b/lustre/kernel_patches/series/suse-2.4.21-2 @@ -30,3 +30,4 @@ ext3-xattr-ptr-arith-fix.patch kernel_text_address-2.4.20-vanilla.patch procfs-ndynamic-2.4.21-suse2.patch ext3-truncate-buffer-head.patch +loop-sync-2.4.21-suse.patch diff --git a/lustre/kernel_patches/targets/2.6-suse.target b/lustre/kernel_patches/targets/2.6-suse.target index ef3b1ae..d8b192b 100644 --- a/lustre/kernel_patches/targets/2.6-suse.target +++ b/lustre/kernel_patches/targets/2.6-suse.target @@ -1,7 +1,7 @@ -KERNEL=linux-2.6.4-51.8.tar.gz +KERNEL=linux-2.6.5-12.1.tar.gz SERIES=2.6-suse -VERSION=2.6.4 -EXTRA_VERSION=51.8_lustre +VERSION=2.6.5 +EXTRA_VERSION=12.1_lustre RHBUILD=0 BASE_ARCHS="" diff --git a/lustre/ldiskfs/autoMakefile.am b/lustre/ldiskfs/autoMakefile.am index b24081e..11838d6 100644 --- a/lustre/ldiskfs/autoMakefile.am +++ b/lustre/ldiskfs/autoMakefile.am @@ -1,6 +1,8 @@ +if MODULES if LDISKFS modulefs_DATA = ldiskfs$(KMODEXT) endif +endif ldiskfs_linux_headers := $(addprefix linux/,$(subst ext3,ldiskfs,$(notdir $(linux_headers)))) diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index cfd1c8c..cfaefc5 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -892,8 +892,13 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req, if (lock->l_granted_mode == LCK_PW && !lock->l_readers && !lock->l_writers && time_after(jiffies, lock->l_last_used + 10 * HZ)) { +#ifdef __KERNEL__ + ldlm_bl_to_thread(ns, NULL, lock); + l_unlock(&ns->ns_lock); +#else l_unlock(&ns->ns_lock); ldlm_handle_bl_callback(ns, NULL, lock); +#endif EXIT; return; } diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 7e75089..787d921 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -284,6 +284,9 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, lock = list_entry(tmp, struct ldlm_lock, l_res_link); LDLM_LOCK_GET(lock); + /* Set CBPENDING so nothing in the cancellation path + * can match this lock */ + lock->l_flags |= LDLM_FL_CBPENDING; lock->l_flags |= LDLM_FL_FAILED; lock->l_flags |= flags; @@ -292,7 +295,6 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, * alternative: pretend that we got a blocking AST from * the server, so that when the lock is decref'd, it * will go away ... */ - lock->l_flags |= LDLM_FL_CBPENDING; /* ... without sending a CANCEL message. */ lock->l_flags |= LDLM_FL_LOCAL_ONLY; LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY"); diff --git a/lustre/liblustre/tests/Makefile.am b/lustre/liblustre/tests/Makefile.am index ff73edf..278567e 100644 --- a/lustre/liblustre/tests/Makefile.am +++ b/lustre/liblustre/tests/Makefile.am @@ -46,8 +46,8 @@ replay_ost_single_DEPENDENCIES = $(top_builddir)/liblustre/liblustre.a if MPITESTS test_lock_cancel_SOURCES = test_lock_cancel.c -test_lock_cancel_CFLAGS = $(LL_CFLAGS) -I/opt/lam/include -L/opt/lam/lib -test_lock_cancel_LDADD := $(LLIB_EXEC) -lmpi -llam +test_lock_cancel_CFLAGS = $(LL_CFLAGS) -I/opt/lam/include +test_lock_cancel_LDADD := $(LLIB_EXEC) -L/opt/lam/lib -lmpi -llam endif diff --git a/lustre/llite/file.c b/lustre/llite/file.c index aa00caf..84b8f2f 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -426,11 +426,11 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, tmpex.l_extent.end = tmpex.l_extent.start + PAGE_CACHE_SIZE - 1; /* check to see if another DLM lock covers this page */ - ldlm_lock2handle(lock, &lockh); - rc2 = ldlm_lock_match(NULL, + rc2 = ldlm_lock_match(lock->l_resource->lr_namespace, LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK, - NULL, 0, &tmpex, 0, &lockh); + &lock->l_resource->lr_name, LDLM_EXTENT, + &tmpex, LCK_PR | LCK_PW, &lockh); if (rc2 == 0 && page->mapping != NULL) { // checking again to account for writeback's lock_page() LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n"); diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 2c29286..1190e4c 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -338,24 +338,20 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched, } uuid = &watched->u.cli.cl_import->imp_target_uuid; - /* - * Must notify (MDS) before we mark the OSC as active, so that - * the orphan deletion happens without interference from racing - * creates. + /* Set OSC as active before notifying the observer, so the + * observer can use the OSC normally. */ - if (obd->obd_observer) { - /* Pass the notification up the chain. */ - rc = obd_notify(obd->obd_observer, watched, active); - if (rc) - RETURN(rc); - } - rc = lov_set_osc_active(&obd->u.lov, uuid, active); - if (rc) { CERROR("%sactivation of %s failed: %d\n", active ? "" : "de", uuid->uuid, rc); + RETURN(rc); } + + if (obd->obd_observer) + /* Pass the notification up the chain. */ + rc = obd_notify(obd->obd_observer, watched, active); + RETURN(rc); } @@ -936,7 +932,7 @@ static int lov_destroy(struct obd_export *exp, struct obdo *oa, err = obd_destroy(lov->tgts[loi->loi_ost_idx].ltd_exp, &tmp, NULL, oti); if (err && lov->tgts[loi->loi_ost_idx].active) { - CERROR("error: destroying objid "LPX64" subobj " + CDEBUG(D_INODE, "error: destroying objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", oa->o_id, loi->loi_id, loi->loi_ost_idx, err); if (!rc) diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 045daa3..f552198 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -2110,33 +2110,59 @@ static int mds_postsetup(struct obd_device *obd) err_cleanup: mds_lov_clean(obd); err_llog: - obd_llog_cleanup(llog_get_context(&obd->obd_llogs, LLOG_CONFIG_ORIG_CTXT)); + obd_llog_cleanup(llog_get_context(&obd->obd_llogs, + LLOG_CONFIG_ORIG_CTXT)); RETURN(rc); } -static int mds_postrecov(struct obd_device *obd) - +int mds_postrecov(struct obd_device *obd) { + struct mds_obd *mds = &obd->u.mds; struct llog_ctxt *ctxt; - int rc, rc2; + int rc, item = 0; ENTRY; LASSERT(!obd->obd_recovering); ctxt = llog_get_context(&obd->obd_llogs, LLOG_UNLINK_ORIG_CTXT); LASSERT(ctxt != NULL); + /* set nextid first, so we are sure it happens */ + rc = mds_lov_set_nextid(obd); + if (rc) { + CERROR("%s: mds_lov_set_nextid failed\n", obd->obd_name); + GOTO(out, rc); + } + + /* clean PENDING dir */ + rc = mds_cleanup_orphans(obd); + if (rc < 0) + GOTO(out, rc); + item = rc; + rc = llog_connect(ctxt, obd->u.mds.mds_lov_desc.ld_tgt_count, NULL, NULL, NULL); - if (rc != 0) { - CERROR("faild at llog_origin_connect: %d\n", rc); + if (rc) { + CERROR("%s: failed at llog_origin_connect: %d\n", + obd->obd_name, rc); + GOTO(out, rc); } - rc = mds_cleanup_orphans(obd); + /* remove the orphaned precreated objects */ + rc = mds_lov_clearorphans(mds, NULL /* all OSTs */); + if (rc) + GOTO(err_llog, rc); + +out: + RETURN(rc < 0 ? rc : item); - rc2 = mds_lov_set_nextid(obd); - if (rc2 == 0) - rc2 = rc; - RETURN(rc2); +err_llog: + /* cleanup all llogging subsystems */ + rc = obd_llog_finish(obd, &obd->obd_llogs, + mds->mds_lov_desc.ld_tgt_count); + if (rc) + CERROR("%s: failed to cleanup llogging subsystems\n", + obd->obd_name); + goto out; } int mds_lov_clean(struct obd_device *obd) diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h index 253ab59..dddd484 100644 --- a/lustre/mds/mds_internal.h +++ b/lustre/mds/mds_internal.h @@ -77,6 +77,7 @@ int mds_lov_write_objids(struct obd_device *obd); void mds_lov_update_objids(struct obd_device *obd, obd_id *ids); int mds_lov_set_growth(struct mds_obd *mds, int count); int mds_lov_set_nextid(struct obd_device *obd); +int mds_lov_clearorphans(struct mds_obd *mds, struct obd_uuid *ost_uuid); int mds_post_mds_lovconf(struct obd_device *obd); int mds_notify(struct obd_device *obd, struct obd_device *watched, int active); int mds_convert_lov_ea(struct obd_device *obd, struct inode *inode, @@ -108,6 +109,7 @@ int mds_lov_clean(struct obd_device *obd); extern int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, void *karg, void *uarg); extern int mds_lock_mode_for_dir(struct obd_device *, struct dentry *, int); +int mds_postrecov(struct obd_device *obd); #ifdef __KERNEL__ int mds_get_md(struct obd_device *, struct inode *, void *md, int *size, diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index e4ab36d..82e1b05 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -117,7 +117,7 @@ int mds_lov_write_objids(struct obd_device *obd) RETURN(rc); } -static int mds_lov_clearorphans(struct mds_obd *mds, struct obd_uuid *ost_uuid) +int mds_lov_clearorphans(struct mds_obd *mds, struct obd_uuid *ost_uuid) { int rc; struct obdo oa; @@ -156,12 +156,6 @@ int mds_lov_set_nextid(struct obd_device *obd) rc = obd_set_info(mds->mds_osc_exp, strlen("next_id"), "next_id", mds->mds_lov_desc.ld_tgt_count, mds->mds_lov_objids); - if (rc < 0) - GOTO(out, rc); - - rc = mds_lov_clearorphans(mds, NULL /* all OSTs */); - -out: RETURN(rc); } @@ -272,30 +266,10 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) * set_nextid(). The class driver can help us here, because * it can use the obd_recovering flag to determine when the * the OBD is full available. */ - if (!obd->obd_recovering) { - struct llog_ctxt *ctxt; - ctxt = llog_get_context(&obd->obd_llogs, LLOG_UNLINK_ORIG_CTXT); - rc = llog_connect(ctxt, obd->u.mds.mds_lov_desc.ld_tgt_count, - NULL, NULL, NULL); - if (rc != 0) - CERROR("faild at llog_origin_connect: %d\n", rc); - - rc = mds_cleanup_orphans(obd); - if (rc > 0) - CERROR("Cleanup %d orphans while MDS isn't recovering\n", rc); - - rc = mds_lov_set_nextid(obd); - if (rc) - GOTO(err_llog, rc); - } + if (!obd->obd_recovering) + rc = mds_postrecov(obd); RETURN(rc); -err_llog: - /* cleanup all llogging subsystems */ - rc = obd_llog_finish(obd, &obd->obd_llogs, - mds->mds_lov_desc.ld_tgt_count); - if (rc) - CERROR("failed to cleanup llogging subsystems\n"); err_reg: obd_register_observer(mds->mds_osc_obd, NULL); err_discon: @@ -520,6 +494,92 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, RETURN(-EINVAL); } RETURN(0); + +} + +struct mds_lov_sync_info { + struct obd_device *mlsi_obd; /* the lov device to sync */ + struct obd_uuid *mlsi_uuid; /* target to sync */ +}; + +int mds_lov_synchronize(void *data) +{ + struct mds_lov_sync_info *mlsi = data; + struct llog_ctxt *ctxt; + struct obd_device *obd; + struct obd_uuid *uuid; + unsigned long flags; + int rc; + + lock_kernel(); + ptlrpc_daemonize(); + + SIGNAL_MASK_LOCK(current, flags); + sigfillset(¤t->blocked); + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); + + obd = mlsi->mlsi_obd; + uuid = mlsi->mlsi_uuid; + + OBD_FREE(mlsi, sizeof(*mlsi)); + + LASSERT(obd != NULL); + LASSERT(uuid != NULL); + + rc = obd_set_info(obd->u.mds.mds_osc_exp, strlen("mds_conn"), + "mds_conn", 0, uuid); + if (rc != 0) + RETURN(rc); + + ctxt = llog_get_context(&obd->obd_llogs, LLOG_UNLINK_ORIG_CTXT); + LASSERT(ctxt != NULL); + + rc = llog_connect(ctxt, obd->u.mds.mds_lov_desc.ld_tgt_count, + NULL, NULL, uuid); + if (rc != 0) { + CERROR("%s: failed at llog_origin_connect: %d\n", + obd->obd_name, rc); + RETURN(rc); + } + + CWARN("MDS %s: %s now active, resetting orphans\n", + obd->obd_name, uuid->uuid); + rc = mds_lov_clearorphans(&obd->u.mds, uuid); + if (rc != 0) { + CERROR("%s: failed at mds_lov_clearorphans: %d\n", + obd->obd_name, rc); + RETURN(rc); + } + + RETURN(0); +} + +int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid) +{ + struct mds_lov_sync_info *mlsi; + int rc; + + ENTRY; + + OBD_ALLOC(mlsi, sizeof(*mlsi)); + if (mlsi == NULL) + RETURN(-ENOMEM); + + mlsi->mlsi_obd = obd; + mlsi->mlsi_uuid = uuid; + + rc = kernel_thread(mds_lov_synchronize, mlsi, CLONE_VM | CLONE_FILES); + if (rc < 0) + CERROR("%s: error starting mds_lov_synchronize: %d\n", + obd->obd_name, rc); + else { + CDEBUG(D_HA, "%s: mds_lov_synchronize thread: %d\n", + obd->obd_name, rc); + rc = 0; + } + + RETURN(rc); } int mds_notify(struct obd_device *obd, struct obd_device *watched, int active) @@ -542,27 +602,7 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched, int active) CWARN("MDS %s: in recovery, not resetting orphans on %s\n", obd->obd_name, uuid->uuid); } else { - struct llog_ctxt *ctxt; - - ctxt = llog_get_context(&obd->obd_llogs, LLOG_UNLINK_ORIG_CTXT); - LASSERT(ctxt != NULL); - - rc = obd_set_info(obd->u.mds.mds_osc_exp, strlen("mds_conn"), - "mds_conn", 0, uuid); - if (rc != 0) - RETURN(rc); - - ctxt = llog_get_context(&obd->obd_llogs, LLOG_UNLINK_ORIG_CTXT); - rc = llog_connect(ctxt, obd->u.mds.mds_lov_desc.ld_tgt_count, - NULL, NULL, uuid); - if (rc != 0) { - CERROR("faild at llog_origin_connect: %d\n", rc); - RETURN(rc); - } - - CWARN("MDS %s: %s now active, resetting orphans\n", - obd->obd_name, uuid->uuid); - rc = mds_lov_clearorphans(&obd->u.mds, uuid); + rc = mds_lov_start_synchronize(obd, uuid); } RETURN(rc); } diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index 709f67c..bc82a93 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -1178,6 +1178,12 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd, int stripe_count = 0; LASSERT(rc == 0); /* mds_put_write_access must have succeeded */ + if (obd->obd_recovering) { + CDEBUG(D_HA, "not remove orphan %s until recovery" + " is over\n", fidname); + GOTO(out, rc); + } + CDEBUG(D_HA, "destroying orphan object %s\n", fidname); /* Sadly, there is no easy way to save pending_child from diff --git a/lustre/mds/mds_unlink_open.c b/lustre/mds/mds_unlink_open.c index b0cc7ec..951c09f 100644 --- a/lustre/mds/mds_unlink_open.c +++ b/lustre/mds/mds_unlink_open.c @@ -149,7 +149,7 @@ static int mds_osc_destroy_orphan(struct mds_obd *mds, rc = obd_destroy(mds->mds_osc_exp, oa, lsm, &oti); obdo_free(oa); if (rc) - CERROR("destroy orphan objid 0x"LPX64" on ost error " + CDEBUG(D_INODE, "destroy orphan objid 0x"LPX64" on ost error " "%d\n", lsm->lsm_object_id, rc); out_free_memmd: obd_free_memmd(mds->mds_osc_exp, &lsm); @@ -320,7 +320,7 @@ int mds_cleanup_orphans(struct obd_device *obd) item ++; CWARN("removed orphan %s from MDS and OST\n", d_name); } else { - CERROR("removed orphan %s from MDS and OST failed," + CDEBUG(D_INODE, "removed orphan %s from MDS/OST failed," " rc = %d\n", d_name, rc); rc = 0; } diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index bb2b2c4..f6a0667 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -659,7 +659,7 @@ static void cleanup_obdclass(void) * kernel patch */ #include #define LUSTRE_MIN_VERSION 32 -#define LUSTRE_MAX_VERSION 36 +#define LUSTRE_MAX_VERSION 37 #if (LUSTRE_KERNEL_VERSION < LUSTRE_MIN_VERSION) # error Cannot continue: Your Lustre kernel patch is older than the sources #elif (LUSTRE_KERNEL_VERSION > LUSTRE_MAX_VERSION) diff --git a/lustre/obdclass/simple.c b/lustre/obdclass/simple.c deleted file mode 100644 index 48cf4d23..0000000 --- a/lustre/obdclass/simple.c +++ /dev/null @@ -1,266 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002, 2003 Cluster File Systems, Inc. - * Author: Peter Braam - * Aurhot: Andreas Dilger - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_FILTER - -#include -#include -#include - -/* Debugging check only needed during development */ -#ifdef OBD_CTXT_DEBUG -# define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC) -# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERT(!segment_eq(get_fs(), get_ds())) -# define ASSERT_KERNEL_CTXT(msg) LASSERT(segment_eq(get_fs(), get_ds())) -#else -# define ASSERT_CTXT_MAGIC(magic) do {} while(0) -# define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0) -# define ASSERT_KERNEL_CTXT(msg) do {} while(0) -#endif - -/* push / pop to root of obd store */ -void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx, - struct lvfs_ucred *uc) -{ - //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n"); - ASSERT_CTXT_MAGIC(new_ctx->magic); - OBD_SET_CTXT_MAGIC(save); - - /* - CDEBUG(D_INFO, - "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%*s), pwdmnt %p:%d\n", - save, current, current->fs, current->fs->pwd, - atomic_read(¤t->fs->pwd->d_count), - atomic_read(¤t->fs->pwd->d_inode->i_count), - current->fs->pwd->d_name.len, current->fs->pwd->d_name.name, - current->fs->pwdmnt, - atomic_read(¤t->fs->pwdmnt->mnt_count)); - */ - - save->fs = get_fs(); - LASSERT(atomic_read(¤t->fs->pwd->d_count)); - LASSERT(atomic_read(&new_ctx->pwd->d_count)); - save->pwd = dget(current->fs->pwd); - save->pwdmnt = mntget(current->fs->pwdmnt); - save->ngroups = current->ngroups; - - LASSERT(save->pwd); - LASSERT(save->pwdmnt); - LASSERT(new_ctx->pwd); - LASSERT(new_ctx->pwdmnt); - - if (uc) { - save->luc.luc_fsuid = current->fsuid; - save->luc.luc_fsgid = current->fsgid; - save->luc.luc_cap = current->cap_effective; - save->luc.luc_suppgid1 = current->groups[0]; - save->luc.luc_suppgid2 = current->groups[1]; - - current->fsuid = uc->luc_fsuid; - current->fsgid = uc->luc_fsgid; - current->cap_effective = uc->luc_cap; - current->ngroups = 0; - - if (uc->luc_suppgid1 != -1) - current->groups[current->ngroups++] = uc->luc_suppgid1; - if (uc->luc_suppgid2 != -1) - current->groups[current->ngroups++] = uc->luc_suppgid2; - } - set_fs(new_ctx->fs); - set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd); - - /* - CDEBUG(D_INFO, - "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%*s), pwdmnt %p:%d\n", - new_ctx, current, current->fs, current->fs->pwd, - atomic_read(¤t->fs->pwd->d_count), - atomic_read(¤t->fs->pwd->d_inode->i_count), - current->fs->pwd->d_name.len, current->fs->pwd->d_name.name, - current->fs->pwdmnt, - atomic_read(¤t->fs->pwdmnt->mnt_count)); - */ -} -EXPORT_SYMBOL(push_ctxt); - -void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx, - struct lvfs_ucred *uc) -{ - //printk("pc0"); - ASSERT_CTXT_MAGIC(saved->magic); - //printk("pc1"); - ASSERT_KERNEL_CTXT("popping non-kernel context!\n"); - - /* - CDEBUG(D_INFO, - " = pop %p==%p = cur %p pwd %p:d%d:i%d (%*s), pwdmnt %p:%d\n", - new_ctx, current, current->fs, current->fs->pwd, - atomic_read(¤t->fs->pwd->d_count), - atomic_read(¤t->fs->pwd->d_inode->i_count), - current->fs->pwd->d_name.len, current->fs->pwd->d_name.name, - current->fs->pwdmnt, - atomic_read(¤t->fs->pwdmnt->mnt_count)); - */ - - LASSERT(current->fs->pwd == new_ctx->pwd); - LASSERT(current->fs->pwdmnt == new_ctx->pwdmnt); - - set_fs(saved->fs); - set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd); - - dput(saved->pwd); - mntput(saved->pwdmnt); - if (uc) { - current->fsuid = saved->luc.luc_fsuid; - current->fsgid = saved->luc.luc_fsgid; - current->cap_effective = saved->luc.luc_cap; - current->ngroups = saved->ngroups; - current->groups[0] = saved->luc.luc_suppgid1; - current->groups[1] = saved->luc.luc_suppgid2; - } - - /* - CDEBUG(D_INFO, - "= pop %p->%p = cur fs %p pwd %p:d%d:i%d (%*s), pwdmnt %p:%d\n", - saved, current, current->fs, current->fs->pwd, - atomic_read(¤t->fs->pwd->d_count), - atomic_read(¤t->fs->pwd->d_inode->i_count), - current->fs->pwd->d_name.len, current->fs->pwd->d_name.name, - current->fs->pwdmnt, - atomic_read(¤t->fs->pwdmnt->mnt_count)); - */ -} -EXPORT_SYMBOL(pop_ctxt); - -/* utility to make a file */ -struct dentry *simple_mknod(struct dentry *dir, char *name, int mode) -{ - struct dentry *dchild; - int err = 0; - ENTRY; - - ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n"); - CDEBUG(D_INODE, "creating file %*s\n", (int)strlen(name), name); - - dchild = ll_lookup_one_len(name, dir, strlen(name)); - if (IS_ERR(dchild)) - GOTO(out_up, dchild); - - if (dchild->d_inode) { - if (!S_ISREG(dchild->d_inode->i_mode)) - GOTO(out_err, err = -EEXIST); - - GOTO(out_up, dchild); - } - - err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG, NULL); - if (err) - GOTO(out_err, err); - - RETURN(dchild); - -out_err: - dput(dchild); - dchild = ERR_PTR(err); -out_up: - return dchild; -} -EXPORT_SYMBOL(simple_mknod); - -/* utility to make a directory */ -struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode) -{ - struct dentry *dchild; - int err = 0; - ENTRY; - - ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n"); - CDEBUG(D_INODE, "creating directory %*s\n", (int)strlen(name), name); - dchild = ll_lookup_one_len(name, dir, strlen(name)); - if (IS_ERR(dchild)) - GOTO(out_up, dchild); - - if (dchild->d_inode) { - if (!S_ISDIR(dchild->d_inode->i_mode)) - GOTO(out_err, err = -ENOTDIR); - - GOTO(out_up, dchild); - } - - err = vfs_mkdir(dir->d_inode, dchild, mode); - if (err) - GOTO(out_err, err); - - RETURN(dchild); - -out_err: - dput(dchild); - dchild = ERR_PTR(err); -out_up: - return dchild; -} -EXPORT_SYMBOL(simple_mkdir); - -/* - * Read a file from within kernel context. Prior to calling this - * function we should already have done a push_ctxt(). - */ -int lustre_fread(struct file *file, void *buf, int len, loff_t *off) -{ - ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n"); - if (!file || !file->f_op || !file->f_op->read || !off) - RETURN(-ENOSYS); - - return file->f_op->read(file, buf, len, off); -} -EXPORT_SYMBOL(lustre_fread); - -/* - * Write a file from within kernel context. Prior to calling this - * function we should already have done a push_ctxt(). - */ -int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off) -{ - ENTRY; - ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n"); - if (!file) - RETURN(-ENOENT); - if (!file->f_op) - RETURN(-ENOSYS); - if (!off) - RETURN(-EINVAL); - - if (!file->f_op->write) - RETURN(-EROFS); - - RETURN(file->f_op->write(file, buf, len, off)); -} -EXPORT_SYMBOL(lustre_fwrite); - diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index 8014526..95f8263 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -143,6 +143,8 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); cleanup_phase = 2; + generic_osync_inode(inode, inode->i_mapping, OSYNC_DATA|OSYNC_METADATA); + oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res, oti); @@ -185,7 +187,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, offs = k * inode->i_sb->s_blocksize; if (!bio || !can_be_merged(bio, sector) || - !bio_add_page(bio, lnb->page, lnb->len, offs)) { + !bio_add_page(bio, lnb->page, PAGE_SIZE, offs)) { if (bio) { atomic_inc(&dreq->numreqs); submit_bio(WRITE, bio); @@ -198,7 +200,8 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, bio->bi_end_io = dio_complete_routine; bio->bi_private = dreq; - if (!bio_add_page(bio, lnb->page, lnb->len, 0)) + if (!bio_add_page(bio, lnb->page, PAGE_SIZE, + offs)) LBUG(); } } @@ -210,7 +213,6 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, iattr.ia_size = this_size; } -#warning This probably needs filemap_fdatasync() like filter_io_24 (bug 2366) if (bio) { atomic_inc(&dreq->numreqs); fsfilt_send_bio(obd, inode, bio); diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index ed8ae35..a3ebcc5 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -238,30 +238,43 @@ int osc_create(struct obd_export *exp, struct obdo *oa, /* this is the special case where create removes orphans */ if ((oa->o_valid & OBD_MD_FLFLAGS) && oa->o_flags == OBD_FL_DELORPHAN) { - CDEBUG(D_HA, "%p: oscc recovery started\n", oscc); + CDEBUG(D_HA, "%s; oscc recovery started\n", + exp->exp_obd->obd_name); + LASSERT(oscc->oscc_flags & OSCC_FLAG_RECOVERING); + /* delete from next_id on up */ oa->o_valid |= OBD_MD_FLID; oa->o_id = oscc->oscc_next_id - 1; CDEBUG(D_HA, "%s: deleting to next_id: "LPU64"\n", - oscc->oscc_obd->u.cli.cl_import->imp_target_uuid.uuid, - oa->o_id); + exp->exp_obd->obd_name, oa->o_id); rc = osc_real_create(exp, oa, ea, NULL); + if (oscc->oscc_obd == NULL) { + CWARN("the obd for oscc %p has been freed\n", oscc); + RETURN(rc); + } spin_lock(&oscc->oscc_lock); - if (rc == -ENOSPC) - oscc->oscc_flags |= OSCC_FLAG_NOSPC; - oscc->oscc_flags &= ~OSCC_FLAG_RECOVERING; - oscc->oscc_last_id = oa->o_id; - wake_up(&oscc->oscc_waitq); + if (rc == 0 || rc == -ENOSPC) { + if (rc == -ENOSPC) + oscc->oscc_flags |= OSCC_FLAG_NOSPC; + oscc->oscc_flags &= ~OSCC_FLAG_RECOVERING; + oscc->oscc_last_id = oa->o_id; + + /* recover happen in mds_setup, before cobd_setup, so + * reset oscc_gr = 0 here, it sould be no harm to CMD */ + oscc->oscc_gr = 0; + + CDEBUG(D_HA, "%s: oscc recovery finished: %d\n", + exp->exp_obd->obd_name, rc); + wake_up(&oscc->oscc_waitq); + + } else { + CDEBUG(D_ERROR, "%s: oscc recovery failed: %d\n", + exp->exp_obd->obd_name, rc); + } spin_unlock(&oscc->oscc_lock); - - /*recover happen in mds_setup, before cobd_setup, so - *reset oscc_gr = 0 here, it sould be no harm to CMD - */ - oscc->oscc_gr = 0; - CDEBUG(D_HA, "%p: oscc recovery finished\n", oscc); RETURN(rc); } @@ -272,20 +285,20 @@ int osc_create(struct obd_export *exp, struct obdo *oa, if (oscc_recovering(oscc)) { struct l_wait_info lwi; - CDEBUG(D_HA,"%p: oscc recovery in progress, waiting\n", - oscc); + CDEBUG(D_HA,"%s: oscc sync in progress, waiting\n", + exp->exp_obd->obd_name); lwi = LWI_TIMEOUT(MAX(obd_timeout * HZ, 1), NULL, NULL); rc = l_wait_event(oscc->oscc_waitq, !oscc_recovering(oscc), &lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); if (rc == -ETIMEDOUT) { - CDEBUG(D_HA, "%p: timed out waiting for " - "recovery\n", oscc); + CDEBUG(D_HA, "%s: timed out waiting for sync\n", + exp->exp_obd->obd_name); RETURN(rc); } - CDEBUG(D_HA, "%p: oscc recovery over, waking up\n", - oscc); + CDEBUG(D_HA, "%s: oscc sync over, waking up\n", + exp->exp_obd->obd_name); } spin_lock(&oscc->oscc_lock); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 3a8aefe..6010c07 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -317,6 +317,8 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa, oa->o_flags == OBD_FL_DELORPHAN); DEBUG_REQ(D_HA, request, "delorphan from OST integration"); + /* Don't resend the delorphan request */ + request->rq_no_resend = request->rq_no_delay = 1; } rc = ptlrpc_queue_wait(request); @@ -2481,7 +2483,8 @@ static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm, rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type, policy, mode, lockh); if (rc) { - osc_set_data_with_check(lockh, data); + if (!(*flags & LDLM_FL_TEST_LOCK)) + osc_set_data_with_check(lockh, data); RETURN(rc); } /* If we're trying to read, we also search for an existing PW lock. The diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4 index 636ee1d..cb6e0a2 100644 --- a/lustre/portals/archdep.m4 +++ b/lustre/portals/archdep.m4 @@ -218,12 +218,13 @@ if test x$enable_modules != xno ; then fi LUSTRE_MODULE_TRY_MAKE( [#include ], - [LINUXRELEASE=UTS_RELEASE], + [char *LINUXRELEASE; + LINUXRELEASE=UTS_RELEASE;], [$makerule LUSTRE_KERNEL_TEST=conftest.i], [test -s kernel-tests/conftest.i], [ # LINUXRELEASE="UTS_RELEASE" - eval $(grep LINUXRELEASE kernel-tests/conftest.i) + eval $(grep "LINUXRELEASE=" kernel-tests/conftest.i) ],[ AC_MSG_RESULT([unknown]) AC_MSG_ERROR([Could not preprocess test program. Consult config.log for details.]) diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h index c55dd37..6ef28a8 100644 --- a/lustre/portals/include/linux/kp30.h +++ b/lustre/portals/include/linux/kp30.h @@ -7,12 +7,6 @@ #include #define PORTAL_DEBUG -#ifndef offsetof -# define offsetof(typ,memb) ((unsigned long)((char *)&(((typ *)0)->memb))) -#endif - -#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) - #ifdef __KERNEL__ # include # include @@ -647,7 +641,6 @@ enum { TCPNAL = 5, ROUTER = 6, IBNAL = 7, - CRAY_KB_ERNAL = 8, NAL_ENUM_END_MARKER }; diff --git a/lustre/portals/include/linux/kpr.h b/lustre/portals/include/linux/kpr.h index 51d2d2f..1127698 100644 --- a/lustre/portals/include/linux/kpr.h +++ b/lustre/portals/include/linux/kpr.h @@ -4,7 +4,7 @@ #ifndef _KPR_H #define _KPR_H -# include /* for ptl_hdr_t */ +# include /* for ptl_hdr_t */ /******************************************************************************/ /* Kernel Portals Router interface */ diff --git a/lustre/portals/include/linux/libcfs.h b/lustre/portals/include/linux/libcfs.h index c2a15f4..a205163 100644 --- a/lustre/portals/include/linux/libcfs.h +++ b/lustre/portals/include/linux/libcfs.h @@ -79,9 +79,11 @@ extern unsigned int portal_cerror; #define S_PTLROUTER 0x00100000 #define S_COBD 0x00200000 #define S_IBNAL 0x00400000 -#define S_LMV 0x00800000 -#define S_SM 0x01000000 -#define S_CMOBD 0x02000000 +#define S_SM 0x00800000 +#define S_ASOBD 0x01000000 +#define S_LMV 0x02000000 +#define S_CMOBD 0x04000000 + /* If you change these values, please keep portals/utils/debug.c * up to date! */ diff --git a/lustre/portals/include/linux/portals_lib.h b/lustre/portals/include/linux/portals_lib.h index 609290d..b4741cc 100644 --- a/lustre/portals/include/linux/portals_lib.h +++ b/lustre/portals/include/linux/portals_lib.h @@ -77,8 +77,10 @@ static inline char *strdup(const char *str) #endif #ifdef __KERNEL__ +# define NTOH__u16(var) le16_to_cpu(var) # define NTOH__u32(var) le32_to_cpu(var) # define NTOH__u64(var) le64_to_cpu(var) +# define HTON__u16(var) cpu_to_le16(var) # define HTON__u32(var) cpu_to_le32(var) # define HTON__u64(var) cpu_to_le64(var) #else @@ -92,8 +94,10 @@ static inline char *strdup(const char *str) }; \ (ret); \ }) +# define NTOH__u16(var) (var) # define NTOH__u32(var) (var) # define NTOH__u64(var) (expansion_u64(var)) +# define HTON__u16(var) (var) # define HTON__u32(var) (var) # define HTON__u64(var) (expansion_u64(var)) #endif diff --git a/lustre/portals/include/portals/api-support.h b/lustre/portals/include/portals/api-support.h index cfae78c..c5994c6 100644 --- a/lustre/portals/include/portals/api-support.h +++ b/lustre/portals/include/portals/api-support.h @@ -19,9 +19,4 @@ #include #include -#include -/* Hack for 2.4.18 macro name collision */ -#ifdef yield -#undef yield -#endif diff --git a/lustre/portals/include/portals/api.h b/lustre/portals/include/portals/api.h index 6d382bb..c7aaced 100644 --- a/lustre/portals/include/portals/api.h +++ b/lustre/portals/include/portals/api.h @@ -5,7 +5,6 @@ #include -#ifndef PTL_NO_WRAP int PtlInit(int *); void PtlFini(void); @@ -17,8 +16,6 @@ int PtlNIInitialized(ptl_interface_t); int PtlNIFini(ptl_handle_ni_t interface_in); -#endif - int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id); @@ -32,9 +29,7 @@ int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, unsigned long *distance_out); -#ifndef PTL_NO_WRAP int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out); -#endif /* @@ -74,16 +69,12 @@ int PtlMEUnlink(ptl_handle_me_t current_in); int PtlMEUnlinkList(ptl_handle_me_t current_in); -int PtlTblDump(ptl_handle_ni_t ni, int index_in); -int PtlMEDump(ptl_handle_me_t current_in); - /* * Memory descriptors */ -#ifndef PTL_NO_WRAP int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in, ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out); @@ -95,7 +86,6 @@ int PtlMDUnlink(ptl_handle_md_t md_in); int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout, ptl_md_t * new_inout, ptl_handle_eq_t testq_in); -#endif /* These should not be called by users */ int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout, @@ -108,16 +98,11 @@ int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout, /* * Event queues */ -#ifndef PTL_NO_WRAP - -/* These should be called by users */ int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in, ptl_eq_handler_t handler, ptl_handle_eq_t *handle_out); int PtlEQFree(ptl_handle_eq_t eventq_in); -int PtlEQCount(ptl_handle_eq_t eventq_in, ptl_size_t * count_out); - int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); @@ -125,7 +110,6 @@ int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout, ptl_event_t *event_out, int *which_out); -#endif /* * Access Control Table diff --git a/lustre/portals/include/portals/arg-blocks.h b/lustre/portals/include/portals/arg-blocks.h deleted file mode 100644 index 21e30d5..0000000 --- a/lustre/portals/include/portals/arg-blocks.h +++ /dev/null @@ -1,268 +0,0 @@ -#ifndef PTL_BLOCKS_H -#define PTL_BLOCKS_H - -#include "build_check.h" - -/* - * blocks.h - * - * Argument block types for the Portals 3.0 library - * Generated by idl - * - */ - -#include - -/* put LIB_MAX_DISPATCH last here -- these must match the - assignements to the dispatch table in lib-p30/dispatch.c */ -#define PTL_GETID 1 -#define PTL_NISTATUS 2 -#define PTL_NIDIST 3 -// #define PTL_NIDEBUG 4 -#define PTL_MEATTACH 5 -#define PTL_MEINSERT 6 -// #define PTL_MEPREPEND 7 -#define PTL_MEUNLINK 8 -#define PTL_TBLDUMP 9 -#define PTL_MEDUMP 10 -#define PTL_MDATTACH 11 -// #define PTL_MDINSERT 12 -#define PTL_MDBIND 13 -#define PTL_MDUPDATE 14 -#define PTL_MDUNLINK 15 -#define PTL_EQALLOC 16 -#define PTL_EQFREE 17 -#define PTL_ACENTRY 18 -#define PTL_PUT 19 -#define PTL_GET 20 -#define PTL_FAILNID 21 -#define LIB_MAX_DISPATCH 21 - -typedef struct PtlFailNid_in { - ptl_handle_ni_t interface; - ptl_nid_t nid; - unsigned int threshold; -} PtlFailNid_in; - -typedef struct PtlFailNid_out { - int rc; -} PtlFailNid_out; - -typedef struct PtlGetId_in { - ptl_handle_ni_t handle_in; -} PtlGetId_in; - -typedef struct PtlGetId_out { - int rc; - ptl_process_id_t id_out; -} PtlGetId_out; - -typedef struct PtlNIStatus_in { - ptl_handle_ni_t interface_in; - ptl_sr_index_t register_in; -} PtlNIStatus_in; - -typedef struct PtlNIStatus_out { - int rc; - ptl_sr_value_t status_out; -} PtlNIStatus_out; - - -typedef struct PtlNIDist_in { - ptl_handle_ni_t interface_in; - ptl_process_id_t process_in; -} PtlNIDist_in; - -typedef struct PtlNIDist_out { - int rc; - unsigned long distance_out; -} PtlNIDist_out; - - -typedef struct PtlNIDebug_in { - unsigned int mask_in; -} PtlNIDebug_in; - -typedef struct PtlNIDebug_out { - unsigned int rc; -} PtlNIDebug_out; - - -typedef struct PtlMEAttach_in { - ptl_handle_ni_t interface_in; - ptl_pt_index_t index_in; - ptl_ins_pos_t position_in; - ptl_process_id_t match_id_in; - ptl_match_bits_t match_bits_in; - ptl_match_bits_t ignore_bits_in; - ptl_unlink_t unlink_in; -} PtlMEAttach_in; - -typedef struct PtlMEAttach_out { - int rc; - ptl_handle_me_t handle_out; -} PtlMEAttach_out; - - -typedef struct PtlMEInsert_in { - ptl_handle_me_t current_in; - ptl_process_id_t match_id_in; - ptl_match_bits_t match_bits_in; - ptl_match_bits_t ignore_bits_in; - ptl_unlink_t unlink_in; - ptl_ins_pos_t position_in; -} PtlMEInsert_in; - -typedef struct PtlMEInsert_out { - int rc; - ptl_handle_me_t handle_out; -} PtlMEInsert_out; - -typedef struct PtlMEUnlink_in { - ptl_handle_me_t current_in; - ptl_unlink_t unlink_in; -} PtlMEUnlink_in; - -typedef struct PtlMEUnlink_out { - int rc; -} PtlMEUnlink_out; - - -typedef struct PtlTblDump_in { - int index_in; -} PtlTblDump_in; - -typedef struct PtlTblDump_out { - int rc; -} PtlTblDump_out; - - -typedef struct PtlMEDump_in { - ptl_handle_me_t current_in; -} PtlMEDump_in; - -typedef struct PtlMEDump_out { - int rc; -} PtlMEDump_out; - - -typedef struct PtlMDAttach_in { - ptl_handle_me_t me_in; - ptl_handle_eq_t eq_in; - ptl_md_t md_in; - ptl_unlink_t unlink_in; -} PtlMDAttach_in; - -typedef struct PtlMDAttach_out { - int rc; - ptl_handle_md_t handle_out; -} PtlMDAttach_out; - - -typedef struct PtlMDBind_in { - ptl_handle_ni_t ni_in; - ptl_handle_eq_t eq_in; - ptl_md_t md_in; - ptl_unlink_t unlink_in; -} PtlMDBind_in; - -typedef struct PtlMDBind_out { - int rc; - ptl_handle_md_t handle_out; -} PtlMDBind_out; - - -typedef struct PtlMDUpdate_internal_in { - ptl_handle_md_t md_in; - ptl_handle_eq_t testq_in; - ptl_seq_t sequence_in; - - ptl_md_t old_inout; - int old_inout_valid; - ptl_md_t new_inout; - int new_inout_valid; -} PtlMDUpdate_internal_in; - -typedef struct PtlMDUpdate_internal_out { - int rc; - ptl_md_t old_inout; - ptl_md_t new_inout; -} PtlMDUpdate_internal_out; - - -typedef struct PtlMDUnlink_in { - ptl_handle_md_t md_in; -} PtlMDUnlink_in; - -typedef struct PtlMDUnlink_out { - int rc; - ptl_md_t status_out; -} PtlMDUnlink_out; - - -typedef struct PtlEQAlloc_in { - ptl_handle_ni_t ni_in; - ptl_size_t count_in; - void *base_in; - int len_in; - ptl_eq_handler_t callback_in; -} PtlEQAlloc_in; - -typedef struct PtlEQAlloc_out { - int rc; - ptl_handle_eq_t handle_out; -} PtlEQAlloc_out; - - -typedef struct PtlEQFree_in { - ptl_handle_eq_t eventq_in; -} PtlEQFree_in; - -typedef struct PtlEQFree_out { - int rc; -} PtlEQFree_out; - - -typedef struct PtlACEntry_in { - ptl_handle_ni_t ni_in; - ptl_ac_index_t index_in; - ptl_process_id_t match_id_in; - ptl_pt_index_t portal_in; -} PtlACEntry_in; - -typedef struct PtlACEntry_out { - int rc; -} PtlACEntry_out; - - -typedef struct PtlPut_in { - ptl_handle_md_t md_in; - ptl_ack_req_t ack_req_in; - ptl_process_id_t target_in; - ptl_pt_index_t portal_in; - ptl_ac_index_t cookie_in; - ptl_match_bits_t match_bits_in; - ptl_size_t offset_in; - ptl_hdr_data_t hdr_data_in; -} PtlPut_in; - -typedef struct PtlPut_out { - int rc; -} PtlPut_out; - - -typedef struct PtlGet_in { - ptl_handle_md_t md_in; - ptl_process_id_t target_in; - ptl_pt_index_t portal_in; - ptl_ac_index_t cookie_in; - ptl_match_bits_t match_bits_in; - ptl_size_t offset_in; -} PtlGet_in; - -typedef struct PtlGet_out { - int rc; -} PtlGet_out; - - -#endif diff --git a/lustre/portals/include/portals/errno.h b/lustre/portals/include/portals/errno.h index a98bfd9..42f2626 100644 --- a/lustre/portals/include/portals/errno.h +++ b/lustre/portals/include/portals/errno.h @@ -41,7 +41,10 @@ typedef enum { PTL_EQ_IN_USE = 21, - PTL_MAX_ERRNO = 22 + PTL_NI_INVALID = 22, + PTL_MD_ILLEGAL = 23, + + PTL_MAX_ERRNO = 24 } ptl_err_t; /* If you change these, you must update the string table in api-errno.c */ diff --git a/lustre/portals/include/portals/lib-dispatch.h b/lustre/portals/include/portals/lib-dispatch.h deleted file mode 100644 index 610c776..0000000 --- a/lustre/portals/include/portals/lib-dispatch.h +++ /dev/null @@ -1,45 +0,0 @@ -#ifndef PTL_DISPATCH_H -#define PTL_DISPATCH_H - -#include "build_check.h" -/* - * include/dispatch.h - * - * Dispatch table header and externs for remote side - * operations - * - * Generated by idl - * - */ - -#include -#include - -extern int do_PtlGetId(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlNIStatus(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlNIDist(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMEAttach(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMEInsert(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMEPrepend(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlTblDump(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMEDump(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlMDAttach(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlMDBind(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *args, - void *ret); -extern int do_PtlPut(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlGet(nal_cb_t * nal, void *private, void *args, void *ret); -extern int do_PtlFailNid (nal_cb_t *nal, void *private, void *args, void *ret); - -extern char *dispatch_name(int index); -#endif diff --git a/lustre/portals/include/portals/lib-nal.h b/lustre/portals/include/portals/lib-nal.h deleted file mode 100644 index d1d0495..0000000 --- a/lustre/portals/include/portals/lib-nal.h +++ /dev/null @@ -1,116 +0,0 @@ -#ifndef _LIB_NAL_H_ -#define _LIB_NAL_H_ - -#include "build_check.h" -/* - * nal.h - * - * Library side headers that define the abstraction layer's - * responsibilities and interfaces - */ - -#include - -struct nal_cb_t { - /* - * Per interface portal table, access control table - * and NAL private data field; - */ - lib_ni_t ni; - void *nal_data; - /* - * send: Sends a preformatted header and payload data to a - * specified remote process. The payload is scattered over 'niov' - * fragments described by iov, starting at 'offset' for 'mlen' - * bytes. - * NB the NAL may NOT overwrite iov. - * PTL_OK on success => NAL has committed to send and will call - * lib_finalize on completion - */ - ptl_err_t (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen); - - /* as send, but with a set of page fragments (NULL if not supported) */ - ptl_err_t (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, ptl_kiov_t *iov, - size_t offset, size_t mlen); - /* - * recv: Receives an incoming message from a remote process. The - * payload is to be received into the scattered buffer of 'niov' - * fragments described by iov, starting at 'offset' for 'mlen' - * bytes. Payload bytes after 'mlen' up to 'rlen' are to be - * discarded. - * NB the NAL may NOT overwrite iov. - * PTL_OK on success => NAL has committed to receive and will call - * lib_finalize on completion - */ - ptl_err_t (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen); - - /* as recv, but with a set of page fragments (NULL if not supported) */ - ptl_err_t (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - unsigned int niov, ptl_kiov_t *iov, - size_t offset, size_t mlen, size_t rlen); - /* - * read: Reads a block of data from a specified user address - */ - ptl_err_t (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr, - user_ptr src_addr, size_t len); - - /* - * write: Writes a block of data into a specified user address - */ - ptl_err_t (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr, - void *src_addr, size_t len); - - /* - * callback: Calls an event callback - * NULL => lib calls eq's callback (if any) directly. - */ - void (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq, - ptl_event_t *ev); - - /* - * malloc: Acquire a block of memory in a system independent - * fashion. - */ - void *(*cb_malloc) (nal_cb_t * nal, size_t len); - - void (*cb_free) (nal_cb_t * nal, void *buf, size_t len); - - /* - * (un)map: Tell the NAL about some memory it will access. - * *addrkey passed to cb_unmap() is what cb_map() set it to. - * type of *iov depends on options. - * Set to NULL if not required. - */ - ptl_err_t (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, - void **addrkey); - void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, - void **addrkey); - - /* as (un)map, but with a set of page fragments */ - ptl_err_t (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, - void **addrkey); - void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, - void **addrkey); - - void (*cb_printf) (nal_cb_t * nal, const char *fmt, ...); - - /* Turn interrupts off (begin of protected area) */ - void (*cb_cli) (nal_cb_t * nal, unsigned long *flags); - - /* Turn interrupts on (end of protected area) */ - void (*cb_sti) (nal_cb_t * nal, unsigned long *flags); - - /* - * Calculate a network "distance" to given node - */ - int (*cb_dist) (nal_cb_t * nal, ptl_nid_t nid, unsigned long *dist); -}; - -#endif diff --git a/lustre/portals/include/portals/lib-p30.h b/lustre/portals/include/portals/lib-p30.h index efa929c..4daf219 100644 --- a/lustre/portals/include/portals/lib-p30.h +++ b/lustre/portals/include/portals/lib-p30.h @@ -17,13 +17,13 @@ #else # include # include +# include #endif #include #include #include +#include #include -#include -#include static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) { @@ -31,17 +31,18 @@ static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); } -#define state_lock(nal,flagsp) \ -do { \ - CDEBUG(D_PORTALS, "taking state lock\n"); \ - nal->cb_cli(nal, flagsp); \ -} while (0) +#ifdef __KERNEL__ +#define LIB_LOCK(nal,flags) \ + spin_lock_irqsave(&(nal)->libnal_ni.ni_lock, flags) +#define LIB_UNLOCK(nal,flags) \ + spin_unlock_irqrestore(&(nal)->libnal_ni.ni_lock, flags) +#else +#define LIB_LOCK(nal,flags) \ + (pthread_mutex_lock(&(nal)->libnal_ni.ni_mutex), (flags) = 0) +#define LIB_UNLOCK(nal,flags) \ + pthread_mutex_unlock(&(nal)->libnal_ni.ni_mutex) +#endif -#define state_unlock(nal,flagsp) \ -{ \ - CDEBUG(D_PORTALS, "releasing state lock\n"); \ - nal->cb_sti(nal, flagsp); \ -} #ifdef PTL_USE_LIB_FREELIST @@ -50,13 +51,13 @@ do { \ #define MAX_MSGS 2048 /* Outstanding messages */ #define MAX_EQS 512 -extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize); -extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl); +extern int lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int nobj, int objsize); +extern void lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl); static inline void * lib_freelist_alloc (lib_freelist_t *fl) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_freeobj_t *o; if (list_empty (&fl->fl_list)) @@ -70,7 +71,7 @@ lib_freelist_alloc (lib_freelist_t *fl) static inline void lib_freelist_free (lib_freelist_t *fl, void *obj) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); list_add (&o->fo_list, &fl->fl_list); @@ -78,78 +79,78 @@ lib_freelist_free (lib_freelist_t *fl, void *obj) static inline lib_eq_t * -lib_eq_alloc (nal_cb_t *nal) +lib_eq_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_eq_t *eq; - state_lock (nal, &flags); - eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + eq = (lib_eq_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_eqs); + LIB_UNLOCK (nal, flags); return (eq); } static inline void -lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_eqs, eq); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_eqs, eq); } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) +lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_md_t *md; - state_lock (nal, &flags); - md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + md = (lib_md_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mds); + LIB_UNLOCK (nal, flags); return (md); } static inline void -lib_md_free (nal_cb_t *nal, lib_md_t *md) +lib_md_free (lib_nal_t *nal, lib_md_t *md) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_mds, md); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_mds, md); } static inline lib_me_t * -lib_me_alloc (nal_cb_t *nal) +lib_me_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_me_t *me; - state_lock (nal, &flags); - me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + me = (lib_me_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mes); + LIB_UNLOCK (nal, flags); return (me); } static inline void -lib_me_free (nal_cb_t *nal, lib_me_t *me) +lib_me_free (lib_nal_t *nal, lib_me_t *me) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_mes, me); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_mes, me); } static inline lib_msg_t * -lib_msg_alloc (nal_cb_t *nal) +lib_msg_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ unsigned long flags; lib_msg_t *msg; - state_lock (nal, &flags); - msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs); - state_unlock (nal, &flags); + LIB_LOCK (nal, flags); + msg = (lib_msg_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_msgs); + LIB_UNLOCK (nal, flags); if (msg != NULL) { /* NULL pointers, clear flags etc */ @@ -160,18 +161,18 @@ lib_msg_alloc (nal_cb_t *nal) } static inline void -lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) +lib_msg_free (lib_nal_t *nal, lib_msg_t *msg) { - /* ALWAYS called with statelock held */ - lib_freelist_free (&nal->ni.ni_free_msgs, msg); + /* ALWAYS called with liblock held */ + lib_freelist_free (&nal->libnal_ni.ni_free_msgs, msg); } #else static inline lib_eq_t * -lib_eq_alloc (nal_cb_t *nal) +lib_eq_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ lib_eq_t *eq; PORTAL_ALLOC(eq, sizeof(*eq)); @@ -179,16 +180,16 @@ lib_eq_alloc (nal_cb_t *nal) } static inline void -lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ PORTAL_FREE(eq, sizeof(*eq)); } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) +lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ lib_md_t *md; int size; int niov; @@ -214,9 +215,9 @@ lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) } static inline void -lib_md_free (nal_cb_t *nal, lib_md_t *md) +lib_md_free (lib_nal_t *nal, lib_md_t *md) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ int size; if ((md->options & PTL_MD_KIOV) != 0) @@ -228,9 +229,9 @@ lib_md_free (nal_cb_t *nal, lib_md_t *md) } static inline lib_me_t * -lib_me_alloc (nal_cb_t *nal) +lib_me_alloc (lib_nal_t *nal) { - /* NEVER called with statelock held */ + /* NEVER called with liblock held */ lib_me_t *me; PORTAL_ALLOC(me, sizeof(*me)); @@ -238,16 +239,16 @@ lib_me_alloc (nal_cb_t *nal) } static inline void -lib_me_free(nal_cb_t *nal, lib_me_t *me) +lib_me_free(lib_nal_t *nal, lib_me_t *me) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ PORTAL_FREE(me, sizeof(*me)); } static inline lib_msg_t * -lib_msg_alloc(nal_cb_t *nal) +lib_msg_alloc(lib_nal_t *nal) { - /* NEVER called with statelock held; may be in interrupt... */ + /* NEVER called with liblock held; may be in interrupt... */ lib_msg_t *msg; if (in_interrupt()) @@ -264,27 +265,28 @@ lib_msg_alloc(nal_cb_t *nal) } static inline void -lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) +lib_msg_free(lib_nal_t *nal, lib_msg_t *msg) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ PORTAL_FREE(msg, sizeof(*msg)); } #endif -extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type); -extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type); -extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh); +extern lib_handle_t *lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type); +extern void lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type); +extern void lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh); static inline void -ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq) +ptl_eq2handle (ptl_handle_eq_t *handle, lib_nal_t *nal, lib_eq_t *eq) { + handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = eq->eq_lh.lh_cookie; } static inline lib_eq_t * -ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) +ptl_handle2eq (ptl_handle_eq_t *handle, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, PTL_COOKIE_TYPE_EQ); if (lh == NULL) @@ -294,15 +296,16 @@ ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) } static inline void -ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md) +ptl_md2handle (ptl_handle_md_t *handle, lib_nal_t *nal, lib_md_t *md) { + handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = md->md_lh.lh_cookie; } static inline lib_md_t * -ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) +ptl_handle2md (ptl_handle_md_t *handle, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, PTL_COOKIE_TYPE_MD); if (lh == NULL) @@ -312,12 +315,12 @@ ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) } static inline lib_md_t * -ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) +ptl_wire_handle2md (ptl_handle_wire_t *wh, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh; - if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie) + if (wh->wh_interface_cookie != nal->libnal_ni.ni_interface_cookie) return (NULL); lh = lib_lookup_cookie (nal, wh->wh_object_cookie, @@ -329,15 +332,16 @@ ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) } static inline void -ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me) +ptl_me2handle (ptl_handle_me_t *handle, lib_nal_t *nal, lib_me_t *me) { + handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = me->me_lh.lh_cookie; } static inline lib_me_t * -ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) +ptl_handle2me (ptl_handle_me_t *handle, lib_nal_t *nal) { - /* ALWAYS called with statelock held */ + /* ALWAYS called with liblock held */ lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, PTL_COOKIE_TYPE_ME); if (lh == NULL) @@ -346,35 +350,30 @@ ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) return (lh_entry (lh, lib_me_t, me_lh)); } -extern int lib_init(nal_cb_t *cb, ptl_process_id_t pid, +extern int lib_init(lib_nal_t *libnal, nal_t *apinal, + ptl_process_id_t pid, ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits); -extern int lib_fini(nal_cb_t * cb); -extern void lib_dispatch(nal_cb_t * cb, void *private, int index, - void *arg_block, void *ret_block); -extern char *dispatch_name(int index); +extern int lib_fini(lib_nal_t *libnal); /* - * When the NAL detects an incoming message, it should call - * lib_parse() decode it. The NAL callbacks will be handed - * the private cookie as a way for the NAL to maintain state - * about which transaction is being processed. An extra parameter, - * lib_cookie will contain the necessary information for - * finalizing the message. - * - * After it has finished the handling the message, it should - * call lib_finalize() with the lib_cookie parameter. - * Call backs will be made to write events, send acks or - * replies and so on. + * When the NAL detects an incoming message header, it should call + * lib_parse() decode it. If the message header is garbage, lib_parse() + * returns immediately with failure, otherwise the NAL callbacks will be + * called to receive the message body. They are handed the private cookie + * as a way for the NAL to maintain state about which transaction is being + * processed. An extra parameter, lib_msg contains the lib-level message + * state for passing to lib_finalize() when the message body has been + * received. */ -extern void lib_enq_event_locked (nal_cb_t *nal, void *private, +extern void lib_enq_event_locked (lib_nal_t *nal, void *private, lib_eq_t *eq, ptl_event_t *ev); -extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg, +extern void lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_ni_fail_t ni_fail_type); -extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private); -extern lib_msg_t *lib_create_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, +extern ptl_err_t lib_parse (lib_nal_t *nal, ptl_hdr_t *hdr, void *private); +extern lib_msg_t *lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, lib_msg_t *get_msg); -extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr); +extern void print_hdr (lib_nal_t * nal, ptl_hdr_t * hdr); extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); @@ -397,14 +396,65 @@ extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, extern void lib_assert_wire_constants (void); -extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, +extern ptl_err_t lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); -extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, +extern ptl_err_t lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, lib_md_t *md, ptl_size_t offset, ptl_size_t len); -extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, - ptl_md_t * md_out); -extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in); -extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in); +extern int lib_api_ni_status (nal_t *nal, ptl_sr_index_t sr_idx, + ptl_sr_value_t *status); +extern int lib_api_ni_dist (nal_t *nal, ptl_process_id_t *pid, + unsigned long *dist); + +extern int lib_api_eq_alloc (nal_t *nal, ptl_size_t count, + ptl_eq_handler_t callback, + ptl_handle_eq_t *handle); +extern int lib_api_eq_free(nal_t *nal, ptl_handle_eq_t *eqh); +extern int lib_api_eq_poll (nal_t *nal, + ptl_handle_eq_t *eventqs, int neq, int timeout_ms, + ptl_event_t *event, int *which); + +extern int lib_api_me_attach(nal_t *nal, + ptl_pt_index_t portal, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); +extern int lib_api_me_insert(nal_t *nal, + ptl_handle_me_t *current_meh, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); +extern int lib_api_me_unlink (nal_t *nal, ptl_handle_me_t *meh); +extern void lib_me_unlink(lib_nal_t *nal, lib_me_t *me); + +extern int lib_api_get_id(nal_t *nal, ptl_process_id_t *pid); + +extern void lib_md_unlink(lib_nal_t *nal, lib_md_t *md); +extern void lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd); +extern int lib_api_md_attach(nal_t *nal, ptl_handle_me_t *meh, + ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle); +extern int lib_api_md_bind(nal_t *nal, ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle); +extern int lib_api_md_unlink (nal_t *nal, ptl_handle_md_t *mdh); +extern int lib_api_md_update (nal_t *nal, ptl_handle_md_t *mdh, + ptl_md_t *oldumd, ptl_md_t *newumd, + ptl_handle_eq_t *testqh); + +extern int lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, + ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, ptl_size_t offset); +extern int lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, + ptl_ack_req_t ack, ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, + ptl_size_t offset, ptl_hdr_data_t hdr_data); +extern int lib_api_fail_nid(nal_t *apinal, ptl_nid_t nid, unsigned int threshold); + #endif diff --git a/lustre/portals/include/portals/lib-types.h b/lustre/portals/include/portals/lib-types.h index ef618c7..6549988 100644 --- a/lustre/portals/include/portals/lib-types.h +++ b/lustre/portals/include/portals/lib-types.h @@ -13,6 +13,7 @@ #include "build_check.h" #include +#include #ifdef __KERNEL__ # include # include @@ -22,9 +23,6 @@ # include #endif -/* struct nal_cb_t is defined in lib-nal.h */ -typedef struct nal_cb_t nal_cb_t; - typedef char *user_ptr; typedef struct lib_msg_t lib_msg_t; typedef struct lib_ptl_t lib_ptl_t; @@ -165,11 +163,12 @@ typedef struct { struct lib_eq_t { struct list_head eq_list; lib_handle_t eq_lh; - ptl_seq_t sequence; - ptl_size_t size; - ptl_event_t *base; + ptl_seq_t eq_enq_seq; + ptl_seq_t eq_deq_seq; + ptl_size_t eq_size; + ptl_event_t *eq_events; int eq_refcount; - ptl_eq_handler_t event_callback; + ptl_eq_handler_t eq_callback; void *eq_addrkey; }; @@ -244,29 +243,117 @@ typedef struct { /* PTL_COOKIE_TYPES must be a power of 2, so the cookie type can be * extracted by masking with (PTL_COOKIE_TYPES - 1) */ -typedef struct { - ptl_nid_t nid; - ptl_pid_t pid; - lib_ptl_t tbl; - lib_counters_t counters; - ptl_ni_limits_t actual_limits; +typedef struct lib_ni +{ + nal_t *ni_api; + ptl_process_id_t ni_pid; + lib_ptl_t ni_portals; + lib_counters_t ni_counters; + ptl_ni_limits_t ni_actual_limits; int ni_lh_hash_size; /* size of lib handle hash table */ struct list_head *ni_lh_hash_table; /* all extant lib handles, this interface */ __u64 ni_next_object_cookie; /* cookie generator */ __u64 ni_interface_cookie; /* uniquely identifies this ni in this epoch */ - struct list_head ni_test_peers; + struct list_head ni_test_peers; #ifdef PTL_USE_LIB_FREELIST - lib_freelist_t ni_free_mes; - lib_freelist_t ni_free_msgs; - lib_freelist_t ni_free_mds; - lib_freelist_t ni_free_eqs; + lib_freelist_t ni_free_mes; + lib_freelist_t ni_free_msgs; + lib_freelist_t ni_free_mds; + lib_freelist_t ni_free_eqs; +#endif + + struct list_head ni_active_msgs; + struct list_head ni_active_mds; + struct list_head ni_active_eqs; + +#ifdef __KERNEL__ + spinlock_t ni_lock; + wait_queue_head_t ni_waitq; +#else + pthread_mutex_t ni_mutex; + pthread_cond_t ni_cond; #endif - struct list_head ni_active_msgs; - struct list_head ni_active_mds; - struct list_head ni_active_eqs; } lib_ni_t; + +typedef struct lib_nal +{ + /* lib-level interface state */ + lib_ni_t libnal_ni; + + /* NAL-private data */ + void *libnal_data; + + /* + * send: Sends a preformatted header and payload data to a + * specified remote process. The payload is scattered over 'niov' + * fragments described by iov, starting at 'offset' for 'mlen' + * bytes. + * NB the NAL may NOT overwrite iov. + * PTL_OK on success => NAL has committed to send and will call + * lib_finalize on completion + */ + ptl_err_t (*libnal_send) + (struct lib_nal *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen); + + /* as send, but with a set of page fragments (NULL if not supported) */ + ptl_err_t (*libnal_send_pages) + (struct lib_nal *nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, ptl_kiov_t *iov, + size_t offset, size_t mlen); + /* + * recv: Receives an incoming message from a remote process. The + * payload is to be received into the scattered buffer of 'niov' + * fragments described by iov, starting at 'offset' for 'mlen' + * bytes. Payload bytes after 'mlen' up to 'rlen' are to be + * discarded. + * NB the NAL may NOT overwrite iov. + * PTL_OK on success => NAL has committed to receive and will call + * lib_finalize on completion + */ + ptl_err_t (*libnal_recv) + (struct lib_nal *nal, void *private, lib_msg_t * cookie, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen); + + /* as recv, but with a set of page fragments (NULL if not supported) */ + ptl_err_t (*libnal_recv_pages) + (struct lib_nal *nal, void *private, lib_msg_t * cookie, + unsigned int niov, ptl_kiov_t *iov, + size_t offset, size_t mlen, size_t rlen); + + /* + * (un)map: Tell the NAL about some memory it will access. + * *addrkey passed to libnal_unmap() is what libnal_map() set it to. + * type of *iov depends on options. + * Set to NULL if not required. + */ + ptl_err_t (*libnal_map) + (struct lib_nal *nal, unsigned int niov, struct iovec *iov, + void **addrkey); + void (*libnal_unmap) + (struct lib_nal *nal, unsigned int niov, struct iovec *iov, + void **addrkey); + + /* as (un)map, but with a set of page fragments */ + ptl_err_t (*libnal_map_pages) + (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + void (*libnal_unmap_pages) + (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + + void (*libnal_printf)(struct lib_nal *nal, const char *fmt, ...); + + /* Calculate a network "distance" to given node */ + int (*libnal_dist) (struct lib_nal *nal, ptl_nid_t nid, unsigned long *dist); +} lib_nal_t; + #endif diff --git a/lustre/portals/include/portals/nal.h b/lustre/portals/include/portals/nal.h index 1f925c1..bf86569 100644 --- a/lustre/portals/include/portals/nal.h +++ b/lustre/portals/include/portals/nal.h @@ -11,32 +11,73 @@ #include -#ifdef yield -#undef yield -#endif - typedef struct nal_t nal_t; struct nal_t { + /* common interface state */ int nal_refct; + ptl_handle_ni_t nal_handle; + + /* NAL-private data */ void *nal_data; - int (*startup) (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *req, ptl_ni_limits_t *actual); + /* NAL API implementation + * NB only nal_ni_init needs to be set when the NAL registers itself */ + int (*nal_ni_init) (nal_t *nal, ptl_pid_t requested_pid, + ptl_ni_limits_t *req, ptl_ni_limits_t *actual); - void (*shutdown) (nal_t *nal); + void (*nal_ni_fini) (nal_t *nal); - int (*forward) (nal_t *nal, int index, /* Function ID */ - void *args, size_t arg_len, void *ret, size_t ret_len); + int (*nal_get_id) (nal_t *nal, ptl_process_id_t *id); + int (*nal_ni_status) (nal_t *nal, ptl_sr_index_t register, ptl_sr_value_t *status); + int (*nal_ni_dist) (nal_t *nal, ptl_process_id_t *id, unsigned long *distance); + int (*nal_fail_nid) (nal_t *nal, ptl_nid_t nid, unsigned int threshold); - int (*yield) (nal_t *nal, unsigned long *flags, int milliseconds); + int (*nal_me_attach) (nal_t *nal, ptl_pt_index_t portal, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); + int (*nal_me_insert) (nal_t *nal, ptl_handle_me_t *me, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle); + int (*nal_me_unlink) (nal_t *nal, ptl_handle_me_t *me); + + int (*nal_md_attach) (nal_t *nal, ptl_handle_me_t *me, + ptl_md_t *md, ptl_unlink_t unlink, + ptl_handle_md_t *handle); + int (*nal_md_bind) (nal_t *nal, + ptl_md_t *md, ptl_unlink_t unlink, + ptl_handle_md_t *handle); + int (*nal_md_unlink) (nal_t *nal, ptl_handle_md_t *md); + int (*nal_md_update) (nal_t *nal, ptl_handle_md_t *md, + ptl_md_t *old_md, ptl_md_t *new_md, + ptl_handle_eq_t *testq); - void (*lock) (nal_t *nal, unsigned long *flags); + int (*nal_eq_alloc) (nal_t *nal, ptl_size_t count, + ptl_eq_handler_t handler, + ptl_handle_eq_t *handle); + int (*nal_eq_free) (nal_t *nal, ptl_handle_eq_t *eq); + int (*nal_eq_poll) (nal_t *nal, + ptl_handle_eq_t *eqs, int neqs, int timeout, + ptl_event_t *event, int *which); - void (*unlock) (nal_t *nal, unsigned long *flags); + int (*nal_ace_entry) (nal_t *nal, ptl_ac_index_t index, + ptl_process_id_t match_id, ptl_pt_index_t portal); + + int (*nal_put) (nal_t *nal, ptl_handle_md_t *md, ptl_ack_req_t ack, + ptl_process_id_t *target, ptl_pt_index_t portal, + ptl_ac_index_t ac, ptl_match_bits_t match, + ptl_size_t offset, ptl_hdr_data_t hdr_data); + int (*nal_get) (nal_t *nal, ptl_handle_md_t *md, + ptl_process_id_t *target, ptl_pt_index_t portal, + ptl_ac_index_t ac, ptl_match_bits_t match, + ptl_size_t offset); }; -extern nal_t *ptl_hndl2nal(ptl_handle_any_t * any); +extern nal_t *ptl_hndl2nal(ptl_handle_any_t *any); #ifdef __KERNEL__ extern int ptl_register_nal(ptl_interface_t interface, nal_t *nal); diff --git a/lustre/portals/include/portals/types.h b/lustre/portals/include/portals/types.h index ef2712b..250b954 100644 --- a/lustre/portals/include/portals/types.h +++ b/lustre/portals/include/portals/types.h @@ -153,17 +153,6 @@ typedef void (*ptl_eq_handler_t)(ptl_event_t *event); #define PTL_EQ_HANDLER_NONE NULL typedef struct { - volatile ptl_seq_t sequence; - ptl_size_t size; - ptl_event_t *base; - ptl_handle_any_t cb_eq_handle; -} ptl_eq_t; - -typedef struct { - ptl_eq_t *eq; -} ptl_ni_t; - -typedef struct { int max_mes; int max_mds; int max_eqs; diff --git a/lustre/portals/knals/gmnal/gmnal.h b/lustre/portals/knals/gmnal/gmnal.h index e48552e..ca98f84 100644 --- a/lustre/portals/knals/gmnal/gmnal.h +++ b/lustre/portals/knals/gmnal/gmnal.h @@ -190,7 +190,6 @@ typedef struct _gmnal_rxtwe { #define NRXTHREADS 10 /* max number of receiver threads */ typedef struct _gmnal_data_t { - spinlock_t cb_lock; spinlock_t stxd_lock; struct semaphore stxd_token; gmnal_stxd_t *stxd; @@ -205,7 +204,7 @@ typedef struct _gmnal_data_t { gmnal_srxd_t *srxd; struct gm_hash *srxd_hash; nal_t *nal; - nal_cb_t *nal_cb; + lib_nal_t *libnal; struct gm_port *gm_port; unsigned int gm_local_nid; unsigned int gm_global_nid; @@ -298,7 +297,6 @@ extern gmnal_data_t *global_nal_data; #define GMNAL_GM_LOCK_INIT(a) spin_lock_init(&a->gm_lock); #define GMNAL_GM_LOCK(a) spin_lock(&a->gm_lock); #define GMNAL_GM_UNLOCK(a) spin_unlock(&a->gm_lock); -#define GMNAL_CB_LOCK_INIT(a) spin_lock_init(&a->cb_lock); /* @@ -340,39 +338,19 @@ void gmnal_api_unlock(nal_t *, unsigned long *); * CB NAL */ -int gmnal_cb_send(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, +int gmnal_cb_send(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec *, size_t); -int gmnal_cb_send_pages(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, +int gmnal_cb_send_pages(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t); -int gmnal_cb_recv(nal_cb_t *, void *, lib_msg_t *, +int gmnal_cb_recv(lib_nal_t *, void *, lib_msg_t *, unsigned int, struct iovec *, size_t, size_t); -int gmnal_cb_recv_pages(nal_cb_t *, void *, lib_msg_t *, +int gmnal_cb_recv_pages(lib_nal_t *, void *, lib_msg_t *, unsigned int, ptl_kiov_t *, size_t, size_t); -int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t); - -int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t); - -int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *); - -void *gmnal_cb_malloc(nal_cb_t *, size_t); - -void gmnal_cb_free(nal_cb_t *, void *, size_t); - -void gmnal_cb_unmap(nal_cb_t *, unsigned int, struct iovec*, void **); - -int gmnal_cb_map(nal_cb_t *, unsigned int, struct iovec*, void **); - -void gmnal_cb_printf(nal_cb_t *, const char *fmt, ...); - -void gmnal_cb_cli(nal_cb_t *, unsigned long *); - -void gmnal_cb_sti(nal_cb_t *, unsigned long *); - -int gmnal_cb_dist(nal_cb_t *, ptl_nid_t, unsigned long *); +int gmnal_cb_dist(lib_nal_t *, ptl_nid_t, unsigned long *); int gmnal_init(void); @@ -381,22 +359,14 @@ void gmnal_fini(void); #define GMNAL_INIT_NAL_CB(a) do { \ - a->cb_send = gmnal_cb_send; \ - a->cb_send_pages = gmnal_cb_send_pages; \ - a->cb_recv = gmnal_cb_recv; \ - a->cb_recv_pages = gmnal_cb_recv_pages; \ - a->cb_read = gmnal_cb_read; \ - a->cb_write = gmnal_cb_write; \ - a->cb_callback = gmnal_cb_callback; \ - a->cb_malloc = gmnal_cb_malloc; \ - a->cb_free = gmnal_cb_free; \ - a->cb_map = NULL; \ - a->cb_unmap = NULL; \ - a->cb_printf = gmnal_cb_printf; \ - a->cb_cli = gmnal_cb_cli; \ - a->cb_sti = gmnal_cb_sti; \ - a->cb_dist = gmnal_cb_dist; \ - a->nal_data = NULL; \ + a->libnal_send = gmnal_cb_send; \ + a->libnal_send_pages = gmnal_cb_send_pages; \ + a->libnal_recv = gmnal_cb_recv; \ + a->libnal_recv_pages = gmnal_cb_recv_pages; \ + a->libnal_map = NULL; \ + a->libnal_unmap = NULL; \ + a->libnal_dist = gmnal_cb_dist; \ + a->libnal_data = NULL; \ } while (0) @@ -451,9 +421,9 @@ void gmnal_remove_rxtwe(gmnal_data_t *); /* * Small messages */ -int gmnal_small_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int, +int gmnal_small_rx(lib_nal_t *, void *, lib_msg_t *, unsigned int, struct iovec *, size_t, size_t); -int gmnal_small_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, +int gmnal_small_tx(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec*, int); void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t); @@ -463,10 +433,10 @@ void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t); /* * Large messages */ -int gmnal_large_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int, +int gmnal_large_rx(lib_nal_t *, void *, lib_msg_t *, unsigned int, struct iovec *, size_t, size_t); -int gmnal_large_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, +int gmnal_large_tx(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec*, int); diff --git a/lustre/portals/knals/gmnal/gmnal_api.c b/lustre/portals/knals/gmnal/gmnal_api.c index 7c94f93..002587d 100644 --- a/lustre/portals/knals/gmnal/gmnal_api.c +++ b/lustre/portals/knals/gmnal/gmnal_api.c @@ -50,77 +50,6 @@ static ctl_table gmnalnal_top_sysctl_table[] = { { 0 } }; - - - - - -/* - * gmnal_api_forward - * This function takes a pack block of arguments from the NAL API - * module and passes them to the NAL CB module. The CB module unpacks - * the args and calls the appropriate function indicated by index. - * Typically this function is used to pass args between kernel and use - * space. - * As lgmanl exists entirely in kernel, just pass the arg block directly - * to the NAL CB, buy passing the args to lib_dispatch - * Arguments are - * nal_t nal Our nal - * int index the api function that initiated this call - * void *args packed block of function args - * size_t arg_len length of args block - * void *ret A return value for the API NAL - * size_t ret_len Size of the return value - * - */ - -int -gmnal_api_forward(nal_t *nal, int index, void *args, size_t arg_len, - void *ret, size_t ret_len) -{ - - nal_cb_t *nal_cb = NULL; - gmnal_data_t *nal_data = NULL; - - - - - - if (!nal || !args || (index < 0) || (arg_len < 0)) { - CDEBUG(D_ERROR, "Bad args to gmnal_api_forward\n"); - return (PTL_FAIL); - } - - if (ret && (ret_len <= 0)) { - CDEBUG(D_ERROR, "Bad args to gmnal_api_forward\n"); - return (PTL_FAIL); - } - - - if (!nal->nal_data) { - CDEBUG(D_ERROR, "bad nal, no nal data\n"); - return (PTL_FAIL); - } - - nal_data = nal->nal_data; - CDEBUG(D_INFO, "nal_data is [%p]\n", nal_data); - - if (!nal_data->nal_cb) { - CDEBUG(D_ERROR, "bad nal_data, no nal_cb\n"); - return (PTL_FAIL); - } - - nal_cb = nal_data->nal_cb; - CDEBUG(D_INFO, "nal_cb is [%p]\n", nal_cb); - - CDEBUG(D_PORTALS, "gmnal_api_forward calling lib_dispatch\n"); - lib_dispatch(nal_cb, NULL, index, args, ret); - CDEBUG(D_PORTALS, "gmnal_api_forward returns from lib_dispatch\n"); - - return(PTL_OK); -} - - /* * gmnal_api_shutdown * nal_refct == 0 => called on last matching PtlNIFini() @@ -131,7 +60,7 @@ void gmnal_api_shutdown(nal_t *nal, int interface) { gmnal_data_t *nal_data; - nal_cb_t *nal_cb; + lib_nal_t *libnal; if (nal->nal_refct != 0) return; @@ -139,9 +68,9 @@ gmnal_api_shutdown(nal_t *nal, int interface) CDEBUG(D_TRACE, "gmnal_api_shutdown: nal_data [%p]\n", nal_data); LASSERT(nal == global_nal_data->nal); - nal_data = nal->nal_data; + libnal = (lib_nal_t *)nal->nal_data; + nal_data = (gmnal_data_t *)libnal->libnal_data; LASSERT(nal_data == global_nal_data); - nal_cb = nal_data->nal_cb; /* Stop portals calling our ioctl handler */ libcfs_nal_cmd_unregister(GMNAL); @@ -150,7 +79,7 @@ gmnal_api_shutdown(nal_t *nal, int interface) * flag so when lib calls us we fail immediately and dont queue any * more work but our threads can still call into lib OK. THEN * shutdown our threads, THEN lib_fini() */ - lib_fini(nal_cb); + lib_fini(libnal); gmnal_stop_rxthread(nal_data); gmnal_stop_ctthread(nal_data); @@ -162,94 +91,22 @@ gmnal_api_shutdown(nal_t *nal, int interface) GMNAL_GM_UNLOCK(nal_data); if (nal_data->sysctl) unregister_sysctl_table (nal_data->sysctl); - PORTAL_FREE(nal, sizeof(nal_t)); + /* Don't free 'nal'; it's a static struct */ PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); global_nal_data = NULL; PORTAL_MODULE_UNUSE; } -/* - * gmnal_api_validate - * validate a user address for use in communications - * There's nothing to be done here - */ -int -gmnal_api_validate(nal_t *nal, void *base, size_t extent) -{ - - return(PTL_OK); -} - - - -/* - * gmnal_api_yield - * Give up the processor - */ -void -gmnal_api_yield(nal_t *nal, unsigned long *flags, int milliseconds) -{ - CDEBUG(D_TRACE, "gmnal_api_yield : nal [%p]\n", nal); - - if (milliseconds != 0) { - CERROR("Blocking yield not implemented yet\n"); - LBUG(); - } - - our_cond_resched(); - return; -} - - - -/* - * gmnal_api_lock - * Take a threadsafe lock - */ -void -gmnal_api_lock(nal_t *nal, unsigned long *flags) -{ - - gmnal_data_t *nal_data; - nal_cb_t *nal_cb; - - nal_data = nal->nal_data; - nal_cb = nal_data->nal_cb; - - nal_cb->cb_cli(nal_cb, flags); - - return; -} - -/* - * gmnal_api_unlock - * Release a threadsafe lock - */ -void -gmnal_api_unlock(nal_t *nal, unsigned long *flags) -{ - gmnal_data_t *nal_data; - nal_cb_t *nal_cb; - - nal_data = nal->nal_data; - nal_cb = nal_data->nal_cb; - - nal_cb->cb_sti(nal_cb, flags); - - return; -} - - int gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { - nal_cb_t *nal_cb = NULL; + lib_nal_t *libnal = NULL; gmnal_data_t *nal_data = NULL; gmnal_srxd_t *srxd = NULL; gm_status_t gm_status; @@ -258,9 +115,8 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, if (nal->nal_refct != 0) { if (actual_limits != NULL) { - nal_data = (gmnal_data_t *)nal->nal_data; - nal_cb = nal_data->nal_cb; - *actual_limits = nal->_cb->ni.actual_limits; + libnal = (lib_nal_t *)nal->nal_data; + *actual_limits = nal->libnal_ni.ni_actual_limits; return (PTL_OK); } @@ -283,24 +139,22 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, CDEBUG(D_INFO, "Allocd and reset nal_data[%p]\n", nal_data); CDEBUG(D_INFO, "small_msg_size is [%d]\n", nal_data->small_msg_size); - PORTAL_ALLOC(nal_cb, sizeof(nal_cb_t)); - if (!nal_cb) { + PORTAL_ALLOC(libnal, sizeof(lib_nal_t)); + if (!libnal) { PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); return(PTL_NO_SPACE); } - memset(nal_cb, 0, sizeof(nal_cb_t)); - CDEBUG(D_INFO, "Allocd and reset nal_cb[%p]\n", nal_cb); + memset(libnal, 0, sizeof(lib_nal_t)); + CDEBUG(D_INFO, "Allocd and reset libnal[%p]\n", libnal); - GMNAL_INIT_NAL_CB(nal_cb); + GMNAL_INIT_NAL_CB(libnal); /* * String them all together */ - nal->nal_data = (void*)nal_data; - nal_cb->nal_data = (void*)nal_data; + libnal->libnal_data = (void*)nal_data; nal_data->nal = nal; - nal_data->nal_cb = nal_cb; + nal_data->libnal = libnal; - GMNAL_CB_LOCK_INIT(nal_data); GMNAL_GM_LOCK_INIT(nal_data); @@ -311,7 +165,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, if (gm_init() != GM_SUCCESS) { CDEBUG(D_ERROR, "call to gm_init failed\n"); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } @@ -356,7 +210,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } @@ -373,7 +227,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } @@ -402,7 +256,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } @@ -434,7 +288,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } nal_data->gm_local_nid = local_nid; @@ -454,7 +308,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } CDEBUG(D_INFO, "Global node id is [%u]\n", global_nid); @@ -471,7 +325,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, CDEBUG(D_INFO, "portals_nid is ["LPU64"]\n", process_id.nid); CDEBUG(D_PORTALS, "calling lib_init\n"); - if (lib_init(nal_cb, process_id, + if (lib_init(libnal, nal, process_id, requested_limits, actual_limits) != PTL_OK) { CDEBUG(D_ERROR, "lib_init failed\n"); gmnal_stop_rxthread(nal_data); @@ -483,7 +337,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } @@ -493,7 +347,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, /* XXX these cleanup cases should be restructured to * minimise duplication... */ - lib_fini(nal_cb); + lib_fini(libnal); gmnal_stop_rxthread(nal_data); gmnal_stop_ctthread(nal_data); @@ -504,7 +358,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_finalize(); GMNAL_GM_UNLOCK(nal_data); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); + PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); } @@ -550,10 +404,6 @@ int gmnal_init(void) */ void gmnal_fini() { - gmnal_data_t *nal_data = global_nal_data; - nal_t *nal = nal_data->nal; - nal_cb_t *nal_cb = nal_data->nal_cb; - CDEBUG(D_TRACE, "gmnal_fini\n"); LASSERT(global_nal_data == NULL); diff --git a/lustre/portals/knals/gmnal/gmnal_cb.c b/lustre/portals/knals/gmnal/gmnal_cb.c index ece1380..e99d3ec 100644 --- a/lustre/portals/knals/gmnal/gmnal_cb.c +++ b/lustre/portals/knals/gmnal/gmnal_cb.c @@ -27,7 +27,7 @@ #include "gmnal.h" -int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +int gmnal_cb_recv(lib_nal_t *libnal, void *private, lib_msg_t *cookie, unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) { @@ -35,19 +35,19 @@ int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int status = PTL_OK; - CDEBUG(D_TRACE, "gmnal_cb_recv nal_cb [%p], private[%p], cookie[%p], " + CDEBUG(D_TRACE, "gmnal_cb_recv libnal [%p], private[%p], cookie[%p], " "niov[%d], iov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", - nal_cb, private, cookie, niov, iov, mlen, rlen); + libnal, private, cookie, niov, iov, mlen, rlen); switch(srxd->type) { case(GMNAL_SMALL_MESSAGE): CDEBUG(D_INFO, "gmnal_cb_recv got small message\n"); - status = gmnal_small_rx(nal_cb, private, cookie, niov, + status = gmnal_small_rx(libnal, private, cookie, niov, iov, mlen, rlen); break; case(GMNAL_LARGE_MESSAGE_INIT): CDEBUG(D_INFO, "gmnal_cb_recv got large message init\n"); - status = gmnal_large_rx(nal_cb, private, cookie, niov, + status = gmnal_large_rx(libnal, private, cookie, niov, iov, mlen, rlen); } @@ -56,7 +56,7 @@ int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, return(status); } -int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +int gmnal_cb_recv_pages(lib_nal_t *libnal, void *private, lib_msg_t *cookie, unsigned int kniov, ptl_kiov_t *kiov, size_t mlen, size_t rlen) { @@ -67,9 +67,9 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, ptl_kiov_t *kiov_dup = kiov;; - CDEBUG(D_TRACE, "gmnal_cb_recv_pages nal_cb [%p],private[%p], " + CDEBUG(D_TRACE, "gmnal_cb_recv_pages libnal [%p],private[%p], " "cookie[%p], kniov[%d], kiov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", - nal_cb, private, cookie, kniov, kiov, mlen, rlen); + libnal, private, cookie, kniov, kiov, mlen, rlen); if (srxd->type == GMNAL_SMALL_MESSAGE) { PORTAL_ALLOC(iovec, sizeof(struct iovec)*kniov); @@ -98,7 +98,7 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, kiov++; } CDEBUG(D_INFO, "calling gmnal_small_rx\n"); - status = gmnal_small_rx(nal_cb, private, cookie, kniov, + status = gmnal_small_rx(libnal, private, cookie, kniov, iovec_dup, mlen, rlen); for (i=0; ikiov_page); @@ -113,7 +113,7 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, } -int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +int gmnal_cb_send(lib_nal_t *libnal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int niov, struct iovec *iov, size_t len) { @@ -123,24 +123,25 @@ int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] len["LPSZ"] nid["LPU64"]\n", niov, len, nid); - nal_data = nal_cb->nal_data; + nal_data = libnal->libnal_data; if (GMNAL_IS_SMALL_MESSAGE(nal_data, niov, iov, len)) { CDEBUG(D_INFO, "This is a small message send\n"); - gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, pid, + gmnal_small_tx(libnal, private, cookie, hdr, type, nid, pid, niov, iov, len); } else { CDEBUG(D_ERROR, "Large message send it is not supported\n"); - lib_finalize(nal_cb, private, cookie, PTL_FAIL); + lib_finalize(libnal, private, cookie, PTL_FAIL); return(PTL_FAIL); - gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, + gmnal_large_tx(libnal, private, cookie, hdr, type, nid, pid, niov, iov, len); } return(PTL_OK); } -int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int kniov, ptl_kiov_t *kiov, size_t len) +int gmnal_cb_send_pages(lib_nal_t *libnal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int kniov, ptl_kiov_t *kiov, size_t len) { int i = 0; @@ -149,7 +150,7 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, ptl_kiov_t *kiov_dup = kiov; CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] len["LPSZ"]\n", nid, kniov, len); - nal_data = nal_cb->nal_data; + nal_data = libnal->libnal_data; PORTAL_ALLOC(iovec, kniov*sizeof(struct iovec)); iovec_dup = iovec; if (GMNAL_IS_SMALL_MESSAGE(nal_data, 0, NULL, len)) { @@ -168,7 +169,7 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, iovec++; kiov++; } - gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, + gmnal_small_tx(libnal, private, cookie, hdr, type, nid, pid, kniov, iovec_dup, len); } else { CDEBUG(D_ERROR, "Large message send it is not supported yet\n"); @@ -185,7 +186,7 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, iovec++; kiov++; } - gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, + gmnal_large_tx(libnal, private, cookie, hdr, type, nid, pid, kniov, iovec, len); } for (i=0; ievent_callback != NULL) { - CDEBUG(D_INFO, "found callback\n"); - eq->event_callback(ev); - } - - return(PTL_OK); -} - -void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len) -{ - void *ptr = NULL; - CDEBUG(D_TRACE, "gmnal_cb_malloc len["LPSZ"]\n", len); - PORTAL_ALLOC(ptr, len); - return(ptr); -} - -void gmnal_cb_free(nal_cb_t *nal_cb, void *buf, size_t len) -{ - CDEBUG(D_TRACE, "gmnal_cb_free :: buf[%p] len["LPSZ"]\n", buf, len); - PORTAL_FREE(buf, len); - return; -} - -void gmnal_cb_unmap(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov, - void **addrkey) -{ - return; -} - -int gmnal_cb_map(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov, - void**addrkey) -{ - return(PTL_OK); -} - -void gmnal_cb_printf(nal_cb_t *nal_cb, const char *fmt, ...) -{ - CDEBUG(D_TRACE, "gmnal_cb_printf\n"); - printk(fmt); - return; -} - -void gmnal_cb_cli(nal_cb_t *nal_cb, unsigned long *flags) -{ - gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; - - spin_lock_irqsave(&nal_data->cb_lock, *flags); - return; -} - -void gmnal_cb_sti(nal_cb_t *nal_cb, unsigned long *flags) -{ - gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; - - spin_unlock_irqrestore(&nal_data->cb_lock, *flags); - return; -} - -void gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq, ptl_event_t *ev) -{ - /* holding cb_lock */ - - if (eq->event_callback != NULL) - eq->event_callback(ev); - - /* We will wake theads sleeping in yield() here, AFTER the - * callback, when we implement blocking yield */ -} - -int gmnal_cb_dist(nal_cb_t *nal_cb, ptl_nid_t nid, unsigned long *dist) +int gmnal_cb_dist(lib_nal_t *libnal, ptl_nid_t nid, unsigned long *dist) { CDEBUG(D_TRACE, "gmnal_cb_dist\n"); if (dist) diff --git a/lustre/portals/knals/gmnal/gmnal_comm.c b/lustre/portals/knals/gmnal/gmnal_comm.c index 1bcd9bd..4af7186 100644 --- a/lustre/portals/knals/gmnal/gmnal_comm.c +++ b/lustre/portals/knals/gmnal/gmnal_comm.c @@ -189,6 +189,7 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) unsigned int snode, sport, type, length; gmnal_msghdr_t *gmnal_msghdr; ptl_hdr_t *portals_hdr; + int rc; CDEBUG(D_INFO, "nal_data [%p], we[%p] type [%d]\n", nal_data, we, gmnal_type); @@ -219,10 +220,12 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) */ srxd = gmnal_rxbuffer_to_srxd(nal_data, buffer); CDEBUG(D_INFO, "Back from gmnal_rxbuffer_to_srxd\n"); - srxd->nal_data = nal_data; if (!srxd) { CDEBUG(D_ERROR, "Failed to get receive descriptor\n"); - lib_parse(nal_data->nal_cb, portals_hdr, srxd); + /* I think passing a NULL srxd to lib_parse will crash + * gmnal_recv() */ + LBUG(); + lib_parse(nal_data->libnal, portals_hdr, srxd); return(GMNAL_STATUS_FAIL); } @@ -234,6 +237,7 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) return(GMNAL_STATUS_OK); } + srxd->nal_data = nal_data; srxd->type = gmnal_type; srxd->nsiov = gmnal_msghdr->niov; srxd->gm_source_node = gmnal_msghdr->sender_node_id; @@ -245,7 +249,12 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) * cb_recv is responsible for returning the buffer * for future receive */ - lib_parse(nal_data->nal_cb, portals_hdr, srxd); + rc = lib_parse(nal_data->libnal, portals_hdr, srxd); + + if (rc != PTL_OK) { + /* I just received garbage; take appropriate action... */ + LBUG(); + } return(GMNAL_STATUS_OK); } @@ -309,19 +318,19 @@ gmnal_rx_bad(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, gmnal_srxd_t *srxd) * Call lib_finalize */ int -gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +gmnal_small_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) { gmnal_srxd_t *srxd = NULL; void *buffer = NULL; - gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; + gmnal_data_t *nal_data = (gmnal_data_t*)libnal->nal_data; CDEBUG(D_TRACE, "niov [%d] mlen["LPSZ"]\n", niov, mlen); if (!private) { CDEBUG(D_ERROR, "gmnal_small_rx no context\n"); - lib_finalize(nal_cb, private, cookie, PTL_FAIL); + lib_finalize(libnal, private, cookie, PTL_FAIL); return(PTL_FAIL); } @@ -343,7 +352,7 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, * let portals library know receive is complete */ CDEBUG(D_PORTALS, "calling lib_finalize\n"); - lib_finalize(nal_cb, private, cookie, PTL_OK); + lib_finalize(libnal, private, cookie, PTL_OK); /* * return buffer so it can be used again */ @@ -365,11 +374,11 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, * The callback function informs when the send is complete. */ int -gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +gmnal_small_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t global_nid, ptl_pid_t pid, unsigned int niov, struct iovec *iov, int size) { - gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data; + gmnal_data_t *nal_data = (gmnal_data_t*)libnal->nal_data; gmnal_stxd_t *stxd = NULL; void *buffer = NULL; gmnal_msghdr_t *msghdr = NULL; @@ -377,9 +386,9 @@ gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, unsigned int local_nid; gm_status_t gm_status = GM_SUCCESS; - CDEBUG(D_TRACE, "gmnal_small_tx nal_cb [%p] private [%p] cookie [%p] " + CDEBUG(D_TRACE, "gmnal_small_tx libnal [%p] private [%p] cookie [%p] " "hdr [%p] type [%d] global_nid ["LPU64"] pid [%d] niov [%d] " - "iov [%p] size [%d]\n", nal_cb, private, cookie, hdr, type, + "iov [%p] size [%d]\n", libnal, private, cookie, hdr, type, global_nid, pid, niov, iov, size); CDEBUG(D_INFO, "portals_hdr:: dest_nid ["LPU64"], src_nid ["LPU64"]\n", @@ -472,7 +481,7 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) gmnal_stxd_t *stxd = (gmnal_stxd_t*)context; lib_msg_t *cookie = stxd->cookie; gmnal_data_t *nal_data = (gmnal_data_t*)stxd->nal_data; - nal_cb_t *nal_cb = nal_data->nal_cb; + lib_nal_t *libnal = nal_data->libnal; if (!stxd) { CDEBUG(D_TRACE, "send completion event for unknown stxd\n"); @@ -592,7 +601,7 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) return; } gmnal_return_stxd(nal_data, stxd); - lib_finalize(nal_cb, stxd, cookie, PTL_OK); + lib_finalize(libnal, stxd, cookie, PTL_OK); return; } @@ -645,7 +654,7 @@ void gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, * this ack, deregister the memory. Only 1 send token is required here. */ int -gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +gmnal_large_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t global_nid, ptl_pid_t pid, unsigned int niov, struct iovec *iov, int size) { @@ -661,15 +670,15 @@ gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int niov_dup; - CDEBUG(D_TRACE, "gmnal_large_tx nal_cb [%p] private [%p], cookie [%p] " + CDEBUG(D_TRACE, "gmnal_large_tx libnal [%p] private [%p], cookie [%p] " "hdr [%p], type [%d] global_nid ["LPU64"], pid [%d], niov [%d], " - "iov [%p], size [%d]\n", nal_cb, private, cookie, hdr, type, + "iov [%p], size [%d]\n", libnal, private, cookie, hdr, type, global_nid, pid, niov, iov, size); - if (nal_cb) - nal_data = (gmnal_data_t*)nal_cb->nal_data; + if (libnal) + nal_data = (gmnal_data_t*)libnal->nal_data; else { - CDEBUG(D_ERROR, "no nal_cb.\n"); + CDEBUG(D_ERROR, "no libnal.\n"); return(GMNAL_STATUS_FAIL); } @@ -811,11 +820,11 @@ gmnal_large_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) * data from the sender. */ int -gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, +gmnal_large_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, unsigned int nriov, struct iovec *riov, size_t mlen, size_t rlen) { - gmnal_data_t *nal_data = nal_cb->nal_data; + gmnal_data_t *nal_data = libnal->nal_data; gmnal_srxd_t *srxd = (gmnal_srxd_t*)private; void *buffer = NULL; struct iovec *riov_dup; @@ -823,13 +832,13 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, gmnal_msghdr_t *msghdr = NULL; gm_status_t gm_status; - CDEBUG(D_TRACE, "gmnal_large_rx :: nal_cb[%p], private[%p], " + CDEBUG(D_TRACE, "gmnal_large_rx :: libnal[%p], private[%p], " "cookie[%p], niov[%d], iov[%p], mlen["LPSZ"], rlen["LPSZ"]\n", - nal_cb, private, cookie, nriov, riov, mlen, rlen); + libnal, private, cookie, nriov, riov, mlen, rlen); if (!srxd) { CDEBUG(D_ERROR, "gmnal_large_rx no context\n"); - lib_finalize(nal_cb, private, cookie, PTL_FAIL); + lib_finalize(libnal, private, cookie, PTL_FAIL); return(PTL_FAIL); } @@ -1092,7 +1101,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context, gmnal_ltxd_t *ltxd = (gmnal_ltxd_t*)context; gmnal_srxd_t *srxd = ltxd->srxd; - nal_cb_t *nal_cb = srxd->nal_data->nal_cb; + lib_nal_t *libnal = srxd->nal_data->libnal; int lastone; struct iovec *riov; int nriov; @@ -1126,7 +1135,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context, * Let our client application proceed */ CDEBUG(D_ERROR, "final callback context[%p]\n", srxd); - lib_finalize(nal_cb, srxd, srxd->cookie, PTL_OK); + lib_finalize(libnal, srxd, srxd->cookie, PTL_OK); /* * send an ack to the sender to let him know we got the data @@ -1276,7 +1285,7 @@ gmnal_large_tx_ack_callback(gm_port_t *gm_port, void *context, void gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) { - nal_cb_t *nal_cb = nal_data->nal_cb; + lib_nal_t *libnal = nal_data->libnal; gmnal_stxd_t *stxd = NULL; gmnal_msghdr_t *msghdr = NULL; void *buffer = NULL; @@ -1291,7 +1300,7 @@ gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) CDEBUG(D_INFO, "gmnal_large_tx_ack_received stxd [%p]\n", stxd); - lib_finalize(nal_cb, stxd, stxd->cookie, PTL_OK); + lib_finalize(libnal, stxd, stxd->cookie, PTL_OK); /* * extract the iovec from the stxd, deregister the memory. diff --git a/lustre/portals/knals/qswnal/qswnal.c b/lustre/portals/knals/qswnal/qswnal.c index f4005de..c595450 100644 --- a/lustre/portals/knals/qswnal/qswnal.c +++ b/lustre/portals/knals/qswnal/qswnal.c @@ -43,6 +43,9 @@ kpr_nal_interface_t kqswnal_router_interface = { #define QSWNAL_SYSCTL_COPY_SMALL_FWD 2 static ctl_table kqswnal_ctl_table[] = { + {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_puts", + &kqswnal_tunables.kqn_optimized_puts, sizeof (int), + 0644, NULL, &proc_dointvec}, {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets", &kqswnal_tunables.kqn_optimized_gets, sizeof (int), 0644, NULL, &proc_dointvec}, @@ -55,88 +58,6 @@ static ctl_table kqswnal_top_ctl_table[] = { }; #endif -static int -kqswnal_forward(nal_t *nal, - int id, - void *args, size_t args_len, - void *ret, size_t ret_len) -{ - kqswnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kqn_cb; - - LASSERT (nal == &kqswnal_api); - LASSERT (k == &kqswnal_data); - LASSERT (nal_cb == &kqswnal_lib); - - lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */ - return (PTL_OK); -} - -static void -kqswnal_lock (nal_t *nal, unsigned long *flags) -{ - kqswnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kqn_cb; - - LASSERT (nal == &kqswnal_api); - LASSERT (k == &kqswnal_data); - LASSERT (nal_cb == &kqswnal_lib); - - nal_cb->cb_cli(nal_cb,flags); -} - -static void -kqswnal_unlock(nal_t *nal, unsigned long *flags) -{ - kqswnal_data_t *k = nal->nal_data; - nal_cb_t *nal_cb = k->kqn_cb; - - LASSERT (nal == &kqswnal_api); - LASSERT (k == &kqswnal_data); - LASSERT (nal_cb == &kqswnal_lib); - - nal_cb->cb_sti(nal_cb,flags); -} - -static int -kqswnal_yield(nal_t *nal, unsigned long *flags, int milliseconds) -{ - /* NB called holding statelock */ - wait_queue_t wait; - unsigned long now = jiffies; - - CDEBUG (D_NET, "yield\n"); - - if (milliseconds == 0) { - if (need_resched()) - schedule(); - return 0; - } - - init_waitqueue_entry(&wait, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kqswnal_data.kqn_yield_waitq, &wait); - - kqswnal_unlock(nal, flags); - - if (milliseconds < 0) - schedule (); - else - schedule_timeout((milliseconds * HZ) / 1000); - - kqswnal_lock(nal, flags); - - remove_wait_queue(&kqswnal_data.kqn_yield_waitq, &wait); - - if (milliseconds > 0) { - milliseconds -= ((jiffies - now) * 1000) / HZ; - if (milliseconds < 0) - milliseconds = 0; - } - - return (milliseconds); -} - int kqswnal_get_tx_desc (struct portals_cfg *pcfg) { @@ -186,7 +107,7 @@ kqswnal_cmd (struct portals_cfg *pcfg, void *private) kqswnal_data.kqn_nid_offset); kqswnal_data.kqn_nid_offset = pcfg->pcfg_nid - kqswnal_data.kqn_elanid; - kqswnal_lib.ni.nid = pcfg->pcfg_nid; + kqswnal_lib.libnal_ni.ni_pid.nid = pcfg->pcfg_nid; return (0); default: @@ -469,9 +390,11 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, ptl_process_id_t my_process_id; int pkmem = atomic_read(&portal_kmemory); + LASSERT (nal == &kqswnal_api); + if (nal->nal_refct != 0) { if (actual_limits != NULL) - *actual_limits = kqswnal_lib.ni.actual_limits; + *actual_limits = kqswnal_lib.libnal_ni.ni_actual_limits; /* This module got the first ref */ PORTAL_MODULE_USE; return (PTL_OK); @@ -481,18 +404,9 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory)); - memset(&kqswnal_rpc_success, 0, sizeof(kqswnal_rpc_success)); - memset(&kqswnal_rpc_failed, 0, sizeof(kqswnal_rpc_failed)); -#if MULTIRAIL_EKC - kqswnal_rpc_failed.Data[0] = -ECONNREFUSED; -#else - kqswnal_rpc_failed.Status = -ECONNREFUSED; -#endif /* ensure all pointers NULL etc */ memset (&kqswnal_data, 0, sizeof (kqswnal_data)); - kqswnal_data.kqn_cb = &kqswnal_lib; - INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds); INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds); @@ -507,8 +421,12 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, spin_lock_init (&kqswnal_data.kqn_sched_lock); init_waitqueue_head (&kqswnal_data.kqn_sched_waitq); - spin_lock_init (&kqswnal_data.kqn_statelock); - init_waitqueue_head (&kqswnal_data.kqn_yield_waitq); + /* Leave kqn_rpc_success zeroed */ +#if MULTIRAIL_EKC + kqswnal_data.kqn_rpc_failed.Data[0] = -ECONNREFUSED; +#else + kqswnal_data.kqn_rpc_failed.Status = -ECONNREFUSED; +#endif /* pointers/lists/locks initialised */ kqswnal_data.kqn_init = KQN_INIT_DATA; @@ -517,13 +435,13 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, kqswnal_data.kqn_ep = ep_system(); if (kqswnal_data.kqn_ep == NULL) { CERROR("Can't initialise EKC\n"); - kqswnal_shutdown(&kqswnal_api); + kqswnal_shutdown(nal); return (PTL_IFACE_INVALID); } if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) { CERROR("Can't get elan ID\n"); - kqswnal_shutdown(&kqswnal_api); + kqswnal_shutdown(nal); return (PTL_IFACE_INVALID); } #else @@ -534,7 +452,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (kqswnal_data.kqn_ep == NULL) { CERROR ("Can't get elan device 0\n"); - kqswnal_shutdown(&kqswnal_api); + kqswnal_shutdown(nal); return (PTL_IFACE_INVALID); } #endif @@ -550,7 +468,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (kqswnal_data.kqn_eptx == NULL) { CERROR ("Can't allocate transmitter\n"); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -563,7 +481,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (kqswnal_data.kqn_eprx_small == NULL) { CERROR ("Can't install small msg receiver\n"); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -573,7 +491,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (kqswnal_data.kqn_eprx_large == NULL) { CERROR ("Can't install large msg receiver\n"); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -588,7 +506,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, EP_PERM_WRITE); if (kqswnal_data.kqn_ep_tx_nmh == NULL) { CERROR("Can't reserve tx dma space\n"); - kqswnal_shutdown(&kqswnal_api); + kqswnal_shutdown(nal); return (PTL_NO_SPACE); } #else @@ -603,7 +521,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (rc != DDI_SUCCESS) { CERROR ("Can't reserve rx dma space\n"); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } #endif @@ -617,7 +535,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, EP_PERM_WRITE); if (kqswnal_data.kqn_ep_tx_nmh == NULL) { CERROR("Can't reserve rx dma space\n"); - kqswnal_shutdown(&kqswnal_api); + kqswnal_shutdown(nal); return (PTL_NO_SPACE); } #else @@ -633,7 +551,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (rc != DDI_SUCCESS) { CERROR ("Can't reserve rx dma space\n"); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } #endif @@ -644,7 +562,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); if (kqswnal_data.kqn_txds == NULL) { - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -660,7 +578,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); if (ktx->ktx_buffer == NULL) { - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -697,7 +615,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE)); if (kqswnal_data.kqn_rxds == NULL) { - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -732,7 +650,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, struct page *page = alloc_page(GFP_KERNEL); if (page == NULL) { - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_NO_SPACE); } @@ -780,12 +698,12 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, my_process_id.nid = kqswnal_elanid2nid(kqswnal_data.kqn_elanid); my_process_id.pid = 0; - rc = lib_init(&kqswnal_lib, my_process_id, + rc = lib_init(&kqswnal_lib, nal, my_process_id, requested_limits, actual_limits); if (rc != PTL_OK) { CERROR ("lib_init failed %d\n", rc); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (rc); } @@ -799,6 +717,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; /* NB this enqueue can allocate/sleep (attr == 0) */ + krx->krx_state = KRX_POSTED; #if MULTIRAIL_EKC rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, &krx->krx_elanbuffer, 0); @@ -810,7 +729,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (rc != EP_SUCCESS) { CERROR ("failed ep_queue_receive %d\n", rc); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_FAIL); } } @@ -822,7 +741,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (rc != 0) { CERROR ("failed to spawn scheduling thread: %d\n", rc); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_FAIL); } } @@ -835,7 +754,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, rc = libcfs_nal_cmd_register (QSWNAL, &kqswnal_cmd, NULL); if (rc != 0) { CERROR ("Can't initialise command interface (rc = %d)\n", rc); - kqswnal_shutdown (&kqswnal_api); + kqswnal_shutdown (nal); return (PTL_FAIL); } @@ -867,17 +786,11 @@ kqswnal_initialise (void) { int rc; - kqswnal_api.startup = kqswnal_startup; - kqswnal_api.shutdown = kqswnal_shutdown; - kqswnal_api.forward = kqswnal_forward; - kqswnal_api.yield = kqswnal_yield; - kqswnal_api.lock = kqswnal_lock; - kqswnal_api.unlock = kqswnal_unlock; - kqswnal_api.nal_data = &kqswnal_data; - - kqswnal_lib.nal_data = &kqswnal_data; + kqswnal_api.nal_ni_init = kqswnal_startup; + kqswnal_api.nal_ni_fini = kqswnal_shutdown; /* Initialise dynamic tunables to defaults once only */ + kqswnal_tunables.kqn_optimized_puts = KQSW_OPTIMIZED_PUTS; kqswnal_tunables.kqn_optimized_gets = KQSW_OPTIMIZED_GETS; rc = ptl_register_nal(QSWNAL, &kqswnal_api); diff --git a/lustre/portals/knals/qswnal/qswnal.h b/lustre/portals/knals/qswnal/qswnal.h index 6978aa0..b085caa 100644 --- a/lustre/portals/knals/qswnal/qswnal.h +++ b/lustre/portals/knals/qswnal/qswnal.h @@ -109,7 +109,8 @@ typedef unsigned long kqsw_csum_t; #define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ -#define KQSW_OPTIMIZED_GETS 1 /* optimized gets? */ +#define KQSW_OPTIMIZED_GETS 1 /* optimize gets >= this size */ +#define KQSW_OPTIMIZED_PUTS (32<<10) /* optimize puts >= this size */ #define KQSW_COPY_SMALL_FWD 0 /* copy small fwd messages to pre-mapped buffer? */ /* @@ -156,12 +157,18 @@ typedef struct int krx_npages; /* # pages in receive buffer */ int krx_nob; /* Number Of Bytes received into buffer */ int krx_rpc_reply_needed; /* peer waiting for EKC RPC reply */ - int krx_rpc_reply_sent; /* rpc reply sent */ + int krx_rpc_reply_status; /* what status to send */ + int krx_state; /* what this RX is doing */ atomic_t krx_refcount; /* how to tell when rpc is done */ kpr_fwd_desc_t krx_fwd; /* embedded forwarding descriptor */ ptl_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */ } kqswnal_rx_t; +#define KRX_POSTED 1 /* receiving */ +#define KRX_PARSE 2 /* ready to be parsed */ +#define KRX_COMPLETING 3 /* waiting to be completed */ + + typedef struct { struct list_head ktx_list; /* enqueue idle/active */ @@ -174,7 +181,7 @@ typedef struct int ktx_nmappedpages; /* # pages mapped for current message */ int ktx_port; /* destination ep port */ ptl_nid_t ktx_nid; /* destination node */ - void *ktx_args[2]; /* completion passthru */ + void *ktx_args[3]; /* completion passthru */ char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */ @@ -193,13 +200,16 @@ typedef struct } kqswnal_tx_t; #define KTX_IDLE 0 /* on kqn_(nblk_)idletxds */ -#define KTX_SENDING 1 /* local send */ -#define KTX_FORWARDING 2 /* routing a packet */ -#define KTX_GETTING 3 /* local optimised get */ +#define KTX_FORWARDING 1 /* sending a forwarded packet */ +#define KTX_SENDING 2 /* normal send */ +#define KTX_GETTING 3 /* sending optimised get */ +#define KTX_PUTTING 4 /* sending optimised put */ +#define KTX_RDMAING 5 /* handling optimised put/get */ typedef struct { /* dynamic tunables... */ + int kqn_optimized_puts; /* optimized PUTs? */ int kqn_optimized_gets; /* optimized GETs? */ #if CONFIG_SYSCTL struct ctl_table_header *kqn_sysctl; /* sysctl interface */ @@ -230,9 +240,6 @@ typedef struct struct list_head kqn_delayedfwds; /* delayed forwards */ struct list_head kqn_delayedtxds; /* delayed transmits */ - spinlock_t kqn_statelock; /* cb_cli/cb_sti */ - wait_queue_head_t kqn_yield_waitq; /* where yield waits */ - nal_cb_t *kqn_cb; /* -> kqswnal_lib */ #if MULTIRAIL_EKC EP_SYS *kqn_ep; /* elan system */ EP_NMH *kqn_ep_tx_nmh; /* elan reserved tx vaddrs */ @@ -250,6 +257,9 @@ typedef struct ptl_nid_t kqn_nid_offset; /* this cluster's NID offset */ int kqn_nnodes; /* this cluster's size */ int kqn_elanid; /* this nodes's elan ID */ + + EP_STATUSBLK kqn_rpc_success; /* preset RPC reply status blocks */ + EP_STATUSBLK kqn_rpc_failed; } kqswnal_data_t; /* kqn_init state */ @@ -258,21 +268,16 @@ typedef struct #define KQN_INIT_LIB 2 #define KQN_INIT_ALL 3 -extern nal_cb_t kqswnal_lib; +extern lib_nal_t kqswnal_lib; extern nal_t kqswnal_api; extern kqswnal_tunables_t kqswnal_tunables; extern kqswnal_data_t kqswnal_data; -/* global pre-prepared replies to keep off the stack */ -extern EP_STATUSBLK kqswnal_rpc_success; -extern EP_STATUSBLK kqswnal_rpc_failed; - extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg); extern void kqswnal_rxhandler(EP_RXD *rxd); extern int kqswnal_scheduler (void *); extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); -extern void kqswnal_dma_reply_complete (EP_RXD *rxd); -extern void kqswnal_requeue_rx (kqswnal_rx_t *krx); +extern void kqswnal_rx_done (kqswnal_rx_t *krx); static inline ptl_nid_t kqswnal_elanid2nid (int elanid) @@ -291,6 +296,12 @@ kqswnal_nid2elanid (ptl_nid_t nid) return (nid - kqswnal_data.kqn_nid_offset); } +static inline ptl_nid_t +kqswnal_rx_nid(kqswnal_rx_t *krx) +{ + return (kqswnal_elanid2nid(ep_rxd_node(krx->krx_rxd))); +} + static inline int kqswnal_pages_spanned (void *base, int nob) { @@ -313,11 +324,11 @@ static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob) } #endif -static inline void kqswnal_rx_done (kqswnal_rx_t *krx) +static inline void kqswnal_rx_decref (kqswnal_rx_t *krx) { LASSERT (atomic_read (&krx->krx_refcount) > 0); if (atomic_dec_and_test (&krx->krx_refcount)) - kqswnal_requeue_rx(krx); + kqswnal_rx_done(krx); } #if MULTIRAIL_EKC diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c index 2bcb853..e1237a8 100644 --- a/lustre/portals/knals/qswnal/qswnal_cb.c +++ b/lustre/portals/knals/qswnal/qswnal_cb.c @@ -26,102 +26,14 @@ #include "qswnal.h" -EP_STATUSBLK kqswnal_rpc_success; -EP_STATUSBLK kqswnal_rpc_failed; - /* * LIB functions follow * */ -static ptl_err_t -kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, - size_t len) -{ - CDEBUG (D_NET, LPX64": reading "LPSZ" bytes from %p -> %p\n", - nal->ni.nid, len, src_addr, dst_addr ); - memcpy( dst_addr, src_addr, len ); - - return (PTL_OK); -} - -static ptl_err_t -kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, - size_t len) -{ - CDEBUG (D_NET, LPX64": writing "LPSZ" bytes from %p -> %p\n", - nal->ni.nid, len, src_addr, dst_addr ); - memcpy( dst_addr, src_addr, len ); - - return (PTL_OK); -} - -static void * -kqswnal_malloc(nal_cb_t *nal, size_t len) -{ - void *buf; - - PORTAL_ALLOC(buf, len); - return (buf); -} - -static void -kqswnal_free(nal_cb_t *nal, void *buf, size_t len) -{ - PORTAL_FREE(buf, len); -} - -static void -kqswnal_printf (nal_cb_t * nal, const char *fmt, ...) -{ - va_list ap; - char msg[256]; - - va_start (ap, fmt); - vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ - va_end (ap); - - msg[sizeof (msg) - 1] = 0; /* ensure terminated */ - - CDEBUG (D_NET, "%s", msg); -} - -#if (defined(CONFIG_SPARC32) || defined(CONFIG_SPARC64)) -# error "Can't save/restore irq contexts in different procedures" -#endif - -static void -kqswnal_cli(nal_cb_t *nal, unsigned long *flags) -{ - kqswnal_data_t *data= nal->nal_data; - - spin_lock_irqsave(&data->kqn_statelock, *flags); -} - - -static void -kqswnal_sti(nal_cb_t *nal, unsigned long *flags) -{ - kqswnal_data_t *data= nal->nal_data; - - spin_unlock_irqrestore(&data->kqn_statelock, *flags); -} - -static void -kqswnal_callback(nal_cb_t *nal, void *private, lib_eq_t *eq, ptl_event_t *ev) -{ - /* holding kqn_statelock */ - - if (eq->event_callback != NULL) - eq->event_callback(ev); - - if (waitqueue_active(&kqswnal_data.kqn_yield_waitq)) - wake_up_all(&kqswnal_data.kqn_yield_waitq); -} - static int -kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +kqswnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { - if (nid == nal->ni.nid) + if (nid == nal->libnal_ni.ni_pid.nid) *dist = 0; /* it's me */ else if (kqswnal_nid2elanid (nid) >= 0) *dist = 1; /* it's my peer */ @@ -212,11 +124,12 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_ do { int fraglen = kiov->kiov_len - offset; - /* nob exactly spans the iovs */ - LASSERT (fraglen <= nob); - /* each frag fits in a page */ + /* each page frag is contained in one page */ LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE); + if (fraglen > nob) + fraglen = nob; + nmapped++; if (nmapped > maxmapped) { CERROR("Can't map message in %d pages (max %d)\n", @@ -328,11 +241,12 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, do { int fraglen = iov->iov_len - offset; - long npages = kqswnal_pages_spanned (iov->iov_base, fraglen); - - /* nob exactly spans the iovs */ - LASSERT (fraglen <= nob); + long npages; + if (fraglen > nob) + fraglen = nob; + npages = kqswnal_pages_spanned (iov->iov_base, fraglen); + nmapped += npages; if (nmapped > maxmapped) { CERROR("Can't map message in %d pages (max %d)\n", @@ -519,40 +433,29 @@ kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) void kqswnal_tx_done (kqswnal_tx_t *ktx, int error) { - lib_msg_t *msg; - lib_msg_t *repmsg = NULL; - switch (ktx->ktx_state) { case KTX_FORWARDING: /* router asked me to forward this packet */ kpr_fwd_done (&kqswnal_data.kqn_router, (kpr_fwd_desc_t *)ktx->ktx_args[0], error); break; - case KTX_SENDING: /* packet sourced locally */ - lib_finalize (&kqswnal_lib, ktx->ktx_args[0], + case KTX_RDMAING: /* optimized GET/PUT handled */ + case KTX_PUTTING: /* optimized PUT sent */ + case KTX_SENDING: /* normal send */ + lib_finalize (&kqswnal_lib, NULL, (lib_msg_t *)ktx->ktx_args[1], - (error == 0) ? PTL_OK : - (error == -ENOMEM) ? PTL_NO_SPACE : PTL_FAIL); + (error == 0) ? PTL_OK : PTL_FAIL); break; - case KTX_GETTING: /* Peer has DMA-ed direct? */ - msg = (lib_msg_t *)ktx->ktx_args[1]; - - if (error == 0) { - repmsg = lib_create_reply_msg (&kqswnal_lib, - ktx->ktx_nid, msg); - if (repmsg == NULL) - error = -ENOMEM; - } - - if (error == 0) { - lib_finalize (&kqswnal_lib, ktx->ktx_args[0], - msg, PTL_OK); - lib_finalize (&kqswnal_lib, NULL, repmsg, PTL_OK); - } else { - lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg, - (error == -ENOMEM) ? PTL_NO_SPACE : PTL_FAIL); - } + case KTX_GETTING: /* optimized GET sent & REPLY received */ + /* Complete the GET with success since we can't avoid + * delivering a REPLY event; we committed to it when we + * launched the GET */ + lib_finalize (&kqswnal_lib, NULL, + (lib_msg_t *)ktx->ktx_args[1], PTL_OK); + lib_finalize (&kqswnal_lib, NULL, + (lib_msg_t *)ktx->ktx_args[2], + (error == 0) ? PTL_OK : PTL_FAIL); break; default: @@ -580,16 +483,27 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) kqswnal_notify_peer_down(ktx); status = -EHOSTDOWN; - } else if (ktx->ktx_state == KTX_GETTING) { - /* RPC completed OK; what did our peer put in the status + } else switch (ktx->ktx_state) { + + case KTX_GETTING: + case KTX_PUTTING: + /* RPC completed OK; but what did our peer put in the status * block? */ #if MULTIRAIL_EKC status = ep_txd_statusblk(txd)->Data[0]; #else status = ep_txd_statusblk(txd)->Status; #endif - } else { + break; + + case KTX_FORWARDING: + case KTX_SENDING: status = 0; + break; + + default: + LBUG(); + break; } kqswnal_tx_done (ktx, status); @@ -610,21 +524,20 @@ kqswnal_launch (kqswnal_tx_t *ktx) return (-ESHUTDOWN); LASSERT (dest >= 0); /* must be a peer */ - if (ktx->ktx_state == KTX_GETTING) { - /* NB ktx_frag[0] is the GET hdr + kqswnal_remotemd_t. The - * other frags are the GET sink which we obviously don't - * send here :) */ -#if MULTIRAIL_EKC + + switch (ktx->ktx_state) { + case KTX_GETTING: + case KTX_PUTTING: + /* NB ktx_frag[0] is the GET/PUT hdr + kqswnal_remotemd_t. + * The other frags are the payload, awaiting RDMA */ rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest, ktx->ktx_port, attr, kqswnal_txhandler, ktx, NULL, ktx->ktx_frags, 1); -#else - rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest, - ktx->ktx_port, attr, kqswnal_txhandler, - ktx, NULL, ktx->ktx_frags, 1); -#endif - } else { + break; + + case KTX_FORWARDING: + case KTX_SENDING: #if MULTIRAIL_EKC rc = ep_transmit_message(kqswnal_data.kqn_eptx, dest, ktx->ktx_port, attr, @@ -636,6 +549,12 @@ kqswnal_launch (kqswnal_tx_t *ktx) kqswnal_txhandler, ktx, ktx->ktx_frags, ktx->ktx_nfrag); #endif + break; + + default: + LBUG(); + rc = -EINVAL; /* no compiler warning please */ + break; } switch (rc) { @@ -658,6 +577,7 @@ kqswnal_launch (kqswnal_tx_t *ktx) } } +#if 0 static char * hdr_type_string (ptl_hdr_t *hdr) { @@ -726,6 +646,7 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr) } } /* end of print_hdr() */ +#endif #if !MULTIRAIL_EKC void @@ -787,114 +708,291 @@ kqswnal_eiovs2datav (int ndv, EP_DATAVEC *dv, CERROR ("DATAVEC too small\n"); return (-E2BIG); } +#else +int +kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag, + int nrfrag, EP_NMD *rfrag) +{ + int i; + + if (nlfrag != nrfrag) { + CERROR("Can't cope with unequal # frags: %d local %d remote\n", + nlfrag, nrfrag); + return (-EINVAL); + } + + for (i = 0; i < nlfrag; i++) + if (lfrag[i].nmd_len != rfrag[i].nmd_len) { + CERROR("Can't cope with unequal frags %d(%d):" + " %d local %d remote\n", + i, nlfrag, lfrag[i].nmd_len, rfrag[i].nmd_len); + return (-EINVAL); + } + + return (0); +} #endif -int -kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, - struct iovec *iov, ptl_kiov_t *kiov, - int offset, int nob) +kqswnal_remotemd_t * +kqswnal_parse_rmd (kqswnal_rx_t *krx, int type, ptl_nid_t expected_nid) { - kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; char *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page); + ptl_hdr_t *hdr = (ptl_hdr_t *)buffer; kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + KQSW_HDR_SIZE); - int rc; -#if MULTIRAIL_EKC - int i; -#else - EP_DATAVEC datav[EP_MAXFRAG]; - int ndatav; -#endif - LASSERT (krx->krx_rpc_reply_needed); - LASSERT ((iov == NULL) != (kiov == NULL)); + ptl_nid_t nid = kqswnal_rx_nid(krx); + + /* Note (1) lib_parse has already flipped hdr. + * (2) RDMA addresses are sent in native endian-ness. When + * EKC copes with different endian nodes, I'll fix this (and + * eat my hat :) */ + + LASSERT (krx->krx_nob >= sizeof(*hdr)); + + if (hdr->type != type) { + CERROR ("Unexpected optimized get/put type %d (%d expected)" + "from "LPX64"\n", hdr->type, type, nid); + return (NULL); + } + + if (hdr->src_nid != nid) { + CERROR ("Unexpected optimized get/put source NID " + LPX64" from "LPX64"\n", hdr->src_nid, nid); + return (NULL); + } + + LASSERT (nid == expected_nid); - /* see kqswnal_sendmsg comment regarding endian-ness */ if (buffer + krx->krx_nob < (char *)(rmd + 1)) { /* msg too small to discover rmd size */ CERROR ("Incoming message [%d] too small for RMD (%d needed)\n", krx->krx_nob, (int)(((char *)(rmd + 1)) - buffer)); - return (-EINVAL); + return (NULL); } - + if (buffer + krx->krx_nob < (char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) { /* rmd doesn't fit in the incoming message */ CERROR ("Incoming message [%d] too small for RMD[%d] (%d needed)\n", krx->krx_nob, rmd->kqrmd_nfrag, (int)(((char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) - buffer)); - return (-EINVAL); + return (NULL); } - /* Map the source data... */ + return (rmd); +} + +void +kqswnal_rdma_store_complete (EP_RXD *rxd) +{ + int status = ep_rxd_status(rxd); + kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); + kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; + + CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, + "rxd %p, ktx %p, status %d\n", rxd, ktx, status); + + LASSERT (ktx->ktx_state == KTX_RDMAING); + LASSERT (krx->krx_rxd == rxd); + LASSERT (krx->krx_rpc_reply_needed); + + krx->krx_rpc_reply_needed = 0; + kqswnal_rx_decref (krx); + + /* free ktx & finalize() its lib_msg_t */ + kqswnal_tx_done(ktx, (status == EP_SUCCESS) ? 0 : -ECONNABORTED); +} + +void +kqswnal_rdma_fetch_complete (EP_RXD *rxd) +{ + /* Completed fetching the PUT data */ + int status = ep_rxd_status(rxd); + kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); + kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; + unsigned long flags; + + CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, + "rxd %p, ktx %p, status %d\n", rxd, ktx, status); + + LASSERT (ktx->ktx_state == KTX_RDMAING); + LASSERT (krx->krx_rxd == rxd); + LASSERT (krx->krx_rpc_reply_needed); + + /* Set the RPC completion status */ + status = (status == EP_SUCCESS) ? 0 : -ECONNABORTED; + krx->krx_rpc_reply_status = status; + + /* free ktx & finalize() its lib_msg_t */ + kqswnal_tx_done(ktx, status); + + if (!in_interrupt()) { + /* OK to complete the RPC now (iff I had the last ref) */ + kqswnal_rx_decref (krx); + return; + } + + LASSERT (krx->krx_state == KRX_PARSE); + krx->krx_state = KRX_COMPLETING; + + /* Complete the RPC in thread context */ + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds); + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); +} + +int +kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type, + int niov, struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t len) +{ + kqswnal_remotemd_t *rmd; + kqswnal_tx_t *ktx; + int eprc; + int rc; +#if !MULTIRAIL_EKC + EP_DATAVEC datav[EP_MAXFRAG]; + int ndatav; +#endif + + LASSERT (type == PTL_MSG_GET || type == PTL_MSG_PUT); + /* Not both mapped and paged payload */ + LASSERT (iov == NULL || kiov == NULL); + /* RPC completes with failure by default */ + LASSERT (krx->krx_rpc_reply_needed); + LASSERT (krx->krx_rpc_reply_status != 0); + + rmd = kqswnal_parse_rmd(krx, type, libmsg->ev.initiator.nid); + if (rmd == NULL) + return (-EPROTO); + + if (len == 0) { + /* data got truncated to nothing. */ + lib_finalize(&kqswnal_lib, krx, libmsg, PTL_OK); + /* Let kqswnal_rx_done() complete the RPC with success */ + krx->krx_rpc_reply_status = 0; + return (0); + } + + /* NB I'm using 'ktx' just to map the local RDMA buffers; I'm not + actually sending a portals message with it */ + ktx = kqswnal_get_idle_tx(NULL, 0); + if (ktx == NULL) { + CERROR ("Can't get txd for RDMA with "LPX64"\n", + libmsg->ev.initiator.nid); + return (-ENOMEM); + } + + ktx->ktx_state = KTX_RDMAING; + ktx->ktx_nid = libmsg->ev.initiator.nid; + ktx->ktx_args[0] = krx; + ktx->ktx_args[1] = libmsg; + + /* Start mapping at offset 0 (we're not mapping any headers) */ ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0; + if (kiov != NULL) - rc = kqswnal_map_tx_kiov (ktx, offset, nob, nfrag, kiov); + rc = kqswnal_map_tx_kiov(ktx, offset, len, niov, kiov); else - rc = kqswnal_map_tx_iov (ktx, offset, nob, nfrag, iov); + rc = kqswnal_map_tx_iov(ktx, offset, len, niov, iov); if (rc != 0) { - CERROR ("Can't map source data: %d\n", rc); - return (rc); + CERROR ("Can't map local RDMA data: %d\n", rc); + goto out; } #if MULTIRAIL_EKC - if (ktx->ktx_nfrag != rmd->kqrmd_nfrag) { - CERROR("Can't cope with unequal # frags: %d local %d remote\n", - ktx->ktx_nfrag, rmd->kqrmd_nfrag); - return (-EINVAL); + rc = kqswnal_check_rdma (ktx->ktx_nfrag, ktx->ktx_frags, + rmd->kqrmd_nfrag, rmd->kqrmd_frag); + if (rc != 0) { + CERROR ("Incompatible RDMA descriptors\n"); + goto out; } - - for (i = 0; i < rmd->kqrmd_nfrag; i++) - if (ktx->ktx_frags[i].nmd_len != rmd->kqrmd_frag[i].nmd_len) { - CERROR("Can't cope with unequal frags %d(%d):" - " %d local %d remote\n", - i, rmd->kqrmd_nfrag, - ktx->ktx_frags[i].nmd_len, - rmd->kqrmd_frag[i].nmd_len); - return (-EINVAL); - } #else - ndatav = kqswnal_eiovs2datav (EP_MAXFRAG, datav, - ktx->ktx_nfrag, ktx->ktx_frags, - rmd->kqrmd_nfrag, rmd->kqrmd_frag); + switch (type) { + default: + LBUG(); + + case PTL_MSG_GET: + ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav, + ktx->ktx_nfrag, ktx->ktx_frags, + rmd->kqrmd_nfrag, rmd->kqrmd_frag); + break; + + case PTL_MSG_PUT: + ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav, + rmd->kqrmd_nfrag, rmd->kqrmd_frag, + ktx->ktx_nfrag, ktx->ktx_frags); + break; + } + if (ndatav < 0) { CERROR ("Can't create datavec: %d\n", ndatav); - return (ndatav); + rc = ndatav; + goto out; } #endif - /* Our caller will start to race with kqswnal_dma_reply_complete... */ - LASSERT (atomic_read (&krx->krx_refcount) == 1); - atomic_set (&krx->krx_refcount, 2); + LASSERT (atomic_read(&krx->krx_refcount) > 0); + /* Take an extra ref for the completion callback */ + atomic_inc(&krx->krx_refcount); -#if MULTIRAIL_EKC - rc = ep_complete_rpc(krx->krx_rxd, kqswnal_dma_reply_complete, ktx, - &kqswnal_rpc_success, - ktx->ktx_frags, rmd->kqrmd_frag, rmd->kqrmd_nfrag); - if (rc == EP_SUCCESS) - return (0); + switch (type) { + default: + LBUG(); - /* Well we tried... */ - krx->krx_rpc_reply_needed = 0; + case PTL_MSG_GET: +#if MULTIRAIL_EKC + eprc = ep_complete_rpc(krx->krx_rxd, + kqswnal_rdma_store_complete, ktx, + &kqswnal_data.kqn_rpc_success, + ktx->ktx_frags, rmd->kqrmd_frag, rmd->kqrmd_nfrag); #else - rc = ep_complete_rpc (krx->krx_rxd, kqswnal_dma_reply_complete, ktx, - &kqswnal_rpc_success, datav, ndatav); - if (rc == EP_SUCCESS) - return (0); - - /* "old" EKC destroys rxd on failed completion */ - krx->krx_rxd = NULL; + eprc = ep_complete_rpc (krx->krx_rxd, + kqswnal_rdma_store_complete, ktx, + &kqswnal_data.kqn_rpc_success, + datav, ndatav); + if (eprc != EP_SUCCESS) /* "old" EKC destroys rxd on failed completion */ + krx->krx_rxd = NULL; #endif + if (eprc != EP_SUCCESS) { + CERROR("can't complete RPC: %d\n", eprc); + /* don't re-attempt RPC completion */ + krx->krx_rpc_reply_needed = 0; + rc = -ECONNABORTED; + } + break; + + case PTL_MSG_PUT: +#if MULTIRAIL_EKC + eprc = ep_rpc_get (krx->krx_rxd, + kqswnal_rdma_fetch_complete, ktx, + rmd->kqrmd_frag, ktx->ktx_frags, ktx->ktx_nfrag); +#else + eprc = ep_rpc_get (krx->krx_rxd, + kqswnal_rdma_fetch_complete, ktx, + datav, ndatav); +#endif + if (eprc != EP_SUCCESS) { + CERROR("ep_rpc_get failed: %d\n", eprc); + rc = -ECONNABORTED; + } + break; + } - CERROR("can't complete RPC: %d\n", rc); - - /* reset refcount back to 1: we're not going to be racing with - * kqswnal_dma_reply_complete. */ - atomic_set (&krx->krx_refcount, 1); + out: + if (rc != 0) { + kqswnal_rx_decref(krx); /* drop callback's ref */ + kqswnal_put_idle_tx (ktx); + } - return (-ECONNABORTED); + atomic_dec(&kqswnal_data.kqn_pending_txs); + return (rc); } static ptl_err_t -kqswnal_sendmsg (nal_cb_t *nal, +kqswnal_sendmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, ptl_hdr_t *hdr, @@ -916,6 +1014,8 @@ kqswnal_sendmsg (nal_cb_t *nal, int sumoff; int sumnob; #endif + /* NB 1. hdr is in network byte order */ + /* 2. 'private' depends on the message type */ CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64 " pid %u\n", payload_nob, payload_niov, nid, pid); @@ -934,6 +1034,15 @@ kqswnal_sendmsg (nal_cb_t *nal, return (PTL_FAIL); } + if (type == PTL_MSG_REPLY && /* can I look in 'private' */ + ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) { /* is it an RPC */ + /* Must be a REPLY for an optimized GET */ + rc = kqswnal_rdma ((kqswnal_rx_t *)private, libmsg, PTL_MSG_GET, + payload_niov, payload_iov, payload_kiov, + payload_offset, payload_nob); + return ((rc == 0) ? PTL_OK : PTL_FAIL); + } + targetnid = nid; if (kqswnal_nid2elanid (nid) < 0) { /* Can't send direct: find gateway? */ rc = kpr_lookup (&kqswnal_data.kqn_router, nid, @@ -956,35 +1065,16 @@ kqswnal_sendmsg (nal_cb_t *nal, type == PTL_MSG_REPLY || in_interrupt())); if (ktx == NULL) { - kqswnal_cerror_hdr (hdr); + CERROR ("Can't get txd for msg type %d for "LPX64"\n", + type, libmsg->ev.initiator.nid); return (PTL_NO_SPACE); } + ktx->ktx_state = KTX_SENDING; ktx->ktx_nid = targetnid; ktx->ktx_args[0] = private; ktx->ktx_args[1] = libmsg; - - if (type == PTL_MSG_REPLY && - ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) { - if (nid != targetnid || - kqswnal_nid2elanid(nid) != - ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd)) { - CERROR("Optimized reply nid conflict: " - "nid "LPX64" via "LPX64" elanID %d\n", - nid, targetnid, - ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd)); - rc = -EINVAL; - goto out; - } - - /* peer expects RPC completion with GET data */ - rc = kqswnal_dma_reply (ktx, payload_niov, - payload_iov, payload_kiov, - payload_offset, payload_nob); - if (rc != 0) - CERROR ("Can't DMA reply to "LPX64": %d\n", nid, rc); - goto out; - } + ktx->ktx_args[2] = NULL; /* set when a GET commits to REPLY */ memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */ ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer; @@ -1027,28 +1117,31 @@ kqswnal_sendmsg (nal_cb_t *nal, memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum)); #endif - if (kqswnal_tunables.kqn_optimized_gets && - type == PTL_MSG_GET && /* doing a GET */ - nid == targetnid) { /* not forwarding */ + /* The first frag will be the pre-mapped buffer for (at least) the + * portals header. */ + ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; + + if (nid == targetnid && /* not forwarding */ + ((type == PTL_MSG_GET && /* optimize GET? */ + kqswnal_tunables.kqn_optimized_gets != 0 && + NTOH__u32(hdr->msg.get.sink_length) >= kqswnal_tunables.kqn_optimized_gets) || + (type == PTL_MSG_PUT && /* optimize PUT? */ + kqswnal_tunables.kqn_optimized_puts != 0 && + payload_nob >= kqswnal_tunables.kqn_optimized_puts))) { lib_md_t *md = libmsg->md; kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(ktx->ktx_buffer + KQSW_HDR_SIZE); - /* Optimised path: I send over the Elan vaddrs of the get - * sink buffers, and my peer DMAs directly into them. + /* Optimised path: I send over the Elan vaddrs of the local + * buffers, and my peer DMAs directly to/from them. * * First I set up ktx as if it was going to send this * payload, (it needs to map it anyway). This fills * ktx_frags[1] and onward with the network addresses * of the GET sink frags. I copy these into ktx_buffer, - * immediately after the header, and send that as my GET - * message. - * - * Note that the addresses are sent in native endian-ness. - * When EKC copes with different endian nodes, I'll fix - * this (and eat my hat :) */ + * immediately after the header, and send that as my + * message. */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - ktx->ktx_state = KTX_GETTING; + ktx->ktx_state = (type == PTL_MSG_PUT) ? KTX_PUTTING : KTX_GETTING; if ((libmsg->md->options & PTL_MD_KIOV) != 0) rc = kqswnal_map_tx_kiov (ktx, 0, md->length, @@ -1078,12 +1171,21 @@ kqswnal_sendmsg (nal_cb_t *nal, ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + payload_nob; #endif + if (type == PTL_MSG_GET) { + /* Allocate reply message now while I'm in thread context */ + ktx->ktx_args[2] = lib_create_reply_msg (&kqswnal_lib, + nid, libmsg); + if (ktx->ktx_args[2] == NULL) + goto out; + + /* NB finalizing the REPLY message is my + * responsibility now, whatever happens. */ + } + } else if (payload_nob <= KQSW_TX_MAXCONTIG) { /* small message: single frag copied into the pre-mapped buffer */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - ktx->ktx_state = KTX_SENDING; #if MULTIRAIL_EKC ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, KQSW_HDR_SIZE + payload_nob); @@ -1105,8 +1207,6 @@ kqswnal_sendmsg (nal_cb_t *nal, /* large message: multiple frags: first is hdr in pre-mapped buffer */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - ktx->ktx_state = KTX_SENDING; #if MULTIRAIL_EKC ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, KQSW_HDR_SIZE); @@ -1135,15 +1235,29 @@ kqswnal_sendmsg (nal_cb_t *nal, rc == 0 ? "Sent" : "Failed to send", payload_nob, nid, targetnid, rc); - if (rc != 0) + if (rc != 0) { + if (ktx->ktx_state == KTX_GETTING && + ktx->ktx_args[2] != NULL) { + /* We committed to reply, but there was a problem + * launching the GET. We can't avoid delivering a + * REPLY event since we committed above, so we + * pretend the GET succeeded but the REPLY + * failed. */ + rc = 0; + lib_finalize (&kqswnal_lib, private, libmsg, PTL_OK); + lib_finalize (&kqswnal_lib, private, + (lib_msg_t *)ktx->ktx_args[2], PTL_FAIL); + } + kqswnal_put_idle_tx (ktx); - + } + atomic_dec(&kqswnal_data.kqn_pending_txs); return (rc == 0 ? PTL_OK : PTL_FAIL); } static ptl_err_t -kqswnal_send (nal_cb_t *nal, +kqswnal_send (lib_nal_t *nal, void *private, lib_msg_t *libmsg, ptl_hdr_t *hdr, @@ -1161,7 +1275,7 @@ kqswnal_send (nal_cb_t *nal, } static ptl_err_t -kqswnal_send_pages (nal_cb_t *nal, +kqswnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *libmsg, ptl_hdr_t *hdr, @@ -1200,7 +1314,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) if (ktx == NULL) /* can't get txd right now */ return; /* fwd will be scheduled when tx desc freed */ - if (nid == kqswnal_lib.ni.nid) /* gateway is me */ + if (nid == kqswnal_lib.libnal_ni.ni_pid.nid) /* gateway is me */ nid = fwd->kprfd_target_nid; /* target is final dest */ if (kqswnal_nid2elanid (nid) < 0) { @@ -1254,9 +1368,8 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) if (rc != 0) { CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc); - kqswnal_put_idle_tx (ktx); /* complete now (with failure) */ - kpr_fwd_done (&kqswnal_data.kqn_router, fwd, rc); + kqswnal_tx_done (ktx, rc); } atomic_dec(&kqswnal_data.kqn_pending_txs); @@ -1277,29 +1390,48 @@ kqswnal_fwd_callback (void *arg, int error) NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error); } - kqswnal_requeue_rx (krx); + LASSERT (atomic_read(&krx->krx_refcount) == 1); + kqswnal_rx_decref (krx); } void -kqswnal_dma_reply_complete (EP_RXD *rxd) +kqswnal_requeue_rx (kqswnal_rx_t *krx) { - int status = ep_rxd_status(rxd); - kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); - kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; - lib_msg_t *msg = (lib_msg_t *)ktx->ktx_args[1]; - - CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, - "rxd %p, ktx %p, status %d\n", rxd, ktx, status); + LASSERT (atomic_read(&krx->krx_refcount) == 0); + LASSERT (!krx->krx_rpc_reply_needed); - LASSERT (krx->krx_rxd == rxd); - LASSERT (krx->krx_rpc_reply_needed); + krx->krx_state = KRX_POSTED; - krx->krx_rpc_reply_needed = 0; - kqswnal_rx_done (krx); +#if MULTIRAIL_EKC + if (kqswnal_data.kqn_shuttingdown) { + /* free EKC rxd on shutdown */ + ep_complete_receive(krx->krx_rxd); + } else { + /* repost receive */ + ep_requeue_receive(krx->krx_rxd, + kqswnal_rxhandler, krx, + &krx->krx_elanbuffer, 0); + } +#else + if (kqswnal_data.kqn_shuttingdown) + return; - lib_finalize (&kqswnal_lib, NULL, msg, - (status == EP_SUCCESS) ? PTL_OK : PTL_FAIL); - kqswnal_put_idle_tx (ktx); + if (krx->krx_rxd == NULL) { + /* We had a failed ep_complete_rpc() which nukes the + * descriptor in "old" EKC */ + int eprc = ep_queue_receive(krx->krx_eprx, + kqswnal_rxhandler, krx, + krx->krx_elanbuffer, + krx->krx_npages * PAGE_SIZE, 0); + LASSERT (eprc == EP_SUCCESS); + /* We don't handle failure here; it's incredibly rare + * (never reported?) and only happens with "old" EKC */ + } else { + ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx, + krx->krx_elanbuffer, + krx->krx_npages * PAGE_SIZE); + } +#endif } void @@ -1319,71 +1451,45 @@ kqswnal_rpc_complete (EP_RXD *rxd) } void -kqswnal_requeue_rx (kqswnal_rx_t *krx) +kqswnal_rx_done (kqswnal_rx_t *krx) { - int rc; + int rc; + EP_STATUSBLK *sblk; LASSERT (atomic_read(&krx->krx_refcount) == 0); if (krx->krx_rpc_reply_needed) { + /* We've not completed the peer's RPC yet... */ + sblk = (krx->krx_rpc_reply_status == 0) ? + &kqswnal_data.kqn_rpc_success : + &kqswnal_data.kqn_rpc_failed; - /* We failed to complete the peer's optimized GET (e.g. we - * couldn't map the source buffers). We complete the - * peer's EKC rpc now with failure. */ + LASSERT (!in_interrupt()); #if MULTIRAIL_EKC - rc = ep_complete_rpc(krx->krx_rxd, kqswnal_rpc_complete, krx, - &kqswnal_rpc_failed, NULL, NULL, 0); + rc = ep_complete_rpc(krx->krx_rxd, + kqswnal_rpc_complete, krx, + sblk, NULL, NULL, 0); if (rc == EP_SUCCESS) return; - - CERROR("can't complete RPC: %d\n", rc); #else - if (krx->krx_rxd != NULL) { - /* We didn't try (and fail) to complete earlier... */ - rc = ep_complete_rpc(krx->krx_rxd, - kqswnal_rpc_complete, krx, - &kqswnal_rpc_failed, NULL, 0); - if (rc == EP_SUCCESS) - return; - - CERROR("can't complete RPC: %d\n", rc); - } - - /* NB the old ep_complete_rpc() frees rxd on failure, so we - * have to requeue from scratch here, unless we're shutting - * down */ - if (kqswnal_data.kqn_shuttingdown) + rc = ep_complete_rpc(krx->krx_rxd, + kqswnal_rpc_complete, krx, + sblk, NULL, 0); + if (rc == EP_SUCCESS) return; - rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, - krx->krx_elanbuffer, - krx->krx_npages * PAGE_SIZE, 0); - LASSERT (rc == EP_SUCCESS); - /* We don't handle failure here; it's incredibly rare - * (never reported?) and only happens with "old" EKC */ - return; + /* "old" EKC destroys rxd on failed completion */ + krx->krx_rxd = NULL; #endif + CERROR("can't complete RPC: %d\n", rc); + krx->krx_rpc_reply_needed = 0; } -#if MULTIRAIL_EKC - if (kqswnal_data.kqn_shuttingdown) { - /* free EKC rxd on shutdown */ - ep_complete_receive(krx->krx_rxd); - } else { - /* repost receive */ - ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx, - &krx->krx_elanbuffer, 0); - } -#else - /* don't actually requeue on shutdown */ - if (!kqswnal_data.kqn_shuttingdown) - ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx, - krx->krx_elanbuffer, krx->krx_npages * PAGE_SIZE); -#endif + kqswnal_requeue_rx(krx); } void -kqswnal_rx (kqswnal_rx_t *krx) +kqswnal_parse (kqswnal_rx_t *krx) { ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(krx->krx_kiov[0].kiov_page); ptl_nid_t dest_nid = NTOH__u64 (hdr->dest_nid); @@ -1391,25 +1497,28 @@ kqswnal_rx (kqswnal_rx_t *krx) int nob; int niov; - LASSERT (atomic_read(&krx->krx_refcount) == 0); + LASSERT (atomic_read(&krx->krx_refcount) == 1); + + if (dest_nid == kqswnal_lib.libnal_ni.ni_pid.nid) { /* It's for me :) */ + /* I ignore parse errors since I'm not consuming a byte + * stream */ + (void)lib_parse (&kqswnal_lib, hdr, krx); - if (dest_nid == kqswnal_lib.ni.nid) { /* It's for me :) */ - atomic_set(&krx->krx_refcount, 1); - lib_parse (&kqswnal_lib, hdr, krx); - kqswnal_rx_done(krx); + /* Drop my ref; any RDMA activity takes an additional ref */ + kqswnal_rx_decref(krx); return; } #if KQSW_CHECKSUM - CERROR ("checksums for forwarded packets not implemented\n"); - LBUG (); + LASSERTF (0, "checksums for forwarded packets not implemented\n"); #endif + if (kqswnal_nid2elanid (dest_nid) >= 0) /* should have gone direct to peer */ { CERROR("dropping packet from "LPX64" for "LPX64 ": target is peer\n", NTOH__u64(hdr->src_nid), dest_nid); - kqswnal_requeue_rx (krx); + kqswnal_rx_decref (krx); return; } @@ -1451,7 +1560,9 @@ kqswnal_rxhandler(EP_RXD *rxd) rxd, krx, nob, status); LASSERT (krx != NULL); - + LASSERT (krx->krx_state = KRX_POSTED); + + krx->krx_state = KRX_PARSE; krx->krx_rxd = rxd; krx->krx_nob = nob; #if MULTIRAIL_EKC @@ -1459,7 +1570,10 @@ kqswnal_rxhandler(EP_RXD *rxd) #else krx->krx_rpc_reply_needed = ep_rxd_isrpc(rxd); #endif - + /* Default to failure if an RPC reply is requested but not handled */ + krx->krx_rpc_reply_status = -EPROTO; + atomic_set (&krx->krx_refcount, 1); + /* must receive a whole header to be able to parse */ if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t)) { @@ -1475,12 +1589,12 @@ kqswnal_rxhandler(EP_RXD *rxd) CERROR("receive status failed with status %d nob %d\n", ep_rxd_status(rxd), nob); #endif - kqswnal_requeue_rx (krx); + kqswnal_rx_decref(krx); return; } if (!in_interrupt()) { - kqswnal_rx (krx); + kqswnal_parse(krx); return; } @@ -1540,7 +1654,7 @@ kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) #endif static ptl_err_t -kqswnal_recvmsg (nal_cb_t *nal, +kqswnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, @@ -1552,16 +1666,18 @@ kqswnal_recvmsg (nal_cb_t *nal, { kqswnal_rx_t *krx = (kqswnal_rx_t *)private; char *buffer = page_address(krx->krx_kiov[0].kiov_page); + ptl_hdr_t *hdr = (ptl_hdr_t *)buffer; int page; char *page_ptr; int page_nob; char *iov_ptr; int iov_nob; int frag; + int rc; #if KQSW_CHECKSUM kqsw_csum_t senders_csum; kqsw_csum_t payload_csum = 0; - kqsw_csum_t hdr_csum = kqsw_csum(0, buffer, sizeof(ptl_hdr_t)); + kqsw_csum_t hdr_csum = kqsw_csum(0, hdr, sizeof(*hdr)); size_t csum_len = mlen; int csum_frags = 0; int csum_nob = 0; @@ -1574,8 +1690,18 @@ kqswnal_recvmsg (nal_cb_t *nal, if (senders_csum != hdr_csum) kqswnal_csum_error (krx, 1); #endif + /* NB lib_parse() has already flipped *hdr */ + CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen); + if (krx->krx_rpc_reply_needed && + hdr->type == PTL_MSG_PUT) { + /* This must be an optimized PUT */ + rc = kqswnal_rdma (krx, libmsg, PTL_MSG_PUT, + niov, iov, kiov, offset, mlen); + return (rc == 0 ? PTL_OK : PTL_FAIL); + } + /* What was actually received must be >= payload. */ LASSERT (mlen <= rlen); if (krx->krx_nob < KQSW_HDR_SIZE + mlen) { @@ -1691,7 +1817,7 @@ kqswnal_recvmsg (nal_cb_t *nal, } static ptl_err_t -kqswnal_recv(nal_cb_t *nal, +kqswnal_recv(lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, @@ -1706,7 +1832,7 @@ kqswnal_recv(nal_cb_t *nal, } static ptl_err_t -kqswnal_recv_pages (nal_cb_t *nal, +kqswnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, @@ -1766,7 +1892,18 @@ kqswnal_scheduler (void *arg) spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); - kqswnal_rx (krx); + switch (krx->krx_state) { + case KRX_PARSE: + kqswnal_parse (krx); + break; + case KRX_COMPLETING: + /* Drop last ref to reply to RPC and requeue */ + LASSERT (krx->krx_rpc_reply_needed); + kqswnal_rx_decref (krx); + break; + default: + LBUG(); + } did_something = 1; spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); @@ -1835,20 +1972,12 @@ kqswnal_scheduler (void *arg) return (0); } -nal_cb_t kqswnal_lib = +lib_nal_t kqswnal_lib = { - nal_data: &kqswnal_data, /* NAL private data */ - cb_send: kqswnal_send, - cb_send_pages: kqswnal_send_pages, - cb_recv: kqswnal_recv, - cb_recv_pages: kqswnal_recv_pages, - cb_read: kqswnal_read, - cb_write: kqswnal_write, - cb_malloc: kqswnal_malloc, - cb_free: kqswnal_free, - cb_printf: kqswnal_printf, - cb_cli: kqswnal_cli, - cb_sti: kqswnal_sti, - cb_callback: kqswnal_callback, - cb_dist: kqswnal_dist + libnal_data: &kqswnal_data, /* NAL private data */ + libnal_send: kqswnal_send, + libnal_send_pages: kqswnal_send_pages, + libnal_recv: kqswnal_recv, + libnal_recv_pages: kqswnal_recv_pages, + libnal_dist: kqswnal_dist }; diff --git a/lustre/portals/knals/socknal/socknal.c b/lustre/portals/knals/socknal/socknal.c index 32bbbec..9d39cb1b 100644 --- a/lustre/portals/knals/socknal/socknal.c +++ b/lustre/portals/knals/socknal/socknal.c @@ -74,83 +74,9 @@ static ctl_table ksocknal_top_ctl_table[] = { #endif int -ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len, - void *ret, size_t ret_len) -{ - ksock_nal_data_t *k; - nal_cb_t *nal_cb; - - k = nal->nal_data; - nal_cb = k->ksnd_nal_cb; - - lib_dispatch(nal_cb, k, id, args, ret); /* ksocknal_send needs k */ - return PTL_OK; -} - -void -ksocknal_api_lock(nal_t *nal, unsigned long *flags) -{ - ksock_nal_data_t *k; - nal_cb_t *nal_cb; - - k = nal->nal_data; - nal_cb = k->ksnd_nal_cb; - nal_cb->cb_cli(nal_cb,flags); -} - -void -ksocknal_api_unlock(nal_t *nal, unsigned long *flags) -{ - ksock_nal_data_t *k; - nal_cb_t *nal_cb; - - k = nal->nal_data; - nal_cb = k->ksnd_nal_cb; - nal_cb->cb_sti(nal_cb,flags); -} - -int -ksocknal_api_yield(nal_t *nal, unsigned long *flags, int milliseconds) -{ - /* NB called holding statelock */ - wait_queue_t wait; - unsigned long now = jiffies; - - CDEBUG (D_NET, "yield\n"); - - if (milliseconds == 0) { - our_cond_resched(); - return 0; - } - - init_waitqueue_entry(&wait, current); - set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&ksocknal_data.ksnd_yield_waitq, &wait); - - ksocknal_api_unlock(nal, flags); - - if (milliseconds < 0) - schedule (); - else - schedule_timeout((milliseconds * HZ) / 1000); - - ksocknal_api_lock(nal, flags); - - remove_wait_queue (&ksocknal_data.ksnd_yield_waitq, &wait); - - if (milliseconds > 0) { - milliseconds -= ((jiffies - now) * 1000) / HZ; - if (milliseconds < 0) - milliseconds = 0; - } - - return (milliseconds); -} - -int ksocknal_set_mynid(ptl_nid_t nid) { - lib_ni_t *ni = &ksocknal_lib.ni; + lib_ni_t *ni = &ksocknal_lib.libnal_ni; /* FIXME: we have to do this because we call lib_init() at module * insertion time, which is before we have 'mynid' available. lib_init @@ -159,9 +85,9 @@ ksocknal_set_mynid(ptl_nid_t nid) * problem. */ CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->nid); + nid, ni->ni_pid.nid); - ni->nid = nid; + ni->ni_pid.nid = nid; return (0); } @@ -1527,14 +1453,18 @@ ksocknal_api_shutdown (nal_t *nal) /* flag threads to terminate; wake and wait for them to die */ ksocknal_data.ksnd_shuttingdown = 1; + mb(); wake_up_all (&ksocknal_data.ksnd_autoconnectd_waitq); wake_up_all (&ksocknal_data.ksnd_reaper_waitq); for (i = 0; i < SOCKNAL_N_SCHED; i++) wake_up_all(&ksocknal_data.ksnd_schedulers[i].kss_waitq); + i = 4; while (atomic_read (&ksocknal_data.ksnd_nthreads) != 0) { - CDEBUG (D_NET, "waitinf for %d threads to terminate\n", + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d threads to terminate\n", atomic_read (&ksocknal_data.ksnd_nthreads)); set_current_state (TASK_UNINTERRUPTIBLE); schedule_timeout (HZ); @@ -1590,7 +1520,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, if (nal->nal_refct != 0) { if (actual_limits != NULL) - *actual_limits = ksocknal_lib.ni.actual_limits; + *actual_limits = ksocknal_lib.libnal_ni.ni_actual_limits; /* This module got the first ref */ PORTAL_MODULE_USE; return (PTL_OK); @@ -1613,10 +1543,6 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, rwlock_init(&ksocknal_data.ksnd_global_lock); - ksocknal_data.ksnd_nal_cb = &ksocknal_lib; - spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock); - init_waitqueue_head(&ksocknal_data.ksnd_yield_waitq); - spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); @@ -1646,7 +1572,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, sizeof(ksock_sched_t) * SOCKNAL_N_SCHED); if (ksocknal_data.ksnd_schedulers == NULL) { - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (-ENOMEM); } @@ -1666,11 +1592,11 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, process_id.pid = 0; process_id.nid = 0; - rc = lib_init(&ksocknal_lib, process_id, + rc = lib_init(&ksocknal_lib, nal, process_id, requested_limits, actual_limits); if (rc != PTL_OK) { CERROR("lib_init failed: error %d\n", rc); - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (rc); } @@ -1682,7 +1608,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, if (rc != 0) { CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc); - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (rc); } } @@ -1691,7 +1617,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, rc = ksocknal_thread_start (ksocknal_autoconnectd, (void *)((long)i)); if (rc != 0) { CERROR("Can't spawn socknal autoconnectd: %d\n", rc); - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (rc); } } @@ -1699,7 +1625,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, rc = ksocknal_thread_start (ksocknal_reaper, NULL); if (rc != 0) { CERROR ("Can't spawn socknal reaper: %d\n", rc); - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (rc); } @@ -1725,7 +1651,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, PORTAL_ALLOC(fmb, offsetof(ksock_fmb_t, fmb_kiov[pool->fmp_buff_pages])); if (fmb == NULL) { - ksocknal_api_shutdown(&ksocknal_api); + ksocknal_api_shutdown(nal); return (-ENOMEM); } @@ -1735,7 +1661,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, fmb->fmb_kiov[j].kiov_page = alloc_page(GFP_KERNEL); if (fmb->fmb_kiov[j].kiov_page == NULL) { - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (-ENOMEM); } @@ -1749,7 +1675,7 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, rc = libcfs_nal_cmd_register(SOCKNAL, &ksocknal_cmd, NULL); if (rc != 0) { CERROR ("Can't initialise command interface (rc = %d)\n", rc); - ksocknal_api_shutdown (&ksocknal_api); + ksocknal_api_shutdown (nal); return (rc); } @@ -1794,14 +1720,8 @@ ksocknal_module_init (void) /* check ksnr_connected/connecting field large enough */ LASSERT(SOCKNAL_CONN_NTYPES <= 4); - ksocknal_api.startup = ksocknal_api_startup; - ksocknal_api.forward = ksocknal_api_forward; - ksocknal_api.shutdown = ksocknal_api_shutdown; - ksocknal_api.lock = ksocknal_api_lock; - ksocknal_api.unlock = ksocknal_api_unlock; - ksocknal_api.nal_data = &ksocknal_data; - - ksocknal_lib.nal_data = &ksocknal_data; + ksocknal_api.nal_ni_init = ksocknal_api_startup; + ksocknal_api.nal_ni_fini = ksocknal_api_shutdown; /* Initialise dynamic tunables to defaults once only */ ksocknal_tunables.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT; diff --git a/lustre/portals/knals/socknal/socknal.h b/lustre/portals/knals/socknal/socknal.h index 87b23dc..ff73f71 100644 --- a/lustre/portals/knals/socknal/socknal.h +++ b/lustre/portals/knals/socknal/socknal.h @@ -160,10 +160,6 @@ typedef struct { struct list_head *ksnd_peers; /* hash table of all my known peers */ int ksnd_peer_hash_size; /* size of ksnd_peers */ - nal_cb_t *ksnd_nal_cb; - spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ - wait_queue_head_t ksnd_yield_waitq; /* where yield waits */ - atomic_t ksnd_nthreads; /* # live threads */ int ksnd_shuttingdown; /* tell threads to exit */ ksock_sched_t *ksnd_schedulers; /* scheduler state */ @@ -364,7 +360,7 @@ typedef struct ksock_peer } ksock_peer_t; -extern nal_cb_t ksocknal_lib; +extern lib_nal_t ksocknal_lib; extern ksock_nal_data_t ksocknal_data; extern ksock_tunables_t ksocknal_tunables; diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c index 21e0abe..5815d16 100644 --- a/lustre/portals/knals/socknal/socknal_cb.c +++ b/lustre/portals/knals/socknal/socknal_cb.c @@ -32,101 +32,12 @@ * LIB functions follow * */ -ptl_err_t -ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr, - user_ptr src_addr, size_t len) -{ - CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n", - nal->ni.nid, (long)len, src_addr, dst_addr); - - memcpy( dst_addr, src_addr, len ); - return PTL_OK; -} - -ptl_err_t -ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, - void *src_addr, size_t len) -{ - CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n", - nal->ni.nid, (long)len, src_addr, dst_addr); - - memcpy( dst_addr, src_addr, len ); - return PTL_OK; -} - -void * -ksocknal_malloc(nal_cb_t *nal, size_t len) -{ - void *buf; - - PORTAL_ALLOC(buf, len); - - if (buf != NULL) - memset(buf, 0, len); - - return (buf); -} - -void -ksocknal_free(nal_cb_t *nal, void *buf, size_t len) -{ - PORTAL_FREE(buf, len); -} - -void -ksocknal_printf(nal_cb_t *nal, const char *fmt, ...) -{ - va_list ap; - char msg[256]; - - va_start (ap, fmt); - vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ - va_end (ap); - - msg[sizeof (msg) - 1] = 0; /* ensure terminated */ - - CDEBUG (D_NET, "%s", msg); -} - -void -ksocknal_cli(nal_cb_t *nal, unsigned long *flags) -{ - ksock_nal_data_t *data = nal->nal_data; - - /* OK to ignore 'flags'; we're only ever serialise threads and - * never need to lock out interrupts */ - spin_lock(&data->ksnd_nal_cb_lock); -} - -void -ksocknal_sti(nal_cb_t *nal, unsigned long *flags) -{ - ksock_nal_data_t *data; - data = nal->nal_data; - - /* OK to ignore 'flags'; we're only ever serialise threads and - * never need to lock out interrupts */ - spin_unlock(&data->ksnd_nal_cb_lock); -} - -void -ksocknal_callback(nal_cb_t *nal, void *private, lib_eq_t *eq, ptl_event_t *ev) -{ - /* holding ksnd_nal_cb_lock */ - - if (eq->event_callback != NULL) - eq->event_callback(ev); - - if (waitqueue_active(&ksocknal_data.ksnd_yield_waitq)) - wake_up_all(&ksocknal_data.ksnd_yield_waitq); -} - int -ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +ksocknal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { /* I would guess that if ksocknal_get_peer (nid) == NULL, and we're not routing, then 'nid' is very distant :) */ - if ( nal->ni.nid == nid ) { + if (nal->libnal_ni.ni_pid.nid == nid) { *dist = 0; } else { *dist = 1; @@ -882,8 +793,8 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) { struct list_head *tmp; ksock_route_t *route; - ksock_route_t *candidate = NULL; - int found = 0; + ksock_route_t *first_lazy = NULL; + int found_connecting_or_connected = 0; int bits; list_for_each (tmp, &peer->ksnp_routes) { @@ -896,7 +807,7 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) /* All typed connections have been established, or * an untyped connection has been established, or * connections are currently being established */ - found = 1; + found_connecting_or_connected = 1; continue; } @@ -904,20 +815,24 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) if (!time_after_eq (jiffies, route->ksnr_timeout)) continue; - /* always do eager routes */ + /* eager routes always want to be connected */ if (route->ksnr_eager) return (route); - if (candidate == NULL) { - /* If we don't find any other route that is fully - * connected or connecting, the first connectable - * route is returned. If it fails to connect, it - * will get placed at the end of the list */ - candidate = route; - } + if (first_lazy == NULL) + first_lazy = route; } - - return (found ? NULL : candidate); + + /* No eager routes need to be connected. If some connection has + * already been established, or is being established there's nothing to + * do. Otherwise we return the first lazy route we found. If it fails + * to connect, it will go to the end of the list. */ + + if (!list_empty (&peer->ksnp_conns) || + found_connecting_or_connected) + return (NULL); + + return (first_lazy); } ksock_route_t * @@ -1028,7 +943,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) } ptl_err_t -ksocknal_sendmsg(nal_cb_t *nal, +ksocknal_sendmsg(lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, @@ -1125,7 +1040,7 @@ ksocknal_sendmsg(nal_cb_t *nal, } ptl_err_t -ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, +ksocknal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, struct iovec *payload_iov, size_t payload_offset, size_t payload_len) @@ -1137,7 +1052,7 @@ ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, } ptl_err_t -ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, +ksocknal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, ptl_kiov_t *payload_kiov, size_t payload_offset, size_t payload_len) @@ -1159,7 +1074,7 @@ ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); /* I'm the gateway; must be the last hop */ - if (nid == ksocknal_lib.ni.nid) + if (nid == ksocknal_lib.libnal_ni.ni_pid.nid) nid = fwd->kprfd_target_nid; /* setup iov for hdr */ @@ -1544,7 +1459,8 @@ ksocknal_process_receive (ksock_conn_t *conn) switch (conn->ksnc_rx_state) { case SOCKNAL_RX_HEADER: if (conn->ksnc_hdr.type != HTON__u32(PTL_MSG_HELLO) && - NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) { + NTOH__u64(conn->ksnc_hdr.dest_nid) != + ksocknal_lib.libnal_ni.ni_pid.nid) { /* This packet isn't for me */ ksocknal_fwd_parse (conn); switch (conn->ksnc_rx_state) { @@ -1561,7 +1477,13 @@ ksocknal_process_receive (ksock_conn_t *conn) } /* sets wanted_len, iovs etc */ - lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); + rc = lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); + + if (rc != PTL_OK) { + /* I just received garbage: give up on this conn */ + ksocknal_close_conn_and_siblings (conn, rc); + return (-EPROTO); + } if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ conn->ksnc_rx_state = SOCKNAL_RX_BODY; @@ -1608,7 +1530,7 @@ ksocknal_process_receive (ksock_conn_t *conn) } ptl_err_t -ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, +ksocknal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, struct iovec *iov, size_t offset, size_t mlen, size_t rlen) { @@ -1636,7 +1558,7 @@ ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, } ptl_err_t -ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg, +ksocknal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, ptl_kiov_t *kiov, size_t offset, size_t mlen, size_t rlen) { @@ -2029,7 +1951,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, hmv->version_major = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); hmv->version_minor = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); - hdr.src_nid = __cpu_to_le64 (ksocknal_lib.ni.nid); + hdr.src_nid = __cpu_to_le64 (ksocknal_lib.libnal_ni.ni_pid.nid); hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); hdr.msg.hello.type = __cpu_to_le32 (*type); @@ -2698,19 +2620,11 @@ ksocknal_reaper (void *arg) return (0); } -nal_cb_t ksocknal_lib = { - nal_data: &ksocknal_data, /* NAL private data */ - cb_send: ksocknal_send, - cb_send_pages: ksocknal_send_pages, - cb_recv: ksocknal_recv, - cb_recv_pages: ksocknal_recv_pages, - cb_read: ksocknal_read, - cb_write: ksocknal_write, - cb_malloc: ksocknal_malloc, - cb_free: ksocknal_free, - cb_printf: ksocknal_printf, - cb_cli: ksocknal_cli, - cb_sti: ksocknal_sti, - cb_callback: ksocknal_callback, - cb_dist: ksocknal_dist +lib_nal_t ksocknal_lib = { + libnal_data: &ksocknal_data, /* NAL private data */ + libnal_send: ksocknal_send, + libnal_send_pages: ksocknal_send_pages, + libnal_recv: ksocknal_recv, + libnal_recv_pages: ksocknal_recv_pages, + libnal_dist: ksocknal_dist }; diff --git a/lustre/portals/libcfs/module.c b/lustre/portals/libcfs/module.c index 4e63c86..06f1578 100644 --- a/lustre/portals/libcfs/module.c +++ b/lustre/portals/libcfs/module.c @@ -52,11 +52,12 @@ #define PORTAL_MINOR 240 struct nal_cmd_handler { + int nch_number; nal_cmd_handler_fn *nch_handler; void *nch_private; }; -static struct nal_cmd_handler nal_cmd[NAL_MAX_NR + 1]; +static struct nal_cmd_handler nal_cmd[16]; static DECLARE_MUTEX(nal_cmd_sem); #ifdef PORTAL_DEBUG @@ -245,23 +246,53 @@ static inline void freedata(void *data, int len) PORTAL_FREE(data, len); } +struct nal_cmd_handler * +libcfs_find_nal_cmd_handler(int nal) +{ + int i; + + for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++) + if (nal_cmd[i].nch_handler != NULL && + nal_cmd[i].nch_number == nal) + return (&nal_cmd[i]); + + return (NULL); +} + int libcfs_nal_cmd_register(int nal, nal_cmd_handler_fn *handler, void *private) { - int rc = 0; + struct nal_cmd_handler *cmd; + int i; + int rc; CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler); - if (nal > 0 && nal <= NAL_MAX_NR) { - down(&nal_cmd_sem); - if (nal_cmd[nal].nch_handler != NULL) - rc = -EBUSY; - else { - nal_cmd[nal].nch_handler = handler; - nal_cmd[nal].nch_private = private; + down(&nal_cmd_sem); + + if (libcfs_find_nal_cmd_handler(nal) != NULL) { + up (&nal_cmd_sem); + return (-EBUSY); + } + + cmd = NULL; + for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++) + if (nal_cmd[i].nch_handler == NULL) { + cmd = &nal_cmd[i]; + break; } - up(&nal_cmd_sem); + + if (cmd == NULL) { + rc = -EBUSY; + } else { + rc = 0; + cmd->nch_number = nal; + cmd->nch_handler = handler; + cmd->nch_private = private; } + + up(&nal_cmd_sem); + return rc; } EXPORT_SYMBOL(libcfs_nal_cmd_register); @@ -269,14 +300,15 @@ EXPORT_SYMBOL(libcfs_nal_cmd_register); void libcfs_nal_cmd_unregister(int nal) { - CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal); + struct nal_cmd_handler *cmd; - LASSERT(nal > 0 && nal <= NAL_MAX_NR); - LASSERT(nal_cmd[nal].nch_handler != NULL); + CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal); down(&nal_cmd_sem); - nal_cmd[nal].nch_handler = NULL; - nal_cmd[nal].nch_private = NULL; + cmd = libcfs_find_nal_cmd_handler(nal); + LASSERT (cmd != NULL); + cmd->nch_handler = NULL; + cmd->nch_private = NULL; up(&nal_cmd_sem); } EXPORT_SYMBOL(libcfs_nal_cmd_unregister); @@ -284,16 +316,17 @@ EXPORT_SYMBOL(libcfs_nal_cmd_unregister); int libcfs_nal_cmd(struct portals_cfg *pcfg) { + struct nal_cmd_handler *cmd; __u32 nal = pcfg->pcfg_nal; int rc = -EINVAL; ENTRY; down(&nal_cmd_sem); - if (nal > 0 && nal <= NAL_MAX_NR && - nal_cmd[nal].nch_handler != NULL) { + cmd = libcfs_find_nal_cmd_handler(nal); + if (cmd != NULL) { CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, pcfg->pcfg_command); - rc = nal_cmd[nal].nch_handler(pcfg, nal_cmd[nal].nch_private); + rc = cmd->nch_handler(pcfg, cmd->nch_private); } up(&nal_cmd_sem); diff --git a/lustre/portals/portals/Makefile.in b/lustre/portals/portals/Makefile.in index 6ce334b..c0f2e71 100644 --- a/lustre/portals/portals/Makefile.in +++ b/lustre/portals/portals/Makefile.in @@ -1,6 +1,6 @@ MODULES := portals -portals-objs := api-eq.o api-init.o api-me.o api-errno.o api-ni.o api-wrap.o -portals-objs += lib-dispatch.o lib-init.o lib-me.o lib-msg.o lib-eq.o lib-md.o +portals-objs := api-errno.o api-ni.o api-wrap.o +portals-objs += lib-init.o lib-me.o lib-msg.o lib-eq.o lib-md.o portals-objs += lib-move.o lib-ni.o lib-pid.o module.o @INCLUDE_RULES@ diff --git a/lustre/portals/portals/Makefile.mk b/lustre/portals/portals/Makefile.mk index de01765..088902a 100644 --- a/lustre/portals/portals/Makefile.mk +++ b/lustre/portals/portals/Makefile.mk @@ -6,7 +6,7 @@ include $(src)/../Kernelenv obj-y += portals.o -portals-objs := lib-dispatch.o lib-eq.o lib-init.o lib-md.o lib-me.o \ +portals-objs := lib-eq.o lib-init.o lib-md.o lib-me.o \ lib-move.o lib-msg.o lib-ni.o lib-pid.o \ - api-eq.o api-errno.o api-init.o api-me.o api-ni.o \ - api-wrap.o module.o + api-errno.o api-ni.o api-wrap.o \ + module.o diff --git a/lustre/portals/portals/api-eq.c b/lustre/portals/portals/api-eq.c deleted file mode 100644 index 0306043..0000000 --- a/lustre/portals/portals/api-eq.c +++ /dev/null @@ -1,120 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-eq.c - * User-level event queue management routines - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include - -int ptl_get_event (ptl_eq_t *eq, ptl_event_t *ev) -{ - int new_index = eq->sequence & (eq->size - 1); - ptl_event_t *new_event = &eq->base[new_index]; - ENTRY; - - CDEBUG(D_INFO, "new_event: %p, sequence: %lu, eq->size: %u\n", - new_event, eq->sequence, eq->size); - - if (PTL_SEQ_GT (eq->sequence, new_event->sequence)) { - RETURN(PTL_EQ_EMPTY); - } - - *ev = *new_event; - - /* ensure event is delivered correctly despite possible - races with lib_finalize */ - if (eq->sequence != new_event->sequence) { - CERROR("DROPPING EVENT: eq seq %lu ev seq %lu\n", - eq->sequence, new_event->sequence); - RETURN(PTL_EQ_DROPPED); - } - - eq->sequence = new_event->sequence + 1; - RETURN(PTL_OK); -} - -int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev) -{ - int which; - - return (PtlEQPoll (&eventq, 1, 0, ev, &which)); -} - -int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) -{ - int which; - - return (PtlEQPoll (&eventq_in, 1, PTL_TIME_FOREVER, - event_out, &which)); -} - -int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout, - ptl_event_t *event_out, int *which_out) -{ - nal_t *nal; - int i; - int rc; - unsigned long flags; - - if (!ptl_init) - RETURN(PTL_NO_INIT); - - if (neq_in < 1) - RETURN(PTL_EQ_INVALID); - - nal = ptl_hndl2nal(&eventqs_in[0]); - if (nal == NULL) - RETURN(PTL_EQ_INVALID); - - nal->lock(nal, &flags); - - for (;;) { - for (i = 0; i < neq_in; i++) { - ptl_eq_t *eq = ptl_handle2usereq(&eventqs_in[i]); - - if (i > 0 && - ptl_hndl2nal(&eventqs_in[i]) != nal) { - nal->unlock(nal, &flags); - RETURN (PTL_EQ_INVALID); - } - - /* size must be a power of 2 to handle a wrapped sequence # */ - LASSERT (eq->size != 0 && - eq->size == LOWEST_BIT_SET (eq->size)); - - rc = ptl_get_event (eq, event_out); - if (rc != PTL_EQ_EMPTY) { - nal->unlock(nal, &flags); - *which_out = i; - RETURN(rc); - } - } - - if (timeout == 0) { - nal->unlock(nal, &flags); - RETURN (PTL_EQ_EMPTY); - } - - timeout = nal->yield(nal, &flags, timeout); - } -} diff --git a/lustre/portals/portals/api-errno.c b/lustre/portals/portals/api-errno.c index 1c01c88..9a4e5ac 100644 --- a/lustre/portals/portals/api-errno.c +++ b/lustre/portals/portals/api-errno.c @@ -40,6 +40,9 @@ const char *ptl_err_str[] = { "PTL_EQ_IN_USE", + "PTL_NI_INVALID", + "PTL_MD_ILLEGAL", + "PTL_MAX_ERRNO" }; /* If you change these, you must update the number table in portals/errno.h */ diff --git a/lustre/portals/portals/api-init.c b/lustre/portals/portals/api-init.c deleted file mode 100644 index 9a98714..0000000 --- a/lustre/portals/portals/api-init.c +++ /dev/null @@ -1,49 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-init.c - * Initialization and global data for the p30 user side library - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include - -int PtlInit(int *max_interfaces) -{ - if (max_interfaces != NULL) - *max_interfaces = NAL_MAX_NR; - - LASSERT(!strcmp(ptl_err_str[PTL_MAX_ERRNO], "PTL_MAX_ERRNO")); - - return ptl_ni_init(); -} - - -void PtlFini(void) -{ - ptl_ni_fini(); -} - - -void PtlSnprintHandle(char *str, int len, ptl_handle_any_t h) -{ - snprintf(str, len, "0x%lx."LPX64, h.nal_idx, h.cookie); -} diff --git a/lustre/portals/portals/api-me.c b/lustre/portals/portals/api-me.c deleted file mode 100644 index 37f0150..0000000 --- a/lustre/portals/portals/api-me.c +++ /dev/null @@ -1,28 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-me.c - * Match Entry local operations. - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include - diff --git a/lustre/portals/portals/api-ni.c b/lustre/portals/portals/api-ni.c index 4f37d13..56afd45 100644 --- a/lustre/portals/portals/api-ni.c +++ b/lustre/portals/portals/api-ni.c @@ -66,6 +66,8 @@ nal_t *ptl_hndl2nal(ptl_handle_any_t *handle) * invalidated out from under her (or worse, swapped for a * completely different interface!) */ + LASSERT (ptl_init); + if (((idx ^ NI_HANDLE_MAGIC) & ~NI_HANDLE_MASK) != 0) return NULL; @@ -112,12 +114,17 @@ void ptl_unregister_nal (ptl_interface_t interface) ptl_mutex_exit(); } -int ptl_ni_init(void) +int PtlInit(int *max_interfaces) { + LASSERT(!strcmp(ptl_err_str[PTL_MAX_ERRNO], "PTL_MAX_ERRNO")); + /* If this assertion fails, we need more bits in NI_HANDLE_MASK and * to shift NI_HANDLE_MAGIC left appropriately */ LASSERT (NAL_MAX_NR <= (NI_HANDLE_MASK + 1)); + if (max_interfaces != NULL) + *max_interfaces = NAL_MAX_NR; + ptl_mutex_enter(); if (!ptl_init) { @@ -143,7 +150,7 @@ int ptl_ni_init(void) return PTL_OK; } -void ptl_ni_fini(void) +void PtlFini(void) { nal_t *nal; int i; @@ -160,7 +167,7 @@ void ptl_ni_fini(void) if (nal->nal_refct != 0) { CWARN("NAL %d has outstanding refcount %d\n", i, nal->nal_refct); - nal->shutdown(nal); + nal->nal_ni_fini(nal); } ptl_nal_table[i] = NULL; @@ -202,9 +209,11 @@ int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid, } nal = ptl_nal_table[interface]; - + nal->nal_handle.nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | interface; + nal->nal_handle.cookie = 0; + CDEBUG(D_OTHER, "Starting up NAL (%d) refs %d\n", interface, nal->nal_refct); - rc = nal->startup(nal, requested_pid, desired_limits, actual_limits); + rc = nal->nal_ni_init(nal, requested_pid, desired_limits, actual_limits); if (rc != PTL_OK) { CERROR("Error %d starting up NAL %d, refs %d\n", rc, @@ -218,10 +227,11 @@ int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid, } nal->nal_refct++; - handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | interface; + *handle = nal->nal_handle; out: ptl_mutex_exit (); + return rc; } @@ -248,15 +258,8 @@ int PtlNIFini(ptl_handle_ni_t ni) nal->nal_refct--; /* nal_refct == 0 tells nal->shutdown to really shut down */ - nal->shutdown(nal); + nal->nal_ni_fini(nal); ptl_mutex_exit (); return PTL_OK; } - -int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * ni_out) -{ - *ni_out = handle_in; - - return PTL_OK; -} diff --git a/lustre/portals/portals/api-wrap.c b/lustre/portals/portals/api-wrap.c index 3e6f9ce..d7ff020 100644 --- a/lustre/portals/portals/api-wrap.c +++ b/lustre/portals/portals/api-wrap.c @@ -26,133 +26,98 @@ # define DEBUG_SUBSYSTEM S_PORTALS #include -static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf, - int argsize, void *retbuf, int retsize) +void PtlSnprintHandle(char *str, int len, ptl_handle_any_t h) { - nal_t *nal; + snprintf(str, len, "0x%lx."LPX64, h.nal_idx, h.cookie); +} - if (!ptl_init) { - CERROR("Not initialized\n"); +int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t *ni_out) +{ + if (!ptl_init) return PTL_NO_INIT; - } - - nal = ptl_hndl2nal(&any_h); - if (!nal) + + if (ptl_hndl2nal(&handle_in) == NULL) return PTL_HANDLE_INVALID; - - nal->forward(nal, cmd, argbuf, argsize, retbuf, retsize); - + + *ni_out = handle_in; return PTL_OK; } int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id) { - PtlGetId_in args; - PtlGetId_out ret; - int rc; - - args.handle_in = ni_handle; + nal_t *nal; - rc = do_forward(ni_handle, PTL_GETID, &args, sizeof(args), &ret, - sizeof(ret)); - if (rc != PTL_OK) - return rc; + if (!ptl_init) + return PTL_NO_INIT; - if (id) - *id = ret.id_out; + nal = ptl_hndl2nal(&ni_handle); + if (nal == NULL) + return PTL_NI_INVALID; - return ret.rc; + return nal->nal_get_id(nal, id); } int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) { - PtlFailNid_in args; - PtlFailNid_out ret; - int rc; - - args.interface = interface; - args.nid = nid; - args.threshold = threshold; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; - rc = do_forward (interface, PTL_FAILNID, - &args, sizeof(args), &ret, sizeof (ret)); + nal = ptl_hndl2nal(&interface); + if (nal == NULL) + return PTL_NI_INVALID; - return ((rc != PTL_OK) ? rc : ret.rc); + return nal->nal_fail_nid(nal, nid, threshold); } int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, - ptl_sr_value_t * status_out) + ptl_sr_value_t *status_out) { - PtlNIStatus_in args; - PtlNIStatus_out ret; - int rc; + nal_t *nal; - args.interface_in = interface_in; - args.register_in = register_in; - - rc = do_forward(interface_in, PTL_NISTATUS, &args, sizeof(args), &ret, - sizeof(ret)); - - if (rc != PTL_OK) - return rc; - - if (status_out) - *status_out = ret.status_out; + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface_in); + if (nal == NULL) + return PTL_NI_INVALID; - return ret.rc; + return nal->nal_ni_status(nal, register_in, status_out); } int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, unsigned long *distance_out) { - PtlNIDist_in args; - PtlNIDist_out ret; - int rc; - - args.interface_in = interface_in; - args.process_in = process_in; - - rc = do_forward(interface_in, PTL_NIDIST, &args, sizeof(args), &ret, - sizeof(ret)); + nal_t *nal; - if (rc != PTL_OK) - return rc; - - if (distance_out) - *distance_out = ret.distance_out; + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface_in); + if (nal == NULL) + return PTL_NI_INVALID; - return ret.rc; + return nal->nal_ni_dist(nal, &process_in, distance_out); } int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, - ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out) + ptl_ins_pos_t pos_in, ptl_handle_me_t *handle_out) { - PtlMEAttach_in args; - PtlMEAttach_out ret; - int rc; - - args.interface_in = interface_in; - args.index_in = index_in; - args.match_id_in = match_id_in; - args.match_bits_in = match_bits_in; - args.ignore_bits_in = ignore_bits_in; - args.unlink_in = unlink_in; - args.position_in = pos_in; - - rc = do_forward(interface_in, PTL_MEATTACH, &args, sizeof(args), &ret, - sizeof(ret)); - - if (rc != PTL_OK) - return rc; - - if (handle_out) { - handle_out->nal_idx = interface_in.nal_idx; - handle_out->cookie = ret.handle_out.cookie; - } - - return ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface_in); + if (nal == NULL) + return PTL_NI_INVALID; + + return nal->nal_me_attach(nal, index_in, match_id_in, + match_bits_in, ignore_bits_in, + unlink_in, pos_in, handle_out); } int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, @@ -160,367 +125,226 @@ int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, ptl_handle_me_t * handle_out) { - PtlMEInsert_in args; - PtlMEInsert_out ret; - int rc; - - args.current_in = current_in; - args.match_id_in = match_id_in; - args.match_bits_in = match_bits_in; - args.ignore_bits_in = ignore_bits_in; - args.unlink_in = unlink_in; - args.position_in = position_in; - - rc = do_forward(current_in, PTL_MEINSERT, &args, sizeof(args), &ret, - sizeof(ret)); - - if (rc != PTL_OK) - return (rc == PTL_HANDLE_INVALID) ? PTL_ME_INVALID : rc; - - if (handle_out) { - handle_out->nal_idx = current_in.nal_idx; - handle_out->cookie = ret.handle_out.cookie; - } - return ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(¤t_in); + if (nal == NULL) + return PTL_ME_INVALID; + + return nal->nal_me_insert(nal, ¤t_in, match_id_in, + match_bits_in, ignore_bits_in, + unlink_in, position_in, handle_out); } int PtlMEUnlink(ptl_handle_me_t current_in) { - PtlMEUnlink_in args; - PtlMEUnlink_out ret; - int rc; + nal_t *nal; - args.current_in = current_in; - args.unlink_in = PTL_RETAIN; - - rc = do_forward(current_in, PTL_MEUNLINK, &args, sizeof(args), &ret, - sizeof(ret)); - - if (rc != PTL_OK) - return (rc == PTL_HANDLE_INVALID) ? PTL_ME_INVALID : rc; + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(¤t_in); + if (nal == NULL) + return PTL_ME_INVALID; - return ret.rc; + return nal->nal_me_unlink(nal, ¤t_in); } -int PtlTblDump(ptl_handle_ni_t ni, int index_in) +int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) { - PtlTblDump_in args; - PtlTblDump_out ret; - int rc; + nal_t *nal; - args.index_in = index_in; - - rc = do_forward(ni, PTL_TBLDUMP, &args, sizeof(args), &ret, - sizeof(ret)); + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&me_in); + if (nal == NULL) + return PTL_ME_INVALID; - if (rc != PTL_OK) - return rc; + if (!PtlHandleIsEqual(md_in.eventq, PTL_EQ_NONE) && + ptl_hndl2nal(&md_in.eventq) != nal) + return PTL_MD_ILLEGAL; - return ret.rc; + return (nal->nal_md_attach)(nal, &me_in, &md_in, + unlink_in, handle_out); } -int PtlMEDump(ptl_handle_me_t current_in) +int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t *handle_out) { - PtlMEDump_in args; - PtlMEDump_out ret; - int rc; + nal_t *nal; - args.current_in = current_in; - - rc = do_forward(current_in, PTL_MEDUMP, &args, sizeof(args), &ret, - sizeof(ret)); + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&ni_in); + if (nal == NULL) + return PTL_NI_INVALID; - if (rc != PTL_OK) - return (rc == PTL_HANDLE_INVALID) ? PTL_ME_INVALID : rc; + if (!PtlHandleIsEqual(md_in.eventq, PTL_EQ_NONE) && + ptl_hndl2nal(&md_in.eventq) != nal) + return PTL_MD_ILLEGAL; - return ret.rc; + return (nal->nal_md_bind)(nal, &md_in, unlink_in, handle_out); } -static ptl_handle_eq_t md2eq (ptl_md_t *md) +int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout, + ptl_md_t *new_inout, ptl_handle_eq_t testq_in) { - if (PtlHandleIsEqual (md->eventq, PTL_EQ_NONE)) - return (PTL_EQ_NONE); + nal_t *nal; - return (ptl_handle2usereq (&md->eventq)->cb_eq_handle); -} - - -int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) -{ - PtlMDAttach_in args; - PtlMDAttach_out ret; - int rc; - - args.eq_in = md2eq(&md_in); - args.me_in = me_in; - args.md_in = md_in; - args.unlink_in = unlink_in; - - rc = do_forward(me_in, PTL_MDATTACH, - &args, sizeof(args), &ret, sizeof(ret)); - - if (rc != PTL_OK) - return (rc == PTL_HANDLE_INVALID) ? PTL_ME_INVALID : rc; - - if (handle_out) { - handle_out->nal_idx = me_in.nal_idx; - handle_out->cookie = ret.handle_out.cookie; - } - return ret.rc; -} - + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&md_in); + if (nal == NULL) + return PTL_MD_INVALID; + if (!PtlHandleIsEqual(testq_in, PTL_EQ_NONE) && + ptl_hndl2nal(&testq_in) != nal) + return PTL_EQ_INVALID; -int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) -{ - PtlMDBind_in args; - PtlMDBind_out ret; - int rc; - - args.eq_in = md2eq(&md_in); - args.ni_in = ni_in; - args.md_in = md_in; - args.unlink_in = unlink_in; - - rc = do_forward(ni_in, PTL_MDBIND, - &args, sizeof(args), &ret, sizeof(ret)); - - if (rc != PTL_OK) - return rc; - - if (handle_out) { - handle_out->nal_idx = ni_in.nal_idx; - handle_out->cookie = ret.handle_out.cookie; - } - return ret.rc; + return (nal->nal_md_update)(nal, &md_in, + old_inout, new_inout, &testq_in); } -int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout, - ptl_md_t *new_inout, ptl_handle_eq_t testq_in) +int PtlMDUnlink(ptl_handle_md_t md_in) { - PtlMDUpdate_internal_in args; - PtlMDUpdate_internal_out ret; - int rc; - - args.md_in = md_in; - - if (old_inout) { - args.old_inout = *old_inout; - args.old_inout_valid = 1; - } else - args.old_inout_valid = 0; - - if (new_inout) { - args.new_inout = *new_inout; - args.new_inout_valid = 1; - } else - args.new_inout_valid = 0; - - if (PtlHandleIsEqual (testq_in, PTL_EQ_NONE)) { - args.testq_in = PTL_EQ_NONE; - args.sequence_in = -1; - } else { - ptl_eq_t *eq = ptl_handle2usereq (&testq_in); - - args.testq_in = eq->cb_eq_handle; - args.sequence_in = eq->sequence; - } - - rc = do_forward(md_in, PTL_MDUPDATE, &args, sizeof(args), &ret, - sizeof(ret)); - if (rc != PTL_OK) - return (rc == PTL_HANDLE_INVALID) ? PTL_MD_INVALID : rc; - - if (old_inout) - *old_inout = ret.old_inout; - - return ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&md_in); + if (nal == NULL) + return PTL_MD_INVALID; + + return (nal->nal_md_unlink)(nal, &md_in); } -int PtlMDUnlink(ptl_handle_md_t md_in) +int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count, + ptl_eq_handler_t callback, + ptl_handle_eq_t *handle_out) { - PtlMDUnlink_in args; - PtlMDUnlink_out ret; - int rc; - - args.md_in = md_in; - rc = do_forward(md_in, PTL_MDUNLINK, &args, sizeof(args), &ret, - sizeof(ret)); - if (rc != PTL_OK) - return (rc == PTL_HANDLE_INVALID) ? PTL_MD_INVALID : rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&interface); + if (nal == NULL) + return PTL_NI_INVALID; - return ret.rc; + return (nal->nal_eq_alloc)(nal, count, callback, handle_out); } -int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count, - ptl_eq_handler_t callback, - ptl_handle_eq_t * handle_out) +int PtlEQFree(ptl_handle_eq_t eventq) { - ptl_eq_t *eq = NULL; - ptl_event_t *ev = NULL; - PtlEQAlloc_in args; - PtlEQAlloc_out ret; - int rc, i; - nal_t *nal; + nal_t *nal; if (!ptl_init) return PTL_NO_INIT; - nal = ptl_hndl2nal (&interface); + nal = ptl_hndl2nal(&eventq); if (nal == NULL) - return PTL_HANDLE_INVALID; + return PTL_EQ_INVALID; - if (count != LOWEST_BIT_SET(count)) { /* not a power of 2 already */ - do { /* knock off all but the top bit... */ - count &= ~LOWEST_BIT_SET (count); - } while (count != LOWEST_BIT_SET(count)); - - count <<= 1; /* ...and round up */ - } - - if (count == 0) /* catch bad parameter / overflow on roundup */ - return (PTL_VAL_FAILED); - - PORTAL_ALLOC(ev, count * sizeof(ptl_event_t)); - if (!ev) - return PTL_NO_SPACE; - - for (i = 0; i < count; i++) - ev[i].sequence = 0; - - args.ni_in = interface; - args.count_in = count; - args.base_in = ev; - args.len_in = count * sizeof(*ev); - args.callback_in = callback; - - rc = do_forward(interface, PTL_EQALLOC, &args, sizeof(args), &ret, - sizeof(ret)); - if (rc != PTL_OK) - goto fail; - if (ret.rc) - GOTO(fail, rc = ret.rc); - - PORTAL_ALLOC(eq, sizeof(*eq)); - if (!eq) { - rc = PTL_NO_SPACE; - goto fail; - } - - eq->sequence = 1; - eq->size = count; - eq->base = ev; - - /* EQ handles are a little wierd. PtlEQGet() just looks at the - * queued events in shared memory. It doesn't want to do_forward() - * at all, so the cookie in the EQ handle we pass out of here is - * simply a pointer to the event queue we just set up. We stash - * the handle returned by do_forward(), so we can pass it back via - * do_forward() when we need to. */ - - eq->cb_eq_handle.nal_idx = interface.nal_idx; - eq->cb_eq_handle.cookie = ret.handle_out.cookie; - - handle_out->nal_idx = interface.nal_idx; - handle_out->cookie = (__u64)((unsigned long)eq); - return PTL_OK; + return (nal->nal_eq_free)(nal, &eventq); +} -fail: - PORTAL_FREE(ev, count * sizeof(ptl_event_t)); - return rc; +int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t *ev) +{ + int which; + + return (PtlEQPoll (&eventq, 1, 0, ev, &which)); } -int PtlEQFree(ptl_handle_eq_t eventq) +int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) { - PtlEQFree_in args; - PtlEQFree_out ret; - ptl_eq_t *eq; - int rc; + int which; + + return (PtlEQPoll (&eventq_in, 1, PTL_TIME_FOREVER, + event_out, &which)); +} - eq = ptl_handle2usereq (&eventq); - args.eventq_in = eq->cb_eq_handle; +int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout, + ptl_event_t *event_out, int *which_out) +{ + int i; + nal_t *nal; - rc = do_forward(eq->cb_eq_handle, PTL_EQFREE, &args, - sizeof(args), &ret, sizeof(ret)); + if (!ptl_init) + return PTL_NO_INIT; + + if (neq_in < 1) + return PTL_EQ_INVALID; + + nal = ptl_hndl2nal(&eventqs_in[0]); + if (nal == NULL) + return PTL_EQ_INVALID; - /* XXX we're betting rc == PTL_OK here */ - PORTAL_FREE(eq->base, eq->size * sizeof(ptl_event_t)); - PORTAL_FREE(eq, sizeof(*eq)); + for (i = 1; i < neq_in; i++) + if (ptl_hndl2nal(&eventqs_in[i]) != nal) + return PTL_EQ_INVALID; - return rc; + return (nal->nal_eq_poll)(nal, eventqs_in, neq_in, timeout, + event_out, which_out); } + int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, ptl_process_id_t match_id_in, ptl_pt_index_t portal_in) { - PtlACEntry_in args; - PtlACEntry_out ret; - int rc; - - /* - * Copy arguments into the argument block to - * hand to the forwarding object - */ - args.ni_in = ni_in; - args.index_in = index_in; - args.match_id_in = match_id_in; - args.portal_in = portal_in; - - rc = do_forward(ni_in, PTL_ACENTRY, &args, sizeof(args), &ret, - sizeof(ret)); - - return (rc != PTL_OK) ? rc : ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&ni_in); + if (nal == NULL) + return PTL_NI_INVALID; + + return (nal->nal_ace_entry)(nal, index_in, match_id_in, portal_in); } int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, ptl_process_id_t target_in, ptl_pt_index_t portal_in, - ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, + ptl_ac_index_t ac_in, ptl_match_bits_t match_bits_in, ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in) { - PtlPut_in args; - PtlPut_out ret; - int rc; - - /* - * Copy arguments into the argument block to - * hand to the forwarding object - */ - args.md_in = md_in; - args.ack_req_in = ack_req_in; - args.target_in = target_in; - args.portal_in = portal_in; - args.cookie_in = cookie_in; - args.match_bits_in = match_bits_in; - args.offset_in = offset_in; - args.hdr_data_in = hdr_data_in; - - rc = do_forward(md_in, PTL_PUT, &args, sizeof(args), &ret, sizeof(ret)); - - return (rc != PTL_OK) ? rc : ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&md_in); + if (nal == NULL) + return PTL_MD_INVALID; + + return (nal->nal_put)(nal, &md_in, ack_req_in, + &target_in, portal_in, ac_in, + match_bits_in, offset_in, hdr_data_in); } int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, - ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, + ptl_pt_index_t portal_in, ptl_ac_index_t ac_in, ptl_match_bits_t match_bits_in, ptl_size_t offset_in) { - PtlGet_in args; - PtlGet_out ret; - int rc; - - /* - * Copy arguments into the argument block to - * hand to the forwarding object - */ - args.md_in = md_in; - args.target_in = target_in; - args.portal_in = portal_in; - args.cookie_in = cookie_in; - args.match_bits_in = match_bits_in; - args.offset_in = offset_in; - - rc = do_forward(md_in, PTL_GET, &args, sizeof(args), &ret, sizeof(ret)); - - return (rc != PTL_OK) ? rc : ret.rc; + nal_t *nal; + + if (!ptl_init) + return PTL_NO_INIT; + + nal = ptl_hndl2nal(&md_in); + if (nal == NULL) + return PTL_MD_INVALID; + + return (nal->nal_get)(nal, &md_in, + &target_in, portal_in, ac_in, + match_bits_in, offset_in); } + diff --git a/lustre/portals/portals/autoMakefile.am b/lustre/portals/portals/autoMakefile.am index bf7a107..285f8fe 100644 --- a/lustre/portals/portals/autoMakefile.am +++ b/lustre/portals/portals/autoMakefile.am @@ -3,8 +3,8 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution -my_sources = api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c \ - lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c \ +my_sources = api-errno.c api-ni.c api-wrap.c \ + lib-init.c lib-me.c lib-msg.c lib-eq.c \ lib-md.c lib-move.c lib-ni.c lib-pid.c if !CRAY_PORTALS diff --git a/lustre/portals/portals/lib-dispatch.c b/lustre/portals/portals/lib-dispatch.c deleted file mode 100644 index 798e117..0000000 --- a/lustre/portals/portals/lib-dispatch.c +++ /dev/null @@ -1,79 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib/lib-dispatch.c - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include -#include - -typedef struct { - int (*fun) (nal_cb_t * nal, void *private, void *in, void *out); - char *name; -} dispatch_table_t; - -static dispatch_table_t dispatch_table[] = { - [PTL_GETID] {do_PtlGetId, "PtlGetId"}, - [PTL_NISTATUS] {do_PtlNIStatus, "PtlNIStatus"}, - [PTL_NIDIST] {do_PtlNIDist, "PtlNIDist"}, - [PTL_MEATTACH] {do_PtlMEAttach, "PtlMEAttach"}, - [PTL_MEINSERT] {do_PtlMEInsert, "PtlMEInsert"}, - [PTL_MEUNLINK] {do_PtlMEUnlink, "PtlMEUnlink"}, - [PTL_TBLDUMP] {do_PtlTblDump, "PtlTblDump"}, - [PTL_MEDUMP] {do_PtlMEDump, "PtlMEDump"}, - [PTL_MDATTACH] {do_PtlMDAttach, "PtlMDAttach"}, - [PTL_MDBIND] {do_PtlMDBind, "PtlMDBind"}, - [PTL_MDUPDATE] {do_PtlMDUpdate_internal, "PtlMDUpdate_internal"}, - [PTL_MDUNLINK] {do_PtlMDUnlink, "PtlMDUnlink"}, - [PTL_EQALLOC] {do_PtlEQAlloc_internal, "PtlEQAlloc_internal"}, - [PTL_EQFREE] {do_PtlEQFree_internal, "PtlEQFree_internal"}, - [PTL_PUT] {do_PtlPut, "PtlPut"}, - [PTL_GET] {do_PtlGet, "PtlGet"}, - [PTL_FAILNID] {do_PtlFailNid, "PtlFailNid"}, - /* */ {0, ""} -}; - -/* - * This really should be elsewhere, but lib-p30/dispatch.c is - * an automatically generated file. - */ -void lib_dispatch(nal_cb_t * nal, void *private, int index, void *arg_block, - void *ret_block) -{ - lib_ni_t *ni = &nal->ni; - - if (index < 0 || index > LIB_MAX_DISPATCH || - !dispatch_table[index].fun) { - CDEBUG(D_NET, LPU64": Invalid API call %d\n", ni->nid, index); - return; - } - - CDEBUG(D_NET, LPU64": API call %s (%d)\n", ni->nid, - dispatch_table[index].name, index); - - dispatch_table[index].fun(nal, private, arg_block, ret_block); -} - -char *dispatch_name(int index) -{ - return dispatch_table[index].name; -} diff --git a/lustre/portals/portals/lib-eq.c b/lustre/portals/portals/lib-eq.c index 8a91860..8ea6fdd 100644 --- a/lustre/portals/portals/lib-eq.c +++ b/lustre/portals/portals/lib-eq.c @@ -25,104 +25,241 @@ #define DEBUG_SUBSYSTEM S_PORTALS #include -#include -int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *v_args, - void *v_ret) +int +lib_api_eq_alloc (nal_t *apinal, ptl_size_t count, + ptl_eq_handler_t callback, + ptl_handle_eq_t *handle) { - /* - * Incoming: - * ptl_handle_ni_t ni_in - * ptl_size_t count_in - * void * base_in - * - * Outgoing: - * ptl_handle_eq_t * handle_out - */ - - PtlEQAlloc_in *args = v_args; - PtlEQAlloc_out *ret = v_ret; - - lib_eq_t *eq; - unsigned long flags; - - /* api should have rounded up */ - if (args->count_in != LOWEST_BIT_SET (args->count_in)) - return ret->rc = PTL_VAL_FAILED; + lib_nal_t *nal = apinal->nal_data; + lib_eq_t *eq; + unsigned long flags; + int rc; + /* We need count to be a power of 2 so that when eq_{enq,deq}_seq + * overflow, they don't skip entries, so the queue has the same + * apparant capacity at all times */ + + if (count != LOWEST_BIT_SET(count)) { /* not a power of 2 already */ + do { /* knock off all but the top bit... */ + count &= ~LOWEST_BIT_SET (count); + } while (count != LOWEST_BIT_SET(count)); + + count <<= 1; /* ...and round up */ + } + + if (count == 0) /* catch bad parameter / overflow on roundup */ + return (PTL_VAL_FAILED); + eq = lib_eq_alloc (nal); if (eq == NULL) - return (ret->rc = PTL_NO_SPACE); + return (PTL_NO_SPACE); - state_lock(nal, &flags); + PORTAL_ALLOC(eq->eq_events, count * sizeof(ptl_event_t)); + if (eq->eq_events == NULL) { + LIB_LOCK(nal, flags); + lib_eq_free (nal, eq); + LIB_UNLOCK(nal, flags); + } - if (nal->cb_map != NULL) { + if (nal->libnal_map != NULL) { struct iovec iov = { - .iov_base = args->base_in, - .iov_len = args->count_in * sizeof (ptl_event_t) }; + .iov_base = eq->eq_events, + .iov_len = count * sizeof(ptl_event_t)}; - ret->rc = nal->cb_map (nal, 1, &iov, &eq->eq_addrkey); - if (ret->rc != PTL_OK) { + rc = nal->libnal_map(nal, 1, &iov, &eq->eq_addrkey); + if (rc != PTL_OK) { + LIB_LOCK(nal, flags); lib_eq_free (nal, eq); - - state_unlock (nal, &flags); - return (ret->rc); + LIB_UNLOCK(nal, flags); + return (rc); } } - eq->sequence = 1; - eq->base = args->base_in; - eq->size = args->count_in; + /* NB this resets all event sequence numbers to 0, to be earlier + * than eq_deq_seq */ + memset(eq->eq_events, 0, count * sizeof(ptl_event_t)); + + eq->eq_deq_seq = 1; + eq->eq_enq_seq = 1; + eq->eq_size = count; eq->eq_refcount = 0; - eq->event_callback = args->callback_in; + eq->eq_callback = callback; + + LIB_LOCK(nal, flags); lib_initialise_handle (nal, &eq->eq_lh, PTL_COOKIE_TYPE_EQ); - list_add (&eq->eq_list, &nal->ni.ni_active_eqs); + list_add (&eq->eq_list, &nal->libnal_ni.ni_active_eqs); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - ptl_eq2handle(&ret->handle_out, eq); - return (ret->rc = PTL_OK); + ptl_eq2handle(handle, nal, eq); + return (PTL_OK); } -int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *v_args, - void *v_ret) +int +lib_api_eq_free(nal_t *apinal, ptl_handle_eq_t *eqh) { - /* - * Incoming: - * ptl_handle_eq_t eventq_in - * - * Outgoing: - */ - - PtlEQFree_in *args = v_args; - PtlEQFree_out *ret = v_ret; - lib_eq_t *eq; - long flags; + lib_nal_t *nal = apinal->nal_data; + lib_eq_t *eq; + int size; + ptl_event_t *events; + void *addrkey; + unsigned long flags; - state_lock (nal, &flags); + LIB_LOCK(nal, flags); - eq = ptl_handle2eq(&args->eventq_in, nal); + eq = ptl_handle2eq(eqh, nal); if (eq == NULL) { - ret->rc = PTL_EQ_INVALID; - } else if (eq->eq_refcount != 0) { - ret->rc = PTL_EQ_IN_USE; + LIB_UNLOCK(nal, flags); + return (PTL_EQ_INVALID); + } + + if (eq->eq_refcount != 0) { + LIB_UNLOCK(nal, flags); + return (PTL_EQ_IN_USE); + } + + /* stash for free after lock dropped */ + events = eq->eq_events; + size = eq->eq_size; + addrkey = eq->eq_addrkey; + + lib_invalidate_handle (nal, &eq->eq_lh); + list_del (&eq->eq_list); + lib_eq_free (nal, eq); + + LIB_UNLOCK(nal, flags); + + if (nal->libnal_unmap != NULL) { + struct iovec iov = { + .iov_base = events, + .iov_len = size * sizeof(ptl_event_t)}; + + nal->libnal_unmap(nal, 1, &iov, &addrkey); + } + + PORTAL_FREE(events, size * sizeof (ptl_event_t)); + + return (PTL_OK); +} + +int +lib_get_event (lib_eq_t *eq, ptl_event_t *ev) +{ + int new_index = eq->eq_deq_seq & (eq->eq_size - 1); + ptl_event_t *new_event = &eq->eq_events[new_index]; + int rc; + ENTRY; + + CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", + new_event, eq->eq_deq_seq, eq->eq_size); + + if (PTL_SEQ_GT (eq->eq_deq_seq, new_event->sequence)) { + RETURN(PTL_EQ_EMPTY); + } + + /* We've got a new event... */ + *ev = *new_event; + + /* ...but did it overwrite an event we've not seen yet? */ + if (eq->eq_deq_seq == new_event->sequence) { + rc = PTL_OK; } else { - if (nal->cb_unmap != NULL) { - struct iovec iov = { - .iov_base = eq->base, - .iov_len = eq->size * sizeof (ptl_event_t) }; - - nal->cb_unmap(nal, 1, &iov, &eq->eq_addrkey); + CERROR("Event Queue Overflow: eq seq %lu ev seq %lu\n", + eq->eq_deq_seq, new_event->sequence); + rc = PTL_EQ_DROPPED; + } + + eq->eq_deq_seq = new_event->sequence + 1; + RETURN(rc); +} + + +int +lib_api_eq_poll (nal_t *apinal, + ptl_handle_eq_t *eventqs, int neq, int timeout_ms, + ptl_event_t *event, int *which) +{ + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; + unsigned long flags; + int i; + int rc; +#ifdef __KERNEL__ + wait_queue_t wq; + unsigned long now; +#else + struct timeval then; + struct timeval now; + struct timespec ts; +#endif + ENTRY; + + LIB_LOCK(nal, flags); + + for (;;) { + for (i = 0; i < neq; i++) { + lib_eq_t *eq = ptl_handle2eq(&eventqs[i], nal); + + rc = lib_get_event (eq, event); + if (rc != PTL_EQ_EMPTY) { + LIB_UNLOCK(nal, flags); + *which = i; + RETURN(rc); + } + } + + if (timeout_ms == 0) { + LIB_UNLOCK (nal, flags); + RETURN (PTL_EQ_EMPTY); } - lib_invalidate_handle (nal, &eq->eq_lh); - list_del (&eq->eq_list); - lib_eq_free (nal, eq); - ret->rc = PTL_OK; - } + /* Some architectures force us to do spin locking/unlocking + * in the same stack frame, means we can abstract the + * locking here */ +#ifdef __KERNEL__ + init_waitqueue_entry(&wq, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&ni->ni_waitq, &wq); - state_unlock (nal, &flags); + LIB_UNLOCK(nal, flags); - return (ret->rc); + if (timeout_ms < 0) { + schedule (); + } else { + now = jiffies; + schedule_timeout((timeout_ms * HZ)/1000); + timeout_ms -= ((jiffies - now) * 1000)/HZ; + if (timeout_ms < 0) + timeout_ms = 0; + } + + LIB_LOCK(nal, flags); +#else + if (timeout_ms < 0) { + pthread_cond_wait(&ni->ni_cond, &ni->ni_mutex); + } else { + gettimeofday(&then, NULL); + + ts.tv_sec = then.tv_sec + timeout_ms/1000; + ts.tv_nsec = then.tv_usec * 1000 + + (timeout_ms%1000) * 1000000; + if (ts.tv_nsec >= 1000000000) { + ts.tv_sec++; + ts.tv_nsec -= 1000000000; + } + + pthread_cond_timedwait(&ni->ni_cond, + &ni->ni_mutex, &ts); + + gettimeofday(&now, NULL); + timeout_ms -= (now.tv_sec - then.tv_sec) * 1000 + + (now.tv_usec - then.tv_usec) / 1000; + + if (timeout_ms < 0) + timeout_ms = 0; + } +#endif + } } diff --git a/lustre/portals/portals/lib-init.c b/lustre/portals/portals/lib-init.c index c62dbc2..9d97bc1 100644 --- a/lustre/portals/portals/lib-init.c +++ b/lustre/portals/portals/lib-init.c @@ -41,7 +41,7 @@ #ifndef PTL_USE_LIB_FREELIST int -kportal_descriptor_setup (nal_cb_t *nal, +kportal_descriptor_setup (lib_nal_t *nal, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { @@ -54,13 +54,13 @@ kportal_descriptor_setup (nal_cb_t *nal, } void -kportal_descriptor_cleanup (nal_cb_t *nal) +kportal_descriptor_cleanup (lib_nal_t *nal) { } #else int -lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) +lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int n, int size) { char *space; @@ -68,7 +68,7 @@ lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) size += offsetof (lib_freeobj_t, fo_contents); - space = nal->cb_malloc (nal, n * size); + PORTAL_ALLOC(space, n * size); if (space == NULL) return (PTL_NO_SPACE); @@ -88,7 +88,7 @@ lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) } void -lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl) +lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl) { struct list_head *el; int count; @@ -102,23 +102,24 @@ lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl) LASSERT (count == fl->fl_nobjs); - nal->cb_free (nal, fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); + PORTAL_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); memset (fl, 0, sizeof (fl)); } int -kportal_descriptor_setup (nal_cb_t *nal, +kportal_descriptor_setup (lib_nal_t *nal, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { /* NB on failure caller must still call kportal_descriptor_cleanup */ /* ****** */ - int rc; + lib_ni_t *ni = &nal->libnal_ni; + int rc; - memset (&nal->ni.ni_free_mes, 0, sizeof (nal->ni.ni_free_mes)); - memset (&nal->ni.ni_free_msgs, 0, sizeof (nal->ni.ni_free_msgs)); - memset (&nal->ni.ni_free_mds, 0, sizeof (nal->ni.ni_free_mds)); - memset (&nal->ni.ni_free_eqs, 0, sizeof (nal->ni.ni_free_eqs)); + memset (&ni->ni_free_mes, 0, sizeof (ni->ni_free_mes)); + memset (&ni->ni_free_msgs, 0, sizeof (ni->ni_free_msgs)); + memset (&ni->ni_free_mds, 0, sizeof (ni->ni_free_mds)); + memset (&ni->ni_free_eqs, 0, sizeof (ni->ni_free_eqs)); /* Ignore requested limits! */ actual_limits->max_mes = MAX_MES; @@ -127,39 +128,41 @@ kportal_descriptor_setup (nal_cb_t *nal, /* Hahahah what a load of bollocks. There's nowhere to * specify the max # messages in-flight */ - rc = lib_freelist_init (nal, &nal->ni.ni_free_mes, + rc = lib_freelist_init (nal, &ni->ni_free_mes, MAX_MES, sizeof (lib_me_t)); if (rc != PTL_OK) return (rc); - rc = lib_freelist_init (nal, &nal->ni.ni_free_msgs, + rc = lib_freelist_init (nal, &ni->ni_free_msgs, MAX_MSGS, sizeof (lib_msg_t)); if (rc != PTL_OK) return (rc); - rc = lib_freelist_init (nal, &nal->ni.ni_free_mds, + rc = lib_freelist_init (nal, &ni->ni_free_mds, MAX_MDS, sizeof (lib_md_t)); if (rc != PTL_OK) return (rc); - rc = lib_freelist_init (nal, &nal->ni.ni_free_eqs, + rc = lib_freelist_init (nal, &ni->ni_free_eqs, MAX_EQS, sizeof (lib_eq_t)); return (rc); } void -kportal_descriptor_cleanup (nal_cb_t *nal) +kportal_descriptor_cleanup (lib_nal_t *nal) { - lib_freelist_fini (nal, &nal->ni.ni_free_mes); - lib_freelist_fini (nal, &nal->ni.ni_free_msgs); - lib_freelist_fini (nal, &nal->ni.ni_free_mds); - lib_freelist_fini (nal, &nal->ni.ni_free_eqs); + lib_ni_t *ni = &nal->libnal_ni; + + lib_freelist_fini (nal, &ni->ni_free_mes); + lib_freelist_fini (nal, &ni->ni_free_msgs); + lib_freelist_fini (nal, &ni->ni_free_mds); + lib_freelist_fini (nal, &ni->ni_free_eqs); } #endif __u64 -lib_create_interface_cookie (nal_cb_t *nal) +lib_create_interface_cookie (lib_nal_t *nal) { /* NB the interface cookie in wire handles guards against delayed * replies and ACKs appearing valid in a new instance of the same @@ -180,9 +183,9 @@ lib_create_interface_cookie (nal_cb_t *nal) } int -lib_setup_handle_hash (nal_cb_t *nal) +lib_setup_handle_hash (lib_nal_t *nal) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; int i; /* Arbitrary choice of hash table size */ @@ -191,9 +194,8 @@ lib_setup_handle_hash (nal_cb_t *nal) #else ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4; #endif - ni->ni_lh_hash_table = - (struct list_head *)nal->cb_malloc (nal, ni->ni_lh_hash_size - * sizeof (struct list_head)); + PORTAL_ALLOC(ni->ni_lh_hash_table, + ni->ni_lh_hash_size * sizeof (struct list_head)); if (ni->ni_lh_hash_table == NULL) return (PTL_NO_SPACE); @@ -206,22 +208,22 @@ lib_setup_handle_hash (nal_cb_t *nal) } void -lib_cleanup_handle_hash (nal_cb_t *nal) +lib_cleanup_handle_hash (lib_nal_t *nal) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; if (ni->ni_lh_hash_table == NULL) return; - nal->cb_free (nal, ni->ni_lh_hash_table, - ni->ni_lh_hash_size * sizeof (struct list_head)); + PORTAL_FREE(ni->ni_lh_hash_table, + ni->ni_lh_hash_size * sizeof (struct list_head)); } lib_handle_t * -lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type) +lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type) { /* ALWAYS called with statelock held */ - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; struct list_head *list; struct list_head *el; unsigned int hash; @@ -243,10 +245,10 @@ lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type) } void -lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type) +lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type) { /* ALWAYS called with statelock held */ - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; unsigned int hash; LASSERT (type >= 0 && type < PTL_COOKIE_TYPES); @@ -258,95 +260,120 @@ lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type) } void -lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh) +lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh) { list_del (&lh->lh_hash_chain); } int -lib_init(nal_cb_t *nal, ptl_process_id_t process_id, +lib_init(lib_nal_t *libnal, nal_t *apinal, + ptl_process_id_t process_id, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { int rc = PTL_OK; - lib_ni_t *ni = &nal->ni; - int ptl_size; - int i; + lib_ni_t *ni = &libnal->libnal_ni; + int ptl_size; + int i; ENTRY; /* NB serialised in PtlNIInit() */ lib_assert_wire_constants (); - - /* - * Allocate the portal table for this interface - * and all per-interface objects. - */ - memset(&ni->counters, 0, sizeof(lib_counters_t)); - rc = kportal_descriptor_setup (nal, requested_limits, - &ni->actual_limits); + /* Setup the API nal with the lib API handling functions */ + apinal->nal_get_id = lib_api_get_id; + apinal->nal_ni_status = lib_api_ni_status; + apinal->nal_ni_dist = lib_api_ni_dist; + apinal->nal_fail_nid = lib_api_fail_nid; + apinal->nal_me_attach = lib_api_me_attach; + apinal->nal_me_insert = lib_api_me_insert; + apinal->nal_me_unlink = lib_api_me_unlink; + apinal->nal_md_attach = lib_api_md_attach; + apinal->nal_md_bind = lib_api_md_bind; + apinal->nal_md_unlink = lib_api_md_unlink; + apinal->nal_md_update = lib_api_md_update; + apinal->nal_eq_alloc = lib_api_eq_alloc; + apinal->nal_eq_free = lib_api_eq_free; + apinal->nal_eq_poll = lib_api_eq_poll; + apinal->nal_put = lib_api_put; + apinal->nal_get = lib_api_get; + + apinal->nal_data = libnal; + ni->ni_api = apinal; + + rc = kportal_descriptor_setup (libnal, requested_limits, + &ni->ni_actual_limits); if (rc != PTL_OK) goto out; + memset(&ni->ni_counters, 0, sizeof(lib_counters_t)); + INIT_LIST_HEAD (&ni->ni_active_msgs); INIT_LIST_HEAD (&ni->ni_active_mds); INIT_LIST_HEAD (&ni->ni_active_eqs); - INIT_LIST_HEAD (&ni->ni_test_peers); - ni->ni_interface_cookie = lib_create_interface_cookie (nal); +#ifdef __KERNEL__ + spin_lock_init (&ni->ni_lock); + init_waitqueue_head (&ni->ni_waitq); +#else + pthread_mutex_init(&ni->ni_mutex, NULL); + pthread_cond_init(&ni->ni_cond, NULL); +#endif + + ni->ni_interface_cookie = lib_create_interface_cookie (libnal); ni->ni_next_object_cookie = 0; - rc = lib_setup_handle_hash (nal); + rc = lib_setup_handle_hash (libnal); if (rc != PTL_OK) goto out; - ni->nid = process_id.nid; - ni->pid = process_id.pid; + ni->ni_pid = process_id; if (requested_limits != NULL) ptl_size = requested_limits->max_pt_index + 1; else ptl_size = 64; - ni->tbl.size = ptl_size; - ni->tbl.tbl = nal->cb_malloc(nal, sizeof(struct list_head) * ptl_size); - if (ni->tbl.tbl == NULL) { + ni->ni_portals.size = ptl_size; + PORTAL_ALLOC(ni->ni_portals.tbl, + ptl_size * sizeof(struct list_head)); + if (ni->ni_portals.tbl == NULL) { rc = PTL_NO_SPACE; goto out; } for (i = 0; i < ptl_size; i++) - INIT_LIST_HEAD(&(ni->tbl.tbl[i])); + INIT_LIST_HEAD(&(ni->ni_portals.tbl[i])); /* max_{mes,mds,eqs} set in kportal_descriptor_setup */ /* We don't have an access control table! */ - ni->actual_limits.max_ac_index = -1; + ni->ni_actual_limits.max_ac_index = -1; - ni->actual_limits.max_pt_index = ptl_size - 1; - ni->actual_limits.max_md_iovecs = PTL_MD_MAX_IOV; - ni->actual_limits.max_me_list = INT_MAX; + ni->ni_actual_limits.max_pt_index = ptl_size - 1; + ni->ni_actual_limits.max_md_iovecs = PTL_MD_MAX_IOV; + ni->ni_actual_limits.max_me_list = INT_MAX; /* We don't support PtlGetPut! */ - ni->actual_limits.max_getput_md = 0; + ni->ni_actual_limits.max_getput_md = 0; if (actual_limits != NULL) - *actual_limits = ni->actual_limits; + *actual_limits = ni->ni_actual_limits; out: if (rc != PTL_OK) { - lib_cleanup_handle_hash (nal); - kportal_descriptor_cleanup (nal); + lib_cleanup_handle_hash (libnal); + kportal_descriptor_cleanup (libnal); } RETURN (rc); } int -lib_fini(nal_cb_t * nal) +lib_fini(lib_nal_t *nal) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; int idx; /* NB no state_lock() since this is the last reference. The NAL @@ -355,9 +382,9 @@ lib_fini(nal_cb_t * nal) * network op (eg MD with non-zero pending count) */ - for (idx = 0; idx < ni->tbl.size; idx++) - while (!list_empty (&ni->tbl.tbl[idx])) { - lib_me_t *me = list_entry (ni->tbl.tbl[idx].next, + for (idx = 0; idx < ni->ni_portals.size; idx++) + while (!list_empty (&ni->ni_portals.tbl[idx])) { + lib_me_t *me = list_entry (ni->ni_portals.tbl[idx].next, lib_me_t, me_list); CERROR ("Active me %p on exit\n", me); @@ -392,10 +419,16 @@ lib_fini(nal_cb_t * nal) lib_msg_free (nal, msg); } - nal->cb_free(nal, ni->tbl.tbl, sizeof(struct list_head) * ni->tbl.size); + PORTAL_FREE(ni->ni_portals.tbl, + ni->ni_portals.size * sizeof(struct list_head)); lib_cleanup_handle_hash (nal); kportal_descriptor_cleanup (nal); +#ifndef __KERNEL__ + pthread_mutex_destroy(&ni->ni_mutex); + pthread_cond_destroy(&ni->ni_cond); +#endif + return (PTL_OK); } diff --git a/lustre/portals/portals/lib-md.c b/lustre/portals/portals/lib-md.c index 64a55b9..a4df791 100644 --- a/lustre/portals/portals/lib-md.c +++ b/lustre/portals/portals/lib-md.c @@ -31,10 +31,10 @@ #endif #include -#include /* must be called with state lock held */ -void lib_md_unlink(nal_cb_t * nal, lib_md_t * md) +void +lib_md_unlink(lib_nal_t *nal, lib_md_t *md) { if ((md->md_flags & PTL_MD_FLAG_ZOMBIE) == 0) { /* first unlink attempt... */ @@ -62,12 +62,15 @@ void lib_md_unlink(nal_cb_t * nal, lib_md_t * md) CDEBUG(D_NET, "Unlinking md %p\n", md); if ((md->options & PTL_MD_KIOV) != 0) { - if (nal->cb_unmap_pages != NULL) - nal->cb_unmap_pages (nal, md->md_niov, md->md_iov.kiov, - &md->md_addrkey); - } else if (nal->cb_unmap != NULL) { - nal->cb_unmap (nal, md->md_niov, md->md_iov.iov, - &md->md_addrkey); + if (nal->libnal_unmap_pages != NULL) + nal->libnal_unmap_pages (nal, + md->md_niov, + md->md_iov.kiov, + &md->md_addrkey); + } else if (nal->libnal_unmap != NULL) { + nal->libnal_unmap (nal, + md->md_niov, md->md_iov.iov, + &md->md_addrkey); } if (md->eq != NULL) { @@ -80,124 +83,124 @@ void lib_md_unlink(nal_cb_t * nal, lib_md_t * md) } /* must be called with state lock held */ -static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, - ptl_md_t *md, ptl_handle_eq_t *eqh, int unlink) +static int +lib_md_build(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd, int unlink) { lib_eq_t *eq = NULL; int rc; int i; int niov; + int total_length = 0; /* NB we are passed an allocated, but uninitialised/active md. * if we return success, caller may lib_md_unlink() it. * otherwise caller may only lib_md_free() it. */ - if (!PtlHandleIsEqual (*eqh, PTL_EQ_NONE)) { - eq = ptl_handle2eq(eqh, nal); + if (!PtlHandleIsEqual (umd->eventq, PTL_EQ_NONE)) { + eq = ptl_handle2eq(&umd->eventq, nal); if (eq == NULL) return PTL_EQ_INVALID; } - /* Must check this _before_ allocation. Also, note that non-iov - * MDs must set md_niov to 0. */ - LASSERT((md->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) == 0 || - md->length <= PTL_MD_MAX_IOV); - /* This implementation doesn't know how to create START events or * disable END events. Best to LASSERT our caller is compliant so * we find out quickly... */ - LASSERT (PtlHandleIsEqual (*eqh, PTL_EQ_NONE) || - ((md->options & PTL_MD_EVENT_START_DISABLE) != 0 && - (md->options & PTL_MD_EVENT_END_DISABLE) == 0)); - - if ((md->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ - (md->max_size < 0 || md->max_size > md->length)) // illegal max_size - return PTL_MD_INVALID; - - new->me = NULL; - new->start = md->start; - new->offset = 0; - new->max_size = md->max_size; - new->options = md->options; - new->user_ptr = md->user_ptr; - new->eq = eq; - new->threshold = md->threshold; - new->pending = 0; - new->md_flags = (unlink == PTL_UNLINK) ? PTL_MD_FLAG_AUTO_UNLINK : 0; - - if ((md->options & PTL_MD_IOVEC) != 0) { - int total_length = 0; - - if ((md->options & PTL_MD_KIOV) != 0) /* Can't specify both */ - return PTL_MD_INVALID; - - new->md_niov = niov = md->length; - - if (nal->cb_read (nal, private, new->md_iov.iov, md->start, - niov * sizeof (new->md_iov.iov[0]))) - return PTL_SEGV; + LASSERT (eq == NULL || + ((umd->options & PTL_MD_EVENT_START_DISABLE) != 0 && + (umd->options & PTL_MD_EVENT_END_DISABLE) == 0)); + + lmd->me = NULL; + lmd->start = umd->start; + lmd->offset = 0; + lmd->max_size = umd->max_size; + lmd->options = umd->options; + lmd->user_ptr = umd->user_ptr; + lmd->eq = eq; + lmd->threshold = umd->threshold; + lmd->pending = 0; + lmd->md_flags = (unlink == PTL_UNLINK) ? PTL_MD_FLAG_AUTO_UNLINK : 0; + + if ((umd->options & PTL_MD_IOVEC) != 0) { + + if ((umd->options & PTL_MD_KIOV) != 0) /* Can't specify both */ + return PTL_MD_ILLEGAL; + + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.iov, umd->start, + niov * sizeof (lmd->md_iov.iov[0])); for (i = 0; i < niov; i++) { /* We take the base address on trust */ - if (new->md_iov.iov[i].iov_len <= 0) /* invalid length */ - return PTL_VAL_FAILED; + if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */ + return PTL_MD_ILLEGAL; - total_length += new->md_iov.iov[i].iov_len; + total_length += lmd->md_iov.iov[i].iov_len; } - new->length = total_length; + lmd->length = total_length; - if (nal->cb_map != NULL) { - rc = nal->cb_map (nal, niov, new->md_iov.iov, - &new->md_addrkey); + if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) // illegal max_size + return PTL_MD_ILLEGAL; + + if (nal->libnal_map != NULL) { + rc = nal->libnal_map (nal, niov, lmd->md_iov.iov, + &lmd->md_addrkey); if (rc != PTL_OK) return (rc); } - } else if ((md->options & PTL_MD_KIOV) != 0) { + } else if ((umd->options & PTL_MD_KIOV) != 0) { #ifndef __KERNEL__ - return PTL_MD_INVALID; -#else - int total_length = 0; - + return PTL_MD_ILLEGAL; +#else /* Trap attempt to use paged I/O if unsupported early. */ - if (nal->cb_send_pages == NULL || - nal->cb_recv_pages == NULL) + if (nal->libnal_send_pages == NULL || + nal->libnal_recv_pages == NULL) return PTL_MD_INVALID; - new->md_niov = niov = md->length; + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.kiov, umd->start, + niov * sizeof (lmd->md_iov.kiov[0])); - if (nal->cb_read (nal, private, new->md_iov.kiov, md->start, - niov * sizeof (new->md_iov.kiov[0]))) - return PTL_SEGV; - for (i = 0; i < niov; i++) { /* We take the page pointer on trust */ - if (new->md_iov.kiov[i].kiov_offset + - new->md_iov.kiov[i].kiov_len > PAGE_SIZE ) + if (lmd->md_iov.kiov[i].kiov_offset + + lmd->md_iov.kiov[i].kiov_len > PAGE_SIZE ) return PTL_VAL_FAILED; /* invalid length */ - total_length += new->md_iov.kiov[i].kiov_len; + total_length += lmd->md_iov.kiov[i].kiov_len; } - new->length = total_length; + lmd->length = total_length; + + if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) // illegal max_size + return PTL_MD_ILLEGAL; - if (nal->cb_map_pages != NULL) { - rc = nal->cb_map_pages (nal, niov, new->md_iov.kiov, - &new->md_addrkey); + if (nal->libnal_map_pages != NULL) { + rc = nal->libnal_map_pages (nal, niov, lmd->md_iov.kiov, + &lmd->md_addrkey); if (rc != PTL_OK) return (rc); } #endif } else { /* contiguous */ - new->length = md->length; - new->md_niov = niov = 1; - new->md_iov.iov[0].iov_base = md->start; - new->md_iov.iov[0].iov_len = md->length; - - if (nal->cb_map != NULL) { - rc = nal->cb_map (nal, niov, new->md_iov.iov, - &new->md_addrkey); + lmd->length = umd->length; + lmd->md_niov = niov = 1; + lmd->md_iov.iov[0].iov_base = umd->start; + lmd->md_iov.iov[0].iov_len = umd->length; + + if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > umd->length)) // illegal max_size + return PTL_MD_ILLEGAL; + + if (nal->libnal_map != NULL) { + rc = nal->libnal_map (nal, niov, lmd->md_iov.iov, + &lmd->md_addrkey); if (rc != PTL_OK) return (rc); } @@ -207,140 +210,125 @@ static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, eq->eq_refcount++; /* It's good; let handle2md succeed and add to active mds */ - lib_initialise_handle (nal, &new->md_lh, PTL_COOKIE_TYPE_MD); - list_add (&new->md_list, &nal->ni.ni_active_mds); + lib_initialise_handle (nal, &lmd->md_lh, PTL_COOKIE_TYPE_MD); + list_add (&lmd->md_list, &nal->libnal_ni.ni_active_mds); return PTL_OK; } /* must be called with state lock held */ -void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md, ptl_md_t * new) +void +lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd) { /* NB this doesn't copy out all the iov entries so when a * discontiguous MD is copied out, the target gets to know the * original iov pointer (in start) and the number of entries it had * and that's all. */ - new->start = md->start; - new->length = ((md->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) == 0) ? - md->length : md->md_niov; - new->threshold = md->threshold; - new->max_size = md->max_size; - new->options = md->options; - new->user_ptr = md->user_ptr; - ptl_eq2handle(&new->eventq, md->eq); + umd->start = lmd->start; + umd->length = ((lmd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) == 0) ? + lmd->length : lmd->md_niov; + umd->threshold = lmd->threshold; + umd->max_size = lmd->max_size; + umd->options = lmd->options; + umd->user_ptr = lmd->user_ptr; + ptl_eq2handle(&umd->eventq, nal, lmd->eq); } -int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_md_attach(nal_t *apinal, ptl_handle_me_t *meh, + ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle) { - /* - * Incoming: - * ptl_handle_me_t current_in - * ptl_md_t md_in - * ptl_unlink_t unlink_in - * - * Outgoing: - * ptl_handle_md_t * handle_out - */ - - PtlMDAttach_in *args = v_args; - PtlMDAttach_out *ret = v_ret; - lib_me_t *me; - lib_md_t *md; + lib_nal_t *nal = apinal->nal_data; + lib_me_t *me; + lib_md_t *md; unsigned long flags; + int rc; - if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && - args->md_in.length > PTL_MD_MAX_IOV) /* too many fragments */ - return (ret->rc = PTL_IOV_INVALID); + if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && + umd->length > PTL_MD_MAX_IOV) /* too many fragments */ + return PTL_IOV_INVALID; - md = lib_md_alloc(nal, &args->md_in); + md = lib_md_alloc(nal, umd); if (md == NULL) - return (ret->rc = PTL_NO_SPACE); + return PTL_NO_SPACE; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - me = ptl_handle2me(&args->me_in, nal); + me = ptl_handle2me(meh, nal); if (me == NULL) { - ret->rc = PTL_ME_INVALID; + rc = PTL_ME_INVALID; } else if (me->md != NULL) { - ret->rc = PTL_ME_IN_USE; + rc = PTL_ME_IN_USE; } else { - ret->rc = lib_md_build(nal, md, private, &args->md_in, - &args->eq_in, args->unlink_in); - - if (ret->rc == PTL_OK) { + rc = lib_md_build(nal, md, umd, unlink); + if (rc == PTL_OK) { me->md = md; md->me = me; - ptl_md2handle(&ret->handle_out, md); + ptl_md2handle(handle, nal, md); - state_unlock (nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_OK); } } lib_md_free (nal, md); - state_unlock (nal, &flags); - return (ret->rc); + LIB_UNLOCK(nal, flags); + return (rc); } -int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_md_bind(nal_t *apinal, + ptl_md_t *umd, ptl_unlink_t unlink, + ptl_handle_md_t *handle) { - /* - * Incoming: - * ptl_handle_ni_t ni_in - * ptl_md_t md_in - * - * Outgoing: - * ptl_handle_md_t * handle_out - */ - - PtlMDBind_in *args = v_args; - PtlMDBind_out *ret = v_ret; - lib_md_t *md; + lib_nal_t *nal = apinal->nal_data; + lib_md_t *md; unsigned long flags; + int rc; - if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && - args->md_in.length > PTL_MD_MAX_IOV) /* too many fragments */ - return (ret->rc = PTL_IOV_INVALID); + if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && + umd->length > PTL_MD_MAX_IOV) /* too many fragments */ + return PTL_IOV_INVALID; - md = lib_md_alloc(nal, &args->md_in); + md = lib_md_alloc(nal, umd); if (md == NULL) - return (ret->rc = PTL_NO_SPACE); + return PTL_NO_SPACE; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - ret->rc = lib_md_build(nal, md, private, &args->md_in, - &args->eq_in, args->unlink_in); + rc = lib_md_build(nal, md, umd, unlink); - if (ret->rc == PTL_OK) { - ptl_md2handle(&ret->handle_out, md); + if (rc == PTL_OK) { + ptl_md2handle(handle, nal, md); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_OK); } lib_md_free (nal, md); - state_unlock(nal, &flags); - return (ret->rc); + LIB_UNLOCK(nal, flags); + return (rc); } -int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_md_unlink (nal_t *apinal, ptl_handle_md_t *mdh) { - PtlMDUnlink_in *args = v_args; - PtlMDUnlink_out *ret = v_ret; + lib_nal_t *nal = apinal->nal_data; ptl_event_t ev; lib_md_t *md; unsigned long flags; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - md = ptl_handle2md(&args->md_in, nal); + md = ptl_handle2md(mdh, nal); if (md == NULL) { - state_unlock(nal, &flags); - return (ret->rc = PTL_MD_INVALID); + LIB_UNLOCK(nal, flags); + return PTL_MD_INVALID; } /* If the MD is busy, lib_md_unlink just marks it for deletion, and @@ -356,95 +344,82 @@ int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) ev.unlinked = 1; lib_md_deconstruct(nal, md, &ev.mem_desc); - lib_enq_event_locked(nal, private, md->eq, &ev); + lib_enq_event_locked(nal, NULL, md->eq, &ev); } - lib_md_deconstruct(nal, md, &ret->status_out); lib_md_unlink(nal, md); - ret->rc = PTL_OK; - state_unlock(nal, &flags); - - return (PTL_OK); + LIB_UNLOCK(nal, flags); + return PTL_OK; } -int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args, - void *v_ret) +int +lib_api_md_update (nal_t *apinal, + ptl_handle_md_t *mdh, + ptl_md_t *oldumd, ptl_md_t *newumd, + ptl_handle_eq_t *testqh) { - /* - * Incoming: - * ptl_handle_md_t md_in - * ptl_md_t * old_inout - * ptl_md_t * new_inout - * ptl_handle_eq_t testq_in - * ptl_seq_t sequence_in - * - * Outgoing: - * ptl_md_t * old_inout - * ptl_md_t * new_inout - */ - PtlMDUpdate_internal_in *args = v_args; - PtlMDUpdate_internal_out *ret = v_ret; - lib_md_t *md; - lib_eq_t *test_eq = NULL; - ptl_md_t *new = &args->new_inout; + lib_nal_t *nal = apinal->nal_data; + lib_md_t *md; + lib_eq_t *test_eq = NULL; unsigned long flags; + int rc; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - md = ptl_handle2md(&args->md_in, nal); + md = ptl_handle2md(mdh, nal); if (md == NULL) { - ret->rc = PTL_MD_INVALID; + rc = PTL_MD_INVALID; goto out; } - if (args->old_inout_valid) - lib_md_deconstruct(nal, md, &ret->old_inout); + if (oldumd != NULL) + lib_md_deconstruct(nal, md, oldumd); - if (!args->new_inout_valid) { - ret->rc = PTL_OK; + if (newumd == NULL) { + rc = PTL_OK; goto out; } /* XXX fttb, the new MD must be the same "shape" wrt fragmentation, * since we simply overwrite the old lib-md */ - if ((((new->options ^ md->options) & + if ((((newumd->options ^ md->options) & (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0) || - ((new->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0 && - new->length != md->md_niov)) { - ret->rc = PTL_IOV_INVALID; + ((newumd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0 && + newumd->length != md->md_niov)) { + rc = PTL_IOV_INVALID; goto out; } - if (!PtlHandleIsEqual (args->testq_in, PTL_EQ_NONE)) { - test_eq = ptl_handle2eq(&args->testq_in, nal); + if (!PtlHandleIsEqual (*testqh, PTL_EQ_NONE)) { + test_eq = ptl_handle2eq(testqh, nal); if (test_eq == NULL) { - ret->rc = PTL_EQ_INVALID; + rc = PTL_EQ_INVALID; goto out; } } if (md->pending != 0) { - ret->rc = PTL_MD_NO_UPDATE; - goto out; + rc = PTL_MD_NO_UPDATE; + goto out; } if (test_eq == NULL || - test_eq->sequence == args->sequence_in) { + test_eq->eq_deq_seq == test_eq->eq_enq_seq) { lib_me_t *me = md->me; int unlink = (md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) ? PTL_UNLINK : PTL_RETAIN; // #warning this does not track eq refcounts properly - ret->rc = lib_md_build(nal, md, private, - new, &new->eventq, unlink); + rc = lib_md_build(nal, md, newumd, unlink); md->me = me; } else { - ret->rc = PTL_MD_NO_UPDATE; + rc = PTL_MD_NO_UPDATE; } out: - state_unlock(nal, &flags); - return (ret->rc); + LIB_UNLOCK(nal, flags); + + return rc; } diff --git a/lustre/portals/portals/lib-me.c b/lustre/portals/portals/lib-me.c index 271fc82..9665b4f 100644 --- a/lustre/portals/portals/lib-me.c +++ b/lustre/portals/portals/lib-me.c @@ -31,120 +31,129 @@ #endif #include -#include -static void lib_me_dump(nal_cb_t * nal, lib_me_t * me); - -int do_PtlMEAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_me_attach(nal_t *apinal, + ptl_pt_index_t portal, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle) { - PtlMEAttach_in *args = v_args; - PtlMEAttach_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; - lib_ptl_t *tbl = &ni->tbl; + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; + lib_ptl_t *tbl = &ni->ni_portals; + lib_me_t *me; unsigned long flags; - lib_me_t *me; - if (args->index_in >= tbl->size) - return ret->rc = PTL_PT_INDEX_INVALID; + if (portal >= tbl->size) + return PTL_PT_INDEX_INVALID; /* Should check for valid matchid, but not yet */ - if (0) - return ret->rc = PTL_PROCESS_INVALID; me = lib_me_alloc (nal); if (me == NULL) - return (ret->rc = PTL_NO_SPACE); + return PTL_NO_SPACE; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - me->match_id = args->match_id_in; - me->match_bits = args->match_bits_in; - me->ignore_bits = args->ignore_bits_in; - me->unlink = args->unlink_in; + me->match_id = match_id; + me->match_bits = match_bits; + me->ignore_bits = ignore_bits; + me->unlink = unlink; me->md = NULL; lib_initialise_handle (nal, &me->me_lh, PTL_COOKIE_TYPE_ME); - if (args->position_in == PTL_INS_AFTER) - list_add_tail(&me->me_list, &(tbl->tbl[args->index_in])); + if (pos == PTL_INS_AFTER) + list_add_tail(&me->me_list, &(tbl->tbl[portal])); else - list_add(&me->me_list, &(tbl->tbl[args->index_in])); + list_add(&me->me_list, &(tbl->tbl[portal])); - ptl_me2handle(&ret->handle_out, me); + ptl_me2handle(handle, nal, me); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return ret->rc = PTL_OK; + return PTL_OK; } -int do_PtlMEInsert(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_me_insert(nal_t *apinal, + ptl_handle_me_t *current_meh, + ptl_process_id_t match_id, + ptl_match_bits_t match_bits, + ptl_match_bits_t ignore_bits, + ptl_unlink_t unlink, ptl_ins_pos_t pos, + ptl_handle_me_t *handle) { - PtlMEInsert_in *args = v_args; - PtlMEInsert_out *ret = v_ret; + lib_nal_t *nal = apinal->nal_data; + lib_me_t *current_me; + lib_me_t *new_me; unsigned long flags; - lib_me_t *me; - lib_me_t *new; - new = lib_me_alloc (nal); - if (new == NULL) - return (ret->rc = PTL_NO_SPACE); + new_me = lib_me_alloc (nal); + if (new_me == NULL) + return PTL_NO_SPACE; /* Should check for valid matchid, but not yet */ - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - me = ptl_handle2me(&args->current_in, nal); - if (me == NULL) { - lib_me_free (nal, new); + current_me = ptl_handle2me(current_meh, nal); + if (current_me == NULL) { + lib_me_free (nal, new_me); - state_unlock (nal, &flags); - return (ret->rc = PTL_ME_INVALID); + LIB_UNLOCK(nal, flags); + return PTL_ME_INVALID; } - new->match_id = args->match_id_in; - new->match_bits = args->match_bits_in; - new->ignore_bits = args->ignore_bits_in; - new->unlink = args->unlink_in; - new->md = NULL; + new_me->match_id = match_id; + new_me->match_bits = match_bits; + new_me->ignore_bits = ignore_bits; + new_me->unlink = unlink; + new_me->md = NULL; - lib_initialise_handle (nal, &new->me_lh, PTL_COOKIE_TYPE_ME); + lib_initialise_handle (nal, &new_me->me_lh, PTL_COOKIE_TYPE_ME); - if (args->position_in == PTL_INS_AFTER) - list_add_tail(&new->me_list, &me->me_list); + if (pos == PTL_INS_AFTER) + list_add_tail(&new_me->me_list, ¤t_me->me_list); else - list_add(&new->me_list, &me->me_list); + list_add(&new_me->me_list, ¤t_me->me_list); - ptl_me2handle(&ret->handle_out, new); + ptl_me2handle(handle, nal, new_me); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return ret->rc = PTL_OK; + return PTL_OK; } -int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_me_unlink (nal_t *apinal, ptl_handle_me_t *meh) { - PtlMEUnlink_in *args = v_args; - PtlMEUnlink_out *ret = v_ret; + lib_nal_t *nal = apinal->nal_data; unsigned long flags; - lib_me_t *me; + lib_me_t *me; + int rc; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - me = ptl_handle2me(&args->current_in, nal); + me = ptl_handle2me(meh, nal); if (me == NULL) { - ret->rc = PTL_ME_INVALID; + rc = PTL_ME_INVALID; } else { lib_me_unlink(nal, me); - ret->rc = PTL_OK; + rc = PTL_OK; } - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return (ret->rc); + return (rc); } /* call with state_lock please */ -void lib_me_unlink(nal_cb_t *nal, lib_me_t *me) +void +lib_me_unlink(lib_nal_t *nal, lib_me_t *me) { list_del (&me->me_list); @@ -157,64 +166,20 @@ void lib_me_unlink(nal_cb_t *nal, lib_me_t *me) lib_me_free(nal, me); } -int do_PtlTblDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +#if 0 +static void +lib_me_dump(lib_nal_t *nal, lib_me_t * me) { - PtlTblDump_in *args = v_args; - PtlTblDump_out *ret = v_ret; - lib_ptl_t *tbl = &nal->ni.tbl; - ptl_handle_any_t handle; - struct list_head *tmp; - unsigned long flags; + CWARN("Match Entry %p ("LPX64")\n", me, + me->me_lh.lh_cookie); - if (args->index_in < 0 || args->index_in >= tbl->size) - return ret->rc = PTL_PT_INDEX_INVALID; - - nal->cb_printf(nal, "Portal table index %d\n", args->index_in); - - state_lock(nal, &flags); - list_for_each(tmp, &(tbl->tbl[args->index_in])) { - lib_me_t *me = list_entry(tmp, lib_me_t, me_list); - ptl_me2handle(&handle, me); - lib_me_dump(nal, me); - } - state_unlock(nal, &flags); + CWARN("\tMatch/Ignore\t= %016lx / %016lx\n", + me->match_bits, me->ignore_bits); - return ret->rc = PTL_OK; -} - -int do_PtlMEDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) -{ - PtlMEDump_in *args = v_args; - PtlMEDump_out *ret = v_ret; - lib_me_t *me; - unsigned long flags; - - state_lock(nal, &flags); - - me = ptl_handle2me(&args->current_in, nal); - if (me == NULL) { - ret->rc = PTL_ME_INVALID; - } else { - lib_me_dump(nal, me); - ret->rc = PTL_OK; - } - - state_unlock(nal, &flags); - - return ret->rc; -} - -static void lib_me_dump(nal_cb_t * nal, lib_me_t * me) -{ - nal->cb_printf(nal, "Match Entry %p ("LPX64")\n", me, - me->me_lh.lh_cookie); - - nal->cb_printf(nal, "\tMatch/Ignore\t= %016lx / %016lx\n", - me->match_bits, me->ignore_bits); - - nal->cb_printf(nal, "\tMD\t= %p\n", me->md); - nal->cb_printf(nal, "\tprev\t= %p\n", - list_entry(me->me_list.prev, lib_me_t, me_list)); - nal->cb_printf(nal, "\tnext\t= %p\n", - list_entry(me->me_list.next, lib_me_t, me_list)); + CWARN("\tMD\t= %p\n", me->md); + CWARN("\tprev\t= %p\n", + list_entry(me->me_list.prev, lib_me_t, me_list)); + CWARN("\tnext\t= %p\n", + list_entry(me->me_list.next, lib_me_t, me_list)); } +#endif diff --git a/lustre/portals/portals/lib-move.c b/lustre/portals/portals/lib-move.c index 477ddf8..9dcc06e 100644 --- a/lustre/portals/portals/lib-move.c +++ b/lustre/portals/portals/lib-move.c @@ -31,20 +31,19 @@ #endif #include #include -#include /* forward ref */ -static void lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg); +static void lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg); static lib_md_t * -lib_match_md(nal_cb_t *nal, int index, int op_mask, +lib_match_md(lib_nal_t *nal, int index, int op_mask, ptl_nid_t src_nid, ptl_pid_t src_pid, ptl_size_t rlength, ptl_size_t roffset, ptl_match_bits_t match_bits, lib_msg_t *msg, ptl_size_t *mlength_out, ptl_size_t *offset_out) { - lib_ni_t *ni = &nal->ni; - struct list_head *match_list = &ni->tbl.tbl[index]; + lib_ni_t *ni = &nal->libnal_ni; + struct list_head *match_list = &ni->ni_portals.tbl[index]; struct list_head *tmp; lib_me_t *me; lib_md_t *md; @@ -55,9 +54,9 @@ lib_match_md(nal_cb_t *nal, int index, int op_mask, CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d " "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits); - if (index < 0 || index >= ni->tbl.size) { + if (index < 0 || index >= ni->ni_portals.size) { CERROR("Invalid portal %d not in [0-%d]\n", - index, ni->tbl.size); + index, ni->ni_portals.size); goto failed; } @@ -153,66 +152,65 @@ lib_match_md(nal_cb_t *nal, int index, int op_mask, failed: CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64 " offset %d length %d: no match\n", - ni->nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT", + ni->ni_pid.nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT", src_nid, src_pid, index, match_bits, roffset, rlength); RETURN(NULL); } -int do_PtlFailNid (nal_cb_t *nal, void *private, void *v_args, void *v_ret) +int lib_api_fail_nid (nal_t *apinal, ptl_nid_t nid, unsigned int threshold) { - PtlFailNid_in *args = v_args; - PtlFailNid_out *ret = v_ret; + lib_nal_t *nal = apinal->nal_data; lib_test_peer_t *tp; unsigned long flags; struct list_head *el; struct list_head *next; struct list_head cull; - if (args->threshold != 0) { + if (threshold != 0) { /* Adding a new entry */ - tp = (lib_test_peer_t *)nal->cb_malloc (nal, sizeof (*tp)); + PORTAL_ALLOC(tp, sizeof(*tp)); if (tp == NULL) - return (ret->rc = PTL_FAIL); + return PTL_NO_SPACE; - tp->tp_nid = args->nid; - tp->tp_threshold = args->threshold; + tp->tp_nid = nid; + tp->tp_threshold = threshold; - state_lock (nal, &flags); - list_add (&tp->tp_list, &nal->ni.ni_test_peers); - state_unlock (nal, &flags); - return (ret->rc = PTL_OK); + LIB_LOCK(nal, flags); + list_add_tail (&tp->tp_list, &nal->libnal_ni.ni_test_peers); + LIB_UNLOCK(nal, flags); + return PTL_OK; } /* removing entries */ INIT_LIST_HEAD (&cull); - state_lock (nal, &flags); + LIB_LOCK(nal, flags); - list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) { tp = list_entry (el, lib_test_peer_t, tp_list); if (tp->tp_threshold == 0 || /* needs culling anyway */ - args->nid == PTL_NID_ANY || /* removing all entries */ - tp->tp_nid == args->nid) /* matched this one */ + nid == PTL_NID_ANY || /* removing all entries */ + tp->tp_nid == nid) /* matched this one */ { list_del (&tp->tp_list); list_add (&tp->tp_list, &cull); } } - state_unlock (nal, &flags); + LIB_UNLOCK(nal, flags); while (!list_empty (&cull)) { tp = list_entry (cull.next, lib_test_peer_t, tp_list); list_del (&tp->tp_list); - nal->cb_free (nal, tp, sizeof (*tp)); + PORTAL_FREE(tp, sizeof (*tp)); } - return (ret->rc = PTL_OK); + return PTL_OK; } static int -fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) +fail_peer (lib_nal_t *nal, ptl_nid_t nid, int outgoing) { lib_test_peer_t *tp; struct list_head *el; @@ -223,9 +221,9 @@ fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) INIT_LIST_HEAD (&cull); - state_lock (nal, &flags); + LIB_LOCK (nal, flags); - list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) { tp = list_entry (el, lib_test_peer_t, tp_list); if (tp->tp_threshold == 0) { @@ -257,13 +255,13 @@ fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) } } - state_unlock (nal, &flags); + LIB_UNLOCK (nal, flags); while (!list_empty (&cull)) { tp = list_entry (cull.next, lib_test_peer_t, tp_list); list_del (&tp->tp_list); - nal->cb_free (nal, tp, sizeof (*tp)); + PORTAL_FREE(tp, sizeof (*tp)); } return (fail); @@ -554,52 +552,52 @@ lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, #endif ptl_err_t -lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, +lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen) { if (mlen == 0) - return (nal->cb_recv(nal, private, msg, - 0, NULL, - offset, mlen, rlen)); + return (nal->libnal_recv(nal, private, msg, + 0, NULL, + offset, mlen, rlen)); if ((md->options & PTL_MD_KIOV) == 0) - return (nal->cb_recv(nal, private, msg, - md->md_niov, md->md_iov.iov, - offset, mlen, rlen)); + return (nal->libnal_recv(nal, private, msg, + md->md_niov, md->md_iov.iov, + offset, mlen, rlen)); - return (nal->cb_recv_pages(nal, private, msg, - md->md_niov, md->md_iov.kiov, - offset, mlen, rlen)); + return (nal->libnal_recv_pages(nal, private, msg, + md->md_niov, md->md_iov.kiov, + offset, mlen, rlen)); } ptl_err_t -lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, +lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, lib_md_t *md, ptl_size_t offset, ptl_size_t len) { if (len == 0) - return (nal->cb_send(nal, private, msg, - hdr, type, nid, pid, - 0, NULL, - offset, len)); + return (nal->libnal_send(nal, private, msg, + hdr, type, nid, pid, + 0, NULL, + offset, len)); if ((md->options & PTL_MD_KIOV) == 0) - return (nal->cb_send(nal, private, msg, - hdr, type, nid, pid, - md->md_niov, md->md_iov.iov, - offset, len)); - - return (nal->cb_send_pages(nal, private, msg, - hdr, type, nid, pid, - md->md_niov, md->md_iov.kiov, - offset, len)); + return (nal->libnal_send(nal, private, msg, + hdr, type, nid, pid, + md->md_niov, md->md_iov.iov, + offset, len)); + + return (nal->libnal_send_pages(nal, private, msg, + hdr, type, nid, pid, + md->md_niov, md->md_iov.kiov, + offset, len)); } static void -lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg) +lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg) { - /* ALWAYS called holding the state_lock */ - lib_counters_t *counters = &nal->ni.counters; + /* ALWAYS called holding the LIB_LOCK */ + lib_counters_t *counters = &nal->libnal_ni.ni_counters; /* Here, we commit the MD to a network OP by marking it busy and * decrementing its threshold. Come what may, the network "owns" @@ -616,11 +614,11 @@ lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg) if (counters->msgs_alloc > counters->msgs_max) counters->msgs_max = counters->msgs_alloc; - list_add (&msg->msg_list, &nal->ni.ni_active_msgs); + list_add (&msg->msg_list, &nal->libnal_ni.ni_active_msgs); } static void -lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr) +lib_drop_message (lib_nal_t *nal, void *private, ptl_hdr_t *hdr) { unsigned long flags; @@ -628,10 +626,10 @@ lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr) * to receive (init_msg() not called) and therefore can't cause an * event. */ - state_lock(nal, &flags); - nal->ni.counters.drop_count++; - nal->ni.counters.drop_length += hdr->payload_length; - state_unlock(nal, &flags); + LIB_LOCK(nal, flags); + nal->libnal_ni.ni_counters.drop_count++; + nal->libnal_ni.ni_counters.drop_length += hdr->payload_length; + LIB_UNLOCK(nal, flags); /* NULL msg => if NAL calls lib_finalize it will be a noop */ (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); @@ -645,9 +643,9 @@ lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr) * */ static ptl_err_t -parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) +parse_put(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; ptl_size_t mlength = 0; ptl_size_t offset = 0; ptl_err_t rc; @@ -659,7 +657,7 @@ parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index); hdr->msg.put.offset = NTOH__u32 (hdr->msg.put.offset); - state_lock(nal, &flags); + LIB_LOCK(nal, flags); md = lib_match_md(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT, hdr->src_nid, hdr->src_pid, @@ -667,7 +665,7 @@ parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->msg.put.match_bits, msg, &mlength, &offset); if (md == NULL) { - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } @@ -679,24 +677,24 @@ parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) msg->ack_wmd = hdr->msg.put.ack_wmd; } - ni->counters.recv_count++; - ni->counters.recv_length += mlength; + ni->ni_counters.recv_count++; + ni->ni_counters.recv_length += mlength; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); rc = lib_recv(nal, private, msg, md, offset, mlength, hdr->payload_length); if (rc != PTL_OK) CERROR(LPU64": error on receiving PUT from "LPU64": %d\n", - ni->nid, hdr->src_nid, rc); + ni->ni_pid.nid, hdr->src_nid, rc); return (rc); } static ptl_err_t -parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) +parse_get(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; ptl_size_t mlength = 0; ptl_size_t offset = 0; lib_md_t *md; @@ -710,7 +708,7 @@ parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->msg.get.sink_length = NTOH__u32 (hdr->msg.get.sink_length); hdr->msg.get.src_offset = NTOH__u32 (hdr->msg.get.src_offset); - state_lock(nal, &flags); + LIB_LOCK(nal, flags); md = lib_match_md(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET, hdr->src_nid, hdr->src_pid, @@ -718,24 +716,24 @@ parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->msg.get.match_bits, msg, &mlength, &offset); if (md == NULL) { - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } msg->ev.type = PTL_EVENT_GET_END; msg->ev.hdr_data = 0; - ni->counters.send_count++; - ni->counters.send_length += mlength; + ni->ni_counters.send_count++; + ni->ni_counters.send_length += mlength; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); memset (&reply, 0, sizeof (reply)); reply.type = HTON__u32 (PTL_MSG_REPLY); reply.dest_nid = HTON__u64 (hdr->src_nid); - reply.src_nid = HTON__u64 (ni->nid); reply.dest_pid = HTON__u32 (hdr->src_pid); - reply.src_pid = HTON__u32 (ni->pid); + reply.src_nid = HTON__u64 (ni->ni_pid.nid); + reply.src_pid = HTON__u32 (ni->ni_pid.pid); reply.payload_length = HTON__u32 (mlength); reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd; @@ -747,7 +745,7 @@ parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->src_nid, hdr->src_pid, md, offset, mlength); if (rc != PTL_OK) CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n", - ni->nid, hdr->src_nid, rc); + ni->ni_pid.nid, hdr->src_nid, rc); /* Discard any junk after the hdr */ (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); @@ -756,27 +754,27 @@ parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) } static ptl_err_t -parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) +parse_reply(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; lib_md_t *md; int rlength; int length; unsigned long flags; ptl_err_t rc; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); /* NB handles only looked up by creator (no flips) */ md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal); if (md == NULL || md->threshold == 0) { CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n", - ni->nid, hdr->src_nid, + ni->ni_pid.nid, hdr->src_nid, md == NULL ? "invalid" : "inactive", hdr->msg.reply.dst_wmd.wh_interface_cookie, hdr->msg.reply.dst_wmd.wh_object_cookie); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } @@ -788,10 +786,10 @@ parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) if ((md->options & PTL_MD_TRUNCATE) == 0) { CERROR (LPU64": Dropping REPLY from "LPU64 " length %d for MD "LPX64" would overflow (%d)\n", - ni->nid, hdr->src_nid, length, + ni->ni_pid.nid, hdr->src_nid, length, hdr->msg.reply.dst_wmd.wh_object_cookie, md->length); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } length = md->length; @@ -812,23 +810,23 @@ parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - ni->counters.recv_count++; - ni->counters.recv_length += length; + ni->ni_counters.recv_count++; + ni->ni_counters.recv_length += length; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); rc = lib_recv(nal, private, msg, md, 0, length, rlength); if (rc != PTL_OK) CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n", - ni->nid, hdr->src_nid, rc); + ni->ni_pid.nid, hdr->src_nid, rc); return (rc); } static ptl_err_t -parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) +parse_ack(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; lib_md_t *md; unsigned long flags; @@ -836,23 +834,23 @@ parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits); hdr->msg.ack.mlength = NTOH__u32 (hdr->msg.ack.mlength); - state_lock(nal, &flags); + LIB_LOCK(nal, flags); /* NB handles only looked up by creator (no flips) */ md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal); if (md == NULL || md->threshold == 0) { CDEBUG(D_INFO, LPU64": Dropping ACK from "LPU64" to %s MD " - LPX64"."LPX64"\n", ni->nid, hdr->src_nid, + LPX64"."LPX64"\n", ni->ni_pid.nid, hdr->src_nid, (md == NULL) ? "invalid" : "inactive", hdr->msg.ack.dst_wmd.wh_interface_cookie, hdr->msg.ack.dst_wmd.wh_object_cookie); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return (PTL_FAIL); } CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n", - ni->nid, hdr->src_nid, + ni->ni_pid.nid, hdr->src_nid, hdr->msg.ack.dst_wmd.wh_object_cookie); lib_commit_md(nal, md, msg); @@ -865,9 +863,9 @@ parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - ni->counters.recv_count++; + ni->ni_counters.recv_count++; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); /* We have received and matched up the ack OK, create the * completion event now... */ @@ -898,125 +896,152 @@ hdr_type_string (ptl_hdr_t *hdr) } } -void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr) +void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr) { char *type_str = hdr_type_string (hdr); - nal->cb_printf(nal, "P3 Header at %p of type %s\n", hdr, type_str); - nal->cb_printf(nal, " From nid/pid %Lu/%Lu", hdr->src_nid, - hdr->src_pid); - nal->cb_printf(nal, " To nid/pid %Lu/%Lu\n", hdr->dest_nid, - hdr->dest_pid); + CWARN("P3 Header at %p of type %s\n", hdr, type_str); + CWARN(" From nid/pid "LPX64"/%u", hdr->src_nid, hdr->src_pid); + CWARN(" To nid/pid "LPX64"/%u\n", hdr->dest_nid, hdr->dest_pid); switch (hdr->type) { default: break; case PTL_MSG_PUT: - nal->cb_printf(nal, - " Ptl index %d, ack md "LPX64"."LPX64", " - "match bits "LPX64"\n", - hdr->msg.put.ptl_index, - hdr->msg.put.ack_wmd.wh_interface_cookie, - hdr->msg.put.ack_wmd.wh_object_cookie, - hdr->msg.put.match_bits); - nal->cb_printf(nal, - " Length %d, offset %d, hdr data "LPX64"\n", - hdr->payload_length, hdr->msg.put.offset, - hdr->msg.put.hdr_data); + CWARN(" Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPX64"\n", + hdr->msg.put.ptl_index, + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + hdr->msg.put.match_bits); + CWARN(" Length %d, offset %d, hdr data "LPX64"\n", + hdr->payload_length, hdr->msg.put.offset, + hdr->msg.put.hdr_data); break; case PTL_MSG_GET: - nal->cb_printf(nal, - " Ptl index %d, return md "LPX64"."LPX64", " - "match bits "LPX64"\n", hdr->msg.get.ptl_index, - hdr->msg.get.return_wmd.wh_interface_cookie, - hdr->msg.get.return_wmd.wh_object_cookie, - hdr->msg.get.match_bits); - nal->cb_printf(nal, - " Length %d, src offset %d\n", - hdr->msg.get.sink_length, - hdr->msg.get.src_offset); + CWARN(" Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPX64"\n", hdr->msg.get.ptl_index, + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + CWARN(" Length %d, src offset %d\n", + hdr->msg.get.sink_length, + hdr->msg.get.src_offset); break; case PTL_MSG_ACK: - nal->cb_printf(nal, " dst md "LPX64"."LPX64", " - "manipulated length %d\n", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie, - hdr->msg.ack.mlength); + CWARN(" dst md "LPX64"."LPX64", " + "manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + hdr->msg.ack.mlength); break; case PTL_MSG_REPLY: - nal->cb_printf(nal, " dst md "LPX64"."LPX64", " - "length %d\n", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie, - hdr->payload_length); + CWARN(" dst md "LPX64"."LPX64", " + "length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + hdr->payload_length); } } /* end of print_hdr() */ -void -lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private) +ptl_err_t +lib_parse(lib_nal_t *nal, ptl_hdr_t *hdr, void *private) { unsigned long flags; ptl_err_t rc; lib_msg_t *msg; + + /* NB we return PTL_OK if we manage to parse the header and believe + * it looks OK. Anything that goes wrong with receiving the + * message after that point is the responsibility of the NAL */ /* convert common fields to host byte order */ - hdr->dest_nid = NTOH__u64 (hdr->dest_nid); + hdr->type = NTOH__u32 (hdr->type); hdr->src_nid = NTOH__u64 (hdr->src_nid); - hdr->dest_pid = NTOH__u32 (hdr->dest_pid); hdr->src_pid = NTOH__u32 (hdr->src_pid); - hdr->type = NTOH__u32 (hdr->type); + hdr->dest_pid = NTOH__u32 (hdr->dest_pid); hdr->payload_length = NTOH__u32(hdr->payload_length); -#if 0 - nal->cb_printf(nal, "%d: lib_parse: nal=%p hdr=%p type=%d\n", - nal->ni.nid, nal, hdr, hdr->type); - print_hdr(nal, hdr); -#endif - if (hdr->type == PTL_MSG_HELLO) { + + switch (hdr->type) { + case PTL_MSG_HELLO: { /* dest_nid is really ptl_magicversion_t */ ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid; - CERROR (LPU64": Dropping unexpected HELLO message: " + mv->magic = NTOH__u32(mv->magic); + mv->version_major = NTOH__u16(mv->version_major); + mv->version_minor = NTOH__u16(mv->version_minor); + + if (mv->magic == PORTALS_PROTO_MAGIC && + mv->version_major == PORTALS_PROTO_VERSION_MAJOR && + mv->version_minor == PORTALS_PROTO_VERSION_MINOR) { + CWARN (LPU64": Dropping unexpected HELLO message: " + "magic %d, version %d.%d from "LPD64"\n", + nal->libnal_ni.ni_pid.nid, mv->magic, + mv->version_major, mv->version_minor, + hdr->src_nid); + + /* it's good but we don't want it */ + lib_drop_message(nal, private, hdr); + return PTL_OK; + } + + /* we got garbage */ + CERROR (LPU64": Bad HELLO message: " "magic %d, version %d.%d from "LPD64"\n", - nal->ni.nid, mv->magic, + nal->libnal_ni.ni_pid.nid, mv->magic, mv->version_major, mv->version_minor, hdr->src_nid); - lib_drop_message(nal, private, hdr); - return; + return PTL_FAIL; } - - if (hdr->dest_nid != nal->ni.nid) { - CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64 - " (not me)\n", nal->ni.nid, hdr_type_string (hdr), - hdr->src_nid, hdr->dest_nid); - lib_drop_message(nal, private, hdr); - return; + + case PTL_MSG_ACK: + case PTL_MSG_PUT: + case PTL_MSG_GET: + case PTL_MSG_REPLY: + hdr->dest_nid = NTOH__u64 (hdr->dest_nid); + if (hdr->dest_nid != nal->libnal_ni.ni_pid.nid) { + CERROR(LPU64": BAD dest NID in %s message from" + LPU64" to "LPU64" (not me)\n", + nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), + hdr->src_nid, hdr->dest_nid); + return PTL_FAIL; + } + break; + + default: + CERROR(LPU64": Bad message type 0x%x from "LPU64"\n", + nal->libnal_ni.ni_pid.nid, hdr->type, hdr->src_nid); + return PTL_FAIL; } - if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + /* We've decided we're not receiving garbage since we can parse the + * header. We will return PTL_OK come what may... */ + + if (!list_empty (&nal->libnal_ni.ni_test_peers) && /* normally we don't */ fail_peer (nal, hdr->src_nid, 0)) /* shall we now? */ { CERROR(LPU64": Dropping incoming %s from "LPU64 ": simulated failure\n", - nal->ni.nid, hdr_type_string (hdr), + nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), hdr->src_nid); lib_drop_message(nal, private, hdr); - return; + return PTL_OK; } msg = lib_msg_alloc(nal); if (msg == NULL) { CERROR(LPU64": Dropping incoming %s from "LPU64 ": can't allocate a lib_msg_t\n", - nal->ni.nid, hdr_type_string (hdr), + nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), hdr->src_nid); lib_drop_message(nal, private, hdr); - return; + return PTL_OK; } switch (hdr->type) { @@ -1033,10 +1058,8 @@ lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private) rc = parse_reply(nal, hdr, private, msg); break; default: - CERROR(LPU64": Dropping message from "LPU64 - ": Bad type=0x%x\n", nal->ni.nid, hdr->src_nid, - hdr->type); - rc = PTL_FAIL; + LASSERT(0); + rc = PTL_FAIL; /* no compiler warning please */ break; } @@ -1045,123 +1068,114 @@ lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private) /* committed... */ lib_finalize(nal, private, msg, rc); } else { - state_lock(nal, &flags); - lib_msg_free(nal, msg); /* expects state_lock held */ - state_unlock(nal, &flags); + LIB_LOCK(nal, flags); + lib_msg_free(nal, msg); /* expects LIB_LOCK held */ + LIB_UNLOCK(nal, flags); lib_drop_message(nal, private, hdr); } } + + return PTL_OK; + /* That's "OK I can parse it", not "OK I like it" :) */ } int -do_PtlPut(nal_cb_t *nal, void *private, void *v_args, void *v_ret) +lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, + ptl_ack_req_t ack, ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, + ptl_size_t offset, ptl_hdr_data_t hdr_data) { - /* - * Incoming: - * ptl_handle_md_t md_in - * ptl_ack_req_t ack_req_in - * ptl_process_id_t target_in - * ptl_pt_index_t portal_in - * ptl_ac_index_t cookie_in - * ptl_match_bits_t match_bits_in - * ptl_size_t offset_in - * - * Outgoing: - */ - - PtlPut_in *args = v_args; - ptl_process_id_t *id = &args->target_in; - PtlPut_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; lib_msg_t *msg; ptl_hdr_t hdr; lib_md_t *md; unsigned long flags; int rc; - if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + if (!list_empty (&ni->ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ { - CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", - nal->ni.nid, id->nid); - return (ret->rc = PTL_PROCESS_INVALID); + CERROR("Dropping PUT to "LPU64": simulated failure\n", + id->nid); + return PTL_PROCESS_INVALID; } msg = lib_msg_alloc(nal); if (msg == NULL) { CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n", - ni->nid, id->nid); - return (ret->rc = PTL_NO_SPACE); + ni->ni_pid.nid, id->nid); + return PTL_NO_SPACE; } - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - md = ptl_handle2md(&args->md_in, nal); + md = ptl_handle2md(mdh, nal); if (md == NULL || md->threshold == 0) { lib_msg_free(nal, msg); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return (ret->rc = PTL_MD_INVALID); + return PTL_MD_INVALID; } - CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid, - (unsigned long)id->pid); + CDEBUG(D_NET, "PtlPut -> "LPX64"\n", id->nid); memset (&hdr, 0, sizeof (hdr)); hdr.type = HTON__u32 (PTL_MSG_PUT); hdr.dest_nid = HTON__u64 (id->nid); - hdr.src_nid = HTON__u64 (ni->nid); hdr.dest_pid = HTON__u32 (id->pid); - hdr.src_pid = HTON__u32 (ni->pid); + hdr.src_nid = HTON__u64 (ni->ni_pid.nid); + hdr.src_pid = HTON__u32 (ni->ni_pid.pid); hdr.payload_length = HTON__u32 (md->length); /* NB handles only looked up by creator (no flips) */ - if (args->ack_req_in == PTL_ACK_REQ) { + if (ack == PTL_ACK_REQ) { hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie; hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie; } else { hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE; } - hdr.msg.put.match_bits = HTON__u64 (args->match_bits_in); - hdr.msg.put.ptl_index = HTON__u32 (args->portal_in); - hdr.msg.put.offset = HTON__u32 (args->offset_in); - hdr.msg.put.hdr_data = args->hdr_data_in; + hdr.msg.put.match_bits = HTON__u64 (match_bits); + hdr.msg.put.ptl_index = HTON__u32 (portal); + hdr.msg.put.offset = HTON__u32 (offset); + hdr.msg.put.hdr_data = hdr_data; lib_commit_md(nal, md, msg); msg->ev.type = PTL_EVENT_SEND_END; - msg->ev.initiator.nid = ni->nid; - msg->ev.initiator.pid = ni->pid; - msg->ev.portal = args->portal_in; - msg->ev.match_bits = args->match_bits_in; + msg->ev.initiator.nid = ni->ni_pid.nid; + msg->ev.initiator.pid = ni->ni_pid.pid; + msg->ev.portal = portal; + msg->ev.match_bits = match_bits; msg->ev.rlength = md->length; msg->ev.mlength = md->length; - msg->ev.offset = args->offset_in; - msg->ev.hdr_data = args->hdr_data_in; + msg->ev.offset = offset; + msg->ev.hdr_data = hdr_data; lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - ni->counters.send_count++; - ni->counters.send_length += md->length; + ni->ni_counters.send_count++; + ni->ni_counters.send_length += md->length; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - rc = lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, + rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_PUT, id->nid, id->pid, md, 0, md->length); if (rc != PTL_OK) { - CERROR(LPU64": error sending PUT to "LPU64": %d\n", - ni->nid, id->nid, rc); - lib_finalize (nal, private, msg, rc); + CERROR("Error sending PUT to "LPX64": %d\n", + id->nid, rc); + lib_finalize (nal, NULL, msg, rc); } /* completion will be signalled by an event */ - return ret->rc = PTL_OK; + return PTL_OK; } lib_msg_t * -lib_create_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg) +lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg) { /* The NAL can DMA direct to the GET md (i.e. no REPLY msg). This * returns a msg for the NAL to pass to lib_finalize() when the sink @@ -1170,12 +1184,12 @@ lib_create_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg) * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when * lib_finalize() is called on it, so the NAL must call this first */ - lib_ni_t *ni = &nal->ni; + lib_ni_t *ni = &nal->libnal_ni; lib_msg_t *msg = lib_msg_alloc(nal); lib_md_t *getmd = getmsg->md; unsigned long flags; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); LASSERT (getmd->pending > 0); @@ -1205,72 +1219,60 @@ lib_create_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg) lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc); - ni->counters.recv_count++; - ni->counters.recv_length += getmd->length; + ni->ni_counters.recv_count++; + ni->ni_counters.recv_length += getmd->length; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); return msg; drop_msg: lib_msg_free(nal, msg); drop: - nal->ni.counters.drop_count++; - nal->ni.counters.drop_length += getmd->length; + nal->libnal_ni.ni_counters.drop_count++; + nal->libnal_ni.ni_counters.drop_length += getmd->length; - state_unlock (nal, &flags); + LIB_UNLOCK (nal, flags); return NULL; } int -do_PtlGet(nal_cb_t *nal, void *private, void *v_args, void *v_ret) +lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, ptl_process_id_t *id, + ptl_pt_index_t portal, ptl_ac_index_t ac, + ptl_match_bits_t match_bits, ptl_size_t offset) { - /* - * Incoming: - * ptl_handle_md_t md_in - * ptl_process_id_t target_in - * ptl_pt_index_t portal_in - * ptl_ac_index_t cookie_in - * ptl_match_bits_t match_bits_in - * ptl_size_t offset_in - * - * Outgoing: - */ - - PtlGet_in *args = v_args; - ptl_process_id_t *id = &args->target_in; - PtlGet_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; lib_msg_t *msg; ptl_hdr_t hdr; lib_md_t *md; unsigned long flags; int rc; - if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + if (!list_empty (&ni->ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ { - CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", - nal->ni.nid, id->nid); - return (ret->rc = PTL_PROCESS_INVALID); + CERROR("Dropping PUT to "LPX64": simulated failure\n", + id->nid); + return PTL_PROCESS_INVALID; } msg = lib_msg_alloc(nal); if (msg == NULL) { - CERROR(LPU64": Dropping GET to "LPU64": ENOMEM on lib_msg_t\n", - ni->nid, id->nid); - return (ret->rc = PTL_NO_SPACE); + CERROR("Dropping GET to "LPU64": ENOMEM on lib_msg_t\n", + id->nid); + return PTL_NO_SPACE; } - state_lock(nal, &flags); + LIB_LOCK(nal, flags); - md = ptl_handle2md(&args->md_in, nal); + md = ptl_handle2md(mdh, nal); if (md == NULL || !md->threshold) { lib_msg_free(nal, msg); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - return ret->rc = PTL_MD_INVALID; + return PTL_MD_INVALID; } CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid, @@ -1279,48 +1281,47 @@ do_PtlGet(nal_cb_t *nal, void *private, void *v_args, void *v_ret) memset (&hdr, 0, sizeof (hdr)); hdr.type = HTON__u32 (PTL_MSG_GET); hdr.dest_nid = HTON__u64 (id->nid); - hdr.src_nid = HTON__u64 (ni->nid); hdr.dest_pid = HTON__u32 (id->pid); - hdr.src_pid = HTON__u32 (ni->pid); + hdr.src_nid = HTON__u64 (ni->ni_pid.nid); + hdr.src_pid = HTON__u32 (ni->ni_pid.pid); hdr.payload_length = 0; /* NB handles only looked up by creator (no flips) */ hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie; hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie; - hdr.msg.get.match_bits = HTON__u64 (args->match_bits_in); - hdr.msg.get.ptl_index = HTON__u32 (args->portal_in); - hdr.msg.get.src_offset = HTON__u32 (args->offset_in); + hdr.msg.get.match_bits = HTON__u64 (match_bits); + hdr.msg.get.ptl_index = HTON__u32 (portal); + hdr.msg.get.src_offset = HTON__u32 (offset); hdr.msg.get.sink_length = HTON__u32 (md->length); lib_commit_md(nal, md, msg); msg->ev.type = PTL_EVENT_SEND_END; - msg->ev.initiator.nid = ni->nid; - msg->ev.initiator.pid = ni->pid; - msg->ev.portal = args->portal_in; - msg->ev.match_bits = args->match_bits_in; + msg->ev.initiator = ni->ni_pid; + msg->ev.portal = portal; + msg->ev.match_bits = match_bits; msg->ev.rlength = md->length; msg->ev.mlength = md->length; - msg->ev.offset = args->offset_in; + msg->ev.offset = offset; msg->ev.hdr_data = 0; lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - ni->counters.send_count++; + ni->ni_counters.send_count++; - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); - rc = lib_send (nal, private, msg, &hdr, PTL_MSG_GET, + rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_GET, id->nid, id->pid, NULL, 0, 0); if (rc != PTL_OK) { CERROR(LPU64": error sending GET to "LPU64": %d\n", - ni->nid, id->nid, rc); - lib_finalize (nal, private, msg, rc); + ni->ni_pid.nid, id->nid, rc); + lib_finalize (nal, NULL, msg, rc); } /* completion will be signalled by an event */ - return ret->rc = PTL_OK; + return PTL_OK; } void lib_assert_wire_constants (void) diff --git a/lustre/portals/portals/lib-msg.c b/lustre/portals/portals/lib-msg.c index 1b69533..328b8d8 100644 --- a/lustre/portals/portals/lib-msg.c +++ b/lustre/portals/portals/lib-msg.c @@ -33,55 +33,39 @@ #include void -lib_enq_event_locked (nal_cb_t *nal, void *private, +lib_enq_event_locked (lib_nal_t *nal, void *private, lib_eq_t *eq, ptl_event_t *ev) { ptl_event_t *eq_slot; - int rc; - ev->sequence = eq->sequence++; /* Allocate the next queue slot */ - - /* size must be a power of 2 to handle a wrapped sequence # */ - LASSERT (eq->size != 0 && - eq->size == LOWEST_BIT_SET (eq->size)); - eq_slot = eq->base + (ev->sequence & (eq->size - 1)); + ev->sequence = eq->eq_enq_seq++; /* Allocate the next queue slot */ - /* Copy the event into the allocated slot, ensuring all the rest of - * the event's contents have been copied _before_ the sequence - * number gets updated. A processes 'getting' an event waits on - * the next queue slot's sequence to be 'new'. When it is, _all_ - * other event fields had better be consistent. I assert - * 'sequence' is the last member, so I only need a 2 stage copy. */ + /* size must be a power of 2 to handle sequence # overflow */ + LASSERT (eq->eq_size != 0 && + eq->eq_size == LOWEST_BIT_SET (eq->eq_size)); + eq_slot = eq->eq_events + (ev->sequence & (eq->eq_size - 1)); - LASSERT(sizeof (ptl_event_t) == - offsetof(ptl_event_t, sequence) + sizeof(ev->sequence)); + /* There is no race since both event consumers and event producers + * take the LIB_LOCK(), so we don't screw around with memory + * barriers, setting the sequence number last or wierd structure + * layout assertions. */ + *eq_slot = *ev; - rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev, - offsetof (ptl_event_t, sequence)); - LASSERT (rc == PTL_OK); + /* Call the callback handler (if any) */ + if (eq->eq_callback != NULL) + eq->eq_callback (eq_slot); + /* Wake anyone sleeping for an event (see lib-eq.c) */ #ifdef __KERNEL__ - barrier(); -#endif - /* Updating the sequence number is what makes the event 'new' NB if - * the cb_write below isn't atomic, this could cause a race with - * PtlEQGet */ - rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence, - (void *)&ev->sequence,sizeof (ev->sequence)); - LASSERT (rc == PTL_OK); - -#ifdef __KERNEL__ - barrier(); + if (waitqueue_active(&nal->libnal_ni.ni_waitq)) + wake_up_all(&nal->libnal_ni.ni_waitq); +#else + pthread_cond_broadcast(&nal->libnal_ni.ni_cond); #endif - - if (nal->cb_callback != NULL) - nal->cb_callback(nal, private, eq, ev); - else if (eq->event_callback != NULL) - eq->event_callback(ev); } void -lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) +lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) { lib_md_t *md; int unlink; @@ -101,9 +85,9 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) memset (&ack, 0, sizeof (ack)); ack.type = HTON__u32 (PTL_MSG_ACK); ack.dest_nid = HTON__u64 (msg->ev.initiator.nid); - ack.src_nid = HTON__u64 (nal->ni.nid); ack.dest_pid = HTON__u32 (msg->ev.initiator.pid); - ack.src_pid = HTON__u32 (nal->ni.pid); + ack.src_nid = HTON__u64 (nal->libnal_ni.ni_pid.nid); + ack.src_pid = HTON__u32 (nal->libnal_ni.ni_pid.pid); ack.payload_length = 0; ack.msg.ack.dst_wmd = msg->ack_wmd; @@ -122,7 +106,7 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) md = msg->md; - state_lock(nal, &flags); + LIB_LOCK(nal, flags); /* Now it's safe to drop my caller's ref */ md->pending--; @@ -148,8 +132,8 @@ lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) lib_md_unlink(nal, md); list_del (&msg->msg_list); - nal->ni.counters.msgs_alloc--; + nal->libnal_ni.ni_counters.msgs_alloc--; lib_msg_free(nal, msg); - state_unlock(nal, &flags); + LIB_UNLOCK(nal, flags); } diff --git a/lustre/portals/portals/lib-ni.c b/lustre/portals/portals/lib-ni.c index aa959fc..0f298a0 100644 --- a/lustre/portals/portals/lib-ni.c +++ b/lustre/portals/portals/lib-ni.c @@ -25,92 +25,48 @@ #define DEBUG_SUBSYSTEM S_PORTALS #include -#include #define MAX_DIST 18446744073709551615ULL -int do_PtlNIStatus(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int lib_api_ni_status (nal_t *apinal, ptl_sr_index_t sr_idx, + ptl_sr_value_t *status) { - /* - * Incoming: - * ptl_handle_ni_t interface_in - * ptl_sr_index_t register_in - * - * Outgoing: - * ptl_sr_value_t * status_out - */ - - PtlNIStatus_in *args = v_args; - PtlNIStatus_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; - lib_counters_t *count = &ni->counters; - - if (!args) - return ret->rc = PTL_SEGV; - - ret->rc = PTL_OK; - ret->status_out = 0; - - /* - * I hate this sort of code.... Hash tables, offset lists? - * Treat the counters as an array of ints? - */ - if (args->register_in == PTL_SR_DROP_COUNT) - ret->status_out = count->drop_count; - - else if (args->register_in == PTL_SR_DROP_LENGTH) - ret->status_out = count->drop_length; - - else if (args->register_in == PTL_SR_RECV_COUNT) - ret->status_out = count->recv_count; - - else if (args->register_in == PTL_SR_RECV_LENGTH) - ret->status_out = count->recv_length; - - else if (args->register_in == PTL_SR_SEND_COUNT) - ret->status_out = count->send_count; - - else if (args->register_in == PTL_SR_SEND_LENGTH) - ret->status_out = count->send_length; - - else if (args->register_in == PTL_SR_MSGS_MAX) - ret->status_out = count->msgs_max; - else - ret->rc = PTL_SR_INDEX_INVALID; - - return ret->rc; + lib_nal_t *nal = apinal->nal_data; + lib_ni_t *ni = &nal->libnal_ni; + lib_counters_t *count = &ni->ni_counters; + + switch (sr_idx) { + case PTL_SR_DROP_COUNT: + *status = count->drop_count; + return PTL_OK; + case PTL_SR_DROP_LENGTH: + *status = count->drop_length; + return PTL_OK; + case PTL_SR_RECV_COUNT: + *status = count->recv_count; + return PTL_OK; + case PTL_SR_RECV_LENGTH: + *status = count->recv_length; + return PTL_OK; + case PTL_SR_SEND_COUNT: + *status = count->send_count; + return PTL_OK; + case PTL_SR_SEND_LENGTH: + *status = count->send_length; + return PTL_OK; + case PTL_SR_MSGS_MAX: + *status = count->msgs_max; + return PTL_OK; + default: + *status = 0; + return PTL_SR_INDEX_INVALID; + } } -int do_PtlNIDist(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int lib_api_ni_dist (nal_t *apinal, ptl_process_id_t *pid, unsigned long *dist) { - /* - * Incoming: - * ptl_handle_ni_t interface_in - * ptl_process_id_t process_in - - * - * Outgoing: - * unsigned long * distance_out - - */ - - PtlNIDist_in *args = v_args; - PtlNIDist_out *ret = v_ret; - - unsigned long dist; - ptl_process_id_t id_in = args->process_in; - ptl_nid_t nid; - int rc; - - nid = id_in.nid; - - if ((rc = nal->cb_dist(nal, nid, &dist)) != 0) { - ret->distance_out = (unsigned long) MAX_DIST; - return PTL_PROCESS_INVALID; - } - - ret->distance_out = dist; + lib_nal_t *nal = apinal->nal_data; - return ret->rc = PTL_OK; + return (nal->libnal_dist(nal, pid->nid, dist)); } diff --git a/lustre/portals/portals/lib-pid.c b/lustre/portals/portals/lib-pid.c index 12eebb5..ff2a601 100644 --- a/lustre/portals/portals/lib-pid.c +++ b/lustre/portals/portals/lib-pid.c @@ -35,24 +35,12 @@ extern int getpid(void); # include #endif #include -#include -int do_PtlGetId(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +lib_api_get_id(nal_t *apinal, ptl_process_id_t *pid) { - /* - * Incoming: - * ptl_handle_ni_t handle_in - * - * Outgoing: - * ptl_process_id_t * id_out - * ptl_id_t * gsize_out - */ - - PtlGetId_out *ret = v_ret; - lib_ni_t *ni = &nal->ni; - - ret->id_out.nid = ni->nid; - ret->id_out.pid = ni->pid; - - return ret->rc = PTL_OK; + lib_nal_t *nal = apinal->nal_data; + + *pid = nal->libnal_ni.ni_pid; + return PTL_OK; } diff --git a/lustre/portals/portals/module.c b/lustre/portals/portals/module.c index 40e9da4..5615a724 100644 --- a/lustre/portals/portals/module.c +++ b/lustre/portals/portals/module.c @@ -160,7 +160,6 @@ EXPORT_SYMBOL(ptl_register_nal); EXPORT_SYMBOL(ptl_unregister_nal); EXPORT_SYMBOL(ptl_err_str); -EXPORT_SYMBOL(lib_dispatch); EXPORT_SYMBOL(PtlMEAttach); EXPORT_SYMBOL(PtlMEInsert); EXPORT_SYMBOL(PtlMEUnlink); @@ -192,7 +191,6 @@ EXPORT_SYMBOL(lib_parse); EXPORT_SYMBOL(lib_create_reply_msg); EXPORT_SYMBOL(lib_init); EXPORT_SYMBOL(lib_fini); -EXPORT_SYMBOL(dispatch_name); MODULE_AUTHOR("Peter J. Braam "); MODULE_DESCRIPTION("Portals v3.1"); diff --git a/lustre/portals/unals/address.c b/lustre/portals/unals/address.c index 6507924..f329e2a 100644 --- a/lustre/portals/unals/address.c +++ b/lustre/portals/unals/address.c @@ -91,8 +91,8 @@ void set_address(bridge t,ptl_pid_t pidrequest) int port; if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; else port=pidrequest; - t->nal_cb->ni.nid=get_node_id(); - t->nal_cb->ni.pid=port; + t->lib_nal->libnal_ni.ni_pid.nid=get_node_id(); + t->lib_nal->libnal_ni.ni_pid.pid=port; } #else @@ -120,10 +120,9 @@ void set_address(bridge t,ptl_pid_t pidrequest) in_addr = get_node_id(); t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ - t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) - << PNAL_VNODE_SHIFT) - + virtnode; - + t->lib_nal->libnal_ni.ni_pid.nid = ((in_addr & PNAL_HOSTID_MASK) + << PNAL_VNODE_SHIFT) + + virtnode; pid=pidrequest; /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ #ifdef notyet @@ -141,6 +140,6 @@ void set_address(bridge t,ptl_pid_t pidrequest) return; } else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; - t->nal_cb->ni.pid=pid; + t->lib_nal->libnal_ni.ni_pid.pid=pid; } #endif diff --git a/lustre/portals/unals/bridge.h b/lustre/portals/unals/bridge.h index 90ce324..d2f0f2c 100644 --- a/lustre/portals/unals/bridge.h +++ b/lustre/portals/unals/bridge.h @@ -19,7 +19,7 @@ typedef struct bridge { int alive; - nal_cb_t *nal_cb; + lib_nal_t *lib_nal; void *lower; void *local; void (*shutdown)(struct bridge *); diff --git a/lustre/portals/unals/procapi.c b/lustre/portals/unals/procapi.c index e40c4b9..f3843d7 100644 --- a/lustre/portals/unals/procapi.c +++ b/lustre/portals/unals/procapi.c @@ -60,34 +60,6 @@ void procbridge_wakeup_nal(procbridge p) syscall(SYS_write, p->notifier[0], buf, sizeof(buf)); } -/* Function: forward - * Arguments: nal_t *nal: pointer to my top-side nal structure - * id: the command to pass to the lower layer - * args, args_len:pointer to and length of the request - * ret, ret_len: pointer to and size of the result - * Returns: a portals status code - * - * forwards a packaged api call from the 'api' side to the 'library' - * side, and collects the result - */ -static int procbridge_forward(nal_t *n, int id, void *args, size_t args_len, - void *ret, size_t ret_len) -{ - bridge b = (bridge) n->nal_data; - - if (id == PTL_FINI) { - lib_fini(b->nal_cb); - - if (b->shutdown) - (*b->shutdown)(b); - } - - lib_dispatch(b->nal_cb, NULL, id, args, ret); - - return (PTL_OK); -} - - /* Function: shutdown * Arguments: nal: a pointer to my top side nal structure * ni: my network interface index @@ -97,7 +69,8 @@ static int procbridge_forward(nal_t *n, int id, void *args, size_t args_len, */ static void procbridge_shutdown(nal_t *n) { - bridge b=(bridge)n->nal_data; + lib_nal_t *nal = n->nal_data; + bridge b=(bridge)nal->libnal_data; procbridge p=(procbridge)b->local; p->nal_flags |= NAL_FLAG_STOPPING; @@ -117,83 +90,19 @@ static void procbridge_shutdown(nal_t *n) } -static void procbridge_lock(nal_t * n, unsigned long *flags) -{ - bridge b=(bridge)n->nal_data; - procbridge p=(procbridge)b->local; - - pthread_mutex_lock(&p->mutex); -} - -static void procbridge_unlock(nal_t * n, unsigned long *flags) -{ - bridge b=(bridge)n->nal_data; - procbridge p=(procbridge)b->local; - - pthread_mutex_unlock(&p->mutex); -} - -/* Function: yield - * Arguments: pid: - * - * this function was originally intended to allow the - * lower half thread to be scheduled to allow progress. we - * overload it to explicitly block until signalled by the - * lower half. - */ -static int procbridge_yield(nal_t *n, unsigned long *flags, int milliseconds) -{ - bridge b=(bridge)n->nal_data; - procbridge p=(procbridge)b->local; - - if (milliseconds == 0) - return 0; - - if (milliseconds < 0) { - pthread_cond_wait(&p->cond,&p->mutex); - } else { - struct timeval then; - struct timeval now; - struct timespec timeout; - - gettimeofday(&then, NULL); - timeout.tv_sec = then.tv_sec + milliseconds/1000; - timeout.tv_nsec = then.tv_usec * 1000 + milliseconds % 1000 * 1000000; - if (timeout.tv_nsec >= 1000000000) { - timeout.tv_sec++; - timeout.tv_nsec -= 1000000000; - } - - pthread_cond_timedwait(&p->cond, &p->mutex, &timeout); - - gettimeofday(&now, NULL); - milliseconds -= (now.tv_sec - then.tv_sec) * 1000 + - (now.tv_usec - then.tv_usec) / 1000; - - if (milliseconds < 0) - milliseconds = 0; - } - - return (milliseconds); -} - /* forward decl */ extern int procbridge_startup (nal_t *, ptl_pid_t, ptl_ni_limits_t *, ptl_ni_limits_t *); /* api_nal * the interface vector to allow the generic code to access - * this nal. this is seperate from the library side nal_cb. + * this nal. this is seperate from the library side lib_nal. * TODO: should be dyanmically allocated */ nal_t procapi_nal = { nal_data: NULL, - startup: procbridge_startup, - shutdown: procbridge_shutdown, - forward: procbridge_forward, - yield: procbridge_yield, - lock: procbridge_lock, - unlock: procbridge_unlock + nal_ni_init: procbridge_startup, + nal_ni_fini: procbridge_shutdown, }; ptl_nid_t tcpnal_mynid; @@ -228,7 +137,6 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, b=(bridge)malloc(sizeof(struct bridge)); p=(procbridge)malloc(sizeof(struct procbridge)); - nal->nal_data=b; b->local=p; args.nia_requested_pid = requested_pid; @@ -236,6 +144,7 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, args.nia_actual_limits = actual_limits; args.nia_nal_type = nal_type; args.nia_bridge = b; + args.nia_apinal = nal; /* init procbridge */ pthread_mutex_init(&p->mutex,0); @@ -273,7 +182,7 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, if (p->nal_flags & NAL_FLAG_STOPPED) return PTL_FAIL; - b->nal_cb->ni.nid = tcpnal_mynid; + b->lib_nal->libnal_ni.ni_pid.nid = tcpnal_mynid; return PTL_OK; } diff --git a/lustre/portals/unals/procbridge.h b/lustre/portals/unals/procbridge.h index 1c8e7dd..1f91ced 100644 --- a/lustre/portals/unals/procbridge.h +++ b/lustre/portals/unals/procbridge.h @@ -30,7 +30,6 @@ typedef struct procbridge { int nal_flags; - pthread_mutex_t nal_cb_lock; } *procbridge; typedef struct nal_init_args { @@ -39,6 +38,7 @@ typedef struct nal_init_args { ptl_ni_limits_t *nia_actual_limits; int nia_nal_type; bridge nia_bridge; + nal_t *nia_apinal; } nal_init_args_t; extern void *nal_thread(void *); diff --git a/lustre/portals/unals/proclib.c b/lustre/portals/unals/proclib.c index af0745b..7ee7c71 100644 --- a/lustre/portals/unals/proclib.c +++ b/lustre/portals/unals/proclib.c @@ -43,85 +43,7 @@ /* the following functions are stubs to satisfy the nal definition without doing anything particularily useful*/ -static ptl_err_t nal_write(nal_cb_t *nal, - void *private, - user_ptr dst_addr, - void *src_addr, - size_t len) -{ - memcpy(dst_addr, src_addr, len); - return PTL_OK; -} - -static ptl_err_t nal_read(nal_cb_t * nal, - void *private, - void *dst_addr, - user_ptr src_addr, - size_t len) -{ - memcpy(dst_addr, src_addr, len); - return PTL_OK; -} - -static void *nal_malloc(nal_cb_t *nal, - size_t len) -{ - void *buf = malloc(len); - return buf; -} - -static void nal_free(nal_cb_t *nal, - void *buf, - size_t len) -{ - free(buf); -} - -static void nal_printf(nal_cb_t *nal, - const char *fmt, - ...) -{ - va_list ap; - - va_start(ap, fmt); - vprintf(fmt, ap); - va_end(ap); -} - - -static void nal_cli(nal_cb_t *nal, - unsigned long *flags) -{ - bridge b = (bridge) nal->nal_data; - procbridge p = (procbridge) b->local; - - pthread_mutex_lock(&p->mutex); -} - - -static void nal_sti(nal_cb_t *nal, - unsigned long *flags) -{ - bridge b = (bridge)nal->nal_data; - procbridge p = (procbridge) b->local; - - pthread_mutex_unlock(&p->mutex); -} - -static void nal_callback(nal_cb_t *nal, void *private, - lib_eq_t *eq, ptl_event_t *ev) -{ - bridge b = (bridge)nal->nal_data; - procbridge p = (procbridge) b->local; - - /* holding p->mutex */ - if (eq->event_callback != NULL) - eq->event_callback(ev); - - pthread_cond_broadcast(&p->cond); -} - -static int nal_dist(nal_cb_t *nal, +static int nal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { @@ -170,33 +92,25 @@ void *nal_thread(void *z) ptl_process_id_t process_id; int nal_type; - b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t)); - b->nal_cb->nal_data=b; - b->nal_cb->cb_read=nal_read; - b->nal_cb->cb_write=nal_write; - b->nal_cb->cb_malloc=nal_malloc; - b->nal_cb->cb_free=nal_free; - b->nal_cb->cb_map=NULL; - b->nal_cb->cb_unmap=NULL; - b->nal_cb->cb_printf=nal_printf; - b->nal_cb->cb_cli=nal_cli; - b->nal_cb->cb_sti=nal_sti; - b->nal_cb->cb_callback=nal_callback; - b->nal_cb->cb_dist=nal_dist; + b->lib_nal=(lib_nal_t *)malloc(sizeof(lib_nal_t)); + b->lib_nal->libnal_data=b; + b->lib_nal->libnal_map=NULL; + b->lib_nal->libnal_unmap=NULL; + b->lib_nal->libnal_dist=nal_dist; nal_type = args->nia_nal_type; - /* Wierd, but this sets b->nal_cb->ni.{nid,pid}, which lib_init() is - * about to do from the process_id passed to it...*/ + /* Wierd, but this sets b->lib_nal->libnal_ni.ni_pid.{nid,pid}, which + * lib_init() is about to do from the process_id passed to it...*/ set_address(b,args->nia_requested_pid); - process_id.pid = b->nal_cb->ni.pid; - process_id.nid = b->nal_cb->ni.nid; + process_id = b->lib_nal->libnal_ni.ni_pid; if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); /* initialize the generic 'library' level code */ - rc = lib_init(b->nal_cb, process_id, + rc = lib_init(b->lib_nal, args->nia_apinal, + process_id, args->nia_requested_limits, args->nia_actual_limits); diff --git a/lustre/portals/unals/tcpnal.c b/lustre/portals/unals/tcpnal.c index 0c47f42..34a9c9d 100644 --- a/lustre/portals/unals/tcpnal.c +++ b/lustre/portals/unals/tcpnal.c @@ -55,7 +55,7 @@ * * sends a packet to the peer, after insuring that a connection exists */ -ptl_err_t tcpnal_send(nal_cb_t *n, +ptl_err_t tcpnal_send(lib_nal_t *n, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, @@ -68,7 +68,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n, size_t len) { connection c; - bridge b=(bridge)n->nal_data; + bridge b=(bridge)n->libnal_data; struct iovec tiov[257]; static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER; ptl_err_t rc = PTL_OK; @@ -142,7 +142,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n, /* Function: tcpnal_recv - * Arguments: nal_cb_t *nal: pointer to my nal control block + * Arguments: lib_nal_t *nal: pointer to my nal control block * void *private: connection pointer passed through * lib_parse() * lib_msg_t *cookie: passed back to portals library @@ -154,7 +154,7 @@ ptl_err_t tcpnal_send(nal_cb_t *n, * blocking read of the requested data. must drain out the * difference of mainpulated and requested lengths from the network */ -ptl_err_t tcpnal_recv(nal_cb_t *n, +ptl_err_t tcpnal_recv(lib_nal_t *n, void *private, lib_msg_t *cookie, unsigned int niov, @@ -217,7 +217,8 @@ static int from_connection(void *a, void *d) ptl_hdr_t hdr; if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ - lib_parse(b->nal_cb, &hdr, c); + lib_parse(b->lib_nal, &hdr, c); + /*TODO: check error status*/ return(1); } return(0); @@ -239,19 +240,19 @@ int tcpnal_init(bridge b) { manager m; - b->nal_cb->cb_send=tcpnal_send; - b->nal_cb->cb_recv=tcpnal_recv; + b->lib_nal->libnal_send=tcpnal_send; + b->lib_nal->libnal_recv=tcpnal_recv; b->shutdown=tcpnal_shutdown; - if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid, - b->nal_cb->ni.pid), + if (!(m=init_connections(PNAL_PORT(b->lib_nal->libnal_ni.ni_pid.nid, + b->lib_nal->libnal_ni.ni_pid.pid), from_connection,b))){ /* TODO: this needs to shut down the newly created junk */ return(PTL_NAL_FAILED); } /* XXX cfs hack */ - b->nal_cb->ni.pid=0; + b->lib_nal->libnal_ni.ni_pid.pid=0; b->lower=m; return(PTL_OK); } diff --git a/lustre/ptlbd/autoMakefile.am b/lustre/ptlbd/autoMakefile.am index 0446dc8..6b76199 100644 --- a/lustre/ptlbd/autoMakefile.am +++ b/lustre/ptlbd/autoMakefile.am @@ -3,9 +3,11 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution +if MODULES if !LINUX25 modulefs_DATA = ptlbd$(KMODEXT) endif +endif MOSTLYCLEANFILES = *.o *.ko *.mod.c DIST_SOURCES = $(ptlbd-objs:%.o=%.c) diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index d865e1f..254ae30 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -37,7 +37,7 @@ static void cray_portals_callback(ptl_event_t *ev); #endif -struct ptlrpc_ni ptlrpc_interfaces[NAL_MAX_NR]; +struct ptlrpc_ni ptlrpc_interfaces[8]; int ptlrpc_ninterfaces; /* @@ -597,12 +597,16 @@ int ptlrpc_init_portals(void) int number; char *name; } ptl_nis[] = { +#ifndef CRAY_PORTALS {QSWNAL, "qswnal"}, {SOCKNAL, "socknal"}, {GMNAL, "gmnal"}, {IBNAL, "ibnal"}, {TCPNAL, "tcpnal"}, - {CRAY_KB_ERNAL, "cray_kb_ernal"}}; +#else + {CRAY_KB_ERNAL, "cray_kb_ernal"}, +#endif + }; int rc; int i; diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index 81f46dc..3fca883 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -32,8 +32,6 @@ struct obd_import; struct ldlm_res_id; struct ptlrpc_request_set; -void ptlrpc_daemonize(void); - void ptlrpc_request_handle_notconn(struct ptlrpc_request *); void lustre_assert_wire_constants(void); int ptlrpc_import_in_recovery(struct obd_import *imp); diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index 6f3ce27..26ae032 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -134,6 +134,7 @@ EXPORT_SYMBOL(ptlrpc_stop_all_threads); EXPORT_SYMBOL(ptlrpc_start_n_threads); EXPORT_SYMBOL(ptlrpc_start_thread); EXPORT_SYMBOL(ptlrpc_unregister_service); +EXPORT_SYMBOL(ptlrpc_daemonize); /* pack_generic.c */ EXPORT_SYMBOL(lustre_msg_swabbed); diff --git a/lustre/scripts/cvsdiffclient b/lustre/scripts/cvsdiffclient index dab1e90..45325c7 100755 --- a/lustre/scripts/cvsdiffclient +++ b/lustre/scripts/cvsdiffclient @@ -1,12 +1,27 @@ #!/bin/bash +# Put this script and cvs-modified-files.pl into your PATH (~bin is good) and +# +# export CVSEDITOR=cvsdiffclient +# +# in your .bashrc and you will get a nice bunch of CVS commit reminders: +# +# b= +# r= +# +# Remember to remove the leading "CVS: " part of the comment before saving +# your commit comment if you want those entries to be saved. [ -f .mergeinfo ] && . ./.mergeinfo FILES=`cvs-modified-files.pl $1` TMP=`mktemp /tmp/cvslog-XXXXXXXX` if [ -f $TMP ]; then - [ -f .mergeinfo ] && \ - echo "CVS: Update $child from $parent ($date)" >> $TMP + if [ -f .mergeinfo ]; then + . .mergeinfo + [ "$OPERATION" ] || OPERATION=Update + [ "$OPERWHERE" ] || OPERWHERE=from + echo "CVS: $OPERATION $child $OPERWHERE $parent ($date)" >> $TMP + fi echo "CVS: did you update the ChangeLog for a bug fix?" >> $TMP echo "CVS: b=" >> $TMP echo "CVS: r=" >> $TMP diff --git a/lustre/scripts/land1.sh b/lustre/scripts/land1.sh index c3a0468..08f559a 100755 --- a/lustre/scripts/land1.sh +++ b/lustre/scripts/land1.sh @@ -27,20 +27,32 @@ module=lustre case $parent in HEAD) : ;; - b_*|b1*) : ;; + b_*|b[1-4]*) : ;; *) parent="b_$parent" ;; esac case $child in HEAD) : ;; - b_*|b1*) : ;; + b_*|b[1-4]*) : ;; *) child="b_$child" esac if [ "$parent" != "HEAD" -a "`cat CVS/Tag 2> /dev/null`" != "T$parent" ]; then - echo "This script must be run within the $parent branch" + echo "$0: this script must be run within the $parent branch" exit 1 fi +TEST_FILE=${TEST_FILE:-ChangeLog} # does this need to be smarter? +check_tag() { + [ -z "$1" ] && echo "check_tag() missing arg" && exit3 + [ "$1" = "HEAD" ] && return + $CVS log $TEST_FILE | grep -q " $1: " && return + echo "$0: tag $1 not found in $TEST_FILE" + exit 2 +} + +check_tag $child +check_tag ${CHILD}_BASE + dir=$3 cat << EOF > .mergeinfo @@ -52,6 +64,8 @@ date=$date module=$module dir=$dir CONFLICTS=$CONFLICTS +OPERATION=Land +OPERWHERE=onto EOF echo PARENT $PARENT parent $parent CHILD $CHILD child $child date $date diff --git a/lustre/scripts/lmake b/lustre/scripts/lmake index 9f92230..2e93089 100755 --- a/lustre/scripts/lmake +++ b/lustre/scripts/lmake @@ -358,35 +358,40 @@ install_kernel() install -m 644 "$CONFIG_FILE" "$DESTDIR/boot/config-${FULL_VERSION}" mkdir -p "$DESTDIR/dev/shm" + mkdir -p "$DESTDIR/lib/modules/${FULL_VERSION}" + + make CC="$CC" INSTALL_MOD_PATH="$DESTDIR" KERNELRELEASE="$FULL_VERSION" \ + -s modules_install || \ + fatal 1 "Error installing modules." case "$TARGET_ARCH" in i386 | i586 | i686 | athlon) cp arch/i386/boot/bzImage "$DESTDIR/boot/vmlinuz-${FULL_VERSION}" - cp vmlinux "$DESTDIR/boot/vmlinux-${FULL_VERSION}" + cp vmlinux "$DESTDIR/lib/modules/${FULL_VERSION}/" + ln -sf "../lib/modules/${FULL_VERSION}/vmlinux" "$DESTDIR/boot/vmlinux-${FULL_VERSION}" ;; x86_64) cp arch/x86_64/boot/bzImage "$DESTDIR/boot/vmlinuz-${FULL_VERSION}" - cp vmlinux "$DESTDIR/boot/vmlinux-${FULL_VERSION}" + cp vmlinux "$DESTDIR/lib/modules/${FULL_VERSION}/" + ln -sf "../lib/modules/${FULL_VERSION}/vmlinux" "$DESTDIR/boot/vmlinux-${FULL_VERSION}" ;; ia64) gzip -cfv vmlinux > vmlinuz mkdir -p "$DESTDIR/boot/efi/redhat" - install -m 755 vmlinux "$DESTDIR/boot/efi/redhat/vmlinux-${FULL_VERSION}" + install -m 755 vmlinux "$DESTDIR/lib/modules/${FULL_VERSION}/" install -m 755 vmlinuz "$DESTDIR/boot/efi/redhat/vmlinuz-${FULL_VERSION}" + ln -sf "../lib/modules/${FULL_VERSION}/vmlinux" "$DESTDIR/boot/efi/redhat/vmlinux-${FULL_VERSION}" ln -sf "efi/redhat/vmlinux-${FULL_VERSION}" "$DESTDIR/boot/vmlinux-${FULL_VERSION}" ln -sf "efi/redhat/vmlinuz-${FULL_VERSION}" "$DESTDIR/boot/vmlinuz-${FULL_VERSION}" ;; *) cp vmlinuz "$DESTDIR/boot/vmlinuz-${FULL_VERSION}" - cp vmlinux "$DESTDIR/boot/vmlinux-${FULL_VERSION}" + cp vmlinux "$DESTDIR/lib/modules/${FULL_VERSION}/vmlinux-${FULL_VERSION}" + ln -sf "../lib/modules/${FULL_VERSION}/vmlinux-${FULL_VERSION}" "$DESTDIR/boot/vmlinux-${FULL_VERSION}" + ;; esac - mkdir -p "$DESTDIR/lib/modules/${FULL_VERSION}" - make CC="$CC" INSTALL_MOD_PATH="$DESTDIR" KERNELRELEASE="$FULL_VERSION" \ - -s modules_install || \ - fatal 1 "Error installing modules." - popd >/dev/null } diff --git a/lustre/scripts/lustre-kernel-2.4.spec.in b/lustre/scripts/lustre-kernel-2.4.spec.in index 9ae6368..f177c17 100644 --- a/lustre/scripts/lustre-kernel-2.4.spec.in +++ b/lustre/scripts/lustre-kernel-2.4.spec.in @@ -21,6 +21,7 @@ Summary: The Linux kernel (the core of the Linux operating system) %define nptlarchs %{all_x86} #define nptlarchs noarch %define rhbuild @RHBUILD@ +%define linux26 @LINUX26@ # disable build root strip policy %define __spec_install_post /usr/lib/rpm/brp-compress || : @@ -444,7 +445,8 @@ ln -sf linux-%{KVERREL} $RPM_BUILD_ROOT/usr/src/linux #clean up the destination make -s mrproper -C $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL} -rm -rf $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/configs/* +rm -rf $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/configs +mkdir -p $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/configs cp ../kernel_patches/kernel_configs/kernel-%{kversion}-@LUSTRE_TARGET@*.config $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/configs cp ../kernel_patches/kernel_configs/kernel-%{kversion}-@LUSTRE_TARGET@-%{_target_cpu}%{dashtargetboard}.config $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/.config if grep -q oldconfig_nonint $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/Makefile ; then @@ -453,7 +455,11 @@ else OLDCONFIG='oldconfig' fi make -s $OLDCONFIG -C $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL} +%if %{linux26} +make -s include/asm -C $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL} +%else make -s symlinks -C $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL} +%endif make -s include/linux/version.h -C $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL} #this generates modversions info which we want to include and we may as @@ -753,7 +759,6 @@ exit 0 /usr/src/linux-%{KVERREL}/Makefile /usr/src/linux-%{KVERREL}/README /usr/src/linux-%{KVERREL}/REPORTING-BUGS -/usr/src/linux-%{KVERREL}/Rules.make /usr/src/linux-%{KVERREL}/arch %ifarch sparc /usr/src/linux-%{KVERREL}/arch/sparc64 @@ -795,6 +800,14 @@ exit 0 %ifarch alpha sparc /usr/src/linux-%{KVERREL}/include/math-emu %endif +%if %{linux26} +%dir /usr/src/linux-%{KVERREL}/crypto +%dir /usr/src/linux-%{KVERREL}/kdb +%dir /usr/src/linux-%{KVERREL}/rpmify +%dir /usr/src/linux-%{KVERREL}/security +%else +/usr/src/linux-%{KVERREL}/Rules.make +%endif %endif %files doc diff --git a/lustre/scripts/merge1.sh b/lustre/scripts/merge1.sh index ac074d7..5fefc71 100755 --- a/lustre/scripts/merge1.sh +++ b/lustre/scripts/merge1.sh @@ -27,12 +27,12 @@ module=lustre case $parent in HEAD) : ;; - b_*|b1*) : ;; + b_*|b[1-4]*) : ;; *) parent="b_$parent" ;; esac case $child in HEAD) : ;; - b_*|b1*) : ;; + b_*|b[1-4]*) : ;; *) child="b_$child" esac @@ -41,6 +41,18 @@ if [ "$child" != "HEAD" -a "`cat CVS/Tag 2> /dev/null`" != "T$child" ]; then exit 1 fi +TEST_FILE=${TEST_FILE:-ChangeLog} # does this need to be smarter? +check_tag() { + [ -z "$1" ] && echo "check_tag() missing arg" && exit3 + [ "$1" = "HEAD" ] && return + $CVS log $TEST_FILE | grep -q " $1: " && return + echo "$0: tag $1 not found in $TEST_FILE" + exit 2 +} + +check_tag $parent +check_tag ${CHILD}_BASE + cat << EOF > .mergeinfo parent=$parent PARENT=$PARENT @@ -49,6 +61,8 @@ CHILD=$CHILD date=$date module=$module CONFLICTS=$CONFLICTS +OPERATION=Merge +OPERWHERE=from EOF echo PARENT: $PARENT parent: $parent CHILD: $CHILD child: $child date: $date diff --git a/lustre/tests/.cvsignore b/lustre/tests/.cvsignore index 0b00c70..bc148be 100644 --- a/lustre/tests/.cvsignore +++ b/lustre/tests/.cvsignore @@ -64,3 +64,4 @@ ostactive ll_dirstripe_verify openfilleddirunlink copy_attr +rename_many diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 1430099..a74483c 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -20,7 +20,7 @@ noinst_PROGRAMS += opendirunlink opendevunlink unlinkmany fchdir_test checkstat noinst_PROGRAMS += wantedi statone runas openfile getdents mkdirdeep o_directory noinst_PROGRAMS += small_write multiop sleeptest ll_sparseness_verify cmknod noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify copy_attr -noinst_PROGRAMS += openfilleddirunlink +noinst_PROGRAMS += openfilleddirunlink rename_many # noinst_PROGRAMS += ldaptest bin_PROGRAMS = mcreate munlink mkdirmany iopentest1 iopentest2 endif # TESTS @@ -68,6 +68,7 @@ mkdirdeep_SOURCES = mkdirdeep.c mkdirdeep_LDADD=-L$(top_builddir)/portals/utils -lptlctl $(LIBREADLINE) small_write_SOURCES = small_write.c sleeptest_SOURCES = sleeptest.c +rename_many_SOURCES = rename_many.c #write_append_truncate_SOURCES=write_append_truncate.c #write_append_truncate_CC=mpicc #createmany_mpi_SOURCES=createmany_mpi.c diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh index 38effed..9d7ca4b 100644 --- a/lustre/tests/cfg/local.sh +++ b/lustre/tests/cfg/local.sh @@ -24,7 +24,7 @@ MDSSIZE=${MDSSIZE:-10000} OSTDEV=${OSTDEV:-$ROOT/tmp/ost1-`hostname`} OSTSIZE=${OSTSIZE:-50000} FSTYPE=${FSTYPE:-ext3} -TIMEOUT=${TIMEOUT:-10} +TIMEOUT=${TIMEOUT:-20} UPCALL=${UPCALL:-$PWD/replay-single-upcall.sh} STRIPE_BYTES=${STRIPE_BYTES:-524288} diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 3c6763f..9c310e5 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -221,9 +221,22 @@ test_15() { } run_test 15 "failed open (-ENOMEM)" +stop_read_ahead() { + for f in /proc/fs/lustre/llite/*/read_ahead; do + echo 0 > $f + done +} + +start_read_ahead() { + for f in /proc/fs/lustre/llite/*/read_ahead; do + echo 1 > $f + done +} + test_16() { do_facet client cp /etc/termcap $MOUNT sync + stop_read_ahead #define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 | OBD_FAIL_ONCE sysctl -w lustre.fail_loc=0x80000504 @@ -234,6 +247,7 @@ test_16() { # give recovery a chance to finish (shouldn't take long) sleep $TIMEOUT do_facet client "cmp /etc/termcap $MOUNT/termcap" || return 2 + start_read_ahead } run_test 16 "timeout bulk put, evict client (2732)" diff --git a/lustre/tests/rename_many.c b/lustre/tests/rename_many.c new file mode 100644 index 0000000..faf5085 --- /dev/null +++ b/lustre/tests/rename_many.c @@ -0,0 +1,263 @@ +#define PATH_LENGTH 35 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct names { + char from[PATH_LENGTH]; + char to[PATH_LENGTH]; +} *names; + +unsigned int loop_count = 500; +int file_count = 1000; +int seed; +int loops; +int stop; +long start; + +int opt_exit_on_err; +int opt_verbose; +int opt_create_only; +int opt_rename_only; +int creat_errors; +int rename_errors; +int unlink_errors; + +void usage(const char *progname) +{ + fprintf(stderr, "usage: %s [-n numfiles] [-s seed] [-v] [-x] [dir]\n" + "\t-c: only do the create step of first loop\n" + "\t-f: number of files to create/rename/unlink per loop\n" + "\t-n: number of test loops (0 to run forever)\n" + "\t-r: only do the rename step of first loop\n" + "\t-s: starting seed (equals loop number by default)\n" + "\t-v: verbose\n" + "\t-x: don't exit on error\n", progname); +} + +void handler(int sig) { + static long last_time; + long now = time(0); + + signal(SIGINT, handler); + signal(SIGALRM, handler); + printf("%6lds %8d iterations %d/%d/%d errors", + now - start, loops, creat_errors, rename_errors, unlink_errors); + if (sig != 0) + printf(" - use SIGQUIT (^\\) or ^C^C to kill\n"); + else + printf("\n"); + + if (sig == SIGQUIT) + stop = 1; + else if (sig == SIGINT) { + if (now - last_time < 2) + stop = 1; + last_time = now; + } + alarm(60); +} + +extern char *optarg; +extern int optind; + +int main(int argc, char *argv[]) +{ + unsigned long n; + char msg[100], c, *end = NULL; + int h1, h2; + int i; + + while ((c = getopt(argc, argv, "cf:n:rs:vx")) != EOF) { + switch(c) { + case 'c': + ++opt_create_only; + break; + case 'f': + i = strtoul(optarg, &end, 0); + if (i && end != NULL && *end == '\0') { + file_count = i; + } else { + fprintf(stderr, "bad file count '%s'\n",optarg); + usage(argv[0]); + return 1; + } + break; + case 'n': + i = strtoul(optarg, &end, 0); + if (i && end != NULL && *end == '\0') { + loop_count = i; + } else { + fprintf(stderr, "bad loop count '%s'\n",optarg); + usage(argv[0]); + return 1; + } + break; + case 'r': + ++opt_rename_only; + break; + case 's': + i = strtoul(optarg, &end, 0); + if (end && *end == '\0') { + seed = i; + } else { + fprintf(stderr, "bad seed '%s'\n", optarg); + usage(argv[0]); + return 1; + } + break; + case 'v': + ++opt_verbose; + break; + case 'x': + ++opt_exit_on_err; + break; + default: + usage(argv[0]); + return 1; + } + } + + names = malloc(sizeof(struct names) * file_count); + if (names == NULL) { + perror("calloc"); + return(1); + } + + h2 = sprintf(msg, "%x", file_count); /* just to figure length */ + h1 = (PATH_LENGTH - h2 - 2) / 4; + + n = (1ULL << h1 * 4) - 1; + + //printf("h1 = %d, h2 = %d n = %lu\n", h1, h2, n); + + start = time(0); + + signal(SIGQUIT, handler); + signal(SIGINT, handler); + signal(SIGALRM, handler); + signal(SIGUSR1, handler); + alarm(60); + + if (argc > optind + 1) { + fprintf(stderr, "too many extra args %d\n", argc - optind); + usage(argv[0]); + return 1; + } else if (argv[optind] != NULL) { + if (chdir(argv[optind]) < 0) { + sprintf(msg, "chdir '%s'\n", argv[optind]); + perror(msg); + return 2; + } + } + + while (!stop && loop_count != 0 && loops < loop_count) { + int j,k,l,m; + + srand(seed + loops); + if (mkdir("tmp", S_IRWXU) == -1) { + perror("mkdir tmp"); + return(1); + } + if (chdir("tmp") == -1) { + perror("chdir tmp"); + return(1); + } + + for (i = 0; i < file_count ; i++) { + j = random() & n; + k = random() & n; + l = random() & n; + m = random() & n; + sprintf(names[i].from, "%0*x%0*x%0*x%0*x0%0*x", + h1, j, h1, k, h1, l, h1, m, h2, i); + sprintf(names[i].to, "%0*x%0*x%0*x%0*x1%0*x", + h1, j, h1, k, h1, l, h1, m, h2, i); + + } + + for (i = 0; i < file_count; i++) { + if (mknod(names[i].from, S_IFREG | S_IRWXU, 0) == -1) { + sprintf(msg, "loop %d.%d: creat %s", + loops, i, names[i].from); + perror(msg); + creat_errors++; + if (!opt_exit_on_err) + return 4; + } + } + + if (opt_create_only) + return 0; + + for (i = 0; i < file_count; i++) { + if (rename(names[i].from, names[i].to) == -1) { + sprintf(msg, "loop %d.%d: rename %s to %s", + loops, i, names[i].from, names[i].to); + perror(msg); + rename_errors++; + if (!opt_exit_on_err) + return 4; + } + } + + if (opt_rename_only) + return 0; + + for (i = 0; i < file_count; i++) { + if (unlink(names[i].to) == -1) { + sprintf(msg, "loop %d.%d: unlink %s", + loops, i, names[i].to); + perror(msg); + unlink_errors++; + if (!opt_exit_on_err) + return 4; + } + } + + if (chdir("..") == -1) { + perror("chdir .."); + return(1); + } + + if (rmdir("tmp") == -1) { + if (chdir("tmp") == -1) { + perror("chdir tmp 2"); + return(1); + } + for (i = 0; i < file_count; i++) { + if (unlink(names[i].from) != -1) { + fprintf(stderr, "loop %d.%d: " + "unexpected file %s\n", + loops, i, names[i].to); + unlink_errors++; + if (!opt_exit_on_err) + return 4; + } + } + if (chdir("..") == -1) { + perror("chdir .. 2"); + return(1); + } + if (rmdir("tmp") == -1) { + perror("rmdir tmp"); + return(1); + } + } + + loops++; + if (opt_verbose) + handler(0); + } + + if (!opt_verbose) + handler(0); + return(0); +} diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 3c84d8d..c91837b 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -175,6 +175,27 @@ test_6() { } run_test 6 "open1, open2, unlink |X| close1 [fail mds] close2" +test_7() { + mcreate $MOUNT1/a + multiop $MOUNT2/a o_c & + pid1=$! + multiop $MOUNT1/a o_c & + pid2=$! + # give multiop a chance to open + sleep 1 + rm -f $MOUNT1/a + replay_barrier mds + kill -USR1 $pid2 + wait $pid2 || return 1 + + fail mds + kill -USR1 $pid1 + wait $pid1 || return 1 + [ -e $MOUNT2/a ] && return 2 + return 0 +} +run_test 7 "open1, open2, unlink |X| close2 [fail mds] close1" + if [ "$ONLY" != "setup" ]; then equals_msg test complete, cleaning up cleanup diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index c0d023b..a65ba63 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -876,7 +876,7 @@ test_42() { run_test 42 "recovery after ost failure" # b=2530 -# directory orphans can't be unlinked from PENDING directory +# timeout in MDS/OST recovery RPC will LBUG MDS test_43() { replay_barrier mds @@ -936,6 +936,31 @@ test_46() { } run_test 46 "Don't leak file handle after open resend (3325)" +# b=2824 +test_47() { + + # create some files to make sure precreate has been done on all + # OSTs. (just in case this test is run independently) + createmany -o $DIR/$tfile 20 || return 1 + + # OBD_FAIL_OST_CREATE_NET 0x204 + fail ost + do_facet ost "sysctl -w lustre.fail_loc=0x80000204" + df $MOUNT || return 2 + + # let the MDS discover the OST failure, attempt to recover, fail + # and recover again. + sleep $((3 * TIMEOUT)) + + # Without 2824, this createmany would hang + createmany -o $DIR/$tfile 20 || return 3 + unlinkmany $DIR/$tfile 20 || return 4 + + do_facet ost "sysctl -w lustre.fail_loc=0" + return 0 +} +run_test 47 "MDS->OSC failure during precreate cleanup (2824)" + equals_msg test complete, cleaning up $CLEANUP diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 451030a..073b1e5 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -10,6 +10,9 @@ ONLY=${ONLY:-"$*"} # bug number for skipped test: 2108 ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"24j 48c 48d 58"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! +case `uname -r` in +2.6.*) ALWAYS_EXCEPT="$ALWAYS_EXCEPT 54c 55" # bug 3117 +esac [ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT" @@ -17,6 +20,7 @@ SRCDIR=`dirname $0` export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH TMP=${TMP:-/tmp} +FSTYPE=${FSTYPE:-ext3} CHECKSTAT=${CHECKSTAT:-"checkstat -v"} CREATETEST=${CREATETEST:-createtest} @@ -190,11 +194,7 @@ build_test_filter echo preparing for tests involving mounts EXT2_DEV=${EXT2_DEV:-/tmp/SANITY.LOOP} touch $EXT2_DEV -mke2fs -F $EXT2_DEV 1000 > /dev/null - -EXT3_DEV=${EXT3_DEV:-/tmp/SANITY_EXT3_DEV.LOOP} -touch $EXT3_DEV -mkfs.ext3 -F $EXT3_DEV 10000 > /dev/null +mke2fs -j -F $EXT2_DEV 8000 > /dev/null test_0() { touch $DIR/f @@ -663,6 +663,12 @@ test_24n() { } run_test 24n "Statting the old file after renameing (Posix rename 2)" +test_24o() { + check_kernel_version 37 || return 0 + rename_many -s 3287 -v -n 10 $DIR +} +run_test 24o "rename of files during htree split ===============" + test_25a() { echo '== symlink sanity =============================================' mkdir $DIR/d25 @@ -1576,7 +1582,7 @@ test_48c() { # bug 2350 #set -vx mkdir -p $DIR/d48c/dir cd $DIR/d48c/dir - rmdir $DIR/d48c/dir || error "remove cwd $DIR/d48c/dir failed" + $TRACE rmdir $DIR/d48c/dir || error "remove cwd $DIR/d48c/dir failed" $TRACE touch foo && error "'touch foo' worked after removing cwd" $TRACE mkdir foo && error "'mkdir foo' worked after removing cwd" $TRACE ls . && error "'ls .' worked after removing cwd" @@ -1585,7 +1591,7 @@ test_48c() { # bug 2350 $TRACE mkdir . && error "'mkdir .' worked after removing cwd" $TRACE rmdir . && error "'rmdir .' worked after removing cwd" $TRACE ln -s . foo && error "'ln -s .' worked after removing cwd" ||true - $TRACE cd .. || error "'cd ..' failed after removing cwd" + $TRACE cd .. || echo "'cd ..' failed after removing cwd (`pwd)`" } run_test 48c "Access removed working subdir (should return errors)" @@ -1595,11 +1601,13 @@ test_48d() { # bug 2350 #set -vx mkdir -p $DIR/d48d/dir cd $DIR/d48d/dir - rm -r $DIR/d48d || error "remove cwd and parent $DIR/d48d failed" + pwd + ls . + $TRACE rm -vr $DIR/d48d || error "remove cwd+parent $DIR/d48d failed" $TRACE touch foo && error "'touch foo' worked after removing cwd" $TRACE mkdir foo && error "'mkdir foo' worked after removing cwd" $TRACE ls . && error "'ls .' worked after removing cwd" - $TRACE ls .. && error "'ls ..' worked after removing cwd" + $TRACE ls .. && echo "'ls ..' worked after removing cwd" # bug 3415 $TRACE cd . && error "'cd .' worked after recreate cwd" $TRACE mkdir . && error "'mkdir .' worked after removing cwd" $TRACE rmdir . && error "'rmdir .' worked after removing cwd" @@ -1735,11 +1743,11 @@ run_test 54d "fifo device works in lustre ======================" test_55() { rm -rf $DIR/d55 mkdir $DIR/d55 - mount -t ext3 -o loop,iopen $EXT3_DEV $DIR/d55 || error + mount -t $FSTYPE -o loop,iopen $EXT2_DEV $DIR/d55 || error touch $DIR/d55/foo $IOPENTEST1 $DIR/d55/foo $DIR/d55 || error $IOPENTEST2 $DIR/d55 || error - echo "check for $EXT3_DEV. Please wait..." + echo "check for $EXT2_DEV. Please wait..." rm -rf $DIR/d55/* umount $DIR/d55 || error } diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 687ea05..455ab96 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -2852,7 +2852,7 @@ def sys_optimize_elan (): "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"] for p in procfiles: if os.access(p, os.R_OK): - run ("echo 0 > " + p) + run ("echo 1 > " + p) def sys_set_ptldebug(ptldebug): if config.ptldebug: diff --git a/lustre/utils/llmount.c b/lustre/utils/llmount.c index 980f9fe..8e75f5c 100644 --- a/lustre/utils/llmount.c +++ b/lustre/utils/llmount.c @@ -29,24 +29,63 @@ #include #include #include +#define _GNU_SOURCE +#include #include "obdctl.h" #include -int debug = 0; -int verbose = 0; -int nomtab = 0; +int debug; +int verbose; +int nomtab; +int force; static char *progname = NULL; +typedef struct { + ptl_nid_t gw; + ptl_nid_t lo; + ptl_nid_t hi; +} llmount_route_t; + +#define MAX_ROUTES 1024 +int route_index; +ptl_nid_t lmd_cluster_id = 0; +llmount_route_t routes[MAX_ROUTES]; + +static int check_mtab_entry(char *spec, char *mtpt, char *type) +{ + FILE *fp; + struct mntent *mnt; + + if (!force) { + fp = setmntent(MOUNTED, "r"); + if (fp == NULL) + return(0); + + while ((mnt = getmntent(fp)) != NULL) { + if (strcmp(mnt->mnt_fsname, spec) == 0 && + strcmp(mnt->mnt_dir, mtpt) == 0 && + strcmp(mnt->mnt_type, type) == 0) { + fprintf(stderr, "%s: according to %s %s is " + "already mounted on %s\n", + progname, MOUNTED, spec, mtpt); + return(1); /* or should we return an error? */ + } + } + endmntent(fp); + } + return(0); +} + static void -update_mtab_entry(char *spec, char *node, char *type, char *opts, - int flags, int freq, int pass) +update_mtab_entry(char *spec, char *mtpt, char *type, char *opts, + int flags, int freq, int pass) { FILE *fp; struct mntent mnt; mnt.mnt_fsname = spec; - mnt.mnt_dir = node; + mnt.mnt_dir = mtpt; mnt.mnt_type = type; mnt.mnt_opts = opts ? opts : ""; mnt.mnt_freq = freq; @@ -55,7 +94,7 @@ update_mtab_entry(char *spec, char *node, char *type, char *opts, if (!nomtab) { fp = setmntent(MOUNTED, "a+"); if (fp == NULL) { - fprintf(stderr, "%s: setmntent(%s): %s:", + fprintf(stderr, "%s: setmntent(%s): %s:", progname, MOUNTED, strerror (errno)); } else { if ((addmntent (fp, &mnt)) == 1) { @@ -82,6 +121,8 @@ init_options(struct lustre_mount_data *lmd) int print_options(struct lustre_mount_data *lmd) { + int i; + printf("mds: %s\n", lmd->lmd_mds); printf("profile: %s\n", lmd->lmd_profile); printf("server_nid: "LPX64"\n", lmd->lmd_server_nid); @@ -90,16 +131,77 @@ print_options(struct lustre_mount_data *lmd) printf("server_ipaddr: 0x%x\n", lmd->lmd_server_ipaddr); printf("port: %d\n", lmd->lmd_port); + for (i = 0; i < route_index; i++) + printf("route: 0x%llx : 0x%llx - 0x%llx\n", + routes[i].gw, routes[i].lo, routes[i].hi); + return 0; } -int -parse_options(char * options, struct lustre_mount_data *lmd) +static int parse_route(char *opteq, char *opttgts) { - ptl_nid_t nid = 0; + char *gw_lo_ptr, *gw_hi_ptr, *tgt_lo_ptr, *tgt_hi_ptr; + ptl_nid_t gw_lo, gw_hi, tgt_lo, tgt_hi; + + opttgts[0] = '\0'; + gw_lo_ptr = opteq + 1; + if (!(gw_hi_ptr = strchr(gw_lo_ptr, '-'))) { + gw_hi_ptr = gw_lo_ptr; + } else { + gw_hi_ptr[0] = '\0'; + gw_hi_ptr++; + } + + if (ptl_parse_nid(&gw_lo, gw_lo_ptr) != 0) { + fprintf(stderr, "%s: can't parse NID %s\n", progname,gw_lo_ptr); + return(-1); + } + + if (ptl_parse_nid(&gw_hi, gw_hi_ptr) != 0) { + fprintf(stderr, "%s: can't parse NID %s\n", progname,gw_hi_ptr); + return(-1); + } + + tgt_lo_ptr = opttgts + 1; + if (!(tgt_hi_ptr = strchr(tgt_lo_ptr, '-'))) { + tgt_hi_ptr = tgt_lo_ptr; + } else { + tgt_hi_ptr[0] = '\0'; + tgt_hi_ptr++; + } + + if (ptl_parse_nid(&tgt_lo, tgt_lo_ptr) != 0) { + fprintf(stderr, "%s: can't parse NID %s\n",progname,tgt_lo_ptr); + return(-1); + } + + if (ptl_parse_nid(&tgt_hi, tgt_hi_ptr) != 0) { + fprintf(stderr, "%s: can't parse NID %s\n",progname,tgt_hi_ptr); + return(-1); + } + + while (gw_lo <= gw_hi) { + if (route_index >= MAX_ROUTES) { + fprintf(stderr, "%s: to many routes %d\n", + progname, MAX_ROUTES); + return(-1); + } + + routes[route_index].gw = gw_lo; + routes[route_index].lo = tgt_lo; + routes[route_index].hi = tgt_hi; + route_index++; + gw_lo++; + } + + return(0); +} + +int parse_options(char * options, struct lustre_mount_data *lmd) +{ + ptl_nid_t nid = 0, cluster_id = 0; int val; - char *opt; - char * opteq; + char *opt, *opteq, *opttgts; /* parsing ideas here taken from util-linux/mount/nfsmount.c */ for (opt = strtok(options, ","); opt; opt = strtok(NULL, ",")) { @@ -107,9 +209,25 @@ parse_options(char * options, struct lustre_mount_data *lmd) val = atoi(opteq + 1); *opteq = '\0'; if (!strcmp(opt, "nettype")) { - lmd->lmd_nal = ptl_name2nal(opteq+1); - } else if(!strcmp(opt, "local_nid")) { - if (ptl_parse_nid(&nid, opteq+1) != 0) { + lmd->lmd_nal = ptl_name2nal(opteq + 1); + } else if(!strcmp(opt, "cluster_id")) { + if (ptl_parse_nid(&cluster_id, opteq+1) != 0) { + fprintf (stderr, "%s: can't parse NID " + "%s\n", progname, opteq+1); + return (-1); + } + lmd_cluster_id = cluster_id; + } else if(!strcmp(opt, "route")) { + if (!(opttgts = strchr(opteq + 1, ':'))) { + fprintf(stderr, "%s: Route must be " + "of the form: route=" + "[-]:[-]\n", + progname); + return(-1); + } + parse_route(opteq, opttgts); + } else if (!strcmp(opt, "local_nid")) { + if (ptl_parse_nid(&nid, opteq + 1) != 0) { fprintf (stderr, "%s: " "can't parse NID %s\n", progname, @@ -117,11 +235,11 @@ parse_options(char * options, struct lustre_mount_data *lmd) return (-1); } lmd->lmd_local_nid = nid; - } else if(!strcmp(opt, "server_nid")) { - if (ptl_parse_nid(&nid, opteq+1) != 0) { + } else if (!strcmp(opt, "server_nid")) { + if (ptl_parse_nid(&nid, opteq + 1) != 0) { fprintf (stderr, "%s: " "can't parse NID %s\n", - progname, opteq+1); + progname, opteq + 1); return (-1); } lmd->lmd_server_nid = nid; @@ -204,7 +322,7 @@ set_local(struct lustre_mount_data *lmd) return (-1); } - lmd->lmd_local_nid = nid; + lmd->lmd_local_nid = nid + lmd_cluster_id; return 0; } @@ -252,25 +370,21 @@ set_peer(char *hostname, struct lustre_mount_data *lmd) int build_data(char *source, char *options, struct lustre_mount_data *lmd) { - char target[1024]; - char *hostname = NULL; - char *mds = NULL; - char *profile = NULL; - char *s; + char buf[1024]; + char *hostname = NULL, *mds = NULL, *profile = NULL, *s; int rc; if (lmd_bad_magic(lmd)) return -EINVAL; - if (strlen(source) > sizeof(target) + 1) { - fprintf(stderr, "%s: " - "exessively long host:/mds/profile argument\n", + if (strlen(source) > sizeof(buf) + 1) { + fprintf(stderr, "%s: host:/mds/profile argument too long\n", progname); return -EINVAL; } - strcpy(target, source); - if ((s = strchr(target, ':'))) { - hostname = target; + strcpy(buf, source); + if ((s = strchr(buf, ':'))) { + hostname = buf; *s = '\0'; while (*++s == '/') @@ -280,8 +394,7 @@ build_data(char *source, char *options, struct lustre_mount_data *lmd) *s = '\0'; profile = s + 1; } else { - fprintf(stderr, "%s: " - "directory to mount not in " + fprintf(stderr, "%s: directory to mount not in " "host:/mds/profile format\n", progname); return(-1); @@ -292,9 +405,6 @@ build_data(char *source, char *options, struct lustre_mount_data *lmd) progname); return(-1); } - if (verbose) - printf("host: %s\nmds: %s\nprofile: %s\n", hostname, mds, - profile); rc = parse_options(options, lmd); if (rc) @@ -324,55 +434,143 @@ build_data(char *source, char *options, struct lustre_mount_data *lmd) return 0; } -int -main(int argc, char * const argv[]) +static int set_routes(struct lustre_mount_data *lmd) { + struct portals_cfg pcfg; + struct portal_ioctl_data data; + int i, j, route_exists, rc, err = 0; + + register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); + + for (i = 0; i < route_index; i++) { + + /* Check for existing routes so as not to add duplicates */ + for (j = 0; ; j++) { + PCFG_INIT(pcfg, NAL_CMD_GET_ROUTE); + pcfg.pcfg_nal = ROUTER; + pcfg.pcfg_count = j; + + PORTAL_IOC_INIT(data); + data.ioc_pbuf1 = (char*)&pcfg; + data.ioc_plen1 = sizeof(pcfg); + data.ioc_nid = pcfg.pcfg_nid; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc != 0) { + route_exists = 0; + break; + } + + if ((pcfg.pcfg_gw_nal == lmd->lmd_nal) && + (pcfg.pcfg_nid == routes[i].gw) && + (pcfg.pcfg_nid2 == routes[i].lo) && + (pcfg.pcfg_nid3 == routes[i].hi)) { + route_exists = 1; + break; + } + } + + if (route_exists) + continue; + + PCFG_INIT(pcfg, NAL_CMD_ADD_ROUTE); + pcfg.pcfg_nid = routes[i].gw; + pcfg.pcfg_nal = ROUTER; + pcfg.pcfg_gw_nal = lmd->lmd_nal; + pcfg.pcfg_nid2 = MIN(routes[i].lo, routes[i].hi); + pcfg.pcfg_nid3 = MAX(routes[i].lo, routes[i].hi); + + PORTAL_IOC_INIT(data); + data.ioc_pbuf1 = (char*)&pcfg; + data.ioc_plen1 = sizeof(pcfg); + data.ioc_nid = pcfg.pcfg_nid; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc != 0) { + fprintf(stderr, "%s: Unable to add route " + "0x%llx : 0x%llx - 0x%llx\n[%d] %s\n", + progname, routes[i].gw, routes[i].lo, + routes[i].hi, errno, strerror(errno)); + err = -1; + break; + } + } + + unregister_ioc_dev(PORTALS_DEV_ID); + return err; +} + +void usage(FILE *out) { - char * source = argv[1]; - char * target = argv[2]; - char * options = ""; - int opt; - int i = 3; - struct lustre_mount_data lmd; + fprintf(out, "usage: %s [-f] [-v] [-n] [-o mntopt]\n", + progname); + exit(out != stdout); +} - int rc; +int main(int argc, char *const argv[]) +{ + char *source, *target, *options = ""; + int i, nargs = 3, opt, rc; + struct lustre_mount_data lmd; + static struct option long_opt[] = { + {"force", 0, 0, 'f'}, + {"help", 0, 0, 'h'}, + {"nomtab", 0, 0, 'n'}, + {"options", 1, 0, 'o'}, + {"verbose", 0, 0, 'v'}, + {0, 0, 0, 0} + }; progname = strrchr(argv[0], '/'); progname = progname ? progname + 1 : argv[0]; - while ((opt = getopt(argc, argv, "vno:")) != EOF) { + while ((opt = getopt_long(argc, argv, "fno:v", long_opt, NULL)) != EOF){ switch (opt) { - case 'v': - verbose = 1; - printf("verbose: %d\n", verbose); - i++; + case 'f': + ++force; + printf("force: %d\n", force); + nargs++; + break; + case 'h': + usage(stdout); break; case 'n': - nomtab = 1; + ++nomtab; printf("nomtab: %d\n", nomtab); - i++; + nargs++; break; case 'o': options = optarg; - i++; + nargs++; + break; + case 'v': + ++verbose; + printf("verbose: %d\n", verbose); + nargs++; break; default: - i++; + fprintf(stderr, "%s: unknown option '%c'\n", + progname, opt); + usage(stderr); break; } } - if (argc < i) { - fprintf(stderr, - "%s: too few arguments\n" - "Usage: %s [-v] [-n] [-o ...]\n", - progname, progname); - exit(1); + if (optind + 2 > argc) { + fprintf(stderr, "%s: too few arguments\n", progname); + usage(stderr); } - if (verbose) - for (i = 0; i < argc; i++) { + source = argv[optind]; + target = argv[optind + 1]; + + if (verbose) { + for (i = 0; i < argc; i++) printf("arg[%d] = %s\n", i, argv[i]); - } + printf("source = %s, target = %s\n", source, target); + } + + if (check_mtab_entry(source, target, "lustre")) + exit(32); init_options(&lmd); rc = build_data(source, options, &lmd); @@ -380,6 +578,11 @@ main(int argc, char * const argv[]) exit(1); } + rc = set_routes(&lmd); + if (rc) { + exit(1); + } + if (debug) { printf("%s: debug mode, not mounting\n", progname); exit(0); -- 1.8.3.1