#AC_SUBST(usrprefix)
AC_MSG_CHECKING(if kernel has CPU affinity support)
-if test "$target_cpu" != ia64 ; then
+SET_CPUS_ALLOW="`grep -c set_cpus_allowed $LINUX/kernel/softirq.c`"
+if test "$SET_CPUS_ALLOW" != 0 ; then
enable_affinity_temp="-DCPU_AFFINITY=1"
AC_MSG_RESULT(yes)
else
if test "${with_gm}" = yes; then
with_gm="-I/usr/local/gm/include"
else
- with_gm=-I"$with_gm/include"
+ with_gm="-I$with_gm/include -I$with_gm/drivers -I$with_gm/drivers/linux/gm"
fi
GMNAL="gmnal"
else
AC_SUBST(GMNAL)
+#fixme: where are the default IB includes?
+default_ib_include_dir=/usr/local/ib/include
+an_ib_include_file=vapi.h
+
+AC_ARG_WITH(ib, [ --with-ib=[yes/no/path] Path to IB includes], with_ib=$withval, with_ib=$default_ib)
+AC_MSG_CHECKING(if IB headers are present)
+if test "$with_ib" = yes; then
+ with_ib=$default_ib_include_dir
+fi
+if test "$with_ib" != no -a -f ${with_ib}/${an_ib_include_file}; then
+ AC_MSG_RESULT(yes)
+ IBNAL="ibnal"
+ with_ib="-I${with_ib}"
+else
+ AC_MSG_RESULT(no)
+ IBNAL=""
+ with_ib=""
+fi
+AC_SUBST(IBNAL)
+AC_SUBST(with_ib)
+
+
def_scamac=/opt/scali/include
AC_ARG_WITH(scamac, [ --with-scamac=[yes/no/path] Path to ScaMAC includes (default=/opt/scali/include)], with_scamac=$withval, with_scamac=$def_scamac)
AC_MSG_CHECKING(if ScaMAC headers are present)
AC_SUBST(SCIMACNAL)
CFLAGS="$KCFLAGS"
-CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS $enable_zerocopy $enable_affinity $with_quadrics $with_gm $with_scamac "
+CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS $enable_zerocopy $enable_affinity $with_quadrics $with_gm $with_scamac $with_ib"
AC_SUBST(MOD_LINK)
AC_SUBST(LINUX25)
sbindir='${exec_prefix}/usr/sbin'
includedir='${prefix}/usr/include'
+rootsbindir='${exec_prefix}/sbin'
+AC_SUBST(rootsbindir)
+
# Directories for documentation and demos.
docdir='${prefix}/usr/share/doc/$(PACKAGE)'
AC_SUBST(docdir)
AC_OUTPUT([Makefile Kernelenv libcfs/Makefile portals/Makefile \
unals/Makefile knals/Makefile router/Makefile \
knals/socknal/Makefile knals/gmnal/Makefile knals/qswnal/Makefile \
- knals/scimacnal/Makefile knals/toenal/Makefile \
+ knals/scimacnal/Makefile knals/toenal/Makefile knals/ibnal/Makefile\
utils/Makefile tests/Makefile doc/Makefile ])
--- /dev/null
+/* $Id: cygwin-ioctl.h,v 1.2 2003/12/03 03:14:43 phil Exp $
+ *
+ * linux/ioctl.h for Linux by H.H. Bergman.
+ */
+
+#ifndef _ASMI386_IOCTL_H
+#define _ASMI386_IOCTL_H
+
+/* ioctl command encoding: 32 bits total, command in lower 16 bits,
+ * size of the parameter structure in the lower 14 bits of the
+ * upper 16 bits.
+ * Encoding the size of the parameter structure in the ioctl request
+ * is useful for catching programs compiled with old versions
+ * and to avoid overwriting user space outside the user buffer area.
+ * The highest 2 bits are reserved for indicating the ``access mode''.
+ * NOTE: This limits the max parameter size to 16kB -1 !
+ */
+
+/*
+ * The following is for compatibility across the various Linux
+ * platforms. The i386 ioctl numbering scheme doesn't really enforce
+ * a type field. De facto, however, the top 8 bits of the lower 16
+ * bits are indeed used as a type field, so we might just as well make
+ * this explicit here. Please be sure to use the decoding macros
+ * below from now on.
+ */
+#undef _IO
+#undef _IOR
+#undef _IOW
+#undef _IOC
+#undef IOC_IN
+#undef IOC_OUT
+
+#define _IOC_NRBITS 8
+#define _IOC_TYPEBITS 8
+#define _IOC_SIZEBITS 14
+#define _IOC_DIRBITS 2
+
+#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1)
+#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1)
+#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1)
+#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1)
+
+#define _IOC_NRSHIFT 0
+#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS)
+#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS)
+#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS)
+
+/*
+ * Direction bits.
+ */
+#define _IOC_NONE 0U
+#define _IOC_WRITE 1U
+#define _IOC_READ 2U
+
+#define _IOC(dir,type,nr,size) \
+ (((dir) << _IOC_DIRSHIFT) | \
+ ((type) << _IOC_TYPESHIFT) | \
+ ((nr) << _IOC_NRSHIFT) | \
+ ((size) << _IOC_SIZESHIFT))
+
+/* used to create numbers */
+#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0)
+#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),sizeof(size))
+#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),sizeof(size))
+#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size))
+
+/* used to decode ioctl numbers.. */
+#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
+#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
+#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
+#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
+
+/* ...and for the drivers/sound files... */
+
+#define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT)
+#define IOC_OUT (_IOC_READ << _IOC_DIRSHIFT)
+#define IOC_INOUT ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT)
+#define IOCSIZE_MASK (_IOC_SIZEMASK << _IOC_SIZESHIFT)
+#define IOCSIZE_SHIFT (_IOC_SIZESHIFT)
+
+#endif /* _ASMI386_IOCTL_H */
extern unsigned int portal_stack;
extern unsigned int portal_debug;
extern unsigned int portal_printk;
+extern unsigned int portal_cerror;
/* Debugging subsystems (32 bits, non-overlapping) */
#define S_UNDEFINED (1 << 0)
#define S_MDC (1 << 1)
#define S_GMNAL (1 << 19)
#define S_PTLROUTER (1 << 20)
#define S_COBD (1 << 21)
+#define S_IBNAL (1 << 22)
/* If you change these values, please keep portals/utils/debug.c
* up to date! */
#define D_IOCTL (1 << 7) /* ioctl related information */
#define D_BLOCKS (1 << 8) /* ext2 block allocation */
#define D_NET (1 << 9) /* network communications */
-#define D_WARNING (1 << 10)
+#define D_WARNING (1 << 10) /* CWARN(...) == CDEBUG (D_WARNING, ...) */
#define D_BUFFS (1 << 11)
#define D_OTHER (1 << 12)
#define D_DENTRY (1 << 13)
#ifdef __KERNEL__
# include <linux/sched.h> /* THREAD_SIZE */
-#else
-# define THREAD_SIZE 8192
+#else
+# ifndef THREAD_SIZE /* x86_64 has THREAD_SIZE in userspace */
+# define THREAD_SIZE 8192
+# endif
#endif
+#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5)
+
#ifdef __KERNEL__
# ifdef __ia64__
# define CDEBUG_STACK (THREAD_SIZE - \
#if 1
#define CDEBUG(mask, format, a...) \
do { \
+ if (portal_cerror == 0) \
+ break; \
CHECK_STACK(CDEBUG_STACK); \
if (!(mask) || ((mask) & (D_ERROR | D_EMERG)) || \
(portal_debug & (mask) && \
if (current->need_resched)
schedule ();
}
+#define work_struct_t struct tq_struct
#else
{
cond_resched();
}
+#define work_struct_t struct work_struct
+
#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) */
#ifdef PORTAL_DEBUG
const int line);
#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__, \
__FUNCTION__, __LINE__))
+/* it would be great to dump_stack() here, but some kernels
+ * export it as show_stack() and I can't be bothered to
+ * proprely engage in that dance right now */
+#define LASSERTF(cond, fmt...) \
+ do { \
+ if (unlikely(!(cond))) { \
+ portals_debug_msg(0, D_EMERG, __FILE__, __FUNCTION__,\
+ __LINE__, CDEBUG_STACK, \
+ "ASSERTION(" #cond ") failed:" fmt);\
+ LBUG(); \
+ } \
+ } while (0)
+
#else
#define LASSERT(e)
+#define LASSERTF(cond, fmt...) do { } while (0)
#endif
#ifdef __arch_um__
#define PORTAL_VMALLOC_SIZE 16384
+#ifndef GFP_MEMALLOC
+#define GFP_MEMALLOC 0
+#endif
+
#define PORTAL_ALLOC(ptr, size) \
do { \
LASSERT (!in_interrupt()); \
if ((size) > PORTAL_VMALLOC_SIZE) \
(ptr) = vmalloc(size); \
else \
- (ptr) = kmalloc((size), GFP_NOFS); \
+ (ptr) = kmalloc((size), (GFP_KERNEL | GFP_MEMALLOC)); \
if ((ptr) == NULL) \
CERROR("PORTALS: out of memory at %s:%d (tried to alloc '"\
#ptr "' = %d)\n", __FILE__, __LINE__, (int)(size));\
s, (ptr), atomic_read(&portal_kmemory)); \
} while (0)
+#ifndef SLAB_MEMALLOC
+#define SLAB_MEMALLOC 0
+#endif
+
#define PORTAL_SLAB_ALLOC(ptr, slab, size) \
do { \
LASSERT(!in_interrupt()); \
- (ptr) = kmem_cache_alloc((slab), SLAB_KERNEL); \
+ (ptr) = kmem_cache_alloc((slab), (SLAB_KERNEL | SLAB_MEMALLOC)); \
if ((ptr) == NULL) { \
CERROR("PORTALS: out of memory at %s:%d (tried to alloc" \
" '" #ptr "' from slab '" #slab "')\n", __FILE__, \
#endif /* PORTALS_PROFILING */
/* debug.c */
+extern spinlock_t stack_backtrace_lock;
+
+char *portals_debug_dumpstack(void);
void portals_run_upcall(char **argv);
void portals_run_lbug_upcall(char * file, const char *fn, const int line);
void portals_debug_dumplog(void);
#endif
void portals_debug_msg(int subsys, int mask, char *file, const char *fn,
const int line, unsigned long stack,
- const char *format, ...)
+ char *format, ...)
__attribute__ ((format (printf, 7, 8)));
#else
void portals_debug_msg(int subsys, int mask, char *file, const char *fn,
# include <stdlib.h>
#ifndef __CYGWIN__
# include <stdint.h>
+#else
+# include <cygwin-ioctl.h>
#endif
# include <unistd.h>
# include <time.h>
# undef NDEBUG
# include <assert.h>
# define LASSERT(e) assert(e)
+# define LASSERTF(cond, args...) assert(cond)
# else
# define LASSERT(e)
+# define LASSERTF(cond, args...) do { } while (0)
# endif
# define printk(format, args...) printf (format, ## args)
# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0);
getpid() , stack, ## a);
#endif
+/* support decl needed both by kernel and liblustre */
+char *portals_nid2str(int nal, ptl_nid_t nid, char *str);
+
#ifndef CURRENT_TIME
# define CURRENT_TIME time(0)
#endif
long lwte_p2;
long lwte_p3;
long lwte_p4;
+#if BITS_PER_LONG > 32
+ long lwte_pad;
+#endif
} lwt_event_t;
#if LWT_SUPPORT
* USER LEVEL STUFF BELOW
*/
+#define PORTALS_CFG_VERSION 0x00010001;
+
+struct portals_cfg {
+ __u32 pcfg_version;
+ __u32 pcfg_command;
+
+ __u32 pcfg_nal;
+ __u32 pcfg_flags;
+
+ __u32 pcfg_gw_nal;
+ __u64 pcfg_nid;
+ __u64 pcfg_nid2;
+ __u64 pcfg_nid3;
+ __u32 pcfg_id;
+ __u32 pcfg_misc;
+ __u32 pcfg_fd;
+ __u32 pcfg_count;
+ __u32 pcfg_size;
+ __u32 pcfg_wait;
+
+ __u32 pcfg_plen1; /* buffers in userspace */
+ char *pcfg_pbuf1;
+ __u32 pcfg_plen2; /* buffers in userspace */
+ char *pcfg_pbuf2;
+};
+
+#define PCFG_INIT(pcfg, cmd) \
+do { \
+ memset(&pcfg, 0, sizeof(pcfg)); \
+ pcfg.pcfg_version = PORTALS_CFG_VERSION; \
+ pcfg.pcfg_command = (cmd); \
+ \
+} while (0)
+
#define PORTAL_IOCTL_VERSION 0x00010007
#define PING_SYNC 0
#define PING_ASYNC 1
#define IOC_PORTAL_CLEAR_DEBUG _IOWR('e', 32, long)
#define IOC_PORTAL_MARK_DEBUG _IOWR('e', 33, long)
#define IOC_PORTAL_PANIC _IOWR('e', 34, long)
-#define IOC_PORTAL_ADD_ROUTE _IOWR('e', 35, long)
-#define IOC_PORTAL_DEL_ROUTE _IOWR('e', 36, long)
-#define IOC_PORTAL_GET_ROUTE _IOWR('e', 37, long)
-#define IOC_PORTAL_NAL_CMD _IOWR('e', 38, long)
-#define IOC_PORTAL_GET_NID _IOWR('e', 39, long)
-#define IOC_PORTAL_FAIL_NID _IOWR('e', 40, long)
-#define IOC_PORTAL_SET_DAEMON _IOWR('e', 41, long)
-#define IOC_PORTAL_NOTIFY_ROUTER _IOWR('e', 42, long)
-#define IOC_PORTAL_LWT_CONTROL _IOWR('e', 43, long)
-#define IOC_PORTAL_LWT_SNAPSHOT _IOWR('e', 44, long)
-#define IOC_PORTAL_LWT_LOOKUP_STRING _IOWR('e', 45, long)
-#define IOC_PORTAL_MAX_NR 45
+#define IOC_PORTAL_NAL_CMD _IOWR('e', 35, long)
+#define IOC_PORTAL_GET_NID _IOWR('e', 36, long)
+#define IOC_PORTAL_FAIL_NID _IOWR('e', 37, long)
+#define IOC_PORTAL_SET_DAEMON _IOWR('e', 38, long)
+#define IOC_PORTAL_LWT_CONTROL _IOWR('e', 39, long)
+#define IOC_PORTAL_LWT_SNAPSHOT _IOWR('e', 40, long)
+#define IOC_PORTAL_LWT_LOOKUP_STRING _IOWR('e', 41, long)
+#define IOC_PORTAL_MAX_NR 41
enum {
QSWNAL = 1,
TOENAL,
TCPNAL,
SCIMACNAL,
+ ROUTER,
+ IBNAL,
NAL_ENUM_END_MARKER
};
extern ptl_handle_ni_t ksocknal_ni;
extern ptl_handle_ni_t ktoenal_ni;
extern ptl_handle_ni_t kgmnal_ni;
+extern ptl_handle_ni_t kibnal_ni;
extern ptl_handle_ni_t kscimacnal_ni;
#endif
+#define PTL_NALFMT_SIZE 16
+
#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1)
#define NAL_CMD_REGISTER_PEER_FD 100
#define NAL_CMD_ADD_AUTOCONN 106
#define NAL_CMD_GET_AUTOCONN 107
#define NAL_CMD_GET_TXDESC 108
+#define NAL_CMD_ADD_ROUTE 109
+#define NAL_CMD_DEL_ROUTE 110
+#define NAL_CMD_GET_ROUTE 111
+#define NAL_CMD_NOTIFY_ROUTER 112
enum {
DEBUG_DAEMON_START = 1,
ptl_handle_ni_t peer_ni;
};
+
/* module.c */
-typedef int (*nal_cmd_handler_t)(struct portal_ioctl_data *, void * private);
+typedef int (*nal_cmd_handler_t)(struct portals_cfg *, void * private);
int kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private);
int kportal_nal_unregister(int nal);
+enum cfg_record_type {
+ PORTALS_CFG_TYPE = 1,
+ LUSTRE_CFG_TYPE = 123,
+};
+
+typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data);
+int kportal_nal_cmd(struct portals_cfg *);
+
ptl_handle_ni_t *kportal_get_ni (int nal);
void kportal_put_ni (int nal);
# endif
#endif
-#if (BITS_PER_LONG == 32 || __WORDSIZE == 32)
+#if defined(__x86_64__)
+# define LPU64 "%Lu"
+# define LPD64 "%Ld"
+# define LPX64 "%#Lx"
+# define LPSZ "%lu"
+# define LPSSZ "%ld"
+#elif (BITS_PER_LONG == 32 || __WORDSIZE == 32)
# define LPU64 "%Lu"
# define LPD64 "%Ld"
# define LPX64 "%#Lx"
# define LPSZ "%u"
# define LPSSZ "%d"
-#endif
-#if (BITS_PER_LONG == 64 || __WORDSIZE == 64)
+#elif (BITS_PER_LONG == 64 || __WORDSIZE == 64)
# define LPU64 "%lu"
# define LPD64 "%ld"
# define LPX64 "%#lx"
return (val + 7) & (~0x7);
}
+static inline int size_round16(int val)
+{
+ return (val + 0xf) & (~0xf);
+}
+
+static inline int size_round32(int val)
+{
+ return (val + 0x1f) & (~0x1f);
+}
+
static inline int size_round0(int val)
{
if (!val)
wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie);
}
-#ifdef __KERNEL__
#define state_lock(nal,flagsp) \
do { \
CDEBUG(D_PORTALS, "taking state lock\n"); \
CDEBUG(D_PORTALS, "releasing state lock\n"); \
nal->cb_sti(nal, flagsp); \
}
-#else
-/* not needed in user space until we thread there */
-#define state_lock(nal,flagsp) \
-do { \
- CDEBUG(D_PORTALS, "taking state lock\n"); \
- CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \
-} while (0)
-
-#define state_unlock(nal,flagsp) \
-{ \
- CDEBUG(D_PORTALS, "releasing state lock\n"); \
- CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \
-}
-#endif /* __KERNEL__ */
#ifndef PTL_USE_SLAB_CACHE
*/
extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
+ lib_md_t *getmd);
extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie);
}
-#ifdef __KERNEL__
#define state_lock(nal,flagsp) \
do { \
CDEBUG(D_PORTALS, "taking state lock\n"); \
CDEBUG(D_PORTALS, "releasing state lock\n"); \
nal->cb_sti(nal, flagsp); \
}
-#else
-/* not needed in user space until we thread there */
-#define state_lock(nal,flagsp) \
-do { \
- CDEBUG(D_PORTALS, "taking state lock\n"); \
- CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \
-} while (0)
-
-#define state_unlock(nal,flagsp) \
-{ \
- CDEBUG(D_PORTALS, "releasing state lock\n"); \
- CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \
-}
-#endif /* __KERNEL__ */
#ifndef PTL_USE_SLAB_CACHE
*/
extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
+ lib_md_t *getmd);
extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
PTL_MSG_HELLO,
} ptl_msg_type_t;
-/* Each of these structs should start with an odd number of
- * __u32, or the compiler could add its own padding and confuse
- * everyone.
- *
- * Also, "length" needs to be at offset 28 of each struct.
- */
+/* The variant fields of the portals message header are aligned on an 8
+ * byte boundary in the message header. Note that all types used in these
+ * wire structs MUST be fixed size and the smaller types are placed at the
+ * end. */
typedef struct ptl_ack {
- ptl_size_t mlength;
- ptl_handle_wire_t dst_wmd;
- ptl_match_bits_t match_bits;
- ptl_size_t length; /* common length (0 for acks) moving out RSN */
+ ptl_handle_wire_t dst_wmd;
+ ptl_match_bits_t match_bits;
+ ptl_size_t mlength;
} WIRE_ATTR ptl_ack_t;
typedef struct ptl_put {
- ptl_pt_index_t ptl_index;
- ptl_handle_wire_t ack_wmd;
- ptl_match_bits_t match_bits;
- ptl_size_t length; /* common length moving out RSN */
- ptl_size_t offset;
- ptl_hdr_data_t hdr_data;
+ ptl_handle_wire_t ack_wmd;
+ ptl_match_bits_t match_bits;
+ ptl_hdr_data_t hdr_data;
+ ptl_pt_index_t ptl_index;
+ ptl_size_t offset;
} WIRE_ATTR ptl_put_t;
typedef struct ptl_get {
- ptl_pt_index_t ptl_index;
- ptl_handle_wire_t return_wmd;
- ptl_match_bits_t match_bits;
- ptl_size_t length; /* common length (0 for gets) moving out RSN */
- ptl_size_t src_offset;
- ptl_size_t return_offset; /* unused: going RSN */
- ptl_size_t sink_length;
+ ptl_handle_wire_t return_wmd;
+ ptl_match_bits_t match_bits;
+ ptl_pt_index_t ptl_index;
+ ptl_size_t src_offset;
+ ptl_size_t sink_length;
} WIRE_ATTR ptl_get_t;
typedef struct ptl_reply {
- __u32 unused1; /* unused fields going RSN */
- ptl_handle_wire_t dst_wmd;
- ptl_size_t dst_offset; /* unused: going RSN */
- __u32 unused2;
- ptl_size_t length; /* common length moving out RSN */
+ ptl_handle_wire_t dst_wmd;
} WIRE_ATTR ptl_reply_t;
+typedef struct ptl_hello {
+ __u64 incarnation;
+ __u32 type;
+} WIRE_ATTR ptl_hello_t;
+
typedef struct {
- ptl_nid_t dest_nid;
- ptl_nid_t src_nid;
- ptl_pid_t dest_pid;
- ptl_pid_t src_pid;
- __u32 type; /* ptl_msg_type_t */
+ ptl_nid_t dest_nid;
+ ptl_nid_t src_nid;
+ ptl_pid_t dest_pid;
+ ptl_pid_t src_pid;
+ __u32 type; /* ptl_msg_type_t */
+ __u32 payload_length; /* payload data to follow */
+ /*<------__u64 aligned------->*/
union {
- ptl_ack_t ack;
- ptl_put_t put;
- ptl_get_t get;
+ ptl_ack_t ack;
+ ptl_put_t put;
+ ptl_get_t get;
ptl_reply_t reply;
+ ptl_hello_t hello;
} msg;
} WIRE_ATTR ptl_hdr_t;
-/* All length fields in individual unions at same offset */
-/* LASSERT for same in lib-move.c */
-#define PTL_HDR_LENGTH(h) ((h)->msg.ack.length)
-
/* A HELLO message contains the portals magic number and protocol version
* code in the header's dest_nid, the peer's NID in the src_nid, and
- * PTL_MSG_HELLO in the type field. All other fields are zero (including
- * PTL_HDR_LENGTH; i.e. no payload).
+ * PTL_MSG_HELLO in the type field. All other common fields are zero
+ * (including payload_size; i.e. no payload).
* This is for use by byte-stream NALs (e.g. TCP/IP) to check the peer is
* running the same protocol and to find out its NID, so that hosts with
* multiple IP interfaces can have a single NID. These NALs should exchange
- * HELLO messages when a connection is first established. */
+ * HELLO messages when a connection is first established.
+ * Individual NALs can put whatever else they fancy in ptl_hdr_t::msg.
+ */
typedef struct {
__u32 magic; /* PORTALS_PROTO_MAGIC */
__u16 version_major; /* increment on incompatible change */
#define PORTALS_PROTO_MAGIC 0xeebc0ded
#define PORTALS_PROTO_VERSION_MAJOR 0
-#define PORTALS_PROTO_VERSION_MINOR 1
+#define PORTALS_PROTO_VERSION_MINOR 3
typedef struct {
long recv_count, recv_length, send_count, send_length, drop_count,
} lib_counters_t;
/* temporary expedient: limit number of entries in discontiguous MDs */
-#if PTL_LARGE_MTU
-# define PTL_MD_MAX_IOV 64
-#else
-# define PTL_MD_MAX_IOV 16
-#endif
+# define PTL_MTU (512<<10)
+# define PTL_MD_MAX_IOV 128
+# define PTL_MD_MAX_PAGES min_t(int, PTL_MD_MAX_IOV, PTL_MTU / PAGE_SIZE)
struct lib_msg_t {
struct list_head msg_list;
struct list_head *next, *prev;
};
+typedef struct list_head list_t;
+
#define LIST_HEAD_INIT(name) { &(name), &(name) }
#define LIST_HEAD(name) \
#include <unistd.h>
#include <sys/time.h>
#include <portals/types.h>
-#include <portals/ptlctl.h>
#include <linux/kp30.h>
+#include <portals/ptlctl.h>
#include <linux/limits.h>
#include <asm/page.h>
#include <linux/version.h>
#define OBD_DEV_PATH "/dev/obd"
int ptl_name2nal(char *str);
+int ptl_parse_ipaddr (__u32 *ipaddrp, char *str);
int ptl_parse_nid (ptl_nid_t *nidp, char *str);
char * ptl_nid2str (char *buffer, ptl_nid_t nid);
int jt_dbg_modules(int argc, char **argv);
int jt_dbg_panic(int argc, char **argv);
+int ptl_set_cfg_record_cb(cfg_record_cb_t cb);
+
/* l_ioctl.c */
+typedef int (ioc_handler_t)(int dev_id, int opc, void *buf);
+void set_ioc_handler(ioc_handler_t *handler);
int register_ioc_dev(int dev_id, const char * dev_name);
void unregister_ioc_dev(int dev_id);
int set_ioctl_dump(char * file);
#define OBD_DEV_PATH "/dev/obd"
int ptl_name2nal(char *str);
+int ptl_parse_ipaddr (__u32 *ipaddrp, char *str);
int ptl_parse_nid (ptl_nid_t *nidp, char *str);
char * ptl_nid2str (char *buffer, ptl_nid_t nid);
int jt_dbg_modules(int argc, char **argv);
int jt_dbg_panic(int argc, char **argv);
+int ptl_set_cfg_record_cb(cfg_record_cb_t cb);
+
/* l_ioctl.c */
+typedef int (ioc_handler_t)(int dev_id, int opc, void *buf);
+void set_ioc_handler(ioc_handler_t *handler);
int register_ioc_dev(int dev_id, const char * dev_name);
void unregister_ioc_dev(int dev_id);
int set_ioctl_dump(char * file);
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * <portals/socknal.h>
+ *
+ * #defines shared between socknal implementation and utilities
+ */
+
+#define SOCKNAL_CONN_NONE (-1)
+#define SOCKNAL_CONN_ANY 0
+#define SOCKNAL_CONN_CONTROL 1
+#define SOCKNAL_CONN_BULK_IN 2
+#define SOCKNAL_CONN_BULK_OUT 3
+#define SOCKNAL_CONN_NTYPES 4
typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t;
#define PTL_SEQ_GT(a,b) (((signed PTL_SEQ_BASETYPE)((a) - (b))) > 0)
+/* XXX
+ * cygwin need the pragma line, not clear if it's needed in other places.
+ * checking!!!
+ */
+#ifdef __CYGWIN__
+#pragma pack(push, 4)
+#endif
typedef struct {
ptl_event_kind_t type;
ptl_process_id_t initiator;
struct timeval arrival_time;
volatile ptl_seq_t sequence;
} ptl_event_t;
+#ifdef __CYGWIN__
+#pragma pop
+#endif
typedef enum {
PTL_ACK_REQ,
# This code is issued under the GNU General Public License.
# See the file COPYING in this distribution
-DIST_SUBDIRS= socknal toenal qswnal gmnal scimacnal
-SUBDIRS= socknal toenal @QSWNAL@ @GMNAL@ @SCIMACNAL@
+DIST_SUBDIRS= socknal toenal qswnal gmnal scimacnal ibnal
+SUBDIRS= socknal toenal @QSWNAL@ @GMNAL@ @SCIMACNAL@ @IBNAL@
modulenet_DATA = kgmnal.o
EXTRA_PROGRAMS = kgmnal
-DEFS =
-kgmnal_SOURCES = gmnal.c gmnal_cb.c gmnal.h
+DEFS = -DGM_KERNEL
+kgmnal_SOURCES = gmnal.h gmnal_api.c gmnal_cb.c gmnal_comm.c gmnal_utils.c gmnal_module.c
include ../../Kernelenv
-obj-y += lgmnal.o
-lgmnal-objs := lgmnal_api.o lgmnal_cb.o lgmnal_utils.o lgmnal_comm.o lgmnal_module.o
+obj-y += gmnal.o
+gmnal-objs := gmnal_api.o gmnal_cb.o gmnal_utils.o gmnal_comm.o gmnal_module.o
+++ /dev/null
-diff -ru gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c
---- gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c Mon Jul 1 10:35:09 2002
-+++ gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c Thu Sep 19 14:19:38 2002
-@@ -30,6 +30,8 @@
- *
- ************************************************************************/
-
-+#define EXPORT_SYMTAB
-+
- #include <linux/config.h>
- #include <linux/module.h>
-
-@@ -4075,6 +4077,28 @@
- return 0;
- }
-
-+EXPORT_SYMBOL(gm_blocking_receive_no_spin);
-+EXPORT_SYMBOL(gm_close);
-+EXPORT_SYMBOL(gm_dma_free);
-+EXPORT_SYMBOL(gm_dma_malloc);
-+EXPORT_SYMBOL(gm_drop_sends);
-+EXPORT_SYMBOL(gm_finalize);
-+EXPORT_SYMBOL(gm_get_node_id);
-+EXPORT_SYMBOL(gm_init);
-+EXPORT_SYMBOL(gm_initialize_alarm);
-+EXPORT_SYMBOL(gm_max_node_id_in_use);
-+EXPORT_SYMBOL(gm_min_size_for_length);
-+EXPORT_SYMBOL(gm_num_receive_tokens);
-+EXPORT_SYMBOL(gm_num_send_tokens);
-+EXPORT_SYMBOL(gm_open);
-+EXPORT_SYMBOL(gm_provide_receive_buffer);
-+EXPORT_SYMBOL(gm_resume_sending);
-+EXPORT_SYMBOL(gm_send_with_callback);
-+EXPORT_SYMBOL(gm_set_acceptable_sizes);
-+EXPORT_SYMBOL(gm_set_alarm);
-+EXPORT_SYMBOL(gm_unknown);
-+
-+
- /*
- This file uses GM standard indentation.
-
-Only in gm-1.5.2.1_Linux-cfs/drivers/linux/gm: gm_arch.c~
-Only in gm-1.5.2.1_Linux-cfs/: trace
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (c) 2003 Los Alamos National Laboratory (LANL)
+ *
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#ifndef _GMNAL_H
-#define _GMNAL_H
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/locks.h>
-#include <linux/unistd.h>
-#include <linux/init.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/list.h>
-#include <asm/uaccess.h>
-#include <asm/segment.h>
+
+
+/*
+ * Portals GM kernel NAL header file
+ * This file makes all declaration and prototypes
+ * for the API side and CB side of the NAL
+ */
+#ifndef __INCLUDE_GMNAL_H__
+#define __INCLUDE_GMNAL_H__
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include "linux/config.h"
+#include "linux/module.h"
+#include "linux/tty.h"
+#include "linux/kernel.h"
+#include "linux/mm.h"
+#include "linux/string.h"
+#include "linux/stat.h"
+#include "linux/errno.h"
+#include "linux/locks.h"
+#include "linux/unistd.h"
+#include "linux/init.h"
+#include "linux/sem.h"
+#include "linux/vmalloc.h"
#define DEBUG_SUBSYSTEM S_GMNAL
-#include <linux/kp30.h>
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
+#include "portals/nal.h"
+#include "portals/api.h"
+#include "portals/errno.h"
+#include "linux/kp30.h"
+#include "portals/p30.h"
+
+#include "portals/lib-nal.h"
+#include "portals/lib-p30.h"
+
+#define GM_STRONG_TYPES 1
+#include "gm.h"
+#include "gm_internal.h"
+
+
+
+/*
+ * Defines for the API NAL
+ */
+
+/*
+ * Small message size is configurable
+ * insmod can set small_msg_size
+ * which is used to populate nal_data.small_msg_size
+ */
+#define GMNAL_SMALL_MESSAGE 1078
+#define GMNAL_LARGE_MESSAGE_INIT 1079
+#define GMNAL_LARGE_MESSAGE_ACK 1080
+#define GMNAL_LARGE_MESSAGE_FINI 1081
+
+extern int gmnal_small_msg_size;
+extern int num_rx_threads;
+extern int num_stxds;
+#define GMNAL_SMALL_MSG_SIZE(a) a->small_msg_size
+#define GMNAL_IS_SMALL_MESSAGE(n,a,b,c) gmnal_is_small_msg(n, a, b, c)
+#define GMNAL_MAGIC 0x1234abcd
+
+
+/*
+ * Small Transmit Descriptor
+ * A structre to keep track of a small transmit operation
+ * This structure has a one-to-one relationship with a small
+ * transmit buffer (both create by gmnal_stxd_alloc).
+ * There are two free list of stxd. One for use by clients of the NAL
+ * and the other by the NAL rxthreads when doing sends.
+ * This helps prevent deadlock caused by stxd starvation.
+ */
+typedef struct _gmnal_stxd_t {
+ void *buffer;
+ int buffer_size;
+ gm_size_t gm_size;
+ int msg_size;
+ int gm_target_node;
+ int gm_priority;
+ int type;
+ struct _gmnal_data_t *nal_data;
+ lib_msg_t *cookie;
+ int niov;
+ struct iovec iov[PTL_MD_MAX_IOV];
+ struct _gmnal_stxd_t *next;
+ int rxt;
+ int kniov;
+ struct iovec *iovec_dup;
+} gmnal_stxd_t;
+
+/*
+ * keeps a transmit token for large transmit (gm_get)
+ * and a pointer to rxd that is used as context for large receive
+ */
+typedef struct _gmnal_ltxd_t {
+ struct _gmnal_ltxd_t *next;
+ struct _gmnal_srxd_t *srxd;
+} gmnal_ltxd_t;
+
+
+/*
+ * as for gmnal_stxd_t
+ * a hash table in nal_data find srxds from
+ * the rx buffer address. hash table populated at init time
+ */
+typedef struct _gmnal_srxd_t {
+ void *buffer;
+ int size;
+ gm_size_t gmsize;
+ unsigned int gm_source_node;
+ gmnal_stxd_t *source_stxd;
+ int type;
+ int nsiov;
+ int nriov;
+ struct iovec *riov;
+ int ncallbacks;
+ spinlock_t callback_lock;
+ int callback_status;
+ lib_msg_t *cookie;
+ struct _gmnal_srxd_t *next;
+ struct _gmnal_data_t *nal_data;
+} gmnal_srxd_t;
+
+/*
+ * Header which lmgnal puts at the start of each message
+ */
+typedef struct _gmnal_msghdr {
+ int magic;
+ int type;
+ unsigned int sender_node_id;
+ gmnal_stxd_t *stxd;
+ int niov;
+ } gmnal_msghdr_t;
+#define GMNAL_MSGHDR_SIZE sizeof(gmnal_msghdr_t)
+
+/*
+ * the caretaker thread (ct_thread) gets receive events
+ * (and other events) from the myrinet device via the GM2 API.
+ * caretaker thread populates one work entry for each receive event,
+ * puts it on a Q in nal_data and wakes a receive thread to
+ * process the receive.
+ * Processing a portals receive can involve a transmit operation.
+ * Because of this the caretaker thread cannot process receives
+ * as it may get deadlocked when supply of transmit descriptors
+ * is exhausted (as caretaker thread is responsible for replacing
+ * transmit descriptors on the free list)
+ */
+typedef struct _gmnal_rxtwe {
+ void *buffer;
+ unsigned snode;
+ unsigned sport;
+ unsigned type;
+ unsigned length;
+ struct _gmnal_rxtwe *next;
+} gmnal_rxtwe_t;
+
+/*
+ * 1 receive thread started on each CPU
+ */
+#define NRXTHREADS 10 /* max number of receiver threads */
+
+typedef struct _gmnal_data_t {
+ int refcnt;
+ spinlock_t cb_lock;
+ spinlock_t stxd_lock;
+ struct semaphore stxd_token;
+ gmnal_stxd_t *stxd;
+ spinlock_t rxt_stxd_lock;
+ struct semaphore rxt_stxd_token;
+ gmnal_stxd_t *rxt_stxd;
+ spinlock_t ltxd_lock;
+ struct semaphore ltxd_token;
+ gmnal_ltxd_t *ltxd;
+ spinlock_t srxd_lock;
+ struct semaphore srxd_token;
+ gmnal_srxd_t *srxd;
+ struct gm_hash *srxd_hash;
+ nal_t *nal;
+ nal_cb_t *nal_cb;
+ struct gm_port *gm_port;
+ unsigned int gm_local_nid;
+ unsigned int gm_global_nid;
+ spinlock_t gm_lock;
+ long rxthread_pid[NRXTHREADS];
+ int rxthread_stop_flag;
+ spinlock_t rxthread_flag_lock;
+ long rxthread_flag;
+ long ctthread_pid;
+ int ctthread_flag;
+ gm_alarm_t ctthread_alarm;
+ int small_msg_size;
+ int small_msg_gmsize;
+ gmnal_rxtwe_t *rxtwe_head;
+ gmnal_rxtwe_t *rxtwe_tail;
+ spinlock_t rxtwe_lock;
+ struct semaphore rxtwe_wait;
+} gmnal_data_t;
+
+/*
+ * Flags to start/stop and check status of threads
+ * each rxthread sets 1 bit (any bit) of the flag on startup
+ * and clears 1 bit when exiting
+ */
+#define GMNAL_THREAD_RESET 0
+#define GMNAL_THREAD_STOP 666
+#define GMNAL_CTTHREAD_STARTED 333
+#define GMNAL_RXTHREADS_STARTED ( (1<<num_rx_threads)-1)
+
+
+extern gmnal_data_t *global_nal_data;
+
+/*
+ * The gm_port to use for gmnal
+ */
+#define GMNAL_GM_PORT 4
+
+/*
+ * for ioctl get pid
+ */
+#define GMNAL_IOC_GET_GNID 1
+
+/*
+ * Return codes
+ */
+#define GMNAL_STATUS_OK 0
+#define GMNAL_STATUS_FAIL 1
+#define GMNAL_STATUS_NOMEM 2
+
+
+/*
+ * FUNCTION PROTOTYPES
+ */
+
+/*
+ * Locking macros
+ */
+
+/*
+ * For the Small tx and rx descriptor lists
+ */
+#define GMNAL_TXD_LOCK_INIT(a) spin_lock_init(&a->stxd_lock);
+#define GMNAL_TXD_LOCK(a) spin_lock(&a->stxd_lock);
+#define GMNAL_TXD_UNLOCK(a) spin_unlock(&a->stxd_lock);
+#define GMNAL_TXD_TOKEN_INIT(a, n) sema_init(&a->stxd_token, n);
+#define GMNAL_TXD_GETTOKEN(a) down(&a->stxd_token);
+#define GMNAL_TXD_TRYGETTOKEN(a) down_trylock(&a->stxd_token)
+#define GMNAL_TXD_RETURNTOKEN(a) up(&a->stxd_token);
+
+#define GMNAL_RXT_TXD_LOCK_INIT(a) spin_lock_init(&a->rxt_stxd_lock);
+#define GMNAL_RXT_TXD_LOCK(a) spin_lock(&a->rxt_stxd_lock);
+#define GMNAL_RXT_TXD_UNLOCK(a) spin_unlock(&a->rxt_stxd_lock);
+#define GMNAL_RXT_TXD_TOKEN_INIT(a, n) sema_init(&a->rxt_stxd_token, n);
+#define GMNAL_RXT_TXD_GETTOKEN(a) down(&a->rxt_stxd_token);
+#define GMNAL_RXT_TXD_TRYGETTOKEN(a) down_trylock(&a->rxt_stxd_token)
+#define GMNAL_RXT_TXD_RETURNTOKEN(a) up(&a->rxt_stxd_token);
+
+#define GMNAL_LTXD_LOCK_INIT(a) spin_lock_init(&a->ltxd_lock);
+#define GMNAL_LTXD_LOCK(a) spin_lock(&a->ltxd_lock);
+#define GMNAL_LTXD_UNLOCK(a) spin_unlock(&a->ltxd_lock);
+#define GMNAL_LTXD_TOKEN_INIT(a, n) sema_init(&a->ltxd_token, n);
+#define GMNAL_LTXD_GETTOKEN(a) down(&a->ltxd_token);
+#define GMNAL_LTXD_TRYGETTOKEN(a) down_trylock(&a->ltxd_token)
+#define GMNAL_LTXD_RETURNTOKEN(a) up(&a->ltxd_token);
+
+#define GMNAL_RXD_LOCK_INIT(a) spin_lock_init(&a->srxd_lock);
+#define GMNAL_RXD_LOCK(a) spin_lock(&a->srxd_lock);
+#define GMNAL_RXD_UNLOCK(a) spin_unlock(&a->srxd_lock);
+#define GMNAL_RXD_TOKEN_INIT(a, n) sema_init(&a->srxd_token, n);
+#define GMNAL_RXD_GETTOKEN(a) down(&a->srxd_token);
+#define GMNAL_RXD_TRYGETTOKEN(a) down_trylock(&a->srxd_token)
+#define GMNAL_RXD_RETURNTOKEN(a) up(&a->srxd_token);
+
+#define GMNAL_GM_LOCK_INIT(a) spin_lock_init(&a->gm_lock);
+#define GMNAL_GM_LOCK(a) spin_lock(&a->gm_lock);
+#define GMNAL_GM_UNLOCK(a) spin_unlock(&a->gm_lock);
+#define GMNAL_CB_LOCK_INIT(a) spin_lock_init(&a->cb_lock);
+
+
+/*
+ * Memory Allocator
+ */
+
+/*
+ * API NAL
+ */
+int gmnal_api_forward(nal_t *, int, void *, size_t, void *, size_t);
+
+int gmnal_api_shutdown(nal_t *, int);
-#include <gm.h>
+int gmnal_api_validate(nal_t *, void *, size_t);
+
+void gmnal_api_yield(nal_t *);
+
+void gmnal_api_lock(nal_t *, unsigned long *);
+
+void gmnal_api_unlock(nal_t *, unsigned long *);
+
+
+#define GMNAL_INIT_NAL(a) do { \
+ a->forward = gmnal_api_forward; \
+ a->shutdown = gmnal_api_shutdown; \
+ a->validate = NULL; \
+ a->yield = gmnal_api_yield; \
+ a->lock = gmnal_api_lock; \
+ a->unlock = gmnal_api_unlock; \
+ a->timeout = NULL; \
+ a->refct = 1; \
+ a->nal_data = NULL; \
+ } while (0)
/*
- * Myrinet GM NAL
+ * CB NAL
*/
-#define NPAGES_LARGE 16
-#define NPAGES_SMALL 1
-#define MSG_LEN_LARGE NPAGES_LARGE*PAGE_SIZE
-#define MSG_LEN_SMALL NPAGES_SMALL*PAGE_SIZE
-#define MSG_SIZE_LARGE (gm_min_size_for_length(MSG_LEN_LARGE))
-#define MSG_SIZE_SMALL (gm_min_size_for_length(MSG_LEN_SMALL))
-#define TXMSGS 64 /* Number of Transmit Messages */
-#define ENVELOPES 8 /* Number of outstanding receive msgs */
+int gmnal_cb_send(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
+ int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec *, size_t);
+
+int gmnal_cb_send_pages(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
+ int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t);
+
+int gmnal_cb_recv(nal_cb_t *, void *, lib_msg_t *,
+ unsigned int, struct iovec *, size_t, size_t);
+
+int gmnal_cb_recv_pages(nal_cb_t *, void *, lib_msg_t *,
+ unsigned int, ptl_kiov_t *, size_t, size_t);
+
+int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t);
+
+int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t);
+
+int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *);
+
+void *gmnal_cb_malloc(nal_cb_t *, size_t);
+
+void gmnal_cb_free(nal_cb_t *, void *, size_t);
+
+void gmnal_cb_unmap(nal_cb_t *, unsigned int, struct iovec*, void **);
+
+int gmnal_cb_map(nal_cb_t *, unsigned int, struct iovec*, void **);
-#define KGM_PORT_NUM 3
-#define KGM_HOSTNAME "kgmnal"
+void gmnal_cb_printf(nal_cb_t *, const char *fmt, ...);
+void gmnal_cb_cli(nal_cb_t *, unsigned long *);
-typedef struct {
- char *krx_buffer;
- unsigned long krx_len;
- unsigned int krx_size;
- unsigned int krx_priority;
- struct list_head krx_item;
-} kgmnal_rx_t;
+void gmnal_cb_sti(nal_cb_t *, unsigned long *);
+int gmnal_cb_dist(nal_cb_t *, ptl_nid_t, unsigned long *);
+
+nal_t *gmnal_init(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t rpid);
+
+void gmnal_fini(void);
+
+
+
+#define GMNAL_INIT_NAL_CB(a) do { \
+ a->cb_send = gmnal_cb_send; \
+ a->cb_send_pages = gmnal_cb_send_pages; \
+ a->cb_recv = gmnal_cb_recv; \
+ a->cb_recv_pages = gmnal_cb_recv_pages; \
+ a->cb_read = gmnal_cb_read; \
+ a->cb_write = gmnal_cb_write; \
+ a->cb_callback = gmnal_cb_callback; \
+ a->cb_malloc = gmnal_cb_malloc; \
+ a->cb_free = gmnal_cb_free; \
+ a->cb_map = NULL; \
+ a->cb_unmap = NULL; \
+ a->cb_printf = gmnal_cb_printf; \
+ a->cb_cli = gmnal_cb_cli; \
+ a->cb_sti = gmnal_cb_sti; \
+ a->cb_dist = gmnal_cb_dist; \
+ a->nal_data = NULL; \
+ } while (0)
+
+
+/*
+ * Small and Large Transmit and Receive Descriptor Functions
+ */
+int gmnal_alloc_txd(gmnal_data_t *);
+void gmnal_free_txd(gmnal_data_t *);
+gmnal_stxd_t* gmnal_get_stxd(gmnal_data_t *, int);
+void gmnal_return_stxd(gmnal_data_t *, gmnal_stxd_t *);
+gmnal_ltxd_t* gmnal_get_ltxd(gmnal_data_t *);
+void gmnal_return_ltxd(gmnal_data_t *, gmnal_ltxd_t *);
+
+int gmnal_alloc_srxd(gmnal_data_t *);
+void gmnal_free_srxd(gmnal_data_t *);
+gmnal_srxd_t* gmnal_get_srxd(gmnal_data_t *, int);
+void gmnal_return_srxd(gmnal_data_t *, gmnal_srxd_t *);
+
+/*
+ * general utility functions
+ */
+gmnal_srxd_t *gmnal_rxbuffer_to_srxd(gmnal_data_t *, void*);
+void gmnal_stop_rxthread(gmnal_data_t *);
+void gmnal_stop_ctthread(gmnal_data_t *);
+void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t);
+void gmnal_drop_sends_callback(gm_port_t *, void *, gm_status_t);
+char *gmnal_gm_error(gm_status_t);
+char *gmnal_rxevent(gm_recv_event_t*);
+int gmnal_is_small_msg(gmnal_data_t*, int, struct iovec*, int);
+void gmnal_yield(int);
+int gmnal_start_kernel_threads(gmnal_data_t *);
+
+
+/*
+ * Communication functions
+ */
+
+/*
+ * Receive threads
+ */
+int gmnal_ct_thread(void *); /* caretaker thread */
+int gmnal_rx_thread(void *); /* receive thread */
+int gmnal_pre_receive(gmnal_data_t*, gmnal_rxtwe_t*, int);
+int gmnal_rx_bad(gmnal_data_t *, gmnal_rxtwe_t *, gmnal_srxd_t*);
+int gmnal_rx_requeue_buffer(gmnal_data_t *, gmnal_srxd_t *);
+int gmnal_add_rxtwe(gmnal_data_t *, gm_recv_t *);
+gmnal_rxtwe_t * gmnal_get_rxtwe(gmnal_data_t *);
+void gmnal_remove_rxtwe(gmnal_data_t *);
+
+
+/*
+ * Small messages
+ */
+int gmnal_small_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int,
+ struct iovec *, size_t, size_t);
+int gmnal_small_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
+ int, ptl_nid_t, ptl_pid_t,
+ unsigned int, struct iovec*, int);
+void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t);
+
+
+
+/*
+ * Large messages
+ */
+int gmnal_large_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int,
+ struct iovec *, size_t, size_t);
-typedef struct {
- nal_cb_t *ktx_nal;
- void *ktx_private;
- lib_msg_t *ktx_cookie;
- char *ktx_buffer;
- size_t ktx_len;
- unsigned long ktx_size;
- int ktx_ndx;
- unsigned int ktx_priority;
- unsigned int ktx_tgt_node;
- unsigned int ktx_tgt_port_id;
-} kgmnal_tx_t;
+int gmnal_large_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
+ int, ptl_nid_t, ptl_pid_t, unsigned int,
+ struct iovec*, int);
+void gmnal_large_tx_callback(gm_port_t *, void *, gm_status_t);
-typedef struct {
- char kgm_init;
- char kgm_shuttingdown;
- struct gm_port *kgm_port;
- struct list_head kgm_list;
- ptl_nid_t kgm_nid;
- nal_cb_t *kgm_cb;
- struct kgm_trans *kgm_trans;
- struct tq_struct kgm_ready_tq;
- spinlock_t kgm_dispatch_lock;
- spinlock_t kgm_update_lock;
- spinlock_t kgm_send_lock;
-} kgmnal_data_t;
+int gmnal_remote_get(gmnal_srxd_t *, int, struct iovec*, int,
+ struct iovec*);
-int kgm_init(kgmnal_data_t *kgm_data);
-int kgmnal_recv_thread(void *);
-int gm_return_mynid(void);
-void kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+void gmnal_remote_get_callback(gm_port_t *, void *, gm_status_t);
-extern kgmnal_data_t kgmnal_data;
-extern nal_t kgmnal_api;
-extern nal_cb_t kgmnal_lib;
+int gmnal_copyiov(int, gmnal_srxd_t *, int, struct iovec*, int,
+ struct iovec*);
-#endif /* _GMNAL_H */
+void gmnal_large_tx_ack(gmnal_data_t *, gmnal_srxd_t *);
+void gmnal_large_tx_ack_callback(gm_port_t *, void *, gm_status_t);
+void gmnal_large_tx_ack_received(gmnal_data_t *, gmnal_srxd_t *);
+#endif /*__INCLUDE_GMNAL_H__*/
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (c) 2003 Los Alamos National Laboratory (LANL)
+ *
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Implements the API NAL functions
+ */
+
+#include "gmnal.h"
+
+gmnal_data_t *global_nal_data = NULL;
+/*
+ * gmnal_api_forward
+ * This function takes a pack block of arguments from the NAL API
+ * module and passes them to the NAL CB module. The CB module unpacks
+ * the args and calls the appropriate function indicated by index.
+ * Typically this function is used to pass args between kernel and use
+ * space.
+ * As lgmanl exists entirely in kernel, just pass the arg block directly
+ * to the NAL CB, buy passing the args to lib_dispatch
+ * Arguments are
+ * nal_t nal Our nal
+ * int index the api function that initiated this call
+ * void *args packed block of function args
+ * size_t arg_len length of args block
+ * void *ret A return value for the API NAL
+ * size_t ret_len Size of the return value
+ *
+ */
+
+int
+gmnal_api_forward(nal_t *nal, int index, void *args, size_t arg_len,
+ void *ret, size_t ret_len)
+{
+
+ nal_cb_t *nal_cb = NULL;
+ gmnal_data_t *nal_data = NULL;
+
+
+
+
+
+ if (!nal || !args || (index < 0) || (arg_len < 0)) {
+ CDEBUG(D_ERROR, "Bad args to gmnal_api_forward\n");
+ return (PTL_FAIL);
+ }
+
+ if (ret && (ret_len <= 0)) {
+ CDEBUG(D_ERROR, "Bad args to gmnal_api_forward\n");
+ return (PTL_FAIL);
+ }
+
+
+ if (!nal->nal_data) {
+ CDEBUG(D_ERROR, "bad nal, no nal data\n");
+ return (PTL_FAIL);
+ }
+
+ nal_data = nal->nal_data;
+ CDEBUG(D_INFO, "nal_data is [%p]\n", nal_data);
+
+ if (!nal_data->nal_cb) {
+ CDEBUG(D_ERROR, "bad nal_data, no nal_cb\n");
+ return (PTL_FAIL);
+ }
+
+ nal_cb = nal_data->nal_cb;
+ CDEBUG(D_INFO, "nal_cb is [%p]\n", nal_cb);
+
+ CDEBUG(D_PORTALS, "gmnal_api_forward calling lib_dispatch\n");
+ lib_dispatch(nal_cb, NULL, index, args, ret);
+ CDEBUG(D_PORTALS, "gmnal_api_forward returns from lib_dispatch\n");
+
+ return(PTL_OK);
+}
+
+
+/*
+ * gmnal_api_shutdown
+ * Close down this interface and free any resources associated with it
+ * nal_t nal our nal to shutdown
+ */
+int
+gmnal_api_shutdown(nal_t *nal, int interface)
+{
+
+ gmnal_data_t *nal_data = nal->nal_data;
+
+ CDEBUG(D_TRACE, "gmnal_api_shutdown: nal_data [%p]\n", nal_data);
+
+ return(PTL_OK);
+}
+
+
+/*
+ * gmnal_api_validate
+ * validate a user address for use in communications
+ * There's nothing to be done here
+ */
+int
+gmnal_api_validate(nal_t *nal, void *base, size_t extent)
+{
+
+ return(PTL_OK);
+}
+
+
+
+/*
+ * gmnal_api_yield
+ * Give up the processor
+ */
+void
+gmnal_api_yield(nal_t *nal)
+{
+ CDEBUG(D_TRACE, "gmnal_api_yield : nal [%p]\n", nal);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+
+ return;
+}
+
+
+
+/*
+ * gmnal_api_lock
+ * Take a threadsafe lock
+ */
+void
+gmnal_api_lock(nal_t *nal, unsigned long *flags)
+{
+
+ gmnal_data_t *nal_data;
+ nal_cb_t *nal_cb;
+
+ nal_data = nal->nal_data;
+ nal_cb = nal_data->nal_cb;
+
+ nal_cb->cb_cli(nal_cb, flags);
+
+ return;
+}
+
+/*
+ * gmnal_api_unlock
+ * Release a threadsafe lock
+ */
+void
+gmnal_api_unlock(nal_t *nal, unsigned long *flags)
+{
+ gmnal_data_t *nal_data;
+ nal_cb_t *nal_cb;
+
+ nal_data = nal->nal_data;
+ nal_cb = nal_data->nal_cb;
+
+ nal_cb->cb_sti(nal_cb, flags);
+
+ return;
+}
+
+
+nal_t *
+gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
+ ptl_pid_t rpid)
+{
+
+ nal_t *nal = NULL;
+ nal_cb_t *nal_cb = NULL;
+ gmnal_data_t *nal_data = NULL;
+ gmnal_srxd_t *srxd = NULL;
+ gm_status_t gm_status;
+ unsigned int local_nid = 0, global_nid = 0;
+ ptl_nid_t portals_nid;
+ ptl_pid_t portals_pid = 0;
+
+
+ CDEBUG(D_TRACE, "gmnal_init : interface [%d], ptl_size [%d],
+ ac_size[%d]\n", interface, ptl_size, ac_size);
+
+
+ PORTAL_ALLOC(nal_data, sizeof(gmnal_data_t));
+ if (!nal_data) {
+ CDEBUG(D_ERROR, "can't get memory\n");
+ return(NULL);
+ }
+ memset(nal_data, 0, sizeof(gmnal_data_t));
+ /*
+ * set the small message buffer size
+ */
+ nal_data->refcnt = 1;
+
+ CDEBUG(D_INFO, "Allocd and reset nal_data[%p]\n", nal_data);
+ CDEBUG(D_INFO, "small_msg_size is [%d]\n", nal_data->small_msg_size);
+
+ PORTAL_ALLOC(nal, sizeof(nal_t));
+ if (!nal) {
+ PORTAL_FREE(nal_data, sizeof(gmnal_data_t));
+ return(NULL);
+ }
+ memset(nal, 0, sizeof(nal_t));
+ CDEBUG(D_INFO, "Allocd and reset nal[%p]\n", nal);
+
+ PORTAL_ALLOC(nal_cb, sizeof(nal_cb_t));
+ if (!nal_cb) {
+ PORTAL_FREE(nal, sizeof(nal_t));
+ PORTAL_FREE(nal_data, sizeof(gmnal_data_t));
+ return(NULL);
+ }
+ memset(nal_cb, 0, sizeof(nal_cb_t));
+ CDEBUG(D_INFO, "Allocd and reset nal_cb[%p]\n", nal_cb);
+
+ GMNAL_INIT_NAL(nal);
+ GMNAL_INIT_NAL_CB(nal_cb);
+ /*
+ * String them all together
+ */
+ nal->nal_data = (void*)nal_data;
+ nal_cb->nal_data = (void*)nal_data;
+ nal_data->nal = nal;
+ nal_data->nal_cb = nal_cb;
+
+ GMNAL_CB_LOCK_INIT(nal_data);
+ GMNAL_GM_LOCK_INIT(nal_data);
+
+
+ /*
+ * initialise the interface,
+ */
+ CDEBUG(D_INFO, "Calling gm_init\n");
+ if (gm_init() != GM_SUCCESS) {
+ CDEBUG(D_ERROR, "call to gm_init failed\n");
+ PORTAL_FREE(nal, sizeof(nal_t));
+ PORTAL_FREE(nal_data, sizeof(gmnal_data_t));
+ PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
+ return(NULL);
+ }
+
+
+ CDEBUG(D_NET, "Calling gm_open with interface [%d], port [%d],
+ name [%s], version [%d]\n", interface, GMNAL_GM_PORT,
+ "gmnal", GM_API_VERSION);
+
+ GMNAL_GM_LOCK(nal_data);
+ gm_status = gm_open(&nal_data->gm_port, 0, GMNAL_GM_PORT, "gmnal",
+ GM_API_VERSION);
+ GMNAL_GM_UNLOCK(nal_data);
+
+ CDEBUG(D_INFO, "gm_open returned [%d]\n", gm_status);
+ if (gm_status == GM_SUCCESS) {
+ CDEBUG(D_INFO, "gm_open succeeded port[%p]\n",
+ nal_data->gm_port);
+ } else {
+ switch(gm_status) {
+ case(GM_INVALID_PARAMETER):
+ CDEBUG(D_ERROR, "gm_open Failure. Invalid Parameter\n");
+ break;
+ case(GM_BUSY):
+ CDEBUG(D_ERROR, "gm_open Failure. GM Busy\n");
+ break;
+ case(GM_NO_SUCH_DEVICE):
+ CDEBUG(D_ERROR, "gm_open Failure. No such device\n");
+ break;
+ case(GM_INCOMPATIBLE_LIB_AND_DRIVER):
+ CDEBUG(D_ERROR, "gm_open Failure. Incompatile lib
+ and driver\n");
+ break;
+ case(GM_OUT_OF_MEMORY):
+ CDEBUG(D_ERROR, "gm_open Failure. Out of Memory\n");
+ break;
+ default:
+ CDEBUG(D_ERROR, "gm_open Failure. Unknow error
+ code [%d]\n", gm_status);
+ break;
+ }
+ GMNAL_GM_LOCK(nal_data);
+ gm_finalize();
+ GMNAL_GM_UNLOCK(nal_data);
+ PORTAL_FREE(nal, sizeof(nal_t));
+ PORTAL_FREE(nal_data, sizeof(gmnal_data_t));
+ PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
+ return(NULL);
+ }
+
+
+ nal_data->small_msg_size = gmnal_small_msg_size;
+ nal_data->small_msg_gmsize =
+ gm_min_size_for_length(gmnal_small_msg_size);
+
+ if (gmnal_alloc_srxd(nal_data) != GMNAL_STATUS_OK) {
+ CDEBUG(D_ERROR, "Failed to allocate small rx descriptors\n");
+ gmnal_free_txd(nal_data);
+ GMNAL_GM_LOCK(nal_data);
+ gm_close(nal_data->gm_port);
+ gm_finalize();
+ GMNAL_GM_UNLOCK(nal_data);
+ PORTAL_FREE(nal, sizeof(nal_t));
+ PORTAL_FREE(nal_data, sizeof(gmnal_data_t));
+ PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
+ return(NULL);
+ }
+
+
+ /*
+ * Hang out a bunch of small receive buffers
+ * In fact hang them all out
+ */
+ while((srxd = gmnal_get_srxd(nal_data, 0))) {
+ CDEBUG(D_NET, "giving [%p] to gm_provide_recvive_buffer\n",
+ srxd->buffer);
+ GMNAL_GM_LOCK(nal_data);
+ gm_provide_receive_buffer_with_tag(nal_data->gm_port,
+ srxd->buffer, srxd->gmsize,
+ GM_LOW_PRIORITY, 0);
+ GMNAL_GM_UNLOCK(nal_data);
+ }
+
+ /*
+ * Allocate pools of small tx buffers and descriptors
+ */
+ if (gmnal_alloc_txd(nal_data) != GMNAL_STATUS_OK) {
+ CDEBUG(D_ERROR, "Failed to allocate small tx descriptors\n");
+ GMNAL_GM_LOCK(nal_data);
+ gm_close(nal_data->gm_port);
+ gm_finalize();
+ GMNAL_GM_UNLOCK(nal_data);
+ PORTAL_FREE(nal, sizeof(nal_t));
+ PORTAL_FREE(nal_data, sizeof(gmnal_data_t));
+ PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
+ return(NULL);
+ }
+
+ gmnal_start_kernel_threads(nal_data);
+
+ while (nal_data->rxthread_flag != GMNAL_RXTHREADS_STARTED) {
+ gmnal_yield(1);
+ CDEBUG(D_INFO, "Waiting for receive thread signs of life\n");
+ }
+
+ CDEBUG(D_INFO, "receive thread seems to have started\n");
+
+
+ /*
+ * Initialise the portals library
+ */
+ CDEBUG(D_NET, "Getting node id\n");
+ GMNAL_GM_LOCK(nal_data);
+ gm_status = gm_get_node_id(nal_data->gm_port, &local_nid);
+ GMNAL_GM_UNLOCK(nal_data);
+ if (gm_status != GM_SUCCESS) {
+ gmnal_stop_rxthread(nal_data);
+ gmnal_stop_ctthread(nal_data);
+ CDEBUG(D_ERROR, "can't determine node id\n");
+ gmnal_free_txd(nal_data);
+ gmnal_free_srxd(nal_data);
+ GMNAL_GM_LOCK(nal_data);
+ gm_close(nal_data->gm_port);
+ gm_finalize();
+ GMNAL_GM_UNLOCK(nal_data);
+ PORTAL_FREE(nal, sizeof(nal_t));
+ PORTAL_FREE(nal_data, sizeof(gmnal_data_t));
+ PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
+ return(NULL);
+ }
+ nal_data->gm_local_nid = local_nid;
+ CDEBUG(D_INFO, "Local node id is [%u]\n", local_nid);
+ GMNAL_GM_LOCK(nal_data);
+ gm_status = gm_node_id_to_global_id(nal_data->gm_port, local_nid,
+ &global_nid);
+ GMNAL_GM_UNLOCK(nal_data);
+ if (gm_status != GM_SUCCESS) {
+ CDEBUG(D_ERROR, "failed to obtain global id\n");
+ gmnal_stop_rxthread(nal_data);
+ gmnal_stop_ctthread(nal_data);
+ gmnal_free_txd(nal_data);
+ gmnal_free_srxd(nal_data);
+ GMNAL_GM_LOCK(nal_data);
+ gm_close(nal_data->gm_port);
+ gm_finalize();
+ GMNAL_GM_UNLOCK(nal_data);
+ PORTAL_FREE(nal, sizeof(nal_t));
+ PORTAL_FREE(nal_data, sizeof(gmnal_data_t));
+ PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
+ return(NULL);
+ }
+ CDEBUG(D_INFO, "Global node id is [%u]\n", global_nid);
+ nal_data->gm_global_nid = global_nid;
+
+/*
+ pid = gm_getpid();
+*/
+ CDEBUG(D_INFO, "portals_pid is [%u]\n", portals_pid);
+ portals_nid = (unsigned long)global_nid;
+ CDEBUG(D_INFO, "portals_nid is ["LPU64"]\n", portals_nid);
+
+ CDEBUG(D_PORTALS, "calling lib_init\n");
+ if (lib_init(nal_cb, portals_nid, portals_pid, 1024, ptl_size,
+ ac_size) != PTL_OK) {
+ CDEBUG(D_ERROR, "lib_init failed\n");
+ gmnal_stop_rxthread(nal_data);
+ gmnal_stop_ctthread(nal_data);
+ gmnal_free_txd(nal_data);
+ gmnal_free_srxd(nal_data);
+ GMNAL_GM_LOCK(nal_data);
+ gm_close(nal_data->gm_port);
+ gm_finalize();
+ GMNAL_GM_UNLOCK(nal_data);
+ PORTAL_FREE(nal, sizeof(nal_t));
+ PORTAL_FREE(nal_data, sizeof(gmnal_data_t));
+ PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
+ return(NULL);
+
+ }
+
+ CDEBUG(D_INFO, "gmnal_init finished\n");
+ global_nal_data = nal->nal_data;
+ return(nal);
+}
+
+
+
+/*
+ * Called when module removed
+ */
+void gmnal_fini()
+{
+ gmnal_data_t *nal_data = global_nal_data;
+ nal_t *nal = nal_data->nal;
+ nal_cb_t *nal_cb = nal_data->nal_cb;
+
+ CDEBUG(D_TRACE, "gmnal_fini\n");
+
+ PtlNIFini(kgmnal_ni);
+ lib_fini(nal_cb);
+
+ gmnal_stop_rxthread(nal_data);
+ gmnal_stop_ctthread(nal_data);
+ gmnal_free_txd(nal_data);
+ gmnal_free_srxd(nal_data);
+ GMNAL_GM_LOCK(nal_data);
+ gm_close(nal_data->gm_port);
+ gm_finalize();
+ GMNAL_GM_UNLOCK(nal_data);
+ PORTAL_FREE(nal, sizeof(nal_t));
+ PORTAL_FREE(nal_data, sizeof(gmnal_data_t));
+ PORTAL_FREE(nal_cb, sizeof(nal_cb_t));
+}
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*
- * Based on ksocknal and qswnal
+ * Copyright (c) 2003 Los Alamos National Laboratory (LANL)
*
- * Copyright (C) 2002 Cluster File Systems, Inc.
- * Author: Robert Read <rread@datarithm.net>
+ * This file is part of Lustre, http://www.lustre.org/
*
- * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- *
- * Portals is free software; you can redistribute it and/or
+ * Lustre is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
- * Portals is distributed in the hope that it will be useful,
+ * Lustre is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with Portals; if not, write to the Free Software
+ * along with Lustre; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-/* TODO
- * preallocate send buffers, store on list
- * put receive buffers on queue, handle with receive threads
- * use routing
- */
-
-#include "gmnal.h"
-
-extern kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *,int);
-
-static kgmnal_tx_t *
-get_trans(void)
-{
- kgmnal_tx_t *t;
- PORTAL_ALLOC(t, (sizeof(kgmnal_tx_t)));
- return t;
-}
-
-static void
-put_trans(kgmnal_tx_t *t)
-{
- PORTAL_FREE(t, sizeof(kgmnal_tx_t));
-}
-
-int
-kgmnal_ispeer (ptl_nid_t nid)
-{
- unsigned int gmnid = (unsigned int)nid;
- unsigned int nnids;
-
- gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
-
- return ((ptl_nid_t)gmnid == nid &&/* didn't lose high bits on conversion ? */
- gmnid < nnids); /* it's in this machine */
-}
/*
- * LIB functions follow
- *
+ * This file implements the nal cb functions
*/
-static int
-kgmnal_read (nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
- size_t len)
-{
- CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n",
- nal->ni.nid, (long)len, src_addr, dst_addr );
- memcpy( dst_addr, src_addr, len );
- return 0;
-}
-
-static int
-kgmnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
- size_t len)
-{
- CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n",
- nal->ni.nid, (long)len, src_addr, dst_addr );
- memcpy( dst_addr, src_addr, len );
- return 0;
-}
-static void *
-kgmnal_malloc(nal_cb_t *nal, size_t len)
-{
- void *buf;
- PORTAL_ALLOC(buf, len);
- return buf;
-}
+#include "gmnal.h"
-static void
-kgmnal_free(nal_cb_t *nal, void *buf, size_t len)
+int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ unsigned int niov, struct iovec *iov, size_t mlen,
+ size_t rlen)
{
- PORTAL_FREE(buf, len);
+ gmnal_srxd_t *srxd = (gmnal_srxd_t*)private;
+ int status = PTL_OK;
+
+
+ CDEBUG(D_TRACE, "gmnal_cb_recv nal_cb [%p], private[%p], cookie[%p],
+ niov[%d], iov [%p], mlen["LPSZ"], rlen["LPSZ"]\n",
+ nal_cb, private, cookie, niov, iov, mlen, rlen);
+
+ switch(srxd->type) {
+ case(GMNAL_SMALL_MESSAGE):
+ CDEBUG(D_INFO, "gmnal_cb_recv got small message\n");
+ status = gmnal_small_rx(nal_cb, private, cookie, niov,
+ iov, mlen, rlen);
+ break;
+ case(GMNAL_LARGE_MESSAGE_INIT):
+ CDEBUG(D_INFO, "gmnal_cb_recv got large message init\n");
+ status = gmnal_large_rx(nal_cb, private, cookie, niov,
+ iov, mlen, rlen);
+ }
+
+
+ CDEBUG(D_INFO, "gmnal_cb_recv gmnal_return status [%d]\n", status);
+ return(status);
}
-static void
-kgmnal_printf(nal_cb_t *nal, const char *fmt, ...)
+int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ unsigned int kniov, ptl_kiov_t *kiov, size_t mlen,
+ size_t rlen)
{
- va_list ap;
- char msg[256];
-
- if (portal_debug & D_NET) {
- va_start( ap, fmt );
- vsnprintf( msg, sizeof(msg), fmt, ap );
- va_end( ap );
-
- printk("CPUId: %d %s",smp_processor_id(), msg);
- }
+ gmnal_srxd_t *srxd = (gmnal_srxd_t*)private;
+ int status = PTL_OK;
+ struct iovec *iovec = NULL, *iovec_dup = NULL;
+ int i = 0;
+
+
+ CDEBUG(D_TRACE, "gmnal_cb_recv_pages nal_cb [%p],private[%p],
+ cookie[%p], kniov[%d], kiov [%p], mlen["LPSZ"], rlen["LPSZ"]\n",
+ nal_cb, private, cookie, kniov, kiov, mlen, rlen);
+
+ if (srxd->type == GMNAL_SMALL_MESSAGE) {
+ PORTAL_ALLOC(iovec, sizeof(struct iovec)*kniov);
+ if (!iovec) {
+ CDEBUG(D_ERROR, "Can't malloc\n");
+ return(GMNAL_STATUS_FAIL);
+ }
+ iovec_dup = iovec;
+
+ /*
+ * map each page and create an iovec for it
+ */
+ for (i=0; i<kniov; i++) {
+ CDEBUG(D_INFO, "processing kniov [%d] [%p]\n", i, kiov);
+ CDEBUG(D_INFO, "kniov page [%p] len [%d] offset[%d]\n",
+ kiov->kiov_page, kiov->kiov_len,
+ kiov->kiov_offset);
+ iovec->iov_len = kiov->kiov_len;
+ CDEBUG(D_INFO, "Calling kmap[%p]", kiov->kiov_page);
+
+ iovec->iov_base = kmap(kiov->kiov_page) +
+ kiov->kiov_offset;
+
+ CDEBUG(D_INFO, "iov_base is [%p]\n", iovec->iov_base);
+ iovec++;
+ kiov++;
+ }
+ CDEBUG(D_INFO, "calling gmnal_small_rx\n");
+ status = gmnal_small_rx(nal_cb, private, cookie, kniov,
+ iovec_dup, mlen, rlen);
+ PORTAL_FREE(iovec_dup, sizeof(struct iovec)*kniov);
+ }
+
+
+ CDEBUG(D_INFO, "gmnal_return status [%d]\n", status);
+ return(status);
}
-static void
-kgmnal_cli(nal_cb_t *nal, unsigned long *flags)
+int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int niov, struct iovec *iov, size_t len)
{
- kgmnal_data_t *data= nal->nal_data;
- spin_lock_irqsave(&data->kgm_dispatch_lock,*flags);
+ gmnal_data_t *nal_data;
+
+
+ CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] len["LPSZ"] nid["LPU64"]\n",
+ niov, len, nid);
+ nal_data = nal_cb->nal_data;
+
+ if (GMNAL_IS_SMALL_MESSAGE(nal_data, niov, iov, len)) {
+ CDEBUG(D_INFO, "This is a small message send\n");
+ gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, pid,
+ niov, iov, len);
+ } else {
+ CDEBUG(D_ERROR, "Large message send it is not supported\n");
+ lib_finalize(nal_cb, private, cookie);
+ return(PTL_FAIL);
+ gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid,
+ niov, iov, len);
+ }
+ return(PTL_OK);
}
-
-static void
-kgmnal_sti(nal_cb_t *nal, unsigned long *flags)
+int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int kniov, ptl_kiov_t *kiov, size_t len)
{
- kgmnal_data_t *data= nal->nal_data;
-
- spin_unlock_irqrestore(&data->kgm_dispatch_lock,*flags);
-}
-
-static int
-kgmnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
-{
- /* network distance doesn't mean much for this nal */
- if ( nal->ni.nid == nid ) {
- *dist = 0;
- } else {
- *dist = 1;
- }
-
- return 0;
+ int i = 0;
+ gmnal_data_t *nal_data;
+ struct iovec *iovec = NULL, *iovec_dup = NULL;
+
+ CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] len["LPSZ"]\n", nid, kniov, len);
+ nal_data = nal_cb->nal_data;
+ PORTAL_ALLOC(iovec, kniov*sizeof(struct iovec));
+ iovec_dup = iovec;
+ if (GMNAL_IS_SMALL_MESSAGE(nal_data, 0, NULL, len)) {
+ CDEBUG(D_INFO, "This is a small message send\n");
+
+ for (i=0; i<kniov; i++) {
+ CDEBUG(D_INFO, "processing kniov [%d] [%p]\n", i, kiov);
+ CDEBUG(D_INFO, "kniov page [%p] len [%d] offset[%d]\n",
+ kiov->kiov_page, kiov->kiov_len,
+ kiov->kiov_offset);
+
+ iovec->iov_base = kmap(kiov->kiov_page)
+ + kiov->kiov_offset;
+
+ iovec->iov_len = kiov->kiov_len;
+ iovec++;
+ kiov++;
+ }
+ gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid,
+ pid, kniov, iovec_dup, len);
+ } else {
+ CDEBUG(D_ERROR, "Large message send it is not supported yet\n");
+ return(PTL_FAIL);
+ for (i=0; i<kniov; i++) {
+ CDEBUG(D_INFO, "processing kniov [%d] [%p]\n", i, kiov);
+ CDEBUG(D_INFO, "kniov page [%p] len [%d] offset[%d]\n",
+ kiov->kiov_page, kiov->kiov_len,
+ kiov->kiov_offset);
+
+ iovec->iov_base = kmap(kiov->kiov_page)
+ + kiov->kiov_offset;
+ iovec->iov_len = kiov->kiov_len;
+ iovec++;
+ kiov++;
+ }
+ gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid,
+ pid, kniov, iovec, len);
+ }
+ PORTAL_FREE(iovec_dup, kniov*sizeof(struct iovec));
+ return(PTL_OK);
}
-/* FIXME rmr: add rounting code here */
-static void
-kgmnal_tx_done(kgmnal_tx_t *trans, int error)
-{
- lib_finalize(trans->ktx_nal, trans->ktx_private, trans->ktx_cookie);
-
- gm_dma_free(kgmnal_data.kgm_port, trans->ktx_buffer);
-
- trans->ktx_buffer = NULL;
- trans->ktx_len = 0;
-
- put_trans(trans);
-}
-static char * gm_error_strings[GM_NUM_STATUS_CODES] = {
- [GM_SUCCESS] = "GM_SUCCESS",
- [GM_SEND_TIMED_OUT] = "GM_SEND_TIMED_OUT",
- [GM_SEND_REJECTED] = "GM_SEND_REJECTED",
- [GM_SEND_TARGET_PORT_CLOSED] = "GM_SEND_TARGET_PORT_CLOSED",
- [GM_SEND_TARGET_NODE_UNREACHABLE] = "GM_SEND_TARGET_NODE_UNREACHABLE",
- [GM_SEND_DROPPED] = "GM_SEND_DROPPED",
- [GM_SEND_PORT_CLOSED] = "GM_SEND_PORT_CLOSED",
-};
-
-inline char * get_error(int status)
+int gmnal_cb_read(nal_cb_t *nal_cb, void *private, void *dst,
+ user_ptr src, size_t len)
{
- if (gm_error_strings[status] != NULL)
- return gm_error_strings[status];
- else
- return "Unknown error";
+ gm_bcopy(src, dst, len);
+ return(PTL_OK);
}
-static void
-kgmnal_errhandler(struct gm_port *p, void *context, gm_status_t status)
+int gmnal_cb_write(nal_cb_t *nal_cb, void *private, user_ptr dst,
+ void *src, size_t len)
{
- CDEBUG(D_NET,"error callback: ktx %p status %d\n", context, status);
+ gm_bcopy(src, dst, len);
+ return(PTL_OK);
}
-static void
-kgmnal_txhandler(struct gm_port *p, void *context, gm_status_t status)
+int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq,
+ ptl_event_t *ev)
{
- kgmnal_tx_t *ktx = (kgmnal_tx_t *)context;
- int err = 0;
-
- LASSERT (p != NULL);
- LASSERT (ktx != NULL);
-
- CDEBUG(D_NET,"ktx %p status %d nid 0x%x pid %d\n", ktx, status,
- ktx->ktx_tgt_node, ktx->ktx_tgt_port_id);
-
- switch((int)status) {
- case GM_SUCCESS: /* normal */
- break;
- case GM_SEND_TIMED_OUT: /* application error */
- case GM_SEND_REJECTED: /* size of msg unacceptable */
- case GM_SEND_TARGET_PORT_CLOSED:
- CERROR("%s (%d):\n", get_error(status), status);
- gm_resume_sending(kgmnal_data.kgm_port, ktx->ktx_priority,
- ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
- kgmnal_errhandler, NULL);
- err = -EIO;
- break;
- case GM_SEND_TARGET_NODE_UNREACHABLE:
- case GM_SEND_PORT_CLOSED:
- CERROR("%s (%d):\n", get_error(status), status);
- gm_drop_sends(kgmnal_data.kgm_port, ktx->ktx_priority,
- ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
- kgmnal_errhandler, NULL);
- err = -EIO;
- break;
- case GM_SEND_DROPPED:
- CERROR("%s (%d):\n", get_error(status), status);
- err = -EIO;
- break;
- default:
- CERROR("Unknown status: %d\n", status);
- err = -EIO;
- break;
- }
-
- kgmnal_tx_done(ktx, err);
-}
-
-/*
- */
-static int
-kgmnal_send(nal_cb_t *nal,
- void *private,
- lib_msg_t *cookie,
- ptl_hdr_t *hdr,
- int type,
- ptl_nid_t nid,
- ptl_pid_t pid,
- int options,
- unsigned int niov,
- lib_md_iov_t *iov,
- size_t len)
-{
- /*
- * ipnal assumes that this is the private as passed to lib_dispatch..
- * so do we :/
- */
- kgmnal_tx_t *ktx=NULL;
- int rc=0;
- void * buf;
- int buf_len = sizeof(ptl_hdr_t) + len;
- int buf_size = 0;
-
- LASSERT ((options & PTL_MD_KIOV) == 0);
-
- PROF_START(gmnal_send);
-
-
- CDEBUG(D_NET, "sending %d bytes from %p to nid: 0x%Lx pid %d\n",
- len, iov, nid, KGM_PORT_NUM);
-
- /* ensure there is an available tx handle */
-
- /* save transaction info to trans for later finalize and cleanup */
- ktx = get_trans();
- if (ktx == NULL) {
- rc = -ENOMEM;
- goto send_exit;
- }
-
- /* hmmm... GM doesn't support vectored write, so need to allocate buffer to coalesce
- header and data.
- Also, memory must be dma'able or registered with GM. */
-
- if (buf_len <= MSG_LEN_SMALL) {
- buf_size = MSG_SIZE_SMALL;
- } else if (buf_len <= MSG_LEN_LARGE) {
- buf_size = MSG_SIZE_LARGE;
- } else {
- printk("kgmnal:request exceeds TX MTU size (%d).\n",
- MSG_SIZE_LARGE);
- rc = -1;
- goto send_exit;
- }
-
- buf = gm_dma_malloc(kgmnal_data.kgm_port, buf_len);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto send_exit;
- }
- memcpy(buf, hdr, sizeof(ptl_hdr_t));
-
- if (len != 0)
- lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t),
- options, niov, iov, len);
-
- ktx->ktx_nal = nal;
- ktx->ktx_private = private;
- ktx->ktx_cookie = cookie;
- ktx->ktx_len = buf_len;
- ktx->ktx_size = buf_size;
- ktx->ktx_buffer = buf;
- ktx->ktx_priority = GM_LOW_PRIORITY;
- ktx->ktx_tgt_node = nid;
- ktx->ktx_tgt_port_id = KGM_PORT_NUM;
-
- CDEBUG(D_NET, "gm_send %d bytes (size %d) from %p to nid: 0x%Lx "
- "pid %d pri %d\n", buf_len, buf_size, iov, nid, KGM_PORT_NUM,
- GM_LOW_PRIORITY);
-
- gm_send_with_callback(kgmnal_data.kgm_port, buf, buf_size,
- buf_len, GM_LOW_PRIORITY,
- nid, KGM_PORT_NUM,
- kgmnal_txhandler, ktx);
-
- PROF_FINISH(gmnal_send);
- send_exit:
- return rc;
-}
-void
-kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
-{
- CERROR ("forwarding not implemented\n");
+ if (eq->event_callback != NULL) {
+ CDEBUG(D_INFO, "found callback\n");
+ eq->event_callback(ev);
+ }
+
+ return(PTL_OK);
}
-void
-kqswnal_fwd_callback (void *arg, int error)
+void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len)
{
- CERROR ("forwarding not implemented\n");
+ void *ptr = NULL;
+ CDEBUG(D_TRACE, "gmnal_cb_malloc len["LPSZ"]\n", len);
+ PORTAL_ALLOC(ptr, len);
+ return(ptr);
}
-
-static inline void
-kgmnal_requeue_rx(kgmnal_rx_t *krx)
+void gmnal_cb_free(nal_cb_t *nal_cb, void *buf, size_t len)
{
- gm_provide_receive_buffer(kgmnal_data.kgm_port, krx->krx_buffer,
- krx->krx_size, krx->krx_priority);
+ CDEBUG(D_TRACE, "gmnal_cb_free :: buf[%p] len["LPSZ"]\n", buf, len);
+ PORTAL_FREE(buf, len);
+ return;
}
-/* Process a received portals packet */
-
-/* Receive Interrupt Handler */
-static void kgmnal_rx(kgmnal_data_t *kgm, unsigned long len, unsigned int size,
- void * buf, unsigned int pri)
+void gmnal_cb_unmap(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov,
+ void **addrkey)
{
- ptl_hdr_t *hdr = buf;
- kgmnal_rx_t krx;
-
- CDEBUG(D_NET,"buf %p, len %ld\n", buf, len);
-
- if ( len < sizeof( ptl_hdr_t ) ) {
- /* XXX what's this for? */
- if (kgm->kgm_shuttingdown)
- return;
- CERROR("kgmnal: did not receive complete portal header, "
- "len= %ld", len);
- gm_provide_receive_buffer(kgm->kgm_port, buf, size, pri);
- return;
- }
-
- /* might want to use seperate threads to handle receive */
- krx.krx_buffer = buf;
- krx.krx_len = len;
- krx.krx_size = size;
- krx.krx_priority = pri;
-
- if ( hdr->dest_nid == kgmnal_lib.ni.nid ) {
- PROF_START(lib_parse);
- lib_parse(&kgmnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx);
- PROF_FINISH(lib_parse);
- } else if (kgmnal_ispeer(hdr->dest_nid)) {
- /* should have gone direct to peer */
- CERROR("dropping packet from 0x%llx to 0x%llx: target is "
- "a peer", hdr->src_nid, hdr->dest_nid);
- kgmnal_requeue_rx(&krx);
- } else {
- /* forward to gateway */
- CERROR("forwarding not implemented yet");
- kgmnal_requeue_rx(&krx);
- }
-
- return;
+ return;
}
-
-static int kgmnal_recv(nal_cb_t *nal,
- void *private,
- lib_msg_t *cookie,
- int options,
- unsigned int niov,
- lib_md_iov_t *iov,
- size_t mlen,
- size_t rlen)
+int gmnal_cb_map(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov,
+ void**addrkey)
{
- kgmnal_rx_t *krx = private;
-
- LASSERT ((options & PTL_MD_KIOV) == 0);
-
- CDEBUG(D_NET,"mlen=%d, rlen=%d\n", mlen, rlen);
-
- /* What was actually received must be >= what sender claims to
- * have sent. This is an LASSERT, since lib-move doesn't
- * check cb return code yet. */
- LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen);
- LASSERT (mlen <= rlen);
-
- PROF_START(gmnal_recv);
-
- if(mlen != 0) {
- PROF_START(memcpy);
- lib_copy_buf2iov (options, niov, iov,
- krx->krx_buffer + sizeof (ptl_hdr_t), mlen);
- PROF_FINISH(memcpy);
- }
-
- PROF_START(lib_finalize);
- lib_finalize(nal, private, cookie);
- PROF_FINISH(lib_finalize);
-
- kgmnal_requeue_rx(krx);
-
- PROF_FINISH(gmnal_recv);
-
- return rlen;
+ return(PTL_OK);
}
-
-static void kgmnal_shutdown(void * none)
+void gmnal_cb_printf(nal_cb_t *nal_cb, const char *fmt, ...)
{
- CERROR("called\n");
- return;
+ CDEBUG(D_TRACE, "gmnal_cb_printf\n");
+ printk(fmt);
+ return;
}
-/*
- * Set terminate and use alarm to wake up the recv thread.
- */
-static void recv_shutdown(kgmnal_data_t *kgm)
+void gmnal_cb_cli(nal_cb_t *nal_cb, unsigned long *flags)
{
- gm_alarm_t alarm;
+ gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data;
- kgm->kgm_shuttingdown = 1;
- gm_initialize_alarm(&alarm);
- gm_set_alarm(kgm->kgm_port, &alarm, 1, kgmnal_shutdown, NULL);
+ spin_lock_irqsave(&nal_data->cb_lock, *flags);
+ return;
}
-int kgmnal_end(kgmnal_data_t *kgm)
+void gmnal_cb_sti(nal_cb_t *nal_cb, unsigned long *flags)
{
+ gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data;
- /* wait for sends to finish ? */
- /* remove receive buffers */
- /* shutdown receive thread */
-
- recv_shutdown(kgm);
-
- return 0;
+ spin_unlock_irqrestore(&nal_data->cb_lock, *flags);
+ return;
}
-/* Used only for the spinner */
-int kgmnal_recv_thread(void *arg)
+int gmnal_cb_dist(nal_cb_t *nal_cb, ptl_nid_t nid, unsigned long *dist)
{
- kgmnal_data_t *kgm = arg;
-
- LASSERT(kgm != NULL);
-
- kportal_daemonize("kgmnal_rx");
-
- while(1) {
- gm_recv_event_t *e;
- int priority = GM_LOW_PRIORITY;
- if (kgm->kgm_shuttingdown)
- break;
-
- e = gm_blocking_receive_no_spin(kgm->kgm_port);
- if (e == NULL) {
- CERROR("gm_blocking_receive returned NULL\n");
- break;
- }
-
- switch(gm_ntohc(e->recv.type)) {
- case GM_HIGH_RECV_EVENT:
- priority = GM_HIGH_PRIORITY;
- /* fall through */
- case GM_RECV_EVENT:
- kgmnal_rx(kgm, gm_ntohl(e->recv.length),
- gm_ntohc(e->recv.size),
- gm_ntohp(e->recv.buffer), priority);
- break;
- case GM_ALARM_EVENT:
- CERROR("received alarm");
- gm_unknown(kgm->kgm_port, e);
- break;
- case GM_BAD_SEND_DETECTED_EVENT: /* ?? */
- CERROR("received bad send!\n");
- break;
- default:
- gm_unknown(kgm->kgm_port, e);
- }
- }
-
- CERROR("shuttting down.\n");
- return 0;
+ CDEBUG(D_TRACE, "gmnal_cb_dist\n");
+ if (dist)
+ *dist = 27;
+ return(PTL_OK);
}
-
-nal_cb_t kgmnal_lib = {
- nal_data: &kgmnal_data, /* NAL private data */
- cb_send: kgmnal_send,
- cb_recv: kgmnal_recv,
- cb_read: kgmnal_read,
- cb_write: kgmnal_write,
- cb_malloc: kgmnal_malloc,
- cb_free: kgmnal_free,
- cb_printf: kgmnal_printf,
- cb_cli: kgmnal_cli,
- cb_sti: kgmnal_sti,
- cb_dist: kgmnal_dist
-};
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (c) 2003 Los Alamos National Laboratory (LANL)
+ *
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * This file contains all gmnal send and receive functions
+ */
+
+#include "gmnal.h"
+
+/*
+ * The caretaker thread
+ * This is main thread of execution for the NAL side
+ * This guy waits in gm_blocking_recvive and gets
+ * woken up when the myrinet adaptor gets an interrupt.
+ * Hands off receive operations to the receive thread
+ * This thread Looks after gm_callbacks etc inline.
+ */
+int
+gmnal_ct_thread(void *arg)
+{
+ gmnal_data_t *nal_data;
+ gm_recv_event_t *rxevent = NULL;
+ gm_recv_t *recv = NULL;
+
+ if (!arg) {
+ CDEBUG(D_TRACE, "NO nal_data. Exiting\n");
+ return(-1);
+ }
+
+ nal_data = (gmnal_data_t*)arg;
+ CDEBUG(D_TRACE, "nal_data is [%p]\n", arg);
+
+ daemonize();
+
+ nal_data->ctthread_flag = GMNAL_CTTHREAD_STARTED;
+
+ GMNAL_GM_LOCK(nal_data);
+ while(nal_data->ctthread_flag == GMNAL_CTTHREAD_STARTED) {
+ CDEBUG(D_NET, "waiting\n");
+ rxevent = gm_blocking_receive_no_spin(nal_data->gm_port);
+ if (nal_data->ctthread_flag == GMNAL_THREAD_STOP) {
+ CDEBUG(D_INFO, "time to exit\n");
+ break;
+ }
+ CDEBUG(D_INFO, "got [%s]\n", gmnal_rxevent(rxevent));
+ switch (GM_RECV_EVENT_TYPE(rxevent)) {
+
+ case(GM_RECV_EVENT):
+ CDEBUG(D_NET, "CTTHREAD:: GM_RECV_EVENT\n");
+ recv = (gm_recv_t*)&rxevent->recv;
+ GMNAL_GM_UNLOCK(nal_data);
+ gmnal_add_rxtwe(nal_data, recv);
+ GMNAL_GM_LOCK(nal_data);
+ CDEBUG(D_NET, "CTTHREAD:: Added event to Q\n");
+ break;
+ case(_GM_SLEEP_EVENT):
+ /*
+ * Blocking receive above just returns
+ * immediatly with _GM_SLEEP_EVENT
+ * Don't know what this is
+ */
+ CDEBUG(D_NET, "Sleeping in gm_unknown\n");
+ GMNAL_GM_UNLOCK(nal_data);
+ gm_unknown(nal_data->gm_port, rxevent);
+ GMNAL_GM_LOCK(nal_data);
+ CDEBUG(D_INFO, "Awake from gm_unknown\n");
+ break;
+
+ default:
+ /*
+ * Don't know what this is
+ * gm_unknown will make sense of it
+ * Should be able to do something with
+ * FAST_RECV_EVENTS here.
+ */
+ CDEBUG(D_NET, "Passing event to gm_unknown\n");
+ GMNAL_GM_UNLOCK(nal_data);
+ gm_unknown(nal_data->gm_port, rxevent);
+ GMNAL_GM_LOCK(nal_data);
+ CDEBUG(D_INFO, "Processed unknown event\n");
+ }
+ }
+ GMNAL_GM_UNLOCK(nal_data);
+ nal_data->ctthread_flag = GMNAL_THREAD_RESET;
+ CDEBUG(D_INFO, "thread nal_data [%p] is exiting\n", nal_data);
+ return(GMNAL_STATUS_OK);
+}
+
+
+/*
+ * process a receive event
+ */
+int gmnal_rx_thread(void *arg)
+{
+ gmnal_data_t *nal_data;
+ void *buffer;
+ gmnal_rxtwe_t *we = NULL;
+
+ if (!arg) {
+ CDEBUG(D_TRACE, "NO nal_data. Exiting\n");
+ return(-1);
+ }
+
+ nal_data = (gmnal_data_t*)arg;
+ CDEBUG(D_TRACE, "nal_data is [%p]\n", arg);
+
+ daemonize();
+ /*
+ * set 1 bit for each thread started
+ * doesn't matter which bit
+ */
+ spin_lock(&nal_data->rxthread_flag_lock);
+ if (nal_data->rxthread_flag)
+ nal_data->rxthread_flag=nal_data->rxthread_flag*2 + 1;
+ else
+ nal_data->rxthread_flag = 1;
+ CDEBUG(D_INFO, "rxthread flag is [%ld]\n", nal_data->rxthread_flag);
+ spin_unlock(&nal_data->rxthread_flag_lock);
+
+ while(nal_data->rxthread_stop_flag != GMNAL_THREAD_STOP) {
+ CDEBUG(D_NET, "RXTHREAD:: Receive thread waiting\n");
+ we = gmnal_get_rxtwe(nal_data);
+ if (!we) {
+ CDEBUG(D_INFO, "Receive thread time to exit\n");
+ break;
+ }
+
+ buffer = we->buffer;
+ switch(((gmnal_msghdr_t*)buffer)->type) {
+ case(GMNAL_SMALL_MESSAGE):
+ gmnal_pre_receive(nal_data, we,
+ GMNAL_SMALL_MESSAGE);
+ break;
+ case(GMNAL_LARGE_MESSAGE_INIT):
+ gmnal_pre_receive(nal_data, we,
+ GMNAL_LARGE_MESSAGE_INIT);
+ break;
+ case(GMNAL_LARGE_MESSAGE_ACK):
+ gmnal_pre_receive(nal_data, we,
+ GMNAL_LARGE_MESSAGE_ACK);
+ break;
+ default:
+ CDEBUG(D_ERROR, "Unsupported message type\n");
+ gmnal_rx_bad(nal_data, we, NULL);
+ }
+ PORTAL_FREE(we, sizeof(gmnal_rxtwe_t));
+ }
+
+ spin_lock(&nal_data->rxthread_flag_lock);
+ nal_data->rxthread_flag/=2;
+ CDEBUG(D_INFO, "rxthread flag is [%ld]\n", nal_data->rxthread_flag);
+ spin_unlock(&nal_data->rxthread_flag_lock);
+ CDEBUG(D_INFO, "thread nal_data [%p] is exiting\n", nal_data);
+ return(GMNAL_STATUS_OK);
+}
+
+
+
+/*
+ * Start processing a small message receive
+ * Get here from gmnal_receive_thread
+ * Hand off to lib_parse, which calls cb_recv
+ * which hands back to gmnal_small_receive
+ * Deal with all endian stuff here.
+ */
+int
+gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type)
+{
+ gmnal_srxd_t *srxd = NULL;
+ void *buffer = NULL;
+ unsigned int snode, sport, type, length;
+ gmnal_msghdr_t *gmnal_msghdr;
+ ptl_hdr_t *portals_hdr;
+
+ CDEBUG(D_INFO, "nal_data [%p], we[%p] type [%d]\n",
+ nal_data, we, gmnal_type);
+
+ buffer = we->buffer;
+ snode = we->snode;
+ sport = we->sport;
+ type = we->type;
+ buffer = we->buffer;
+ length = we->length;
+
+ gmnal_msghdr = (gmnal_msghdr_t*)buffer;
+ portals_hdr = (ptl_hdr_t*)(buffer+GMNAL_MSGHDR_SIZE);
+
+ CDEBUG(D_INFO, "rx_event:: Sender node [%d], Sender Port [%d],
+ type [%d], length [%d], buffer [%p]\n",
+ snode, sport, type, length, buffer);
+ CDEBUG(D_INFO, "gmnal_msghdr:: Sender node [%u], magic [%d],
+ gmnal_type [%d]\n", gmnal_msghdr->sender_node_id,
+ gmnal_msghdr->magic, gmnal_msghdr->type);
+ CDEBUG(D_INFO, "portals_hdr:: Sender node ["LPD64"],
+ dest_node ["LPD64"]\n", portals_hdr->src_nid,
+ portals_hdr->dest_nid);
+
+
+ /*
+ * Get a receive descriptor for this message
+ */
+ srxd = gmnal_rxbuffer_to_srxd(nal_data, buffer);
+ CDEBUG(D_INFO, "Back from gmnal_rxbuffer_to_srxd\n");
+ srxd->nal_data = nal_data;
+ if (!srxd) {
+ CDEBUG(D_ERROR, "Failed to get receive descriptor\n");
+ lib_parse(nal_data->nal_cb, portals_hdr, srxd);
+ return(GMNAL_STATUS_FAIL);
+ }
+
+ /*
+ * no need to bother portals library with this
+ */
+ if (gmnal_type == GMNAL_LARGE_MESSAGE_ACK) {
+ gmnal_large_tx_ack_received(nal_data, srxd);
+ return(GMNAL_STATUS_OK);
+ }
+
+ srxd->type = gmnal_type;
+ srxd->nsiov = gmnal_msghdr->niov;
+ srxd->gm_source_node = gmnal_msghdr->sender_node_id;
+
+ CDEBUG(D_PORTALS, "Calling lib_parse buffer is [%p]\n",
+ buffer+GMNAL_MSGHDR_SIZE);
+ /*
+ * control passes to lib, which calls cb_recv
+ * cb_recv is responsible for returning the buffer
+ * for future receive
+ */
+ lib_parse(nal_data->nal_cb, portals_hdr, srxd);
+
+ return(GMNAL_STATUS_OK);
+}
+
+
+
+/*
+ * After a receive has been processed,
+ * hang out the receive buffer again.
+ * This implicitly returns a receive token.
+ */
+int
+gmnal_rx_requeue_buffer(gmnal_data_t *nal_data, gmnal_srxd_t *srxd)
+{
+ CDEBUG(D_TRACE, "gmnal_rx_requeue_buffer\n");
+
+ CDEBUG(D_NET, "requeueing srxd[%p] nal_data[%p]\n", srxd, nal_data);
+
+ GMNAL_GM_LOCK(nal_data);
+ gm_provide_receive_buffer_with_tag(nal_data->gm_port, srxd->buffer,
+ srxd->gmsize, GM_LOW_PRIORITY, 0 );
+ GMNAL_GM_UNLOCK(nal_data);
+
+ return(GMNAL_STATUS_OK);
+}
+
+
+/*
+ * Handle a bad message
+ * A bad message is one we don't expect or can't interpret
+ */
+int
+gmnal_rx_bad(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, gmnal_srxd_t *srxd)
+{
+ CDEBUG(D_TRACE, "Can't handle message\n");
+
+ if (!srxd)
+ srxd = gmnal_rxbuffer_to_srxd(nal_data,
+ we->buffer);
+ if (srxd) {
+ gmnal_rx_requeue_buffer(nal_data, srxd);
+ } else {
+ CDEBUG(D_ERROR, "Can't find a descriptor for this buffer\n");
+ /*
+ * get rid of it ?
+ */
+ return(GMNAL_STATUS_FAIL);
+ }
+
+ return(GMNAL_STATUS_OK);
+}
+
+
+
+/*
+ * Process a small message receive.
+ * Get here from gmnal_receive_thread, gmnal_pre_receive
+ * lib_parse, cb_recv
+ * Put data from prewired receive buffer into users buffer(s)
+ * Hang out the receive buffer again for another receive
+ * Call lib_finalize
+ */
+int
+gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+{
+ gmnal_srxd_t *srxd = NULL;
+ void *buffer = NULL;
+ gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data;
+
+
+ CDEBUG(D_TRACE, "niov [%d] mlen["LPSZ"]\n", niov, mlen);
+
+ if (!private) {
+ CDEBUG(D_ERROR, "gmnal_small_rx no context\n");
+ lib_finalize(nal_cb, private, cookie);
+ return(PTL_FAIL);
+ }
+
+ srxd = (gmnal_srxd_t*)private;
+ buffer = srxd->buffer;
+ buffer += sizeof(gmnal_msghdr_t);
+ buffer += sizeof(ptl_hdr_t);
+
+ while(niov--) {
+ CDEBUG(D_INFO, "processing [%p] len ["LPSZ"]\n", iov,
+ iov->iov_len);
+ gm_bcopy(buffer, iov->iov_base, iov->iov_len);
+ buffer += iov->iov_len;
+ iov++;
+ }
+
+
+ /*
+ * let portals library know receive is complete
+ */
+ CDEBUG(D_PORTALS, "calling lib_finalize\n");
+ if (lib_finalize(nal_cb, private, cookie) != PTL_OK) {
+ /* TO DO what to do with failed lib_finalise? */
+ CDEBUG(D_INFO, "lib_finalize failed\n");
+ }
+ /*
+ * return buffer so it can be used again
+ */
+ CDEBUG(D_NET, "calling gm_provide_receive_buffer\n");
+ GMNAL_GM_LOCK(nal_data);
+ gm_provide_receive_buffer_with_tag(nal_data->gm_port, srxd->buffer,
+ srxd->gmsize, GM_LOW_PRIORITY, 0);
+ GMNAL_GM_UNLOCK(nal_data);
+
+ return(PTL_OK);
+}
+
+
+/*
+ * Start a small transmit.
+ * Get a send token (and wired transmit buffer).
+ * Copy data from senders buffer to wired buffer and
+ * initiate gm_send from the wired buffer.
+ * The callback function informs when the send is complete.
+ */
+int
+gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t global_nid, ptl_pid_t pid,
+ unsigned int niov, struct iovec *iov, int size)
+{
+ gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data;
+ gmnal_stxd_t *stxd = NULL;
+ void *buffer = NULL;
+ gmnal_msghdr_t *msghdr = NULL;
+ int tot_size = 0;
+ unsigned int local_nid;
+ gm_status_t gm_status = GM_SUCCESS;
+
+ CDEBUG(D_TRACE, "gmnal_small_tx nal_cb [%p] private [%p] cookie [%p]
+ hdr [%p] type [%d] global_nid ["LPU64"] pid [%d] niov [%d]
+ iov [%p] size [%d]\n", nal_cb, private, cookie, hdr, type,
+ global_nid, pid, niov, iov, size);
+
+ CDEBUG(D_INFO, "portals_hdr:: dest_nid ["LPU64"], src_nid ["LPU64"]\n",
+ hdr->dest_nid, hdr->src_nid);
+
+ if (!nal_data) {
+ CDEBUG(D_ERROR, "no nal_data\n");
+ return(GMNAL_STATUS_FAIL);
+ } else {
+ CDEBUG(D_INFO, "nal_data [%p]\n", nal_data);
+ }
+
+ GMNAL_GM_LOCK(nal_data);
+ gm_status = gm_global_id_to_node_id(nal_data->gm_port, global_nid,
+ &local_nid);
+ GMNAL_GM_UNLOCK(nal_data);
+ if (gm_status != GM_SUCCESS) {
+ CDEBUG(D_ERROR, "Failed to obtain local id\n");
+ return(GMNAL_STATUS_FAIL);
+ }
+ CDEBUG(D_INFO, "Local Node_id is [%u][%x]\n", local_nid, local_nid);
+
+ stxd = gmnal_get_stxd(nal_data, 1);
+ CDEBUG(D_INFO, "stxd [%p]\n", stxd);
+
+ stxd->type = GMNAL_SMALL_MESSAGE;
+ stxd->cookie = cookie;
+
+ /*
+ * Copy gmnal_msg_hdr and portals header to the transmit buffer
+ * Then copy the data in
+ */
+ buffer = stxd->buffer;
+ msghdr = (gmnal_msghdr_t*)buffer;
+
+ msghdr->magic = GMNAL_MAGIC;
+ msghdr->type = GMNAL_SMALL_MESSAGE;
+ msghdr->sender_node_id = nal_data->gm_global_nid;
+ CDEBUG(D_INFO, "processing msghdr at [%p]\n", buffer);
+
+ buffer += sizeof(gmnal_msghdr_t);
+
+ CDEBUG(D_INFO, "processing portals hdr at [%p]\n", buffer);
+ gm_bcopy(hdr, buffer, sizeof(ptl_hdr_t));
+
+ buffer += sizeof(ptl_hdr_t);
+
+ while(niov--) {
+ CDEBUG(D_INFO, "processing iov [%p] len ["LPSZ"] to [%p]\n",
+ iov, iov->iov_len, buffer);
+ gm_bcopy(iov->iov_base, buffer, iov->iov_len);
+ buffer+= iov->iov_len;
+ iov++;
+ }
+
+ CDEBUG(D_INFO, "sending\n");
+ tot_size = size+sizeof(ptl_hdr_t)+sizeof(gmnal_msghdr_t);
+ stxd->msg_size = tot_size;
+
+
+ CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p]
+ gmsize [%lu] msize [%d] global_nid ["LPU64"] local_nid[%d]
+ stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size,
+ stxd->msg_size, global_nid, local_nid, stxd);
+
+ GMNAL_GM_LOCK(nal_data);
+ stxd->gm_priority = GM_LOW_PRIORITY;
+ stxd->gm_target_node = local_nid;
+ gm_send_to_peer_with_callback(nal_data->gm_port, stxd->buffer,
+ stxd->gm_size, stxd->msg_size,
+ GM_LOW_PRIORITY, local_nid,
+ gmnal_small_tx_callback, (void*)stxd);
+ GMNAL_GM_UNLOCK(nal_data);
+ CDEBUG(D_INFO, "done\n");
+
+ return(PTL_OK);
+}
+
+
+/*
+ * A callback to indicate the small transmit operation is compete
+ * Check for erros and try to deal with them.
+ * Call lib_finalise to inform the client application that the send
+ * is complete and the memory can be reused.
+ * Return the stxd when finished with it (returns a send token)
+ */
+void
+gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status)
+{
+ gmnal_stxd_t *stxd = (gmnal_stxd_t*)context;
+ lib_msg_t *cookie = stxd->cookie;
+ gmnal_data_t *nal_data = (gmnal_data_t*)stxd->nal_data;
+ nal_cb_t *nal_cb = nal_data->nal_cb;
+
+ if (!stxd) {
+ CDEBUG(D_TRACE, "send completion event for unknown stxd\n");
+ return;
+ }
+ if (status != GM_SUCCESS) {
+ CDEBUG(D_ERROR, "Result of send stxd [%p] is [%s]\n",
+ stxd, gmnal_gm_error(status));
+ }
+
+ switch(status) {
+ case(GM_SUCCESS):
+ break;
+
+
+
+ case(GM_SEND_DROPPED):
+ /*
+ * do a resend on the dropped ones
+ */
+ CDEBUG(D_ERROR, "send stxd [%p] was dropped
+ resending\n", context);
+ GMNAL_GM_LOCK(nal_data);
+ gm_send_to_peer_with_callback(nal_data->gm_port,
+ stxd->buffer,
+ stxd->gm_size,
+ stxd->msg_size,
+ stxd->gm_priority,
+ stxd->gm_target_node,
+ gmnal_small_tx_callback,
+ context);
+ GMNAL_GM_UNLOCK(nal_data);
+
+ return;
+ case(GM_TIMED_OUT):
+ case(GM_SEND_TIMED_OUT):
+ /*
+ * drop these ones
+ */
+ CDEBUG(D_INFO, "calling gm_drop_sends\n");
+ GMNAL_GM_LOCK(nal_data);
+ gm_drop_sends(nal_data->gm_port, stxd->gm_priority,
+ stxd->gm_target_node, GMNAL_GM_PORT,
+ gmnal_drop_sends_callback, context);
+ GMNAL_GM_UNLOCK(nal_data);
+
+ return;
+
+
+ /*
+ * abort on these ?
+ */
+ case(GM_TRY_AGAIN):
+ case(GM_INTERRUPTED):
+ case(GM_FAILURE):
+ case(GM_INPUT_BUFFER_TOO_SMALL):
+ case(GM_OUTPUT_BUFFER_TOO_SMALL):
+ case(GM_BUSY):
+ case(GM_MEMORY_FAULT):
+ case(GM_INVALID_PARAMETER):
+ case(GM_OUT_OF_MEMORY):
+ case(GM_INVALID_COMMAND):
+ case(GM_PERMISSION_DENIED):
+ case(GM_INTERNAL_ERROR):
+ case(GM_UNATTACHED):
+ case(GM_UNSUPPORTED_DEVICE):
+ case(GM_SEND_REJECTED):
+ case(GM_SEND_TARGET_PORT_CLOSED):
+ case(GM_SEND_TARGET_NODE_UNREACHABLE):
+ case(GM_SEND_PORT_CLOSED):
+ case(GM_NODE_ID_NOT_YET_SET):
+ case(GM_STILL_SHUTTING_DOWN):
+ case(GM_CLONE_BUSY):
+ case(GM_NO_SUCH_DEVICE):
+ case(GM_ABORTED):
+ case(GM_INCOMPATIBLE_LIB_AND_DRIVER):
+ case(GM_UNTRANSLATED_SYSTEM_ERROR):
+ case(GM_ACCESS_DENIED):
+ case(GM_NO_DRIVER_SUPPORT):
+ case(GM_PTE_REF_CNT_OVERFLOW):
+ case(GM_NOT_SUPPORTED_IN_KERNEL):
+ case(GM_NOT_SUPPORTED_ON_ARCH):
+ case(GM_NO_MATCH):
+ case(GM_USER_ERROR):
+ case(GM_DATA_CORRUPTED):
+ case(GM_HARDWARE_FAULT):
+ case(GM_SEND_ORPHANED):
+ case(GM_MINOR_OVERFLOW):
+ case(GM_PAGE_TABLE_FULL):
+ case(GM_UC_ERROR):
+ case(GM_INVALID_PORT_NUMBER):
+ case(GM_DEV_NOT_FOUND):
+ case(GM_FIRMWARE_NOT_RUNNING):
+ case(GM_YP_NO_MATCH):
+ default:
+ CDEBUG(D_ERROR, "Unknown send error\n");
+ }
+
+ /*
+ * TO DO
+ * If this is a large message init,
+ * we're not finished with the data yet,
+ * so can't call lib_finalise.
+ * However, we're also holding on to a
+ * stxd here (to keep track of the source
+ * iovec only). Should use another structure
+ * to keep track of iovec and return stxd to
+ * free list earlier.
+ */
+ if (stxd->type == GMNAL_LARGE_MESSAGE_INIT) {
+ CDEBUG(D_INFO, "large transmit done\n");
+ return;
+ }
+ gmnal_return_stxd(nal_data, stxd);
+ if (lib_finalize(nal_cb, stxd, cookie) != PTL_OK) {
+ CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n",
+ stxd);
+ }
+ return;
+}
+
+
+
+void gmnal_drop_sends_callback(struct gm_port *gm_port, void *context,
+ gm_status_t status)
+{
+ gmnal_stxd_t *stxd = (gmnal_stxd_t*)context;
+ gmnal_data_t *nal_data = stxd->nal_data;
+
+ CDEBUG(D_TRACE, "status is [%d] context is [%p]\n", status, context);
+ if (status == GM_SUCCESS) {
+ GMNAL_GM_LOCK(nal_data);
+ gm_send_to_peer_with_callback(gm_port, stxd->buffer,
+ stxd->gm_size, stxd->msg_size,
+ stxd->gm_priority,
+ stxd->gm_target_node,
+ gmnal_small_tx_callback,
+ context);
+ GMNAL_GM_LOCK(nal_data);
+ } else {
+ CDEBUG(D_ERROR, "send_to_peer status for stxd [%p] is
+ [%d][%s]\n", stxd, status, gmnal_gm_error(status));
+ }
+
+
+ return;
+}
+
+
+/*
+ * Begine a large transmit.
+ * Do a gm_register of the memory pointed to by the iovec
+ * and send details to the receiver. The receiver does a gm_get
+ * to pull the data and sends and ack when finished. Upon receipt of
+ * this ack, deregister the memory. Only 1 send token is required here.
+ */
+int
+gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t global_nid, ptl_pid_t pid,
+ unsigned int niov, struct iovec *iov, int size)
+{
+
+ gmnal_data_t *nal_data;
+ gmnal_stxd_t *stxd = NULL;
+ void *buffer = NULL;
+ gmnal_msghdr_t *msghdr = NULL;
+ unsigned int local_nid;
+ int mlen = 0; /* the size of the init message data */
+ struct iovec *iov_dup = NULL;
+ gm_status_t gm_status;
+ int niov_dup;
+
+
+ CDEBUG(D_TRACE, "gmnal_large_tx nal_cb [%p] private [%p], cookie [%p]
+ hdr [%p], type [%d] global_nid ["LPU64"], pid [%d], niov [%d],
+ iov [%p], size [%d]\n", nal_cb, private, cookie, hdr, type,
+ global_nid, pid, niov, iov, size);
+
+ if (nal_cb)
+ nal_data = (gmnal_data_t*)nal_cb->nal_data;
+ else {
+ CDEBUG(D_ERROR, "no nal_cb.\n");
+ return(GMNAL_STATUS_FAIL);
+ }
+
+
+ /*
+ * Get stxd and buffer. Put local address of data in buffer,
+ * send local addresses to target,
+ * wait for the target node to suck the data over.
+ * The stxd is used to ren
+ */
+ stxd = gmnal_get_stxd(nal_data, 1);
+ CDEBUG(D_INFO, "stxd [%p]\n", stxd);
+
+ stxd->type = GMNAL_LARGE_MESSAGE_INIT;
+ stxd->cookie = cookie;
+
+ /*
+ * Copy gmnal_msg_hdr and portals header to the transmit buffer
+ * Then copy the iov in
+ */
+ buffer = stxd->buffer;
+ msghdr = (gmnal_msghdr_t*)buffer;
+
+ CDEBUG(D_INFO, "processing msghdr at [%p]\n", buffer);
+
+ msghdr->magic = GMNAL_MAGIC;
+ msghdr->type = GMNAL_LARGE_MESSAGE_INIT;
+ msghdr->sender_node_id = nal_data->gm_global_nid;
+ msghdr->stxd = stxd;
+ msghdr->niov = niov ;
+ buffer += sizeof(gmnal_msghdr_t);
+ mlen = sizeof(gmnal_msghdr_t);
+ CDEBUG(D_INFO, "mlen is [%d]\n", mlen);
+
+
+ CDEBUG(D_INFO, "processing portals hdr at [%p]\n", buffer);
+
+ gm_bcopy(hdr, buffer, sizeof(ptl_hdr_t));
+ buffer += sizeof(ptl_hdr_t);
+ mlen += sizeof(ptl_hdr_t);
+ CDEBUG(D_INFO, "mlen is [%d]\n", mlen);
+
+ /*
+ * copy the iov to the buffer so target knows
+ * where to get the data from
+ */
+ CDEBUG(D_INFO, "processing iov to [%p]\n", buffer);
+ gm_bcopy(iov, buffer, niov*sizeof(struct iovec));
+ mlen += niov*(sizeof(struct iovec));
+ CDEBUG(D_INFO, "mlen is [%d]\n", mlen);
+
+
+ /*
+ * Store the iovs in the stxd for we can get
+ * them later if we need them
+ */
+ CDEBUG(D_NET, "Copying iov [%p] to [%p]\n", iov, stxd->iov);
+ gm_bcopy(iov, stxd->iov, niov*sizeof(struct iovec));
+ stxd->niov = niov;
+
+
+ /*
+ * register the memory so the NIC can get hold of the data
+ * This is a slow process. it'd be good to overlap it
+ * with something else.
+ */
+ iov_dup = iov;
+ niov_dup = niov;
+ while(niov--) {
+ CDEBUG(D_INFO, "Registering memory [%p] len ["LPSZ"] \n",
+ iov->iov_base, iov->iov_len);
+ GMNAL_GM_LOCK(nal_data);
+ gm_status = gm_register_memory(nal_data->gm_port,
+ iov->iov_base, iov->iov_len);
+ if (gm_status != GM_SUCCESS) {
+ GMNAL_GM_UNLOCK(nal_data);
+ CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s]
+ for memory [%p] len ["LPSZ"]\n",
+ gm_status, gmnal_gm_error(gm_status),
+ iov->iov_base, iov->iov_len);
+ GMNAL_GM_LOCK(nal_data);
+ while (iov_dup != iov) {
+ gm_deregister_memory(nal_data->gm_port,
+ iov_dup->iov_base,
+ iov_dup->iov_len);
+ iov_dup++;
+ }
+ GMNAL_GM_UNLOCK(nal_data);
+ gmnal_return_stxd(nal_data, stxd);
+ return(PTL_FAIL);
+ }
+
+ GMNAL_GM_UNLOCK(nal_data);
+ iov++;
+ }
+
+ /*
+ * Send the init message to the target
+ */
+ CDEBUG(D_INFO, "sending mlen [%d]\n", mlen);
+ GMNAL_GM_LOCK(nal_data);
+ gm_status = gm_global_id_to_node_id(nal_data->gm_port, global_nid,
+ &local_nid);
+ if (gm_status != GM_SUCCESS) {
+ GMNAL_GM_UNLOCK(nal_data);
+ CDEBUG(D_ERROR, "Failed to obtain local id\n");
+ gmnal_return_stxd(nal_data, stxd);
+ /* TO DO deregister memory on failure */
+ return(GMNAL_STATUS_FAIL);
+ }
+ CDEBUG(D_INFO, "Local Node_id is [%d]\n", local_nid);
+ gm_send_to_peer_with_callback(nal_data->gm_port, stxd->buffer,
+ stxd->gm_size, mlen, GM_LOW_PRIORITY,
+ local_nid, gmnal_large_tx_callback,
+ (void*)stxd);
+ GMNAL_GM_UNLOCK(nal_data);
+
+ CDEBUG(D_INFO, "done\n");
+
+ return(PTL_OK);
+}
+
+/*
+ * Callback function indicates that send of buffer with
+ * large message iovec has completed (or failed).
+ */
+void
+gmnal_large_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status)
+{
+ gmnal_small_tx_callback(gm_port, context, status);
+
+}
+
+
+
+/*
+ * Have received a buffer that contains an iovec of the sender.
+ * Do a gm_register_memory of the receivers buffer and then do a get
+ * data from the sender.
+ */
+int
+gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ unsigned int nriov, struct iovec *riov, size_t mlen,
+ size_t rlen)
+{
+ gmnal_data_t *nal_data = nal_cb->nal_data;
+ gmnal_srxd_t *srxd = (gmnal_srxd_t*)private;
+ void *buffer = NULL;
+ struct iovec *riov_dup;
+ int nriov_dup;
+ gmnal_msghdr_t *msghdr = NULL;
+ gm_status_t gm_status;
+
+ CDEBUG(D_TRACE, "gmnal_large_rx :: nal_cb[%p], private[%p],
+ cookie[%p], niov[%d], iov[%p], mlen["LPSZ"], rlen["LPSZ"]\n",
+ nal_cb, private, cookie, nriov, riov, mlen, rlen);
+
+ if (!srxd) {
+ CDEBUG(D_ERROR, "gmnal_large_rx no context\n");
+ lib_finalize(nal_cb, private, cookie);
+ return(PTL_FAIL);
+ }
+
+ buffer = srxd->buffer;
+ msghdr = (gmnal_msghdr_t*)buffer;
+ buffer += sizeof(gmnal_msghdr_t);
+ buffer += sizeof(ptl_hdr_t);
+
+ /*
+ * Store the senders stxd address in the srxd for this message
+ * The gmnal_large_message_ack needs it to notify the sender
+ * the pull of data is complete
+ */
+ srxd->source_stxd = msghdr->stxd;
+
+ /*
+ * Register the receivers memory
+ * get the data,
+ * tell the sender that we got the data
+ * then tell the receiver we got the data
+ * TO DO
+ * If the iovecs match, could interleave
+ * gm_registers and gm_gets for each element
+ */
+ nriov_dup = nriov;
+ riov_dup = riov;
+ while(nriov--) {
+ CDEBUG(D_INFO, "Registering memory [%p] len ["LPSZ"] \n",
+ riov->iov_base, riov->iov_len);
+ GMNAL_GM_LOCK(nal_data);
+ gm_status = gm_register_memory(nal_data->gm_port,
+ riov->iov_base, riov->iov_len);
+ if (gm_status != GM_SUCCESS) {
+ GMNAL_GM_UNLOCK(nal_data);
+ CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s]
+ for memory [%p] len ["LPSZ"]\n",
+ gm_status, gmnal_gm_error(gm_status),
+ riov->iov_base, riov->iov_len);
+ GMNAL_GM_LOCK(nal_data);
+ while (riov_dup != riov) {
+ gm_deregister_memory(nal_data->gm_port,
+ riov_dup->iov_base,
+ riov_dup->iov_len);
+ riov_dup++;
+ }
+ GMNAL_GM_LOCK(nal_data);
+ /*
+ * give back srxd and buffer. Send NACK to sender
+ */
+ return(PTL_FAIL);
+ }
+ GMNAL_GM_UNLOCK(nal_data);
+ riov++;
+ }
+ /*
+ * do this so the final gm_get callback can deregister the memory
+ */
+ PORTAL_ALLOC(srxd->riov, nriov_dup*(sizeof(struct iovec)));
+ gm_bcopy(riov_dup, srxd->riov, nriov_dup*(sizeof(struct iovec)));
+ srxd->nriov = nriov_dup;
+
+ /*
+ * now do gm_get to get the data
+ */
+ srxd->cookie = cookie;
+ if (gmnal_remote_get(srxd, srxd->nsiov, (struct iovec*)buffer,
+ nriov_dup, riov_dup) != GMNAL_STATUS_OK) {
+ CDEBUG(D_ERROR, "can't get the data");
+ }
+
+ CDEBUG(D_INFO, "lgmanl_large_rx done\n");
+
+ return(PTL_OK);
+}
+
+
+/*
+ * Perform a number of remote gets as part of receiving
+ * a large message.
+ * The final one to complete (i.e. the last callback to get called)
+ * tidies up.
+ * gm_get requires a send token.
+ */
+int
+gmnal_remote_get(gmnal_srxd_t *srxd, int nsiov, struct iovec *siov,
+ int nriov, struct iovec *riov)
+{
+
+ int ncalls = 0;
+
+ CDEBUG(D_TRACE, "gmnal_remote_get srxd[%p], nriov[%d], riov[%p],
+ nsiov[%d], siov[%p]\n", srxd, nriov, riov, nsiov, siov);
+
+
+ ncalls = gmnal_copyiov(0, srxd, nsiov, siov, nriov, riov);
+ if (ncalls < 0) {
+ CDEBUG(D_ERROR, "there's something wrong with the iovecs\n");
+ return(GMNAL_STATUS_FAIL);
+ }
+ CDEBUG(D_INFO, "gmnal_remote_get ncalls [%d]\n", ncalls);
+ spin_lock_init(&srxd->callback_lock);
+ srxd->ncallbacks = ncalls;
+ srxd->callback_status = 0;
+
+ ncalls = gmnal_copyiov(1, srxd, nsiov, siov, nriov, riov);
+ if (ncalls < 0) {
+ CDEBUG(D_ERROR, "there's something wrong with the iovecs\n");
+ return(GMNAL_STATUS_FAIL);
+ }
+
+ return(GMNAL_STATUS_OK);
+
+}
+
+
+/*
+ * pull data from source node (source iovec) to a local iovec.
+ * The iovecs may not match which adds the complications below.
+ * Count the number of gm_gets that will be required to the callbacks
+ * can determine who is the last one.
+ */
+int
+gmnal_copyiov(int do_copy, gmnal_srxd_t *srxd, int nsiov,
+ struct iovec *siov, int nriov, struct iovec *riov)
+{
+
+ int ncalls = 0;
+ int slen = siov->iov_len, rlen = riov->iov_len;
+ char *sbuf = siov->iov_base, *rbuf = riov->iov_base;
+ unsigned long sbuf_long;
+ gm_remote_ptr_t remote_ptr = 0;
+ unsigned int source_node;
+ gmnal_ltxd_t *ltxd = NULL;
+ gmnal_data_t *nal_data = srxd->nal_data;
+
+ CDEBUG(D_TRACE, "copy[%d] nal_data[%p]\n", do_copy, nal_data);
+ if (do_copy) {
+ if (!nal_data) {
+ CDEBUG(D_ERROR, "Bad args No nal_data\n");
+ return(GMNAL_STATUS_FAIL);
+ }
+ GMNAL_GM_LOCK(nal_data);
+ if (gm_global_id_to_node_id(nal_data->gm_port,
+ srxd->gm_source_node,
+ &source_node) != GM_SUCCESS) {
+
+ CDEBUG(D_ERROR, "cannot resolve global_id [%u]
+ to local node_id\n", srxd->gm_source_node);
+ GMNAL_GM_UNLOCK(nal_data);
+ return(GMNAL_STATUS_FAIL);
+ }
+ GMNAL_GM_UNLOCK(nal_data);
+ /*
+ * We need a send token to use gm_get
+ * getting an stxd gets us a send token.
+ * the stxd is used as the context to the
+ * callback function (so stxd can be returned).
+ * Set pointer in stxd to srxd so callback count in srxd
+ * can be decremented to find last callback to complete
+ */
+ CDEBUG(D_INFO, "gmnal_copyiov source node is G[%u]L[%d]\n",
+ srxd->gm_source_node, source_node);
+ }
+
+ do {
+ CDEBUG(D_INFO, "sbuf[%p] slen[%d] rbuf[%p], rlen[%d]\n",
+ sbuf, slen, rbuf, rlen);
+ if (slen > rlen) {
+ ncalls++;
+ if (do_copy) {
+ CDEBUG(D_INFO, "slen>rlen\n");
+ ltxd = gmnal_get_ltxd(nal_data);
+ ltxd->srxd = srxd;
+ GMNAL_GM_LOCK(nal_data);
+ /*
+ * funny business to get rid
+ * of compiler warning
+ */
+ sbuf_long = (unsigned long) sbuf;
+ remote_ptr = (gm_remote_ptr_t)sbuf_long;
+ gm_get(nal_data->gm_port, remote_ptr, rbuf,
+ rlen, GM_LOW_PRIORITY, source_node,
+ GMNAL_GM_PORT,
+ gmnal_remote_get_callback, ltxd);
+ GMNAL_GM_UNLOCK(nal_data);
+ }
+ /*
+ * at the end of 1 iov element
+ */
+ sbuf+=rlen;
+ slen-=rlen;
+ riov++;
+ nriov--;
+ rbuf = riov->iov_base;
+ rlen = riov->iov_len;
+ } else if (rlen > slen) {
+ ncalls++;
+ if (do_copy) {
+ CDEBUG(D_INFO, "slen<rlen\n");
+ ltxd = gmnal_get_ltxd(nal_data);
+ ltxd->srxd = srxd;
+ GMNAL_GM_LOCK(nal_data);
+ sbuf_long = (unsigned long) sbuf;
+ remote_ptr = (gm_remote_ptr_t)sbuf_long;
+ gm_get(nal_data->gm_port, remote_ptr, rbuf,
+ slen, GM_LOW_PRIORITY, source_node,
+ GMNAL_GM_PORT,
+ gmnal_remote_get_callback, ltxd);
+ GMNAL_GM_UNLOCK(nal_data);
+ }
+ /*
+ * at end of siov element
+ */
+ &