Whamcloud - gitweb
Merge b_release_1_4_6 from b_hd_newconfig (20060124_1325)
authoradilger <adilger>
Wed, 25 Jan 2006 00:33:39 +0000 (00:33 +0000)
committeradilger <adilger>
Wed, 25 Jan 2006 00:33:39 +0000 (00:33 +0000)
- cross-platform compiling for winnt (r=mattwu, original patch)
- improve use of compat macros for darwin (r=nikita, original patch)

Description: fix use of portals/lnet pid
Details    : incorrect use of portals/lnet pid caused them to get out of
     sync and would result in silent dropping of RPC messages
b=10074
r=eeb (original patch)

Severity   : major
Frequency  : Infiniband IB LND only
Description: iiblnd wasn't mapping all memory
Details    : iiblnd wasn't mapping all memory, resulting in comms errors
     on some architectures/memory configs
b=9776
r=eeb (original patch)

143 files changed:
lnet/ChangeLog
lnet/include/libcfs/curproc.h
lnet/include/libcfs/darwin/Makefile.am
lnet/include/libcfs/darwin/darwin-fs.h
lnet/include/libcfs/darwin/darwin-lock.h
lnet/include/libcfs/darwin/darwin-mem.h
lnet/include/libcfs/darwin/darwin-prim.h
lnet/include/libcfs/darwin/darwin-sync.h
lnet/include/libcfs/darwin/darwin-tcpip.h [new file with mode: 0644]
lnet/include/libcfs/darwin/darwin-time.h
lnet/include/libcfs/darwin/darwin-types.h
lnet/include/libcfs/darwin/darwin-utils.h
lnet/include/libcfs/darwin/kp30.h
lnet/include/libcfs/darwin/libcfs.h
lnet/include/libcfs/kp30.h
lnet/include/libcfs/libcfs.h
lnet/include/libcfs/linux/kp30.h
lnet/include/libcfs/linux/libcfs.h
lnet/include/libcfs/linux/linux-fs.h
lnet/include/libcfs/linux/linux-lock.h
lnet/include/libcfs/linux/linux-mem.h
lnet/include/libcfs/linux/linux-prim.h
lnet/include/libcfs/linux/linux-time.h
lnet/include/libcfs/list.h
lnet/include/libcfs/lltrace.h
lnet/include/libcfs/portals_lib.h
lnet/include/libcfs/portals_utils.h
lnet/include/libcfs/user-lock.h
lnet/include/libcfs/user-prim.h
lnet/include/libcfs/user-time.h
lnet/include/libcfs/winnt/kp30.h [new file with mode: 0644]
lnet/include/libcfs/winnt/libcfs.h [new file with mode: 0644]
lnet/include/libcfs/winnt/lltrace.h [new file with mode: 0644]
lnet/include/libcfs/winnt/portals_compat25.h [new file with mode: 0644]
lnet/include/libcfs/winnt/portals_lib.h [new file with mode: 0644]
lnet/include/libcfs/winnt/portals_utils.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-fs.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-lock.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-mem.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-prim.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-tcpip.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-time.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-types.h [new file with mode: 0644]
lnet/include/lnet/api-support.h
lnet/include/lnet/darwin/Makefile.am
lnet/include/lnet/darwin/api-support.h [new file with mode: 0644]
lnet/include/lnet/darwin/lib-lnet.h
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/include/lnet/linux/Makefile.am
lnet/include/lnet/linux/api-support.h [new file with mode: 0644]
lnet/include/lnet/linux/lib-lnet.h
lnet/include/lnet/lnet.h
lnet/include/lnet/winnt/api-support.h [new file with mode: 0644]
lnet/include/lnet/winnt/lib-lnet.h [new file with mode: 0644]
lnet/include/lnet/winnt/lib-types.h [new file with mode: 0644]
lnet/include/lnet/winnt/lnet.h [new file with mode: 0644]
lnet/klnds/iiblnd/iiblnd.c
lnet/klnds/iiblnd/iiblnd_cb.c
lnet/klnds/ptllnd/ptllnd.c
lnet/klnds/ptllnd/ptllnd.h
lnet/klnds/ptllnd/ptllnd_cb.c
lnet/klnds/ptllnd/ptllnd_modparams.c
lnet/klnds/ptllnd/ptllnd_peer.c
lnet/klnds/ptllnd/ptllnd_rx_buf.c
lnet/klnds/ptllnd/ptllnd_tx.c
lnet/klnds/socklnd/Info.plist
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c
lnet/klnds/socklnd/socklnd_lib-darwin.c
lnet/klnds/socklnd/socklnd_lib-darwin.h
lnet/klnds/socklnd/socklnd_lib-linux.h
lnet/klnds/socklnd/socklnd_modparams.c
lnet/klnds/tdilnd/socklnd.c [new file with mode: 0644]
lnet/klnds/tdilnd/socklnd.h [new file with mode: 0644]
lnet/klnds/tdilnd/socklnd_cb.c [new file with mode: 0644]
lnet/klnds/tdilnd/socklnd_lib-winnt.c [new file with mode: 0644]
lnet/klnds/tdilnd/socklnd_lib-winnt.h [new file with mode: 0644]
lnet/klnds/tdilnd/socklnd_modparams.c [new file with mode: 0644]
lnet/klnds/tdilnd/tdilnd.h [new file with mode: 0644]
lnet/libcfs/autoMakefile.am
lnet/libcfs/darwin/Makefile.am
lnet/libcfs/darwin/darwin-curproc.c
lnet/libcfs/darwin/darwin-debug.c
lnet/libcfs/darwin/darwin-fs.c
lnet/libcfs/darwin/darwin-internal.h [new file with mode: 0644]
lnet/libcfs/darwin/darwin-mem.c
lnet/libcfs/darwin/darwin-module.c
lnet/libcfs/darwin/darwin-prim.c
lnet/libcfs/darwin/darwin-proc.c
lnet/libcfs/darwin/darwin-sync.c
lnet/libcfs/darwin/darwin-tcpip.c [new file with mode: 0644]
lnet/libcfs/darwin/darwin-tracefile.c
lnet/libcfs/darwin/darwin-utils.c
lnet/libcfs/debug.c
lnet/libcfs/linux/linux-debug.c
lnet/libcfs/linux/linux-fs.c
lnet/libcfs/linux/linux-mem.c
lnet/libcfs/linux/linux-module.c
lnet/libcfs/linux/linux-prim.c
lnet/libcfs/linux/linux-tcpip.c
lnet/libcfs/linux/linux-tracefile.c
lnet/libcfs/linux/linux-utils.c
lnet/libcfs/misc.c [new file with mode: 0644]
lnet/libcfs/module.c
lnet/libcfs/nidstrings.c
lnet/libcfs/tracefile.c
lnet/libcfs/tracefile.h
lnet/libcfs/user-lock.c
lnet/libcfs/user-prim.c
lnet/libcfs/watchdog.c
lnet/libcfs/winnt/winnt-curproc.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-debug.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-fs.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-lock.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-lwt.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-mem.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-module.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-prim.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-proc.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-sync.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-tcpip.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-tracefile.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-usr.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-utils.c [new file with mode: 0644]
lnet/lnet/Info.plist
lnet/lnet/acceptor.c
lnet/lnet/autoMakefile.am
lnet/lnet/config.c
lnet/lnet/lib-eq.c
lnet/lnet/lib-move.c
lnet/lnet/lo.c
lnet/lnet/router.c
lnet/lnet/router_proc.c
lnet/tests/ping_cli.c
lnet/tests/ping_cli/winnt-pingcli.c [new file with mode: 0644]
lnet/tests/ping_srv.c
lnet/tests/ping_srv/winnt-pingsrv.c [new file with mode: 0644]
lnet/ulnds/ptllnd/ptllnd.c
lnet/ulnds/ptllnd/ptllnd.h
lnet/ulnds/ptllnd/ptllnd_cb.c
lnet/utils/debug.c

index 44c216f..cb78e99 100644 (file)
@@ -1,6 +1,24 @@
+2006-01-24  Cluster File Systems, Inc. <info@clusterfs.com>
+       * version 1.4.6
+       * bug fixes
+
+Severity   : major
+Frequency  : Cray XT3 only
+Bugzilla   : 10074
+Description: fix use of portals/lnet pid
+Details    : incorrect use of portals/lnet pid caused them to get out of
+            sync and would result in silent dropping of RPC messages
+
+Severity   : major
+Frequency  : Infiniband IB LND only
+Bugzilla   : 9776
+Description: iiblnd wasn't mapping all memory
+Details    : iiblnd wasn't mapping all memory, resulting in comms errors
+            on some architectures/memory configs
+
 2005-10-10  Cluster File Systems, Inc. <info@clusterfs.com>
        * Configuration change for the XT3
-             The PTLLND is now used to run Lustre over Portals on the XT3.
+            The PTLLND is now used to run Lustre over Portals on the XT3.
             The configure option(s) --with-cray-portals are no longer
             used.  Rather --with-portals=<path-to-portals-includes> is
             used to enable building on the XT3.  In addition to enable
@@ -9,7 +27,7 @@
 
 2005-10-10  Cluster File Systems, Inc. <info@clusterfs.com>
        * Portals has been removed, replaced by LNET.
-          LNET is new networking infrastructure for Lustre, it includes a
+          LNET is new networking infrastructure for Lustre, it includes a
           reorganized network configuration mode (see the user
           documentation for full details) as well as support for routing
           between different network fabrics.  Lustre Networking Devices
index 630912d..6495c66 100644 (file)
@@ -20,6 +20,7 @@
 #ifndef __LIBCFS_CURPROC_H__
 #define __LIBCFS_CURPROC_H__
 
+#ifdef __KERNEL__
 /*
  * Portable API to access common characteristics of "current" UNIX process.
  *
@@ -48,6 +49,7 @@ char  *cfs_curproc_comm(void);
  */
 cfs_kernel_cap_t cfs_curproc_cap_get(void);
 void cfs_curproc_cap_set(cfs_kernel_cap_t cap);
+#endif
 
 /* __LIBCFS_CURPROC_H__ */
 #endif
index 4ff2072..9d94305 100644 (file)
@@ -1,3 +1,3 @@
 EXTRA_DIST := darwin-mem.h darwin-types.h libcfs.h portals_utils.h     \
        darwin-fs.h darwin-prim.h darwin-utils.h lltrace.h              \
-       darwin-lock.h darwin-sync.h kp30.h portals_lib.h
+       darwin-lock.h darwin-sync.h darwin-tcpip.h kp30.h portals_lib.h
index 32244e7..52d5e48 100644 (file)
@@ -1,5 +1,24 @@
-#ifndef __LIBCFS_DARWIN_CFS_FS_H__
-#define __LIBCFS_DARWIN_CFS_FS_H__
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Implementation of standard file system interfaces for XNU kernel.
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+#ifndef __LIBCFS_DARWIN_FS_H__
+#define __LIBCFS_DARWIN_FS_H__
 
 #ifndef __LIBCFS_LIBCFS_H__
 #error Do not #include this file directly. #include <libcfs/libcfs.h> instead
 
 #include <sys/types.h>
 #include <sys/systm.h>
-/*
- * __APPLE_API_PRIVATE is defined before include user.h
- * Doing this way to get the define of uthread, it's not good
- * but I do need to know what's inside uthread.
- */
-#ifndef __APPLE_API_PRIVATE
-#define __APPLE_API_PRIVATE
-#include <sys/vnode.h>
-#undef __APPLE_API_PRIVATE
-#else
-#include <sys/vnode.h>
-#endif
 
 #include <sys/kernel.h>
 #include <sys/file.h>
 #include <sys/time.h>
 #include <sys/filedesc.h>
-#include <sys/stat.h>
 #include <sys/mount.h>
+#include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/ubc.h>
 #include <sys/mbuf.h>
@@ -37,7 +44,6 @@
 #include <stdarg.h>
 
 #include <mach/mach_types.h>
-#include <mach/mach_traps.h>
 #include <mach/time_value.h>
 #include <kern/clock.h>
 #include <sys/param.h>
 /*
  * File operating APIs in kernel
  */
+#ifdef __DARWIN8__
+/*
+ * Kernel file descriptor
+ */
+typedef struct cfs_kern_file {
+        int             f_flags;
+        vnode_t         f_vp;
+        vfs_context_t   f_ctxt;
+} cfs_file_t;
+
+#else
+
 typedef struct file cfs_file_t;
 
-int    filp_node_size(cfs_file_t *fp, off_t    *size);
+#endif
+
+int    kern_file_size(cfs_file_t *fp, off_t    *size);
 #define cfs_filp_size(fp)                      \
        ({                                      \
                off_t           __size;         \
-               filp_node_size((fp), &__size);  \
+               kern_file_size((fp), &__size);  \
                __size;                         \
         })
 #define cfs_filp_poff(fp)               (NULL)
 
-cfs_file_t *filp_open(const char *name, int flags, int mode, int *err);
-int filp_close(cfs_file_t *fp);
-int filp_read(cfs_file_t *fp, void *buf, size_t nbytes, off_t *pos);
-int filp_write(cfs_file_t *fp, void *buf, size_t nbytes, off_t *pos);
-int filp_fsync(cfs_file_t *fp);
+cfs_file_t *kern_file_open(const char *name, int flags, int mode, int *err);
+int kern_file_close(cfs_file_t *fp);
+int kern_file_read(cfs_file_t *fp, void *buf, size_t nbytes, off_t *pos);
+int kern_file_write(cfs_file_t *fp, void *buf, size_t nbytes, off_t *pos);
+int kern_file_sync(cfs_file_t *fp);
 
-#define cfs_filp_open(n, f, m, e)      filp_open(n, f, m, e)
-#define cfs_filp_close(f)              filp_close(f)
-#define cfs_filp_read(f, b, n, p)      filp_read(f, b, n, p)
-#define cfs_filp_write(f, b, n, p)     filp_write(f, b, n, p)
-#define cfs_filp_fsync(f)              filp_fsync(f)
+#define cfs_filp_open(n, f, m, e)      kern_file_open(n, f, m, e)
+#define cfs_filp_close(f)              kern_file_close(f)
+#define cfs_filp_read(f, b, n, p)      kern_file_read(f, b, n, p)
+#define cfs_filp_write(f, b, n, p)     kern_file_write(f, b, n, p)
+#define cfs_filp_fsync(f)              kern_file_sync(f)
 
 int ref_file(cfs_file_t *fp);
 int rele_file(cfs_file_t *fp);
@@ -85,25 +105,25 @@ int file_count(cfs_file_t *fp);
 #define CFS_OFFSET_MAX                 CFS_INT_LIMIT(loff_t)
 
 typedef struct flock                   cfs_flock_t;
-#define CFS_FLOCK_TYPE(fl)             ((fl)->l_type)
-#define CFS_FLOCK_SET_TYPE(fl, type)   do { (fl)->l_type = (type); } while(0)
-#define CFS_FLOCK_PID(fl)              ((fl)->l_pid)
-#define CFS_FLOCK_SET_PID(fl, pid)     do { (fl)->l_pid = (pid); } while(0)
-#define CFS_FLOCK_START(fl)            ((fl)->l_start)
-#define CFS_FLOCK_SET_START(fl, start) do { (fl)->l_start = (start); } while(0)
-#define CFS_FLOCK_END(fl)              ((fl)->l_len == 0? CFS_OFFSET_MAX: ((fl)->l_start + (fl)->l_en))
-#define CFS_FLOCK_SET_END(fl, end)             \
-       do {                                    \
-               if (end == CFS_OFFSET_MAX)      \
-                       (fl)->l_len = 0;        \
-               else                            \
-                       (fl)->l_len = (end) - (fl)->l_start;\
-       } while(0)
-
-typedef struct {
-       void    *d;
-} cfs_dentry_t;
-typedef unsigned short umode_t;
+#define cfs_flock_type(fl)             ((fl)->l_type)
+#define cfs_flock_set_type(fl, type)   do { (fl)->l_type = (type); } while(0)
+#define cfs_flock_pid(fl)              ((fl)->l_pid)
+#define cfs_flock_set_pid(fl, pid)     do { (fl)->l_pid = (pid); } while(0)
+#define cfs_flock_start(fl)            ((fl)->l_start)
+#define cfs_flock_set_start(fl, start) do { (fl)->l_start = (start); } while(0)
+
+static inline loff_t cfs_flock_end(cfs_flock_t *fl)
+{
+        return (fl->l_len == 0 ? CFS_OFFSET_MAX: (fl->l_start + fl->l_len));
+}
+
+static inline void cfs_flock_set_end(cfs_flock_t *fl, loff_t end)
+{
+        if (end == CFS_OFFSET_MAX)
+                fl->l_len = 0;
+        else
+                fl->l_len = end - fl->l_start;
+}
 
 #define ATTR_MODE       0x0001
 #define ATTR_UID        0x0002
@@ -122,10 +142,55 @@ typedef unsigned short umode_t;
 
 #define in_group_p(x)  (0)
 
-#endif
+struct posix_acl_entry {
+        short                   e_tag;
+        unsigned short          e_perm;
+        unsigned int            e_id;
+};
+
+struct posix_acl {
+        atomic_t                a_refcount;
+        unsigned int            a_count;
+        struct posix_acl_entry  a_entries[0];
+};
+
+struct posix_acl *posix_acl_alloc(int count, int flags);
+static inline struct posix_acl *posix_acl_from_xattr(const void *value, 
+                                                     size_t size)
+{ 
+        return posix_acl_alloc(0, 0);
+}
+static inline void posix_acl_release(struct posix_acl *acl) {};
+static inline int posix_acl_valid(const struct posix_acl *acl) { return 0; }
+static inline struct posix_acl * posix_acl_dup(struct posix_acl *acl) 
+{ 
+        return acl;
+}
+
+/*
+ * portable UNIX device file identification.
+ */
+
+typedef dev_t cfs_rdev_t;
+
+#else  /* !__KERNEL__ */
+
+typedef struct file cfs_file_t;
 
+#endif /* END __KERNEL__ */
+
+typedef struct {
+       void    *d;
+} cfs_dentry_t;
+
+#ifndef O_SYNC
 #define O_SYNC                                 0
+#endif
+#ifndef O_DIRECTORY
 #define O_DIRECTORY                            0
+#endif
+#ifndef O_LARGEFILE
 #define O_LARGEFILE                            0
+#endif
 
 #endif
index da16418..e2d2016 100644 (file)
@@ -9,10 +9,6 @@
 #include <mach/sync_policy.h>
 #include <mach/task.h>
 #include <mach/semaphore.h>
-#include <mach/mach_traps.h>
-
-/* spin lock types and operations */
-#include <kern/simple_lock.h>
 #include <kern/assert.h>
 #include <kern/thread.h>
 
@@ -56,12 +52,17 @@ static inline int spin_trylock(spinlock_t *lock)
        return kspin_trylock(&lock->spin);
 }
 
+static inline void spin_lock_done(spinlock_t *lock)
+{
+       kspin_done(&lock->spin);
+}
+
 #define spin_lock_bh(x)                spin_lock(x)
 #define spin_unlock_bh(x)      spin_unlock(x)
 #define spin_lock_bh_init(x)   spin_lock_init(x)
 
 extern boolean_t ml_set_interrupts_enabled(boolean_t enable);
-#define __disable_irq()         (spl_t) ml_set_interrupts_enabled(FALSE)
+#define __disable_irq()         ml_set_interrupts_enabled(FALSE)
 #define __enable_irq(x)         (void) ml_set_interrupts_enabled(x)
 
 #define spin_lock_irqsave(s, f)                do{                     \
@@ -165,6 +166,11 @@ static inline void init_rwsem(struct rw_semaphore *s)
        krw_sem_init(&s->s);
 }
 
+static inline void fini_rwsem(struct rw_semaphore *s)
+{
+       krw_sem_done(&s->s);
+}
+
 static inline void down_read(struct rw_semaphore *s)
 {
        krw_sem_down_r(&s->s);
@@ -173,7 +179,7 @@ static inline void down_read(struct rw_semaphore *s)
 static inline int down_read_trylock(struct rw_semaphore *s)
 {
        int ret = krw_sem_down_r_try(&s->s);
-       return ret == 0? 1: 0;
+       return ret == 0;
 }
 
 static inline void down_write(struct rw_semaphore *s)
@@ -184,7 +190,7 @@ static inline void down_write(struct rw_semaphore *s)
 static inline int down_write_trylock(struct rw_semaphore *s)
 {
        int ret = krw_sem_down_w_try(&s->s);
-       return ret == 0? 1: 0;
+       return ret == 0;
 }
 
 static inline void up_read(struct rw_semaphore *s)
@@ -199,7 +205,6 @@ static inline void up_write(struct rw_semaphore *s)
 
 /* 
  * read-write lock : Need to be investigated more!!
- * XXX nikita: for now, let rwlock_t to be identical to rw_semaphore
  *
  * - DECLARE_RWLOCK(l)
  * - rwlock_init(x)
@@ -208,14 +213,14 @@ static inline void up_write(struct rw_semaphore *s)
  * - write_lock(x)
  * - write_unlock(x)
  */
-typedef struct rw_semaphore rwlock_t;
+typedef struct krw_spin rwlock_t;
 
-#define rwlock_init(pl)                init_rwsem(pl)
+#define rwlock_init(pl)                        krw_spin_init(pl)
 
-#define read_lock(l)           down_read(l)
-#define read_unlock(l)         up_read(l)
-#define write_lock(l)          down_write(l)
-#define write_unlock(l)                up_write(l)
+#define read_lock(l)                   krw_spin_down_r(l)
+#define read_unlock(l)                 krw_spin_up_r(l)
+#define write_lock(l)                  krw_spin_down_w(l)
+#define write_unlock(l)                        krw_spin_up_w(l)
 
 #define write_lock_irqsave(l, f)       do{                     \
                                        f = __disable_irq();    \
@@ -232,12 +237,23 @@ typedef struct rw_semaphore rwlock_t;
 #define read_unlock_irqrestore(l, f)   do{                     \
                                        read_unlock(l);         \
                                        __enable_irq(f);}while(0)
-
 /*
  * Funnel: 
  *
  * Safe funnel in/out
  */
+#ifdef __DARWIN8__
+
+#define CFS_DECL_FUNNEL_DATA
+#define CFS_DECL_CONE_DATA              DECLARE_FUNNEL_DATA
+#define CFS_DECL_NET_DATA               DECLARE_FUNNEL_DATA
+#define CFS_CONE_IN                     do {} while(0)
+#define CFS_CONE_EX                     do {} while(0)
+
+#define CFS_NET_IN                      do {} while(0)
+#define CFS_NET_EX                      do {} while(0)
+
+#else
 
 #define CFS_DECL_FUNNEL_DATA                   \
         boolean_t    __funnel_state = FALSE;   \
@@ -257,8 +273,11 @@ void lustre_net_ex(boolean_t state, funnel_t *cone);
 #define CFS_NET_IN  lustre_net_in(&__funnel_state, &__funnel)
 #define CFS_NET_EX  lustre_net_ex(__funnel_state, __funnel)
 
-/* __KERNEL__ */
 #endif
 
+#else
+#include <libcfs/user-lock.h>
+#endif /* __KERNEL__ */
+
 /* __XNU_CFS_LOCK_H */
 #endif
index ce4a42b..3a19af0 100644 (file)
 #include <libcfs/list.h>
 
 /*
- * Page of OSX
- *
- * There is no page in OSX, however, we need page in lustre.
- */
-#define PAGE_MASK                              (~(PAGE_SIZE-1))
-#define _ALIGN_UP(addr,size)                   (((addr)+((size)-1))&(~((size)-1)))
-#define _ALIGN(addr,size)                      _ALIGN_UP(addr,size)
-#define PAGE_ALIGN(addr)                       _ALIGN(addr, PAGE_SIZE)
-
-/*
  * Basic xnu_page struct, should be binary compatibility with
  * all page types in xnu (we have only xnu_raw_page, xll_page now)
  */
 
 /* Variable sized pages are not supported */
 
+#ifdef PAGE_SHIFT
+#define CFS_PAGE_SHIFT PAGE_SHIFT
+#else
 #define CFS_PAGE_SHIFT 12
+#endif
+
+#ifdef PAGE_SIZE
+#define CFS_PAGE_SIZE  PAGE_SIZE
+#else
 #define CFS_PAGE_SIZE  (1 << CFS_PAGE_SHIFT)
+#endif
+
 #define PAGE_CACHE_SIZE CFS_PAGE_SIZE
+
+#ifdef PAGE_MASK
+#define CFS_PAGE_MASK  PAGE_MASK
+#else
 #define CFS_PAGE_MASK  (~(CFS_PAGE_SIZE - 1))
+#endif
 
 enum {
        XNU_PAGE_RAW,
@@ -98,20 +103,16 @@ void xnu_page_ops_unregister(int type);
  * raw page, no cache object, just like buffer
  */
 struct xnu_raw_page {
-       struct xnu_page header;
-       vm_address_t    virtual;
-       upl_t           upl;
-       int             order;
-       atomic_t        count;
-       void           *private;
+       struct xnu_page  header;
+       void            *virtual;
+       atomic_t         count;
+       struct list_head link;
 };
 
 /*
  * Public interface to lustre
  *
- * - cfs_alloc_pages(f, o)
  * - cfs_alloc_page(f)
- * - cfs_free_pages(p, o)
  * - cfs_free_page(p)
  * - cfs_kmap(p)
  * - cfs_kunmap(p)
@@ -124,14 +125,13 @@ struct xnu_raw_page {
  * pages only.
  */
 
-cfs_page_t *cfs_alloc_pages(u_int32_t flags, u_int32_t order);
 cfs_page_t *cfs_alloc_page(u_int32_t flags);
-void cfs_free_pages(cfs_page_t *pages, int order);
 void cfs_free_page(cfs_page_t *page);
 void cfs_get_page(cfs_page_t *page);
 int cfs_put_page_testzero(cfs_page_t *page);
 int cfs_page_count(cfs_page_t *page);
 void cfs_set_page_count(cfs_page_t *page, int v);
+#define cfs_page_index(pg)     (0)
 
 void *cfs_page_address(cfs_page_t *pg);
 void *cfs_kmap(cfs_page_t *pg);
@@ -141,51 +141,84 @@ void cfs_kunmap(cfs_page_t *pg);
  * Memory allocator
  */
 
-extern void *cfs_alloc(size_t nr_bytes, u_int32_t flags);
-extern void  cfs_free(void *addr);
+void *cfs_alloc(size_t nr_bytes, u_int32_t flags);
+void  cfs_free(void *addr);
+
+void *cfs_alloc_large(size_t nr_bytes);
+void  cfs_free_large(void *addr);
 
-extern void *cfs_alloc_large(size_t nr_bytes);
-extern void  cfs_free_large(void *addr);
+extern int get_preemption_level(void);
+
+#define CFS_ALLOC_ATOMIC_TRY                                    \
+       (get_preemption_level() != 0 ? CFS_ALLOC_ATOMIC : 0)
 
 /*
  * Slab:
  *
- * No slab in OSX, use zone allocator to fake slab
+ * No slab in OSX, use zone allocator to simulate slab
  */
 #define SLAB_HWCACHE_ALIGN             0
 
+#ifdef __DARWIN8__
+/* 
+ * In Darwin8, we cannot use zalloc_noblock(not exported by kernel),
+ * also, direct using of zone allocator is not recommended.
+ */
+#define CFS_INDIVIDUAL_ZONE     (0)
+
+#if !CFS_INDIVIDUAL_ZONE
+#include <libkern/OSMalloc.h>
+typedef        OSMallocTag     mem_cache_t;
+#else
+typedef                void*           zone_t;
+typedef                zone_t          mem_cache_t;
+#endif
+
+#else /* !__DARWIN8__ */
+
+#define CFS_INDIVIDUAL_ZONE     (1)
+
+typedef        zone_t          mem_cache_t;
+
+#endif /* !__DARWIN8__ */
+
+#define MC_NAME_MAX_LEN                64
+
 typedef struct cfs_mem_cache {
-       struct list_head        link;
-       zone_t                  zone;
-       int                     size;
-       char                    name [ZONE_NAME_MAX_LEN];
+       int                     mc_size;
+       mem_cache_t             mc_cache;
+       struct list_head        mc_link;
+       char                    mc_name [MC_NAME_MAX_LEN];
 } cfs_mem_cache_t;
 
 #define KMEM_CACHE_MAX_COUNT   64
 #define KMEM_MAX_ZONE          8192
 
-extern cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, unsigned long,
-                                              void (*)(void *, cfs_mem_cache_t *, unsigned long),
-                                              void (*)(void *, cfs_mem_cache_t *, unsigned long));
-extern int cfs_mem_cache_destroy ( cfs_mem_cache_t * );
-extern void *cfs_mem_cache_alloc ( cfs_mem_cache_t *, int);
-extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *);
+cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, unsigned long);
+int cfs_mem_cache_destroy ( cfs_mem_cache_t * );
+void *cfs_mem_cache_alloc ( cfs_mem_cache_t *, int);
+void cfs_mem_cache_free ( cfs_mem_cache_t *, void *);
 
 /*
  * Misc
  */
-/* XXX fix me */
+/* XXX Liang: num_physpages... fix me */
 #define num_physpages                  (64 * 1024)
 
 #define CFS_DECL_MMSPACE               
 #define CFS_MMSPACE_OPEN               do {} while(0)
 #define CFS_MMSPACE_CLOSE              do {} while(0)
 
-#define copy_from_user(kaddr, uaddr, size)     copyin((caddr_t)uaddr, (caddr_t)kaddr, size)
-#define copy_to_user(uaddr, kaddr, size)       copyout((caddr_t)kaddr, (caddr_t)uaddr, size)
+#define copy_from_user(kaddr, uaddr, size)     copyin(CAST_USER_ADDR_T(uaddr), (caddr_t)kaddr, size)
+#define copy_to_user(uaddr, kaddr, size)       copyout((caddr_t)kaddr, CAST_USER_ADDR_T(uaddr), size)
 
-#error "need this define"
-#define strncpy_from_user(kaddr, uaddr, size) "something"
+#if 0
+static inline int strncpy_from_user(char *kaddr, char *uaddr, int size)
+{
+       size_t count;
+       return copyinstr((const user_addr_t)uaddr, (void *)kaddr, size, &count);
+}
+#endif
 
 #if defined (__ppc__)
 #define mb()  __asm__ __volatile__ ("sync" : : : "memory")
@@ -201,9 +234,10 @@ extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *);
 
 #else  /* !__KERNEL__ */
 
-typedef struct cfs_page{
-       void    *foo;
-} cfs_page_t;
+#define CFS_CACHE_SHIFT 12
+#define PAGE_CACHE_SIZE (1 << CFS_CACHE_SHIFT)
+#include <libcfs/user-prim.h>
+
 #endif /* __KERNEL__ */
 
 #endif /* __XNU_CFS_MEM_H__ */
index 63dbad3..d3f3c3b 100644 (file)
@@ -9,25 +9,29 @@
 #include <sys/types.h>
 #include <sys/systm.h>
 
-#ifndef __APPLE_API_PRIVATE
-#define __APPLE_API_PRIVATE
-#include <sys/user.h>
-#undef __APPLE_API_PRIVATE
-#else
-#include <sys/user.h>
-#endif
+#ifndef __DARWIN8__
+# ifndef __APPLE_API_PRIVATE
+#  define __APPLE_API_PRIVATE
+#  include <sys/user.h>
+#  undef __APPLE_API_PRIVATE
+# else
+#  include <sys/user.h>
+# endif
+# include <mach/mach_traps.h>
+# include <mach/thread_switch.h>
+# include <machine/cpu_number.h>
+#endif /* !__DARWIN8__ */
 
 #include <sys/kernel.h>
 
 #include <mach/thread_act.h>
 #include <mach/mach_types.h>
-#include <mach/mach_traps.h>
-#include <mach/thread_switch.h>
 #include <mach/time_value.h>
 #include <kern/sched_prim.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <mach/machine/vm_param.h>
+#include <machine/machine_routines.h>
 #include <kern/clock.h>
 #include <kern/thread_call.h>
 #include <sys/param.h>
@@ -63,10 +67,12 @@ extern kern_return_t            cfs_symbol_put(const char *);
  * User can register/unregister a list of sysctl_oids
  * sysctl_oid is data struct of osx's sysctl-entry
  */
+#define        CONFIG_SYSCTL   1
+
 typedef struct sysctl_oid *     cfs_sysctl_table_t;
 typedef cfs_sysctl_table_t      cfs_sysctl_table_header_t;
-cfs_sysctl_table_header_t      *register_cfs_sysctl_table (cfs_sysctl_table_t *table, int arg);
-void unregister_cfs_sysctl_table (cfs_sysctl_table_header_t *table);
+cfs_sysctl_table_header_t      *cfs_register_sysctl_table (cfs_sysctl_table_t *table, int arg);
+void cfs_unregister_sysctl_table (cfs_sysctl_table_header_t *table);
 
 /*
  * Proc file system APIs, no /proc fs support in OSX
@@ -111,10 +117,22 @@ extern kern_return_t            cfs_psdev_deregister(cfs_psdev_t *);
 extern boolean_t        assert_wait_possible(void);
 extern void             *get_bsdtask_info(task_t);
 
+#ifdef __DARWIN8__
+
+typedef struct {}              cfs_task_t;
+#define cfs_current()          ((cfs_task_t *)current_thread())
+#else  /* !__DARWIN8__ */
+
 typedef struct uthread         cfs_task_t;
+
 #define current_uthread()       ((struct uthread *)get_bsdthread_info(current_act()))
 #define cfs_current()          current_uthread()
 
+#endif /* !__DARWIN8__ */
+
+#define cfs_task_lock(t)       do {;} while (0)
+#define cfs_task_unlock(t)     do {;} while (0)
+
 #define set_current_state(s)   do {;} while (0)
 #define reparent_to_init()     do {;} while (0)
 
@@ -128,109 +146,12 @@ typedef struct uthread           cfs_task_t;
  *
  * OSX kernel thread can not be created with args,
  * so we have to implement new APIs to create thread with args
- *
- * All requests to create kernel thread will create a new
- * thread instance of cfs_thread_agent, one by one.
- * cfs_thread_agent will call the caller's thread function
- * with argument supplied by caller.
  */
 
 typedef int (*cfs_thread_t)(void *);
 
 extern task_t  kernel_task;
 
-struct kernel_thread_arg
-{
-       spinlock_t      lock;
-       atomic_t        inuse;
-       cfs_thread_t    func;
-       void            *arg;
-};
-
-extern struct kernel_thread_arg cfs_thread_arg;
-extern void cfs_thread_agent(void);
-
-#define THREAD_ARG_FREE                        0
-#define THREAD_ARG_HOLD                        1
-#define THREAD_ARG_RECV                        2
-
-#define set_targ_stat(a, v)            atomic_set(&(a)->inuse, v)
-#define get_targ_stat(a)               atomic_read(&(a)->inuse)
-
-/*
- * Hold the thread argument and set the status of thread_status
- * to THREAD_ARG_HOLD, if the thread argument is held by other
- * threads (It's THREAD_ARG_HOLD already), current-thread has to wait.
- */
-#define thread_arg_hold(pta, _func, _arg)                      \
-       do {                                                    \
-               spin_lock(&(pta)->lock);                        \
-               if (get_targ_stat(pta) == THREAD_ARG_FREE) {    \
-                       set_targ_stat((pta), THREAD_ARG_HOLD);  \
-                       (pta)->arg = (void *)_arg;              \
-                       (pta)->func = _func;                    \
-                       spin_unlock(&(pta)->lock);              \
-                       break;                                  \
-               }                                               \
-               spin_unlock(&(pta)->lock);                      \
-               schedule();                                     \
-       } while(1);                                             \
-
-/*
- * Release the thread argument if the thread argument has been
- * received by the child-thread (Status of thread_args is
- * THREAD_ARG_RECV), otherwise current-thread has to wait.
- * After release, the thread_args' status will be set to
- * THREAD_ARG_FREE, and others can re-use the thread_args to
- * create new kernel_thread.
- */
-#define thread_arg_release(pta)                                        \
-       do {                                                    \
-               spin_lock(&(pta)->lock);                        \
-               if (get_targ_stat(pta) == THREAD_ARG_RECV) {    \
-                       (pta)->arg = NULL;                      \
-                       (pta)->func = NULL;                     \
-                       set_targ_stat(pta, THREAD_ARG_FREE);    \
-                       spin_unlock(&(pta)->lock);              \
-                       break;                                  \
-               }                                               \
-               spin_unlock(&(pta)->lock);                      \
-               schedule();                                     \
-       } while(1)
-
-/*
- * Receive thread argument (Used in child thread), set the status
- * of thread_args to THREAD_ARG_RECV.
- */
-#define __thread_arg_recv_fin(pta, _func, _arg, fin)           \
-       do {                                                    \
-               spin_lock(&(pta)->lock);                        \
-               if (get_targ_stat(pta) == THREAD_ARG_HOLD) {    \
-                       if (fin)                                \
-                           set_targ_stat(pta, THREAD_ARG_RECV);\
-                       _arg = (pta)->arg;                      \
-                       _func = (pta)->func;                    \
-                       spin_unlock(&(pta)->lock);              \
-                       break;                                  \
-               }                                               \
-               spin_unlock(&(pta)->lock);                      \
-               schedule();                                     \
-       } while (1);                                            \
-
-/*
- * Just set the thread_args' status to THREAD_ARG_RECV
- */
-#define thread_arg_fin(pta)                                    \
-       do {                                                    \
-               spin_lock(&(pta)->lock);                        \
-               assert( get_targ_stat(pta) == THREAD_ARG_HOLD); \
-               set_targ_stat(pta, THREAD_ARG_RECV);            \
-               spin_unlock(&(pta)->lock);                      \
-       } while(0)
-
-#define thread_arg_recv(pta, f, a)     __thread_arg_recv_fin(pta, f, a, 1)
-#define thread_arg_keep(pta, f, a)     __thread_arg_recv_fin(pta, f, a, 0)
-
 /*
  * cloning flags, no use in OSX, just copy them from Linux
  */
@@ -265,11 +186,16 @@ typedef struct cfs_waitlink {
        struct ksleep_link  wl_ksleep_link;
 } cfs_waitlink_t;
 
+typedef int cfs_task_state_t;
+
+#define CFS_TASK_INTERRUPTIBLE THREAD_ABORTSAFE
+#define CFS_TASK_UNINT         THREAD_UNINT
+
 void cfs_waitq_init(struct cfs_waitq *waitq);
 void cfs_waitlink_init(struct cfs_waitlink *link);
 
 void cfs_waitq_add(struct cfs_waitq *waitq, struct cfs_waitlink *link);
-void cfs_waitq_add_exclusive(struct cfs_waitq *waitq, 
+void cfs_waitq_add_exclusive(struct cfs_waitq *waitq,
                             struct cfs_waitlink *link);
 void cfs_waitq_forward(struct cfs_waitlink *link, struct cfs_waitq *waitq);
 void cfs_waitq_del(struct cfs_waitq *waitq, struct cfs_waitlink *link);
@@ -279,29 +205,37 @@ void cfs_waitq_signal(struct cfs_waitq *waitq);
 void cfs_waitq_signal_nr(struct cfs_waitq *waitq, int nr);
 void cfs_waitq_broadcast(struct cfs_waitq *waitq);
 
-void cfs_waitq_wait(struct cfs_waitlink *link);
-cfs_duration_t cfs_waitq_timedwait(struct cfs_waitlink *link, 
+void cfs_waitq_wait(struct cfs_waitlink *link, cfs_task_state_t state);
+cfs_duration_t cfs_waitq_timedwait(struct cfs_waitlink *link,
+                                  cfs_task_state_t state, 
                                   cfs_duration_t timeout);
 
 /*
  * Thread schedule APIs.
  */
 #define MAX_SCHEDULE_TIMEOUT    ((long)(~0UL>>12))
+extern void thread_set_timer_deadline(uint64_t deadline);
+extern void thread_cancel_timer(void);
 
-static inline int schedule_timeout(int64_t timeout)
+static inline int cfs_schedule_timeout(int state, int64_t timeout)
 {
        int          result;
        
-       AbsoluteTime clock_current;
-       AbsoluteTime clock_delay;
-       result = assert_wait((event_t)current_uthread(), THREAD_UNINT);
-       clock_get_uptime(&clock_current);
-       nanoseconds_to_absolutetime(timeout, &clock_delay);
-       ADD_ABSOLUTETIME(&clock_current, &clock_delay);
-       thread_set_timer_deadline(clock_current);
+#ifdef __DARWIN8__
+       result = assert_wait((event_t)current_thread(), state);
+#else
+       result = assert_wait((event_t)current_uthread(), state);
+#endif
+       if (timeout > 0) {
+               uint64_t expire;
+               nanoseconds_to_absolutetime(timeout, &expire);
+               clock_absolutetime_interval_to_deadline(expire, &expire);
+               thread_set_timer_deadline(expire);
+       }
        if (result == THREAD_WAITING)
                result = thread_block(THREAD_CONTINUE_NULL);
-       thread_cancel_timer();
+       if (timeout > 0)
+               thread_cancel_timer();
        if (result == THREAD_TIMED_OUT)
                result = 0;
        else
@@ -309,47 +243,80 @@ static inline int schedule_timeout(int64_t timeout)
        return result;
 }
 
-#define schedule()                              \
-       do {                                    \
-               if (assert_wait_possible())     \
-                       schedule_timeout(1);    \
-               else                            \
-                       schedule_timeout(0);    \
-       } while (0)
+#define cfs_schedule() cfs_schedule_timeout(CFS_TASK_UNINT, CFS_JIFFY)
+#define cfs_pause(tick)        cfs_schedule_timeout(CFS_TASK_UNINT, tick)
+
+#define __wait_event(wq, condition)                            \
+do {                                                           \
+       struct cfs_waitlink __wait;                             \
+                                                               \
+       cfs_waitlink_init(&__wait);                             \
+       for (;;) {                                              \
+               cfs_waitq_add(&wq, &__wait);                    \
+               if (condition)                                  \
+                       break;                                  \
+               cfs_waitq_wait(&__wait, CFS_TASK_UNINT);        \
+               cfs_waitq_del(&wq, &__wait);                    \
+       }                                                       \
+       cfs_waitq_del(&wq, &__wait);                            \
+} while (0)
 
-#define __wait_event(wq, condition)            \
-do {                                           \
-       struct cfs_waitlink __wait;             \
-                                               \
-       cfs_waitlink_init(&__wait);             \
-       for (;;) {                              \
-               cfs_waitq_add(&wq, &__wait);    \
-               if (condition)                  \
-                       break;                  \
-               cfs_waitq_wait(&__wait);        \
-               cfs_waitq_del(&wq, &__wait);    \
-       }                                       \
-       cfs_waitq_del(&wq, &__wait);            \
+#define wait_event(wq, condition)                              \
+do {                                                           \
+       if (condition)                                          \
+               break;                                          \
+       __wait_event(wq, condition);                            \
 } while (0)
 
-#define wait_event(wq, condition)                                      \
-do {                                                                   \
-       if (condition)                                                  \
-               break;                                                  \
-       __wait_event(wq, condition);                                    \
+#define __wait_event_interruptible(wq, condition, ex, ret)     \
+do {                                                           \
+       struct cfs_waitlink __wait;                             \
+                                                               \
+       cfs_waitlink_init(&__wait);                             \
+       for (;;) {                                              \
+               if (ex == 0)                                    \
+                       cfs_waitq_add(&wq, &__wait);            \
+               else                                            \
+                       cfs_waitq_add_exclusive(&wq, &__wait);  \
+               if (condition)                                  \
+                       break;                                  \
+               if (!cfs_signal_pending()) {                    \
+                       cfs_waitq_wait(&__wait,                 \
+                                      CFS_TASK_INTERRUPTIBLE); \
+                       cfs_waitq_del(&wq, &__wait);            \
+                       continue;                               \
+               }                                               \
+               ret = -ERESTARTSYS;                             \
+               break;                                          \
+       }                                                       \
+       cfs_waitq_del(&wq, &__wait);                            \
 } while (0)
 
-#define wait_event_interruptible(wq, condition)        \
-({                                             \
-       wait_event(wq, condition);              \
-       0;                                      \
+#define wait_event_interruptible(wq, condition)                        \
+({                                                             \
+       int __ret = 0;                                          \
+       if (!condition)                                         \
+               __wait_event_interruptible(wq, condition,       \
+                                          0, __ret);           \
+       __ret;                                                  \
 })
 
+#define wait_event_interruptible_exclusive(wq, condition)      \
+({                                                             \
+       int __ret = 0;                                          \
+       if (!condition)                                         \
+               __wait_event_interruptible(wq, condition,       \
+                                          1, __ret);           \
+       __ret;                                                  \
+})
+
+#ifndef __DARWIN8__
 extern void    wakeup_one __P((void * chan));
+#endif
 /* only used in tests */
-#define wake_up_process(p)                     \
-       do {                                    \
-               wakeup_one(p);                  \
+#define wake_up_process(p)                                     \
+       do {                                                    \
+               wakeup_one((caddr_t)p);                         \
        } while (0)
        
 /* used in couple of places */
@@ -359,41 +326,29 @@ static inline void sleep_on(cfs_waitq_t *waitq)
        
        cfs_waitlink_init(&link);
        cfs_waitq_add(waitq, &link);
-       cfs_waitq_wait(&link);
+       cfs_waitq_wait(&link, CFS_TASK_UNINT);
        cfs_waitq_del(waitq, &link);
 }
 
 /*
- * XXX
  * Signal
+ * We don't use signal_lock/signal_unlock in cfs_sigmask_lock()
+ * and cfs_sigmask_unlock() because they will be called in 
+ * signal kernel APIs by xnu.
  */
-#define cfs_sigmask_lock(t, f)         do { f = 0; } while(0)
-#define cfs_sigmask_unlock(t, f)       do { f = 0; } while(0)
-#define cfs_signal_pending(t)          (0)
-
-#define cfs_siginitset(pmask, sigs)                            \
-       do {                                                    \
-               sigset_t __sigs = sigs & (~sigcantmask);        \
-               *(pmask) = __sigs;                              \
-       } while(0)
-
-#define cfs_siginitsetinv(pmask, sigs)                          \
-       do {                                                    \
-               sigset_t __sigs = ~(sigs | sigcantmask);        \
-               *(pmask) = __sigs;                              \
-       } while(0)
-
-#define cfs_recalc_sigpending(ut)                              \
-        do {                                                   \
-               (ut)->uu_siglist = (ut)->uu_siglist & ~(ut)->uu_sigmask;\
-       } while (0)
-#define cfs_sigfillset(s)                                      \
-       do {                                                    \
-               memset((s), -1, sizeof(sigset_t));              \
-       } while(0)
-
-#define cfs_set_sig_blocked(ut, b)             do {(ut)->uu_sigmask = b;} while(0)
-#define cfs_get_sig_blocked(ut)                        (&(ut)->uu_sigmask)
+typedef sigset_t       cfs_sigset_t;
+#define cfs_sigmask_lock(f)            do { f = 0; } while (0)
+#define cfs_sigmask_unlock(f)          do { f = 0; } while (0)
+int cfs_signal_pending(void);
+/*
+ * We don't need to recalc_sigpending because xnu always
+ * call SHOULDissignal to checking if there are pending signals.
+ */
+#define cfs_recalc_sigpending()                do {} while (0)
+/*
+ * Clear all pending signals.
+ */
+#define cfs_clear_sigpending()         clear_procsiglist(current_proc(), -1)
 
 #define SIGNAL_MASK_ASSERT()
 
@@ -434,20 +389,28 @@ cfs_time_t cfs_timer_deadline(struct cfs_timer *t);
 /*
  * CPU
  */
-#include <machine/cpu_number.h>
 /* Run in PowerG5 who is PPC64 */
 #define SMP_CACHE_BYTES                         128
 #define __cacheline_aligned                     __attribute__((__aligned__(SMP_CACHE_BYTES)))
-/* XXX How to get the value of NCPUS from xnu ? */
 #define NR_CPUS                                        2
-#define smp_processor_id()                     cpu_number()
-#define smp_num_cpus                           NR_CPUS
+
+extern unsigned int cpu_number(void);
+#define smp_num_cpus                           cpu_number()
+/* 
+ * XXX Liang: patch xnu and export current_processor()?
+ *
+ * #define smp_processor_id()                  current_processor()
+ */
+#define smp_processor_id()                     0
 /* XXX smp_call_function is not supported in xnu */
 #define smp_call_function(f, a, n, w)          do {} while(0)
+int cfs_online_cpus(void);
 
 /*
  * Misc
  */
+extern int is_suser(void);
+
 #ifndef likely
 #define likely(exp) (exp)
 #endif
@@ -499,6 +462,11 @@ struct __dummy_ ## name ## _struct {}
 #define inter_module_get(n)                    cfs_symbol_get(n)
 #define inter_module_put(n)                    cfs_symbol_put(n)
 
+static inline int request_module(char *name)
+{
+       return (-EINVAL);
+}
+
 #ifndef __exit
 #define __exit
 #endif
@@ -514,10 +482,10 @@ struct __dummy_ ## name ## _struct {}
 #define MODULE_PARM_DESC(a, b)
 
 #define KERNEL_VERSION(a,b,c) ((a)*100+(b)*10+c)
-#define LINUX_VERSION_CODE (2*200+5*10+0)
+#define LINUX_VERSION_CODE KERNEL_VERSION(2,5,0)
 
-#define NR_IRQS                         512
-#define in_interrupt()                  (0)
+#define NR_IRQS                                512
+#define in_interrupt()                 ml_at_interrupt_context()
 
 #define KERN_EMERG      "<0>"   /* system is unusable                   */
 #define KERN_ALERT      "<1>"   /* action must be taken immediately     */
@@ -538,17 +506,45 @@ static inline long PTR_ERR(const void *ptr)
 /* XXX */
 #define IS_ERR(p)      (0)
 
+#else  /* !__KERNEL__ */
+
+typedef struct cfs_proc_dir_entry{
+       void            *data;
+}cfs_proc_dir_entry_t;
+
+#include <libcfs/user-prim.h>
+#define __WORDSIZE     32
+
+#endif /* END __KERNEL__ */
 /*
  * Error number
  */
+#ifndef EPROTO
+#define EPROTO          EPROTOTYPE
+#endif
+#ifndef EBADR
 #define EBADR          EBADRPC
-#define ERESTARTSYS    ERESTART
+#endif
+#ifndef ERESTARTSYS
+#define ERESTARTSYS    512
+#endif
+#ifndef EDEADLOCK
 #define EDEADLOCK      EDEADLK
+#endif
+#ifndef ECOMM
 #define ECOMM          EINVAL
+#endif
+#ifndef ENODATA
 #define ENODATA                EINVAL
+#endif
+#ifndef ENOTSUPP
+#define ENOTSUPP       EINVAL
+#endif
 
+#if BYTE_ORDER == BIG_ENDIAN
+# define __BIG_ENDIAN
 #else
-#define __WORDSIZE     32
-#endif /* __KERNEL__ */
+# define __LITTLE_ENDIAN
+#endif
 
-#endif /* __LINUX__ */
+#endif /* __LIBCFS_DARWIN_CFS_PRIM_H__ */
index b97c5ba..fbce7bf 100644 (file)
@@ -1,7 +1,8 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- * Lustre Light Super operations
+ * Implementation of standard libcfs synchronization primitives for XNU
+ * kernel.
  *
  *  Copyright (c) 2004 Cluster File Systems, Inc.
  *
@@ -33,7 +34,7 @@
 #error Do not #include this file directly. #include <libcfs/libcfs.h> instead
 #endif
 
-#define XNU_SYNC_DEBUG (0)
+#define XNU_SYNC_DEBUG (1)
 
 #if XNU_SYNC_DEBUG
 #define ON_SYNC_DEBUG(e) e
@@ -48,6 +49,7 @@ enum {
        KCOND_MAGIC = 0xb01dface,
        KRW_MAGIC   = 0xdabb1edd,
        KSPIN_MAGIC = 0xca11ab1e,
+        KRW_SPIN_MAGIC    = 0xbabeface,
        KSLEEP_CHAN_MAGIC = 0x0debac1e,
        KSLEEP_LINK_MAGIC = 0xacc01ade,
        KTIMER_MAGIC      = 0xbefadd1e
@@ -60,25 +62,55 @@ enum {
  */
 #define SMP (1)
 
+#include <libcfs/list.h>
+
+#ifdef __DARWIN8__
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <kern/locks.h>
+
+/*
+ * hw_lock is not available in Darwin8 (hw_lock_* are not exported at all), 
+ * so use lck_spin_t. we can hack out lck_spin_t easily:
+ *
+ * typedef struct {
+ *      unsigned int             opaque[3];
+ * } lck_spin_t;
+ *
+ * But it's not very necessory.
+ */
+typedef lck_spin_t      *xnu_spin_t;
+/* 
+ * wait_queue is not available in Darwin8 (wait_queue_* are not exported), 
+ * use assert_wait/wakeup/wake_one (wait_queue in kernel hash).
+ */
+typedef void * xnu_wait_queue_t;
+
+/* DARWIN8 */
+#else
+
+#include <mach/mach_types.h>
+#include <sys/types.h>
 #include <kern/simple_lock.h>
 
-#include <libcfs/list.h>
+typedef hw_lock_data_t  xnu_spin_t;
+typedef struct wait_queue       xnu_wait_queue_t;
+
+/* DARWIN8 */
+#endif
 
 struct kspin {
 #if SMP
-       hw_lock_data_t lock;
+       xnu_spin_t      lock;
 #endif
 #if XNU_SYNC_DEBUG
-       unsigned magic;
-       thread_t owner;
+       unsigned        magic;
+       thread_t        owner;
 #endif
 };
 
-/*
- * XXX nikita: we cannot use simple_* functions, because bsd/sys/lock.h
- * redefines them to nothing. Use low-level hw_lock_* instead.
- */
-
 void kspin_init(struct kspin *spin);
 void kspin_done(struct kspin *spin);
 void kspin_lock(struct kspin *spin);
@@ -98,11 +130,27 @@ int kspin_isnotlocked(struct kspin *spin);
 #define kspin_isnotlocked(s) (1)
 #endif
 
+/* ------------------------- rw spinlock ----------------------- */
+struct krw_spin {
+        struct kspin      guard;
+        int               count;
+#if XNU_SYNC_DEBUG
+        unsigned          magic;
+#endif
+};
+
+void krw_spin_init(struct krw_spin *sem);
+void krw_spin_done(struct krw_spin *sem);
+void krw_spin_down_r(struct krw_spin *sem);
+void krw_spin_down_w(struct krw_spin *sem);
+void krw_spin_up_r(struct krw_spin *sem);
+void krw_spin_up_w(struct krw_spin *sem);
+
 /* ------------------------- semaphore ------------------------- */
 
 struct ksem {
         struct kspin      guard;
-        struct wait_queue q;
+        xnu_wait_queue_t  q;
         int               value;
 #if XNU_SYNC_DEBUG
         unsigned          magic;
@@ -225,20 +273,20 @@ void ksleep_link_done(struct ksleep_link *link);
 void ksleep_add(struct ksleep_chan *chan, struct ksleep_link *link);
 void ksleep_del(struct ksleep_chan *chan, struct ksleep_link *link);
 
-void ksleep_wait(struct ksleep_chan *chan);
-int64_t  ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout);
+void ksleep_wait(struct ksleep_chan *chan, int state);
+int64_t  ksleep_timedwait(struct ksleep_chan *chan, int state, uint64_t timeout);
 
 void ksleep_wake(struct ksleep_chan *chan);
 void ksleep_wake_all(struct ksleep_chan *chan);
 void ksleep_wake_nr(struct ksleep_chan *chan, int nr);
 
-#define KSLEEP_LINK_DECLARE(name)                      \
-{                                                      \
-       .flags   = 0,                                   \
-       .event   = 0,                                   \
-       .hits    = 0,                                   \
-       .linkage = CFS_LIST_HEAD_INIT(name.linkage),    \
-       .magic   = KSLEEP_LINK_MAGIC                    \
+#define KSLEEP_LINK_DECLARE(name)               \
+{                                               \
+       .flags   = 0,                           \
+       .event   = 0,                           \
+       .hits    = 0,                           \
+       .linkage = CFS_LIST_HEAD(name.linkage), \
+       .magic   = KSLEEP_LINK_MAGIC            \
 }
 
 /* ------------------------- timer ------------------------- */
diff --git a/lnet/include/libcfs/darwin/darwin-tcpip.h b/lnet/include/libcfs/darwin/darwin-tcpip.h
new file mode 100644 (file)
index 0000000..079dc4d
--- /dev/null
@@ -0,0 +1,88 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines. 
+ *
+ */
+
+#ifndef __LIBCFS_DARWIN_TCPIP_H__
+#define __LIBCFS_DARWIN_TCPIP_H__
+
+#ifdef __KERNEL__
+#include <sys/socket.h>
+
+#ifdef __DARWIN8__
+
+struct socket;
+
+typedef void    (*so_upcall)(socket_t sock, void* arg, int waitf);
+
+#define CFS_SOCK_UPCALL         0x1
+
+typedef struct cfs_socket {
+        socket_t        s_so;
+        int             s_flags;
+        so_upcall       s_upcall;
+        void           *s_upcallarg;
+} cfs_socket_t;
+
+#ifndef container_of
+#define container_of(ptr, type, member) \
+                ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+#endif
+
+/* cfs_socket_t to bsd socket */
+#define C2B_SOCK(s)             ((s)->s_so)     
+/* bsd socket to cfs_socket_t */
+#define B2C_SOCK(s)             container_of((s), cfs_socket_t, s_so)
+
+static inline int get_sock_intopt(socket_t so, int opt)
+{
+        int     val, len;
+        int     rc;
+
+        /*
+         * sock_getsockopt will take a lock(mutex) for socket,
+         * so it can be blocked. So be careful while using 
+         * them.
+         */
+        rc = sock_getsockopt(so, SOL_SOCKET, opt, &val, &len);
+        assert(rc == 0);
+        return opt;
+}
+
+#define SOCK_ERROR(s)           get_sock_intopt(C2B_SOCK(s), SO_ERROR)        
+/* #define SOCK_WMEM_QUEUED(s)     (0) */
+#define SOCK_WMEM_QUEUED(s)     get_sock_intopt(C2B_SOCK(s), SO_NWRITE)
+/* XXX Liang: no reliable way to get it in Darwin8.x */
+#define SOCK_TEST_NOSPACE(s)    (0)
+
+#else /* !__DARWIN8__ */
+
+#define SOCK_WMEM_QUEUED(so)    ((so)->so_snd.sb_cc)
+#define SOCK_ERROR(so)          ((so)->so_error)
+
+#define SOCK_TEST_NOSPACE(so)   (sbspace(&(so)->so_snd) < (so)->so_snd.sb_lowat)
+
+#endif /* !__DARWIN8__ */
+
+#endif /* __KERNEL END */
+
+#endif  /* __XNU_CFS_TYPES_H__ */
index d6230ad..82f5e9f 100644 (file)
@@ -64,8 +64,6 @@
  *  int            cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *);
  *  int            cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *);
  *
- *  cfs_duration_t cfs_time_minimal_timeout(void)
- *
  *  CFS_TIME_FORMAT
  *  CFS_DURATION_FORMAT
  *
 #include <sys/types.h>
 #include <sys/systm.h>
 
-#ifndef __APPLE_API_PRIVATE
-#define __APPLE_API_PRIVATE
-#include <sys/user.h>
-#undef __APPLE_API_PRIVATE
-#else
-#include <sys/user.h>
-#endif
-
 #include <sys/kernel.h>
 
-#include <mach/thread_act.h>
 #include <mach/mach_types.h>
-#include <mach/mach_traps.h>
-#include <mach/thread_switch.h>
 #include <mach/time_value.h>
-#include <kern/sched_prim.h>
-#include <vm/pmap.h>
-#include <vm/vm_kern.h>
-#include <mach/machine/vm_param.h>
 #include <kern/clock.h>
-#include <kern/thread_call.h>
 #include <sys/param.h>
-#include <sys/vm.h>
 
 #include <libcfs/darwin/darwin-types.h>
 #include <libcfs/darwin/darwin-utils.h>
 #include <libcfs/darwin/darwin-lock.h>
 
+/*
+ * There are three way to measure time in OS X:
+ * 1. nanoseconds
+ * 2. absolute time (abstime unit equal to the length of one bus cycle),
+ *    schedule of thread/timer are counted by absolute time, but abstime
+ *    in different mac can be different also, so we wouldn't use it.
+ * 3. clock interval (1sec = 100hz). But clock interval only taken by KPI
+ *    like tsleep().
+ *
+ * We use nanoseconds (uptime, not calendar time)
+ *
+ * clock_get_uptime()   :get absolute time since bootup.
+ * nanouptime()         :get nanoseconds since bootup
+ * microuptime()        :get microseonds since bootup
+ * nanotime()           :get nanoseconds since epoch
+ * microtime()          :get microseconds since epoch
+ */
 typedef u_int64_t cfs_time_t; /* nanoseconds */
 typedef int64_t cfs_duration_t;
 
@@ -118,15 +116,15 @@ static inline cfs_time_t cfs_time_current(void)
 {
         struct timespec instant;
 
-        nanotime(&instant);
-        return ((u_int64_t)instant.tv_sec) * ONE_BILLION + instant.tv_nsec;
+        nanouptime(&instant);
+        return ((u_int64_t)instant.tv_sec) * NSEC_PER_SEC + instant.tv_nsec;
 }
 
 static inline time_t cfs_time_current_sec(void)
 {
         struct timespec instant;
 
-        nanotime(&instant);
+        nanouptime(&instant);
        return instant.tv_sec;
 }
 
@@ -152,7 +150,7 @@ static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
 
 static inline void cfs_fs_time_current(cfs_fs_time_t *t)
 {
-        *t = time;
+        microtime((struct timeval *)t);
 }
 
 static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t)
@@ -160,12 +158,6 @@ static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t)
         return t->tv_sec;
 }
 
-static inline cfs_duration_t cfs_duration_build(int64_t nano)
-{
-        return nano;
-}
-
-
 static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v)
 {
         *v = *t;
@@ -174,17 +166,12 @@ static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v)
 static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s)
 {
         s->tv_sec  = t->tv_sec;
-        s->tv_nsec = t->tv_usec * 1000;
+        s->tv_nsec = t->tv_usec * NSEC_PER_USEC;
 }
 
 static inline cfs_duration_t cfs_time_seconds(int seconds)
 {
-       return cfs_duration_build(ONE_BILLION * (int64_t)seconds);
-}
-
-static inline cfs_time_t cfs_time_shift(int seconds)
-{
-       return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
+       return (NSEC_PER_SEC * (int64_t)seconds);
 }
 
 /*
@@ -192,7 +179,7 @@ static inline cfs_time_t cfs_time_shift(int seconds)
  */
 static inline int64_t __cfs_fs_time_flat(cfs_fs_time_t *t)
 {
-        return ((int64_t)t->tv_sec) * ONE_BILLION + t->tv_usec;
+        return ((int64_t)t->tv_sec)*NSEC_PER_SEC + t->tv_usec*NSEC_PER_USEC;
 }
 
 static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
@@ -207,29 +194,28 @@ static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
 
 static inline time_t cfs_duration_sec(cfs_duration_t d)
 {
-        return d / ONE_BILLION;
+        return d / NSEC_PER_SEC;
 }
 
 static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s)
 {
-        s->tv_sec = d / ONE_BILLION;
-        s->tv_usec = (d - s->tv_sec * ONE_BILLION) / 1000;
+        s->tv_sec = d / NSEC_PER_SEC;
+        s->tv_usec = (d - s->tv_sec * NSEC_PER_SEC) / NSEC_PER_USEC;
 }
 
 static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s)
 {
-        s->tv_sec = d / ONE_BILLION;
-        s->tv_nsec = d - ((int64_t)s->tv_sec) * ONE_BILLION;
+        s->tv_sec = d / NSEC_PER_SEC;
+        s->tv_nsec = d - ((int64_t)s->tv_sec) * NSEC_PER_SEC;
 }
 
-static inline cfs_duration_t cfs_time_minimal_timeout(void)
-{
-        return ONE_BILLION / (u_int64_t)hz;
-}
-
-/* inline function cfs_time_minimal_timeout() can not be used to
- * initiallize static variable */
-#define CFS_MIN_DELAY          (ONE_BILLION / (u_int64_t)100)
+/* 
+ * One jiffy (in nanoseconds)
+ *
+ * osfmk/kern/sched_prim.c
+ * #define DEFAULT_PREEMPTION_RATE      100
+ */
+#define CFS_JIFFY              (NSEC_PER_SEC / (u_int64_t)100)
 
 #define LTIME_S(t)             (t)
 
index b2762c0..bb9c5de 100644 (file)
 #include <mach/mach_types.h>
 #include <sys/types.h>
 
+#ifndef _BLKID_TYPES_H
+#define _BLKID_TYPES_H
+#endif
+
 typedef u_int8_t  __u8;
 typedef u_int16_t __u16;
 typedef u_int32_t __u32;
@@ -61,21 +65,28 @@ typedef struct { volatile uint32_t counter; }       atomic_t;
 #define ATOMIC_INIT(i)                 { (i) }
 #define atomic_read(a)                 ((a)->counter)
 #define atomic_set(a, v)               (((a)->counter) = (v))
+#ifdef __DARWIN8__
+#define atomic_add(v, a)               OSAddAtomic(v, (SInt32 *)&((a)->counter))
+#define atomic_sub(v, a)               OSAddAtomic(-(v), (SInt32 *)&((a)->counter))
+#define atomic_inc(a)                  OSIncrementAtomic((SInt32 *)&((a)->counter))
+#define atomic_dec(a)                  OSDecrementAtomic((SInt32 *)&((a)->counter))
+#else /* !__DARWIN8__ */
 #define atomic_add(v, a)               hw_atomic_add((uint32_t *)&((a)->counter), v)
 #define atomic_sub(v, a)               hw_atomic_sub((uint32_t *)&((a)->counter), v)
 #define atomic_inc(a)                  atomic_add(1, a)
 #define atomic_dec(a)                  atomic_sub(1, a)
+#endif /* !__DARWIN8__ */
 #define atomic_sub_and_test(v, a)      ( atomic_sub(v, a) == 0 )
 #define atomic_dec_and_test(a)         ( atomic_dec(a) == 0 )
 
 #include <libsa/mach/mach.h>
-typedef uint64_t                       loff_t;
+typedef off_t                          loff_t;
 
 #else  /* !__KERNEL__ */
 
 #include <stdint.h>
 
-typedef uint64_t                       loff_t;
+typedef off_t                          loff_t;
 
 #endif /* __KERNEL END */
 
index 4e91db9..0f808a2 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef __LIBCFS_DARWIN_XNU_UTILS_H__
-#define __LIBCFS_DARWIN_XNU_UTILS_H__
+#ifndef __LIBCFS_DARWIN_UTILS_H__
+#define __LIBCFS_DARWIN_UTILS_H__
 
 #ifndef __LIBCFS_LIBCFS_H__
 #error Do not #include this file directly. #include <libcfs/libcfs.h> instead
@@ -57,4 +57,11 @@ char * ul2dstr(unsigned long address, char *buf, int len);
 
 #define HIPQUAD NIPQUAD
 
+#ifndef LIST_CIRCLE
+#define LIST_CIRCLE(elm, field)                                 \
+       do {                                                    \
+               (elm)->field.le_prev = &(elm)->field.le_next;   \
+       } while (0)
+#endif
+
 #endif /* __XNU_UTILS_H__ */
index 4b2e94f..aba1a82 100644 (file)
@@ -24,7 +24,7 @@
 #include <libcfs/darwin/darwin-prim.h>
 #include <lnet/lnet.h>
 
-#define our_cond_resched()              schedule_timeout(1);
+#define our_cond_resched() cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE, 1)
 
 #ifdef CONFIG_SMP
 #define LASSERT_SPIN_LOCKED(lock) do {} while(0) /* XXX */
@@ -33,8 +33,6 @@
 #endif
 #define LASSERT_SEM_LOCKED(sem) do {} while(0) /* XXX */
 
-#define LBUG_WITH_LOC(file, func, line)    do {libcfs_catastrophe = 1;} while(0)
-
 /* --------------------------------------------------------------------- */
 
 #define PORTAL_SYMBOL_REGISTER(x)               cfs_symbol_register(#x, &x)
 #define PORTAL_MODULE_USE                       do{int i = 0; i++;}while(0)
 #define PORTAL_MODULE_UNUSE                     do{int i = 0; i--;}while(0)
 
-#define printk(format, args...)                 printf(format, ## args)
+#define num_online_cpus()                       cfs_online_cpus()
 
 /******************************************************************************/
-/* Module parameter support */
-#define CFS_MODULE_PARM(name, t, type, perm, desc) \
-        this should force a syntax error
+/* XXX Liang: There is no module parameter supporting in OSX */
+#define CFS_MODULE_PARM(name, t, type, perm, desc) 
 
 #define CFS_SYSFS_MODULE_PARM    0 /* no sysfs access to module parameters */
 /******************************************************************************/
index 3e96748..5a05d9c 100644 (file)
@@ -9,6 +9,7 @@
 #endif
 
 #include <mach/mach_types.h>
+#include <sys/errno.h>
 #include <string.h>
 #include <libcfs/darwin/darwin-types.h>
 #include <libcfs/darwin/darwin-time.h>
@@ -16,6 +17,7 @@
 #include <libcfs/darwin/darwin-mem.h>
 #include <libcfs/darwin/darwin-lock.h>
 #include <libcfs/darwin/darwin-fs.h>
+#include <libcfs/darwin/darwin-tcpip.h>
 
 #ifdef __KERNEL__
 # include <sys/types.h>
@@ -164,10 +166,28 @@ __entry_nesting(&__cdd);
  *
  * Implementation is in darwin-curproc.c
  */
-#define CFS_CURPROC_COMM_MAX (sizeof ((struct proc *)0)->p_comm)
+#define CFS_CURPROC_COMM_MAX    MAXCOMLEN
 /*
  * XNU has no capabilities
  */
 typedef int cfs_kernel_cap_t;
 
+#ifdef __KERNEL__
+enum {
+        /* if you change this, update darwin-util.c:cfs_stack_trace_fill() */
+        CFS_STACK_TRACE_DEPTH = 16
+};
+
+struct cfs_stack_trace {
+        void *frame[CFS_STACK_TRACE_DEPTH];
+};
+
+#define printk(format, args...)                 printf(format, ## args)
+
+#ifdef WITH_WATCHDOG
+#undef WITH_WATCHDOG
+#endif
+
+#endif /* __KERNEL__ */
+
 #endif /* _XNU_LIBCFS_H */
index 58f6cfd..3dd22de 100644 (file)
@@ -6,11 +6,14 @@
 
 #define LIBCFS_DEBUG
 #include <libcfs/libcfs.h>
+#include <lnet/types.h>
 
 #if defined(__linux__)
 #include <libcfs/linux/kp30.h>
 #elif defined(__APPLE__)
 #include <libcfs/darwin/kp30.h>
+#elif defined(__WINNT__)
+#include <libcfs/winnt/kp30.h>
 #else
 #error Unsupported operating system
 #endif
@@ -192,7 +195,7 @@ do {                                                                           \
 # else
 #  define LASSERT(e)
 #  define LASSERTF(cond, args...) do { } while (0)
-# endif
+# endif /* LIBCFS_DEBUG */
 # define LBUG()   assert(0)
 # define printk(format, args...) printf (format, ## args)
 # define LIBCFS_ALLOC(ptr, size) do { (ptr) = calloc(1,size); } while (0);
@@ -364,7 +367,7 @@ static inline int libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data)
                 CERROR ("LIBCFS ioctl: plen2 nonzero but no pbuf2 pointer\n");
                 return 1;
         }
-        if (libcfs_ioctl_packlen(data) != data->ioc_len ) {
+        if ((__u32)libcfs_ioctl_packlen(data) != data->ioc_len ) {
                 CERROR ("LIBCFS ioctl: packlen != ioc_len\n");
                 return 1;
         }
index 898af0b..fc5a4de 100644 (file)
@@ -12,6 +12,8 @@
 #include <libcfs/linux/libcfs.h>
 #elif defined(__APPLE__)
 #include <libcfs/darwin/libcfs.h>
+#elif defined(__WINNT__)
+#include <libcfs/winnt/libcfs.h>
 #else
 #error Unsupported operating system.
 #endif
 # define offsetof(typ,memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
 #endif
 
+/* cardinality of array */
+#define sizeof_array(a) ((sizeof (a)) / (sizeof ((a)[0])))
+
+#if !defined(container_of)
+/* given a pointer @ptr to the field @member embedded into type (usually
+ * struct) @type, return pointer to the embedding instance of @type. */
+#define container_of(ptr, type, member) \
+        ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+#endif
+
+/*
+ * true iff @i is power-of-2
+ */
+#define IS_PO2(i)                               \
+({                                              \
+        typeof(i) __i;                          \
+                                                \
+        __i = (i);                              \
+        !(__i & (__i - 1));                     \
+})
+
 #define LOWEST_BIT_SET(x)       ((x) & ~((x) - 1))
 
 /*
@@ -37,6 +60,7 @@ extern unsigned int libcfs_subsystem_debug;
 extern unsigned int libcfs_stack;
 extern unsigned int libcfs_debug;
 extern unsigned int libcfs_printk;
+extern unsigned int libcfs_debug_binary;
 
 /* Has there been an LBUG? */
 extern unsigned int libcfs_catastrophe;
@@ -172,17 +196,7 @@ do {                                                                          \
         }                                                                     \
 } while (0)
 
-#elif defined(LUSTRE_UTILS)
-
-#define CDEBUG(mask, format, a...)                                      \
-do {                                                                    \
-        if ((mask) & (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE))       \
-                fprintf(stderr, "(%s:%d:%s()) " format,                 \
-                        __FILE__, __LINE__, __FUNCTION__, ## a);        \
-} while (0)
-#define CDEBUG_LIMIT CDEBUG
-
-#else  /* !__KERNEL__ && !LUSTRE_UTILS*/
+#elif defined(__arch_lib__) && !defined(LUSTRE_UTILS)
 
 #define CDEBUG(mask, format, a...)                                      \
 do {                                                                    \
@@ -195,6 +209,16 @@ do {                                                                    \
 } while (0)
 #define CDEBUG_LIMIT CDEBUG
 
+#else
+
+#define CDEBUG(mask, format, a...)                                      \
+do {                                                                    \
+        if ((mask) & (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE))       \
+                fprintf(stderr, "(%s:%d:%s()) " format,                 \
+                        __FILE__, __LINE__, __FUNCTION__, ## a);        \
+} while (0)
+#define CDEBUG_LIMIT CDEBUG
+
 #endif /* !__KERNEL__ */
 
 #define CWARN(format, a...)          CDEBUG_LIMIT(D_WARNING, format, ## a)
@@ -216,7 +240,7 @@ do {                                                                    \
         goto label;                                                     \
 } while (0)
 
-#define CDEBUG_ENTRY_EXIT 1
+#define CDEBUG_ENTRY_EXIT (1)
 #if CDEBUG_ENTRY_EXIT
 
 /*
@@ -251,6 +275,28 @@ do {                                                                    \
 
 #endif /* !CDEBUG_ENTRY_EXIT */
 
+/*
+ * Some (nomina odiosa sunt) platforms define NULL as naked 0. This confuses
+ * Lustre RETURN(NULL) macro.
+ */
+#if defined(NULL)
+#undef NULL
+#endif
+
+/*
+ * Define lbug_with_loc for your own platform.
+ */
+void lbug_with_loc(char *file,
+                   const char *func,
+                   const int line) __attribute__((noreturn));
+
+#define LBUG_WITH_LOC(file, func, line)                                 \
+do {                                                                    \
+        libcfs_catastrophe = 1;                                         \
+        lbug_with_loc(file, func, line);                                \
+} while (0)
+
+#define NULL ((void *)0)
 
 #define LUSTRE_SRV_LNET_PID      LUSTRE_LNET_PID
 
@@ -267,8 +313,8 @@ struct libcfs_ioctl_handler {
 
 #define DECLARE_IOCTL_HANDLER(ident, func)              \
         struct libcfs_ioctl_handler ident = {           \
-                .item = CFS_LIST_HEAD_INIT(ident.item), \
-                .handle_ioctl = func                    \
+                /* .item = */ CFS_LIST_HEAD_INIT(ident.item),     \
+                /* .handle_ioctl = */ func                    \
         }
 
 int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand);
@@ -304,7 +350,7 @@ struct lc_watchdog;
  * touch it once to enable it. */
 struct lc_watchdog *lc_watchdog_add(int time,
                                     void (*cb)(struct lc_watchdog *,
-                                               struct task_struct *,
+                                               cfs_task_t *,
                                                void *),
                                     void *data);
 
@@ -319,7 +365,7 @@ void lc_watchdog_delete(struct lc_watchdog *lcw);
 
 /* Dump a debug log */
 void lc_watchdog_dumplog(struct lc_watchdog *lcw,
-                         struct task_struct *tsk,
+                         cfs_task_t *tsk,
                          void *data);
 
 /* __KERNEL__ */
@@ -373,7 +419,25 @@ static inline time_t cfs_unix_seconds(void)
         cfs_fs_time_t t;
 
         cfs_fs_time_current(&t);
-        return cfs_fs_time_sec(&t);
+        return (time_t)cfs_fs_time_sec(&t);
+}
+
+static inline cfs_time_t cfs_time_shift(int seconds)
+{
+        return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
+}
+
+static inline long cfs_timeval_sub(struct timeval *large, struct timeval *small,
+                                   struct timeval *result)
+{
+        long r = (long) (
+                (large->tv_sec - small->tv_sec) * ONE_MILLION +
+                (large->tv_usec - small->tv_usec));
+        if (result != NULL) {
+                result->tv_usec = do_div(r, ONE_MILLION);
+                result->tv_sec = r;
+        }
+        return r;
 }
 
 #define CFS_RATELIMIT(seconds)                                  \
@@ -432,7 +496,7 @@ static inline cfs_duration_t cfs_timeout_cap(cfs_duration_t timeout)
 }
 
 /*
- * Portable memory allocator API (draft)
+ * Universal memory allocator API 
  */
 enum cfs_alloc_flags {
         /* allocation is not allowed to block */
@@ -451,22 +515,106 @@ enum cfs_alloc_flags {
         CFS_ALLOC_USER   = CFS_ALLOC_WAIT | CFS_ALLOC_FS | CFS_ALLOC_IO,
 };
 
-#define CFS_SLAB_ATOMIC         CFS_ALLOC_ATOMIC
-#define CFS_SLAB_WAIT           CFS_ALLOC_WAIT
-#define CFS_SLAB_ZERO           CFS_ALLOC_ZERO
-#define CFS_SLAB_FS             CFS_ALLOC_FS
-#define CFS_SLAB_IO             CFS_ALLOC_IO
-#define CFS_SLAB_STD            CFS_ALLOC_STD
-#define CFS_SLAB_USER           CFS_ALLOC_USER
-
 /* flags for cfs_page_alloc() in addition to enum cfs_alloc_flags */
-enum cfs_page_alloc_flags {
+enum cfs_alloc_page_flags {
         /* allow to return page beyond KVM. It has to be mapped into KVM by
          * cfs_page_map(); */
         CFS_ALLOC_HIGH   = (1 << 5),
         CFS_ALLOC_HIGHUSER = CFS_ALLOC_WAIT | CFS_ALLOC_FS | CFS_ALLOC_IO | CFS_ALLOC_HIGH,
 };
 
+/*
+ * portable UNIX device file identification. (This is not _very_
+ * portable. Probably makes no sense for Windows.)
+ */
+/*
+ * Platform defines
+ *
+ * cfs_rdev_t
+ */
+
+typedef unsigned int cfs_major_nr_t;
+typedef unsigned int cfs_minor_nr_t;
+
+/*
+ * Defined by platform.
+ */
+cfs_rdev_t     cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor);
+cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev);
+cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev);
+
+/*
+ * Generic on-wire rdev format.
+ */
+
+typedef __u32 cfs_wire_rdev_t;
+
+cfs_wire_rdev_t cfs_wire_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor);
+cfs_major_nr_t  cfs_wire_rdev_major(cfs_wire_rdev_t rdev);
+cfs_minor_nr_t  cfs_wire_rdev_minor(cfs_wire_rdev_t rdev);
+
+/*
+ * Drop into debugger, if possible. Implementation is provided by platform.
+ */
+
+void cfs_enter_debugger(void);
+
+/*
+ * Defined by platform
+ */
+void cfs_daemonize(char *str);
+#ifdef __KERNEL__
+void cfs_block_allsigs(void);
+void cfs_block_sigs(cfs_sigset_t bits);
+cfs_sigset_t cfs_get_blocked_sigs(void);
+#endif
+
+int convert_server_error(__u64 ecode);
+int convert_client_oflag(int cflag, int *result);
+
+/*
+ * Stack-tracing filling.
+ */
+
+/*
+ * Platform-dependent data-type to hold stack frames.
+ */
+struct cfs_stack_trace;
+
+/*
+ * Fill @trace with current back-trace.
+ */
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace);
+
+/*
+ * Return instruction pointer for frame @frame_no. NULL if @frame_no is
+ * invalid.
+ */
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no);
+
+/*
+ * Universal open flags.
+ */
+#define CFS_O_ACCMODE           0003
+#define CFS_O_CREAT             0100
+#define CFS_O_EXCL              0200
+#define CFS_O_NOCTTY            0400
+#define CFS_O_TRUNC             01000
+#define CFS_O_APPEND            02000
+#define CFS_O_NONBLOCK          04000
+#define CFS_O_NDELAY            CFS_O_NONBLOCK
+#define CFS_O_SYNC              010000
+#define CFS_O_ASYNC             020000
+#define CFS_O_DIRECT            040000
+#define CFS_O_LARGEFILE         0100000
+#define CFS_O_DIRECTORY         0200000
+#define CFS_O_NOFOLLOW          0400000
+#define CFS_O_NOATIME           01000000
+
+/* convert local open flags to universal open flags */
+int cfs_oflags2univ(int flags);
+/* convert universal open flags to local open flags */
+int cfs_univ2oflags(int flags);
 
 #define _LIBCFS_H
 
index 017ca73..ac5e82e 100644 (file)
@@ -91,28 +91,6 @@ static inline void our_cond_resched(void)
 #endif
 #define LASSERT_SEM_LOCKED(sem) LASSERT(down_trylock(sem) != 0)
 
-#ifdef __arch_um__
-#define LBUG_WITH_LOC(file, func, line)                                 \
-do {                                                                    \
-        CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n");       \
-        libcfs_catastrophe = 1;                                        \
-        libcfs_debug_dumplog();                                        \
-        libcfs_run_lbug_upcall(file, func, line);                      \
-        panic("LBUG");                                                  \
-} while (0)
-#else
-#define LBUG_WITH_LOC(file, func, line)                                 \
-do {                                                                    \
-        CEMERG("LBUG\n");                                               \
-        libcfs_catastrophe = 1;                                        \
-        libcfs_debug_dumpstack(NULL);                                  \
-        libcfs_debug_dumplog();                                        \
-        libcfs_run_lbug_upcall(file, func, line);                      \
-        set_task_state(current, TASK_UNINTERRUPTIBLE);                  \
-        schedule();                                                     \
-} while (0)
-#endif /* __arch_um__ */
-
 /* ------------------------------------------------------------------- */
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
@@ -120,7 +98,7 @@ do {                                                                    \
 #define PORTAL_SYMBOL_REGISTER(x) inter_module_register(#x, THIS_MODULE, &x)
 #define PORTAL_SYMBOL_UNREGISTER(x) inter_module_unregister(#x)
 
-#define PORTAL_SYMBOL_GET(x) (void *)inter_module_get(#x)
+#define PORTAL_SYMBOL_GET(x) ((typeof(&x))inter_module_get(#x))
 #define PORTAL_SYMBOL_PUT(x) inter_module_put(#x)
 
 #define PORTAL_MODULE_USE       MOD_INC_USE_COUNT
index c69058f..b2dc2e2 100644 (file)
@@ -8,6 +8,7 @@
 #error Do not #include this file directly. #include <libcfs/libcfs.h> instead
 #endif
 
+#include <stdarg.h>
 #include <libcfs/linux/linux-mem.h>
 #include <libcfs/linux/linux-time.h>
 #include <libcfs/linux/linux-prim.h>
@@ -117,7 +118,7 @@ struct ptldebug_header {
 /* initial pid  */
 #define LUSTRE_LNET_PID          12345
 
-#define ENTRY_NESTING_SUPPORT (0)
+#define ENTRY_NESTING_SUPPORT (1)
 #define ENTRY_NESTING   do {;} while (0)
 #define EXIT_NESTING   do {;} while (0)
 #define __current_nesting_level() (0)
@@ -136,4 +137,17 @@ typedef kernel_cap_t cfs_kernel_cap_t;
 typedef __u32 cfs_kernel_cap_t;
 #endif
 
+#if defined(__KERNEL__)
+/*
+ * No stack-back-tracing in Linux for now.
+ */
+struct cfs_stack_trace {
+};
+
+#ifndef WITH_WATCHDOG
+#define WITH_WATCHDOG
+#endif
+
+#endif
+
 #endif /* _LINUX_LIBCFS_H */
index b046999..3ba5461 100644 (file)
 #include <linux/fs.h>
 #include <linux/stat.h>
 #include <linux/mount.h>
-#endif
+#else /* !__KERNEL__ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <mntent.h>
+#endif  /* __KERNEL__ */
 
 typedef struct file cfs_file_t;
 typedef struct dentry cfs_dentry_t;
@@ -55,17 +64,24 @@ cfs_file_t *cfs_filp_open (const char *name, int flags, int mode, int *err);
 #define cfs_put_file(f)                     fput(f)
 #define cfs_file_count(f)                   file_count(f)
 
-typedef struct file_lock cfs_flock_t; 
-#define CFS_FLOCK_TYPE(fl)                  ((fl)->fl_type)
-#define CFS_FLOCK_SET_TYPE(fl, type)        do { (fl)->fl_type = (type); } while(0)
-#define CFS_FLOCK_PID(fl)                   ((fl)->fl_pid)
-#define CFS_FLOCK_SET_PID(fl, pid)          do { (fl)->fl_pid = (pid); } while(0)
-#define CFS_FLOCK_START(fl)                 ((fl)->fl_start)
-#define CFS_FLOCK_SET_START(fl, start)      do { (fl)->fl_start = (start); } while(0)
-#define CFS_FLOCK_END(fl)                   ((fl)->fl_end)
-#define CFS_FLOCK_SET_END(fl, end)          do { (fl)->fl_end = (end); } while(0)
+typedef struct file_lock cfs_flock_t;
+#define cfs_flock_type(fl)                  ((fl)->fl_type)
+#define cfs_flock_set_type(fl, type)        do { (fl)->fl_type = (type); } while(0)
+#define cfs_flock_pid(fl)                   ((fl)->fl_pid)
+#define cfs_flock_set_pid(fl, pid)          do { (fl)->fl_pid = (pid); } while(0)
+#define cfs_flock_start(fl)                 ((fl)->fl_start)
+#define cfs_flock_set_start(fl, start)      do { (fl)->fl_start = (start); } while(0)
+#define cfs_flock_end(fl)                   ((fl)->fl_end)
+#define cfs_flock_set_end(fl, end)          do { (fl)->fl_end = (end); } while(0)
 
 ssize_t cfs_user_write (cfs_file_t *filp, const char *buf, size_t count, loff_t *offset);
+
+/*
+ * portable UNIX device file identification.
+ */
+
+typedef dev_t cfs_rdev_t;
+
 #endif
 
 #endif
index cb92b3a..f419c9b 100644 (file)
  * - wait_for_completion(c)
  */
 
-/*
- * OSX funnels:
- *
- * No funnels needed in Linux
- */
-#define CFS_DECL_FUNNEL_DATA
-#define CFS_DECL_CONE_DATA             DECLARE_FUNNEL_DATA
-#define CFS_DECL_NET_DATA               DECLARE_FUNNEL_DATA
-#define CFS_CONE_IN                    do {} while(0)
-#define CFS_CONE_EX                    do {} while(0)
-
-#define CFS_NET_IN                      do {} while(0)
-#define CFS_NET_EX                      do {} while(0)
-
 /* __KERNEL__ */
 #else
 
-//#include "../user-lock.h"
+#include "../user-lock.h"
 
 /* __KERNEL__ */
 #endif
index 94b764f..6927daf 100644 (file)
@@ -43,13 +43,15 @@ typedef struct page                     cfs_page_t;
 #define CFS_PAGE_SHIFT                  PAGE_CACHE_SHIFT
 #define CFS_PAGE_MASK                   PAGE_CACHE_MASK
 
-cfs_page_t *cfs_alloc_pages(unsigned int flags, unsigned int order);
-#define cfs_alloc_page(f)              cfs_alloc_pages(f, 0)
-#define cfs_free_pages(p, o)           __free_pages(p, o)
-#define cfs_free_page(p)               __free_pages(p, 0)
+cfs_page_t *cfs_alloc_page(unsigned int flags);
+#define cfs_free_page(p)                __free_pages(p, 0)
 
 static inline void *cfs_page_address(cfs_page_t *page)
 {
+        /*
+         * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
         return page_address(page);
 }
 
@@ -78,8 +80,11 @@ static inline void cfs_set_page_count(cfs_page_t *page, int v)
         set_page_count(page, v);
 }
 
+#define cfs_page_index(p)       ((p)->index)
+
 /*
  * Memory allocator
+ * XXX Liang: move these declare to public file
  */
 extern void *cfs_alloc(size_t nr_bytes, u_int32_t flags);
 extern void  cfs_free(void *addr);
@@ -88,12 +93,17 @@ extern void *cfs_alloc_large(size_t nr_bytes);
 extern void  cfs_free_large(void *addr);
 
 /*
+ * In Linux there is no way to determine whether current execution context is
+ * blockable.
+ */
+#define CFS_ALLOC_ATOMIC_TRY   CFS_ALLOC_ATOMIC
+
+/*
  * SLAB allocator
+ * XXX Liang: move these declare to public file
  */
 typedef kmem_cache_t    cfs_mem_cache_t;
-extern cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, unsigned long,
-                                               void (*)(void *, cfs_mem_cache_t *, unsigned long),
-                                               void (*)(void *, cfs_mem_cache_t *, unsigned long));
+extern cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, unsigned long);
 extern int cfs_mem_cache_destroy ( cfs_mem_cache_t * );
 extern void *cfs_mem_cache_alloc ( cfs_mem_cache_t *, int);
 extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *);
@@ -104,6 +114,13 @@ extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *);
 #define CFS_MMSPACE_OPEN                do { __oldfs = get_fs(); set_fs(get_ds());} while(0)
 #define CFS_MMSPACE_CLOSE               set_fs(__oldfs)
 
+#else   /* !__KERNEL__ */
+#ifdef HAVE_ASM_PAGE_H
+#include <asm/page.h>           /* needed for PAGE_SIZE - rread */
+#endif
+
+#define PAGE_CACHE_SIZE         PAGE_SIZE
+#include <libcfs/user-prim.h>
 /* __KERNEL__ */
 #endif
 
index 0d080d9..c85110f 100644 (file)
 /*
  * Pseudo device register
  */
-typedef struct miscdevice              cfs_psdev_t;
-#define cfs_psdev_register(dev)                misc_register(dev)
-#define cfs_psdev_deregister(dev)      misc_deregister(dev)
+typedef struct miscdevice               cfs_psdev_t;
+#define cfs_psdev_register(dev)         misc_register(dev)
+#define cfs_psdev_deregister(dev)       misc_deregister(dev)
 
 /*
  * Sysctl register
  */
-typedef struct ctl_table               cfs_sysctl_table_t;
-typedef struct ctl_table_header                cfs_sysctl_table_header_t;
+typedef struct ctl_table                cfs_sysctl_table_t;
+typedef struct ctl_table_header         cfs_sysctl_table_header_t;
 
-#define register_cfs_sysctl_table(t, a)        register_sysctl_table(t, a)
-#define unregister_cfs_sysctl_table(t) unregister_sysctl_table(t, a)
+#define cfs_register_sysctl_table(t, a) register_sysctl_table(t, a)
+#define cfs_unregister_sysctl_table(t)  unregister_sysctl_table(t, a)
+
+/*
+ * Symbol register
+ */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define cfs_symbol_register(s, p)       inter_module_register(s, THIS_MODULE, p)
+#define cfs_symbol_unregister(s)        inter_module_unregister(s)
+#define cfs_symbol_get(s)               inter_module_get(s)
+#define cfs_symbol_put(s)               inter_module_put(s)
+#define cfs_module_get()                MOD_INC_USE_COUNT
+#define cfs_module_put()                MOD_DEC_USE_COUNT
+#else
+#define cfs_symbol_register(s, p)       do {} while(0)
+#define cfs_symbol_unregister(s)        do {} while(0)
+#define cfs_symbol_get(s)               symbol_get(s)
+#define cfs_symbol_put(s)               symbol_put(s)
+#define cfs_module_get()                try_module_get(THIS_MODULE)
+#define cfs_module_put()                module_put(THIS_MODULE)
+#endif
 
 /*
  * Proc file system APIs
@@ -73,21 +92,31 @@ typedef struct proc_dir_entry           cfs_proc_dir_entry_t;
 /*
  * Wait Queue
  */
-typedef wait_queue_t                   cfs_waitlink_t;
-typedef wait_queue_head_t              cfs_waitq_t;
+#define CFS_TASK_INTERRUPTIBLE          TASK_INTERRUPTIBLE
+#define CFS_TASK_UNINT                  TASK_UNINTERRUPTIBLE
+
+typedef wait_queue_t                    cfs_waitlink_t;
+typedef wait_queue_head_t               cfs_waitq_t;
 
-#define cfs_waitq_init(w)              init_waitqueue_head(w)
-#define cfs_waitlink_init(l)           init_waitqueue_entry(l, current)
-#define cfs_waitq_add(w, l)            add_wait_queue(w, l)
-#define cfs_waitq_add_exclusive(w, l)  add_wait_queue_exclusive(w, l)
+typedef long                            cfs_task_state_t;
+
+#define CFS_TASK_INTERRUPTIBLE          TASK_INTERRUPTIBLE
+#define CFS_TASK_UNINT                  TASK_UNINTERRUPTIBLE
+
+#define cfs_waitq_init(w)               init_waitqueue_head(w)
+#define cfs_waitlink_init(l)            init_waitqueue_entry(l, current)
+#define cfs_waitq_add(w, l)             add_wait_queue(w, l)
+#define cfs_waitq_add_exclusive(w, l)   add_wait_queue_exclusive(w, l)
 #define cfs_waitq_forward(l, w)         do {} while(0)
-#define cfs_waitq_del(w, l)            remove_wait_queue(w, l)
-#define cfs_waitq_active(w)            waitqueue_active(w)
-#define cfs_waitq_signal(w)            wake_up(w)
-#define cfs_waitq_signal_nr(w,n)       wake_up_nr(w, n)
-#define cfs_waitq_broadcast(w)         wake_up_all(w)
-#define cfs_waitq_wait(l)              schedule()
-#define cfs_waitq_timedwait(l, t)      schedule_timeout(t)
+#define cfs_waitq_del(w, l)             remove_wait_queue(w, l)
+#define cfs_waitq_active(w)             waitqueue_active(w)
+#define cfs_waitq_signal(w)             wake_up(w)
+#define cfs_waitq_signal_nr(w,n)        wake_up_nr(w, n)
+#define cfs_waitq_broadcast(w)          wake_up_all(w)
+#define cfs_waitq_wait(l, s)            schedule()
+#define cfs_waitq_timedwait(l, s, t)    schedule_timeout(t)
+#define cfs_schedule_timeout(s, t)      schedule_timeout(t)
+#define cfs_schedule()                  schedule()
 
 /* Kernel thread */
 typedef int (*cfs_thread_t)(void *);
@@ -98,6 +127,8 @@ typedef int (*cfs_thread_t)(void *);
  */
 typedef struct task_struct              cfs_task_t;
 #define cfs_current()                   current
+#define cfs_task_lock(t)                task_lock(t)
+#define cfs_task_unlock(t)              task_unlock(t)
 #define CFS_DECL_JOURNAL_DATA           void *journal_info
 #define CFS_PUSH_JOURNAL                do {    \
         journal_info = current->journal_info;   \
@@ -115,14 +146,12 @@ module_exit(fini)
 /*
  * Signal
  */
+typedef sigset_t                        cfs_sigset_t;
 #define cfs_sigmask_lock(t, f)          SIGNAL_MASK_LOCK(t, f)
 #define cfs_sigmask_unlock(t, f)        SIGNAL_MASK_UNLOCK(t, f)
 #define cfs_recalc_sigpending(t)        RECALC_SIGPENDING
+#define cfs_clear_sigpending(t)         CLEAR_SIGPENDING
 #define cfs_signal_pending(t)           signal_pending(t)
-#define cfs_sigfillset(s)               sigfillset(s)
-
-#define cfs_set_sig_blocked(t, b)       do { (t)->blocked = b; } while(0)
-#define cfs_get_sig_blocked(t)          (&(t)->blocked)
 
 /*
  * Timer
@@ -174,6 +203,7 @@ static inline void cfs_pause(cfs_duration_t ticks)
 
 #else   /* !__KERNEL__ */
 
+typedef struct proc_dir_entry           cfs_proc_dir_entry_t;
 #include "../user-prim.h"
 
 #endif /* __KERNEL__ */
index f18e7d9..ac3e837 100644 (file)
@@ -227,11 +227,6 @@ static inline cfs_duration_t cfs_time_seconds(int seconds)
         return seconds * HZ;
 }
 
-static inline cfs_time_t cfs_time_shift(int seconds)
-{
-        return jiffies + seconds * HZ;
-}
-
 static inline time_t cfs_duration_sec(cfs_duration_t d)
 {
         return d / HZ;
index 7367d14..efa55db 100644 (file)
  * using the generic single-entry routines.
  */
 
+#ifndef __WINNT__
 #define prefetch(a) ((void)a)
+#else
+#define prefetch(a) ((void *)a)
+#endif
 
 struct list_head {
        struct list_head *next, *prev;
index 0240459..dbeae91 100644 (file)
@@ -11,6 +11,8 @@
 #include <libcfs/linux/lltrace.h>
 #elif defined(__APPLE__)
 #include <libcfs/darwin/lltrace.h>
+#elif defined(__WINNT__)
+#include <libcfs/winnt/lltrace.h>
 #else
 #error Unsupported Operating System
 #endif
index 8be849b..2c74825 100644 (file)
@@ -29,6 +29,8 @@
 #include <libcfs/linux/portals_lib.h>
 #elif defined(__APPLE__)
 #include <libcfs/darwin/portals_lib.h>
+#elif defined(__WINNT__)
+#include <libcfs/winnt/portals_lib.h>
 #else
 #error Unsupported Operating System
 #endif
@@ -68,7 +70,7 @@ static inline int size_round0(int val)
 
 static inline size_t round_strlen(char *fset)
 {
-        return size_round(strlen(fset) + 1);
+        return (size_t)size_round((int)strlen(fset) + 1);
 }
 
 #define LOGL(var,len,ptr)                                       \
index 932caaf..b79eb7e 100644 (file)
@@ -12,6 +12,8 @@
 #include <libcfs/linux/portals_utils.h>
 #elif defined(__APPLE__)
 #include <libcfs/darwin/portals_utils.h>
+#elif defined(__WINNT__)
+#include <libcfs/winnt/portals_utils.h>
 #else
 #error Unsupported Operating System
 #endif
index a4ced3d..cea7a6d 100644 (file)
 
 /*
  * liblustre is single-threaded, so most "synchronization" APIs are trivial.
+ *
+ * XXX Liang: There are several branches share lnet with b_hd_newconfig,
+ * if we define lock APIs at here, there will be conflict with liblustre
+ * in other branches.
  */
 
 #ifndef __KERNEL__
+#include <stdio.h>
+#include <stdlib.h>
 
+#if 0
 /*
  * Optional debugging (magic stamping and checking ownership) can be added.
  */
  *
  * No-op implementation.
  */
-struct spin_lock {};
+struct spin_lock {int foo;};
 
 typedef struct spin_lock spinlock_t;
 
+#define SPIN_LOCK_UNLOCKED (spinlock_t) { }
+#define LASSERT_SPIN_LOCKED(lock) do {} while(0)
+
 void spin_lock_init(spinlock_t *lock);
 void spin_lock(spinlock_t *lock);
 void spin_unlock(spinlock_t *lock);
@@ -66,11 +76,10 @@ int spin_trylock(spinlock_t *lock);
 void spin_lock_bh_init(spinlock_t *lock);
 void spin_lock_bh(spinlock_t *lock);
 void spin_unlock_bh(spinlock_t *lock);
+static inline int spin_is_locked(spinlock_t *l) {return 1;}
 
-static inline void 
-spin_lock_irqsave(spinlock_t *l, unsigned long f) { spin_lock(l); }
-static inline void 
-spin_unlock_irqrestore(spinlock_t *l, unsigned long f) { spin_unlock(l); }
+static inline void spin_lock_irqsave(spinlock_t *l, unsigned long f){}
+static inline void spin_unlock_irqrestore(spinlock_t *l, unsigned long f){}
 
 /*
  * Semaphore
@@ -79,7 +88,9 @@ spin_unlock_irqrestore(spinlock_t *l, unsigned long f) { spin_unlock(l); }
  * - __down(x)
  * - __up(x)
  */
-struct semaphore {};
+typedef struct semaphore {
+    int foo;
+} mutex_t;
 
 void sema_init(struct semaphore *s, int val);
 void __down(struct semaphore *s);
@@ -106,11 +117,13 @@ void __up(struct semaphore *s);
  * - complete(c)
  * - wait_for_completion(c)
  */
+#if 0
 struct completion {};
 
 void init_completion(struct completion *c);
 void complete(struct completion *c);
 void wait_for_completion(struct completion *c);
+#endif
 
 /*
  * rw_semaphore:
@@ -161,6 +174,23 @@ read_lock_irqsave(rwlock_t *l, unsigned long f) { read_lock(l); }
 static inline void
 read_unlock_irqrestore(rwlock_t *l, unsigned long f) { read_unlock(l); }
 
+/*
+ * Atomic for user-space
+ * Copied from liblustre
+ */
+typedef struct { volatile int counter; } atomic_t;
+
+#define ATOMIC_INIT(i) { (i) }
+#define atomic_read(a) ((a)->counter)
+#define atomic_set(a,b) do {(a)->counter = b; } while (0)
+#define atomic_dec_and_test(a) ((--((a)->counter)) == 0)
+#define atomic_inc(a)  (((a)->counter)++)
+#define atomic_dec(a)  do { (a)->counter--; } while (0)
+#define atomic_add(b,a)  do {(a)->counter += b;} while (0)
+#define atomic_sub(b,a)  do {(a)->counter -= b;} while (0)
+
+#endif
+
 /* !__KERNEL__ */
 #endif
 
index 40b15b5..a8ecca8 100644 (file)
 
 #ifndef __KERNEL__
 
+#include <stdlib.h>
+#include <string.h>
+#include <sys/signal.h>
+#include <sys/mman.h>
 #include <libcfs/list.h>
+#include <libcfs/user-time.h>
 
+typedef sigset_t        cfs_sigset_t;
 /*
  * Wait Queue. No-op implementation.
  */
 
-typedef struct cfs_waitlink {} cfs_waitlink_t;
-typedef struct cfs_waitq {} cfs_waitq_t;
+typedef struct cfs_waitlink {
+        struct list_head sleeping;
+        void *process;
+} cfs_waitlink_t;
+
+typedef struct cfs_waitq {
+        struct list_head sleepers;
+} cfs_waitq_t;
 
 void cfs_waitq_init(struct cfs_waitq *waitq);
 void cfs_waitlink_init(struct cfs_waitlink *link);
@@ -57,13 +69,17 @@ void cfs_waitq_del(struct cfs_waitq *waitq, struct cfs_waitlink *link);
 int  cfs_waitq_active(struct cfs_waitq *waitq);
 void cfs_waitq_signal(struct cfs_waitq *waitq);
 void cfs_waitq_signal_nr(struct cfs_waitq *waitq, int nr);
-void cfs_waitq_broadcast(struct cfs_waitq *waitq);
+void cfs_waitq_broadcast(struct cfs_waitq *waitq, int state);
 void cfs_waitq_wait(struct cfs_waitlink *link);
-int64_t cfs_waitq_timedwait(struct cfs_waitlink *link, int64_t timeout);
+int64_t cfs_waitq_timedwait(struct cfs_waitlink *link, int state, int64_t timeout);
+#define cfs_schedule_timeout(s, t)              \
+        do {                                    \
+                cfs_waitlink_t    l;            \
+                cfs_waitq_timedwait(&l, s, t);  \
+        } while (0)
 
-/*
- * Allocator
- */
+#define CFS_TASK_INTERRUPTIBLE  (0)
+#define CFS_TASK_UNINT          (0)
 
 /* 2.4 defines */
 
@@ -92,11 +108,8 @@ typedef struct page cfs_page_t;
 #define CFS_PAGE_SHIFT                  PAGE_CACHE_SHIFT
 #define CFS_PAGE_MASK                   PAGE_CACHE_MASK
 
-cfs_page_t *cfs_alloc_pages(unsigned int flags, unsigned int order);
-void cfs_free_pages(struct page *pg, int what);
-
 cfs_page_t *cfs_alloc_page(unsigned int flags);
-void cfs_free_page(cfs_page_t *pg, int what);
+void cfs_free_page(cfs_page_t *pg);
 void *cfs_page_address(cfs_page_t *pg);
 void *cfs_kmap(cfs_page_t *pg);
 void cfs_kunmap(cfs_page_t *pg);
@@ -104,15 +117,28 @@ void cfs_kunmap(cfs_page_t *pg);
 #define cfs_get_page(p)                        __I_should_not_be_called__(at_all)
 #define cfs_page_count(p)              __I_should_not_be_called__(at_all)
 #define cfs_set_page_count(p, v)       __I_should_not_be_called__(at_all)
+#define cfs_page_index(p)               ((p)->index)
 
 /*
  * Memory allocator
+ * Inline function, so utils can use them without linking of libcfs
  */
-void *cfs_alloc(size_t nr_bytes, u_int32_t flags);
-void cfs_free(void *addr);
-void *cfs_alloc_large(size_t nr_bytes);
-void  cfs_free_large(void *addr);
+#define __ALLOC_ZERO    (1 << 2)
+static inline void *cfs_alloc(size_t nr_bytes, u_int32_t flags)
+{
+        void *result;
 
+        result = malloc(nr_bytes);
+        if (result != NULL && (flags & __ALLOC_ZERO))
+                memset(result, 0, nr_bytes);
+        return result;
+}
+
+#define cfs_free(addr)  free(addr)
+#define cfs_alloc_large(nr_bytes) cfs_alloc(nr_bytes, 0)
+#define cfs_free_large(addr) cfs_free(addr)
+
+#define CFS_ALLOC_ATOMIC_TRY   (0)
 /*
  * SLAB allocator
  */
@@ -123,9 +149,7 @@ typedef struct {
 #define SLAB_HWCACHE_ALIGN 0
 
 cfs_mem_cache_t *
-cfs_mem_cache_create(const char *, size_t, size_t, unsigned long,
-                     void (*)(void *, cfs_mem_cache_t *, unsigned long),
-                     void (*)(void *, cfs_mem_cache_t *, unsigned long));
+cfs_mem_cache_create(const char *, size_t, size_t, unsigned long);
 int cfs_mem_cache_destroy(cfs_mem_cache_t *c);
 void *cfs_mem_cache_alloc(cfs_mem_cache_t *c, int gfp);
 void cfs_mem_cache_free(cfs_mem_cache_t *c, void *addr);
@@ -140,8 +164,54 @@ typedef int (cfs_write_proc_t)(struct file *file, const char *buffer,
 /*
  * Timer
  */
+#include <sys/time.h>
+
+typedef struct {
+        struct list_head tl_list;
+        void (*function)(unsigned long unused);
+        unsigned long data;
+        long expires;
+} cfs_timer_t;
+
+#define cfs_init_timer(t)       do {} while(0)
+#define cfs_jiffies                             \
+({                                              \
+        unsigned long _ret = 0;                 \
+        struct timeval tv;                      \
+        if (gettimeofday(&tv, NULL) == 0)       \
+                _ret = tv.tv_sec;               \
+        _ret;                                   \
+})
+
+static inline int cfs_timer_init(cfs_timer_t *l, void (* func)(unsigned long), void *arg)
+{
+        CFS_INIT_LIST_HEAD(&l->tl_list);
+        l->function = func;
+        l->data = (unsigned long)arg;
+        return 0;
+}
+
+static inline int cfs_timer_is_armed(cfs_timer_t *l)
+{
+        if (cfs_time_before(cfs_jiffies, l->expires))
+                return 1;
+        else
+                return 0;
+}
 
-typedef struct cfs_timer {} cfs_timer_t;
+static inline void cfs_timer_arm(cfs_timer_t *l, int thetime)
+{
+        l->expires = thetime;
+}
+
+static inline void cfs_timer_disarm(cfs_timer_t *l)
+{
+}
+
+static inline long cfs_timer_deadline(cfs_timer_t *l)
+{
+        return l->expires;
+}
 
 #if 0
 #define cfs_init_timer(t)      do {} while(0)
@@ -176,6 +246,42 @@ static inline int cfs_psdev_deregister(cfs_psdev_t *foo)
         return 0;
 }
 
+/*
+ * portable UNIX device file identification.
+ */
+
+typedef unsigned int cfs_rdev_t;
+// typedef unsigned long long kdev_t;
+/*
+ */
+#define cfs_lock_kernel()               do {} while (0)
+#define cfs_sigfillset(l) do {}         while (0)
+#define cfs_recalc_sigpending(l)        do {} while (0)
+#define cfs_kernel_thread(l,m,n)        LBUG()
+
+// static inline void local_irq_save(unsigned long flag) {return;}
+// static inline void local_irq_restore(unsigned long flag) {return;}
+
+enum {
+        CFS_STACK_TRACE_DEPTH = 16
+};
+
+struct cfs_stack_trace {
+        void *frame[CFS_STACK_TRACE_DEPTH];
+};
+
+/*
+ * arithmetic
+ */
+#define do_div(a,b)                     \
+        ({                              \
+                unsigned long remainder;\
+                remainder = (a) % (b);  \
+                (a) = (a) / (b);        \
+                (remainder);            \
+        })
+
+
 /* !__KERNEL__ */
 #endif
 
index 1a5807f..59dfb7b 100644 (file)
@@ -115,7 +115,7 @@ static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
 
 static inline cfs_duration_t cfs_duration_build(int64_t nano)
 {
-        return nano / ONE_BILLION;
+        return (cfs_duration_t) (nano / ONE_BILLION);
 }
 
 static inline time_t cfs_duration_sec(cfs_duration_t d)
diff --git a/lnet/include/libcfs/winnt/kp30.h b/lnet/include/libcfs/winnt/kp30.h
new file mode 100644 (file)
index 0000000..6c7da07
--- /dev/null
@@ -0,0 +1,152 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or modify it under the
+ * terms of version 2 of the GNU General Public License as published by the
+ * Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass
+ * Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_KP30_H__
+#define __LIBCFS_WINNT_KP30_H__
+
+#ifndef __LIBCFS_KP30_H__
+#error Do not #include this file directly. #include <libcfs/kp30.h> instead
+#endif
+
+#ifdef __KERNEL__
+
+#include <libcfs/winnt/portals_compat25.h>
+#include <lnet/types.h>
+
+/* Module parameter support */
+#define CFS_MODULE_PARM(name, t, type, perm, desc) 
+
+#define CFS_SYSFS_MODULE_PARM    0 /* no sysfs access to module parameters */
+
+
+static inline void our_cond_resched()
+{
+    schedule_timeout(1i64);
+}
+                                                                                                                                                                            
+#ifdef CONFIG_SMP
+#define LASSERT_SPIN_LOCKED(lock) do {} while(0) /* XXX */
+#else
+#define LASSERT_SPIN_LOCKED(lock) do {} while(0)
+#endif
+
+
+#define cfs_work_struct_t WORK_QUEUE_ITEM
+#define cfs_prepare_work(tq, routine, contex)
+#define cfs_schedule_work(tq)
+                                                                                                                                                                            
+/* ------------------------------------------------------------------- */
+                                                                                                                                                                            
+#define PORTAL_SYMBOL_REGISTER(x)               cfs_symbol_register(#x, &x)
+#define PORTAL_SYMBOL_UNREGISTER(x)             cfs_symbol_unregister(#x)
+                                                                                                                                                                            
+#define PORTAL_SYMBOL_GET(x)                    (cfs_symbol_get(#x))
+#define PORTAL_SYMBOL_PUT(x)                    cfs_symbol_put(#x)
+                                                                                                                                                                            
+#define PORTAL_MODULE_USE                       do{}while(0)
+#define PORTAL_MODULE_UNUSE                     do{}while(0)
+
+#define printk                                  DbgPrint
+#define ptintf                                  DbgPrint
+                                                                                                                                                                            
+#else  /* !__KERNEL__ */
+
+# include <stdio.h>
+# include <stdlib.h>
+#ifdef __CYGWIN__
+# include <cygwin-ioctl.h>
+#endif
+# include <time.h>
+
+#endif /* End of !__KERNEL__ */
+
+/******************************************************************************/
+/* Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect. */
+#define LWT_SUPPORT  0
+
+/* kernel hasn't defined this? */
+typedef struct {
+        __s64      lwte_when;
+        char       *lwte_where;
+        void       *lwte_task;
+        long_ptr        lwte_p1;
+        long_ptr        lwte_p2;
+        long_ptr        lwte_p3;
+        long_ptr        lwte_p4;
+# if BITS_PER_LONG > 32
+        long_ptr        lwte_pad;
+# endif
+} lwt_event_t;
+
+
+# define LWT_EVENT(p1,p2,p3,p4)
+
+
+/* ------------------------------------------------------------------ */
+
+#define IOCTL_LIBCFS_TYPE long_ptr
+
+#ifdef __CYGWIN__
+# ifndef BITS_PER_LONG
+#  if (~0UL) == 0xffffffffUL
+#   define BITS_PER_LONG 32
+#  else
+#   define BITS_PER_LONG 64
+#  endif
+# endif
+#endif
+
+#if BITS_PER_LONG > 32
+# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
+# define LL_POISON ((long_ptr)0x5a5a5a5a5a5a5a5a)
+# define LP_POISON ((char *)(long_ptr)0x5a5a5a5a5a5a5a5a)
+#else
+# define LI_POISON ((int)0x5a5a5a5a)
+# define LL_POISON ((long_ptr)0x5a5a5a5a)
+# define LP_POISON ((char *)(long_ptr)0x5a5a5a5a)
+#endif
+
+#if defined(__x86_64__)
+# define LPU64 "%I64u"
+# define LPD64 "%I64d"
+# define LPX64 "%I64x"
+# define LPSZ  "%lu"
+# define LPSSZ "%ld"
+#elif (BITS_PER_LONG == 32 || __WORDSIZE == 32)
+# define LPU64 "%I64u"
+# define LPD64 "%I64d"
+# define LPX64 "%I64x"
+# define LPSZ  "%u"
+# define LPSSZ "%d"
+#elif (BITS_PER_LONG == 64 || __WORDSIZE == 64)
+# define LPU64 "%I64u"
+# define LPD64 "%I64d"
+# define LPX64 "%I64x"
+# define LPSZ  "%u"
+# define LPSSZ "%d"
+#endif
+#ifndef LPU64
+# error "No word size defined"
+#endif
+
+#endif
diff --git a/lnet/include/libcfs/winnt/libcfs.h b/lnet/include/libcfs/winnt/libcfs.h
new file mode 100644 (file)
index 0000000..5f68ee0
--- /dev/null
@@ -0,0 +1,131 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or modify it under the
+ * terms of version 2 of the GNU General Public License as published by the
+ * Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass
+ * Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_LIBCFS_H__
+#define __LIBCFS_WINNT_LIBCFS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+/* workgroud for VC compiler */
+#ifndef __FUNCTION__
+#define __FUNCTION__ "generic"
+#endif
+
+#include <libcfs/winnt/winnt-types.h>
+#include <libcfs/portals_utils.h>
+#include <libcfs/winnt/winnt-time.h>
+#include <libcfs/winnt/winnt-lock.h>
+#include <libcfs/winnt/winnt-mem.h>
+#include <libcfs/winnt/winnt-prim.h>
+#include <libcfs/winnt/winnt-fs.h>
+#include <libcfs/winnt/winnt-tcpip.h>
+
+struct ptldebug_header {
+        __u32 ph_len;
+        __u32 ph_flags;
+        __u32 ph_subsys;
+        __u32 ph_mask;
+        __u32 ph_cpu_id;
+        __u32 ph_sec;
+        __u64 ph_usec;
+        __u32 ph_stack;
+        __u32 ph_pid;
+        __u32 ph_extern_pid;
+        __u32 ph_line_num;
+} __attribute__((packed));
+
+#ifdef __KERNEL__
+
+enum {
+       /* if you change this, update darwin-util.c:cfs_stack_trace_fill() */
+       CFS_STACK_TRACE_DEPTH = 16
+};
+
+struct cfs_stack_trace {
+       void *frame[CFS_STACK_TRACE_DEPTH];
+};
+
+static inline __u32 query_stack_size()
+{
+    ULONG   LowLimit, HighLimit;
+
+    IoGetStackLimits(&LowLimit, &HighLimit);
+    ASSERT(HighLimit > LowLimit);
+
+    return (__u32) (HighLimit - LowLimit);
+}
+#else
+static inline __u32 query_stack_size()
+{
+   return 4096;
+}
+#endif
+
+
+#ifndef THREAD_SIZE
+# define THREAD_SIZE query_stack_size()
+#endif
+
+#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5)
+
+#ifdef __KERNEL__
+# ifdef  __ia64__
+#  define CDEBUG_STACK (THREAD_SIZE -                                      \
+                        ((ulong_ptr)__builtin_dwarf_cfa() &            \
+                         (THREAD_SIZE - 1)))
+# else
+#  define CDEBUG_STACK (IoGetRemainingStackSize())
+# endif /* __ia64__ */
+
+#define CHECK_STACK(stack)                                                 \
+        do {                                                                  \
+                if ((stack) > 3*THREAD_SIZE/4 && (stack) > portal_stack) {    \
+                        portals_debug_msg(DEBUG_SUBSYSTEM, D_WARNING,         \
+                                          __FILE__, NULL, __LINE__,   \
+                                          (stack),"maximum lustre stack %u\n",\
+                                          portal_stack = (stack));            \
+                }                                                             \
+        } while (0)
+#else /* !__KERNEL__ */
+#define CHECK_STACK(stack) do { } while(0)
+#define CDEBUG_STACK (0L)
+#endif /* __KERNEL__ */
+
+/* initial pid  */
+#define LUSTRE_LNET_PID          12345
+
+#define ENTRY_NESTING_SUPPORT (0)
+#define ENTRY_NESTING   do {;} while (0)
+#define EXIT_NESTING   do {;} while (0)
+#define __current_nesting_level() (0)
+
+
+#define LBUG_WITH_LOC(_FILE, _FUNC, _LINE)  \
+do {                                        \
+    CEMERG("LBUG: pid: %u thread: %#x\n",   \
+              (unsigned)cfs_curproc_pid(),     \
+           (unsigned)PsGetCurrentThread()); \
+} while(0)
+
+#endif /* _WINNT_LIBCFS_H */
diff --git a/lnet/include/libcfs/winnt/lltrace.h b/lnet/include/libcfs/winnt/lltrace.h
new file mode 100644 (file)
index 0000000..9615e94
--- /dev/null
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_LLTRACE_H__
+#define __LIBCFS_WINNT_LLTRACE_H__
+
+#ifndef __LIBCFS_LLTRACE_H__
+#error Do not #include this file directly. #include <libcfs/lltrace.h> instead
+#endif
+
+
+#endif
diff --git a/lnet/include/libcfs/winnt/portals_compat25.h b/lnet/include/libcfs/winnt/portals_compat25.h
new file mode 100644 (file)
index 0000000..579b795
--- /dev/null
@@ -0,0 +1,28 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_PORTALS_COMPAT_H__
+#define __LIBCFS_WINNT_PORTALS_COMPAT_H__
+
+
+
+#endif /* _PORTALS_COMPAT_H */
diff --git a/lnet/include/libcfs/winnt/portals_lib.h b/lnet/include/libcfs/winnt/portals_lib.h
new file mode 100644 (file)
index 0000000..05fa613
--- /dev/null
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_PORTALS_LIB_H__
+#define __LIBCFS_WINNT_PORTALS_LIB_H__
+
+#ifndef __LIBCFS_PORTALS_LIB_H__
+#error Do not #include this file directly. #include <libcfs/portals_lib.h> instead
+#endif
+
+#endif
diff --git a/lnet/include/libcfs/winnt/portals_utils.h b/lnet/include/libcfs/winnt/portals_utils.h
new file mode 100644 (file)
index 0000000..ec80692
--- /dev/null
@@ -0,0 +1,168 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_PORTALS_UTILS_H__
+#define __LIBCFS_WINNT_PORTALS_UTILS_H__
+
+#ifndef __LIBCFS_PORTALS_UTILS_H__
+#error Do not #include this file directly. #include <libcfs/portals_utils.h> instead
+#endif
+
+#ifndef cfs_is_flag_set
+#define cfs_is_flag_set(x,f) (((x)&(f))==(f))
+#endif
+
+#ifndef cfs_set_flag
+#define cfs_set_flag(x,f)    ((x) |= (f))
+#endif
+
+#ifndef cfs_clear_flag
+#define cfs_clear_flag(x,f)  ((x) &= ~(f))
+#endif
+
+
+static inline __u32 __do_div(__u32 * n, __u32 b) 
+{
+    __u32   mod;
+
+    mod = *n % b;
+    *n  = *n / b;
+    return mod;
+} 
+
+#define do_div(n,base)  __do_div((__u32 *)&(n), (__u32) (base))
+
+#ifdef __KERNEL__
+
+#include <stdlib.h>
+#include <libcfs/winnt/winnt-types.h>
+
+char * strsep(char **s, const char *ct);
+static inline size_t strnlen(const char * s, size_t count) {
+    size_t len = 0;
+    while(len < count && s[len++]);
+    return len;
+}
+char * ul2dstr(ulong_ptr address, char *buf, int len);
+
+#define simple_strtol(a1, a2, a3)               strtol(a1, a2, a3)
+#define simple_strtoll(a1, a2, a3)              (__s64)strtoull(a1, a2, a3)
+#define simple_strtoull(a1, a2, a3)             strtoull(a1, a2, a3)
+
+unsigned long simple_strtoul(const char *cp,char **endp, unsigned int base);
+
+static inline int test_bit(int nr, void * addr)
+{
+    return ((1UL << (nr & 31)) & (((volatile ULONG *) addr)[nr >> 5])) != 0;
+}
+
+static inline void clear_bit(int nr, void * addr)
+{
+    (((volatile ULONG *) addr)[nr >> 5]) &= (~(1UL << (nr & 31)));
+}
+
+
+static inline void set_bit(int nr, void * addr)
+{
+    (((volatile ULONG *) addr)[nr >> 5]) |= (1UL << (nr & 31));
+}
+
+static inline void read_random(char *buf, int len)
+{
+    ULONG   Seed = (ULONG) buf;
+    Seed = RtlRandom(&Seed);
+    while (len >0) {
+        if (len > sizeof(ULONG)) {
+            memcpy(buf, &Seed, sizeof(ULONG));
+            len -= sizeof(ULONG);
+            buf += sizeof(ULONG);
+        } else {
+            memcpy(buf, &Seed, len);
+            len = 0;
+            break;
+        } 
+    }
+}
+#define get_random_bytes(buf, len)  read_random(buf, len)
+
+/* do NOT use function or expression as parameters ... */
+
+#ifndef min_t
+#define min_t(type,x,y) (type)(x) < (type)(y) ? (x): (y)
+#endif
+
+#ifndef max_t
+#define max_t(type,x,y) (type)(x) < (type)(y) ? (y): (x)
+#endif
+
+
+#define NIPQUAD(addr)                      \
+       ((unsigned char *)&addr)[0],    \
+       ((unsigned char *)&addr)[1],    \
+       ((unsigned char *)&addr)[2],    \
+       ((unsigned char *)&addr)[3]
+
+#define HIPQUAD(addr)                      \
+       ((unsigned char *)&addr)[3],    \
+       ((unsigned char *)&addr)[2],    \
+       ((unsigned char *)&addr)[1],    \
+       ((unsigned char *)&addr)[0]
+
+static int copy_from_user(void *to, void *from, int c) 
+{
+    memcpy(to, from, c);
+    return 0;
+}
+
+static int copy_to_user(void *to, void *from, int c) 
+{
+    memcpy(to, from, c);
+    return 0;
+}
+
+
+#define put_user(x, ptr)        \
+(                               \
+    *(ptr) = x,                 \
+    0                           \
+)
+
+
+#define get_user(x,ptr)         \
+(                               \
+    x = *(ptr),                 \
+    0                           \
+)
+
+#define num_physpages                  (64 * 1024)
+
+#define snprintf  _snprintf
+#define vsnprintf _vsnprintf
+
+
+#endif /* !__KERNEL__ */
+
+int cfs_error_code(NTSTATUS);
+
+#endif
diff --git a/lnet/include/libcfs/winnt/winnt-fs.h b/lnet/include/libcfs/winnt/winnt-fs.h
new file mode 100644 (file)
index 0000000..39fb9d0
--- /dev/null
@@ -0,0 +1,279 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * File operations & routines.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_CFS_FS_H__
+#define __LIBCFS_WINNT_CFS_FS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+
+/*
+ * Platform defines
+ *
+ * cfs_rdev_t
+ */
+
+typedef unsigned short cfs_rdev_t;
+
+typedef unsigned int cfs_major_nr_t;
+typedef unsigned int cfs_minor_nr_t;
+
+
+#define MINORBITS      8
+#define MINORMASK      ((1U << MINORBITS) - 1)
+
+#define MAJOR(dev)     ((unsigned int) ((dev) >> MINORBITS))
+#define MINOR(dev)     ((unsigned int) ((dev) & MINORMASK))
+#define NODEV          0
+#define MKDEV(ma,mi)   (((ma) << MINORBITS) | (mi))
+
+
+static inline cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor)
+{
+    return MKDEV(major, minor);
+}
+
+static inline cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev)
+{
+    return MAJOR(rdev);
+}
+
+static inline cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev)
+{
+    return MINOR(rdev);
+}
+
+
+#ifdef __KERNEL__
+
+struct file_operations
+{
+    loff_t (*lseek)(struct file * file, loff_t offset, int origin);
+    ssize_t (*read) (struct file * file, char * buf, size_t nbytes, loff_t *ppos);
+    ssize_t (*write)(struct file * file, const char * buffer,
+        size_t count, loff_t *ppos);
+    int (*ioctl) (struct file *, unsigned int, ulong_ptr);
+    int (*open) (struct file *);
+    int (*release) (struct file *);
+};
+
+struct file {
+
+    cfs_handle_t            f_handle;
+    unsigned int            f_flags;
+    mode_t                  f_mode;
+    ulong_ptr           f_count;
+
+    //struct list_head      f_list;
+    //struct dentry *       f_dentry;
+
+    cfs_proc_entry_t *      proc_dentry;
+    cfs_file_operations_t * f_op;
+
+    size_t                  f_size;
+    loff_t                  f_pos;
+    unsigned int            f_uid, f_gid;
+    int                     f_error;
+
+    ulong_ptr           f_version;
+
+    void *                  private_data;
+
+    char                    f_name[1];
+
+};
+
+#define cfs_filp_size(f)               ((f)->f_size)
+#define cfs_filp_poff(f)                (&(f)->f_pos)
+
+cfs_file_t *cfs_filp_open(const char *name, int flags, int mode, int *err);
+int cfs_filp_close(cfs_file_t *fp);
+int cfs_filp_read(cfs_file_t *fp, void *buf, size_t nbytes, loff_t *pos);
+int cfs_filp_write(cfs_file_t *fp, void *buf, size_t nbytes, loff_t *pos);
+int cfs_filp_fsync(cfs_file_t *fp);
+int cfs_get_file(cfs_file_t *fp);
+int cfs_put_file(cfs_file_t *fp);
+int cfs_file_count(cfs_file_t *fp);
+
+
+
+/*
+ * CFS_FLOCK routines
+ */
+
+typedef struct file_lock{
+    int         fl_type;
+    pid_t       fl_pid;
+    size_t      fl_len;
+    off_t       fl_start;
+    off_t       fl_end;
+} cfs_flock_t; 
+
+#define CFS_INT_LIMIT(x)               (~((x)1 << (sizeof(x)*8 - 1)))
+#define CFS_OFFSET_MAX                 CFS_INT_LIMIT(loff_t)
+
+#define cfs_flock_type(fl)                  ((fl)->fl_type)
+#define cfs_flock_set_type(fl, type)        do { (fl)->fl_type = (type); } while(0)
+#define cfs_flock_pid(fl)                   ((fl)->fl_pid)
+#define cfs_flock_set_pid(fl, pid)          do { (fl)->fl_pid = (pid); } while(0)
+#define cfs_flock_start(fl)                 ((fl)->fl_start)
+#define cfs_flock_set_start(fl, start)      do { (fl)->fl_start = (start); } while(0)
+#define cfs_flock_end(fl)                   ((fl)->fl_end)
+#define cfs_flock_set_end(fl, end)          do { (fl)->fl_end = (end); } while(0)
+
+#define ATTR_MODE       0x0001
+#define ATTR_UID        0x0002
+#define ATTR_GID        0x0004
+#define ATTR_SIZE       0x0008
+#define ATTR_ATIME      0x0010
+#define ATTR_MTIME      0x0020
+#define ATTR_CTIME      0x0040
+#define ATTR_ATIME_SET  0x0080
+#define ATTR_MTIME_SET  0x0100
+#define ATTR_FORCE      0x0200  /* Not a change, but a change it */
+#define ATTR_ATTR_FLAG  0x0400
+#define ATTR_RAW        0x0800  /* file system, not vfs will massage attrs */
+#define ATTR_FROM_OPEN  0x1000  /* called from open path, ie O_TRUNC */
+//#define ATTR_CTIME_SET  0x2000
+
+#define in_group_p(x)  (0)
+
+/*
+ * proc fs routines
+ */
+
+int proc_init_fs();
+void proc_destroy_fs();
+
+
+/*
+ *  misc
+ */
+
+static inline void *ERR_PTR(long_ptr error)
+{
+       return (void *) error;
+}
+
+static inline long_ptr PTR_ERR(const void *ptr)
+{
+       return (long_ptr) ptr;
+}
+
+static inline long_ptr IS_ERR(const void *ptr)
+{
+       return (ulong_ptr)ptr > (ulong_ptr)-1000L;
+}
+
+#else  /* !__KERNEL__ */
+
+#define CREATE_NEW          1
+#define CREATE_ALWAYS       2
+#define OPEN_EXISTING       3
+#define OPEN_ALWAYS         4
+#define TRUNCATE_EXISTING   5
+
+#define SECTION_QUERY       0x0001
+#define SECTION_MAP_WRITE   0x0002
+#define SECTION_MAP_READ    0x0004
+#define SECTION_MAP_EXECUTE 0x0008
+#define SECTION_EXTEND_SIZE 0x0010
+
+#define FILE_MAP_COPY       SECTION_QUERY
+#define FILE_MAP_WRITE      SECTION_MAP_WRITE
+#define FILE_MAP_READ       SECTION_MAP_READ
+#define FILE_MAP_ALL_ACCESS SECTION_ALL_ACCESS
+
+
+NTSYSAPI
+HANDLE
+NTAPI
+CreateFileA(
+    IN LPCSTR lpFileName,
+    IN DWORD dwDesiredAccess,
+    IN DWORD dwShareMode,
+    IN PVOID lpSecurityAttributes,
+    IN DWORD dwCreationDisposition,
+    IN DWORD dwFlagsAndAttributes,
+    IN HANDLE hTemplateFile
+    );
+
+#define CreateFile  CreateFileA
+
+NTSYSAPI
+BOOL
+NTAPI
+CloseHandle(
+    IN OUT HANDLE hObject
+    );
+
+NTSYSAPI
+HANDLE
+NTAPI
+CreateFileMappingA(
+    IN HANDLE hFile,
+    IN PVOID lpFileMappingAttributes,
+    IN DWORD flProtect,
+    IN DWORD dwMaximumSizeHigh,
+    IN DWORD dwMaximumSizeLow,
+    IN LPCSTR lpName
+    );
+#define CreateFileMapping  CreateFileMappingA
+
+NTSYSAPI
+DWORD
+NTAPI
+GetFileSize(
+    IN HANDLE hFile,
+    OUT DWORD * lpFileSizeHigh
+    );
+
+NTSYSAPI
+PVOID
+NTAPI
+MapViewOfFile(
+    IN HANDLE hFileMappingObject,
+    IN DWORD dwDesiredAccess,
+    IN DWORD dwFileOffsetHigh,
+    IN DWORD dwFileOffsetLow,
+    IN SIZE_T dwNumberOfBytesToMap
+    );
+
+NTSYSAPI
+BOOL
+NTAPI
+UnmapViewOfFile(
+    IN PVOID lpBaseAddress
+    );
+
+#endif /* __KERNEL__ */
+
+typedef struct {
+       void    *d;
+} cfs_dentry_t;
+
+
+#endif /* __LIBCFS_WINNT_CFS_FS_H__*/
diff --git a/lnet/include/libcfs/winnt/winnt-lock.h b/lnet/include/libcfs/winnt/winnt-lock.h
new file mode 100644 (file)
index 0000000..5a94c1e
--- /dev/null
@@ -0,0 +1,683 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_CFS_LOCK_H__
+#define __LIBCFS_WINNT_CFS_LOCK_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifdef __KERNEL__
+
+
+/*
+ *  nt specific part ...
+ */
+
+
+/* atomic */
+
+typedef struct { volatile int counter; } atomic_t;
+
+#define ATOMIC_INIT(i) { i }
+
+#define atomic_read(v) ((v)->counter)
+#define atomic_set(v,i)                (((v)->counter) = (i))
+
+void FASTCALL atomic_add(int i, atomic_t *v);
+void FASTCALL atomic_sub(int i, atomic_t *v);
+
+int FASTCALL atomic_sub_and_test(int i, atomic_t *v);
+
+void FASTCALL atomic_inc(atomic_t *v);
+void FASTCALL atomic_dec(atomic_t *v);
+
+int FASTCALL atomic_dec_and_test(atomic_t *v);
+int FASTCALL atomic_inc_and_test(atomic_t *v);
+
+
+/* event */
+
+typedef KEVENT          event_t;
+
+/*
+ * cfs_init_event
+ *   To initialize the event object
+ *
+ * Arguments:
+ *   event:  pointer to the event object
+ *   type:   Non Zero: SynchronizationEvent
+ *           Zero: NotificationEvent
+ *   status: the initial stats of the event
+ *           Non Zero: signaled
+ *           Zero: un-signaled
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+static inline void
+    cfs_init_event(event_t *event, int type, int status)
+{
+    KeInitializeEvent(
+            event,
+            (type) ? SynchronizationEvent: NotificationEvent,
+            (status) ? TRUE : FALSE
+            );
+}
+
+/*
+ * cfs_wait_event
+ *   To wait on an event to syncrhonize the process
+ *
+ * Arguments:
+ *   event:  pointer to the event object
+ *   timeout: the timeout for waitting or 0 means infinite time.
+ *
+ * Return Value:
+ *   Zero:   waiting timeouts
+ *   Non Zero: event signaled ...
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline int64_t
+cfs_wait_event(event_t * event, int64_t timeout)
+{
+    NTSTATUS        Status;
+    LARGE_INTEGER   TimeOut;
+
+    TimeOut.QuadPart = -1 * (10000000/HZ) * timeout;
+
+    Status = KeWaitForSingleObject(
+                event,
+                Executive,
+                KernelMode,
+                FALSE,
+                (timeout != 0) ? (&TimeOut) : (NULL)
+                );
+
+    if (Status == STATUS_TIMEOUT)  {
+        return 0;
+    }
+
+    return TRUE; // signaled case
+}
+
+/*
+ * cfs_wake_event
+ *   To signal the event object
+ *
+ * Arguments:
+ *   event:  pointer to the event object
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline int
+cfs_wake_event(event_t * event)
+{
+    return (KeSetEvent(event, 0, FALSE) != 0);
+}
+
+/*
+ * cfs_clear_event
+ *   To clear/reset the status of the event object
+ *
+ * Arguments:
+ *   event:  pointer to the event object
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void
+cfs_clear_event(event_t * event)
+{
+    KeResetEvent(event);
+}
+
+
+/*
+ * IMPORTANT !!!!!!!!
+ *
+ * All locks' declaration are not guaranteed to be initialized,
+ * Althought some of they are initialized in Linux. All locks
+ * declared by CFS_DECL_* should be initialized explicitly.
+ */
+
+
+/*
+ * spin lock defintions / routines
+ */
+
+/*
+ * Warning:
+ *
+ * for spinlock operations, try to grab nesting acquisition of
+ * spinlock will cause dead-lock in MP system and current irql 
+ * overwritten for UP system. (UP system could allow nesting spin
+ * acqisition, because it's not spin at all just raising the irql.)
+ *
+ */
+
+typedef struct spin_lock {
+
+    KSPIN_LOCK lock;
+    KIRQL      irql;
+
+} spinlock_t;
+
+
+#define CFS_DECL_SPIN(name)  spinlock_t name;
+#define CFS_DECL_SPIN_EXTERN(name)  extern spinlock_t name;
+
+
+static inline void spin_lock_init(spinlock_t *lock)
+{
+    KeInitializeSpinLock(&(lock->lock));
+}
+
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    KeAcquireSpinLock(&(lock->lock), &(lock->irql));
+}
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+    KeReleaseSpinLock(&(lock->lock), lock->irql);
+}
+
+
+#define spin_lock_irqsave(lock, flags)         do {(flags) = 0; spin_lock(lock);} while(0)
+#define spin_unlock_irqrestore(lock, flags)    do {spin_unlock(lock);} while(0)
+
+
+/* There's no  corresponding routine in windows kernel.
+   We must realize a light one of our own.  But there's
+   no way to identify the system is MP build or UP build
+   on the runtime. We just uses a workaround for it. */
+
+extern int MPSystem;
+
+static int spin_trylock(spinlock_t *lock)
+{
+    KIRQL   Irql;
+    int     rc = 0;
+
+    ASSERT(lock != NULL);
+
+    KeRaiseIrql(DISPATCH_LEVEL, &Irql);
+
+    if (MPSystem) {
+        if (0 == (ulong_ptr)lock->lock) {
+#if _X86_
+            __asm {
+                mov  edx, dword ptr [ebp + 8]
+                lock bts dword ptr[edx], 0
+                jb   lock_failed
+                mov  rc, TRUE
+            lock_failed:
+            }
+#else
+        KdBreakPoint();
+#endif
+
+        }
+    } else {
+        rc = TRUE;
+    }
+
+    if (rc) {
+        lock->irql = Irql;
+    } else {
+        KeLowerIrql(Irql);
+    }
+
+    return rc;
+}
+
+#define spin_lock_bh(x)                    spin_lock(x)
+#define spin_unlock_bh(x)          spin_unlock(x)
+#define spin_lock_bh_init(x)   spin_lock_init(x)
+
+/*
+ * rw_semaphore (using ERESOURCE)
+ */
+
+
+typedef struct rw_semaphore {
+    ERESOURCE   rwsem;
+} rw_semaphore_t;
+
+
+#define CFS_DECL_RWSEM(name) rw_semaphore_t name
+#define CFS_DECL_RWSEM_EXTERN(name) extern rw_semaphore_t name
+
+
+/*
+ * init_rwsem
+ *   To initialize the the rw_semaphore_t structure
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void init_rwsem(rw_semaphore_t *s)
+{
+       ExInitializeResourceLite(&s->rwsem);
+}
+
+
+/*
+ * fini_rwsem
+ *   To finilize/destroy the the rw_semaphore_t structure
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   For winnt system, we need this routine to delete the ERESOURCE.
+ *   Just define it NULL for other systems.
+ */
+
+static inline void fini_rwsem(rw_semaphore_t *s)
+{
+    ExDeleteResourceLite(&s->rwsem);
+}
+
+/*
+ * down_read
+ *   To acquire read-lock of the rw_semahore
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void down_read(struct rw_semaphore *s)
+{
+       ExAcquireResourceSharedLite(&s->rwsem, TRUE);
+}
+
+
+/*
+ * down_read_trylock
+ *   To acquire read-lock of the rw_semahore without blocking
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   Zero: failed to acquire the read lock
+ *   Non-Zero: succeeded to acquire the read lock
+ *
+ * Notes: 
+ *   This routine will return immediately without waiting.
+ */
+
+static inline int down_read_trylock(struct rw_semaphore *s)
+{
+       return ExAcquireResourceSharedLite(&s->rwsem, FALSE);
+}
+
+
+/*
+ * down_write
+ *   To acquire write-lock of the rw_semahore
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void down_write(struct rw_semaphore *s)
+{
+       ExAcquireResourceExclusiveLite(&(s->rwsem), TRUE);
+}
+
+
+/*
+ * down_write_trylock
+ *   To acquire write-lock of the rw_semahore without blocking
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   Zero: failed to acquire the write lock
+ *   Non-Zero: succeeded to acquire the read lock
+ *
+ * Notes: 
+ *   This routine will return immediately without waiting.
+ */
+
+static inline int down_write_trylock(struct rw_semaphore *s)
+{
+    return ExAcquireResourceExclusiveLite(&(s->rwsem), FALSE);
+}
+
+
+/*
+ * up_read
+ *   To release read-lock of the rw_semahore
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void up_read(struct rw_semaphore *s)
+{
+    ExReleaseResourceForThreadLite(
+            &(s->rwsem),
+            ExGetCurrentResourceThread());
+}
+
+
+/*
+ * up_write
+ *   To release write-lock of the rw_semahore
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void up_write(struct rw_semaphore *s)
+{
+    ExReleaseResourceForThreadLite(
+                &(s->rwsem),
+                ExGetCurrentResourceThread());
+}
+
+/*
+ * rwlock_t (using sempahore)
+ *
+ * - rwlock_init(x)
+ * - read_lock(x)
+ * - read_unlock(x)
+ * - write_lock(x)
+ * - write_unlock(x)
+ */
+
+typedef struct {
+    spinlock_t guard;
+    int        count;
+} rwlock_t;
+
+void rwlock_init(rwlock_t * rwlock);
+void rwlock_fini(rwlock_t * rwlock);
+
+void read_lock(rwlock_t * rwlock);
+void read_unlock(rwlock_t * rwlock);
+void write_lock(rwlock_t * rwlock);
+void write_unlock(rwlock_t * rwlock);
+
+#define write_lock_irqsave(l, f)        do {f = 0; write_lock(l);} while(0)
+#define write_unlock_irqrestore(l, f)   do {write_unlock(l);} while(0)
+#define read_lock_irqsave(l, f)                do {f=0; read_lock(l);} while(0)
+#define read_unlock_irqrestore(l, f)    do {read_unlock(l);} while(0)
+
+
+/*
+ * Semaphore
+ *
+ * - sema_init(x, v)
+ * - __down(x)
+ * - __up(x)
+ */
+
+typedef struct semaphore {
+       KSEMAPHORE sem;
+} mutex_t;
+
+static inline void sema_init(struct semaphore *s, int val)
+{
+       KeInitializeSemaphore(&s->sem, val, val);
+}
+
+static inline void __down(struct semaphore *s)
+{
+   KeWaitForSingleObject( &(s->sem), Executive,
+                          KernelMode, FALSE, NULL );
+
+}
+
+static inline void __up(struct semaphore *s)
+{
+       KeReleaseSemaphore(&s->sem, 0, 1, FALSE);
+}
+
+/*
+ * mutex_t:
+ *
+ * - init_mutex(x)
+ * - init_mutex_locked(x)
+ * - mutex_up(x)
+ * - mutex_down(x)
+ */
+
+
+/*
+ * init_mutex
+ *   To initialize a mutex_t structure
+ *
+ * Arguments:
+ *   mutex:  pointer to the mutex_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void init_mutex(mutex_t *mutex)
+{
+    sema_init(mutex, 1);
+}
+
+
+/*
+ * mutex_down
+ *   To acquire the mutex lock
+ *
+ * Arguments:
+ *   mutex:  pointer to the mutex_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void mutex_down(mutex_t *mutex)
+{
+    __down(mutex);
+}
+
+
+/*
+ * mutex_up
+ *   To release the mutex lock (acquired already)
+ *
+ * Arguments:
+ *   mutex:  pointer to the mutex_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void mutex_up(mutex_t *mutex)
+{
+    __up(mutex);
+}
+
+
+/*
+ * init_mutex_locked
+ *   To initialize the mutex as acquired state
+ *
+ * Arguments:
+ *   mutex:  pointer to the mutex_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline init_mutex_locked(mutex_t *mutex)
+{
+    init_mutex(mutex);
+    mutex_down(mutex);
+}
+
+/*
+ * completion
+ *
+ * - init_complition(c)
+ * - complete(c)
+ * - wait_for_completion(c)
+ */
+
+struct completion {
+       event_t  event;
+};
+
+
+/*
+ * init_completion
+ *   To initialize the completion object
+ *
+ * Arguments:
+ *   c:  pointer to the completion structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void init_completion(struct completion *c)
+{
+       cfs_init_event(&(c->event), 1, FALSE);
+}
+
+
+/*
+ * complete
+ *   To complete/signal the completion object
+ *
+ * Arguments:
+ *   c:  pointer to the completion structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void complete(struct completion *c)
+{
+       cfs_wake_event(&(c->event));
+}
+
+/*
+ * wait_for_completion
+ *   To wait on the completion object. If the event is signaled,
+ *   this function will return to the call with the event un-singled.
+ *
+ * Arguments:
+ *   c:  pointer to the completion structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void wait_for_completion(struct completion *c)
+{
+    cfs_wait_event(&(c->event), 0);
+}
+
+/* __KERNEL__ */
+#else
+
+#include "../user-lock.h"
+
+/* __KERNEL__ */
+#endif
+#endif
diff --git a/lnet/include/libcfs/winnt/winnt-mem.h b/lnet/include/libcfs/winnt/winnt-mem.h
new file mode 100644 (file)
index 0000000..ef468ba
--- /dev/null
@@ -0,0 +1,138 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines of memory manipulation routines .
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_CFS_MEM_H__
+#define __LIBCFS_WINNT_CFS_MEM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifdef __KERNEL__
+
+#define CFS_PAGE_SIZE                   PAGE_SIZE
+#define CFS_PAGE_SHIFT                  PAGE_SHIFT
+#define CFS_PAGE_MASK                   (~(PAGE_SIZE - 1))
+
+typedef struct cfs_page {
+    void *      addr;
+    atomic_t    count;
+} cfs_page_t;
+
+
+cfs_page_t *cfs_alloc_page(int flags);
+void cfs_free_page(cfs_page_t *pg);
+
+static inline void *cfs_page_address(cfs_page_t *page)
+{
+    return page->addr;
+}
+
+static inline void *cfs_kmap(cfs_page_t *page)
+{
+    return page->addr;
+}
+
+static inline void cfs_kunmap(cfs_page_t *page)
+{
+    return;
+}
+
+static inline void cfs_get_page(cfs_page_t *page)
+{
+    atomic_inc(&page->count);
+}
+
+static inline void cfs_put_page(cfs_page_t *page)
+{
+    atomic_dec(&page->count);
+}
+
+static inline int cfs_page_count(cfs_page_t *page)
+{
+    return atomic_read(&page->count);
+}
+
+static inline void cfs_set_page_count(cfs_page_t *page, int v)
+{
+    atomic_set(&page->count, v);
+}
+
+/*
+ * Memory allocator
+ */
+
+#define CFS_ALLOC_ATOMIC_TRY   (0)
+
+extern void *cfs_alloc(size_t nr_bytes, u_int32_t flags);
+extern void  cfs_free(void *addr);
+
+extern void *cfs_alloc_large(size_t nr_bytes);
+extern void  cfs_free_large(void *addr);
+
+/*
+ * SLAB allocator
+ */
+
+#define SLAB_HWCACHE_ALIGN             0
+
+/* The cache name is limited to 20 chars */
+
+typedef struct cfs_mem_cache {
+
+    char                    name[20];
+    ulong_ptr           flags;
+    NPAGED_LOOKASIDE_LIST   npll;
+
+} cfs_mem_cache_t;
+
+
+extern cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, ulong_ptr);
+extern int cfs_mem_cache_destroy ( cfs_mem_cache_t * );
+extern void *cfs_mem_cache_alloc ( cfs_mem_cache_t *, int);
+extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *);
+
+
+/*
+ * Page allocator slabs 
+ */
+
+extern cfs_mem_cache_t *cfs_page_t_slab;
+extern cfs_mem_cache_t *cfs_page_p_slab;
+
+
+#define CFS_DECL_MMSPACE
+#define CFS_MMSPACE_OPEN    do {} while(0)
+#define CFS_MMSPACE_CLOSE   do {} while(0)
+
+
+#define mb()    do {} while(0)
+#define rmb()   mb()
+#define wmb()   mb()
+
+
+/* __KERNEL__ */
+#endif
+
+#endif /* __WINNT_CFS_MEM_H__ */
diff --git a/lnet/include/libcfs/winnt/winnt-prim.h b/lnet/include/libcfs/winnt/winnt-prim.h
new file mode 100644 (file)
index 0000000..e22fe02
--- /dev/null
@@ -0,0 +1,1108 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_CFS_PRIM_H__
+#define __LIBCFS_WINNT_CFS_PRIM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+
+/*
+ * libcfs proc device object
+ */
+
+
+#define LUSTRE_PROC_DEVICE  L"\\Device\\lproc"      /* proc fs emulator device object */
+#define LUSTRE_PROC_SYMLNK  L"\\DosDevices\\lproc"  /* proc fs user-visible device */
+
+
+/*
+ * Device IO Control Code Definitions
+ */
+
+#define FILE_DEVICE_LIBCFS      ('LC')
+
+#define FILE_DEVICE_LIBCFS      ('LC')
+
+#define FUNC_LIBCFS_VERSION     0x101  // get version of current libcfs
+#define FUNC_LIBCFS_IOCTL       0x102  // Device i/o control to proc fs
+
+
+#define IOCTL_LIBCFS_VERSION \
+     CTL_CODE (FILE_DEVICE_LIBCFS, FUNC_LIBCFS_VERSION, METHOD_BUFFERED, FILE_ANY_ACCESS)
+#define IOCTL_LIBCFS_ENTRY   \
+     CTL_CODE(FILE_DEVICE_LIBCFS, FUNC_LIBCFS_IOCTL,   METHOD_BUFFERED, FILE_ANY_ACCESS)
+
+#pragma pack(4)
+
+typedef struct _CFS_PROC_IOCTL {
+
+    ULONG           cmd;    // ioctl command identifier
+    ULONG           len;    // length of data
+
+    // UCHAR        data[]; // content of the real ioctl
+
+} CFS_PROC_IOCTL, *PCFS_PROC_IOCTL;
+
+#pragma pack()
+
+#ifdef __KERNEL__
+
+#include <libcfs/list.h>
+
+/*
+ * Symbol functions for libcfs
+ *
+ * OSX has no facility for use to register symbol.
+ * So we have to implement it.
+ */
+#define CFS_SYMBOL_LEN     64
+
+struct  cfs_symbol {
+       char    name[CFS_SYMBOL_LEN];
+       void    *value;
+       int     ref;
+       struct  list_head sym_list;
+};
+
+extern int      cfs_symbol_register(const char *, const void *);
+extern void     cfs_symbol_unregister(const char *);
+extern void *   cfs_symbol_get(const char *);
+extern void     cfs_symbol_put(const char *);
+extern void     cfs_symbol_clean();
+
+
+
+typedef struct file_operations cfs_file_operations_t;
+typedef struct file cfs_file_t;
+
+/*
+ * Pseudo device register
+ */
+
+typedef struct
+{
+    int                     minor;
+    const char *            name;
+    cfs_file_operations_t * fops;
+} cfs_psdev_t;
+
+int cfs_psdev_register(cfs_psdev_t * psdev);
+int cfs_psdev_deregister(cfs_psdev_t * psdev);
+
+
+/*
+ * Proc emulator file system APIs
+ */
+
+typedef int cfs_read_proc_t(char *page, char **start, off_t off,
+                         int count, int *eof, void *data);
+typedef int cfs_write_proc_t(struct file *file, const char *buffer,
+                          ulong_ptr count, void *data);
+
+#define CFS_PROC_ENTRY_MAGIC 'CPEM'
+
+#define CFS_PROC_FLAG_DIRECTORY    0x00000001 // directory node
+#define CFS_PROC_FLAG_ATTACHED     0x00000002 // node is attached to proc
+#define CFS_PROC_FLAG_MISCDEV      0x00000004 // miscellaneous device
+
+typedef struct cfs_proc_entry
+{
+    ULONG                   magic;      // Magic
+    ULONG                   flags;      // Flags
+
+    struct _dir_entry {                 // proc directory entry
+        PRTL_SPLAY_LINKS    root;
+    };
+
+    struct _file_entry {                // proc file / leaf entry
+           cfs_read_proc_t  *  read_proc;
+           cfs_write_proc_t *  write_proc;
+    };
+
+    mode_t                  mode;
+    unsigned short          nlink;
+
+       
+    struct file_operations * proc_fops;
+       void * data;
+
+    // proc_dir_entry ended.
+
+    RTL_SPLAY_LINKS         s_link;       // splay link
+
+    //
+    // Maximum length of proc entry name is 0x20
+    //
+
+    char                    name[0x20];
+
+} cfs_proc_entry_t, cfs_proc_dir_entry_t;
+
+typedef cfs_proc_entry_t cfs_proc_dir_entry_t;
+
+#define PROC_BLOCK_SIZE    PAGE_SIZE
+
+/*
+ * Sysctl register
+ */
+
+typedef struct ctl_table                   cfs_sysctl_table_t;
+typedef struct ctl_table_header                cfs_sysctl_table_header_t;
+
+
+typedef int ctl_handler (
+            cfs_sysctl_table_t *table,
+            int *name,    int nlen,
+                       void *oldval, size_t *oldlenp,
+                       void *newval, size_t newlen, 
+                       void **context );
+
+typedef int proc_handler (
+            cfs_sysctl_table_t *ctl,
+            int write, struct file * filp,
+                       void *buffer, size_t *lenp );
+
+
+int proc_dointvec(cfs_sysctl_table_t *table, int write, struct file *filp,
+                    void *buffer, size_t *lenp);
+
+int proc_dostring(cfs_sysctl_table_t *table, int write, struct file *filp,
+                 void *buffer, size_t *lenp);
+
+int sysctl_string(cfs_sysctl_table_t *table, int *name, int nlen,
+                 void *oldval, size_t *oldlenp,
+                 void *newval, size_t newlen, void **context);
+
+
+/*
+ *  System io control definitions
+ */
+
+#define CTL_MAXNAME 10
+
+#define CTL_ANY     -1  /* Matches any name */
+#define CTL_NONE    0
+
+enum
+{
+    CTL_KERN=1,     /* General kernel info and control */
+    CTL_VM=2,       /* VM management */
+    CTL_NET=3,      /* Networking */
+    CTL_PROC=4,     /* Process info */
+    CTL_FS=5,       /* Filesystems */
+    CTL_DEBUG=6,        /* Debugging */
+    CTL_DEV=7,      /* Devices */
+    CTL_BUS=8,      /* Busses */
+    CTL_ABI=9,      /* Binary emulation */
+    CTL_CPU=10      /* CPU stuff (speed scaling, etc) */
+};
+
+/* sysctl table definitons */
+struct ctl_table 
+{
+       int ctl_name;
+       char *procname;
+       void *data;
+       int maxlen;
+       mode_t mode;
+       cfs_sysctl_table_t *child;
+       proc_handler *proc_handler;     /* text formatting callback */
+       ctl_handler *strategy;          /* read / write callback functions */
+       cfs_proc_entry_t *de;   /* proc entry block */
+       void *extra1;
+       void *extra2;
+};
+
+
+/* the mantaner of the cfs_sysctl_table trees */
+struct ctl_table_header
+{
+       cfs_sysctl_table_t *    ctl_table;
+       struct list_head        ctl_entry;
+};
+
+
+cfs_proc_entry_t * create_proc_entry(char *name, mode_t mod,
+                                         cfs_proc_entry_t *parent);
+void proc_free_entry(cfs_proc_entry_t *de);
+void remove_proc_entry(char *name, cfs_proc_entry_t *entry);
+cfs_proc_entry_t * search_proc_entry(char * name,
+                        cfs_proc_entry_t *  root );
+
+#define cfs_create_proc_entry create_proc_entry
+#define cfs_free_proc_entry   proc_free_entry
+#define cfs_remove_proc_entry remove_proc_entry
+
+#define register_cfs_sysctl_table(t, a)        register_sysctl_table(t, a)
+#define unregister_cfs_sysctl_table(t) unregister_sysctl_table(t, a)
+
+
+/*
+ *  declaration of proc kernel process routines
+ */
+
+cfs_file_t *
+lustre_open_file(char * filename);
+
+int
+lustre_close_file(cfs_file_t * fh);
+
+int
+lustre_do_ioctl( cfs_file_t * fh,
+                 unsigned long cmd,
+                 ulong_ptr arg );
+
+int
+lustre_ioctl_file( cfs_file_t * fh,
+                   PCFS_PROC_IOCTL devctl);
+
+size_t
+lustre_read_file( cfs_file_t *    fh,
+                  loff_t          off,
+                  size_t          size,
+                  char *          buf
+                  );
+
+size_t
+lustre_write_file( cfs_file_t *    fh,
+                   loff_t          off,
+                   size_t          size,
+                   char *          buf
+                   );
+
+/*
+ * Wait Queue
+ */
+
+
+typedef int cfs_task_state_t;
+
+#define CFS_TASK_INTERRUPTIBLE 0x00000001
+#define CFS_TASK_UNINT         0x00000002
+
+
+
+#define CFS_WAITQ_MAGIC     'CWQM'
+#define CFS_WAITLINK_MAGIC  'CWLM'
+
+typedef struct cfs_waitq {
+
+    unsigned int        magic;
+    unsigned int        flags;
+    
+    spinlock_t          guard;
+    struct list_head    waiters;
+
+} cfs_waitq_t;
+
+
+typedef struct cfs_waitlink cfs_waitlink_t;
+
+#define CFS_WAITQ_CHANNELS     (2)
+
+#define CFS_WAITQ_CHAN_NORMAL  (0)
+#define CFS_WAITQ_CHAN_FORWARD (1)
+
+
+
+typedef struct cfs_waitlink_channel {
+    struct list_head        link;
+    cfs_waitq_t *           waitq;
+    cfs_waitlink_t *        waitl;
+} cfs_waitlink_channel_t;
+
+struct cfs_waitlink {
+
+    unsigned int            magic;
+    int                     flags;
+    event_t  *              event;
+    atomic_t *              hits;
+
+    cfs_waitlink_channel_t  waitq[CFS_WAITQ_CHANNELS];
+};
+
+enum {
+       CFS_WAITQ_EXCLUSIVE = 1
+};
+
+#define CFS_DECL_WAITQ(name) cfs_waitq_t name
+
+
+void cfs_waitq_init(struct cfs_waitq *waitq);
+void cfs_waitlink_init(struct cfs_waitlink *link);
+
+void cfs_waitq_add(struct cfs_waitq *waitq, struct cfs_waitlink *link);
+void cfs_waitq_add_exclusive(struct cfs_waitq *waitq, 
+                            struct cfs_waitlink *link);
+void cfs_waitq_forward(struct cfs_waitlink *link, struct cfs_waitq *waitq);
+void cfs_waitq_del(struct cfs_waitq *waitq, struct cfs_waitlink *link);
+int  cfs_waitq_active(struct cfs_waitq *waitq);
+
+void cfs_waitq_signal(struct cfs_waitq *waitq);
+void cfs_waitq_signal_nr(struct cfs_waitq *waitq, int nr);
+void cfs_waitq_broadcast(struct cfs_waitq *waitq);
+
+void cfs_waitq_wait(struct cfs_waitlink *link, cfs_task_state_t state);
+cfs_duration_t cfs_waitq_timedwait(struct cfs_waitlink *link, 
+                                  cfs_task_state_t state, cfs_duration_t timeout);
+
+
+
+/* Kernel thread */
+
+typedef int (*cfs_thread_t) (void *arg);
+
+typedef struct _cfs_thread_context {
+    cfs_thread_t        func;
+    void *              arg;
+} cfs_thread_context_t;
+
+int cfs_kernel_thread(int (*func)(void *), void *arg, int flag);
+
+/*
+ * thread creation flags from Linux, not used in winnt
+ */
+#define CSIGNAL         0x000000ff      /* signal mask to be sent at exit */
+#define CLONE_VM        0x00000100      /* set if VM shared between processes */
+#define CLONE_FS        0x00000200      /* set if fs info shared between processes */
+#define CLONE_FILES     0x00000400      /* set if open files shared between processes */
+#define CLONE_SIGHAND   0x00000800      /* set if signal handlers and blocked signals shared */
+#define CLONE_PID       0x00001000      /* set if pid shared */
+#define CLONE_PTRACE    0x00002000      /* set if we want to let tracing continue on the child too */
+#define CLONE_VFORK     0x00004000      /* set if the parent wants the child to wake it up on mm_release */
+#define CLONE_PARENT    0x00008000      /* set if we want to have the same parent as the cloner */
+#define CLONE_THREAD    0x00010000      /* Same thread group? */
+#define CLONE_NEWNS     0x00020000      /* New namespace group? */
+
+#define CLONE_SIGNAL    (CLONE_SIGHAND | CLONE_THREAD)
+
+
+/*
+ * sigset ...
+ */
+
+typedef sigset_t cfs_sigset_t;
+
+/*
+ * Task struct
+ */
+
+#define MAX_SCHEDULE_TIMEOUT    ((long_ptr)(~0UL>>12))
+
+
+#define NGROUPS 1
+#define CFS_CURPROC_COMM_MAX (16)
+typedef struct task_sruct{
+    mode_t umask;
+
+       pid_t pid;
+       pid_t pgrp;
+
+       uid_t uid,euid,suid,fsuid;
+       gid_t gid,egid,sgid,fsgid;
+
+       int ngroups;
+       gid_t   groups[NGROUPS];
+       cfs_kernel_cap_t   cap_effective,
+                       cap_inheritable,
+                       cap_permitted;
+
+       char comm[CFS_CURPROC_COMM_MAX];
+    void * journal_info;
+}  cfs_task_t;
+
+
+/*
+ *  linux task struct emulator ...
+ */
+
+#define TASKMAN_MAGIC  'TMAN'   /* Task Manager */
+#define TASKSLT_MAGIC  'TSLT'   /* Task Slot */
+
+typedef struct _TASK_MAN {
+
+    ULONG       Magic;      /* Magic and Flags */
+    ULONG       Flags;
+
+    spinlock_t  Lock;       /* Protection lock */
+
+    cfs_mem_cache_t * slab; /* Memory slab for task slot */
+
+    ULONG       NumOfTasks; /* Total tasks (threads) */
+    LIST_ENTRY  TaskList;   /* List of task slots */
+
+} TASK_MAN, *PTASK_MAN;
+
+typedef struct _TASK_SLOT {
+
+    ULONG       Magic;      /* Magic and Flags */
+    ULONG       Flags;
+
+    LIST_ENTRY  Link;       /* To be linked to TaskMan */
+
+    event_t     Event;      /* Schedule event */
+
+    HANDLE      Pid;        /* Process id */
+    HANDLE      Tid;        /* Thread id */
+    PETHREAD    Tet;        /* Pointer to ethread */
+
+    atomic_t    count;      /* refer count */
+    atomic_t    hits;       /* times of waken event singaled */
+
+    KIRQL       irql;       /* irql for rwlock ... */
+
+    cfs_task_t  task;       /* linux task part */
+
+} TASK_SLOT, *PTASK_SLOT;
+
+
+#define current                 cfs_current()
+#define set_current_state(s)   do {;} while (0)
+#define reparent_to_init()         do {;} while (0)
+
+#define wait_event(wq, condition)                           \
+do {                                                        \
+    cfs_waitlink_t __wait;                                     \
+                                                            \
+    cfs_waitlink_init(&__wait);                                    \
+       while (TRUE) {                                          \
+               cfs_waitq_add(&wq, &__wait);                        \
+               if (condition)  {                                           \
+                       break;                                                  \
+        }                                                   \
+               cfs_waitq_wait(&__wait, CFS_TASK_INTERRUPTIBLE);        \
+               cfs_waitq_del(&wq, &__wait);                        \
+       }                                                                           \
+       cfs_waitq_del(&wq, &__wait);                                \
+} while(0)
+
+#define wait_event_interruptible(wq, condition, __ret)      \
+do {                                                        \
+    cfs_waitlink_t __wait;                                     \
+                                                            \
+    __ret = 0;                                              \
+    cfs_waitlink_init(&__wait);                                    \
+       while (TRUE) {                                          \
+               cfs_waitq_add(&wq, &__wait);                        \
+               if (condition)  {                                           \
+                       break;                                                  \
+        }                                                   \
+               cfs_waitq_wait(&__wait, CFS_TASK_INTERRUPTIBLE);    \
+               cfs_waitq_del(&wq, &__wait);                        \
+       }                                                                           \
+       cfs_waitq_del(&wq, &__wait);                                \
+} while(0)
+
+
+int     init_task_manager();
+void    cleanup_task_manager();
+cfs_task_t * cfs_current();
+int     schedule_timeout(int64_t time);
+int     schedule();
+int     wake_up_process(cfs_task_t * task);
+#define cfs_schedule_timeout(state, time)  schedule_timeout(time)
+void sleep_on(cfs_waitq_t *waitq);
+
+#define CFS_DECL_JOURNAL_DATA  
+#define CFS_PUSH_JOURNAL           do {;} while(0)
+#define CFS_POP_JOURNAL                    do {;} while(0)
+
+
+/* module related definitions */
+
+#ifndef __exit
+#define __exit
+#endif
+#ifndef __init
+#define __init
+#endif
+
+#define request_module(x) (0)
+
+#define EXPORT_SYMBOL(s)
+#define MODULE_AUTHOR(s)
+#define MODULE_DESCRIPTION(s)
+#define MODULE_LICENSE(s)
+#define MODULE_PARM(a, b)
+#define MODULE_PARM_DESC(a, b)
+
+#define module_init(X) int  __init module_##X() {return X();}
+#define module_exit(X) void __exit module_##X() {X();}
+
+#define DECLARE_INIT(X) extern int  __init  module_##X(void)
+#define DECLARE_EXIT(X) extern void __exit  module_##X(void)
+
+#define MODULE_INIT(X) do { int rc = module_##X(); \
+                            if (rc) goto errorout; \
+                          } while(0)
+
+#define MODULE_EXIT(X) do { module_##X(); } while(0)
+
+
+/* Module interfaces */
+#define cfs_module(name, version, init, fini) \
+module_init(init);                            \
+module_exit(fini)
+
+
+/*
+ *  Linux kernel version definition
+ */
+
+#define KERNEL_VERSION(a,b,c) ((a)*100+(b)*10+c)
+#define LINUX_VERSION_CODE (2*100+6*10+7)
+
+
+/*
+ * Signal
+ */
+
+#define cfs_sigmask_lock(t, f)         do { f = 0; } while(0)
+#define cfs_sigmask_unlock(t, f)       do { f = 0; } while(0)
+#define cfs_signal_pending(t)          (0)
+
+#define cfs_recalc_sigpending(t)    do { } while(0)
+
+#define cfs_siginitset(m, s)        do { } while(0)
+#define cfs_sigfillset(s)           do { } while(0)
+#define cfs_siginitsetinv(m,f)      do { } while(0)
+
+#define cfs_set_sig_blocked(t, b)   do { } while(0)
+#define cfs_get_sig_blocked(t)      (0)
+
+#define SIGNAL_MASK_ASSERT()
+
+cfs_sigset_t cfs_get_blocked_sigs(cfs_task_t *t);
+void cfs_block_allsigs(cfs_task_t *t);
+void cfs_block_sigs(cfs_task_t *t, sigset_t bit);
+
+/*
+ * Clear all pending signals.
+ */
+#define cfs_clear_sigpending(ut)    do {} while (0)
+
+
+/*
+ * Timer
+ */
+
+#define CFS_TIMER_FLAG_INITED   0x00000001  // Initialized already
+#define CFS_TIMER_FLAG_TIMERED  0x00000002  // KeSetTimer is called
+
+typedef struct cfs_timer {
+
+    KSPIN_LOCK      Lock;
+
+    ULONG           Flags;
+
+    KDPC            Dpc;
+    KTIMER          Timer;
+
+    cfs_time_t      deadline;
+
+    void (*proc)(ulong_ptr);
+    void *          arg;
+
+} cfs_timer_t;
+
+
+typedef  void (*timer_func_t)(ulong_ptr);
+
+#define cfs_init_timer(t)
+
+void cfs_timer_init(cfs_timer_t *timer, void (*func)(ulong_ptr), void *arg);
+void cfs_timer_done(cfs_timer_t *t);
+void cfs_timer_arm(cfs_timer_t *t, cfs_time_t deadline);
+void cfs_timer_disarm(cfs_timer_t *t);
+int  cfs_timer_is_armed(cfs_timer_t *t);
+cfs_time_t cfs_timer_deadline(cfs_timer_t *t);
+
+
+/* deschedule for a bit... */
+static inline void cfs_pause(cfs_duration_t ticks)
+{
+    cfs_schedule_timeout(TASK_UNINTERRUPTIBLE, ticks);
+}
+
+
+static inline void cfs_enter_debugger(void)
+{
+#if _X86_
+    __asm int 3;
+#else
+    KdBreakPoint();
+#endif
+}
+
+/*
+ *  libcfs globals initialization/cleanup
+ */
+
+int
+libcfs_arch_init(void);
+
+void
+libcfs_arch_cleanup(void);
+
+/*
+ * SMP ...
+ */
+
+#define SMP_CACHE_BYTES             128
+#define __cacheline_aligned
+#define NR_CPUS                                            (2)
+#define smp_processor_id()                 KeGetCurrentProcessorNumber()
+#define smp_num_cpus                NR_CPUS
+#define num_online_cpus() smp_num_cpus
+#define smp_call_function(f, a, n, w)          do {} while(0)
+
+/*
+ *  Irp related
+ */
+
+#define NR_IRQS                                    512
+#define in_interrupt()                 (0)
+
+/*
+ *  printk flags
+ */
+
+#define KERN_EMERG      "<0>"   /* system is unusable                   */
+#define KERN_ALERT      "<1>"   /* action must be taken immediately     */
+#define KERN_CRIT       "<2>"   /* critical conditions                  */
+#define KERN_ERR        "<3>"   /* error conditions                     */
+#define KERN_WARNING    "<4>"   /* warning conditions                   */
+#define KERN_NOTICE     "<5>"   /* normal but significant condition     */
+#define KERN_INFO       "<6>"   /* informational                        */
+#define KERN_DEBUG      "<7>"   /* debug-level messages                 */
+
+/*
+ * Misc
+ */
+
+
+#define inter_module_get(n)                    cfs_symbol_get(n)
+#define inter_module_put(n)                    cfs_symbol_put(n)
+
+#ifndef likely
+#define likely(exp) (exp)
+#endif
+#ifndef unlikely
+#define unlikely(exp) (exp)
+#endif
+
+#define lock_kernel()               do {} while(0)
+#define unlock_kernel()             do {} while(0)
+
+#define exit_mm(t)                  do {} while(0)
+#define exit_files(t)               do {} while(0)
+
+#define CAP_SYS_ADMIN                    0
+#define CAP_SYS_ROOT                     1
+
+#define capable(a)                             (TRUE)
+
+#define USERMODEHELPER(path, argv, envp)       (0)
+
+
+#define local_irq_save(x)
+#define local_irq_restore(x)
+
+#define cfs_assert                      ASSERT
+
+#define THREAD_NAME
+
+#else   /* !__KERNEL__ */
+
+#define PAGE_CACHE_SIZE PAGE_SIZE
+#define PAGE_CACHE_MASK PAGE_MASK
+
+#define getpagesize()   (PAGE_SIZE)
+
+
+typedef struct {
+    int foo;
+} pthread_mutex_t;
+
+typedef struct {
+    int foo;
+} pthread_cond_t;
+
+#define pthread_mutex_init(x, y)    do {} while(0)
+#define pthread_cond_init(x, y)     do {} while(0)
+
+#define pthread_mutex_lock(x)       do {} while(0)
+#define pthread_mutex_unlock(x)     do {} while(0)
+
+#define pthread_cond_wait(x,y)      do {} while(0)
+#define pthread_cond_broadcast(x)   do {} while(0)
+
+typedef struct file {
+    int foo;
+} cfs_file_t;
+
+typedef struct cfs_proc_dir_entry{
+       void            *data;
+}cfs_proc_dir_entry_t;
+
+
+
+#include "../user-prim.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#define strcasecmp  strcmp
+#define strncasecmp strncmp
+#define snprintf   _snprintf
+#define getpid()   (0)
+
+
+#define getpwuid(x) (NULL)
+#define getgrgid(x) (NULL)
+
+int gethostname(char * name, int namelen);
+
+#define setlinebuf(x) do {} while(0)
+
+
+NTSYSAPI VOID NTAPI DebugBreak();
+
+
+static inline void cfs_enter_debugger(void)
+{
+#if _X86_
+    __asm int 3;
+#else
+    DebugBreak();
+#endif
+}
+
+/* Maximum EA Information Length */
+#define EA_MAX_LENGTH  (sizeof(FILE_FULL_EA_INFORMATION) + 15)
+
+
+/*
+ *  proc user mode routines
+ */
+
+HANDLE cfs_proc_open (char * filename, int oflag);
+int cfs_proc_close(HANDLE handle);
+int cfs_proc_read(HANDLE handle, void *buffer, unsigned int count);
+int cfs_proc_write(HANDLE handle, void *buffer, unsigned int count);
+int cfs_proc_ioctl(HANDLE handle, int cmd, void *buffer);
+
+
+/*
+ * Native API definitions
+ */
+
+//
+//  Disk I/O Routines
+//
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtReadFile(HANDLE FileHandle,
+    HANDLE Event OPTIONAL,
+    PIO_APC_ROUTINE ApcRoutine OPTIONAL,
+    PVOID ApcContext OPTIONAL,
+    PIO_STATUS_BLOCK IoStatusBlock,
+    PVOID Buffer,
+    ULONG Length,
+    PLARGE_INTEGER ByteOffset OPTIONAL,
+    PULONG Key OPTIONAL);
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtWriteFile(HANDLE FileHandle,
+    HANDLE Event OPTIONAL,
+    PIO_APC_ROUTINE ApcRoutine OPTIONAL,
+    PVOID ApcContext OPTIONAL,
+    PIO_STATUS_BLOCK IoStatusBlock,
+    PVOID Buffer,
+    ULONG Length,
+    PLARGE_INTEGER ByteOffset OPTIONAL,
+    PULONG Key OPTIONAL);
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtClose(HANDLE Handle);
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtCreateFile(PHANDLE FileHandle,
+    ACCESS_MASK DesiredAccess,
+    POBJECT_ATTRIBUTES ObjectAttributes,
+    PIO_STATUS_BLOCK IoStatusBlock,
+    PLARGE_INTEGER AllocationSize OPTIONAL,
+    ULONG FileAttributes,
+    ULONG ShareAccess,
+    ULONG CreateDisposition,
+    ULONG CreateOptions,
+    PVOID EaBuffer OPTIONAL,
+    ULONG EaLength);
+
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtDeviceIoControlFile(
+    IN HANDLE  FileHandle,
+    IN HANDLE  Event,
+    IN PIO_APC_ROUTINE  ApcRoutine,
+    IN PVOID  ApcContext,
+    OUT PIO_STATUS_BLOCK  IoStatusBlock,
+    IN ULONG  IoControlCode,
+    IN PVOID  InputBuffer,
+    IN ULONG  InputBufferLength,
+    OUT PVOID  OutputBuffer,
+    OUT ULONG  OutputBufferLength
+    ); 
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtFsControlFile(
+    IN HANDLE FileHandle,
+    IN HANDLE Event OPTIONAL,
+    IN PIO_APC_ROUTINE ApcRoutine OPTIONAL,
+    IN PVOID ApcContext OPTIONAL,
+    OUT PIO_STATUS_BLOCK IoStatusBlock,
+    IN ULONG FsControlCode,
+    IN PVOID InputBuffer OPTIONAL,
+    IN ULONG InputBufferLength,
+    OUT PVOID OutputBuffer OPTIONAL,
+    IN ULONG OutputBufferLength
+);
+
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtQueryInformationFile(
+    IN HANDLE  FileHandle,
+    OUT PIO_STATUS_BLOCK  IoStatusBlock,
+    OUT PVOID  FileInformation,
+    IN ULONG  Length,
+    IN FILE_INFORMATION_CLASS  FileInformationClass
+    );
+
+//
+// Random routines ...
+//
+
+NTSYSAPI
+ULONG
+NTAPI
+RtlRandom(
+    IN OUT PULONG  Seed
+    ); 
+
+#endif /* __KERNEL__ */
+
+
+//
+// Inode flags (Linux uses octad number, but why ? strange!!!)
+//
+
+#undef S_IFMT
+#undef S_IFDIR
+#undef S_IFCHR
+#undef S_IFREG
+#undef S_IREAD
+#undef S_IWRITE
+#undef S_IEXEC
+
+#define S_IFMT   0x0F000            /* 017 0000 */
+#define S_IFSOCK 0x0C000            /* 014 0000 */
+#define S_IFLNK  0x0A000            /* 012 0000 */
+#define S_IFREG  0x08000            /* 010 0000 */
+#define S_IFBLK  0x06000            /* 006 0000 */
+#define S_IFDIR  0x04000            /* 004 0000 */
+#define S_IFCHR  0x02000            /* 002 0000 */
+#define S_IFIFO  0x01000            /* 001 0000 */
+#define S_ISUID  0x00800            /* 000 4000 */
+#define S_ISGID  0x00400            /* 000 2000 */
+#define S_ISVTX  0x00200            /* 000 1000 */
+
+#define S_ISREG(m)      (((m) & S_IFMT) == S_IFREG)
+#define S_ISSOCK(m)     (((m) & S_IFMT) == S_IFSOCK)
+#define S_ISLNK(m)      (((m) & S_IFMT) == S_IFLNK)
+#define S_ISFIL(m)      (((m) & S_IFMT) == S_IFFIL)
+#define S_ISBLK(m)      (((m) & S_IFMT) == S_IFBLK)
+#define S_ISDIR(m)      (((m) & S_IFMT) == S_IFDIR)
+#define S_ISCHR(m)      (((m) & S_IFMT) == S_IFCHR)
+#define S_ISFIFO(m)     (((m) & S_IFMT) == S_IFIFO)
+
+#define S_IPERMISSION_MASK 0x1FF /*  */
+
+#define S_IRWXU  0x1C0              /* 0 0700 */
+#define S_IRUSR  0x100              /* 0 0400 */
+#define S_IWUSR  0x080              /* 0 0200 */
+#define S_IXUSR  0x040              /* 0 0100 */
+
+#define S_IRWXG  0x038              /* 0 0070 */
+#define S_IRGRP  0x020              /* 0 0040 */
+#define S_IWGRP  0x010              /* 0 0020 */
+#define S_IXGRP  0x008              /* 0 0010 */
+
+#define S_IRWXO  0x007              /* 0 0007 */
+#define S_IROTH  0x004              /* 0 0004 */
+#define S_IWOTH  0x002              /* 0 0002 */
+#define S_IXOTH  0x001              /* 0 0001 */
+
+#define S_IRWXUGO   (S_IRWXU|S_IRWXG|S_IRWXO)
+#define S_IALLUGO   (S_ISUID|S_ISGID|S_ISVTX|S_IRWXUGO)
+#define S_IRUGO     (S_IRUSR|S_IRGRP|S_IROTH)
+#define S_IWUGO     (S_IWUSR|S_IWGRP|S_IWOTH)
+#define S_IXUGO     (S_IXUSR|S_IXGRP|S_IXOTH)
+
+/*
+ *  linux ioctl coding definitions
+ */
+#define _IOC_NRBITS 8
+#define _IOC_TYPEBITS   8
+#define _IOC_SIZEBITS   14
+#define _IOC_DIRBITS    2
+
+#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1)
+#define _IOC_TYPEMASK   ((1 << _IOC_TYPEBITS)-1)
+#define _IOC_SIZEMASK   ((1 << _IOC_SIZEBITS)-1)
+#define _IOC_DIRMASK    ((1 << _IOC_DIRBITS)-1)
+
+#define _IOC_NRSHIFT    0
+#define _IOC_TYPESHIFT  (_IOC_NRSHIFT+_IOC_NRBITS)
+#define _IOC_SIZESHIFT  (_IOC_TYPESHIFT+_IOC_TYPEBITS)
+#define _IOC_DIRSHIFT   (_IOC_SIZESHIFT+_IOC_SIZEBITS)
+
+/*
+ * Direction bits.
+ */
+#define _IOC_NONE   0U
+#define _IOC_WRITE  1U
+#define _IOC_READ   2U
+
+#define _IOC(dir,type,nr,size) \
+    (((dir)  << _IOC_DIRSHIFT) | \
+     ((type) << _IOC_TYPESHIFT) | \
+     ((nr)   << _IOC_NRSHIFT) | \
+     ((size) << _IOC_SIZESHIFT))
+
+/* used to create numbers */
+#define _IO(type,nr)      _IOC(_IOC_NONE,(type),(nr),0)
+#define _IOR(type,nr,size)    _IOC(_IOC_READ,(type),(nr),sizeof(size))
+#define _IOW(type,nr,size)    _IOC(_IOC_WRITE,(type),(nr),sizeof(size))
+#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size))
+
+/* used to decode ioctl numbers.. */
+#define _IOC_DIR(nr)        (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
+#define _IOC_TYPE(nr)       (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
+#define _IOC_NR(nr)         (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
+#define _IOC_SIZE(nr)       (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
+
+/*
+ * Io vector ...  
+ */
+
+struct iovec
+{
+    void *iov_base;
+    size_t iov_len;
+};
+
+
+#define ULONG_LONG_MAX ((__u64)(0xFFFFFFFFFFFFFFFF))
+/*
+ * Convert a string to an unsigned long long integer.
+ *
+ * Ignores `locale' stuff.  Assumes that the upper and lower case
+ * alphabets and digits are each contiguous.
+ */
+static inline __u64
+strtoull(
+       char *nptr,
+       char **endptr,
+       int base)
+{
+       char *s = nptr;
+       __u64 acc, cutoff;
+       int c, neg = 0, any, cutlim;
+
+       /*
+        * See strtol for comments as to the logic used.
+        */
+       do {
+               c = *s++;
+       } while (isspace(c));
+       if (c == '-') {
+               neg = 1;
+               c = *s++;
+       } else if (c == '+')
+               c = *s++;
+       if ((base == 0 || base == 16) &&
+           c == '0' && (*s == 'x' || *s == 'X')) {
+               c = s[1];
+               s += 2;
+               base = 16;
+       }
+       if (base == 0)
+               base = c == '0' ? 8 : 10;
+       cutoff = (__u64)ULONG_LONG_MAX / (__u64)base;
+       cutlim = (int)((__u64)ULONG_LONG_MAX % (__u64)base);
+       for (acc = 0, any = 0;; c = *s++) {
+               if (isdigit(c))
+                       c -= '0';
+               else if (isalpha(c))
+                       c -= isupper(c) ? 'A' - 10 : 'a' - 10;
+               else
+                       break;
+               if (c >= base)
+                       break;
+               if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+                       any = -1;
+               else {
+                       any = 1;
+                       acc *= base;
+                       acc += c;
+               }
+       }
+       if (any < 0) {
+               acc = ULONG_LONG_MAX;
+       } else if (neg)
+               acc = 0 - acc;
+       if (endptr != 0)
+               *endptr = (char *) (any ? s - 1 : nptr);
+       return (acc);
+}
+
+#endif
diff --git a/lnet/include/libcfs/winnt/winnt-tcpip.h b/lnet/include/libcfs/winnt/winnt-tcpip.h
new file mode 100644 (file)
index 0000000..13f4008
--- /dev/null
@@ -0,0 +1,636 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or modify it under the
+ * terms of version 2 of the GNU General Public License as published by the
+ * Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass
+ * Ave, Cambridge, MA 02139, USA.
+ *
+ * Implementation of portable time API for Winnt (kernel and user-level).
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_TCPIP_H__
+#define __LIBCFS_WINNT_TCPIP_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+
+#ifdef __KERNEL__
+
+//
+//  ksocknal definitions
+//
+
+// iovec is defined in libcfs: winnt_prim.h 
+// lnetkiov_t is defined in lnet/types.h
+
+typedef struct socket ksock_tconn_t;
+
+// completion notification callback routine
+
+typedef VOID (*ksock_schedule_cb)(struct socket*, int, void *, ulong_ptr);
+
+/* completion routine to update tx structure for async sending */
+typedef PVOID (*ksock_update_tx)(struct socket*, PVOID tx, ulong_ptr);
+
+//
+// tdinal definitions
+//
+
+
+#if TDINAL_DBG
+#define KsPrint(X)     KsPrintf X
+#else
+#define KsPrint(X)
+#endif
+
+
+//
+// Socket Addresses Related ...
+//
+
+#define            INADDR_ANY              (ULONG)0x00000000
+#define     INADDR_LOOPBACK     (ULONG)0x7f000001
+#define            INADDR_BROADCAST    (ULONG)0xffffffff
+#define            INADDR_NONE             (ULONG)0xffffffff
+
+/*
+ *  TCP / IP options
+ */
+
+#define     SOL_TCP             6
+#define     SOL_UDP                    17
+
+
+#define TL_INSTANCE             0
+
+#define TCP_SOCKET_NODELAY      1 //  disabling "Nagle"
+#define TCP_SOCKET_KEEPALIVE    2
+#define TCP_SOCKET_OOBINLINE    3
+#define TCP_SOCKET_BSDURGENT    4
+#define TCP_SOCKET_ATMARK       5
+#define TCP_SOCKET_WINDOW       6
+
+
+/* Flags we can use with send/ and recv. 
+   Added those for 1003.1g not all are supported yet
+ */
+#define MSG_OOB            1
+#define MSG_PEEK        2
+#define MSG_DONTROUTE   4
+#define MSG_TRYHARD     4       /* Synonym for MSG_DONTROUTE for DECnet */
+#define MSG_CTRUNC      8
+#define MSG_PROBE       0x10   /* Do not send. Only probe path f.e. for MTU */
+#define MSG_TRUNC       0x20
+#define MSG_DONTWAIT    0x40   /* Nonblocking io                */
+#define MSG_EOR         0x80   /* End of record */
+#define MSG_WAITALL     0x100  /* Wait for a full request */
+#define MSG_FIN         0x200
+#define MSG_SYN                0x400
+#define MSG_CONFIRM     0x800  /* Confirm path validity */
+#define MSG_RST         0x1000
+#define MSG_ERRQUEUE    0x2000 /* Fetch message from error queue */
+#define MSG_NOSIGNAL    0x4000 /* Do not generate SIGPIPE */
+#define MSG_MORE        0x8000 /* Sender will send more */
+
+#define MSG_EOF         MSG_FIN
+
+
+//
+// Maximum TRANSPORT_ADDRESS Length
+//
+// it must >= FIELD_OFFSET(TRANSPORT_ADDRESS, Address->Address)
+//            + TDI_ADDRESS_LENGTH_IP
+//
+// I define it a little large and 16 bytes aligned to avoid possible overflow.
+//
+
+#define MAX_ADDRESS_LENGTH              (0x30)
+
+
+//
+// Maximum Listers Children Sockets
+//
+
+#define MAX_CHILD_LISTENERS             (4)
+
+//
+// Maximum EA Information Length
+//
+
+#define EA_MAX_LENGTH                   ( sizeof(FILE_FULL_EA_INFORMATION) - 1 + \
+                                          TDI_TRANSPORT_ADDRESS_LENGTH + 1 + \
+                                          MAX_ADDRESS_LENGTH )
+
+
+#define UDP_DEVICE_NAME L"\\Device\\Udp"
+#define TCP_DEVICE_NAME L"\\Device\\Tcp"
+
+
+/*
+ * TSDU definitions
+ */
+
+#define TDINAL_TSDU_DEFAULT_SIZE  (0x10000)
+
+#define KS_TSDU_MAGIC       'KSTD'
+
+#define KS_TSDU_ATTACHED    0x00000001  // Attached to the socket receive tsdu list
+
+typedef struct _KS_TSDU {
+
+    ULONG               Magic;
+    ULONG               Flags;
+
+    struct list_head    Link;
+
+    ULONG               TotalLength;    // Total size of KS_TSDU
+
+    ULONG               StartOffset;    // Start offset of the first Tsdu unit
+    ULONG               LastOffset;     // End offset of the last Tsdu unit
+
+/*
+    union {
+        KS_TSDU_DAT[];
+        KS_TSDU_BUF[];
+        KS_TSDU_MDL[];
+    }
+*/
+
+} KS_TSDU, *PKS_TSDU;
+
+#define TSDU_TYPE_BUF   ((USHORT)0x5401)
+#define TSDU_TYPE_DAT   ((USHORT)0x5402)
+#define TSDU_TYPE_MDL   ((USHORT)0x5403)
+
+#define KS_TSDU_BUF_RECEIVING       0x0001
+typedef struct _KS_TSDU_BUF {
+
+    USHORT              TsduType;
+    USHORT              TsduFlags;
+
+    ULONG               DataLength;
+    ULONG               StartOffset;
+
+    PVOID               UserBuffer;
+
+} KS_TSDU_BUF, *PKS_TSDU_BUF;
+
+#define KS_TSDU_DAT_RECEIVING       0x0001
+
+typedef struct _KS_TSDU_DAT {
+
+    USHORT              TsduType;
+    USHORT              TsduFlags;
+
+    ULONG               DataLength;
+    ULONG               StartOffset;
+
+    ULONG               TotalLength;
+
+    UCHAR               Data[1];
+
+} KS_TSDU_DAT, *PKS_TSDU_DAT;
+
+#define KS_DWORD_ALIGN(x)      (((x) + 0x03) & (~(0x03)))
+#define KS_TSDU_STRU_SIZE(Len) (KS_DWORD_ALIGN((Len) + FIELD_OFFSET(KS_TSDU_DAT, Data)))
+
+typedef struct _KS_TSDU_MDL {
+
+    USHORT              TsduType;
+    USHORT              TsduFlags;
+
+    ULONG               DataLength;
+    ULONG               StartOffset;    
+
+    PMDL                Mdl;
+    PVOID               Descriptor;
+
+} KS_TSDU_MDL, *PKS_TSDU_MDL;
+
+
+typedef struct _KS_TSDUMGR {
+
+    struct list_head    TsduList;
+    ULONG               NumOfTsdu;
+    ULONG               TotalBytes;
+    KEVENT              Event;
+
+} KS_TSDUMGR, *PKS_TSDUMGR;
+
+
+typedef struct _KS_CHAIN {
+
+    KS_TSDUMGR          Normal;
+    KS_TSDUMGR          Expedited;
+
+} KS_CHAIN, *PKS_CHAIN;
+
+
+#define TDINAL_SCHED_FACTOR (1)
+#define CAN_BE_SCHED(Len, Limit) (Len >= ((Limit) >> TDINAL_SCHED_FACTOR))
+
+//
+// Handler Settings Indictor 
+//
+
+#define TDI_EVENT_MAXIMUM_HANDLER (TDI_EVENT_ERROR_EX + 1)
+
+
+typedef struct _KS_EVENT_HANDLERS {
+    BOOLEAN     IsActive[TDI_EVENT_MAXIMUM_HANDLER];
+    PVOID       Handler [TDI_EVENT_MAXIMUM_HANDLER];
+} KS_EVENT_HANDLERS, *PKS_EVENT_HANDLERS;
+
+#define SetEventHandler(ha, ht, hr) do {        \
+            ha.IsActive[ht] = TRUE;             \
+            ha.Handler[ht] = (PVOID) (hr);      \
+        } while(0)
+
+//
+// KSock Internal Structures
+//
+
+typedef struct _KS_ADDRESS {
+
+    union {
+        TRANSPORT_ADDRESS   Tdi;
+        UCHAR               Pading[MAX_ADDRESS_LENGTH];
+    };
+
+    HANDLE                  Handle;
+    PFILE_OBJECT            FileObject;
+
+} KS_ADDRESS, *PKS_ADDRESS;
+
+//
+// Structures for Disconnect Workitem
+//
+
+typedef struct _KS_DISCONNECT_WORKITEM {
+
+    WORK_QUEUE_ITEM         WorkItem;       // Workitem to perform disconnection
+    ksock_tconn_t *         tconn;          // tdi connecton
+    ULONG                   Flags;          // connection broken/discnnection flags
+    KEVENT                  Event;          // sync event
+
+} KS_DISCONNECT_WORKITEM, *PKS_DISCONNECT_WORKITEM;
+
+
+typedef struct _KS_CONNECTION {
+
+    HANDLE                      Handle;     // Handle of the tdi connection
+    PFILE_OBJECT                FileObject; // FileObject if the conn object
+
+    PTRANSPORT_ADDRESS          Remote;     // the ConnectionInfo of this connection
+    PTDI_CONNECTION_INFORMATION ConnectionInfo;
+
+    ULONG                       nagle;      // Tcp options 
+
+} KS_CONNECTION, *PKS_CONNECTION;
+
+
+//
+// type definitions
+//
+
+typedef MDL                         ksock_mdl_t;
+typedef UNICODE_STRING              ksock_unicode_name_t;
+typedef WORK_QUEUE_ITEM             ksock_workitem_t;
+
+
+typedef KS_CHAIN                    ksock_chain_t;
+typedef KS_ADDRESS                  ksock_tdi_addr_t;
+typedef KS_CONNECTION               ksock_tconn_info_t;
+typedef KS_DISCONNECT_WORKITEM      ksock_disconnect_workitem_t;
+
+
+//
+// Structures for transmission done Workitem
+//
+
+typedef struct _KS_TCPX_FINILIZE {
+    ksock_workitem_t        item;
+    void *                  tx;
+} ksock_tcpx_fini_t;
+
+
+typedef struct ksock_backlogs {
+
+        struct list_head    list;   /* list to link the backlog connections */
+        int                 num;    /* number of backlogs in the list */
+
+} ksock_backlogs_t;
+
+
+typedef struct ksock_daemon {
+
+    ksock_tconn_t *         tconn;         /* the listener connection object */
+    unsigned short          nbacklogs;     /* number of listening backlog conns */
+    unsigned short          port;          /* listening port number */ 
+    int                     shutdown;      /* daemon threads is to exit */
+    struct list_head        list;          /* to be attached into ksock_nal_data_t*/
+
+} ksock_daemon_t ;
+
+
+typedef enum {
+
+    kstt_sender = 0,    // normal sending connection type, it's active connection, while
+                        // child tconn is for passive connection.
+
+    kstt_listener,      // listener daemon type, it just acts as a daemon, and it does
+                        // not have real connection. It manages children tcons to accept
+                        // or refuse the connecting request from remote peers.
+
+    kstt_child,         // accepted child connection type, it's parent must be Listener
+    kstt_lasttype
+} ksock_tconn_type;
+
+typedef enum {
+
+    ksts_uninited = 0, // tconn is just allocated (zero values), not initialized yet
+
+    ksts_inited,        // tconn structure initialized: so it now can be identified as
+                        // a sender, listener or a child
+
+    ksts_bind,          // tconn is bound: the local address object (ip/port) is created.
+                        // after being bound, we must call ksocknal_put_tconn to release
+                        // the tconn objects, it's not safe just to free the memory of tconn.
+
+    ksts_associated,    // the connection object is created and associated with the address
+                        // object. so it's ready for connection. only for child and sender.
+
+    ksts_connecting,    // only used by child tconn: in the ConnectEvent handler routine,
+                        // it indicts the child tconn is busy to be connected to the peer.
+
+    ksts_connected,     // the connection is built already: for sender and child
+
+    ksts_listening,     // listener daemon is working, only for listener tconn
+
+    ksts_disconnected,  // disconnected by user
+    ksts_aborted,       // un-exptected broken status
+
+    ksts_last           // total number of tconn statuses
+} ksock_tconn_state;
+
+#define KS_TCONN_MAGIC              'KSTM'
+
+#define KS_TCONN_HANDLERS_SET       0x00000001  // Conection handlers are set.
+#define KS_TCONN_DISCONNECT_BUSY    0x00010000  // Disconnect Workitem is queued ...
+#define KS_TCONN_DESTROY_BUSY       0x00020000  // Destory Workitem is queued ...
+
+#define KS_TCONN_DAEMON_STARTED     0x00100000  // indict the daemon is started,
+                                                // only valid for listener
+
+struct socket {
+
+        ulong_ptr                   kstc_magic;      /* Magic & Flags */
+        ulong_ptr                   kstc_flags;
+
+        spinlock_t                  kstc_lock;       /* serialise lock*/
+        void *                      kstc_conn;       /* ksock_conn_t */
+
+        ksock_tconn_type            kstc_type;          /* tdi connection Type */
+        ksock_tconn_state           kstc_state;      /* tdi connection state flag */
+
+        ksock_unicode_name_t        kstc_dev;        /* tcp transport device name */
+
+        ksock_tdi_addr_t            kstc_addr;       /* local address handlers / Objects */
+
+        atomic_t                    kstc_refcount;   /* reference count of ksock_tconn */
+
+        struct list_head            kstc_list;       /* linked to global ksocknal_data */
+
+        union {
+
+            struct {
+                int                 nbacklog;         /* total number of backlog tdi connections */
+                ksock_backlogs_t    kstc_listening;   /* listeing backlog child connections */
+                ksock_backlogs_t    kstc_accepted;    /* connected backlog child connections */
+                event_t             kstc_accept_event;   /* Signaled by AcceptedHander, 
+                                                            ksocknal_wait_accpeted_conns waits on */
+                event_t             kstc_destroy_event;  /* Signaled when accepted child is released */
+            } listener; 
+
+            struct  {
+                ksock_tconn_info_t  kstc_info;      /* Connection Info if Connected */
+                ksock_chain_t       kstc_recv;      /* tsdu engine for data receiving */
+                ksock_chain_t       kstc_send;      /* tsdu engine for data sending */
+
+                int                 kstc_queued;    /* Attached to Parent->ChildList ... */
+                int                 kstc_queueno;   /* 0: Attached to Listening list 
+                                                       1: Attached to Accepted list */
+
+                int                 kstc_busy;      /* referred by ConnectEventCallback ? */
+                int                 kstc_accepted;  /* the connection is built ready ? */
+
+                struct list_head    kstc_link;      /* linked to parent tdi connection */
+                ksock_tconn_t   *   kstc_parent;    /* pointers to it's listener parent */
+            } child;
+
+            struct {
+                ksock_tconn_info_t  kstc_info;      /* Connection Info if Connected */
+                ksock_chain_t       kstc_recv;      /* tsdu engine for data receiving */
+                ksock_chain_t       kstc_send;      /* tsdu engine for data sending */
+            } sender; 
+        };
+
+        ulong_ptr                   kstc_snd_wnd;   /* Sending window size */
+        ulong_ptr                   kstc_rcv_wnd;   /* Recving window size */
+
+        ksock_workitem_t            kstc_destroy;    /* tconn destruction workitem */
+        ksock_disconnect_workitem_t kstc_disconnect; /* connection disconnect workitem */
+
+        ksock_schedule_cb           kstc_sched_cb;   /* notification callback routine of completion */
+        ksock_update_tx             kstc_update_tx;  /* aync sending callback to update tx */
+};
+
+#define TDINAL_WINDOW_DEFAULT_SIZE  (0x100000)
+
+
+struct _KS_UDP_COMPLETION_CONTEXT;
+struct _KS_TCP_COMPLETION_CONTEXT;
+
+
+typedef
+NTSTATUS
+(*PKS_UDP_COMPLETION_ROUTINE) (
+    IN PIRP     Irp,
+    IN struct _KS_UDP_COMPLETION_CONTEXT
+                *UdpContext
+    );
+
+
+typedef
+NTSTATUS
+(*PKS_TCP_COMPLETION_ROUTINE) (
+    IN PIRP     Irp,
+    IN struct _KS_TCP_COMPLETION_CONTEXT
+                *TcpContext
+    );
+
+//
+// Udp Irp Completion Context
+//
+
+typedef struct _KS_UDP_COMPLETION_CONTEXT {
+
+    PKEVENT                             Event;
+    union {
+        PFILE_OBJECT                    AddressObject;
+        ksock_tconn_t *                 tconn;
+    };
+
+    PKS_UDP_COMPLETION_ROUTINE          CompletionRoutine;
+    PVOID                               CompletionContext;
+
+} KS_UDP_COMPLETION_CONTEXT, *PKS_UDP_COMPLETION_CONTEXT;
+
+
+//
+// Tcp Irp Completion Context (used by tcp data recv/send)
+//
+
+typedef struct _KS_TCP_COMPLETION_CONTEXT {
+
+    PKEVENT                             Event;      // Event to be waited on by Irp caller ...
+
+    ksock_tconn_t *                     tconn;      // the tdi connection
+
+    PKS_TCP_COMPLETION_ROUTINE          CompletionRoutine;
+    PVOID                               CompletionContext;
+    PVOID                               CompletionContext2;
+
+    PKS_TSDUMGR                         KsTsduMgr;  // Tsdu buffer manager
+
+    //
+    // These tow new members are for NON_BLOCKING transmission
+    //
+
+    BOOLEAN                                                        bCounted;    // To indict needing refcount to
+                                                     // execute CompetionRoutine
+    ULONG                               ReferCount;  // Refer count of this structure
+
+} KS_TCP_COMPLETION_CONTEXT, *PKS_TCP_COMPLETION_CONTEXT;
+
+typedef KS_TCP_COMPLETION_CONTEXT  ksock_tdi_tx_t, ksock_tdi_rx_t;
+
+
+/*
+ * tdi extensions
+ */
+
+#define IOCTL_TCP_QUERY_INFORMATION_EX        \
+                        CTL_CODE(FILE_DEVICE_NETWORK, 0, METHOD_NEITHER, FILE_ANY_ACCESS)
+#define IOCTL_TCP_SET_INFORMATION_EX        \
+                        CTL_CODE(FILE_DEVICE_NETWORK, 1, METHOD_BUFFERED, FILE_WRITE_ACCESS)
+
+
+#define TcpBuildSetInformationEx(Irp, DevObj, FileObj, CompRoutine, Contxt, Buffer, BufferLen)\
+    {                                                                        \
+        PIO_STACK_LOCATION _IRPSP;                                           \
+        if ( CompRoutine != NULL) {                                          \
+            IoSetCompletionRoutine( Irp, CompRoutine, Contxt, TRUE, TRUE, TRUE);\
+        } else {                                                             \
+            IoSetCompletionRoutine( Irp, NULL, NULL, FALSE, FALSE, FALSE);   \
+        }                                                                    \
+        _IRPSP = IoGetNextIrpStackLocation (Irp);                            \
+        _IRPSP->MajorFunction = IRP_MJ_DEVICE_CONTROL;                       \
+        _IRPSP->DeviceObject = DevObj;                                       \
+        _IRPSP->FileObject = FileObj;                                        \
+        _IRPSP->Parameters.DeviceIoControl.OutputBufferLength = 0;           \
+        _IRPSP->Parameters.DeviceIoControl.InputBufferLength = BufferLen;    \
+        _IRPSP->Parameters.DeviceIoControl.IoControlCode = IOCTL_TCP_SET_INFORMATION_EX;  \
+        Irp->AssociatedIrp.SystemBuffer = Buffer;                            \
+    }
+
+
+#define TcpBuildQueryInformationEx(Irp, DevObj, FileObj, CompRoutine, Contxt, InBuffer, InLength, OutBuffer, OutLength)\
+    {                                                                        \
+        PIO_STACK_LOCATION _IRPSP;                                           \
+        if ( CompRoutine != NULL) {                                          \
+            IoSetCompletionRoutine( Irp, CompRoutine, Contxt, TRUE, TRUE, TRUE);\
+        } else {                                                             \
+            IoSetCompletionRoutine( Irp, NULL, NULL, FALSE, FALSE, FALSE);   \
+        }                                                                    \
+        _IRPSP = IoGetNextIrpStackLocation (Irp);                            \
+        _IRPSP->MajorFunction = IRP_MJ_DEVICE_CONTROL;                       \
+        _IRPSP->DeviceObject = DevObj;                                       \
+        _IRPSP->FileObject = FileObj;                                        \
+        _IRPSP->Parameters.DeviceIoControl.OutputBufferLength = OutLength;           \
+        _IRPSP->Parameters.DeviceIoControl.InputBufferLength = InLength;    \
+        _IRPSP->Parameters.DeviceIoControl.IoControlCode = IOCTL_TCP_QUERY_INFORMATION_EX;  \
+        _IRPSP->Parameters.DeviceIoControl.Type3InputBuffer = InBuffer;    \
+        Irp->UserBuffer = OutBuffer;                            \
+    }
+
+
+typedef struct
+{
+        /*
+         *  Tdinal internal defintions
+         */
+
+        int               ksnd_init;            /* initialisation state */
+
+        TDI_PROVIDER_INFO ksnd_provider;    /* tdi tcp/ip provider's information */
+
+        spinlock_t        ksnd_tconn_lock;      /* tdi connections access serialise */
+
+        int               ksnd_ntconns;         /* number of tconns attached in list */
+        struct list_head  ksnd_tconns;          /* tdi connections list */
+        cfs_mem_cache_t * ksnd_tconn_slab;      /* slabs for ksock_tconn_t allocations */
+        event_t           ksnd_tconn_exit;      /* exit event to be signaled by the last tconn */
+
+        spinlock_t        ksnd_tsdu_lock;       /* tsdu access serialise */
+        
+        int               ksnd_ntsdus;          /* number of tsdu buffers allocated */
+        ulong_ptr     ksnd_tsdu_size;       /* the size of a signel tsdu buffer */
+        cfs_mem_cache_t * ksnd_tsdu_slab;       /* slab cache for tsdu buffer allocation */
+
+        int               ksnd_nfreetsdus;      /* number of tsdu buffers in the freed list */
+        struct list_head  ksnd_freetsdus;          /* List of the freed Tsdu buffer. */
+
+        spinlock_t        ksnd_daemon_lock;     /* stabilize daemon ops */
+        int               ksnd_ndaemons;        /* number of listening daemons */
+        struct list_head  ksnd_daemons;         /* listening daemon list */
+        event_t           ksnd_daemon_exit;     /* the last daemon quiting should singal it */
+
+} ks_data_t;
+
+int
+ksocknal_init_tdi_data();
+
+void
+ksocknal_fini_tdi_data();
+
+
+#endif /* __KERNEL__ */
+#endif /* __LIBCFS_WINNT_TCPIP_H__ */
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/lnet/include/libcfs/winnt/winnt-time.h b/lnet/include/libcfs/winnt/winnt-time.h
new file mode 100644 (file)
index 0000000..096edab
--- /dev/null
@@ -0,0 +1,322 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or modify it under the
+ * terms of version 2 of the GNU General Public License as published by the
+ * Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass
+ * Ave, Cambridge, MA 02139, USA.
+ *
+ * Implementation of portable time API for Winnt (kernel and user-level).
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_LINUX_TIME_H__
+#define __LIBCFS_WINNT_LINUX_TIME_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+/* Portable time API */
+
+/*
+ * Platform provides three opaque data-types:
+ *
+ *  cfs_time_t        represents point in time. This is internal kernel
+ *                    time rather than "wall clock". This time bears no
+ *                    relation to gettimeofday().
+ *
+ *  cfs_duration_t    represents time interval with resolution of internal
+ *                    platform clock
+ *
+ *  cfs_fs_time_t     represents instance in world-visible time. This is
+ *                    used in file-system time-stamps
+ *
+ *  cfs_time_t     cfs_time_current(void);
+ *  cfs_time_t     cfs_time_add    (cfs_time_t, cfs_duration_t);
+ *  cfs_duration_t cfs_time_sub    (cfs_time_t, cfs_time_t);
+ *  int            cfs_time_before (cfs_time_t, cfs_time_t);
+ *  int            cfs_time_beforeq(cfs_time_t, cfs_time_t);
+ *
+ *  cfs_duration_t cfs_duration_build(int64_t);
+ *
+ *  time_t         cfs_duration_sec (cfs_duration_t);
+ *  void           cfs_duration_usec(cfs_duration_t, struct timeval *);
+ *  void           cfs_duration_nsec(cfs_duration_t, struct timespec *);
+ *
+ *  void           cfs_fs_time_current(cfs_fs_time_t *);
+ *  time_t         cfs_fs_time_sec    (cfs_fs_time_t *);
+ *  void           cfs_fs_time_usec   (cfs_fs_time_t *, struct timeval *);
+ *  void           cfs_fs_time_nsec   (cfs_fs_time_t *, struct timespec *);
+ *  int            cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *);
+ *  int            cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *);
+ *
+ *  cfs_duration_t cfs_time_minimal_timeout(void)
+ *
+ *  CFS_TIME_FORMAT
+ *  CFS_DURATION_FORMAT
+ *
+ */
+
+#define ONE_BILLION ((u_int64_t)1000000000)
+#define ONE_MILLION ((u_int64_t)   1000000)
+
+#define HZ (100)
+
+struct timeval {
+       time_t          tv_sec;         /* seconds */
+       suseconds_t     tv_usec;        /* microseconds */
+};
+
+struct timespec {
+    ulong_ptr tv_sec;
+    ulong_ptr tv_nsec;
+};
+
+#ifdef __KERNEL__
+
+#include <libcfs/winnt/portals_compat25.h>
+
+/*
+ * Generic kernel stuff
+ */
+
+typedef struct timeval cfs_fs_time_t;
+
+typedef u_int64_t cfs_time_t;
+typedef int64_t cfs_duration_t;
+
+static inline void do_gettimeofday(struct timeval *tv)
+{
+    LARGE_INTEGER Time;
+
+    KeQuerySystemTime(&Time);
+
+    tv->tv_sec  = (long_ptr) (Time.QuadPart / 10000000);
+    tv->tv_usec = (long_ptr) (Time.QuadPart % 10000000) / 10;
+}
+
+static inline cfs_time_t JIFFIES()
+{
+    LARGE_INTEGER Tick;
+    LARGE_INTEGER Elapse;
+
+    KeQueryTickCount(&Tick);
+
+    Elapse.QuadPart  = Tick.QuadPart * KeQueryTimeIncrement();
+    Elapse.QuadPart /= (10000000 / HZ);
+
+    return Elapse.QuadPart;
+}
+
+static inline cfs_time_t cfs_time_current(void)
+{
+    return JIFFIES();
+}
+
+static inline cfs_time_t cfs_time_current_sec(void)
+{
+    return (JIFFIES() / HZ);
+}
+
+static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d)
+{
+    return (t + d);
+}
+
+static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2)
+{
+    return (t1 - t2);
+}
+
+static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2)
+{
+    return ((int64_t)t1 - (int64_t)t2) < 0; 
+}
+
+static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
+{
+    return ((int64_t)t1 - (int64_t)t2) <= 0;
+}
+
+static inline void cfs_fs_time_current(cfs_fs_time_t *t)
+{
+    ULONG         Linux;
+    LARGE_INTEGER Sys;
+
+    KeQuerySystemTime(&Sys);
+
+    RtlTimeToSecondsSince1970(&Sys, &Linux);
+
+    t->tv_sec  = Linux;
+    t->tv_usec = (Sys.LowPart % 10000000) / 10;
+}
+
+static inline cfs_time_t cfs_fs_time_sec(cfs_fs_time_t *t)
+{
+    return t->tv_sec;
+}
+
+static inline u_int64_t __cfs_fs_time_flat(cfs_fs_time_t *t)
+{
+    return ((u_int64_t)t->tv_sec) * ONE_MILLION + t->tv_usec;
+}
+
+static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
+{
+    return (__cfs_fs_time_flat(t1) < __cfs_fs_time_flat(t2));
+}
+
+static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
+{
+    return (__cfs_fs_time_flat(t1) <= __cfs_fs_time_flat(t2));
+}
+
+static inline cfs_duration_t cfs_time_seconds(int seconds)
+{
+    return (cfs_duration_t)seconds * HZ;
+}
+
+#if 0 // defined in libcfs/libcfs.h
+static inline cfs_time_t cfs_time_shift(int seconds)
+{
+        return (JIFFIES() + seconds * HZ);
+}
+#endif
+
+static inline cfs_time_t cfs_duration_sec(cfs_duration_t d)
+{
+        return d / HZ;
+}
+
+static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s)
+{
+        s->tv_usec = (time_t)((d - s->tv_sec * HZ) * ONE_MILLION / HZ);
+        s->tv_sec = (suseconds_t) (d / HZ);
+}
+
+static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s)
+{
+        s->tv_nsec = (time_t)((d - s->tv_sec * HZ) * ONE_BILLION / HZ);
+        s->tv_sec = (suseconds_t) (d / HZ);
+}
+
+static inline cfs_duration_t cfs_time_minimal_timeout(void)
+{
+        return 1;
+}
+
+static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v)
+{
+        *v = *t;
+}
+
+static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s)
+{
+        s->tv_sec  = t->tv_sec;
+        s->tv_nsec = t->tv_usec * 1000;
+}
+
+
+/* inline function cfs_time_minimal_timeout() can not be used
+ * to initiallize static variable */
+#define CFS_MIN_DELAY           (1)
+
+#define LTIME_S(t)                     (t)
+
+#define CFS_TIME_T              "%I64u"
+#define CFS_DURATION_T          "%I64d"
+
+#else   /* !__KERNEL__ */
+
+/*
+ * Liblustre. time(2) based implementation.
+ */
+#include <libcfs/user-time.h>
+
+
+//
+// Time routines ...
+//
+
+NTSYSAPI
+CCHAR
+NTAPI
+NtQuerySystemTime(
+    OUT PLARGE_INTEGER  CurrentTime
+    );
+
+
+NTSYSAPI
+BOOLEAN
+NTAPI
+RtlTimeToSecondsSince1970(
+    IN PLARGE_INTEGER  Time,
+    OUT PULONG  ElapsedSeconds
+    );
+
+
+NTSYSAPI
+VOID
+NTAPI
+RtlSecondsSince1970ToTime(
+    IN ULONG  ElapsedSeconds,
+    OUT PLARGE_INTEGER  Time
+    );
+
+NTSYSAPI
+VOID
+NTAPI
+Sleep(
+  DWORD dwMilliseconds   // sleep time in milliseconds
+);
+
+
+static inline void sleep(int time)
+{
+    DWORD Time = 1000 * time;
+    Sleep(Time);
+}
+
+
+static inline void do_gettimeofday(struct timeval *tv)
+{
+    LARGE_INTEGER Time;
+
+    NtQuerySystemTime(&Time);
+
+    tv->tv_sec  = (long_ptr) (Time.QuadPart / 10000000);
+    tv->tv_usec = (long_ptr) (Time.QuadPart % 10000000) / 10;
+}
+
+static inline int gettimeofday(struct timeval *tv, void * tz)
+{
+    do_gettimeofday(tv);
+    return 0;
+}
+
+#endif /* __KERNEL__ */
+
+/* __LIBCFS_LINUX_LINUX_TIME_H__ */
+#endif
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/lnet/include/libcfs/winnt/winnt-types.h b/lnet/include/libcfs/winnt/winnt-types.h
new file mode 100644 (file)
index 0000000..6478730
--- /dev/null
@@ -0,0 +1,647 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic types definitions
+ *
+ */
+
+#ifndef _WINNT_TYPE_H
+#define _WINNT_TYPE_H
+
+#ifdef __KERNEL__
+
+#include <ntifs.h>
+#include <windef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#include <tdi.h>
+#include <tdikrnl.h>
+#include <tdiinfo.h>
+
+#else
+
+#include <ntddk.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <time.h>
+#include <io.h>
+#include <string.h>
+#include <assert.h>
+
+#endif
+
+
+#define __LITTLE_ENDIAN
+
+#define inline     __inline
+#define __inline__ __inline
+
+typedef unsigned __int8     __u8;
+typedef signed   __int8     __s8;
+
+typedef signed   __int64    __s64;
+typedef unsigned __int64    __u64;
+
+typedef        signed   __int16        __s16;
+typedef        unsigned __int16        __u16;
+
+typedef        signed   __int32        __s32;
+typedef        unsigned __int32        __u32;
+
+typedef        signed   __int64        __s64;
+typedef        unsigned __int64        __u64;
+
+typedef unsigned long       ULONG;
+
+
+#if defined(_WIN64)
+    #define long_ptr        __int64
+    #define ulong_ptr       unsigned __int64
+    #define BITS_PER_LONG   (64)
+#else
+    #define long_ptr        long
+    #define ulong_ptr       unsigned long
+    #define BITS_PER_LONG   (32)
+
+#endif
+
+/* bsd */
+typedef unsigned char          u_char;
+typedef unsigned short         u_short;
+typedef unsigned int           u_int;
+typedef unsigned long          u_long;
+
+/* sysv */
+typedef unsigned char          unchar;
+typedef unsigned short         ushort;
+typedef unsigned int           uint;
+typedef unsigned long          ulong;
+
+#ifndef __BIT_TYPES_DEFINED__
+#define __BIT_TYPES_DEFINED__
+
+typedef                __u8            u_int8_t;
+typedef                __s8            int8_t;
+typedef                __u16           u_int16_t;
+typedef                __s16           int16_t;
+typedef                __u32           u_int32_t;
+typedef                __s32           int32_t;
+
+#endif /* !(__BIT_TYPES_DEFINED__) */
+
+typedef                __u8            uint8_t;
+typedef                __u16           uint16_t;
+typedef                __u32           uint32_t;
+
+typedef                __u64           uint64_t;
+typedef                __u64           u_int64_t;
+typedef                __s64           int64_t;
+
+typedef long            ssize_t;
+
+typedef __u32           suseconds_t;
+
+typedef __u32           pid_t, tid_t;
+
+typedef __u16           uid_t, gid_t;
+
+typedef __u16           mode_t;
+typedef __u16           umode_t;
+
+typedef ulong_ptr       sigset_t;
+
+typedef uint64_t        loff_t;
+typedef HANDLE          cfs_handle_t;
+typedef uint64_t        cycles_t;
+
+#ifndef INVALID_HANDLE_VALUE
+#define INVALID_HANDLE_VALUE ((HANDLE)-1)
+#endif
+
+
+#ifdef __KERNEL__ /* kernel */
+
+typedef __u32           off_t;
+typedef __u32           time_t;
+
+typedef unsigned short  kdev_t;
+
+#else  /* !__KERNEL__ */
+
+typedef int             BOOL;
+typedef __u8            BYTE;
+typedef __u16           WORD;
+typedef __u32           DWORD;
+
+#endif /* __KERNEL__ */
+
+/*
+ * Conastants suffix
+ */
+
+#define ULL i64
+#define ull i64
+
+/*
+ * Winnt kernel has no capabilities.
+ */
+
+typedef __u32 cfs_kernel_cap_t;
+
+#define INT_MAX         ((int)(~0U>>1))
+#define INT_MIN         (-INT_MAX - 1)
+#define UINT_MAX        (~0U)
+
+#endif /* _WINNT_TYPES_H */
+
+
+/*
+ *  Bytes order 
+ */
+
+//
+// Byte order swapping routines
+//
+
+
+#define ___swab16(x) RtlUshortByteSwap(x)
+#define ___swab32(x) RtlUlongByteSwap(x)
+#define ___swab64(x) RtlUlonglongByteSwap(x)
+
+#define ___constant_swab16(x) \
+       ((__u16)( \
+               (((__u16)(x) & (__u16)0x00ffU) << 8) | \
+               (((__u16)(x) & (__u16)0xff00U) >> 8) ))
+
+#define ___constant_swab32(x) \
+       ((__u32)( \
+               (((__u32)(x) & (__u32)0x000000ffUL) << 24) | \
+               (((__u32)(x) & (__u32)0x0000ff00UL) <<  8) | \
+               (((__u32)(x) & (__u32)0x00ff0000UL) >>  8) | \
+               (((__u32)(x) & (__u32)0xff000000UL) >> 24) ))
+
+#define ___constant_swab64(x) \
+       ((__u64)( \
+               (__u64)(((__u64)(x) & (__u64)0x00000000000000ffUL) << 56) | \
+               (__u64)(((__u64)(x) & (__u64)0x000000000000ff00UL) << 40) | \
+               (__u64)(((__u64)(x) & (__u64)0x0000000000ff0000UL) << 24) | \
+               (__u64)(((__u64)(x) & (__u64)0x00000000ff000000UL) <<  8) | \
+               (__u64)(((__u64)(x) & (__u64)0x000000ff00000000UL) >>  8) | \
+               (__u64)(((__u64)(x) & (__u64)0x0000ff0000000000UL) >> 24) | \
+               (__u64)(((__u64)(x) & (__u64)0x00ff000000000000UL) >> 40) | \
+               (__u64)(((__u64)(x) & (__u64)0xff00000000000000UL) >> 56) ))
+
+
+#define __swab16(x)  ___constant_swab16(x)
+#define __swab32(x)  ___constant_swab32(x)
+#define __swab64(x)  ___constant_swab64(x)
+
+#define __swab16s(x) do { *(x) = __swab16((USHORT)(*(x)));} while(0)
+#define __swab32s(x) do { *(x) = __swab32((ULONG)(*(x)));} while(0)
+#define __swab64s(x) do { *(x) = __swab64((ULONGLONG)(*(x)));} while(0)
+
+#define __constant_htonl(x) ___constant_swab32((x))
+#define __constant_ntohl(x) ___constant_swab32((x))
+#define __constant_htons(x) ___constant_swab16((x))
+#define __constant_ntohs(x) ___constant_swab16((x))
+#define __constant_cpu_to_le64(x) ((__u64)(x))
+#define __constant_le64_to_cpu(x) ((__u64)(x))
+#define __constant_cpu_to_le32(x) ((__u32)(x))
+#define __constant_le32_to_cpu(x) ((__u32)(x))
+#define __constant_cpu_to_le16(x) ((__u16)(x))
+#define __constant_le16_to_cpu(x) ((__u16)(x))
+#define __constant_cpu_to_be64(x) ___constant_swab64((x))
+#define __constant_be64_to_cpu(x) ___constant_swab64((x))
+#define __constant_cpu_to_be32(x) ___constant_swab32((x))
+#define __constant_be32_to_cpu(x) ___constant_swab32((x))
+#define __constant_cpu_to_be16(x) ___constant_swab16((x))
+#define __constant_be16_to_cpu(x) ___constant_swab16((x))
+#define __cpu_to_le64(x) ((__u64)(x))
+#define __le64_to_cpu(x) ((__u64)(x))
+#define __cpu_to_le32(x) ((__u32)(x))
+#define __le32_to_cpu(x) ((__u32)(x))
+#define __cpu_to_le16(x) ((__u16)(x))
+#define __le16_to_cpu(x) ((__u16)(x))
+#define __cpu_to_be64(x) __swab64((x))
+#define __be64_to_cpu(x) __swab64((x))
+#define __cpu_to_be32(x) __swab32((x))
+#define __be32_to_cpu(x) __swab32((x))
+#define __cpu_to_be16(x) __swab16((x))
+#define __be16_to_cpu(x) __swab16((x))
+#define __cpu_to_le64p(x) (*(__u64*)(x))
+#define __le64_to_cpup(x) (*(__u64*)(x))
+#define __cpu_to_le32p(x) (*(__u32*)(x))
+#define __le32_to_cpup(x) (*(__u32*)(x))
+#define __cpu_to_le16p(x) (*(__u16*)(x))
+#define __le16_to_cpup(x) (*(__u16*)(x))
+#define __cpu_to_be64p(x) __swab64p((x))
+#define __be64_to_cpup(x) __swab64p((x))
+#define __cpu_to_be32p(x) __swab32p((x))
+#define __be32_to_cpup(x) __swab32p((x))
+#define __cpu_to_be16p(x) __swab16p((x))
+#define __be16_to_cpup(x) __swab16p((x))
+#define __cpu_to_le64s(x) do {} while (0)
+#define __le64_to_cpus(x) do {} while (0)
+#define __cpu_to_le32s(x) do {} while (0)
+#define __le32_to_cpus(x) do {} while (0)
+#define __cpu_to_le16s(x) do {} while (0)
+#define __le16_to_cpus(x) do {} while (0)
+#define __cpu_to_be64s(x) __swab64s((x))
+#define __be64_to_cpus(x) __swab64s((x))
+#define __cpu_to_be32s(x) __swab32s((x))
+#define __be32_to_cpus(x) __swab32s((x))
+#define __cpu_to_be16s(x) __swab16s((x))
+#define __be16_to_cpus(x) __swab16s((x))
+
+#ifndef cpu_to_le64
+#define cpu_to_le64 __cpu_to_le64
+#define le64_to_cpu __le64_to_cpu
+#define cpu_to_le32 __cpu_to_le32
+#define le32_to_cpu __le32_to_cpu
+#define cpu_to_le16 __cpu_to_le16
+#define le16_to_cpu __le16_to_cpu
+#endif
+
+#define cpu_to_be64 __cpu_to_be64
+#define be64_to_cpu __be64_to_cpu
+#define cpu_to_be32 __cpu_to_be32
+#define be32_to_cpu __be32_to_cpu
+#define cpu_to_be16 __cpu_to_be16
+#define be16_to_cpu __be16_to_cpu
+#define cpu_to_le64p __cpu_to_le64p
+#define le64_to_cpup __le64_to_cpup
+#define cpu_to_le32p __cpu_to_le32p
+#define le32_to_cpup __le32_to_cpup
+#define cpu_to_le16p __cpu_to_le16p
+#define le16_to_cpup __le16_to_cpup
+#define cpu_to_be64p __cpu_to_be64p
+#define be64_to_cpup __be64_to_cpup
+#define cpu_to_be32p __cpu_to_be32p
+#define be32_to_cpup __be32_to_cpup
+#define cpu_to_be16p __cpu_to_be16p
+#define be16_to_cpup __be16_to_cpup
+#define cpu_to_le64s __cpu_to_le64s
+#define le64_to_cpus __le64_to_cpus
+#define cpu_to_le32s __cpu_to_le32s
+#define le32_to_cpus __le32_to_cpus
+#define cpu_to_le16s __cpu_to_le16s
+#define le16_to_cpus __le16_to_cpus
+#define cpu_to_be64s __cpu_to_be64s
+#define be64_to_cpus __be64_to_cpus
+#define cpu_to_be32s __cpu_to_be32s
+#define be32_to_cpus __be32_to_cpus
+#define cpu_to_be16s __cpu_to_be16s
+#define be16_to_cpus __be16_to_cpus
+
+
+//
+// Network to host byte swap functions
+//
+
+#define ntohl(x)           ( ( ( ( x ) & 0x000000ff ) << 24 ) | \
+                             ( ( ( x ) & 0x0000ff00 ) << 8 ) | \
+                             ( ( ( x ) & 0x00ff0000 ) >> 8 ) | \
+                             ( ( ( x ) & 0xff000000 ) >> 24 )   )
+
+#define ntohs(x)           ( ( ( ( x ) & 0xff00 ) >> 8 ) | \
+                             ( ( ( x ) & 0x00ff ) << 8 ) )
+
+
+#define htonl(x)           ntohl(x)
+#define htons(x)           ntohs(x)
+
+
+
+#ifndef _I386_ERRNO_H
+#define _I386_ERRNO_H
+
+#define        EPERM            1      /* Operation not permitted */
+#define        ENOENT           2      /* No such file or directory */
+#define        ESRCH            3      /* No such process */
+#define        EINTR            4      /* Interrupted system call */
+#define        EIO                  5  /* I/O error */
+#define        ENXIO            6      /* No such device or address */
+#define        E2BIG            7      /* Arg list too long */
+#define        ENOEXEC          8      /* Exec format error */
+#define        EBADF            9      /* Bad file number */
+#define        ECHILD          10      /* No child processes */
+#define        EAGAIN          11      /* Try again */
+#define        ENOMEM          12      /* Out of memory */
+#define        EACCES          13      /* Permission denied */
+#define        EFAULT          14      /* Bad address */
+#define        ENOTBLK         15      /* Block device required */
+#define        EBUSY           16      /* Device or resource busy */
+#define        EEXIST          17      /* File exists */
+#define        EXDEV           18      /* Cross-device link */
+#define        ENODEV          19      /* No such device */
+#define        ENOTDIR         20      /* Not a directory */
+#define        EISDIR          21      /* Is a directory */
+#define        EINVAL          22      /* Invalid argument */
+#define        ENFILE          23      /* File table overflow */
+#define        EMFILE          24      /* Too many open files */
+#define        ENOTTY          25      /* Not a typewriter */
+#define        ETXTBSY         26      /* Text file busy */
+#define        EFBIG           27      /* File too large */
+#define        ENOSPC          28      /* No space left on device */
+#define        ESPIPE          29      /* Illegal seek */
+#define        EROFS           30      /* Read-only file system */
+#define        EMLINK          31      /* Too many links */
+#define        EPIPE           32      /* Broken pipe */
+#define        EDOM            33      /* Math argument out of domain of func */
+#define        ERANGE          34      /* Math result not representable */
+#undef EDEADLK
+#define        EDEADLK         35      /* Resource deadlock would occur */
+#undef ENAMETOOLONG
+#define        ENAMETOOLONG    36      /* File name too long */
+#undef ENOLCK
+#define        ENOLCK          37      /* No record locks available */
+#undef ENOSYS
+#define        ENOSYS          38      /* Function not implemented */
+#undef ENOTEMPTY
+#define        ENOTEMPTY       39      /* Directory not empty */
+#define        ELOOP           40      /* Too many symbolic links encountered */
+#define        EWOULDBLOCK     EAGAIN  /* Operation would block */
+#define        ENOMSG          42      /* No message of desired type */
+#define        EIDRM           43      /* Identifier removed */
+#define        ECHRNG          44      /* Channel number out of range */
+#define        EL2NSYNC        45      /* Level 2 not synchronized */
+#define        EL3HLT          46      /* Level 3 halted */
+#define        EL3RST          47      /* Level 3 reset */
+#define        ELNRNG          48      /* Link number out of range */
+#define        EUNATCH         49      /* Protocol driver not attached */
+#define        ENOCSI          50      /* No CSI structure available */
+#define        EL2HLT          51      /* Level 2 halted */
+#define        EBADE           52      /* Invalid exchange */
+#define        EBADR           53      /* Invalid request descriptor */
+#define        EXFULL          54      /* Exchange full */
+#define        ENOANO          55      /* No anode */
+#define        EBADRQC         56      /* Invalid request code */
+#define        EBADSLT         57      /* Invalid slot */
+
+#define        EDEADLOCK       EDEADLK
+
+#define        EBFONT          59      /* Bad font file format */
+#define        ENOSTR          60      /* Device not a stream */
+#define        ENODATA         61      /* No data available */
+#define        ETIME           62      /* Timer expired */
+#define        ENOSR           63      /* Out of streams resources */
+#define        ENONET          64      /* Machine is not on the network */
+#define        ENOPKG          65      /* Package not installed */
+#define        EREMOTE         66      /* Object is remote */
+#define        ENOLINK         67      /* Link has been severed */
+#define        EADV            68      /* Advertise error */
+#define        ESRMNT          69      /* Srmount error */
+#define        ECOMM           70      /* Communication error on send */
+#define        EPROTO          71      /* Protocol error */
+#define        EMULTIHOP       72      /* Multihop attempted */
+#define        EDOTDOT         73      /* RFS specific error */
+#define        EBADMSG         74      /* Not a data message */
+#define        EOVERFLOW       75      /* Value too large for defined data type */
+#define        ENOTUNIQ        76      /* Name not unique on network */
+#define        EBADFD          77      /* File descriptor in bad state */
+#define        EREMCHG         78      /* Remote address changed */
+#define        ELIBACC         79      /* Can not access a needed shared library */
+#define        ELIBBAD         80      /* Accessing a corrupted shared library */
+#define        ELIBSCN         81      /* .lib section in a.out corrupted */
+#define        ELIBMAX         82      /* Attempting to link in too many shared libraries */
+#define        ELIBEXEC        83      /* Cannot exec a shared library directly */
+#undef EILSEQ
+#define        EILSEQ          84      /* Illegal byte sequence */
+#define        ERESTART        85      /* Interrupted system call should be restarted */
+#define        ESTRPIPE        86      /* Streams pipe error */
+#define        EUSERS          87      /* Too many users */
+#define        ENOTSOCK        88      /* Socket operation on non-socket */
+#define        EDESTADDRREQ    89      /* Destination address required */
+#define        EMSGSIZE        90      /* Message too long */
+#define        EPROTOTYPE      91      /* Protocol wrong type for socket */
+#define        ENOPROTOOPT     92      /* Protocol not available */
+#define        EPROTONOSUPPORT 93      /* Protocol not supported */
+#define        ESOCKTNOSUPPORT 94      /* Socket type not supported */
+#define        EOPNOTSUPP      95      /* Operation not supported on transport endpoint */
+#define        EPFNOSUPPORT    96      /* Protocol family not supported */
+#define        EAFNOSUPPORT    97      /* Address family not supported by protocol */
+#define        EADDRINUSE      98      /* Address already in use */
+#define        EADDRNOTAVAIL   99      /* Cannot assign requested address */
+#define        ENETDOWN        100     /* Network is down */
+#define        ENETUNREACH     101     /* Network is unreachable */
+#define        ENETRESET       102     /* Network dropped connection because of reset */
+#define        ECONNABORTED    103     /* Software caused connection abort */
+#define        ECONNRESET      104     /* Connection reset by peer */
+#define        ENOBUFS         105     /* No buffer space available */
+#define        EISCONN         106     /* Transport endpoint is already connected */
+#define        ENOTCONN        107     /* Transport endpoint is not connected */
+#define        ESHUTDOWN       108     /* Cannot send after transport endpoint shutdown */
+#define        ETOOMANYREFS    109     /* Too many references: cannot splice */
+#define        ETIMEDOUT       110     /* Connection timed out */
+#define        ECONNREFUSED    111     /* Connection refused */
+#define        EHOSTDOWN       112     /* Host is down */
+#define        EHOSTUNREACH    113     /* No route to host */
+#define        EALREADY        114     /* Operation already in progress */
+#define        EINPROGRESS     115     /* Operation now in progress */
+#define        ESTALE          116     /* Stale NFS file handle */
+#define        EUCLEAN         117     /* Structure needs cleaning */
+#define        ENOTNAM         118     /* Not a XENIX named type file */
+#define        ENAVAIL         119     /* No XENIX semaphores available */
+#define        EISNAM          120     /* Is a named type file */
+#define        EREMOTEIO       121     /* Remote I/O error */
+#define        EDQUOT          122     /* Quota exceeded */
+
+#define        ENOMEDIUM       123     /* No medium found */
+#define        EMEDIUMTYPE     124     /* Wrong medium type */
+
+/* Should never be seen by user programs */
+#define ERESTARTSYS    512
+#define ERESTARTNOINTR 513
+#define ERESTARTNOHAND 514     /* restart if no handler.. */
+#define ENOIOCTLCMD    515     /* No ioctl command */
+
+/* Defined for the NFSv3 protocol */
+#define EBADHANDLE     521     /* Illegal NFS file handle */
+#define ENOTSYNC       522     /* Update synchronization mismatch */
+#define EBADCOOKIE     523     /* Cookie is stale */
+#define ENOTSUPP       524     /* Operation is not supported */
+#define ETOOSMALL      525     /* Buffer or request is too small */
+#define ESERVERFAULT   526     /* An untranslatable error occurred */
+#define EBADTYPE       527     /* Type not supported by server */
+#define EJUKEBOX       528     /* Request initiated, but will not complete before timeout */
+
+
+
+/* open/fcntl - O_SYNC is only implemented on blocks devices and on files
+   located on an ext2 file system */
+#define O_ACCMODE         0003
+#define O_RDONLY            00
+#define O_WRONLY            01
+#define O_RDWR              02
+#define O_CREAT                   0100 /* not fcntl */
+#define O_EXCL            0200 /* not fcntl */
+#define O_NOCTTY          0400 /* not fcntl */
+#define O_TRUNC                  01000 /* not fcntl */
+#define O_APPEND         02000
+#define O_NONBLOCK       04000
+#define O_NDELAY       O_NONBLOCK
+#define O_SYNC          010000
+#define FASYNC          020000 /* fcntl, for BSD compatibility */
+#define O_DIRECT        040000 /* direct disk access hint */
+#define O_LARGEFILE    0100000
+#define O_DIRECTORY    0200000 /* must be a directory */
+#define O_NOFOLLOW     0400000 /* don't follow links */
+
+#define F_DUPFD                0       /* dup */
+#define F_GETFD                1       /* get close_on_exec */
+#define F_SETFD                2       /* set/clear close_on_exec */
+#define F_GETFL                3       /* get file->f_flags */
+#define F_SETFL                4       /* set file->f_flags */
+#define F_GETLK                5
+#define F_SETLK                6
+#define F_SETLKW       7
+
+#define F_SETOWN       8       /*  for sockets. */
+#define F_GETOWN       9       /*  for sockets. */
+#define F_SETSIG       10      /*  for sockets. */
+#define F_GETSIG       11      /*  for sockets. */
+
+#define F_GETLK64      12      /*  using 'struct flock64' */
+#define F_SETLK64      13
+#define F_SETLKW64     14
+
+/* for F_[GET|SET]FL */
+#define FD_CLOEXEC     1       /* actually anything with low bit set goes */
+
+/* for posix fcntl() and lockf() */
+#define F_RDLCK                0
+#define F_WRLCK                1
+#define F_UNLCK                2
+
+/* for old implementation of bsd flock () */
+#define F_EXLCK                4       /* or 3 */
+#define F_SHLCK                8       /* or 4 */
+
+/* for leases */
+#define F_INPROGRESS   16
+
+/* operations for bsd flock(), also used by the kernel implementation */
+#define LOCK_SH                1       /* shared lock */
+#define LOCK_EX                2       /* exclusive lock */
+#define LOCK_NB                4       /* or'd with one of the above to prevent
+                                  blocking */
+#define LOCK_UN                8       /* remove lock */
+
+#define LOCK_MAND      32      /* This is a mandatory flock */
+#define LOCK_READ      64      /* ... Which allows concurrent read operations */
+#define LOCK_WRITE     128     /* ... Which allows concurrent write operations */
+#define LOCK_RW                192     /* ... Which allows concurrent read & write ops */
+
+#endif
+
+
+#ifndef LIBCFS_SIGNAL_H
+#define LIBCFS_SIGNAL_H
+
+/*
+ *  signal values ...
+ */
+
+#define SIGHUP          1
+#define SIGINT          2
+#define SIGQUIT                 3
+#define SIGILL          4
+#define SIGTRAP                 5
+#define SIGABRT                 6
+#define SIGIOT          6
+#define SIGBUS          7
+#define SIGFPE          8
+#define SIGKILL                 9
+#define SIGUSR1                10
+#define SIGSEGV                11
+#define SIGUSR2                12
+#define SIGPIPE                13
+#define SIGALRM                14
+#define SIGTERM                15
+#define SIGSTKFLT      16
+#define SIGCHLD                17
+#define SIGCONT                18
+#define SIGSTOP                19
+#define SIGTSTP                20
+#define SIGTTIN                21
+#define SIGTTOU                22
+#define SIGURG         23
+#define SIGXCPU                24
+#define SIGXFSZ                25
+#define SIGVTALRM      26
+#define SIGPROF                27
+#define SIGWINCH       28
+#define SIGIO          29
+#define SIGPOLL                SIGIO
+/*
+#define SIGLOST                29
+*/
+#define SIGPWR         30
+#define SIGSYS         31
+#define        SIGUNUSED       31
+
+/* These should not be considered constants from userland.  */
+#define SIGRTMIN       32
+#define SIGRTMAX       (_NSIG-1)
+
+/*
+ * SA_FLAGS values:
+ *
+ * SA_ONSTACK indicates that a registered stack_t will be used.
+ * SA_INTERRUPT is a no-op, but left due to historical reasons. Use the
+ * SA_RESTART flag to get restarting signals (which were the default long ago)
+ * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
+ * SA_RESETHAND clears the handler when the signal is delivered.
+ * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
+ * SA_NODEFER prevents the current signal from being masked in the handler.
+ *
+ * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
+ * Unix names RESETHAND and NODEFER respectively.
+ */
+#define SA_NOCLDSTOP   0x00000001
+#define SA_NOCLDWAIT   0x00000002 /* not supported yet */
+#define SA_SIGINFO     0x00000004
+#define SA_ONSTACK     0x08000000
+#define SA_RESTART     0x10000000
+#define SA_NODEFER     0x40000000
+#define SA_RESETHAND   0x80000000
+
+#define SA_NOMASK      SA_NODEFER
+#define SA_ONESHOT     SA_RESETHAND
+#define SA_INTERRUPT   0x20000000 /* dummy -- ignored */
+
+#define SA_RESTORER    0x04000000
+
+/* 
+ * sigaltstack controls
+ */
+#define SS_ONSTACK     1
+#define SS_DISABLE     2
+
+#define MINSIGSTKSZ    2048
+#define SIGSTKSZ       8192
+
+
+#define sigmask(sig)   ((__u32)1 << ((sig) - 1))
+
+#endif // LIBCFS_SIGNAL_H
\ No newline at end of file
index 5d09875..717559f 100644 (file)
@@ -1,11 +1,14 @@
 #ifndef __LNET_API_SUPPORT_H__
 #define __LNET_API_SUPPORT_H__
 
-#ifndef __KERNEL__
-# include <stdio.h>
-# include <stdlib.h>
-# include <unistd.h>
-# include <time.h>
+#if defined(__linux__)
+#include <lnet/linux/api-support.h>
+#elif defined(__APPLE__)
+#include <lnet/darwin/api-support.h>
+#elif defined(__WINNT__)
+#include <lnet/winnt/api-support.h>
+#else
+#error Unsupported Operating System
 #endif
 
 #include <lnet/types.h>
index c3b2e2c..409e159 100644 (file)
@@ -1 +1 @@
-EXTRA_DIST := lib-lnet.h  lib-types.h  lnet.h
+EXTRA_DIST := lib-lnet.h  lib-types.h  lnet.h api-support.h
diff --git a/lnet/include/lnet/darwin/api-support.h b/lnet/include/lnet/darwin/api-support.h
new file mode 100644 (file)
index 0000000..f587255
--- /dev/null
@@ -0,0 +1,21 @@
+#ifndef __DARWIN_API_SUPPORT_H__
+#define __DARWIN_API_SUPPORT_H__
+
+#ifndef __LNET_API_SUPPORT_H__
+#error Do not #include this file directly. #include <portals/api-support.h> instead
+#endif
+
+#ifndef __KERNEL__
+# include <stdio.h>
+# include <stdlib.h>
+# include <unistd.h>
+# include <time.h>
+
+/* Lots of POSIX dependencies to support PtlEQWait_timeout */
+# include <signal.h>
+# include <setjmp.h>
+# include <time.h>
+#endif
+
+
+#endif
index a017d6f..af4bc5d 100644 (file)
@@ -11,4 +11,6 @@
 #include <string.h>
 #include <libcfs/libcfs.h>
 
+#undef LNET_ROUTER
+
 #endif
index 0d93071..dca0d8b 100644 (file)
@@ -13,6 +13,8 @@
 #include <lnet/linux/lib-lnet.h>
 #elif defined(__APPLE__)
 #include <lnet/darwin/lib-lnet.h>
+#elif defined(__WINNT__)
+#include <lnet/winnt/lib-lnet.h>
 #else
 #error Unsupported Operating System
 #endif
@@ -562,7 +564,7 @@ lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset,
                    unsigned int nsiov, struct iovec *siov, unsigned int soffset,
                    unsigned int nob)
 {
-        struct iovec diov = {.iov_base = dest, .iov_len = dlen};
+        struct iovec diov = {/*.iov_base = */ dest, /*.iov_len = */ dlen};
 
         lnet_copy_iov2iov(1, &diov, doffset,
                           nsiov, siov, soffset, nob);
@@ -573,7 +575,7 @@ lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset,
                     unsigned int nsiov, lnet_kiov_t *skiov, unsigned int soffset,
                     unsigned int nob)
 {
-        struct iovec diov = {.iov_base = dest, .iov_len = dlen};
+        struct iovec diov = {/* .iov_base = */ dest, /* .iov_len = */ dlen};
 
         lnet_copy_kiov2iov(1, &diov, doffset,
                            nsiov, skiov, soffset, nob);
@@ -583,7 +585,7 @@ static inline void
 lnet_copy_flat2iov(unsigned int ndiov, struct iovec *diov, unsigned int doffset,
                    int slen, void *src, unsigned int soffset, unsigned int nob)
 {
-        struct iovec siov = {.iov_base = src, .iov_len = slen};
+        struct iovec siov = {/*.iov_base = */ src, /*.iov_len = */slen};
         lnet_copy_iov2iov(ndiov, diov, doffset,
                           1, &siov, soffset, nob);
 }
@@ -592,7 +594,7 @@ static inline void
 lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov, unsigned int doffset,
                     int slen, void *src, unsigned int soffset, unsigned int nob)
 {
-        struct iovec siov = {.iov_base = src, .iov_len = slen};
+        struct iovec siov = {/* .iov_base = */ src, /* .iov_len = */ slen};
         lnet_copy_iov2kiov(ndiov, dkiov, doffset,
                            1, &siov, soffset, nob);
 }
index b17b4f4..9959e9c 100644 (file)
@@ -14,6 +14,8 @@
 #include <lnet/linux/lib-types.h>
 #elif defined(__APPLE__)
 #include <lnet/darwin/lib-types.h>
+#elif defined(__WINNT__)
+#include <lnet/winnt/lib-types.h>
 #else
 #error Unsupported Operating System
 #endif
index c3b2e2c..409e159 100644 (file)
@@ -1 +1 @@
-EXTRA_DIST := lib-lnet.h  lib-types.h  lnet.h
+EXTRA_DIST := lib-lnet.h  lib-types.h  lnet.h api-support.h
diff --git a/lnet/include/lnet/linux/api-support.h b/lnet/include/lnet/linux/api-support.h
new file mode 100644 (file)
index 0000000..15e9244
--- /dev/null
@@ -0,0 +1,20 @@
+#ifndef __LINUX_API_SUPPORT_H__
+#define __LINUX_API_SUPPORT_H__
+
+#ifndef __LNET_API_SUPPORT_H__
+#error Do not #include this file directly. #include <lnet /api-support.h> instead
+#endif
+
+#ifndef __KERNEL__
+# include <stdio.h>
+# include <stdlib.h>
+# include <unistd.h>
+# include <time.h>
+
+/* Lots of POSIX dependencies to support PtlEQWait_timeout */
+# include <signal.h>
+# include <setjmp.h>
+# include <time.h>
+#endif
+
+#endif
index 8494198..9c38fd3 100644 (file)
@@ -44,4 +44,6 @@ lnet_page2phys (struct page *p)
 # endif
 #endif
 
+#define LNET_ROUTER
+
 #endif /* __LNET_LINUX_LIB_LNET_H__ */
index 9f99f2b..819c524 100644 (file)
@@ -13,6 +13,8 @@
 #include <lnet/linux/lnet.h>
 #elif defined(__APPLE__)
 #include <lnet/darwin/lnet.h>
+#elif defined(__WINNT__)
+#include <lnet/winnt/lnet.h>
 #else
 #error Unsupported Operating System
 #endif
diff --git a/lnet/include/lnet/winnt/api-support.h b/lnet/include/lnet/winnt/api-support.h
new file mode 100644 (file)
index 0000000..dd5b5e8
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef __WINNT_API_SUPPORT_H__
+#define __WINNT_API_SUPPORT_H__
+
+#ifndef __API_SUPPORT_H__
+#error Do not #include this file directly. #include <lnet /api-support.h> instead
+#endif
+
+
+#endif
diff --git a/lnet/include/lnet/winnt/lib-lnet.h b/lnet/include/lnet/winnt/lib-lnet.h
new file mode 100644 (file)
index 0000000..bb3e5af
--- /dev/null
@@ -0,0 +1,25 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef __LNET_WINNT_LIB_LNET_H__
+#define __LNET_WINNT_LIB_LNET_H__
+
+#ifndef __LNET_LIB_LNET_H__
+#error Do not #include this file directly. #include <lnet/lib-lnet.h> instead
+#endif
+
+#ifdef __KERNEL__
+# include <libcfs/libcfs.h>
+# include <libcfs/kp30.h>
+
+static inline __u64
+lnet_page2phys (struct page *p)
+{
+    return 0;
+}
+
+#else  /* __KERNEL__ */
+
+#endif
+
+#endif /* __LNET_WINNT_LIB_LNET_H__ */
diff --git a/lnet/include/lnet/winnt/lib-types.h b/lnet/include/lnet/winnt/lib-types.h
new file mode 100644 (file)
index 0000000..33a3134
--- /dev/null
@@ -0,0 +1,55 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LNET_WINNT_LIB_TYPES_H__
+#define __LNET_WINNT_LIB_TYPES_H__
+
+#ifndef __LNET_LIB_TYPES_H__
+#error Do not #include this file directly. #include <lnet/lib-types.h> instead
+#endif
+
+#include <libcfs/libcfs.h>
+
+typedef struct {
+    spinlock_t lock;
+} lib_ni_lock_t;
+
+static inline void lib_ni_lock_init(lib_ni_lock_t *l)
+{
+        spin_lock_init(&l->lock);
+}
+
+static inline void lib_ni_lock_fini(lib_ni_lock_t *l)
+{}
+
+static inline void lib_ni_lock(lib_ni_lock_t *l)
+{
+        int     flags;
+        spin_lock_irqsave(&l->lock, flags);
+}
+
+static inline void lib_ni_unlock(lib_ni_lock_t *l)
+{
+        spin_unlock_irqrestore(&l->lock, 0);
+}
+
+#endif
diff --git a/lnet/include/lnet/winnt/lnet.h b/lnet/include/lnet/winnt/lnet.h
new file mode 100644 (file)
index 0000000..7a98836
--- /dev/null
@@ -0,0 +1,533 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef __LNET_LINUX_LNET_H__
+#define __LNET_LINUX_LNET_H__
+
+#ifndef __LNET_H__
+#error Do not #include this file directly. #include <lnet/lnet.h> instead
+#endif
+
+#ifdef __KERNEL__
+
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+
+/*
+ * tdilnd routines
+ */
+
+//
+// debug.c
+//
+
+
+PUCHAR
+KsNtStatusToString (IN NTSTATUS Status);
+
+
+VOID
+KsPrintf(
+    IN LONG  DebugPrintLevel,
+    IN PCHAR DebugMessage,
+    IN ...
+    );
+
+
+//
+// tconn.c
+//
+
+
+ksock_mdl_t *
+ksocknal_lock_iovs(
+    IN struct iovec  *iov,
+    IN int            niov,
+    IN int            recv,
+    IN int *          len
+    );
+
+ksock_mdl_t *
+ksocknal_lock_kiovs(
+    IN lnet_kiov_t *   kiov,
+    IN int            nkiov,
+    IN int            recv,
+    IN int *          len
+    );
+
+int
+ksocknal_send_mdl(
+    ksock_tconn_t * tconn,
+    void *          tx,
+    ksock_mdl_t *   mdl,
+    int             len,
+    int             flags
+    );
+
+int
+ksocknal_query_data(
+    ksock_tconn_t * tconn,
+    size_t *        size,
+    int             bIsExpedited);
+
+int
+ksocknal_recv_mdl(
+    ksock_tconn_t * tconn,
+    ksock_mdl_t *   mdl,
+    int             size,
+    int             flags
+    );
+
+int
+ksocknal_get_tcp_option (
+    ksock_tconn_t *     tconn,
+    ULONG               ID,
+    PVOID               OptionValue,
+    PULONG              Length
+    );
+
+NTSTATUS
+ksocknal_set_tcp_option (
+    ksock_tconn_t * tconn,
+    ULONG           ID,
+    PVOID           OptionValue,
+    ULONG           Length
+    );
+
+int
+ksocknal_bind_tconn (
+    ksock_tconn_t * tconn,
+    ksock_tconn_t * parent,
+    ulong_ptr   addr,
+    unsigned short  port
+    );
+
+int
+ksocknal_build_tconn(
+    ksock_tconn_t *                 tconn,
+    ulong_ptr                   addr,
+    unsigned short                  port
+    );
+
+int
+ksocknal_disconnect_tconn(
+    ksock_tconn_t *     tconn,
+    ulong_ptr       flags
+    );
+
+void
+ksocknal_abort_tconn(
+    ksock_tconn_t *     tconn
+    );
+
+int
+ksocknal_query_local_ipaddr(
+    ksock_tconn_t *     tconn
+    );
+
+int
+ksocknal_tconn_write (ksock_tconn_t *tconn, void *buffer, int nob);
+
+int
+ksocknal_tconn_read (ksock_tconn_t * tconn, void *buffer, int nob);
+
+//
+// tcp.c
+//
+
+NTSTATUS
+KsTcpCompletionRoutine(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    );
+
+NTSTATUS
+KsDisconectCompletionRoutine (
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    );
+
+NTSTATUS
+KsTcpReceiveCompletionRoutine(
+    IN PIRP                         Irp,
+    IN PKS_TCP_COMPLETION_CONTEXT   Context
+    );
+
+NTSTATUS
+KsTcpSendCompletionRoutine(
+    IN PIRP                         Irp,
+    IN PKS_TCP_COMPLETION_CONTEXT   Context
+    );
+
+NTSTATUS
+KsAcceptCompletionRoutine(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    );
+
+
+NTSTATUS
+KsConnectEventHandler(
+    IN PVOID                    TdiEventContext,
+    IN LONG                     RemoteAddressLength,
+    IN PVOID                    RemoteAddress,
+    IN LONG                     UserDataLength,
+    IN PVOID                    UserData,
+    IN LONG                     OptionsLength,
+    IN PVOID                    Options,
+    OUT CONNECTION_CONTEXT *    ConnectionContext,
+    OUT PIRP *                  AcceptIrp
+    );
+
+NTSTATUS 
+KsDisconnectEventHandler(
+    IN PVOID                TdiEventContext,
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN LONG                 DisconnectDataLength,
+    IN PVOID                DisconnectData,
+    IN LONG                 DisconnectInformationLength,
+    IN PVOID                DisconnectInformation,
+    IN ULONG                DisconnectFlags
+    );
+
+NTSTATUS
+KsTcpReceiveEventHandler(
+    IN PVOID                TdiEventContext, 
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags,
+    IN ULONG                BytesIndicated,
+    IN ULONG                BytesAvailable,
+    OUT ULONG *             BytesTaken,
+    IN PVOID                Tsdu,
+    OUT PIRP *              IoRequestPacket
+   );
+
+NTSTATUS
+KsTcpReceiveExpeditedEventHandler(
+    IN PVOID                TdiEventContext, 
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags,
+    IN ULONG                BytesIndicated,
+    IN ULONG                BytesAvailable,
+    OUT ULONG *             BytesTaken,
+    IN PVOID                Tsdu,
+    OUT PIRP *              IoRequestPacket
+    );
+
+NTSTATUS
+KsTcpChainedReceiveEventHandler (
+    IN PVOID TdiEventContext,       // the event context
+    IN CONNECTION_CONTEXT ConnectionContext,
+    IN ULONG ReceiveFlags, 
+    IN ULONG ReceiveLength,
+    IN ULONG StartingOffset,        // offset of start of client data in TSDU
+    IN PMDL  Tsdu,                  // TSDU data chain
+    IN PVOID TsduDescriptor         // for call to TdiReturnChainedReceives
+    );
+
+NTSTATUS
+KsTcpChainedReceiveExpeditedEventHandler (
+    IN PVOID                TdiEventContext,       // the event context
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags, 
+    IN ULONG                ReceiveLength,
+    IN ULONG                StartingOffset,        // offset of start of client data in TSDU
+    IN PMDL                 Tsdu,                  // TSDU data chain
+    IN PVOID                TsduDescriptor         // for call to TdiReturnChainedReceives
+    );
+
+
+
+VOID
+KsDisconnectHelper(PKS_DISCONNECT_WORKITEM WorkItem);
+
+
+//
+// tdi.c
+//
+
+ULONG
+ksocknal_tdi_send_flags(ULONG SockFlags);
+
+PIRP
+KsBuildTdiIrp(
+    IN PDEVICE_OBJECT    DeviceObject
+    );
+
+NTSTATUS
+KsSubmitTdiIrp(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN BOOLEAN          bSynchronous,
+    OUT PULONG          Information
+    );
+
+NTSTATUS
+KsOpenControl(
+    IN PUNICODE_STRING      DeviceName,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   );
+
+NTSTATUS
+KsCloseControl(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+   );
+
+NTSTATUS
+KsOpenAddress(
+    IN PUNICODE_STRING      DeviceName,
+    IN PTRANSPORT_ADDRESS   pAddress,
+    IN ULONG                AddressLength,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   );
+
+NTSTATUS
+KsCloseAddress(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+    );
+
+NTSTATUS
+KsOpenConnection(
+    IN PUNICODE_STRING      DeviceName,
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   );
+
+NTSTATUS
+KsCloseConnection(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+    );
+
+NTSTATUS
+KsAssociateAddress(
+    IN HANDLE           AddressHandle,
+    IN PFILE_OBJECT     ConnectionObject
+    );
+
+
+NTSTATUS
+KsDisassociateAddress(
+    IN PFILE_OBJECT     ConnectionObject
+    );
+
+
+NTSTATUS
+KsSetEventHandlers(
+    IN PFILE_OBJECT         AddressObject,
+    IN PVOID                EventContext,
+    IN PKS_EVENT_HANDLERS   Handlers
+   );
+
+
+NTSTATUS
+KsQueryProviderInfo(
+    PWSTR               TdiDeviceName,
+    PTDI_PROVIDER_INFO  ProviderInfo
+   );
+
+NTSTATUS
+KsQueryAddressInfo(
+    IN PFILE_OBJECT         FileObject,
+    OUT PTDI_ADDRESS_INFO   AddressInfo,
+    OUT PULONG              AddressSize
+   );
+
+NTSTATUS
+KsQueryConnectionInfo(
+    IN PFILE_OBJECT            ConnectionObject,
+    OUT PTDI_CONNECTION_INFO   ConnectionInfo,
+    OUT PULONG                 ConnectionSize
+   );
+
+ULONG
+KsInitializeTdiAddress(
+    IN OUT PTA_IP_ADDRESS   pTransportAddress,
+    IN ULONG                IpAddress,
+    IN USHORT               IpPort
+    );
+
+ULONG
+KsQueryMdlsSize (IN PMDL Mdl);
+
+
+ULONG
+KsQueryTdiAddressLength(
+    OUT PTRANSPORT_ADDRESS   pTransportAddress
+    );
+
+NTSTATUS
+KsQueryIpAddress(
+    IN PFILE_OBJECT     FileObject,
+    OUT PVOID           TdiAddress,
+    OUT ULONG*          AddressLength
+    );
+
+
+NTSTATUS
+KsErrorEventHandler(
+    IN PVOID            TdiEventContext,
+    IN NTSTATUS         Status
+   );
+
+int
+ksocknal_set_handlers(
+    ksock_tconn_t *     tconn
+    );
+
+
+
+//
+// Strusup.c
+//
+
+VOID
+KsPrintProviderInfo(
+   PWSTR DeviceName,
+   PTDI_PROVIDER_INFO ProviderInfo
+   );
+
+ksock_tconn_t *
+ksocknal_create_tconn();
+
+void
+ksocknal_free_tconn(
+    ksock_tconn_t * tconn
+    );
+
+void
+ksocknal_init_listener(
+    ksock_tconn_t * tconn
+    );
+
+void
+ksocknal_init_sender(
+    ksock_tconn_t * tconn
+    );
+
+void
+ksocknal_init_child(
+    ksock_tconn_t * tconn
+    );
+
+void
+ksocknal_get_tconn(
+    ksock_tconn_t * tconn
+    );
+
+void
+ksocknal_put_tconn(
+    ksock_tconn_t * tconn
+    );
+
+int
+ksocknal_reset_handlers(
+    ksock_tconn_t *     tconn
+    );
+
+void
+ksocknal_destroy_tconn(
+    ksock_tconn_t *     tconn
+    );
+
+
+PKS_TSDU
+KsAllocateKsTsdu();
+
+VOID
+KsPutKsTsdu(
+    PKS_TSDU  KsTsdu
+    );
+
+VOID
+KsFreeKsTsdu(
+    PKS_TSDU  KsTsdu
+    );
+
+VOID
+KsInitializeKsTsdu(
+    PKS_TSDU    KsTsdu,
+    ULONG       Length
+    );
+
+
+VOID
+KsInitializeKsTsduMgr(
+    PKS_TSDUMGR     TsduMgr
+    );
+
+VOID
+KsInitializeKsChain(
+    PKS_CHAIN       KsChain
+    );
+
+NTSTATUS
+KsCleanupTsduMgr(
+    PKS_TSDUMGR     KsTsduMgr
+    );
+
+NTSTATUS
+KsCleanupKsChain(
+    PKS_CHAIN   KsChain
+    );
+
+NTSTATUS
+KsCleanupTsdu(
+    ksock_tconn_t * tconn
+    );
+
+NTSTATUS
+KsCopyMdlChainToMdlChain(
+    IN PMDL     SourceMdlChain,
+    IN ULONG    SourceOffset,
+    IN PMDL     DestinationMdlChain,
+    IN ULONG    DestinationOffset,
+    IN ULONG    BytesTobecopied,
+    OUT PULONG  BytesCopied
+    );
+
+ULONG
+KsQueryMdlsSize (PMDL Mdl);
+
+NTSTATUS
+KsLockUserBuffer (
+    IN PVOID            UserBuffer,
+    IN BOOLEAN          bPaged,
+    IN ULONG            Length,
+    IN LOCK_OPERATION   Operation,
+    OUT PMDL *          pMdl
+    );
+
+PVOID
+KsMapMdlBuffer (PMDL    Mdl);
+
+VOID
+KsReleaseMdl ( IN PMDL   Mdl,
+               IN int    Paged );
+
+int
+ksocknal_lock_buffer (
+    void *            buffer,
+    int               paged,
+    int               length,
+    LOCK_OPERATION    access,
+    ksock_mdl_t **    kmdl
+    );
+
+void *
+ksocknal_map_mdl (ksock_mdl_t * mdl);
+
+void
+ksocknal_release_mdl (ksock_mdl_t *mdl, int paged);
+
+#endif /* __KERNEL__ */
+
+#endif
index 3a947c5..f98bd4e 100644 (file)
@@ -1556,6 +1556,7 @@ kibnal_register_all_memory(void)
          * chunk starting at 0 */
         struct sysinfo     si;
         __u64              total;
+        __u64              total2;
         __u64              roundup = (128<<20);     /* round up in big chunks */
         IB_MR_PHYS_BUFFER  phys;
         IB_ACCESS_CONTROL  access;
@@ -1575,13 +1576,20 @@ kibnal_register_all_memory(void)
         }
 
         si_meminfo(&si);
+
+        CDEBUG(D_NET, "si_meminfo: %lu/%u, num_physpages %lu/%lu\n",
+               si.totalram, si.mem_unit, num_physpages, PAGE_SIZE);
+
         total = ((__u64)si.totalram) * si.mem_unit;
+        total2 = num_physpages * PAGE_SIZE;
+        if (total < total2)
+                total = total2;
 
         if (total == 0) {
                 CERROR("Can't determine memory size\n");
                 return -ENOMEM;
         }
-        
+                 
         roundup = (128<<20);
         total = (total + (roundup - 1)) & ~(roundup - 1);
 
@@ -1600,8 +1608,8 @@ kibnal_register_all_memory(void)
                 return -EIO;
         }
 
-        CDEBUG(D_NET, "registered phys mem from "LPX64" for "LPU64"\n", 
-               phys.PhysAddr, phys.Length);
+        CDEBUG(D_WARNING, "registered phys mem from 0("LPX64") for "LPU64"("LPU64") -> "LPX64"\n",
+               phys.PhysAddr, total, phys.Length, kibnal_data.kib_whole_mem.md_addr);
 
         return 0;
 }
index 23a9577..ab1798b 100644 (file)
@@ -972,13 +972,93 @@ kibnal_tx_complete (IB_WORK_COMPLETION *wc)
 
         if (failed &&
             tx->tx_status == 0 &&
-            conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+            conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+#if KIBLND_DETAILED_DEBUG
+                int                   i;
+                IB_WORK_REQ2         *wrq = &tx->tx_wrq[0];
+                IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[0];
+                lnet_msg_t           *lntmsg = tx->tx_lntmsg[0];
+#endif
                 CERROR("tx -> %s type %x cookie "LPX64
-                       "sending %d waiting %d: failed %d\n", 
+                       " sending %d waiting %d failed %d nwrk %d\n", 
                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
                        tx->tx_msg->ibm_type, tx->tx_cookie,
-                       tx->tx_sending, tx->tx_waiting, wc->Status);
+                       tx->tx_sending, tx->tx_waiting, wc->Status,
+                       tx->tx_nwrq);
+#if KIBLND_DETAILED_DEBUG
+                for (i = 0; i < tx->tx_nwrq; i++, wrq++, gl++) {
+                        switch (wrq->Operation) {
+                        default:
+                                CDEBUG(D_ERROR, "    [%3d] Addr %p Next %p OP %d "
+                                       "DSList %p(%p)/%d: "LPX64"/%d K %x\n",
+                                       i, wrq, wrq->Next, wrq->Operation,
+                                       wrq->DSList, gl, wrq->DSListDepth,
+                                       gl->Address, gl->Length, gl->Lkey);
+                                break;
+                        case WROpSend:
+                                CDEBUG(D_ERROR, "    [%3d] Addr %p Next %p SEND "
+                                       "DSList %p(%p)/%d: "LPX64"/%d K %x\n",
+                                       i, wrq, wrq->Next, 
+                                       wrq->DSList, gl, wrq->DSListDepth,
+                                       gl->Address, gl->Length, gl->Lkey);
+                                break;
+                        case WROpRdmaWrite:
+                                CDEBUG(D_ERROR, "    [%3d] Addr %p Next %p DMA "
+                                       "DSList: %p(%p)/%d "LPX64"/%d K %x -> "
+                                       LPX64" K %x\n",
+                                       i, wrq, wrq->Next, 
+                                       wrq->DSList, gl, wrq->DSListDepth,
+                                       gl->Address, gl->Length, gl->Lkey,
+                                       wrq->Req.SendRC.RemoteDS.Address,
+                                       wrq->Req.SendRC.RemoteDS.Rkey);
+                                break;
+                        }
+                }
+                
+                switch (tx->tx_msg->ibm_type) {
+                default:
+                        CERROR("  msg type %x %p/%d, No RDMA\n", 
+                               tx->tx_msg->ibm_type, 
+                               tx->tx_msg, tx->tx_msg->ibm_nob);
+                        break;
 
+                case IBNAL_MSG_PUT_DONE:
+                case IBNAL_MSG_GET_DONE:
+                        CERROR("  msg type %x %p/%d, RDMA key %x frags %d...\n", 
+                               tx->tx_msg->ibm_type, 
+                               tx->tx_msg, tx->tx_msg->ibm_nob,
+                               tx->tx_rd->rd_key, tx->tx_rd->rd_nfrag);
+                        for (i = 0; i < tx->tx_rd->rd_nfrag; i++)
+                                CDEBUG(D_ERROR, "    [%d] "LPX64"/%d\n", i,
+                                       tx->tx_rd->rd_frags[i].rf_addr,
+                                       tx->tx_rd->rd_frags[i].rf_nob);
+                        if (lntmsg == NULL) {
+                                CERROR("  No lntmsg\n");
+                        } else if (lntmsg->msg_iov != NULL) {
+                                CERROR("  lntmsg in %d VIRT frags...\n", 
+                                       lntmsg->msg_niov);
+                                for (i = 0; i < lntmsg->msg_niov; i++)
+                                        CDEBUG(D_ERROR, "    [%d] %p/%d\n", i,
+                                               lntmsg->msg_iov[i].iov_base,
+                                               lntmsg->msg_iov[i].iov_len);
+                        } else if (lntmsg->msg_kiov != NULL) {
+                                CERROR("  lntmsg in %d PAGE frags...\n", 
+                                       lntmsg->msg_niov);
+                                for (i = 0; i < lntmsg->msg_niov; i++)
+                                        CDEBUG(D_ERROR, "    [%d] %p+%d/%d\n", i,
+                                               lntmsg->msg_kiov[i].kiov_page,
+                                               lntmsg->msg_kiov[i].kiov_offset,
+                                               lntmsg->msg_kiov[i].kiov_len);
+                        } else {
+                                CERROR("  lntmsg in %d frags\n", 
+                                       lntmsg->msg_niov);
+                        }
+                        
+                        break;
+                }
+#endif
+        }
+        
         spin_lock(&conn->ibc_lock);
 
         /* I could be racing with rdma completion.  Whoever makes 'tx' idle
index 1d5144f..29812b6 100755 (executable)
@@ -194,8 +194,8 @@ kptllnd_msg_unpack(kptl_msg_t *msg, int nob,kptl_data_t *kptllnd_data)
         /*
          * Src nid can not be ANY
          */
-        if (msg->ptlm_srcnid == PTL_NID_ANY) {
-                CERROR("Bad src nid: "LPX64"\n", msg->ptlm_srcnid);
+        if (msg->ptlm_srcnid == LNET_NID_ANY) {
+                CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ptlm_srcnid));
                 return -EPROTO;
         }
 
@@ -211,7 +211,7 @@ kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
         int          rc = -EINVAL;
         kptl_data_t *kptllnd_data = ni->ni_data;
 
-        PJK_UT_MSG(">>> kptllnd_ctl cmd=%u arg=%p\n",cmd,arg);
+        CDEBUG(D_NET, ">>> kptllnd_ctl cmd=%u arg=%p\n",cmd,arg);
 
         /*
          * Validate that the context block is actually
@@ -221,7 +221,7 @@ kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
 
         switch(cmd) {
         case IOC_LIBCFS_DEL_PEER: {
-                rc = kptllnd_peer_del (kptllnd_data,data->ioc_nid);
+                rc = kptllnd_peer_del (kptllnd_data, data->ioc_nid);
                 break;
         }
         /*
@@ -237,7 +237,7 @@ kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
                 rc=-EINVAL;
                 break;
         }
-        PJK_UT_MSG("<<< kptllnd_ctl rc=%d\n",rc);
+        CDEBUG(D_NET, "<<< kptllnd_ctl rc=%d\n",rc);
         return rc;
 }
 
@@ -268,7 +268,7 @@ kptllnd_startup (lnet_ni_t *ni)
         ptl_err_t       ptl_rc;
 
 
-        PJK_UT_MSG(">>>\n");
+        CDEBUG(D_NET, ">>>\n");
 
         LASSERT (ni->ni_lnd == &kptllnd_lnd);
 
@@ -316,7 +316,7 @@ kptllnd_startup (lnet_ni_t *ni)
 #else /* _USING_CRAY_PORTALS_ */
                 CRAY_KERN_NAL,
 #endif
-                PTLLND_PID, NULL, NULL,
+                *kptllnd_tunables.kptl_pid, NULL, NULL,
                 &kptllnd_data->kptl_nih);
 
         /*
@@ -335,7 +335,7 @@ kptllnd_startup (lnet_ni_t *ni)
                 8,                      /* We use callback - no need for max */
                 kptllnd_eq_callback,    /* handler callback */
                 &kptllnd_data->kptl_eqh);   /* output handle */
-        if(ptl_rc != 0) {
+        if(ptl_rc != PTL_OK) {
                 CERROR("PtlEQAlloc failed %d\n",ptl_rc);
                 rc = -ENOMEM;
                 goto failed;
@@ -344,33 +344,41 @@ kptllnd_startup (lnet_ni_t *ni)
         /*
          * Fetch the lower NID
          */
-        if(ptl_rc != PtlGetId(kptllnd_data->kptl_nih,&kptllnd_data->kptl_portals_id)){
+        ptl_rc != PtlGetId(kptllnd_data->kptl_nih, &kptllnd_data->kptl_portals_id);
+        if (ptl_rc != PTL_OK) {
                 CERROR ("PtlGetID: error %d\n", ptl_rc);
                 rc = -EINVAL;
                 goto failed;
         }
 
-        PJK_UT_MSG("lnet nid=" LPX64 " (passed in)\n",ni->ni_nid);
+        if (kptllnd_data->kptl_portals_id.pid !=
+                *kptllnd_tunables.kptl_pid) {
+                /* The kernel ptllnd must have the expected PID */
+                CERROR("Unexpected PID: %u (%u expected)\n",
+                       kptllnd_data->kptl_portals_id.pid,
+                       *kptllnd_tunables.kptl_pid);
+                rc = -EINVAL;
+                goto failed;
+        }
+        
+        CDEBUG(D_NET, "lnet nid=" LPX64 " (passed in)\n",ni->ni_nid);
 
         /*
          * Create the new NID.  Based on the LND network type
          * and the lower ni's address data.
          */
-        ni->ni_nid = ptl2lnetnid(kptllnd_data,kptllnd_data->kptl_portals_id.nid);
-
-        PJK_UT_MSG("ptl  nid=" FMT_NID "\n",kptllnd_data->kptl_portals_id.nid);
-        PJK_UT_MSG("lnet nid=" LPX64 " (passed back)\n",ni->ni_nid);
+        ni->ni_nid = ptl2lnetnid(kptllnd_data, kptllnd_data->kptl_portals_id.nid);
 
-        CDEBUG(D_INFO,"ptl  nid=" FMT_NID "\n",kptllnd_data->kptl_portals_id.nid);
-        CDEBUG(D_INFO,"lnet nid=" LPX64 "\n",ni->ni_nid);
+        CDEBUG(D_NET, "ptl  nid=" FMT_NID "\n",kptllnd_data->kptl_portals_id.nid);
+        CDEBUG(D_NET, "ptl  pid= %d\n", kptllnd_data->kptl_portals_id.pid);
+        CDEBUG(D_NET, "lnet nid=" LPX64 " (passed back)\n",ni->ni_nid);
 
         /*
          * Initialized the incarnation
          */
         do_gettimeofday(&tv);
         kptllnd_data->kptl_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
-        PJK_UT_MSG("Incarnation=" LPX64 "\n",kptllnd_data->kptl_incarnation);
-        CDEBUG(D_INFO,"Incarnation=" LPX64 "\n",kptllnd_data->kptl_incarnation);
+        CDEBUG(D_NET, "Incarnation=" LPX64 "\n",kptllnd_data->kptl_incarnation);
 
         /*
          * Setup the sched locks/lists/waitq
@@ -390,7 +398,7 @@ kptllnd_startup (lnet_ni_t *ni)
         /*
          * Allocate and setup the peer hash table
          */
-        PJK_UT_MSG("Allocate Peer Hash Table\n");
+        CDEBUG(D_NET, "Allocate Peer Hash Table\n");
         rwlock_init(&kptllnd_data->kptl_peer_rw_lock);
         kptllnd_data->kptl_peer_hash_size = *kptllnd_tunables.kptl_peer_hash_table_size;
         INIT_LIST_HEAD(&kptllnd_data->kptl_canceled_peers);
@@ -416,7 +424,7 @@ kptllnd_startup (lnet_ni_t *ni)
          * this will be automatically cleaned up now that PTLNAT_INIT_DATA
          * state has been entered
          */
-        PJK_UT_MSG("starting %d scheduler threads\n",PTLLND_N_SCHED);
+        CDEBUG(D_NET, "starting %d scheduler threads\n",PTLLND_N_SCHED);
         for (i = 0; i < PTLLND_N_SCHED; i++) {
                 rc = kptllnd_thread_start (
                         kptllnd_scheduler,
@@ -443,7 +451,7 @@ kptllnd_startup (lnet_ni_t *ni)
          * because we'll use the pointer being NULL as a sentry
          * to know that we have to clean this up
          */
-        PJK_UT_MSG("Allocate TX Descriptor array\n");
+        CDEBUG(D_NET, "Allocate TX Descriptor array\n");
         LIBCFS_ALLOC (kptllnd_data->kptl_tx_descs,
                       (*kptllnd_tunables.kptl_ntx) * sizeof(kptl_tx_t));
         if (kptllnd_data->kptl_tx_descs == NULL){
@@ -501,13 +509,13 @@ kptllnd_startup (lnet_ni_t *ni)
 
         /*****************************************************/
 
-        PJK_UT_MSG("<<< kptllnd_startup SUCCESS\n");
+        CDEBUG(D_NET, "<<< kptllnd_startup SUCCESS\n");
         return 0;
 
  failed:
         CDEBUG(D_NET, "kptllnd_startup failed rc=%d\n",rc);
         kptllnd_shutdown (ni);
-        PJK_UT_MSG("<<< kptllnd_startup rc=%d\n",rc);
+        CDEBUG(D_NET, "<<< kptllnd_startup rc=%d\n",rc);
         return rc;
 }
 
@@ -517,7 +525,7 @@ kptllnd_shutdown (lnet_ni_t *ni)
         int             i;
         kptl_data_t    *kptllnd_data = ni->ni_data;
 
-        PJK_UT_MSG(">>> kptllnd_shutdown\n");
+        CDEBUG(D_NET, ">>> kptllnd_shutdown\n");
 
         /*
          * Validate that the context block is actually
@@ -536,7 +544,7 @@ kptllnd_shutdown (lnet_ni_t *ni)
 
         case PTLLND_INIT_ALL:
         case PTLLND_INIT_RXD:
-                PJK_UT_MSG("PTLLND_INIT_RXD\n");
+                CDEBUG(D_NET, "PTLLND_INIT_RXD\n");
 
                 kptllnd_rx_buffer_pool_fini(
                         &kptllnd_data->kptl_rx_buffer_pool);
@@ -546,17 +554,17 @@ kptllnd_shutdown (lnet_ni_t *ni)
 
                 /* fall through */
         case PTLLND_INIT_TXD:
-                PJK_UT_MSG("PTLLND_INIT_TXD\n");
+                CDEBUG(D_NET, "PTLLND_INIT_TXD\n");
 
                 /*
                  * If there were peers started up then
                  * clean them up.
                  */
                 if( atomic_read(&kptllnd_data->kptl_npeers) != 0) {
-                        PJK_UT_MSG("Deleting %d peers\n",atomic_read(&kptllnd_data->kptl_npeers));
+                        CDEBUG(D_NET, "Deleting %d peers\n",atomic_read(&kptllnd_data->kptl_npeers));
 
                         /* nuke all peers */
-                        kptllnd_peer_del(kptllnd_data,PTL_NID_ANY);
+                        kptllnd_peer_del(kptllnd_data, LNET_NID_ANY);
 
                         i = 2;
                         while (atomic_read (&kptllnd_data->kptl_npeers) != 0) {
@@ -565,14 +573,14 @@ kptllnd_shutdown (lnet_ni_t *ni)
                                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                        "Waiting for %d peers to terminate\n",
                                        atomic_read (&kptllnd_data->kptl_npeers));
-                                PJK_UT_MSG("Waiting for %d peers to terminate\n",
+                                CDEBUG(D_NET, "Waiting for %d peers to terminate\n",
                                         atomic_read (&kptllnd_data->kptl_npeers));
                                 cfs_pause(cfs_time_seconds(1));
                         }
                 }
 
                 LASSERT(list_empty(&kptllnd_data->kptl_canceled_peers));
-                PJK_UT_MSG("All peers deleted\n");
+                CDEBUG(D_NET, "All peers deleted\n");
 
                 /*
                  * Set the shutdown flag
@@ -589,7 +597,7 @@ kptllnd_shutdown (lnet_ni_t *ni)
                  * if we are not in the right state.
                  */
                 if(atomic_read (&kptllnd_data->kptl_nthreads) != 0){
-                        PJK_UT_MSG("Stopping %d threads\n",atomic_read(&kptllnd_data->kptl_nthreads));
+                        CDEBUG(D_NET, "Stopping %d threads\n",atomic_read(&kptllnd_data->kptl_nthreads));
                         /*
                          * Wake up all the schedulers
                          */
@@ -601,13 +609,13 @@ kptllnd_shutdown (lnet_ni_t *ni)
                                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                        "Waiting for %d threads to terminate\n",
                                        atomic_read (&kptllnd_data->kptl_nthreads));
-                                PJK_UT_MSG("Waiting for %d threads to terminate\n",
+                                CDEBUG(D_NET, "Waiting for %d threads to terminate\n",
                                         atomic_read (&kptllnd_data->kptl_nthreads));
                                 cfs_pause(cfs_time_seconds(1));
                         }
 
                 }
-                PJK_UT_MSG("All Threads stopped\n");
+                CDEBUG(D_NET, "All Threads stopped\n");
 
 
                 LASSERT(list_empty(&kptllnd_data->kptl_sched_txq));
@@ -617,7 +625,7 @@ kptllnd_shutdown (lnet_ni_t *ni)
                 /* fall through */
         case PTLLND_INIT_DATA:
 
-                PJK_UT_MSG("PTLLND_INIT_DATA\n");
+                CDEBUG(D_NET, "PTLLND_INIT_DATA\n");
 
                 LASSERT (atomic_read(&kptllnd_data->kptl_npeers) == 0);
                 LASSERT (kptllnd_data->kptl_peers != NULL);
@@ -633,7 +641,7 @@ kptllnd_shutdown (lnet_ni_t *ni)
                 /* fall through */
 
         case PTLLND_INIT_NOTHING:
-                PJK_UT_MSG("PTLLND_INIT_NOTHING\n");
+                CDEBUG(D_NET, "PTLLND_INIT_NOTHING\n");
                 break;
         }
 
@@ -688,7 +696,7 @@ kptllnd_shutdown (lnet_ni_t *ni)
                atomic_read (&libcfs_kmemory));
 
         PORTAL_MODULE_UNUSE;
-        PJK_UT_MSG("<<<\n");
+        CDEBUG(D_NET, "<<<\n");
 }
 
 int __init
@@ -696,7 +704,7 @@ kptllnd_module_init (void)
 {
         int    rc;
 
-        PJK_UT_MSG(">>> %s %s\n",__DATE__,__TIME__);
+        CDEBUG(D_NET, ">>> %s %s\n",__DATE__,__TIME__);
 
         /*
          * Display the module parameters
@@ -730,7 +738,7 @@ kptllnd_module_init (void)
         kptllnd_proc_init();
         lnet_register_lnd(&kptllnd_lnd);
 
-        PJK_UT_MSG("<<<\n");
+        CDEBUG(D_NET, "<<<\n");
         return 0;
 }
 
@@ -738,12 +746,12 @@ void __exit
 kptllnd_module_fini (void)
 {
 
-        PJK_UT_MSG(">>> %s %s\n",__DATE__,__TIME__);
+        CDEBUG(D_NET, ">>> %s %s\n",__DATE__,__TIME__);
         lnet_unregister_lnd(&kptllnd_lnd);
         kptllnd_proc_fini();
         kptllnd_tunables_fini();
-        kpttllnd_get_stats();
-        PJK_UT_MSG("<<<\n");
+        // kpttllnd_get_stats();
+        CDEBUG(D_NET, "<<<\n");
 }
 
 #define DO_TYPE(x) case x: return #x;
@@ -782,7 +790,7 @@ const char *get_msg_type_string(int type)
         }
 }
 
-#define LOGSTAT(x) PJK_UT_MSG_ALWAYS("%30.30s %d\n",#x,kptllnd_stats.x);
+#define LOGSTAT(x) CDEBUG(D_NET, "%30.30s %d\n",#x,kptllnd_stats.x);
 
 kptl_stats_t* kpttllnd_get_stats(void)
 {
index 2559f8b..4aa7285 100755 (executable)
@@ -91,6 +91,7 @@ typedef struct
         int             *kptl_cksum;            /* checksum kptl_msg_t? */
         int             *kptl_timeout;          /* comms timeout (seconds) */
         int             *kptl_portal;           /* portal number */
+        int             *kptl_pid;              /* portals PID (self + kernel peers) */
         int             *kptl_rxb_npages;       /* number of pages for rx buffer */
         int             *kptl_credits;          /* number of credits */
         int             *kptl_peercredits;      /* number of credits */
@@ -154,14 +155,14 @@ typedef struct kptl_rx_buffer_pool
         int                     rxbp_reserved;  /* the number currently reserved        */
         int                     rxbp_shutdown;  /* the shutdown flag for the pool       */
         int                     rxbp_posted;    /* the number of elements posted        */
-}kptl_rx_buffer_pool_t;
+} kptl_rx_buffer_pool_t;
 
-typedef enum
+enum kptl_rxb_state
 {
         RXB_STATE_UNINITIALIZED  = 0,
         RXB_STATE_IDLE           = 1,
         RXB_STATE_POSTED         = 2,
-}kptl_rxb_state_t;
+};
 
 struct kptl_rx_buffer
 {
@@ -173,23 +174,23 @@ struct kptl_rx_buffer
         kptl_rx_buffer_pool_t  *rxb_pool;
         struct list_head        rxb_list;       /* for the rxb_pool list */
         struct list_head        rxb_repost_list;/* for the kptl_sched_rxbq list*/
-        kptl_rxb_state_t        rxb_state;      /* the state of this rx buffer*/
+        enum kptl_rxb_state     rxb_state;      /* the state of this rx buffer*/
         atomic_t                rxb_refcount;   /* outstanding rx */
         ptl_handle_md_t         rxb_mdh;        /* the portals memory descriptor (MD) handle */
         void                   *rxb_buffer;     /* the buffer */
 
 };
 
-typedef enum
+enum kptl_tx_state
 {
         TX_STATE_UNINITIALIZED          = 0,
         TX_STATE_ON_IDLE_QUEUE          = 1,
         TX_STATE_ALLOCATED              = 2,
         TX_STATE_WAITING_CREDITS        = 3,
         TX_STATE_WAITING_RESPONSE       = 4
-}kptl_tx_state_t;
+};
 
-typedef enum
+enum kptl_tx_type
 {
         TX_TYPE_RESERVED                = 0,
         TX_TYPE_SMALL_MESSAGE           = 1,
@@ -197,7 +198,17 @@ typedef enum
         TX_TYPE_LARGE_GET               = 3,
         TX_TYPE_LARGE_PUT_RESPONSE      = 4,
         TX_TYPE_LARGE_GET_RESPONSE      = 5,
-}kptl_tx_type_t;
+};
+
+/*  */
+typedef union {
+#ifdef _USING_LUSTRE_PORTALS_
+        struct iovec iov[PTL_MD_MAX_IOV];
+        ptl_kiov_t kiov[PTL_MD_MAX_IOV];
+#else /* _USING_CRAY_PORTALS_ */
+        ptl_md_iovec_t iov[PTL_MD_MAX_IOV];
+#endif
+} kptl_fragvec_t;
 
 typedef struct kptl_tx                           /* transmit message */
 {
@@ -209,10 +220,10 @@ typedef struct kptl_tx                           /* transmit message */
         struct list_head        tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
         struct list_head        tx_schedlist; /* queue on idle_txs ibc_tx_queue etc. */
         atomic_t                tx_refcount;  /* Posted Buffer refrences count*/
-        kptl_tx_state_t         tx_state;     /* the state of this tx descriptor */
+        enum kptl_tx_state      tx_state;     /* the state of this tx descriptor */
         int                     tx_seen_send_end; /* if we've seen a SEND_END event */
         int                     tx_seen_reply_end; /* if we've seen a REPLY_END event */
-        kptl_tx_type_t          tx_type;      /* type of transfer */
+        enum kptl_tx_type       tx_type;      /* type of transfer */
         int                     tx_status;    /* the status of this tx descriptor */
         ptl_handle_md_t         tx_mdh;       /* the portals memory descriptor (MD) handle */
         ptl_handle_md_t         tx_mdh_msg;   /* the portals MD handle for the initial message */
@@ -222,6 +233,7 @@ typedef struct kptl_tx                           /* transmit message */
         kptl_peer_t            *tx_peer;      /* the peer this is waiting on */
         unsigned long           tx_deadline;  /* deadline */
         kptl_rx_t              *tx_associated_rx; /* Associated RX for Bulk RDMA */
+        kptl_fragvec_t         *tx_frags;     /* buffer fragments for buld RDMA */
 
         unsigned int            tx_payload_niov;
         struct iovec           *tx_payload_iov;
@@ -232,27 +244,27 @@ typedef struct kptl_tx                           /* transmit message */
 } kptl_tx_t;
 
 
-typedef enum
+enum kptllnd_peer_state
 {
         PEER_STATE_UNINITIALIZED        = 0,
         PEER_STATE_ALLOCATED            = 1,    //QQQ
         PEER_STATE_WAITING_HELLO        = 2,
         PEER_STATE_ACTIVE               = 3,
         PEER_STATE_CANCELED             = 4,
-}kptllnd_peer_state_t;
+};
 
 struct kptl_peer
 {
         struct list_head        peer_list;
         atomic_t                peer_refcount;          /* The current refrences */
-        kptllnd_peer_state_t    peer_state;
+        enum kptllnd_peer_state peer_state;
         kptl_data_t            *peer_kptllnd_data;      /* LND Instance Data */
         spinlock_t              peer_lock;              /* serialize */
         struct list_head        peer_pending_txs;       /* queue of pending txs */
         struct list_head        peer_active_txs;        /* queue of activce txs */
         int                     peer_active_txs_change_counter;/* updated when peer_active_txs changes*/
-        lnet_nid_t              peer_nid;               /* who's on the other end(s) */
-        int                     peer_pid;               /* the pid on the other end */
+        lnet_nid_t              peer_nid;               /* Peer's LNET NID */
+        ptl_process_id_t        peer_ptlid;             /* Peer's portals id */
         __u64                   peer_incarnation;       /* peer's incarnation */
         __u64                   peer_tx_seqnum;         /* next seq# to send with*/
         int                     peer_credits;           /* number of send credits */
@@ -293,7 +305,6 @@ struct kptl_data
         int                     kptl_canceled_peers_counter; /* updated when canceled_peers is modified*/
         int                     kptl_peer_hash_size;   /* size of kptl_peers */
         atomic_t                kptl_npeers;           /* # peers extant */
-
 };
 
 typedef struct kptl_stats
@@ -536,18 +547,18 @@ kptllnd_tx_launch (
         lnet_msg_t *ptlmsg );
 
 kptl_peer_t *
-kptllnd_peer_find (
-        kptl_data_t *kptllnd_data,
-        lnet_process_id_t target);
+kptllnd_nid2peer (kptl_data_t *kptllnd_data, lnet_nid_t nid);
 
 kptl_peer_t *
-kptllnd_peer_handle_hello (
-        kptl_data_t *kptllnd_data,
-        lnet_process_id_t initiator,
-        kptl_msg_t *msg);
+kptllnd_ptlnid2peer (kptl_data_t *kptllnd_data, ptl_nid_t ptlnid);
+
+kptl_peer_t *
+kptllnd_peer_handle_hello (kptl_data_t      *kptllnd_data,
+                           ptl_process_id_t  initiator,
+                           kptl_msg_t       *msg);
 
 static inline struct list_head *
-kptllnd_nid2peerlist (kptl_data_t *kptllnd_data,lnet_nid_t nid)
+kptllnd_ptlnid2peerlist (kptl_data_t *kptllnd_data, ptl_nid_t nid)
 {
         unsigned int hash = ((unsigned int)nid) % kptllnd_data->kptl_peer_hash_size;
 
@@ -580,7 +591,7 @@ kptllnd_tx_done (
 kptl_tx_t *
 kptllnd_get_idle_tx(
         kptl_data_t *kptllnd_data,
-        kptl_tx_type_t purpose);
+        enum kptl_tx_type purpose);
 
 void
 kptllnd_tx_callback(
@@ -614,16 +625,6 @@ kptllnd_msg_unpack(
  * MISC SUPPORT FUNCTIONS
  */
 
-
-typedef union {
-#ifdef _USING_LUSTRE_PORTALS_
-        struct iovec iov[PTL_MD_MAX_IOV];
-        ptl_kiov_t kiov[PTL_MD_MAX_IOV];
-#else /* _USING_CRAY_PORTALS_ */
-        ptl_md_iovec_t iov[PTL_MD_MAX_IOV];
-#endif
-}tempiov_t;
-
 void
 kptllnd_setup_md(
         kptl_data_t     *kptllnd_data,
@@ -634,8 +635,7 @@ kptllnd_setup_md(
         struct iovec    *payload_iov,
         lnet_kiov_t     *payload_kiov,
         unsigned int     payload_offset,
-        int              payload_nob,
-        tempiov_t       *tempiov);
+        int              payload_nob);
 
 static inline lnet_nid_t ptl2lnetnid(kptl_data_t *kptllnd_data,ptl_nid_t portals_nid)
 {
@@ -656,30 +656,6 @@ static inline ptl_nid_t lnet2ptlnid(kptl_data_t *kptllnd_data,lnet_nid_t lnet_ni
 }
 
 #ifdef PJK_DEBUGGING
-
-#define PJK_UT_MSG_ALWAYS(fmt, a...)                    \
-do{                                                     \
-        printk("<1>ptllnd:%-30s:%u:",__FUNCTION__,cfs_curproc_pid());       \
-        printk(fmt,## a);                               \
-        CDEBUG(D_TRACE,fmt,## a);                       \
-}while(0)
-
-#define PJK_UT_MSG_SIMULATION(fmt, a...)        PJK_UT_MSG_ALWAYS(fmt, ## a )
-
-
-#if 1
-#define PJK_UT_MSG_DATA(fmt, a...)              PJK_UT_MSG_ALWAYS(fmt, ## a )
-#else
-#define PJK_UT_MSG_DATA(fmt, a...)              do{}while(0)
-#endif
-
-#if 1
-#define PJK_UT_MSG(fmt, a...)                   PJK_UT_MSG_ALWAYS(fmt, ## a )
-#else
-#define PJK_UT_MSG(fmt, a...)                   do{}while(0)
-#endif
-
-
 #define SIMULATION_FAIL_BLOCKING_TX_PUT_ALLOC   0       /* 0x00000001 */
 #define SIMULATION_FAIL_BLOCKING_TX_GET_ALLOC   1       /* 0x00000002 */
 #define SIMULATION_FAIL_BLOCKING_TX             2       /* 0x00000004 */
@@ -687,17 +663,7 @@ do{                                                     \
 
 #define IS_SIMULATION_ENABLED(x) \
         (((*kptllnd_tunables.kptl_simulation_bitmap) & 1<< SIMULATION_##x) != 0)
-
-
 #else
-
-
-#define PJK_UT_MSG_ALWAYS(fmt, a...)            do{}while(0)
-#define PJK_UT_MSG_SIMULATION(fmt, a...)        do{}while(0)
-#define PJK_UT_MSG_DATA(fmt, a...)              do{}while(0)
-#define PJK_UT_MSG(fmt, a...)                   do{}while(0)
-
 #define IS_SIMULATION_ENABLED(x)                0
-
 #endif
 
index e5093e9..1ba8940 100644 (file)
@@ -30,14 +30,14 @@ kptllnd_setup_md(
         struct iovec    *payload_iov,
         lnet_kiov_t     *payload_kiov,
         unsigned int     payload_offset,
-        int              payload_nob,
-        tempiov_t       *tempiov)
+        int              payload_nob)
 {
-        unsigned int niov = 0;
+        kptl_fragvec_t *frags = tx->tx_frags;
+        unsigned int    niov = 0;
 
-        PJK_UT_MSG_DATA("%s nob=%d offset=%d niov=%d\n",
-                op == PTL_MD_OP_GET ? "GET" : "PUT",
-                payload_nob,payload_offset,payload_niov);
+        CDEBUG(D_NET, "%s nob=%d offset=%d niov=%d\n",
+               op == PTL_MD_OP_GET ? "GET" : "PUT",
+               payload_nob,payload_offset,payload_niov);
 
         /* One but not both of iov or kiov must be NULL (XOR) */
         LASSERT( (payload_iov != NULL && payload_kiov == NULL) ||
@@ -85,23 +85,23 @@ kptllnd_setup_md(
                 while(payload_nob){
                         LASSERT( payload_offset < payload_iov->iov_len);
                         LASSERT (payload_niov > 0);
-                        LASSERT (niov < sizeof(tempiov->iov)/sizeof(tempiov->iov[0]));
+                        LASSERT (niov < sizeof(frags->iov)/sizeof(frags->iov[0]));
 
-                        tempiov->iov[niov].iov_base = payload_iov->iov_base + payload_offset;
-                        tempiov->iov[niov].iov_len  = min((int)(payload_iov->iov_len - payload_offset),
+                        frags->iov[niov].iov_base = payload_iov->iov_base + payload_offset;
+                        frags->iov[niov].iov_len  = min((int)(payload_iov->iov_len - payload_offset),
                                                 (int)payload_nob);
 
-                        PJK_UT_MSG("iov_base[%d]=%p\n",niov,tempiov->iov[niov].iov_base);
-                        PJK_UT_MSG("iov_len[%d] =%d\n",niov,tempiov->iov[niov].iov_len);
+                        CDEBUG(D_NET, "iov_base[%d]=%p\n",niov,frags->iov[niov].iov_base);
+                        CDEBUG(D_NET, "iov_len[%d] =%d\n",niov,(int)frags->iov[niov].iov_len);
 
                         payload_offset = 0;
-                        payload_nob -= tempiov->iov[niov].iov_len;
+                        payload_nob -= frags->iov[niov].iov_len;
                         payload_iov++;
                         payload_niov--;
                         niov++;
                 }
 
-                md->start = tempiov->iov;
+                md->start = frags->iov;
                 md->options |= PTL_MD_IOVEC;
         }else{
 
@@ -117,21 +117,21 @@ kptllnd_setup_md(
                 while(payload_nob){
                         LASSERT( payload_offset < payload_kiov->kiov_len);
                         LASSERT (payload_niov > 0);
-                        LASSERT (niov < sizeof(tempiov->kiov)/sizeof(tempiov->kiov[0]));
+                        LASSERT (niov < sizeof(frags->kiov)/sizeof(frags->kiov[0]));
 
-                        tempiov->kiov[niov].kiov_page   = payload_kiov->kiov_page;
-                        tempiov->kiov[niov].kiov_offset = payload_kiov->kiov_offset + payload_offset;
-                        tempiov->kiov[niov].kiov_len    = min((int)(payload_kiov->kiov_len - payload_offset),
+                        frags->kiov[niov].kiov_page   = payload_kiov->kiov_page;
+                        frags->kiov[niov].kiov_offset = payload_kiov->kiov_offset + payload_offset;
+                        frags->kiov[niov].kiov_len    = min((int)(payload_kiov->kiov_len - payload_offset),
                                                         (int)payload_nob);
 
                         payload_offset = 0;
-                        payload_nob -=  tempiov->kiov[niov].kiov_len;
+                        payload_nob -=  frags->kiov[niov].kiov_len;
                         payload_kiov++;
                         payload_niov--;
                         niov++;
                 }
 
-                md->start = tempiov->kiov;
+                md->start = frags->kiov;
                 md->options |= PTL_MD_KIOV;
 
 #else /* _USING_CRAY_PORTALS_ */
@@ -144,9 +144,9 @@ kptllnd_setup_md(
 #error "Conflicting compilation directives"
 #endif
 
-                PJK_UT_MSG("payload_offset %d\n",payload_offset);
-                PJK_UT_MSG("payload_niov   %d\n",payload_niov);
-                PJK_UT_MSG("payload_nob    %d\n",payload_nob);
+                CDEBUG(D_NET, "payload_offset %d\n",payload_offset);
+                CDEBUG(D_NET, "payload_niov   %d\n",payload_niov);
+                CDEBUG(D_NET, "payload_nob    %d\n",payload_nob);
 
                 while (payload_offset >= payload_kiov->kiov_len) {
                         payload_offset -= payload_kiov->kiov_len;
@@ -165,29 +165,29 @@ kptllnd_setup_md(
                         
                         LASSERT (payload_offset < payload_kiov->kiov_len);
                         LASSERT (payload_niov > 0);
-                        LASSERT (niov < sizeof(tempiov->iov)/sizeof(tempiov->iov[0]));
+                        LASSERT (niov < sizeof(frags->iov)/sizeof(frags->iov[0]));
                         LASSERT (sizeof(void *) > 4 || 
                                  (phys <= 0xffffffffULL &&
                                   phys + (nob - 1) <= 0xffffffffULL));
 
-                        PJK_UT_MSG("kiov_page  [%d]="LPX64" (phys)\n",niov,phys_page);
-                        PJK_UT_MSG("kiov_offset[%d]=%d (phys)\n",niov,payload_kiov->kiov_offset);
-                        PJK_UT_MSG("kiov_len   [%d]=%d (phys)\n",niov,payload_kiov->kiov_len);
+                        CDEBUG(D_NET, "kiov_page  [%d]="LPX64" (phys)\n",niov,phys_page);
+                        CDEBUG(D_NET, "kiov_offset[%d]=%d (phys)\n",niov,payload_kiov->kiov_offset);
+                        CDEBUG(D_NET, "kiov_len   [%d]=%d (phys)\n",niov,payload_kiov->kiov_len);
 
-                        tempiov->iov[niov].iov_base = (void *)((unsigned long)phys);
-                        tempiov->iov[niov].iov_len = nob;
+                        frags->iov[niov].iov_base = (void *)((unsigned long)phys);
+                        frags->iov[niov].iov_len = nob;
 
-                        PJK_UT_MSG("iov_base[%d]=%p\n",niov,tempiov->iov[niov].iov_base);
-                        PJK_UT_MSG("iov_len [%d]=%d\n",niov,tempiov->iov[niov].iov_len);
+                        CDEBUG(D_NET, "iov_base[%d]=%p\n",niov,frags->iov[niov].iov_base);
+                        CDEBUG(D_NET, "iov_len [%d]=%d\n",niov,(int)frags->iov[niov].iov_len);
 
                         payload_offset = 0;
-                        payload_nob -= tempiov->iov[niov].iov_len;
+                        payload_nob -= frags->iov[niov].iov_len;
                         payload_kiov++;
                         payload_niov--;
                         niov++;
                 }
 
-                md->start = tempiov->iov;
+                md->start = frags->iov;
                 md->options |= PTL_MD_IOVEC | PTL_MD_PHYS;
 #endif
 
@@ -199,8 +199,8 @@ kptllnd_setup_md(
          */
         md->length = niov;
 
-        PJK_UT_MSG("md->options=%x\n",md->options);
-        PJK_UT_MSG("md->length=%d\n",md->length);
+        CDEBUG(D_NET, "md->options=%x\n",md->options);
+        CDEBUG(D_NET, "md->length=%u\n",(unsigned)md->length);
 }
 
 int
@@ -220,7 +220,6 @@ kptllnd_start_bulk_rdma(
         ptl_err_t        ptl_rc;
         ptl_err_t        ptl_rc2;
         int              rc;
-        tempiov_t        tempiov;
         kptl_msg_t      *rxmsg = rx->rx_msg;
         kptl_peer_t     *peer = rx->rx_peer;
         unsigned long    flags;
@@ -246,28 +245,24 @@ kptllnd_start_bulk_rdma(
         tx->tx_associated_rx = rx;
         kptllnd_rx_addref(rx,"tx");
 
-        PJK_UT_MSG_DATA(">>> %s rx=%p associated with tx=%p\n",
+        CDEBUG(D_NET, ">>> %s rx=%p associated with tx=%p\n",
                 op == PTL_MD_OP_GET ? "GET" : "PUT",
                 rx,tx);
-        PJK_UT_MSG_DATA("matchibts=" LPX64 "\n",
+        CDEBUG(D_NET, "matchibts=" LPX64 "\n",
                 rxmsg->ptlm_u.req.kptlrm_matchbits);
 
         /*
          * Setup the MD
          */
-        kptllnd_setup_md(kptllnd_data,&md,op,tx,
-                payload_niov,payload_iov,payload_kiov,
-                payload_offset,payload_nob,&tempiov);
+        kptllnd_setup_md(kptllnd_data, &md, op, tx,
+                         payload_niov, payload_iov, payload_kiov,
+                         payload_offset, payload_nob);
 
         /*
          * Attach the MD
          */
-        ptl_rc = PtlMDBind(
-                kptllnd_data->kptl_nih,
-                md,
-                PTL_UNLINK,
-                &mdh);
-        if(ptl_rc != PTL_OK){
+        ptl_rc = PtlMDBind(kptllnd_data->kptl_nih, md, PTL_UNLINK, &mdh);
+        if (ptl_rc != PTL_OK) {
                 CERROR("PtlMDBind failed %d\n",ptl_rc);
                 rc = -ENOMEM;
                 goto end;
@@ -370,7 +365,7 @@ end:
          */
         kptllnd_tx_decref(tx);
 
-        PJK_UT_MSG("<<< rc=%d\n",rc);
+        CDEBUG(D_NET, "<<< rc=%d\n",rc);
         return rc;
 }
 
@@ -413,12 +408,12 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
         kptl_data_t      *kptllnd_data = ni->ni_data;
         int               nob;
 
-        PJK_UT_MSG_DATA(">>> SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
-        PJK_UT_MSG_DATA("nob=%d nov=%d offset=%d to %s\n",
+        CDEBUG(D_NET, ">>> SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
+        CDEBUG(D_NET, "nob=%d nov=%d offset=%d to %s\n",
                payload_nob, payload_niov, payload_offset,
                libcfs_id2str(target));
-        PJK_UT_MSG_DATA("routing=%d target_is_router=%d\n",
-                routing,target_is_router);
+        CDEBUG(D_NET, "routing=%d target_is_router=%d\n",
+               routing,target_is_router);
 
         if(routing)
                 STAT_UPDATE(kps_send_routing);
@@ -453,7 +448,7 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 
         case LNET_MSG_REPLY:
         case LNET_MSG_PUT:
-                PJK_UT_MSG_DATA("LNET_MSG_PUT/REPLY\n");
+                CDEBUG(D_NET, "LNET_MSG_PUT/REPLY\n");
 
                 /*
                  * Get an idle tx descriptor
@@ -477,12 +472,12 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 
                 kptllnd_do_put(tx,lntmsg,kptllnd_data);
 
-                PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
+                CDEBUG(D_NET, "<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
                 return 0;
 
         case LNET_MSG_GET:
 
-                PJK_UT_MSG_DATA("LNET_MSG_GET\n");
+                CDEBUG(D_NET, "LNET_MSG_GET\n");
 
                 /*
                  * Get an idle tx descriptor
@@ -500,7 +495,7 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
                 if(target_is_router || routing)
                         break;
 
-                PJK_UT_MSG_DATA("nob=%d\n",lntmsg->msg_md->md_length);
+                CDEBUG(D_NET, "nob=%d\n",lntmsg->msg_md->md_length);
 
                 /* Is the payload small enough not to need RDMA? */
                 nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[lntmsg->msg_md->md_length]);
@@ -533,14 +528,14 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
                 goto launch;
 
         case LNET_MSG_ACK:
-                PJK_UT_MSG_DATA("LNET_MSG_ACK\n");
+                CDEBUG(D_NET, "LNET_MSG_ACK\n");
                 LASSERT (payload_nob == 0);
                 break;
         }
 
 
         if(tx == NULL){
-                PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_IMMEDIATE\n");
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE\n");
 
                 /*
                  * Get an idle tx descriptor
@@ -552,7 +547,7 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
                         return -ENOMEM;
                 }
         }else{
-                PJK_UT_MSG_DATA("Using PTLLND_MSG_TYPE_IMMEDIATE\n");
+                CDEBUG(D_NET, "Using PTLLND_MSG_TYPE_IMMEDIATE\n");
                 /*
                  * Repurpose this TX
                  */
@@ -593,7 +588,7 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 
 launch:
         kptllnd_tx_launch(tx, target, lntmsg);
-        PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
+        CDEBUG(D_NET, "<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
         return 0;
 }
 
@@ -606,7 +601,7 @@ int kptllnd_eager_recv(
         //kptl_data_t    *kptllnd_data = ni->ni_data;
         kptl_rx_t    *rx = private;
 
-        PJK_UT_MSG_DATA("Eager RX=%p RXB=%p\n",rx,rx->rx_rxb);
+        CDEBUG(D_NET, "Eager RX=%p RXB=%p\n",rx,rx->rx_rxb);
 
         LASSERT(rx->rx_nob < *kptllnd_tunables.kptl_max_msg_size);
 
@@ -645,8 +640,8 @@ int kptllnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
         int           nob;
         int           rc;
 
-        PJK_UT_MSG_DATA(">>> RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR\n");
-        PJK_UT_MSG_DATA("niov=%d offset=%d mlen=%d rlen=%d\n",
+        CDEBUG(D_NET, ">>> RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR\n");
+        CDEBUG(D_NET, "niov=%d offset=%d mlen=%d rlen=%d\n",
                 niov,offset,mlen,rlen);
 
         LASSERT (mlen <= rlen);
@@ -682,7 +677,7 @@ int kptllnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
                 break;
 
         case PTLLND_MSG_TYPE_IMMEDIATE:
-                PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_IMMEDIATE\n");
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE\n");
 
                 nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[rlen]);
                 if (nob > *kptllnd_tunables.kptl_max_msg_size) {
@@ -712,7 +707,7 @@ int kptllnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
                 break;
 
         case PTLLND_MSG_TYPE_GET:
-                PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_GET\n");
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_GET\n");
 
                 if (lntmsg == NULL) {
                         /* No match for the GET request */
@@ -730,12 +725,12 @@ int kptllnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
                                 lntmsg->msg_kiov,
                                 lntmsg->msg_offset,
                                 lntmsg->msg_len);
-                        PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS rc=%d\n",rc);
+                        CDEBUG(D_NET, "<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS rc=%d\n",rc);
                 }
                 break;
 
         case PTLLND_MSG_TYPE_PUT:
-                PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_PUT\n");
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_PUT\n");
 
                 if (mlen == 0) { /* No payload */
                         lnet_finalize(ni, lntmsg, 0);
@@ -760,7 +755,7 @@ int kptllnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
          */
         kptllnd_rx_decref(rx,"lnet_parse",kptllnd_data);
 
-        PJK_UT_MSG_DATA("<<< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR rc=%d\n",rc);
+        CDEBUG(D_NET, "<<< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR rc=%d\n",rc);
         return rc;
 }
 
@@ -845,7 +840,7 @@ kptllnd_watchdog(void *arg)
         int                timeout;
         int                i;
 
-        PJK_UT_MSG(">>>\n");
+        CDEBUG(D_NET, ">>>\n");
 
         /*
          * Daemonize
@@ -907,7 +902,7 @@ kptllnd_watchdog(void *arg)
         }
 
         kptllnd_thread_fini(thread_data);
-        PJK_UT_MSG("<<<\n");
+        CDEBUG(D_NET, "<<<\n");
         return (0);
 };
 
@@ -924,7 +919,7 @@ kptllnd_scheduler(void *arg)
         kptl_rx_buffer_t        *rxb = NULL;
         kptl_tx_t               *tx = NULL;
 
-        PJK_UT_MSG(">>>\n");
+        CDEBUG(D_NET, ">>>\n");
 
         /*
          * Daemonize
@@ -997,9 +992,9 @@ kptllnd_scheduler(void *arg)
                         if(rxb)
                                 kptllnd_rx_buffer_post_handle_error(rxb);
                         if(tx){
-                                PJK_UT_MSG(">>> tx=%p\n",tx);
+                                CDEBUG(D_NET, ">>> tx=%p\n",tx);
                                 kptllnd_tx_done(tx);
-                                PJK_UT_MSG("<<<\n");
+                                CDEBUG(D_NET, "<<<\n");
                         }
 
                         /*
@@ -1010,7 +1005,7 @@ kptllnd_scheduler(void *arg)
         }
 
         kptllnd_thread_fini(thread_data);
-        PJK_UT_MSG("<<<\n");
+        CDEBUG(D_NET, "<<<\n");
         return (0);
 }
 
@@ -1025,7 +1020,7 @@ void kptllnd_clean_canceled_peers(kptl_data_t *kptllnd_data)
 
 
         if(!list_empty(&kptllnd_data->kptl_canceled_peers)){
-                PJK_UT_MSG("Cleaning Canceled Peers\n");
+                CDEBUG(D_NET, "Cleaning Canceled Peers\n");
                 STAT_UPDATE(kps_cleaning_caneled_peers);
         }
 
index bf06d44..2a99b86 100644 (file)
@@ -39,6 +39,10 @@ static int portal = PTLLND_PORTAL;
 CFS_MODULE_PARM(portal, "i", int, 0444,
                "portal id");
 
+static int pid = PTLLND_PID;
+CFS_MODULE_PARM(pid, "i", int, 0444,
+               "portals pid");
+
 static int rxb_npages = PTLLND_RXB_NPAGES;
 CFS_MODULE_PARM(rxb_npages, "i", int, 0444,
                "# of pages for rx buffers");
@@ -71,6 +75,7 @@ kptl_tunables_t kptllnd_tunables = {
         .kptl_concurrent_peers       = &concurrent_peers,
         .kptl_cksum                  = &cksum,
         .kptl_portal                 = &portal,
+        .kptl_pid                    = &pid,
         .kptl_timeout                = &timeout,
         .kptl_rxb_npages             = &rxb_npages,
         .kptl_credits                = &credits,
@@ -96,19 +101,21 @@ static ctl_table kptllnd_ctl_table[] = {
         sizeof(int), 0644, NULL, &proc_dointvec},
        {5, "portal", &portal,
         sizeof(int), 0444, NULL, &proc_dointvec},
-       {6, "rxb_npages", &rxb_npages,
+       {6, "pid", &pid,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {7, "rxb_npages", &rxb_npages,
         sizeof(int), 0444, NULL, &proc_dointvec},
-       {7, "credits", &credits,
+       {8, "credits", &credits,
         sizeof(int), 0444, NULL, &proc_dointvec},
-       {8, "peercredits", &peercredits,
+       {9, "peercredits", &peercredits,
         sizeof(int), 0444, NULL, &proc_dointvec},
-       {9, "max_msg_size", &max_msg_size,
+       {10,"max_msg_size", &max_msg_size,
         sizeof(int), 0444, NULL, &proc_dointvec},
-       {10, "peer_hash_table_size,", &peer_hash_table_size,
+       {11,"peer_hash_table_size,", &peer_hash_table_size,
         sizeof(int), 0444, NULL, &proc_dointvec},
 
 #ifdef PJK_DEBUGGING
-       {11, "simulation_bitmap,", &simulation_bitmap,
+       {12, "simulation_bitmap,", &simulation_bitmap,
         sizeof(int), 0444, NULL, &proc_dointvec},
 #endif
 
index fe62903..9d96e0f 100644 (file)
@@ -23,12 +23,6 @@ void
 kptllnd_peer_destroy (
         kptl_peer_t *peer);
 
-kptl_peer_t *
-kptllnd_peer_find_holding_list_lock (
-        kptl_data_t *kptllnd_data,
-        lnet_process_id_t target);
-
-
 int
 kptllnd_peer_add_to_list_locked (
         kptl_data_t *kptllnd_data,
@@ -62,7 +56,7 @@ kptllnd_peer_add_to_list_locked (
         /* And add this to the list */
         LASSERT(list_empty(&peer->peer_list));
         list_add_tail (&peer->peer_list,
-                       kptllnd_nid2peerlist (kptllnd_data,peer->peer_nid));
+                       kptllnd_ptlnid2peerlist(kptllnd_data,peer->peer_ptlid.nid));
 
         STAT_UPDATE(kps_peers_created);
 
@@ -70,17 +64,16 @@ kptllnd_peer_add_to_list_locked (
 }
 
 int
-kptllnd_peer_allocate (
-        kptl_data_t *kptllnd_data,
-        kptl_peer_t **peerp,
-        lnet_process_id_t target)
+kptllnd_peer_allocate (kptl_data_t       *kptllnd_data,
+                       kptl_peer_t      **peerp,
+                       ptl_process_id_t   ptlid) 
 {
         kptl_peer_t     *peer;
         int             rc;
 
-        PJK_UT_MSG(">>> id=%s\n",libcfs_id2str(target));
+        CDEBUG(D_NET, ">>> "FMT_NID"/%d\n", ptlid.nid, ptlid.pid);
 
-        LASSERT (target.nid != PTL_NID_ANY);
+        LASSERT (ptlid.nid != PTL_NID_ANY);
 
         LIBCFS_ALLOC(peer, sizeof (*peer));
         if (peer == NULL) {
@@ -98,8 +91,10 @@ kptllnd_peer_allocate (
 
         peer->peer_state = PEER_STATE_ALLOCATED;
         peer->peer_kptllnd_data = kptllnd_data;
-        peer->peer_nid = target.nid;
-        peer->peer_pid = target.pid;
+
+        peer->peer_nid = ptl2lnetnid(kptllnd_data, ptlid.nid);
+        peer->peer_ptlid = ptlid;
+
         //peer->peer_incarnation = 0;
         //peer->peer_tx_seqnum = 0;
 
@@ -136,7 +131,8 @@ kptllnd_peer_allocate (
          */
         atomic_set (&peer->peer_refcount, 1);
 
-        PJK_UT_MSG("<<< Peer=%p id=%s\n",peer,libcfs_id2str(target));
+        CDEBUG(D_NET, "<<< Peer=%p nid=%s\n", 
+               peer, libcfs_nid2str(peer->peer_nid));
         *peerp = peer;
         return 0;
 }
@@ -148,7 +144,7 @@ kptllnd_peer_destroy (
 {
         kptl_data_t *kptllnd_data = peer->peer_kptllnd_data;
 
-        PJK_UT_MSG("Peer=%p\n",peer);
+        CDEBUG(D_NET, "Peer=%p\n",peer);
 
         LASSERT (atomic_read (&peer->peer_refcount) == 0);
         /* Not on the peer list */
@@ -181,14 +177,6 @@ kptllnd_peer_addref (
         const char *owner)
 {
         atomic_inc(&peer->peer_refcount);
-
-        /*
-         * The below message could actually be out of sync
-         * with the real ref count, and is for informational purposes
-         * only
-         */
-        PJK_UT_MSG("peer=%p owner=%s count=%d\n",peer,owner,
-                atomic_read(&peer->peer_refcount));
 }
 
 void
@@ -199,19 +187,10 @@ kptllnd_peer_decref (
         unsigned long    flags;
         kptl_data_t     *kptllnd_data = peer->peer_kptllnd_data;
 
-        if( !atomic_dec_and_test(&peer->peer_refcount)){
-
-                /*
-                 * The below message could actually be out of sync
-                 * with the real ref count, and is for informational purposes
-                 * only
-                 */
-                PJK_UT_MSG("peer=%p owner=%s count=%d\n",peer,owner,
-                        atomic_read(&peer->peer_refcount));
+        if( !atomic_dec_and_test(&peer->peer_refcount))
                 return;
-        }
 
-        PJK_UT_MSG("peer=%p owner=%s LAST REF\n",peer,owner);
+        CDEBUG(D_NET, "peer=%p owner=%s LAST REF\n",peer,owner);
 
         write_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
         list_del_init (&peer->peer_list);
@@ -243,7 +222,7 @@ kptllnd_peer_cancel_pending_txs(
         spin_lock_irqsave(&peer->peer_lock, flags);
 
         if(!list_empty(&peer->peer_pending_txs))
-                PJK_UT_MSG("Clearing Pending TXs\n");
+                CDEBUG(D_NET, "Clearing Pending TXs\n");
 
         list_for_each_safe (tx_temp, tx_next, &peer->peer_pending_txs) {
                 tx = list_entry (tx_temp, kptl_tx_t, tx_list);
@@ -277,7 +256,7 @@ kptllnd_peer_cancel_active_txs(
         spin_lock_irqsave(&peer->peer_lock, flags);
 
         if(!list_empty(&peer->peer_active_txs))
-                PJK_UT_MSG("Clearing Active TXs\n");
+                CDEBUG(D_NET, "Clearing Active TXs\n");
 
 again:
 
@@ -303,7 +282,7 @@ again:
                  */
 
                 if(!PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE)){
-                        PJK_UT_MSG("Unlink mhd_msg\n");
+                        CDEBUG(D_NET, "Unlink mhd_msg\n");
                         LASSERT(atomic_read(&tx->tx_refcount)>1);
                         ptl_rc = PtlMDUnlink(tx->tx_mdh_msg);
 #ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
@@ -315,7 +294,7 @@ again:
                 }
 
                 if(!PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE)){
-                        PJK_UT_MSG("Unlink mdh\n");
+                        CDEBUG(D_NET, "Unlink mdh\n");
                         LASSERT(atomic_read(&tx->tx_refcount)>1);
                         ptl_rc = PtlMDUnlink(tx->tx_mdh);
 #ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
@@ -349,7 +328,7 @@ kptllnd_peer_cancel(
         unsigned long      flags;
         int                list_owns_ref=0;
 
-        PJK_UT_MSG(">>> Peer=%p\n",peer);
+        CDEBUG(D_NET, ">>> Peer=%p\n",peer);
 
         write_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
         if(peer->peer_state != PEER_STATE_CANCELED){
@@ -374,13 +353,11 @@ kptllnd_peer_cancel(
         if(list_owns_ref)
                 kptllnd_peer_decref(peer,"list");
 
-        PJK_UT_MSG("<<< Peer=%p\n",peer);
+        CDEBUG(D_NET, "<<< Peer=%p\n",peer);
 }
 
 int
-kptllnd_peer_del (
-        kptl_data_t *kptllnd_data,
-        lnet_nid_t nid)
+kptllnd_peer_del (kptl_data_t *kptllnd_data, lnet_nid_t nid)
 {
         struct list_head  *ptmp;
         struct list_head  *pnxt;
@@ -391,16 +368,18 @@ kptllnd_peer_del (
         unsigned long      flags;
         int                rc = -ENOENT;
 
-
-        PJK_UT_MSG(">>> NID="LPX64"\n",nid);
+        CDEBUG(D_NET, ">>> NID="LPX64"\n",nid);
 
         /*
-         * Find the single bucket we are supposed to look at
-         * or if nid = PTL_NID_ANY then look at all of the buckets
+         * Find the single bucket we are supposed to look at or if nid is a
+         * wildcard (LNET_NID_ANY) then look at all of the buckets
          */
-        if (nid != PTL_NID_ANY)
-                lo = hi = kptllnd_nid2peerlist(kptllnd_data,nid) - kptllnd_data->kptl_peers;
-        else {
+        if (nid != LNET_NID_ANY) {
+                ptl_nid_t         ptlnid = lnet2ptlnid(kptllnd_data, nid);
+                struct list_head *l = kptllnd_ptlnid2peerlist(kptllnd_data, ptlnid);
+                
+                lo = hi =  l - kptllnd_data->kptl_peers;
+        } else {
                 lo = 0;
                 hi = kptllnd_data->kptl_peer_hash_size - 1;
         }
@@ -415,7 +394,7 @@ again:
                         /*
                          * Is this the right one?
                          */
-                        if (!(nid == PTL_NID_ANY || peer->peer_nid == nid))
+                        if (!(nid == LNET_NID_ANY || peer->peer_nid == nid))
                                 continue;
 
                         kptllnd_peer_addref(peer,"temp"); /* 1 ref for me... */
@@ -435,7 +414,7 @@ again:
 
         read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
 
-        PJK_UT_MSG("<<< rc=%d\n",rc);
+        CDEBUG(D_NET, "<<< rc=%d\n",rc);
         return (rc);
 }
 
@@ -444,7 +423,7 @@ kptllnd_peer_queue_tx_locked (
         kptl_peer_t *peer,
         kptl_tx_t *tx)
 {
-        PJK_UT_MSG("Peer=%p TX=%p\n",peer,tx);
+        CDEBUG(D_NET, "Peer=%p TX=%p\n",peer,tx);
 
         LASSERT(peer->peer_state != PEER_STATE_CANCELED);
         LASSERT(tx->tx_state == TX_STATE_ALLOCATED);
@@ -463,7 +442,7 @@ kptllnd_peer_queue_bulk_rdma_tx_locked(
         kptl_peer_t *peer,
         kptl_tx_t *tx)
 {
-        PJK_UT_MSG("Peer=%p TX=%p\n",peer,tx);
+        CDEBUG(D_NET, "Peer=%p TX=%p\n",peer,tx);
 
         LASSERT(peer->peer_state != PEER_STATE_CANCELED);
         LASSERT(tx->tx_state == TX_STATE_ALLOCATED);
@@ -511,7 +490,7 @@ kptllnd_peer_dequeue_tx(
 
 void
 kptllnd_peer_check_sends (
-        kptl_peer_t *peer )
+        kptl_peer_t *peer)
 {
 
         kptl_tx_t       *tx;
@@ -521,7 +500,6 @@ kptllnd_peer_check_sends (
         ptl_handle_me_t  meh;
         ptl_handle_md_t  mdh;
         ptl_handle_md_t  mdh_msg;
-        ptl_process_id_t target;
         unsigned long    flags;
 
         LASSERT(!in_interrupt());
@@ -532,7 +510,7 @@ kptllnd_peer_check_sends (
          */
         spin_lock_irqsave(&peer->peer_lock, flags);
 
-        PJK_UT_MSG_DATA(">>>Peer=%p Credits=%d Outstanding=%d\n",
+        CDEBUG(D_NET, ">>>Peer=%p Credits=%d Outstanding=%d\n",
                 peer,peer->peer_credits,peer->peer_outstanding_credits);
 
         if(list_empty(&peer->peer_pending_txs) &&
@@ -543,8 +521,8 @@ kptllnd_peer_check_sends (
                  */
                 tx = kptllnd_get_idle_tx(kptllnd_data,TX_TYPE_SMALL_MESSAGE);
                 if( tx == NULL ) {
-                        CERROR ("Can't return credits to "LPX64": tx descs exhausted\n",
-                                peer->peer_nid);
+                        CERROR("Can't return credits to %s: tx descs exhausted\n",
+                               libcfs_nid2str(peer->peer_nid));
                 }else{
                         kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_NOOP,0);
                         kptllnd_peer_queue_tx_locked(peer,tx);
@@ -568,7 +546,8 @@ kptllnd_peer_check_sends (
                  */
                 if (peer->peer_credits == 0) {
                         STAT_UPDATE(kps_no_credits);
-                        CDEBUG(D_NET, LPX64": no credits\n",peer->peer_nid);
+                        CDEBUG(D_NET, "%s: no credits\n",
+                               libcfs_nid2str(peer->peer_nid));
                         break;
                 }
 
@@ -580,8 +559,8 @@ kptllnd_peer_check_sends (
                 if (peer->peer_credits == 1 &&
                     peer->peer_outstanding_credits == 0) {
                         STAT_UPDATE(kps_saving_last_credit);
-                        CDEBUG(D_NET, LPX64": not using last credit\n",
-                               peer->peer_nid);
+                        CDEBUG(D_NET, "%s: not using last credit\n",
+                               libcfs_nid2str(peer->peer_nid));
                         break;
                 }
 
@@ -612,15 +591,17 @@ kptllnd_peer_check_sends (
                         spin_unlock_irqrestore(&peer->peer_lock, flags);
                         /* redundant NOOP */
                         kptllnd_tx_decref(tx);
-                        CDEBUG(D_NET, LPX64": redundant noop\n",
-                               peer->peer_nid);
+                        CDEBUG(D_NET, "%s: redundant noop\n",
+                               libcfs_nid2str(peer->peer_nid));
                         spin_lock_irqsave(&peer->peer_lock, flags);
                         continue;
                 }
 
-                PJK_UT_MSG_DATA("--- TXTXTXTXTXTXTXTXTXTXTXTXTXTX\n");
-                PJK_UT_MSG_DATA("Sending TX=%p Size=%d\n",tx,tx->tx_msg->ptlm_nob);
-                PJK_UT_MSG_DATA("Target nid="LPX64" pid=%d\n",peer->peer_nid,peer->peer_pid);
+                CDEBUG(D_NET, "--- TXTXTXTXTXTXTXTXTXTXTXTXTXTX\n");
+                CDEBUG(D_NET, "Sending TX=%p Size=%d\n",tx,tx->tx_msg->ptlm_nob);
+                CDEBUG(D_NET, "Target nid=%s ptl "FMT_NID"/%d\n",
+                       libcfs_nid2str(peer->peer_nid), 
+                       peer->peer_ptlid.nid, peer->peer_ptlid.pid);
 
                 mdh = PTL_INVALID_HANDLE;
                 mdh_msg =PTL_INVALID_HANDLE;
@@ -628,10 +609,10 @@ kptllnd_peer_check_sends (
                 /*
                  * Assign matchbits for a put/get
                  */
-                if(tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_PUT ||
-                   tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_GET){
+                if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_PUT ||
+                    tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_GET) {
 
-                        PJK_UT_MSG_DATA("next matchbits="LPX64" (before)\n",
+                        CDEBUG(D_NET, "next matchbits="LPX64" (before)\n",
                                 peer->peer_next_matchbits);
 
 
@@ -643,11 +624,10 @@ kptllnd_peer_check_sends (
                          * not use them.  Just skip over them.  This check protects us
                          * even in the case of 64-bit rollover.
                          */
-                        if(peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS){
-                                CDEBUG(D_INFO,"Match Bits Rollover for "LPX64"\n",
-                                        peer->peer_nid);
+                        if (peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS) {
+                                CDEBUG(D_INFO,"Match Bits Rollover for %s\n",
+                                       libcfs_nid2str(peer->peer_nid));
                                 peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
-
                         }
 
                         /*
@@ -656,7 +636,7 @@ kptllnd_peer_check_sends (
                         tx->tx_msg->ptlm_u.req.kptlrm_matchbits =
                                 peer->peer_next_matchbits ++;
 
-                        PJK_UT_MSG_DATA("next matchbits="LPX64" (after)\n",
+                        CDEBUG(D_NET, "next matchbits="LPX64" (after)\n",
                                 peer->peer_next_matchbits);
                 }
 
@@ -664,14 +644,12 @@ kptllnd_peer_check_sends (
                  * Complete the message fill in all the rest
                  * of the header
                  */
-                kptllnd_msg_pack(
-                        tx->tx_msg,
-                        peer->peer_outstanding_credits,
-                        peer->peer_nid,
-                        peer->peer_incarnation,
-                        peer->peer_tx_seqnum,
-                        kptllnd_data);
-
+                kptllnd_msg_pack(tx->tx_msg,
+                                 peer->peer_outstanding_credits,
+                                 peer->peer_nid,
+                                 peer->peer_incarnation,
+                                 peer->peer_tx_seqnum,
+                                 kptllnd_data);
                 /*
                  * We just sent a packet
                  */
@@ -702,51 +680,47 @@ kptllnd_peer_check_sends (
                  * Construct an address that Portals needs from the NID
                  */
 
-                target.nid = lnet2ptlnid(kptllnd_data,peer->peer_nid);
-                target.pid = peer->peer_pid;
+                CDEBUG(D_NET, "Msg NOB = %d\n",tx->tx_msg->ptlm_nob);
+                CDEBUG(D_NET, "Giving %d credits back to peer\n",
+                       tx->tx_msg->ptlm_credits);
+                CDEBUG(D_NET, "Seq # = "LPX64"\n",tx->tx_msg->ptlm_seq);
 
-                PJK_UT_MSG_DATA("Msg NOB = %d\n",tx->tx_msg->ptlm_nob);
-                PJK_UT_MSG_DATA("Giving %d credits back to peer\n",tx->tx_msg->ptlm_credits);
-                PJK_UT_MSG_DATA("Seq # = "LPX64"\n",tx->tx_msg->ptlm_seq);
+                CDEBUG(D_NET, "lnet TX %s\n", libcfs_nid2str(peer->peer_nid));
+                CDEBUG(D_NET, "ptl  TX "FMT_NID"/%d\n",
+                       peer->peer_ptlid.nid, peer->peer_ptlid.pid);
 
-                PJK_UT_MSG("lnet TX nid=" LPX64 " pid=%d\n",peer->peer_nid,peer->peer_pid);
-                PJK_UT_MSG("ptl  TX nid=" FMT_NID " pid=%d\n",target.nid,target.pid);
-
-                if(tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_GET ||
-                   tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_PUT){
-                        tempiov_t tempiov;
+                if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_GET ||
+                    tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_PUT) {
+                        int       op;
+                        
+                        if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_PUT)
+                                op = PTL_MD_OP_GET;
+                        else
+                                op = PTL_MD_OP_PUT;
 
-                        PJK_UT_MSG_DATA("matchibts=" LPX64 "\n",
+                        CDEBUG(D_NET, "matchibts=" LPX64 "\n",
                                 tx->tx_msg->ptlm_u.req.kptlrm_matchbits);
 
-                        /*
-                         * Attach the ME
-                         */
-                        rc = PtlMEAttach(
-                            kptllnd_data->kptl_nih,
-                            *kptllnd_tunables.kptl_portal,
-                            target,
-                            tx->tx_msg->ptlm_u.req.kptlrm_matchbits,
-                            0, /* all matchbits are valid - ignore none*/
-                            PTL_UNLINK,
-                            PTL_INS_BEFORE,
-                            &meh);
-                        if(rc != 0) {
+                        rc = PtlMEAttach(kptllnd_data->kptl_nih,
+                                         *kptllnd_tunables.kptl_portal,
+                                         peer->peer_ptlid,
+                                         tx->tx_msg->ptlm_u.req.kptlrm_matchbits,
+                                         0, /* ignore none */
+                                         PTL_UNLINK,
+                                         PTL_INS_BEFORE,
+                                         &meh);
+                        if (rc != PTL_OK) {
                                 CERROR("PtlMeAttach failed %d\n",rc);
                                 goto failed_without_lock;
                         }
 
                         /* Setup the MD */
-                        kptllnd_setup_md(kptllnd_data,&md,
-                                tx->tx_msg->ptlm_type == LNET_MSG_GET ? PTL_MD_OP_PUT :
-                                        PTL_MD_OP_GET,
-                                tx,
-                                tx->tx_payload_niov,
-                                tx->tx_payload_iov,
-                                tx->tx_payload_kiov,
-                                tx->tx_payload_offset,
-                                tx->tx_payload_nob,
-                                &tempiov);
+                        kptllnd_setup_md(kptllnd_data, &md, op, tx,
+                                         tx->tx_payload_niov,
+                                         tx->tx_payload_iov,
+                                         tx->tx_payload_kiov,
+                                         tx->tx_payload_offset,
+                                         tx->tx_payload_nob);
 
                         /*
                          * Add a ref for this MD, because unlink
@@ -798,13 +772,10 @@ kptllnd_peer_check_sends (
                 /*
                  * Bind the MD
                  */
-                rc = PtlMDBind (
-                        kptllnd_data->kptl_nih,
-                        md,
-                        PTL_UNLINK,
-                        &mdh_msg);
-                if(rc != 0){
-                        if(!PtlHandleIsEqual(mdh,PTL_INVALID_HANDLE)){
+                rc = PtlMDBind(kptllnd_data->kptl_nih, md,
+                               PTL_UNLINK, &mdh_msg);
+                if (rc != PTL_OK) {
+                        if (!PtlHandleIsEqual(mdh,PTL_INVALID_HANDLE)) {
                                 rc2 = PtlMDUnlink(mdh);
                                 /*
                                  * The unlink should succeed
@@ -824,19 +795,19 @@ kptllnd_peer_check_sends (
                 LASSERT(PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE));
                 LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
 #ifdef _USING_LUSTRE_PORTALS_
-                PJK_UT_MSG("tx_mdh     = " LPX64 "\n",mdh.cookie);
-                PJK_UT_MSG("tx_mdh_msg = " LPX64 "\n",mdh_msg.cookie);
+                CDEBUG(D_NET, "tx_mdh     = " LPX64 "\n",mdh.cookie);
+                CDEBUG(D_NET, "tx_mdh_msg = " LPX64 "\n",mdh_msg.cookie);
 #endif
                 tx->tx_mdh = mdh;
                 tx->tx_mdh_msg = mdh_msg;
 
-                if(tx->tx_type == TX_TYPE_SMALL_MESSAGE)
-                        LASSERT(PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE));
+                LASSERT (tx->tx_type != TX_TYPE_SMALL_MESSAGE ||
+                         PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE));
 
                 list_add_tail(&tx->tx_list, &peer->peer_active_txs);
                 peer->peer_active_txs_change_counter++;
 
-                LASSERT(tx->tx_peer == peer);
+                LASSERT (tx->tx_peer == peer);
 
                 /*
                  * Grab a ref so the TX doesn't go away
@@ -846,18 +817,16 @@ kptllnd_peer_check_sends (
 
                 spin_unlock_irqrestore(&peer->peer_lock, flags);
 
-                rc = PtlPut (
-                            tx->tx_mdh_msg,
-                            PTL_NOACK_REQ,     /* we dont need an ack */
-                            target,            /* peer "address" */
-                            *kptllnd_tunables.kptl_portal,     /* portal */
-                            0,                 /* cookie */
-                            LNET_MSG_MATCHBITS, /* match bits */
-                            0,                 /* offset */
-                            0);                /* header data */
-                if(rc != 0){
+                rc = PtlPut (tx->tx_mdh_msg,
+                             PTL_NOACK_REQ,     /* we dont need an ack */
+                             peer->peer_ptlid,  /* peer "address" */
+                             *kptllnd_tunables.kptl_portal,     /* portal */
+                             0,                 /* cookie */
+                             LNET_MSG_MATCHBITS, /* match bits */
+                             0,                 /* offset */
+                             0);                /* header data */
+                if (rc != PTL_OK) {
                         CERROR("PtlPut error %d\n",rc);
-
                         /*
                          * Do the unlink which should succeed
                          */
@@ -865,7 +834,6 @@ kptllnd_peer_check_sends (
                         rc2 = PtlMDUnlink(tx->tx_mdh_msg);
                         LASSERT( rc2 == 0);
 
-
 #ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
                         tx->tx_mdh_msg = PTL_INVALID_HANDLE;
                         kptllnd_tx_decref(tx);
@@ -882,10 +850,9 @@ kptllnd_peer_check_sends (
 
         }
 
-
         spin_unlock_irqrestore(&peer->peer_lock, flags);
 
-        PJK_UT_MSG_DATA("<<<\n");
+        CDEBUG(D_NET, "<<<\n");
         return;
 
 failed_without_lock:
@@ -916,7 +883,7 @@ failed_without_lock:
          */
         kptllnd_tx_decref(tx);
 
-        PJK_UT_MSG("<<< FAILED\n");
+        CDEBUG(D_NET, "<<< FAILED\n");
 }
 
 int
@@ -936,7 +903,7 @@ kptllnd_peer_timedout(kptl_peer_t *peer)
         if(!list_empty(&peer->peer_pending_txs)){
                 tx = list_entry(peer->peer_pending_txs.next,kptl_tx_t,tx_list);
                 if(time_after_eq(jiffies,tx->tx_deadline)){
-                        PJK_UT_MSG("Peer=%p PENDING tx=%p time=%lu sec\n",
+                        CDEBUG(D_NET, "Peer=%p PENDING tx=%p time=%lu sec\n",
                                 peer,tx,(jiffies - tx->tx_deadline)/HZ);
                         rc = 1;
                 }
@@ -948,7 +915,7 @@ kptllnd_peer_timedout(kptl_peer_t *peer)
         if(!list_empty(&peer->peer_active_txs)){
                 tx = list_entry(peer->peer_active_txs.next,kptl_tx_t,tx_list);
                 if(time_after_eq(jiffies,tx->tx_deadline)){
-                        PJK_UT_MSG("Peer=%p ACTIVE tx=%p time=%lu sec\n",
+                        CDEBUG(D_NET, "Peer=%p ACTIVE tx=%p time=%lu sec\n",
                                 peer,tx,(jiffies - tx->tx_deadline)/HZ);
                         rc = 1;
                 }
@@ -968,7 +935,7 @@ kptllnd_peer_check_bucket (int idx, kptl_data_t *kptllnd_data)
         unsigned long      flags;
 
 
-        /*PJK_UT_MSG("Bucket=%d\n",idx);*/
+        CDEBUG(D_INFO, "Bucket=%d\n",idx);
 
  again:
         /* NB. We expect to have a look at all the peers and not find any
@@ -979,8 +946,8 @@ kptllnd_peer_check_bucket (int idx, kptl_data_t *kptllnd_data)
         list_for_each (ptmp, peers) {
                 peer = list_entry (ptmp, kptl_peer_t, peer_list);
 
-                PJK_UT_MSG("Peer=%p Credits=%d Outstanding=%d\n",
-                        peer,peer->peer_credits,peer->peer_outstanding_credits);
+                CDEBUG(D_NET, "Peer=%p Credits=%d Outstanding=%d\n",
+                       peer,peer->peer_credits,peer->peer_outstanding_credits);
 
                 /* In case we have enough credits to return via a
                  * NOOP, but there were no non-blocking tx descs
@@ -995,7 +962,8 @@ kptllnd_peer_check_bucket (int idx, kptl_data_t *kptllnd_data)
                 read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock,
                                        flags);
 
-                CERROR("Timed out RDMA with "LPX64"\n",peer->peer_nid);
+                CERROR("Timed out communications with %s\n",
+                       libcfs_nid2str(peer->peer_nid));
 
                 kptllnd_peer_cancel(peer);
                 kptllnd_peer_decref(peer,"temp"); /* ...until here */
@@ -1008,28 +976,14 @@ kptllnd_peer_check_bucket (int idx, kptl_data_t *kptllnd_data)
 }
 
 kptl_peer_t *
-kptllnd_peer_find (
-        kptl_data_t *kptllnd_data,
-        lnet_process_id_t target)
-{
-        kptl_peer_t *peer;
-        unsigned long flags;
-        read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
-        peer = kptllnd_peer_find_holding_list_lock(kptllnd_data,target);
-        read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
-        return peer;
-}
-
-kptl_peer_t *
-kptllnd_peer_find_holding_list_lock (
-        kptl_data_t *kptllnd_data,
-        lnet_process_id_t target)
+kptllnd_ptlnid2peer_locked (kptl_data_t  *kptllnd_data,
+                            ptl_nid_t     nid)
 {
-        struct list_head *peer_list = kptllnd_nid2peerlist (kptllnd_data,target.nid);
+        struct list_head *peer_list = kptllnd_ptlnid2peerlist(kptllnd_data, nid);
         struct list_head *tmp;
         kptl_peer_t      *peer;
 
-        PJK_UT_MSG(">>> id=%s\n",libcfs_id2str(target));
+        CDEBUG(D_NET, ">>> id="FMT_NID"\n", nid);
 
         list_for_each (tmp, peer_list) {
 
@@ -1037,44 +991,64 @@ kptllnd_peer_find_holding_list_lock (
 
                 LASSERT(peer->peer_state != PEER_STATE_CANCELED);
                 
-                PJK_UT_MSG("NID: peer="LPX64" target="LPX64"\n",
-                        peer->peer_nid,target.nid);
-                PJK_UT_MSG("PID: peer=%d target=%d\n",
-                        peer->peer_pid,target.pid);
-                        
-                if (! (peer->peer_nid == target.nid && 
-                       peer->peer_pid == target.pid))
+                if (peer->peer_ptlid.nid != nid)
                         continue;
 
-                CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
-                       peer, libcfs_id2str(target), atomic_read (&peer->peer_refcount));
-
                 kptllnd_peer_addref(peer,"find");
-                PJK_UT_MSG("<<< Peer=%p\n",peer);
+
+                CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+                       peer, libcfs_nid2str(peer->peer_nid), 
+                       atomic_read (&peer->peer_refcount));
                 return peer;
         }
 
-        PJK_UT_MSG("<<< NOTFOUND\n");
+        CDEBUG(D_NET, "<<< NOTFOUND\n");
         return NULL;
 }
 
 kptl_peer_t *
-kptllnd_peer_handle_hello (
-        kptl_data_t *kptllnd_data,
-        lnet_process_id_t initiator,
-        kptl_msg_t *msg)
+kptllnd_ptlnid2peer (kptl_data_t *kptllnd_data, ptl_nid_t nid)
+{
+        kptl_peer_t   *peer;
+        unsigned long  flags;
+
+        read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+        peer = kptllnd_ptlnid2peer_locked(kptllnd_data, nid);
+        read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+        return peer;
+}
+
+kptl_peer_t *
+kptllnd_nid2peer_locked (kptl_data_t  *kptllnd_data,
+                         lnet_nid_t    nid)
+{
+        return kptllnd_ptlnid2peer_locked(kptllnd_data,
+                                          lnet2ptlnid(kptllnd_data, nid));
+}
+
+kptl_peer_t *
+kptllnd_nid2peer (kptl_data_t *kptllnd_data, lnet_nid_t nid)
+{
+        return kptllnd_ptlnid2peer(kptllnd_data,
+                                   lnet2ptlnid(kptllnd_data, nid));
+}
+
+kptl_peer_t *
+kptllnd_peer_handle_hello (kptl_data_t      *kptllnd_data,
+                           ptl_process_id_t  initiator,
+                           kptl_msg_t       *msg)
 {
         kptl_peer_t    *peer           = NULL;
-        kptl_peer_t    *peer_allocated = NULL;
+        kptl_peer_t    *new_peer       = NULL;
         kptl_peer_t    *peer_to_cancel = NULL;
         unsigned long   flags;
-        kptl_tx_t      *tx_hello = NULL;
+        kptl_tx_t      *hello_tx = NULL;
         int             rc;
         __u64           safe_matchbits_from_peer;
         __u64           safe_matchbits_to_peer = 0;
 
-
-        PJK_UT_MSG(">>>\n");
+        CDEBUG(D_NET, ">>> "FMT_NID"/%d\n", initiator.nid, initiator.pid);
 
         safe_matchbits_from_peer = msg->ptlm_u.hello.kptlhm_matchbits +
                         *kptllnd_tunables.kptl_peercredits;
@@ -1082,12 +1056,11 @@ kptllnd_peer_handle_hello (
         /*
          * Immediate message sizes MUST be equal
          */
-        if(  msg->ptlm_u.hello.kptlhm_max_msg_size !=
-                *kptllnd_tunables.kptl_max_msg_size){
+        if (msg->ptlm_u.hello.kptlhm_max_msg_size !=
+            *kptllnd_tunables.kptl_max_msg_size) {
                 CERROR("IMMD message size MUST be equal for all peers got %d expected %d\n",
-                        msg->ptlm_u.hello.kptlhm_max_msg_size,
-                        *kptllnd_tunables.kptl_max_msg_size);
-
+                       msg->ptlm_u.hello.kptlhm_max_msg_size,
+                       *kptllnd_tunables.kptl_max_msg_size);
                 return 0;
         }
 
@@ -1095,26 +1068,26 @@ kptllnd_peer_handle_hello (
          * Setup a connect HELLO message.  We ultimately might not
          * use it but likely we will.
          */
-        tx_hello = kptllnd_get_idle_tx(kptllnd_data,TX_TYPE_SMALL_MESSAGE);
-        if( tx_hello == NULL) {
-                CERROR("Unable to allocate connect message for %s\n",libcfs_id2str(initiator));
+        hello_tx = kptllnd_get_idle_tx(kptllnd_data,TX_TYPE_SMALL_MESSAGE);
+        if (hello_tx == NULL) {
+                CERROR("Unable to allocate connect message for "FMT_NID"/%d\n",
+                       initiator.nid, initiator.pid);
                 return 0;
         }
 
-        kptllnd_init_msg(
-                tx_hello->tx_msg,
-                PTLLND_MSG_TYPE_HELLO,
-                sizeof(kptl_hello_msg_t));
+        kptllnd_init_msg(hello_tx->tx_msg, PTLLND_MSG_TYPE_HELLO,
+                         sizeof(kptl_hello_msg_t));
 
         /*
-         * Allocate a peer, even though we might not ultimatly use it
-         * however we want to avoid doing this while holidng
+         * Allocate a peer, even though we might not ultimately use it
+         * however we want to avoid doing this while holding
          * the peer_rw_lock and be forced into atomic context
          */
-        rc = kptllnd_peer_allocate ( kptllnd_data, &peer_allocated, initiator);
-        if(rc != 0){
-                kptllnd_tx_decref(tx_hello);
-                CERROR("Failed to create peer (id=%s)\n",libcfs_id2str(initiator));
+        rc = kptllnd_peer_allocate(kptllnd_data, &new_peer, initiator);
+        if (rc != 0){
+                kptllnd_tx_decref(hello_tx);
+                CERROR("Failed to create peer for "FMT_NID"/%d\n",
+                       initiator.nid, initiator.pid);
                 return 0;
         }
 
@@ -1123,14 +1096,13 @@ kptllnd_peer_handle_hello (
         /*
          * Look for peer because it could have been previously here
          */
-        peer = kptllnd_peer_find_holding_list_lock(kptllnd_data,initiator);
+        peer = kptllnd_ptlnid2peer_locked(kptllnd_data, initiator.nid);
 
         /*
          * If peer is already here
          */
-        if(peer != NULL){
-
-                if(peer->peer_incarnation == 0) {
+        if (peer != NULL) {
+                if (peer->peer_incarnation == 0) {
                         /*
                          * Update the peer state
                          */
@@ -1145,22 +1117,24 @@ kptllnd_peer_handle_hello (
                         /*
                          * Save the match bits
                          */
-                        PJK_UT_MSG_DATA(" **** Updating Matchbits="LPX64" ****\n",
-                                safe_matchbits_from_peer);
+                        CDEBUG(D_NET, " **** Updating Matchbits="LPX64" ****\n",
+                               safe_matchbits_from_peer);
 
                         peer->peer_next_matchbits = safe_matchbits_from_peer;
-                        if(peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS)
+                        if (peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS)
                                 peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
-                }
 
-                /*
-                 * If the incarnation has changed then we need to
-                 * resend the hello.
-                 */
-                else if( peer->peer_incarnation != msg->ptlm_srcnid ) {
+                } else if (peer->peer_incarnation != msg->ptlm_srcstamp ||
+                           peer->peer_ptlid.pid != initiator.pid) {
 
+                        CDEBUG(D_NET, "Peer %s reconnecting with pid,stamp: "
+                               "%d,"LPX64" (old %d,"LPX64"\n",
+                               libcfs_nid2str(peer->peer_nid),
+                               initiator.pid, msg->ptlm_srcstamp,
+                               peer->peer_ptlid.pid, peer->peer_incarnation);
                         /*
-                         * Put the match bits into the hello message
+                         * If the incarnation or PID have changed, assume the
+                         * peer has rebooted and resend the hello 
                          */
                         safe_matchbits_to_peer =
                                 peer->peer_last_matchbits_seen + 1 +
@@ -1172,35 +1146,33 @@ kptllnd_peer_handle_hello (
                         peer_to_cancel = peer;
                         peer = NULL;
 
-                }else{
+                } else {
                         CERROR("Receiving HELLO message on already connected peer %s\n",
-                                libcfs_id2str(initiator));
+                               libcfs_nid2str(peer->peer_nid));
                 }
         }
 
-        if( peer == NULL) {
-
+        if (peer == NULL) {
                 /*
                  * Put the match bits into the hello message
                  */
-                tx_hello->tx_msg->ptlm_u.hello.kptlhm_matchbits =
+                hello_tx->tx_msg->ptlm_u.hello.kptlhm_matchbits =
                         safe_matchbits_to_peer;
-                tx_hello->tx_msg->ptlm_u.hello.kptlhm_max_msg_size =
+                hello_tx->tx_msg->ptlm_u.hello.kptlhm_max_msg_size =
                         *kptllnd_tunables.kptl_max_msg_size;
 
                 /*
                  * Try and attach this peer to the list
                  */
-                rc = kptllnd_peer_add_to_list_locked ( kptllnd_data, peer_allocated);
-                if(rc != 0){
-                        CERROR("Failed to create peer (id=%s)\n",
-                                libcfs_id2str(initiator));
+                rc = kptllnd_peer_add_to_list_locked(kptllnd_data, new_peer);
+                if (rc != 0) {
+                        CERROR("Failed to create peer for "FMT_NID"/%d\n",
+                               initiator.nid, initiator.pid);
                         goto failed;
                 }
 
-                peer = peer_allocated;
-                peer_allocated = NULL;
-
+                peer = new_peer;
+                new_peer = NULL;
 
                 LASSERT(peer->peer_state == PEER_STATE_WAITING_HELLO);
                 peer->peer_state = PEER_STATE_ACTIVE;
@@ -1220,13 +1192,12 @@ kptllnd_peer_handle_hello (
                 /*
                  * Save the match bits
                  */
-                PJK_UT_MSG_DATA("**** Setting Matchbits="LPX64" ****\n",
-                        safe_matchbits_from_peer);
+                CDEBUG(D_NET, "**** Setting Matchbits="LPX64" ****\n",
+                       safe_matchbits_from_peer);
                 peer->peer_next_matchbits = safe_matchbits_from_peer;
                 if(peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS)
                         peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
 
-
                 /*
                  * And save them from a previous incarnation
                  */
@@ -1235,60 +1206,54 @@ kptllnd_peer_handle_hello (
                 /*
                  * Queue the message
                  */
-                kptllnd_peer_queue_tx_locked(peer,tx_hello);
+                kptllnd_peer_queue_tx_locked(peer,hello_tx);
 
                 /*
                  * And don't free it because it's queued
                  */
-                tx_hello = NULL;
-
+                hello_tx = NULL;
         }
 
 failed:
         write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock,flags);
 
-        if(tx_hello)
-                kptllnd_tx_decref(tx_hello);
+        if (hello_tx != NULL)
+                kptllnd_tx_decref(hello_tx);
 
-        /*
-         *
-         */
-        if(peer){
+        if (peer != NULL)
                 kptllnd_peer_check_sends(peer);
-        }
 
-        if(peer_to_cancel) {
+        if (peer_to_cancel != NULL) {
                 kptllnd_peer_cancel(peer_to_cancel);
-                kptllnd_peer_decref(peer_to_cancel,"find");
+                kptllnd_peer_decref(peer_to_cancel, "find");
         }
 
-        if(peer_allocated)
-                kptllnd_peer_decref(peer_allocated,"alloc");
-
-        PJK_UT_MSG("<<< Peer=%p\n",peer);
+        if (new_peer != NULL)
+                kptllnd_peer_decref(new_peer, "alloc");
 
+        CDEBUG(D_NET, "<<< Peer=%p\n", peer);
         return peer;
 }
 
 void
-kptllnd_tx_launch (
-        kptl_tx_t *tx,
-        lnet_process_id_t target,
-        lnet_msg_t *ptlmsg )
+kptllnd_tx_launch (kptl_tx_t         *tx,
+                   lnet_process_id_t  target,
+                   lnet_msg_t        *ptlmsg)
 {
         kptl_data_t     *kptllnd_data = tx->tx_po.po_kptllnd_data;
         kptl_peer_t     *peer = NULL;
-        kptl_peer_t     *peer_allocated = NULL;
+        kptl_peer_t     *new_peer = NULL;
         unsigned long    flags;
         rwlock_t        *g_lock = &kptllnd_data->kptl_peer_rw_lock;
         int              rc;
-        kptl_tx_t       *tx_hello = NULL;
+        ptl_process_id_t ptlid;
+        kptl_tx_t       *hello_tx = NULL;
 
 
         /* If I get here, I've committed to send, so I complete the tx with
          * failure on any problems */
 
-        PJK_UT_MSG(">>> TX=%p target=%s\n",tx,libcfs_id2str(target));
+        CDEBUG(D_NET, ">>> TX=%p target=%s\n",tx,libcfs_id2str(target));
 
         LASSERT (tx->tx_ptlmsg == NULL);
         tx->tx_ptlmsg = ptlmsg;              /* finalize ptlmsg on completion */
@@ -1300,7 +1265,7 @@ kptllnd_tx_launch (
          * First try to find the peer (this will grab the
          * read lock
          */
-        peer = kptllnd_peer_find (kptllnd_data,target);
+        peer = kptllnd_nid2peer(kptllnd_data, target.nid);
 
         /*
          * If we find the peer
@@ -1313,7 +1278,7 @@ kptllnd_tx_launch (
                 spin_unlock_irqrestore(&peer->peer_lock, flags);
                 kptllnd_peer_check_sends(peer);
                 kptllnd_peer_decref(peer,"find");
-                PJK_UT_MSG("<<< FOUND\n");
+                CDEBUG(D_NET, "<<< FOUND\n");
                 return;
         }
 
@@ -1324,15 +1289,16 @@ kptllnd_tx_launch (
          * (in the case that the peer is racing to connect with us)
          * but more than likely we will.
          */
-        tx_hello = kptllnd_get_idle_tx(kptllnd_data,TX_TYPE_SMALL_MESSAGE);
-        if( tx_hello == NULL) {
-                CERROR("Unable to allocate connect message for %s\n",libcfs_id2str(target));
+        hello_tx = kptllnd_get_idle_tx(kptllnd_data,TX_TYPE_SMALL_MESSAGE);
+        if( hello_tx == NULL) {
+                CERROR("Unable to allocate connect message for %s\n",
+                       libcfs_id2str(target));
                 kptllnd_tx_decref (tx);
                 return;
         }
 
         kptllnd_init_msg(
-                tx_hello->tx_msg,
+                hello_tx->tx_msg,
                 PTLLND_MSG_TYPE_HELLO,
                 sizeof(kptl_hello_msg_t));
 
@@ -1340,20 +1306,25 @@ kptllnd_tx_launch (
          * We've never seen this peer before.  So setup
          * a default message.
          */
-        tx_hello->tx_msg->ptlm_u.hello.kptlhm_matchbits = 0;
-        tx_hello->tx_msg->ptlm_u.hello.kptlhm_max_msg_size =
+        hello_tx->tx_msg->ptlm_u.hello.kptlhm_matchbits = 0;
+        hello_tx->tx_msg->ptlm_u.hello.kptlhm_max_msg_size =
                 *kptllnd_tunables.kptl_max_msg_size;
 
         /*
          * Allocate a new peer
          * (it's not active until its on the list)
          */
-        PJK_UT_MSG("TX %p creating NEW PEER %s\n",tx,libcfs_id2str(target));
-        rc = kptllnd_peer_allocate ( kptllnd_data, &peer_allocated, target);
-        if(rc != 0){
-                CERROR("Failed to create peer %s\n",libcfs_id2str(target));
-                kptllnd_tx_decref (tx);
-                kptllnd_tx_decref (tx_hello);
+        CDEBUG(D_NET, "TX %p creating NEW PEER %s\n", 
+               tx, libcfs_id2str(target));
+        ptlid.nid = lnet2ptlnid(kptllnd_data, target.nid);
+        ptlid.pid = kptllnd_data->kptl_portals_id.pid;
+
+        rc = kptllnd_peer_allocate(kptllnd_data, &new_peer, ptlid);
+
+        if (rc != 0) {
+                CERROR("Failed to create peer %s\n", libcfs_id2str(target));
+                kptllnd_tx_decref(tx);
+                kptllnd_tx_decref(hello_tx);
                 return;
         }
 
@@ -1364,7 +1335,7 @@ kptllnd_tx_launch (
          */
         write_lock_irqsave(g_lock, flags);
 
-        peer = kptllnd_peer_find_holding_list_lock (kptllnd_data,target);
+        peer = kptllnd_nid2peer_locked(kptllnd_data, target.nid);
 
         /*
          * If we find the peer
@@ -1374,7 +1345,8 @@ kptllnd_tx_launch (
         if (peer != NULL) {
                 write_unlock_irqrestore(g_lock, flags);
 
-                CDEBUG(D_TRACE,"HELLO message race occurred for %s\n",libcfs_id2str(target));
+                CDEBUG(D_TRACE,"HELLO message race occurred for %s\n",
+                       libcfs_id2str(target));
 
                 spin_lock_irqsave(&peer->peer_lock, flags);
                 kptllnd_peer_queue_tx_locked ( peer, tx );
@@ -1382,31 +1354,32 @@ kptllnd_tx_launch (
 
                 kptllnd_peer_check_sends(peer);
                 kptllnd_peer_decref(peer,"find");
-                kptllnd_peer_decref(peer_allocated,"alloc");
+                kptllnd_peer_decref(new_peer,"alloc");
 
                 /* and we don't need the connection tx*/
-                kptllnd_tx_decref(tx_hello);
+                kptllnd_tx_decref(hello_tx);
 
-                PJK_UT_MSG("<<< FOUND2\n");
+                CDEBUG(D_NET, "<<< FOUND2\n");
                 return;
         }
 
 
-        rc = kptllnd_peer_add_to_list_locked ( kptllnd_data, peer_allocated);
+        rc = kptllnd_peer_add_to_list_locked ( kptllnd_data, new_peer);
         if(rc != 0){
                 write_unlock_irqrestore(g_lock, flags);
 
-                CERROR("Failed to add peer to list for %s\n",libcfs_id2str(target));
+                CERROR("Failed to add peer to list for %s\n",
+                       libcfs_id2str(target));
 
                 /* Drop these TXs tx*/
-                kptllnd_tx_decref (tx);
-                kptllnd_tx_decref (tx_hello);
-                kptllnd_peer_decref(peer_allocated,"create");
+                kptllnd_tx_decref(tx);
+                kptllnd_tx_decref(hello_tx);
+                kptllnd_peer_decref(new_peer,"create");
                 return;
         }
 
-        peer = peer_allocated;
-        peer_allocated = NULL;
+        peer = new_peer;
+        new_peer = NULL;
 
         write_unlock_irqrestore(g_lock,flags);
 
@@ -1417,16 +1390,16 @@ kptllnd_tx_launch (
          * the connection request will go out, and
          * the tx will wait for a reply.
          */
-        PJK_UT_MSG("TXHello=%p\n",tx_hello);
+        CDEBUG(D_NET, "TXHello=%p\n", hello_tx);
 
 
         spin_lock_irqsave(&peer->peer_lock, flags);
-        kptllnd_peer_queue_tx_locked(peer,tx_hello);
-        kptllnd_peer_queue_tx_locked(peer,tx);
+        kptllnd_peer_queue_tx_locked(peer, hello_tx);
+        kptllnd_peer_queue_tx_locked(peer, tx);
         spin_unlock_irqrestore(&peer->peer_lock, flags);
 
         kptllnd_peer_check_sends(peer);
         kptllnd_peer_decref(peer,"find");
 
-        PJK_UT_MSG("<<<\n");
+        CDEBUG(D_NET, "<<<\n");
 }
index 241be98..3430cda 100644 (file)
@@ -41,7 +41,7 @@ void
 kptllnd_rx_buffer_pool_init(
         kptl_rx_buffer_pool_t *rxbp)
 {
-        PJK_UT_MSG("kptllnd_rx_buffer_pool_init\n");
+        CDEBUG(D_NET, "kptllnd_rx_buffer_pool_init\n");
         memset(rxbp,0,sizeof(*rxbp));
 
         spin_lock_init (&rxbp->rxbp_lock);
@@ -58,7 +58,7 @@ kptllnd_rx_buffer_pool_fini(
         int                     i;
         unsigned long           flags;
 
-        PJK_UT_MSG("kptllnd_rx_buffer_pool_fini\n");
+        CDEBUG(D_NET, "kptllnd_rx_buffer_pool_fini\n");
 
         spin_lock_irqsave(&rxbp->rxbp_lock, flags);
 
@@ -130,7 +130,7 @@ kptllnd_rx_buffer_pool_fini(
 
                                 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
                         }else{
-                                PJK_UT_MSG("PtlMDUnlink(%p) rc=%d\n",rxb,rc);
+                                CDEBUG(D_NET, "PtlMDUnlink(%p) rc=%d\n",rxb,rc);
                                 /*
                                  * The unlinked failed so put this back
                                  * on the list for later
@@ -153,7 +153,7 @@ kptllnd_rx_buffer_pool_fini(
                  */
                 if(!list_empty(&rxbp->rxbp_list)){
                         i++;
-                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                        CDEBUG(((i & (-i)) == i) ? D_NET : D_NET, /* power of 2? */
                                "Waiting for %d Busy RX Buffers\n",
                                rxbp->rxbp_count);
                         spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
@@ -165,12 +165,12 @@ kptllnd_rx_buffer_pool_fini(
         CDEBUG(D_TRACE,"|rxbp_list|=EMPTY\n");
 
         if(rxbp->rxbp_count != 0){
-                PJK_UT_MSG("Waiting for %d RX Buffers to unlink\n",rxbp->rxbp_count);
+                CDEBUG(D_NET, "Waiting for %d RX Buffers to unlink\n",rxbp->rxbp_count);
 
                 i = 2;
                 while (rxbp->rxbp_count != 0) {
                         i++;
-                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                        CDEBUG(((i & (-i)) == i) ? D_NET : D_NET, /* power of 2? */
                                "Waiting for %d RX Buffers to unlink\n",
                                rxbp->rxbp_count);
                         spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
@@ -200,7 +200,7 @@ kptllnd_rx_buffer_pool_reserve(
 
         spin_lock_irqsave(&rxbp->rxbp_lock, flags);
 
-        PJK_UT_MSG("kptllnd_rx_buffer_pool_reserve(%d)\n",count);
+        CDEBUG(D_NET, "kptllnd_rx_buffer_pool_reserve(%d)\n",count);
 
         /*
          * Prevent reservation of anymore while we are shutting down
@@ -224,12 +224,12 @@ kptllnd_rx_buffer_pool_reserve(
                 (PAGE_SIZE * (*kptllnd_tunables.kptl_rxb_npages));
         ++nbuffers ;
 
-        PJK_UT_MSG("nbuffers=%d rxbp_count=%d\n",nbuffers,rxbp->rxbp_count);
+        CDEBUG(D_NET, "nbuffers=%d rxbp_count=%d\n",nbuffers,rxbp->rxbp_count);
 
         if(rxbp->rxbp_count < nbuffers)
                 add = nbuffers - rxbp->rxbp_count;
 
-        PJK_UT_MSG("adding=%d\n",add);
+        CDEBUG(D_NET, "adding=%d\n",add);
 
         /*
          * Under the same lock assume they are added
@@ -305,7 +305,7 @@ kptllnd_rx_buffer_pool_unreserve(
 {
         unsigned long flags;
         spin_lock_irqsave(&rxbp->rxbp_lock, flags);
-        PJK_UT_MSG("kptllnd_rx_buffer_pool_unreserve(%d)\n",count);
+        CDEBUG(D_NET, "kptllnd_rx_buffer_pool_unreserve(%d)\n",count);
         rxbp->rxbp_reserved -= count;
         spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
 }
@@ -317,7 +317,7 @@ kptllnd_rx_buffer_scheduled_post(
         kptl_data_t     *kptllnd_data = rxb->rxb_po.po_kptllnd_data;
         unsigned long    flags;
 
-        PJK_UT_MSG("rxb=%p\n",rxb);
+        CDEBUG(D_NET, "rxb=%p\n",rxb);
 
         spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
         LASSERT(list_empty(&rxb->rxb_repost_list));
@@ -343,7 +343,7 @@ kptllnd_rx_buffer_post(
         any.nid = PTL_NID_ANY;
         any.pid = PTL_PID_ANY;
 
-        /*PJK_UT_MSG("rxb=%p\n",rxb);*/
+        //CDEBUG(D_NET, "rxb=%p\n",rxb);
 
         spin_lock_irqsave(&rxbp->rxbp_lock, flags);
 
@@ -517,9 +517,9 @@ kptllnd_rx_buffer_callback(ptl_event_t *ev)
                 STAT_UPDATE(kps_rx_unlink_event);
 
         if(!rxbp->rxbp_shutdown){
-                PJK_UT_MSG("RXB Callback %s(%d) rxb=%p id="FMT_NID" unlink=%d\n",
-                        get_ev_type_string(ev->type),ev->type,
-                        rxb,ev->initiator.nid,unlinked);
+                CDEBUG(D_NET, "RXB Callback %s(%d) rxb=%p id="FMT_NID" unlink=%d\n",
+                       get_ev_type_string(ev->type),ev->type,
+                       rxb,ev->initiator.nid,unlinked);
         }
 
         LASSERT( ev->md.start == rxb->rxb_buffer);
@@ -527,7 +527,7 @@ kptllnd_rx_buffer_callback(ptl_event_t *ev)
         LASSERT( ev->type == PTL_EVENT_PUT_END || ev->type == PTL_EVENT_UNLINK);
         LASSERT( ev->match_bits == LNET_MSG_MATCHBITS);
 
-        CDEBUG((ev->ni_fail_type == PTL_OK) ? D_NET : D_ERROR,
+        CDEBUG((ev->ni_fail_type == PTL_NI_OK) ? D_NET : D_ERROR,
                "event type %d, status %d from "FMT_NID"\n",
                ev->type, ev->ni_fail_type,ev->initiator.nid);
 
@@ -575,7 +575,7 @@ kptllnd_rx_buffer_callback(ptl_event_t *ev)
                 return;
         }
 
-        PJK_UT_MSG_DATA("New RX=%p\n",rx);
+        CDEBUG(D_NET, "New RX=%p\n",rx);
 
         /*
          * If we are unlinked we can just transfer the ref
@@ -595,7 +595,7 @@ kptllnd_rx_buffer_callback(ptl_event_t *ev)
         kptllnd_rx_schedule(rx);
 
         if(!rxbp->rxbp_shutdown){
-                PJK_UT_MSG("<<< rx=%p rxb=%p\n",rx,rxb);
+                CDEBUG(D_NET, "<<< rx=%p rxb=%p\n",rx,rxb);
         }
 }
 
@@ -606,9 +606,7 @@ kptllnd_rx_schedule (kptl_rx_t *rx)
         unsigned long    flags;
         kptl_data_t  *kptllnd_data = rx->rx_rxb->rxb_po.po_kptllnd_data;
 
-        CDEBUG(D_NET, "rx\n");
-
-        PJK_UT_MSG("RX Schedule %p\n",rx);
+        CDEBUG(D_NET, "RX Schedule %p\n",rx);
 
         spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
         list_add_tail(&rx->rx_list,&kptllnd_data->kptl_sched_rxq);
@@ -626,99 +624,85 @@ kptllnd_rx_scheduler_handler(kptl_rx_t *rx)
         kptl_data_t            *kptllnd_data = rxb->rxb_po.po_kptllnd_data;
         kptl_peer_t            *peer = NULL;
         int                     returned_credits = 0;
-        int                     type = msg->ptlm_type;
-        lnet_process_id_t       lnet_initiator;
         unsigned long           flags;
 
+        CDEBUG(D_NET, ">>> RXRXRXRXRX rx=%p nob=%d "FMT_NID"/%d\n",
+               rx, rx->rx_nob, rx->rx_initiator.nid, rx->rx_initiator.pid);
 
-        PJK_UT_MSG_DATA(">>> RXRXRXRXRXRXRXRXRXRXRXRX\n");
-        PJK_UT_MSG_DATA("rx=%p nob=%d\n",rx,rx->rx_nob);
-
-        /*
-         * Setup the intiator for LNET
-         */        
-        lnet_initiator.nid = ptl2lnetnid(kptllnd_data,rx->rx_initiator.nid);
-        lnet_initiator.pid = rx->rx_initiator.pid;
-
-        /*
-         * If the nob==0 then silently discard this message
-         */
-        if(rx->rx_nob == 0)
-                goto exit;
-
+        if (rx->rx_nob == 0) {
+                /* discard silently!!! */
+                goto out;
+        }
+        
         rc = kptllnd_msg_unpack(msg, rx->rx_nob, kptllnd_data);
         if (rc != 0) {
-                CERROR ("Error %d unpacking rx from "FMT_NID"\n",
-                        rc, rx->rx_initiator.nid);
-                goto exit;
+                CERROR ("Error %d unpacking rx from "FMT_NID"/%d\n",
+                        rc, rx->rx_initiator.nid, rx->rx_initiator.pid);
+                goto out;
         }
 
-        PJK_UT_MSG_DATA("RX=%p Type=%s(%d)\n",rx,
-                get_msg_type_string(type),type);
-        PJK_UT_MSG_DATA("Msg NOB = %d\n",msg->ptlm_nob);
-        PJK_UT_MSG_DATA("Credits back from peer=%d\n",msg->ptlm_credits);
-        PJK_UT_MSG_DATA("Seq # ="LPX64"\n",msg->ptlm_seq);
-        PJK_UT_MSG_DATA("lnet RX nid=" LPX64 "\n",lnet_initiator.nid);
-        PJK_UT_MSG("ptl  RX nid=" FMT_NID " pid=%d\n",rx->rx_initiator.nid,rx->rx_initiator.pid);
-
-        if(type == PTLLND_MSG_TYPE_HELLO)
-        {
-                peer = kptllnd_peer_handle_hello(
-                        kptllnd_data,
-                        lnet_initiator,
-                        msg);
-                if( peer == NULL){
-                        CERROR ("Failed to create peer for %s\n",
-                                libcfs_id2str(lnet_initiator));
-                        goto exit;
+        CDEBUG(D_NET, "RX=%p Type=%s(%d)\n",
+               rx, get_msg_type_string(msg->ptlm_type), msg->ptlm_type);
+        CDEBUG(D_NET, "Msg NOB = %d\n", msg->ptlm_nob);
+        CDEBUG(D_NET, "Credits back from peer=%d\n", msg->ptlm_credits);
+        CDEBUG(D_NET, "Seq # ="LPX64"\n",msg->ptlm_seq);
+        CDEBUG(D_NET, "ptl  RX id="FMT_NID"/%d\n",
+               rx->rx_initiator.nid, rx->rx_initiator.pid);
+
+        if (msg->ptlm_type == PTLLND_MSG_TYPE_HELLO) {
+                peer = kptllnd_peer_handle_hello(kptllnd_data,
+                                                 rx->rx_initiator, 
+                                                 msg);
+                if (peer == NULL) {
+                        CERROR ("Failed to create peer for "FMT_NID"/%d\n",
+                                rx->rx_initiator.nid, rx->rx_initiator.pid);
+                        goto out;
                 }
 
-                if (!( msg->ptlm_dststamp == kptllnd_data->kptl_incarnation ||
-                       msg->ptlm_dststamp == 0)) {
-                        CERROR ("Stale rx from "LPX64" dststamp "LPX64" expected "LPX64"\n",
-                                peer->peer_nid,
-                                msg->ptlm_dststamp,
-                                kptllnd_data->kptl_incarnation );
-                        goto exit;
+                if (!(msg->ptlm_dststamp == kptllnd_data->kptl_incarnation ||
+                      msg->ptlm_dststamp == 0)) {
+                        CERROR("Stale rx from %s dststamp "LPX64" expected "LPX64"\n",
+                               libcfs_nid2str(peer->peer_nid),
+                               msg->ptlm_dststamp,
+                               kptllnd_data->kptl_incarnation);
+                        goto out;
                 }
-        }
-        else
-        {
-                peer = kptllnd_peer_find(kptllnd_data,lnet_initiator);
+        } else {
+                peer = kptllnd_ptlnid2peer(kptllnd_data, rx->rx_initiator.nid);
                 if( peer == NULL){
-                        CERROR ("No connection with %s\n",
-                                libcfs_id2str(lnet_initiator));
-                        goto exit;
+                        CERROR("No connection with "FMT_NID"/%d\n",
+                               rx->rx_initiator.nid, rx->rx_initiator.pid);
+                        goto out;
                 }
 
                 if (msg->ptlm_dststamp != kptllnd_data->kptl_incarnation) {
-                        CERROR ("Stale rx from "LPX64" dststamp "LPX64" expected "LPX64"\n",
-                                peer->peer_nid,
-                                msg->ptlm_dststamp,
-                                kptllnd_data->kptl_incarnation );
-                        goto exit;
+                        CERROR("Stale rx from %s dststamp "LPX64" expected "LPX64"\n",
+                               libcfs_nid2str(peer->peer_nid),
+                               msg->ptlm_dststamp,
+                               kptllnd_data->kptl_incarnation );
+                        goto out;
                 }
         }
 
-        if( msg->ptlm_srcnid != peer->peer_nid){
-                CERROR ("Stale rx srcnid "LPX64" expected "LPX64"\n",
-                        msg->ptlm_srcnid,
-                        peer->peer_nid );
-                goto exit;
+        if (msg->ptlm_srcnid != peer->peer_nid) {
+                CERROR("Bad rx srcnid %s expected %s\n",
+                       libcfs_nid2str(msg->ptlm_srcnid),
+                       libcfs_nid2str(peer->peer_nid));
+                goto out;
         }
-        if( msg->ptlm_srcstamp != peer->peer_incarnation){
-                CERROR ("Stale rx from "LPX64" srcstamp"LPX64" expected "LPX64"\n",
-                        peer->peer_nid,
+        if (msg->ptlm_srcstamp != peer->peer_incarnation) {
+                CERROR ("Stale rx from %s srcstamp "LPX64" expected "LPX64"\n",
+                        libcfs_nid2str(peer->peer_nid),
                         msg->ptlm_srcstamp,
-                        peer->peer_incarnation );
-                goto exit;
+                        peer->peer_incarnation);
+                goto out;
         }
-        if( msg->ptlm_dstnid != kptllnd_data->kptl_ni->ni_nid){
-                CERROR ("Stale rx from "LPX64" dststamp "LPX64" expected "LPX64"\n",
-                        peer->peer_nid,
-                        msg->ptlm_dstnid,
-                        kptllnd_data->kptl_ni->ni_nid );
-                goto exit;
+        if (msg->ptlm_dstnid != kptllnd_data->kptl_ni->ni_nid) {
+                CERROR ("Bad rx from %s dstnid %s expected %s\n",
+                        libcfs_nid2str(peer->peer_nid),
+                        libcfs_nid2str(msg->ptlm_dstnid),
+                        libcfs_nid2str(kptllnd_data->kptl_ni->ni_nid));
+                goto out;
         }
 
         /*
@@ -735,89 +719,81 @@ kptllnd_rx_scheduler_handler(kptl_rx_t *rx)
                         *kptllnd_tunables.kptl_peercredits);
                 spin_unlock_irqrestore(&peer->peer_lock, flags);
 
-                PJK_UT_MSG("Peer=%p Credits=%d Outstanding=%d\n",
+                CDEBUG(D_NET, "Peer=%p Credits=%d Outstanding=%d\n",
                         peer,peer->peer_credits,peer->peer_outstanding_credits);
-                PJK_UT_MSG_DATA("Getting %d credits back rx=%p\n",returned_credits,rx);
+                CDEBUG(D_NET, "Getting %d credits back rx=%p\n",returned_credits,rx);
 
                 kptllnd_peer_check_sends(peer);
         }
 
-        /*
-         * Attach the peer to the RX
-         * it now is responsibly for releaseing the refrence
-         */
+        /* Attach the peer to the RX (it takes over my reference) */
         rx->rx_peer = peer;
-        peer = 0;
+        peer = NULL;
+
+        /* NB msg->ptlm_seq is ignored; it's only a debugging aid */
 
-        /*
-         * Note: We are explicitly ignore sequence #
-         * It is informational only
-         */
         switch (msg->ptlm_type) {
         default:
-                CERROR("Bad PTL message type %x from "LPX64"\n",
-                       msg->ptlm_type, rx->rx_peer->peer_nid);
+                CERROR("Bad PTL message type %x from %s\n",
+                       msg->ptlm_type, libcfs_nid2str(rx->rx_peer->peer_nid));
                 break;
 
         case PTLLND_MSG_TYPE_HELLO:
-                PJK_UT_MSG("PTLLND_MSG_TYPE_HELLO\n");
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_HELLO\n");
                 break;
 
         case PTLLND_MSG_TYPE_NOOP:
-                PJK_UT_MSG("PTLLND_MSG_TYPE_NOOP\n");
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_NOOP\n");
                 break;
 
         case PTLLND_MSG_TYPE_IMMEDIATE:
-                PJK_UT_MSG("PTLLND_MSG_TYPE_IMMEDIATE\n");
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE\n");
                 rc = lnet_parse(kptllnd_data->kptl_ni,
-                        &msg->ptlm_u.immediate.kptlim_hdr,
-                        msg->ptlm_srcnid,
-                        rx, 0);
+                                &msg->ptlm_u.immediate.kptlim_hdr,
+                                msg->ptlm_srcnid,
+                                rx, 0);
                 /* RX Completing asynchronously */
-                if( rc >= 0)
-                        rx = 0;
+                if ( rc >= 0)
+                        rx = NULL;
                 break;
 
         case PTLLND_MSG_TYPE_PUT:
         case PTLLND_MSG_TYPE_GET:
-                PJK_UT_MSG("PTLLND_MSG_TYPE_%s\n",
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_%s\n",
                         msg->ptlm_type == PTLLND_MSG_TYPE_PUT ?
                         "PUT" : "GET");
-
                 /*
                  * Save the last match bits used
                  */
                 spin_lock_irqsave(&rx->rx_peer->peer_lock, flags);
-                if(msg->ptlm_u.req.kptlrm_matchbits > rx->rx_peer->peer_last_matchbits_seen)
-                        rx->rx_peer->peer_last_matchbits_seen = msg->ptlm_u.req.kptlrm_matchbits;
+                if (msg->ptlm_u.req.kptlrm_matchbits >
+                    rx->rx_peer->peer_last_matchbits_seen)
+                        rx->rx_peer->peer_last_matchbits_seen =
+                                msg->ptlm_u.req.kptlrm_matchbits;
                 spin_unlock_irqrestore(&rx->rx_peer->peer_lock, flags);
 
                 rc = lnet_parse(kptllnd_data->kptl_ni,
-                        &msg->ptlm_u.req.kptlrm_hdr,
-                        msg->ptlm_srcnid,
-                        rx, 1);
+                                &msg->ptlm_u.req.kptlrm_hdr,
+                                msg->ptlm_srcnid,
+                                rx, 1);
 
                 /* RX Completing asynchronously */
                 if( rc >= 0)
-                        rx = 0;
+                        rx = NULL;
                 break;
          }
 
-
-        CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n",
-                type, returned_credits, peer->peer_nid);
-
-exit:
+out:
         /* PEER == NULL if it is not yet assigned or already
          * been attached to RX */
-        if(peer)
-                kptllnd_peer_decref(peer,"lookup");
+        if (peer != NULL)
+                kptllnd_peer_decref(peer, "lookup");
 
         /* RX == NULL if it is completing asynchronously */
-        if(rx)
-                kptllnd_rx_decref(rx,"sched",kptllnd_data);
+        if (rx != NULL)
+                kptllnd_rx_decref(rx, "sched", kptllnd_data);
 
-        PJK_UT_MSG_DATA("<<< RXRXRXRXRXRXRXRXRXRXRXRX rx=%p\n",rx);
+        CDEBUG(D_NET, "<<< RXRXRXRXRXRXRXRXRXRXRXRX rx=%p\n",rx);
         return;
 }
 
@@ -834,7 +810,7 @@ kptllnd_rx_buffer_addref(
          * with the real ref count, and is for informational purposes
          * only
          */
-        PJK_UT_MSG("rxb=%p owner=%s count=%d\n",rxb,owner,
+        CDEBUG(D_NET, "rxb=%p owner=%s count=%d\n",rxb,owner,
                 atomic_read(&rxb->rxb_refcount));
 #endif
 }
@@ -844,24 +820,10 @@ kptllnd_rx_buffer_decref(
         kptl_rx_buffer_t *rxb,
         const char *owner)
 {
-        if( !atomic_dec_and_test (&rxb->rxb_refcount)){
-
-#if 0
-                /*
-                 * The below message could actually be out of sync
-                 * with the real ref count, and is for informational purposes
-                 * only
-                 */
-                PJK_UT_MSG("rxb=%p owner=%s count=%d\n",rxb,owner,
-                        atomic_read(&rxb->rxb_refcount));
-#endif
+        if (!atomic_dec_and_test (&rxb->rxb_refcount))
                 return;
-        }
-
-#if 0
-        PJK_UT_MSG("rxb=%p owner=%s LAST REF reposting\n",rxb,owner);
-#endif
 
+        CDEBUG(D_NET, "rxb=%p owner=%s LAST REF reposting\n",rxb,owner);
         kptllnd_rx_buffer_post_handle_error(rxb);
 }
 
@@ -872,7 +834,6 @@ kptllnd_rx_alloc(
         kptl_rx_t* rx;
 
         if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_RX_ALLOC )){
-                PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_RX_ALLOC SIMULATION triggered\n");
                 CERROR ("FAIL_BLOCKING_RX_ALLOC SIMULATION triggered\n");
                 STAT_UPDATE(kps_rx_allocation_failed);
                 return 0;
@@ -902,18 +863,18 @@ kptllnd_rx_destroy(kptl_rx_t *rx,kptl_data_t *kptllnd_data)
         kptl_peer_t    *peer = rx->rx_peer;
         unsigned long   flags;
 
-        PJK_UT_MSG(">>> rx=%p\n",rx);
+        CDEBUG(D_NET, ">>> rx=%p\n",rx);
 
         STAT_UPDATE(kps_rx_released);
 
         LASSERT(atomic_read(&rx->rx_refcount)==0);
 
         if(rx->rx_rxb){
-                PJK_UT_MSG("Release rxb=%p\n",rx->rx_rxb);
+                CDEBUG(D_NET, "Release rxb=%p\n",rx->rx_rxb);
                 kptllnd_rx_buffer_decref(rx->rx_rxb,"rx");
                 rx->rx_rxb = 0;
         }else{
-                PJK_UT_MSG("rxb already released\n");
+                CDEBUG(D_NET, "rxb already released\n");
         }
 
         if(peer){
@@ -928,8 +889,8 @@ kptllnd_rx_destroy(kptl_rx_t *rx,kptl_data_t *kptllnd_data)
                         *kptllnd_tunables.kptl_peercredits);
                 spin_unlock_irqrestore(&peer->peer_lock, flags);
 
-                PJK_UT_MSG("Peer=%p Credits=%d Outstanding=%d\n",
-                        peer,peer->peer_credits,peer->peer_outstanding_credits);
+                CDEBUG(D_NET, "Peer=%p Credits=%d Outstanding=%d\n",
+                       peer,peer->peer_credits,peer->peer_outstanding_credits);
 
                 /* Have I received credits that will let me send? */
                 kptllnd_peer_check_sends(peer);
@@ -939,7 +900,7 @@ kptllnd_rx_destroy(kptl_rx_t *rx,kptl_data_t *kptllnd_data)
 
         cfs_mem_cache_free(kptllnd_data->kptl_rx_cache,rx);
 
-        PJK_UT_MSG("<<< rx=%p\n",rx);
+        CDEBUG(D_NET, "<<< rx=%p\n",rx);
 }
 
 void
@@ -952,26 +913,17 @@ kptllnd_rx_addref(kptl_rx_t *rx,const char *owner)
          * with the real ref count, and is for informational purposes
          * only
          */
-        PJK_UT_MSG("rx=%p owner=%s count=%d\n",rx,owner,
-                atomic_read(&rx->rx_refcount));
+        CDEBUG(D_NET, "rx=%p owner=%s count=%d\n",rx,owner,
+               atomic_read(&rx->rx_refcount));
 }
 
 void
 kptllnd_rx_decref(kptl_rx_t *rx,const char *owner,kptl_data_t *kptllnd_data)
 {
-        if( !atomic_dec_and_test (&rx->rx_refcount)){
-                /*
-                 * The below message could actually be out of sync
-                 * with the real ref count, and is for informational purposes
-                 * only
-                 */
-                PJK_UT_MSG("rx=%p owner=%s count=%d\n",rx,owner,
-                        atomic_read(&rx->rx_refcount));
+        if (!atomic_dec_and_test (&rx->rx_refcount))
                 return;
-        }
-
-        PJK_UT_MSG("rx=%p owner=%s LAST REF destroying\n",rx,owner);
 
-        kptllnd_rx_destroy(rx,kptllnd_data);
+        CDEBUG(D_NET, "rx=%p owner=%s LAST REF destroying\n",rx,owner);
+        kptllnd_rx_destroy(rx, kptllnd_data);
 }
 
index 5c8d8c7..a0a7e86 100644 (file)
@@ -30,7 +30,7 @@ kptllnd_setup_tx_descs (kptl_data_t *kptllnd_data)
         kptl_tx_t       *tx;
         int             i;
 
-        PJK_UT_MSG("\n");
+        CDEBUG(D_NET, "\n");
 
         /*
          * First initialize the tx descriptors
@@ -54,20 +54,29 @@ kptllnd_setup_tx_descs (kptl_data_t *kptllnd_data)
                  */
                 tx->tx_state = TX_STATE_ON_IDLE_QUEUE;
 
-                LIBCFS_ALLOC( tx->tx_msg, *kptllnd_tunables.kptl_max_msg_size );
-                if(tx->tx_msg == NULL){
+                LIBCFS_ALLOC(tx->tx_msg, *kptllnd_tunables.kptl_max_msg_size);
+                if (tx->tx_msg == NULL) {
                         CERROR("Failed to allocate TX payload\n");
-                        kptllnd_cleanup_tx_descs(kptllnd_data);
+                        goto failed;
                 }
 
-
+                LIBCFS_ALLOC(tx->tx_frags, sizeof(*tx->tx_frags));
+                if (tx->tx_frags == NULL) {
+                        CERROR("Failed to allocate TX frags\n");
+                        goto failed;
+                }
+                
                 /*
                  * Add this to the queue
                  */
                 list_add (&tx->tx_list,&kptllnd_data->kptl_idle_txs);
         }
 
-        return (0);
+        return 0;
+
+ failed:
+        kptllnd_cleanup_tx_descs(kptllnd_data);
+        return -ENOMEM;
 }
 
 void
@@ -76,50 +85,43 @@ kptllnd_cleanup_tx_descs(kptl_data_t *kptllnd_data)
         kptl_tx_t       *tx;
         int             i;
 
-        PJK_UT_MSG("\n");
+        CDEBUG(D_NET, "\n");
 
         for (i = 0; i < (*kptllnd_tunables.kptl_ntx); i++) {
                 tx = &kptllnd_data->kptl_tx_descs[i];
 
-
-                /*
-                 * Handle partial initization by stopping
-                 * when we hit one that is not fully initialized
-                 */
-                if( tx->tx_msg == NULL )
-                        break;
+                if (tx->tx_msg != NULL)
+                        LIBCFS_FREE(tx->tx_msg, 
+                                    *kptllnd_tunables.kptl_max_msg_size);
+                        
+                if (tx->tx_frags != NULL)
+                        LIBCFS_FREE(tx->tx_frags, sizeof(*tx->tx_frags));
 
                 LASSERT( tx->tx_state == TX_STATE_ON_IDLE_QUEUE );
-
-                LIBCFS_FREE(tx->tx_msg,*kptllnd_tunables.kptl_max_msg_size);
         }
 }
 
 kptl_tx_t *
-kptllnd_get_idle_tx(
-        kptl_data_t *kptllnd_data,
-        kptl_tx_type_t purpose)
+kptllnd_get_idle_tx(kptl_data_t *kptllnd_data,
+                    enum kptl_tx_type purpose)
 {
         kptl_tx_t      *tx = NULL;
 
-        PJK_UT_MSG(">>> purpose=%d\n",purpose);
+        CDEBUG(D_NET, ">>> purpose=%d\n",purpose);
 
         if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_TX_PUT_ALLOC ) && purpose == TX_TYPE_LARGE_PUT){
-                PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_TX_PUT_ALLOC SIMULATION triggered\n");
                 CERROR ("FAIL_BLOCKING_TX_PUT_ALLOC SIMULATION triggered\n");
                 tx = NULL;
                 STAT_UPDATE(kps_tx_allocation_failed);
                 goto exit;
         }
         if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_TX_GET_ALLOC ) && purpose == TX_TYPE_LARGE_GET){
-                PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_TX_GET_ALLOC SIMULATION triggered\n");
                 CERROR ("FAIL_BLOCKING_TX_GET_ALLOC SIMULATION triggered\n");
                 tx = NULL;
                 STAT_UPDATE(kps_tx_allocation_failed);
                 goto exit;
         }
         if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_TX )){
-                PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_TX SIMULATION triggered\n");
                 CERROR ("FAIL_BLOCKING_TX SIMULATION triggered\n");
                 tx = NULL;
                 STAT_UPDATE(kps_tx_allocation_failed);
@@ -190,7 +192,7 @@ kptllnd_get_idle_tx(
 
 
 exit:
-        PJK_UT_MSG("<<< tx=%p\n",tx);
+        CDEBUG(D_NET, "<<< tx=%p\n",tx);
         return tx;
 }
 
@@ -203,7 +205,7 @@ kptllnd_tx_done (kptl_tx_t *tx)
 
         LASSERT (!in_interrupt());
 
-        PJK_UT_MSG(">>> tx=%p\n",tx);
+        CDEBUG(D_NET, ">>> tx=%p\n",tx);
 
         LASSERT(tx->tx_state != TX_STATE_ON_IDLE_QUEUE);
         LASSERT(PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE));
@@ -219,7 +221,7 @@ kptllnd_tx_done (kptl_tx_t *tx)
          * Release the associated RX if there is one
          */
         if(tx->tx_associated_rx){
-                PJK_UT_MSG("tx=%p destroy associated rx %p\n",tx,tx->tx_associated_rx);
+                CDEBUG(D_NET, "tx=%p destroy associated rx %p\n",tx,tx->tx_associated_rx);
                 kptllnd_rx_decref(tx->tx_associated_rx,"tx",kptllnd_data);
                 tx->tx_associated_rx = NULL;
         }
@@ -228,7 +230,7 @@ kptllnd_tx_done (kptl_tx_t *tx)
          * Cleanup resources associate with the peer
          */
         if(tx->tx_peer){
-                PJK_UT_MSG("tx=%p detach from peer=%p\n",tx,tx->tx_peer);
+                CDEBUG(D_NET, "tx=%p detach from peer=%p\n",tx,tx->tx_peer);
                 kptllnd_peer_dequeue_tx(tx->tx_peer,tx);
                 kptllnd_peer_decref(tx->tx_peer,"tx");
                 tx->tx_peer = NULL;
@@ -255,7 +257,7 @@ kptllnd_tx_done (kptl_tx_t *tx)
         if (lnetmsg[1] != NULL)
                 lnet_finalize(kptllnd_data->kptl_ni, lnetmsg[1], status);
 
-        PJK_UT_MSG("<<< tx=%p\n",tx);
+        CDEBUG(D_NET, "<<< tx=%p\n",tx);
 }
 
 void
@@ -264,7 +266,7 @@ kptllnd_tx_schedule (kptl_tx_t *tx)
         kptl_data_t *kptllnd_data = tx->tx_po.po_kptllnd_data;
         unsigned long    flags;
 
-        PJK_UT_MSG("tx=%p\n",tx);
+        CDEBUG(D_NET, "tx=%p\n",tx);
 
         spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
         LASSERT(list_empty(&tx->tx_schedlist));
@@ -282,13 +284,13 @@ kptllnd_tx_callback(ptl_event_t *ev)
         int              do_decref = 0;
         unsigned long    flags;
 
-        PJK_UT_MSG(">>> %s(%d) tx=%p fail=%d\n",
+        CDEBUG(D_NET, ">>> %s(%d) tx=%p fail=%d\n",
                 get_ev_type_string(ev->type),ev->type,tx,ev->ni_fail_type);
 
         STAT_UPDATE(kps_tx_event);
 
 #ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
-        PJK_UT_MSG("ev->unlinked=%d\n",ev->unlinked);
+        CDEBUG(D_NET, "ev->unlinked=%d\n",ev->unlinked);
         if(ev->unlinked)
                 STAT_UPDATE(kps_tx_unlink_event);
 #endif
@@ -303,7 +305,7 @@ kptllnd_tx_callback(ptl_event_t *ev)
                  * event's and we've already cleaned up in
                  * those cases.
                  */
-                PJK_UT_MSG("<<<\n");
+                CDEBUG(D_NET, "<<<\n");
                 return;
 #else
                 /*
@@ -316,7 +318,7 @@ kptllnd_tx_callback(ptl_event_t *ev)
 
                 tx->tx_status = -EINVAL;
                 kptllnd_tx_scheduled_decref(tx);
-                PJK_UT_MSG("<<<\n");
+                CDEBUG(D_NET, "<<<\n");
                 return;
 #endif
         }
@@ -347,7 +349,7 @@ kptllnd_tx_callback(ptl_event_t *ev)
                         break;
 
                 case TX_TYPE_SMALL_MESSAGE:
-                        PJK_UT_MSG("TX_TYPE_SMALL_MESSAGE\n");
+                        CDEBUG(D_NET, "TX_TYPE_SMALL_MESSAGE\n");
                         LASSERT(PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE));
 
                         /*
@@ -359,7 +361,7 @@ kptllnd_tx_callback(ptl_event_t *ev)
 
                 case TX_TYPE_LARGE_PUT:
                 case TX_TYPE_LARGE_GET:
-                        PJK_UT_MSG("TX_TYPE_LARGE_%s\n",
+                        CDEBUG(D_NET, "TX_TYPE_LARGE_%s\n",
                                 tx->tx_type == TX_TYPE_LARGE_PUT ?
                                 "PUT" : "GET");
                         /*
@@ -399,7 +401,7 @@ kptllnd_tx_callback(ptl_event_t *ev)
                         break;
 
                 case TX_TYPE_LARGE_PUT_RESPONSE:
-                        PJK_UT_MSG("TX_TYPE_LARGE_PUT_RESPONSE\n");
+                        CDEBUG(D_NET, "TX_TYPE_LARGE_PUT_RESPONSE\n");
                         LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
 
                         /*
@@ -414,7 +416,7 @@ kptllnd_tx_callback(ptl_event_t *ev)
                         break;
 
                 case TX_TYPE_LARGE_GET_RESPONSE:
-                        PJK_UT_MSG("TX_TYPE_LARGE_GET_RESPONSE\n");
+                        CDEBUG(D_NET, "TX_TYPE_LARGE_GET_RESPONSE\n");
                         LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
 
                         /*
@@ -452,7 +454,7 @@ kptllnd_tx_callback(ptl_event_t *ev)
 
         if(do_decref)
                 kptllnd_tx_scheduled_decref(tx);
-        PJK_UT_MSG("<<< decref=%d\n",do_decref);
+        CDEBUG(D_NET, "<<< decref=%d\n",do_decref);
 }
 
 void
@@ -470,7 +472,7 @@ kptllnd_tx_decref(
                 return;
         }
 
-        PJK_UT_MSG("tx=%p LAST REF\n",tx);
+        CDEBUG(D_NET, "tx=%p LAST REF\n",tx);
         kptllnd_tx_done(tx);
 }
 
@@ -484,11 +486,11 @@ kptllnd_tx_scheduled_decref(
                  * with the real ref count, and is for informational purposes
                  * only
                  */
-                PJK_UT_MSG("tx=%p count=%d\n",tx,
+                CDEBUG(D_NET, "tx=%p count=%d\n",tx,
                         atomic_read(&tx->tx_refcount));
                 return;
         }
 
-        PJK_UT_MSG("tx=%p LAST REF\n",tx);
+        CDEBUG(D_NET, "tx=%p LAST REF\n",tx);
         kptllnd_tx_schedule(tx);
 }
index 11be93d..d45724a 100644 (file)
@@ -5,11 +5,11 @@
        <key>CFBundleDevelopmentRegion</key>
        <string>English</string>
        <key>CFBundleExecutable</key>
-       <string>ksocknal</string>
+       <string>ksocklnd</string>
        <key>CFBundleIconFile</key>
        <string></string>
        <key>CFBundleIdentifier</key>
-       <string>com.clusterfs.lustre.ksocknal</string>
+       <string>com.clusterfs.lustre.ksocklnd</string>
        <key>CFBundleInfoDictionaryVersion</key>
        <string>6.0</string>
        <key>CFBundlePackageType</key>
@@ -30,7 +30,7 @@
                <string>1.0.0b1</string> 
                <key>com.clusterfs.lustre.libcfs</key> 
                <string>1.0.0</string>
-               <key>com.clusterfs.lustre.portals</key> 
+               <key>com.clusterfs.lustre.lnet</key> 
                <string>1.0.0</string>
        </dict>
 </dict>
index 71cc574..4ee2f37 100644 (file)
@@ -943,7 +943,7 @@ ksocknal_accept (lnet_ni_t *ni, struct socket *sock)
         spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock, flags);
 
         list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
-        wake_up(&ksocknal_data.ksnd_connd_waitq);
+        cfs_waitq_signal(&ksocknal_data.ksnd_connd_waitq);
                         
         spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock, flags);
         return 0;
index 7e46090..3b9ec30 100644 (file)
@@ -52,7 +52,7 @@
 #define SOCKNAL_TYPED_CONNS      1              /* unidirectional large, bidirectional small? */
 #define SOCKNAL_ZC_MIN_FRAG     (2<<10)         /* default smallest zerocopy fragment */
 #define SOCKNAL_MIN_BULK        (1<<10)         /* smallest "large" message */
-#define SOCKNAL_BUFFER_SIZE     (8<<20)         /* default socket buffer size */
+#define SOCKNAL_BUFFER_SIZE      SOCK_BUFFER_SIZE /* default socket buffer size */
 #define SOCKNAL_NAGLE            0              /* enable/disable NAGLE? */
 #define SOCKNAL_IRQ_AFFINITY     1              /* enable/disable IRQ affinity? */
 #define SOCKNAL_KEEPALIVE_IDLE   35             /* # seconds idle before 1st probe */
index f6539fb..68d068d 100644 (file)
@@ -1145,8 +1145,8 @@ int ksocknal_scheduler (void *arg)
         char               name[16];
 
         snprintf (name, sizeof (name),"socknal_sd%02d", id);
-        libcfs_daemonize (name);
-        libcfs_blockallsigs ();
+        cfs_daemonize (name);
+        cfs_block_allsigs ();
 
 #if (CONFIG_SMP && CPU_AFFINITY)
         id = ksocknal_sched2cpu(id);
@@ -1753,8 +1753,8 @@ ksocknal_connd (void *arg)
         int                did_something;
 
         snprintf (name, sizeof (name), "socknal_cd%02ld", id);
-        libcfs_daemonize (name);
-        libcfs_blockallsigs ();
+        cfs_daemonize (name);
+        cfs_block_allsigs ();
 
         spin_lock_irqsave (&ksocknal_data.ksnd_connd_lock, flags);
 
@@ -1954,8 +1954,8 @@ ksocknal_reaper (void *arg)
         int                peer_index = 0;
         cfs_time_t         deadline = cfs_time_current();
 
-        libcfs_daemonize ("socknal_reaper");
-        libcfs_blockallsigs ();
+        cfs_daemonize ("socknal_reaper");
+        cfs_block_allsigs ();
 
         CFS_INIT_LIST_HEAD(&enomem_conns);
         cfs_waitlink_init (&wait);
@@ -2062,7 +2062,7 @@ ksocknal_reaper (void *arg)
                 if (!ksocknal_data.ksnd_shuttingdown &&
                     list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
                     list_empty (&ksocknal_data.ksnd_zombie_conns))
-                        cfs_waitq_timedwait (&wait, timeout);
+                        cfs_waitq_timedwait (&wait, CFS_TASK_INTERRUPTIBLE, timeout);
 
                 set_current_state (TASK_RUNNING);
                 cfs_waitq_del (&ksocknal_data.ksnd_reaper_waitq, &wait);
index 22257dc..868cd5e 100644 (file)
 
 #include "socklnd.h"
 
-#if 0
-#undef SOCKNAL_SINGLE_FRAG_TX
-#define SOCKNAL_SINGLE_FRAG_TX  1
-#undef SOCKNAL_SINGLE_FRAG_RX
-#define SOCKNAL_SINGLE_FRAG_RX  1
-#endif
-
-#if !CFS_SYSFS_MODULE_PARM
-#error "this can't use ksocknal_tunables to get the addresses of the tuning vars"
+# if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
 
-SYSCTL_DECL(_portals);
+SYSCTL_DECL(_lnet);
 
-SYSCTL_NODE (_portals,           OID_AUTO,       ksocknal,        CTLFLAG_RW, 
-             0,                 "ksocknal_sysctl");
+SYSCTL_NODE (_lnet,           OID_AUTO,         ksocknal,        CTLFLAG_RW, 
+             0,                                 "ksocknal_sysctl");
 
-SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       timeout, 
-           CTLTYPE_INT | CTLFLAG_RW ,            ksocknal_tunables.ksnd_timeout, 
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         timeout, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_timeout, 
            0,                                   "timeout");
-SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       eager_ack, 
-           CTLTYPE_INT | CTLFLAG_RW ,            ksocknal_tunables.ksnd_eager_ack, 
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         credits, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_credits, 
+           0,                                   "credits");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         peer_credits, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_peercredits, 
+           0,                                   "peer_credits");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         nconnds, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_nconnds, 
+           0,                                   "nconnds");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         min_reconnectms, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_min_reconnectms, 
+           0,                                   "min_reconnectms");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         max_reconnectms, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_max_reconnectms, 
+           0,                                   "max_reconnectms");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         eager_ack, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_eager_ack, 
            0,                                   "eager_ack");
-SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       typed, 
-           CTLTYPE_INT | CTLFLAG_RW ,            ksocknal_tunables.ksnd_typed_conns, 
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         typed, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_typed_conns, 
            0,                                   "typed");
-SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       min_bulk, 
-           CTLTYPE_INT | CTLFLAG_RW ,            ksocknal_tunables.ksnd_min_bulk, 
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         min_bulk, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_min_bulk, 
            0,                                   "min_bulk");
-SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       buffer_size, 
-           CTLTYPE_INT | CTLFLAG_RW ,            ksocknal_tunables.ksnd_buffer_size, 
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         buffer_size, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_buffer_size, 
            0,                                   "buffer_size");
-SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       nagle, 
-           CTLTYPE_INT | CTLFLAG_RW ,            ksocknal_tunables.ksnd_nagle, 
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         nagle, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_nagle, 
            0,                                   "nagle");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         keepalive_idle, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_keepalive_idle, 
+           0,                                   "keepalive_idle");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         keepalive_count, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_keepalive_count, 
+           0,                                   "keepalive_count");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         keepalive_intvl, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_keepalive_intvl, 
+           0,                                   "keepalive_intvl");
 
 cfs_sysctl_table_t      ksocknal_top_ctl_table [] = {
-        &sysctl__portals_ksocknal,
-        &sysctl__portals_ksocknal_timeout,
-        &sysctl__portals_ksocknal_eager_ack,
-        &sysctl__portals_ksocknal_typed,
-        &sysctl__portals_ksocknal_min_bulk,
-        &sysctl__portals_ksocknal_buffer_size,
-        &sysctl__portals_ksocknal_nagle,
+        &sysctl__lnet_ksocknal,
+        &sysctl__lnet_ksocknal_timeout,
+        &sysctl__lnet_ksocknal_credits,
+        &sysctl__lnet_ksocknal_peer_credits,
+        &sysctl__lnet_ksocknal_nconnds,
+        &sysctl__lnet_ksocknal_min_reconnectms,
+        &sysctl__lnet_ksocknal_max_reconnectms,
+        &sysctl__lnet_ksocknal_eager_ack,
+        &sysctl__lnet_ksocknal_typed,
+        &sysctl__lnet_ksocknal_min_bulk,
+        &sysctl__lnet_ksocknal_buffer_size,
+        &sysctl__lnet_ksocknal_nagle,
+        &sysctl__lnet_ksocknal_keepalive_idle,
+        &sysctl__lnet_ksocknal_keepalive_count,
+        &sysctl__lnet_ksocknal_keepalive_intvl,
         NULL
 };
 
@@ -79,7 +103,7 @@ int
 ksocknal_lib_tunables_init ()
 {
         ksocknal_tunables.ksnd_sysctl =
-                register_sysctl_table (ksocknal_top_ctl_table, 0);
+                cfs_register_sysctl_table (ksocknal_top_ctl_table, 0);
 
         if (ksocknal_tunables.ksnd_sysctl == NULL)
                return -ENOMEM;
@@ -87,11 +111,11 @@ ksocknal_lib_tunables_init ()
        return 0;
 }
 
-int
+void
 ksocknal_lib_tunables_fini ()
 {
         if (ksocknal_tunables.ksnd_sysctl != NULL)
-                unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl);       
+                cfs_unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl);   
 }
 #else
 int
@@ -100,25 +124,22 @@ ksocknal_lib_tunables_init ()
        return 0;
 }
 
-int
+void
 ksocknal_lib_tunables_fini ()
 {
 }
 #endif
 
-static unsigned long  ksocknal_mbuf_size = (u_quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES);
-
-extern struct fileops socketops;
-
-void
-ksocknal_lib_release_sock(struct socket *so)
-{
-        CFS_DECL_FUNNEL_DATA;
-
-        CFS_NET_IN;
-       soshutdown(so, 0);
-        CFS_NET_EX;
-}
+/*
+ * To use bigger buffer for socket:
+ * 1. Increase nmbclusters (Cannot increased by sysctl because it's ready only, so
+ *    we must patch kernel).
+ * 2. Increase net.inet.tcp.reass.maxsegments
+ * 3. Increase net.inet.tcp.sendspace
+ * 4. Increase net.inet.tcp.recvspace
+ * 5. Increase kern.ipc.maxsockbuf
+ */
+#define KSOCKNAL_MAX_BUF        (1152*1024)
 
 void
 ksocknal_lib_bind_irq (unsigned int irq)
@@ -127,7 +148,7 @@ ksocknal_lib_bind_irq (unsigned int irq)
 }
 
 unsigned int
-ksocknal_lib_sock_irq (struct socket *sock)
+ksocknal_lib_sock_irq (cfs_socket_t *sock)
 {
         return 0;
 }
@@ -135,36 +156,395 @@ ksocknal_lib_sock_irq (struct socket *sock)
 int
 ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
 { 
-        struct sockaddr_in *sin; 
-        struct sockaddr    *sa; 
-        int                rc; 
-        CFS_DECL_NET_DATA;
+        int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+                                     &conn->ksnc_ipaddr,
+                                     &conn->ksnc_port);
 
-        CFS_NET_IN; 
-        rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_peeraddr(conn->ksnc_sock, &sa); 
-        LASSERT (!conn->ksnc_closing); 
-        if (rc != 0) { 
-                CFS_NET_EX; 
-                if (sa) FREE(sa, M_SONAME); 
-                CERROR ("Error %d getting sock peer IP\n", rc); 
-                return rc; 
+        /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+        LASSERT (!conn->ksnc_closing);
+
+        if (rc != 0) {
+                CERROR ("Error %d getting sock peer IP\n", rc);
+                return rc;
+        }
+
+        rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+                                 &conn->ksnc_myipaddr, NULL);
+        if (rc != 0) {
+                CERROR ("Error %d getting sock local IP\n", rc);
+                return rc;
+        }
+
+        return 0;
+}
+
+static int
+ksocknal_lib_buffersize (int current_sz, int tunable_sz)
+{
+        /* ensure >= SOCKNAL_MIN_BUFFER */
+        if (current_sz < SOCKNAL_MIN_BUFFER)
+                return MAX(SOCKNAL_MIN_BUFFER, tunable_sz);
+
+        if (tunable_sz > SOCKNAL_MIN_BUFFER)
+                return tunable_sz;
+
+        /* leave alone */
+        return 0;
+}
+
+#ifdef __DARWIN8__
+
+int
+ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        socket_t        sock = C2B_SOCK(conn->ksnc_sock);
+        size_t          sndlen;
+        int             nob;
+        int             rc;
+
+#if SOCKNAL_SINGLE_FRAG_TX
+        struct iovec    scratch;
+        struct iovec   *scratchiov = &scratch;
+        unsigned int    niov = 1;
+#else
+        struct iovec   *scratchiov = conn->ksnc_tx_scratch_iov;
+        unsigned int    niov = tx->tx_niov;
+#endif
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = scratchiov,
+                .msg_iovlen     = niov,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = MSG_DONTWAIT
+        };
+        
+        int  i;
+        
+        for (nob = i = 0; i < niov; i++) {
+                scratchiov[i] = tx->tx_iov[i];
+                nob += scratchiov[i].iov_len;
         } 
-        sin = (struct sockaddr_in *)sa; 
-        conn->ksnc_ipaddr = ntohl (sin->sin_addr.s_addr); 
-        conn->ksnc_port = ntohs (sin->sin_port); 
-        if (sa) FREE(sa, M_SONAME); 
-        rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_sockaddr(conn->ksnc_sock, &sa); 
-        CFS_NET_EX; 
-        if (rc != 0) { 
-                if (sa) FREE(sa, M_SONAME); 
-                CERROR ("Error %d getting sock local IP\n", rc); 
-                return rc; 
+        
+        /* 
+         * XXX Liang:
+         * Linux has MSG_MORE, do wen have anyting to
+         * reduce number of partial TCP segments sent?
+         */
+        rc = -sock_send(sock, &msg, MSG_DONTWAIT, &sndlen);
+        if (rc == 0)
+                rc = sndlen;
+        return rc;
+}
+
+int
+ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        socket_t       sock = C2B_SOCK(conn->ksnc_sock);
+        lnet_kiov_t   *kiov = tx->tx_kiov;
+        int            rc;
+        int            nob;
+        size_t         sndlen;
+
+#if SOCKNAL_SINGLE_FRAG_TX
+        struct iovec  scratch;
+        struct iovec *scratchiov = &scratch;
+        unsigned int  niov = 1;
+#else
+        struct iovec *scratchiov = conn->ksnc_tx_scratch_iov;
+        unsigned int  niov = tx->tx_nkiov;
+#endif
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = scratchiov,
+                .msg_iovlen     = niov,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = MSG_DONTWAIT
+        };
+        
+        int           i;
+        
+        for (nob = i = 0; i < niov; i++) {
+                scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) +
+                                         kiov[i].kiov_offset;
+                nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+        }
+
+        /* 
+         * XXX Liang:
+         * Linux has MSG_MORE, do wen have anyting to
+         * reduce number of partial TCP segments sent?
+         */
+        rc = -sock_send(sock, &msg, MSG_DONTWAIT, &sndlen);
+        for (i = 0; i < niov; i++)
+                cfs_kunmap(kiov[i].kiov_page);
+        if (rc == 0)
+                rc = sndlen;
+        return rc;
+}
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+        struct iovec  scratch;
+        struct iovec *scratchiov = &scratch;
+        unsigned int  niov = 1;
+#else
+        struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
+        unsigned int  niov = conn->ksnc_rx_niov;
+#endif
+        struct iovec *iov = conn->ksnc_rx_iov;
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = scratchiov,
+                .msg_iovlen     = niov,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = 0
+        };
+        size_t       rcvlen;
+        int          nob;
+        int          i;
+        int          rc;
+
+        LASSERT (niov > 0);
+
+        for (nob = i = 0; i < niov; i++) {
+                scratchiov[i] = iov[i];
+                nob += scratchiov[i].iov_len;
+        }
+        LASSERT (nob <= conn->ksnc_rx_nob_wanted); 
+        rc = -sock_receive (C2B_SOCK(conn->ksnc_sock), &msg, MSG_DONTWAIT, &rcvlen);
+        if (rc == 0)
+                rc = rcvlen;
+
+        return rc;
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+        struct iovec  scratch;
+        struct iovec *scratchiov = &scratch;
+        unsigned int  niov = 1;
+#else
+        struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
+        unsigned int  niov = conn->ksnc_rx_nkiov;
+#endif
+        lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = scratchiov,
+                .msg_iovlen     = niov,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = 0
+        };
+        int          nob;
+        int          i;
+        size_t       rcvlen;
+        int          rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone. */
+        for (nob = i = 0; i < niov; i++) {
+                scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + \
+                                         kiov[i].kiov_offset;
+                nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+        }
+        LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+        rc = sock_receive(C2B_SOCK(conn->ksnc_sock), &msg, MSG_DONTWAIT, &rcvlen); 
+        for (i = 0; i < niov; i++)
+                cfs_kunmap(kiov[i].kiov_page); 
+        if (rc == 0)
+                rc = rcvlen;
+        return (rc);
+}
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+        /* XXX Liang: */
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+        socket_t       sock = C2B_SOCK(conn->ksnc_sock);
+        int            len;
+        int            rc;
+
+        rc = ksocknal_connsock_addref(conn);
+        if (rc != 0) {
+                LASSERT (conn->ksnc_closing);
+                *txmem = *rxmem = *nagle = 0;
+                return (-ESHUTDOWN);
+        }
+        rc = libcfs_sock_getbuf(B2C_SOCK(sock), txmem, rxmem);
+        if (rc == 0) {
+                len = sizeof(*nagle);
+                rc = -sock_getsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
+                                      nagle, &len);
+        }
+        ksocknal_connsock_decref(conn);
+
+        if (rc == 0)
+                *nagle = !*nagle;
+        else
+                *txmem = *rxmem = *nagle = 0;
+
+        return (rc);
+}
+
+int
+ksocknal_lib_setup_sock (cfs_socket_t *sock)
+{
+        int             rc; 
+        int             option; 
+        int             sndbuf;
+        int             rcvbuf;
+        int             keep_idle; 
+        int             keep_intvl; 
+        int             keep_count; 
+        int             do_keepalive; 
+        socket_t        so = C2B_SOCK(sock);
+        struct linger   linger;
+
+        /* Ensure this socket aborts active sends immediately when we close
+         * it. */
+        linger.l_onoff = 0;
+        linger.l_linger = 0;
+        rc = -sock_setsockopt(so, SOL_SOCKET, SO_LINGER, &linger, sizeof(linger));
+        if (rc != 0) {
+                CERROR ("Can't set SO_LINGER: %d\n", rc);
+                return (rc);
+        }
+
+        if (!*ksocknal_tunables.ksnd_nagle) { 
+                option = 1; 
+                rc = -sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &option, sizeof(option));
+                if (rc != 0) { 
+                        CERROR ("Can't disable nagle: %d\n", rc); 
+                        return (rc);
+                } 
         } 
-        conn->ksnc_myipaddr = ntohl (sin->sin_addr.s_addr);
 
-        return 0;
+        rc = libcfs_sock_getbuf(sock, &sndbuf, &rcvbuf);
+        if (rc != 0) {
+                CERROR("Can't get buffer sizes: %d\n", rc);
+                return (rc);
+        }
+
+        sndbuf = ksocknal_lib_buffersize(sndbuf,
+                                         *ksocknal_tunables.ksnd_buffer_size);
+        rcvbuf = ksocknal_lib_buffersize(rcvbuf,
+                                         *ksocknal_tunables.ksnd_buffer_size);
+        rc = libcfs_sock_setbuf(sock, sndbuf, rcvbuf);
+        if (rc != 0) {
+                CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+                        sndbuf, rcvbuf, rc);
+                return (rc);
+        }
+
+        /* snapshot tunables */ 
+        keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle; 
+        keep_count = *ksocknal_tunables.ksnd_keepalive_count; 
+        keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+        do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); 
+        option = (do_keepalive ? 1 : 0); 
+
+        rc = -sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &option, sizeof(option)); 
+        if (rc != 0) { 
+                CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); 
+                return (rc);
+        }
+        
+        if (!do_keepalive)
+                return (rc);
+        rc = -sock_setsockopt(so, IPPROTO_TCP, TCP_KEEPALIVE, 
+                              &keep_idle, sizeof(keep_idle));
+        
+        return (rc);
+}
+
+void
+ksocknal_lib_push_conn(ksock_conn_t *conn)
+{ 
+        socket_t        sock; 
+        int             val = 1; 
+        int             rc; 
+        
+        rc = ksocknal_connsock_addref(conn); 
+        if (rc != 0)            /* being shut down */ 
+                return; 
+        sock = C2B_SOCK(conn->ksnc_sock); 
+
+        rc = -sock_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)); 
+        LASSERT(rc == 0);
+
+        ksocknal_connsock_decref(conn);
+        return;
+}
+
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
+
+static void
+ksocknal_upcall(socket_t so, void *arg, int waitf)
+{
+        ksock_conn_t  *conn = (ksock_conn_t *)arg;
+        ENTRY;
+
+        read_lock (&ksocknal_data.ksnd_global_lock);
+        if (conn == NULL)
+                goto out;
+
+        ksocknal_read_callback (conn);
+out:
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+        EXIT;
+}
+
+void
+ksocknal_lib_save_callback(cfs_socket_t *sock, ksock_conn_t *conn)
+{ 
+        /* No callback need to save in osx */
+        return;
+}
+
+void
+ksocknal_lib_set_callback(cfs_socket_t *sock, ksock_conn_t *conn)
+{ 
+        sock->s_upcallarg = (void *)conn;
+        sock->s_upcall = ksocknal_upcall; 
+        sock->s_flags |= CFS_SOCK_UPCALL; 
+        return;
+}
+
+void
+ksocknal_lib_act_callback(cfs_socket_t *sock, ksock_conn_t *conn)
+{
+        ksocknal_upcall (C2B_SOCK(sock), (void *)conn, 0);
 }
 
+void 
+ksocknal_lib_reset_callback(cfs_socket_t *sock, ksock_conn_t *conn)
+{ 
+        sock->s_flags &= ~CFS_SOCK_UPCALL; 
+        sock->s_upcall = NULL; 
+        sock->s_upcallarg = NULL; 
+}
+
+#else /* !__DARWIN8__ */
+
 int
 ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
 { 
@@ -343,6 +723,10 @@ ksocknal_lib_eager_ack (ksock_conn_t *conn)
         CFS_NET_IN;
         s = splnet();
 
+        /*
+         * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo
+         * to send immediate ACK. 
+         */
         if (tp && tp->t_flags & TF_DELACK){
                 tp->t_flags &= ~TF_DELACK;
                 tp->t_flags |= TF_ACKNOW;
@@ -350,14 +734,6 @@ ksocknal_lib_eager_ack (ksock_conn_t *conn)
         }
         splx(s);
 
-        /*
-         * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo
-         * to send immediate ACK. It's not the best resolution because
-         * tcp_fasttimo will send out ACK for all delayed-ack tcp socket.
-         * Anyway, it's working now. 
-         * extern void tcp_fasttimo(); 
-         * tcp_fasttimo();
-         */
         CFS_NET_EX;
 
         return;
@@ -476,129 +852,35 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 }
 
 int
-ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob)
-{
-        int           rc;
-        CFS_DECL_NET_DATA;
-
-        while (nob > 0) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct  uio suio = {
-                        .uio_iov        = &iov,
-                        .uio_iovcnt     = 1,
-                        .uio_offset     = 0,
-                        .uio_resid      = nob,
-                        .uio_segflg     = UIO_SYSSPACE,
-                        .uio_rw         = UIO_WRITE,
-                        .uio_procp      = NULL
-                };
-
-                CFS_NET_IN;
-                rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, 0);
-                CFS_NET_EX;
-
-                if (rc != 0) {
-                        if ( suio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
-                                                rc == EWOULDBLOCK))
-                                rc = 0;
-                        if ( rc != 0 )
-                                return -rc;
-                        rc = nob - suio.uio_resid;
-                        buffer = ((char *)buffer) + rc;
-                        nob = suio.uio_resid;
-                        continue;
-                }
-                break;
-        }
-
-        return (0);
-}
-
-int
-ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob)
-{
-        int           rc;
-        CFS_DECL_NET_DATA;
-
-        while (nob > 0) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct uio  ruio = {
-                        .uio_iov        = &iov,
-                        .uio_iovcnt     = 1,
-                        .uio_offset     = 0,
-                        .uio_resid      = nob,
-                        .uio_segflg     = UIO_SYSSPACE,
-                        .uio_rw         = UIO_READ,
-                        .uio_procp      = NULL
-                };
-
-                CFS_NET_IN;
-                rc = soreceive(sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, (int *)0);
-                CFS_NET_EX;
-
-                if (rc != 0) {
-                        if ( ruio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
-                                                rc == EWOULDBLOCK))
-                                rc = 0;
-                        if (rc != 0)
-                                return -rc;
-                        rc = nob - ruio.uio_resid;
-                        buffer = ((char *)buffer) + rc;
-                        nob = ruio.uio_resid;
-                        continue;
-                }
-                break;
-        }
-
-        return (0);
-}
-
-int
 ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
 {
-        struct sockopt  sopt;
         struct socket *sock = conn->ksnc_sock;
-        int            len;
         int            rc;
-        CFS_DECL_NET_DATA;
 
         rc = ksocknal_connsock_addref(conn);
         if (rc != 0) {
                 LASSERT (conn->ksnc_closing);
                 *txmem = *rxmem = *nagle = 0;
-                rc = -ESHUTDOWN;
-                goto out;
-        }
-        len = sizeof(*txmem);
-        bzero(&sopt, sizeof sopt);
-        sopt.sopt_dir = SOPT_GET; 
-        sopt.sopt_level = SOL_SOCKET; 
-        sopt.sopt_name = SO_SNDBUF; 
-        sopt.sopt_val = txmem; 
-        sopt.sopt_valsize = len;
-
-        CFS_NET_IN;
-        rc = sogetopt(sock, &sopt);
-        if (rc == 0) {
-                len = sizeof(*rxmem);
-                sopt.sopt_name = SO_RCVBUF;
-                sopt.sopt_val = rxmem;
-                rc = sogetopt(sock, &sopt);
+                return -ESHUTDOWN;
         }
+        rc = libcfs_sock_getbuf(sock, txmem, rxmem);
         if (rc == 0) {
+                struct sockopt  sopt;
+                int            len;
+                CFS_DECL_NET_DATA;
+
                 len = sizeof(*nagle);
+                bzero(&sopt, sizeof sopt);
+                sopt.sopt_dir = SOPT_GET; 
                 sopt.sopt_level = IPPROTO_TCP;
                 sopt.sopt_name = TCP_NODELAY;
                 sopt.sopt_val = nagle;
-                rc = sogetopt(sock, &sopt);
+                sopt.sopt_valsize = len;
+
+                CFS_NET_IN;
+                rc = -sogetopt(sock, &sopt);
+                CFS_NET_EX;
         }
-        CFS_NET_EX;
 
         ksocknal_connsock_decref(conn);
 
@@ -606,8 +888,7 @@ ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int
                 *nagle = !*nagle;
         else
                 *txmem = *rxmem = *nagle = 0;
-out:
-        return (-rc);
+        return (rc);
 }
 
 int
@@ -616,6 +897,8 @@ ksocknal_lib_setup_sock (struct socket *so)
         struct sockopt  sopt;
         int             rc; 
         int             option; 
+        int             sndbuf;
+        int             rcvbuf;
         int             keep_idle; 
         int             keep_intvl; 
         int             keep_count; 
@@ -623,9 +906,25 @@ ksocknal_lib_setup_sock (struct socket *so)
         struct linger   linger;
         CFS_DECL_NET_DATA;
 
+        rc = libcfs_sock_getbuf(so, &sndbuf, &rcvbuf);
+        if (rc != 0) {
+                CERROR("Can't get buffer sizes: %d\n", rc);
+                return rc;
+        }
+
+        sndbuf = ksocknal_lib_buffersize(sndbuf,
+                                         *ksocknal_tunables.ksnd_buffer_size);
+        rcvbuf = ksocknal_lib_buffersize(rcvbuf,
+                                         *ksocknal_tunables.ksnd_buffer_size);
+        rc = libcfs_sock_setbuf(so, sndbuf, rcvbuf);
+        if (rc != 0) {
+                CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+                        sndbuf, rcvbuf, rc);
+                return (rc);
+        }
+
         /* Ensure this socket aborts active sends immediately when we close
          * it. */
-
         bzero(&sopt, sizeof sopt);
 
         linger.l_onoff = 0;
@@ -637,13 +936,12 @@ ksocknal_lib_setup_sock (struct socket *so)
         sopt.sopt_valsize = sizeof(linger);
 
         CFS_NET_IN;
-        rc = sosetopt(so, &sopt);
+        rc = -sosetopt(so, &sopt);
         if (rc != 0) {
                 CERROR ("Can't set SO_LINGER: %d\n", rc);
                 goto out;
         }
 
-
         if (!*ksocknal_tunables.ksnd_nagle) { 
                 option = 1; 
                 bzero(&sopt, sizeof sopt);
@@ -652,37 +950,13 @@ ksocknal_lib_setup_sock (struct socket *so)
                 sopt.sopt_name = TCP_NODELAY; 
                 sopt.sopt_val = &option; 
                 sopt.sopt_valsize = sizeof(option);
-                rc = sosetopt(so, &sopt);
+                rc = -sosetopt(so, &sopt);
                 if (rc != 0) { 
                         CERROR ("Can't disable nagle: %d\n", rc); 
                         goto out;
                 } 
         } 
-        if (*ksocknal_tunables.ksnd_buffer_size > 0) { 
-                option = *ksocknal_tunables.ksnd_buffer_size; 
-                if (option > ksocknal_mbuf_size) 
-                        option = ksocknal_mbuf_size; 
-                                                
-                sopt.sopt_dir = SOPT_SET; 
-                sopt.sopt_level = SOL_SOCKET; 
-                sopt.sopt_name = SO_SNDBUF; 
-                sopt.sopt_val = &option; 
-                sopt.sopt_valsize = sizeof(option); 
-                rc = sosetopt(so, &sopt); 
-                if (rc != 0) { 
-                        CERROR ("Can't set send buffer %d: %d\n", 
-                                        option, rc); 
-                        goto out;
-                } 
-                
-                sopt.sopt_name = SO_RCVBUF; 
-                rc = sosetopt(so, &sopt); 
-                if (rc != 0) { 
-                        CERROR ("Can't set receive buffer %d: %d\n", 
-                                        option, rc); 
-                        goto out;
-                }
-        } 
+
         /* snapshot tunables */ 
         keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle; 
         keep_count = *ksocknal_tunables.ksnd_keepalive_count; 
@@ -696,7 +970,7 @@ ksocknal_lib_setup_sock (struct socket *so)
         sopt.sopt_name = SO_KEEPALIVE; 
         sopt.sopt_val = &option; 
         sopt.sopt_valsize = sizeof(option); 
-        rc = sosetopt(so, &sopt); 
+        rc = -sosetopt(so, &sopt); 
         if (rc != 0) { 
                 CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); 
                 goto out; 
@@ -714,148 +988,14 @@ ksocknal_lib_setup_sock (struct socket *so)
         sopt.sopt_name = TCP_KEEPALIVE; 
         sopt.sopt_val = &keep_idle; 
         sopt.sopt_valsize = sizeof(keep_idle); 
-        rc = sosetopt(so, &sopt); 
+        rc = -sosetopt(so, &sopt); 
         if (rc != 0) { 
                 CERROR ("Can't set TCP_KEEPALIVE : %d\n", rc); 
                 goto out; 
         }
 out:
         CFS_NET_EX;
-        return (-rc);
-}
-
-int
-ksocknal_lib_connect_sock (struct socket **sockp, int *fatal, 
-                           ksock_route_t *route, int local_port)
-{
-        struct sockaddr_in  locaddr;
-        struct sockaddr_in  srvaddr;
-        struct timeval      tv;
-        int                 fd;
-        struct socket      *so;
-        struct sockopt      sopt;
-        int                 option;
-        int                 rc;
-        int                 s;
-        CFS_DECL_FUNNEL_DATA;
-
-        ENTRY; 
-        bzero (&locaddr, sizeof (locaddr)); 
-        locaddr.sin_len = sizeof(struct sockaddr_in); 
-        locaddr.sin_family = AF_INET; 
-        locaddr.sin_port = htons (local_port);
-        locaddr.sin_addr.s_addr = 
-                (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr)
-                                            : INADDR_ANY;
-        bzero(&srvaddr, sizeof(srvaddr));
-        srvaddr.sin_len = sizeof(struct sockaddr_in);
-        srvaddr.sin_family = AF_INET;
-        srvaddr.sin_port = htons (route->ksnr_port);
-        srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
-
-        *fatal = 1;
-
-        CFS_NET_IN;
-        rc = socreate(PF_INET, &so, SOCK_STREAM, 0); 
-        CFS_NET_EX;
-        *sockp = so;
-        if (rc != 0) {
-                CERROR ("Can't create autoconnect socket: %d\n", rc);
-                return (-rc);
-        }
-
-        /* Set the socket timeouts, so our connection attempt completes in
-         * finite time */
-        tv.tv_sec = *ksocknal_tunables.ksnd_timeout;
-        tv.tv_usec = 0;
-        bzero(&sopt, sizeof sopt);
-        sopt.sopt_dir = SOPT_SET;
-        sopt.sopt_level = SOL_SOCKET;
-        sopt.sopt_name = SO_SNDTIMEO;
-        sopt.sopt_val = &tv;
-        sopt.sopt_valsize = sizeof(tv);
-
-        CFS_NET_IN;
-        rc = sosetopt(so, &sopt);
-        if (rc != 0) { 
-                CFS_NET_EX;
-                CERROR ("Can't set send timeout %d: %d\n",
-                        *ksocknal_tunables.ksnd_timeout, rc);
-                goto out;
-        }
-        sopt.sopt_level = SOL_SOCKET;
-        sopt.sopt_name = SO_RCVTIMEO;
-        rc = sosetopt(so, &sopt);
-        if (rc != 0) {
-                CFS_NET_EX;
-                CERROR ("Can't set receive timeout %d: %d\n",
-                        *ksocknal_tunables.ksnd_timeout, rc);
-                goto out;
-        } 
-        option = 1;
-        sopt.sopt_level = SOL_SOCKET;
-        sopt.sopt_name = SO_REUSEADDR;
-        sopt.sopt_val = &option;
-        sopt.sopt_valsize = sizeof(option);
-        rc = sosetopt(so, &sopt);
-        if (rc != 0) {
-                CFS_NET_EX;
-                CERROR ("Can't set sock reuse address: %d\n", rc);
-                goto out;
-        } 
-        rc = sobind(so, (struct sockaddr *)&locaddr); 
-        if (rc == EADDRINUSE) { 
-                CFS_NET_EX; 
-                CDEBUG(D_NET, "Port %d already in use\n", local_port); 
-                *fatal = 0; 
-                goto out;
-        }
-        if (rc != 0) { 
-                CFS_NET_EX; 
-                CERROR ("Can't bind to local IP Address %u.%u.%u.%u: %d\n", 
-                        HIPQUAD(route->ksnr_myipaddr), rc); 
-                goto out; 
-        }
-        rc = soconnect(so, (struct sockaddr *)&srvaddr);
-        *fatal = !(rc == EADDRNOTAVAIL || rc == EADDRINUSE);
-        if (rc != 0) { 
-                CFS_NET_EX;
-                if (rc != EADDRNOTAVAIL && rc != EADDRINUSE)
-                        CERROR ("Can't connect to %s"
-                                " local IP: %u.%u.%u.%u," 
-                                " remote IP: %u.%u.%u.%u/%d: %d\n", 
-                                libcfs_id2str(route->ksnr_peer->ksnp_id, 
-                                HIPQUAD(route->ksnr_myipaddr), 
-                                HIPQUAD(route->ksnr_ipaddr), 
-                                route->ksnr_port, rc); 
-                goto out;
-        }
-
-        s = splnet();
-        while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
-                CDEBUG(D_NET, "ksocknal sleep for waiting auto_connect.\n");
-                (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "ksocknal_conn", hz);
-        }
-        LASSERT((so->so_state & SS_ISCONNECTED));
-        splx(s);
-        CFS_NET_EX;
-
-        rc = so->so_error; 
-        if (rc != 0) { 
-                CERROR ("Error %d waiting for connection to %s" 
-                        " local IP: %u.%u.%u.%u," 
-                        " remote IP: %u.%u.%u.%u/%d: %d\n", rc,
-                        libcfs_id2str(route->ksnr_peer->ksnp_id), 
-                        HIPQUAD(route->ksnr_myipaddr), 
-                        HIPQUAD(route->ksnr_ipaddr), 
-                        route->ksnr_port, rc); 
-                goto out; 
-        }
-        return (-rc);
-
- out:
-        ksocknal_lib_release_sock(so);
-        return (-rc);
+        return (rc);
 }
 
 void
@@ -886,43 +1026,32 @@ ksocknal_lib_push_conn(ksock_conn_t *conn)
         return;
 }
 
+
 extern void ksocknal_read_callback (ksock_conn_t *conn);
 extern void ksocknal_write_callback (ksock_conn_t *conn);
 
 static void
 ksocknal_upcall(struct socket *so, caddr_t arg, int waitf)
 {
-        ksock_conn_t  *conn;
-        CFS_DECL_NET_DATA;
+        ksock_conn_t  *conn = (ksock_conn_t *)arg;
         ENTRY;
 
         read_lock (&ksocknal_data.ksnd_global_lock);
-        conn = so->reserved3;
-
-        if (conn == NULL){
-                /* More processing is needed?  */
+        if (conn == NULL)
                 goto out;
-        }
-        if ((so->so_rcv.sb_flags & SB_UPCALL) || !arg ) {
+
+        if (so->so_rcv.sb_flags & SB_UPCALL) {
                 extern int soreadable(struct socket *so);
-                CFS_NET_IN;
-                if (conn->ksnc_rx_nob_wanted && soreadable(so)){
+                if (conn->ksnc_rx_nob_wanted && soreadable(so))
                         /* To verify whether the upcall is for receive */
-                        CFS_NET_EX;
                         ksocknal_read_callback (conn);
-                }else
-                        CFS_NET_EX;
         }
         /* go foward? */
-        if ((so->so_snd.sb_flags & SB_UPCALL) || !arg){
+        if (so->so_snd.sb_flags & SB_UPCALL){
                 extern int sowriteable(struct socket *so);
-                CFS_NET_IN;
-                if (sowriteable(so)){
+                if (sowriteable(so))
                         /* socket is writable */
-                        CFS_NET_EX;
                         ksocknal_write_callback(conn);
-                } else 
-                        CFS_NET_EX;
         }
 out:
         read_unlock (&ksocknal_data.ksnd_global_lock);
@@ -943,22 +1072,24 @@ ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn)
         CFS_DECL_NET_DATA;
 
         CFS_NET_IN;
-        sock->so_upcallarg = (void *)sock;  /* anything not NULL */ 
+        sock->so_upcallarg = (void *)conn;
         sock->so_upcall = ksocknal_upcall; 
         sock->so_snd.sb_timeo = 0; 
-        sock->so_rcv.sb_timeo = 2 * HZ; 
+        sock->so_rcv.sb_timeo = cfs_time_seconds(2);
         sock->so_rcv.sb_flags |= SB_UPCALL; 
         sock->so_snd.sb_flags |= SB_UPCALL; 
-        sock->reserved3 = conn;
         CFS_NET_EX;
         return;
 }
 
 void
-ksocknal_lib_act_callback(struct socket *sock)
+ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn)
 {
-        /* upcall will take the network funnel */
-        ksocknal_upcall (sock, 0, 0);
+        CFS_DECL_NET_DATA;
+
+        CFS_NET_IN;
+        ksocknal_upcall (sock, (void *)conn, 0);
+        CFS_NET_EX;
 }
 
 void 
@@ -967,11 +1098,11 @@ ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
         CFS_DECL_NET_DATA;
 
         CFS_NET_IN;
-        sock->so_upcall = NULL; 
-        sock->so_upcallarg = NULL; 
         sock->so_rcv.sb_flags &= ~SB_UPCALL; 
         sock->so_snd.sb_flags &= ~SB_UPCALL;
+        sock->so_upcall = NULL; 
+        sock->so_upcallarg = NULL; 
         CFS_NET_EX;
 }
 
-
+#endif  /* !__DARWIN8__ */
index 69a20dd..24c7305 100644 (file)
 #include <mach/mach_types.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
-#include <netat/sysglue.h>
 #include <stdarg.h>
 
 #include <libcfs/libcfs.h>
 
+#ifdef __DARWIN8__
+
+#define SOCKNAL_ARCH_EAGER_ACK         0
+
+#else /* !__DARWIN8__ */
+
 #define SOCKNAL_ARCH_EAGER_ACK         1
 
-#define SOCK_WMEM_QUEUED(so)           ((so)->so_snd.sb_cc)
-#define SOCK_ERROR(so)                 ((so)->so_error)
+#endif
 
-#define SOCK_TEST_NOSPACE(so)          (sbspace(&(so)->so_snd) < (so)->so_snd.sb_lowat)
+#define SOCK_BUFFER_SIZE                (1152 * 1024)
 
 static inline
 int ksocknal_nsched(void)
 { 
+       /* XXX Liang: fix it */
        return 1;
 }
 
index 827e625..8c8ff1f 100644 (file)
@@ -46,6 +46,8 @@
 
 #define SOCKNAL_ARCH_EAGER_ACK 0
 
+#define SOCK_BUFFER_SIZE       (8<<20)
+
 #ifndef CONFIG_SMP
 static inline
 int ksocknal_nsched(void)
index e9f0011..f3d0acd 100644 (file)
@@ -20,8 +20,8 @@
 
 #include "socklnd.h"
 
-static int timeout = SOCKNAL_TIMEOUT;
-CFS_MODULE_PARM(timeout, "i", int, 0644,
+static int sock_timeout = SOCKNAL_TIMEOUT;
+CFS_MODULE_PARM(sock_timeout, "i", int, 0644,
                 "dead socket timeout (seconds)");
 
 static int credits = SOCKNAL_CREDITS;
@@ -89,7 +89,7 @@ CFS_MODULE_PARM(zc_min_frag, "i", int, 0644,
 #endif
 
 ksock_tunables_t ksocknal_tunables = {
-        .ksnd_timeout         = &timeout,
+        .ksnd_timeout         = &sock_timeout,
        .ksnd_credits         = &credits,
        .ksnd_peercredits     = &peer_credits,
        .ksnd_nconnds         = &nconnds,
diff --git a/lnet/klnds/tdilnd/socklnd.c b/lnet/klnds/tdilnd/socklnd.c
new file mode 100644 (file)
index 0000000..c828fa4
--- /dev/null
@@ -0,0 +1,2377 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+lnd_t the_ksocklnd = {
+        /* .lnd_list       = */ { &the_ksocklnd.lnd_list, 
+                                  &the_ksocklnd.lnd_list },
+        /* .lnd_refcount   = */ 0,
+        /* .lnd_type       = */ SOCKLND,
+        /* .lnd_startup    = */ ksocknal_startup,
+        /* .lnd_shutdown   = */ ksocknal_shutdown,
+        /* .lnd_ctl        = */ ksocknal_ctl,
+        /* .lnd_send       = */ ksocknal_send,
+        /* .lnd_recv       = */ ksocknal_recv,
+        /* .lnd_eager_recv = */ NULL,
+        /* .lnd_notify     = */ ksocknal_notify,
+        /* .lnd_accept     = */ ksocknal_accept
+};
+
+ksock_nal_data_t        ksocknal_data;
+
+
+ksock_interface_t *
+ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip)
+{
+        ksock_net_t       *net = ni->ni_data;
+        int                i;
+        ksock_interface_t *iface;
+
+        for (i = 0; i < net->ksnn_ninterfaces; i++) {
+                LASSERT(i < LNET_MAX_INTERFACES);
+                iface = &net->ksnn_interfaces[i];
+
+                if (iface->ksni_ipaddr == ip)
+                        return (iface);
+        }
+
+        return (NULL);
+}
+
+ksock_route_t *
+ksocknal_create_route (__u32 ipaddr, int port)
+{
+        ksock_route_t *route;
+
+        LIBCFS_ALLOC (route, sizeof (*route));
+        if (route == NULL)
+                return (NULL);
+
+        atomic_set (&route->ksnr_refcount, 1);
+        route->ksnr_peer = NULL;
+        route->ksnr_retry_interval = 0;         /* OK to connect at any time */
+        route->ksnr_ipaddr = ipaddr;
+        route->ksnr_port = port;
+        route->ksnr_connecting = 0;
+        route->ksnr_connected = 0;
+        route->ksnr_deleted = 0;
+        route->ksnr_conn_count = 0;
+        route->ksnr_share_count = 0;
+
+        return (route);
+}
+
+void
+ksocknal_destroy_route (ksock_route_t *route)
+{
+        LASSERT (atomic_read(&route->ksnr_refcount) == 0);
+
+        if (route->ksnr_peer != NULL)
+                ksocknal_peer_decref(route->ksnr_peer);
+
+        LIBCFS_FREE (route, sizeof (*route));
+}
+
+int
+ksocknal_create_peer (ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
+{
+        ksock_net_t   *net = ni->ni_data;
+        ksock_peer_t  *peer;
+        unsigned long  flags;
+
+        LASSERT (id.nid != LNET_NID_ANY);
+        LASSERT (id.pid != LNET_PID_ANY);
+        LASSERT (!in_interrupt());
+
+        LIBCFS_ALLOC (peer, sizeof (*peer));
+        if (peer == NULL)
+                return -ENOMEM;
+
+        memset (peer, 0, sizeof (*peer));       /* NULL pointers/clear flags etc */
+
+        peer->ksnp_ni = ni;
+        peer->ksnp_id = id;
+        atomic_set (&peer->ksnp_refcount, 1);   /* 1 ref for caller */
+        peer->ksnp_closing = 0;
+        peer->ksnp_accepting = 0;
+        CFS_INIT_LIST_HEAD (&peer->ksnp_conns);
+        CFS_INIT_LIST_HEAD (&peer->ksnp_routes);
+        CFS_INIT_LIST_HEAD (&peer->ksnp_tx_queue);
+
+        spin_lock_irqsave(&net->ksnn_lock, flags);
+
+        if (net->ksnn_shutdown) {
+                spin_unlock_irqrestore(&net->ksnn_lock, flags);
+                
+                LIBCFS_FREE(peer, sizeof(*peer));
+                CERROR("Can't create peer: network shutdown\n");
+                return -ESHUTDOWN;
+        }
+
+        net->ksnn_npeers++;
+
+        spin_unlock_irqrestore(&net->ksnn_lock, flags);
+
+        *peerp = peer;
+        return 0;
+}
+
+void
+ksocknal_destroy_peer (ksock_peer_t *peer)
+{
+        ksock_net_t    *net = peer->ksnp_ni->ni_data;
+        unsigned long   flags;
+
+        CDEBUG (D_NET, "peer %s %p deleted\n", 
+                libcfs_id2str(peer->ksnp_id), peer);
+
+        LASSERT (atomic_read (&peer->ksnp_refcount) == 0);
+        LASSERT (peer->ksnp_accepting == 0);
+        LASSERT (list_empty (&peer->ksnp_conns));
+        LASSERT (list_empty (&peer->ksnp_routes));
+        LASSERT (list_empty (&peer->ksnp_tx_queue));
+
+        LIBCFS_FREE (peer, sizeof (*peer));
+
+        /* NB a peer's connections and routes keep a reference on their peer
+         * until they are destroyed, so we can be assured that _all_ state to
+         * do with this peer has been cleaned up when its refcount drops to
+         * zero. */
+        spin_lock_irqsave(&net->ksnn_lock, flags);
+        net->ksnn_npeers--;
+        spin_unlock_irqrestore(&net->ksnn_lock, flags);
+}
+
+ksock_peer_t *
+ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id)
+{
+        struct list_head *peer_list = ksocknal_nid2peerlist(id.nid);
+        struct list_head *tmp;
+        ksock_peer_t     *peer;
+
+        list_for_each (tmp, peer_list) {
+
+                peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+                LASSERT (!peer->ksnp_closing);
+
+                if (peer->ksnp_ni != ni)
+                        continue;
+
+                if (peer->ksnp_id.nid != id.nid ||
+                    peer->ksnp_id.pid != id.pid)
+                        continue;
+
+                CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+                       peer, libcfs_id2str(id), 
+                       atomic_read(&peer->ksnp_refcount));
+                return (peer);
+        }
+        return (NULL);
+}
+
+ksock_peer_t *
+ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id)
+{
+        ksock_peer_t     *peer;
+
+        read_lock (&ksocknal_data.ksnd_global_lock);
+        peer = ksocknal_find_peer_locked (ni, id);
+        if (peer != NULL)                       /* +1 ref for caller? */
+                ksocknal_peer_addref(peer);
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+
+        return (peer);
+}
+
+void
+ksocknal_unlink_peer_locked (ksock_peer_t *peer)
+{
+        int                i;
+        __u32              ip;
+
+        for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
+                LASSERT (i < LNET_MAX_INTERFACES);
+                ip = peer->ksnp_passive_ips[i];
+
+                ksocknal_ip2iface(peer->ksnp_ni, ip)->ksni_npeers--;
+        }
+
+        LASSERT (list_empty(&peer->ksnp_conns));
+        LASSERT (list_empty(&peer->ksnp_routes));
+        LASSERT (!peer->ksnp_closing);
+        peer->ksnp_closing = 1;
+        list_del (&peer->ksnp_list);
+        /* lose peerlist's ref */
+        ksocknal_peer_decref(peer);
+}
+
+int
+ksocknal_get_peer_info (lnet_ni_t *ni, int index, 
+                        lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip, int *port,
+                        int *conn_count, int *share_count)
+{
+        ksock_peer_t      *peer;
+        struct list_head  *ptmp;
+        ksock_route_t     *route;
+        struct list_head  *rtmp;
+        int                i;
+        int                j;
+        int                rc = -ENOENT;
+
+        read_lock (&ksocknal_data.ksnd_global_lock);
+
+        for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+
+                list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+                        peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+                        if (peer->ksnp_ni != ni)
+                                continue;
+
+                        if (peer->ksnp_n_passive_ips == 0 &&
+                            list_empty(&peer->ksnp_routes)) {
+                                if (index-- > 0)
+                                        continue;
+
+                                *id = peer->ksnp_id;
+                                *myip = 0;
+                                *peer_ip = 0;
+                                *port = 0;
+                                *conn_count = 0;
+                                *share_count = 0;
+                                rc = 0;
+                                goto out;
+                        }
+
+                        for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
+                                if (index-- > 0)
+                                        continue;
+
+                                *id = peer->ksnp_id;
+                                *myip = peer->ksnp_passive_ips[j];
+                                *peer_ip = 0;
+                                *port = 0;
+                                *conn_count = 0;
+                                *share_count = 0;
+                                rc = 0;
+                                goto out;
+                        }
+
+                        list_for_each (rtmp, &peer->ksnp_routes) {
+                                if (index-- > 0)
+                                        continue;
+
+                                route = list_entry(rtmp, ksock_route_t,
+                                                   ksnr_list);
+
+                                *id = peer->ksnp_id;
+                                *myip = route->ksnr_myipaddr;
+                                *peer_ip = route->ksnr_ipaddr;
+                                *port = route->ksnr_port;
+                                *conn_count = route->ksnr_conn_count;
+                                *share_count = route->ksnr_share_count;
+                                rc = 0;
+                                goto out;
+                        }
+                }
+        }
+ out:
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+        return (rc);
+}
+
+void
+ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+{
+        ksock_peer_t      *peer = route->ksnr_peer;
+        int                type = conn->ksnc_type;
+        ksock_interface_t *iface;
+
+        conn->ksnc_route = route;
+        ksocknal_route_addref(route);
+
+        if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
+                if (route->ksnr_myipaddr == 0) {
+                        /* route wasn't bound locally yet (the initial route) */
+                        CDEBUG(D_NET, "Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n",
+                               libcfs_id2str(peer->ksnp_id),
+                               HIPQUAD(route->ksnr_ipaddr),
+                               HIPQUAD(conn->ksnc_myipaddr));
+                } else {
+                        CDEBUG(D_NET, "Rebinding %s %u.%u.%u.%u from "
+                               "%u.%u.%u.%u to %u.%u.%u.%u\n",
+                               libcfs_id2str(peer->ksnp_id),
+                               HIPQUAD(route->ksnr_ipaddr),
+                               HIPQUAD(route->ksnr_myipaddr),
+                               HIPQUAD(conn->ksnc_myipaddr));
+
+                        iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                                  route->ksnr_myipaddr);
+                        if (iface != NULL)
+                                iface->ksni_nroutes--;
+                }
+                route->ksnr_myipaddr = conn->ksnc_myipaddr;
+                iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                          route->ksnr_myipaddr);
+                if (iface != NULL)
+                        iface->ksni_nroutes++;
+        }
+
+        route->ksnr_connected |= (1<<type);
+        route->ksnr_connecting &= ~(1<<type);
+        route->ksnr_conn_count++;
+
+        /* Successful connection => further attempts can
+         * proceed immediately */
+        route->ksnr_retry_interval = 0;
+}
+
+void
+ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
+{
+        struct list_head  *tmp;
+        ksock_conn_t      *conn;
+        int                type;
+        ksock_route_t     *route2;
+
+        LASSERT (route->ksnr_peer == NULL);
+        LASSERT (route->ksnr_connecting == 0);
+        LASSERT (route->ksnr_connected == 0);
+
+        /* LASSERT(unique) */
+        list_for_each(tmp, &peer->ksnp_routes) {
+                route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
+                        CERROR ("Duplicate route %s %u.%u.%u.%u\n",
+                                libcfs_id2str(peer->ksnp_id), 
+                                HIPQUAD(route->ksnr_ipaddr));
+                        LBUG();
+                }
+        }
+
+        route->ksnr_peer = peer;
+        ksocknal_peer_addref(peer);
+        /* peer's routelist takes over my ref on 'route' */
+        list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+
+        list_for_each(tmp, &peer->ksnp_conns) {
+                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                type = conn->ksnc_type;
+
+                if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
+                        continue;
+
+                ksocknal_associate_route_conn_locked(route, conn);
+                /* keep going (typed routes) */
+        }
+}
+
+void
+ksocknal_del_route_locked (ksock_route_t *route)
+{
+        ksock_peer_t      *peer = route->ksnr_peer;
+        ksock_interface_t *iface;
+        ksock_conn_t      *conn;
+        struct list_head  *ctmp;
+        struct list_head  *cnxt;
+
+        LASSERT (!route->ksnr_deleted);
+
+        /* Close associated conns */
+        list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+                conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_route != route)
+                        continue;
+
+                ksocknal_close_conn_locked (conn, 0);
+        }
+
+        if (route->ksnr_myipaddr != 0) {
+                iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                          route->ksnr_myipaddr);
+                if (iface != NULL)
+                        iface->ksni_nroutes--;
+        }
+
+        route->ksnr_deleted = 1;
+        list_del (&route->ksnr_list);
+        ksocknal_route_decref(route);             /* drop peer's ref */
+
+        if (list_empty (&peer->ksnp_routes) &&
+            list_empty (&peer->ksnp_conns)) {
+                /* I've just removed the last route to a peer with no active
+                 * connections */
+                ksocknal_unlink_peer_locked (peer);
+        }
+}
+
+int
+ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
+{
+        unsigned long      flags;
+        struct list_head  *tmp;
+        ksock_peer_t      *peer;
+        ksock_peer_t      *peer2;
+        ksock_route_t     *route;
+        ksock_route_t     *route2;
+        int                rc;
+
+        if (id.nid == LNET_NID_ANY ||
+            id.pid == LNET_PID_ANY)
+                return (-EINVAL);
+
+        /* Have a brand new peer ready... */
+        rc = ksocknal_create_peer(&peer, ni, id);
+        if (rc != 0)
+                return rc;
+
+        route = ksocknal_create_route (ipaddr, port);
+        if (route == NULL) {
+                ksocknal_peer_decref(peer);
+                return (-ENOMEM);
+        }
+
+        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+
+        peer2 = ksocknal_find_peer_locked (ni, id);
+        if (peer2 != NULL) {
+                ksocknal_peer_decref(peer);
+                peer = peer2;
+        } else {
+                /* peer table takes my ref on peer */
+                list_add_tail (&peer->ksnp_list,
+                               ksocknal_nid2peerlist (id.nid));
+        }
+
+        route2 = NULL;
+        list_for_each (tmp, &peer->ksnp_routes) {
+                route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                if (route2->ksnr_ipaddr == ipaddr)
+                        break;
+
+                route2 = NULL;
+        }
+        if (route2 == NULL) {
+                ksocknal_add_route_locked(peer, route);
+                route->ksnr_share_count++;
+        } else {
+                ksocknal_route_decref(route);
+                route2->ksnr_share_count++;
+        }
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+
+        return (0);
+}
+
+void
+ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
+{
+        ksock_conn_t     *conn;
+        ksock_route_t    *route;
+        struct list_head *tmp;
+        struct list_head *nxt;
+        int               nshared;
+
+        LASSERT (!peer->ksnp_closing);
+
+        /* Extra ref prevents peer disappearing until I'm done with it */
+        ksocknal_peer_addref(peer);
+
+        list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+                route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                /* no match */
+                if (!(ip == 0 || route->ksnr_ipaddr == ip))
+                        continue;
+
+                route->ksnr_share_count = 0;
+                /* This deletes associated conns too */
+                ksocknal_del_route_locked (route);
+        }
+
+        nshared = 0;
+        list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+                route = list_entry(tmp, ksock_route_t, ksnr_list);
+                nshared += route->ksnr_share_count;
+        }
+
+        if (nshared == 0) {
+                /* remove everything else if there are no explicit entries
+                 * left */
+
+                list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+                        route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                        /* we should only be removing auto-entries */
+                        LASSERT(route->ksnr_share_count == 0);
+                        ksocknal_del_route_locked (route);
+                }
+
+                list_for_each_safe (tmp, nxt, &peer->ksnp_conns) {
+                        conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                        ksocknal_close_conn_locked(conn, 0);
+                }
+        }
+
+        ksocknal_peer_decref(peer);
+        /* NB peer unlinks itself when last conn/route is removed */
+}
+
+int
+ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
+{
+        unsigned long      flags;
+        struct list_head  *ptmp;
+        struct list_head  *pnxt;
+        ksock_peer_t      *peer;
+        int                lo;
+        int                hi;
+        int                i;
+        int                rc = -ENOENT;
+
+        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+
+        if (id.nid != LNET_NID_ANY)
+                lo = hi = ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers;
+        else {
+                lo = 0;
+                hi = ksocknal_data.ksnd_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) {
+                        peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+                        if (peer->ksnp_ni != ni)
+                                continue;
+
+                        if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) &&
+                              (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid)))
+                                continue;
+
+                        ksocknal_del_peer_locked (peer, ip);
+                        rc = 0;                 /* matched! */
+                }
+        }
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+
+        return (rc);
+}
+
+ksock_conn_t *
+ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+        ksock_peer_t      *peer;
+        struct list_head  *ptmp;
+        ksock_conn_t      *conn;
+        struct list_head  *ctmp;
+        int                i;
+
+        read_lock (&ksocknal_data.ksnd_global_lock);
+
+        for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+                list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+                        peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+                        LASSERT (!peer->ksnp_closing);
+
+                        if (peer->ksnp_ni != ni)
+                                continue;
+
+                        list_for_each (ctmp, &peer->ksnp_conns) {
+                                if (index-- > 0)
+                                        continue;
+
+                                conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+                                ksocknal_conn_addref(conn);
+                                read_unlock (&ksocknal_data.ksnd_global_lock);
+                                return (conn);
+                        }
+                }
+        }
+
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+        return (NULL);
+}
+
+ksock_sched_t *
+ksocknal_choose_scheduler_locked (unsigned int irq)
+{
+        ksock_sched_t    *sched;
+        ksock_irqinfo_t  *info;
+        int               i;
+
+        LASSERT (irq < NR_IRQS);
+        info = &ksocknal_data.ksnd_irqinfo[irq];
+
+        if (irq != 0 &&                         /* hardware NIC */
+            info->ksni_valid) {                 /* already set up */
+                return (&ksocknal_data.ksnd_schedulers[info->ksni_sched]);
+        }
+
+        /* software NIC (irq == 0) || not associated with a scheduler yet.
+         * Choose the CPU with the fewest connections... */
+        sched = &ksocknal_data.ksnd_schedulers[0];
+        for (i = 1; i < ksocknal_data.ksnd_nschedulers; i++)
+                if (sched->kss_nconns >
+                    ksocknal_data.ksnd_schedulers[i].kss_nconns)
+                        sched = &ksocknal_data.ksnd_schedulers[i];
+
+        if (irq != 0) {                         /* Hardware NIC */
+                info->ksni_valid = 1;
+                info->ksni_sched = sched - ksocknal_data.ksnd_schedulers;
+
+                /* no overflow... */
+                LASSERT (info->ksni_sched == sched - ksocknal_data.ksnd_schedulers);
+        }
+
+        return (sched);
+}
+
+int
+ksocknal_local_ipvec (lnet_ni_t *ni, __u32 *ipaddrs)
+{
+        ksock_net_t       *net = ni->ni_data;
+        int                i;
+        int                nip;
+
+        read_lock (&ksocknal_data.ksnd_global_lock);
+
+        nip = net->ksnn_ninterfaces;
+        LASSERT (nip < LNET_MAX_INTERFACES);
+
+        for (i = 0; i < nip; i++) {
+                ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
+                LASSERT (ipaddrs[i] != 0);
+        }
+
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+        return (nip);
+}
+
+int
+ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
+{
+        int   best_netmatch = 0;
+        int   best_xor      = 0;
+        int   best          = -1;
+        int   this_xor;
+        int   this_netmatch;
+        int   i;
+
+        for (i = 0; i < nips; i++) {
+                if (ips[i] == 0)
+                        continue;
+
+                this_xor = (ips[i] ^ iface->ksni_ipaddr);
+                this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+                if (!(best < 0 ||
+                      best_netmatch < this_netmatch ||
+                      (best_netmatch == this_netmatch &&
+                       best_xor > this_xor)))
+                        continue;
+
+                best = i;
+                best_netmatch = this_netmatch;
+                best_xor = this_xor;
+        }
+
+        LASSERT (best >= 0);
+        return (best);
+}
+
+int
+ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
+{
+        rwlock_t           *global_lock = &ksocknal_data.ksnd_global_lock;
+        ksock_net_t        *net = peer->ksnp_ni->ni_data;
+        unsigned long       flags;
+        ksock_interface_t  *iface;
+        ksock_interface_t  *best_iface;
+        int                 n_ips;
+        int                 i;
+        int                 j;
+        int                 k;
+        __u32               ip;
+        __u32               xor;
+        int                 this_netmatch;
+        int                 best_netmatch;
+        int                 best_npeers;
+
+        /* CAVEAT EMPTOR: We do all our interface matching with an
+         * exclusive hold of global lock at IRQ priority.  We're only
+         * expecting to be dealing with small numbers of interfaces, so the
+         * O(n**3)-ness shouldn't matter */
+
+        /* Also note that I'm not going to return more than n_peerips
+         * interfaces, even if I have more myself */
+
+        write_lock_irqsave(global_lock, flags);
+
+        LASSERT (n_peerips <= LNET_MAX_INTERFACES);
+        LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+        n_ips = MIN(n_peerips, net->ksnn_ninterfaces);
+
+        for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
+                /*              ^ yes really... */
+
+                /* If we have any new interfaces, first tick off all the
+                 * peer IPs that match old interfaces, then choose new
+                 * interfaces to match the remaining peer IPS.
+                 * We don't forget interfaces we've stopped using; we might
+                 * start using them again... */
+
+                if (i < peer->ksnp_n_passive_ips) {
+                        /* Old interface. */
+                        ip = peer->ksnp_passive_ips[i];
+                        best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+
+                        /* peer passive ips are kept up to date */
+                        LASSERT(best_iface != NULL);
+                } else {
+                        /* choose a new interface */
+                        LASSERT (i == peer->ksnp_n_passive_ips);
+
+                        best_iface = NULL;
+                        best_netmatch = 0;
+                        best_npeers = 0;
+
+                        for (j = 0; j < net->ksnn_ninterfaces; j++) {
+                                iface = &net->ksnn_interfaces[j];
+                                ip = iface->ksni_ipaddr;
+
+                                for (k = 0; k < peer->ksnp_n_passive_ips; k++)
+                                        if (peer->ksnp_passive_ips[k] == ip)
+                                                break;
+
+                                if (k < peer->ksnp_n_passive_ips) /* using it already */
+                                        continue;
+
+                                k = ksocknal_match_peerip(iface, peerips, n_peerips);
+                                xor = (ip ^ peerips[k]);
+                                this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+                                if (!(best_iface == NULL ||
+                                      best_netmatch < this_netmatch ||
+                                      (best_netmatch == this_netmatch &&
+                                       best_npeers > iface->ksni_npeers)))
+                                        continue;
+
+                                best_iface = iface;
+                                best_netmatch = this_netmatch;
+                                best_npeers = iface->ksni_npeers;
+                        }
+
+                        best_iface->ksni_npeers++;
+                        ip = best_iface->ksni_ipaddr;
+                        peer->ksnp_passive_ips[i] = ip;
+                        peer->ksnp_n_passive_ips = i+1;
+                }
+
+                LASSERT (best_iface != NULL);
+
+                /* mark the best matching peer IP used */
+                j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
+                peerips[j] = 0;
+        }
+
+        /* Overwrite input peer IP addresses */
+        memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
+
+        write_unlock_irqrestore(global_lock, flags);
+
+        return (n_ips);
+}
+
+void
+ksocknal_create_routes(ksock_peer_t *peer, int port,
+                       __u32 *peer_ipaddrs, int npeer_ipaddrs)
+{
+        ksock_route_t      *newroute = NULL;
+        rwlock_t           *global_lock = &ksocknal_data.ksnd_global_lock;
+        lnet_ni_t          *ni = peer->ksnp_ni;
+        ksock_net_t        *net = ni->ni_data;
+        unsigned long       flags;
+        struct list_head   *rtmp;
+        ksock_route_t      *route;
+        ksock_interface_t  *iface;
+        ksock_interface_t  *best_iface;
+        int                 best_netmatch;
+        int                 this_netmatch;
+        int                 best_nroutes;
+        int                 i;
+        int                 j;
+
+        /* CAVEAT EMPTOR: We do all our interface matching with an
+         * exclusive hold of global lock at IRQ priority.  We're only
+         * expecting to be dealing with small numbers of interfaces, so the
+         * O(n**3)-ness here shouldn't matter */
+
+        write_lock_irqsave(global_lock, flags);
+
+        LASSERT (npeer_ipaddrs <= LNET_MAX_INTERFACES);
+
+        for (i = 0; i < npeer_ipaddrs; i++) {
+                if (newroute != NULL) {
+                        newroute->ksnr_ipaddr = peer_ipaddrs[i];
+                } else {
+                        write_unlock_irqrestore(global_lock, flags);
+
+                        newroute = ksocknal_create_route(peer_ipaddrs[i], port);
+                        if (newroute == NULL)
+                                return;
+
+                        write_lock_irqsave(global_lock, flags);
+                }
+
+                /* Already got a route? */
+                route = NULL;
+                list_for_each(rtmp, &peer->ksnp_routes) {
+                        route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+                        if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
+                                break;
+
+                        route = NULL;
+                }
+                if (route != NULL)
+                        continue;
+
+                best_iface = NULL;
+                best_nroutes = 0;
+                best_netmatch = 0;
+
+                LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+                /* Select interface to connect from */
+                for (j = 0; j < net->ksnn_ninterfaces; j++) {
+                        iface = &net->ksnn_interfaces[j];
+
+                        /* Using this interface already? */
+                        list_for_each(rtmp, &peer->ksnp_routes) {
+                                route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+                                if (route->ksnr_myipaddr == iface->ksni_ipaddr)
+                                        break;
+
+                                route = NULL;
+                        }
+                        if (route != NULL)
+                                continue;
+
+                        this_netmatch = (((iface->ksni_ipaddr ^
+                                           newroute->ksnr_ipaddr) &
+                                           iface->ksni_netmask) == 0) ? 1 : 0;
+
+                        if (!(best_iface == NULL ||
+                              best_netmatch < this_netmatch ||
+                              (best_netmatch == this_netmatch &&
+                               best_nroutes > iface->ksni_nroutes)))
+                                continue;
+
+                        best_iface = iface;
+                        best_netmatch = this_netmatch;
+                        best_nroutes = iface->ksni_nroutes;
+                }
+
+                if (best_iface == NULL)
+                        continue;
+
+                newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
+                best_iface->ksni_nroutes++;
+
+                ksocknal_add_route_locked(peer, newroute);
+                newroute = NULL;
+        }
+
+        write_unlock_irqrestore(global_lock, flags);
+        if (newroute != NULL)
+                ksocknal_route_decref(newroute);
+}
+
+int
+ksocknal_accept (lnet_ni_t *ni, struct socket *sock)
+{
+        ksock_connreq_t    *cr;
+        int                 rc;
+        __u32               peer_ip;
+        int                 peer_port;
+        unsigned long       flags;
+
+        rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+        LASSERT (rc == 0);                      /* we succeeded before */
+
+        LIBCFS_ALLOC(cr, sizeof(*cr));
+        if (cr == NULL) {
+                LCONSOLE_ERROR("Dropping connection request from "
+                               "%u.%u.%u.%u: memory exhausted\n",
+                               HIPQUAD(peer_ip));
+                return -ENOMEM;
+        }
+
+        lnet_ni_addref(ni);
+        cr->ksncr_ni   = ni;
+        cr->ksncr_sock = sock;
+
+        spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock, flags);
+
+        list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
+        cfs_waitq_signal(&ksocknal_data.ksnd_connd_waitq);
+                        
+        spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock, flags);
+        return 0;
+}
+
+int
+ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, 
+                      struct socket *sock, int type)
+{
+        rwlock_t          *global_lock = &ksocknal_data.ksnd_global_lock;
+        CFS_LIST_HEAD     (zombies);
+        __u32              ipaddrs[LNET_MAX_INTERFACES];
+        int                nipaddrs;
+        lnet_process_id_t   peerid;
+        struct list_head  *tmp;
+        __u64              incarnation;
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        ksock_conn_t      *conn2;
+        ksock_peer_t      *peer = NULL;
+        ksock_peer_t      *peer2;
+        ksock_sched_t     *sched;
+        unsigned int       irq;
+        ksock_tx_t        *tx;
+        int                bits;
+        int                rc;
+        char              *warn = NULL;
+
+        LASSERT (route == NULL == (type == SOCKLND_CONN_NONE));
+
+        rc = ksocknal_lib_setup_sock (sock);
+        if (rc != 0)
+                return (rc);
+
+        irq = ksocknal_lib_sock_irq (sock);
+
+        rc = -ENOMEM;
+        LIBCFS_ALLOC(conn, sizeof(*conn));
+        if (conn == NULL)
+                goto failed_0;
+
+        memset (conn, 0, sizeof (*conn));
+        conn->ksnc_peer = NULL;
+        conn->ksnc_route = NULL;
+        conn->ksnc_sock = sock;
+        atomic_set (&conn->ksnc_sock_refcount, 1); /* 1 ref for conn */
+        conn->ksnc_type = type;
+        ksocknal_lib_save_callback(sock, conn);
+        atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
+
+        conn->ksnc_rx_ready = 0;
+        conn->ksnc_rx_scheduled = 0;
+        ksocknal_new_packet (conn, 0);
+
+        CFS_INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+        conn->ksnc_tx_ready = 0;
+        conn->ksnc_tx_scheduled = 0;
+        atomic_set (&conn->ksnc_tx_nob, 0);
+
+        /* stash conn's local and remote addrs */
+        rc = ksocknal_lib_get_conn_addrs (conn);
+        if (rc != 0)
+                goto failed_1;
+
+        /* Find out/confirm peer's NID and connection type and get the
+         * vector of interfaces she's willing to let me connect to.
+         * Passive connections use the listener timeout since the peer sends
+         * eagerly */
+
+        if (route != NULL) {
+                LASSERT(ni == route->ksnr_peer->ksnp_ni);
+
+                /* Active connection sends HELLO eagerly */
+                nipaddrs = ksocknal_local_ipvec(ni, ipaddrs);
+                peerid = route->ksnr_peer->ksnp_id;
+
+                rc = ksocknal_send_hello (ni, conn, peerid.nid,
+                                          ipaddrs, nipaddrs);
+                if (rc != 0)
+                        goto failed_1;
+        } else {
+                peerid.nid = LNET_NID_ANY;
+                peerid.pid = LNET_PID_ANY;
+        }
+
+        rc = ksocknal_recv_hello (ni, conn, &peerid, &incarnation, ipaddrs);
+        if (rc < 0) {
+                if (rc == -EALREADY) {
+                        CDEBUG(D_NET, "Lost connection race with %s\n", 
+                               libcfs_id2str(peerid));
+                        /* Not an actual failure: return +ve RC so active
+                         * connector can back off */
+                        rc = EALREADY;
+                }
+                goto failed_1;
+        }
+        
+        nipaddrs = rc;
+        LASSERT (peerid.nid != LNET_NID_ANY);
+
+        if (route != NULL) {
+                peer = route->ksnr_peer;
+                ksocknal_peer_addref(peer);
+
+                /* additional routes after interface exchange? */
+                ksocknal_create_routes(peer, conn->ksnc_port,
+                                       ipaddrs, nipaddrs);
+                rc = 0;
+                write_lock_irqsave (global_lock, flags);
+        } else {
+                rc = ksocknal_create_peer(&peer, ni, peerid);
+                if (rc != 0)
+                        goto failed_1;
+
+                write_lock_irqsave(global_lock, flags);
+
+                peer2 = ksocknal_find_peer_locked(ni, peerid);
+                if (peer2 == NULL) {
+                        /* NB this puts an "empty" peer in the peer
+                         * table (which takes my ref) */
+                        list_add_tail(&peer->ksnp_list,
+                                      ksocknal_nid2peerlist(peerid.nid));
+                } else {
+                        ksocknal_peer_decref(peer);
+                        peer = peer2;
+                }
+
+                /* +1 ref for me */
+                ksocknal_peer_addref(peer);
+                peer->ksnp_accepting++;
+                
+                /* Am I already connecting/connected to this guy?  Resolve in
+                 * favour of higher NID... */
+                rc = 0;
+                if (peerid.nid < ni->ni_nid) {
+                        bits = (1 << conn->ksnc_type);
+
+                        list_for_each(tmp, &peer->ksnp_routes) {
+                                route = list_entry(tmp, ksock_route_t, 
+                                                   ksnr_list);
+                        
+                                if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+                                        continue;
+                                
+                                if ((route->ksnr_connecting & bits) == 0)
+                                        continue;
+
+                                rc = EALREADY;  /* not a failure */
+                                warn = "connection race";
+                                break;
+                        }
+                }
+                
+                write_unlock_irqrestore(global_lock, flags);
+
+                if (rc != 0) {
+                        /* set CONN_NONE makes returned HELLO acknowledge I
+                         * lost a connection race */
+                        conn->ksnc_type = SOCKLND_CONN_NONE;
+                        ksocknal_send_hello (ni, conn, peerid.nid,
+                                             ipaddrs, 0);
+                } else {
+                        nipaddrs = ksocknal_select_ips(peer, ipaddrs, nipaddrs);
+                        rc = ksocknal_send_hello (ni, conn, peerid.nid,
+                                                  ipaddrs, nipaddrs);
+                }
+                
+                write_lock_irqsave(global_lock, flags);
+                peer->ksnp_accepting--;
+                
+                if (rc != 0)
+                        goto failed_2;
+        }
+
+        if (peer->ksnp_closing ||
+            (route != NULL && route->ksnr_deleted)) {
+                /* route/peer got closed under me */
+                rc = -ESTALE;
+                warn = "peer/route removed";
+                goto failed_2;
+        }
+
+        /* Refuse to duplicate an existing connection, unless this is a
+         * loopback connection */
+        if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
+                list_for_each(tmp, &peer->ksnp_conns) {
+                        conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                        if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
+                            conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
+                            conn2->ksnc_type != conn->ksnc_type ||
+                            conn2->ksnc_incarnation != incarnation)
+                                continue;
+
+                        rc = 0;    /* more of a NOOP than a failure */
+                        warn = "duplicate";
+                        goto failed_2;
+                }
+        }
+
+        /* If the connection created by this route didn't bind to the IP
+         * address the route connected to, the connection/route matching
+         * code below probably isn't going to work. */
+        if (route != NULL &&
+            route->ksnr_ipaddr != conn->ksnc_ipaddr) {
+                CERROR("Route %s %u.%u.%u.%u connected to %u.%u.%u.%u\n",
+                       libcfs_id2str(peer->ksnp_id),
+                       HIPQUAD(route->ksnr_ipaddr),
+                       HIPQUAD(conn->ksnc_ipaddr));
+        }
+
+        /* Search for a route corresponding to the new connection and
+         * create an association.  This allows incoming connections created
+         * by routes in my peer to match my own route entries so I don't
+         * continually create duplicate routes. */
+        list_for_each (tmp, &peer->ksnp_routes) {
+                route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+                        continue;
+
+                ksocknal_associate_route_conn_locked(route, conn);
+                break;
+        }
+
+        conn->ksnc_peer = peer;                 /* conn takes my ref on peer */
+        conn->ksnc_incarnation = incarnation;
+        peer->ksnp_last_alive = cfs_time_current();
+        peer->ksnp_error = 0;
+
+        sched = ksocknal_choose_scheduler_locked (irq);
+        sched->kss_nconns++;
+        conn->ksnc_scheduler = sched;
+
+        /* Set the deadline for the outgoing HELLO to drain */
+        conn->ksnc_tx_bufnob = 0;
+        conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+        mb();       /* order with adding to peer's conn list */
+
+        list_add (&conn->ksnc_list, &peer->ksnp_conns);
+        ksocknal_conn_addref(conn);
+
+        /* NB my callbacks block while I hold ksnd_global_lock */
+        ksocknal_lib_set_callback(sock, conn);
+
+        /* Take all the packets blocking for a connection.
+         * NB, it might be nicer to share these blocked packets among any
+         * other connections that are becoming established. */
+        while (!list_empty (&peer->ksnp_tx_queue)) {
+                tx = list_entry (peer->ksnp_tx_queue.next,
+                                 ksock_tx_t, tx_list);
+
+                list_del (&tx->tx_list);
+                ksocknal_queue_tx_locked (tx, conn);
+        }
+
+        rc = ksocknal_close_stale_conns_locked(peer, incarnation);
+        write_unlock_irqrestore (global_lock, flags);
+
+        if (rc != 0)
+                CDEBUG(D_HA, "Closed %d stale conns to %s ip %d.%d.%d.%d\n",
+                        rc, libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                        HIPQUAD(conn->ksnc_ipaddr));
+
+        ksocknal_lib_bind_irq (irq);
+
+        /* Call the callbacks right now to get things going. */
+        if (ksocknal_connsock_addref(conn) == 0) {
+                ksocknal_lib_act_callback(sock, conn);
+                ksocknal_connsock_decref(conn);
+        }
+
+        CDEBUG(D_NET, "New conn %s %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+               " incarnation:"LPD64" sched[%d]/%d\n",
+               libcfs_id2str(peerid), HIPQUAD(conn->ksnc_myipaddr),
+               HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation,
+               (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
+
+        ksocknal_conn_decref(conn);
+        return (0);
+
+ failed_2:
+        if (!peer->ksnp_closing &&
+            list_empty (&peer->ksnp_conns) &&
+            list_empty (&peer->ksnp_routes)) {
+                list_add(&zombies, &peer->ksnp_tx_queue);
+                list_del_init(&peer->ksnp_tx_queue);
+                ksocknal_unlink_peer_locked(peer);
+        }
+        
+        write_unlock_irqrestore(global_lock, flags);
+
+        if (warn != NULL) {
+                if (rc < 0)
+                        CERROR("Not creating conn %s type %d: %s\n",
+                               libcfs_id2str(peerid), conn->ksnc_type, warn);
+                else
+                        CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
+                              libcfs_id2str(peerid), conn->ksnc_type, warn);
+        }
+
+        ksocknal_txlist_done(ni, &zombies);
+        ksocknal_peer_decref(peer);
+
+ failed_1:
+        LIBCFS_FREE (conn, sizeof(*conn));
+
+ failed_0:
+        libcfs_sock_release(sock);
+        return rc;
+}
+
+void
+ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
+{
+        /* This just does the immmediate housekeeping, and queues the
+         * connection for the reaper to terminate.
+         * Caller holds ksnd_global_lock exclusively in irq context */
+        ksock_peer_t      *peer = conn->ksnc_peer;
+        ksock_route_t     *route;
+        ksock_conn_t      *conn2;
+        struct list_head  *tmp;
+
+        LASSERT (peer->ksnp_error == 0);
+        LASSERT (!conn->ksnc_closing);
+        conn->ksnc_closing = 1;
+
+        /* ksnd_deathrow_conns takes over peer's ref */
+        list_del (&conn->ksnc_list);
+
+        route = conn->ksnc_route;
+        if (route != NULL) {
+                /* dissociate conn from route... */
+                LASSERT (!route->ksnr_deleted);
+                LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
+
+                conn2 = NULL;
+                list_for_each(tmp, &peer->ksnp_conns) {
+                        conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                        if (conn2->ksnc_route == route &&
+                            conn2->ksnc_type == conn->ksnc_type)
+                                break;
+
+                        conn2 = NULL;
+                }
+                if (conn2 == NULL)
+                        route->ksnr_connected &= ~(1 << conn->ksnc_type);
+
+                conn->ksnc_route = NULL;
+
+#if 0           /* irrelevent with only eager routes */
+                list_del (&route->ksnr_list);   /* make route least favourite */
+                list_add_tail (&route->ksnr_list, &peer->ksnp_routes);
+#endif
+                ksocknal_route_decref(route);     /* drop conn's ref on route */
+        }
+
+        if (list_empty (&peer->ksnp_conns)) {
+                /* No more connections to this peer */
+
+                peer->ksnp_error = error;       /* stash last conn close reason */
+
+                if (list_empty (&peer->ksnp_routes)) {
+                        /* I've just closed last conn belonging to a
+                         * peer with no routes to it */
+                        ksocknal_unlink_peer_locked (peer);
+                }
+        }
+
+        spin_lock (&ksocknal_data.ksnd_reaper_lock);
+
+        list_add_tail (&conn->ksnc_list, &ksocknal_data.ksnd_deathrow_conns);
+        cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq);
+
+        spin_unlock (&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_peer_failed (ksock_peer_t *peer)
+{
+        time_t    last_alive = 0;
+        int       notify = 0;
+
+        /* There has been a connection failure or comms error; but I'll only
+         * tell LNET I think the peer is dead if it's to another kernel and
+         * there are no connections or connection attempts in existance. */
+        
+        read_lock (&ksocknal_data.ksnd_global_lock);
+
+        if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+            list_empty(&peer->ksnp_conns) &&
+            peer->ksnp_accepting == 0 &&
+            ksocknal_find_connecting_route_locked(peer) == NULL) {
+                notify = 1;
+                last_alive = cfs_time_current_sec() - 
+                             cfs_duration_sec(cfs_time_current() - 
+                                              peer->ksnp_last_alive);
+        }
+        
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+
+        if (notify)
+                lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0,
+                             last_alive);
+}
+
+void
+ksocknal_terminate_conn (ksock_conn_t *conn)
+{
+        /* This gets called by the reaper (guaranteed thread context) to
+         * disengage the socket from its callbacks and close it.
+         * ksnc_refcount will eventually hit zero, and then the reaper will
+         * destroy it. */
+        unsigned long   flags;
+        ksock_peer_t   *peer = conn->ksnc_peer;
+        ksock_sched_t  *sched = conn->ksnc_scheduler;
+        int             failed = 0;
+
+        LASSERT(conn->ksnc_closing);
+
+        /* wake up the scheduler to "send" all remaining packets to /dev/null */
+        spin_lock_irqsave(&sched->kss_lock, flags);
+
+        if (!conn->ksnc_tx_scheduled &&
+            !list_empty(&conn->ksnc_tx_queue)){
+                list_add_tail (&conn->ksnc_tx_list,
+                               &sched->kss_tx_conns);
+                /* a closing conn is always ready to tx */
+                conn->ksnc_tx_ready = 1;
+                conn->ksnc_tx_scheduled = 1;
+                /* extra ref for scheduler */
+                ksocknal_conn_addref(conn);
+
+                cfs_waitq_signal (&sched->kss_waitq);
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+        /* serialise with callbacks */
+        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+
+        ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
+
+        /* OK, so this conn may not be completely disengaged from its
+         * scheduler yet, but it _has_ committed to terminate... */
+        conn->ksnc_scheduler->kss_nconns--;
+
+        if (peer->ksnp_error != 0) {
+                /* peer's last conn closed in error */
+                LASSERT (list_empty (&peer->ksnp_conns));
+                failed = 1;
+        }
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+
+        /* The socket is closed on the final put; either here, or in
+         * ksocknal_{send,recv}msg().  Since we set up the linger2 option
+         * when the connection was established, this will close the socket
+         * immediately, aborting anything buffered in it. Any hung
+         * zero-copy transmits will therefore complete in finite time. */
+        ksocknal_connsock_decref(conn);
+
+        if (failed)
+                ksocknal_peer_failed(peer);
+}
+
+void
+ksocknal_queue_zombie_conn (ksock_conn_t *conn)
+{
+        /* Queue the conn for the reaper to destroy */
+        unsigned long flags;
+
+        LASSERT (atomic_read(&conn->ksnc_conn_refcount) == 0);
+        spin_lock_irqsave(&ksocknal_data.ksnd_reaper_lock, flags);
+
+        list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
+        cfs_waitq_signal(&ksocknal_data.ksnd_reaper_waitq);
+        
+        spin_unlock_irqrestore(&ksocknal_data.ksnd_reaper_lock, flags);
+}
+
+void
+ksocknal_destroy_conn (ksock_conn_t *conn)
+{
+        /* Final coup-de-grace of the reaper */
+        CDEBUG (D_NET, "connection %p\n", conn);
+
+        LASSERT (atomic_read (&conn->ksnc_conn_refcount) == 0);
+        LASSERT (atomic_read (&conn->ksnc_sock_refcount) == 0);
+        LASSERT (conn->ksnc_sock == NULL);
+        LASSERT (conn->ksnc_route == NULL);
+        LASSERT (!conn->ksnc_tx_scheduled);
+        LASSERT (!conn->ksnc_rx_scheduled);
+        LASSERT (list_empty(&conn->ksnc_tx_queue));
+
+        /* complete current receive if any */
+        switch (conn->ksnc_rx_state) {
+        case SOCKNAL_RX_BODY:
+                CERROR("Completing partial receive from %s"
+                       ", ip %d.%d.%d.%d:%d, with error\n",
+                       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+                lnet_finalize (conn->ksnc_peer->ksnp_ni, 
+                               conn->ksnc_cookie, -EIO);
+                break;
+        case SOCKNAL_RX_HEADER:
+        case SOCKNAL_RX_SLOP:
+                break;
+        default:
+                LBUG ();
+                break;
+        }
+
+        ksocknal_peer_decref(conn->ksnc_peer);
+
+        LIBCFS_FREE (conn, sizeof (*conn));
+}
+
+int
+ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why)
+{
+        ksock_conn_t       *conn;
+        struct list_head   *ctmp;
+        struct list_head   *cnxt;
+        int                 count = 0;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+                conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+                if (ipaddr == 0 ||
+                    conn->ksnc_ipaddr == ipaddr) {
+                        count++;
+                        ksocknal_close_conn_locked (conn, why);
+                }
+        }
+
+        return (count);
+}
+
+int
+ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation)
+{
+        ksock_conn_t       *conn;
+        struct list_head   *ctmp;
+        struct list_head   *cnxt;
+        int                 count = 0;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+                conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_incarnation == incarnation)
+                        continue;
+
+                CDEBUG(D_NET, "Closing stale conn %s ip:%08x/%d "
+                       "incarnation:"LPD64"("LPD64")\n",
+                       libcfs_id2str(peer->ksnp_id), 
+                       conn->ksnc_ipaddr, conn->ksnc_port,
+                       conn->ksnc_incarnation, incarnation);
+
+                count++;
+                ksocknal_close_conn_locked (conn, -ESTALE);
+        }
+
+        return (count);
+}
+
+int
+ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
+{
+        ksock_peer_t     *peer = conn->ksnc_peer;
+        __u32             ipaddr = conn->ksnc_ipaddr;
+        unsigned long     flags;
+        int               count;
+
+        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+
+        count = ksocknal_close_peer_conns_locked (peer, ipaddr, why);
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+
+        return (count);
+}
+
+int
+ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr)
+{
+        unsigned long       flags;
+        ksock_peer_t       *peer;
+        struct list_head   *ptmp;
+        struct list_head   *pnxt;
+        int                 lo;
+        int                 hi;
+        int                 i;
+        int                 count = 0;
+
+        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+
+        if (id.nid != LNET_NID_ANY)
+                lo = hi = ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers;
+        else {
+                lo = 0;
+                hi = ksocknal_data.ksnd_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) {
+
+                        peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+                        if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
+                              (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
+                                continue;
+
+                        count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0);
+                }
+        }
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+
+        /* wildcards always succeed */
+        if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0)
+                return (0);
+
+        return (count == 0 ? -ENOENT : 0);
+}
+
+void
+ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive)
+{
+        /* The router is telling me she's been notified of a change in
+         * gateway state.... */
+        lnet_process_id_t  id = {/* .nid = */ gw_nid, /* .pid = */ LNET_PID_ANY};
+
+        CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid), 
+                alive ? "up" : "down");
+
+        if (!alive) {
+                /* If the gateway crashed, close all open connections... */
+                ksocknal_close_matching_conns (id, 0);
+                return;
+        }
+
+        /* ...otherwise do nothing.  We can only establish new connections
+         * if we have autroutes, and these connect on demand. */
+}
+
+void
+ksocknal_push_peer (ksock_peer_t *peer)
+{
+        int               index;
+        int               i;
+        struct list_head *tmp;
+        ksock_conn_t     *conn;
+
+        for (index = 0; ; index++) {
+                read_lock (&ksocknal_data.ksnd_global_lock);
+
+                i = 0;
+                conn = NULL;
+
+                list_for_each (tmp, &peer->ksnp_conns) {
+                        if (i++ == index) {
+                                conn = list_entry (tmp, ksock_conn_t, ksnc_list);
+                                ksocknal_conn_addref(conn);
+                                break;
+                        }
+                }
+
+                read_unlock (&ksocknal_data.ksnd_global_lock);
+
+                if (conn == NULL)
+                        break;
+
+                ksocknal_push_conn (conn);
+                ksocknal_conn_decref(conn);
+        }
+}
+
+int
+ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id)
+{
+        ksock_peer_t      *peer;
+        struct list_head  *tmp;
+        int                index;
+        int                i;
+        int                j;
+        int                rc = -ENOENT;
+
+        for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+                for (j = 0; ; j++) {
+                        read_lock (&ksocknal_data.ksnd_global_lock);
+
+                        index = 0;
+                        peer = NULL;
+
+                        list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+                                peer = list_entry(tmp, ksock_peer_t,
+                                                  ksnp_list);
+
+                                if (!((id.nid == LNET_NID_ANY ||
+                                       id.nid == peer->ksnp_id.nid) &&
+                                      (id.pid == LNET_PID_ANY ||
+                                       id.pid == peer->ksnp_id.pid))) {
+                                        peer = NULL;
+                                        continue;
+                                }
+
+                                if (index++ == j) {
+                                        ksocknal_peer_addref(peer);
+                                        break;
+                                }
+                        }
+
+                        read_unlock (&ksocknal_data.ksnd_global_lock);
+
+                        if (peer != NULL) {
+                                rc = 0;
+                                ksocknal_push_peer (peer);
+                                ksocknal_peer_decref(peer);
+                        }
+                }
+
+        }
+
+        return (rc);
+}
+
+int
+ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
+{
+        ksock_net_t       *net = ni->ni_data;
+        unsigned long      flags;
+        ksock_interface_t *iface;
+        int                rc;
+        int                i;
+        int                j;
+        struct list_head  *ptmp;
+        ksock_peer_t      *peer;
+        struct list_head  *rtmp;
+        ksock_route_t     *route;
+
+        if (ipaddress == 0 ||
+            netmask == 0)
+                return (-EINVAL);
+
+        write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
+
+        iface = ksocknal_ip2iface(ni, ipaddress);
+        if (iface != NULL) {
+                /* silently ignore dups */
+                rc = 0;
+        } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) {
+                rc = -ENOSPC;
+        } else {
+                iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
+
+                iface->ksni_ipaddr = ipaddress;
+                iface->ksni_netmask = netmask;
+                iface->ksni_nroutes = 0;
+                iface->ksni_npeers = 0;
+
+                for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+                        list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+                                peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+
+                                for (j = 0; i < peer->ksnp_n_passive_ips; j++)
+                                        if (peer->ksnp_passive_ips[j] == ipaddress)
+                                                iface->ksni_npeers++;
+
+                                list_for_each(rtmp, &peer->ksnp_routes) {
+                                        route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+                                        if (route->ksnr_myipaddr == ipaddress)
+                                                iface->ksni_nroutes++;
+                                }
+                        }
+                }
+
+                rc = 0;
+                /* NB only new connections will pay attention to the new interface! */
+        }
+
+        write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
+
+        return (rc);
+}
+
+void
+ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
+{
+        struct list_head   *tmp;
+        struct list_head   *nxt;
+        ksock_route_t      *route;
+        ksock_conn_t       *conn;
+        int                 i;
+        int                 j;
+
+        for (i = 0; i < peer->ksnp_n_passive_ips; i++)
+                if (peer->ksnp_passive_ips[i] == ipaddr) {
+                        for (j = i+1; j < peer->ksnp_n_passive_ips; j++)
+                                peer->ksnp_passive_ips[j-1] =
+                                        peer->ksnp_passive_ips[j];
+                        peer->ksnp_n_passive_ips--;
+                        break;
+                }
+
+        list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+                route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+                if (route->ksnr_myipaddr != ipaddr)
+                        continue;
+
+                if (route->ksnr_share_count != 0) {
+                        /* Manually created; keep, but unbind */
+                        route->ksnr_myipaddr = 0;
+                } else {
+                        ksocknal_del_route_locked(route);
+                }
+        }
+
+        list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_myipaddr == ipaddr)
+                        ksocknal_close_conn_locked (conn, 0);
+        }
+}
+
+int
+ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress)
+{
+        ksock_net_t       *net = ni->ni_data;
+        int                rc = -ENOENT;
+        unsigned long      flags;
+        struct list_head  *tmp;
+        struct list_head  *nxt;
+        ksock_peer_t      *peer;
+        __u32              this_ip;
+        int                i;
+        int                j;
+
+        write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
+
+        for (i = 0; i < net->ksnn_ninterfaces; i++) {
+                this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
+
+                if (!(ipaddress == 0 ||
+                      ipaddress == this_ip))
+                        continue;
+
+                rc = 0;
+
+                for (j = i+1; j < net->ksnn_ninterfaces; j++)
+                        net->ksnn_interfaces[j-1] =
+                                net->ksnn_interfaces[j];
+
+                net->ksnn_ninterfaces--;
+
+                for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
+                        list_for_each_safe(tmp, nxt, &ksocknal_data.ksnd_peers[j]) {
+                                peer = list_entry(tmp, ksock_peer_t, ksnp_list);
+
+                                if (peer->ksnp_ni != ni)
+                                        continue;
+
+                                ksocknal_peer_del_interface_locked(peer, this_ip);
+                        }
+                }
+        }
+
+        write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
+
+        return (rc);
+}
+
+int
+ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+        struct libcfs_ioctl_data *data = arg;
+        int rc;
+
+        switch(cmd) {
+        case IOC_LIBCFS_GET_INTERFACE: {
+                ksock_net_t       *net = ni->ni_data;
+                ksock_interface_t *iface;
+
+                read_lock (&ksocknal_data.ksnd_global_lock);
+
+                if (data->ioc_count < 0 ||
+                    data->ioc_count >= net->ksnn_ninterfaces) {
+                        rc = -ENOENT;
+                } else {
+                        rc = 0;
+                        iface = &net->ksnn_interfaces[data->ioc_count];
+
+                        data->ioc_u32[0] = iface->ksni_ipaddr;
+                        data->ioc_u32[1] = iface->ksni_netmask;
+                        data->ioc_u32[2] = iface->ksni_npeers;
+                        data->ioc_u32[3] = iface->ksni_nroutes;
+                }
+
+                read_unlock (&ksocknal_data.ksnd_global_lock);
+                return rc;
+        }
+
+        case IOC_LIBCFS_ADD_INTERFACE:
+                return ksocknal_add_interface(ni,
+                                              data->ioc_u32[0], /* IP address */
+                                              data->ioc_u32[1]); /* net mask */
+
+        case IOC_LIBCFS_DEL_INTERFACE:
+                return ksocknal_del_interface(ni, 
+                                              data->ioc_u32[0]); /* IP address */
+
+        case IOC_LIBCFS_GET_PEER: {
+                lnet_process_id_t id = {0,};
+                __u32            myip = 0;
+                __u32            ip = 0;
+                int              port = 0;
+                int              conn_count = 0;
+                int              share_count = 0;
+
+                rc = ksocknal_get_peer_info(ni, data->ioc_count,
+                                            &id, &myip, &ip, &port,
+                                            &conn_count,  &share_count);
+                if (rc != 0)
+                        return rc;
+                        
+                data->ioc_nid    = id.nid;
+                data->ioc_count  = share_count;
+                data->ioc_u32[0] = ip;
+                data->ioc_u32[1] = port;
+                data->ioc_u32[2] = myip;
+                data->ioc_u32[3] = conn_count;
+                data->ioc_u32[4] = id.pid;
+                return 0;
+        }
+
+        case IOC_LIBCFS_ADD_PEER: {
+                lnet_process_id_t  id = {/* .nid = */ data->ioc_nid,
+                                         /* .pid = */ LUSTRE_SRV_LNET_PID};
+                return ksocknal_add_peer (ni, id,
+                                          data->ioc_u32[0], /* IP */
+                                          data->ioc_u32[1]); /* port */
+        }
+        case IOC_LIBCFS_DEL_PEER: {
+                lnet_process_id_t  id = {/* .nid = */ data->ioc_nid,
+                                         /* .pid = */ LNET_PID_ANY};
+                return ksocknal_del_peer (ni, id,
+                                          data->ioc_u32[0]); /* IP */
+        }
+        case IOC_LIBCFS_GET_CONN: {
+                int           txmem;
+                int           rxmem;
+                int           nagle;
+                ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
+
+                if (conn == NULL)
+                        return -ENOENT;
+
+                ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
+                data->ioc_count  = txmem;
+                data->ioc_nid    = conn->ksnc_peer->ksnp_id.nid;
+                data->ioc_flags  = nagle;
+                data->ioc_u32[0] = conn->ksnc_ipaddr;
+                data->ioc_u32[1] = conn->ksnc_port;
+                data->ioc_u32[2] = conn->ksnc_myipaddr;
+                data->ioc_u32[3] = conn->ksnc_type;
+                data->ioc_u32[4] = conn->ksnc_scheduler -
+                                   ksocknal_data.ksnd_schedulers;
+                data->ioc_u32[5] = rxmem;
+                data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
+                ksocknal_conn_decref(conn);
+                return 0;
+        }
+
+        case IOC_LIBCFS_CLOSE_CONNECTION: {
+                lnet_process_id_t  id = {/* .nid = */ data->ioc_nid,
+                                         /* .pid = */ LNET_PID_ANY};
+
+                return ksocknal_close_matching_conns (id,
+                                                      data->ioc_u32[0]);
+        }
+        case IOC_LIBCFS_REGISTER_MYNID:
+                /* Ignore if this is a noop */
+               if (data->ioc_nid == ni->ni_nid)
+                       return 0;
+
+               CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                      libcfs_nid2str(data->ioc_nid),
+                      libcfs_nid2str(ni->ni_nid));
+               return -EINVAL;
+
+        case IOC_LIBCFS_PUSH_CONNECTION: {
+                lnet_process_id_t  id = {/* .nid = */ data->ioc_nid,
+                                         /* .pid = */ LNET_PID_ANY};
+                
+                return ksocknal_push(ni, id);
+        }
+        default:
+                return -EINVAL;
+        }
+        /* not reached */
+}
+
+void
+ksocknal_free_buffers (void)
+{
+        LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
+
+        if (ksocknal_data.ksnd_schedulers != NULL)
+                LIBCFS_FREE (ksocknal_data.ksnd_schedulers,
+                             sizeof (ksock_sched_t) * ksocknal_data.ksnd_nschedulers);
+
+        LIBCFS_FREE (ksocknal_data.ksnd_peers,
+                     sizeof (struct list_head) *
+                     ksocknal_data.ksnd_peer_hash_size);
+}
+
+void
+ksocknal_base_shutdown (void)
+{
+        ksock_sched_t *sched;
+        int            i;
+        unsigned long  flags;
+
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&libcfs_kmemory));
+        LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+        switch (ksocknal_data.ksnd_init) {
+        default:
+                LASSERT (0);
+
+        case SOCKNAL_INIT_ALL:
+                /* Wait for queued connreqs to clean up */
+                i = 2;
+                spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock, flags);
+                while (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+                        spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock,
+                                               flags);
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "waiting for connreqs to clean up\n");
+                        cfs_pause(cfs_time_seconds(1));
+
+                        spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock, flags);
+                }
+                spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock, 
+                                       flags);
+
+                /* fall through */
+
+        case SOCKNAL_INIT_DATA:
+                LASSERT (ksocknal_data.ksnd_peers != NULL);
+                for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+                        LASSERT (list_empty (&ksocknal_data.ksnd_peers[i]));
+                }
+                LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns));
+                LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns));
+                LASSERT (list_empty (&ksocknal_data.ksnd_connd_connreqs));
+                LASSERT (list_empty (&ksocknal_data.ksnd_connd_routes));
+
+                if (ksocknal_data.ksnd_schedulers != NULL)
+                        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
+                                ksock_sched_t *kss =
+                                        &ksocknal_data.ksnd_schedulers[i];
+
+                                LASSERT (list_empty (&kss->kss_tx_conns));
+                                LASSERT (list_empty (&kss->kss_rx_conns));
+                                LASSERT (kss->kss_nconns == 0);
+                        }
+
+                /* flag threads to terminate; wake and wait for them to die */
+                ksocknal_data.ksnd_shuttingdown = 1;
+                cfs_waitq_broadcast (&ksocknal_data.ksnd_connd_waitq);
+                cfs_waitq_broadcast (&ksocknal_data.ksnd_reaper_waitq);
+
+                if (ksocknal_data.ksnd_schedulers != NULL)
+                        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
+                                sched = &ksocknal_data.ksnd_schedulers[i];
+                                cfs_waitq_broadcast(&sched->kss_waitq);
+                        }
+
+                i = 4;
+                read_lock(&ksocknal_data.ksnd_global_lock);
+                while (ksocknal_data.ksnd_nthreads != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "waiting for %d threads to terminate\n",
+                                ksocknal_data.ksnd_nthreads);
+                        read_unlock(&ksocknal_data.ksnd_global_lock);
+                        cfs_pause(cfs_time_seconds(1));
+                        read_lock(&ksocknal_data.ksnd_global_lock);
+                }
+                read_unlock(&ksocknal_data.ksnd_global_lock);
+
+                ksocknal_free_buffers();
+
+                ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
+                break;
+        }
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&libcfs_kmemory));
+
+        PORTAL_MODULE_UNUSE;
+}
+
+
+__u64
+ksocknal_new_incarnation (void)
+{
+        struct timeval tv;
+
+        /* The incarnation number is the time this module loaded and it
+         * identifies this particular instance of the socknal.  Hopefully
+         * we won't be able to reboot more frequently than 1MHz for the
+         * forseeable future :) */
+
+        do_gettimeofday(&tv);
+
+        return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+}
+
+int
+ksocknal_base_startup (void)
+{
+        int               rc;
+        int               i;
+
+        LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+        LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+        memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+
+        ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
+        LIBCFS_ALLOC (ksocknal_data.ksnd_peers,
+                      sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size);
+        if (ksocknal_data.ksnd_peers == NULL)
+                return -ENOMEM;
+
+        for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
+                CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
+
+        rwlock_init(&ksocknal_data.ksnd_global_lock);
+
+        spin_lock_init (&ksocknal_data.ksnd_reaper_lock);
+        CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns);
+        CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns);
+        CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns);
+        cfs_waitq_init(&ksocknal_data.ksnd_reaper_waitq);
+
+        spin_lock_init (&ksocknal_data.ksnd_connd_lock);
+        CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_connreqs);
+        CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_routes);
+        cfs_waitq_init(&ksocknal_data.ksnd_connd_waitq);
+
+        /* NB memset above zeros whole of ksocknal_data, including
+         * ksocknal_data.ksnd_irqinfo[all].ksni_valid */
+
+        /* flag lists/ptrs/locks initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+        PORTAL_MODULE_USE;
+
+        ksocknal_data.ksnd_nschedulers = ksocknal_nsched();
+        LIBCFS_ALLOC(ksocknal_data.ksnd_schedulers,
+                     sizeof(ksock_sched_t) * ksocknal_data.ksnd_nschedulers);
+        if (ksocknal_data.ksnd_schedulers == NULL)
+                goto failed;
+
+        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
+                ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i];
+
+                spin_lock_init (&kss->kss_lock);
+                CFS_INIT_LIST_HEAD (&kss->kss_rx_conns);
+                CFS_INIT_LIST_HEAD (&kss->kss_tx_conns);
+#if SOCKNAL_ZC
+                CFS_INIT_LIST_HEAD (&kss->kss_zctxdone_list);
+#endif
+                cfs_waitq_init (&kss->kss_waitq);
+        }
+
+        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
+                rc = ksocknal_thread_start (ksocknal_scheduler,
+                                            &ksocknal_data.ksnd_schedulers[i]);
+                if (rc != 0) {
+                        CERROR("Can't spawn socknal scheduler[%d]: %d\n",
+                               i, rc);
+                        goto failed;
+                }
+        }
+
+        for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
+                rc = ksocknal_thread_start (ksocknal_connd, (void *)((long)i));
+                if (rc != 0) {
+                        CERROR("Can't spawn socknal connd: %d\n", rc);
+                        goto failed;
+                }
+        }
+
+        rc = ksocknal_thread_start (ksocknal_reaper, NULL);
+        if (rc != 0) {
+                CERROR ("Can't spawn socknal reaper: %d\n", rc);
+                goto failed;
+        }
+
+        /* flag everything initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+        return 0;
+
+ failed:
+        ksocknal_base_shutdown();
+        return -ENETDOWN;
+}
+
+void
+ksocknal_shutdown (lnet_ni_t *ni)
+{
+        ksock_net_t      *net = ni->ni_data;
+        int               i;
+        unsigned long     flags;
+        lnet_process_id_t  anyid = { /* .nid = */ LNET_NID_ANY,
+                                     /* .pid = */ LNET_PID_ANY};
+
+        LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
+        LASSERT(ksocknal_data.ksnd_nnets > 0);
+
+        spin_lock_irqsave(&net->ksnn_lock, flags);
+        net->ksnn_shutdown = 1;                 /* prevent new peers */
+        spin_unlock_irqrestore(&net->ksnn_lock, flags);
+
+        /* Delete all peers */
+        ksocknal_del_peer(ni, anyid, 0);
+
+        /* Wait for all peer state to clean up */
+        i = 2;
+        spin_lock_irqsave(&net->ksnn_lock, flags);
+        while (net->ksnn_npeers != 0) {
+                spin_unlock_irqrestore(&net->ksnn_lock, flags);
+
+                i++;
+                CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                       "waiting for %d peers to disconnect\n",
+                       net->ksnn_npeers);
+                cfs_pause(cfs_time_seconds(1));
+
+                spin_lock_irqsave(&net->ksnn_lock, flags);
+        }
+        spin_unlock_irqrestore(&net->ksnn_lock, flags);
+
+        for (i = 0; i < net->ksnn_ninterfaces; i++) {
+                LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0);
+                LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0);
+        }
+
+        LIBCFS_FREE(net, sizeof(*net));
+        
+        ksocknal_data.ksnd_nnets--;
+        if (ksocknal_data.ksnd_nnets == 0)
+                ksocknal_base_shutdown();
+}
+
+int
+ksocknal_enumerate_interfaces(ksock_net_t *net)
+{
+        char      **names;
+        int         i;
+        int         j;
+        int         rc;
+        int         n;
+                
+        n = libcfs_ipif_enumerate(&names);
+        if (n <= 0) {
+                CERROR("Can't enumerate interfaces: %d\n", n);
+                return n;
+        }
+
+        for (i = j = 0; i < n; i++) {
+                int        up;
+                __u32      ip;
+                __u32      mask;
+
+                if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+                        continue;
+
+                rc = libcfs_ipif_query(names[i], &up, &ip, &mask);
+                if (rc != 0) {
+                        CWARN("Can't get interface %s info: %d\n",
+                              names[i], rc);
+                        continue;
+                }
+                
+                if (!up) {
+                        CWARN("Ignoring interface %s (down)\n",
+                              names[i]);
+                        continue;
+                }
+
+                if (j == LNET_MAX_INTERFACES) {
+                        CWARN("Ignoring interface %s (too many interfaces)\n",
+                              names[i]);
+                        continue;
+                }
+
+                net->ksnn_interfaces[j].ksni_ipaddr = ip;
+                net->ksnn_interfaces[j].ksni_netmask = mask;
+                j++;
+        }
+
+        libcfs_ipif_free_enumeration(names, n);
+        
+        if (j == 0)
+                CERROR("Can't find any usable interfaces\n");
+        
+        return j;
+}
+
+int
+ksocknal_startup (lnet_ni_t *ni)
+{
+        ksock_net_t  *net;
+        int           rc;
+        int           i;
+
+        LASSERT (ni->ni_lnd == &the_ksocklnd);
+
+        if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
+                rc = ksocknal_base_startup();
+                if (rc != 0)
+                        return rc;
+        }
+        
+        LIBCFS_ALLOC(net, sizeof(*net));
+        if (net == NULL)
+                goto fail_0;
+                
+        memset(net, 0, sizeof(*net));
+        spin_lock_init(&net->ksnn_lock);
+        net->ksnn_incarnation = ksocknal_new_incarnation();
+        ni->ni_data = net;
+        ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits;
+        ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peercredits;
+        
+        if (ni->ni_interfaces[0] == NULL) {
+                rc = ksocknal_enumerate_interfaces(net);
+                if (rc <= 0)
+                        goto fail_1;
+
+                net->ksnn_ninterfaces = rc;
+        } else {
+                for (i = 0; i < LNET_MAX_INTERFACES; i++) {
+                        int    up;
+
+                        if (ni->ni_interfaces[i] == NULL)
+                                break;
+
+                        rc = libcfs_ipif_query(
+                                ni->ni_interfaces[i], &up,
+                                &net->ksnn_interfaces[i].ksni_ipaddr,
+                                &net->ksnn_interfaces[i].ksni_netmask);
+                        
+                        if (rc != 0) {
+                                CERROR("Can't get interface %s info: %d\n",
+                                       ni->ni_interfaces[i], rc);
+                                goto fail_1;
+                        }
+                        
+                        if (!up) {
+                                CERROR("Interface %s is down\n",
+                                       ni->ni_interfaces[i]);
+                                goto fail_1;
+                        }
+                }
+                net->ksnn_ninterfaces = i;
+        }
+
+        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+                                net->ksnn_interfaces[0].ksni_ipaddr);
+
+        ksocknal_data.ksnd_nnets++;
+
+        return 0;
+        
+ fail_1:
+        LIBCFS_FREE(net, sizeof(*net));
+ fail_0:
+        if (ksocknal_data.ksnd_nnets == 0)
+                ksocknal_base_shutdown();
+
+        return -ENETDOWN;
+}
+
+
+void __exit
+ksocknal_module_fini (void)
+{
+        lnet_unregister_lnd(&the_ksocklnd);
+        ksocknal_lib_tunables_fini();
+}
+
+int __init
+ksocknal_module_init (void)
+{
+        int    rc;
+
+        /* check ksnr_connected/connecting field large enough */
+        CLASSERT(SOCKLND_CONN_NTYPES <= 4);
+        
+        rc = ksocknal_lib_tunables_init();
+        if (rc != 0)
+                return rc;
+
+        lnet_register_lnd(&the_ksocklnd);
+
+        return 0;
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel TCP Socket LND v1.0.0");
+MODULE_LICENSE("GPL");
+
+cfs_module(ksocknal, "1.0.0", ksocknal_module_init, ksocknal_module_fini);
diff --git a/lnet/klnds/tdilnd/socklnd.h b/lnet/klnds/tdilnd/socklnd.h
new file mode 100644 (file)
index 0000000..e7ebefc
--- /dev/null
@@ -0,0 +1,498 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#if defined(__linux__)
+#include "socklnd_lib-linux.h"
+#elif defined(__APPLE__)
+#include "socklnd_lib-darwin.h"
+#elif defined(__WINNT__)
+#include "socklnd_lib-winnt.h"
+#else
+#error Unsupported Operating System
+#endif
+
+#include <libcfs/kp30.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
+#include <lnet/socklnd.h>
+
+/* default vals for tunables/modparams */
+#define SOCKNAL_TIMEOUT          50             /* default comms timeout (seconds) */
+#define SOCKNAL_NCONND           4              /* # socknal connection daemons */
+#define SOCKNAL_MIN_RECONNECTMS  1000           /* first connection retry after (mS)... */
+#define SOCKNAL_MAX_RECONNECTMS  60000          /* ...exponentially increasing to this */
+#define SOCKNAL_EAGER_ACK        SOCKNAL_ARCH_EAGER_ACK  /* default eager ack (boolean) */
+#define SOCKNAL_TYPED_CONNS      1              /* unidirectional large, bidirectional small? */
+#define SOCKNAL_ZC_MIN_FRAG     (2<<10)         /* default smallest zerocopy fragment */
+#define SOCKNAL_MIN_BULK        (1<<10)         /* smallest "large" message */
+#define SOCKNAL_BUFFER_SIZE     (8<<20)         /* default socket buffer size */
+#define SOCKNAL_NAGLE            0              /* enable/disable NAGLE? */
+#define SOCKNAL_IRQ_AFFINITY     1              /* enable/disable IRQ affinity? */
+#define SOCKNAL_KEEPALIVE_IDLE   35             /* # seconds idle before 1st probe */
+#define SOCKNAL_KEEPALIVE_COUNT  5              /* # unanswered probes to determine peer death */
+#define SOCKNAL_KEEPALIVE_INTVL  5              /* seconds between probes */
+#define SOCKNAL_CREDITS          256            /* # concurrent sends */
+#define SOCKNAL_PEERCREDITS      8              /* # concurrent sends to 1 peer */
+
+#define SOCKNAL_PEER_HASH_SIZE   101            /* # peer lists */
+
+#define SOCKNAL_SMALL_FWD_NMSGS        128             /* # small messages I can be forwarding at any time */
+#define SOCKNAL_LARGE_FWD_NMSGS 64              /* # large messages I can be forwarding at any time */
+
+#define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
+#define SOCKNAL_ENOMEM_RETRY    CFS_MIN_DELAY   /* jiffies between retries */
+
+#define SOCKNAL_ROUND_ROBIN     0               /* round robin / load balance */
+
+#define SOCKNAL_SINGLE_FRAG_TX      0           /* disable multi-fragment sends */
+#define SOCKNAL_SINGLE_FRAG_RX      0           /* disable multi-fragment receives */
+
+/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
+ * no risk if we're not running on a CONFIG_HIGHMEM platform. */
+#ifdef CONFIG_HIGHMEM
+# define SOCKNAL_RISK_KMAP_DEADLOCK  0
+#else
+# define SOCKNAL_RISK_KMAP_DEADLOCK  1
+#endif
+
+/* minimum socket buffer required for connection handshake */
+#define SOCKNAL_MIN_BUFFER   (2*(sizeof(lnet_hdr_t) +                   \
+                                 LNET_MAX_INTERFACES * sizeof(__u32)))
+
+typedef struct                                  /* per scheduler state */
+{
+        spinlock_t        kss_lock;             /* serialise */
+        struct list_head  kss_rx_conns;         /* conn waiting to be read */
+        struct list_head  kss_tx_conns;         /* conn waiting to be written */
+#if SOCKNAL_ZC
+        struct list_head  kss_zctxdone_list;    /* completed ZC transmits */
+#endif
+        cfs_waitq_t       kss_waitq;            /* where scheduler sleeps */
+        int               kss_nconns;           /* # connections assigned to this scheduler */
+} ksock_sched_t;
+
+typedef struct
+{
+        int               ksni_valid:1;         /* been set yet? */
+        int               ksni_bound:1;         /* bound to a cpu yet? */
+        int               ksni_sched:6;         /* which scheduler (assumes < 64) */
+} ksock_irqinfo_t;
+
+typedef struct                                  /* in-use interface */
+{
+        __u32             ksni_ipaddr;          /* interface's IP address */
+        __u32             ksni_netmask;         /* interface's network mask */
+        int               ksni_nroutes;         /* # routes using (active) */
+        int               ksni_npeers;          /* # peers using (passive) */
+        char              ksni_name[16];        /* interface name */
+} ksock_interface_t;
+
+typedef struct
+{
+        int              *ksnd_timeout;         /* "stuck" socket timeout (seconds) */
+        int              *ksnd_nconnds;         /* # connection daemons */
+        int              *ksnd_min_reconnectms; /* first connection retry after (ms)... */
+        int              *ksnd_max_reconnectms; /* ...exponentially increasing to this */
+        int              *ksnd_eager_ack;       /* make TCP ack eagerly? */
+        int              *ksnd_typed_conns;     /* drive sockets by type? */
+        int              *ksnd_min_bulk;        /* smallest "large" message */
+        int              *ksnd_buffer_size;     /* socket buffer size */
+        int              *ksnd_nagle;           /* enable NAGLE? */
+        int              *ksnd_keepalive_idle;  /* # idle secs before 1st probe */
+        int              *ksnd_keepalive_count; /* # probes */
+        int              *ksnd_keepalive_intvl; /* time between probes */
+        int              *ksnd_credits;         /* # concurrent sends */
+        int              *ksnd_peercredits;     /* # concurrent sends to 1 peer */
+#if SOCKNAL_ZC
+        unsigned int     *ksnd_zc_min_frag;     /* minimum zero copy frag size */
+#endif
+#if CPU_AFFINITY
+        int              *ksnd_irq_affinity;    /* enable IRQ affinity? */
+#endif
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+        cfs_sysctl_table_header_t *ksnd_sysctl;   /* sysctl interface */
+#endif
+} ksock_tunables_t;
+
+typedef struct
+{
+        __u64             ksnn_incarnation;     /* my epoch */
+        spinlock_t        ksnn_lock;            /* serialise */
+        int               ksnn_npeers;          /* # peers */
+        int               ksnn_shutdown;        /* shutting down? */
+        int               ksnn_ninterfaces;     /* IP interfaces */
+        ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES];
+} ksock_net_t;
+
+typedef struct
+{
+        int               ksnd_init;            /* initialisation state */
+        int               ksnd_nnets;           /* # networks set up */
+
+        rwlock_t          ksnd_global_lock;     /* stabilize peer/conn ops */
+        struct list_head *ksnd_peers;           /* hash table of all my known peers */
+        int               ksnd_peer_hash_size;  /* size of ksnd_peers */
+
+        int               ksnd_nthreads;        /* # live threads */
+        int               ksnd_shuttingdown;    /* tell threads to exit */
+        int               ksnd_nschedulers;     /* # schedulers */
+        ksock_sched_t    *ksnd_schedulers;      /* their state */
+
+        atomic_t          ksnd_nactive_txs;     /* #active txs */
+
+        struct list_head  ksnd_deathrow_conns;  /* conns to close: reaper_lock*/
+        struct list_head  ksnd_zombie_conns;    /* conns to free: reaper_lock */
+        struct list_head  ksnd_enomem_conns;    /* conns to retry: reaper_lock*/
+        cfs_waitq_t       ksnd_reaper_waitq;    /* reaper sleeps here */
+        cfs_time_t        ksnd_reaper_waketime; /* when reaper will wake */
+        spinlock_t        ksnd_reaper_lock;     /* serialise */
+
+        int               ksnd_enomem_tx;       /* test ENOMEM sender */
+        int               ksnd_stall_tx;        /* test sluggish sender */
+        int               ksnd_stall_rx;        /* test sluggish receiver */
+
+        struct list_head  ksnd_connd_connreqs;  /* incoming connection requests */
+        struct list_head  ksnd_connd_routes;    /* routes waiting to be connected */
+        cfs_waitq_t       ksnd_connd_waitq;     /* connds sleep here */
+        spinlock_t        ksnd_connd_lock;      /* serialise */
+
+        ksock_irqinfo_t   ksnd_irqinfo[NR_IRQS];/* irq->scheduler lookup */
+
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_ALL        2
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments (the first frag contains the portals header),
+ * followed by 0 or more lnet_kiov_t fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header).  Once the header has been received, the payload is
+ * received into either struct iovec or lnet_kiov_t fragments, depending on
+ * what the header matched or whether the message needs forwarding. */
+
+struct ksock_conn;                              /* forward ref */
+struct ksock_peer;                              /* forward ref */
+struct ksock_route;                             /* forward ref */
+
+typedef struct                                  /* transmit packet */
+{
+        struct list_head        tx_list;        /* queue on conn for transmission etc */
+        int                     tx_nob;         /* # packet bytes */
+        int                     tx_resid;       /* residual bytes */
+        int                     tx_niov;        /* # packet iovec frags */
+        struct iovec           *tx_iov;         /* packet iovec frags */
+        int                     tx_nkiov;       /* # packet page frags */
+        lnet_kiov_t            *tx_kiov;        /* packet page frags */
+        struct ksock_conn      *tx_conn;        /* owning conn */
+        lnet_msg_t             *tx_lnetmsg;     /* lnet message for lnet_finalize() */
+#if SOCKNAL_ZC        
+        zccd_t                  tx_zccd;        /* zero copy callback descriptor */
+#endif
+        int                     tx_desc_size;   /* size of this descriptor */
+        union {
+                struct {
+                        struct iovec iov;       /* virt hdr */
+                        lnet_kiov_t  kiov[0];   /* paged payload */
+                }                  paged;
+                struct {
+                        struct iovec iov[1];    /* virt hdr + payload */
+                }                  virt;
+        }                       tx_frags;
+} ksock_tx_t;
+
+#define KSOCK_ZCCD_2_TX(ptr)   list_entry (ptr, ksock_tx_t, tx_zccd)
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or up to LNET_MAX_IOV frags of payload of either type. */
+typedef union {
+        struct iovec     iov[LNET_MAX_IOV];
+        lnet_kiov_t      kiov[LNET_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_HEADER       1               /* reading header */
+#define SOCKNAL_RX_PARSE        2               /* Calling lnet_parse() */
+#define SOCKNAL_RX_PARSE_WAIT   3               /* waiting to be told to read the body */
+#define SOCKNAL_RX_BODY         4               /* reading body (to deliver here) */
+#define SOCKNAL_RX_SLOP         5               /* skipping body */
+
+typedef struct ksock_conn
+{ 
+        struct ksock_peer  *ksnc_peer;          /* owning peer */
+        struct ksock_route *ksnc_route;         /* owning route */
+        struct list_head    ksnc_list;          /* stash on peer's conn list */
+        struct socket      *ksnc_sock;          /* actual socket */
+        void               *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+        void               *ksnc_saved_write_space; /* socket's original write_space() callback */
+        atomic_t            ksnc_conn_refcount; /* conn refcount */
+        atomic_t            ksnc_sock_refcount; /* sock refcount */
+        ksock_sched_t     *ksnc_scheduler;     /* who schedules this connection */
+        __u32               ksnc_myipaddr;      /* my IP */
+        __u32               ksnc_ipaddr;        /* peer's IP */
+        int                 ksnc_port;          /* peer's port */
+        int                 ksnc_closing;       /* being shut down */
+        int                 ksnc_type;          /* type of connection */
+        __u64               ksnc_incarnation;   /* peer's incarnation */
+        
+        /* reader */
+        struct list_head    ksnc_rx_list;       /* where I enq waiting input or a forwarding descriptor */
+        cfs_time_t          ksnc_rx_deadline;   /* when (in jiffies) receive times out */
+        int                 ksnc_rx_started;    /* started receiving a message */
+        int                 ksnc_rx_ready;      /* data ready to read */
+        int                 ksnc_rx_scheduled;  /* being progressed */
+        int                 ksnc_rx_state;      /* what is being read */
+        int                 ksnc_rx_nob_left;   /* # bytes to next hdr/body  */
+        int                 ksnc_rx_nob_wanted; /* bytes actually wanted */
+        int                 ksnc_rx_niov;       /* # iovec frags */
+        struct iovec       *ksnc_rx_iov;        /* the iovec frags */
+        int                 ksnc_rx_nkiov;      /* # page frags */
+        lnet_kiov_t        *ksnc_rx_kiov;       /* the page frags */
+        ksock_rxiovspace_t  ksnc_rx_iov_space;  /* space for frag descriptors */
+        void               *ksnc_cookie;        /* rx lnet_finalize passthru arg */
+        lnet_hdr_t           ksnc_hdr;           /* where I read headers into */
+
+        /* WRITER */
+        struct list_head    ksnc_tx_list;       /* where I enq waiting for output space */
+        struct list_head    ksnc_tx_queue;      /* packets waiting to be sent */
+        cfs_time_t          ksnc_tx_deadline;   /* when (in jiffies) tx times out */
+        int                 ksnc_tx_bufnob;     /* send buffer marker */
+        atomic_t            ksnc_tx_nob;        /* # bytes queued */
+        int                 ksnc_tx_ready;      /* write space */
+        int                 ksnc_tx_scheduled;  /* being progressed */
+
+#if !SOCKNAL_SINGLE_FRAG_RX
+        struct iovec        ksnc_rx_scratch_iov[LNET_MAX_IOV];
+#endif
+#if !SOCKNAL_SINGLE_FRAG_TX
+        struct iovec        ksnc_tx_scratch_iov[LNET_MAX_IOV];
+#endif
+} ksock_conn_t;
+
+#define KSNR_TYPED_ROUTES   ((1 << SOCKLND_CONN_CONTROL) |      \
+                             (1 << SOCKLND_CONN_BULK_IN) |      \
+                             (1 << SOCKLND_CONN_BULK_OUT))
+
+typedef struct ksock_route
+{
+        struct list_head    ksnr_list;          /* chain on peer route list */
+        struct list_head    ksnr_connd_list;    /* chain on ksnr_connd_routes */
+        struct ksock_peer  *ksnr_peer;          /* owning peer */
+        atomic_t            ksnr_refcount;      /* # users */
+        cfs_time_t          ksnr_timeout;       /* when (in jiffies) reconnection can happen next */
+        cfs_duration_t      ksnr_retry_interval; /* how long between retries */
+        __u32               ksnr_myipaddr;      /* my IP */
+        __u32               ksnr_ipaddr;        /* IP address to connect to */
+        int                 ksnr_port;          /* port to connect to */
+        unsigned int        ksnr_connecting:4;  /* autoconnect in progress by type */
+        unsigned int        ksnr_connected:4;   /* connections established by type */
+        unsigned int        ksnr_deleted:1;     /* been removed from peer? */
+        unsigned int        ksnr_share_count;   /* created explicitly? */
+        int                 ksnr_conn_count;    /* # conns established by this route */
+} ksock_route_t;
+
+typedef struct ksock_peer
+{
+        struct list_head    ksnp_list;          /* stash on global peer list */
+        lnet_process_id_t   ksnp_id;            /* who's on the other end(s) */
+        atomic_t            ksnp_refcount;      /* # users */
+        int                 ksnp_sharecount;    /* lconf usage counter */
+        int                 ksnp_closing;       /* being closed */
+        int                 ksnp_accepting;     /* # passive connections pending */
+        int                 ksnp_error;         /* errno on closing last conn */
+        struct list_head    ksnp_conns;         /* all active connections */
+        struct list_head    ksnp_routes;        /* routes */
+        struct list_head    ksnp_tx_queue;      /* waiting packets */
+        cfs_time_t          ksnp_last_alive;    /* when (in jiffies) I was last alive */
+        lnet_ni_t          *ksnp_ni;            /* which network */
+        int                 ksnp_n_passive_ips; /* # of... */
+        __u32               ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */
+} ksock_peer_t;
+
+typedef struct ksock_connreq
+{
+        struct list_head    ksncr_list;         /* stash on ksnd_connd_connreqs */
+        lnet_ni_t           *ksncr_ni;           /* chosen NI */
+        struct socket      *ksncr_sock;         /* accepted socket */
+} ksock_connreq_t;
+
+extern ksock_nal_data_t ksocknal_data;
+extern ksock_tunables_t ksocknal_tunables;
+
+static inline struct list_head *
+ksocknal_nid2peerlist (lnet_nid_t nid)
+{
+        unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
+
+        return (&ksocknal_data.ksnd_peers [hash]);
+}
+
+static inline void
+ksocknal_conn_addref (ksock_conn_t *conn)
+{
+        LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+        atomic_inc(&conn->ksnc_conn_refcount);
+}
+
+extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn);
+
+static inline void
+ksocknal_conn_decref (ksock_conn_t *conn)
+{
+        LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+        if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
+                ksocknal_queue_zombie_conn(conn);
+}
+
+static inline int
+ksocknal_connsock_addref (ksock_conn_t *conn)
+{
+        int   rc = -ESHUTDOWN;
+
+        read_lock (&ksocknal_data.ksnd_global_lock);
+        if (!conn->ksnc_closing) {
+                LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+                atomic_inc(&conn->ksnc_sock_refcount);
+                rc = 0;
+        }
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+
+        return (rc);
+}
+
+static inline void
+ksocknal_connsock_decref (ksock_conn_t *conn)
+{
+        LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+        if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
+                LASSERT (conn->ksnc_closing);
+                libcfs_sock_release(conn->ksnc_sock);
+                conn->ksnc_sock = NULL;
+        }
+}
+
+static inline void
+ksocknal_route_addref (ksock_route_t *route)
+{
+        LASSERT (atomic_read(&route->ksnr_refcount) > 0);
+        atomic_inc(&route->ksnr_refcount);
+}
+
+extern void ksocknal_destroy_route (ksock_route_t *route);
+
+static inline void
+ksocknal_route_decref (ksock_route_t *route)
+{
+        LASSERT (atomic_read (&route->ksnr_refcount) > 0);
+        if (atomic_dec_and_test(&route->ksnr_refcount))
+                ksocknal_destroy_route (route);
+}
+
+static inline void
+ksocknal_peer_addref (ksock_peer_t *peer)
+{
+        LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+        atomic_inc(&peer->ksnp_refcount);
+}
+
+extern void ksocknal_destroy_peer (ksock_peer_t *peer);
+
+static inline void
+ksocknal_peer_decref (ksock_peer_t *peer)
+{
+        LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+        if (atomic_dec_and_test(&peer->ksnp_refcount))
+                ksocknal_destroy_peer (peer);
+}
+
+int ksocknal_startup (lnet_ni_t *ni);
+void ksocknal_shutdown (lnet_ni_t *ni);
+int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int ksocknal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, 
+                  int delayed, unsigned int niov, 
+                  struct iovec *iov, lnet_kiov_t *kiov,
+                  unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ksocknal_accept(lnet_ni_t *ni, struct socket *sock);
+
+extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port);
+extern ksock_peer_t *ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id);
+extern ksock_peer_t *ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id);
+extern void ksocknal_peer_failed (ksock_peer_t *peer);
+extern int ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+                                 struct socket *sock, int type);
+extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
+extern void ksocknal_terminate_conn (ksock_conn_t *conn);
+extern void ksocknal_destroy_conn (ksock_conn_t *conn);
+extern int ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation);
+extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
+extern int ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr);
+
+extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
+extern void ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx, int asynch);
+extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist);
+extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
+extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg);
+extern void ksocknal_thread_fini (void);
+extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_t *peer);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_connd (void *arg);
+extern int ksocknal_reaper (void *arg);
+extern int ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+                                lnet_nid_t peer_nid, 
+                                __u32 *ipaddrs, int nipaddrs);
+extern int ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, 
+                                lnet_process_id_t *id, 
+                                __u64 *incarnation, __u32 *ipaddrs);
+
+extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn);
+extern void ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_push_conn (ksock_conn_t *conn);
+extern void ksocknal_lib_bind_irq (unsigned int irq);
+extern int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn);
+extern unsigned int ksocknal_lib_sock_irq (struct socket *sock);
+extern int ksocknal_lib_setup_sock (struct socket *so);
+extern int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern void ksocknal_lib_eager_ack (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_iov (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_kiov (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, 
+                                           int *rxmem, int *nagle);
+
+extern int ksocknal_lib_tunables_init(void);
+extern void ksocknal_lib_tunables_fini(void);
diff --git a/lnet/klnds/tdilnd/socklnd_cb.c b/lnet/klnds/tdilnd/socklnd_cb.c
new file mode 100644 (file)
index 0000000..5b1c8f2
--- /dev/null
@@ -0,0 +1,2108 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+void
+ksocknal_free_tx (ksock_tx_t *tx)
+{
+        atomic_dec(&ksocknal_data.ksnd_nactive_txs);
+        LIBCFS_FREE(tx, tx->tx_desc_size);
+}
+
+
+int
+ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        struct iovec  *iov = tx->tx_iov;
+        int            fragsize = iov->iov_len;
+        ulong_ptr  vaddr = (ulong_ptr)iov->iov_base;
+        int            more = (tx->tx_nkiov > 0) ||
+                              (!list_empty (&conn->ksnc_tx_queue));
+
+        int            rc;
+
+        int            len;
+        ksock_mdl_t *  mdl;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only send 1 frag at a time. */
+        LASSERT (fragsize <= tx->tx_resid);
+        LASSERT (tx->tx_niov > 0);
+        
+        /* lock the whole tx iovs into a single mdl chain */
+        mdl = ksocknal_lock_iovs(tx->tx_iov, tx->tx_niov, FALSE, &len);
+
+        if (mdl) {
+                /* send the total mdl chain */
+                rc = ksocknal_send_mdl(
+                            conn->ksnc_sock, tx, mdl, len, 
+                            more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT);
+        } else {
+                rc = -ENOMEM;
+        }
+
+        if (rc <= 0) {
+            goto errorout;
+        }
+
+        tx->tx_resid -= rc;
+
+        len = rc;
+
+        while (len > 0) {
+
+                if ((unsigned int)len < iov->iov_len) {
+                        /* didn't send whole iov entry... */
+                        iov->iov_base = (char *)(iov->iov_base) + len;
+                        iov->iov_len -= len;
+                        len = 0;
+                } else {
+                        len -= iov->iov_len;
+                        tx->tx_iov++;
+                        tx->tx_niov--;
+                        iov = tx->tx_iov;
+                }
+        }
+
+errorout:
+        
+        return (rc);
+}
+
+
+int
+ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        lnet_kiov_t   *kiov = tx->tx_kiov;
+        int            fragsize = kiov->kiov_len;
+        cfs_page_t    *page = kiov->kiov_page;
+        int            offset = kiov->kiov_offset;
+        int            more = (!list_empty (&conn->ksnc_tx_queue));
+
+        int            rc;
+        __u32          len;
+        PMDL           mdl;
+
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only send 1 frag at a time. */
+        LASSERT (fragsize <= tx->tx_resid);
+        LASSERT (offset + fragsize <= PAGE_SIZE);
+        LASSERT (tx->tx_niov == 0);
+        LASSERT (tx->tx_nkiov > 0);
+
+        /* lock the whole tx kiovs into a single mdl chain */
+        mdl = ksocknal_lock_kiovs(tx->tx_kiov, tx->tx_nkiov, FALSE, &len);
+
+        if (mdl) {
+                /* send the total mdl chain */
+                rc = ksocknal_send_mdl(
+                            conn->ksnc_sock, tx, mdl, len,
+                            more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT);
+        } else {
+                rc = -ENOMEM;
+        }
+
+        if (rc <= 0) {
+                goto errorout;
+        }
+
+        tx->tx_resid -= rc;
+
+        len = rc;
+
+        while (len >0) {
+                if (len < kiov->kiov_len) {
+                        kiov->kiov_offset +=  len;
+                        kiov->kiov_len -= len;
+                        len = 0;
+                } else {
+                        len -= kiov->kiov_len;
+                        tx->tx_kiov++;
+                        tx->tx_nkiov--;
+                        kiov = tx->tx_kiov;
+                }
+        }
+
+errorout:
+
+        return (rc);
+}
+
+
+int
+ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        int      rc;
+        int      bufnob;
+        
+        if (ksocknal_data.ksnd_stall_tx != 0) {
+                set_current_state (TASK_UNINTERRUPTIBLE);
+                schedule_timeout (cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
+        }
+
+        LASSERT (tx->tx_resid != 0);
+
+        ksocknal_get_tconn (conn->ksnc_sock);
+
+        do {
+                if (ksocknal_data.ksnd_enomem_tx > 0) {
+                        /* testing... */
+                        ksocknal_data.ksnd_enomem_tx--;
+                        rc = -EAGAIN;
+                } else if (tx->tx_niov != 0) {
+                        rc = ksocknal_send_iov (conn, tx);
+                } else {
+                        rc = ksocknal_send_kiov (conn, tx);
+                }
+
+                bufnob = 0; // conn->ksnc_sock->sk->sk_wmem_queued;
+                if (rc > 0)                     /* sent something? */
+                        conn->ksnc_tx_bufnob += rc; /* account it */
+                
+                if (bufnob < conn->ksnc_tx_bufnob) {
+                        /* allocated send buffer bytes < computed; infer
+                         * something got ACKed */
+                        conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+                        conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+                        conn->ksnc_tx_bufnob = bufnob;
+                        mb();
+                }
+
+                if (rc <= 0) {
+                        /* Didn't write anything.
+                         *
+                         * NB: rc == 0 and rc == -EAGAIN both mean try
+                         * again later (linux stack returns -EAGAIN for
+                         * this, but Adaptech TOE returns 0).
+                         *
+                         * Also, sends never fail with -ENOMEM, just
+                         * -EAGAIN, but with the added bonus that we can't
+                         * expect write_space() to call us back to tell us
+                         * when to try sending again.  We use the
+                         * SOCK_NOSPACE flag to diagnose...  */
+
+                        LASSERT(rc != -ENOMEM);
+
+                        break;
+                }
+
+                /* socket's wmem_queued now includes 'rc' bytes */
+                atomic_sub (rc, &conn->ksnc_tx_nob);
+                rc = 0;
+
+        } while (tx->tx_resid != 0);
+
+        ksocknal_put_tconn (conn->ksnc_sock);
+        return (rc);
+}
+
+int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+        struct iovec *iov = conn->ksnc_rx_iov;
+        int          size;
+        int          rc;
+
+        ksock_mdl_t *   mdl;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only receive 1 frag at a time. */
+        LASSERT (conn->ksnc_rx_niov > 0);
+
+        /* lock the whole tx iovs into a single mdl chain */
+        mdl = ksocknal_lock_iovs(iov, conn->ksnc_rx_niov, TRUE, &size);
+
+        if (!mdl) {
+            rc = -ENOMEM;
+            return (rc);
+        }
+        
+        LASSERT (size <= conn->ksnc_rx_nob_wanted);
+
+        /* try to request data for the whole mdl chain */
+        rc = ksocknal_recv_mdl (conn->ksnc_sock, mdl, size, MSG_DONTWAIT);
+
+        if (rc <= 0)
+                return (rc);
+
+        /* received something... */
+        conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+        conn->ksnc_rx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+        conn->ksnc_rx_started = 1;
+
+        conn->ksnc_rx_nob_wanted -= rc;
+        conn->ksnc_rx_nob_left -= rc;
+
+        while (rc > 0) {
+
+                if (rc < (int)iov->iov_len) {
+                        iov->iov_base = (char *)(iov->iov_base) + rc;
+                        iov->iov_len -= rc;
+                        rc = 0;
+                } else {
+                        rc -= iov->iov_len;
+                        conn->ksnc_rx_iov++;
+                        conn->ksnc_rx_niov--;
+                        iov = conn->ksnc_rx_iov;
+                }
+        }
+
+        return (1);
+}
+
+
+int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+        lnet_kiov_t  *kiov = conn->ksnc_rx_kiov;
+        int           size;
+        int           rc;
+
+        ksock_mdl_t *   mdl;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only receive 1 frag at a time. */
+        LASSERT (conn->ksnc_rx_nkiov > 0);
+
+        /* lock the whole tx kiovs into a single mdl chain */
+        mdl = ksocknal_lock_kiovs(kiov, conn->ksnc_rx_nkiov, TRUE, &size);
+
+        if (!mdl) {
+            rc = -ENOMEM;
+            return (rc);
+        }
+        
+        LASSERT (size <= conn->ksnc_rx_nob_wanted);
+
+        /* try to request data for the whole mdl chain */
+        rc = ksocknal_recv_mdl (conn->ksnc_sock, mdl, size, MSG_DONTWAIT);
+
+        if (rc <= 0)
+                return (rc);
+
+       
+        /* received something... */
+        conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+        conn->ksnc_rx_deadline = cfs_time_shift (*ksocknal_tunables.ksnd_timeout);
+
+        conn->ksnc_rx_started = 1;
+
+        conn->ksnc_rx_nob_wanted -= rc;
+        conn->ksnc_rx_nob_left -= rc;
+
+        while (rc > 0) {
+
+                if (rc < (int) kiov->kiov_len) {
+                        kiov->kiov_offset +=  rc;
+                        kiov->kiov_len -= rc;
+                        rc = 0;
+                } else {
+                        rc -=  kiov->kiov_len;
+                        kiov->kiov_len = 0;
+                        conn->ksnc_rx_kiov++;
+                        conn->ksnc_rx_nkiov--;
+                        kiov = conn->ksnc_rx_kiov;
+                }
+        }
+
+        return (1);
+}
+
+
+int
+ksocknal_receive (ksock_conn_t *conn) 
+{
+        /* Return 1 on success, 0 on EOF, < 0 on error.
+         * Caller checks ksnc_rx_nob_wanted to determine
+         * progress/completion. */
+        int     rc;
+        size_t  size;
+
+        int count = 0;
+
+        ENTRY;
+        
+        if (ksocknal_data.ksnd_stall_rx != 0) {
+                set_current_state (TASK_UNINTERRUPTIBLE);
+                schedule_timeout (cfs_time_seconds (ksocknal_data.ksnd_stall_rx));
+        }
+
+        rc = ksocknal_connsock_addref(conn);
+        if (rc != 0) {
+                LASSERT (conn->ksnc_closing);
+                return (-ESHUTDOWN);
+        }
+
+        rc = ksocknal_query_data(conn->ksnc_sock, &size, FALSE);
+        if (rc != 0) {
+                KsPrint((1, "ksocknal_receive: error querying data length ...\n"));
+                goto errorout;
+        }
+
+        if (!CAN_BE_SCHED(size, (ULONG)conn->ksnc_rx_nob_wanted)) {
+                KsPrint((1, "ksocknal_receive: queried data length = %xh rx_nob_wanted/left = %xh/%xh\n",
+                        size, conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left ));
+                rc =1;
+                goto errorout;
+        }
+
+        for (;;) {
+
+                count++;
+
+                if (conn->ksnc_rx_niov != 0)
+                        rc = ksocknal_recv_iov (conn);
+                else
+                        rc = ksocknal_recv_kiov (conn);
+
+                if (rc <= 0) {
+                        /* error/EOF or partial receive */
+                        if (rc == -EAGAIN) {
+                                rc = 1;
+                        } else if (rc == 0 && conn->ksnc_rx_started) {
+                                /* EOF in the middle of a message */
+                                rc = -EPROTO;
+                        }
+                        break;
+                }
+
+                /* Completed a fragment */
+
+                if (conn->ksnc_rx_nob_wanted == 0) {
+                        /* Completed a message segment (header or payload) */
+                        if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0 &&
+                            (conn->ksnc_rx_state ==  SOCKNAL_RX_BODY)) {
+                                /* Remind the socket to ack eagerly... */
+                                ksocknal_eager_ack(conn);
+                        }
+                        rc = 1;
+                        break;
+                }
+        }
+
+errorout:
+
+        ksocknal_connsock_decref(conn);
+        RETURN(rc);
+}
+
+
+#if SOCKNAL_ZC
+void
+ksocknal_zc_callback (zccd_t *zcd)
+{
+        ksock_tx_t    *tx = KSOCK_ZCCD_2_TX(zcd);
+        ksock_sched_t *sched = tx->tx_conn->ksnc_scheduler;
+        unsigned long  flags;
+        ENTRY;
+
+        /* Schedule tx for cleanup (can't do it now due to lock conflicts) */
+
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list);
+        cfs_waitq_signal (&sched->kss_waitq);
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        EXIT;
+}
+#endif
+
+void
+ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx, int asynch)
+{
+        ENTRY;
+
+        if (tx->tx_conn != NULL) {
+#if SOCKNAL_ZC
+                /* zero copy completion isn't always from
+                 * process_transmit() so it needs to keep a ref on
+                 * tx_conn... */
+                if (asynch)
+                        ksocknal_conn_decref(tx->tx_conn);
+#else
+                LASSERT (!asynch);
+#endif
+        }
+
+        lnet_finalize (ni, tx->tx_lnetmsg, (tx->tx_resid == 0) ? 0 : -EIO);
+        ksocknal_free_tx (tx);
+        EXIT;
+}
+
+void
+ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist)
+{
+        ksock_tx_t *tx;
+        
+        while (!list_empty (txlist)) {
+                tx = list_entry (txlist->next, ksock_tx_t, tx_list);
+
+                CERROR ("Deleting packet type %d len %d %s->%s\n",
+                        le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type),
+                        le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length),
+                        libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
+                        libcfs_nid2str(le64_to_cpu (tx->tx_lnetmsg->msg_hdr.dest_nid)));
+
+                list_del (&tx->tx_list);
+                ksocknal_tx_done (ni, tx, 0);
+        }
+}
+
+void
+ksocknal_tx_launched (ksock_tx_t *tx) 
+{
+#if SOCKNAL_ZC
+        if (atomic_read (&tx->tx_zccd.zccd_count) != 1) {
+                ksock_conn_t  *conn = tx->tx_conn;
+                
+                /* zccd skbufs are still in-flight.  First take a ref on
+                 * conn, so it hangs about for ksocknal_tx_done... */
+                ksocknal_conn_addref(conn);
+
+                /* ...then drop the initial ref on zccd, so the zero copy
+                 * callback can occur */
+                zccd_put (&tx->tx_zccd);
+                return;
+        }
+#endif
+        /* Any zero-copy-ness (if any) has completed; I can complete the
+         * transmit now, avoiding an extra schedule */
+        ksocknal_tx_done (tx->tx_conn->ksnc_peer->ksnp_ni, tx, 0);
+}
+
+int
+ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        unsigned long  flags;
+        int            rc;
+       
+        rc = ksocknal_transmit (conn, tx);
+
+        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+
+        if (tx->tx_resid == 0) {
+                /* Sent everything OK */
+                LASSERT (rc == 0);
+
+                ksocknal_tx_launched (tx);
+                return (0);
+        }
+
+        if (rc == -EAGAIN)
+                return (rc);
+
+        if (rc == -ENOMEM) {
+                static int counter;
+
+                counter++;   /* exponential backoff warnings */
+                if ((counter & (-counter)) == counter)
+                        CWARN("%d ENOMEM tx %p (%u allocated)\n",
+                              counter, conn, atomic_read(&libcfs_kmemory));
+
+                /* Queue on ksnd_enomem_conns for retry after a timeout */
+                spin_lock_irqsave(&ksocknal_data.ksnd_reaper_lock, flags);
+
+                /* enomem list takes over scheduler's ref... */
+                LASSERT (conn->ksnc_tx_scheduled);
+                list_add_tail(&conn->ksnc_tx_list,
+                              &ksocknal_data.ksnd_enomem_conns);
+                if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
+                                                   SOCKNAL_ENOMEM_RETRY),
+                                   ksocknal_data.ksnd_reaper_waketime))
+                        cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq);
+                
+                spin_unlock_irqrestore(&ksocknal_data.ksnd_reaper_lock, flags);
+                return (rc);
+        }
+
+        /* Actual error */
+        LASSERT (rc < 0);
+
+        if (!conn->ksnc_closing) {
+                switch (rc) {
+                case -ECONNRESET:
+                        LCONSOLE_WARN("Host %u.%u.%u.%u reset our connection "
+                                      "while we were sending data; it may have "
+                                      "rebooted.\n",
+                                      HIPQUAD(conn->ksnc_ipaddr));
+                        break;
+                default:
+                        LCONSOLE_WARN("There was an unexpected network error "
+                                      "while writing to %u.%u.%u.%u: %d.\n",
+                                      HIPQUAD(conn->ksnc_ipaddr), rc);
+                        break;
+                }
+                CDEBUG(D_HA, "[%p] Error %d on write to %s"
+                       " ip %d.%d.%d.%d:%d\n", conn, rc,
+                       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                       HIPQUAD(conn->ksnc_ipaddr),
+                       conn->ksnc_port);
+        }
+
+        ksocknal_close_conn_and_siblings (conn, rc);
+        ksocknal_tx_launched (tx);
+
+        return (rc);
+}
+
+void
+ksocknal_launch_connection_locked (ksock_route_t *route)
+{
+        unsigned long     flags;
+        int               bits;
+
+        /* called holding write lock on ksnd_global_lock */
+        LASSERT (route->ksnr_connecting == 0);
+
+        bits = *ksocknal_tunables.ksnd_typed_conns ?
+               KSNR_TYPED_ROUTES : (1 << SOCKLND_CONN_ANY);
+        bits &= ~route->ksnr_connected;
+        
+        LASSERT (bits != 0);
+        
+        route->ksnr_connecting = bits;          /* scheduling conn for connd */
+        ksocknal_route_addref(route);           /* extra ref for connd */
+        
+        spin_lock_irqsave (&ksocknal_data.ksnd_connd_lock, flags);
+        
+        list_add_tail (&route->ksnr_connd_list,
+                       &ksocknal_data.ksnd_connd_routes);
+        cfs_waitq_signal (&ksocknal_data.ksnd_connd_waitq);
+        
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_connd_lock, flags);
+}
+
+ksock_conn_t *
+ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer)
+{
+        struct list_head *tmp;
+        ksock_conn_t     *typed = NULL;
+        int               tnob  = 0;
+        ksock_conn_t     *fallback = NULL;
+        int               fnob     = 0;
+        ksock_conn_t     *conn;
+
+        list_for_each (tmp, &peer->ksnp_conns) {
+                ksock_conn_t *c = list_entry(tmp, ksock_conn_t, ksnc_list);
+#if SOCKNAL_ROUND_ROBIN
+                const int     nob = 0;
+#else
+                int           nob = atomic_read(&c->ksnc_tx_nob);
+#endif
+                LASSERT (!c->ksnc_closing);
+
+                if (fallback == NULL || nob < fnob) {
+                        fallback = c;
+                        fnob     = nob;
+                }
+
+                if (!*ksocknal_tunables.ksnd_typed_conns)
+                        continue;
+
+                switch (c->ksnc_type) {
+                default:
+                        CERROR("ksnc_type bad: %u\n", c->ksnc_type);
+                        LBUG();
+                case SOCKLND_CONN_ANY:
+                        break;
+                case SOCKLND_CONN_BULK_IN:
+                        continue;
+                case SOCKLND_CONN_BULK_OUT:
+                        if (tx->tx_nob < *ksocknal_tunables.ksnd_min_bulk)
+                                continue;
+                        break;
+                case SOCKLND_CONN_CONTROL:
+                        if (tx->tx_nob >= *ksocknal_tunables.ksnd_min_bulk)
+                                continue;
+                        break;
+                }
+
+                if (typed == NULL || nob < tnob) {
+                        typed = c;
+                        tnob  = nob;
+                }
+        }
+
+        /* prefer the typed selection */
+        conn = (typed != NULL) ? typed : fallback;
+
+#if SOCKNAL_ROUND_ROBIN
+        if (conn != NULL) {
+                /* round-robin all else being equal */
+                list_del (&conn->ksnc_list);
+                list_add_tail (&conn->ksnc_list, &peer->ksnp_conns);
+        }
+#endif
+        return conn;
+}
+
+void
+ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
+{
+        unsigned long  flags;
+        ksock_sched_t *sched = conn->ksnc_scheduler;
+
+        /* called holding global lock (read or irq-write) and caller may
+         * not have dropped this lock between finding conn and calling me,
+         * so we don't need the {get,put}connsock dance to deref
+         * ksnc_sock... */
+        LASSERT(!conn->ksnc_closing);
+        LASSERT(tx->tx_resid == tx->tx_nob);
+        
+        CDEBUG (D_NET, "Sending to %s ip %d.%d.%d.%d:%d\n", 
+                libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                HIPQUAD(conn->ksnc_ipaddr),
+                conn->ksnc_port);
+
+        atomic_add (tx->tx_nob, &conn->ksnc_tx_nob);
+        tx->tx_conn = conn;
+
+#if SOCKNAL_ZC
+        zccd_init (&tx->tx_zccd, ksocknal_zc_callback);
+        /* NB this sets 1 ref on zccd, so the callback can only occur after
+         * I've released this ref. */
+#endif
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        if (list_empty(&conn->ksnc_tx_queue)) {
+                /* First packet starts the timeout */
+                conn->ksnc_tx_deadline = 
+                        cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+                conn->ksnc_tx_bufnob = 0;
+                mb();    /* order with adding to tx_queue */
+        }
+
+        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+                
+        if (conn->ksnc_tx_ready &&      /* able to send */
+            !conn->ksnc_tx_scheduled) { /* not scheduled to send */
+                /* +1 ref for scheduler */
+                ksocknal_conn_addref(conn);
+                list_add_tail (&conn->ksnc_tx_list, 
+                               &sched->kss_tx_conns);
+                conn->ksnc_tx_scheduled = 1;
+                cfs_waitq_signal (&sched->kss_waitq);
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+}
+
+ksock_route_t *
+ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
+{
+        struct list_head  *tmp;
+        ksock_route_t     *route;
+        int                bits;
+        
+        list_for_each (tmp, &peer->ksnp_routes) {
+                route = list_entry (tmp, ksock_route_t, ksnr_list);
+                bits  = route->ksnr_connected | route->ksnr_connecting;
+
+                if (*ksocknal_tunables.ksnd_typed_conns) {
+                        /* All typed connections (being) established? */
+                        if ((bits & KSNR_TYPED_ROUTES) == KSNR_TYPED_ROUTES)
+                                continue;
+                } else {
+                        /* Untyped connection (being) established? */
+                        if ((bits & (1 << SOCKLND_CONN_ANY)) != 0)
+                                continue;
+                }
+
+                /* too soon to retry this guy? */
+                if (!(route->ksnr_retry_interval == 0 || /* first attempt */
+                      cfs_time_aftereq (cfs_time_current(), 
+                                        route->ksnr_timeout)))
+                        continue;
+                
+                return (route);
+        }
+        
+        return (NULL);
+}
+
+ksock_route_t *
+ksocknal_find_connecting_route_locked (ksock_peer_t *peer)
+{
+        struct list_head  *tmp;
+        ksock_route_t     *route;
+
+        list_for_each (tmp, &peer->ksnp_routes) {
+                route = list_entry (tmp, ksock_route_t, ksnr_list);
+                
+                if (route->ksnr_connecting != 0)
+                        return (route);
+        }
+        
+        return (NULL);
+}
+
+int
+ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
+{
+        unsigned long     flags;
+        ksock_peer_t     *peer;
+        ksock_conn_t     *conn;
+        ksock_route_t    *route;
+        rwlock_t         *g_lock;
+        int               retry;
+        int               rc;
+        
+        /* Ensure the frags we've been given EXACTLY match the number of
+         * bytes we want to send.  Many TCP/IP stacks disregard any total
+         * size parameters passed to them and just look at the frags. 
+         *
+         * We always expect at least 1 mapped fragment containing the
+         * complete portals header. */
+        LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) +
+                 lnet_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob);
+        LASSERT (tx->tx_niov >= 1);
+        LASSERT (tx->tx_iov[0].iov_len >= sizeof (lnet_hdr_t));
+        LASSERT (tx->tx_conn == NULL);
+        LASSERT (tx->tx_resid == tx->tx_nob);
+
+        CDEBUG (D_NET, "packet %p type %d, nob %d niov %d nkiov %d\n",
+                tx, ((lnet_hdr_t *)tx->tx_iov[0].iov_base)->type, 
+                tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
+
+        g_lock = &ksocknal_data.ksnd_global_lock;
+        
+        for (retry = 0;; retry = 1) {
+#if !SOCKNAL_ROUND_ROBIN
+                read_lock (g_lock);
+                peer = ksocknal_find_peer_locked(ni, id);
+                if (peer != NULL) {
+                        if (ksocknal_find_connectable_route_locked(peer) == NULL) {
+                                conn = ksocknal_find_conn_locked (tx, peer);
+                                if (conn != NULL) {
+                                        /* I've got no routes that need to be
+                                         * connecting and I do have an actual
+                                         * connection... */
+                                        ksocknal_queue_tx_locked (tx, conn);
+                                        read_unlock (g_lock);
+                                        return (0);
+                                }
+                        }
+                }
+                /* I'll need a write lock... */
+                read_unlock (g_lock);
+#endif
+                write_lock_irqsave(g_lock, flags);
+
+                peer = ksocknal_find_peer_locked(ni, id);
+                if (peer != NULL) 
+                        break;
+                
+                write_unlock_irqrestore(g_lock, flags);
+
+                if ((id.pid & LNET_PID_USERFLAG) != 0) {
+                        CERROR("Refusing to create a connection to "
+                               "userspace process %s\n", libcfs_id2str(id));
+                        return -EHOSTUNREACH;
+                }
+                
+                if (retry) {
+                        CERROR("Can't find peer %s\n", libcfs_id2str(id));
+                        return -EHOSTUNREACH;
+                }
+                
+                rc = ksocknal_add_peer(ni, id, 
+                                       LNET_NIDADDR(id.nid),
+                                       lnet_acceptor_port());
+                if (rc != 0) {
+                        CERROR("Can't add peer %s: %d\n",
+                               libcfs_id2str(id), rc);
+                        return rc;
+                }
+        }
+
+        for (;;) {
+                /* launch any/all connections that need it */
+                route = ksocknal_find_connectable_route_locked (peer);
+                if (route == NULL)
+                        break;
+
+                ksocknal_launch_connection_locked (route);
+        }
+
+        conn = ksocknal_find_conn_locked (tx, peer);
+        if (conn != NULL) {
+                /* Connection exists; queue message on it */
+                ksocknal_queue_tx_locked (tx, conn);
+                write_unlock_irqrestore (g_lock, flags);
+                return (0);
+        }
+
+        if (peer->ksnp_accepting > 0 ||
+            ksocknal_find_connecting_route_locked (peer) != NULL) {
+                /* Queue the message until a connection is established */
+                list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue);
+                write_unlock_irqrestore (g_lock, flags);
+                return 0;
+        }
+        
+        write_unlock_irqrestore (g_lock, flags);
+
+        /* NB Routes may be ignored if connections to them failed recently */
+        CERROR("No usable routes to %s\n", libcfs_id2str(id));
+        return (-EHOSTUNREACH);
+}
+
+int
+ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+        lnet_hdr_t       *hdr = &lntmsg->msg_hdr; 
+        int               type = lntmsg->msg_type; 
+        lnet_process_id_t target = lntmsg->msg_target;
+        unsigned int      payload_niov = lntmsg->msg_niov; 
+        struct iovec     *payload_iov = lntmsg->msg_iov; 
+        lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+        unsigned int      payload_offset = lntmsg->msg_offset;
+        unsigned int      payload_nob = lntmsg->msg_len;
+        ksock_tx_t       *tx;
+        int               desc_size;
+        int               rc;
+
+        /* NB 'private' is different depending on what we're sending.
+         * Just ignore it... */
+
+        CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+               payload_nob, payload_niov, libcfs_id2str(target));
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= LNET_MAX_IOV);
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+        LASSERT (!in_interrupt ());
+        
+        if (payload_iov != NULL)
+                desc_size = offsetof(ksock_tx_t, 
+                                     tx_frags.virt.iov[1 + payload_niov]);
+        else
+                desc_size = offsetof(ksock_tx_t, 
+                                     tx_frags.paged.kiov[payload_niov]);
+        
+        LIBCFS_ALLOC(tx, desc_size);
+        if (tx == NULL) {
+                CERROR("Can't allocate tx desc type %d size %d\n",
+                       type, desc_size);
+                return (-ENOMEM);
+        }
+
+        atomic_inc(&ksocknal_data.ksnd_nactive_txs);
+        
+        tx->tx_conn = NULL;                     /* set when assigned a conn */
+        tx->tx_desc_size = desc_size;
+        tx->tx_lnetmsg = lntmsg;
+
+        if (payload_iov != NULL) {
+                tx->tx_kiov = NULL;
+                tx->tx_nkiov = 0;
+                tx->tx_iov = tx->tx_frags.virt.iov;
+                tx->tx_niov = 1 + 
+                              lnet_extract_iov(payload_niov, &tx->tx_iov[1],
+                                               payload_niov, payload_iov,
+                                               payload_offset, payload_nob);
+        } else {
+                tx->tx_niov = 1;
+                tx->tx_iov = &tx->tx_frags.paged.iov;
+                tx->tx_kiov = tx->tx_frags.paged.kiov;
+                tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
+                                                 payload_niov, payload_kiov,
+                                                 payload_offset, payload_nob);
+        }
+
+        /* first frag is the header */
+        tx->tx_iov[0].iov_base = (void *)hdr;
+        tx->tx_iov[0].iov_len = sizeof(*hdr);
+        tx->tx_resid = tx->tx_nob = sizeof (*hdr) + payload_nob;
+
+        rc = ksocknal_launch_packet(ni, tx, target);
+        if (rc == 0)
+                return (0);
+        
+        ksocknal_free_tx(tx);
+        return (-EIO);
+}
+
+int
+ksocknal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long          pid = cfs_kernel_thread (fn, arg, 0);
+        unsigned long flags;
+
+        if (pid < 0)
+                return ((int)pid);
+
+        write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
+        ksocknal_data.ksnd_nthreads++;
+        write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
+        return (0);
+}
+
+void
+ksocknal_thread_fini (void)
+{
+        unsigned long flags;
+
+        write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
+        ksocknal_data.ksnd_nthreads--;
+        write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+        static char ksocknal_slop_buffer[4096];
+
+        int            nob;
+        unsigned int   niov;
+        int            skipped;
+
+        if (nob_to_skip == 0) {         /* right at next packet boundary now */
+                conn->ksnc_rx_started = 0;
+                mb ();                          /* racing with timeout thread */
+                
+                conn->ksnc_rx_state = SOCKNAL_RX_HEADER;
+                conn->ksnc_rx_nob_wanted = sizeof (lnet_hdr_t);
+                conn->ksnc_rx_nob_left = sizeof (lnet_hdr_t);
+
+                conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+                conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr;
+                conn->ksnc_rx_iov[0].iov_len  = sizeof (lnet_hdr_t);
+                conn->ksnc_rx_niov = 1;
+
+                conn->ksnc_rx_kiov = NULL;
+                conn->ksnc_rx_nkiov = 0;
+                return (1);
+        }
+
+        /* Set up to skip as much a possible now.  If there's more left
+         * (ran out of iov entries) we'll get called again */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+        conn->ksnc_rx_nob_left = nob_to_skip;
+        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+        skipped = 0;
+        niov = 0;
+
+        do {
+                nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
+
+                conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+                conn->ksnc_rx_iov[niov].iov_len  = nob;
+                niov++;
+                skipped += nob;
+                nob_to_skip -=nob;
+
+        } while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+                 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_kiov = NULL;
+        conn->ksnc_rx_nkiov = 0;
+        conn->ksnc_rx_nob_wanted = skipped;
+        return (0);
+}
+
+int
+ksocknal_process_receive (ksock_conn_t *conn)
+{
+        int           rc;
+        
+        LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+
+        /* NB: sched lock NOT held */
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+ again:
+        if (conn->ksnc_rx_nob_wanted != 0) {
+                rc = ksocknal_receive(conn);
+
+                if (rc <= 0) {
+                        LASSERT (rc != -EAGAIN);
+
+                        if (rc == 0)
+                                CDEBUG (D_NET, "[%p] EOF from %s"
+                                        " ip %d.%d.%d.%d:%d\n", conn, 
+                                        libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                                        HIPQUAD(conn->ksnc_ipaddr),
+                                        conn->ksnc_port);
+                        else if (!conn->ksnc_closing)
+                                CERROR ("[%p] Error %d on read from %s"
+                                        " ip %d.%d.%d.%d:%d\n",
+                                        conn, rc, 
+                                        libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                                        HIPQUAD(conn->ksnc_ipaddr),
+                                        conn->ksnc_port);
+
+                        ksocknal_close_conn_and_siblings (conn, rc);
+                        return (rc == 0 ? -ESHUTDOWN : rc);
+                }
+                
+                if (conn->ksnc_rx_nob_wanted != 0) {
+                        /* short read */
+                        return (-EAGAIN);
+                }
+        }
+        
+        switch (conn->ksnc_rx_state) {
+        case SOCKNAL_RX_HEADER:
+                if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) { 
+                        /* Userspace peer */
+                        lnet_process_id_t *id = &conn->ksnc_peer->ksnp_id;
+                        
+                        /* Substitute process ID assigned at connection time */
+                        conn->ksnc_hdr.src_pid = cpu_to_le32(id->pid);
+                        conn->ksnc_hdr.src_nid = cpu_to_le64(id->nid);
+                }
+
+                conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
+                ksocknal_conn_addref(conn);     /* ++ref while parsing */
+                
+                rc = lnet_parse(conn->ksnc_peer->ksnp_ni, &conn->ksnc_hdr, 
+                                conn->ksnc_peer->ksnp_id.nid, conn);
+                if (rc < 0) {
+                        /* I just received garbage: give up on this conn */
+                        ksocknal_new_packet(conn, 0);
+                        ksocknal_close_conn_and_siblings (conn, rc);
+                        ksocknal_conn_decref(conn);
+                        return (-EPROTO);
+                }
+
+                /* I'm racing with ksocknal_recv() */
+                LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
+                         conn->ksnc_rx_state == SOCKNAL_RX_BODY);
+                
+                if (conn->ksnc_rx_state != SOCKNAL_RX_BODY)
+                        return 0;
+                
+                /* ksocknal_recv() got called */
+                goto again;
+
+        case SOCKNAL_RX_BODY:
+                /* payload all received */
+                lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, 0);
+                /* Fall through */
+
+        case SOCKNAL_RX_SLOP:
+                /* starting new packet? */
+                if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+                        return 0;       /* come back later */
+                goto again;             /* try to finish reading slop now */
+
+        default:
+                break;
+        }
+
+        /* Not Reached */
+        LBUG ();
+        return (-EINVAL);                       /* keep gcc happy */
+}
+
+int
+ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+               unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+               unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+        ksock_conn_t  *conn = (ksock_conn_t *)private;
+        ksock_sched_t *sched = conn->ksnc_scheduler;
+        unsigned long  flags;
+
+        LASSERT (mlen <= rlen);
+        LASSERT (niov <= LNET_MAX_IOV);
+        
+        conn->ksnc_cookie = msg;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        if (mlen == 0 || iov != NULL) {
+                conn->ksnc_rx_nkiov = 0;
+                conn->ksnc_rx_kiov = NULL;
+                conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+                conn->ksnc_rx_niov =
+                        lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov,
+                                         niov, iov, offset, mlen);
+        } else {
+                conn->ksnc_rx_niov = 0;
+                conn->ksnc_rx_iov  = NULL;
+                conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+                conn->ksnc_rx_nkiov = 
+                        lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov,
+                                          niov, kiov, offset, mlen);
+        }
+        
+        LASSERT (mlen == 
+                 lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                 lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+        LASSERT (conn->ksnc_rx_scheduled);
+
+        spin_lock_irqsave(&sched->kss_lock, flags);
+
+        switch (conn->ksnc_rx_state) {
+        case SOCKNAL_RX_PARSE_WAIT:
+                list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
+                cfs_waitq_signal (&sched->kss_waitq);
+                LASSERT (conn->ksnc_rx_ready);
+                break;
+                
+        case SOCKNAL_RX_PARSE:
+                /* scheduler hasn't noticed I'm parsing yet */
+                break;
+        }
+
+        conn->ksnc_rx_state = SOCKNAL_RX_BODY;
+        
+        spin_unlock_irqrestore(&sched->kss_lock, flags);
+        ksocknal_conn_decref(conn);
+        return (0);
+}
+
+static inline int
+ksocknal_sched_cansleep(ksock_sched_t *sched)
+{
+        unsigned long flags;
+        int           rc;
+
+        spin_lock_irqsave(&sched->kss_lock, flags);
+
+        rc = (!ksocknal_data.ksnd_shuttingdown &&
+#if SOCKNAL_ZC
+              list_empty(&sched->kss_zctxdone_list) &&
+#endif
+              list_empty(&sched->kss_rx_conns) &&
+              list_empty(&sched->kss_tx_conns));
+        
+        spin_unlock_irqrestore(&sched->kss_lock, flags);
+        return (rc);
+}
+
+int ksocknal_scheduler (void *arg)
+{
+        ksock_sched_t     *sched = (ksock_sched_t *)arg;
+        ksock_conn_t      *conn;
+        ksock_tx_t        *tx;
+        unsigned long      flags;
+        int                rc;
+        int                nloops = 0;
+        int                id = sched - ksocknal_data.ksnd_schedulers;
+        char               name[16];
+
+        snprintf (name, sizeof (name),"socknal_sd%02d", id);
+        libcfs_daemonize (name);
+        libcfs_blockallsigs ();
+
+#if (CONFIG_SMP && CPU_AFFINITY)
+        id = ksocknal_sched2cpu(id);
+        if (cpu_online(id)) {
+                cpumask_t m;
+                cpu_set(id, m);
+                set_cpus_allowed(current, m);
+        } else {
+                CERROR ("Can't set CPU affinity for %s to %d\n", name, id);
+        }
+#endif /* CONFIG_SMP && CPU_AFFINITY */
+
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+                int did_something = 0;
+
+                /* Ensure I progress everything semi-fairly */
+
+                if (!list_empty (&sched->kss_rx_conns)) {
+                        conn = list_entry(sched->kss_rx_conns.next,
+                                          ksock_conn_t, ksnc_rx_list);
+                        list_del(&conn->ksnc_rx_list);
+
+                        LASSERT(conn->ksnc_rx_scheduled);
+                        LASSERT(conn->ksnc_rx_ready);
+
+                        /* clear rx_ready in case receive isn't complete.
+                         * Do it BEFORE we call process_recv, since
+                         * data_ready can set it any time after we release
+                         * kss_lock. */
+                        conn->ksnc_rx_ready = 0;
+                        spin_unlock_irqrestore(&sched->kss_lock, flags);
+
+                        rc = ksocknal_process_receive(conn);
+
+                        spin_lock_irqsave(&sched->kss_lock, flags);
+
+                        /* I'm the only one that can clear this flag */
+                        LASSERT(conn->ksnc_rx_scheduled);
+
+                        /* Did process_receive get everything it wanted? */
+                        if (rc == 0)
+                                conn->ksnc_rx_ready = 1;
+
+                        if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+                                /* Conn blocked waiting for ksocknal_recv()
+                                 * I change its state (under lock) to signal
+                                 * it can be rescheduled */
+                                conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+                        } else if (conn->ksnc_rx_ready) {
+                                /* reschedule for rx */
+                                list_add_tail (&conn->ksnc_rx_list,
+                                               &sched->kss_rx_conns);
+                        } else {
+                                conn->ksnc_rx_scheduled = 0;
+                                /* drop my ref */
+                                ksocknal_conn_decref(conn);
+                        }
+
+                        did_something = 1;
+                }
+
+                if (!list_empty (&sched->kss_tx_conns)) {
+                        conn = list_entry(sched->kss_tx_conns.next,
+                                          ksock_conn_t, ksnc_tx_list);
+                        list_del (&conn->ksnc_tx_list);
+                        
+                        LASSERT(conn->ksnc_tx_scheduled);
+                        LASSERT(conn->ksnc_tx_ready);
+                        LASSERT(!list_empty(&conn->ksnc_tx_queue));
+                        
+                        tx = list_entry(conn->ksnc_tx_queue.next,
+                                        ksock_tx_t, tx_list);
+                        /* dequeue now so empty list => more to send */
+                        list_del(&tx->tx_list);
+                        
+                        /* Clear tx_ready in case send isn't complete.  Do
+                         * it BEFORE we call process_transmit, since
+                         * write_space can set it any time after we release
+                         * kss_lock. */
+                        conn->ksnc_tx_ready = 0;
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+                        rc = ksocknal_process_transmit(conn, tx);
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+
+                        if (rc == -ENOMEM || rc == -EAGAIN) {
+                                /* Incomplete send: replace tx on HEAD of tx_queue */
+                                list_add (&tx->tx_list, &conn->ksnc_tx_queue);
+                        } else {
+                                /* Complete send; assume space for more */
+                                conn->ksnc_tx_ready = 1;
+                        }
+
+                        if (rc == -ENOMEM) {
+                                /* Do nothing; after a short timeout, this
+                                 * conn will be reposted on kss_tx_conns. */
+                        } else if (conn->ksnc_tx_ready &&
+                                   !list_empty (&conn->ksnc_tx_queue)) {
+                                /* reschedule for tx */
+                                list_add_tail (&conn->ksnc_tx_list, 
+                                               &sched->kss_tx_conns);
+                        } else {
+                                conn->ksnc_tx_scheduled = 0;
+                                /* drop my ref */
+                                ksocknal_conn_decref(conn);
+                        }
+                                
+                        did_something = 1;
+                }
+#if SOCKNAL_ZC
+                if (!list_empty (&sched->kss_zctxdone_list)) {
+                        ksock_tx_t *tx =
+                                list_entry(sched->kss_zctxdone_list.next,
+                                           ksock_tx_t, tx_list);
+                        did_something = 1;
+
+                        list_del (&tx->tx_list);
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+                        ksocknal_tx_done (tx->tx_conn->ksnc_peer->ksnp_ni,
+                                          tx, 1);
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+                }
+#endif
+                if (!did_something ||           /* nothing to do */
+                    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+                        nloops = 0;
+
+                        if (!did_something) {   /* wait for something to do */
+                                wait_event_interruptible_exclusive(
+                                        sched->kss_waitq,
+                                        !ksocknal_sched_cansleep(sched), rc);
+                                LASSERT (rc == 0);
+                        } else
+                               our_cond_resched();
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        ksocknal_thread_fini ();
+        return (0);
+}
+
+/*
+ * Add connection to kss_rx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_read_callback (ksock_conn_t *conn)
+{
+        ksock_sched_t *sched; 
+        unsigned long  flags;
+        ENTRY;
+
+        sched = conn->ksnc_scheduler; 
+
+        spin_lock_irqsave (&sched->kss_lock, flags); 
+
+        conn->ksnc_rx_ready = 1; 
+
+        if (!conn->ksnc_rx_scheduled) {  /* not being progressed */ 
+                list_add_tail(&conn->ksnc_rx_list, 
+                              &sched->kss_rx_conns); 
+                conn->ksnc_rx_scheduled = 1; 
+                /* extra ref for scheduler */ 
+                ksocknal_conn_addref(conn);
+
+                cfs_waitq_signal (&sched->kss_waitq); 
+        } 
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+        EXIT;
+} 
+
+/*
+ * Add connection to kss_tx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_write_callback (ksock_conn_t *conn)
+{ 
+        ksock_sched_t *sched; 
+        unsigned long  flags;
+        ENTRY;
+        
+        sched = conn->ksnc_scheduler; 
+
+        spin_lock_irqsave (&sched->kss_lock, flags); 
+
+        conn->ksnc_tx_ready = 1; 
+
+        if (!conn->ksnc_tx_scheduled && // not being progressed 
+            !list_empty(&conn->ksnc_tx_queue)){//packets to send 
+                list_add_tail (&conn->ksnc_tx_list, 
+                               &sched->kss_tx_conns); 
+                conn->ksnc_tx_scheduled = 1; 
+                /* extra ref for scheduler */ 
+                ksocknal_conn_addref(conn); 
+
+                cfs_waitq_signal (&sched->kss_waitq); 
+        } 
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+        EXIT;
+}
+
+int
+ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn, lnet_nid_t peer_nid,
+                     __u32 *ipaddrs, int nipaddrs)
+{
+        /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+        ksock_net_t         *net = (ksock_net_t *)ni->ni_data;
+        struct socket       *sock = conn->ksnc_sock;
+        lnet_hdr_t           hdr;
+        lnet_magicversion_t *hmv = (lnet_magicversion_t *)&hdr.dest_nid;
+        int                  i;
+        int                  rc;
+        lnet_nid_t           srcnid;
+
+        LASSERT (0 <= nipaddrs && nipaddrs <= LNET_MAX_INTERFACES);
+
+        /* No need for getconnsock/putconnsock */
+        LASSERT (!conn->ksnc_closing);
+        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+        hmv->magic         = cpu_to_le32 (LNET_PROTO_TCP_MAGIC);
+        hmv->version_major = cpu_to_le16 (LNET_PROTO_TCP_VERSION_MAJOR);
+        hmv->version_minor = cpu_to_le16 (LNET_PROTO_TCP_VERSION_MINOR);
+
+        srcnid = lnet_ptlcompat_srcnid(ni->ni_nid, peer_nid);
+        
+        hdr.src_nid        = cpu_to_le64 (srcnid);
+        hdr.src_pid        = cpu_to_le64 (the_lnet.ln_pid);
+        hdr.type           = cpu_to_le32 (LNET_MSG_HELLO);
+        hdr.payload_length = cpu_to_le32 (nipaddrs * sizeof(*ipaddrs));
+
+        hdr.msg.hello.type = cpu_to_le32 (conn->ksnc_type);
+        hdr.msg.hello.incarnation = cpu_to_le64 (net->ksnn_incarnation);
+
+        for (i = 0; i < nipaddrs; i++) {
+                ipaddrs[i] = __cpu_to_le32 (ipaddrs[i]);
+        }
+
+        /* socket buffer should have been set large enough not to block
+         * (timeout == 0) */
+        rc = libcfs_sock_write(sock, &hdr, sizeof(hdr), 0);
+        if (rc != 0) {
+                CERROR ("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+                return (rc);
+        }
+        
+        if (nipaddrs == 0)
+                return (0);
+        
+        rc = libcfs_sock_write(sock, ipaddrs, nipaddrs * sizeof(*ipaddrs), 0);
+        if (rc != 0)
+                CERROR ("Error %d sending HELLO payload (%d)"
+                        " to %u.%u.%u.%u/%d\n", rc, nipaddrs, 
+                        HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+        return (rc);
+}
+
+int
+ksocknal_invert_type(int type)
+{
+        switch (type)
+        {
+        case SOCKLND_CONN_ANY:
+        case SOCKLND_CONN_CONTROL:
+                return (type);
+        case SOCKLND_CONN_BULK_IN:
+                return SOCKLND_CONN_BULK_OUT;
+        case SOCKLND_CONN_BULK_OUT:
+                return SOCKLND_CONN_BULK_IN;
+        default:
+                return (SOCKLND_CONN_NONE);
+        }
+}
+
+int
+ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, 
+                     lnet_process_id_t *peerid, 
+                     __u64 *incarnation, __u32 *ipaddrs)
+{
+        struct socket       *sock = conn->ksnc_sock;
+        int                  active;
+        int                  timeout;
+        int                  rc;
+        int                  nips;
+        int                  i;
+        int                  type;
+        lnet_hdr_t           hdr;
+        lnet_process_id_t    recv_id;
+        lnet_magicversion_t *hmv;
+
+        active = (peerid->nid != LNET_NID_ANY);
+        timeout = active ? *ksocknal_tunables.ksnd_timeout :
+                            lnet_acceptor_timeout();
+
+        hmv = (lnet_magicversion_t *)&hdr.dest_nid;
+        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+
+        rc = libcfs_sock_read(sock, &hmv->magic, sizeof (hmv->magic), timeout);
+        if (rc != 0) {
+                CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr));
+                LASSERT (rc < 0 && rc != -EALREADY);
+                return (rc);
+        }
+
+        if (!active && 
+            hmv->magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+                /* Is this a generic acceptor connection request? */
+                rc = lnet_accept(ni, sock, hmv->magic);
+                if (rc != 0)
+                        return -EPROTO;
+
+                /* Yes it is! Start over again now I've skipping the generic
+                 * request */
+                rc = libcfs_sock_read(sock, &hmv->magic, 
+                                      sizeof (hmv->magic), timeout);
+                if (rc != 0) {
+                        CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                                rc, HIPQUAD(conn->ksnc_ipaddr));
+                        LASSERT (rc < 0 && rc != -EALREADY);
+                        return (rc);
+                }
+        }
+        
+        if (hmv->magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+                CERROR ("Bad magic %#08x (%#08x expected) from %u.%u.%u.%u\n",
+                        __cpu_to_le32 (hmv->magic), LNET_PROTO_TCP_MAGIC,
+                        HIPQUAD(conn->ksnc_ipaddr));
+                return (-EPROTO);
+        }
+
+        rc = libcfs_sock_read(sock, &hmv->magic + 1,
+                              sizeof(*hmv) - sizeof(hmv->magic), timeout);
+        if (rc != 0) {
+                CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr));
+                LASSERT (rc < 0 && rc != -EALREADY);
+                return (rc);
+        }
+        
+        if (hmv->version_major != cpu_to_le16 (LNET_PROTO_TCP_VERSION_MAJOR) ||
+            hmv->version_minor != cpu_to_le16 (LNET_PROTO_TCP_VERSION_MINOR)) {
+                CERROR ("Incompatible protocol version %d.%d (%d.%d expected)"
+                        " from %u.%u.%u.%u\n",
+                        le16_to_cpu (hmv->version_major),
+                        le16_to_cpu (hmv->version_minor),
+                        LNET_PROTO_TCP_VERSION_MAJOR,
+                        LNET_PROTO_TCP_VERSION_MINOR,
+                        HIPQUAD(conn->ksnc_ipaddr));
+                return (-EPROTO);
+        }
+
+#if (LNET_PROTO_TCP_VERSION_MAJOR != 1)
+# error "This code only understands protocol version 1.x"
+#endif
+        /* version 1 sends magic/version as the dest_nid of a 'hello'
+         * header, followed by payload full of interface IP addresses.
+         * Read the rest of it in now... */
+
+        rc = libcfs_sock_read(sock, hmv + 1, sizeof (hdr) - sizeof (*hmv), 
+                              timeout);
+        if (rc != 0) {
+                CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr));
+                LASSERT (rc < 0 && rc != -EALREADY);
+                return (rc);
+        }
+
+        /* ...and check we got what we expected */
+        if (hdr.type != cpu_to_le32 (LNET_MSG_HELLO)) {
+                CERROR ("Expecting a HELLO hdr,"
+                        " but got type %d from %u.%u.%u.%u\n",
+                        le32_to_cpu (hdr.type),
+                        HIPQUAD(conn->ksnc_ipaddr));
+                return (-EPROTO);
+        }
+
+        if (le64_to_cpu(hdr.src_nid) == LNET_NID_ANY) {
+                CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY"
+                       "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr));
+                return (-EPROTO);
+        }
+
+        if (conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {          
+                /* Userspace NAL assigns peer process ID from socket */
+                recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
+                recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr);
+        } else {
+                recv_id.nid = le64_to_cpu(hdr.src_nid);
+
+                if (the_lnet.ln_ptlcompat > 1 && /* portals peers may exist */
+                    LNET_NIDNET(recv_id.nid) == 0) /* this is one */
+                        recv_id.pid = the_lnet.ln_pid; /* give it a sensible pid */
+                else
+                        recv_id.pid = le32_to_cpu(hdr.src_pid);
+
+        }
+        
+        if (!active) {                          /* don't know peer's nid yet */
+                *peerid = recv_id;
+        } else if (peerid->pid != recv_id.pid ||
+                   !lnet_ptlcompat_matchnid(peerid->nid, recv_id.nid)) {
+                LCONSOLE_ERROR("Connected successfully to %s on host "
+                               "%u.%u.%u.%u, but they claimed they were "
+                               "%s; please check your Lustre "
+                               "configuration.\n",
+                               libcfs_id2str(*peerid),
+                               HIPQUAD(conn->ksnc_ipaddr),
+                               libcfs_id2str(recv_id));
+                return (-EPROTO);
+        }
+
+        type = __le32_to_cpu(hdr.msg.hello.type);
+
+        if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+                /* I've accepted this connection; peer determines type */
+                conn->ksnc_type = ksocknal_invert_type(type);
+                if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+                        CERROR ("Unexpected type %d from %s ip %u.%u.%u.%u\n",
+                                type, libcfs_id2str(*peerid), 
+                                HIPQUAD(conn->ksnc_ipaddr));
+                        return (-EPROTO);
+                }
+        } else if (type == SOCKLND_CONN_NONE) {
+                /* lost a connection race */
+                return -EALREADY;
+        } else if (ksocknal_invert_type(type) != conn->ksnc_type) {
+                CERROR ("Mismatched types: me %d, %s ip %u.%u.%u.%u %d\n",
+                        conn->ksnc_type, libcfs_id2str(*peerid), 
+                        HIPQUAD(conn->ksnc_ipaddr),
+                        le32_to_cpu(hdr.msg.hello.type));
+                return (-EPROTO);
+        }
+
+        *incarnation = le64_to_cpu(hdr.msg.hello.incarnation);
+
+        nips = __le32_to_cpu (hdr.payload_length) / sizeof (__u32);
+
+        if (nips > LNET_MAX_INTERFACES ||
+            nips * sizeof(__u32) != __le32_to_cpu (hdr.payload_length)) {
+                CERROR("Bad payload length %d from %s ip %u.%u.%u.%u\n",
+                       __le32_to_cpu (hdr.payload_length),
+                       libcfs_id2str(*peerid), HIPQUAD(conn->ksnc_ipaddr));
+        }
+
+        if (nips == 0)
+                return (0);
+        
+        rc = libcfs_sock_read(sock, ipaddrs, nips * sizeof(*ipaddrs), timeout);
+        if (rc != 0) {
+                CERROR ("Error %d reading IPs from %s ip %u.%u.%u.%u\n",
+                        rc, libcfs_id2str(*peerid), HIPQUAD(conn->ksnc_ipaddr));
+                return (rc);
+        }
+
+        for (i = 0; i < nips; i++) {
+                ipaddrs[i] = __le32_to_cpu(ipaddrs[i]);
+                
+                if (ipaddrs[i] == 0) {
+                        CERROR("Zero IP[%d] from %s ip %u.%u.%u.%u\n",
+                               i, libcfs_id2str(*peerid),
+                               HIPQUAD(conn->ksnc_ipaddr));
+                        return (-EPROTO);
+                }
+        }
+
+        return (nips);
+}
+
+void
+ksocknal_connect (ksock_route_t *route)
+{
+        CFS_LIST_HEAD    (zombies);
+        ksock_peer_t     *peer = route->ksnr_peer;
+        unsigned long     flags;
+        int               type;
+        struct socket    *sock;
+        cfs_time_t        deadline;
+        int               rc = 0;
+
+        deadline = cfs_time_add(cfs_time_current(), 
+                                cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+
+        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+
+        for (;;) {
+                if (!*ksocknal_tunables.ksnd_typed_conns) {
+                        if ((route->ksnr_connected & (1<<SOCKLND_CONN_ANY)) == 0)
+                                type = SOCKLND_CONN_ANY;
+                        else
+                                break;  /* got connected while route queued */
+                } else {
+                        if ((route->ksnr_connected & (1<<SOCKLND_CONN_CONTROL)) == 0)
+                                type = SOCKLND_CONN_CONTROL;
+                        else if ((route->ksnr_connected & (1<<SOCKLND_CONN_BULK_IN)) == 0)
+                                type = SOCKLND_CONN_BULK_IN;
+                        else if ((route->ksnr_connected & (1<<SOCKLND_CONN_BULK_OUT)) == 0)
+                                type = SOCKLND_CONN_BULK_OUT;
+                        else
+                                break;  /* got connected while route queued */
+                }
+
+                write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
+
+                if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+                        lnet_connect_console_error(-ETIMEDOUT, peer->ksnp_id.nid,
+                                                   route->ksnr_ipaddr,
+                                                   route->ksnr_port);
+                        goto failed;
+                }
+                
+                rc = lnet_connect(&sock, peer->ksnp_id.nid,
+                                  route->ksnr_myipaddr, 
+                                  route->ksnr_ipaddr, route->ksnr_port);
+                if (rc != 0)
+                        goto failed;
+
+                rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
+                if (rc < 0) {
+                        lnet_connect_console_error(rc, peer->ksnp_id.nid,
+                                                   route->ksnr_ipaddr, 
+                                                   route->ksnr_port);
+                        goto failed;
+                }
+
+                if (rc != 0) {
+                        /* lost connection race; peer is connecting to me, so
+                         * give her some time... */
+                        cfs_pause(cfs_time_seconds(1));
+                }
+                
+                write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+        }
+
+        LASSERT (route->ksnr_connecting == 0);
+        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+        return;
+
+ failed:
+        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+
+        route->ksnr_connecting = 0;
+
+        /* This is a retry rather than a new connection */
+        route->ksnr_retry_interval *= 2;
+        route->ksnr_retry_interval = 
+                MAX(route->ksnr_retry_interval,
+                    cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+        route->ksnr_retry_interval = 
+                MIN(route->ksnr_retry_interval,
+                    cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+        
+        LASSERT (route->ksnr_retry_interval != 0);
+        route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+                                           route->ksnr_retry_interval);
+
+        if (!list_empty(&peer->ksnp_tx_queue) &&
+            peer->ksnp_accepting == 0 &&
+            ksocknal_find_connecting_route_locked(peer) == NULL) {
+                /* ksnp_tx_queue is queued on a conn on successful
+                 * connection */
+                LASSERT (list_empty (&peer->ksnp_conns));
+
+                /* take all the blocked packets while I've got the lock and
+                 * complete below... */
+                list_add(&zombies, &peer->ksnp_tx_queue);
+                list_del_init(&peer->ksnp_tx_queue);
+        }
+
+#if 0           /* irrelevent with only eager routes */
+        if (!route->ksnr_deleted) {
+                /* make this route least-favourite for re-selection */
+                list_del(&route->ksnr_list);
+                list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+        }
+#endif
+        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+
+        ksocknal_peer_failed(peer);
+        ksocknal_txlist_done(peer->ksnp_ni, &zombies);
+}
+
+int
+ksocknal_connd (void *arg)
+{
+        long               id = (long)arg;
+        char               name[16];
+        unsigned long      flags;
+        ksock_connreq_t   *cr;
+        ksock_route_t     *route;
+        int                rc;
+        int                did_something;
+
+        snprintf (name, sizeof (name), "socknal_cd%02ld", id);
+        libcfs_daemonize (name);
+        libcfs_blockallsigs ();
+
+        spin_lock_irqsave (&ksocknal_data.ksnd_connd_lock, flags);
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+
+                did_something = 0;
+
+                if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+                        /* Connection accepted by the listener */
+                        cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next,
+                                        ksock_connreq_t, ksncr_list);
+                        
+                        list_del(&cr->ksncr_list);
+                        spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock, 
+                                               flags);
+                        
+                        ksocknal_create_conn(cr->ksncr_ni, NULL, 
+                                             cr->ksncr_sock, SOCKLND_CONN_NONE);
+                        lnet_ni_decref(cr->ksncr_ni);
+                        LIBCFS_FREE(cr, sizeof(*cr));
+                        
+                        spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock,
+                                          flags);
+                        did_something = 1;
+                }
+
+                if (!list_empty (&ksocknal_data.ksnd_connd_routes)) {
+                        /* Connection request */
+                        route = list_entry (ksocknal_data.ksnd_connd_routes.next,
+                                            ksock_route_t, ksnr_connd_list);
+
+                        list_del (&route->ksnr_connd_list);
+                        spin_unlock_irqrestore (&ksocknal_data.ksnd_connd_lock, flags);
+
+                        ksocknal_connect (route);
+                        ksocknal_route_decref(route);
+
+                        spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock,
+                                          flags);
+                        did_something = 1;
+                }
+
+                if (did_something)
+                        continue;
+
+                spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock,
+                                       flags);
+
+                wait_event_interruptible(ksocknal_data.ksnd_connd_waitq,
+                                         ksocknal_data.ksnd_shuttingdown ||
+                                         !list_empty(&ksocknal_data.ksnd_connd_connreqs) ||
+                                         !list_empty(&ksocknal_data.ksnd_connd_routes), rc);
+
+                spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock, flags);
+        }
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_connd_lock, flags);
+
+        ksocknal_thread_fini ();
+        return (0);
+}
+
+ksock_conn_t *
+ksocknal_find_timed_out_conn (ksock_peer_t *peer) 
+{
+        /* We're called with a shared lock on ksnd_global_lock */
+        ksock_conn_t      *conn;
+        struct list_head  *ctmp;
+
+        list_for_each (ctmp, &peer->ksnp_conns) {
+                conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+                /* Don't need the {get,put}connsock dance to deref ksnc_sock... */
+                LASSERT (!conn->ksnc_closing);
+
+                if (conn->ksnc_rx_started &&
+                    cfs_time_aftereq (cfs_time_current(), conn->ksnc_rx_deadline)) {
+                        /* Timed out incomplete incoming message */
+                        ksocknal_conn_addref(conn);
+                        LCONSOLE_ERROR("A timeout occurred receiving data from "
+                                       "%u.%u.%u.%u; the network or that node "
+                                       "may be down.\n",
+                                       HIPQUAD(conn->ksnc_ipaddr));
+                        CERROR ("Timed out RX from %s %p %d.%d.%d.%d\n",
+                                libcfs_id2str(peer->ksnp_id),
+                                conn, HIPQUAD(conn->ksnc_ipaddr));
+                        return (conn);
+                }
+
+                if ((!list_empty (&conn->ksnc_tx_queue)) &&
+                    cfs_time_aftereq (cfs_time_current(), conn->ksnc_tx_deadline)) {
+                        /* Timed out messages queued for sending or
+                         * buffered in the socket's send buffer */
+                        ksocknal_conn_addref(conn);
+                        LCONSOLE_ERROR("A timeout occurred sending data to "
+                                       "%u.%u.%u.%u; the network or that node "
+                                       "may be down.\n",
+                                       HIPQUAD(conn->ksnc_ipaddr));
+                        return (conn);
+                }
+        }
+
+        return (NULL);
+}
+
+void
+ksocknal_check_peer_timeouts (int idx)
+{
+        struct list_head *peers = &ksocknal_data.ksnd_peers[idx];
+        struct list_head *ptmp;
+        ksock_peer_t     *peer;
+        ksock_conn_t     *conn;
+
+ again:
+        /* NB. We expect to have a look at all the peers and not find any
+         * connections to time out, so we just use a shared lock while we
+         * take a look... */
+        read_lock (&ksocknal_data.ksnd_global_lock);
+
+        list_for_each (ptmp, peers) {
+                peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+                conn = ksocknal_find_timed_out_conn (peer);
+                
+                if (conn != NULL) {
+                        read_unlock (&ksocknal_data.ksnd_global_lock);
+
+                        CERROR ("Timeout out conn->%s ip %d.%d.%d.%d:%d\n",
+                                libcfs_id2str(peer->ksnp_id),
+                                HIPQUAD(conn->ksnc_ipaddr),
+                                conn->ksnc_port);
+                        ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+                        
+                        /* NB we won't find this one again, but we can't
+                         * just proceed with the next peer, since we dropped
+                         * ksnd_global_lock and it might be dead already! */
+                        ksocknal_conn_decref(conn);
+                        goto again;
+                }
+        }
+
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+        cfs_waitlink_t     wait;
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        ksock_sched_t     *sched;
+        struct list_head   enomem_conns;
+        int                nenomem_conns;
+        cfs_duration_t     timeout;
+        int                i;
+        int                peer_index = 0;
+        cfs_time_t         deadline = cfs_time_current();
+
+        libcfs_daemonize ("socknal_reaper");
+        libcfs_blockallsigs ();
+
+        CFS_INIT_LIST_HEAD(&enomem_conns);
+        cfs_waitlink_init (&wait);
+
+        spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+
+                if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) {
+                        conn = list_entry (ksocknal_data.ksnd_deathrow_conns.next,
+                                           ksock_conn_t, ksnc_list);
+                        list_del (&conn->ksnc_list);
+                        
+                        spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+
+                        ksocknal_terminate_conn (conn);
+                        ksocknal_conn_decref(conn);
+
+                        spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+                        continue;
+                }
+
+                if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) {
+                        conn = list_entry (ksocknal_data.ksnd_zombie_conns.next,
+                                           ksock_conn_t, ksnc_list);
+                        list_del (&conn->ksnc_list);
+                        
+                        spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+
+                        ksocknal_destroy_conn (conn);
+
+                        spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+                        continue;
+                }
+
+                if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) {
+                        list_add(&enomem_conns, &ksocknal_data.ksnd_enomem_conns);
+                        list_del_init(&ksocknal_data.ksnd_enomem_conns);
+                }
+
+                spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+
+                /* reschedule all the connections that stalled with ENOMEM... */
+                nenomem_conns = 0;
+                while (!list_empty (&enomem_conns)) {
+                        conn = list_entry (enomem_conns.next,
+                                           ksock_conn_t, ksnc_tx_list);
+                        list_del (&conn->ksnc_tx_list);
+
+                        sched = conn->ksnc_scheduler;
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+
+                        LASSERT (conn->ksnc_tx_scheduled);
+                        conn->ksnc_tx_ready = 1;
+                        list_add_tail(&conn->ksnc_tx_list,&sched->kss_tx_conns);
+                        cfs_waitq_signal (&sched->kss_waitq);
+
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+                        nenomem_conns++;
+                }
+
+                /* careful with the jiffy wrap... */
+                while ((timeout = cfs_time_sub(deadline,
+                                               cfs_time_current())) <= 0) {
+                        const int n = 4;
+                        const int p = 1;
+                        int       chunk = ksocknal_data.ksnd_peer_hash_size;
+
+                        /* Time to check for timeouts on a few more peers: I do
+                         * checks every 'p' seconds on a proportion of the peer
+                         * table and I need to check every connection 'n' times
+                         * within a timeout interval, to ensure I detect a
+                         * timeout on any connection within (n+1)/n times the
+                         * timeout interval. */
+
+                        if (*ksocknal_tunables.ksnd_timeout > n * p)
+                                chunk = (chunk * n * p) /
+                                        *ksocknal_tunables.ksnd_timeout;
+                        if (chunk == 0)
+                                chunk = 1;
+
+                        for (i = 0; i < chunk; i++) {
+                                ksocknal_check_peer_timeouts (peer_index);
+                                peer_index = (peer_index + 1) %
+                                             ksocknal_data.ksnd_peer_hash_size;
+                        }
+
+                        deadline = cfs_time_add(deadline, cfs_time_seconds(p));
+                }
+
+                if (nenomem_conns != 0) {
+                        /* Reduce my timeout if I rescheduled ENOMEM conns.
+                         * This also prevents me getting woken immediately
+                         * if any go back on my enomem list. */
+                        timeout = SOCKNAL_ENOMEM_RETRY;
+                }
+                ksocknal_data.ksnd_reaper_waketime =
+                        cfs_time_add(cfs_time_current(), timeout);
+
+                set_current_state (TASK_INTERRUPTIBLE);
+                cfs_waitq_add (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+                if (!ksocknal_data.ksnd_shuttingdown &&
+                    list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
+                    list_empty (&ksocknal_data.ksnd_zombie_conns))
+                        cfs_waitq_timedwait (&wait, CFS_TASK_INTERRUPTIBLE, timeout);
+
+                set_current_state (TASK_RUNNING);
+                cfs_waitq_del (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+                spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+        }
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+
+        ksocknal_thread_fini ();
+        return (0);
+}
diff --git a/lnet/klnds/tdilnd/socklnd_lib-winnt.c b/lnet/klnds/tdilnd/socklnd_lib-winnt.c
new file mode 100644 (file)
index 0000000..e05aee2
--- /dev/null
@@ -0,0 +1,595 @@
+#include "socklnd.h"
+
+# if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+static ctl_table ksocknal_ctl_table[18];
+
+ctl_table ksocknal_top_ctl_table[] = {
+        {200, "socknal", NULL, 0, 0555, ksocknal_ctl_table},
+        { 0 }
+};
+
+int
+ksocknal_lib_tunables_init () 
+{
+       int    i = 0;
+       int    j = 1;
+       
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "timeout", ksocknal_tunables.ksnd_timeout, 
+                sizeof (int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "credits", ksocknal_tunables.ksnd_credits, 
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "peer_credits", ksocknal_tunables.ksnd_peercredits, 
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "nconnds", ksocknal_tunables.ksnd_nconnds, 
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "min_reconnectms", ksocknal_tunables.ksnd_min_reconnectms, 
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "max_reconnectms", ksocknal_tunables.ksnd_max_reconnectms, 
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "eager_ack", ksocknal_tunables.ksnd_eager_ack, 
+                sizeof (int), 0644, NULL, &proc_dointvec};
+#if SOCKNAL_ZC
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "zero_copy", ksocknal_tunables.ksnd_zc_min_frag, 
+                sizeof (int), 0644, NULL, &proc_dointvec};
+#endif
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "typed", ksocknal_tunables.ksnd_typed_conns, 
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "min_bulk", ksocknal_tunables.ksnd_min_bulk, 
+                sizeof (int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "buffer_size", ksocknal_tunables.ksnd_buffer_size, 
+                sizeof(int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "nagle", ksocknal_tunables.ksnd_nagle, 
+                sizeof(int), 0644, NULL, &proc_dointvec};
+#if CPU_AFFINITY
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "irq_affinity", ksocknal_tunables.ksnd_irq_affinity, 
+                sizeof(int), 0644, NULL, &proc_dointvec};
+#endif
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "keepalive_idle", ksocknal_tunables.ksnd_keepalive_idle, 
+                sizeof(int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "keepalive_count", ksocknal_tunables.ksnd_keepalive_count, 
+                sizeof(int), 0644, NULL, &proc_dointvec};
+       ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "keepalive_intvl", ksocknal_tunables.ksnd_keepalive_intvl, 
+                sizeof(int), 0644, NULL, &proc_dointvec};
+
+       LASSERT (j == i+1);
+       LASSERT (i < sizeof(ksocknal_ctl_table)/sizeof(ksocknal_ctl_table[0]));
+
+        ksocknal_tunables.ksnd_sysctl =
+                register_sysctl_table(ksocknal_top_ctl_table, 0);
+
+        if (ksocknal_tunables.ksnd_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+       return 0;
+}
+
+void
+ksocknal_lib_tunables_fini () 
+{
+        if (ksocknal_tunables.ksnd_sysctl != NULL)
+                unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl);        
+}
+#else
+int
+ksocknal_lib_tunables_init () 
+{
+       return 0;
+}
+
+void 
+ksocknal_lib_tunables_fini ()
+{
+}
+#endif
+
+void
+ksocknal_lib_bind_irq (unsigned int irq)
+{
+}
+
+int
+ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+{
+        int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+                                    &conn->ksnc_ipaddr,
+                                    &conn->ksnc_port);
+
+        /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+        LASSERT (!conn->ksnc_closing);
+
+        if (rc != 0) {
+                CERROR ("Error %d getting sock peer IP\n", rc);
+                return rc;
+        }
+
+        rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+                                &conn->ksnc_myipaddr, NULL);
+        if (rc != 0) {
+                CERROR ("Error %d getting sock local IP\n", rc);
+                return rc;
+        }
+
+        return 0;
+}
+
+unsigned int
+ksocknal_lib_sock_irq (struct socket *sock)
+{
+        int                irq = 0;
+        return irq;
+}
+
+#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
+static struct page *
+ksocknal_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END)
+                page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+        else if (vaddr >= PKMAP_BASE &&
+                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+                page = vmalloc_to_page ((void *)vaddr);
+                /* in 2.4 ^ just walks the page tables */
+#endif
+        else
+                page = virt_to_page (vaddr);
+
+        if (page == NULL ||
+            !VALID_PAGE (page))
+                return (NULL);
+
+        return (page);
+}
+#endif
+
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+        ksock_tconn_t * tconn = conn->ksnc_sock;
+        int             len;
+        int             rc;
+
+        ksocknal_get_tconn (tconn);
+        
+        *txmem = *rxmem = 0;
+
+        len = sizeof(*nagle);
+
+        rc = ksocknal_get_tcp_option(
+                    tconn, TCP_SOCKET_NODELAY,
+                    (__u32 *)nagle, &len);
+
+        ksocknal_put_tconn (tconn);
+
+        printk("ksocknal_get_conn_tunables: nodelay = %d rc = %d\n", *nagle, rc);
+
+        if (rc == 0)
+                *nagle = !*nagle;
+        else
+                *txmem = *rxmem = *nagle = 0;
+                
+        return (rc);
+}
+
+int
+ksocknal_lib_buffersize (int current_sz, int tunable_sz)
+{
+       /* ensure >= SOCKNAL_MIN_BUFFER */
+       if (current_sz < SOCKNAL_MIN_BUFFER)
+               return MAX(SOCKNAL_MIN_BUFFER, tunable_sz);
+
+       if (tunable_sz > SOCKNAL_MIN_BUFFER)
+               return tunable_sz;
+       
+       /* leave alone */
+       return 0;
+}
+
+int
+ksocknal_lib_setup_sock (struct socket *sock)
+{
+        int             rc;
+
+        int             keep_idle;
+        int             keep_count;
+        int             keep_intvl;
+        int             keep_alive;
+
+        __u32           option;
+
+        /* set the window size */
+
+#if 0
+        tconn->kstc_snd_wnd = ksocknal_tunables.ksnd_buffer_size;
+        tconn->kstc_rcv_wnd = ksocknal_tunables.ksnd_buffer_size;
+#endif
+
+        /* disable nagle */
+        if (!ksocknal_tunables.ksnd_nagle) {
+                option = 1;
+                
+                rc = ksocknal_set_tcp_option(
+                            sock, TCP_SOCKET_NODELAY,
+                            &option, sizeof (option));
+                if (rc != 0) {
+                        printk ("Can't disable nagle: %d\n", rc);
+                        return (rc);
+                }
+        }
+
+        /* snapshot tunables */
+        keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
+        keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+        keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+        
+        keep_alive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+
+        option = (__u32)(keep_alive ? 1 : 0);
+
+        rc = ksocknal_set_tcp_option(
+                    sock, TCP_SOCKET_KEEPALIVE,
+                    &option, sizeof (option));
+        if (rc != 0) {
+                CERROR (("Can't disable nagle: %d\n", rc));
+                return (rc);
+        }
+
+        return (0);
+}
+
+void
+ksocknal_push_conn (ksock_conn_t *conn)
+{
+        ksock_tconn_t * tconn;
+        __u32           nagle;
+        __u32           val = 1;
+        int             rc;
+
+        tconn = conn->ksnc_sock;
+
+        ksocknal_get_tconn(tconn);
+
+        spin_lock(&tconn->kstc_lock);
+        if (tconn->kstc_type == kstt_sender) {
+            nagle = tconn->sender.kstc_info.nagle;
+            tconn->sender.kstc_info.nagle = 0;
+        } else {
+            LASSERT(tconn->kstc_type == kstt_child);
+            nagle = tconn->child.kstc_info.nagle;
+            tconn->child.kstc_info.nagle = 0;
+        }
+
+        spin_unlock(&tconn->kstc_lock);
+
+        val = 1;
+        rc = ksocknal_set_tcp_option(
+                    tconn,
+                    TCP_SOCKET_NODELAY,
+                    &(val),
+                    sizeof(__u32)
+                    );
+
+        LASSERT (rc == 0);
+        spin_lock(&tconn->kstc_lock);
+
+        if (tconn->kstc_type == kstt_sender) {
+            tconn->sender.kstc_info.nagle = nagle;
+        } else {
+            LASSERT(tconn->kstc_type == kstt_child);
+            tconn->child.kstc_info.nagle = nagle;
+        }
+        spin_unlock(&tconn->kstc_lock);
+
+        ksocknal_put_tconn(tconn);
+}
+
+/* @mode: 0: receiving mode / 1: sending mode */
+void
+ksocknal_sched_conn (ksock_conn_t *conn, int mode, ksock_tx_t *tx)
+{
+        int             flags;
+        ksock_sched_t * sched;
+        ENTRY;
+
+        /* interleave correctly with closing sockets... */
+        read_lock (&ksocknal_data.ksnd_global_lock);
+
+        sched = conn->ksnc_scheduler;
+
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        if (mode) { /* transmission can continue ... */ 
+
+                conn->ksnc_tx_ready = 1;
+
+                if (tx) {
+                    /* Incomplete send: place tx on HEAD of tx_queue */
+                    list_add (&tx->tx_list, &conn->ksnc_tx_queue);
+                }
+
+                if ( !conn->ksnc_tx_scheduled && 
+                     !list_empty(&conn->ksnc_tx_queue)) {  //packets to send
+                        list_add_tail (&conn->ksnc_tx_list,
+                                       &sched->kss_tx_conns);
+                        conn->ksnc_tx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_conn_refcount);
+
+                        cfs_waitq_signal (&sched->kss_waitq);
+                }
+        } else {    /* receiving can continue ... */
+
+                conn->ksnc_rx_ready = 1;
+
+                if ( !conn->ksnc_rx_scheduled) {  /* not being progressed */
+                        list_add_tail(&conn->ksnc_rx_list,
+                                      &sched->kss_rx_conns);
+                        conn->ksnc_rx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_conn_refcount);
+
+                        cfs_waitq_signal (&sched->kss_waitq);
+                }
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+
+        EXIT;
+}
+
+void ksocknal_schedule_callback(struct socket*sock, int mode, void * tx, ulong_ptr bytes)
+{
+    ksock_conn_t * conn = (ksock_conn_t *) sock->kstc_conn;
+
+    if (mode) {
+        ksocknal_sched_conn(conn, mode, tx);
+    } else {
+        if ( CAN_BE_SCHED(bytes, (ulong_ptr)conn->ksnc_rx_nob_wanted )) {
+            ksocknal_sched_conn(conn, mode, tx);
+        }
+    }
+}
+
+
+void
+ksocknal_fini_sending(ksock_tcpx_fini_t *tcpx)
+{
+    ksocknal_tx_launched(tcpx->tx);
+    cfs_free(tcpx);
+}
+
+PVOID
+ksocknal_update_tx(
+    struct socket*  tconn,
+    PVOID           txp,
+    ulong_ptr       rc
+    )
+{
+    ksock_tx_t *    tx = (ksock_tx_t *)txp;
+
+    /*
+     *  the transmission was done, we need update the tx
+     */
+
+    LASSERT(tx->tx_resid >= (int)rc);
+    tx->tx_resid -= (int)rc;
+
+    /*
+     *  just partial of tx is sent out, we need update
+     *  the fields of tx and schedule later transmission.
+     */
+
+    if (tx->tx_resid) {
+
+        if (tx->tx_niov > 0) {
+
+            /* if there's iov, we need process iov first */
+            while (rc > 0 ) {
+                if (rc < tx->tx_iov->iov_len) {
+                    /* didn't send whole iov entry... */
+                    tx->tx_iov->iov_base = 
+                        (char *)(tx->tx_iov->iov_base) + rc;
+                    tx->tx_iov->iov_len -= rc;
+                    rc = 0;
+                 } else {
+                    /* the whole of iov was sent out */
+                    rc -= tx->tx_iov->iov_len;
+                    tx->tx_iov++;
+                    tx->tx_niov--;
+                }
+            }
+
+        } else {
+
+            /* now we need process the kiov queues ... */
+
+            while (rc > 0 ) {
+
+                if (rc < tx->tx_kiov->kiov_len) {
+                    /* didn't send whole kiov entry... */
+                    tx->tx_kiov->kiov_offset += rc;
+                    tx->tx_kiov->kiov_len -= rc;
+                    rc = 0;
+                } else {
+                    /* whole kiov was sent out */
+                    rc -= tx->tx_kiov->kiov_len;
+                    tx->tx_kiov++;
+                    tx->tx_nkiov--;
+                }
+            }
+        }
+
+    } else {
+
+        ksock_tcpx_fini_t * tcpx = 
+                cfs_alloc(sizeof(ksock_tcpx_fini_t), CFS_ALLOC_ZERO);
+
+        ASSERT(tx->tx_resid == 0);
+
+        if (!tcpx) {
+
+            ksocknal_tx_launched (tx);
+
+        } else {
+
+            tcpx->tx = tx;
+            ExInitializeWorkItem(
+                    &(tcpx->item), 
+                    ksocknal_fini_sending,
+                    tcpx
+            );
+            ExQueueWorkItem(
+                    &(tcpx->item),
+                    CriticalWorkQueue
+                    );
+        }
+
+        tx = NULL;
+    }
+
+    return (PVOID)tx;
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
+{
+    sock->kstc_sched_cb  = ksocknal_schedule_callback;
+    sock->kstc_update_tx = ksocknal_update_tx;
+
+       return;
+}
+
+void
+ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn)
+{
+    sock->kstc_sched_cb(sock, TRUE,  NULL, 0);
+    sock->kstc_sched_cb(sock, FALSE, NULL, 0);
+
+       return;
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{
+       return ;
+}
+
+/*
+ * ksocknal_lock_kiovs
+ *   Lock the kiov pages into MDL structure
+ *
+ * Arguments:
+ *   kiov:  the array of kiov pages
+ *   niov:  number of kiov to be locked
+ *   len:   the real length of the kiov arrary
+ *
+ * Return Value:
+ *   PMDL: the Mdl of the locked buffers or NULL
+ *         pointer in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+ksock_mdl_t *
+ksocknal_lock_kiovs(
+    IN lnet_kiov_t *  kiov,
+    IN int            nkiov,
+    IN int            recving,
+    IN int *          len )
+{
+    int             rc = 0;
+    int             i = 0;
+    int             total = 0;
+    ksock_mdl_t *   mdl = NULL;
+    ksock_mdl_t *   tail = NULL;
+
+    LASSERT(kiov != NULL);
+    LASSERT(nkiov > 0);
+    LASSERT(len != NULL);
+
+    for (i=0; i < nkiov; i++) {
+
+        ksock_mdl_t *        Iovec = NULL;
+
+
+        //
+        //  Lock the kiov page into Iovec Â¡Â­
+        //
+
+        rc = ksocknal_lock_buffer(
+                (PUCHAR)kiov[i].kiov_page->addr + 
+                     kiov[i].kiov_offset,
+                FALSE,
+                kiov[i].kiov_len,
+                recving ? IoWriteAccess : IoReadAccess,
+                &Iovec
+            );
+
+        if (rc < 0) {
+            break;
+        }
+
+        //
+        // Attach the Iovec to the mdl chain
+        //
+
+        if (tail) {
+            tail->Next = Iovec;
+        } else {
+            mdl = Iovec;
+        }
+
+        tail = Iovec;
+
+        total += kiov[i].kiov_len;
+
+    }
+
+    if (rc >= 0) {
+        *len = total;
+    } else {
+        if (mdl) {
+            ksocknal_release_mdl(mdl, FALSE);
+            mdl = NULL;
+        }
+    }
+
+    return mdl;
+}
+
+void
+ksocknal_eager_ack (ksock_conn_t *conn)
+{
+    return;
+}
\ No newline at end of file
diff --git a/lnet/klnds/tdilnd/socklnd_lib-winnt.h b/lnet/klnds/tdilnd/socklnd_lib-winnt.h
new file mode 100644 (file)
index 0000000..7572420
--- /dev/null
@@ -0,0 +1,44 @@
+#define DEBUG_PORTAL_ALLOC
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#ifndef __WINNT_SOCKNAL_LIB_H__
+#define __WINNT_SOCKNAL_LIB_H__
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+
+#define SOCKNAL_ARCH_EAGER_ACK 0
+
+#ifndef CONFIG_SMP
+
+static inline
+int ksocknal_nsched(void)
+{
+        return 1;
+}
+
+#else
+
+static inline int
+ksocknal_nsched(void)
+{
+        return num_online_cpus();
+}
+
+static inline int
+ksocknal_sched2cpu(int i)
+{
+        return i;
+}
+
+static inline int
+ksocknal_irqsched2cpu(int i)
+{
+        return i;
+}
+
+#endif
+
+#endif
diff --git a/lnet/klnds/tdilnd/socklnd_modparams.c b/lnet/klnds/tdilnd/socklnd_modparams.c
new file mode 100644 (file)
index 0000000..faefe78
--- /dev/null
@@ -0,0 +1,113 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+static int timeout = SOCKNAL_TIMEOUT;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+                "dead socket timeout (seconds)");
+
+static int credits = SOCKNAL_CREDITS;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+                "# concurrent sends");
+
+static int peer_credits = SOCKNAL_PEERCREDITS;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+                "# concurrent sends to 1 peer");
+
+static int nconnds = SOCKNAL_NCONND;
+CFS_MODULE_PARM(nconnds, "i", int, 0444,
+                "# connection daemons");
+
+static int min_reconnectms = SOCKNAL_MIN_RECONNECTMS;
+CFS_MODULE_PARM(min_reconnectms, "i", int, 0644,
+                "min connection retry interval (mS)");
+
+static int max_reconnectms = SOCKNAL_MAX_RECONNECTMS;
+CFS_MODULE_PARM(max_reconnectms, "i", int, 0644,
+                "max connection retry interval (mS)");
+
+static int eager_ack = SOCKNAL_EAGER_ACK;
+CFS_MODULE_PARM(eager_ack, "i", int, 0644,
+                "send tcp ack packets eagerly");
+
+static int typed_conns = SOCKNAL_TYPED_CONNS;
+CFS_MODULE_PARM(typed_conns, "i", int, 0444,
+                "use different sockets for bulk");
+
+static int min_bulk = SOCKNAL_MIN_BULK;
+CFS_MODULE_PARM(min_bulk, "i", int, 0644,
+                "smallest 'large' message");
+
+static int buffer_size = SOCKNAL_BUFFER_SIZE;
+CFS_MODULE_PARM(buffer_size, "i", int, 0644,
+                "socket buffer size");
+
+static int nagle = SOCKNAL_NAGLE;
+CFS_MODULE_PARM(nagle, "i", int, 0644,
+                "enable NAGLE?");
+
+static int keepalive_idle = SOCKNAL_KEEPALIVE_IDLE;
+CFS_MODULE_PARM(keepalive_idle, "i", int, 0644,
+                "# idle seconds before probe");
+
+static int keepalive_count = SOCKNAL_KEEPALIVE_COUNT;
+CFS_MODULE_PARM(keepalive_count, "i", int, 0644,
+                "# missed probes == dead");
+
+static int keepalive_intvl = SOCKNAL_KEEPALIVE_INTVL;
+CFS_MODULE_PARM(keepalive_intvl, "i", int, 0644,
+                "seconds between probes");
+
+#if CPU_AFFINITY
+static int enable_irq_affinity = SOCKNAL_IRQ_AFFINITY;
+CFS_MODULE_PARM(enable_irq_affinity, "i", int, 0644,
+                "enable IRQ affinity");
+#endif
+
+#if SOCKNAL_ZC
+static unsigned int zc_min_frag = SOCKNAL_ZC_MIN_FRAG;
+CFS_MODULE_PARM(zc_min_frag, "i", int, 0644,
+                "minimum fragment to zero copy");
+#endif
+
+ksock_tunables_t ksocknal_tunables = {
+        /* .ksnd_timeout         = */ &timeout,
+        /* .ksnd_credits         = */ &credits,
+        /* .ksnd_peercredits     = */ &peer_credits,
+        /* .ksnd_nconnds         = */ &nconnds,
+        /* .ksnd_min_reconnectms = */ &min_reconnectms,
+        /* .ksnd_max_reconnectms = */ &max_reconnectms,
+        /* .ksnd_eager_ack       = */ &eager_ack,
+        /* .ksnd_typed_conns     = */ &typed_conns,
+        /* .ksnd_min_bulk        = */ &min_bulk,
+        /* .ksnd_buffer_size     = */ &buffer_size,
+        /* .ksnd_nagle           = */ &nagle,
+        /* .ksnd_keepalive_idle  = */ &keepalive_idle,
+        /* .ksnd_keepalive_count = */ &keepalive_count,
+        /* .ksnd_keepalive_intvl = */ &keepalive_intvl,
+#if SOCKNAL_ZC
+        /* .ksnd_zc_min_frag     = */ &zc_min_frag,
+#endif
+#if CPU_AFFINITY
+        /* .ksnd_irq_affinity    = */ &enable_irq_affinity,
+#endif
+};
+
diff --git a/lnet/klnds/tdilnd/tdilnd.h b/lnet/klnds/tdilnd/tdilnd.h
new file mode 100644 (file)
index 0000000..9cc3e57
--- /dev/null
@@ -0,0 +1,635 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _TDINAL_H_
+#define _TDINAL_H_
+
+/*
+ *  Included Headers 
+ */
+
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
+#include <lnet/socklnd.h>
+
+
+/*
+ * tdinal routines
+ */
+
+//
+// daemon.c
+//
+
+struct ksock_daemon *
+ksocknal_alloc_daemon(
+    unsigned short port,
+    int backlog
+    );
+
+void
+ksocknal_free_daemon(
+    struct ksock_daemon * daemon
+    );
+
+int
+ksocknal_daemon(
+    void * context
+    );
+
+void
+ksocknal_shut_daemon(
+    struct ksock_daemon *daemon
+    );
+
+int
+ksocknal_start_daemon(
+    unsigned short port,
+    int backlog
+    );
+
+void
+ksocknal_stop_daemon(
+    unsigned short port
+    );
+
+void
+ksocknal_stop_all_daemons();
+
+ksock_tconn_t *
+ksocknal_create_child_tconn(
+    ksock_tconn_t * parent
+    );
+
+void
+ksocknal_replenish_backlogs(
+    ksock_daemon_t * daemon
+    );
+
+int
+ksocknal_start_listen(
+  struct ksock_daemon * daemon
+    );
+
+int
+ksocknal_wait_child_tconn(
+    struct ksock_daemon * daemon,
+    ksock_tconn_t ** child
+    );
+
+ksock_tconn_t *
+ksocknal_get_vacancy_backlog(
+    ksock_tconn_t *  parent
+    );
+
+
+//
+// debug.c
+//
+
+
+PUCHAR
+KsNtStatusToString (IN NTSTATUS Status);
+
+
+VOID
+KsPrintf(
+    IN LONG  DebugPrintLevel,
+    IN PCHAR DebugMessage,
+    IN ...
+    );
+
+
+//
+// tconn.c
+//
+
+
+ksock_mdl_t *
+ksocknal_lock_iovs(
+    IN struct iovec  *iov,
+    IN int            niov,
+    IN int            recv,
+    IN int *          len
+    );
+
+ksock_mdl_t *
+ksocknal_lock_kiovs(
+    IN lnet_kiov_t *   kiov,
+    IN int            nkiov,
+    IN int            recv,
+    IN int *          len
+    );
+
+int
+ksocknal_send_mdl(
+    ksock_tconn_t * tconn,
+    ksock_tx_t *    tx,
+    ksock_mdl_t *   mdl,
+    int             len,
+    int             flags
+    );
+
+int
+ksocknal_query_data(
+    ksock_tconn_t * tconn,
+    size_t *        size,
+    int             bIsExpedited);
+
+int
+ksocknal_recv_mdl(
+    ksock_tconn_t * tconn,
+    ksock_mdl_t *   mdl,
+    int             size,
+    int             flags
+    );
+
+int
+ksocknal_get_tcp_option (
+    ksock_tconn_t *     tconn,
+    ULONG               ID,
+    PVOID               OptionValue,
+    PULONG              Length
+    );
+
+NTSTATUS
+ksocknal_set_tcp_option (
+    ksock_tconn_t * tconn,
+    ULONG           ID,
+    PVOID           OptionValue,
+    ULONG           Length
+    );
+
+int
+ksocknal_bind_tconn (
+    ksock_tconn_t * tconn,
+    ksock_tconn_t * parent,
+    ulong_ptr   addr,
+    unsigned short  port
+    );
+
+int
+ksocknal_build_tconn(
+    ksock_tconn_t *                 tconn,
+    ulong_ptr                   addr,
+    unsigned short                  port
+    );
+
+int
+ksocknal_disconnect_tconn(
+    ksock_tconn_t *     tconn,
+    ulong_ptr       flags
+    );
+
+void
+ksocknal_abort_tconn(
+    ksock_tconn_t *     tconn
+    );
+
+int
+ksocknal_query_local_ipaddr(
+    ksock_tconn_t *     tconn
+    );
+
+int
+ksocknal_init_tdi_data();
+
+void
+ksocknal_fini_tdi_data();
+
+int
+ksocknal_tconn_write (ksock_tconn_t *tconn, void *buffer, int nob);
+
+int
+ksocknal_tconn_read (ksock_tconn_t * tconn, void *buffer, int nob);
+
+int
+ksocknal_test_nagle(void * context);
+
+//
+// tcp.c
+//
+
+NTSTATUS
+KsTcpCompletionRoutine(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    );
+
+NTSTATUS
+KsDisconectCompletionRoutine (
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    );
+
+NTSTATUS
+KsTcpReceiveCompletionRoutine(
+    IN PIRP                         Irp,
+    IN PKS_TCP_COMPLETION_CONTEXT   Context
+    );
+
+NTSTATUS
+KsTcpSendCompletionRoutine(
+    IN PIRP                         Irp,
+    IN PKS_TCP_COMPLETION_CONTEXT   Context
+    );
+
+NTSTATUS
+KsAcceptCompletionRoutine(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    );
+
+
+NTSTATUS
+KsConnectEventHandler(
+    IN PVOID                    TdiEventContext,
+    IN LONG                     RemoteAddressLength,
+    IN PVOID                    RemoteAddress,
+    IN LONG                     UserDataLength,
+    IN PVOID                    UserData,
+    IN LONG                     OptionsLength,
+    IN PVOID                    Options,
+    OUT CONNECTION_CONTEXT *    ConnectionContext,
+    OUT PIRP *                  AcceptIrp
+    );
+
+NTSTATUS 
+KsDisconnectEventHandler(
+    IN PVOID                TdiEventContext,
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN LONG                 DisconnectDataLength,
+    IN PVOID                DisconnectData,
+    IN LONG                 DisconnectInformationLength,
+    IN PVOID                DisconnectInformation,
+    IN ULONG                DisconnectFlags
+    );
+
+NTSTATUS
+KsTcpReceiveEventHandler(
+    IN PVOID                TdiEventContext, 
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags,
+    IN ULONG                BytesIndicated,
+    IN ULONG                BytesAvailable,
+    OUT ULONG *             BytesTaken,
+    IN PVOID                Tsdu,
+    OUT PIRP *              IoRequestPacket
+   );
+
+NTSTATUS
+KsTcpReceiveExpeditedEventHandler(
+    IN PVOID                TdiEventContext, 
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags,
+    IN ULONG                BytesIndicated,
+    IN ULONG                BytesAvailable,
+    OUT ULONG *             BytesTaken,
+    IN PVOID                Tsdu,
+    OUT PIRP *              IoRequestPacket
+    );
+
+NTSTATUS
+KsTcpChainedReceiveEventHandler (
+    IN PVOID TdiEventContext,       // the event context
+    IN CONNECTION_CONTEXT ConnectionContext,
+    IN ULONG ReceiveFlags, 
+    IN ULONG ReceiveLength,
+    IN ULONG StartingOffset,        // offset of start of client data in TSDU
+    IN PMDL  Tsdu,                  // TSDU data chain
+    IN PVOID TsduDescriptor         // for call to TdiReturnChainedReceives
+    );
+
+NTSTATUS
+KsTcpChainedReceiveExpeditedEventHandler (
+    IN PVOID                TdiEventContext,       // the event context
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags, 
+    IN ULONG                ReceiveLength,
+    IN ULONG                StartingOffset,        // offset of start of client data in TSDU
+    IN PMDL                 Tsdu,                  // TSDU data chain
+    IN PVOID                TsduDescriptor         // for call to TdiReturnChainedReceives
+    );
+
+
+
+VOID
+KsDisconnectHelper(PKS_DISCONNECT_WORKITEM WorkItem);
+
+
+//
+// tdi.c
+//
+
+ULONG
+ksocknal_tdi_send_flags(ULONG SockFlags);
+
+PIRP
+KsBuildTdiIrp(
+    IN PDEVICE_OBJECT    DeviceObject
+    );
+
+NTSTATUS
+KsSubmitTdiIrp(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN BOOLEAN          bSynchronous,
+    OUT PULONG          Information
+    );
+
+NTSTATUS
+KsOpenControl(
+    IN PUNICODE_STRING      DeviceName,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   );
+
+NTSTATUS
+KsCloseControl(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+   );
+
+NTSTATUS
+KsOpenAddress(
+    IN PUNICODE_STRING      DeviceName,
+    IN PTRANSPORT_ADDRESS   pAddress,
+    IN ULONG                AddressLength,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   );
+
+NTSTATUS
+KsCloseAddress(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+    );
+
+NTSTATUS
+KsOpenConnection(
+    IN PUNICODE_STRING      DeviceName,
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   );
+
+NTSTATUS
+KsCloseConnection(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+    );
+
+NTSTATUS
+KsAssociateAddress(
+    IN HANDLE           AddressHandle,
+    IN PFILE_OBJECT     ConnectionObject
+    );
+
+
+NTSTATUS
+KsDisassociateAddress(
+    IN PFILE_OBJECT     ConnectionObject
+    );
+
+
+NTSTATUS
+KsSetEventHandlers(
+    IN PFILE_OBJECT         AddressObject,
+    IN PVOID                EventContext,
+    IN PKS_EVENT_HANDLERS   Handlers
+   );
+
+
+NTSTATUS
+KsQueryProviderInfo(
+    PWSTR               TdiDeviceName,
+    PTDI_PROVIDER_INFO  ProviderInfo
+   );
+
+NTSTATUS
+KsQueryAddressInfo(
+    IN PFILE_OBJECT         FileObject,
+    OUT PTDI_ADDRESS_INFO   AddressInfo,
+    OUT PULONG              AddressSize
+   );
+
+NTSTATUS
+KsQueryConnectionInfo(
+    IN PFILE_OBJECT            ConnectionObject,
+    OUT PTDI_CONNECTION_INFO   ConnectionInfo,
+    OUT PULONG                 ConnectionSize
+   );
+
+ULONG
+KsInitializeTdiAddress(
+    IN OUT PTA_IP_ADDRESS   pTransportAddress,
+    IN ULONG                IpAddress,
+    IN USHORT               IpPort
+    );
+
+ULONG
+KsQueryMdlsSize (IN PMDL Mdl);
+
+
+ULONG
+KsQueryTdiAddressLength(
+    OUT PTRANSPORT_ADDRESS   pTransportAddress
+    );
+
+NTSTATUS
+KsQueryIpAddress(
+    IN PFILE_OBJECT     FileObject,
+    OUT PVOID           TdiAddress,
+    OUT ULONG*          AddressLength
+    );
+
+
+NTSTATUS
+KsErrorEventHandler(
+    IN PVOID            TdiEventContext,
+    IN NTSTATUS         Status
+   );
+
+int
+ksocknal_set_handlers(
+    ksock_tconn_t *     tconn
+    );
+
+
+
+//
+// Strusup.c
+//
+
+VOID
+KsPrintProviderInfo(
+   PWSTR DeviceName,
+   PTDI_PROVIDER_INFO ProviderInfo
+   );
+
+VOID
+KsInitialize(VOID);
+
+VOID
+KsUninitialize(VOID);
+
+
+ksock_tconn_t *
+ksocknal_create_tconn();
+
+void
+ksocknal_free_tconn(
+    ksock_tconn_t * tconn
+    );
+
+void
+ksocknal_init_listener(
+    ksock_tconn_t * tconn
+    );
+
+void
+ksocknal_init_sender(
+    ksock_tconn_t * tconn
+    );
+
+void
+ksocknal_init_child(
+    ksock_tconn_t * tconn
+    );
+
+void
+ksocknal_get_tconn(
+    ksock_tconn_t * tconn
+    );
+
+void
+ksocknal_put_tconn(
+    ksock_tconn_t * tconn
+    );
+
+int
+ksocknal_reset_handlers(
+    ksock_tconn_t *     tconn
+    );
+
+void
+ksocknal_destroy_tconn(
+    ksock_tconn_t *     tconn
+    );
+
+
+PKS_TSDU
+KsAllocateKsTsdu();
+
+VOID
+KsPutKsTsdu(
+    PKS_TSDU  KsTsdu
+    );
+
+VOID
+KsFreeKsTsdu(
+    PKS_TSDU  KsTsdu
+    );
+
+VOID
+KsInitializeKsTsdu(
+    PKS_TSDU    KsTsdu,
+    ULONG       Length
+    );
+
+
+VOID
+KsInitializeKsTsduMgr(
+    PKS_TSDUMGR     TsduMgr
+    );
+
+VOID
+KsInitializeKsChain(
+    PKS_CHAIN       KsChain
+    );
+
+NTSTATUS
+KsCleanupTsduMgr(
+    PKS_TSDUMGR     KsTsduMgr
+    );
+
+NTSTATUS
+KsCleanupKsChain(
+    PKS_CHAIN   KsChain
+    );
+
+NTSTATUS
+KsCleanupTsdu(
+    ksock_tconn_t * tconn
+    );
+
+NTSTATUS
+KsCopyMdlChainToMdlChain(
+    IN PMDL     SourceMdlChain,
+    IN ULONG    SourceOffset,
+    IN PMDL     DestinationMdlChain,
+    IN ULONG    DestinationOffset,
+    IN ULONG    BytesTobecopied,
+    OUT PULONG  BytesCopied
+    );
+
+ULONG
+KsQueryMdlsSize (PMDL Mdl);
+
+NTSTATUS
+KsLockUserBuffer (
+    IN PVOID            UserBuffer,
+    IN BOOLEAN          bPaged,
+    IN ULONG            Length,
+    IN LOCK_OPERATION   Operation,
+    OUT PMDL *          pMdl
+    );
+
+PVOID
+KsMapMdlBuffer (PMDL    Mdl);
+
+VOID
+KsReleaseMdl ( IN PMDL   Mdl,
+               IN int    Paged );
+
+int
+ksocknal_lock_buffer (
+    void *            buffer,
+    int               paged,
+    int               length,
+    LOCK_OPERATION    access,
+    ksock_mdl_t **    kmdl
+    );
+
+void *
+ksocknal_map_mdl (ksock_mdl_t * mdl);
+
+void
+ksocknal_release_mdl (ksock_mdl_t *mdl, int paged);
+
+
+#endif //_TDINAL_H_
\ No newline at end of file
index 21f5548..61d4b8a 100644 (file)
@@ -11,7 +11,7 @@ DIST_SUBDIRS := $(SUBDIRS)
 
 if LIBLUSTRE
 noinst_LIBRARIES= libcfs.a
-libcfs_a_SOURCES= debug.c
+libcfs_a_SOURCES= debug.c user-prim.c user-lock.c
 libcfs_a_CPPFLAGS = $(LLCPPFLAGS)
 libcfs_a_CFLAGS = $(LLCFLAGS)
 endif
@@ -25,12 +25,12 @@ endif
 if DARWIN
 macos_PROGRAMS := libcfs
 
-nodist_libcfs_SOURCES := debug.c module.c tracefile.c nidstrings.c   \
-       darwin/darwin-debug.c darwin/darwin-fs.c darwin/darwin-mem.c \
-       darwin/darwin-module.c darwin/darwin-prim.c                  \
-       darwin/darwin-proc.c darwin/darwin-tracefile.c               \
-       darwin/darwin-utils.c darwin/darwin-sync.c                   \
-       darwin/darwin-curproc.c user-prim.c user-lock.c
+nodist_libcfs_SOURCES := darwin/darwin-sync.c darwin/darwin-mem.c      \
+       darwin/darwin-prim.c darwin/darwin-fs.c darwin/darwin-curproc.c \
+       darwin/darwin-tcpip.c darwin/darwin-utils.c                     \
+       darwin/darwin-debug.c darwin/darwin-proc.c                      \
+       darwin/darwin-tracefile.c darwin/darwin-module.c                \
+       debug.c module.c tracefile.c nidstrings.c   
 
 libcfs_CFLAGS := $(EXTRA_KCFLAGS)
 libcfs_LDFLAGS := $(EXTRA_KLDFLAGS)
@@ -49,4 +49,4 @@ install-data-hook: $(install_data_hook)
 EXTRA_DIST := Info.plist
 
 MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ linux-*.c linux/*.o darwin/*.o libcfs
-DIST_SOURCES := $(libcfs-all-objs:%.o=%.c) tracefile.h
+DIST_SOURCES := $(libcfs-all-objs:%.o=%.c) tracefile.h user-prim.c user-lock.c
index 8e77294..3f2077b 100644 (file)
@@ -8,4 +8,5 @@ EXTRA_DIST := \
        darwin-fs.c \
        darwin-prim.c \
        darwin-tracefile.c \
-       darwin-curproc.c
+       darwin-curproc.c \
+       darwin-tcpip.c
index 4832669..cf8a722 100644 (file)
 #include <libcfs/kp30.h>
 
 /*
- * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h)
+ * Implementation of cfs_curproc API (see lnet/include/libcfs/curproc.h)
  * for XNU kernel.
  */
 
 static inline struct ucred *curproc_ucred(void)
 {
+#ifdef __DARWIN8__
+        return proc_ucred(current_proc());
+#else
         return current_proc()->p_cred->pc_ucred;
+#endif
 }
 
 uid_t  cfs_curproc_uid(void)
@@ -46,17 +50,29 @@ gid_t  cfs_curproc_gid(void)
 
 uid_t  cfs_curproc_fsuid(void)
 {
+#ifdef __DARWIN8__
+        return curproc_ucred()->cr_ruid;
+#else
         return current_proc()->p_cred->p_ruid;
+#endif
 }
 
 gid_t  cfs_curproc_fsgid(void)
 {
+#ifdef __DARWIN8__
+        return curproc_ucred()->cr_rgid;
+#else
         return current_proc()->p_cred->p_rgid;
+#endif
 }
 
 pid_t  cfs_curproc_pid(void)
 {
+#ifdef __DARWIN8__
+        return proc_pid(current_proc());
+#else
         return current_proc()->p_pid;
+#endif
 }
 
 int    cfs_curproc_groups_nr(void)
@@ -94,17 +110,39 @@ void   cfs_curproc_groups_dump(gid_t *array, int size)
 
 mode_t cfs_curproc_umask(void)
 {
+#ifdef __DARWIN8__
+        /*
+         * XXX Liang:
+         *
+         * fd_cmask is not available in kexts, so we just assume 
+         * verything is permited.
+         */
+        return -1;
+#else
         return current_proc()->p_fd->fd_cmask;
+#endif
 }
 
 char  *cfs_curproc_comm(void)
 {
+#ifdef __DARWIN8__
+        /*
+         * Writing to proc->p_comm is not permited in Darwin8,
+         * because proc_selfname() only return a copy of proc->p_comm,
+         * so this function is not really working.
+         */
+        static char     pcomm[MAXCOMLEN+1];
+
+        proc_selfname(pcomm, MAXCOMLEN+1);
+        return pcomm;
+#else
         return current_proc()->p_comm;
+#endif
 }
 
 cfs_kernel_cap_t cfs_curproc_cap_get(void)
 {
-        return 0;
+        return -1;
 }
 
 void cfs_curproc_cap_set(cfs_kernel_cap_t cap)
index 6fdaa94..8336a21 100644 (file)
@@ -9,17 +9,68 @@ void libcfs_debug_dumpstack(cfs_task_t *tsk)
        return;
 }
 
-cfs_task_t *libcfs_current(void)
-{ 
-       return cfs_current();
+void libcfs_run_lbug_upcall(char *file, const char *fn, const int line)
+{
+}
+
+void lbug_with_loc(char *file, const char *func, const int line)
+{
+        CEMERG("LBUG: pid: %u thread: %#x\n",
+              (unsigned)cfs_curproc_pid(), (unsigned)current_thread());
+        libcfs_debug_dumplog();
+        libcfs_run_lbug_upcall(file, func, line);
+        while (1)
+                cfs_schedule();
+
+       /* panic("lbug_with_loc(%s, %s, %d)", file, func, line) */
 }
 
-int portals_arch_debug_init(unsigned long bufsize)
+#if ENTRY_NESTING_SUPPORT
+
+static inline struct cfs_debug_data *__current_cdd(void)
 {
-       return 0;
+       struct cfs_debug_data *cdd;
+
+       cdd = (struct cfs_debug_data *)current_uthread()->uu_nlminfo;
+       if (cdd != NULL &&
+           cdd->magic1 == CDD_MAGIC1 && cdd->magic2 == CDD_MAGIC2 &&
+           cdd->nesting_level < 1000)
+               return cdd;
+       else
+               return NULL;
 }
 
-int portals_arch_debug_cleanup(void)
+static inline void __current_cdd_set(struct cfs_debug_data *cdd)
 {
-       return 0;
+       current_uthread()->uu_nlminfo = (void *)cdd;
+}
+
+void __entry_nesting(struct cfs_debug_data *child)
+{
+       struct cfs_debug_data *parent;
+
+       parent = __current_cdd();
+       if (parent != NULL) {
+               child->parent        = parent;
+               child->nesting_level = parent->nesting_level + 1;
+       }
+       __current_cdd_set(child);
+}
+
+void __exit_nesting(struct cfs_debug_data *child)
+{
+       __current_cdd_set(child->parent);
+}
+
+unsigned int __current_nesting_level(void)
+{
+       struct cfs_debug_data *cdd;
+
+       cdd = __current_cdd();
+       if (cdd != NULL)
+               return cdd->nesting_level;
+       else
+               return 0;
 }
+/* ENTRY_NESTING_SUPPORT */
+#endif
index 0e2d5bf..45f37df 100644 (file)
@@ -27,7 +27,6 @@
 #include <sys/file.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
-#include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/uio.h>
 #include <sys/filedesc.h>
  *
  * Public functions
  */
+
+#ifdef __DARWIN8__
+#include <sys/vnode.h>
+
+extern int vn_rdwr(enum uio_rw, vnode_t, caddr_t, int, off_t, enum uio_seg, int, kauth_cred_t, int *, proc_t);
+
+/* vnode_size() is not exported */
+static errno_t
+vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
+{
+        struct vnode_attr       va;
+        int                     error; 
+        
+        VATTR_INIT(&va);
+        VATTR_WANTED(&va, va_data_size);
+        error = vnode_getattr(vp, &va, ctx);
+        if (!error)
+                *sizep = va.va_data_size;
+        return(error);
+}
+
+/*
+ * XXX Liang:
+ *
+ * kern_file_*() are not safe for multi-threads now,
+ * however, we need them only for tracefiled, so it's
+ * not so important to implement for MT.
+ */
+int
+kern_file_size(struct cfs_kern_file *fp, off_t *psize) 
+{
+        int     error;
+        off_t   size;
+
+        error = vnode_size(fp->f_vp, &size, fp->f_ctxt);
+        if (error) 
+                return error;
+
+        if (psize)
+                *psize = size;
+        return 0;
+}
+
+struct cfs_kern_file *
+kern_file_open(const char * filename, int uflags, int mode, int *err)
+{
+        struct cfs_kern_file    *fp;
+        vnode_t         vp;
+        int             error;
+
+        fp = (struct cfs_kern_file *)_MALLOC(sizeof(struct cfs_kern_file), M_TEMP, M_WAITOK);
+        if (fp == NULL) {
+                if (err != NULL)
+                        *err = -ENOMEM;
+                return NULL;
+        }
+        fp->f_flags = FFLAGS(uflags);
+        fp->f_ctxt = vfs_context_create(NULL);
+
+        if ((error = vnode_open(filename, fp->f_flags, 
+                                mode, 0, &vp, fp->f_ctxt))){
+                if (err != NULL)
+                        *err = -error;
+                _FREE(fp, M_TEMP);
+        } else {
+                if (err != NULL)
+                        *err = 0;
+                fp->f_vp = vp;
+        }
+
+        return fp;
+}
+
+int
+kern_file_close(struct cfs_kern_file *fp)
+{
+        vnode_close(fp->f_vp, fp->f_flags, fp->f_ctxt);
+        vfs_context_rele(fp->f_ctxt);
+        _FREE(fp, M_TEMP);
+
+        return 0;
+}
+
+int
+kern_file_read(struct cfs_kern_file *fp, void *buf, size_t nbytes, loff_t *pos)
+{
+        struct proc *p = current_proc();
+        int     resid;
+        int     error;
+
+        assert(buf != NULL);
+        assert(fp != NULL && fp->f_vp != NULL);
+
+        error = vn_rdwr(UIO_READ, fp->f_vp, buf, nbytes, *pos, 
+                        UIO_SYSSPACE32, 0, vfs_context_ucred(fp->f_ctxt), &resid, p);
+        if ((error) || (nbytes == resid)) {
+                if (!error)
+                        error = -EINVAL;
+                return error;
+        }
+        *pos += nbytes - resid;
+
+        return (int)(nbytes - resid);
+}
+
 int
-filp_node_size(struct file *fp, off_t *size)
+kern_file_write(struct cfs_kern_file *fp, void *buf, size_t nbytes, loff_t *pos)
+{
+        struct proc *p = current_proc();
+        int     resid;
+        int     error;
+
+        assert(buf != NULL);
+        assert(fp != NULL && fp->f_vp != NULL);
+
+        error = vn_rdwr(UIO_WRITE, fp->f_vp, buf, nbytes, *pos, 
+                        UIO_SYSSPACE32, 0, vfs_context_ucred(fp->f_ctxt), &resid, p);
+        if ((error) || (nbytes == resid)) {
+                if (!error)
+                        error = -EINVAL;
+                return error;
+        }
+        *pos += nbytes - resid;
+
+        return (int)(nbytes - resid);
+
+}
+
+int
+kern_file_sync (struct cfs_kern_file *fp)
+{
+        return VNOP_FSYNC(fp->f_vp, MNT_WAIT, fp->f_ctxt);
+}
+
+#else  /* !__DARWIN8__ */
+
+int
+kern_file_size(struct file *fp, off_t *size)
 {
         struct vnode *vp = (struct vnode *)fp->f_data;
         struct stat sb;
@@ -60,12 +195,11 @@ filp_node_size(struct file *fp, off_t *size)
 }
 
 cfs_file_t *
-filp_open(const char * filename, int flags, int mode, int *err)
+kern_file_open(const char * filename, int flags, int mode, int *err)
 {
        struct nameidata nd;
-       register cfs_file_t     *fp;
+       cfs_file_t      *fp;
        register struct vnode   *vp;
-       cfs_file_t              *nfp;
        int                     rc;
        extern struct fileops   vnops;
        extern int nfiles;
@@ -73,16 +207,16 @@ filp_open(const char * filename, int flags, int mode, int *err)
 
         CFS_CONE_IN;
        nfiles++;
-       MALLOC_ZONE(nfp, cfs_file_t *, sizeof(cfs_file_t), M_FILE, M_WAITOK|M_ZERO);
-       bzero(nfp, sizeof(cfs_file_t));
-       nfp->f_count = 1;
-       fp = nfp;
+       MALLOC_ZONE(fp, cfs_file_t *, sizeof(cfs_file_t), M_FILE, M_WAITOK|M_ZERO);
+       bzero(fp, sizeof(cfs_file_t));
+       fp->f_count = 1;
+        LIST_CIRCLE(fp, f_list);
        NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, (char *)filename, current_proc());
        if ((rc = vn_open(&nd, flags, mode)) != 0){
                 printf("filp_open failed at (%d)\n", rc);
                 if (err != NULL)
                         *err = rc;
-               ffree(fp);
+                FREE_ZONE(fp, sizeof *fp, M_FILE);
                 CFS_CONE_EX;
                return NULL;
        }
@@ -117,7 +251,7 @@ frele_internal(cfs_file_t *fp)
 }
 
 int
-filp_close (cfs_file_t *fp)
+kern_file_close (cfs_file_t *fp)
 {
        struct vnode    *vp;
         CFS_DECL_CONE_DATA;
@@ -159,21 +293,28 @@ extern void bwillwrite(void);
  * Write buffer to filp inside kernel
  */
 int
-filp_write (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
+kern_file_write (cfs_file_t *fp, void *buf, size_t nbyte, loff_t *pos)
 {
        struct uio auio;
        struct iovec aiov;
        struct proc *p = current_proc();
        long cnt, error = 0;
+        int flags = 0;
         CFS_DECL_CONE_DATA;
 
        aiov.iov_base = (void *)(uintptr_t)buf;
        aiov.iov_len = nbyte;
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
-        if (pos != NULL)
+        if (pos != NULL) {
                auio.uio_offset = *pos;
-        else
+                /* 
+                 * Liang: If don't set FOF_OFFSET, vn_write()
+                 * will use fp->f_offset as the the real offset.
+                 * Same in vn_read()
+                 */
+                flags |= FOF_OFFSET;
+        } else
                 auio.uio_offset = (off_t)-1;
        if (nbyte > INT_MAX)
                return (EINVAL);
@@ -186,7 +327,7 @@ filp_write (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
         CFS_CONE_IN;
        if (fp->f_type == DTYPE_VNODE)
                bwillwrite();   /* empty stuff now */
-       if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
+       if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
                if (auio.uio_resid != cnt && (error == ERESTART ||\
                    error == EINTR || error == EWOULDBLOCK))
                        error = 0;
@@ -200,7 +341,7 @@ filp_write (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
        else
                cnt -= auio.uio_resid;
         if (pos != NULL)
-                *pos = auio.uio_offset;
+                *pos += cnt;
        return cnt;
 }
 
@@ -208,21 +349,23 @@ filp_write (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
  * Read from filp inside kernel
  */
 int
-filp_read (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
+kern_file_read (cfs_file_t *fp, void *buf, size_t nbyte, loff_t *pos)
 {
        struct uio auio;
        struct iovec aiov;
        struct proc *p = current_proc();
        long cnt, error = 0;
+        int  flags = 0;
         CFS_DECL_CONE_DATA;
 
        aiov.iov_base = (caddr_t)buf;
        aiov.iov_len = nbyte;
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
-        if (pos != NULL)
+        if (pos != NULL) {
                auio.uio_offset = *pos;
-        else
+                flags |= FOF_OFFSET;
+        } else
                 auio.uio_offset = (off_t)-1;
        if (nbyte > INT_MAX)
                return (EINVAL);
@@ -233,7 +376,7 @@ filp_read (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
 
        cnt = nbyte;
         CFS_CONE_IN;
-       if ((error = fo_read(fp, &auio, fp->f_cred, 0, p)) != 0) {
+       if ((error = fo_read(fp, &auio, fp->f_cred, flags, p)) != 0) {
                if (auio.uio_resid != cnt && (error == ERESTART ||
                    error == EINTR || error == EWOULDBLOCK))
                        error = 0;
@@ -244,13 +387,13 @@ filp_read (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
        else
                cnt -= auio.uio_resid;
         if (pos != NULL)
-                *pos = auio.uio_offset;
+                *pos += cnt;
 
        return cnt;
 }
 
 int
-filp_fsync (cfs_file_t *fp)
+kern_file_sync (cfs_file_t *fp)
 {
        struct vnode *vp = (struct vnode *)fp->f_data;
        struct proc *p = current_proc();
@@ -271,60 +414,53 @@ filp_fsync (cfs_file_t *fp)
        return error;
 }
 
-int
-ref_file(cfs_file_t *fp)
+#endif /* !__DARWIN8__ */
+
+cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor)
 {
-        CFS_DECL_CONE_DATA;
+        return makedev(major, minor);
+}
 
-        CFS_CONE_IN;
-        fref(fp);
-        CFS_CONE_EX;
-        return 0;
+cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev)
+{
+        return major(rdev);
 }
 
-int 
-rele_file(cfs_file_t *fp)
+cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev)
 {
-        CFS_DECL_CONE_DATA;
+        return minor(rdev);
+}
 
-        CFS_CONE_IN;
-        frele(fp);
-        CFS_CONE_EX;
-        return 0;
+struct posix_acl *posix_acl_alloc(int count, int flags)
+{
+        static struct posix_acl acl;
+        return &acl;
 }
 
 /*
- * Private functions
+ * XXX Liang: I've not converted all of them, 
+ * more is needed? 
  */
-void vrele_safe(struct vnode *nd)
-{ 
-        CFS_DECL_CONE_DATA; 
-        
-        CFS_CONE_IN; 
-        vrele(nd); 
-        CFS_CONE_EX;
-}
-
-int
-path_lookup(const char *path, unsigned int flags, struct nameidata *nd)
+int cfs_oflags2univ(int flags) 
 {
-       int ret = 0;
-        CFS_DECL_CONE_DATA;
+        int f;
 
-        CFS_CONE_IN;
-       NDINIT(nd, LOOKUP, FOLLOW, UIO_SYSSPACE, (char *)path, current_proc());
-       if ((ret = namei(nd)) != 0){
-               CERROR("path_lookup fail!\n");
-       }
-        CFS_CONE_EX;
-
-       return ret;
+        f = flags & O_ACCMODE;
+        f |= (flags & O_CREAT) ? CFS_O_CREAT: 0;
+        f |= (flags & O_TRUNC) ? CFS_O_TRUNC: 0;
+        f |= (flags & O_EXCL) ? CFS_O_EXCL: 0;
+        f |= (flags & O_NONBLOCK) ? CFS_O_NONBLOCK: 0;
+        f |= (flags & O_APPEND) ? CFS_O_APPEND: 0;
+        f |= (flags & O_NOFOLLOW) ? CFS_O_NOFOLLOW: 0;
+        f |= (flags & O_SYNC)? CFS_O_SYNC: 0;
+        return f;
 }
 
-int 
-file_count(struct file *fp)
+/*
+ * XXX Liang: we don't need it in OSX.
+ * But it should be implemented anyway.
+ */
+int cfs_univ2oflags(int flags)
 {
-        return fcount(fp);
+        return flags;
 }
-
-
diff --git a/lnet/libcfs/darwin/darwin-internal.h b/lnet/libcfs/darwin/darwin-internal.h
new file mode 100644 (file)
index 0000000..6c83577
--- /dev/null
@@ -0,0 +1,22 @@
+#ifndef __LIBCFS_DARWIN_INTERNAL_H__
+#define __LIBCFS_DARWIN_INTERNAL_H__
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+
+int cfs_sysctl_isvalid(void);
+struct sysctl_oid *cfs_alloc_sysctl_node(struct sysctl_oid_list *parent, int nbr, int access,
+                                        const char *name, int (*handler) SYSCTL_HANDLER_ARGS);
+struct sysctl_oid *cfs_alloc_sysctl_int(struct sysctl_oid_list *parent, int n,
+                                       const char *name, int *ptr, int val);
+struct sysctl_oid * cfs_alloc_sysctl_long(struct sysctl_oid_list *parent, int nbr, int access,
+                                         const char *name, int *ptr, int val);
+struct sysctl_oid * cfs_alloc_sysctl_string(struct sysctl_oid_list *parent, int nbr, int access,
+                                           const char *name, char *ptr, int len);
+struct sysctl_oid * cfs_alloc_sysctl_struct(struct sysctl_oid_list *parent, int nbr, int access,
+                                           const char *name, void *ptr, int size);
+
+#endif
index 8f7654f..57452a2 100644 (file)
@@ -2,7 +2,8 @@
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
  * Copyright (C) 2002 Cluster File Systems, Inc.
- * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ *         Nikita Danilov <nikita@clusterfs.com>
  *
  * This file is part of Lustre, http://www.lustre.org.
  *
 
 #include <mach/mach_types.h>
 #include <string.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <sys/file.h>
-#include <sys/conf.h>
-#include <sys/vnode.h>
-#include <sys/uio.h>
-#include <sys/filedesc.h>
-#include <sys/namei.h>
-#include <miscfs/devfs/devfs.h>
-#include <kern/kalloc.h>
-#include <kern/zalloc.h>
-#include <kern/thread.h>
+#include <sys/malloc.h>
 
 #include <libcfs/libcfs.h>
 #include <libcfs/kp30.h>
+#include "darwin-internal.h"
 
-/*
- * Definition of struct zone, copied from osfmk/kern/zalloc.h.
- */
-struct zone_hack {
-       int             count;          /* Number of elements used now */
-       vm_offset_t     free_elements;
-       vm_size_t       cur_size;       /* current memory utilization */
-       vm_size_t       max_size;       /* how large can this zone grow */
-       vm_size_t       elem_size;      /* size of an element */
-       vm_size_t       alloc_size;     /* size used for more memory */
-       char            *zone_name;     /* a name for the zone */
-       unsigned int
-       /* boolean_t */ exhaustible :1, /* (F) merely return if empty? */
-       /* boolean_t */ collectable :1, /* (F) garbage collect empty pages */
-       /* boolean_t */ expandable :1,  /* (T) expand zone (with message)? */
-       /* boolean_t */ allows_foreign :1,/* (F) allow non-zalloc space */
-       /* boolean_t */ doing_alloc :1, /* is zone expanding now? */
-       /* boolean_t */ waiting :1,     /* is thread waiting for expansion? */
-       /* boolean_t */ async_pending :1;       /* asynchronous allocation pending? */
-       struct zone_hack *      next_zone;      /* Link for all-zones list */
-       /*
-        * more fields follow, but we don't need them. We only need
-        * offset from the beginning of struct zone to ->next_zone
-        * field: it allows us to scan the list of all zones.
-        */
+#if CFS_INDIVIDUAL_ZONE
+extern zone_t zinit( vm_size_t, vm_size_t, vm_size_t, const char *);
+extern void * zalloc(zone_t zone);
+extern void *zalloc_noblock(zone_t zone);
+extern void zfree(zone_t zone, void *addr);
+
+struct cfs_zone_nob {
+        struct list_head       *z_nob;  /* Pointer to z_link */
+        struct list_head        z_link; /* Do NOT access it directly */       
 };
 
-decl_simple_lock_data(extern, all_zones_lock)
+static struct cfs_zone_nob      cfs_zone_nob;
+static spinlock_t               cfs_zone_guard;
 
-/*
- * returns true iff zone with name @name already exists.
- *
- * XXX nikita: this function is defined in this file only because there is no
- * better place to put it in.
- */
-zone_t cfs_find_zone(const char *name)
+cfs_mem_cache_t *mem_cache_find(const char *name, size_t objsize)
 {
-       struct zone_hack *scan;
+        cfs_mem_cache_t         *walker = NULL;
 
-       /* from osfmk/kern/zalloc.c */
-       extern zone_t first_zone;
+        LASSERT(cfs_zone_nob.z_nob != NULL);
 
-       LASSERT(name != NULL);
+        spin_lock(&cfs_zone_guard);
+        list_for_each_entry(walker, cfs_zone_nob.z_nob, mc_link) {
+                if (!strcmp(walker->mc_name, name) && \
+                    walker->mc_size == objsize)
+                        break;
+        }
+        spin_unlock(&cfs_zone_guard);
 
-       simple_lock(&all_zones_lock);
-       for (scan = (struct zone_hack *)first_zone;
-            scan != NULL; scan = scan->next_zone) {
-               if (!strcmp(scan->zone_name, name))
-                       break;
-       }
-       simple_unlock(&all_zones_lock);
-       return((zone_t)scan);
+        return walker;
 }
 
 /*
@@ -103,59 +71,120 @@ zone_t cfs_find_zone(const char *name)
  * survives kext unloading, so that @name cannot be just static string
  * embedded into kext image.
  */
-zone_t cfs_zinit(vm_size_t size, vm_size_t max, int alloc, const char *name)
+cfs_mem_cache_t *mem_cache_create(vm_size_t objsize, const char *name)
 {
+       cfs_mem_cache_t *mc = NULL;
         char *cname;
 
+       MALLOC(mc, cfs_mem_cache_t *, sizeof(cfs_mem_cache_t), M_TEMP, M_WAITOK|M_ZERO);
+       if (mc == NULL){
+               CERROR("cfs_mem_cache created fail!\n");
+               return NULL;
+       }
+
         cname = _MALLOC(strlen(name) + 1, M_TEMP, M_WAITOK);
         LASSERT(cname != NULL);
-        return zinit(size, max, alloc, strcpy(cname, name));
+        mc->mc_cache = zinit(objsize, (KMEM_MAX_ZONE * objsize), 0, strcpy(cname, name));
+        mc->mc_size = objsize;
+        CFS_INIT_LIST_HEAD(&mc->mc_link);
+        strncpy(mc->mc_name, name, 1 + strlen(name));
+        return mc;
+}
+
+void mem_cache_destroy(cfs_mem_cache_t *mc)
+{
+        /*
+         * zone can NOT be destroyed after creating, 
+         * so just keep it in list.
+         *
+         * We will not lost a zone after we unload
+         * libcfs, it can be found by from libcfs.zone
+         */
+        return;
 }
 
+#define mem_cache_alloc(mc)     zalloc((mc)->mc_cache)
+#ifdef __DARWIN8__
+# define mem_cache_alloc_nb(mc) zalloc((mc)->mc_cache)
+#else
+/* XXX Liang: Tiger doesn't export zalloc_noblock() */
+# define mem_cache_alloc_nb(mc) zalloc_noblock((mc)->mc_cache)
+#endif
+#define mem_cache_free(mc, p)   zfree((mc)->mc_cache, p)
+
+#else  /* !CFS_INDIVIDUAL_ZONE */
+
 cfs_mem_cache_t *
-cfs_mem_cache_create (const char *name, size_t objsize, size_t off, unsigned long arg1,
-               void (*arg2)(void *, cfs_mem_cache_t *, unsigned long),
-               void (*arg3)(void *, cfs_mem_cache_t *, unsigned long))
+mem_cache_find(const char *name, size_t objsize)
 {
-       cfs_mem_cache_t *new = NULL;
+        return NULL;
+}
 
-       MALLOC(new, cfs_mem_cache_t *, objsize, M_TEMP, M_WAITOK|M_ZERO);
-       if (new == NULL){
+cfs_mem_cache_t *mem_cache_create(vm_size_t size, const char *name)
+{
+        cfs_mem_cache_t *mc = NULL;
+
+       MALLOC(mc, cfs_mem_cache_t *, sizeof(cfs_mem_cache_t), M_TEMP, M_WAITOK|M_ZERO);
+       if (mc == NULL){
                CERROR("cfs_mem_cache created fail!\n");
                return NULL;
        }
-       new->size = objsize;
-        CFS_INIT_LIST_HEAD(&new->link);
-        strncpy(new->name, name, 1 + strlen(name));
-        new->zone = cfs_find_zone(name);
-        if (new->zone == NULL) {
-                new->zone = cfs_zinit (objsize, KMEM_MAX_ZONE * objsize, 0, name);
-                if (new->zone == NULL) {
-                        CERROR("zone create fault!\n");
-                        FREE (new, M_TEMP);
-                        return NULL;
-                }
-        }
-       return new;
+        mc->mc_cache = OSMalloc_Tagalloc(name, OSMT_DEFAULT);
+        mc->mc_size = size;
+        return mc;
 }
 
-int
-cfs_mem_cache_destroy (cfs_mem_cache_t *cachep)
+void mem_cache_destroy(cfs_mem_cache_t *mc)
 {
-        FREE (cachep, M_TEMP);
-       return 0;
+        OSMalloc_Tagfree(mc->mc_cache);
+        FREE(mc, M_TEMP);
 }
 
-void *
-cfs_mem_cache_alloc (cfs_mem_cache_t *cachep, int flags)
+#define mem_cache_alloc(mc)     OSMalloc((mc)->mc_size, (mc)->mc_cache)
+#define mem_cache_alloc_nb(mc)  OSMalloc_noblock((mc)->mc_size, (mc)->mc_cache)
+#define mem_cache_free(mc, p)   OSFree(p, (mc)->mc_size, (mc)->mc_cache)
+
+#endif /* !CFS_INDIVIDUAL_ZONE */
+
+cfs_mem_cache_t *
+cfs_mem_cache_create (const char *name,
+                      size_t objsize, size_t off, unsigned long arg1)
+{
+        cfs_mem_cache_t *mc;
+
+        mc = mem_cache_find(name, objsize);
+        if (mc)
+                return mc;
+        mc = mem_cache_create(objsize, name);
+       return mc;
+}
+
+int cfs_mem_cache_destroy (cfs_mem_cache_t *cachep)
 {
-        return (void *)zalloc(cachep->zone);
+        mem_cache_destroy(cachep);
+        return 0;
+}
+
+void *cfs_mem_cache_alloc (cfs_mem_cache_t *cachep, int flags)
+{
+        void *result;
+
+        /* zalloc_canblock() is not exported... Emulate it. */
+        if (flags & CFS_ALLOC_ATOMIC) {
+                result = (void *)mem_cache_alloc_nb(cachep);
+        } else {
+                LASSERT(get_preemption_level() == 0);
+                result = (void *)mem_cache_alloc(cachep);
+        }
+        if (result != NULL && (flags & CFS_ALLOC_ZERO))
+                memset(result, 0, cachep->mc_size);
+
+        return result;
 }
 
-void
-cfs_mem_cache_free (cfs_mem_cache_t *cachep, void *objp)
+void cfs_mem_cache_free (cfs_mem_cache_t *cachep, void *objp)
 {
-        zfree (cachep->zone, (vm_address_t)objp);
+        mem_cache_free(cachep, objp);
 }
 
 /* ---------------------------------------------------------------------------
@@ -167,38 +196,15 @@ cfs_mem_cache_free (cfs_mem_cache_t *cachep, void *objp)
  * "Raw" pages
  */
 
-extern vm_map_t zone_map;
-static inline vm_map_t page_map(struct xnu_raw_page *pg)
-{
-        LASSERT(pg != NULL);
-
-        return pg->order == 0 ? zone_map : kernel_map;
-}
-
-static int raw_page_init(struct xnu_raw_page *pg)
-{
-       vm_size_t size = (1UL << pg->order) * PAGE_SIZE;
-       int upl_flags = UPL_SET_INTERNAL |
-                UPL_SET_LITE | UPL_SET_IO_WIRE | UPL_COPYOUT_FROM;
-        int     kr = 0;
-
-        /* XXX is it necessary? */
-       kr = vm_map_get_upl(page_map(pg),
-                            pg->virtual, &size, &pg->upl, 0, 0, &upl_flags, 0);
-        return kr;
-}
-
-static void raw_page_done(struct xnu_raw_page *pg)
-{
-       ubc_upl_abort(pg->upl, UPL_ABORT_FREE_ON_EMPTY);
-        return;
-}
+static unsigned int raw_pages = 0;
+static cfs_mem_cache_t  *raw_page_cache = NULL;
 
 static struct xnu_page_ops raw_page_ops;
 static struct xnu_page_ops *page_ops[XNU_PAGE_NTYPES] = {
         [XNU_PAGE_RAW] = &raw_page_ops
 };
 
+#if defined(LIBCFS_DEBUG)
 static int page_type_is_valid(cfs_page_t *page)
 {
         LASSERT(page != NULL);
@@ -209,6 +215,7 @@ static int page_is_raw(cfs_page_t *page)
 {
         return page->type == XNU_PAGE_RAW;
 }
+#endif
 
 static struct xnu_raw_page *as_raw(cfs_page_t *page)
 {
@@ -236,120 +243,81 @@ static struct xnu_page_ops raw_page_ops = {
         .page_address   = raw_page_address
 };
 
+extern int get_preemption_level(void);
 
-extern vm_size_t kalloc_max;
-extern vm_size_t kalloc_max_prerounded;
-extern int first_k_zone;
-extern struct zone *k_zone[16];
-extern vm_offset_t zalloc_canblock( register zone_t, boolean_t );
-extern vm_map_t zone_map;
-
-static inline vm_address_t
-page_zone_alloc(int flags, int order)
-{
-       register int zindex;
-       register vm_size_t allocsize;
-       vm_size_t size = (1UL << order) * PAGE_SIZE;
-       vm_address_t    addr;
-       kern_return_t   kr;
-
-       assert(order >= 0);
-       if (size > PAGE_SIZE){
-               /* XXX Liang:
-                * zalloc_canblock() call kernel_memory_allocate to allocate
-                * pages, kernel_memory_allocate cannot guarantee contig pages!
-                * So any request bigger then PAGE_SIZE should not call zalloc()
-                *
-                * NB. kmem_alloc_contig could be very slow!!!! Anyway, I dont
-                * know what will happen if order >= 1 :-(
-                * */
-               CDEBUG(D_MALLOC, "Allocate contig pages!\n");
-               kr = kmem_alloc_contig(kernel_map, &addr, size, 0, 0);
-               if (kr)
-                       return 0;
-               return addr;
-       }
-       allocsize = KALLOC_MINSIZE;
-       zindex = first_k_zone;
-       while (allocsize < size) {
-               allocsize <<= 1;
-               zindex++;
-       }
-       assert(allocsize < kalloc_max);
-       if (flags & M_NOWAIT != 0)
-               addr = zalloc_canblock(k_zone[zindex], FALSE);
-       else
-               addr = zalloc_canblock(k_zone[zindex], TRUE);
-       return addr;
-}
+struct list_head page_death_row;
+spinlock_t page_death_row_phylax;
 
-/* Allocate a "page", actually upl of darwin */
-struct xnu_raw_page *alloc_raw_pages(u_int32_t flags, u_int32_t order)
+static void raw_page_finish(struct xnu_raw_page *pg)
 {
-       kern_return_t   kr;
-       vm_size_t size = (1UL << order) * PAGE_SIZE;
-        u_int32_t mflags = 0;
-       struct xnu_raw_page *pg;
-
-        if (flags & CFS_ALLOC_ATOMIC != 0)
-                mflags |= M_NOWAIT;
-        else
-                mflags |= M_WAITOK;
-        if (flags & CFS_ALLOC_ZERO != 0)
-                mflags |= M_ZERO;
+        -- raw_pages;
+        if (pg->virtual != NULL)
+                cfs_mem_cache_free(pg->virtual, raw_page_cache);
+        cfs_free(pg);
+}
 
-       MALLOC (pg, struct xnu_raw_page *, sizeof *pg, M_TEMP, mflags);
-       if (pg == NULL)
-               return NULL;
-        pg->header.type = XNU_PAGE_RAW;
-        pg->order = order;
-       cfs_set_page_count(&pg->header, 1);
-       pg->virtual = page_zone_alloc(flags, order);
-       if (!pg->virtual)
-                /*
-                 * XXX nikita: Liang, shouldn't pg be freed here?
-                 */
-               return NULL;
+void raw_page_death_row_clean(void)
+{
+        struct xnu_raw_page *pg;
 
-        kr = raw_page_init(pg);
-       if (kr != 0) {
-               size = (1UL << order) * PAGE_SIZE;
-                kmem_free(page_map(pg), pg->virtual, size);
-               return NULL;
-       }
-       return pg;
+        spin_lock(&page_death_row_phylax);
+        while (!list_empty(&page_death_row)) {
+                pg = container_of(page_death_row.next,
+                                  struct xnu_raw_page, link);
+                list_del(&pg->link);
+                spin_unlock(&page_death_row_phylax);
+                raw_page_finish(pg);
+                spin_lock(&page_death_row_phylax);
+        }
+        spin_unlock(&page_death_row_phylax);
 }
 
 /* Free a "page" */
-void free_raw_pages(struct xnu_raw_page *pg, u_int32_t order)
+void free_raw_page(struct xnu_raw_page *pg)
 {
-       vm_size_t size = (1UL << order) * PAGE_SIZE;
-
        if (!atomic_dec_and_test(&pg->count))
                return;
-        raw_page_done(pg);
-        kmem_free(page_map(pg), pg->virtual, size);
-       FREE(pg, M_TEMP);
-}
-
-cfs_page_t *cfs_alloc_pages(u_int32_t flags, u_int32_t order)
-{
-        return &alloc_raw_pages(flags, order)->header;
+        /*
+         * kmem_free()->vm_map_remove()->vm_map_delete()->lock_write() may
+         * block. (raw_page_done()->upl_abort() can block too) On the other
+         * hand, cfs_free_page() may be called in non-blockable context. To
+         * work around this, park pages on global list when cannot block.
+         */
+        if (get_preemption_level() > 0) {
+                spin_lock(&page_death_row_phylax);
+                list_add(&pg->link, &page_death_row);
+                spin_unlock(&page_death_row_phylax);
+        } else {
+                raw_page_finish(pg);
+                raw_page_death_row_clean();
+        }
 }
 
 cfs_page_t *cfs_alloc_page(u_int32_t flags)
 {
-        return cfs_alloc_pages(flags, 0);
-}
-
-void cfs_free_pages(cfs_page_t *pages, int order)
-{
-        free_raw_pages(as_raw(pages), order);
+        struct xnu_raw_page *page;
+
+        /*
+         * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
+
+        page = cfs_alloc(sizeof *page, flags);
+        if (page != NULL) {
+                page->virtual = cfs_mem_cache_alloc(raw_page_cache, flags);
+                if (page->virtual != NULL) {
+                        ++ raw_pages;
+                        page->header.type = XNU_PAGE_RAW;
+                        atomic_set(&page->count, 1);
+                } else
+                        cfs_free(page);
+        }
+        return page != NULL ? &page->header : NULL;
 }
 
-void cfs_free_page(cfs_page_t *page)
+void cfs_free_page(cfs_page_t *pages)
 {
-        cfs_free_pages(page, 0);
+        free_raw_page(as_raw(pages));
 }
 
 void cfs_get_page(cfs_page_t *p)
@@ -378,6 +346,10 @@ void cfs_set_page_count(cfs_page_t *p, int v)
 
 void *cfs_page_address(cfs_page_t *pg)
 {
+        /*
+         * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
         LASSERT(page_type_is_valid(pg));
         return page_ops[pg->type]->page_address(pg);
 }
@@ -425,14 +397,14 @@ void *cfs_alloc(size_t nr_bytes, u_int32_t flags)
         int mflags;
 
         mflags = 0;
-        if (flags & CFS_ALLOC_ATOMIC != 0) {
-                mflags |= 0 /* M_NOWAIT */;
+        if (flags & CFS_ALLOC_ATOMIC) {
+                mflags |= M_NOWAIT;
         } else {
                 LASSERT(get_preemption_level() == 0);
                 mflags |= M_WAITOK;
         }
 
-        if (flags & CFS_ALLOC_ZERO != 0)
+        if (flags & CFS_ALLOC_ZERO)
                 mflags |= M_ZERO;
 
         return _MALLOC(nr_bytes, M_TEMP, mflags);
@@ -451,5 +423,57 @@ void *cfs_alloc_large(size_t nr_bytes)
 
 void  cfs_free_large(void *addr)
 {
+        LASSERT(get_preemption_level() == 0);
         return _FREE(addr, M_TEMP);
 }
+
+/*
+ * Lookup cfs_zone_nob by sysctl.zone, if it cannot be 
+ * found (first load of * libcfs since boot), allocate 
+ * sysctl libcfs.zone.
+ */
+int cfs_mem_cache_init(void)
+{
+#if     CFS_INDIVIDUAL_ZONE
+        int     rc;
+        size_t  len;
+
+        len = sizeof(struct cfs_zone_nob);
+        rc = sysctlbyname("libcfs.zone",
+                          (void *)&cfs_zone_nob, &len, NULL, 0);
+        if (rc == ENOENT) {
+                /* zone_nob is not register in libcfs_sysctl */
+                struct cfs_zone_nob  *nob;
+                struct sysctl_oid       *oid;
+
+                assert(cfs_sysctl_isvalid());
+
+                nob = _MALLOC(sizeof(struct cfs_zone_nob), 
+                              M_TEMP, M_WAITOK | M_ZERO);
+                CFS_INIT_LIST_HEAD(&nob->z_link);
+                nob->z_nob = &nob->z_link;
+                oid = cfs_alloc_sysctl_struct(NULL, OID_AUTO, CTLFLAG_RD | CTLFLAG_KERN, 
+                                              "zone", nob, sizeof(struct cfs_zone_nob));
+                if (oid == NULL) {
+                        _FREE(nob, M_TEMP);
+                        return -ENOMEM;
+                }
+                sysctl_register_oid(oid);
+
+                cfs_zone_nob.z_nob = nob->z_nob;
+        }
+        spin_lock_init(&cfs_zone_guard);
+#endif
+        raw_page_cache = cfs_mem_cache_create("raw-page", CFS_PAGE_SIZE, 0, 0);
+        return 0;
+}
+
+void cfs_mem_cache_fini(void)
+{
+        cfs_mem_cache_destroy(raw_page_cache);
+
+#if     CFS_INDIVIDUAL_ZONE
+        cfs_zone_nob.z_nob = NULL;
+        spin_lock_done(&cfs_zone_guard);
+#endif
+}
index 7405c06..fa953a9 100644 (file)
 
 int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
 {
-        struct libcfs_ioctl_hdr   *hdr;
-        struct libcfs_ioctl_data  *data;
+        struct libcfs_ioctl_hdr *hdr;
+        struct libcfs_ioctl_data *data;
         int err = 0;
         ENTRY;
 
-        hdr = (struct libcfs_ioctl_hdr *)buf; 
+        hdr = (struct libcfs_ioctl_hdr *)buf;
         data = (struct libcfs_ioctl_data *)buf;
-       /* portals_ioctl_data has been copied in by ioctl of osx */
+       /* libcfs_ioctl_data has been copied in by ioctl of osx */
        memcpy(buf, arg, sizeof(struct libcfs_ioctl_data));
 
         if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) {
-                CERROR("PORTALS: version mismatch kernel vs application\n");
+                CERROR("LIBCFS: version mismatch kernel vs application\n");
                 RETURN(-EINVAL);
         }
 
         if (hdr->ioc_len + buf >= end) {
-                CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+                CERROR("LIBCFS: user buffer exceeds kernel buffer\n");
                 RETURN(-EINVAL);
         }
 
         if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) {
-                CERROR("PORTALS: user buffer too small for ioctl\n");
+                CERROR("LIBCFS: user buffer too small for ioctl\n");
                 RETURN(-EINVAL);
         }
        buf += size_round(sizeof(*data));
 
-        if (data->ioc_inllen1) { 
-                err = copy_from_user(buf, data->ioc_inlbuf1, size_round(data->ioc_inllen1)); 
+        if (data->ioc_inllen1) {
+                err = copy_from_user(buf, data->ioc_inlbuf1, size_round(data->ioc_inllen1));
                if (err)
                        RETURN(err);
-                data->ioc_inlbuf1 = buf; 
-                buf += size_round(data->ioc_inllen1); 
-        } 
-        
-        if (data->ioc_inllen2) { 
-                copy_from_user(buf, data->ioc_inlbuf2, size_round(data->ioc_inllen2)); 
+                data->ioc_inlbuf1 = buf;
+                buf += size_round(data->ioc_inllen1);
+        }
+
+        if (data->ioc_inllen2) {
+                copy_from_user(buf, data->ioc_inlbuf2, size_round(data->ioc_inllen2));
                if (err)
                        RETURN(err);
-                data->ioc_inlbuf2 = buf; 
-        } 
+                data->ioc_inlbuf2 = buf;
+        }
 
         RETURN(err);
 }
 
 extern struct cfs_psdev_ops            libcfs_psdev_ops;
-struct libcfs_device_userstate         *mdev_state[16];
+struct libcfs_device_userstate         *mdev_state[16];
 
-static int 
+static int
 libcfs_psdev_open(dev_t dev, int flags, int devtype, struct proc *p)
-{ 
+{
        struct  libcfs_device_userstate *mstat = NULL;
        int     rc = 0;
-       int     devid; 
-       devid = minor(dev);    
+       int     devid;
+       devid = minor(dev);
 
        if (devid > 16) return (-ENXIO);
 
@@ -71,17 +71,16 @@ libcfs_psdev_open(dev_t dev, int flags, int devtype, struct proc *p)
                rc = libcfs_psdev_ops.p_open(0, &mstat);
        else
                rc = -EPERM;
-       if (!rc)
-               return rc;
-       mdev_state[devid] = mstat;
+       if (rc == 0)
+               mdev_state[devid] = mstat;
        return rc;
 }
 
-static int 
+static int
 libcfs_psdev_close(dev_t dev, int flags, int mode, struct proc *p)
 {
-       int     devid; 
-       devid = minor(dev);    
+       int     devid;
+       devid = minor(dev);
        int     rc = 0;
 
        if (devid > 16) return (-ENXIO);
@@ -90,70 +89,111 @@ libcfs_psdev_close(dev_t dev, int flags, int mode, struct proc *p)
                rc = libcfs_psdev_ops.p_close(0, mdev_state[devid]);
        else
                rc = -EPERM;
-       if (rc)
-               return rc;
-       mdev_state[devid] = NULL;
+       if (rc == 0)
+               mdev_state[devid] = NULL;
        return rc;
 }
 
-static int 
+static int
 libcfs_ioctl (dev_t dev, u_long cmd, caddr_t arg, int flag, struct proc *p)
-{ 
-       int rc = 0; 
-        struct cfs_psdev_file    pfile; 
-       int     devid; 
-       devid = minor(dev); 
+{
+       int rc = 0;
+        struct cfs_psdev_file    pfile;
+       int     devid;
+       devid = minor(dev);
        
        if (devid > 16) return (-ENXIO);
 
-       if (suser(p->p_ucred, &p->p_acflag)) 
-               return (-EPERM); 
+       if (!is_suser())
+               return (-EPERM);
        
        pfile.off = 0;
        pfile.private_data = mdev_state[devid];
 
-       if (libcfs_psdev_ops.p_ioctl != NULL) 
+       if (libcfs_psdev_ops.p_ioctl != NULL)
                rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg);
-       else 
+       else
                rc = -EPERM;
        return rc;
 }
 
 static struct cdevsw libcfs_devsw =
-{ 
-       libcfs_psdev_open,            /* open */ 
-       libcfs_psdev_close,           /* close */ 
-       NULL,                   /* read */ 
-       NULL,                   /* write */ 
-       libcfs_ioctl,           /* ioctl */ 
-       NULL,                   /* stop */ 
-       NULL,                   /* reset */ 
-       NULL,                   /* tty's */ 
-       NULL,                   /* select */ 
-       NULL,                   /* mmap */ 
-       NULL,                   /* strategy */ 
-       NULL,                   /* getc */ 
-       NULL,                   /* putc */ 
-       0                       /* type */ 
+{
+       .d_open     = libcfs_psdev_open,
+       .d_close    = libcfs_psdev_close,
+       .d_read     = eno_rdwrt,
+       .d_write    = eno_rdwrt,
+       .d_ioctl    = libcfs_ioctl,
+       .d_stop     = eno_stop,
+       .d_reset    = eno_reset,
+       .d_ttys     = NULL,
+       .d_select   = eno_select,
+       .d_mmap     = eno_mmap,
+       .d_strategy = eno_strat,
+       .d_getc     = eno_getc,
+       .d_putc     = eno_putc,
+       .d_type     = 0
 };
 
-cfs_psdev_t libcfs_dev = { 
-       -1, 
-       NULL, 
-       "portals", 
-       &libcfs_devsw, 
+cfs_psdev_t libcfs_dev = {
+       -1,
+       NULL,
+       "lnet",
+       &libcfs_devsw,
        NULL
 };
 
-void
-libcfs_daemonize (char *str)
+extern void cfs_sync_init(void);
+extern void cfs_sync_fini(void);
+extern int cfs_sysctl_init(void);
+extern void cfs_sysctl_fini(void);
+extern int cfs_mem_cache_init(void);
+extern int cfs_mem_cache_fini(void);
+extern spinlock_t trace_cpu_serializer;
+extern struct list_head page_death_row;
+extern spinlock_t page_death_row_phylax;
+extern void raw_page_death_row_clean(void);
+extern void cfs_thread_agent_init(void);
+extern void cfs_thread_agent_fini(void);
+extern void cfs_symbol_clean(void);
+extern struct rw_semaphore cfs_symbol_lock;
+extern struct list_head cfs_symbol_list;
+
+int libcfs_arch_init(void)
 {
-       printf("Daemonize request: %s.\n", str);
-       return;
+       cfs_sync_init();
+
+       cfs_sysctl_init();
+       cfs_mem_cache_init();
+
+       init_rwsem(&cfs_symbol_lock);
+       CFS_INIT_LIST_HEAD(&cfs_symbol_list);
+
+       cfs_thread_agent_init();
+
+       spin_lock_init(&trace_cpu_serializer);
+
+       CFS_INIT_LIST_HEAD(&page_death_row);
+       spin_lock_init(&page_death_row_phylax);
+       return 0;
 }
 
-void 
-libcfs_blockallsigs(void)
+void libcfs_arch_cleanup(void)
 {
-       return;
+       cfs_symbol_clean();
+
+       spin_lock_done(&trace_cpu_serializer);
+
+       cfs_thread_agent_fini();
+
+       raw_page_death_row_clean();
+       spin_lock_done(&page_death_row_phylax);
+
+       fini_rwsem(&cfs_symbol_lock);
+
+       cfs_mem_cache_fini();
+       cfs_sysctl_fini();
+
+       cfs_sync_fini();
 }
+
index ba5b06f..3501e7c 100644 (file)
 
 #include <mach/mach_types.h>
 #include <string.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
 #include <sys/file.h>
 #include <sys/conf.h>
-#include <sys/vnode.h>
 #include <sys/uio.h>
 #include <sys/filedesc.h>
 #include <sys/namei.h>
 #include <miscfs/devfs/devfs.h>
-#include <kern/kalloc.h>
-#include <kern/zalloc.h>
 #include <kern/thread.h>
 
 #include <libcfs/libcfs.h>
 #include <libcfs/kp30.h>
 
-void    *darwin_current_journal_info = NULL;
-int     darwin_current_cap_effective = -1;
-
-/* 
- * cfs pseudo device, actually pseudo char device in darwin 
+/*
+ * cfs pseudo device, actually pseudo char device in darwin
  */
-#define KPORTAL_MAJOR  -1
+#define KLNET_MAJOR  -1
 
 kern_return_t  cfs_psdev_register(cfs_psdev_t *dev) {
-       dev->index = cdevsw_add(KPORTAL_MAJOR, dev->devsw);
+       dev->index = cdevsw_add(KLNET_MAJOR, dev->devsw);
        if (dev->index < 0) {
-               printf("portal_init: failed to allocate a major number!\n");
+               printf("libcfs_init: failed to allocate a major number!\n");
                return KERN_FAILURE;
        }
-       dev->handle = devfs_make_node(makedev (dev->index, 0), 
-                                      DEVFS_CHAR, UID_ROOT, 
+       dev->handle = devfs_make_node(makedev (dev->index, 0),
+                                      DEVFS_CHAR, UID_ROOT,
                                       GID_WHEEL, 0666, (char *)dev->name, 0);
        return KERN_SUCCESS;
 }
@@ -68,11 +60,11 @@ kern_return_t  cfs_psdev_deregister(cfs_psdev_t *dev) {
        return KERN_SUCCESS;
 }
 
-/* 
- * KPortal symbol register / unregister support 
+/*
+ * KPortal symbol register / unregister support
  */
-static struct rw_semaphore cfs_symbol_lock;
-struct list_head           cfs_symbol_list;
+struct rw_semaphore             cfs_symbol_lock;
+struct list_head                cfs_symbol_list;
 
 void *
 cfs_symbol_get(const char *name)
@@ -87,9 +79,9 @@ cfs_symbol_get(const char *name)
                         sym->ref ++;
                         break;
                 }
-        } 
+        }
         up_read(&cfs_symbol_lock);
-        if (sym != NULL) 
+        if (sym != NULL)
                 return sym->value;
         return NULL;
 }
@@ -108,7 +100,7 @@ cfs_symbol_put(const char *name)
                         LASSERT(sym->ref >= 0);
                         break;
                 }
-        } 
+        }
         up_read(&cfs_symbol_lock);
         LASSERT(sym != NULL);
 
@@ -183,74 +175,202 @@ cfs_symbol_clean()
         return;
 }
 
-/* 
- * Register sysctl table
- */
-cfs_sysctl_table_header_t *
-register_cfs_sysctl_table (cfs_sysctl_table_t *table, int arg)
+struct kernel_thread_arg
 {
-       cfs_sysctl_table_t      item;
-       int i = 0;
+       spinlock_t      lock;
+       atomic_t        inuse;
+       cfs_thread_t    func;
+       void            *arg;
+};
 
-       while ((item = table[i++]) != NULL) {
-               sysctl_register_oid(item); 
-       }
-       return table;
-}
+struct kernel_thread_arg cfs_thread_arg;
+
+#define THREAD_ARG_FREE                        0
+#define THREAD_ARG_HOLD                        1
+#define THREAD_ARG_RECV                        2
+
+#define set_targ_stat(a, v)            atomic_set(&(a)->inuse, v)
+#define get_targ_stat(a)               atomic_read(&(a)->inuse)
 
 /*
- * Unregister sysctl table
+ * Hold the thread argument and set the status of thread_status
+ * to THREAD_ARG_HOLD, if the thread argument is held by other
+ * threads (It's THREAD_ARG_HOLD already), current-thread has to wait.
  */
-void
-unregister_cfs_sysctl_table (cfs_sysctl_table_header_t *table) {
-       int i = 0;
-       cfs_sysctl_table_t      item;
+#define thread_arg_hold(pta, _func, _arg)                      \
+       do {                                                    \
+               spin_lock(&(pta)->lock);                        \
+               if (get_targ_stat(pta) == THREAD_ARG_FREE) {    \
+                       set_targ_stat((pta), THREAD_ARG_HOLD);  \
+                       (pta)->arg = (void *)_arg;              \
+                       (pta)->func = _func;                    \
+                       spin_unlock(&(pta)->lock);              \
+                       break;                                  \
+               }                                               \
+               spin_unlock(&(pta)->lock);                      \
+               cfs_schedule();                                 \
+       } while(1);                                             \
 
-       while ((item = table[i++]) != NULL) {
-               sysctl_unregister_oid(item); 
-       }
-       return;
-}
+/*
+ * Release the thread argument if the thread argument has been
+ * received by the child-thread (Status of thread_args is
+ * THREAD_ARG_RECV), otherwise current-thread has to wait.
+ * After release, the thread_args' status will be set to
+ * THREAD_ARG_FREE, and others can re-use the thread_args to
+ * create new kernel_thread.
+ */
+#define thread_arg_release(pta)                                        \
+       do {                                                    \
+               spin_lock(&(pta)->lock);                        \
+               if (get_targ_stat(pta) == THREAD_ARG_RECV) {    \
+                       (pta)->arg = NULL;                      \
+                       (pta)->func = NULL;                     \
+                       set_targ_stat(pta, THREAD_ARG_FREE);    \
+                       spin_unlock(&(pta)->lock);              \
+                       break;                                  \
+               }                                               \
+               spin_unlock(&(pta)->lock);                      \
+               cfs_schedule();                                 \
+       } while(1)
 
-struct kernel_thread_arg cfs_thread_arg;
+/*
+ * Receive thread argument (Used in child thread), set the status
+ * of thread_args to THREAD_ARG_RECV.
+ */
+#define __thread_arg_recv_fin(pta, _func, _arg, fin)           \
+       do {                                                    \
+               spin_lock(&(pta)->lock);                        \
+               if (get_targ_stat(pta) == THREAD_ARG_HOLD) {    \
+                       if (fin)                                \
+                           set_targ_stat(pta, THREAD_ARG_RECV);\
+                       _arg = (pta)->arg;                      \
+                       _func = (pta)->func;                    \
+                       spin_unlock(&(pta)->lock);              \
+                       break;                                  \
+               }                                               \
+               spin_unlock(&(pta)->lock);                      \
+               cfs_schedule();                                 \
+       } while (1);                                            \
+
+/*
+ * Just set the thread_args' status to THREAD_ARG_RECV
+ */
+#define thread_arg_fin(pta)                                    \
+       do {                                                    \
+               spin_lock(&(pta)->lock);                        \
+               assert( get_targ_stat(pta) == THREAD_ARG_HOLD); \
+               set_targ_stat(pta, THREAD_ARG_RECV);            \
+               spin_unlock(&(pta)->lock);                      \
+       } while(0)
+
+#define thread_arg_recv(pta, f, a)     __thread_arg_recv_fin(pta, f, a, 1)
+#define thread_arg_keep(pta, f, a)     __thread_arg_recv_fin(pta, f, a, 0)
 
 void
-cfs_thread_agent_init()
-{ 
-        set_targ_stat(&cfs_thread_arg, THREAD_ARG_FREE); 
-        spin_lock_init(&cfs_thread_arg.lock);        
-        cfs_thread_arg.arg = NULL;                       
-        cfs_thread_arg.func = NULL;       
+cfs_thread_agent_init(void)
+{
+        set_targ_stat(&cfs_thread_arg, THREAD_ARG_FREE);
+        spin_lock_init(&cfs_thread_arg.lock);
+        cfs_thread_arg.arg = NULL;
+        cfs_thread_arg.func = NULL;
 }
 
 void
-cfs_thread_agent (void) 
+cfs_thread_agent_fini(void)
+{
+        assert(get_targ_stat(&cfs_thread_arg) == THREAD_ARG_FREE);
+
+        spin_lock_done(&cfs_thread_arg.lock);
+}
+
+/*
+ *
+ * All requests to create kernel thread will create a new
+ * thread instance of cfs_thread_agent, one by one.
+ * cfs_thread_agent will call the caller's thread function
+ * with argument supplied by caller.
+ */
+void
+cfs_thread_agent (void)
 {
         cfs_thread_t           func = NULL;
         void                   *arg = NULL;
 
         thread_arg_recv(&cfs_thread_arg, func, arg);
-        printf("entry of thread agent (func: %08lx).\n", (void *)func);
+        /* printf("entry of thread agent (func: %08lx).\n", (void *)func); */
         assert(func != NULL);
         func(arg);
-        printf("thread agent exit. (func: %08lx)\n", (void *)func);
-        (void) thread_terminate(current_act());
+        /* printf("thread agent exit. (func: %08lx)\n", (void *)func); */
+        (void) thread_terminate(current_thread());
 }
 
+extern thread_t kernel_thread(task_t task, void (*start)(void));
+
 int
 cfs_kernel_thread(cfs_thread_t  func, void *arg, int flag)
-{ 
-        int ret = 0;   
-        thread_t th = NULL;  
-                                                
-        thread_arg_hold(&cfs_thread_arg, func, arg); 
-        th = kernel_thread(kernel_task, cfs_thread_agent);  
-        thread_arg_release(&cfs_thread_arg);      
-        if (th == THREAD_NULL) 
-                ret = -1;  
+{
+        int ret = 0;
+        thread_t th = NULL;
+
+        thread_arg_hold(&cfs_thread_arg, func, arg);
+        th = kernel_thread(kernel_task, cfs_thread_agent);
+        thread_arg_release(&cfs_thread_arg);
+        if (th == THREAD_NULL)
+                ret = -1;
         return ret;
 }
 
+void cfs_daemonize(char *str)
+{
+        snprintf(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX, "%s", str);
+        return;
+}
+
+int cfs_signal_pending(void)
+{
+#ifdef __DARWIN8__
+        extern int thread_issignal(proc_t, thread_t, sigset_t);
+        return thread_issignal(current_proc(), current_thread(), (sigset_t)-1);
+#else
+        return SHOULDissignal(current_proc(), current_uthread())
+#endif
+}
+
+/*
+ * XXX Liang: kexts cannot access sigmask in Darwin8.
+ * it's almost impossible for us to get/set signal mask
+ * without patching kernel.
+ * Should we provide these functions in xnu?
+ *
+ * There are several functions/MACRO which are very 
+ * confusing for me:
+ *
+ * proc_pendingsignals()
+ * thread_issignal()
+ * SHOULDissignal()
+ */
+extern int block_procsigmask(struct proc *p,  int bit);
+
+void cfs_block_allsigs()
+{
+#ifdef __DARWIN8__
+#else
+        block_procsigmask(current_proc(), -1);
+#endif
+}
+
+void cfs_block_sigs(sigset_t bit)
+{
+#ifdef __DARWIN8__
+#else
+        block_procsigmask(current_proc(), bit);
+#endif
+}
+
+#ifdef __DARWIN8__
+
+#else /* !__DARWIN8__ */
+
 void lustre_cone_in(boolean_t *state, funnel_t **cone)
 {
         *cone = thread_funnel_get();
@@ -284,7 +404,7 @@ void lustre_net_ex(boolean_t state, funnel_t *cone)
         else if (cone == NULL)
                 (void) thread_funnel_set(network_flock, state);
 }
-
+#endif /* !__DARWIN8__ */
 
 void cfs_waitq_init(struct cfs_waitq *waitq)
 {
@@ -297,7 +417,7 @@ void cfs_waitlink_init(struct cfs_waitlink *link)
 }
 
 void cfs_waitq_add(struct cfs_waitq *waitq, struct cfs_waitlink *link)
-{ 
+{
         link->wl_waitq = waitq;
        ksleep_add(&waitq->wq_ksleep_chan, &link->wl_ksleep_link);
 }
@@ -329,6 +449,10 @@ int cfs_waitq_active(struct cfs_waitq *waitq)
 
 void cfs_waitq_signal(struct cfs_waitq *waitq)
 {
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
        ksleep_wake(&waitq->wq_ksleep_chan);
 }
 
@@ -342,61 +466,90 @@ void cfs_waitq_broadcast(struct cfs_waitq *waitq)
        ksleep_wake_all(&waitq->wq_ksleep_chan);
 }
 
-void cfs_waitq_wait(struct cfs_waitlink *link)
-{ 
-        ksleep_wait(&link->wl_waitq->wq_ksleep_chan);
+void cfs_waitq_wait(struct cfs_waitlink *link, cfs_task_state_t state)
+{
+        ksleep_wait(&link->wl_waitq->wq_ksleep_chan, state);
 }
 
-cfs_duration_t  cfs_waitq_timedwait(struct cfs_waitlink *link, 
+cfs_duration_t  cfs_waitq_timedwait(struct cfs_waitlink *link,
+                                    cfs_task_state_t state,
                                     cfs_duration_t timeout)
-{ 
-        CDEBUG(D_TRACE, "timeout: %llu\n", (long long unsigned)timeout); 
-        return ksleep_timedwait(&link->chan->c, timeout);
+{
+        CDEBUG(D_TRACE, "timeout: %llu\n", (long long unsigned)timeout);
+        return ksleep_timedwait(&link->wl_waitq->wq_ksleep_chan, 
+                                state, timeout);
 }
 
 typedef  void (*ktimer_func_t)(void *);
 void cfs_timer_init(cfs_timer_t *t, void (* func)(unsigned long), void *arg)
-{ 
+{
         ktimer_init(&t->t, (ktimer_func_t)func, arg);
 }
 
 void cfs_timer_done(struct cfs_timer *t)
-{ 
+{
         ktimer_done(&t->t);
 }
 
 void cfs_timer_arm(struct cfs_timer *t, cfs_time_t deadline)
-{ 
+{
         ktimer_arm(&t->t, deadline);
 }
 
 void cfs_timer_disarm(struct cfs_timer *t)
-{ 
+{
         ktimer_disarm(&t->t);
 }
 
 int  cfs_timer_is_armed(struct cfs_timer *t)
-{ 
+{
         return ktimer_is_armed(&t->t);
 }
 
 cfs_time_t cfs_timer_deadline(struct cfs_timer *t)
-{ 
+{
         return ktimer_deadline(&t->t);
 }
 
-int
-libcfs_arch_init(void)
+void cfs_enter_debugger(void)
 {
-       init_rwsem(&cfs_symbol_lock);
-        CFS_INIT_LIST_HEAD(&cfs_symbol_list);
-        cfs_thread_agent_init();
-       return 0;
+#ifdef __DARWIN8__
+        extern void Debugger(const char * reason);
+        Debugger("CFS");
+#else
+        extern void PE_enter_debugger(char *cause);
+        PE_enter_debugger("CFS");
+#endif
 }
 
-void
-libcfs_arch_cleanup(void)
+int cfs_online_cpus(void)
 {
-       cfs_symbol_clean();
-}
+        int     activecpu;
+        size_t  size;
+
+#ifdef __DARWIN8__ 
+        size = sizeof(int);
+        sysctlbyname("hw.activecpu", &activecpu, &size, NULL, 0);
+        return activecpu;
+#else
+        host_basic_info_data_t hinfo;
+        kern_return_t kret;
+        int count = HOST_BASIC_INFO_COUNT;
+#define BSD_HOST 1
+        kret = host_info(BSD_HOST, HOST_BASIC_INFO, &hinfo, &count);
+        if (kret == KERN_SUCCESS) 
+                return (hinfo.avail_cpus);
+        return(-EINVAL);
+#endif
+}
+
+int cfs_ncpus(void)
+{
+        int     ncpu;
+        size_t  size;
 
+        size = sizeof(int);
+
+        sysctlbyname("hw.ncpu", &ncpu, &size, NULL, 0);
+        return ncpu;
+}
index f88b825..b57a0a8 100644 (file)
 #include <mach/mach_types.h>
 
 #define DEBUG_SUBSYSTEM S_LNET
+
 #include <libcfs/libcfs.h>
 
-static cfs_sysctl_table_header_t *portals_table_header = NULL;
+#define LIBCFS_SYSCTL           "libcfs"
+#define LIBCFS_SYSCTL_SPRITE    "sprite"
+#define LIBCFS_SYSCTL_MAGIC     0xbabeface
+
+static struct libcfs_sysctl_sprite {
+        int                     ss_magic;
+        struct sysctl_oid_list  *ss_link;
+} libcfs_sysctl_sprite = { 0, NULL };
+
+static cfs_sysctl_table_header_t *libcfs_table_header = NULL;
 extern unsigned int libcfs_debug;
 extern char debug_file_path[1024];
 extern unsigned int libcfs_subsystem_debug;
@@ -43,46 +53,217 @@ extern long max_debug_mb;
 extern int cfs_trace_daemon SYSCTL_HANDLER_ARGS;
 extern int cfs_debug_mb SYSCTL_HANDLER_ARGS;
 /*
- * sysctl table for portals
+ * sysctl table for lnet
  */
-SYSCTL_NODE (,                 OID_AUTO,       portals,        CTLFLAG_RW,
-            0,                 "portals sysctl top");
 
-SYSCTL_INT(_portals,                   OID_AUTO,       debug,  
+SYSCTL_NODE (,                 OID_AUTO,       lnet,   CTLFLAG_RW,
+            0,                 "lnet sysctl top");
+
+SYSCTL_INT(_lnet,                      OID_AUTO,       debug,  
             CTLTYPE_INT | CTLFLAG_RW ,                 &libcfs_debug,  
             0,         "debug");
-SYSCTL_INT(_portals,                   OID_AUTO,       subsystem_debug,        
+SYSCTL_INT(_lnet,                      OID_AUTO,       subsystem_debug,        
             CTLTYPE_INT | CTLFLAG_RW,                  &libcfs_subsystem_debug,        
             0,         "subsystem debug");
-SYSCTL_INT(_portals,                   OID_AUTO,       printk, 
+SYSCTL_INT(_lnet,                      OID_AUTO,       printk, 
             CTLTYPE_INT | CTLFLAG_RW,                  &libcfs_printk, 
             0,         "printk");
-SYSCTL_STRING(_portals,                        OID_AUTO,       debug_path,     
+SYSCTL_STRING(_lnet,                   OID_AUTO,       debug_path,     
             CTLTYPE_STRING | CTLFLAG_RW,               debug_file_path,        
             1024,      "debug path");
-SYSCTL_INT(_portals,                   OID_AUTO,       memused,        
+SYSCTL_INT(_lnet,                      OID_AUTO,       memused,        
             CTLTYPE_INT | CTLFLAG_RW,                  (int *)&libcfs_kmemory.counter, 
             0,         "memused");
-SYSCTL_PROC(_portals,                  OID_AUTO,       trace_daemon,
+SYSCTL_INT(_lnet,                      OID_AUTO,       catastrophe,    
+            CTLTYPE_INT | CTLFLAG_RW,                  (int *)&libcfs_catastrophe,     
+            0,         "catastrophe");
+SYSCTL_PROC(_lnet,                     OID_AUTO,       trace_daemon,
             CTLTYPE_STRING | CTLFLAG_RW,               0,
             0,         &cfs_trace_daemon,              "A",    "trace daemon");
-SYSCTL_PROC(_portals,                  OID_AUTO,       debug_mb,
+SYSCTL_PROC(_lnet,                     OID_AUTO,       debug_mb,
             CTLTYPE_INT | CTLFLAG_RW,                  &max_debug_mb,
             0,         &cfs_debug_mb,                  "L",    "max debug size");
 
 
 static cfs_sysctl_table_t      top_table[] = {
-       &sysctl__portals,
-       &sysctl__portals_debug,
-       &sysctl__portals_subsystem_debug,
-       &sysctl__portals_printk,
-       &sysctl__portals_debug_path,
-       &sysctl__portals_memused,
-       &sysctl__portals_trace_daemon,
-       &sysctl__portals_debug_mb,
+       &sysctl__lnet,
+       &sysctl__lnet_debug,
+       &sysctl__lnet_subsystem_debug,
+       &sysctl__lnet_printk,
+       &sysctl__lnet_debug_path,
+       &sysctl__lnet_memused,
+       &sysctl__lnet_catastrophe,
+       &sysctl__lnet_trace_daemon,
+       &sysctl__lnet_debug_mb,
        NULL
 };
 
+/*
+ * Register sysctl table
+ */
+cfs_sysctl_table_header_t *
+cfs_register_sysctl_table (cfs_sysctl_table_t *table, int arg)
+{
+        cfs_sysctl_table_t      item;
+        int i = 0;
+
+        while ((item = table[i++]) != NULL) 
+                sysctl_register_oid(item);
+        return table;
+}
+
+/*
+ * Unregister sysctl table
+ */
+void
+cfs_unregister_sysctl_table (cfs_sysctl_table_header_t *table) {
+        int i = 0;
+        cfs_sysctl_table_t      item;
+
+        while ((item = table[i++]) != NULL) 
+                sysctl_unregister_oid(item);
+        return;
+}
+
+/*
+ * Allocate a sysctl oid. 
+ */
+static struct sysctl_oid *
+cfs_alloc_sysctl(struct sysctl_oid_list *parent, int nbr, int access,
+                 const char *name, void *arg1, int arg2, const char *fmt,
+                 int (*handler) SYSCTL_HANDLER_ARGS)
+{
+        struct sysctl_oid *oid;
+        char    *sname = NULL;
+        char    *sfmt = NULL;
+
+        if (strlen(name) + 1 > CTL_MAXNAME) {
+                printf("libcfs: sysctl name: %s is too long.\n", name);
+                return NULL;
+        }
+        oid = (struct sysctl_oid*)_MALLOC(sizeof(struct sysctl_oid), 
+                                          M_TEMP, M_WAITOK | M_ZERO);
+        if (oid == NULL) 
+                return NULL;
+
+        sname = (char *)_MALLOC(sizeof(CTL_MAXNAME), 
+                                M_TEMP, M_WAITOK | M_ZERO);
+        if (sname == NULL) 
+                goto error;
+        strcpy(sname, name);
+
+        sfmt = (char *)_MALLOC(4, M_TEMP, M_WAITOK | M_ZERO);
+        if (sfmt == NULL) 
+                goto error;
+        strcpy(sfmt, fmt);
+
+        if (parent == NULL)
+                oid->oid_parent = &sysctl__children;
+        else
+                oid->oid_parent = parent;
+        oid->oid_number = nbr;
+        oid->oid_kind = access;
+        oid->oid_name = sname;
+        oid->oid_handler = handler;
+        oid->oid_fmt = sfmt;
+
+        if (access & CTLTYPE_NODE != 0) {
+                /* It's a sysctl node */
+                struct sysctl_oid_list *link;
+
+                link = (struct sysctl_oid_list *)_MALLOC(sizeof(struct sysctl_oid_list), 
+                                                         M_TEMP, M_WAITOK | M_ZERO);
+                if (link == NULL)
+                        goto error;
+                oid->oid_arg1 = link;
+                oid->oid_arg2 = 0;
+        } else {
+                oid->oid_arg1 = arg1;
+                oid->oid_arg2 = arg2;
+        }
+
+        return oid;
+error:
+        if (sfmt != NULL)
+                _FREE(sfmt, M_TEMP);
+        if (sname != NULL)
+                _FREE(sname, M_TEMP);
+        if (oid != NULL)
+                _FREE(oid, M_TEMP);
+        return NULL;
+}
+
+void cfs_free_sysctl(struct sysctl_oid *oid)
+{
+        if (oid->oid_name != NULL)
+                _FREE((void *)oid->oid_name, M_TEMP);
+        if (oid->oid_fmt != NULL)
+                _FREE((void *)oid->oid_fmt, M_TEMP);
+        if ((oid->oid_kind & CTLTYPE_NODE != 0) && oid->oid_arg1)
+                /* XXX Liang: need to assert the list is empty */
+                _FREE(oid->oid_arg1, M_TEMP);
+        _FREE(oid, M_TEMP);
+}
+
+#define CFS_SYSCTL_ISVALID ((libcfs_sysctl_sprite.ss_magic == LIBCFS_SYSCTL_MAGIC) && \
+                            (libcfs_sysctl_sprite.ss_link != NULL))       
+
+int
+cfs_sysctl_isvalid(void)
+{
+        return CFS_SYSCTL_ISVALID;
+}
+
+struct sysctl_oid *
+cfs_alloc_sysctl_node(struct sysctl_oid_list *parent, int nbr, int access,
+                      const char *name, int (*handler) SYSCTL_HANDLER_ARGS)
+{
+        if (parent == NULL && CFS_SYSCTL_ISVALID)
+                parent = libcfs_sysctl_sprite.ss_link;
+        return cfs_alloc_sysctl(parent, nbr, CTLTYPE_NODE | access, name,
+                                NULL, 0, "N", handler);
+}
+
+struct sysctl_oid *
+cfs_alloc_sysctl_int(struct sysctl_oid_list *parent, int nbr, int access,
+                     const char *name, int *ptr, int val)
+{
+        if (parent == NULL && CFS_SYSCTL_ISVALID)
+                parent = libcfs_sysctl_sprite.ss_link;
+        return cfs_alloc_sysctl(parent, nbr, CTLTYPE_INT | access, name, 
+                                ptr, val, "I", sysctl_handle_int);
+}
+
+struct sysctl_oid *
+cfs_alloc_sysctl_long(struct sysctl_oid_list *parent, int nbr, int access,
+                      const char *name, int *ptr, int val)
+{
+        if (parent == NULL && CFS_SYSCTL_ISVALID)
+                parent = libcfs_sysctl_sprite.ss_link;
+        return cfs_alloc_sysctl(parent, nbr, CTLTYPE_INT | access, name, 
+                                ptr, val, "L", sysctl_handle_long);
+}
+
+struct sysctl_oid *
+cfs_alloc_sysctl_string(struct sysctl_oid_list *parent, int nbr, int access,
+                        const char *name, char *ptr, int len)
+{
+        if (parent == NULL && CFS_SYSCTL_ISVALID)
+                parent = libcfs_sysctl_sprite.ss_link;
+        return cfs_alloc_sysctl(parent, nbr, CTLTYPE_STRING | access, name, 
+                                ptr, len, "A", sysctl_handle_string);
+}
+
+struct sysctl_oid *
+cfs_alloc_sysctl_struct(struct sysctl_oid_list *parent, int nbr, int access,
+                        const char *name, void *ptr, int size)
+{
+        if (parent == NULL && CFS_SYSCTL_ISVALID)
+                parent = libcfs_sysctl_sprite.ss_link;
+        return cfs_alloc_sysctl(parent, nbr, CTLTYPE_OPAQUE | access, name,
+                                ptr, size, "S", sysctl_handle_opaque);
+}
+
 /* no proc in osx */
 cfs_proc_dir_entry_t *
 cfs_create_proc_entry(char *name, int mod, cfs_proc_dir_entry_t *parent)
@@ -110,8 +291,8 @@ int
 insert_proc(void)
 {
 #if 1
-        if (!portals_table_header) 
-                portals_table_header = register_cfs_sysctl_table(top_table, 0);
+        if (!libcfs_table_header) 
+                libcfs_table_header = cfs_register_sysctl_table(top_table, 0);
 #endif
        return 0;
 }
@@ -120,11 +301,79 @@ void
 remove_proc(void)
 {
 #if 1
-        if (portals_table_header != NULL) 
-                unregister_cfs_sysctl_table(portals_table_header); 
-        portals_table_header = NULL;
+        if (libcfs_table_header != NULL) 
+                cfs_unregister_sysctl_table(libcfs_table_header); 
+        libcfs_table_header = NULL;
 #endif
        return;
 }
 
+int
+cfs_sysctl_init(void)
+{
+        struct sysctl_oid               *oid_root;
+        struct sysctl_oid               *oid_sprite;
+        struct libcfs_sysctl_sprite     *sprite;
+        size_t  len; 
+        int     rc;
+
+        len = sizeof(struct libcfs_sysctl_sprite);
+        rc = sysctlbyname("libcfs.sprite", 
+                          (void *)&libcfs_sysctl_sprite, &len, NULL, 0);
+        if (rc == 0) {
+                /* 
+                 * XXX Liang: assert (rc == 0 || rc == ENOENT)
+                 *
+                 * libcfs.sprite has been registered by previous 
+                 * loading of libcfs 
+                 */
+                if (libcfs_sysctl_sprite.ss_magic != LIBCFS_SYSCTL_MAGIC) {
+                        printf("libcfs: magic number of libcfs.sprite "
+                               "is not right (%lx, %lx)\n", 
+                               libcfs_sysctl_sprite.ss_magic,
+                               LIBCFS_SYSCTL_MAGIC);
+                        return -1;
+                }
+                printf("libcfs: registered libcfs.sprite found.\n");
+                return 0;
+        }
+        oid_root = cfs_alloc_sysctl_node(NULL, OID_AUTO, CTLFLAG_RD | CTLFLAG_KERN,
+                                         LIBCFS_SYSCTL, 0);
+        if (oid_root == NULL)
+                return -1;
+        sysctl_register_oid(oid_root);
+
+        sprite = (struct libcfs_sysctl_sprite *)_MALLOC(sizeof(struct libcfs_sysctl_sprite), 
+                                                        M_TEMP, M_WAITOK | M_ZERO);
+        if (sprite == NULL) {
+                sysctl_unregister_oid(oid_root);
+                cfs_free_sysctl(oid_root);
+                return -1;
+        }
+        sprite->ss_magic = LIBCFS_SYSCTL_MAGIC;
+        sprite->ss_link = (struct sysctl_oid_list *)oid_root->oid_arg1;
+        oid_sprite = cfs_alloc_sysctl_struct((struct sysctl_oid_list *)oid_root->oid_arg1, 
+                                             OID_AUTO, CTLFLAG_RD | CTLFLAG_KERN, 
+                                             LIBCFS_SYSCTL_SPRITE, sprite, 
+                                             sizeof(struct libcfs_sysctl_sprite));
+        if (oid_sprite == NULL) {
+                cfs_free_sysctl(oid_sprite);
+                sysctl_unregister_oid(oid_root);
+                cfs_free_sysctl(oid_root);
+                return -1;
+        }
+        sysctl_register_oid(oid_sprite);
+
+        libcfs_sysctl_sprite.ss_magic = sprite->ss_magic;
+        libcfs_sysctl_sprite.ss_link = sprite->ss_link;
+
+        return 0;
+}
+
+void
+cfs_sysctl_fini(void)
+{
+        libcfs_sysctl_sprite.ss_magic = 0;
+        libcfs_sysctl_sprite.ss_link = NULL;
+}
 
index ddb17d3..a576d19 100644 (file)
@@ -23,7 +23,7 @@
  *
  * Created by nikita on Sun Jul 18 2004.
  *
- * Prototypes of XNU synchronization primitives.
+ * XNU synchronization primitives.
  */
 
 /*
  * A lot can be optimized here.
  */
 
-#include <mach/mach_types.h>
-#include <sys/types.h>
-#include <kern/simple_lock.h>
-
 #define DEBUG_SUBSYSTEM S_LNET
 
+#ifdef __DARWIN8__
+# include <kern/locks.h>
+#else
+# include <mach/mach_types.h>
+# include <sys/types.h>
+# include <kern/simple_lock.h>
+#endif
+
 #include <libcfs/libcfs.h>
 #include <libcfs/kp30.h>
 
@@ -62,14 +66,33 @@ extern int get_preemption_level(void);
 #define get_preemption_level() (0)
 #endif
 
-/*
- * Warning: low level portals debugging code (portals_debug_msg(), for
- * example), uses spin-locks, so debugging output here may lead to nasty
- * surprises.
- */
-
 #if SMP
+#ifdef __DARWIN8__
+
+static lck_grp_t       *cfs_lock_grp = NULL;
+
+/* hw_lock_* are not exported by Darwin8 */
+static inline void xnu_spin_init(xnu_spin_t *s)
+{
+        SLASSERT(cfs_lock_grp != NULL);
+        *s = lck_spin_alloc_init(cfs_lock_grp, LCK_ATTR_NULL);
+}
+
+static inline void xnu_spin_done(xnu_spin_t *s)
+{
+        SLASSERT(cfs_lock_grp != NULL);
+        *s = lck_spin_alloc_init(cfs_lock_grp, LCK_ATTR_NULL);
+        lck_spin_free(*s, cfs_lock_grp);
+        *s = NULL;
+}
+
+#define xnu_spin_lock(s)        lck_spin_lock(*(s))
+#define xnu_spin_unlock(s)      lck_spin_unlock(*(s))
+
+#warning "Darwin8 does not export lck_spin_try_lock"
+#define xnu_spin_try(s)         (1)
 
+#else /* DARWIN8 */
 extern void                    hw_lock_init(hw_lock_t);
 extern void                    hw_lock_lock(hw_lock_t);
 extern void                    hw_lock_unlock(hw_lock_t);
@@ -77,10 +100,33 @@ extern unsigned int                hw_lock_to(hw_lock_t, unsigned int);
 extern unsigned int            hw_lock_try(hw_lock_t);
 extern unsigned int            hw_lock_held(hw_lock_t);
 
+#define xnu_spin_init(s)        hw_lock_init(s)
+#define xnu_spin_done(s)        do {} while (0)
+#define xnu_spin_lock(s)        hw_lock_lock(s)
+#define xnu_spin_unlock(s)      hw_lock_unlock(s)
+#define xnu_spin_try(s)         hw_lock_try(s)
+#endif /* DARWIN8 */
+
+#else /* SMP */
+#define xnu_spin_init(s)        do {} while (0)
+#define xnu_spin_done(s)        do {} while (0)
+#define xnu_spin_lock(s)        do {} while (0)
+#define xnu_spin_unlock(s)      do {} while (0)
+#define xnu_spin_try(s)         (1)
+#endif /* SMP */
+
+/*
+ * Warning: low level libcfs debugging code (libcfs_debug_msg(), for
+ * example), uses spin-locks, so debugging output here may lead to nasty
+ * surprises.
+ *
+ * In uniprocessor version of spin-lock. Only checks.
+ */
+
 void kspin_init(struct kspin *spin)
 {
        SLASSERT(spin != NULL);
-       hw_lock_init(&spin->lock);
+       xnu_spin_init(&spin->lock);
        ON_SYNC_DEBUG(spin->magic = KSPIN_MAGIC);
        ON_SYNC_DEBUG(spin->owner = NULL);
 }
@@ -90,26 +136,37 @@ void kspin_done(struct kspin *spin)
        SLASSERT(spin != NULL);
        SLASSERT(spin->magic == KSPIN_MAGIC);
        SLASSERT(spin->owner == NULL);
+        xnu_spin_done(&spin->lock);
 }
 
 void kspin_lock(struct kspin *spin)
 {
        SLASSERT(spin != NULL);
        SLASSERT(spin->magic == KSPIN_MAGIC);
-       SLASSERT(spin->owner != current_thread);
+       SLASSERT(spin->owner != current_thread());
+
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
 
-       hw_lock_lock(&spin->lock);
+       xnu_spin_lock(&spin->lock);
        SLASSERT(spin->owner == NULL);
-       ON_SYNC_DEBUG(spin->owner = current_thread);
+       ON_SYNC_DEBUG(spin->owner = current_thread());
 }
 
 void kspin_unlock(struct kspin *spin)
 {
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+
        SLASSERT(spin != NULL);
        SLASSERT(spin->magic == KSPIN_MAGIC);
-       SLASSERT(spin->owner == current_thread);
+       SLASSERT(spin->owner == current_thread());
        ON_SYNC_DEBUG(spin->owner = NULL);
-       hw_lock_unlock(&spin->lock);
+       xnu_spin_unlock(&spin->lock);
 }
 
 int  kspin_trylock(struct kspin *spin)
@@ -117,84 +174,127 @@ int  kspin_trylock(struct kspin *spin)
        SLASSERT(spin != NULL);
        SLASSERT(spin->magic == KSPIN_MAGIC);
 
-       if (hw_lock_try(&spin->lock)) {
+       if (xnu_spin_try(&spin->lock)) {
                SLASSERT(spin->owner == NULL);
-               ON_SYNC_DEBUG(spin->owner = current_thread);
+               ON_SYNC_DEBUG(spin->owner = current_thread());
                return 1;
        } else
                return 0;
 }
 
-/* SMP */
-#else
-
-/*
- * uniprocessor version of spin-lock. Only checks.
- */
-
-void kspin_init(struct kspin *spin)
+#if XNU_SYNC_DEBUG
+int kspin_islocked(struct kspin *spin)
 {
        SLASSERT(spin != NULL);
-       ON_SYNC_DEBUG(spin->magic = KSPIN_MAGIC);
-       ON_SYNC_DEBUG(spin->owner = NULL);
+       SLASSERT(spin->magic == KSPIN_MAGIC);
+       return spin->owner == current_thread();
 }
 
-void kspin_done(struct kspin *spin)
+int kspin_isnotlocked(struct kspin *spin)
 {
        SLASSERT(spin != NULL);
        SLASSERT(spin->magic == KSPIN_MAGIC);
-       SLASSERT(spin->owner == NULL);
+       return spin->owner != current_thread();
 }
+#endif
 
-void kspin_lock(struct kspin *spin)
+/*
+ * read/write spin-lock
+ */
+void krw_spin_init(struct krw_spin *rwspin)
 {
-       SLASSERT(spin != NULL);
-       SLASSERT(spin->magic == KSPIN_MAGIC);
-       SLASSERT(spin->owner == NULL);
-       ON_SYNC_DEBUG(spin->owner = current_thread);
+       SLASSERT(rwspin != NULL);
+
+       kspin_init(&rwspin->guard);
+       rwspin->count = 0;
+       ON_SYNC_DEBUG(rwspin->magic = KRW_SPIN_MAGIC);
 }
 
-void kspin_unlock(struct kspin *spin)
+void krw_spin_done(struct krw_spin *rwspin)
 {
-       SLASSERT(spin != NULL);
-       SLASSERT(spin->magic == KSPIN_MAGIC);
-       SLASSERT(spin->owner == current_thread);
-       ON_SYNC_DEBUG(spin->owner = NULL);
+       SLASSERT(rwspin != NULL);
+       SLASSERT(rwspin->magic == KRW_SPIN_MAGIC);
+       SLASSERT(rwspin->count == 0);
+       kspin_done(&rwspin->guard);
 }
 
-int kspin_trylock(struct kspin *spin)
+void krw_spin_down_r(struct krw_spin *rwspin)
 {
-       SLASSERT(spin != NULL);
-       SLASSERT(spin->magic == KSPIN_MAGIC);
-       SLASSERT(spin->owner == NULL);
-       ON_SYNC_DEBUG(spin->owner = current_thread);
-       return 1;
+       SLASSERT(rwspin != NULL);
+       SLASSERT(rwspin->magic == KRW_SPIN_MAGIC);
+
+        while(1) {
+               kspin_lock(&rwspin->guard);
+                if (rwspin->count >= 0)
+                        break;
+                kspin_unlock(&rwspin->guard);
+        }
+       ++ rwspin->count;
+       kspin_unlock(&rwspin->guard);
 }
 
-/* SMP */
-#endif
+void krw_spin_down_w(struct krw_spin *rwspin)
+{
+       SLASSERT(rwspin != NULL);
+       SLASSERT(rwspin->magic == KRW_SPIN_MAGIC);
 
-#if XNU_SYNC_DEBUG
-int kspin_islocked(struct kspin *spin)
+        while (1) {
+               kspin_lock(&rwspin->guard);
+                if (rwspin->count == 0)
+                        break;
+               kspin_unlock(&rwspin->guard);
+        }
+       rwspin->count = -1;
+       kspin_unlock(&rwspin->guard);
+}
+
+void krw_spin_up_r(struct krw_spin *rwspin)
 {
-       SLASSERT(spin != NULL);
-       SLASSERT(spin->magic == KSPIN_MAGIC);
-       return spin->owner == current_thread;
+       SLASSERT(rwspin != NULL);
+       SLASSERT(rwspin->magic == KRW_SPIN_MAGIC);
+       SLASSERT(rwspin->count > 0);
+
+       kspin_lock(&rwspin->guard);
+       -- rwspin->count;
+       kspin_unlock(&rwspin->guard);
 }
 
-int kspin_isnotlocked(struct kspin *spin)
+void krw_spin_up_w(struct krw_spin *rwspin)
 {
-       SLASSERT(spin != NULL);
-       SLASSERT(spin->magic == KSPIN_MAGIC);
-       return spin->owner != current_thread;
+       SLASSERT(rwspin != NULL);
+       SLASSERT(rwspin->magic == KRW_SPIN_MAGIC);
+       SLASSERT(rwspin->count == -1);
+
+       kspin_lock(&rwspin->guard);
+       rwspin->count = 0;
+       kspin_unlock(&rwspin->guard);
 }
-#endif
 
+/*
+ * semaphore 
+ */
+#ifdef __DARWIN8__
+
+#define xnu_waitq_init(q, a)            do {} while (0)
+#define xnu_waitq_done(q)               do {} while (0)
+#define xnu_waitq_wakeup_one(q, e, s)   ({wakeup_one((void *)(e)); KERN_SUCCESS;})
+#define xnu_waitq_wakeup_all(q, e, s)   ({wakeup((void *)(e)); KERN_SUCCESS;})
+#define xnu_waitq_assert_wait(q, e, s)  assert_wait((e), s)
+
+#else /* DARWIN8 */
+
+#define xnu_waitq_init(q, a)            wait_queue_init((q), a)
+#define xnu_waitq_done(q)               do {} while (0)
+#define xnu_waitq_wakeup_one(q, e, s)   wait_queue_wakeup_one((q), (event_t)(e), s)
+#define xnu_waitq_wakeup_all(q, e, s)   wait_queue_wakeup_all((q), (event_t)(e), s)
+#define xnu_waitq_assert_wait(q, e, s)  wait_queue_assert_wait((q), (event_t)(e), s)
+
+#endif /* DARWIN8 */
 void ksem_init(struct ksem *sem, int value)
 {
        SLASSERT(sem != NULL);
        kspin_init(&sem->guard);
-       wait_queue_init(&sem->q, SYNC_POLICY_FIFO);
+       xnu_waitq_init(&sem->q, SYNC_POLICY_FIFO);
        sem->value = value;
        ON_SYNC_DEBUG(sem->magic = KSEM_MAGIC);
 }
@@ -221,11 +321,11 @@ int ksem_up(struct ksem *sem, int value)
        kspin_lock(&sem->guard);
        sem->value += value;
        if (sem->value == 0)
-               result = wait_queue_wakeup_one(&sem->q, (event_t)sem,
-                                              THREAD_AWAKENED);
+               result = xnu_waitq_wakeup_one(&sem->q, sem,
+                                             THREAD_AWAKENED);
        else
-               result = wait_queue_wakeup_all(&sem->q, (event_t)sem,
-                                              THREAD_AWAKENED);
+               result = xnu_waitq_wakeup_all(&sem->q, sem,
+                                             THREAD_AWAKENED);
        kspin_unlock(&sem->guard);
        SLASSERT(result == KERN_SUCCESS || result == KERN_NOT_WAITING);
        return (result == KERN_SUCCESS) ? 0 : 1;
@@ -242,8 +342,8 @@ void ksem_down(struct ksem *sem, int value)
 
        kspin_lock(&sem->guard);
        while (sem->value < value) {
-               result = wait_queue_assert_wait(&sem->q, (event_t)sem,
-                                               THREAD_UNINT);
+               result = xnu_waitq_assert_wait(&sem->q, sem,
+                                              THREAD_UNINT);
                SLASSERT(result == THREAD_AWAKENED || result == THREAD_WAITING);
                kspin_unlock(&sem->guard);
                if (result == THREAD_WAITING)
@@ -292,18 +392,18 @@ void kmut_lock(struct kmut *mut)
 {
        SLASSERT(mut != NULL);
        SLASSERT(mut->magic == KMUT_MAGIC);
-       SLASSERT(mut->owner != current_thread);
+       SLASSERT(mut->owner != current_thread());
        SLASSERT(get_preemption_level() == 0);
 
        ksem_down(&mut->s, 1);
-       ON_SYNC_DEBUG(mut->owner = current_thread);
+       ON_SYNC_DEBUG(mut->owner = current_thread());
 }
 
 void kmut_unlock(struct kmut *mut)
 {
        SLASSERT(mut != NULL);
        SLASSERT(mut->magic == KMUT_MAGIC);
-       SLASSERT(mut->owner == current_thread);
+       SLASSERT(mut->owner == current_thread());
 
        ON_SYNC_DEBUG(mut->owner = NULL);
        ksem_up(&mut->s, 1);
@@ -321,14 +421,14 @@ int kmut_islocked(struct kmut *mut)
 {
        SLASSERT(mut != NULL);
        SLASSERT(mut->magic == KMUT_MAGIC);
-       return mut->owner == current_thread;
+       return mut->owner == current_thread();
 }
 
 int kmut_isnotlocked(struct kmut *mut)
 {
        SLASSERT(mut != NULL);
        SLASSERT(mut->magic == KMUT_MAGIC);
-       return mut->owner != current_thread;
+       return mut->owner != current_thread();
 }
 #endif
 
@@ -560,7 +660,7 @@ void ksleep_link_init(struct ksleep_link *link)
 
        CFS_INIT_LIST_HEAD(&link->linkage);
        link->flags = 0;
-       link->event = current_thread;
+       link->event = current_thread();
        link->hits  = 0;
        link->forward = NULL;
        ON_SYNC_DEBUG(link->magic = KSLEEP_LINK_MAGIC);
@@ -620,6 +720,11 @@ static void add_hit(struct ksleep_chan *chan, event_t event)
 {
        struct ksleep_link *scan;
 
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+
        SLASSERT(kspin_islocked(&chan->guard));
        list_for_each_entry(scan, &chan->waiters, linkage) {
                if (scan->event == event) {
@@ -629,7 +734,7 @@ static void add_hit(struct ksleep_chan *chan, event_t event)
        }
 }
 
-void ksleep_wait(struct ksleep_chan *chan)
+void ksleep_wait(struct ksleep_chan *chan, cfs_task_state_t state)
 {
        event_t event;
        int     result;
@@ -640,10 +745,10 @@ void ksleep_wait(struct ksleep_chan *chan)
        SLASSERT(chan->magic == KSLEEP_CHAN_MAGIC);
        SLASSERT(get_preemption_level() == 0);
 
-       event = current_thread;
+       event = current_thread();
        kspin_lock(&chan->guard);
        if (!has_hits(chan, event)) {
-               result = assert_wait(event, THREAD_UNINT);
+               result = assert_wait(event, state);
                kspin_unlock(&chan->guard);
                SLASSERT(result == THREAD_AWAKENED || result == THREAD_WAITING);
                if (result == THREAD_WAITING)
@@ -653,12 +758,16 @@ void ksleep_wait(struct ksleep_chan *chan)
        EXIT;
 }
 
-int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout)
+/*
+ * Sleep on @chan for no longer than @timeout nano-seconds. Return remaining
+ * sleep time (non-zero only if thread was waken by a signal (not currently
+ * implemented), or waitq was already in the "signalled" state).
+ */
+int64_t ksleep_timedwait(struct ksleep_chan *chan, 
+                         cfs_task_state_t state,
+                         uint64_t timeout)
 {
        event_t event;
-       int64_t     result; 
-        AbsoluteTime clock_current; 
-        AbsoluteTime clock_delay;
 
        ENTRY;
 
@@ -668,20 +777,20 @@ int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout)
 
        CDEBUG(D_TRACE, "timeout: %llu\n", (long long unsigned)timeout);
 
-       event = current_thread;
-       result = 0;
+       event = current_thread();
        kspin_lock(&chan->guard);
        if (!has_hits(chan, event)) {
-               result = assert_wait(event, THREAD_UNINT);
+                int      result;
+                uint64_t expire;
+               result = assert_wait(event, state);
                if (timeout > 0) {
                        /*
                         * arm a timer. thread_set_timer()'s first argument is
                         * uint32_t, so we have to cook deadline ourselves.
                         */
-                       clock_get_uptime(&clock_current);
-                       nanoseconds_to_absolutetime(timeout, &clock_delay);
-                       ADD_ABSOLUTETIME(&clock_current, &clock_delay);
-                       thread_set_timer_deadline(clock_current);
+                       nanoseconds_to_absolutetime(timeout, &expire);
+                        clock_absolutetime_interval_to_deadline(expire, &expire);
+                       thread_set_timer_deadline(expire);
                }
                kspin_unlock(&chan->guard);
                SLASSERT(result == THREAD_AWAKENED || result == THREAD_WAITING);
@@ -689,19 +798,22 @@ int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout)
                        result = thread_block(THREAD_CONTINUE_NULL);
                thread_cancel_timer();
 
-                clock_get_uptime(&clock_delay);
-                SUB_ABSOLUTETIME(&clock_delay, &clock_current);
-                if (result == THREAD_TIMED_OUT)
-                        result = 0;
-                else {
-                        absolutetime_to_nanoseconds(clock_delay, &result);
-                        if (result < 0)
-                                result = 0;
-                }
-       } else
+               if (result == THREAD_TIMED_OUT)
+                        timeout = 0;
+               else {
+                        uint64_t now;
+                        clock_get_uptime(&now);
+                        if (expire > now)
+                               absolutetime_to_nanoseconds(expire - now, &timeout);
+                        else
+                                timeout = 0;
+               }
+       } else  {
+                timeout = 0;
                kspin_unlock(&chan->guard);
+        }
 
-        RETURN(result);
+        RETURN(timeout);
 }
 
 /*
@@ -710,9 +822,11 @@ int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout)
  */
 void ksleep_wake(struct ksleep_chan *chan)
 {
-       ENTRY;
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
        ksleep_wake_nr(chan, 1);
-       EXIT;
 }
 
 /*
@@ -734,7 +848,10 @@ void ksleep_wake_nr(struct ksleep_chan *chan, int nr)
        struct ksleep_link *scan;
        int result;
 
-       ENTRY;
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
 
        SLASSERT(chan != NULL);
        SLASSERT(chan->magic == KSLEEP_CHAN_MAGIC);
@@ -747,8 +864,6 @@ void ksleep_wake_nr(struct ksleep_chan *chan, int nr)
                if (forward != NULL)
                        kspin_lock(&forward->guard);
                result = thread_wakeup(scan->event);
-               CDEBUG(D_INFO, "waking 0x%x: %d\n",
-                      (unsigned int)scan->event, result);
                SLASSERT(result == KERN_SUCCESS || result == KERN_NOT_WAITING);
                if (result == KERN_NOT_WAITING) {
                        ++ scan->hits;
@@ -761,7 +876,6 @@ void ksleep_wake_nr(struct ksleep_chan *chan, int nr)
                        break;
        }
        kspin_unlock(&chan->guard);
-       EXIT;
 }
 
 void ktimer_init(struct ktimer *t, void (*func)(void *), void *arg)
@@ -807,6 +921,9 @@ static void ktimer_actor(void *arg0, void *arg1)
                t->func(t->arg);
 }
 
+extern boolean_t thread_call_func_cancel(thread_call_func_t, thread_call_param_t, boolean_t);
+extern void thread_call_func_delayed(thread_call_func_t, thread_call_param_t, uint64_t);
+
 static void ktimer_disarm_locked(struct ktimer *t)
 {
        SLASSERT(t != NULL);
@@ -815,15 +932,29 @@ static void ktimer_disarm_locked(struct ktimer *t)
        thread_call_func_cancel(ktimer_actor, t, FALSE);
 }
 
+/*
+ * Received deadline is nanoseconds, but time checked by 
+ * thread_call is absolute time (The abstime unit is equal to 
+ * the length of one bus cycle, so the duration is dependent 
+ * on the bus speed of the computer), so we need to convert
+ * nanotime to abstime by nanoseconds_to_absolutetime().
+ *
+ * Refer to _delayed_call_timer(...)
+ *
+ * if thread_call_func_delayed is not exported in the future,
+ * we can use timeout() or bsd_timeout() to replace it.
+ */
 void ktimer_arm(struct ktimer *t, u_int64_t deadline)
 {
+        cfs_time_t    abstime;
        SLASSERT(t != NULL);
        SLASSERT(t->magic == KTIMER_MAGIC);
 
        kspin_lock(&t->guard);
        ktimer_disarm_locked(t);
        t->armed = 1;
-       thread_call_func_delayed(ktimer_actor, t, *(AbsoluteTime *)&deadline);
+        nanoseconds_to_absolutetime(deadline, &abstime);
+       thread_call_func_delayed(ktimer_actor, t, deadline);
        kspin_unlock(&t->guard);
 }
 
@@ -857,6 +988,23 @@ u_int64_t ktimer_deadline(struct ktimer *t)
        return t->deadline;
 }
 
+void cfs_sync_init(void) 
+{
+#ifdef __DARWIN8__
+        /* Initialize lock group */
+        cfs_lock_grp = lck_grp_alloc_init("libcfs sync", LCK_GRP_ATTR_NULL);
+#endif
+}
+
+void cfs_sync_fini(void)
+{
+#ifdef __DARWIN8__
+        /* destroy lock group */
+        lck_grp_free(cfs_lock_grp);
+        /* XXX Liang: check reference count of lock group */
+        cfs_lock_grp = NULL;
+#endif
+}
 /*
  * Local variables:
  * c-indentation-style: "K&R"
diff --git a/lnet/libcfs/darwin/darwin-tcpip.c b/lnet/libcfs/darwin/darwin-tcpip.c
new file mode 100644 (file)
index 0000000..cafd824
--- /dev/null
@@ -0,0 +1,1335 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ * 
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * 
+ * This file is part of Lustre, http://www.lustre.org.
+ * 
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ * 
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * 
+ * Darwin porting library
+ * Make things easy to port
+ */ 
+
+#include <mach/mach_types.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sockio.h>
+#include <sys/protosw.h>
+#include <net/if.h>
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+
+static __inline__ struct sockaddr_in
+blank_sin()
+{
+        struct sockaddr_in  blank = { sizeof(struct sockaddr_in), AF_INET };
+        return (blank);
+}
+
+void
+libcfs_ipif_free_enumeration (char **names, int n)
+{
+        int      i;
+
+        LASSERT (n > 0);
+
+        for (i = 0; i < n && names[i] != NULL; i++)
+                LIBCFS_FREE(names[i], IFNAMSIZ);
+                
+        LIBCFS_FREE(names, n * sizeof(*names));
+}
+
+#ifdef __DARWIN8__
+/*
+ * Darwin 8.x 
+ *
+ * No hack kernel structre, all using KPI.
+ */
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+        struct ifreq    ifr;
+        socket_t        so;
+        __u32           val;
+        int             nob;
+        int             rc;
+
+        rc = -sock_socket(PF_INET, SOCK_STREAM, 0, 
+                          NULL, NULL, &so);
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return rc;
+        }
+
+        nob = strnlen(name, IFNAMSIZ);
+        if (nob == IFNAMSIZ) {
+                CERROR("Interface name %s too long\n", name);
+                rc = -EINVAL;
+                goto out;
+        }
+
+        CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+        bzero(&ifr, sizeof(ifr));
+        strcpy(ifr.ifr_name, name);
+        rc = -sock_ioctl (so, SIOCGIFFLAGS, &ifr);
+
+        if (rc != 0) {
+                CERROR("Can't get flags for interface %s\n", name);
+                goto out;
+        }
+        
+        if ((ifr.ifr_flags & IFF_UP) == 0) {
+                CDEBUG(D_NET, "Interface %s down\n", name);
+                *up = 0;
+                *ip = *mask = 0;
+                goto out;
+        }
+
+        *up = 1;
+
+        bzero(&ifr, sizeof(ifr));
+        strcpy(ifr.ifr_name, name);
+        *((struct sockaddr_in *)&ifr.ifr_addr) = blank_sin();
+        rc = -sock_ioctl(so, SIOCGIFADDR, &ifr);
+
+        if (rc != 0) {
+                CERROR("Can't get IP address for interface %s\n", name);
+                goto out;
+        }
+        
+        val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+        *ip = ntohl(val);
+
+        bzero(&ifr, sizeof(ifr));
+        strcpy(ifr.ifr_name, name);
+        *((struct sockaddr_in *)&ifr.ifr_addr) = blank_sin();
+        rc = -sock_ioctl(so, SIOCGIFNETMASK, &ifr);
+
+        if (rc != 0) {
+                CERROR("Can't get netmask for interface %s\n", name);
+                goto out;
+        }
+
+        val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+        *mask = ntohl(val);
+out:
+        sock_close(so);
+        return rc;
+}
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+        /* Allocate and fill in 'names', returning # interfaces/error */
+        char           **names;
+        int             toobig;
+        int             nalloc;
+        int             nfound;
+        socket_t        so;
+        struct ifreq   *ifr;
+        struct ifconf   ifc;
+        int             rc;
+        int             nob;
+        int             i;
+
+        rc = -sock_socket(PF_INET, SOCK_STREAM, 0, 
+                          NULL, NULL, &so);
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return (rc);
+        }
+
+        nalloc = 16;    /* first guess at max interfaces */
+        toobig = 0;
+        for (;;) {
+                if (nalloc * sizeof(*ifr) > CFS_PAGE_SIZE) {
+                        toobig = 1;
+                        nalloc = CFS_PAGE_SIZE/sizeof(*ifr);
+                        CWARN("Too many interfaces: only enumerating first %d\n",
+                              nalloc);
+                }
+
+                LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+                if (ifr == NULL) {
+                        CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+                                rc = -ENOMEM;
+                        goto out0;
+                }
+                                
+                ifc.ifc_buf = (char *)ifr;
+                ifc.ifc_len = nalloc * sizeof(*ifr);
+                                        
+#if 1
+                /*
+                 * XXX Liang:
+                 * sock_ioctl(..., SIOCGIFCONF, ...) is not usable for calling in kernel,
+                 * it always use copyout(...) to copy ifreq to userspace. 
+                 * So we can't get interfaces name by sock_ioctl(...,SIOCGIFCONF,...).
+                 */
+                nfound = 0;
+                for (i = 0; i < 16; i++) {
+                        struct ifreq    en;
+                        bzero(&en, sizeof(en));
+                        snprintf(en.ifr_name, IFNAMSIZ, "en%d", i);
+                        rc = -sock_ioctl (so, SIOCGIFFLAGS, &en);
+                        if (rc != 0)
+                                continue;
+                        strcpy(ifr[nfound++].ifr_name, en.ifr_name);
+                }
+
+#else           /* NOT in using now */
+                rc = -sock_ioctl(so, SIOCGIFCONF, (caddr_t)&ifc);
+                                
+                if (rc < 0) {
+                        CERROR ("Error %d enumerating interfaces\n", rc);
+                        goto out1;
+                }
+
+                nfound = ifc.ifc_len/sizeof(*ifr);
+                LASSERT (nfound <= nalloc);
+#endif
+
+                if (nfound < nalloc || toobig)
+                        break;
+
+                LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+                nalloc *= 2;
+        }
+        if (nfound == 0)
+                goto out1;
+
+        LIBCFS_ALLOC(names, nfound * sizeof(*names));
+        if (names == NULL) {
+                rc = -ENOMEM;
+                goto out1;
+        }
+        /* NULL out all names[i] */
+        memset (names, 0, nfound * sizeof(*names));
+
+        for (i = 0; i < nfound; i++) {
+
+                nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+                if (nob == IFNAMSIZ) {
+                        /* no space for terminating NULL */
+                        CERROR("interface name %.*s too long (%d max)\n",
+                               nob, ifr[i].ifr_name, IFNAMSIZ);
+                        rc = -ENAMETOOLONG;
+                        goto out2;
+                }
+
+                LIBCFS_ALLOC(names[i], IFNAMSIZ);
+                if (names[i] == NULL) {
+                        rc = -ENOMEM;
+                        goto out2;
+                }
+
+                memcpy(names[i], ifr[i].ifr_name, nob);
+                names[i][nob] = 0;
+        }
+
+        *namesp = names;
+        rc = nfound;
+
+out2:
+        if (rc < 0)
+                libcfs_ipif_free_enumeration(names, nfound);
+out1:
+        LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+out0:
+        sock_close(so);
+        return rc;
+
+}
+
+/*
+ * Public entry of socket upcall.
+ *
+ * so_upcall can only be installed while create/accept of socket in 
+ * Darwin 8.0, so we setup libcfs_sock_upcall() as upcall for all 
+ * sockets in creat/accept, it will call upcall provided by user 
+ * which can be setup after create/accept of socket.
+ */
+static void libcfs_sock_upcall(socket_t so, void* arg, int waitf)
+{
+        cfs_socket_t    *sock;
+
+        sock = B2C_SOCK(so);
+        if ((sock->s_flags & CFS_SOCK_UPCALL) != 0 && sock->s_upcall != NULL)
+                sock->s_upcall((struct socket *)so, sock->s_upcallarg, waitf);
+        return;
+}
+
+void libcfs_sock_set_cb(cfs_socket_t *sock, so_upcall callback, void *arg)
+{
+        sock->s_upcall = callback;
+        sock->s_upcallarg = arg;
+        sock->s_flags |= CFS_SOCK_UPCALL;
+        return;
+}
+
+void libcfs_sock_reset_cb(cfs_socket_t *sock)
+{
+        sock->s_flags &= ~CFS_SOCK_UPCALL;
+        sock->s_upcall = NULL;
+        sock->s_upcallarg = NULL;
+        return;
+}
+
+static int
+libcfs_sock_create (cfs_socket_t **sockp, int *fatal,
+                    __u32 local_ip, int local_port)
+{
+        struct sockaddr_in  locaddr;
+        cfs_socket_t    *sock;
+        int             option;
+        int             optlen;
+        int             rc;
+
+        /* All errors are fatal except bind failure if the port is in use */
+        *fatal = 1;
+
+        sock = _MALLOC(sizeof(cfs_socket_t), M_TEMP, M_WAITOK|M_ZERO);
+        if (!sock) {
+                CERROR("Can't allocate cfs_socket.\n");
+                return -ENOMEM;
+        }
+        *sockp = sock;
+
+        rc = -sock_socket(PF_INET, SOCK_STREAM, 0, 
+                          libcfs_sock_upcall, NULL, &C2B_SOCK(sock));
+        if (rc != 0) 
+                goto out;
+        option = 1;
+        optlen = sizeof(option);
+        rc = -sock_setsockopt(C2B_SOCK(sock), SOL_SOCKET, 
+                              SO_REUSEADDR, &option, optlen);
+        if (rc != 0)
+                goto out;
+
+        /* can't specify a local port without a local IP */
+        LASSERT (local_ip == 0 || local_port != 0);
+
+        if (local_ip != 0 || local_port != 0) {
+                bzero (&locaddr, sizeof (locaddr));
+                locaddr.sin_len = sizeof(struct sockaddr_in);
+                locaddr.sin_family = AF_INET;
+                locaddr.sin_port = htons (local_port);
+                locaddr.sin_addr.s_addr = (local_ip != 0) ? htonl(local_ip) : INADDR_ANY;
+                rc = -sock_bind(C2B_SOCK(sock), (struct sockaddr *)&locaddr);
+                if (rc == -EADDRINUSE) {
+                        CDEBUG(D_NET, "Port %d already in use\n", local_port);
+                        *fatal = 0;
+                        goto out;
+                }
+                if (rc != 0) {
+                        CERROR("Error trying to bind to port %d: %d\n",
+                               local_port, rc);
+                        goto out;
+                }
+        }
+        return 0;
+out:
+        if (C2B_SOCK(sock) != NULL) 
+                sock_close(C2B_SOCK(sock));
+        FREE(sock, M_TEMP);
+        return rc;
+}
+
+int
+libcfs_sock_listen (cfs_socket_t **sockp,
+                   __u32 local_ip, int local_port, int backlog)
+{
+        cfs_socket_t    *sock;
+        int             fatal;
+        int             rc;
+
+        rc = libcfs_sock_create(&sock, &fatal, local_ip, local_port);
+        if (rc != 0)  {
+                if (!fatal)
+                        CERROR("Can't create socket: port %d already in use\n",
+                                local_port);
+                return rc;
+
+        }
+        rc = -sock_listen(C2B_SOCK(sock), backlog);
+        if (rc == 0) {
+                *sockp = sock;
+                return 0;
+        }
+
+        if (C2B_SOCK(sock) != NULL) 
+                sock_close(C2B_SOCK(sock));
+        FREE(sock, M_TEMP);
+        return rc;
+}
+
+int
+libcfs_sock_accept (cfs_socket_t **newsockp, cfs_socket_t *sock)
+{
+        cfs_socket_t   *newsock;
+        int             rc;
+
+        newsock = _MALLOC(sizeof(cfs_socket_t), M_TEMP, M_WAITOK|M_ZERO);
+        if (!newsock) {
+                CERROR("Can't allocate cfs_socket.\n");
+                return -ENOMEM;
+        }
+        /*
+         * thread will sleep in sock_accept by calling of msleep(), 
+         * it can be interrupted because msleep() use PCATCH as argument.
+         */
+        rc = -sock_accept(C2B_SOCK(sock), NULL, 0, 0, 
+                          libcfs_sock_upcall, NULL, &C2B_SOCK(newsock));
+        if (rc) {
+                if (C2B_SOCK(newsock) != NULL) sock_close(C2B_SOCK(newsock));
+                FREE(newsock, M_TEMP);
+                return rc;
+        }
+        *newsockp = newsock;
+        return 0;
+}
+
+void
+libcfs_sock_abort_accept (cfs_socket_t *sock)
+{
+        /*
+         * XXX Liang: 
+         *
+         * we want to wakeup thread blocked by sock_accept, but we don't
+         * know the address where thread is sleeping on, so we cannot 
+         * wakeup it directly.
+         * The thread slept in sock_accept will be waken up while:
+         * 1. interrupt by signal
+         * 2. new connection is coming (sonewconn)
+         * 3. disconnecting of the socket (soisconnected)
+         * 
+         * Cause we can't send signal to a thread directly(no KPI), so the 
+         * only thing can be done here is disconnect the socket (by 
+         * sock_shutdown() or sth else? ).
+         *
+         * Shutdown request of socket with SHUT_WR or SHUT_RDWR will
+         * be issured to the protocol.
+         * sock_shutdown()->tcp_usr_shutdown()->tcp_usrclosed()->
+         * tcp_close()->soisdisconnected(), it will wakeup thread by
+         * wakeup((caddr_t)&so->so_timeo);
+         */
+        sock_shutdown(C2B_SOCK(sock), SHUT_RDWR);
+}
+
+int
+libcfs_sock_read (cfs_socket_t *sock, void *buffer, int nob, int timeout)
+{
+        size_t          rcvlen;
+        int             rc;
+        cfs_duration_t  to = cfs_time_seconds(timeout);
+        cfs_time_t      then;
+        struct timeval  tv;
+
+        LASSERT(nob > 0);
+
+        for (;;) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct  msghdr  msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &iov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = 0,
+                };
+                cfs_duration_usec(to, &tv);
+                rc = -sock_setsockopt(C2B_SOCK(sock), SOL_SOCKET, SO_RCVTIMEO,
+                                      &tv, sizeof(tv));
+                if (rc != 0) {
+                        CERROR("Can't set socket recv timeout "
+                                        "%ld.%06d: %d\n",
+                                        (long)tv.tv_sec, (int)tv.tv_usec, rc);
+                        return rc;
+                }
+
+                then = cfs_time_current();
+                rc = -sock_receive(C2B_SOCK(sock), &msg, 0, &rcvlen);
+                to -= cfs_time_current() - then;
+
+                if (rc != 0) {
+                        if (rcvlen != nob && \
+                        (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK))
+                                rc = 0;
+                        if (rc != 0)
+                                return rc;
+                }
+
+                if (rcvlen == nob)
+                        return 0;
+
+                if (to <= 0)
+                        return -EAGAIN;
+
+                buffer = ((char *)buffer) + rcvlen;
+                nob -= rcvlen;
+        }
+        return 0;
+}
+
+int
+libcfs_sock_write (cfs_socket_t *sock, void *buffer, int nob, int timeout)
+{
+        size_t          sndlen;
+        int             rc;
+        cfs_duration_t  to = cfs_time_seconds(timeout);
+        cfs_time_t      then;
+        struct timeval  tv;
+
+        LASSERT(nob > 0);
+
+        for (;;) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct  msghdr  msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &iov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = 0,
+                };
+                cfs_duration_usec(to, &tv);
+                rc = -sock_setsockopt(C2B_SOCK(sock), SOL_SOCKET, SO_SNDTIMEO,
+                                      &tv, sizeof(tv));
+                if (rc != 0) {
+                        CERROR("Can't set socket send timeout "
+                                        "%ld.%06d: %d\n",
+                                        (long)tv.tv_sec, (int)tv.tv_usec, rc);
+                        return rc;
+                }
+
+                then = cfs_time_current();
+                rc = -sock_send(C2B_SOCK(sock), &msg, 0, &sndlen);
+                to -= cfs_time_current() - then;
+
+                if (rc != 0) {
+                        if (sndlen != nob && \
+                        (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK))
+                                rc = 0;
+                        if (rc != 0)
+                                return rc;
+                }
+
+                if (sndlen == nob)
+                        return 0;
+                if (to <= 0)
+                        return -EAGAIN;
+                buffer = ((char *)buffer) + sndlen;
+                nob -= sndlen;
+        }
+        return 0;
+
+}
+
+int
+libcfs_sock_getaddr (cfs_socket_t *sock, int remote, __u32 *ip, int *port)
+{
+        struct sockaddr_in sin;
+        int                rc;
+
+        if (remote != 0) 
+                /* Get remote address */
+                rc = -sock_getpeername(C2B_SOCK(sock), (struct sockaddr *)&sin, sizeof(sin));
+        else 
+                /* Get local address */
+                rc = -sock_getsockname(C2B_SOCK(sock), (struct sockaddr *)&sin, sizeof(sin));
+        if (rc != 0) {
+                CERROR ("Error %d getting sock %s IP/port\n",
+                         rc, remote ? "peer" : "local");
+                return rc;
+        }
+
+        if (ip != NULL)
+                *ip = ntohl (sin.sin_addr.s_addr);
+
+        if (port != NULL)
+                *port = ntohs (sin.sin_port);
+        return 0;
+}
+
+int
+libcfs_sock_setbuf (cfs_socket_t *sock, int txbufsize, int rxbufsize)
+{
+        int                 option;
+        int                 rc;
+        
+        if (txbufsize != 0) {
+                option = txbufsize;
+                rc = -sock_setsockopt(C2B_SOCK(sock), SOL_SOCKET, SO_SNDBUF,
+                                     (char *)&option, sizeof (option));
+                if (rc != 0) {
+                        CERROR ("Can't set send buffer %d: %d\n",
+                                option, rc);
+                        return (rc);
+                } 
+        } 
+        
+        if (rxbufsize != 0) {
+                option = rxbufsize;
+                rc = -sock_setsockopt (C2B_SOCK(sock), SOL_SOCKET, SO_RCVBUF,
+                                      (char *)&option, sizeof (option));
+                if (rc != 0) {
+                        CERROR ("Can't set receive buffer %d: %d\n",
+                                option, rc);
+                        return (rc);
+                }
+        }
+        return 0;
+}
+
+int
+libcfs_sock_getbuf (cfs_socket_t *sock, int *txbufsize, int *rxbufsize)
+{
+        int                 option;
+        int                 optlen;
+        int                 rc; 
+        
+        if (txbufsize != NULL) {
+                optlen = sizeof(option);
+                rc = -sock_getsockopt(C2B_SOCK(sock), SOL_SOCKET, SO_SNDBUF,
+                                (char *)&option, &optlen);
+                if (rc != 0) {
+                        CERROR ("Can't get send buffer size: %d\n", rc);
+                        return (rc);
+                }
+                *txbufsize = option;
+        } 
+        
+        if (rxbufsize != NULL) {
+                optlen = sizeof(option);
+                rc = -sock_getsockopt (C2B_SOCK(sock), SOL_SOCKET, SO_RCVBUF,
+                                (char *)&option, &optlen);
+                if (rc != 0) {
+                        CERROR ("Can't get receive buffer size: %d\n", rc);
+                        return (rc);
+                }
+                *rxbufsize = option;
+        }
+        return 0;
+}
+
+void
+libcfs_sock_release (cfs_socket_t *sock)
+{
+        if (C2B_SOCK(sock) != NULL) {
+                sock_shutdown(C2B_SOCK(sock), 2);
+                sock_close(C2B_SOCK(sock));
+        }
+        FREE(sock, M_TEMP);
+}
+
+int
+libcfs_sock_connect (cfs_socket_t **sockp, int *fatal,
+                     __u32 local_ip, int local_port,
+                     __u32 peer_ip, int peer_port)
+{
+        cfs_socket_t       *sock;
+        struct sockaddr_in  srvaddr;
+        int                 rc; 
+        
+        rc = libcfs_sock_create(&sock, fatal, local_ip, local_port);
+        if (rc != 0)
+                return rc;
+
+        bzero(&srvaddr, sizeof(srvaddr));
+        srvaddr.sin_len = sizeof(struct sockaddr_in);
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons(peer_port);
+        srvaddr.sin_addr.s_addr = htonl(peer_ip);
+
+        rc = -sock_connect(C2B_SOCK(sock), (struct sockaddr *)&srvaddr, 0);
+        if (rc == 0) {
+                *sockp = sock;
+                return 0;
+        }
+
+        *fatal = !(rc == -EADDRNOTAVAIL);
+        CDEBUG(*fatal ? D_ERROR : D_NET,
+               "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+               HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port);
+
+        libcfs_sock_release(sock);
+        return rc;
+}
+
+#else   /* !__DARWIN8__ */
+
+/*
+ * To use bigger buffer for socket:
+ * 1. Increase nmbclusters (Cannot increased by sysctl because it's ready only, so
+ *    we must patch kernel).
+ * 2. Increase net.inet.tcp.reass.maxsegments
+ * 3. Increase net.inet.tcp.sendspace
+ * 4. Increase net.inet.tcp.recvspace
+ * 5. Increase kern.ipc.maxsockbuf
+ */
+#define KSOCK_MAX_BUF        (1152*1024)
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+        struct socket      *so;
+        struct ifreq       ifr;
+        int                nob;
+        int                rc;
+        __u32              val;
+        CFS_DECL_FUNNEL_DATA;
+
+        CFS_NET_IN;
+        rc = socreate(PF_INET, &so, SOCK_STREAM, 0);
+        CFS_NET_EX;
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return (-rc);
+        }
+        nob = strnlen(name, IFNAMSIZ);
+        if (nob == IFNAMSIZ) {
+                CERROR("Interface name %s too long\n", name);
+                rc = -EINVAL;
+                goto out;
+        }
+
+        CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+        strcpy(ifr.ifr_name, name);
+        CFS_NET_IN;
+        rc = ifioctl(so, SIOCGIFFLAGS, (caddr_t)&ifr, current_proc());
+        CFS_NET_EX;
+
+        if (rc != 0) {
+                CERROR("Can't get flags for interface %s\n", name);
+                goto out;
+        }
+        if ((ifr.ifr_flags & IFF_UP) == 0) {
+                CDEBUG(D_NET, "Interface %s down\n", name);
+                *up = 0;
+                *ip = *mask = 0;
+                goto out;
+        }
+       
+        *up = 1;
+        strcpy(ifr.ifr_name, name);
+        *((struct sockaddr_in *)&ifr.ifr_addr) = blank_sin();
+        CFS_NET_IN;
+        rc = ifioctl(so, SIOCGIFADDR, (caddr_t)&ifr, current_proc());
+        CFS_NET_EX;
+
+        if (rc != 0) {
+                CERROR("Can't get IP address for interface %s\n", name);
+                goto out;
+        }
+
+        val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+        *ip = ntohl(val);
+
+        strcpy(ifr.ifr_name, name);
+        *((struct sockaddr_in *)&ifr.ifr_addr) = blank_sin();
+        CFS_NET_IN;
+        rc = ifioctl(so, SIOCGIFNETMASK, (caddr_t)&ifr, current_proc());
+        CFS_NET_EX;
+
+        if (rc != 0) {
+                CERROR("Can't get netmask for interface %s\n", name);
+                goto out;
+        }
+
+        val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+        *mask = ntohl(val);
+out:
+        CFS_NET_IN;
+        soclose(so);
+        CFS_NET_EX;
+        return -rc;
+}
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+        /* Allocate and fill in 'names', returning # interfaces/error */
+        char           **names;
+        int             toobig;
+        int             nalloc;
+        int             nfound;
+        struct socket  *so;
+        struct ifreq   *ifr;
+        struct ifconf   ifc;
+        int             rc;
+        int             nob;
+        int             i;
+        CFS_DECL_FUNNEL_DATA;
+
+        CFS_NET_IN;
+        rc = socreate(PF_INET, &so, SOCK_STREAM, 0);
+        CFS_NET_EX;
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return (-rc);
+        }
+
+        nalloc = 16;    /* first guess at max interfaces */
+        toobig = 0;
+        for (;;) {
+                if (nalloc * sizeof(*ifr) > PAGE_SIZE) {
+                        toobig = 1;
+                        nalloc = CFS_PAGE_SIZE/sizeof(*ifr);
+                        CWARN("Too many interfaces: only enumerating first %d\n",
+                              nalloc);
+                }
+
+                LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+                if (ifr == NULL) {
+                        CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+                                rc = -ENOMEM;
+                        goto out0;
+                }
+                                
+                ifc.ifc_buf = (char *)ifr;
+                ifc.ifc_len = nalloc * sizeof(*ifr);
+                                        
+                CFS_NET_IN;
+                rc = -ifioctl(so, SIOCGIFCONF, (caddr_t)&ifc, current_proc());
+                CFS_NET_EX;
+                                
+                if (rc < 0) {
+                        CERROR ("Error %d enumerating interfaces\n", rc);
+                        goto out1;
+                }
+
+                nfound = ifc.ifc_len/sizeof(*ifr);
+                LASSERT (nfound <= nalloc);
+
+                if (nfound < nalloc || toobig)
+                        break;
+
+                LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+                nalloc *= 2;
+        }
+        if (nfound == 0)
+                goto out1;
+
+        LIBCFS_ALLOC(names, nfound * sizeof(*names));
+        if (names == NULL) {
+                rc = -ENOMEM;
+                goto out1;
+        }
+        /* NULL out all names[i] */
+        memset (names, 0, nfound * sizeof(*names));
+
+        for (i = 0; i < nfound; i++) {
+
+                nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+                if (nob == IFNAMSIZ) {
+                        /* no space for terminating NULL */
+                        CERROR("interface name %.*s too long (%d max)\n",
+                               nob, ifr[i].ifr_name, IFNAMSIZ);
+                        rc = -ENAMETOOLONG;
+                        goto out2;
+                }
+
+                LIBCFS_ALLOC(names[i], IFNAMSIZ);
+                if (names[i] == NULL) {
+                        rc = -ENOMEM;
+                        goto out2;
+                }
+
+                memcpy(names[i], ifr[i].ifr_name, nob);
+                names[i][nob] = 0;
+        }
+
+        *namesp = names;
+        rc = nfound;
+
+out2:
+        if (rc < 0)
+                libcfs_ipif_free_enumeration(names, nfound);
+out1:
+        LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+out0:
+        CFS_NET_IN;
+        soclose(so);
+        CFS_NET_EX;
+        return rc;
+}
+
+static int
+libcfs_sock_create (struct socket **sockp, int *fatal,
+                    __u32 local_ip, int local_port)
+{
+        struct sockaddr_in  locaddr;
+        struct socket      *so;
+        struct sockopt      sopt;
+        int                 option;
+        int                 rc;
+        CFS_DECL_FUNNEL_DATA;
+
+        *fatal = 1;
+        CFS_NET_IN;
+        rc = socreate(PF_INET, &so, SOCK_STREAM, 0);
+        CFS_NET_EX;
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return (-rc);
+        }
+        
+        bzero(&sopt, sizeof sopt);
+        option = 1;
+        sopt.sopt_level = SOL_SOCKET;
+        sopt.sopt_name = SO_REUSEADDR;
+        sopt.sopt_val = &option;
+        sopt.sopt_valsize = sizeof(option);
+        CFS_NET_IN;
+        rc = sosetopt(so, &sopt);
+        if (rc != 0) {
+                CFS_NET_EX;
+                CERROR ("Can't set sock reuse address: %d\n", rc);
+                goto out;
+        }
+        /* can't specify a local port without a local IP */
+        LASSERT (local_ip == 0 || local_port != 0);
+
+        if (local_ip != 0 || local_port != 0) {
+                bzero (&locaddr, sizeof (locaddr));
+                locaddr.sin_len = sizeof(struct sockaddr_in);
+                locaddr.sin_family = AF_INET;
+                locaddr.sin_port = htons (local_port);
+                locaddr.sin_addr.s_addr = (local_ip != 0) ? htonl(local_ip) :
+                                                            INADDR_ANY;
+
+                rc = sobind(so, (struct sockaddr *)&locaddr);
+                if (rc == EADDRINUSE) {
+                        CFS_NET_EX;
+                        CDEBUG(D_NET, "Port %d already in use\n", local_port);
+                        *fatal = 0;
+                        goto out;
+                }
+                if (rc != 0) {
+                        CFS_NET_EX;
+                        CERROR ("Can't bind to local IP Address %u.%u.%u.%u: %d\n",
+                        HIPQUAD(local_ip), rc);
+                        goto out;
+                }
+        }
+        *sockp = so;
+        return 0;
+out:
+        CFS_NET_IN;
+        soclose(so);
+        CFS_NET_EX;
+        return -rc;
+}
+
+int
+libcfs_sock_listen (struct socket **sockp,
+                    __u32 local_ip, int local_port, int backlog)
+{
+        int      fatal;
+        int      rc;
+        CFS_DECL_FUNNEL_DATA;
+
+        rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port);
+        if (rc != 0) {
+                if (!fatal)
+                CERROR("Can't create socket: port %d already in use\n",
+                       local_port);
+                return rc;
+        }
+        CFS_NET_IN;
+        rc = solisten(*sockp, backlog);
+        CFS_NET_EX;
+        if (rc == 0)
+                return 0;
+        CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+        CFS_NET_IN;
+        soclose(*sockp);
+        CFS_NET_EX;
+        return -rc;
+}
+
+int
+libcfs_sock_accept (struct socket **newsockp, struct socket *sock)
+{
+        struct socket *so;
+        struct sockaddr *sa;
+        int error, s;
+        CFS_DECL_FUNNEL_DATA;
+
+        CFS_NET_IN;
+        s = splnet();
+        if ((sock->so_options & SO_ACCEPTCONN) == 0) {
+                splx(s);
+                CFS_NET_EX;
+                return (-EINVAL);
+        }
+
+        if ((sock->so_state & SS_NBIO) && sock->so_comp.tqh_first == NULL) {
+                splx(s);
+                CFS_NET_EX;
+                return (-EWOULDBLOCK);
+        }
+
+        error = 0;
+        while (TAILQ_EMPTY(&sock->so_comp) && sock->so_error == 0) {
+                if (sock->so_state & SS_CANTRCVMORE) {
+                        sock->so_error = ECONNABORTED;
+                        break;
+                }
+                error = tsleep((caddr_t)&sock->so_timeo, PSOCK | PCATCH,
+                                "accept", 0);
+                if (error) {
+                        splx(s);
+                        CFS_NET_EX;
+                        return (-error);
+                }
+        }
+        if (sock->so_error) {
+                error = sock->so_error;
+                sock->so_error = 0;
+                splx(s);
+                CFS_NET_EX;
+                return (-error);
+        }
+
+        /*
+         * At this point we know that there is at least one connection
+         * ready to be accepted. Remove it from the queue prior to
+         * allocating the file descriptor for it since falloc() may
+         * block allowing another process to accept the connection
+         * instead.
+         */
+        so = TAILQ_FIRST(&sock->so_comp);
+        TAILQ_REMOVE(&sock->so_comp, so, so_list);
+        sock->so_qlen--;
+
+        so->so_state &= ~SS_COMP;
+        so->so_head = NULL;
+        sa = 0;
+        (void) soaccept(so, &sa);
+
+        *newsockp = so;
+        FREE(sa, M_SONAME);
+        splx(s);
+        CFS_NET_EX;
+        return (-error);
+}
+
+void
+libcfs_sock_abort_accept (struct socket *sock)
+{
+        wakeup(&sock->so_timeo);
+}
+
+/*
+ * XXX Liang: timeout for write is not supported yet.
+ */
+int
+libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
+{
+        int            rc;
+        CFS_DECL_NET_DATA;
+
+        while (nob > 0) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct  uio suio = {
+                        .uio_iov        = &iov,
+                        .uio_iovcnt     = 1,
+                        .uio_offset     = 0,
+                        .uio_resid      = nob,
+                        .uio_segflg     = UIO_SYSSPACE,
+                        .uio_rw         = UIO_WRITE,
+                        .uio_procp      = NULL
+                };
+                                
+                CFS_NET_IN;
+                rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, 0);
+                CFS_NET_EX;
+                                
+                if (rc != 0) {
+                        if ( suio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
+                             rc == EWOULDBLOCK))
+                        rc = 0;
+                        if ( rc != 0 )
+                                return -rc;
+                        rc = nob - suio.uio_resid;
+                        buffer = ((char *)buffer) + rc;
+                        nob = suio.uio_resid;
+                        continue;
+                }
+                break;
+        }
+        return (0);
+}
+
+/*
+ * XXX Liang: timeout for read is not supported yet.
+ */
+int
+libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
+{
+        int            rc;
+        CFS_DECL_NET_DATA;
+
+        while (nob > 0) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct uio  ruio = {
+                        .uio_iov        = &iov,
+                        .uio_iovcnt     = 1,
+                        .uio_offset     = 0,
+                        .uio_resid      = nob,
+                        .uio_segflg     = UIO_SYSSPACE,
+                        .uio_rw         = UIO_READ,
+                        .uio_procp      = NULL
+                };
+                
+                CFS_NET_IN;
+                rc = soreceive(sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, (int *)0);
+                CFS_NET_EX;
+                
+                if (rc != 0) {
+                        if ( ruio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
+                                rc == EWOULDBLOCK))
+                                rc = 0;
+                        if (rc != 0)
+                                return -rc;
+                        rc = nob - ruio.uio_resid;
+                        buffer = ((char *)buffer) + rc;
+                        nob = ruio.uio_resid;
+                        continue;
+                }
+                break;
+        }
+        return (0);
+}
+
+int
+libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
+{
+        struct sockopt  sopt;
+        int             rc = 0;
+        int             option;
+        CFS_DECL_NET_DATA;
+
+        bzero(&sopt, sizeof sopt);
+        sopt.sopt_dir = SOPT_SET;
+        sopt.sopt_level = SOL_SOCKET;
+        sopt.sopt_val = &option;
+        sopt.sopt_valsize = sizeof(option);
+
+        if (txbufsize != 0) {
+                option = txbufsize;
+                if (option > KSOCK_MAX_BUF)
+                        option = KSOCK_MAX_BUF;
+        
+                sopt.sopt_name = SO_SNDBUF;
+                CFS_NET_IN;
+                rc = sosetopt(sock, &sopt);
+                CFS_NET_EX;
+                if (rc != 0) {
+                        CERROR ("Can't set send buffer %d: %d\n",
+                                option, rc);
+                        
+                        return -rc;
+                }
+        }
+                
+        if (rxbufsize != 0) {
+                option = rxbufsize;
+                sopt.sopt_name = SO_RCVBUF;
+                CFS_NET_IN;
+                rc = sosetopt(sock, &sopt);
+                CFS_NET_EX;
+                if (rc != 0) {
+                        CERROR ("Can't set receive buffer %d: %d\n",
+                                option, rc);
+                        return -rc;
+                }
+        }
+        return 0;
+}
+
+int
+libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port)
+{
+        struct sockaddr_in *sin;
+        struct sockaddr    *sa = NULL;
+        int                rc;
+        CFS_DECL_NET_DATA;
+
+        if (remote != 0) {
+                CFS_NET_IN;
+                rc = sock->so_proto->pr_usrreqs->pru_peeraddr(sock, &sa);
+                CFS_NET_EX;
+
+                if (rc != 0) {
+                        if (sa) FREE(sa, M_SONAME);
+                        CERROR ("Error %d getting sock peer IP\n", rc);
+                        return -rc;
+                }
+        } else {
+                CFS_NET_IN;
+                rc = sock->so_proto->pr_usrreqs->pru_sockaddr(sock, &sa);
+                CFS_NET_EX;
+                if (rc != 0) {
+                        if (sa) FREE(sa, M_SONAME);
+                        CERROR ("Error %d getting sock local IP\n", rc);
+                        return -rc;
+                }
+        }
+        if (sa != NULL) {
+                sin = (struct sockaddr_in *)sa;
+                if (ip != NULL)
+                        *ip = ntohl (sin->sin_addr.s_addr);
+                if (port != NULL)
+                        *port = ntohs (sin->sin_port);
+                if (sa) 
+                        FREE(sa, M_SONAME);
+        }
+        return 0;
+}
+
+int
+libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+        struct sockopt  sopt;
+        int rc;
+        CFS_DECL_NET_DATA;
+
+        bzero(&sopt, sizeof sopt);
+        sopt.sopt_dir = SOPT_GET;
+        sopt.sopt_level = SOL_SOCKET;
+
+        if (txbufsize != NULL) {
+                sopt.sopt_val = txbufsize;
+                sopt.sopt_valsize = sizeof(*txbufsize);
+                sopt.sopt_name = SO_SNDBUF;
+                CFS_NET_IN;
+                rc = sogetopt(sock, &sopt);
+                CFS_NET_EX;
+                if (rc != 0) {
+                        CERROR ("Can't get send buffer size: %d\n", rc);
+                        return -rc;
+                }
+        }
+
+        if (rxbufsize != NULL) {
+                sopt.sopt_val = rxbufsize;
+                sopt.sopt_valsize = sizeof(*rxbufsize);
+                sopt.sopt_name = SO_RCVBUF;
+                CFS_NET_IN;
+                rc = sogetopt(sock, &sopt);
+                CFS_NET_EX;
+                if (rc != 0) {
+                        CERROR ("Can't get receive buffer size: %d\n", rc);
+                        return -rc;
+                }
+        }
+        return 0;
+}
+
+int
+libcfs_sock_connect (struct socket **sockp, int *fatal,
+                     __u32 local_ip, int local_port,
+                     __u32 peer_ip, int peer_port)
+{
+        struct sockaddr_in  srvaddr;
+        struct socket      *so;
+        int                 s;
+        int                 rc; 
+        CFS_DECL_FUNNEL_DATA;
+        
+        rc = libcfs_sock_create(sockp, fatal, local_ip, local_port);
+        if (rc != 0)
+                return rc;
+        so = *sockp;
+        bzero(&srvaddr, sizeof(srvaddr));
+        srvaddr.sin_len = sizeof(struct sockaddr_in);
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons (peer_port);
+        srvaddr.sin_addr.s_addr = htonl (peer_ip);
+
+        CFS_NET_IN;
+        rc = soconnect(so, (struct sockaddr *)&srvaddr);
+        if (rc != 0) {
+                CFS_NET_EX;
+                if (rc != EADDRNOTAVAIL && rc != EADDRINUSE)
+                        CDEBUG(*fatal ? D_ERROR : D_NET,
+                               "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+                               HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port);
+                goto out;
+        }
+        s = splnet();
+        while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
+                CDEBUG(D_NET, "ksocknal sleep for waiting auto_connect.\n");
+                (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "ksocknal_conn", hz);
+        }
+        if ((rc = so->so_error) != 0) {
+                so->so_error = 0;
+                splx(s);
+                CFS_NET_EX;
+                CDEBUG(*fatal ? D_ERROR : D_NET,
+                       "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+                       HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port);
+                goto out;
+        }
+        LASSERT(so->so_state & SS_ISCONNECTED);
+        splx(s);
+        CFS_NET_EX;
+        if (sockp)
+                *sockp = so;
+        return (0);
+out:
+        CFS_NET_IN;
+        soshutdown(so, 2);
+        soclose(so);
+        CFS_NET_EX;
+        return (-rc);
+}
+
+void
+libcfs_sock_release (struct socket *sock)
+{
+        CFS_DECL_FUNNEL_DATA;
+        CFS_NET_IN;
+        soshutdown(sock, 0);
+        CFS_NET_EX;
+}
+
+#endif
index 0c6253e..c56d253 100644 (file)
 extern union trace_data_union trace_data[NR_CPUS];
 extern char *tracefile;
 extern long long tracefile_size;
-extern struct rw_semaphore tracefile_sem;
 extern int trace_start_thread(void);
 extern void trace_stop_thread(void);
 
 long max_debug_mb = M_TCD_MAX_PAGES;
 static long max_permit_mb = (64 * 1024);
 
-inline struct trace_cpu_data *
-__trace_get_tcd (unsigned long *flags)
+spinlock_t trace_cpu_serializer;
+
+/*
+ * thread currently executing tracefile code or NULL if none does. Used to
+ * detect recursive calls to libcfs_debug_msg().
+ */
+static thread_t trace_owner = NULL;
+
+extern int get_preemption_level(void);
+extern atomic_t tage_allocated;
+
+struct rw_semaphore tracefile_sem;
+
+void tracefile_lock_init() {
+    init_rwsem(&tracefile_sem);
+}
+
+void tracefile_read_lock() {
+    down_read(&tracefile_sem);
+}
+
+void tracefile_read_unlock() {
+    up_read(&tracefile_sem);
+}
+
+void tracefile_write_lock() {
+    down_write(&tracefile_sem);
+}
+
+void tracefile_write_unlock() {
+    up_write(&tracefile_sem);
+}
+
+inline struct trace_cpu_data *__trace_get_tcd(unsigned long *flags)
 {
-       return &trace_data[0].tcd;
+       struct trace_cpu_data *tcd;
+       int nr_pages;
+       struct list_head pages;
+
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+
+       /*
+        * debugging check for recursive call to libcfs_debug_msg()
+        */
+       if (trace_owner == current_thread()) {
+                /*
+                 * Cannot assert here.
+                 */
+               printk(KERN_EMERG "recursive call to %s", __FUNCTION__);
+               /*
+                 * "The death of God left the angels in a strange position."
+                */
+               cfs_enter_debugger();
+       }
+       tcd = &trace_data[0].tcd;
+        CFS_INIT_LIST_HEAD(&pages);
+       if (get_preemption_level() == 0)
+               nr_pages = trace_refill_stock(tcd, CFS_ALLOC_STD, &pages);
+       else
+               nr_pages = 0;
+       spin_lock(&trace_cpu_serializer);
+       trace_owner = current_thread();
+       tcd->tcd_cur_stock_pages += nr_pages;
+       list_splice(&pages, &tcd->tcd_stock_pages);
+       return tcd;
+}
+
+extern void raw_page_death_row_clean(void);
+
+inline void __trace_put_tcd(struct trace_cpu_data *tcd, unsigned long flags)
+{
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+       LASSERT(trace_owner == current_thread());
+       trace_owner = NULL;
+       spin_unlock(&trace_cpu_serializer);
+       if (get_preemption_level() == 0)
+               /* purge all pending pages */
+               raw_page_death_row_clean();
 }
 
-inline void
-__trace_put_tcd (struct trace_cpu_data *tcd, unsigned long flags)
+int tcd_owns_tage(struct trace_cpu_data *tcd, struct trace_page *tage)
 {
-       return;
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+       /* XNU has global tcd, and all pages are owned by it */
+       return 1;
 }
 
 void
-set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, 
+set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask,
                    const int line, unsigned long stack)
-{ 
-       struct timeval tv; 
+{
+       struct timeval tv;
        
-       do_gettimeofday(&tv); 
-       header->ph_subsys = subsys; 
-       header->ph_mask = mask; 
-       header->ph_cpu_id = smp_processor_id(); 
-       header->ph_sec = (__u32)tv.tv_sec; 
-       header->ph_usec = tv.tv_usec; 
-       header->ph_stack = stack; 
-       header->ph_pid = 0; 
-       header->ph_line_num = line; 
-       header->ph_extern_pid = 0;
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+       do_gettimeofday(&tv);
+       header->ph_subsys = subsys;
+       header->ph_mask = mask;
+       header->ph_cpu_id = smp_processor_id();
+       header->ph_sec = (__u32)tv.tv_sec;
+       header->ph_usec = tv.tv_usec;
+       header->ph_stack = stack;
+       header->ph_pid = cfs_curproc_pid();
+       header->ph_line_num = line;
+       header->ph_extern_pid = (__u32)current_thread();
 }
 
-void print_to_console(struct ptldebug_header *hdr, int mask, char *buf, 
-                            int len, char *file, const char *fn)
-{ 
-       char *prefix = NULL, *ptype = NULL;
-                       
-       if ((mask & D_EMERG) != 0) { 
-               prefix = "LustreError"; 
-               ptype = KERN_EMERG; 
-       } else if ((mask & D_ERROR) != 0) { 
-               prefix = "LustreError"; 
-               ptype = KERN_ERR; 
-       } else if ((mask & D_WARNING) != 0) { 
-               prefix = "Lustre"; 
-               ptype = KERN_WARNING; 
+void print_to_console(struct ptldebug_header *hdr, int mask, char *buf,
+                     int len, char *file, const char *fn)
+{
+       char *prefix = "Lustre", *ptype = KERN_INFO;
+
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+       if ((mask & D_EMERG) != 0) {
+               prefix = "LustreError";
+               ptype = KERN_EMERG;
+       } else if ((mask & D_ERROR) != 0) {
+               prefix = "LustreError";
+               ptype = KERN_ERR;
+       } else if ((mask & D_WARNING) != 0) {
+               prefix = "Lustre";
+               ptype = KERN_WARNING;
        } else if (libcfs_printk != 0 || (mask & D_CONSOLE)) {
-               prefix = "Lustre"; 
-               ptype = KERN_INFO; 
-       } 
+               prefix = "Lustre";
+               ptype = KERN_INFO;
+       }
 
        if ((mask & D_CONSOLE) != 0) {
                printk("%s%s: %.*s", ptype, prefix, len, buf);
        } else {
-               printk("%s%s: %d:%d:(%s:%d:%s()) %*s", ptype, prefix, hdr->ph_pid, 
-                      hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
+               printk("%s%s: %d:%d:(%s:%d:%s()) %*s",
+                      ptype, prefix, hdr->ph_pid, hdr->ph_extern_pid,
+                      file, hdr->ph_line_num, fn, len, buf);
        }
 }
 
@@ -89,34 +181,34 @@ int cfs_trace_daemon SYSCTL_HANDLER_ARGS
        MALLOC(name, char *, req->newlen + 1, M_TEMP, M_WAITOK | M_ZERO);
        if (name == NULL)
                return -ENOMEM;
-       down_write(&tracefile_sem);
+       tracefile_write_lock();
        error = sysctl_handle_string(oidp, name, req->newlen + 1, req);
-       if (!error || req->newptr != NULL) {
+       if (!error || !req->newptr) {
                /* write */
                if (strcmp(name, "stop") == 0) {
                        /* stop tracefile daemon */
                        tracefile = NULL;
                        trace_stop_thread();
-                       goto out; 
-               }else if (strncmp(name, "size=", 5) == 0) { 
-                       tracefile_size = simple_strtoul(name + 5, NULL, 0); 
-                       if (tracefile_size < 10 || tracefile_size > 20480) 
-                               tracefile_size = TRACEFILE_SIZE; 
-                       else 
-                               tracefile_size <<= 20; 
+                       goto out;
+               }else if (strncmp(name, "size=", 5) == 0) {
+                       tracefile_size = simple_strtoul(name + 5, NULL, 0);
+                       if (tracefile_size < 10 || tracefile_size > 20480)
+                               tracefile_size = TRACEFILE_SIZE;
+                       else
+                               tracefile_size <<= 20;
                        goto out;
 
                }
-               if (name[0] != '/') { 
-                       error = -EINVAL; 
-                       goto out; 
-               } 
-               if (tracefile != NULL) 
+               if (name[0] != '/') {
+                       error = -EINVAL;
+                       goto out;
+               }
+               if (tracefile != NULL)
                        cfs_free(tracefile);
-               tracefile = name; 
-               name = NULL; 
+               tracefile = name;
+               name = NULL;
                trace_start_thread();
-       } else if (req->newptr != NULL) {
+       } else if (!req->newptr) {
                /* Something was wrong with the write request */
                printf("sysctl debug daemon failed: %d.\n", error);
                goto out;
@@ -125,9 +217,9 @@ int cfs_trace_daemon SYSCTL_HANDLER_ARGS
                SYSCTL_OUT(req, tracefile, sizeof(tracefile));
        }
 out:
-       if (name != NULL) 
+       if (name != NULL)
                FREE(name, M_TEMP);
-       up_write(&tracefile_sem);
+       tracefile_write_unlock();
        return error;
 }
 
@@ -138,20 +230,20 @@ int cfs_debug_mb SYSCTL_HANDLER_ARGS
        int error = 0;
 
        error = sysctl_handle_long(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
-       if (!error && req->newptr != NULL) {
+       if (!error && !req->newptr) {
                /* We have a new value stored in the standard location */
                if (max_debug_mb <= 0)
                        return -EINVAL;
                if (max_debug_mb > max_permit_mb) {
                        printf("sysctl debug_mb is too big: %d.\n", max_debug_mb);
                        return 0;
-               } 
-               for (i = 0; i < NR_CPUS; i++) { 
-                       struct trace_cpu_data *tcd; 
-                       tcd = &trace_data[i].tcd; 
+               }
+               for (i = 0; i < NR_CPUS; i++) {
+                       struct trace_cpu_data *tcd;
+                       tcd = &trace_data[i].tcd;
                        tcd->tcd_max_pages = max_debug_mb;
                }
-       } else if (req->newptr != NULL) {
+       } else if (!req->newptr) {
                /* Something was wrong with the write request */
                printf ("sysctl debug_mb fault: %d.\n", error);
        } else {
index 7d75f50..cfd7a2d 100644 (file)
 #include <sys/fcntl.h>
 #include <lnet/types.h>
 
+#include <libcfs/kp30.h>
+
 #ifndef isspace
 inline int
 isspace(char c)
-{ 
+{
         return (c == ' ' || c == '\t' || c == '\n' || c == '\12');
 }
 #endif
@@ -98,12 +100,12 @@ strstr(const char *in, const char *str)
 
 char *
 strrchr(const char *p, int ch)
-{ 
-        const char *end = p + strlen(p); 
-        do { 
-                if (*end == (char)ch) 
-                        return (char *)end; 
-        } while (--end >= p); 
+{
+        const char *end = p + strlen(p);
+        do {
+                if (*end == (char)ch)
+                        return (char *)end;
+        } while (--end >= p);
         return NULL;
 }
 
@@ -273,7 +275,7 @@ int convert_server_error(__u64 ecode)
        int sign;
        int code;
 
-        static int errno_xlate[] = {
+       static int errno_xlate[] = {
                /* success is always success */
                [0]                     = 0,
                [LINUX_EPERM]           = EPERM,
@@ -358,7 +360,8 @@ int convert_server_error(__u64 ecode)
                [LINUX_ELIBMAX]         = EINVAL /* ELIBMAX */,
                [LINUX_ELIBEXEC]        = EINVAL /* ELIBEXEC */,
                [LINUX_EILSEQ]          = EILSEQ,
-               [LINUX_ERESTART]        = ERESTART,
+               [LINUX_ERESTART]        = EINVAL /* because ERESTART is
+                                                  * negative in XNU */,
                [LINUX_ESTRPIPE]        = EINVAL /* ESTRPIPE */,
                [LINUX_EUSERS]          = EUSERS,
                [LINUX_ENOTSOCK]        = ENOTSOCK,
@@ -398,22 +401,19 @@ int convert_server_error(__u64 ecode)
                [LINUX_EDQUOT]          = EDQUOT,
                [LINUX_ENOMEDIUM]       = EINVAL /* ENOMEDIUM */,
                [LINUX_EMEDIUMTYPE]     = EINVAL /* EMEDIUMTYPE */,
-        };
+       };
        code = (int)ecode;
-        if (code >= 0) {
+       if (code >= 0) {
                sign = +1;
        } else {
                sign = -1;
                code = -code;
        }
-       if (code < (sizeof errno_xlate) / (sizeof errno_xlate[0]))
+       if (code < (sizeof errno_xlate) / (sizeof errno_xlate[0])) {
                code = errno_xlate[code];
-       else
-               /*
-                * Unknown error. Reserved for the future.
-                */
-               code = EINVAL;
-        return sign * code;
+               LASSERT(code >= 0);
+        }
+       return sign * code;
 }
 
 enum {
@@ -448,7 +448,7 @@ static inline void obit_convert(int *cflag, int *sflag,
  */
 int convert_client_oflag(int cflag, int *result)
 {
-       int sflag;
+       int sflag = 0;
 
        cflag = 0;
        obit_convert(&cflag, &sflag, O_RDONLY,   LINUX_O_RDONLY);
@@ -480,3 +480,99 @@ int convert_client_oflag(int cflag, int *result)
        } else
                return -EINVAL;
 }
+
+#ifdef __DARWIN8__
+#else /* !__DARWIN8__ */
+extern int unix_syscall();
+extern int unix_syscall_return();
+
+extern int ktrsysret();
+extern int ktrace();
+
+extern int ast_taken();
+extern int ast_check();
+
+extern int trap();
+extern int syscall_trace();
+
+static int is_addr_in_range(void *addr, void *start, void *end)
+{
+       return start <= addr && addr <= end;
+}
+
+extern void cfs_thread_agent (void);
+
+static int is_last_frame(void *addr)
+{
+       if (addr == NULL)
+               return 1;
+       else if (is_addr_in_range(addr, unix_syscall, unix_syscall_return))
+               return 1;
+       else if (is_addr_in_range(addr, ktrsysret, ktrace))
+               return 1;
+       else if (is_addr_in_range(addr, ast_taken, ast_check))
+               return 1;
+       else if (is_addr_in_range(addr, trap, syscall_trace))
+               return 1;
+       else if (is_addr_in_range(addr, cfs_thread_agent, cfs_kernel_thread))
+               return 1;
+       else
+               return 0;
+}
+
+static void *get_frame(int i)
+{
+       void *result;
+
+#define CASE(i) case (i): result = __builtin_return_address(i); break
+       switch (i + 1) {
+               CASE(1);
+               CASE(2);
+               CASE(3);
+               CASE(4);
+               CASE(5);
+               CASE(6);
+               CASE(7);
+               CASE(8);
+               CASE(9);
+               CASE(10);
+               CASE(11);
+               CASE(12);
+               CASE(13);
+               CASE(14);
+               CASE(15);
+               CASE(16);
+               CASE(17);
+               CASE(18);
+               CASE(19);
+               CASE(20);
+       default:
+               panic("impossible frame number: %d\n", i);
+               result = NULL;
+       }
+       return result;
+}
+
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace)
+{
+       int i;
+
+       memset(trace, 0, sizeof *trace);
+       for (i = 0; i < sizeof_array(trace->frame); ++ i) {
+               void *addr;
+
+               addr = get_frame(i);
+               trace->frame[i] = addr;
+               if (is_last_frame(addr))
+                       break;
+       }
+}
+
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no)
+{
+        if (0 <= frame_no && frame_no < sizeof_array(trace->frame))
+                return trace->frame[frame_no];
+        else
+                return NULL;
+}
+#endif /* !__DARWIN8__ */
index 53ec89e..0fa4008 100644 (file)
 
 # define DEBUG_SUBSYSTEM S_LNET
 
-#ifdef __KERNEL__
 #include <libcfs/kp30.h>
 #include <libcfs/libcfs.h>
 #include "tracefile.h"
-#else
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdarg.h>
-#include <sys/time.h>
-#include <libcfs/libcfs.h>
-#endif
 
 #ifdef __KERNEL__
 unsigned int libcfs_subsystem_debug = ~0 - (S_LNET | S_LND);
@@ -52,9 +42,15 @@ EXPORT_SYMBOL(libcfs_debug);
 unsigned int libcfs_printk;
 EXPORT_SYMBOL(libcfs_printk);
 
+unsigned int libcfs_debug_binary = 1;
+EXPORT_SYMBOL(libcfs_debug_binary);
+
 unsigned int libcfs_stack;
 EXPORT_SYMBOL(libcfs_stack);
 
+unsigned int portal_enter_debugger = 0;
+EXPORT_SYMBOL(portal_enter_debugger);
+
 unsigned int libcfs_catastrophe;
 EXPORT_SYMBOL(libcfs_catastrophe);
 
@@ -82,7 +78,7 @@ void libcfs_debug_dumplog_internal(void *arg)
 
 int libcfs_debug_dumplog_thread(void *arg)
 {
-        libcfs_daemonize("");
+        cfs_daemonize("");
         reparent_to_init();
         libcfs_debug_dumplog_internal(arg);
         cfs_waitq_signal(&debug_ctlwq);
@@ -105,11 +101,11 @@ void libcfs_debug_dumplog(void)
         rc = cfs_kernel_thread(libcfs_debug_dumplog_thread,
                                (void *)(long)cfs_curproc_pid(),
                                CLONE_VM | CLONE_FS | CLONE_FILES);
-        if (rc < 0)
+        if (rc < 0) 
                 printk(KERN_ERR "LustreError: cannot start log dump thread: "
                        "%d\n", rc);
         else
-                schedule();
+                cfs_waitq_wait(&wait, CFS_TASK_INTERRUPTIBLE);
 
         /* be sure to teardown if kernel_thread() failed */
         cfs_waitq_del(&debug_ctlwq, &wait);
index d3c8f7a..10837ea 100644 (file)
@@ -114,6 +114,27 @@ void libcfs_run_lbug_upcall(char *file, const char *fn, const int line)
         libcfs_run_upcall (argv);
 }
 
+#ifdef __arch_um__
+void lbug_with_loc(char *file, const char *func, const int line)
+{
+        CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n");
+        libcfs_debug_dumplog();
+        libcfs_run_lbug_upcall(file, func, line);
+        panic("LBUG");
+}
+#else
+void lbug_with_loc(char *file, const char *func, const int line)
+{
+        CEMERG("LBUG\n");
+        libcfs_debug_dumpstack(NULL);
+        libcfs_debug_dumplog();
+        libcfs_run_lbug_upcall(file, func, line);
+        set_task_state(current, TASK_UNINTERRUPTIBLE);
+        while (1)
+                schedule();
+}
+#endif /* __arch_um__ */
+
 #ifdef __KERNEL__
 
 void libcfs_debug_dumpstack(struct task_struct *tsk)
@@ -148,3 +169,4 @@ EXPORT_SYMBOL(libcfs_current);
 
 EXPORT_SYMBOL(libcfs_run_upcall);
 EXPORT_SYMBOL(libcfs_run_lbug_upcall);
+EXPORT_SYMBOL(lbug_with_loc);
index ef351a0..6cb50b1 100644 (file)
@@ -1,6 +1,7 @@
 # define DEBUG_SUBSYSTEM S_LNET
 
 #include <linux/fs.h>
+#include <linux/kdev_t.h>
 #include <linux/ctype.h>
 #include <asm/uaccess.h>
 
@@ -14,12 +15,12 @@ cfs_filp_open (const char *name, int flags, int mode, int *err)
         */
        cfs_file_t      *filp = NULL;
 
-       filp = filp_open(name, flags, mode); 
-       if (IS_ERR(filp)) { 
+       filp = filp_open(name, flags, mode);
+       if (IS_ERR(filp)) {
                int rc;
 
-               rc = PTR_ERR(filp); 
-               printk(KERN_ERR "LustreError: can't open %s file: err %d\n", 
+               rc = PTR_ERR(filp);
+               printk(KERN_ERR "LustreError: can't open %s file: err %d\n",
                                name, rc);
                if (err)
                        *err = rc;
@@ -34,7 +35,7 @@ cfs_user_write (cfs_file_t *filp, const char *buf, size_t count, loff_t *offset)
        mm_segment_t fs;
        ssize_t size = 0;
 
-       fs = get_fs();
+       fs = get_fs(); 
        set_fs(KERNEL_DS);
        while (count > 0) {
                size = filp->f_op->write(filp, (char *)buf, count, offset);
@@ -44,9 +45,69 @@ cfs_user_write (cfs_file_t *filp, const char *buf, size_t count, loff_t *offset)
                size = 0;
        }
        set_fs(fs);
-
+       
        return size;
 }
 
+cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor)
+{
+        return MKDEV(major, minor);
+}
+
+cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev)
+{
+        return MAJOR(rdev);
+}
+
+cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev)
+{
+        return MINOR(rdev);
+}
+
+#if !(CFS_O_CREAT == O_CREAT && CFS_O_EXCL == O_EXCL &&        \
+     CFS_O_TRUNC == O_TRUNC && CFS_O_APPEND == O_APPEND &&\
+     CFS_O_NONBLOCK == O_NONBLOCK && CFS_O_NDELAY == O_NDELAY &&\
+     CFS_O_SYNC == O_SYNC && CFS_O_ASYNC == FASYNC &&\
+     CFS_O_DIRECT == O_DIRECT && CFS_O_LARGEFILE == O_LARGEFILE &&\
+     CFS_O_DIRECTORY == O_DIRECTORY && CFS_O_NOFOLLOW == O_NOFOLLOW)
+
+int cfs_oflags2univ(int flags)
+{
+       int f; 
+       
+       f = flags & O_ACCMODE;
+       f |= (flags & O_CREAT) ? CFS_O_CREAT: 0;
+       f |= (flags & O_EXCL) ? CFS_O_EXCL: 0;
+       f |= (flags & O_NOCTTY) ? CFS_O_NOCTTY: 0;
+       f |= (flags & O_TRUNC) ? CFS_O_TRUNC: 0;
+       f |= (flags & O_APPEND) ? CFS_O_APPEND: 0;
+       f |= (flags & O_NONBLOCK) ? CFS_O_NONBLOCK: 0;
+       f |= (flags & O_SYNC)? CFS_O_SYNC: 0;
+       f |= (flags & FASYNC)? CFS_O_ASYNC: 0;
+       f |= (flags & O_DIRECTORY)? CFS_O_DIRECTORY: 0;
+       f |= (flags & O_DIRECT)? CFS_O_DIRECT: 0;
+       f |= (flags & O_LARGEFILE)? CFS_O_LARGEFILE: 0;
+       f |= (flags & O_NOFOLLOW)? CFS_O_NOFOLLOW: 0;
+       f |= (flags & O_NOATIME)? CFS_O_NOATIME: 0;
+       return f;
+}
+#else
+
+int cfs_oflags2univ(int flags)
+{
+       return (flags);
+}
+#endif
+
+/* 
+ * XXX Liang: we don't need cfs_univ2oflags() now.
+ */
+int cfs_univ2oflags(int flags)
+{
+       return (flags);
+}
+
 EXPORT_SYMBOL(cfs_filp_open);
 EXPORT_SYMBOL(cfs_user_write);
+EXPORT_SYMBOL(cfs_oflags2univ);
+EXPORT_SYMBOL(cfs_univ2oflags);
index 0662f49..17f102e 100644 (file)
 void *
 cfs_alloc(size_t nr_bytes, u_int32_t flags)
 {
-       void *ptr = NULL;
-       unsigned int mflags = 0;
+        void *ptr = NULL;
+        unsigned int mflags = 0;
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-       if (flags & CFS_ALLOC_ATOMIC)
-               mflags |= __GFP_HIGH;
+        if (flags & CFS_ALLOC_ATOMIC)
+                mflags |= __GFP_HIGH;
         else if (flags & CFS_ALLOC_WAIT)
                 mflags |= __GFP_WAIT;
-       else
-               mflags |= (__GFP_HIGH | __GFP_WAIT);
+        else
+                mflags |= (__GFP_HIGH | __GFP_WAIT);
 
-       if (flags & CFS_ALLOC_FS)
-               mflags |= __GFP_FS;
-       if (flags & CFS_ALLOC_IO)
-               mflags |= __GFP_IO | __GFP_HIGHIO;
+        if (flags & CFS_ALLOC_FS)
+                mflags |= __GFP_FS;
+        if (flags & CFS_ALLOC_IO)
+                mflags |= __GFP_IO | __GFP_HIGHIO;
 #else
         if (flags & CFS_ALLOC_ATOMIC)
                 mflags |= __GFP_HIGH;
@@ -55,47 +55,50 @@ cfs_alloc(size_t nr_bytes, u_int32_t flags)
                 mflags |= __GFP_IO;
 #endif
 
-       ptr = kmalloc(nr_bytes, mflags);
-       if (ptr != NULL && (flags & CFS_ALLOC_ZERO))
-               memset(ptr, 0, nr_bytes);
-       return ptr;
+        ptr = kmalloc(nr_bytes, mflags);
+        if (ptr != NULL && (flags & CFS_ALLOC_ZERO))
+                memset(ptr, 0, nr_bytes);
+        return ptr;
 }
 
 void
 cfs_free(void *addr)
 {
-       kfree(addr);
+        kfree(addr);
 }
 
 void *
 cfs_alloc_large(size_t nr_bytes)
 {
-       return vmalloc(nr_bytes);
+        return vmalloc(nr_bytes);
 }
 
 void
 cfs_free_large(void *addr)
 {
-       vfree(addr);
+        vfree(addr);
 }
 
-cfs_page_t *
-cfs_alloc_pages(unsigned int flags, unsigned int order)
+cfs_page_t *cfs_alloc_page(unsigned int flags)
 {
         unsigned int mflags = 0;
 
+        /*
+         * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-       if (flags & CFS_ALLOC_ATOMIC)
-               mflags |= __GFP_HIGH;
+        if (flags & CFS_ALLOC_ATOMIC)
+                mflags |= __GFP_HIGH;
         else if (flags & CFS_ALLOC_WAIT)
                 mflags |= __GFP_WAIT;
-       else
-               mflags |= (__GFP_HIGH | __GFP_WAIT);
+        else
+                mflags |= (__GFP_HIGH | __GFP_WAIT);
 
-       if (flags & CFS_ALLOC_FS)
-               mflags |= __GFP_FS;
-       if (flags & CFS_ALLOC_IO)
-               mflags |= __GFP_IO | __GFP_HIGHIO;
+        if (flags & CFS_ALLOC_FS)
+                mflags |= __GFP_FS;
+        if (flags & CFS_ALLOC_IO)
+                mflags |= __GFP_IO | __GFP_HIGHIO;
         if (flags & CFS_ALLOC_HIGH)
                 mflags |=  __GFP_HIGHMEM;
 #else
@@ -111,15 +114,14 @@ cfs_alloc_pages(unsigned int flags, unsigned int order)
                 mflags |=  __GFP_HIGHMEM;
 #endif
 
-        return alloc_pages(mflags, order);
+        return alloc_pages(mflags, 0);
 }
 
 cfs_mem_cache_t *
 cfs_mem_cache_create (const char *name, size_t size, size_t offset,
-                      unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
-                      void (*dtor)(void*, cfs_mem_cache_t *, unsigned long))
+                      unsigned long flags)
 {
-        return kmem_cache_create(name, size, offset, flags, ctor, dtor);
+        return kmem_cache_create(name, size, offset, flags, NULL, NULL);
 }
 
 int
@@ -133,25 +135,26 @@ cfs_mem_cache_alloc(cfs_mem_cache_t *cachep, int flags)
 {
         unsigned int mflags = 0;
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-       if (flags & CFS_SLAB_ATOMIC)
-               mflags |= __GFP_HIGH;
+        if (flags & CFS_ALLOC_ATOMIC)
+                mflags |= __GFP_HIGH;
         else if (flags & CFS_ALLOC_WAIT)
                 mflags |= __GFP_WAIT;
-       else
-               mflags |= (__GFP_HIGH | __GFP_WAIT);
+        else
+                mflags |= (__GFP_HIGH | __GFP_WAIT);
 
-       if (flags & CFS_SLAB_FS)
-               mflags |= __GFP_FS;
-       if (flags & CFS_SLAB_IO)
-               mflags |= __GFP_IO | __GFP_HIGHIO;
+        if (flags & CFS_ALLOC_FS)
+                mflags |= __GFP_FS;
+        if (flags & CFS_ALLOC_IO)
+                mflags |= __GFP_IO | __GFP_HIGHIO;
 #else
-        if (flags & CFS_SLAB_ATOMIC)
+
+        if (flags & CFS_ALLOC_ATOMIC)
                 mflags |= __GFP_HIGH;
         else
                 mflags |= __GFP_WAIT;
-        if (flags & CFS_SLAB_FS)
+        if (flags & CFS_ALLOC_FS)
                 mflags |= __GFP_FS;
-        if (flags & CFS_SLAB_IO)
+        if (flags & CFS_ALLOC_IO)
                 mflags |= __GFP_IO;
 #endif
 
@@ -168,7 +171,7 @@ EXPORT_SYMBOL(cfs_alloc);
 EXPORT_SYMBOL(cfs_free);
 EXPORT_SYMBOL(cfs_alloc_large);
 EXPORT_SYMBOL(cfs_free_large);
-EXPORT_SYMBOL(cfs_alloc_pages);
+EXPORT_SYMBOL(cfs_alloc_page);
 EXPORT_SYMBOL(cfs_mem_cache_create);
 EXPORT_SYMBOL(cfs_mem_cache_destroy);
 EXPORT_SYMBOL(cfs_mem_cache_alloc);
index 0150296..b643161 100644 (file)
@@ -5,26 +5,25 @@
 
 #define LNET_MINOR 240
 
-
 void
 libcfs_daemonize (char *str)
 {
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63)) 
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63))
        daemonize(str);
-#else 
-       daemonize(); 
+#else
+       daemonize();
        snprintf (current->comm, sizeof (current->comm), "%s", str);
 #endif
 }
 
 void
 libcfs_blockallsigs ()
-{ 
-       unsigned long  flags; 
-       
-       SIGNAL_MASK_LOCK(current, flags); 
-       sigfillset(&current->blocked); 
-       RECALC_SIGPENDING; 
+{
+       unsigned long  flags;
+
+       SIGNAL_MASK_LOCK(current, flags);
+       sigfillset(&current->blocked);
+       RECALC_SIGPENDING;
        SIGNAL_MASK_UNLOCK(current, flags);
 }
 
@@ -79,13 +78,13 @@ int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
 
 extern struct cfs_psdev_ops          libcfs_psdev_ops;
 
-static int 
+static int
 libcfs_psdev_open(struct inode * inode, struct file * file)
-{ 
+{
        struct libcfs_device_userstate **pdu = NULL;
        int    rc = 0;
 
-       if (!inode) 
+       if (!inode)
                return (-EINVAL);
        pdu = (struct libcfs_device_userstate **)&file->private_data;
        if (libcfs_psdev_ops.p_open != NULL)
@@ -96,13 +95,13 @@ libcfs_psdev_open(struct inode * inode, struct file * file)
 }
 
 /* called when closing /dev/device */
-static int 
+static int
 libcfs_psdev_release(struct inode * inode, struct file * file)
 {
        struct libcfs_device_userstate *pdu;
        int    rc = 0;
 
-       if (!inode) 
+       if (!inode)
                return (-EINVAL);
        pdu = file->private_data;
        if (libcfs_psdev_ops.p_close != NULL)
@@ -112,55 +111,55 @@ libcfs_psdev_release(struct inode * inode, struct file * file)
        return rc;
 }
 
-static int 
-libcfs_ioctl(struct inode *inode, struct file *file, 
+static int
+libcfs_ioctl(struct inode *inode, struct file *file,
             unsigned int cmd, unsigned long arg)
-{ 
+{
        struct cfs_psdev_file    pfile;
        int    rc = 0;
 
-       if (current->fsuid != 0) 
-               return -EACCES; 
-       
-       if ( _IOC_TYPE(cmd) != IOC_LIBCFS_TYPE || 
-            _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR  || 
-            _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR ) { 
-               CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", 
-                      _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); 
-               return (-EINVAL); 
-       } 
-       
+       if (current->fsuid != 0)
+               return -EACCES;
+
+       if ( _IOC_TYPE(cmd) != IOC_LIBCFS_TYPE ||
+            _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR  ||
+            _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR ) {
+               CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+                      _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+               return (-EINVAL);
+       }
+
        /* Handle platform-dependent IOC requests */
-       switch (cmd) { 
-       case IOC_LIBCFS_PANIC: 
-               if (!capable (CAP_SYS_BOOT)) 
-                       return (-EPERM); 
-               panic("debugctl-invoked panic"); 
+       switch (cmd) {
+       case IOC_LIBCFS_PANIC:
+               if (!capable (CAP_SYS_BOOT))
+                       return (-EPERM);
+               panic("debugctl-invoked panic");
                return (0);
-       case IOC_LIBCFS_MEMHOG: 
-               if (!capable (CAP_SYS_ADMIN)) 
+       case IOC_LIBCFS_MEMHOG:
+               if (!capable (CAP_SYS_ADMIN))
                        return -EPERM;
                /* go thought */
        }
 
        pfile.off = 0;
        pfile.private_data = file->private_data;
-       if (libcfs_psdev_ops.p_ioctl != NULL) 
-               rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg); 
+       if (libcfs_psdev_ops.p_ioctl != NULL)
+               rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg);
        else
                rc = -EPERM;
        return (rc);
 }
 
-static struct file_operations libcfs_fops = { 
-       ioctl:   libcfs_ioctl, 
-       open:    libcfs_psdev_open, 
+static struct file_operations libcfs_fops = {
+       ioctl:   libcfs_ioctl,
+       open:    libcfs_psdev_open,
        release: libcfs_psdev_release
 };
 
-cfs_psdev_t libcfs_dev = { 
-       LNET_MINOR, 
-       "lnet", 
+cfs_psdev_t libcfs_dev = {
+       LNET_MINOR,
+       "lnet",
        &libcfs_fops
 };
 
index 28a7a25..d00edaf 100644 (file)
@@ -1,19 +1,79 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
 #define DEBUG_SUBSYSTEM S_LNET
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <libcfs/libcfs.h>
+
+void cfs_enter_debugger(void)
+{
+#if defined(CONFIG_KGDB)
+        extern void breakpoint(void);
+        breakpoint();
+#elif defined(__arch_um__)
+        //asm("int $3");
+#else
+        /* nothing */
+#endif
+}
+
+void cfs_daemonize(char *str) {
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63))
+        daemonize(str);
+#else
+        daemonize();
+        snprintf (current->comm, sizeof (current->comm), "%s", str);
+#endif
+}
+
+sigset_t cfs_get_blocked_sigs(void)
+{
+        unsigned long   flags;
+        sigset_t        old;
+
+        SIGNAL_MASK_LOCK(current, flags);
+        old = current->blocked;
+        SIGNAL_MASK_UNLOCK(current, flags);
+        return old;
+}
+
+void cfs_block_allsigs(void)
+{
+        unsigned long   flags;
+
+        SIGNAL_MASK_LOCK(current, flags);
+        sigfillset(&current->blocked);
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+}
+
+void cfs_block_sigs(sigset_t bits)
+{
+        unsigned long  flags;
+
+        SIGNAL_MASK_LOCK(current, flags);
+        current->blocked = bits;
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+}
 
 int
 libcfs_arch_init(void)
-{ 
-       return 0;
+{
+        return 0;
 }
 
 void
 libcfs_arch_cleanup(void)
 {
-       return; 
+        return;
 }
 
 EXPORT_SYMBOL(libcfs_arch_init);
 EXPORT_SYMBOL(libcfs_arch_cleanup);
+EXPORT_SYMBOL(cfs_daemonize);
+EXPORT_SYMBOL(cfs_block_allsigs);
+EXPORT_SYMBOL(cfs_block_sigs);
+EXPORT_SYMBOL(cfs_get_blocked_sigs);
index 3cac8eb..dc68ff7 100644 (file)
@@ -41,14 +41,14 @@ libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
                CERROR ("Can't create socket: %d\n", rc);
                return rc;
        }
-       
+
        nob = strnlen(name, IFNAMSIZ);
        if (nob == IFNAMSIZ) {
                CERROR("Interface name %s too long\n", name);
                rc = -EINVAL;
                goto out;
        }
-       
+
        CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
 
        strcpy(ifr.ifr_name, name);
@@ -80,7 +80,7 @@ libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
                CERROR("Can't get IP address for interface %s\n", name);
                goto out;
        }
-       
+
        val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
        *ip = ntohl(val);
 
@@ -89,12 +89,12 @@ libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
        set_fs(KERNEL_DS);
        rc = sock->ops->ioctl(sock, SIOCGIFNETMASK, (unsigned long)&ifr);
        set_fs(oldmm);
-       
+
        if (rc != 0) {
                CERROR("Can't get netmask for interface %s\n", name);
                goto out;
        }
-       
+
        val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
        *mask = ntohl(val);
 
@@ -143,10 +143,10 @@ libcfs_ipif_enumerate (char ***namesp)
                        rc = -ENOMEM;
                        goto out0;
                }
-               
+
                ifc.ifc_buf = (char *)ifr;
                ifc.ifc_len = nalloc * sizeof(*ifr);
-               
+
                set_fs(KERNEL_DS);
                rc = sock->ops->ioctl(sock, SIOCGIFCONF, (unsigned long)&ifc);
                set_fs(oldmm);
@@ -155,7 +155,7 @@ libcfs_ipif_enumerate (char ***namesp)
                        CERROR ("Error %d enumerating interfaces\n", rc);
                        goto out1;
                }
-               
+
                LASSERT (rc == 0);
 
                nfound = ifc.ifc_len/sizeof(*ifr);
@@ -178,13 +178,13 @@ libcfs_ipif_enumerate (char ***namesp)
         }
         /* NULL out all names[i] */
         memset (names, 0, nfound * sizeof(*names));
-                
+
        for (i = 0; i < nfound; i++) {
 
                nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
                if (nob == IFNAMSIZ) {
                        /* no space for terminating NULL */
-                       CERROR("interface name %.*s too long (%d max)\n", 
+                       CERROR("interface name %.*s too long (%d max)\n",
                               nob, ifr[i].ifr_name, IFNAMSIZ);
                        rc = -ENAMETOOLONG;
                         goto out2;
@@ -202,7 +202,7 @@ libcfs_ipif_enumerate (char ***namesp)
 
         *namesp = names;
        rc = nfound;
-        
+
  out2:
         if (rc < 0)
                 libcfs_ipif_free_enumeration(names, nfound);
@@ -238,7 +238,7 @@ libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
        long           ticks = timeout * HZ;
        unsigned long  then;
        struct timeval tv;
-       
+
        LASSERT (nob > 0);
        /* Caller may pass a zero timeout if she thinks the socket buffer is
         * empty enough to take the whole message immediately */
@@ -270,7 +270,7 @@ libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
                        set_fs(oldmm);
                        if (rc != 0) {
                                CERROR("Can't set socket send timeout "
-                                      "%ld.%06d: %d\n", 
+                                      "%ld.%06d: %d\n",
                                        (long)tv.tv_sec, (int)tv.tv_usec, rc);
                                return rc;
                        }
@@ -279,7 +279,7 @@ libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
                 set_fs (KERNEL_DS);
                 then = jiffies;
                 rc = sock_sendmsg (sock, &msg, iov.iov_len);
-                ticks -= then - jiffies;
+                ticks -= jiffies - then;
                 set_fs (oldmm);
 
                if (rc == nob)
@@ -295,14 +295,13 @@ libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
 
                if (ticks <= 0)
                        return -EAGAIN;
-               
+
                 buffer = ((char *)buffer) + rc;
                 nob -= rc;
         }
 
         return (0);
 }
-
 EXPORT_SYMBOL(libcfs_sock_write);
 
 int
@@ -373,7 +372,7 @@ libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
 EXPORT_SYMBOL(libcfs_sock_read);
 
 static int
-libcfs_sock_create (struct socket **sockp, int *fatal, 
+libcfs_sock_create (struct socket **sockp, int *fatal,
                     __u32 local_ip, int local_port)
 {
         struct sockaddr_in  locaddr;
@@ -409,10 +408,10 @@ libcfs_sock_create (struct socket **sockp, int *fatal,
                 memset(&locaddr, 0, sizeof(locaddr));
                 locaddr.sin_family = AF_INET;
                 locaddr.sin_port = htons(local_port);
-                locaddr.sin_addr.s_addr = (local_ip == 0) ? 
+                locaddr.sin_addr.s_addr = (local_ip == 0) ?
                                           INADDR_ANY : htonl(local_ip);
-                
-                rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr, 
+
+                rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr,
                                      sizeof(locaddr));
                 if (rc == -EADDRINUSE) {
                         CDEBUG(D_NET, "Port %d already in use\n", local_port);
@@ -425,7 +424,7 @@ libcfs_sock_create (struct socket **sockp, int *fatal,
                         goto failed;
                 }
         }
-        
+
        return 0;
 
  failed:
@@ -447,12 +446,12 @@ libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
                                      (char *)&option, sizeof (option));
                 set_fs (oldmm);
                 if (rc != 0) {
-                        CERROR ("Can't set send buffer %d: %d\n", 
+                        CERROR ("Can't set send buffer %d: %d\n",
                                 option, rc);
                         return (rc);
                 }
         }
-        
+
         if (rxbufsize != 0) {
                 option = rxbufsize;
                 set_fs (KERNEL_DS);
@@ -460,12 +459,12 @@ libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
                                       (char *)&option, sizeof (option));
                 set_fs (oldmm);
                 if (rc != 0) {
-                        CERROR ("Can't set receive buffer %d: %d\n", 
+                        CERROR ("Can't set receive buffer %d: %d\n",
                                 option, rc);
                         return (rc);
                 }
         }
-        
+
         return 0;
 }
 
@@ -517,7 +516,7 @@ libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
                 }
                 *txbufsize = option;
         }
-        
+
         if (rxbufsize != NULL) {
                 optlen = sizeof(option);
                 set_fs (KERNEL_DS);
@@ -530,14 +529,14 @@ libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
                 }
                 *rxbufsize = option;
         }
-        
+
         return 0;
 }
 
 EXPORT_SYMBOL(libcfs_sock_getbuf);
 
 int
-libcfs_sock_listen (struct socket **sockp, 
+libcfs_sock_listen (struct socket **sockp,
                     __u32 local_ip, int local_port, int backlog)
 {
         int      fatal;
@@ -550,11 +549,11 @@ libcfs_sock_listen (struct socket **sockp,
                                local_port);
                 return rc;
         }
-        
+
        rc = (*sockp)->ops->listen(*sockp, backlog);
        if (rc == 0)
                return 0;
-       
+
        CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
        sock_release(*sockp);
        return rc;
@@ -601,14 +600,14 @@ libcfs_sock_accept (struct socket **newsockp, struct socket *sock)
 
        set_current_state(TASK_INTERRUPTIBLE);
        add_wait_queue(sock->sk->sk_sleep, &wait);
-       
+
        rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
        if (rc == -EAGAIN) {
                /* Nothing ready, so wait for activity */
                schedule();
                rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
        }
-       
+
        remove_wait_queue(sock->sk->sk_sleep, &wait);
        set_current_state(TASK_RUNNING);
 
index a23c247..7b8e220 100644 (file)
 extern union trace_data_union trace_data[NR_CPUS];
 extern char *tracefile;
 extern long long tracefile_size;
-extern struct rw_semaphore tracefile_sem;
+
+struct rw_semaphore tracefile_sem;
+
+void tracefile_lock_init()
+{
+       init_rwsem(&tracefile_sem);
+}
+
+void tracefile_read_lock()
+{
+       down_read(&tracefile_sem);
+}
+
+void tracefile_read_unlock()
+{
+       up_read(&tracefile_sem);
+}
+
+void tracefile_write_lock()
+{
+       down_write(&tracefile_sem);
+}
+
+void tracefile_write_unlock()
+{
+       up_write(&tracefile_sem);
+}
 
 inline struct trace_cpu_data *
-__trace_get_tcd(unsigned long *flags) 
+__trace_get_tcd(unsigned long *flags)
 {
-       struct trace_cpu_data *ret;           
+       struct trace_cpu_data *ret;
 
-       int cpu = get_cpu();                
-       local_irq_save(*flags);               
-       ret = &trace_data[cpu].tcd;     
+       int cpu = get_cpu();
+       local_irq_save(*flags);
+       ret = &trace_data[cpu].tcd;
 
-       return ret;                             
+       return ret;
 }
 
-inline void 
+inline void
 trace_put_tcd (struct trace_cpu_data *tcd, unsigned long flags)
 {
-       local_irq_restore(flags); 
-       put_cpu();               
+       local_irq_restore(flags);
+       put_cpu();
+}
+
+int tcd_owns_tage(struct trace_cpu_data *tcd, struct trace_page *tage)
+{
+       /*
+        * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+       return tcd->tcd_cpu == tage->cpu;
 }
 
 void
-set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, 
+set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask,
                    const int line, unsigned long stack)
-{ 
-       struct timeval tv; 
-       
-       do_gettimeofday(&tv); 
-       
-       header->ph_subsys = subsys; 
-       header->ph_mask = mask; 
-       header->ph_cpu_id = smp_processor_id(); 
-       header->ph_sec = (__u32)tv.tv_sec; 
-       header->ph_usec = tv.tv_usec; 
-       header->ph_stack = stack; 
-       header->ph_pid = current->pid; 
-       header->ph_line_num = line; 
-#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) 
+{
+       struct timeval tv;
+
+       do_gettimeofday(&tv);
+
+       header->ph_subsys = subsys;
+       header->ph_mask = mask;
+       header->ph_cpu_id = smp_processor_id();
+       header->ph_sec = (__u32)tv.tv_sec;
+       header->ph_usec = tv.tv_usec;
+       header->ph_stack = stack;
+       header->ph_pid = current->pid;
+       header->ph_line_num = line;
+#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20))
        header->ph_extern_pid = current->thread.extern_pid;
-#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) 
+#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
        header->ph_extern_pid = current->thread.mode.tt.extern_pid;
-#else 
+#else
        header->ph_extern_pid = 0;
 #endif
        return;
 }
 
-void print_to_console(struct ptldebug_header *hdr, int mask, char *buf, 
+void print_to_console(struct ptldebug_header *hdr, int mask, char *buf,
                             int len, char *file, const char *fn)
-{ 
-       char *prefix = NULL, *ptype = NULL; 
-       
-       if ((mask & D_EMERG) != 0) { 
-               prefix = "LustreError"; 
-               ptype = KERN_EMERG; 
-       } else if ((mask & D_ERROR) != 0) { 
-               prefix = "LustreError"; 
-               ptype = KERN_ERR; 
-       } else if ((mask & D_WARNING) != 0) { 
-               prefix = "Lustre"; 
-               ptype = KERN_WARNING; 
+{
+       char *prefix = "Lustre", *ptype = NULL;
+
+       if ((mask & D_EMERG) != 0) {
+               prefix = "LustreError";
+               ptype = KERN_EMERG;
+       } else if ((mask & D_ERROR) != 0) {
+               prefix = "LustreError";
+               ptype = KERN_ERR;
+       } else if ((mask & D_WARNING) != 0) {
+               prefix = "Lustre";
+               ptype = KERN_WARNING;
        } else if (libcfs_printk != 0 || (mask & D_CONSOLE)) {
-               prefix = "Lustre"; 
-               ptype = KERN_INFO; 
-       } 
+               prefix = "Lustre";
+               ptype = KERN_INFO;
+       }
 
        if ((mask & D_CONSOLE) != 0) {
                printk("%s%s: %.*s", ptype, prefix, len, buf);
        } else {
-               printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid, 
+               printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid,
                       hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
        }
        return;
 }
 
-int trace_write_daemon_file(struct file *file, const char *buffer, 
+int trace_write_daemon_file(struct file *file, const char *buffer,
                            unsigned long count, void *data)
-{ 
-       char *name; 
-       unsigned long off; 
-       int rc; 
-       
-       name = kmalloc(count + 1, GFP_KERNEL); 
-       if (name == NULL) 
-               return -ENOMEM; 
-       
-       if (copy_from_user(name, buffer, count)) { 
-               rc = -EFAULT; 
-               goto out; 
-       } 
-       
-       /* be nice and strip out trailing '\n' */ 
-       for (off = count ; off > 2 && isspace(name[off - 1]); off--) 
-               ; 
-       
-       name[off] = '\0'; 
-       
-       down_write(&tracefile_sem); 
-       if (strcmp(name, "stop") == 0) { 
-               tracefile = NULL; 
-               trace_stop_thread(); 
-               goto out_sem; 
-       } else if (strncmp(name, "size=", 5) == 0) { 
-               tracefile_size = simple_strtoul(name + 5, NULL, 0); 
-               if (tracefile_size < 10 || tracefile_size > 20480) 
-                       tracefile_size = TRACEFILE_SIZE; 
-               else 
-                       tracefile_size <<= 20; 
-               goto out_sem; 
-       } 
-       
-       if (name[0] != '/') { 
-               rc = -EINVAL; 
-               goto out_sem; 
-       } 
-       
-       if (tracefile != NULL) 
-               kfree(tracefile); 
-       
-       tracefile = name; 
-       name = NULL; 
-       printk(KERN_INFO "Lustre: debug daemon will attempt to start writing " 
-              "to %s (%lukB max)\n", tracefile, (long)(tracefile_size >> 10)); 
-       
-       trace_start_thread(); 
-out_sem: 
-       up_write(&tracefile_sem); 
-out: 
+{
+       char *name;
+       unsigned long off;
+       int rc;
+
+       name = kmalloc(count + 1, GFP_KERNEL);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name, buffer, count)) {
+               rc = -EFAULT;
+               goto out;
+       }
+
+       /* be nice and strip out trailing '\n' */
+       for (off = count ; off > 2 && isspace(name[off - 1]); off--)
+               ;
+
+       name[off] = '\0';
+
+       tracefile_write_lock();
+       if (strcmp(name, "stop") == 0) {
+               tracefile = NULL;
+               trace_stop_thread();
+               goto out_sem;
+       } else if (strncmp(name, "size=", 5) == 0) {
+               tracefile_size = simple_strtoul(name + 5, NULL, 0);
+               if (tracefile_size < 10 || tracefile_size > 20480)
+                       tracefile_size = TRACEFILE_SIZE;
+               else
+                       tracefile_size <<= 20;
+               goto out_sem;
+       }
+
+       if (name[0] != '/') {
+               rc = -EINVAL;
+               goto out_sem;
+       }
+
+       if (tracefile != NULL)
+               kfree(tracefile);
+
+       tracefile = name;
+       name = NULL;
+       printk(KERN_INFO "Lustre: debug daemon will attempt to start writing "
+              "to %s (%lukB max)\n", tracefile, (long)(tracefile_size >> 10));
+
+       trace_start_thread();
+out_sem:
+       tracefile_write_unlock();
+out:
        kfree(name);
        return count;
 }
 
-int trace_read_daemon_file(char *page, char **start, off_t off, int count, 
+int trace_read_daemon_file(char *page, char **start, off_t off, int count,
                           int *eof, void *data)
-{ 
-       int rc; 
-       
-       down_read(&tracefile_sem); 
-       rc = snprintf(page, count, "%s", tracefile); 
-       up_read(&tracefile_sem); 
+{
+       int rc;
+
+       tracefile_read_lock();
+       rc = snprintf(page, count, "%s", tracefile);
+       tracefile_read_unlock();
 
        return rc;
 }
 
-int trace_write_debug_mb(struct file *file, const char *buffer, 
+int trace_write_debug_mb(struct file *file, const char *buffer,
                         unsigned long count, void *data)
-{ 
-       char string[32]; 
-       int i; 
-       unsigned max; 
-       
-       if (count >= sizeof(string)) { 
-               printk(KERN_ERR "Lustre: value too large (length %lu bytes)\n", 
-                      count); 
-               return -EOVERFLOW; 
-       } 
-       
-       if (copy_from_user(string, buffer, count)) 
-               return -EFAULT; 
-       
-       max = simple_strtoul(string, NULL, 0); 
-       if (max == 0) 
+{
+       char string[32];
+       int i;
+       unsigned max;
+
+       if (count >= sizeof(string)) {
+               printk(KERN_ERR "Lustre: value too large (length %lu bytes)\n",
+                      count);
+               return -EOVERFLOW;
+       }
+
+       if (copy_from_user(string, buffer, count))
+               return -EFAULT;
+
+       max = simple_strtoul(string, NULL, 0);
+       if (max == 0)
+               return -EINVAL;
+
+       if (max > (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5 || max >= 512) {
+               printk(KERN_ERR "Lustre: Refusing to set debug buffer size to "
+                      "%dMB, which is more than 80%% of available RAM (%lu)\n",
+                      max, (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5);
                return -EINVAL;
+       }
+
+       max /= smp_num_cpus;
 
-       if (max > (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5 || max >= 512) { 
-               printk(KERN_ERR "Lustre: Refusing to set debug buffer size to " 
-                      "%dMB, which is more than 80%% of available RAM (%lu)\n", 
-                      max, (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5); 
-               return -EINVAL; 
-       } 
-
-       max /= smp_num_cpus; 
-       
-       for (i = 0; i < NR_CPUS; i++) { 
-               struct trace_cpu_data *tcd; 
-               tcd = &trace_data[i].tcd; 
-               tcd->tcd_max_pages = max << (20 - PAGE_SHIFT); 
-       } 
+       for (i = 0; i < NR_CPUS; i++) {
+               struct trace_cpu_data *tcd;
+               tcd = &trace_data[i].tcd;
+               tcd->tcd_max_pages = max << (20 - PAGE_SHIFT);
+       }
        return count;
 }
 
 int trace_read_debug_mb(char *page, char **start, off_t off, int count,
                                        int *eof, void *data)
-{ 
-       struct trace_cpu_data *tcd; 
-       unsigned long flags; 
+{
+       struct trace_cpu_data *tcd;
+       unsigned long flags;
        int rc;
-                                       
-       tcd = trace_get_tcd(flags); 
-       rc = snprintf(page, count, "%lu\n", 
-                     (tcd->tcd_max_pages >> (20 - PAGE_SHIFT)) * smp_num_cpus); 
-       trace_put_tcd(tcd, flags); 
+
+       tcd = trace_get_tcd(flags);
+       rc = snprintf(page, count, "%lu\n",
+                     (tcd->tcd_max_pages >> (20 - PAGE_SHIFT)) * smp_num_cpus);
+       trace_put_tcd(tcd, flags);
        return rc;
 }
 
index 6220541..065d54a 100644 (file)
@@ -35,13 +35,25 @@ int convert_server_error(__u64 ecode)
 {
        return ecode;
 }
+EXPORT_SYMBOL(convert_server_error);
 
 /*
  * convert <fcntl.h> flag from client to server.
  */
-int convert_client_oflag(int cflag)
+int convert_client_oflag(int cflag, int *result)
 {
-       return cflag;
+        *result = cflag;
+       return 0;
 }
+EXPORT_SYMBOL(convert_client_oflag);
 
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace)
+{}
+EXPORT_SYMBOL(cfs_stack_trace_fill);
+
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no)
+{
+        return NULL;
+}
+EXPORT_SYMBOL(cfs_stack_trace_frame);
 
diff --git a/lnet/libcfs/misc.c b/lnet/libcfs/misc.c
new file mode 100644 (file)
index 0000000..0ace40d
--- /dev/null
@@ -0,0 +1,53 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc.
+ *   Author: Nikita Danilov <nikita@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+
+/*
+ * On-wire format is native kdev_t format of Linux kernel 2.6
+ */
+enum {
+       WIRE_RDEV_MINORBITS = 20,
+       WIRE_RDEV_MINORMASK = ((1U << WIRE_RDEV_MINORBITS) - 1)
+};
+
+cfs_wire_rdev_t cfs_wire_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor)
+{
+        return (major << WIRE_RDEV_MINORBITS) | minor;
+}
+
+cfs_major_nr_t  cfs_wire_rdev_major(cfs_wire_rdev_t rdev)
+{
+        return rdev >> WIRE_RDEV_MINORBITS;
+}
+
+cfs_minor_nr_t  cfs_wire_rdev_minor(cfs_wire_rdev_t rdev)
+{
+        return rdev & WIRE_RDEV_MINORMASK;
+}
+
index 9fd8cff..4975a58 100644 (file)
@@ -27,6 +27,7 @@
 #include <lnet/lib-lnet.h>
 #include <lnet/lnet.h>
 #include <libcfs/kp30.h>
+#include "tracefile.h"
 
 void
 kportal_memhog_free (struct libcfs_device_userstate *ldu)
index e72e047..2d7f8db 100644 (file)
@@ -100,53 +100,53 @@ struct netstrfns {
 };
 
 static struct netstrfns  libcfs_netstrfns[] = {
-        {.nf_type     = LOLND,
-         .nf_name     = "lo",
-         .nf_modname  = "klolnd",
-         .nf_addr2str = libcfs_decnum_addr2str,
-         .nf_str2addr = libcfs_lo_str2addr},
-        {.nf_type     = SOCKLND,
-         .nf_name     = "tcp",
-         .nf_modname  = "ksocklnd",
-         .nf_addr2str = libcfs_ip_addr2str,
-         .nf_str2addr = libcfs_ip_str2addr},
-        {.nf_type     = OPENIBLND,
-         .nf_name     = "openib",
-         .nf_modname  = "kopeniblnd",
-         .nf_addr2str = libcfs_ip_addr2str,
-         .nf_str2addr = libcfs_ip_str2addr},
-        {.nf_type     = IIBLND,
-         .nf_name     = "iib",
-         .nf_modname  = "kiiblnd",
-         .nf_addr2str = libcfs_ip_addr2str,
-         .nf_str2addr = libcfs_ip_str2addr},
-        {.nf_type     = VIBLND,
-         .nf_name     = "vib",
-         .nf_modname  = "kviblnd",
-         .nf_addr2str = libcfs_ip_addr2str,
-         .nf_str2addr = libcfs_ip_str2addr},
-        {.nf_type     = RALND,
-         .nf_name     = "ra",
-         .nf_modname  = "kralnd",
-         .nf_addr2str = libcfs_ip_addr2str,
-         .nf_str2addr = libcfs_ip_str2addr},
-        {.nf_type     = QSWLND,
-         .nf_name     = "elan",
-         .nf_modname  = "kqswlnd",
-         .nf_addr2str = libcfs_decnum_addr2str,
-         .nf_str2addr = libcfs_num_str2addr},
-        {.nf_type     = GMLND,
-         .nf_name     = "gm",
-         .nf_modname  = "kgmlnd",
-         .nf_addr2str = libcfs_hexnum_addr2str,
-         .nf_str2addr = libcfs_num_str2addr},
-        {.nf_type     = PTLLND,
-         .nf_name     = "ptl",
-         .nf_modname  = "kptllnd",
-         .nf_addr2str = libcfs_decnum_addr2str,
-         .nf_str2addr = libcfs_num_str2addr},
+        {/* .nf_type      */  LOLND,
+         /* .nf_name      */  "lo",
+         /* .nf_modname   */  "klolnd",
+         /* .nf_addr2str  */  libcfs_decnum_addr2str,
+         /* .nf_str2addr  */  libcfs_lo_str2addr},
+        {/* .nf_type      */  SOCKLND,
+         /* .nf_name      */  "tcp",
+         /* .nf_modname   */  "ksocklnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  OPENIBLND,
+         /* .nf_name      */  "openib",
+         /* .nf_modname   */  "kopeniblnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  IIBLND,
+         /* .nf_name      */  "iib",
+         /* .nf_modname   */  "kiiblnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  VIBLND,
+         /* .nf_name      */  "vib",
+         /* .nf_modname   */  "kviblnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  RALND,
+         /* .nf_name      */  "ra",
+         /* .nf_modname   */  "kralnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  QSWLND,
+         /* .nf_name      */  "elan",
+         /* .nf_modname   */  "kqswlnd",
+         /* .nf_addr2str  */  libcfs_decnum_addr2str,
+         /* .nf_str2addr  */  libcfs_num_str2addr},
+        {/* .nf_type      */  GMLND,
+         /* .nf_name      */  "gm",
+         /* .nf_modname   */  "kgmlnd",
+         /* .nf_addr2str  */  libcfs_hexnum_addr2str,
+         /* .nf_str2addr  */  libcfs_num_str2addr},
+        {/* .nf_type      */  PTLLND,
+         /* .nf_name      */  "ptl",
+         /* .nf_modname   */  "kptllnd",
+         /* .nf_addr2str  */  libcfs_decnum_addr2str,
+         /* .nf_str2addr  */  libcfs_num_str2addr},
         /* placeholder for net0 alias.  It MUST BE THE LAST ENTRY */
-        {.nf_type     = -1},
+        {/* .nf_type      */  -1},
 };
 
 const int libcfs_nnetstrfns = sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]);
@@ -383,7 +383,7 @@ libcfs_str2net_internal(char *str, __u32 *net)
         struct netstrfns *nf;
         int               nob;
         int               netnum;
-        unsigned int      i;
+        int               i;
 
         for (i = 0; i < libcfs_nnetstrfns; i++) {
                 nf = &libcfs_netstrfns[i];
@@ -397,7 +397,7 @@ libcfs_str2net_internal(char *str, __u32 *net)
 
         nob = strlen(nf->nf_name);
 
-        if (strlen(str) == nob) {
+        if (strlen(str) == (unsigned int)nob) {
                 netnum = 0;
         } else {
                 if (nf->nf_type == LOLND) /* net number not allowed */
@@ -406,7 +406,7 @@ libcfs_str2net_internal(char *str, __u32 *net)
                 str += nob;
                 i = strlen(str);
                 if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
-                    i != strlen(str))
+                    i != (int)strlen(str))
                         return NULL;
         }
 
index 77adaae..189b880 100644 (file)
 /* XXX move things up to the top, comment */
 union trace_data_union trace_data[NR_CPUS] __cacheline_aligned;
 
-struct rw_semaphore tracefile_sem;
 char *tracefile = NULL;
-long long tracefile_size = TRACEFILE_SIZE;
+int64_t tracefile_size = TRACEFILE_SIZE;
 static struct tracefiled_ctl trace_tctl;
 struct semaphore trace_thread_sem;
 static int thread_running = 0;
 
-static void put_pages_on_daemon_list_on_cpu(void *info);
+atomic_t tage_allocated = ATOMIC_INIT(0);
+
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+                                         struct trace_cpu_data *tcd);
 
 static inline struct trace_page *tage_from_list(struct list_head *list)
 {
@@ -62,6 +64,7 @@ static struct trace_page *tage_alloc(int gfp)
         }
         
         tage->page = page;
+        atomic_inc(&tage_allocated);
         return tage;
 }
 
@@ -72,6 +75,7 @@ static void tage_free(struct trace_page *tage)
 
         cfs_free_page(tage->page);
         cfs_free(tage);
+        atomic_dec(&tage_allocated);
 }
 
 static void tage_to_tail(struct trace_page *tage, struct list_head *queue)
@@ -92,30 +96,53 @@ static void LASSERT_TAGE_INVARIANT(struct trace_page *tage)
                  cfs_page_count(tage->page));
 }
 
-/* return a page that has 'len' bytes left at the end */
-static struct trace_page *trace_get_tage(struct trace_cpu_data *tcd,
-                                         unsigned long len)
+int trace_refill_stock(struct trace_cpu_data *tcd, int gfp,
+                       struct list_head *stock)
 {
-        struct trace_page *tage;
+        int i;
 
-        if (len > CFS_PAGE_SIZE) {
-                printk(KERN_ERR "cowardly refusing to write %lu bytes in a "
-                       "page\n", len);
-                return NULL;
+        /*
+         * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
+
+        for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) {
+                struct trace_page *tage;
+
+                tage = tage_alloc(gfp);
+                if (tage == NULL)
+                        break;
+                list_add_tail(&tage->linkage, stock);
         }
+        return i;
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct trace_page *trace_get_tage_try(struct trace_cpu_data *tcd,
+                                             unsigned long len)
+{
+        struct trace_page *tage;
 
-        if (!list_empty(&tcd->tcd_pages)) {
+        if (tcd->tcd_cur_pages > 0) {
+                LASSERT(!list_empty(&tcd->tcd_pages));
                 tage = tage_from_list(tcd->tcd_pages.prev);
                 if (tage->used + len <= CFS_PAGE_SIZE)
                         return tage;
         }
 
         if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
-                tage = tage_alloc(CFS_ALLOC_ATOMIC);
-                if (tage == NULL) {
-                        /* the kernel should print a message for us.  fall back
-                         * to using the last page in the ring buffer. */
-                        goto ring_buffer;
+                if (tcd->tcd_cur_stock_pages > 0) {
+                        tage = tage_from_list(tcd->tcd_stock_pages.prev);
+                        -- tcd->tcd_cur_stock_pages;
+                        list_del_init(&tage->linkage);
+                } else {
+                        tage = tage_alloc(CFS_ALLOC_ATOMIC);
+                        if (tage == NULL) {
+                                printk(KERN_WARNING
+                                       "failure to allocate a tage (%ld)\n",
+                                       tcd->tcd_cur_pages);
+                                return NULL;
+                        }
                 }
 
                 tage->used = 0;
@@ -125,43 +152,71 @@ static struct trace_page *trace_get_tage(struct trace_cpu_data *tcd,
 
                 if (tcd->tcd_cur_pages > 8 && thread_running) {
                         struct tracefiled_ctl *tctl = &trace_tctl;
+                        /*
+                         * wake up tracefiled to process some pages.
+                         */
                         cfs_waitq_signal(&tctl->tctl_waitq);
                 }
                 return tage;
         }
+        return NULL;
+}
 
- ring_buffer:
-        if (thread_running) {
-                int pgcount = tcd->tcd_cur_pages / 10;
-                struct page_collection pc;
-                struct trace_page *tage;
-                struct trace_page *tmp;
+static void tcd_shrink(struct trace_cpu_data *tcd)
+{
+        int pgcount = tcd->tcd_cur_pages / 10;
+        struct page_collection pc;
+        struct trace_page *tage;
+        struct trace_page *tmp;
 
-                printk(KERN_WARNING "debug daemon buffer overflowed; discarding"
-                       " 10%% of pages (%d)\n", pgcount + 1);
+        /*
+         * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
 
-                CFS_INIT_LIST_HEAD(&pc.pc_pages);
-                spin_lock_init(&pc.pc_lock);
+        printk(KERN_WARNING "debug daemon buffer overflowed; discarding"
+               " 10%% of pages (%d of %ld)\n", pgcount + 1, tcd->tcd_cur_pages);
 
-                list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
-                        if (pgcount-- == 0)
-                                break;
+        CFS_INIT_LIST_HEAD(&pc.pc_pages);
+        spin_lock_init(&pc.pc_lock);
 
-                        list_move_tail(&tage->linkage, &pc.pc_pages);
-                        tcd->tcd_cur_pages--;
-                }
-                put_pages_on_daemon_list_on_cpu(&pc);
+        list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
+                if (pgcount-- == 0)
+                        break;
 
-                LASSERT(!list_empty(&tcd->tcd_pages));
+                list_move_tail(&tage->linkage, &pc.pc_pages);
+                tcd->tcd_cur_pages--;
         }
+        put_pages_on_tcd_daemon_list(&pc, tcd);
+}
 
-        if (list_empty(&tcd->tcd_pages))
-                return NULL;
+/* return a page that has 'len' bytes left at the end */
+static struct trace_page *trace_get_tage(struct trace_cpu_data *tcd,
+                                         unsigned long len)
+{
+        struct trace_page *tage;
 
-        tage = tage_from_list(tcd->tcd_pages.next);
-        tage->used = 0;
-        tage_to_tail(tage, &tcd->tcd_pages);
+        /*
+         * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
 
+        if (len > CFS_PAGE_SIZE) {
+                printk(KERN_ERR
+                       "cowardly refusing to write %lu bytes in a page\n", len);
+                return NULL;
+        }
+
+        tage = trace_get_tage_try(tcd, len);
+        if (tage != NULL)
+                return tage;
+        if (thread_running)
+                tcd_shrink(tcd);
+        if (tcd->tcd_cur_pages > 0) {
+                tage = tage_from_list(tcd->tcd_pages.next);
+                tage->used = 0;
+                tage_to_tail(tage, &tcd->tcd_pages);
+        }
         return tage;
 }
 
@@ -171,10 +226,12 @@ void libcfs_debug_msg(int subsys, int mask, char *file, const char *fn,
         struct trace_cpu_data *tcd;
         struct ptldebug_header header;
         struct trace_page *tage;
-        char *debug_buf = format;
+        char *string_buf = format;
+        char *debug_buf;
         int known_size, needed = 85 /* average message length */, max_nob;
         va_list       ap;
         unsigned long flags;
+        int depth;
 
         if (strchr(file, '/'))
                 file = strrchr(file, '/') + 1;
@@ -188,19 +245,24 @@ void libcfs_debug_msg(int subsys, int mask, char *file, const char *fn,
                 goto out;
 
         set_ptldebug_header(&header, subsys, mask, line, stack);
-        known_size = sizeof(header) + strlen(file) + strlen(fn) + 2; // nulls
+
+        depth = __current_nesting_level();
+        known_size = strlen(file) + 1 + depth;
+        if (fn) 
+                known_size += strlen(fn) + 1;
+
+        if (libcfs_debug_binary)
+                known_size += sizeof(header);
 
  retry:
         tage = trace_get_tage(tcd, needed + known_size);
         if (tage == NULL) {
-                debug_buf = format;
                 if (needed + known_size > CFS_PAGE_SIZE)
                         mask |= D_ERROR;
-                needed = strlen(format);
                 goto out;
         }
 
-        debug_buf = cfs_page_address(tage->page) + tage->used + known_size;
+        string_buf = (char *)cfs_page_address(tage->page) + tage->used + known_size;
 
         max_nob = CFS_PAGE_SIZE - tage->used - known_size;
         if (max_nob <= 0) {
@@ -211,26 +273,38 @@ void libcfs_debug_msg(int subsys, int mask, char *file, const char *fn,
                 goto out;
         }
         va_start(ap, format);
-        needed = vsnprintf(debug_buf, max_nob, format, ap);
+        needed = vsnprintf(string_buf, max_nob, format, ap);
         va_end(ap);
 
         if (needed > max_nob) /* overflow.  oh poop. */
                 goto retry;
 
         header.ph_len = known_size + needed;
-        debug_buf = cfs_page_address(tage->page) + tage->used;
+        debug_buf = (char *)cfs_page_address(tage->page) + tage->used;
 
-        memcpy(debug_buf, &header, sizeof(header));
-        tage->used += sizeof(header);
-        debug_buf += sizeof(header);
+        if (libcfs_debug_binary) {
+                memcpy(debug_buf, &header, sizeof(header));
+                tage->used += sizeof(header);
+                debug_buf += sizeof(header);
+        }
+
+        /* indent message according to the nesting level */
+        while (depth-- > 0) {
+                *(debug_buf++) = '.';
+                ++ tage->used;
+        }
 
         strcpy(debug_buf, file);
         tage->used += strlen(file) + 1;
         debug_buf += strlen(file) + 1;
 
-        strcpy(debug_buf, fn);
-        tage->used += strlen(fn) + 1;
-        debug_buf += strlen(fn) + 1;
+        if (fn) {
+                strcpy(debug_buf, fn);
+                tage->used += strlen(fn) + 1;
+                debug_buf += strlen(fn) + 1;
+        }
+
+        LASSERT(debug_buf == string_buf);
 
         tage->used += needed;
         if (tage->used > CFS_PAGE_SIZE)
@@ -239,7 +313,7 @@ void libcfs_debug_msg(int subsys, int mask, char *file, const char *fn,
 
  out:
         if ((mask & (D_EMERG | D_ERROR | D_WARNING | D_CONSOLE)) || libcfs_printk)
-                print_to_console(&header, mask, debug_buf, needed, file, fn);
+                print_to_console(&header, mask, string_buf, needed, file, fn);
 
         trace_put_tcd(tcd, flags);
 }
@@ -325,15 +399,11 @@ static void put_pages_back(struct page_collection *pc)
  * we have a good amount of data at all times for dumping during an LBUG, even
  * if we have been steadily writing (and otherwise discarding) pages via the
  * debug daemon. */
-static void put_pages_on_daemon_list_on_cpu(void *info)
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+                                         struct trace_cpu_data *tcd)
 {
-        struct page_collection *pc = info;
-        struct trace_cpu_data *tcd;
         struct trace_page *tage;
         struct trace_page *tmp;
-        unsigned long flags;
-
-        tcd = trace_get_tcd(flags);
 
         spin_lock(&pc->pc_lock);
         list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) {
@@ -360,7 +430,15 @@ static void put_pages_on_daemon_list_on_cpu(void *info)
                 }
         }
         spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_on_daemon_list_on_cpu(void *info)
+{
+        struct trace_cpu_data *tcd;
+        unsigned long flags;
 
+        tcd = trace_get_tcd(flags);
+        put_pages_on_tcd_daemon_list(info, tcd);
         trace_put_tcd(tcd, flags);
 }
 
@@ -378,6 +456,7 @@ void trace_debug_print(void)
 
         spin_lock_init(&pc.pc_lock);
 
+        pc.pc_want_daemon_pages = 1;
         collect_pages(&pc);
         list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
                 char *p, *file, *fn;
@@ -412,10 +491,11 @@ int tracefile_dump_all_pages(char *filename)
         cfs_file_t *filp;
         struct trace_page *tage;
         struct trace_page *tmp;
-        CFS_DECL_MMSPACE;
         int rc;
 
-        down_write(&tracefile_sem);
+        CFS_DECL_MMSPACE;
+
+        tracefile_write_lock();
 
         filp = cfs_filp_open(filename,
                              O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600, &rc);
@@ -442,10 +522,11 @@ int tracefile_dump_all_pages(char *filename)
 
                 rc = cfs_filp_write(filp, cfs_page_address(tage->page),
                                     tage->used, cfs_filp_poff(filp));
-                if (rc != tage->used) {
+                if (rc != (int)tage->used) {
                         printk(KERN_WARNING "wanted to write %u but wrote "
                                "%d\n", tage->used, rc);
                         put_pages_back(&pc);
+                        LASSERT(list_empty(&pc.pc_pages));
                         break;
                 }
                 list_del(&tage->linkage);
@@ -458,7 +539,7 @@ int tracefile_dump_all_pages(char *filename)
  close:
         cfs_filp_close(filp);
  out:
-        up_write(&tracefile_sem);
+        tracefile_write_unlock();
         return rc;
 }
 
@@ -470,6 +551,7 @@ void trace_flush_pages(void)
 
         spin_lock_init(&pc.pc_lock);
 
+        pc.pc_want_daemon_pages = 1;
         collect_pages(&pc);
         list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
 
@@ -491,15 +573,17 @@ int trace_dk(struct file *file, const char *buffer, unsigned long count,
         if (name == NULL)
                 return -ENOMEM;
 
-        if (copy_from_user(name, buffer, count)) {
+        if (copy_from_user((void *)name, (void *)buffer, count)) {
                 rc = -EFAULT;
                 goto out;
         }
 
+#if !defined(__WINNT__)
         if (name[0] != '/') {
                 rc = -EINVAL;
                 goto out;
         }
+#endif
 
         /* be nice and strip out trailing '\n' */
         for (off = count ; off > 2 && isspace(name[off - 1]); off--)
@@ -522,12 +606,13 @@ static int tracefiled(void *arg)
         struct trace_page *tmp;
         struct ptldebug_header *hdr;
         cfs_file_t *filp;
-        CFS_DECL_MMSPACE;
         int rc;
 
+        CFS_DECL_MMSPACE;
+
         /* we're started late enough that we pick up init's fs context */
         /* this is so broken in uml?  what on earth is going on? */
-        libcfs_daemonize("ktracefiled");
+        cfs_daemonize("ktracefiled");
         reparent_to_init();
 
         spin_lock_init(&pc.pc_lock);
@@ -539,7 +624,8 @@ static int tracefiled(void *arg)
                 cfs_waitlink_init(&__wait);
                 cfs_waitq_add(&tctl->tctl_waitq, &__wait);
                 set_current_state(TASK_INTERRUPTIBLE);
-                cfs_waitq_timedwait(&__wait, cfs_time_seconds(1));
+                cfs_waitq_timedwait(&__wait, CFS_TASK_INTERRUPTIBLE, 
+                                    cfs_time_seconds(1));
                 cfs_waitq_del(&tctl->tctl_waitq, &__wait);
 
                 if (atomic_read(&tctl->tctl_shutdown))
@@ -551,16 +637,17 @@ static int tracefiled(void *arg)
                         continue;
 
                 filp = NULL;
-                down_read(&tracefile_sem);
+                tracefile_read_lock();
                 if (tracefile != NULL) {
                         filp = cfs_filp_open(tracefile, O_CREAT|O_RDWR|O_LARGEFILE,
                                         0600, &rc);
                         if (!(filp))
                                 printk("couldn't open %s: %d\n", tracefile, rc);
                 }
-                up_read(&tracefile_sem);
+                tracefile_read_unlock();
                 if (filp == NULL) {
                         put_pages_on_daemon_list(&pc);
+                        LASSERT(list_empty(&pc.pc_pages));
                         continue;
                 }
 
@@ -578,23 +665,25 @@ static int tracefiled(void *arg)
 
                         LASSERT_TAGE_INVARIANT(tage);
 
-                        if (f_pos >= tracefile_size)
+                        if (f_pos >= (off_t)tracefile_size)
                                 f_pos = 0;
                         else if (f_pos > cfs_filp_size(filp))
                                 f_pos = cfs_filp_size(filp);
 
                         rc = cfs_filp_write(filp, cfs_page_address(tage->page),
                                             tage->used, &f_pos);
-                        if (rc != tage->used) {
+                        if (rc != (int)tage->used) {
                                 printk(KERN_WARNING "wanted to write %u but "
                                        "wrote %d\n", tage->used, rc);
                                 put_pages_back(&pc);
+                                LASSERT(list_empty(&pc.pc_pages));
                         }
                 }
                 CFS_MMSPACE_CLOSE;
 
                 cfs_filp_close(filp);
                 put_pages_on_daemon_list(&pc);
+                LASSERT(list_empty(&pc.pc_pages));
         }
         complete(&tctl->tctl_stop);
         return 0;
@@ -645,14 +734,18 @@ int tracefile_init(void)
         struct trace_cpu_data *tcd;
         int i;
 
+        tracefile_lock_init();
         for (i = 0; i < NR_CPUS; i++) {
                 tcd = &trace_data[i].tcd;
                 CFS_INIT_LIST_HEAD(&tcd->tcd_pages);
+                CFS_INIT_LIST_HEAD(&tcd->tcd_stock_pages);
                 CFS_INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
                 tcd->tcd_cur_pages = 0;
+                tcd->tcd_cur_stock_pages = 0;
                 tcd->tcd_cur_daemon_pages = 0;
                 tcd->tcd_max_pages = TCD_MAX_PAGES;
                 tcd->tcd_shutting_down = 0;
+                tcd->tcd_cpu = i;
         }
         return 0;
 }
index 4e7fdde..79eeeb2 100644 (file)
@@ -3,6 +3,14 @@
 
 #include <libcfs/libcfs.h>
 
+/* trace file lock routines */
+
+void tracefile_lock_init(void);
+void tracefile_read_lock(void);
+void tracefile_read_unlock(void);
+void tracefile_write_lock(void);
+void tracefile_write_unlock(void);
+
 int tracefile_dump_all_pages(char *filename);
 void trace_debug_print(void);
 void trace_flush_pages(void);
@@ -25,34 +33,99 @@ int trace_dk(struct file *file, const char *buffer, unsigned long count,
 /*
  * Private declare for tracefile
  */
-#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT))
+#define TCD_MAX_PAGES (5 << (20 - CFS_PAGE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
 
 #define TRACEFILE_SIZE (500 << 20)
 
 union trace_data_union {
        struct trace_cpu_data {
+               /*
+                * pages with trace records not yet processed by tracefiled.
+                */
                struct list_head        tcd_pages;
+               /* number of pages on ->tcd_pages */
                unsigned long           tcd_cur_pages;
 
+               /*
+                * pages with trace records already processed by
+                * tracefiled. These pages are kept in memory, so that some
+                * portion of log can be written in the event of LBUG. This
+                * list is maintained in LRU order.
+                *
+                * Pages are moved to ->tcd_daemon_pages by tracefiled()
+                * (put_pages_on_daemon_list()). LRU pages from this list are
+                * discarded when list grows too large.
+                */
                struct list_head        tcd_daemon_pages;
+               /* number of pages on ->tcd_cur_daemon_pages */
                unsigned long           tcd_cur_daemon_pages;
 
+               /*
+                * Maximal number of pages allowed on ->tcd_pages and
+                * ->tcd_daemon_pages each. Always TCD_MAX_PAGES in current
+                * implementation.
+                */
                unsigned long           tcd_max_pages;
+
+               /*
+                * preallocated pages to write trace records into. Pages from
+                * ->tcd_stock_pages are moved to ->tcd_pages by
+                * portals_debug_msg().
+                *
+                * This list is necessary, because on some platforms it's
+                * impossible to perform efficient atomic page allocation in a
+                * non-blockable context.
+                *
+                * Such platforms fill ->tcd_stock_pages "on occasion", when
+                * tracing code is entered in blockable context.
+                *
+                * trace_get_tage_try() tries to get a page from
+                * ->tcd_stock_pages first and resorts to atomic page
+                * allocation only if this queue is empty. ->tcd_stock_pages
+                * is replenished when tracing code is entered in blocking
+                * context (darwin-tracefile.c:__trace_get_tcd()). We try to
+                * maintain TCD_STOCK_PAGES (40 by default) pages in this
+                * queue. Atomic allocation is only required if more than
+                * TCD_STOCK_PAGES pagesful are consumed by trace records all
+                * emitted in non-blocking contexts. Which is quite unlikely.
+                */
+               struct list_head        tcd_stock_pages;
+               /* number of pages on ->tcd_stock_pages */
+               unsigned long           tcd_cur_stock_pages;
+
                int                     tcd_shutting_down;
+               int                     tcd_cpu;
        } tcd;
        char __pad[SMP_CACHE_BYTES];
 };
 
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
 struct page_collection {
        struct list_head        pc_pages;
+       /*
+        * spin-lock protecting ->pc_pages. It is taken by smp_call_function()
+        * call-back functions. XXX nikita: Which is horrible: all processors
+        * receive NMI at the same time only to be serialized by this
+        * lock. Probably ->pc_pages should be replaced with an array of
+        * NR_CPUS elements accessed locklessly.
+        */
        spinlock_t              pc_lock;
+       /*
+        * if this flag is set, collect_pages() will spill both
+        * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise,
+        * only ->tcd_pages are spilled.
+        */
        int                     pc_want_daemon_pages;
 };
 
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
 struct tracefiled_ctl {
        struct completion       tctl_start;
        struct completion       tctl_stop;
-       cfs_waitq_t             tctl_waitq; 
+       cfs_waitq_t             tctl_waitq;
        pid_t                   tctl_pid;
        atomic_t                tctl_shutdown;
 };
@@ -60,6 +133,8 @@ struct tracefiled_ctl {
 /*
  * small data-structure for each page owned by tracefiled.
  */
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
 struct trace_page {
        /*
         * page itself
@@ -87,10 +162,15 @@ extern void print_to_console(struct ptldebug_header *hdr, int mask,
                             char *buf, int len, char *file, const char *fn);
 extern struct trace_cpu_data * __trace_get_tcd (unsigned long *flags);
 extern void __trace_put_tcd (struct trace_cpu_data *tcd, unsigned long flags);
+int trace_refill_stock(struct trace_cpu_data *tcd, int gfp,
+                      struct list_head *stock);
+
 
 #define trace_get_tcd(f)       __trace_get_tcd(&(f))
 #define trace_put_tcd(t, f)    __trace_put_tcd(t, f)
 
+int tcd_owns_tage(struct trace_cpu_data *tcd, struct trace_page *tage);
+
 #endif /* LUSTRE_TRACEFILE_PRIVATE */
 
 #endif /* __PORTALS_TRACEFILE_H */
index 99dcd7f..a1a6779 100644 (file)
 
 /*
  * liblustre is single-threaded, so most "synchronization" APIs are trivial.
+ *
+ * XXX Liang: There are several branches share lnet with b_hd_newconfig,
+ * if we define lock APIs at here, there will be conflict with liblustre
+ * in other branches.
  */
 
 #ifndef __KERNEL__
 
+#include <stdlib.h>
+#include <libcfs/libcfs.h>
 /*
  * Optional debugging (magic stamping and checking ownership) can be added.
  */
 
+#if 0
 /*
  * spin_lock
  *
@@ -89,19 +96,6 @@ void spin_unlock_bh(spinlock_t *lock)
         (void)lock;
 }
 
-void spin_lock_irqsave(spinlock_t *lock, unsigned long flags)
-{
-        LASSERT(lock != NULL);
-        (void)lock;
-}
-
-void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
-{
-        LASSERT(lock != NULL);
-        (void)lock;
-}
-
-
 /*
  * Semaphore
  *
@@ -227,6 +221,7 @@ void up_write(struct rw_semaphore *s)
         LASSERT(s != NULL);
         (void)s;
 }
+#endif
 
 /* !__KERNEL__ */
 #endif
index ddc994c..a9cc1db 100644 (file)
@@ -46,6 +46,7 @@
 
 #include <libcfs/libcfs.h>
 
+#define LASSERT(a)      do {} while (0)
 /*
  * Sleep channel. No-op implementation.
  */
@@ -98,6 +99,7 @@ int cfs_waitq_active(struct cfs_waitq *waitq)
 {
         LASSERT(waitq != NULL);
         (void)waitq;
+        return 0;
 }
 
 void cfs_waitq_signal(struct cfs_waitq *waitq)
@@ -112,7 +114,7 @@ void cfs_waitq_signal_nr(struct cfs_waitq *waitq, int nr)
         (void)waitq;
 }
 
-void cfs_waitq_broadcast(struct cfs_waitq *waitq)
+void cfs_waitq_broadcast(struct cfs_waitq *waitq, int state)
 {
         LASSERT(waitq != NULL);
         (void)waitq;
@@ -124,27 +126,24 @@ void cfs_waitq_wait(struct cfs_waitlink *link)
         (void)link;
 }
 
-int64_t cfs_waitq_timedwait(struct cfs_waitlink *link, int64_t timeout)
+int64_t cfs_waitq_timedwait(struct cfs_waitlink *link, int state, int64_t timeout)
 {
         LASSERT(link != NULL);
         (void)link;
+        return 0;
 }
 
 /*
  * Allocator
  */
 
-cfs_page_t *cfs_alloc_pages(unsigned int flags, unsigned int order)
+cfs_page_t *cfs_alloc_page(unsigned int flags)
 {
         cfs_page_t *pg = malloc(sizeof(*pg));
 
         if (!pg)
                 return NULL;
-#if 0 //#ifdef MAP_ANONYMOUS
-        pg->addr = mmap(0, PAGE_SIZE << order, PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
-#else
-        pg->addr = malloc(PAGE_SIZE << order);
-#endif
+        pg->addr = malloc(PAGE_SIZE);
 
         if (!pg->addr) {
                 free(pg);
@@ -153,26 +152,12 @@ cfs_page_t *cfs_alloc_pages(unsigned int flags, unsigned int order)
         return pg;
 }
 
-void cfs_free_pages(struct page *pg, int what)
+void cfs_free_page(cfs_page_t *pg)
 {
-#if 0 //#ifdef MAP_ANONYMOUS
-        munmap(pg->addr, PAGE_SIZE);
-#else
         free(pg->addr);
-#endif
         free(pg);
 }
 
-cfs_page_t *cfs_alloc_page(unsigned int flags)
-{
-        return cfs_alloc_pages(flags, 0);
-}
-
-void cfs_free_page(cfs_page_t *pg, int what)
-{
-        cfs_free_page(pg, what);
-}
-
 void *cfs_page_address(cfs_page_t *pg)
 {
         return pg->addr;
@@ -188,40 +173,11 @@ void cfs_kunmap(cfs_page_t *pg)
 }
 
 /*
- * Memory allocator
- */
-void *cfs_alloc(size_t nr_bytes, u_int32_t flags)
-{
-        void *result;
-
-        result = malloc(nr_bytes);
-        if (result != NULL && (flags & CFS_ALLOC_ZERO))
-               memset(result, 0, nr_bytes);
-}
-
-void cfs_free(void *addr)
-{
-        free(addr);
-}
-
-void *cfs_alloc_large(size_t nr_bytes)
-{
-        return cfs_alloc(nr_bytes, 0);
-}
-
-void  cfs_free_large(void *addr)
-{
-        return cfs_free(addr);
-}
-
-/*
  * SLAB allocator
  */
 
 cfs_mem_cache_t *
-cfs_mem_cache_create(const char *, size_t, size_t, unsigned long,
-                     void (*)(void *, cfs_mem_cache_t *, unsigned long),
-                     void (*)(void *, cfs_mem_cache_t *, unsigned long))
+cfs_mem_cache_create(const char *name, size_t objsize, size_t off, unsigned long flags)
 {
         cfs_mem_cache_t *c;
 
@@ -243,7 +199,7 @@ int cfs_mem_cache_destroy(cfs_mem_cache_t *c)
 
 void *cfs_mem_cache_alloc(cfs_mem_cache_t *c, int gfp)
 {
-        return cfs_alloc(c, gfp);
+        return cfs_alloc(c->size, gfp);
 }
 
 void cfs_mem_cache_free(cfs_mem_cache_t *c, void *addr)
@@ -251,6 +207,102 @@ void cfs_mem_cache_free(cfs_mem_cache_t *c, void *addr)
         cfs_free(addr);
 }
 
+/*
+ * This uses user-visible declarations from <linux/kdev_t.h>
+ */
+#ifdef __LINUX__
+#include <linux/kdev_t.h>
+#endif
+
+#ifndef MKDEV
+
+#define MAJOR(dev)      ((dev)>>8)
+#define MINOR(dev)      ((dev) & 0xff)
+#define MKDEV(ma,mi)    ((ma)<<8 | (mi))
+
+#endif
+
+cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor)
+{
+        return MKDEV(major, minor);
+}
+
+cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev)
+{
+        return MAJOR(rdev);
+}
+
+cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev)
+{
+        return MINOR(rdev);
+}
+
+void cfs_enter_debugger(void)
+{
+        /*
+         * nothing for now.
+         */
+}
+
+void cfs_daemonize(char *str)
+{
+        return;
+}
+
+void cfs_block_allsigs()
+{
+}
+
+cfs_sigset_t cfs_get_blocked_sigs()
+{
+        cfs_sigset_t    s;
+        memset(&s, 0, sizeof(s));
+        return s;
+}
+
+void cfs_block_sigs(cfs_sigset_t blocks)
+{
+}
+
+#ifdef __LINUX__
+
+/*
+ * In glibc (NOT in Linux, so check above is not right), implement
+ * stack-back-tracing through backtrace() function.
+ */
+#include <execinfo.h>
+
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace)
+{
+        backtrace(trace->frame, sizeof_array(trace->frame));
+}
+
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no)
+{
+        if (0 <= frame_no && frame_no < sizeof_array(trace->frame))
+                return trace->frame[frame_no];
+        else
+                return NULL;
+}
+
+#else
+
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace)
+{}
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no)
+{
+        return NULL;
+}
+
+/* __LINUX__ */
+#endif
+
+void lbug_with_loc(char *file, const char *func, const int line)
+{
+        CEMERG("LBUG\n");
+        abort();
+}
+
 
 /* !__KERNEL__ */
 #endif
index 143ce43..dea8fee 100644 (file)
 
 #include <libcfs/kp30.h>
 #include <libcfs/libcfs.h>
-#include <libcfs/linux/portals_compat25.h>
-
-
 
 struct lc_watchdog {
-        struct timer_list lcw_timer; /* kernel timer */
+        cfs_timer_t       lcw_timer; /* kernel timer */
         struct list_head  lcw_list;
         struct timeval    lcw_last_touched;
-        struct task_struct *lcw_task;
+        cfs_task_t       *lcw_task;
 
         void (*lcw_callback)(struct lc_watchdog *,
-                            struct task_struct *,
+                            cfs_task_t *,
                             void *data);
         void *lcw_data;
 
@@ -49,6 +46,7 @@ struct lc_watchdog {
         } lcw_state;
 };
 
+#ifdef WITH_WATCHDOG
 /*
  * The dispatcher will complete lcw_start_completion when it starts,
  * and lcw_stop_completion when it exits.
@@ -82,9 +80,9 @@ static spinlock_t       lcw_pending_timers_lock = SPIN_LOCK_UNLOCKED;
 static struct list_head lcw_pending_timers = \
         LIST_HEAD_INIT(lcw_pending_timers);
 
-static struct task_struct *lcw_lookup_task(struct lc_watchdog *lcw)
+static cfs_task_t *lcw_lookup_task(struct lc_watchdog *lcw)
 {
-        struct task_struct *tsk;
+        cfs_task_t *tsk;
         unsigned long flags;
         ENTRY;
 
@@ -118,8 +116,9 @@ static void lcw_cb(unsigned long data)
 
         lcw->lcw_state = LC_WATCHDOG_EXPIRED;
 
-        CWARN("Watchdog triggered for pid %d: it was inactive for %dms\n",
-              lcw->lcw_pid, (lcw->lcw_time * 1000) / HZ);
+        CWARN("Watchdog triggered for pid %d: it was inactive for %ldms\n",
+              lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time) * 1000);
+
 
         tsk = lcw_lookup_task(lcw);
         if (tsk != NULL)
@@ -158,7 +157,7 @@ static int lcw_dispatch_main(void *data)
 
         ENTRY;
 
-        libcfs_daemonize("lc_watchdogd");
+        cfs_daemonize("lc_watchdogd");
 
         SIGNAL_MASK_LOCK(current, flags);
         sigfillset(&current->blocked);
@@ -271,7 +270,7 @@ struct lc_watchdog *lc_watchdog_add(int timeout_ms,
 
         lcw->lcw_task = cfs_current();
         lcw->lcw_pid = cfs_curproc_pid();
-        lcw->lcw_time = (timeout_ms * HZ) / 1000;
+        lcw->lcw_time = cfs_time_seconds(timeout_ms / 1000);
         lcw->lcw_callback = callback ? callback : lc_watchdog_dumplog;
         lcw->lcw_data = data;
         lcw->lcw_state = LC_WATCHDOG_DISABLED;
@@ -298,26 +297,19 @@ struct lc_watchdog *lc_watchdog_add(int timeout_ms,
 }
 EXPORT_SYMBOL(lc_watchdog_add);
 
-static long
-timeval_sub(struct timeval *large, struct timeval *small)
-{
-        return (large->tv_sec - small->tv_sec) * 1000000 +
-                (large->tv_usec - small->tv_usec);
-}
-
 static void lcw_update_time(struct lc_watchdog *lcw, const char *message)
 {
         struct timeval newtime;
-        unsigned long timediff;
+        struct timeval timediff;
 
         do_gettimeofday(&newtime);
         if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) {
-                timediff = timeval_sub(&newtime, &lcw->lcw_last_touched);
+                cfs_timeval_sub(&newtime, &lcw->lcw_last_touched, &timediff);
                 CWARN("Expired watchdog for pid %d %s after %lu.%.4lus\n",
                       lcw->lcw_pid,
                       message,
-                      timediff / 1000000,
-                      (timediff % 1000000) / 100);
+                      timediff.tv_sec,
+                      timediff.tv_usec / 100);
         }
         lcw->lcw_last_touched = newtime;
 }
@@ -400,3 +392,34 @@ void lc_watchdog_dumplog(struct lc_watchdog *lcw,
         libcfs_debug_dumplog_internal((void *)(long)tsk->pid);
 }
 EXPORT_SYMBOL(lc_watchdog_dumplog);
+
+#else   /* !defined(WITH_WATCHDOG) */
+
+struct lc_watchdog *lc_watchdog_add(int timeout_ms,
+                                    void (*callback)(struct lc_watchdog *,
+                                                     cfs_task_t *,
+                                                     void *),
+                                    void *data)
+{
+        static struct lc_watchdog      watchdog;
+        return &watchdog;
+}
+EXPORT_SYMBOL(lc_watchdog_add);
+
+void lc_watchdog_touch(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_touch);
+
+void lc_watchdog_disable(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_disable);
+
+void lc_watchdog_delete(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_delete);
+
+#endif
+
diff --git a/lnet/libcfs/winnt/winnt-curproc.c b/lnet/libcfs/winnt/winnt-curproc.c
new file mode 100644 (file)
index 0000000..09f6d83
--- /dev/null
@@ -0,0 +1,453 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ *
+ * Impletion of winnt curproc routines.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+
+
+/*
+ * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h)
+ * for Linux kernel.
+ */
+
+cfs_task_t this_task = 
+    { 0, 0, 0, 0, 0, 0, 0, 
+      0, 0, 0, 0,  1, 0,  0, 0, 0,
+      "sysetm\0" };
+
+
+uid_t  cfs_curproc_uid(void)
+{
+    return this_task.uid;
+}
+
+gid_t  cfs_curproc_gid(void)
+{
+    return this_task.gid;
+}
+
+uid_t  cfs_curproc_fsuid(void)
+{
+    return this_task.fsuid;
+}
+
+gid_t cfs_curproc_fsgid(void)
+{
+    return this_task.fsgid;
+}
+
+pid_t cfs_curproc_pid(void)
+{
+    return cfs_current()->pid;
+}
+
+int cfs_curproc_groups_nr(void)
+{
+    return this_task.ngroups;
+}
+
+void cfs_curproc_groups_dump(gid_t *array, int size)
+{
+    LASSERT(size <= NGROUPS);
+    size = min_t(int, size, this_task.ngroups);
+    memcpy(array, this_task.groups, size * sizeof(__u32));
+}
+
+int cfs_curproc_is_in_groups(gid_t gid)
+{
+    return in_group_p(gid);
+}
+
+mode_t cfs_curproc_umask(void)
+{
+    return this_task.umask;
+}
+
+char  *cfs_curproc_comm(void)
+{
+    return this_task.comm;
+}
+
+cfs_kernel_cap_t cfs_curproc_cap_get(void)
+{
+    return this_task.cap_effective;
+}
+
+void cfs_curproc_cap_set(cfs_kernel_cap_t cap)
+{
+    this_task.cap_effective = cap;
+}
+
+
+/*
+ * Implementation of linux task management routines
+ */
+
+
+/* global of the task manager structure */
+
+TASK_MAN TaskMan;
+
+
+/*
+ *  task slot routiens
+ */
+
+PTASK_SLOT
+alloc_task_slot()
+{
+    PTASK_SLOT task = NULL;
+
+    if (TaskMan.slab) {
+        task = cfs_mem_cache_alloc(TaskMan.slab, 0);
+    } else {
+        task = cfs_alloc(sizeof(TASK_SLOT), 0);
+    }
+
+    return task;
+}
+
+void
+init_task_slot(PTASK_SLOT task)
+{
+    memset(task, 0, sizeof(TASK_SLOT));
+    task->Magic = TASKSLT_MAGIC;
+    task->task  = this_task;
+    task->task.pid = (pid_t)PsGetCurrentThreadId();
+    cfs_init_event(&task->Event, TRUE, FALSE);
+}
+
+
+void
+cleanup_task_slot(PTASK_SLOT task)
+{
+    if (TaskMan.slab) {
+        cfs_mem_cache_free(TaskMan.slab, task);
+    } else {
+        cfs_free(task);
+    }
+}
+
+/*
+ *  task manager related routines
+ */
+
+VOID
+task_manager_notify(
+    IN HANDLE   ProcessId,
+    IN HANDLE   ThreadId,
+    IN BOOLEAN  Create
+    )
+{
+    PLIST_ENTRY ListEntry = NULL; 
+    PTASK_SLOT  TaskSlot  = NULL;
+
+    spin_lock(&(TaskMan.Lock));
+
+    ListEntry = TaskMan.TaskList.Flink;
+
+    while (ListEntry != (&(TaskMan.TaskList))) {
+
+        TaskSlot = CONTAINING_RECORD(ListEntry, TASK_SLOT, Link);
+
+        if (TaskSlot->Pid == ProcessId && TaskSlot->Tid == ThreadId) {
+
+            if (Create) {
+/*
+                DbgPrint("task_manager_notify: Pid=%xh Tid %xh resued (TaskSlot->Tet = %xh)...\n",
+                         ProcessId, ThreadId, TaskSlot->Tet);
+*/
+            } else {
+                /* remove the taskslot */
+                RemoveEntryList(&(TaskSlot->Link));
+                TaskMan.NumOfTasks--;
+
+                /* now free the task slot */
+                cleanup_task_slot(TaskSlot);
+            }
+        }
+
+        ListEntry = ListEntry->Flink;
+    }
+
+    spin_unlock(&(TaskMan.Lock));
+}
+
+int
+init_task_manager()
+{
+    NTSTATUS    status;
+
+    /* initialize the content and magic */
+    memset(&TaskMan, 0, sizeof(TASK_MAN));
+    TaskMan.Magic = TASKMAN_MAGIC;
+
+    /* initialize the spinlock protection */
+    spin_lock_init(&TaskMan.Lock);
+
+    /* create slab memory cache */
+    TaskMan.slab = cfs_mem_cache_create(
+        "TSLT", sizeof(TASK_SLOT), 0, 0);
+
+    /* intialize the list header */
+    InitializeListHead(&(TaskMan.TaskList));
+
+    /* set the thread creation/destruction notify routine */
+    status = PsSetCreateThreadNotifyRoutine(task_manager_notify);
+
+    if (!NT_SUCCESS(status)) {
+        cfs_enter_debugger();
+    }
+
+    return 0;
+}
+
+void
+cleanup_task_manager()
+{
+    PLIST_ENTRY ListEntry = NULL; 
+    PTASK_SLOT  TaskSlot  = NULL;
+
+    /* we must stay in system since we succeed to register the
+       CreateThreadNotifyRoutine: task_manager_notify */
+    cfs_enter_debugger();
+
+
+    /* cleanup all the taskslots attached to the list */
+    spin_lock(&(TaskMan.Lock));
+
+    while (!IsListEmpty(&(TaskMan.TaskList))) {
+
+        ListEntry = TaskMan.TaskList.Flink;
+        TaskSlot = CONTAINING_RECORD(ListEntry, TASK_SLOT, Link);
+
+        RemoveEntryList(ListEntry);
+        cleanup_task_slot(TaskSlot);
+    }
+
+    spin_unlock(&TaskMan.Lock);
+
+    /* destroy the taskslot cache slab */
+    cfs_mem_cache_destroy(TaskMan.slab);
+    memset(&TaskMan, 0, sizeof(TASK_MAN));
+}
+
+
+/*
+ * schedule routines (task slot list)
+ */
+
+
+cfs_task_t *
+cfs_current()
+{
+    HANDLE      Pid = PsGetCurrentProcessId();
+    HANDLE      Tid = PsGetCurrentThreadId();
+    PETHREAD    Tet = PsGetCurrentThread();
+
+    PLIST_ENTRY ListEntry = NULL; 
+    PTASK_SLOT  TaskSlot  = NULL;
+
+    spin_lock(&(TaskMan.Lock));
+
+    ListEntry = TaskMan.TaskList.Flink;
+
+    while (ListEntry != (&(TaskMan.TaskList))) {
+
+        TaskSlot = CONTAINING_RECORD(ListEntry, TASK_SLOT, Link);
+
+        if (TaskSlot->Pid == Pid && TaskSlot->Tid == Tid) {
+            if (TaskSlot->Tet != Tet) {
+
+/*
+                DbgPrint("cfs_current: Pid=%xh Tid %xh Tet = %xh resued (TaskSlot->Tet = %xh)...\n",
+                         Pid, Tid, Tet, TaskSlot->Tet);
+*/
+                //
+                // The old thread was already exit. This must be a
+                // new thread which get the same Tid to the previous.
+                //
+
+                TaskSlot->Tet = Tet;
+            }
+            break;
+
+        } else {
+
+            if ((ULONG)TaskSlot->Pid > (ULONG)Pid) {
+                TaskSlot = NULL;
+                break;
+            } else if ((ULONG)TaskSlot->Pid == (ULONG)Pid) {
+                if ((ULONG)TaskSlot->Tid > (ULONG)Tid) {
+                    TaskSlot = NULL;
+                    break;
+                }
+            }
+
+            TaskSlot =  NULL;
+        }
+
+        ListEntry = ListEntry->Flink;
+    }
+
+    if (!TaskSlot) {
+
+        TaskSlot = alloc_task_slot();
+
+        if (!TaskSlot) {
+            cfs_enter_debugger();
+            goto errorout;
+        }
+
+        init_task_slot(TaskSlot);
+
+        TaskSlot->Pid = Pid;
+        TaskSlot->Tid = Tid;
+        TaskSlot->Tet = Tet;
+
+        if (ListEntry == (&(TaskMan.TaskList))) {
+            //
+            // Empty case or the biggest case, put it to the tail.
+            //
+            InsertTailList(&(TaskMan.TaskList), &(TaskSlot->Link));
+        } else {
+            //
+            // Get a slot and smaller than it's tid, put it just before.
+            //
+            InsertHeadList(ListEntry->Blink, &(TaskSlot->Link));
+        }
+
+        TaskMan.NumOfTasks++;
+    }
+
+    //
+    // To Check whether he task structures are arranged in the expected order ?
+    //
+
+    {
+        PTASK_SLOT  Prev = NULL, Curr = NULL;
+        
+        ListEntry = TaskMan.TaskList.Flink;
+
+        while (ListEntry != (&(TaskMan.TaskList))) {
+
+            Curr = CONTAINING_RECORD(ListEntry, TASK_SLOT, Link);
+            ListEntry = ListEntry->Flink;
+
+            if (Prev) {
+                if ((ULONG)Prev->Pid > (ULONG)Curr->Pid) {
+                    cfs_enter_debugger();
+                } else if ((ULONG)Prev->Pid == (ULONG)Curr->Pid) {
+                    if ((ULONG)Prev->Tid > (ULONG)Curr->Tid) {
+                        cfs_enter_debugger();
+                    }
+                }
+            }
+
+            Prev = Curr;
+        }
+    }
+
+errorout:
+
+    spin_unlock(&(TaskMan.Lock));
+
+    if (!TaskSlot) {
+        cfs_enter_debugger();
+        return NULL;
+    }
+
+    return (&(TaskSlot->task));
+}
+
+int
+schedule_timeout(int64_t time)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        cfs_enter_debugger();
+        return 0;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    cfs_assert(slot->Magic == TASKSLT_MAGIC);
+
+    if (time == MAX_SCHEDULE_TIMEOUT) {
+        time = 0;
+    }
+
+    return (cfs_wait_event(&(slot->Event), time) != 0);
+}
+
+int
+schedule()
+{
+    return schedule_timeout(0);
+}
+
+int
+wake_up_process(
+    cfs_task_t * task
+    )
+{
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        cfs_enter_debugger();
+        return 0;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    cfs_assert(slot->Magic == TASKSLT_MAGIC);
+
+    cfs_wake_event(&(slot->Event));
+
+    return TRUE;
+}
+
+void
+sleep_on(
+    cfs_waitq_t *waitq
+    )
+{
+       cfs_waitlink_t link;
+       
+       cfs_waitlink_init(&link);
+       cfs_waitq_add(waitq, &link);
+       cfs_waitq_wait(&link, CFS_TASK_INTERRUPTIBLE);
+       cfs_waitq_del(waitq, &link);
+}
+
+EXPORT_SYMBOL(cfs_curproc_uid);
+EXPORT_SYMBOL(cfs_curproc_pid);
+EXPORT_SYMBOL(cfs_curproc_gid);
+EXPORT_SYMBOL(cfs_curproc_fsuid);
+EXPORT_SYMBOL(cfs_curproc_fsgid);
+EXPORT_SYMBOL(cfs_curproc_umask);
+EXPORT_SYMBOL(cfs_curproc_comm);
+EXPORT_SYMBOL(cfs_curproc_groups_nr);
+EXPORT_SYMBOL(cfs_curproc_groups_dump);
+EXPORT_SYMBOL(cfs_curproc_is_in_groups);
+EXPORT_SYMBOL(cfs_curproc_cap_get);
+EXPORT_SYMBOL(cfs_curproc_cap_set);
diff --git a/lnet/libcfs/winnt/winnt-debug.c b/lnet/libcfs/winnt/winnt-debug.c
new file mode 100644 (file)
index 0000000..237ba24
--- /dev/null
@@ -0,0 +1,1056 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <libcfs/kp30.h>
+#include <libcfs/libcfs.h>
+#include "tracefile.h"
+
+void portals_debug_dumpstack(cfs_task_t *tsk)
+{ 
+       return;
+}
+
+cfs_task_t *portals_current(void)
+{ 
+       return cfs_current();
+}
+
+int portals_arch_debug_init(unsigned long bufsize)
+{
+       return 0;
+}
+
+int portals_arch_debug_cleanup(void)
+{
+       return 0;
+}
+
+void portals_run_lbug_upcall(char *file, const char *fn, const int line)
+{
+}
+
+void lbug_with_loc(char *file, const char *func, const int line)
+{
+        CEMERG("LBUG: pid: %u thread: %#x\n",
+              (unsigned)cfs_curproc_pid(), (unsigned)PsGetCurrentThread());
+        // portals_debug_dumplog();
+        // portals_run_lbug_upcall(file, func, line);
+}
+
+#if KS_DEBUG
+
+/*
+ * Definitions
+ */
+
+LONG  KsDebugLevel = 0x5;
+
+
+/*
+ * Routines
+ */
+
+
+/*
+ * KsNtStatusToString
+ *   Get the error message for a specified nt status
+ *
+ * Arguments:
+ *   Status - nt status code
+ *
+ * Return Value:
+ *   PUCHAR - message string for the status code
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+PUCHAR
+KsNtStatusToString (IN NTSTATUS Status)
+{
+    switch (Status) {
+
+    case 0x00000000: return "STATUS_SUCCESS";
+    case 0x00000001: return "STATUS_WAIT_1";
+    case 0x00000002: return "STATUS_WAIT_2";
+    case 0x00000003: return "STATUS_WAIT_3";
+    case 0x0000003F: return "STATUS_WAIT_63";
+    case 0x00000080: return "STATUS_ABANDONED_WAIT_0";
+    case 0x000000BF: return "STATUS_ABANDONED_WAIT_63";
+    case 0x000000C0: return "STATUS_USER_APC";
+    case 0x00000100: return "STATUS_KERNEL_APC";
+    case 0x00000101: return "STATUS_ALERTED";
+    case 0x00000102: return "STATUS_TIMEOUT";
+    case 0x00000103: return "STATUS_PENDING";
+    case 0x00000104: return "STATUS_REPARSE";
+    case 0x00000105: return "STATUS_MORE_ENTRIES";
+    case 0x00000106: return "STATUS_NOT_ALL_ASSIGNED";
+    case 0x00000107: return "STATUS_SOME_NOT_MAPPED";
+    case 0x00000108: return "STATUS_OPLOCK_BREAK_IN_PROGRESS";
+    case 0x00000109: return "STATUS_VOLUME_MOUNTED";
+    case 0x0000010A: return "STATUS_RXACT_COMMITTED";
+    case 0x0000010B: return "STATUS_NOTIFY_CLEANUP";
+    case 0x0000010C: return "STATUS_NOTIFY_ENUM_DIR";
+    case 0x0000010D: return "STATUS_NO_QUOTAS_FOR_ACCOUNT";
+    case 0x0000010E: return "STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED";
+    case 0x00000110: return "STATUS_PAGE_FAULT_TRANSITION";
+    case 0x00000111: return "STATUS_PAGE_FAULT_DEMAND_ZERO";
+    case 0x00000112: return "STATUS_PAGE_FAULT_COPY_ON_WRITE";
+    case 0x00000113: return "STATUS_PAGE_FAULT_GUARD_PAGE";
+    case 0x00000114: return "STATUS_PAGE_FAULT_PAGING_FILE";
+    case 0x00000115: return "STATUS_CACHE_PAGE_LOCKED";
+    case 0x00000116: return "STATUS_CRASH_DUMP";
+    case 0x00000117: return "STATUS_BUFFER_ALL_ZEROS";
+    case 0x00000118: return "STATUS_REPARSE_OBJECT";
+    case 0x00000119: return "STATUS_RESOURCE_REQUIREMENTS_CHANGED";
+    case 0x00000120: return "STATUS_TRANSLATION_COMPLETE";
+    case 0x00000121: return "STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY";
+    case 0x00010001: return "DBG_EXCEPTION_HANDLED";
+    case 0x00010002: return "DBG_CONTINUE";
+    case 0x40000000: return "STATUS_OBJECT_NAME_EXISTS";
+    case 0x40000001: return "STATUS_THREAD_WAS_SUSPENDED";
+    case 0x40000002: return "STATUS_WORKING_SET_LIMIT_RANGE";
+    case 0x40000003: return "STATUS_IMAGE_NOT_AT_BASE";
+    case 0x40000004: return "STATUS_RXACT_STATE_CREATED";
+    case 0x40000005: return "STATUS_SEGMENT_NOTIFICATION";
+    case 0x40000006: return "STATUS_LOCAL_USER_SESSION_KEY";
+    case 0x40000007: return "STATUS_BAD_CURRENT_DIRECTORY";
+    case 0x40000008: return "STATUS_SERIAL_MORE_WRITES";
+    case 0x40000009: return "STATUS_REGISTRY_RECOVERED";
+    case 0x4000000A: return "STATUS_FT_READ_RECOVERY_FROM_BACKUP";
+    case 0x4000000B: return "STATUS_FT_WRITE_RECOVERY";
+    case 0x4000000C: return "STATUS_SERIAL_COUNTER_TIMEOUT";
+    case 0x4000000D: return "STATUS_NULL_LM_PASSWORD";
+    case 0x4000000E: return "STATUS_IMAGE_MACHINE_TYPE_MISMATCH";
+    case 0x4000000F: return "STATUS_RECEIVE_PARTIAL";
+    case 0x40000010: return "STATUS_RECEIVE_EXPEDITED";
+    case 0x40000011: return "STATUS_RECEIVE_PARTIAL_EXPEDITED";
+    case 0x40000012: return "STATUS_EVENT_DONE";
+    case 0x40000013: return "STATUS_EVENT_PENDING";
+    case 0x40000014: return "STATUS_CHECKING_FILE_SYSTEM";
+    case 0x40000015: return "STATUS_FATAL_APP_EXIT";
+    case 0x40000016: return "STATUS_PREDEFINED_HANDLE";
+    case 0x40000017: return "STATUS_WAS_UNLOCKED";
+    case 0x40000018: return "STATUS_SERVICE_NOTIFICATION";
+    case 0x40000019: return "STATUS_WAS_LOCKED";
+    case 0x4000001A: return "STATUS_LOG_HARD_ERROR";
+    case 0x4000001B: return "STATUS_ALREADY_WIN32";
+    case 0x4000001C: return "STATUS_WX86_UNSIMULATE";
+    case 0x4000001D: return "STATUS_WX86_CONTINUE";
+    case 0x4000001E: return "STATUS_WX86_SINGLE_STEP";
+    case 0x4000001F: return "STATUS_WX86_BREAKPOINT";
+    case 0x40000020: return "STATUS_WX86_EXCEPTION_CONTINUE";
+    case 0x40000021: return "STATUS_WX86_EXCEPTION_LASTCHANCE";
+    case 0x40000022: return "STATUS_WX86_EXCEPTION_CHAIN";
+    case 0x40000023: return "STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE";
+    case 0x40000024: return "STATUS_NO_YIELD_PERFORMED";
+    case 0x40000025: return "STATUS_TIMER_RESUME_IGNORED";
+    case 0x40000026: return "STATUS_ARBITRATION_UNHANDLED";
+    case 0x40000027: return "STATUS_CARDBUS_NOT_SUPPORTED";
+    case 0x40000028: return "STATUS_WX86_CREATEWX86TIB";
+    case 0x40000029: return "STATUS_MP_PROCESSOR_MISMATCH";
+    case 0x40010001: return "DBG_REPLY_LATER";
+    case 0x40010002: return "DBG_UNABLE_TO_PROVIDE_HANDLE";
+    case 0x40010003: return "DBG_TERMINATE_THREAD";
+    case 0x40010004: return "DBG_TERMINATE_PROCESS";
+    case 0x40010005: return "DBG_CONTROL_C";
+    case 0x40010006: return "DBG_PRINTEXCEPTION_C";
+    case 0x40010007: return "DBG_RIPEXCEPTION";
+    case 0x40010008: return "DBG_CONTROL_BREAK";
+    case 0x80000001: return "STATUS_GUARD_PAGE_VIOLATION";
+    case 0x80000002: return "STATUS_DATATYPE_MISALIGNMENT";
+    case 0x80000003: return "STATUS_BREAKPOINT";
+    case 0x80000004: return "STATUS_SINGLE_STEP";
+    case 0x80000005: return "STATUS_BUFFER_OVERFLOW";
+    case 0x80000006: return "STATUS_NO_MORE_FILES";
+    case 0x80000007: return "STATUS_WAKE_SYSTEM_DEBUGGER";
+    case 0x8000000A: return "STATUS_HANDLES_CLOSED";
+    case 0x8000000B: return "STATUS_NO_INHERITANCE";
+    case 0x8000000C: return "STATUS_GUID_SUBSTITUTION_MADE";
+    case 0x8000000D: return "STATUS_PARTIAL_COPY";
+    case 0x8000000E: return "STATUS_DEVICE_PAPER_EMPTY";
+    case 0x8000000F: return "STATUS_DEVICE_POWERED_OFF";
+    case 0x80000010: return "STATUS_DEVICE_OFF_LINE";
+    case 0x80000011: return "STATUS_DEVICE_BUSY";
+    case 0x80000012: return "STATUS_NO_MORE_EAS";
+    case 0x80000013: return "STATUS_INVALID_EA_NAME";
+    case 0x80000014: return "STATUS_EA_LIST_INCONSISTENT";
+    case 0x80000015: return "STATUS_INVALID_EA_FLAG";
+    case 0x80000016: return "STATUS_VERIFY_REQUIRED";
+    case 0x80000017: return "STATUS_EXTRANEOUS_INFORMATION";
+    case 0x80000018: return "STATUS_RXACT_COMMIT_NECESSARY";
+    case 0x8000001A: return "STATUS_NO_MORE_ENTRIES";
+    case 0x8000001B: return "STATUS_FILEMARK_DETECTED";
+    case 0x8000001C: return "STATUS_MEDIA_CHANGED";
+    case 0x8000001D: return "STATUS_BUS_RESET";
+    case 0x8000001E: return "STATUS_END_OF_MEDIA";
+    case 0x8000001F: return "STATUS_BEGINNING_OF_MEDIA";
+    case 0x80000020: return "STATUS_MEDIA_CHECK";
+    case 0x80000021: return "STATUS_SETMARK_DETECTED";
+    case 0x80000022: return "STATUS_NO_DATA_DETECTED";
+    case 0x80000023: return "STATUS_REDIRECTOR_HAS_OPEN_HANDLES";
+    case 0x80000024: return "STATUS_SERVER_HAS_OPEN_HANDLES";
+    case 0x80000025: return "STATUS_ALREADY_DISCONNECTED";
+    case 0x80000026: return "STATUS_LONGJUMP";
+    case 0x80010001: return "DBG_EXCEPTION_NOT_HANDLED";
+    case 0xC0000001: return "STATUS_UNSUCCESSFUL";
+    case 0xC0000002: return "STATUS_NOT_IMPLEMENTED";
+    case 0xC0000003: return "STATUS_INVALID_INFO_CLASS";
+    case 0xC0000004: return "STATUS_INFO_LENGTH_MISMATCH";
+    case 0xC0000005: return "STATUS_ACCESS_VIOLATION";
+    case 0xC0000006: return "STATUS_IN_PAGE_ERROR";
+    case 0xC0000007: return "STATUS_PAGEFILE_QUOTA";
+    case 0xC0000008: return "STATUS_INVALID_HANDLE";
+    case 0xC0000009: return "STATUS_BAD_INITIAL_STACK";
+    case 0xC000000A: return "STATUS_BAD_INITIAL_PC";
+    case 0xC000000B: return "STATUS_INVALID_CID";
+    case 0xC000000C: return "STATUS_TIMER_NOT_CANCELED";
+    case 0xC000000D: return "STATUS_INVALID_PARAMETER";
+    case 0xC000000E: return "STATUS_NO_SUCH_DEVICE";
+    case 0xC000000F: return "STATUS_NO_SUCH_FILE";
+    case 0xC0000010: return "STATUS_INVALID_DEVICE_REQUEST";
+    case 0xC0000011: return "STATUS_END_OF_FILE";
+    case 0xC0000012: return "STATUS_WRONG_VOLUME";
+    case 0xC0000013: return "STATUS_NO_MEDIA_IN_DEVICE";
+    case 0xC0000014: return "STATUS_UNRECOGNIZED_MEDIA";
+    case 0xC0000015: return "STATUS_NONEXISTENT_SECTOR";
+    case 0xC0000016: return "STATUS_MORE_PROCESSING_REQUIRED";
+    case 0xC0000017: return "STATUS_NO_MEMORY";
+    case 0xC0000018: return "STATUS_CONFLICTING_ADDRESSES";
+    case 0xC0000019: return "STATUS_NOT_MAPPED_VIEW";
+    case 0xC000001A: return "STATUS_UNABLE_TO_FREE_VM";
+    case 0xC000001B: return "STATUS_UNABLE_TO_DELETE_SECTION";
+    case 0xC000001C: return "STATUS_INVALID_SYSTEM_SERVICE";
+    case 0xC000001D: return "STATUS_ILLEGAL_INSTRUCTION";
+    case 0xC000001E: return "STATUS_INVALID_LOCK_SEQUENCE";
+    case 0xC000001F: return "STATUS_INVALID_VIEW_SIZE";
+    case 0xC0000020: return "STATUS_INVALID_FILE_FOR_SECTION";
+    case 0xC0000021: return "STATUS_ALREADY_COMMITTED";
+    case 0xC0000022: return "STATUS_ACCESS_DENIED";
+    case 0xC0000023: return "STATUS_BUFFER_TOO_SMALL";
+    case 0xC0000024: return "STATUS_OBJECT_TYPE_MISMATCH";
+    case 0xC0000025: return "STATUS_NONCONTINUABLE_EXCEPTION";
+    case 0xC0000026: return "STATUS_INVALID_DISPOSITION";
+    case 0xC0000027: return "STATUS_UNWIND";
+    case 0xC0000028: return "STATUS_BAD_STACK";
+    case 0xC0000029: return "STATUS_INVALID_UNWIND_TARGET";
+    case 0xC000002A: return "STATUS_NOT_LOCKED";
+    case 0xC000002B: return "STATUS_PARITY_ERROR";
+    case 0xC000002C: return "STATUS_UNABLE_TO_DECOMMIT_VM";
+    case 0xC000002D: return "STATUS_NOT_COMMITTED";
+    case 0xC000002E: return "STATUS_INVALID_PORT_ATTRIBUTES";
+    case 0xC000002F: return "STATUS_PORT_MESSAGE_TOO_LONG";
+    case 0xC0000030: return "STATUS_INVALID_PARAMETER_MIX";
+    case 0xC0000031: return "STATUS_INVALID_QUOTA_LOWER";
+    case 0xC0000032: return "STATUS_DISK_CORRUPT_ERROR";
+    case 0xC0000033: return "STATUS_OBJECT_NAME_INVALID";
+    case 0xC0000034: return "STATUS_OBJECT_NAME_NOT_FOUND";
+    case 0xC0000035: return "STATUS_OBJECT_NAME_COLLISION";
+    case 0xC0000037: return "STATUS_PORT_DISCONNECTED";
+    case 0xC0000038: return "STATUS_DEVICE_ALREADY_ATTACHED";
+    case 0xC0000039: return "STATUS_OBJECT_PATH_INVALID";
+    case 0xC000003A: return "STATUS_OBJECT_PATH_NOT_FOUND";
+    case 0xC000003B: return "STATUS_OBJECT_PATH_SYNTAX_BAD";
+    case 0xC000003C: return "STATUS_DATA_OVERRUN";
+    case 0xC000003D: return "STATUS_DATA_LATE_ERROR";
+    case 0xC000003E: return "STATUS_DATA_ERROR";
+    case 0xC000003F: return "STATUS_CRC_ERROR";
+    case 0xC0000040: return "STATUS_SECTION_TOO_BIG";
+    case 0xC0000041: return "STATUS_PORT_CONNECTION_REFUSED";
+    case 0xC0000042: return "STATUS_INVALID_PORT_HANDLE";
+    case 0xC0000043: return "STATUS_SHARING_VIOLATION";
+    case 0xC0000044: return "STATUS_QUOTA_EXCEEDED";
+    case 0xC0000045: return "STATUS_INVALID_PAGE_PROTECTION";
+    case 0xC0000046: return "STATUS_MUTANT_NOT_OWNED";
+    case 0xC0000047: return "STATUS_SEMAPHORE_LIMIT_EXCEEDED";
+    case 0xC0000048: return "STATUS_PORT_ALREADY_SET";
+    case 0xC0000049: return "STATUS_SECTION_NOT_IMAGE";
+    case 0xC000004A: return "STATUS_SUSPEND_COUNT_EXCEEDED";
+    case 0xC000004B: return "STATUS_THREAD_IS_TERMINATING";
+    case 0xC000004C: return "STATUS_BAD_WORKING_SET_LIMIT";
+    case 0xC000004D: return "STATUS_INCOMPATIBLE_FILE_MAP";
+    case 0xC000004E: return "STATUS_SECTION_PROTECTION";
+    case 0xC000004F: return "STATUS_EAS_NOT_SUPPORTED";
+    case 0xC0000050: return "STATUS_EA_TOO_LARGE";
+    case 0xC0000051: return "STATUS_NONEXISTENT_EA_ENTRY";
+    case 0xC0000052: return "STATUS_NO_EAS_ON_FILE";
+    case 0xC0000053: return "STATUS_EA_CORRUPT_ERROR";
+    case 0xC0000054: return "STATUS_FILE_LOCK_CONFLICT";
+    case 0xC0000055: return "STATUS_LOCK_NOT_GRANTED";
+    case 0xC0000056: return "STATUS_DELETE_PENDING";
+    case 0xC0000057: return "STATUS_CTL_FILE_NOT_SUPPORTED";
+    case 0xC0000058: return "STATUS_UNKNOWN_REVISION";
+    case 0xC0000059: return "STATUS_REVISION_MISMATCH";
+    case 0xC000005A: return "STATUS_INVALID_OWNER";
+    case 0xC000005B: return "STATUS_INVALID_PRIMARY_GROUP";
+    case 0xC000005C: return "STATUS_NO_IMPERSONATION_TOKEN";
+    case 0xC000005D: return "STATUS_CANT_DISABLE_MANDATORY";
+    case 0xC000005E: return "STATUS_NO_LOGON_SERVERS";
+    case 0xC000005F: return "STATUS_NO_SUCH_LOGON_SESSION";
+    case 0xC0000060: return "STATUS_NO_SUCH_PRIVILEGE";
+    case 0xC0000061: return "STATUS_PRIVILEGE_NOT_HELD";
+    case 0xC0000062: return "STATUS_INVALID_ACCOUNT_NAME";
+    case 0xC0000063: return "STATUS_USER_EXISTS";
+    case 0xC0000064: return "STATUS_NO_SUCH_USER";
+    case 0xC0000065: return "STATUS_GROUP_EXISTS";
+    case 0xC0000066: return "STATUS_NO_SUCH_GROUP";
+    case 0xC0000067: return "STATUS_MEMBER_IN_GROUP";
+    case 0xC0000068: return "STATUS_MEMBER_NOT_IN_GROUP";
+    case 0xC0000069: return "STATUS_LAST_ADMIN";
+    case 0xC000006A: return "STATUS_WRONG_PASSWORD";
+    case 0xC000006B: return "STATUS_ILL_FORMED_PASSWORD";
+    case 0xC000006C: return "STATUS_PASSWORD_RESTRICTION";
+    case 0xC000006D: return "STATUS_LOGON_FAILURE";
+    case 0xC000006E: return "STATUS_ACCOUNT_RESTRICTION";
+    case 0xC000006F: return "STATUS_INVALID_LOGON_HOURS";
+    case 0xC0000070: return "STATUS_INVALID_WORKSTATION";
+    case 0xC0000071: return "STATUS_PASSWORD_EXPIRED";
+    case 0xC0000072: return "STATUS_ACCOUNT_DISABLED";
+    case 0xC0000073: return "STATUS_NONE_MAPPED";
+    case 0xC0000074: return "STATUS_TOO_MANY_LUIDS_REQUESTED";
+    case 0xC0000075: return "STATUS_LUIDS_EXHAUSTED";
+    case 0xC0000076: return "STATUS_INVALID_SUB_AUTHORITY";
+    case 0xC0000077: return "STATUS_INVALID_ACL";
+    case 0xC0000078: return "STATUS_INVALID_SID";
+    case 0xC0000079: return "STATUS_INVALID_SECURITY_DESCR";
+    case 0xC000007A: return "STATUS_PROCEDURE_NOT_FOUND";
+    case 0xC000007B: return "STATUS_INVALID_IMAGE_FORMAT";
+    case 0xC000007C: return "STATUS_NO_TOKEN";
+    case 0xC000007D: return "STATUS_BAD_INHERITANCE_ACL";
+    case 0xC000007E: return "STATUS_RANGE_NOT_LOCKED";
+    case 0xC000007F: return "STATUS_DISK_FULL";
+    case 0xC0000080: return "STATUS_SERVER_DISABLED";
+    case 0xC0000081: return "STATUS_SERVER_NOT_DISABLED";
+    case 0xC0000082: return "STATUS_TOO_MANY_GUIDS_REQUESTED";
+    case 0xC0000083: return "STATUS_GUIDS_EXHAUSTED";
+    case 0xC0000084: return "STATUS_INVALID_ID_AUTHORITY";
+    case 0xC0000085: return "STATUS_AGENTS_EXHAUSTED";
+    case 0xC0000086: return "STATUS_INVALID_VOLUME_LABEL";
+    case 0xC0000087: return "STATUS_SECTION_NOT_EXTENDED";
+    case 0xC0000088: return "STATUS_NOT_MAPPED_DATA";
+    case 0xC0000089: return "STATUS_RESOURCE_DATA_NOT_FOUND";
+    case 0xC000008A: return "STATUS_RESOURCE_TYPE_NOT_FOUND";
+    case 0xC000008B: return "STATUS_RESOURCE_NAME_NOT_FOUND";
+    case 0xC000008C: return "STATUS_ARRAY_BOUNDS_EXCEEDED";
+    case 0xC000008D: return "STATUS_FLOAT_DENORMAL_OPERAND";
+    case 0xC000008E: return "STATUS_FLOAT_DIVIDE_BY_ZERO";
+    case 0xC000008F: return "STATUS_FLOAT_INEXACT_RESULT";
+    case 0xC0000090: return "STATUS_FLOAT_INVALID_OPERATION";
+    case 0xC0000091: return "STATUS_FLOAT_OVERFLOW";
+    case 0xC0000092: return "STATUS_FLOAT_STACK_CHECK";
+    case 0xC0000093: return "STATUS_FLOAT_UNDERFLOW";
+    case 0xC0000094: return "STATUS_INTEGER_DIVIDE_BY_ZERO";
+    case 0xC0000095: return "STATUS_INTEGER_OVERFLOW";
+    case 0xC0000096: return "STATUS_PRIVILEGED_INSTRUCTION";
+    case 0xC0000097: return "STATUS_TOO_MANY_PAGING_FILES";
+    case 0xC0000098: return "STATUS_FILE_INVALID";
+    case 0xC0000099: return "STATUS_ALLOTTED_SPACE_EXCEEDED";
+    case 0xC000009A: return "STATUS_INSUFFICIENT_RESOURCES";
+    case 0xC000009B: return "STATUS_DFS_EXIT_PATH_FOUND";
+    case 0xC000009C: return "STATUS_DEVICE_DATA_ERROR";
+    case 0xC000009D: return "STATUS_DEVICE_NOT_CONNECTED";
+    case 0xC000009E: return "STATUS_DEVICE_POWER_FAILURE";
+    case 0xC000009F: return "STATUS_FREE_VM_NOT_AT_BASE";
+    case 0xC00000A0: return "STATUS_MEMORY_NOT_ALLOCATED";
+    case 0xC00000A1: return "STATUS_WORKING_SET_QUOTA";
+    case 0xC00000A2: return "STATUS_MEDIA_WRITE_PROTECTED";
+    case 0xC00000A3: return "STATUS_DEVICE_NOT_READY";
+    case 0xC00000A4: return "STATUS_INVALID_GROUP_ATTRIBUTES";
+    case 0xC00000A5: return "STATUS_BAD_IMPERSONATION_LEVEL";
+    case 0xC00000A6: return "STATUS_CANT_OPEN_ANONYMOUS";
+    case 0xC00000A7: return "STATUS_BAD_VALIDATION_CLASS";
+    case 0xC00000A8: return "STATUS_BAD_TOKEN_TYPE";
+    case 0xC00000A9: return "STATUS_BAD_MASTER_BOOT_RECORD";
+    case 0xC00000AA: return "STATUS_INSTRUCTION_MISALIGNMENT";
+    case 0xC00000AB: return "STATUS_INSTANCE_NOT_AVAILABLE";
+    case 0xC00000AC: return "STATUS_PIPE_NOT_AVAILABLE";
+    case 0xC00000AD: return "STATUS_INVALID_PIPE_STATE";
+    case 0xC00000AE: return "STATUS_PIPE_BUSY";
+    case 0xC00000AF: return "STATUS_ILLEGAL_FUNCTION";
+    case 0xC00000B0: return "STATUS_PIPE_DISCONNECTED";
+    case 0xC00000B1: return "STATUS_PIPE_CLOSING";
+    case 0xC00000B2: return "STATUS_PIPE_CONNECTED";
+    case 0xC00000B3: return "STATUS_PIPE_LISTENING";
+    case 0xC00000B4: return "STATUS_INVALID_READ_MODE";
+    case 0xC00000B5: return "STATUS_IO_TIMEOUT";
+    case 0xC00000B6: return "STATUS_FILE_FORCED_CLOSED";
+    case 0xC00000B7: return "STATUS_PROFILING_NOT_STARTED";
+    case 0xC00000B8: return "STATUS_PROFILING_NOT_STOPPED";
+    case 0xC00000B9: return "STATUS_COULD_NOT_INTERPRET";
+    case 0xC00000BA: return "STATUS_FILE_IS_A_DIRECTORY";
+    case 0xC00000BB: return "STATUS_NOT_SUPPORTED";
+    case 0xC00000BC: return "STATUS_REMOTE_NOT_LISTENING";
+    case 0xC00000BD: return "STATUS_DUPLICATE_NAME";
+    case 0xC00000BE: return "STATUS_BAD_NETWORK_PATH";
+    case 0xC00000BF: return "STATUS_NETWORK_BUSY";
+    case 0xC00000C0: return "STATUS_DEVICE_DOES_NOT_EXIST";
+    case 0xC00000C1: return "STATUS_TOO_MANY_COMMANDS";
+    case 0xC00000C2: return "STATUS_ADAPTER_HARDWARE_ERROR";
+    case 0xC00000C3: return "STATUS_INVALID_NETWORK_RESPONSE";
+    case 0xC00000C4: return "STATUS_UNEXPECTED_NETWORK_ERROR";
+    case 0xC00000C5: return "STATUS_BAD_REMOTE_ADAPTER";
+    case 0xC00000C6: return "STATUS_PRINT_QUEUE_FULL";
+    case 0xC00000C7: return "STATUS_NO_SPOOL_SPACE";
+    case 0xC00000C8: return "STATUS_PRINT_CANCELLED";
+    case 0xC00000C9: return "STATUS_NETWORK_NAME_DELETED";
+    case 0xC00000CA: return "STATUS_NETWORK_ACCESS_DENIED";
+    case 0xC00000CB: return "STATUS_BAD_DEVICE_TYPE";
+    case 0xC00000CC: return "STATUS_BAD_NETWORK_NAME";
+    case 0xC00000CD: return "STATUS_TOO_MANY_NAMES";
+    case 0xC00000CE: return "STATUS_TOO_MANY_SESSIONS";
+    case 0xC00000CF: return "STATUS_SHARING_PAUSED";
+    case 0xC00000D0: return "STATUS_REQUEST_NOT_ACCEPTED";
+    case 0xC00000D1: return "STATUS_REDIRECTOR_PAUSED";
+    case 0xC00000D2: return "STATUS_NET_WRITE_FAULT";
+    case 0xC00000D3: return "STATUS_PROFILING_AT_LIMIT";
+    case 0xC00000D4: return "STATUS_NOT_SAME_DEVICE";
+    case 0xC00000D5: return "STATUS_FILE_RENAMED";
+    case 0xC00000D6: return "STATUS_VIRTUAL_CIRCUIT_CLOSED";
+    case 0xC00000D7: return "STATUS_NO_SECURITY_ON_OBJECT";
+    case 0xC00000D8: return "STATUS_CANT_WAIT";
+    case 0xC00000D9: return "STATUS_PIPE_EMPTY";
+    case 0xC00000DA: return "STATUS_CANT_ACCESS_DOMAIN_INFO";
+    case 0xC00000DB: return "STATUS_CANT_TERMINATE_SELF";
+    case 0xC00000DC: return "STATUS_INVALID_SERVER_STATE";
+    case 0xC00000DD: return "STATUS_INVALID_DOMAIN_STATE";
+    case 0xC00000DE: return "STATUS_INVALID_DOMAIN_ROLE";
+    case 0xC00000DF: return "STATUS_NO_SUCH_DOMAIN";
+    case 0xC00000E0: return "STATUS_DOMAIN_EXISTS";
+    case 0xC00000E1: return "STATUS_DOMAIN_LIMIT_EXCEEDED";
+    case 0xC00000E2: return "STATUS_OPLOCK_NOT_GRANTED";
+    case 0xC00000E3: return "STATUS_INVALID_OPLOCK_PROTOCOL";
+    case 0xC00000E4: return "STATUS_INTERNAL_DB_CORRUPTION";
+    case 0xC00000E5: return "STATUS_INTERNAL_ERROR";
+    case 0xC00000E6: return "STATUS_GENERIC_NOT_MAPPED";
+    case 0xC00000E7: return "STATUS_BAD_DESCRIPTOR_FORMAT";
+    case 0xC00000E8: return "STATUS_INVALID_USER_BUFFER";
+    case 0xC00000E9: return "STATUS_UNEXPECTED_IO_ERROR";
+    case 0xC00000EA: return "STATUS_UNEXPECTED_MM_CREATE_ERR";
+    case 0xC00000EB: return "STATUS_UNEXPECTED_MM_MAP_ERROR";
+    case 0xC00000EC: return "STATUS_UNEXPECTED_MM_EXTEND_ERR";
+    case 0xC00000ED: return "STATUS_NOT_LOGON_PROCESS";
+    case 0xC00000EE: return "STATUS_LOGON_SESSION_EXISTS";
+    case 0xC00000EF: return "STATUS_INVALID_PARAMETER_1";
+    case 0xC00000F0: return "STATUS_INVALID_PARAMETER_2";
+    case 0xC00000F1: return "STATUS_INVALID_PARAMETER_3";
+    case 0xC00000F2: return "STATUS_INVALID_PARAMETER_4";
+    case 0xC00000F3: return "STATUS_INVALID_PARAMETER_5";
+    case 0xC00000F4: return "STATUS_INVALID_PARAMETER_6";
+    case 0xC00000F5: return "STATUS_INVALID_PARAMETER_7";
+    case 0xC00000F6: return "STATUS_INVALID_PARAMETER_8";
+    case 0xC00000F7: return "STATUS_INVALID_PARAMETER_9";
+    case 0xC00000F8: return "STATUS_INVALID_PARAMETER_10";
+    case 0xC00000F9: return "STATUS_INVALID_PARAMETER_11";
+    case 0xC00000FA: return "STATUS_INVALID_PARAMETER_12";
+    case 0xC00000FB: return "STATUS_REDIRECTOR_NOT_STARTED";
+    case 0xC00000FC: return "STATUS_REDIRECTOR_STARTED";
+    case 0xC00000FD: return "STATUS_STACK_OVERFLOW";
+    case 0xC00000FE: return "STATUS_NO_SUCH_PACKAGE";
+    case 0xC00000FF: return "STATUS_BAD_FUNCTION_TABLE";
+    case 0xC0000100: return "STATUS_VARIABLE_NOT_FOUND";
+    case 0xC0000101: return "STATUS_DIRECTORY_NOT_EMPTY";
+    case 0xC0000102: return "STATUS_FILE_CORRUPT_ERROR";
+    case 0xC0000103: return "STATUS_NOT_A_DIRECTORY";
+    case 0xC0000104: return "STATUS_BAD_LOGON_SESSION_STATE";
+    case 0xC0000105: return "STATUS_LOGON_SESSION_COLLISION";
+    case 0xC0000106: return "STATUS_NAME_TOO_LONG";
+    case 0xC0000107: return "STATUS_FILES_OPEN";
+    case 0xC0000108: return "STATUS_CONNECTION_IN_USE";
+    case 0xC0000109: return "STATUS_MESSAGE_NOT_FOUND";
+    case 0xC000010A: return "STATUS_PROCESS_IS_TERMINATING";
+    case 0xC000010B: return "STATUS_INVALID_LOGON_TYPE";
+    case 0xC000010C: return "STATUS_NO_GUID_TRANSLATION";
+    case 0xC000010D: return "STATUS_CANNOT_IMPERSONATE";
+    case 0xC000010E: return "STATUS_IMAGE_ALREADY_LOADED";
+    case 0xC000010F: return "STATUS_ABIOS_NOT_PRESENT";
+    case 0xC0000110: return "STATUS_ABIOS_LID_NOT_EXIST";
+    case 0xC0000111: return "STATUS_ABIOS_LID_ALREADY_OWNED";
+    case 0xC0000112: return "STATUS_ABIOS_NOT_LID_OWNER";
+    case 0xC0000113: return "STATUS_ABIOS_INVALID_COMMAND";
+    case 0xC0000114: return "STATUS_ABIOS_INVALID_LID";
+    case 0xC0000115: return "STATUS_ABIOS_SELECTOR_NOT_AVAILABLE";
+    case 0xC0000116: return "STATUS_ABIOS_INVALID_SELECTOR";
+    case 0xC0000117: return "STATUS_NO_LDT";
+    case 0xC0000118: return "STATUS_INVALID_LDT_SIZE";
+    case 0xC0000119: return "STATUS_INVALID_LDT_OFFSET";
+    case 0xC000011A: return "STATUS_INVALID_LDT_DESCRIPTOR";
+    case 0xC000011B: return "STATUS_INVALID_IMAGE_NE_FORMAT";
+    case 0xC000011C: return "STATUS_RXACT_INVALID_STATE";
+    case 0xC000011D: return "STATUS_RXACT_COMMIT_FAILURE";
+    case 0xC000011E: return "STATUS_MAPPED_FILE_SIZE_ZERO";
+    case 0xC000011F: return "STATUS_TOO_MANY_OPENED_FILES";
+    case 0xC0000120: return "STATUS_CANCELLED";
+    case 0xC0000121: return "STATUS_CANNOT_DELETE";
+    case 0xC0000122: return "STATUS_INVALID_COMPUTER_NAME";
+    case 0xC0000123: return "STATUS_FILE_DELETED";
+    case 0xC0000124: return "STATUS_SPECIAL_ACCOUNT";
+    case 0xC0000125: return "STATUS_SPECIAL_GROUP";
+    case 0xC0000126: return "STATUS_SPECIAL_USER";
+    case 0xC0000127: return "STATUS_MEMBERS_PRIMARY_GROUP";
+    case 0xC0000128: return "STATUS_FILE_CLOSED";
+    case 0xC0000129: return "STATUS_TOO_MANY_THREADS";
+    case 0xC000012A: return "STATUS_THREAD_NOT_IN_PROCESS";
+    case 0xC000012B: return "STATUS_TOKEN_ALREADY_IN_USE";
+    case 0xC000012C: return "STATUS_PAGEFILE_QUOTA_EXCEEDED";
+    case 0xC000012D: return "STATUS_COMMITMENT_LIMIT";
+    case 0xC000012E: return "STATUS_INVALID_IMAGE_LE_FORMAT";
+    case 0xC000012F: return "STATUS_INVALID_IMAGE_NOT_MZ";
+    case 0xC0000130: return "STATUS_INVALID_IMAGE_PROTECT";
+    case 0xC0000131: return "STATUS_INVALID_IMAGE_WIN_16";
+    case 0xC0000132: return "STATUS_LOGON_SERVER_CONFLICT";
+    case 0xC0000133: return "STATUS_TIME_DIFFERENCE_AT_DC";
+    case 0xC0000134: return "STATUS_SYNCHRONIZATION_REQUIRED";
+    case 0xC0000135: return "STATUS_DLL_NOT_FOUND";
+    case 0xC0000136: return "STATUS_OPEN_FAILED";
+    case 0xC0000137: return "STATUS_IO_PRIVILEGE_FAILED";
+    case 0xC0000138: return "STATUS_ORDINAL_NOT_FOUND";
+    case 0xC0000139: return "STATUS_ENTRYPOINT_NOT_FOUND";
+    case 0xC000013A: return "STATUS_CONTROL_C_EXIT";
+    case 0xC000013B: return "STATUS_LOCAL_DISCONNECT";
+    case 0xC000013C: return "STATUS_REMOTE_DISCONNECT";
+    case 0xC000013D: return "STATUS_REMOTE_RESOURCES";
+    case 0xC000013E: return "STATUS_LINK_FAILED";
+    case 0xC000013F: return "STATUS_LINK_TIMEOUT";
+    case 0xC0000140: return "STATUS_INVALID_CONNECTION";
+    case 0xC0000141: return "STATUS_INVALID_ADDRESS";
+    case 0xC0000142: return "STATUS_DLL_INIT_FAILED";
+    case 0xC0000143: return "STATUS_MISSING_SYSTEMFILE";
+    case 0xC0000144: return "STATUS_UNHANDLED_EXCEPTION";
+    case 0xC0000145: return "STATUS_APP_INIT_FAILURE";
+    case 0xC0000146: return "STATUS_PAGEFILE_CREATE_FAILED";
+    case 0xC0000147: return "STATUS_NO_PAGEFILE";
+    case 0xC0000148: return "STATUS_INVALID_LEVEL";
+    case 0xC0000149: return "STATUS_WRONG_PASSWORD_CORE";
+    case 0xC000014A: return "STATUS_ILLEGAL_FLOAT_CONTEXT";
+    case 0xC000014B: return "STATUS_PIPE_BROKEN";
+    case 0xC000014C: return "STATUS_REGISTRY_CORRUPT";
+    case 0xC000014D: return "STATUS_REGISTRY_IO_FAILED";
+    case 0xC000014E: return "STATUS_NO_EVENT_PAIR";
+    case 0xC000014F: return "STATUS_UNRECOGNIZED_VOLUME";
+    case 0xC0000150: return "STATUS_SERIAL_NO_DEVICE_INITED";
+    case 0xC0000151: return "STATUS_NO_SUCH_ALIAS";
+    case 0xC0000152: return "STATUS_MEMBER_NOT_IN_ALIAS";
+    case 0xC0000153: return "STATUS_MEMBER_IN_ALIAS";
+    case 0xC0000154: return "STATUS_ALIAS_EXISTS";
+    case 0xC0000155: return "STATUS_LOGON_NOT_GRANTED";
+    case 0xC0000156: return "STATUS_TOO_MANY_SECRETS";
+    case 0xC0000157: return "STATUS_SECRET_TOO_LONG";
+    case 0xC0000158: return "STATUS_INTERNAL_DB_ERROR";
+    case 0xC0000159: return "STATUS_FULLSCREEN_MODE";
+    case 0xC000015A: return "STATUS_TOO_MANY_CONTEXT_IDS";
+    case 0xC000015B: return "STATUS_LOGON_TYPE_NOT_GRANTED";
+    case 0xC000015C: return "STATUS_NOT_REGISTRY_FILE";
+    case 0xC000015D: return "STATUS_NT_CROSS_ENCRYPTION_REQUIRED";
+    case 0xC000015E: return "STATUS_DOMAIN_CTRLR_CONFIG_ERROR";
+    case 0xC000015F: return "STATUS_FT_MISSING_MEMBER";
+    case 0xC0000160: return "STATUS_ILL_FORMED_SERVICE_ENTRY";
+    case 0xC0000161: return "STATUS_ILLEGAL_CHARACTER";
+    case 0xC0000162: return "STATUS_UNMAPPABLE_CHARACTER";
+    case 0xC0000163: return "STATUS_UNDEFINED_CHARACTER";
+    case 0xC0000164: return "STATUS_FLOPPY_VOLUME";
+    case 0xC0000165: return "STATUS_FLOPPY_ID_MARK_NOT_FOUND";
+    case 0xC0000166: return "STATUS_FLOPPY_WRONG_CYLINDER";
+    case 0xC0000167: return "STATUS_FLOPPY_UNKNOWN_ERROR";
+    case 0xC0000168: return "STATUS_FLOPPY_BAD_REGISTERS";
+    case 0xC0000169: return "STATUS_DISK_RECALIBRATE_FAILED";
+    case 0xC000016A: return "STATUS_DISK_OPERATION_FAILED";
+    case 0xC000016B: return "STATUS_DISK_RESET_FAILED";
+    case 0xC000016C: return "STATUS_SHARED_IRQ_BUSY";
+    case 0xC000016D: return "STATUS_FT_ORPHANING";
+    case 0xC000016E: return "STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT";
+    case 0xC0000172: return "STATUS_PARTITION_FAILURE";
+    case 0xC0000173: return "STATUS_INVALID_BLOCK_LENGTH";
+    case 0xC0000174: return "STATUS_DEVICE_NOT_PARTITIONED";
+    case 0xC0000175: return "STATUS_UNABLE_TO_LOCK_MEDIA";
+    case 0xC0000176: return "STATUS_UNABLE_TO_UNLOAD_MEDIA";
+    case 0xC0000177: return "STATUS_EOM_OVERFLOW";
+    case 0xC0000178: return "STATUS_NO_MEDIA";
+    case 0xC000017A: return "STATUS_NO_SUCH_MEMBER";
+    case 0xC000017B: return "STATUS_INVALID_MEMBER";
+    case 0xC000017C: return "STATUS_KEY_DELETED";
+    case 0xC000017D: return "STATUS_NO_LOG_SPACE";
+    case 0xC000017E: return "STATUS_TOO_MANY_SIDS";
+    case 0xC000017F: return "STATUS_LM_CROSS_ENCRYPTION_REQUIRED";
+    case 0xC0000180: return "STATUS_KEY_HAS_CHILDREN";
+    case 0xC0000181: return "STATUS_CHILD_MUST_BE_VOLATILE";
+    case 0xC0000182: return "STATUS_DEVICE_CONFIGURATION_ERROR";
+    case 0xC0000183: return "STATUS_DRIVER_INTERNAL_ERROR";
+    case 0xC0000184: return "STATUS_INVALID_DEVICE_STATE";
+    case 0xC0000185: return "STATUS_IO_DEVICE_ERROR";
+    case 0xC0000186: return "STATUS_DEVICE_PROTOCOL_ERROR";
+    case 0xC0000187: return "STATUS_BACKUP_CONTROLLER";
+    case 0xC0000188: return "STATUS_LOG_FILE_FULL";
+    case 0xC0000189: return "STATUS_TOO_LATE";
+    case 0xC000018A: return "STATUS_NO_TRUST_LSA_SECRET";
+    case 0xC000018B: return "STATUS_NO_TRUST_SAM_ACCOUNT";
+    case 0xC000018C: return "STATUS_TRUSTED_DOMAIN_FAILURE";
+    case 0xC000018D: return "STATUS_TRUSTED_RELATIONSHIP_FAILURE";
+    case 0xC000018E: return "STATUS_EVENTLOG_FILE_CORRUPT";
+    case 0xC000018F: return "STATUS_EVENTLOG_CANT_START";
+    case 0xC0000190: return "STATUS_TRUST_FAILURE";
+    case 0xC0000191: return "STATUS_MUTANT_LIMIT_EXCEEDED";
+    case 0xC0000192: return "STATUS_NETLOGON_NOT_STARTED";
+    case 0xC0000193: return "STATUS_ACCOUNT_EXPIRED";
+    case 0xC0000194: return "STATUS_POSSIBLE_DEADLOCK";
+    case 0xC0000195: return "STATUS_NETWORK_CREDENTIAL_CONFLICT";
+    case 0xC0000196: return "STATUS_REMOTE_SESSION_LIMIT";
+    case 0xC0000197: return "STATUS_EVENTLOG_FILE_CHANGED";
+    case 0xC0000198: return "STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT";
+    case 0xC0000199: return "STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT";
+    case 0xC000019A: return "STATUS_NOLOGON_SERVER_TRUST_ACCOUNT";
+    case 0xC000019B: return "STATUS_DOMAIN_TRUST_INCONSISTENT";
+    case 0xC000019C: return "STATUS_FS_DRIVER_REQUIRED";
+    case 0xC0000202: return "STATUS_NO_USER_SESSION_KEY";
+    case 0xC0000203: return "STATUS_USER_SESSION_DELETED";
+    case 0xC0000204: return "STATUS_RESOURCE_LANG_NOT_FOUND";
+    case 0xC0000205: return "STATUS_INSUFF_SERVER_RESOURCES";
+    case 0xC0000206: return "STATUS_INVALID_BUFFER_SIZE";
+    case 0xC0000207: return "STATUS_INVALID_ADDRESS_COMPONENT";
+    case 0xC0000208: return "STATUS_INVALID_ADDRESS_WILDCARD";
+    case 0xC0000209: return "STATUS_TOO_MANY_ADDRESSES";
+    case 0xC000020A: return "STATUS_ADDRESS_ALREADY_EXISTS";
+    case 0xC000020B: return "STATUS_ADDRESS_CLOSED";
+    case 0xC000020C: return "STATUS_CONNECTION_DISCONNECTED";
+    case 0xC000020D: return "STATUS_CONNECTION_RESET";
+    case 0xC000020E: return "STATUS_TOO_MANY_NODES";
+    case 0xC000020F: return "STATUS_TRANSACTION_ABORTED";
+    case 0xC0000210: return "STATUS_TRANSACTION_TIMED_OUT";
+    case 0xC0000211: return "STATUS_TRANSACTION_NO_RELEASE";
+    case 0xC0000212: return "STATUS_TRANSACTION_NO_MATCH";
+    case 0xC0000213: return "STATUS_TRANSACTION_RESPONDED";
+    case 0xC0000214: return "STATUS_TRANSACTION_INVALID_ID";
+    case 0xC0000215: return "STATUS_TRANSACTION_INVALID_TYPE";
+    case 0xC0000216: return "STATUS_NOT_SERVER_SESSION";
+    case 0xC0000217: return "STATUS_NOT_CLIENT_SESSION";
+    case 0xC0000218: return "STATUS_CANNOT_LOAD_REGISTRY_FILE";
+    case 0xC0000219: return "STATUS_DEBUG_ATTACH_FAILED";
+    case 0xC000021A: return "STATUS_SYSTEM_PROCESS_TERMINATED";
+    case 0xC000021B: return "STATUS_DATA_NOT_ACCEPTED";
+    case 0xC000021C: return "STATUS_NO_BROWSER_SERVERS_FOUND";
+    case 0xC000021D: return "STATUS_VDM_HARD_ERROR";
+    case 0xC000021E: return "STATUS_DRIVER_CANCEL_TIMEOUT";
+    case 0xC000021F: return "STATUS_REPLY_MESSAGE_MISMATCH";
+    case 0xC0000220: return "STATUS_MAPPED_ALIGNMENT";
+    case 0xC0000221: return "STATUS_IMAGE_CHECKSUM_MISMATCH";
+    case 0xC0000222: return "STATUS_LOST_WRITEBEHIND_DATA";
+    case 0xC0000223: return "STATUS_CLIENT_SERVER_PARAMETERS_INVALID";
+    case 0xC0000224: return "STATUS_PASSWORD_MUST_CHANGE";
+    case 0xC0000225: return "STATUS_NOT_FOUND";
+    case 0xC0000226: return "STATUS_NOT_TINY_STREAM";
+    case 0xC0000227: return "STATUS_RECOVERY_FAILURE";
+    case 0xC0000228: return "STATUS_STACK_OVERFLOW_READ";
+    case 0xC0000229: return "STATUS_FAIL_CHECK";
+    case 0xC000022A: return "STATUS_DUPLICATE_OBJECTID";
+    case 0xC000022B: return "STATUS_OBJECTID_EXISTS";
+    case 0xC000022C: return "STATUS_CONVERT_TO_LARGE";
+    case 0xC000022D: return "STATUS_RETRY";
+    case 0xC000022E: return "STATUS_FOUND_OUT_OF_SCOPE";
+    case 0xC000022F: return "STATUS_ALLOCATE_BUCKET";
+    case 0xC0000230: return "STATUS_PROPSET_NOT_FOUND";
+    case 0xC0000231: return "STATUS_MARSHALL_OVERFLOW";
+    case 0xC0000232: return "STATUS_INVALID_VARIANT";
+    case 0xC0000233: return "STATUS_DOMAIN_CONTROLLER_NOT_FOUND";
+    case 0xC0000234: return "STATUS_ACCOUNT_LOCKED_OUT";
+    case 0xC0000235: return "STATUS_HANDLE_NOT_CLOSABLE";
+    case 0xC0000236: return "STATUS_CONNECTION_REFUSED";
+    case 0xC0000237: return "STATUS_GRACEFUL_DISCONNECT";
+    case 0xC0000238: return "STATUS_ADDRESS_ALREADY_ASSOCIATED";
+    case 0xC0000239: return "STATUS_ADDRESS_NOT_ASSOCIATED";
+    case 0xC000023A: return "STATUS_CONNECTION_INVALID";
+    case 0xC000023B: return "STATUS_CONNECTION_ACTIVE";
+    case 0xC000023C: return "STATUS_NETWORK_UNREACHABLE";
+    case 0xC000023D: return "STATUS_HOST_UNREACHABLE";
+    case 0xC000023E: return "STATUS_PROTOCOL_UNREACHABLE";
+    case 0xC000023F: return "STATUS_PORT_UNREACHABLE";
+    case 0xC0000240: return "STATUS_REQUEST_ABORTED";
+    case 0xC0000241: return "STATUS_CONNECTION_ABORTED";
+    case 0xC0000242: return "STATUS_BAD_COMPRESSION_BUFFER";
+    case 0xC0000243: return "STATUS_USER_MAPPED_FILE";
+    case 0xC0000244: return "STATUS_AUDIT_FAILED";
+    case 0xC0000245: return "STATUS_TIMER_RESOLUTION_NOT_SET";
+    case 0xC0000246: return "STATUS_CONNECTION_COUNT_LIMIT";
+    case 0xC0000247: return "STATUS_LOGIN_TIME_RESTRICTION";
+    case 0xC0000248: return "STATUS_LOGIN_WKSTA_RESTRICTION";
+    case 0xC0000249: return "STATUS_IMAGE_MP_UP_MISMATCH";
+    case 0xC0000250: return "STATUS_INSUFFICIENT_LOGON_INFO";
+    case 0xC0000251: return "STATUS_BAD_DLL_ENTRYPOINT";
+    case 0xC0000252: return "STATUS_BAD_SERVICE_ENTRYPOINT";
+    case 0xC0000253: return "STATUS_LPC_REPLY_LOST";
+    case 0xC0000254: return "STATUS_IP_ADDRESS_CONFLICT1";
+    case 0xC0000255: return "STATUS_IP_ADDRESS_CONFLICT2";
+    case 0xC0000256: return "STATUS_REGISTRY_QUOTA_LIMIT";
+    case 0xC0000257: return "STATUS_PATH_NOT_COVERED";
+    case 0xC0000258: return "STATUS_NO_CALLBACK_ACTIVE";
+    case 0xC0000259: return "STATUS_LICENSE_QUOTA_EXCEEDED";
+    case 0xC000025A: return "STATUS_PWD_TOO_SHORT";
+    case 0xC000025B: return "STATUS_PWD_TOO_RECENT";
+    case 0xC000025C: return "STATUS_PWD_HISTORY_CONFLICT";
+    case 0xC000025E: return "STATUS_PLUGPLAY_NO_DEVICE";
+    case 0xC000025F: return "STATUS_UNSUPPORTED_COMPRESSION";
+    case 0xC0000260: return "STATUS_INVALID_HW_PROFILE";
+    case 0xC0000261: return "STATUS_INVALID_PLUGPLAY_DEVICE_PATH";
+    case 0xC0000262: return "STATUS_DRIVER_ORDINAL_NOT_FOUND";
+    case 0xC0000263: return "STATUS_DRIVER_ENTRYPOINT_NOT_FOUND";
+    case 0xC0000264: return "STATUS_RESOURCE_NOT_OWNED";
+    case 0xC0000265: return "STATUS_TOO_MANY_LINKS";
+    case 0xC0000266: return "STATUS_QUOTA_LIST_INCONSISTENT";
+    case 0xC0000267: return "STATUS_FILE_IS_OFFLINE";
+    case 0xC0000268: return "STATUS_EVALUATION_EXPIRATION";
+    case 0xC0000269: return "STATUS_ILLEGAL_DLL_RELOCATION";
+    case 0xC000026A: return "STATUS_LICENSE_VIOLATION";
+    case 0xC000026B: return "STATUS_DLL_INIT_FAILED_LOGOFF";
+    case 0xC000026C: return "STATUS_DRIVER_UNABLE_TO_LOAD";
+    case 0xC000026D: return "STATUS_DFS_UNAVAILABLE";
+    case 0xC000026E: return "STATUS_VOLUME_DISMOUNTED";
+    case 0xC000026F: return "STATUS_WX86_INTERNAL_ERROR";
+    case 0xC0000270: return "STATUS_WX86_FLOAT_STACK_CHECK";
+    case 0xC0000271: return "STATUS_VALIDATE_CONTINUE";
+    case 0xC0000272: return "STATUS_NO_MATCH";
+    case 0xC0000273: return "STATUS_NO_MORE_MATCHES";
+    case 0xC0000275: return "STATUS_NOT_A_REPARSE_POINT";
+    case 0xC0000276: return "STATUS_IO_REPARSE_TAG_INVALID";
+    case 0xC0000277: return "STATUS_IO_REPARSE_TAG_MISMATCH";
+    case 0xC0000278: return "STATUS_IO_REPARSE_DATA_INVALID";
+    case 0xC0000279: return "STATUS_IO_REPARSE_TAG_NOT_HANDLED";
+    case 0xC0000280: return "STATUS_REPARSE_POINT_NOT_RESOLVED";
+    case 0xC0000281: return "STATUS_DIRECTORY_IS_A_REPARSE_POINT";
+    case 0xC0000282: return "STATUS_RANGE_LIST_CONFLICT";
+    case 0xC0000283: return "STATUS_SOURCE_ELEMENT_EMPTY";
+    case 0xC0000284: return "STATUS_DESTINATION_ELEMENT_FULL";
+    case 0xC0000285: return "STATUS_ILLEGAL_ELEMENT_ADDRESS";
+    case 0xC0000286: return "STATUS_MAGAZINE_NOT_PRESENT";
+    case 0xC0000287: return "STATUS_REINITIALIZATION_NEEDED";
+    case 0x80000288: return "STATUS_DEVICE_REQUIRES_CLEANING";
+    case 0x80000289: return "STATUS_DEVICE_DOOR_OPEN";
+    case 0xC000028A: return "STATUS_ENCRYPTION_FAILED";
+    case 0xC000028B: return "STATUS_DECRYPTION_FAILED";
+    case 0xC000028C: return "STATUS_RANGE_NOT_FOUND";
+    case 0xC000028D: return "STATUS_NO_RECOVERY_POLICY";
+    case 0xC000028E: return "STATUS_NO_EFS";
+    case 0xC000028F: return "STATUS_WRONG_EFS";
+    case 0xC0000290: return "STATUS_NO_USER_KEYS";
+    case 0xC0000291: return "STATUS_FILE_NOT_ENCRYPTED";
+    case 0xC0000292: return "STATUS_NOT_EXPORT_FORMAT";
+    case 0xC0000293: return "STATUS_FILE_ENCRYPTED";
+    case 0x40000294: return "STATUS_WAKE_SYSTEM";
+    case 0xC0000295: return "STATUS_WMI_GUID_NOT_FOUND";
+    case 0xC0000296: return "STATUS_WMI_INSTANCE_NOT_FOUND";
+    case 0xC0000297: return "STATUS_WMI_ITEMID_NOT_FOUND";
+    case 0xC0000298: return "STATUS_WMI_TRY_AGAIN";
+    case 0xC0000299: return "STATUS_SHARED_POLICY";
+    case 0xC000029A: return "STATUS_POLICY_OBJECT_NOT_FOUND";
+    case 0xC000029B: return "STATUS_POLICY_ONLY_IN_DS";
+    case 0xC000029C: return "STATUS_VOLUME_NOT_UPGRADED";
+    case 0xC000029D: return "STATUS_REMOTE_STORAGE_NOT_ACTIVE";
+    case 0xC000029E: return "STATUS_REMOTE_STORAGE_MEDIA_ERROR";
+    case 0xC000029F: return "STATUS_NO_TRACKING_SERVICE";
+    case 0xC00002A0: return "STATUS_SERVER_SID_MISMATCH";
+    case 0xC00002A1: return "STATUS_DS_NO_ATTRIBUTE_OR_VALUE";
+    case 0xC00002A2: return "STATUS_DS_INVALID_ATTRIBUTE_SYNTAX";
+    case 0xC00002A3: return "STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED";
+    case 0xC00002A4: return "STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS";
+    case 0xC00002A5: return "STATUS_DS_BUSY";
+    case 0xC00002A6: return "STATUS_DS_UNAVAILABLE";
+    case 0xC00002A7: return "STATUS_DS_NO_RIDS_ALLOCATED";
+    case 0xC00002A8: return "STATUS_DS_NO_MORE_RIDS";
+    case 0xC00002A9: return "STATUS_DS_INCORRECT_ROLE_OWNER";
+    case 0xC00002AA: return "STATUS_DS_RIDMGR_INIT_ERROR";
+    case 0xC00002AB: return "STATUS_DS_OBJ_CLASS_VIOLATION";
+    case 0xC00002AC: return "STATUS_DS_CANT_ON_NON_LEAF";
+    case 0xC00002AD: return "STATUS_DS_CANT_ON_RDN";
+    case 0xC00002AE: return "STATUS_DS_CANT_MOD_OBJ_CLASS";
+    case 0xC00002AF: return "STATUS_DS_CROSS_DOM_MOVE_FAILED";
+    case 0xC00002B0: return "STATUS_DS_GC_NOT_AVAILABLE";
+    case 0xC00002B1: return "STATUS_DIRECTORY_SERVICE_REQUIRED";
+    case 0xC00002B2: return "STATUS_REPARSE_ATTRIBUTE_CONFLICT";
+    case 0xC00002B3: return "STATUS_CANT_ENABLE_DENY_ONLY";
+    case 0xC00002B4: return "STATUS_FLOAT_MULTIPLE_FAULTS";
+    case 0xC00002B5: return "STATUS_FLOAT_MULTIPLE_TRAPS";
+    case 0xC00002B6: return "STATUS_DEVICE_REMOVED";
+    case 0xC00002B7: return "STATUS_JOURNAL_DELETE_IN_PROGRESS";
+    case 0xC00002B8: return "STATUS_JOURNAL_NOT_ACTIVE";
+    case 0xC00002B9: return "STATUS_NOINTERFACE";
+    case 0xC00002C1: return "STATUS_DS_ADMIN_LIMIT_EXCEEDED";
+    case 0xC00002C2: return "STATUS_DRIVER_FAILED_SLEEP";
+    case 0xC00002C3: return "STATUS_MUTUAL_AUTHENTICATION_FAILED";
+    case 0xC00002C4: return "STATUS_CORRUPT_SYSTEM_FILE";
+    case 0xC00002C5: return "STATUS_DATATYPE_MISALIGNMENT_ERROR";
+    case 0xC00002C6: return "STATUS_WMI_READ_ONLY";
+    case 0xC00002C7: return "STATUS_WMI_SET_FAILURE";
+    case 0xC00002C8: return "STATUS_COMMITMENT_MINIMUM";
+    case 0xC00002C9: return "STATUS_REG_NAT_CONSUMPTION";
+    case 0xC00002CA: return "STATUS_TRANSPORT_FULL";
+    case 0xC00002CB: return "STATUS_DS_SAM_INIT_FAILURE";
+    case 0xC00002CC: return "STATUS_ONLY_IF_CONNECTED";
+    case 0xC00002CD: return "STATUS_DS_SENSITIVE_GROUP_VIOLATION";
+    case 0xC00002CE: return "STATUS_PNP_RESTART_ENUMERATION";
+    case 0xC00002CF: return "STATUS_JOURNAL_ENTRY_DELETED";
+    case 0xC00002D0: return "STATUS_DS_CANT_MOD_PRIMARYGROUPID";
+    case 0xC00002D1: return "STATUS_SYSTEM_IMAGE_BAD_SIGNATURE";
+    case 0xC00002D2: return "STATUS_PNP_REBOOT_REQUIRED";
+    case 0xC00002D3: return "STATUS_POWER_STATE_INVALID";
+    case 0xC00002D4: return "STATUS_DS_INVALID_GROUP_TYPE";
+    case 0xC00002D5: return "STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN";
+    case 0xC00002D6: return "STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN";
+    case 0xC00002D7: return "STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER";
+    case 0xC00002D8: return "STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER";
+    case 0xC00002D9: return "STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER";
+    case 0xC00002DA: return "STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER";
+    case 0xC00002DB: return "STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER";
+    case 0xC00002DC: return "STATUS_DS_HAVE_PRIMARY_MEMBERS";
+    case 0xC00002DD: return "STATUS_WMI_NOT_SUPPORTED";
+    case 0xC00002DE: return "STATUS_INSUFFICIENT_POWER";
+    case 0xC00002DF: return "STATUS_SAM_NEED_BOOTKEY_PASSWORD";
+    case 0xC00002E0: return "STATUS_SAM_NEED_BOOTKEY_FLOPPY";
+    case 0xC00002E1: return "STATUS_DS_CANT_START";
+    case 0xC00002E2: return "STATUS_DS_INIT_FAILURE";
+    case 0xC00002E3: return "STATUS_SAM_INIT_FAILURE";
+    case 0xC00002E4: return "STATUS_DS_GC_REQUIRED";
+    case 0xC00002E5: return "STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY";
+    case 0xC00002E6: return "STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS";
+    case 0xC00002E7: return "STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED";
+    case 0xC00002E8: return "STATUS_MULTIPLE_FAULT_VIOLATION";
+    case 0xC0000300: return "STATUS_NOT_SUPPORTED_ON_SBS";
+    case 0xC0009898: return "STATUS_WOW_ASSERTION";
+    case 0xC0010001: return "DBG_NO_STATE_CHANGE";
+    case 0xC0010002: return "DBG_APP_NOT_IDLE";
+    case 0xC0020001: return "RPC_NT_INVALID_STRING_BINDING";
+    case 0xC0020002: return "RPC_NT_WRONG_KIND_OF_BINDING";
+    case 0xC0020003: return "RPC_NT_INVALID_BINDING";
+    case 0xC0020004: return "RPC_NT_PROTSEQ_NOT_SUPPORTED";
+    case 0xC0020005: return "RPC_NT_INVALID_RPC_PROTSEQ";
+    case 0xC0020006: return "RPC_NT_INVALID_STRING_UUID";
+    case 0xC0020007: return "RPC_NT_INVALID_ENDPOINT_FORMAT";
+    case 0xC0020008: return "RPC_NT_INVALID_NET_ADDR";
+    case 0xC0020009: return "RPC_NT_NO_ENDPOINT_FOUND";
+    case 0xC002000A: return "RPC_NT_INVALID_TIMEOUT";
+    case 0xC002000B: return "RPC_NT_OBJECT_NOT_FOUND";
+    case 0xC002000C: return "RPC_NT_ALREADY_REGISTERED";
+    case 0xC002000D: return "RPC_NT_TYPE_ALREADY_REGISTERED";
+    case 0xC002000E: return "RPC_NT_ALREADY_LISTENING";
+    case 0xC002000F: return "RPC_NT_NO_PROTSEQS_REGISTERED";
+    case 0xC0020010: return "RPC_NT_NOT_LISTENING";
+    case 0xC0020011: return "RPC_NT_UNKNOWN_MGR_TYPE";
+    case 0xC0020012: return "RPC_NT_UNKNOWN_IF";
+    case 0xC0020013: return "RPC_NT_NO_BINDINGS";
+    case 0xC0020014: return "RPC_NT_NO_PROTSEQS";
+    case 0xC0020015: return "RPC_NT_CANT_CREATE_ENDPOINT";
+    case 0xC0020016: return "RPC_NT_OUT_OF_RESOURCES";
+    case 0xC0020017: return "RPC_NT_SERVER_UNAVAILABLE";
+    case 0xC0020018: return "RPC_NT_SERVER_TOO_BUSY";
+    case 0xC0020019: return "RPC_NT_INVALID_NETWORK_OPTIONS";
+    case 0xC002001A: return "RPC_NT_NO_CALL_ACTIVE";
+    case 0xC002001B: return "RPC_NT_CALL_FAILED";
+    case 0xC002001C: return "RPC_NT_CALL_FAILED_DNE";
+    case 0xC002001D: return "RPC_NT_PROTOCOL_ERROR";
+    case 0xC002001F: return "RPC_NT_UNSUPPORTED_TRANS_SYN";
+    case 0xC0020021: return "RPC_NT_UNSUPPORTED_TYPE";
+    case 0xC0020022: return "RPC_NT_INVALID_TAG";
+    case 0xC0020023: return "RPC_NT_INVALID_BOUND";
+    case 0xC0020024: return "RPC_NT_NO_ENTRY_NAME";
+    case 0xC0020025: return "RPC_NT_INVALID_NAME_SYNTAX";
+    case 0xC0020026: return "RPC_NT_UNSUPPORTED_NAME_SYNTAX";
+    case 0xC0020028: return "RPC_NT_UUID_NO_ADDRESS";
+    case 0xC0020029: return "RPC_NT_DUPLICATE_ENDPOINT";
+    case 0xC002002A: return "RPC_NT_UNKNOWN_AUTHN_TYPE";
+    case 0xC002002B: return "RPC_NT_MAX_CALLS_TOO_SMALL";
+    case 0xC002002C: return "RPC_NT_STRING_TOO_LONG";
+    case 0xC002002D: return "RPC_NT_PROTSEQ_NOT_FOUND";
+    case 0xC002002E: return "RPC_NT_PROCNUM_OUT_OF_RANGE";
+    case 0xC002002F: return "RPC_NT_BINDING_HAS_NO_AUTH";
+    case 0xC0020030: return "RPC_NT_UNKNOWN_AUTHN_SERVICE";
+    case 0xC0020031: return "RPC_NT_UNKNOWN_AUTHN_LEVEL";
+    case 0xC0020032: return "RPC_NT_INVALID_AUTH_IDENTITY";
+    case 0xC0020033: return "RPC_NT_UNKNOWN_AUTHZ_SERVICE";
+    case 0xC0020034: return "EPT_NT_INVALID_ENTRY";
+    case 0xC0020035: return "EPT_NT_CANT_PERFORM_OP";
+    case 0xC0020036: return "EPT_NT_NOT_REGISTERED";
+    case 0xC0020037: return "RPC_NT_NOTHING_TO_EXPORT";
+    case 0xC0020038: return "RPC_NT_INCOMPLETE_NAME";
+    case 0xC0020039: return "RPC_NT_INVALID_VERS_OPTION";
+    case 0xC002003A: return "RPC_NT_NO_MORE_MEMBERS";
+    case 0xC002003B: return "RPC_NT_NOT_ALL_OBJS_UNEXPORTED";
+    case 0xC002003C: return "RPC_NT_INTERFACE_NOT_FOUND";
+    case 0xC002003D: return "RPC_NT_ENTRY_ALREADY_EXISTS";
+    case 0xC002003E: return "RPC_NT_ENTRY_NOT_FOUND";
+    case 0xC002003F: return "RPC_NT_NAME_SERVICE_UNAVAILABLE";
+    case 0xC0020040: return "RPC_NT_INVALID_NAF_ID";
+    case 0xC0020041: return "RPC_NT_CANNOT_SUPPORT";
+    case 0xC0020042: return "RPC_NT_NO_CONTEXT_AVAILABLE";
+    case 0xC0020043: return "RPC_NT_INTERNAL_ERROR";
+    case 0xC0020044: return "RPC_NT_ZERO_DIVIDE";
+    case 0xC0020045: return "RPC_NT_ADDRESS_ERROR";
+    case 0xC0020046: return "RPC_NT_FP_DIV_ZERO";
+    case 0xC0020047: return "RPC_NT_FP_UNDERFLOW";
+    case 0xC0020048: return "RPC_NT_FP_OVERFLOW";
+    case 0xC0030001: return "RPC_NT_NO_MORE_ENTRIES";
+    case 0xC0030002: return "RPC_NT_SS_CHAR_TRANS_OPEN_FAIL";
+    case 0xC0030003: return "RPC_NT_SS_CHAR_TRANS_SHORT_FILE";
+    case 0xC0030004: return "RPC_NT_SS_IN_NULL_CONTEXT";
+    case 0xC0030005: return "RPC_NT_SS_CONTEXT_MISMATCH";
+    case 0xC0030006: return "RPC_NT_SS_CONTEXT_DAMAGED";
+    case 0xC0030007: return "RPC_NT_SS_HANDLES_MISMATCH";
+    case 0xC0030008: return "RPC_NT_SS_CANNOT_GET_CALL_HANDLE";
+    case 0xC0030009: return "RPC_NT_NULL_REF_POINTER";
+    case 0xC003000A: return "RPC_NT_ENUM_VALUE_OUT_OF_RANGE";
+    case 0xC003000B: return "RPC_NT_BYTE_COUNT_TOO_SMALL";
+    case 0xC003000C: return "RPC_NT_BAD_STUB_DATA";
+    case 0xC0020049: return "RPC_NT_CALL_IN_PROGRESS";
+    case 0xC002004A: return "RPC_NT_NO_MORE_BINDINGS";
+    case 0xC002004B: return "RPC_NT_GROUP_MEMBER_NOT_FOUND";
+    case 0xC002004C: return "EPT_NT_CANT_CREATE";
+    case 0xC002004D: return "RPC_NT_INVALID_OBJECT";
+    case 0xC002004F: return "RPC_NT_NO_INTERFACES";
+    case 0xC0020050: return "RPC_NT_CALL_CANCELLED";
+    case 0xC0020051: return "RPC_NT_BINDING_INCOMPLETE";
+    case 0xC0020052: return "RPC_NT_COMM_FAILURE";
+    case 0xC0020053: return "RPC_NT_UNSUPPORTED_AUTHN_LEVEL";
+    case 0xC0020054: return "RPC_NT_NO_PRINC_NAME";
+    case 0xC0020055: return "RPC_NT_NOT_RPC_ERROR";
+    case 0x40020056: return "RPC_NT_UUID_LOCAL_ONLY";
+    case 0xC0020057: return "RPC_NT_SEC_PKG_ERROR";
+    case 0xC0020058: return "RPC_NT_NOT_CANCELLED";
+    case 0xC0030059: return "RPC_NT_INVALID_ES_ACTION";
+    case 0xC003005A: return "RPC_NT_WRONG_ES_VERSION";
+    case 0xC003005B: return "RPC_NT_WRONG_STUB_VERSION";
+    case 0xC003005C: return "RPC_NT_INVALID_PIPE_OBJECT";
+    case 0xC003005D: return "RPC_NT_INVALID_PIPE_OPERATION";
+    case 0xC003005E: return "RPC_NT_WRONG_PIPE_VERSION";
+    case 0xC003005F: return "RPC_NT_PIPE_CLOSED";
+    case 0xC0030060: return "RPC_NT_PIPE_DISCIPLINE_ERROR";
+    case 0xC0030061: return "RPC_NT_PIPE_EMPTY";
+    case 0xC0020062: return "RPC_NT_INVALID_ASYNC_HANDLE";
+    case 0xC0020063: return "RPC_NT_INVALID_ASYNC_CALL";
+    case 0x400200AF: return "RPC_NT_SEND_INCOMPLETE";
+    case 0xC0140001: return "STATUS_ACPI_INVALID_OPCODE";
+    case 0xC0140002: return "STATUS_ACPI_STACK_OVERFLOW";
+    case 0xC0140003: return "STATUS_ACPI_ASSERT_FAILED";
+    case 0xC0140004: return "STATUS_ACPI_INVALID_INDEX";
+    case 0xC0140005: return "STATUS_ACPI_INVALID_ARGUMENT";
+    case 0xC0140006: return "STATUS_ACPI_FATAL";
+    case 0xC0140007: return "STATUS_ACPI_INVALID_SUPERNAME";
+    case 0xC0140008: return "STATUS_ACPI_INVALID_ARGTYPE";
+    case 0xC0140009: return "STATUS_ACPI_INVALID_OBJTYPE";
+    case 0xC014000A: return "STATUS_ACPI_INVALID_TARGETTYPE";
+    case 0xC014000B: return "STATUS_ACPI_INCORRECT_ARGUMENT_COUNT";
+    case 0xC014000C: return "STATUS_ACPI_ADDRESS_NOT_MAPPED";
+    case 0xC014000D: return "STATUS_ACPI_INVALID_EVENTTYPE";
+    case 0xC014000E: return "STATUS_ACPI_HANDLER_COLLISION";
+    case 0xC014000F: return "STATUS_ACPI_INVALID_DATA";
+    case 0xC0140010: return "STATUS_ACPI_INVALID_REGION";
+    case 0xC0140011: return "STATUS_ACPI_INVALID_ACCESS_SIZE";
+    case 0xC0140012: return "STATUS_ACPI_ACQUIRE_GLOBAL_LOCK";
+    case 0xC0140013: return "STATUS_ACPI_ALREADY_INITIALIZED";
+    case 0xC0140014: return "STATUS_ACPI_NOT_INITIALIZED";
+    case 0xC0140015: return "STATUS_ACPI_INVALID_MUTEX_LEVEL";
+    case 0xC0140016: return "STATUS_ACPI_MUTEX_NOT_OWNED";
+    case 0xC0140017: return "STATUS_ACPI_MUTEX_NOT_OWNER";
+    case 0xC0140018: return "STATUS_ACPI_RS_ACCESS";
+    case 0xC0140019: return "STATUS_ACPI_INVALID_TABLE";
+    case 0xC0140020: return "STATUS_ACPI_REG_HANDLER_FAILED";
+    case 0xC0140021: return "STATUS_ACPI_POWER_REQUEST_FAILED";
+    case 0xC00A0001: return "STATUS_CTX_WINSTATION_NAME_INVALID";
+    case 0xC00A0002: return "STATUS_CTX_INVALID_PD";
+    case 0xC00A0003: return "STATUS_CTX_PD_NOT_FOUND";
+    case 0x400A0004: return "STATUS_CTX_CDM_CONNECT";
+    case 0x400A0005: return "STATUS_CTX_CDM_DISCONNECT";
+    case 0xC00A0006: return "STATUS_CTX_CLOSE_PENDING";
+    case 0xC00A0007: return "STATUS_CTX_NO_OUTBUF";
+    case 0xC00A0008: return "STATUS_CTX_MODEM_INF_NOT_FOUND";
+    case 0xC00A0009: return "STATUS_CTX_INVALID_MODEMNAME";
+    case 0xC00A000A: return "STATUS_CTX_RESPONSE_ERROR";
+    case 0xC00A000B: return "STATUS_CTX_MODEM_RESPONSE_TIMEOUT";
+    case 0xC00A000C: return "STATUS_CTX_MODEM_RESPONSE_NO_CARRIER";
+    case 0xC00A000D: return "STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE";
+    case 0xC00A000E: return "STATUS_CTX_MODEM_RESPONSE_BUSY";
+    case 0xC00A000F: return "STATUS_CTX_MODEM_RESPONSE_VOICE";
+    case 0xC00A0010: return "STATUS_CTX_TD_ERROR";
+    case 0xC00A0012: return "STATUS_CTX_LICENSE_CLIENT_INVALID";
+    case 0xC00A0013: return "STATUS_CTX_LICENSE_NOT_AVAILABLE";
+    case 0xC00A0014: return "STATUS_CTX_LICENSE_EXPIRED";
+    case 0xC00A0015: return "STATUS_CTX_WINSTATION_NOT_FOUND";
+    case 0xC00A0016: return "STATUS_CTX_WINSTATION_NAME_COLLISION";
+    case 0xC00A0017: return "STATUS_CTX_WINSTATION_BUSY";
+    case 0xC00A0018: return "STATUS_CTX_BAD_VIDEO_MODE";
+    case 0xC00A0022: return "STATUS_CTX_GRAPHICS_INVALID";
+    case 0xC00A0024: return "STATUS_CTX_NOT_CONSOLE";
+    case 0xC00A0026: return "STATUS_CTX_CLIENT_QUERY_TIMEOUT";
+    case 0xC00A0027: return "STATUS_CTX_CONSOLE_DISCONNECT";
+    case 0xC00A0028: return "STATUS_CTX_CONSOLE_CONNECT";
+    case 0xC00A002A: return "STATUS_CTX_SHADOW_DENIED";
+    case 0xC00A002B: return "STATUS_CTX_WINSTATION_ACCESS_DENIED";
+    case 0xC00A002E: return "STATUS_CTX_INVALID_WD";
+    case 0xC00A002F: return "STATUS_CTX_WD_NOT_FOUND";
+    case 0xC00A0030: return "STATUS_CTX_SHADOW_INVALID";
+    case 0xC00A0031: return "STATUS_CTX_SHADOW_DISABLED";
+    case 0xC00A0032: return "STATUS_RDP_PROTOCOL_ERROR";
+    case 0xC00A0033: return "STATUS_CTX_CLIENT_LICENSE_NOT_SET";
+    case 0xC00A0034: return "STATUS_CTX_CLIENT_LICENSE_IN_USE";
+    case 0xC0040035: return "STATUS_PNP_BAD_MPS_TABLE";
+    case 0xC0040036: return "STATUS_PNP_TRANSLATION_FAILED";
+    case 0xC0040037: return "STATUS_PNP_IRQ_TRANSLATION_FAILED";
+    default:         return "STATUS_UNKNOWN";
+    }
+}
+
+
+/*
+ * KsPrintf
+ *   This function is variable-argument, level-sensitive debug print routine.
+ *   If the specified debug level for the print statement is lower or equal
+ *   to the current debug level, the message will be printed.
+ *
+ * Arguments:
+ *   DebugPrintLevel - Specifies at which debugging level the string should
+ *                     be printed
+ *   DebugMessage - Variable argument ascii c string
+ *
+ * Return Value:
+ *   N/A
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+VOID
+KsPrintf(
+    LONG  DebugPrintLevel,
+    PCHAR DebugMessage,
+    ...
+    )
+{
+    va_list  ap;
+
+    va_start(ap, DebugMessage);
+
+    if (DebugPrintLevel <= KsDebugLevel)
+    {
+        CHAR buffer[0x200];
+
+        vsprintf(buffer, DebugMessage, ap);
+
+        KdPrint(("TID:%8.8x: %s", PsGetCurrentThread(), buffer));
+    }
+
+    va_end(ap);
+
+} // KsPrint()
+
+#endif
\ No newline at end of file
diff --git a/lnet/libcfs/winnt/winnt-fs.c b/lnet/libcfs/winnt/winnt-fs.c
new file mode 100644 (file)
index 0000000..47c3346
--- /dev/null
@@ -0,0 +1,541 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <libcfs/libcfs.h>
+
+const CHAR *dos_file_prefix = "\\??\\";
+
+/*
+ * cfs_filp_open
+ *     To open or create a file in kernel mode
+ *
+ * Arguments:
+ *   name:  name of the file to be opened or created, no dos path prefix
+ *   flags: open/creation attribute options
+ *   mode:  access mode/permission to open or create
+ *   err:   error code
+ *
+ * Return Value:
+ *   the pointer to the cfs_file_t or NULL if it fails
+ *
+ * Notes: 
+ *   N/A
+ */
+
+cfs_file_t *cfs_filp_open(const char *name, int flags, int mode, int *err)
+{
+    cfs_file_t *        fp = NULL;
+
+    NTSTATUS            Status;
+
+    OBJECT_ATTRIBUTES   ObjectAttributes;
+    HANDLE              FileHandle;
+    IO_STATUS_BLOCK     IoStatus;
+    ACCESS_MASK         DesiredAccess;
+    ULONG               CreateDisposition;
+    ULONG               ShareAccess;
+    ULONG               CreateOptions;
+
+    USHORT              NameLength = 0;
+    USHORT              PrefixLength = 0;
+
+    UNICODE_STRING      UnicodeName;
+    PWCHAR              UnicodeString = NULL;
+
+    ANSI_STRING         AnsiName;
+    PUCHAR              AnsiString = NULL;
+
+    /* Analyze the flags settings */
+
+    if (cfs_is_flag_set(flags, O_WRONLY)) {
+        DesiredAccess = (GENERIC_WRITE | SYNCHRONIZE);
+        ShareAccess = 0;
+    }  else if (cfs_is_flag_set(flags, O_RDWR)) {
+        DesiredAccess = (GENERIC_READ | GENERIC_WRITE | SYNCHRONIZE);
+        ShareAccess = FILE_SHARE_READ | FILE_SHARE_WRITE;
+    } else {
+        DesiredAccess = (GENERIC_READ | SYNCHRONIZE);
+        ShareAccess = FILE_SHARE_READ;
+    }
+
+    if (cfs_is_flag_set(flags, O_CREAT)) {
+        if (cfs_is_flag_set(flags, O_EXCL)) {
+            CreateDisposition = FILE_CREATE;
+        } else {
+            CreateDisposition = FILE_OPEN_IF;
+        }
+    } else {
+        CreateDisposition = FILE_OPEN;
+    }
+
+    if (cfs_is_flag_set(flags, O_TRUNC)) {
+        if (cfs_is_flag_set(flags, O_EXCL)) {
+            CreateDisposition = FILE_OVERWRITE;
+        } else {
+            CreateDisposition = FILE_OVERWRITE_IF;
+        }
+    }
+
+    CreateOptions = 0;
+
+    if (cfs_is_flag_set(flags, O_DIRECTORY)) {
+        cfs_set_flag(CreateOptions,  FILE_DIRECTORY_FILE);
+    }
+
+    if (cfs_is_flag_set(flags, O_SYNC)) {
+         cfs_set_flag(CreateOptions, FILE_WRITE_THROUGH);
+    }
+
+    if (cfs_is_flag_set(flags, O_DIRECT)) {
+         cfs_set_flag(CreateOptions, FILE_NO_INTERMEDIATE_BUFFERING);
+    }
+
+    /* Initialize the unicode path name for the specified file */
+
+    NameLength = (USHORT)strlen(name);
+
+    if (name[0] != '\\') {
+        PrefixLength = (USHORT)strlen(dos_file_prefix);
+    }
+
+    AnsiString = cfs_alloc( sizeof(CHAR) * (NameLength + PrefixLength + 1),
+                            CFS_ALLOC_ZERO);
+    if (NULL == AnsiString) {
+        if (err) *err = -ENOMEM;
+        return NULL;
+    }
+
+    UnicodeString = cfs_alloc( sizeof(WCHAR) * (NameLength + PrefixLength + 1),
+                               CFS_ALLOC_ZERO);
+
+    if (NULL == UnicodeString) {
+        if (err) *err = -ENOMEM;
+        cfs_free(AnsiString);
+        return NULL;
+    }
+
+    if (PrefixLength) {
+        RtlCopyMemory(&AnsiString[0], dos_file_prefix , PrefixLength);
+    }
+
+    RtlCopyMemory(&AnsiString[PrefixLength], name, NameLength);
+    NameLength += PrefixLength;
+
+    AnsiName.MaximumLength = NameLength + 1;
+    AnsiName.Length = NameLength;
+    AnsiName.Buffer = AnsiString;
+
+    UnicodeName.MaximumLength = (NameLength + 1) * sizeof(WCHAR);
+    UnicodeName.Length = 0;
+    UnicodeName.Buffer = (PWSTR)UnicodeString;
+
+    RtlAnsiStringToUnicodeString(&UnicodeName, &AnsiName, FALSE);
+
+    /* Setup the object attributes structure for the file. */
+
+    InitializeObjectAttributes(
+            &ObjectAttributes,
+            &UnicodeName,
+            OBJ_CASE_INSENSITIVE |
+            OBJ_KERNEL_HANDLE,
+            NULL,
+            NULL );
+
+    /* Now to open or create the file now */
+
+    Status = ZwCreateFile(
+            &FileHandle,
+            DesiredAccess,
+            &ObjectAttributes,
+            &IoStatus,
+            0,
+            FILE_ATTRIBUTE_NORMAL,
+            ShareAccess,
+            CreateDisposition,
+            CreateOptions,
+            NULL,
+            0 );
+
+    /* Check the returned status of IoStatus... */
+
+    if (!NT_SUCCESS(IoStatus.Status)) {
+        *err = cfs_error_code(IoStatus.Status);
+        cfs_free(UnicodeString);
+        cfs_free(AnsiString);
+        return NULL;
+    }
+
+    /* Allocate the cfs_file_t: libcfs file object */
+
+    fp = cfs_alloc(sizeof(cfs_file_t) + NameLength, CFS_ALLOC_ZERO);
+
+    if (NULL == fp) {
+        Status = ZwClose(FileHandle);
+        ASSERT(NT_SUCCESS(Status));
+        *err = -ENOMEM;
+        cfs_free(UnicodeString);
+        cfs_free(AnsiString);
+        return NULL;
+    }
+
+    fp->f_handle = FileHandle;
+    strcpy(fp->f_name, name);
+    fp->f_flags = flags;
+    fp->f_mode  = (mode_t)mode;
+    fp->f_count = 1;
+    *err = 0;
+
+    /* free the memory of temporary name strings */
+    cfs_free(UnicodeString);
+    cfs_free(AnsiString);
+
+    return fp;
+}
+
+
+/*
+ * cfs_filp_close
+ *     To close the opened file and release the filp structure
+ *
+ * Arguments:
+ *   fp:   the pointer of the cfs_file_t strcture
+ *
+ * Return Value:
+ *   ZERO: on success
+ *   Non-Zero: on failure
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_filp_close(cfs_file_t *fp)
+{
+    NTSTATUS    Status;
+
+    ASSERT(fp != NULL);
+    ASSERT(fp->f_handle != NULL);
+
+    /* release the file handle */
+    Status = ZwClose(fp->f_handle);
+    ASSERT(NT_SUCCESS(Status));
+
+    /* free the file flip structure */
+    cfs_free(fp);
+    return 0;
+}
+
+
+/*
+ * cfs_filp_read
+ *     To read data from the opened file
+ *
+ * Arguments:
+ *   fp:   the pointer of the cfs_file_t strcture
+ *   buf:  pointer to the buffer to contain the data
+ *   nbytes: size in bytes to be read from the file
+ *   pos:  offset in file where reading starts, if pos
+ *         NULL, then read from current file offset
+ *
+ * Return Value:
+ *   Actual size read into the buffer in success case
+ *   Error code in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_filp_read(cfs_file_t *fp, void *buf, size_t nbytes, loff_t *pos)
+{
+    LARGE_INTEGER   address;
+    NTSTATUS        Status;
+    IO_STATUS_BLOCK IoStatus;
+
+    int             rc = 0;
+
+    /* Read data from the file into the specified buffer */
+
+    if (pos != NULL) {
+        address.QuadPart = *pos;
+    } else {
+        address.QuadPart = fp->f_pos;
+    }
+
+    Status = ZwReadFile( fp->f_handle,
+                         0,
+                         NULL,
+                         NULL,
+                         &IoStatus,
+                         buf,
+                         nbytes,
+                         &address,
+                         NULL );
+
+    if (!NT_SUCCESS(IoStatus.Status)) {
+        rc = cfs_error_code(IoStatus.Status);
+    } else {
+        rc = (int)IoStatus.Information;
+        fp->f_pos = address.QuadPart + rc;
+        if (pos != NULL) {
+            *pos = fp->f_pos;
+        }   
+    }
+
+    return rc;     
+}
+
+
+/*
+ * cfs_filp_wrtie
+ *     To write specified data to the opened file
+ *
+ * Arguments:
+ *   fp:   the pointer of the cfs_file_t strcture
+ *   buf:  pointer to the buffer containing the data
+ *   nbytes: size in bytes to be written to the file
+ *   pos:  offset in file where writing starts, if pos
+ *         NULL, then write to current file offset
+ *
+ * Return Value:
+ *   Actual size written into the buffer in success case
+ *   Error code in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_filp_write(cfs_file_t *fp, void *buf, size_t nbytes, loff_t *pos)
+{
+    LARGE_INTEGER   address;
+    NTSTATUS        Status;
+    IO_STATUS_BLOCK IoStatus;
+    int             rc = 0;
+
+    /* Write user specified data into the file */
+
+    if (pos != NULL) {
+        address.QuadPart = *pos;
+    } else {
+        address.QuadPart = fp->f_pos;
+    }
+
+    Status = ZwWriteFile( fp->f_handle,
+                         0,
+                         NULL,
+                         NULL,
+                         &IoStatus,
+                         buf,
+                         nbytes,
+                         &address,
+                         NULL );
+
+    if (!NT_SUCCESS(Status)) {
+        rc =  cfs_error_code(Status);
+    } else {
+        rc = (int)IoStatus.Information;
+        fp->f_pos = address.QuadPart + rc;
+        if (pos != NULL) {
+            *pos = fp->f_pos;
+        }   
+    }
+
+    return rc;
+}
+
+
+NTSTATUS
+CompletionRoutine(
+    PDEVICE_OBJECT DeviceObject,
+    PIRP Irp,
+    PVOID Context)
+{
+    /* copy the IoStatus result */
+    *Irp->UserIosb = Irp->IoStatus;
+    
+    /* singal the event we set */
+    KeSetEvent(Irp->UserEvent, 0, FALSE);
+   
+    /* free the Irp we allocated */
+    IoFreeIrp(Irp);
+    
+    return STATUS_MORE_PROCESSING_REQUIRED;
+}
+
+
+/*
+ * cfs_filp_fsync
+ *     To sync the dirty data of the file to disk
+ *
+ * Arguments:
+ *   fp: the pointer of the cfs_file_t strcture
+ *
+ * Return Value:
+ *   Zero:  in success case
+ *   Error code: in failure case
+ *
+ * Notes: 
+ *   Nt kernel doesn't export such a routine to flush a file,
+ *   we must allocate our own Irp and issue it to the file
+ *   system driver.
+ */
+
+int cfs_filp_fsync(cfs_file_t *fp)
+{
+
+    PFILE_OBJECT            FileObject;
+    PDEVICE_OBJECT          DeviceObject;
+
+    NTSTATUS                Status;
+    PIRP                    Irp;
+    KEVENT                  Event;
+    IO_STATUS_BLOCK         IoSb;
+    PIO_STACK_LOCATION      IrpSp;
+
+    /* get the FileObject and the DeviceObject */
+
+    Status = ObReferenceObjectByHandle(
+                fp->f_handle,
+                FILE_WRITE_DATA,
+                NULL,
+                KernelMode,
+                (PVOID*)&FileObject,
+                NULL );
+
+    if (!NT_SUCCESS(Status)) {
+        return cfs_error_code(Status);
+    }
+
+    DeviceObject = IoGetRelatedDeviceObject(FileObject);
+
+    /* allocate a new Irp */
+
+    Irp = IoAllocateIrp(DeviceObject->StackSize, FALSE);
+
+    if (!Irp) {
+
+        ObDereferenceObject(FileObject);
+        return -ENOMEM;
+    }
+
+    /* intialize the event */
+    KeInitializeEvent(&Event, SynchronizationEvent, FALSE);
+
+    /* setup the Irp */
+    Irp->UserEvent = &Event;
+    Irp->UserIosb = &IoSb;
+    Irp->RequestorMode = KernelMode;
+
+    Irp->Tail.Overlay.Thread = PsGetCurrentThread();
+    Irp->Tail.Overlay.OriginalFileObject = FileObject;
+
+    /* setup the Irp stack location */
+    IrpSp = IoGetNextIrpStackLocation(Irp);
+
+    IrpSp->MajorFunction = IRP_MJ_FLUSH_BUFFERS;
+    IrpSp->DeviceObject = DeviceObject;
+    IrpSp->FileObject = FileObject;
+
+    IoSetCompletionRoutine(Irp, CompletionRoutine, 0, TRUE, TRUE, TRUE);
+
+
+    /* issue the Irp to the underlying file system driver */
+    IoCallDriver(DeviceObject, Irp);
+
+    /* wait until it is finished */
+    KeWaitForSingleObject(&Event, Executive, KernelMode, TRUE, 0);
+
+    /* cleanup our reference on it */
+    ObDereferenceObject(FileObject);
+
+    Status = IoSb.Status;
+
+    return cfs_error_code(Status);
+}
+
+/*
+ * cfs_get_file
+ *     To increase the reference of the file object
+ *
+ * Arguments:
+ *   fp:   the pointer of the cfs_file_t strcture
+ *
+ * Return Value:
+ *   Zero:  in success case
+ *   Non-Zero: in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_get_file(cfs_file_t *fp)
+{
+    InterlockedIncrement(&(fp->f_count));
+    return 0;
+}
+
+
+/*
+ * cfs_put_file
+ *     To decrease the reference of the file object
+ *
+ * Arguments:
+ *   fp:   the pointer of the cfs_file_t strcture
+ *
+ * Return Value:
+ *   Zero:  in success case
+ *   Non-Zero: in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_put_file(cfs_file_t *fp)
+{
+    if (InterlockedDecrement(&(fp->f_count)) == 0) {
+        cfs_filp_close(fp);
+    }
+
+    return 0;
+}
+
+
+/*
+ * cfs_file_count
+ *   To query the reference count of the file object
+ *
+ * Arguments:
+ *   fp:   the pointer of the cfs_file_t strcture
+ *
+ * Return Value:
+ *   the reference count of the file object
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_file_count(cfs_file_t *fp)
+{
+    return (int)(fp->f_count);
+}
diff --git a/lnet/libcfs/winnt/winnt-lock.c b/lnet/libcfs/winnt/winnt-lock.c
new file mode 100644 (file)
index 0000000..b77eb95
--- /dev/null
@@ -0,0 +1,353 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <libcfs/libcfs.h>
+
+
+#if _X86_
+
+void __declspec (naked) FASTCALL
+atomic_add(
+    int i,
+    atomic_t *v
+    )
+{
+    // ECX = i
+    // EDX = v ; [EDX][0] = v->counter
+
+    __asm {
+        lock add dword ptr [edx][0], ecx
+        ret
+    }
+}
+
+void __declspec (naked) FASTCALL
+atomic_sub(
+    int i,
+    atomic_t *v
+   ) 
+{
+    // ECX = i
+    // EDX = v ; [EDX][0] = v->counter
+
+    __asm {
+        lock sub dword ptr [edx][0], ecx
+        ret
+    }
+}
+
+void __declspec (naked) FASTCALL
+atomic_inc(
+    atomic_t *v
+    )
+{
+    //InterlockedIncrement((PULONG)(&((v)->counter)));
+
+    //` ECX = v ; [ECX][0] = v->counter
+
+    __asm {
+        lock inc dword ptr [ecx][0]
+        ret
+    }
+}
+
+void __declspec (naked) FASTCALL
+atomic_dec(
+    atomic_t *v
+    )
+{
+    // ECX = v ; [ECX][0] = v->counter
+
+    __asm {
+        lock dec dword ptr [ecx][0]
+        ret
+    }
+}
+
+int __declspec (naked) FASTCALL 
+atomic_sub_and_test(
+    int i,
+    atomic_t *v
+    )
+{
+
+    // ECX = i
+    // EDX = v ; [EDX][0] = v->counter
+
+    __asm {
+        xor eax, eax
+        lock sub dword ptr [edx][0], ecx
+        sete al
+        ret
+    }
+}
+
+int __declspec (naked) FASTCALL
+atomic_inc_and_test(
+    atomic_t *v
+    )
+{
+    // ECX = v ; [ECX][0] = v->counter
+
+    __asm {
+        xor eax, eax
+        lock inc dword ptr [ecx][0]
+        sete al
+        ret
+    }
+}
+
+int __declspec (naked) FASTCALL
+atomic_dec_and_test(
+    atomic_t *v
+    )
+{
+    // ECX = v ; [ECX][0] = v->counter
+
+    __asm {
+        xor eax, eax
+        lock dec dword ptr [ecx][0]
+        sete al
+        ret
+    }
+}
+
+#else
+
+void FASTCALL
+atomic_add(
+    int i,
+    atomic_t *v
+    )
+{
+    InterlockedExchangeAdd( (PULONG)(&((v)->counter)) , (LONG) (i));
+}
+
+void FASTCALL
+atomic_sub(
+    int i,
+    atomic_t *v
+   ) 
+{
+    InterlockedExchangeAdd( (PULONG)(&((v)->counter)) , (LONG) (-1*i));
+}
+
+void FASTCALL
+atomic_inc(
+    atomic_t *v
+    )
+{
+   InterlockedIncrement((PULONG)(&((v)->counter)));
+}
+
+void FASTCALL
+atomic_dec(
+    atomic_t *v
+    )
+{
+    InterlockedDecrement((PULONG)(&((v)->counter)));
+}
+
+int FASTCALL 
+atomic_sub_and_test(
+    int i,
+    atomic_t *v
+    )
+{
+    int counter, result;
+
+    do {
+
+        counter = v->counter;
+        result = counter - i;
+
+    } while ( InterlockedCompareExchange(
+                &(v->counter),
+                result,
+                counter) !=  counter);
+
+    return (result == 0);
+}
+
+int FASTCALL
+atomic_inc_and_test(
+    atomic_t *v
+    )
+{
+    int counter, result;
+
+    do {
+
+        counter = v->counter;
+        result = counter + 1;
+
+    } while ( InterlockedCompareExchange(
+                &(v->counter),
+                result,
+                counter) !=  counter);
+
+    return (result == 0);
+}
+
+int FASTCALL
+atomic_dec_and_test(
+    atomic_t *v
+    )
+{
+    int counter, result;
+
+    do {
+
+        counter = v->counter;
+        result = counter + 1;
+
+    } while ( InterlockedCompareExchange(
+                &(v->counter),
+                result,
+                counter) !=  counter);
+
+    return (result == 0);
+}
+
+#endif
+
+
+/*
+ * rw spinlock
+ */
+
+
+void
+rwlock_init(rwlock_t * rwlock)
+{
+    spin_lock_init(&rwlock->guard);
+    rwlock->count = 0;
+}
+
+void
+rwlock_fini(rwlock_t * rwlock)
+{
+}
+
+void
+read_lock(rwlock_t * rwlock)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        /* should bugchk here */
+        cfs_enter_debugger();
+        return;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    ASSERT(slot->Magic == TASKSLT_MAGIC);
+   
+    slot->irql = KeRaiseIrqlToDpcLevel();
+
+    while (TRUE) {
+           spin_lock(&rwlock->guard);
+        if (rwlock->count >= 0)
+            break;
+        spin_unlock(&rwlock->guard);
+    }
+
+       rwlock->count++;
+       spin_unlock(&rwlock->guard);
+}
+
+void
+read_unlock(rwlock_t * rwlock)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        /* should bugchk here */
+        cfs_enter_debugger();
+        return;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    ASSERT(slot->Magic == TASKSLT_MAGIC);
+   
+    spin_lock(&rwlock->guard);
+       ASSERT(rwlock->count > 0);
+    rwlock->count--;
+    if (rwlock < 0) {
+        cfs_enter_debugger();
+    }
+       spin_unlock(&rwlock->guard);
+
+    KeLowerIrql(slot->irql);
+}
+
+void
+write_lock(rwlock_t * rwlock)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        /* should bugchk here */
+        cfs_enter_debugger();
+        return;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    ASSERT(slot->Magic == TASKSLT_MAGIC);
+   
+    slot->irql = KeRaiseIrqlToDpcLevel();
+
+    while (TRUE) {
+           spin_lock(&rwlock->guard);
+        if (rwlock->count == 0)
+            break;
+        spin_unlock(&rwlock->guard);
+    }
+
+       rwlock->count = -1;
+       spin_unlock(&rwlock->guard);
+}
+
+void
+write_unlock(rwlock_t * rwlock)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        /* should bugchk here */
+        cfs_enter_debugger();
+        return;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    ASSERT(slot->Magic == TASKSLT_MAGIC);
+   
+    spin_lock(&rwlock->guard);
+       ASSERT(rwlock->count == -1);
+    rwlock->count = 0;
+       spin_unlock(&rwlock->guard);
+
+    KeLowerIrql(slot->irql);
+}
\ No newline at end of file
diff --git a/lnet/libcfs/winnt/winnt-lwt.c b/lnet/libcfs/winnt/winnt-lwt.c
new file mode 100644 (file)
index 0000000..ed09d99
--- /dev/null
@@ -0,0 +1,20 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
diff --git a/lnet/libcfs/winnt/winnt-mem.c b/lnet/libcfs/winnt/winnt-mem.c
new file mode 100644 (file)
index 0000000..9c4bebd
--- /dev/null
@@ -0,0 +1,332 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <libcfs/libcfs.h>
+
+
+cfs_mem_cache_t *cfs_page_t_slab = NULL;
+cfs_mem_cache_t *cfs_page_p_slab = NULL;
+
+/*
+ * cfs_alloc_page
+ *   To allocate the cfs_page_t and also 1 page of memory
+ *
+ * Arguments:
+ *   flags:  the allocation options
+ *
+ * Return Value:
+ *   pointer to the cfs_page_t strcture in success or
+ *   NULL in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+
+cfs_page_t * cfs_alloc_page(int flags)
+{
+    cfs_page_t *pg;
+    pg = cfs_mem_cache_alloc(cfs_page_t_slab, 0);
+    
+    if (NULL == pg) {
+        cfs_enter_debugger();
+        return NULL;
+    }
+
+    memset(pg, 0, sizeof(cfs_page_t));
+    pg->addr = cfs_mem_cache_alloc(cfs_page_p_slab, 0);
+    atomic_set(&pg->count, 1);
+
+    if (pg->addr) {
+        if (cfs_is_flag_set(flags, CFS_ALLOC_ZERO)) {
+            memset(pg->addr, 0, CFS_PAGE_SIZE);
+        }
+    } else {
+        cfs_enter_debugger();
+        cfs_mem_cache_free(cfs_page_t_slab, pg);
+        pg = NULL;
+    }
+
+    return pg;
+}
+
+/*
+ * cfs_free_page
+ *   To free the cfs_page_t including the page
+ *
+ * Arguments:
+ *   pg:  pointer to the cfs_page_t strcture
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+void cfs_free_page(cfs_page_t *pg)
+{
+    ASSERT(pg != NULL);
+    ASSERT(pg->addr  != NULL);
+    ASSERT(atomic_read(&pg->count) <= 1);
+
+    cfs_mem_cache_free(cfs_page_p_slab, pg->addr);
+    cfs_mem_cache_free(cfs_page_t_slab, pg);
+}
+
+
+/*
+ * cfs_alloc
+ *   To allocate memory from system pool
+ *
+ * Arguments:
+ *   nr_bytes:  length in bytes of the requested buffer
+ *   flags:     flags indiction
+ *
+ * Return Value:
+ *   NULL: if there's no enough memory space in system
+ *   the address of the allocated memory in success.
+ *
+ * Notes: 
+ *   This operation can be treated as atomic.
+ */
+
+void *
+cfs_alloc(size_t nr_bytes, u_int32_t flags)
+{
+       void *ptr;
+
+    /* Ignore the flags: always allcoate from NonPagedPool */
+
+       ptr = ExAllocatePoolWithTag(NonPagedPool, nr_bytes, 'Lufs');
+
+       if (ptr != NULL && (flags & CFS_ALLOC_ZERO)) {
+               memset(ptr, 0, nr_bytes);
+    }
+
+    if (!ptr) {
+        cfs_enter_debugger();
+    }
+
+       return ptr;
+}
+
+/*
+ * cfs_free
+ *   To free the sepcified memory to system pool
+ *
+ * Arguments:
+ *   addr:   pointer to the buffer to be freed
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *    This operation can be treated as atomic.
+ */
+
+void
+cfs_free(void *addr)
+{
+       ExFreePool(addr);
+}
+
+/*
+ * cfs_alloc_large
+ *   To allocate large block of memory from system pool
+ *
+ * Arguments:
+ *   nr_bytes:  length in bytes of the requested buffer
+ *
+ * Return Value:
+ *   NULL: if there's no enough memory space in system
+ *   the address of the allocated memory in success.
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void *
+cfs_alloc_large(size_t nr_bytes)
+{
+       return cfs_alloc(nr_bytes, 0);
+}
+
+/*
+ * cfs_free_large
+ *   To free the sepcified memory to system pool
+ *
+ * Arguments:
+ *   addr:   pointer to the buffer to be freed
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+cfs_free_large(void *addr)
+{
+       cfs_free(addr);
+}
+
+
+/*
+ * cfs_mem_cache_create
+ *   To create a SLAB cache
+ *
+ * Arguments:
+ *   name:   name string of the SLAB cache to be created
+ *   size:   size in bytes of SLAB entry buffer
+ *   offset: offset in the page
+ *   flags:  SLAB creation flags
+*
+ * Return Value:
+ *   The poitner of cfs_memory_cache structure in success.
+ *   NULL pointer in failure case.
+ *
+ * Notes: 
+ *   1, offset won't be used here.
+ *   2, it could be better to induce a lock to protect the access of the
+ *       SLAB structure on SMP if there's not outside lock protection.
+ *   3, parameters C/D are removed.
+ */
+
+cfs_mem_cache_t *
+cfs_mem_cache_create(
+    const char * name,
+    size_t size,
+    size_t offset,
+    unsigned long flags
+    )
+{
+    cfs_mem_cache_t * kmc = NULL;
+
+    /*  The name of the SLAB could not exceed 20 chars */
+
+    if (name && strlen(name) >= 20) {
+        goto errorout;
+    }
+
+    /* Allocate and initialize the SLAB strcture */
+
+    kmc = cfs_alloc (sizeof(cfs_mem_cache_t), 0);
+
+    if (NULL == kmc) {
+        goto errorout;
+    }
+
+    memset(kmc, 0, sizeof(cfs_mem_cache_t));
+
+    kmc->flags = flags;
+
+    if (name) {
+        strcpy(&kmc->name[0], name);
+    }
+
+    /* Initialize the corresponding LookAside list */
+
+    ExInitializeNPagedLookasideList(
+            &(kmc->npll),
+            NULL,
+            NULL,
+            0,
+            size,
+            'pnmk',
+            0);
+errorout:
+
+    return kmc;
+}
+
+/*
+ * cfs_mem_cache_destroy
+ *   To destroy the unused SLAB cache
+ *
+ * Arguments:
+ *   kmc: the SLAB cache to be destroied.
+ *
+ * Return Value:
+ *   0: in success case.
+ *   1: in failure case.
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_mem_cache_destroy (cfs_mem_cache_t * kmc)
+{
+    ASSERT(kmc != NULL);
+
+    ExDeleteNPagedLookasideList(&(kmc->npll));
+
+    cfs_free(kmc);
+
+    return 0;
+}
+
+/*
+ * cfs_mem_cache_alloc
+ *   To allocate an object (LookAside entry) from the SLAB
+ *
+ * Arguments:
+ *   kmc:   the SLAB cache to be allocated from.
+ *   flags: flags for allocation options
+ *
+ * Return Value:
+ *   object buffer address: in success case.
+ *   NULL: in failure case.
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void *cfs_mem_cache_alloc(cfs_mem_cache_t * kmc, int flags)
+{
+    void *buf = NULL;
+
+    buf = ExAllocateFromNPagedLookasideList(&(kmc->npll));
+
+    return buf;
+}
+
+/*
+ * cfs_mem_cache_free
+ *   To free an object (LookAside entry) to the SLAB cache
+ *
+ * Arguments:
+ *   kmc: the SLAB cache to be freed to.
+ *   buf: the pointer to the object to be freed.
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_mem_cache_free(cfs_mem_cache_t * kmc, void * buf)
+{
+    ExFreeToNPagedLookasideList(&(kmc->npll), buf);
+}
diff --git a/lnet/libcfs/winnt/winnt-module.c b/lnet/libcfs/winnt/winnt-module.c
new file mode 100644 (file)
index 0000000..30eb729
--- /dev/null
@@ -0,0 +1,172 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+
+#define DEBUG_SUBSYSTEM S_LIBCFS
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+
+#define LIBCFS_MINOR 240
+
+int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
+{
+        struct libcfs_ioctl_hdr *hdr;
+        struct libcfs_ioctl_data *data;
+        int err;
+        ENTRY;
+
+        hdr = (struct libcfs_ioctl_hdr *)buf;
+        data = (struct libcfs_ioctl_data *)buf;
+
+        err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
+        if (err)
+                RETURN(err);
+
+        if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) {
+                CERROR(("LIBCFS: version mismatch kernel vs application\n"));
+                RETURN(-EINVAL);
+        }
+
+        if (hdr->ioc_len + buf >= end) {
+                CERROR(("LIBCFS: user buffer exceeds kernel buffer\n"));
+                RETURN(-EINVAL);
+        }
+
+        if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) {
+                CERROR(("LIBCFS: user buffer too small for ioctl\n"));
+                RETURN(-EINVAL);
+        }
+
+        err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
+        if (err)
+                RETURN(err);
+
+        if (libcfs_ioctl_is_invalid(data)) {
+                CERROR(("LIBCFS: ioctl not correctly formatted\n"));
+                RETURN(-EINVAL);
+        }
+
+        if (data->ioc_inllen1)
+                data->ioc_inlbuf1 = &data->ioc_bulk[0];
+
+        if (data->ioc_inllen2)
+                data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+                        size_round(data->ioc_inllen1);
+
+        RETURN(0);
+}
+                                                                                                                                                                        
+extern struct cfs_psdev_ops          libcfs_psdev_ops;
+
+static int 
+libcfs_psdev_open(cfs_file_t * file)
+{ 
+       struct libcfs_device_userstate **pdu = NULL;
+       int    rc = 0;
+
+       pdu = (struct libcfs_device_userstate **)&file->private_data;
+       if (libcfs_psdev_ops.p_open != NULL)
+               rc = libcfs_psdev_ops.p_open(0, (void *)pdu);
+       else
+               return (-EPERM);
+       return rc;
+}
+
+/* called when closing /dev/device */
+static int 
+libcfs_psdev_release(cfs_file_t * file)
+{
+       struct libcfss_device_userstate *pdu;
+       int    rc = 0;
+
+       pdu = file->private_data;
+       if (libcfs_psdev_ops.p_close != NULL)
+               rc = libcfs_psdev_ops.p_close(0, (void *)pdu);
+       else
+               rc = -EPERM;
+       return rc;
+}
+
+static int 
+libcfs_ioctl(cfs_file_t * file, unsigned int cmd, ulong_ptr arg)
+{ 
+       struct cfs_psdev_file    pfile;
+       int    rc = 0;
+
+       if ( _IOC_TYPE(cmd) != IOC_LIBCFS_TYPE || 
+            _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR  || 
+            _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR ) { 
+               CDEBUG(D_IOCTL, ("invalid ioctl ( type %d, nr %d, size %d )\n", 
+                      _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd))); 
+               return (-EINVAL); 
+       } 
+       
+       /* Handle platform-dependent IOC requests */
+       switch (cmd) { 
+       case IOC_LIBCFS_PANIC: 
+               if (!capable (CAP_SYS_BOOT)) 
+                       return (-EPERM); 
+               CERROR(("debugctl-invoked panic"));
+        KeBugCheckEx('LUFS', (ULONG_PTR)libcfs_ioctl, (ULONG_PTR)NULL, (ULONG_PTR)NULL, (ULONG_PTR)NULL);
+
+               return (0);
+       case IOC_LIBCFS_MEMHOG:
+
+               if (!capable (CAP_SYS_ADMIN)) 
+                       return -EPERM;
+        break;
+       }
+
+       pfile.off = 0;
+       pfile.private_data = file->private_data;
+       if (libcfs_psdev_ops.p_ioctl != NULL) 
+               rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg); 
+       else
+               rc = -EPERM;
+       return (rc);
+}
+
+static struct file_operations libcfs_fops = {
+    /* lseek: */  NULL,
+    /* read: */   NULL,
+    /* write: */  NULL,
+    /* ioctl: */  libcfs_ioctl,
+    /* open: */   libcfs_psdev_open,
+    /* release:*/ libcfs_psdev_release
+};
+
+cfs_psdev_t libcfs_dev = { 
+       LIBCFS_MINOR, 
+       "libcfs", 
+       &libcfs_fops
+};
+
+void
+libcfs_daemonize (char *str)
+{
+       printk("Daemonize request: %s.\n", str);
+       return;
+}
+
+void 
+libcfs_blockallsigs(void)
+{
+       return;
+}
diff --git a/lnet/libcfs/winnt/winnt-prim.c b/lnet/libcfs/winnt/winnt-prim.c
new file mode 100644 (file)
index 0000000..a7d4fba
--- /dev/null
@@ -0,0 +1,636 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+
+
+/*
+ *  Thread routines
+ */
+
+/*
+ * cfs_thread_proc
+ *   Lustre thread procedure wrapper routine (It's an internal routine)
+ *
+ * Arguments:
+ *   context:  a structure of cfs_thread_context_t, containing
+ *             all the necessary parameters
+ *
+ * Return Value:
+ *   void: N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+cfs_thread_proc(
+    void * context
+    )
+{
+    cfs_thread_context_t * thread_context = 
+        (cfs_thread_context_t *) context;
+
+    /* Execute the specified function ... */
+
+    if (thread_context->func) {
+        (thread_context->func)(thread_context->arg);
+    }
+
+    /* Free the context memory */
+   
+    cfs_free(context);
+
+    /* Terminate this system thread */
+
+    PsTerminateSystemThread(STATUS_SUCCESS);
+}
+
+/*
+ * cfs_kernel_thread
+ *   Create a system thread to execute the routine specified
+ *
+ * Arguments:
+ *   func:  function to be executed in the thread
+ *   arg:   argument transferred to func function
+ *   flag:  thread creation flags.
+ *
+ * Return Value:
+ *   int:   0 on success or error codes
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_kernel_thread(int (*func)(void *), void *arg, int flag)
+{
+    cfs_handle_t  thread = NULL;
+    NTSTATUS      status;
+    cfs_thread_context_t * context = NULL;
+
+    /* Allocate the context to be transferred to system thread */
+
+    context = cfs_alloc(sizeof(cfs_thread_context_t), CFS_ALLOC_ZERO);
+
+    if (!context) {
+        return -ENOMEM;
+    }
+
+    context->func  = func;
+    context->arg   = arg;
+
+    /* Create system thread with the cfs_thread_proc wrapper */
+
+    status = PsCreateSystemThread(
+                &thread,
+                (ACCESS_MASK)0L,
+                0, 0, 0,
+                cfs_thread_proc,
+                context);
+
+    if (!NT_SUCCESS(status)) {
+
+
+        cfs_free(context);
+
+        /* We need translate the nt status to linux error code */
+
+        return cfs_error_code(status);
+    }
+
+    //
+    //  Query the thread id of the newly created thread
+    //
+
+    ZwClose(thread);
+
+    return 0;
+}
+
+
+/*
+ * Symbols routines
+ */
+
+
+static CFS_DECL_RWSEM(cfs_symbol_lock);
+CFS_LIST_HEAD(cfs_symbol_list);
+
+int MPSystem = FALSE;
+
+/*
+ * cfs_symbol_get
+ *   To query the specified symbol form the symbol table
+ *
+ * Arguments:
+ *   name:  the symbol name to be queried
+ *
+ * Return Value:
+ *   If the symbol is in the table, return the address of it.
+ *   If not, return NULL.
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void *
+cfs_symbol_get(const char *name)
+{
+    struct list_head    *walker;
+    struct cfs_symbol   *sym = NULL;
+
+    down_read(&cfs_symbol_lock);
+    list_for_each(walker, &cfs_symbol_list) {
+        sym = list_entry (walker, struct cfs_symbol, sym_list);
+        if (!strcmp(sym->name, name)) {
+            sym->ref ++;
+            break;
+        } 
+    } 
+    up_read(&cfs_symbol_lock);
+
+    if (sym != NULL) 
+        return sym->value;
+
+    return NULL;
+}
+
+/*
+ * cfs_symbol_put
+ *   To decrease the reference of  the specified symbol
+ *
+ * Arguments:
+ *   name:  the symbol name to be dereferred
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+cfs_symbol_put(const char *name)
+{
+    struct list_head    *walker;
+    struct cfs_symbol   *sym = NULL;
+
+    down_read(&cfs_symbol_lock);
+    list_for_each(walker, &cfs_symbol_list) {
+        sym = list_entry (walker, struct cfs_symbol, sym_list);
+        if (!strcmp(sym->name, name)) {
+            LASSERT(sym->ref > 0);
+            sym->ref--;
+            break;
+        } 
+    } 
+    up_read(&cfs_symbol_lock);
+
+    LASSERT(sym != NULL);
+}
+
+
+/*
+ * cfs_symbol_register
+ *   To register the specified symbol infromation
+ *
+ * Arguments:
+ *   name:  the symbol name to be dereferred
+ *   value: the value that the symbol stands for
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   Zero: Succeed to register
+ *   Non-Zero: Fail to register the symbol
+ */
+
+int
+cfs_symbol_register(const char *name, const void *value)
+{
+    struct list_head    *walker;
+    struct cfs_symbol   *sym = NULL;
+    struct cfs_symbol   *new = NULL;
+
+    new = cfs_alloc(sizeof(struct cfs_symbol), CFS_ALLOC_ZERO);
+    if (!new) {
+        return (-ENOMEM);
+    }
+    strncpy(new->name, name, CFS_SYMBOL_LEN);
+    new->value = (void *)value;
+    new->ref = 0;
+    CFS_INIT_LIST_HEAD(&new->sym_list);
+
+    down_write(&cfs_symbol_lock);
+    list_for_each(walker, &cfs_symbol_list) {
+        sym = list_entry (walker, struct cfs_symbol, sym_list);
+        if (!strcmp(sym->name, name)) {
+            up_write(&cfs_symbol_lock);
+            cfs_free(new);
+            return 0; // alreay registerred
+        }
+    }
+    list_add_tail(&new->sym_list, &cfs_symbol_list);
+    up_write(&cfs_symbol_lock);
+
+    return 0;
+}
+
+/*
+ * cfs_symbol_unregister
+ *   To unregister/remove the specified symbol
+ *
+ * Arguments:
+ *   name:  the symbol name to be dereferred
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+cfs_symbol_unregister(const char *name)
+{
+    struct list_head    *walker;
+    struct list_head    *nxt;
+    struct cfs_symbol   *sym = NULL;
+
+    down_write(&cfs_symbol_lock);
+    list_for_each_safe(walker, nxt, &cfs_symbol_list) {
+        sym = list_entry (walker, struct cfs_symbol, sym_list);
+        if (!strcmp(sym->name, name)) {
+            LASSERT(sym->ref == 0);
+            list_del (&sym->sym_list);
+            cfs_free(sym);
+            break;
+        }
+    }
+    up_write(&cfs_symbol_lock);
+}
+
+/*
+ * cfs_symbol_clean
+ *   To clean all the symbols
+ *
+ * Arguments:
+ *   N/A
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+cfs_symbol_clean()
+{
+    struct list_head    *walker;
+    struct cfs_symbol   *sym = NULL;
+
+    down_write(&cfs_symbol_lock);
+    list_for_each(walker, &cfs_symbol_list) {
+        sym = list_entry (walker, struct cfs_symbol, sym_list);
+        LASSERT(sym->ref == 0);
+        list_del (&sym->sym_list);
+        cfs_free(sym);
+    }
+    up_write(&cfs_symbol_lock);
+    return;
+}
+
+
+
+/*
+ * Timer routines
+ */
+
+
+/* Timer dpc procedure */
+static void
+cfs_timer_dpc_proc (
+    IN PKDPC Dpc,
+    IN PVOID DeferredContext,
+    IN PVOID SystemArgument1,
+    IN PVOID SystemArgument2)
+{
+    cfs_timer_t *   timer;
+    KIRQL           Irql;
+
+    timer = (cfs_timer_t *) DeferredContext;
+
+    /* clear the flag */
+    KeAcquireSpinLock(&(timer->Lock), &Irql);
+    cfs_clear_flag(timer->Flags, CFS_TIMER_FLAG_TIMERED);
+    KeReleaseSpinLock(&(timer->Lock), Irql);
+
+    /* call the user specified timer procedure */
+    timer->proc((unsigned long)(timer->arg));
+}
+
+/*
+ * cfs_timer_init
+ *   To initialize the cfs_timer_t
+ *
+ * Arguments:
+ *   timer:  the cfs_timer to be initialized
+ *   func:   the timer callback procedure
+ *   arg:    argument for the callback proc
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_timer_init(cfs_timer_t *timer, void (*func)(unsigned long), void *arg)
+{
+    memset(timer, 0, sizeof(cfs_timer_t));
+
+    timer->proc = func;
+    timer->arg  = arg;
+
+    KeInitializeSpinLock(&(timer->Lock));
+    KeInitializeTimer(&timer->Timer);
+    KeInitializeDpc (&timer->Dpc, cfs_timer_dpc_proc, timer);
+
+    cfs_set_flag(timer->Flags, CFS_TIMER_FLAG_INITED);
+}
+
+/*
+ * cfs_timer_done
+ *   To finialize the cfs_timer_t (unused)
+ *
+ * Arguments:
+ *   timer:  the cfs_timer to be cleaned up
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_timer_done(cfs_timer_t *timer)
+{
+    return;
+}
+
+/*
+ * cfs_timer_arm
+ *   To schedule the timer while touching @deadline
+ *
+ * Arguments:
+ *   timer:  the cfs_timer to be freed
+ *   dealine: timeout value to wake up the timer
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_timer_arm(cfs_timer_t *timer, cfs_time_t deadline)
+{
+    LARGE_INTEGER   timeout;
+    KIRQL           Irql;
+
+    KeAcquireSpinLock(&(timer->Lock), &Irql);
+    if (!cfs_is_flag_set(timer->Flags, CFS_TIMER_FLAG_TIMERED)){
+
+        timeout.QuadPart = (LONGLONG)-1*1000*1000*10/HZ*deadline;
+
+        if (KeSetTimer(&timer->Timer, timeout, &timer->Dpc )) {
+            cfs_set_flag(timer->Flags, CFS_TIMER_FLAG_TIMERED);
+        }
+
+        timer->deadline = deadline;
+    }
+
+    KeReleaseSpinLock(&(timer->Lock), Irql);
+}
+
+/*
+ * cfs_timer_disarm
+ *   To discard the timer to be scheduled
+ *
+ * Arguments:
+ *   timer:  the cfs_timer to be discarded
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_timer_disarm(cfs_timer_t *timer)
+{
+    KIRQL   Irql;
+
+    KeAcquireSpinLock(&(timer->Lock), &Irql);
+    KeCancelTimer(&(timer->Timer));
+    cfs_clear_flag(timer->Flags, CFS_TIMER_FLAG_TIMERED);
+    KeReleaseSpinLock(&(timer->Lock), Irql);
+}
+
+
+/*
+ * cfs_timer_is_armed
+ *   To check the timer is scheduled or not
+ *
+ * Arguments:
+ *   timer:  the cfs_timer to be checked
+ *
+ * Return Value:
+ *   1:  if it's armed.
+ *   0:  if it's not.
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_timer_is_armed(cfs_timer_t *timer)
+{
+    int     rc = 0;
+    KIRQL   Irql;
+
+    KeAcquireSpinLock(&(timer->Lock), &Irql);
+    if (cfs_is_flag_set(timer->Flags, CFS_TIMER_FLAG_TIMERED)) {
+        rc = 1;
+    }
+    KeReleaseSpinLock(&(timer->Lock), Irql);
+
+    return rc;
+}
+
+/*
+ * cfs_timer_deadline
+ *   To query the deadline of the timer
+ *
+ * Arguments:
+ *   timer:  the cfs_timer to be queried
+ *
+ * Return Value:
+ *   the deadline value
+ *
+ * Notes: 
+ *   N/A
+ */
+
+cfs_time_t cfs_timer_deadline(cfs_timer_t * timer)
+{
+    return timer->deadline;
+}
+
+/*
+ * daemonize routine stub
+ */
+
+void cfs_daemonize(char *str)
+{
+    return;
+}
+
+/*
+ *  routine related with sigals
+ */
+
+cfs_sigset_t cfs_get_blocked_sigs(cfs_task_t *t)
+{
+        return 0;
+}
+
+void cfs_block_allsigs(cfs_task_t *t)
+{
+        return;
+}
+
+void cfs_block_sigs(cfs_task_t *t, sigset_t bit)
+{
+        return;
+}
+
+/**
+ **  Initialize routines 
+ **/
+
+int
+libcfs_arch_init(void)
+{ 
+    int         rc;
+
+    spinlock_t  lock;
+    /* Workground to check the system is MP build or UP build */
+    spin_lock_init(&lock);
+    spin_lock(&lock);
+    MPSystem = (int)lock.lock;
+    /* MP build system: it's a real spin, for UP build system, it
+       only raises the IRQL to DISPATCH_LEVEL */
+    spin_unlock(&lock);
+
+    /* create slab memory caches for page alloctors */
+    cfs_page_t_slab = cfs_mem_cache_create(
+        "CPGT", sizeof(cfs_page_t), 0, 0 );
+
+    cfs_page_p_slab = cfs_mem_cache_create(
+        "CPGP", CFS_PAGE_SIZE, 0, 0 );
+
+    if ( cfs_page_t_slab == NULL ||
+         cfs_page_p_slab == NULL ){
+        rc = -ENOMEM;
+        goto errorout;
+    }    
+
+    rc = init_task_manager();
+
+    if (rc != 0) {
+        cfs_enter_debugger();
+        KdPrint(("winnt-prim.c:libcfs_arch_init: error initializing task manager ...\n"));
+        goto errorout;
+    }
+
+    /* initialize the proc file system */
+    rc = proc_init_fs();
+
+    if (rc != 0) {
+        cfs_enter_debugger();
+        KdPrint(("winnt-prim.c:libcfs_arch_init: error initializing proc fs ...\n"));
+        cleanup_task_manager();
+        goto errorout;
+    }
+
+    /* initialize the tdi data */
+    rc = ksocknal_init_tdi_data();
+
+    if (rc != 0) {
+        cfs_enter_debugger();
+        KdPrint(("winnt-prim.c:libcfs_arch_init: error initializing tdi ...\n"));
+        proc_destroy_fs();
+        cleanup_task_manager();
+        goto errorout;
+    }
+
+errorout:
+
+    if (rc != 0) {
+        /* destroy the taskslot cache slab */
+        if (cfs_page_t_slab) {
+            cfs_mem_cache_destroy(cfs_page_t_slab);
+        }
+        if (cfs_page_p_slab) {
+            cfs_mem_cache_destroy(cfs_page_p_slab);
+        }
+    }
+
+    return rc;
+}
+
+void
+libcfs_arch_cleanup(void)
+{
+    /* finialize the tdi data */
+    ksocknal_fini_tdi_data();
+
+    /* detroy the whole proc fs tree and nodes */
+    proc_destroy_fs();
+
+    /* destroy the taskslot cache slab */
+    if (cfs_page_t_slab) {
+        cfs_mem_cache_destroy(cfs_page_t_slab);
+    }
+
+    if (cfs_page_p_slab) {
+        cfs_mem_cache_destroy(cfs_page_p_slab);
+    }
+
+       return; 
+}
+
+EXPORT_SYMBOL(libcfs_arch_init);
+EXPORT_SYMBOL(libcfs_arch_cleanup);
diff --git a/lnet/libcfs/winnt/winnt-proc.c b/lnet/libcfs/winnt/winnt-proc.c
new file mode 100644 (file)
index 0000000..ea7f270
--- /dev/null
@@ -0,0 +1,2173 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+#include "tracefile.h"
+
+#ifdef __KERNEL__
+
+
+/*
+ *  /proc emulator routines ...
+ */
+
+/* The root node of the proc fs emulation: /proc */
+cfs_proc_entry_t *              proc_fs_root = NULL;
+
+
+/* The sys root: /proc/sys */
+cfs_proc_entry_t *              proc_sys_root = NULL;
+
+
+/* The sys root: /proc/dev | to implement misc device */
+
+cfs_proc_entry_t *              proc_dev_root = NULL;
+
+
+/* SLAB object for cfs_proc_entry_t allocation */
+
+cfs_mem_cache_t *               proc_entry_cache = NULL;
+
+/* root node for sysctl table */
+
+cfs_sysctl_table_header_t       root_table_header;
+
+/* The global lock to protect all the access */
+
+#if LIBCFS_PROCFS_SPINLOCK
+spinlock_t                      proc_fs_lock;
+
+#define INIT_PROCFS_LOCK()      spin_lock_init(&proc_fs_lock)
+#define LOCK_PROCFS()           spin_lock(&proc_fs_lock)
+#define UNLOCK_PROCFS()         spin_unlock(&proc_fs_lock)
+
+#else
+
+mutex_t                         proc_fs_lock;
+
+#define INIT_PROCFS_LOCK()      init_mutex(&proc_fs_lock)
+#define LOCK_PROCFS()           mutex_down(&proc_fs_lock)
+#define UNLOCK_PROCFS()         mutex_up(&proc_fs_lock)
+
+#endif
+
+static ssize_t
+proc_file_read(struct file * file, const char * buf, size_t nbytes, loff_t *ppos)
+{
+    char    *page;
+    ssize_t retval=0;
+    int eof=0;
+    ssize_t n, count;
+    char    *start;
+    cfs_proc_entry_t * dp;
+
+    dp = (cfs_proc_entry_t  *) file->private_data;
+    if (!(page = (char*) cfs_alloc(PAGE_SIZE, 0)))
+        return -ENOMEM;
+
+    while ((nbytes > 0) && !eof) {
+
+        count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);
+
+        start = NULL;
+        if (dp->read_proc) {
+            n = dp->read_proc( page, &start, (long)*ppos,
+                               count, &eof, dp->data);
+        } else
+            break;
+
+        if (!start) {
+            /*
+             * For proc files that are less than 4k
+             */
+            start = page + *ppos;
+            n -= (ssize_t)(*ppos);
+            if (n <= 0)
+                break;
+            if (n > count)
+                n = count;
+        }
+        if (n == 0)
+            break;  /* End of file */
+        if (n < 0) {
+            if (retval == 0)
+                retval = n;
+            break;
+        }
+        
+        n -= copy_to_user((void *)buf, start, n);
+        if (n == 0) {
+            if (retval == 0)
+                retval = -EFAULT;
+            break;
+        }
+
+        *ppos += n;
+        nbytes -= n;
+        buf += n;
+        retval += n;
+    }
+    cfs_free(page);
+
+    return retval;
+}
+
+static ssize_t
+proc_file_write(struct file * file, const char * buffer,
+                size_t count, loff_t *ppos)
+{
+    cfs_proc_entry_t  * dp;
+    
+    dp = (cfs_proc_entry_t *) file->private_data;
+
+    if (!dp->write_proc)
+        return -EIO;
+
+    /* FIXME: does this routine need ppos?  probably... */
+    return dp->write_proc(file, buffer, count, dp->data);
+}
+
+struct file_operations proc_file_operations = {
+    /*lseek:*/      NULL, //proc_file_lseek,
+    /*read:*/       proc_file_read,
+    /*write:*/      proc_file_write,
+    /*ioctl:*/      NULL,
+    /*open:*/       NULL,
+    /*release:*/    NULL
+};
+
+/* allocate proc entry block */
+
+cfs_proc_entry_t *
+proc_alloc_entry()
+{
+    cfs_proc_entry_t * entry = NULL;
+
+    entry = cfs_mem_cache_alloc(proc_entry_cache, 0);
+    if (!entry) {
+        return NULL;
+    }
+
+    memset(entry, 0, sizeof(cfs_proc_entry_t));
+
+    entry->magic = CFS_PROC_ENTRY_MAGIC;
+    RtlInitializeSplayLinks(&(entry->s_link));
+    entry->proc_fops = &proc_file_operations;
+
+    return entry;
+}
+
+/* free the proc entry block */
+
+void
+proc_free_entry(cfs_proc_entry_t * entry)
+
+{
+    ASSERT(entry->magic == CFS_PROC_ENTRY_MAGIC);
+
+    cfs_mem_cache_free(proc_entry_cache, entry);
+}
+
+/* dissect the path string for a given full proc path */
+
+void
+proc_dissect_name(
+    char *path,
+    char **first,
+    int  *first_len,
+    char **remain
+    )
+{
+    int i = 0, j = 0, len = 0;
+
+    *first = *remain = NULL;
+    *first_len = 0;
+
+    len = strlen(path);
+
+    while (i < len && (path[i] == '/')) i++;
+
+    if (i < len) {
+
+        *first = path + i;
+        while (i < len && (path[i] != '/')) i++;
+        *first_len = (path + i - *first);
+
+        if (i + 1 < len) {
+            *remain = path + i + 1;
+        }
+    }
+}
+
+/* search the children entries of the parent entry */
+
+cfs_proc_entry_t *
+proc_search_splay (
+    cfs_proc_entry_t *  parent,
+    char *              name
+    )
+{
+    cfs_proc_entry_t *  node;
+    PRTL_SPLAY_LINKS    link;
+
+    ASSERT(parent->magic == CFS_PROC_ENTRY_MAGIC);
+    ASSERT(cfs_is_flag_set(parent->flags, CFS_PROC_FLAG_DIRECTORY));
+
+    link = parent->root;
+
+    while (link) {
+
+        ANSI_STRING ename,nname;
+        long        result;
+
+        node = CONTAINING_RECORD(link, cfs_proc_entry_t, s_link);
+
+        ASSERT(node->magic == CFS_PROC_ENTRY_MAGIC);
+
+        /*  Compare the prefix in the tree with the full name */
+
+        RtlInitAnsiString(&ename, name);
+        RtlInitAnsiString(&nname, node->name);
+
+        result = RtlCompareString(&nname, &ename,TRUE);
+
+        if (result > 0) {
+
+            /*  The prefix is greater than the full name
+                so we go down the left child          */
+
+            link = RtlLeftChild(link);
+
+        } else if (result < 0) {
+
+            /*  The prefix is less than the full name
+                so we go down the right child      */
+            //
+
+            link = RtlRightChild(link);
+
+        } else {
+
+            /*  We got the entry in the splay tree and
+                make it root node instead           */
+
+            parent->root = RtlSplay(link);
+
+            return node;
+        }
+
+        /* we need continue searching down the tree ... */
+    }
+
+    /*  There's no the exptected entry in the splay tree */
+
+    return NULL;
+}
+
+int
+proc_insert_splay (
+    cfs_proc_entry_t * parent,
+    cfs_proc_entry_t * child
+    )
+{
+    cfs_proc_entry_t * entry;
+
+    ASSERT(parent != NULL && child != NULL);
+    ASSERT(parent->magic == CFS_PROC_ENTRY_MAGIC);
+    ASSERT(child->magic == CFS_PROC_ENTRY_MAGIC);
+    ASSERT(cfs_is_flag_set(parent->flags, CFS_PROC_FLAG_DIRECTORY));
+
+    if (!parent->root) {
+        parent->root = &(child->s_link);
+    } else {
+        entry = CONTAINING_RECORD(parent->root, cfs_proc_entry_t, s_link);
+        while (TRUE) {
+            long        result;
+            ANSI_STRING ename, cname;
+
+            ASSERT(entry->magic == CFS_PROC_ENTRY_MAGIC);
+
+            RtlInitAnsiString(&ename, entry->name);
+            RtlInitAnsiString(&cname, child->name);
+
+            result = RtlCompareString(&ename, &cname,TRUE);
+
+            if (result == 0) {
+                cfs_enter_debugger();
+                if (entry == child) {
+                    break;
+                }
+                return FALSE;
+            }
+
+            if (result > 0) {
+                if (RtlLeftChild(&entry->s_link) == NULL) {
+                    RtlInsertAsLeftChild(&entry->s_link, &child->s_link);
+                    break;
+                } else {
+                    entry = CONTAINING_RECORD( RtlLeftChild(&entry->s_link),
+                                               cfs_proc_entry_t, s_link);
+                }
+            } else {
+                if (RtlRightChild(&entry->s_link) == NULL) {
+                    RtlInsertAsRightChild(&entry->s_link, &child->s_link);
+                    break;
+                } else {
+                    entry = CONTAINING_RECORD( RtlRightChild(&entry->s_link),
+                                               cfs_proc_entry_t, s_link );
+                }
+            }
+        }
+    }
+
+    cfs_set_flag(child->flags, CFS_PROC_FLAG_ATTACHED);
+    parent->nlink++;
+
+    return TRUE;
+}
+
+
+/* remove a child entry from the splay tree */
+int
+proc_remove_splay (
+    cfs_proc_entry_t *  parent,
+    cfs_proc_entry_t *  child
+    )
+{
+    cfs_proc_entry_t * entry = NULL;
+
+    ASSERT(parent != NULL && child != NULL);
+    ASSERT(parent->magic == CFS_PROC_ENTRY_MAGIC);
+    ASSERT(child->magic == CFS_PROC_ENTRY_MAGIC);
+    ASSERT(cfs_is_flag_set(parent->flags, CFS_PROC_FLAG_DIRECTORY));
+    ASSERT(cfs_is_flag_set(child->flags, CFS_PROC_FLAG_ATTACHED));
+
+    entry = proc_search_splay(parent, child->name);
+
+    if (entry) {
+        ASSERT(entry == child);
+        parent->root = RtlDelete(&(entry->s_link));
+        parent->nlink--;
+    } else {
+        cfs_enter_debugger();
+        return FALSE;
+    }
+
+    return TRUE;
+}
+
+
+/* search a node inside the proc fs tree */
+
+cfs_proc_entry_t *
+proc_search_entry(
+    char *              name,
+    cfs_proc_entry_t *  root
+    )
+{
+    cfs_proc_entry_t *  entry;
+    cfs_proc_entry_t *  parent;
+    char *first, *remain;
+    int   flen;
+    char *ename = NULL;
+
+    parent = root;
+    entry = NULL;
+
+    ename = cfs_alloc(0x21, CFS_ALLOC_ZERO);
+
+    if (ename == NULL) {
+        goto errorout;
+    }
+
+again:
+
+    /* dissect the file name string */
+    proc_dissect_name(name, &first, &flen, &remain);
+
+    if (first) {
+
+        if (flen >= 0x20) {
+            cfs_enter_debugger();
+            entry = NULL;
+            goto errorout;
+        }
+
+        memset(ename, 0, 0x20);
+        memcpy(ename, first, flen);
+
+        entry = proc_search_splay(parent, ename);
+
+        if (!entry) {
+            goto errorout;
+        }
+
+        if (remain) {
+            name = remain;
+            parent = entry;
+
+            goto again;
+        }
+    }
+
+errorout:
+
+    if (ename) {
+        cfs_free(ename);
+    }
+
+    return entry;   
+}
+
+/* insert the path nodes to the proc fs tree */
+
+cfs_proc_entry_t *
+proc_insert_entry(
+    char *              name,
+    cfs_proc_entry_t *  root
+    )
+{
+    cfs_proc_entry_t *entry;
+    cfs_proc_entry_t *parent;
+    char *first, *remain;
+    int flen;
+    char ename[0x20];
+
+    parent = root;
+    entry = NULL;
+
+again:
+
+    proc_dissect_name(name, &first, &flen, &remain);
+
+    if (first) {
+
+        if (flen >= 0x20) {
+            return NULL;
+        }
+
+        memset(ename, 0, 0x20);
+        memcpy(ename, first, flen);
+
+        entry = proc_search_splay(parent, ename);
+
+        if (!entry) {
+            entry = proc_alloc_entry();
+            memcpy(entry->name, ename, flen);
+
+            if (entry) {
+                if(!proc_insert_splay(parent, entry)) {
+                    proc_free_entry(entry);
+                    entry = NULL;
+                }
+            }
+        }
+
+        if (!entry) {
+            return NULL;
+        }
+
+        if (remain) {
+            entry->mode |= S_IFDIR | S_IRUGO | S_IXUGO;
+            cfs_set_flag(entry->flags, CFS_PROC_FLAG_DIRECTORY);
+            name = remain;
+            parent = entry;
+            goto again;
+        }
+    }
+
+    return entry;   
+}
+
+/* remove the path nodes from the proc fs tree */
+
+void
+proc_remove_entry(
+    char *              name,
+    cfs_proc_entry_t *  root
+    )
+{
+    cfs_proc_entry_t *entry;
+    char *first, *remain;
+    int  flen;
+    char ename[0x20];
+
+    entry  = NULL;
+
+    proc_dissect_name(name, &first, &flen, &remain);
+
+    if (first) {
+
+        memset(ename, 0, 0x20);
+        memcpy(ename, first, flen);
+
+        entry = proc_search_splay(root, ename);
+
+        if (entry) {
+
+            if (remain) {
+                ASSERT(S_ISDIR(entry->mode));
+                proc_remove_entry(remain, entry);
+            }
+
+            if (!entry->nlink) {
+                proc_remove_splay(root, entry);
+                proc_free_entry(entry);
+            }
+        }
+    } else {
+        cfs_enter_debugger();
+    }
+}
+
+/* create proc entry and insert it into the proc fs */
+
+cfs_proc_entry_t *
+create_proc_entry (
+    char *              name,
+    mode_t              mode,
+    cfs_proc_entry_t *  root
+    )
+{
+    cfs_proc_entry_t *parent = root;
+    cfs_proc_entry_t *entry  = NULL;
+
+    if (S_ISDIR(mode)) {
+        if ((mode & S_IALLUGO) == 0)
+        mode |= S_IRUGO | S_IXUGO;
+    } else {
+        if ((mode & S_IFMT) == 0)
+            mode |= S_IFREG;
+        if ((mode & S_IALLUGO) == 0)
+            mode |= S_IRUGO;
+    }
+
+    LOCK_PROCFS();
+
+    ASSERT(NULL != proc_fs_root);
+
+    if (!parent) {
+        parent = proc_fs_root;
+    }
+
+    entry = proc_search_entry(name, parent);
+
+    if (!entry) {
+        entry = proc_insert_entry(name, parent);
+        if (!entry) {
+            /* Failed to create/insert the splay node ... */
+            cfs_enter_debugger();
+            goto errorout;
+        }
+        /* Initializing entry ... */
+        entry->mode = mode;
+
+        if (S_ISDIR(mode)) {
+            cfs_set_flag(entry->flags, CFS_PROC_FLAG_DIRECTORY);
+        }
+    }
+
+errorout:
+
+    UNLOCK_PROCFS();
+
+    return entry;
+}
+
+
+/* search the specified entry form the proc fs */
+
+cfs_proc_entry_t *
+search_proc_entry(
+    char *              name,
+    cfs_proc_entry_t *  root
+    )
+{
+    cfs_proc_entry_t * entry;
+
+    LOCK_PROCFS();
+    if (root == NULL) {
+        root = proc_fs_root;
+    }
+    entry = proc_search_entry(name, root);
+    UNLOCK_PROCFS();
+
+    return entry;    
+}
+
+/* remove the entry from the proc fs */
+
+void
+remove_proc_entry(
+    char *              name,
+    cfs_proc_entry_t *  parent
+    )
+{
+    LOCK_PROCFS();
+    if (parent == NULL) {
+        parent = proc_fs_root;
+    }
+    proc_remove_entry(name, parent);
+    UNLOCK_PROCFS();
+}
+
+
+void proc_destroy_splay(cfs_proc_entry_t * entry)
+{
+    cfs_proc_entry_t * node;
+
+    if (S_ISDIR(entry->mode)) {
+
+        while (entry->root) {
+            node = CONTAINING_RECORD(entry->root, cfs_proc_entry_t, s_link);
+            entry->root = RtlDelete(&(node->s_link));
+            proc_destroy_splay(node);
+        }
+    }
+
+    proc_free_entry(entry);
+}
+
+
+/* destory the whole proc fs tree */
+
+void proc_destroy_fs()
+{
+    LOCK_PROCFS();
+
+    if (proc_fs_root) {
+        proc_destroy_splay(proc_fs_root);
+    }
+
+    if (proc_entry_cache) {
+        cfs_mem_cache_destroy(proc_entry_cache);
+    }
+   
+    UNLOCK_PROCFS();
+}
+
+/* initilaize / build the proc fs tree */
+
+int proc_init_fs()
+{
+    cfs_proc_entry_t * root = NULL;
+
+    memset(&(root_table_header), 0, sizeof(struct ctl_table_header));
+    INIT_LIST_HEAD(&(root_table_header.ctl_entry));
+
+    INIT_PROCFS_LOCK();
+    proc_entry_cache = cfs_mem_cache_create(
+                            NULL,
+                            sizeof(cfs_proc_entry_t),
+                            0,
+                            0
+                            );
+
+    if (!proc_entry_cache) {
+        return (-ENOMEM);
+    }
+
+    root = proc_alloc_entry();
+
+    if (!root) {
+        proc_destroy_fs();
+        return (-ENOMEM);
+    }
+
+    root->magic = CFS_PROC_ENTRY_MAGIC;
+    root->flags = CFS_PROC_FLAG_DIRECTORY;
+    root->mode  = S_IFDIR | S_IRUGO | S_IXUGO;
+    root->nlink = 3; // root should never be deleted.
+
+    root->name[0]='p';
+    root->name[1]='r';
+    root->name[2]='o';
+    root->name[3]='c';
+
+    proc_fs_root = root;
+
+    proc_sys_root = create_proc_entry("sys", S_IFDIR, root);
+
+    if (!proc_sys_root) {
+        proc_free_entry(root);
+        proc_fs_root = NULL;
+        proc_destroy_fs();
+        return (-ENOMEM);
+    }
+
+    proc_sys_root->nlink = 1;
+
+    proc_dev_root = create_proc_entry("dev", S_IFDIR, root);
+
+    if (!proc_dev_root) {
+        proc_free_entry(proc_sys_root);
+        proc_sys_root = NULL;
+        proc_free_entry(proc_fs_root);
+        proc_fs_root = NULL;
+        proc_destroy_fs();
+        return (-ENOMEM);
+    }
+
+    proc_dev_root->nlink = 1;
+   
+    return 0;
+}
+
+
+static ssize_t do_rw_proc(int write, struct file * file, char * buf,
+              size_t count, loff_t *ppos)
+{
+    int op;
+    cfs_proc_entry_t *de;
+    struct ctl_table *table;
+    size_t res;
+    ssize_t error;
+    
+    de = (cfs_proc_entry_t *) file->proc_dentry; 
+
+    if (!de || !de->data)
+        return -ENOTDIR;
+    table = (struct ctl_table *) de->data;
+    if (!table || !table->proc_handler)
+        return -ENOTDIR;
+    op = (write ? 002 : 004);
+
+//  if (ctl_perm(table, op))
+//      return -EPERM;
+    
+    res = count;
+
+    /*
+     * FIXME: we need to pass on ppos to the handler.
+     */
+
+    error = (*table->proc_handler) (table, write, file, buf, &res);
+    if (error)
+        return error;
+    return res;
+}
+
+static ssize_t proc_readsys(struct file * file, char * buf,
+                size_t count, loff_t *ppos)
+{
+    return do_rw_proc(0, file, buf, count, ppos);
+}
+
+static ssize_t proc_writesys(struct file * file, const char * buf,
+                 size_t count, loff_t *ppos)
+{
+    return do_rw_proc(1, file, (char *) buf, count, ppos);
+}
+
+
+struct file_operations proc_sys_file_operations = {
+    /*lseek:*/      NULL,
+    /*read:*/       proc_readsys,
+    /*write:*/      proc_writesys,
+    /*ioctl:*/      NULL,
+    /*open:*/       NULL,
+    /*release:*/    NULL
+};
+
+
+/* Scan the sysctl entries in table and add them all into /proc */
+void register_proc_table(cfs_sysctl_table_t * table, cfs_proc_entry_t * root)
+{
+    cfs_proc_entry_t * de;
+    int len;
+    mode_t mode;
+    
+    for (; table->ctl_name; table++) {
+        /* Can't do anything without a proc name. */
+        if (!table->procname)
+            continue;
+        /* Maybe we can't do anything with it... */
+        if (!table->proc_handler && !table->child) {
+            printk(KERN_WARNING "SYSCTL: Can't register %s\n",
+                table->procname);
+            continue;
+        }
+
+        len = strlen(table->procname);
+        mode = table->mode;
+
+        de = NULL;
+        if (table->proc_handler)
+            mode |= S_IFREG;
+        else {
+            de = search_proc_entry(table->procname, root);
+            if (de) {
+                break;
+            }
+            /* If the subdir exists already, de is non-NULL */
+        }
+
+        if (!de) {
+
+            de = create_proc_entry((char *)table->procname, mode, root);
+            if (!de)
+                continue;
+            de->data = (void *) table;
+            if (table->proc_handler) {
+                de->proc_fops = &proc_sys_file_operations;
+            }
+        }
+        table->de = de;
+        if (de->mode & S_IFDIR)
+            register_proc_table(table->child, de);
+    }
+}
+
+
+/*
+ * Unregister a /proc sysctl table and any subdirectories.
+ */
+void unregister_proc_table(cfs_sysctl_table_t * table, cfs_proc_entry_t *root)
+{
+    cfs_proc_entry_t *de;
+    for (; table->ctl_name; table++) {
+        if (!(de = table->de))
+            continue;
+        if (de->mode & S_IFDIR) {
+            if (!table->child) {
+                printk (KERN_ALERT "Help - malformed sysctl tree on free\n");
+                continue;
+            }
+            unregister_proc_table(table->child, de);
+
+            /* Don't unregister directories which still have entries.. */
+            if (de->nlink)
+                continue;
+        }
+
+        /* Don't unregister proc entries that are still being used.. */
+        if (de->nlink)
+            continue;
+
+        table->de = NULL;
+        remove_proc_entry((char *)table->procname, root);
+    }
+}
+
+/* The generic string strategy routine: */
+int sysctl_string(cfs_sysctl_table_t *table, int *name, int nlen,
+          void *oldval, size_t *oldlenp,
+          void *newval, size_t newlen, void **context)
+{
+    int l, len;
+    
+    if (!table->data || !table->maxlen) 
+        return -ENOTDIR;
+    
+    if (oldval && oldlenp) {
+        if(get_user(len, oldlenp))
+            return -EFAULT;
+        if (len) {
+            l = strlen(table->data);
+            if (len > l) len = l;
+            if (len >= table->maxlen)
+                len = table->maxlen;
+            if(copy_to_user(oldval, table->data, len))
+                return -EFAULT;
+            if(put_user(0, ((char *) oldval) + len))
+                return -EFAULT;
+            if(put_user(len, oldlenp))
+                return -EFAULT;
+        }
+    }
+    if (newval && newlen) {
+        len = newlen;
+        if (len > table->maxlen)
+            len = table->maxlen;
+        if(copy_from_user(table->data, newval, len))
+            return -EFAULT;
+        if (len == table->maxlen)
+            len--;
+        ((char *) table->data)[len] = 0;
+    }
+    return 0;
+}
+
+/**
+ * simple_strtoul - convert a string to an unsigned long
+ * @cp: The start of the string
+ * @endp: A pointer to the end of the parsed string will be placed here
+ * @base: The number base to use
+ */
+unsigned long simple_strtoul(const char *cp,char **endp,unsigned int base)
+{
+    unsigned long result = 0, value;
+
+    if (!base) {
+        base = 10;
+        if (*cp == '0') {
+            base = 8;
+            cp++;
+            if ((*cp == 'x') && isxdigit(cp[1])) {
+                cp++;
+                base = 16;
+            }
+        }
+    }
+    while (isxdigit(*cp) &&
+           (value = isdigit(*cp) ? *cp-'0' : toupper(*cp)-'A'+10) < base) {
+        result = result*base + value;
+        cp++;
+    }
+    if (endp)
+        *endp = (char *)cp;
+    return result;
+}
+
+#define OP_SET  0
+#define OP_AND  1
+#define OP_OR   2
+#define OP_MAX  3
+#define OP_MIN  4
+
+
+static int do_proc_dointvec(cfs_sysctl_table_t *table, int write, struct file *filp,
+          void *buffer, size_t *lenp, int conv, int op)
+{
+    int *i, vleft, first=1, neg, val;
+    size_t left, len;
+    
+    #define TMPBUFLEN 20
+    char buf[TMPBUFLEN], *p;
+    
+    if (!table->data || !table->maxlen || !*lenp)
+    {
+        *lenp = 0;
+        return 0;
+    }
+    
+    i = (int *) table->data;
+    vleft = table->maxlen / sizeof(int);
+    left = *lenp;
+    
+    for (; left && vleft--; i++, first=0) {
+        if (write) {
+            while (left) {
+                char c;
+                if(get_user(c,(char *) buffer))
+                    return -EFAULT;
+                if (!isspace(c))
+                    break;
+                left--;
+                ((char *) buffer)++;
+            }
+            if (!left)
+                break;
+            neg = 0;
+            len = left;
+            if (len > TMPBUFLEN-1)
+                len = TMPBUFLEN-1;
+            if(copy_from_user(buf, buffer, len))
+                return -EFAULT;
+            buf[len] = 0;
+            p = buf;
+            if (*p == '-' && left > 1) {
+                neg = 1;
+                left--, p++;
+            }
+            if (*p < '0' || *p > '9')
+                break;
+            val = simple_strtoul(p, &p, 0) * conv;
+            len = p-buf;
+            if ((len < left) && *p && !isspace(*p))
+                break;
+            if (neg)
+                val = -val;
+            (char *)buffer += len;
+            left -= len;
+            switch(op) {
+            case OP_SET:    *i = val; break;
+            case OP_AND:    *i &= val; break;
+            case OP_OR: *i |= val; break;
+            case OP_MAX:    if(*i < val)
+                        *i = val;
+                    break;
+            case OP_MIN:    if(*i > val)
+                        *i = val;
+                    break;
+            }
+        } else {
+            p = buf;
+            if (!first)
+                *p++ = '\t';
+            sprintf(p, "%d", (*i) / conv);
+            len = strlen(buf);
+            if (len > left)
+                len = left;
+            if(copy_to_user(buffer, buf, len))
+                return -EFAULT;
+            left -= len;
+            (char *)buffer += len;
+        }
+    }
+
+    if (!write && !first && left) {
+        if(put_user('\n', (char *) buffer))
+            return -EFAULT;
+        left--, ((char *)buffer)++;
+    }
+    if (write) {
+        p = (char *) buffer;
+        while (left) {
+            char c;
+            if(get_user(c, p++))
+                return -EFAULT;
+            if (!isspace(c))
+                break;
+            left--;
+        }
+    }
+    if (write && first)
+        return -EINVAL;
+    *lenp -= left;
+    memset(&(filp->f_pos) , 0, sizeof(loff_t));
+    filp->f_pos += (loff_t)(*lenp);
+    return 0;
+}
+
+/**
+ * proc_dointvec - read a vector of integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string. 
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec(cfs_sysctl_table_t *table, int write, struct file *filp,
+             void *buffer, size_t *lenp)
+{
+    return do_proc_dointvec(table,write,filp,buffer,lenp,1,OP_SET);
+}
+
+
+/**
+ * proc_dostring - read a string sysctl
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ *
+ * Reads/writes a string from/to the user buffer. If the kernel
+ * buffer provided is not large enough to hold the string, the
+ * string is truncated. The copied string is %NULL-terminated.
+ * If the string is being read by the user process, it is copied
+ * and a newline '\n' is added. It is truncated if the buffer is
+ * not large enough.
+ *
+ * Returns 0 on success.
+ */
+int proc_dostring(cfs_sysctl_table_t *table, int write, struct file *filp,
+          void *buffer, size_t *lenp)
+{
+    size_t len;
+    char *p, c;
+    
+    if (!table->data || !table->maxlen || !*lenp ||
+        (filp->f_pos && !write)) {
+        *lenp = 0;
+        return 0;
+    }
+    
+    if (write) {
+        len = 0;
+        p = buffer;
+        while (len < *lenp) {
+            if(get_user(c, p++))
+                return -EFAULT;
+            if (c == 0 || c == '\n')
+                break;
+            len++;
+        }
+        if (len >= (size_t)table->maxlen)
+            len = (size_t)table->maxlen-1;
+        if(copy_from_user(table->data, buffer, len))
+            return -EFAULT;
+        ((char *) table->data)[len] = 0;
+        filp->f_pos += *lenp;
+    } else {
+        len = (size_t)strlen(table->data);
+        if (len > (size_t)table->maxlen)
+            len = (size_t)table->maxlen;
+        if (len > *lenp)
+            len = *lenp;
+        if (len)
+            if(copy_to_user(buffer, table->data, len))
+                return -EFAULT;
+        if (len < *lenp) {
+            if(put_user('\n', ((char *) buffer) + len))
+                return -EFAULT;
+            len++;
+        }
+        *lenp = len;
+        filp->f_pos += len;
+    }
+    return 0;
+}
+
+/* Perform the actual read/write of a sysctl table entry. */
+int do_sysctl_strategy (cfs_sysctl_table_t *table, 
+            int *name, int nlen,
+            void *oldval, size_t *oldlenp,
+            void *newval, size_t newlen, void **context)
+{
+    int op = 0, rc;
+    size_t len;
+
+    if (oldval)
+        op |= 004;
+    if (newval) 
+        op |= 002;
+
+    if (table->strategy) {
+        rc = table->strategy(table, name, nlen, oldval, oldlenp,
+                     newval, newlen, context);
+        if (rc < 0)
+            return rc;
+        if (rc > 0)
+            return 0;
+    }
+
+    /* If there is no strategy routine, or if the strategy returns
+     * zero, proceed with automatic r/w */
+    if (table->data && table->maxlen) {
+        if (oldval && oldlenp) {
+            get_user(len, oldlenp);
+            if (len) {
+                if (len > (size_t)table->maxlen)
+                    len = (size_t)table->maxlen;
+                if(copy_to_user(oldval, table->data, len))
+                    return -EFAULT;
+                if(put_user(len, oldlenp))
+                    return -EFAULT;
+            }
+        }
+        if (newval && newlen) {
+            len = newlen;
+            if (len > (size_t)table->maxlen)
+                len = (size_t)table->maxlen;
+            if(copy_from_user(table->data, newval, len))
+                return -EFAULT;
+        }
+    }
+    return 0;
+}
+
+static int parse_table(int *name, int nlen,
+               void *oldval, size_t *oldlenp,
+               void *newval, size_t newlen,
+               cfs_sysctl_table_t *table, void **context)
+{
+    int n;
+
+repeat:
+
+    if (!nlen)
+        return -ENOTDIR;
+    if (get_user(n, name))
+        return -EFAULT;
+    for ( ; table->ctl_name; table++) {
+        if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
+            int error;
+            if (table->child) {
+/*
+                if (ctl_perm(table, 001))
+                    return -EPERM;
+*/
+                if (table->strategy) {
+                    error = table->strategy(
+                        table, name, nlen,
+                        oldval, oldlenp,
+                        newval, newlen, context);
+                    if (error)
+                        return error;
+                }
+                name++;
+                nlen--;
+                table = table->child;
+                goto repeat;
+            }
+            error = do_sysctl_strategy(table, name, nlen,
+                           oldval, oldlenp,
+                           newval, newlen, context);
+            return error;
+        }
+    }
+    return -ENOTDIR;
+}
+
+int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp,
+           void *newval, size_t newlen)
+{
+    struct list_head *tmp;
+
+    if (nlen <= 0 || nlen >= CTL_MAXNAME)
+        return -ENOTDIR;
+    if (oldval) {
+        int old_len;
+        if (!oldlenp || get_user(old_len, oldlenp))
+            return -EFAULT;
+    }
+    tmp = &root_table_header.ctl_entry;
+    do {
+        struct ctl_table_header *head =
+            list_entry(tmp, struct ctl_table_header, ctl_entry);
+        void *context = NULL;
+        int error = parse_table(name, nlen, oldval, oldlenp, 
+                    newval, newlen, head->ctl_table,
+                    &context);
+        if (context)
+            cfs_free(context);
+        if (error != -ENOTDIR)
+            return error;
+        tmp = tmp->next;
+    } while (tmp != &root_table_header.ctl_entry);
+    return -ENOTDIR;
+}
+
+/**
+ * register_sysctl_table - register a sysctl heirarchy
+ * @table: the top-level table structure
+ * @insert_at_head: whether the entry should be inserted in front or at the end
+ *
+ * Register a sysctl table heirarchy. @table should be a filled in ctl_table
+ * array. An entry with a ctl_name of 0 terminates the table. 
+ *
+ * The members of the &ctl_table structure are used as follows:
+ *
+ * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
+ *            must be unique within that level of sysctl
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ *            enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file, and for sysctl(2)
+ *
+ * child - a pointer to the child sysctl table if this entry is a directory, or
+ *         %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * strategy - the strategy routine (described below)
+ *
+ * de - for internal use by the sysctl routines
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * sysctl(2) can automatically manage read and write requests through
+ * the sysctl table.  The data and maxlen fields of the ctl_table
+ * struct enable minimal validation of the values being written to be
+ * performed, and the mode field allows minimal authentication.
+ *
+ * More sophisticated management can be enabled by the provision of a
+ * strategy routine with the table entry.  This will be called before
+ * any automatic read or write of the data is performed.
+ *
+ * The strategy routine may return
+ *
+ * < 0 - Error occurred (error is passed to user process)
+ *
+ * 0   - OK - proceed with automatic read or write.
+ *
+ * > 0 - OK - read or write has been done by the strategy routine, so
+ *       return immediately.
+ *
+ * There must be a proc_handler routine for any terminal nodes
+ * mirrored under /proc/sys (non-terminals are handled by a built-in
+ * directory handler).  Several default handlers are available to
+ * cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_minmax(), proc_doulongvec_ms_jiffies_minmax(),
+ * proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *register_sysctl_table(cfs_sysctl_table_t * table, 
+                           int insert_at_head)
+{
+    struct ctl_table_header *tmp;
+    tmp = cfs_alloc(sizeof(struct ctl_table_header), 0);
+    if (!tmp)
+        return NULL;
+    tmp->ctl_table = table;
+
+    INIT_LIST_HEAD(&tmp->ctl_entry);
+    if (insert_at_head)
+        list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
+    else
+        list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
+#ifdef CONFIG_PROC_FS
+    register_proc_table(table, proc_sys_root);
+#endif
+    return tmp;
+}
+
+/**
+ * unregister_sysctl_table - unregister a sysctl table heirarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+    list_del(&header->ctl_entry);
+#ifdef CONFIG_PROC_FS
+    unregister_proc_table(header->ctl_table, proc_sys_root);
+#endif
+    cfs_free(header);
+}
+
+
+int cfs_psdev_register(cfs_psdev_t * psdev)
+{
+    cfs_proc_entry_t *  entry;
+
+    entry = create_proc_entry (
+                (char *)psdev->name,
+                S_IFREG,
+                proc_dev_root
+            );
+
+    if (!entry) {
+        return -ENOMEM;
+    }
+
+    entry->flags |= CFS_PROC_FLAG_MISCDEV;
+
+    entry->proc_fops = psdev->fops;
+    entry->data = (void *)psdev;
+
+    return 0;
+}
+
+int cfs_psdev_deregister(cfs_psdev_t * psdev)
+{
+    cfs_proc_entry_t *  entry;
+
+    entry = search_proc_entry (
+                (char *)psdev->name,
+                proc_dev_root
+            );
+
+    if (entry) {
+
+        ASSERT(entry->data == (void *)psdev);
+        ASSERT(entry->flags & CFS_PROC_FLAG_MISCDEV);
+
+        remove_proc_entry(
+            (char *)psdev->name,
+            proc_dev_root
+            );
+    }
+
+    return 0;
+}
+
+
+
+extern char debug_file_path[1024];
+
+#define PSDEV_LIBCFS  (0x100)
+enum {
+        PSDEV_DEBUG = 1,          /* control debugging */
+        PSDEV_SUBSYSTEM_DEBUG,    /* control debugging */
+        PSDEV_PRINTK,             /* force all errors to console */
+        PSDEV_CONSOLE,            /* allow _any_ messages to console */
+        PSDEV_DEBUG_PATH,         /* crashdump log location */
+        PSDEV_DEBUG_DUMP_PATH,    /* crashdump tracelog location */
+        PSDEV_LIBCFS_MEMUSED,     /* bytes currently PORTAL_ALLOCated */
+};
+
+static struct ctl_table libcfs_table[] = {
+        {PSDEV_DEBUG, "debug", &libcfs_debug, sizeof(int), 0644, NULL,
+         &proc_dointvec},
+        {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &libcfs_subsystem_debug,
+         sizeof(int), 0644, NULL, &proc_dointvec},
+        {PSDEV_PRINTK, "printk", &libcfs_printk, sizeof(int), 0644, NULL,
+         &proc_dointvec},
+        {PSDEV_DEBUG_PATH, "debug_path", debug_file_path,
+         sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string},
+/*
+        {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall,
+         sizeof(portals_upcall), 0644, NULL, &proc_dostring,
+         &sysctl_string},
+*/
+        {PSDEV_LIBCFS_MEMUSED, "memused", (int *)&libcfs_kmemory.counter,
+         sizeof(int), 0644, NULL, &proc_dointvec},
+        {0}
+};
+
+static struct ctl_table top_table[2] = {
+        {PSDEV_LIBCFS, "libcfs", NULL, 0, 0555, libcfs_table},
+        {0}
+};
+
+
+#ifdef PORTALS_PROFILING
+/*
+ * profiling stuff.  we do this statically for now 'cause its simple,
+ * but we could do some tricks with elf sections to have this array
+ * automatically built.
+ */
+#define def_prof(FOO) [PROF__##FOO] = {#FOO, 0, }
+
+struct prof_ent prof_ents[] = {
+        def_prof(our_recvmsg),
+        def_prof(our_sendmsg),
+        def_prof(socknal_recv),
+        def_prof(lib_parse),
+        def_prof(conn_list_walk),
+        def_prof(memcpy),
+        def_prof(lib_finalize),
+        def_prof(pingcli_time),
+        def_prof(gmnal_send),
+        def_prof(gmnal_recv),
+};
+
+EXPORT_SYMBOL(prof_ents);
+
+/*
+ * this function is as crazy as the proc filling api
+ * requires.
+ *
+ * buffer: page allocated for us to scribble in.  the
+ *  data returned to the user will be taken from here.
+ * *start: address of the pointer that will tell the 
+ *  caller where in buffer the data the user wants is.
+ * ppos: offset in the entire /proc file that the user
+ *  currently wants.
+ * wanted: the amount of data the user wants.
+ *
+ * while going, 'curpos' is the offset in the entire
+ * file where we currently are.  We only actually
+ * start filling buffer when we get to a place in
+ * the file that the user cares about.
+ *
+ * we take care to only sprintf when the user cares because
+ * we're holding a lock while we do this.
+ *
+ * we're smart and know that we generate fixed size lines.
+ * we only start writing to the buffer when the user cares.
+ * This is unpredictable because we don't snapshot the
+ * list between calls that are filling in a file from
+ * the list.  The list could change mid read and the
+ * output will look very weird indeed.  oh well.
+ */
+
+static int prof_read_proc(char *buffer, char **start, off_t ppos, int wanted,
+                          int *eof, void *data)
+{
+        int len = 0, i;
+        int curpos;
+        char *header = "Interval        Cycles_per (Starts Finishes Total)\n";
+        int header_len = strlen(header);
+        char *format = "%-15s %.12Ld (%.12d %.12d %.12Ld)";
+        int line_len = (15 + 1 + 12 + 2 + 12 + 1 + 12 + 1 + 12 + 1);
+
+        *start = buffer;
+
+        if (ppos < header_len) {
+                int diff = MIN(header_len, wanted);
+                memcpy(buffer, header + ppos, diff);
+                len += diff;
+                ppos += diff;
+        }
+
+        if (len >= wanted)
+                goto out;
+
+        curpos = header_len;
+
+        for ( i = 0; i < MAX_PROFS ; i++) {
+                int copied;
+                struct prof_ent *pe = &prof_ents[i];
+                long long cycles_per;
+                /*
+                 * find the part of the array that the buffer wants
+                 */
+                if (ppos >= (curpos + line_len))  {
+                        curpos += line_len;
+                        continue;
+                }
+                /* the clever caller split a line */
+                if (ppos > curpos) {
+                        *start = buffer + (ppos - curpos);
+                }
+
+                if (pe->finishes == 0)
+                        cycles_per = 0;
+                else
+                {
+                        cycles_per = pe->total_cycles;
+                        do_div (cycles_per, pe->finishes);
+                }
+
+                copied = sprintf(buffer + len, format, pe->str, cycles_per,
+                                 pe->starts, pe->finishes, pe->total_cycles);
+
+                len += copied;
+
+                /* pad to line len, -1 for \n */
+                if ((copied < line_len-1)) {
+                        int diff = (line_len-1) - copied;
+                        memset(buffer + len, ' ', diff);
+                        len += diff;
+                        copied += diff;
+                }
+
+                buffer[len++]= '\n';
+
+                /* bail if we have enough */
+                if (((buffer + len) - *start) >= wanted)
+                        break;
+
+                curpos += line_len;
+        }
+
+        /* lameness */
+        if (i == MAX_PROFS)
+                *eof = 1;
+ out:
+
+        return MIN(((buffer + len) - *start), wanted);
+}
+
+/*
+ * all kids love /proc :/
+ */
+static unsigned char basedir[]="net/portals";
+#endif /* PORTALS_PROFILING */
+
+
+int insert_proc(void)
+{
+        cfs_proc_entry_t *ent;
+#if PORTALS_PROFILING
+        unsigned char dir[128];
+
+        if (ARRAY_SIZE(prof_ents) != MAX_PROFS) {
+                CERROR("profiling enum and array are out of sync.\n");
+                return -1;
+        }
+
+        /*
+         * This is pretty lame.  assuming that failure just
+         * means that they already existed.
+         */
+        strcat(dir, basedir);
+        create_proc_entry(dir, S_IFDIR, 0);
+
+        strcat(dir, "/cycles");
+        ent = create_proc_entry(dir, 0, 0);
+        if (!ent) {
+                CERROR("couldn't register %s?\n", dir);
+                return -1;
+        }
+
+        ent->data = NULL;
+        ent->read_proc = prof_read_proc;
+#endif /* PORTALS_PROFILING */
+
+#ifdef CONFIG_SYSCTL
+        if (!portals_table_header)
+                portals_table_header = register_sysctl_table(top_table, 0);
+#endif
+
+        ent = create_proc_entry("sys/portals/dump_kernel", 0, NULL);
+        if (ent == NULL) {
+                CERROR(("couldn't register dump_kernel\n"));
+                return -1;
+        }
+        ent->write_proc = trace_dk;
+
+        ent = create_proc_entry("sys/portals/daemon_file", 0, NULL);
+        if (ent == NULL) {
+                CERROR(("couldn't register daemon_file\n"));
+                return -1;
+        }
+        ent->write_proc = trace_write_daemon_file;
+        ent->read_proc = trace_read_daemon_file;
+
+        ent = create_proc_entry("sys/portals/debug_mb", 0, NULL);
+        if (ent == NULL) {
+                CERROR(("couldn't register debug_mb\n"));
+                return -1;
+        }
+        ent->write_proc = trace_write_debug_mb;
+        ent->read_proc = trace_read_debug_mb;
+
+        return 0;
+}
+
+void remove_proc(void)
+{
+#if PORTALS_PROFILING
+        unsigned char dir[128];
+        int end;
+
+        dir[0]='\0';
+        strcat(dir, basedir);
+
+        end = strlen(dir);
+
+        strcat(dir, "/cycles");
+        remove_proc_entry(dir, 0);
+
+        dir[end] = '\0';
+        remove_proc_entry(dir, 0);
+#endif /* PORTALS_PROFILING */
+
+        remove_proc_entry("sys/portals/dump_kernel", NULL);
+        remove_proc_entry("sys/portals/daemon_file", NULL);
+        remove_proc_entry("sys/portals/debug_mb", NULL);
+
+#ifdef CONFIG_SYSCTL
+        if (portals_table_header)
+                unregister_sysctl_table(portals_table_header);
+        portals_table_header = NULL;
+#endif
+}
+
+
+/*
+ *  proc process routines of kernel space
+ */
+
+cfs_file_t *
+lustre_open_file(char * filename)
+{
+    int rc = 0;
+    cfs_file_t * fh = NULL;
+    cfs_proc_entry_t * fp = NULL;
+
+    fp = search_proc_entry(filename, proc_fs_root);
+
+    if (!fp) {
+        rc =  -ENOENT;
+        return NULL;
+    }
+
+    fh = cfs_alloc(sizeof(cfs_file_t), CFS_ALLOC_ZERO);
+
+    if (!fh) {
+        rc =  -ENOMEM;
+        return NULL;
+    }
+
+    fh->private_data = (void *)fp;
+    fh->f_op = fp->proc_fops;
+
+    if (fh->f_op->open) {
+        rc = (fh->f_op->open)(fh);
+    } else {
+        fp->nlink++;
+    }
+
+    if (0 != rc) {
+        cfs_free(fh);
+        return NULL;
+    }
+
+    return fh;
+}
+
+int
+lustre_close_file(cfs_file_t * fh)
+{
+    int rc = 0;
+    cfs_proc_entry_t * fp = NULL;
+
+    fp = (cfs_proc_entry_t *) fh->private_data;
+
+    if (fh->f_op->release) {
+        rc = (fh->f_op->release)(fh);
+    } else {
+        fp->nlink--;
+    }
+
+    cfs_free(fh);
+
+    return rc;
+}
+
+int
+lustre_do_ioctl( cfs_file_t * fh,
+                 unsigned long cmd,
+                 ulong_ptr arg )
+{
+    int rc = 0;
+
+    if (fh->f_op->ioctl) {
+        rc = (fh->f_op->ioctl)(fh, cmd, arg);
+    }
+
+    if (rc != 0) {
+        printk("lustre_do_ioctl: fialed: cmd = %xh arg = %xh rc = %d\n",
+                cmd, arg, rc);
+    }
+
+    return rc;
+}
+    
+int
+lustre_ioctl_file(cfs_file_t * fh, PCFS_PROC_IOCTL devctl)
+{
+    int         rc = 0;
+    ulong_ptr   data;
+
+    data = (ulong_ptr)devctl + sizeof(CFS_PROC_IOCTL);
+
+    /* obd ioctl code */
+    if (_IOC_TYPE(devctl->cmd) == 'f') {
+#if 0
+        struct obd_ioctl_data * obd = (struct obd_ioctl_data *) data;
+
+        if ( devctl->cmd != (ULONG)OBD_IOC_BRW_WRITE  &&
+             devctl->cmd != (ULONG)OBD_IOC_BRW_READ ) {
+
+            unsigned long off = obd->ioc_len;
+
+            if (obd->ioc_pbuf1) {
+                obd->ioc_pbuf1 = (char *)(data + off);
+                off += size_round(obd->ioc_plen1);
+            }
+
+            if (obd->ioc_pbuf2) {
+                obd->ioc_pbuf2 = (char *)(data + off);
+            }
+        }
+ #endif
+   }
+
+    rc = lustre_do_ioctl(fh, devctl->cmd, data);
+
+    return rc;
+} 
+
+
+size_t
+lustre_read_file(
+    cfs_file_t *    fh,
+    loff_t          off,
+    size_t          size,
+    char *          buf
+    )
+{
+    size_t rc = 0;
+
+    if (fh->f_op->read) {
+        rc = (fh->f_op->read) (fh, buf, size, &off);
+    }
+
+    return rc;
+}
+
+size_t
+lustre_write_file(
+    cfs_file_t *    fh,
+    loff_t          off,
+    size_t          size,
+    char *          buf
+    )
+{
+    size_t rc = 0;
+
+    if (fh->f_op->write) {
+        rc = (fh->f_op->write)(fh, buf, size, &off);
+    }
+
+    return rc;
+}  
+
+#else /* !__KERNEL__ */
+
+#include <lnet/api-support.h>
+#include <liblustre.h>
+#include <lustre_lib.h>
+
+/*
+ * proc process routines of user space
+ */
+
+HANDLE cfs_proc_open (char * filename, int oflag)
+{
+    NTSTATUS            status;
+    IO_STATUS_BLOCK     iosb;
+    int                 rc;
+
+    HANDLE              FileHandle = INVALID_HANDLE_VALUE;
+    OBJECT_ATTRIBUTES   ObjectAttributes;
+    ACCESS_MASK         DesiredAccess;
+    ULONG               CreateDisposition;
+    ULONG               ShareAccess;
+    ULONG               CreateOptions;
+    UNICODE_STRING      UnicodeName;
+    USHORT              NameLength;
+
+    PFILE_FULL_EA_INFORMATION Ea = NULL;
+    ULONG               EaLength;
+    UCHAR               EaBuffer[EA_MAX_LENGTH];
+
+    /* Check the filename: should start with "/proc" or "/dev" */
+    NameLength = (USHORT)strlen(filename);
+    if (NameLength > 0x05) {
+        if (_strnicmp(filename, "/proc/", 6) == 0) {
+            filename += 6;
+            NameLength -=6;
+            if (NameLength <= 0) {
+                rc = -EINVAL;
+                goto errorout;
+            }
+        } else if (_strnicmp(filename, "/dev/", 5) == 0) {
+        } else {
+            rc = -EINVAL;
+            goto errorout;
+        }
+    } else {
+        rc = -EINVAL;
+        goto errorout;
+    }
+
+    /* Analyze the flags settings */
+
+    if (cfs_is_flag_set(oflag, O_WRONLY)) {
+        DesiredAccess = (GENERIC_WRITE | SYNCHRONIZE);
+        ShareAccess = 0;
+    }  else if (cfs_is_flag_set(oflag, O_RDWR)) {
+        DesiredAccess = (GENERIC_READ | GENERIC_WRITE | SYNCHRONIZE);
+        ShareAccess = FILE_SHARE_READ | FILE_SHARE_WRITE;
+    } else {
+        DesiredAccess = (GENERIC_READ | SYNCHRONIZE);
+        ShareAccess = FILE_SHARE_READ;
+    }
+
+    if (cfs_is_flag_set(oflag, O_CREAT)) {
+        if (cfs_is_flag_set(oflag, O_EXCL)) {
+            CreateDisposition = FILE_CREATE;
+            rc = -EINVAL;
+            goto errorout;
+        } else {
+            CreateDisposition = FILE_OPEN_IF;
+        }
+    } else {
+        CreateDisposition = FILE_OPEN;
+    }
+
+    if (cfs_is_flag_set(oflag, O_TRUNC)) {
+        if (cfs_is_flag_set(oflag, O_EXCL)) {
+            CreateDisposition = FILE_OVERWRITE;
+        } else {
+            CreateDisposition = FILE_OVERWRITE_IF;
+        }
+    }
+
+    CreateOptions = 0;
+
+    if (cfs_is_flag_set(oflag, O_DIRECTORY)) {
+        cfs_set_flag(CreateOptions,  FILE_DIRECTORY_FILE);
+    }
+
+    if (cfs_is_flag_set(oflag, O_SYNC)) {
+         cfs_set_flag(CreateOptions, FILE_WRITE_THROUGH);
+    }
+
+    if (cfs_is_flag_set(oflag, O_DIRECT)) {
+         cfs_set_flag(CreateOptions, FILE_NO_INTERMEDIATE_BUFFERING);
+    }
+
+    /* Initialize the unicode path name for the specified file */
+    RtlInitUnicodeString(&UnicodeName, LUSTRE_PROC_SYMLNK);
+
+    /* Setup the object attributes structure for the file. */
+    InitializeObjectAttributes(
+            &ObjectAttributes,
+            &UnicodeName,
+            OBJ_CASE_INSENSITIVE,
+            NULL,
+            NULL );
+
+    /* building EA for the proc entry ...  */
+    Ea = (PFILE_FULL_EA_INFORMATION)EaBuffer;
+    Ea->NextEntryOffset = 0;
+    Ea->Flags = 0;
+    Ea->EaNameLength = (UCHAR)NameLength;
+    Ea->EaValueLength = 0;
+    RtlCopyMemory(
+        &(Ea->EaName),
+        filename,
+        NameLength + 1
+        );
+    EaLength = sizeof(FILE_FULL_EA_INFORMATION) - 1 +
+                               Ea->EaNameLength + 1;
+
+    /* Now to open or create the file now */
+    status = ZwCreateFile(
+                &FileHandle,
+                DesiredAccess,
+                &ObjectAttributes,
+                &iosb,
+                0,
+                FILE_ATTRIBUTE_NORMAL,
+                ShareAccess,
+                CreateDisposition,
+                CreateOptions,
+                Ea,
+                EaLength );
+
+    /* Check the returned status of Iosb ... */
+
+    if (!NT_SUCCESS(status)) {
+        rc = cfs_error_code(status);
+        goto errorout;
+    }
+
+errorout:
+
+    return FileHandle;
+}
+
+int cfs_proc_close(HANDLE handle)
+{
+    if (handle) {
+        NtClose((HANDLE)handle);
+    }
+
+    return 0;
+}
+
+int cfs_proc_read(HANDLE handle, void *buffer, unsigned int count)
+{
+    NTSTATUS            status;
+    IO_STATUS_BLOCK     iosb;
+    LARGE_INTEGER       offset;
+
+
+    offset.QuadPart = 0;
+
+    /* read file data */
+    status = NtReadFile(
+                (HANDLE)handle,
+                0,
+                NULL,
+                NULL,
+                &iosb,
+                buffer,
+                count,
+                &offset,
+                NULL);                     
+
+    /* check the return status */
+    if (!NT_SUCCESS(status)) {
+        printf("NtReadFile request failed 0x%0x\n", status);
+        goto errorout;
+    }
+
+errorout:
+
+    if (NT_SUCCESS(status)) {
+        return iosb.Information;
+    }
+
+    return cfs_error_code(status);
+}
+
+
+int cfs_proc_write(HANDLE handle, void *buffer, unsigned int count)
+{
+    NTSTATUS            status;
+    IO_STATUS_BLOCK     iosb;
+    LARGE_INTEGER       offset;
+
+    offset.QuadPart = -1;
+
+    /* write buffer to the opened file */
+    status = NtWriteFile(
+                (HANDLE)handle,
+                0,
+                NULL,
+                NULL,
+                &iosb,
+                buffer,
+                count,
+                &offset,
+                NULL);                     
+
+    /* check the return status */
+    if (!NT_SUCCESS(status)) {
+        printf("NtWriteFile request failed 0x%0x\n", status);
+        goto errorout;
+    }
+
+errorout:
+
+    if (NT_SUCCESS(status)) {
+        return iosb.Information;
+    }
+
+    return cfs_error_code(status);
+}
+
+int cfs_proc_ioctl(HANDLE handle, int cmd, void *buffer)
+{
+    PUCHAR          procdat = NULL;
+    CFS_PROC_IOCTL  procctl;
+    ULONG           length = 0;
+    ULONG           extra = 0;
+
+    NTSTATUS        status;
+    IO_STATUS_BLOCK iosb;
+
+    procctl.cmd = cmd;
+
+    if(_IOC_TYPE(cmd) == IOC_PORTAL_TYPE) {
+        struct portal_ioctl_data * portal;
+        portal = (struct portal_ioctl_data *) buffer;
+        length = portal->ioc_len;
+    } else if (_IOC_TYPE(cmd) == 'f') {
+        struct obd_ioctl_data * obd;
+        obd = (struct obd_ioctl_data *) buffer;
+        length = obd->ioc_len;
+        extra = size_round(obd->ioc_plen1) + size_round(obd->ioc_plen2);
+    } else if(_IOC_TYPE(cmd) == 'u') {
+        length = 4;
+        extra  = 0;
+    } else {
+        printf("user:winnt-proc:cfs_proc_ioctl: un-supported ioctl type ...\n");
+        cfs_enter_debugger();
+        status = STATUS_INVALID_PARAMETER;
+        goto errorout;
+    }
+
+    procctl.len = length + extra;
+    procdat = malloc(length + extra + sizeof(CFS_PROC_IOCTL));
+
+    if (NULL == procdat) {
+        printf("user:winnt-proc:cfs_proc_ioctl: no enough memory ...\n");
+        status = STATUS_INSUFFICIENT_RESOURCES;
+        cfs_enter_debugger();
+        goto errorout;
+    }
+    memset(procdat, 0, length + extra + sizeof(CFS_PROC_IOCTL));
+    memcpy(procdat, &procctl, sizeof(CFS_PROC_IOCTL));
+    memcpy(&procdat[sizeof(CFS_PROC_IOCTL)], buffer, length);
+    length += sizeof(CFS_PROC_IOCTL);
+
+    if (_IOC_TYPE(cmd) == 'f') {
+
+        char *ptr;
+        struct obd_ioctl_data * data;
+        struct obd_ioctl_data * obd;
+
+        data = (struct obd_ioctl_data *) buffer;
+        obd  = (struct obd_ioctl_data *) (procdat + sizeof(CFS_PROC_IOCTL));
+        ptr = obd->ioc_bulk;
+
+        if (data->ioc_inlbuf1) {
+                obd->ioc_inlbuf1 = ptr;
+                LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr);
+        }
+
+        if (data->ioc_inlbuf2) {
+                obd->ioc_inlbuf2 = ptr;
+                LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr);
+        }
+        if (data->ioc_inlbuf3) {
+                obd->ioc_inlbuf3 = ptr;
+                LOGL(data->ioc_inlbuf3, data->ioc_inllen3, ptr);
+        }
+        if (data->ioc_inlbuf4) {
+                obd->ioc_inlbuf4 = ptr;
+                LOGL(data->ioc_inlbuf4, data->ioc_inllen4, ptr);
+        }
+    
+        if ( cmd != (ULONG)OBD_IOC_BRW_WRITE  &&
+             cmd != (ULONG)OBD_IOC_BRW_READ ) {
+
+            if (data->ioc_pbuf1 && data->ioc_plen1) {
+                obd->ioc_pbuf1 = &procdat[length];
+                memcpy(obd->ioc_pbuf1, data->ioc_pbuf1, data->ioc_plen1); 
+                length += size_round(data->ioc_plen1);
+            }
+
+            if (data->ioc_pbuf2 && data->ioc_plen2) {
+                obd->ioc_pbuf2 = &procdat[length];
+                memcpy(obd->ioc_pbuf2, data->ioc_pbuf2, data->ioc_plen2);
+                length += size_round(data->ioc_plen2);
+            }
+        }
+
+        if (obd_ioctl_is_invalid(obd)) {
+            cfs_enter_debugger();
+        }
+    }
+
+    status = NtDeviceIoControlFile(
+                (HANDLE)handle,
+                NULL, NULL, NULL, &iosb,
+                IOCTL_LIBCFS_ENTRY,
+                procdat, length,
+                procdat, length );
+
+
+    if (NT_SUCCESS(status)) {
+        memcpy(buffer, &procdat[sizeof(CFS_PROC_IOCTL)], procctl.len); 
+    }
+
+errorout:
+
+    if (procdat) {
+        free(procdat);
+    }
+
+    return cfs_error_code(status);
+}
+
+#endif /* __KERNEL__ */
\ No newline at end of file
diff --git a/lnet/libcfs/winnt/winnt-sync.c b/lnet/libcfs/winnt/winnt-sync.c
new file mode 100644 (file)
index 0000000..5094bef
--- /dev/null
@@ -0,0 +1,449 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LIBCFS
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+
+
+/*
+ * Wait queue routines
+ */
+
+/*
+ * cfs_waitq_init
+ *   To initialize the wait queue
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_init(cfs_waitq_t *waitq)
+{
+    waitq->magic = CFS_WAITQ_MAGIC;
+    waitq->flags = 0;
+    INIT_LIST_HEAD(&(waitq->waiters));
+    spin_lock_init(&(waitq->guard));
+}
+
+/*
+ * cfs_waitlink_init
+ *   To initialize the wake link node
+ *
+ * Arguments:
+ *   link:  pointer to the cfs_waitlink_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitlink_init(cfs_waitlink_t *link)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        /* should bugchk here */
+        cfs_enter_debugger();
+        return;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    cfs_assert(slot->Magic == TASKSLT_MAGIC);
+
+    memset(link, 0, sizeof(cfs_waitlink_t));
+
+    link->magic = CFS_WAITLINK_MAGIC;
+    link->flags = 0;
+
+    link->event = &(slot->Event);
+    link->hits  = &(slot->hits);
+
+    atomic_inc(&slot->count);
+
+    INIT_LIST_HEAD(&(link->waitq[0].link));
+    INIT_LIST_HEAD(&(link->waitq[1].link));
+
+    link->waitq[0].waitl = link->waitq[1].waitl = link;
+}
+
+
+/*
+ * cfs_waitlink_fini
+ *   To finilize the wake link node
+ *
+ * Arguments:
+ *   link:  pointer to the cfs_waitlink_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitlink_fini(cfs_waitlink_t *link)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        /* should bugchk here */
+        cfs_enter_debugger();
+        return;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    cfs_assert(slot->Magic == TASKSLT_MAGIC);
+    cfs_assert(link->magic == CFS_WAITLINK_MAGIC);
+    cfs_assert(link->waitq[0].waitq == NULL);
+    cfs_assert(link->waitq[1].waitq == NULL);
+
+    atomic_dec(&slot->count);
+}
+
+
+/*
+ * cfs_waitq_add_internal
+ *   To queue the wait link node to the wait queue
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *   link:   pointer to the cfs_waitlink_t structure
+ *   int:    queue no (Normal or Forward waitq)
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_add_internal(cfs_waitq_t *waitq,
+                            cfs_waitlink_t *link,
+                            __u32 waitqid )
+{ 
+    LASSERT(waitq != NULL);
+    LASSERT(link != NULL);
+    LASSERT(waitq->magic == CFS_WAITQ_MAGIC);
+    LASSERT(link->magic == CFS_WAITLINK_MAGIC);
+    LASSERT(waitqid < CFS_WAITQ_CHANNELS);
+
+    spin_lock(&(waitq->guard));
+    LASSERT(link->waitq[waitqid].waitq == NULL);
+    link->waitq[waitqid].waitq = waitq;
+    if (link->flags & CFS_WAITQ_EXCLUSIVE) {
+        list_add_tail(&link->waitq[waitqid].link, &waitq->waiters);
+    } else {
+        list_add(&link->waitq[waitqid].link, &waitq->waiters);
+    }
+    spin_unlock(&(waitq->guard));
+}
+/*
+ * cfs_waitq_add
+ *   To queue the wait link node to the wait queue
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *   link:  pointer to the cfs_waitlink_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_add(cfs_waitq_t *waitq,
+                   cfs_waitlink_t *link)
+{ 
+    cfs_waitq_add_internal(waitq, link, CFS_WAITQ_CHAN_NORMAL);
+}
+
+/*
+ * cfs_waitq_add_exclusive
+ *   To set the wait link node to exclusive mode
+ *   and queue it to the wait queue
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *   link:  pointer to the cfs_wait_link structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_add_exclusive( cfs_waitq_t *waitq,
+                              cfs_waitlink_t *link)
+{
+    LASSERT(waitq != NULL);
+    LASSERT(link != NULL);
+    LASSERT(waitq->magic == CFS_WAITQ_MAGIC);
+    LASSERT(link->magic == CFS_WAITLINK_MAGIC);
+
+       link->flags |= CFS_WAITQ_EXCLUSIVE;
+    cfs_waitq_add(waitq, link);
+}
+
+/*
+ * cfs_waitq_forward
+ *   To be determinated.
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *   link:  pointer to the cfs_waitlink_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_forward( cfs_waitlink_t *link,
+                        cfs_waitq_t *waitq)
+{
+    cfs_waitq_add_internal(waitq, link, CFS_WAITQ_CHAN_FORWARD);
+}
+
+/*
+ * cfs_waitq_del
+ *   To remove the wait link node from the waitq
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_ waitq_t structure
+ *   link:  pointer to the cfs_waitlink_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_del( cfs_waitq_t *waitq,
+                    cfs_waitlink_t *link)
+{
+    int i = 0;
+
+    LASSERT(waitq != NULL);
+    LASSERT(link != NULL);
+
+    LASSERT(waitq->magic == CFS_WAITQ_MAGIC);
+    LASSERT(link->magic == CFS_WAITLINK_MAGIC);
+
+    spin_lock(&(waitq->guard));
+
+    for (i=0; i < CFS_WAITQ_CHANNELS; i++) {
+        if (link->waitq[i].waitq == waitq)
+            break;
+    }
+
+    if (i < CFS_WAITQ_CHANNELS) {
+        link->waitq[i].waitq = NULL;
+        list_del_init(&link->waitq[i].link);
+    } else {
+        cfs_enter_debugger();
+    }
+
+    spin_unlock(&(waitq->guard));
+}
+
+/*
+ * cfs_waitq_active
+ *   Is the waitq active (not empty) ?
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_ waitq_t structure
+ *
+ * Return Value:
+ *   Zero: the waitq is empty
+ *   Non-Zero: the waitq is active
+ *
+ * Notes: 
+ *   We always returns TRUE here, the same to Darwin.
+ */
+
+int cfs_waitq_active(cfs_waitq_t *waitq)
+{
+    LASSERT(waitq != NULL);
+    LASSERT(waitq->magic == CFS_WAITQ_MAGIC);
+
+       return (1);
+}
+
+/*
+ * cfs_waitq_signal_nr
+ *   To wake up all the non-exclusive tasks plus nr exclusive
+ *   ones in the waitq
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *   nr:    number of exclusive tasks to be woken up
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+
+void cfs_waitq_signal_nr(cfs_waitq_t *waitq, int nr)
+{
+    int     result;
+    cfs_waitlink_channel_t * scan;
+
+    LASSERT(waitq != NULL);
+    LASSERT(waitq->magic == CFS_WAITQ_MAGIC);
+
+    spin_lock(&waitq->guard);
+
+    list_for_each_entry(scan, &waitq->waiters, cfs_waitlink_channel_t, link) {
+
+        cfs_waitlink_t *waitl = scan->waitl;
+
+        result = cfs_wake_event(waitl->event);
+        LASSERT( result == FALSE || result == TRUE );
+
+        if (result) {
+            atomic_inc(waitl->hits);
+        }
+
+        if ((waitl->flags & CFS_WAITQ_EXCLUSIVE) && --nr == 0)
+            break;
+    }
+
+    spin_unlock(&waitq->guard);
+    return;
+}
+
+/*
+ * cfs_waitq_signal
+ *   To wake up all the non-exclusive tasks and 1 exclusive
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_signal(cfs_waitq_t *waitq)
+{
+    cfs_waitq_signal_nr(waitq, 1);
+}
+
+
+/*
+ * cfs_waitq_broadcast
+ *   To wake up all the tasks in the waitq
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_broadcast(cfs_waitq_t *waitq)
+{
+    LASSERT(waitq != NULL);
+    LASSERT(waitq->magic ==CFS_WAITQ_MAGIC);
+
+       cfs_waitq_signal_nr(waitq, 0);
+}
+
+/*
+ * cfs_waitq_wait
+ *   To wait on the link node until it is signaled.
+ *
+ * Arguments:
+ *   link:  pointer to the cfs_waitlink_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_wait(cfs_waitlink_t *link, cfs_task_state_t state)
+{ 
+    LASSERT(link != NULL);
+    LASSERT(link->magic == CFS_WAITLINK_MAGIC);
+
+    if (atomic_read(link->hits) > 0) {
+        atomic_dec(link->hits);
+        LASSERT((__u32)atomic_read(link->hits) < (__u32)0xFFFFFF00);
+    } else {
+        cfs_wait_event(link->event, 0);
+    }
+}
+
+/*
+ * cfs_waitq_timedwait
+ *   To wait the link node to be signaled with a timeout limit
+ *
+ * Arguments:
+ *   link:   pointer to the cfs_waitlink_t structure
+ *   timeout: the timeout limitation
+ *
+ * Return Value:
+ *   Woken up: return the difference of the current time and
+ *             the timeout
+ *   Timeout:  return 0
+ *
+ * Notes: 
+ *   What if it happens to be woken up at the just timeout time !?
+ */
+
+cfs_duration_t cfs_waitq_timedwait( cfs_waitlink_t *link,
+                                    cfs_task_state_t state,
+                                    cfs_duration_t timeout)
+{ 
+
+    if (atomic_read(link->hits) > 0) {
+        atomic_dec(link->hits);
+        LASSERT((__u32)atomic_read(link->hits) < (__u32)0xFFFFFF00);
+        return TRUE;
+    }
+
+    return (cfs_duration_t)cfs_wait_event(link->event, timeout);
+}
+
+
diff --git a/lnet/libcfs/winnt/winnt-tcpip.c b/lnet/libcfs/winnt/winnt-tcpip.c
new file mode 100644 (file)
index 0000000..faa3f11
--- /dev/null
@@ -0,0 +1,6519 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LIBCFS
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+#include <lnet/lnet.h>
+
+ks_data_t ksocknal_data;
+
+ULONG
+ksocknal_tdi_send_flags(ULONG SockFlags)
+{
+    ULONG   TdiFlags = 0;
+
+    if (cfs_is_flag_set(SockFlags, MSG_OOB)) {
+        cfs_set_flag(TdiFlags, TDI_SEND_EXPEDITED);
+    }
+
+    if (cfs_is_flag_set(SockFlags, MSG_MORE)) {
+        cfs_set_flag(TdiFlags, TDI_SEND_PARTIAL);
+    }
+
+    if (cfs_is_flag_set(SockFlags, MSG_DONTWAIT)) {
+        cfs_set_flag(TdiFlags, TDI_SEND_NON_BLOCKING);
+    }
+
+    return TdiFlags;
+}
+
+NTSTATUS
+KsIrpCompletionRoutine(
+    IN PDEVICE_OBJECT    DeviceObject,
+    IN PIRP              Irp,
+    IN PVOID             Context
+    )
+{
+    if (NULL != Context) {
+        KeSetEvent((PKEVENT)Context, IO_NETWORK_INCREMENT, FALSE);
+    }
+
+    return STATUS_MORE_PROCESSING_REQUIRED;
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+    UNREFERENCED_PARAMETER(Irp);
+}
+
+
+/*
+ * KsBuildTdiIrp
+ *   Allocate a new IRP and initialize it to be issued to tdi
+ *
+ * Arguments:
+ *   DeviceObject:  device object created by the underlying
+ *                  TDI transport driver
+ *
+ * Return Value:
+ *   PRIP:   the allocated Irp in success or NULL in failure.
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+PIRP
+KsBuildTdiIrp(
+    IN PDEVICE_OBJECT    DeviceObject
+    )
+{
+    PIRP                Irp;
+    PIO_STACK_LOCATION  IrpSp;
+
+    //
+    // Allocating the IRP ...
+    //
+
+    Irp = IoAllocateIrp(DeviceObject->StackSize, FALSE);
+
+    if (NULL != Irp) {
+
+        //
+        // Getting the Next Stack Location ...
+        //
+
+        IrpSp = IoGetNextIrpStackLocation(Irp);
+
+        //
+        // Initializing Irp ...
+        //
+
+        IrpSp->MajorFunction = IRP_MJ_INTERNAL_DEVICE_CONTROL;
+        IrpSp->Parameters.DeviceIoControl.IoControlCode = 0;
+    }
+
+    return Irp;
+}
+
+/*
+ * KsSubmitTdiIrp
+ *   Issue the Irp to the underlying tdi driver
+ *
+ * Arguments:
+ *   DeviceObject:  the device object created by TDI driver
+ *   Irp:           the I/O request packet to be processed
+ *   bSynchronous:  synchronous or not. If true, we need wait
+ *                  until the process is finished.
+ *   Information:   returned info
+ *
+ * Return Value:
+ *   NTSTATUS:      kernel status code
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+NTSTATUS
+KsSubmitTdiIrp(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN BOOLEAN          bSynchronous,
+    OUT PULONG          Information
+    )
+{
+    NTSTATUS            Status;
+    KEVENT              Event;
+
+    if (bSynchronous) {
+
+        KeInitializeEvent(
+            &Event,
+            SynchronizationEvent,
+            FALSE
+            );
+
+
+        IoSetCompletionRoutine(
+            Irp,
+            KsIrpCompletionRoutine,
+            &Event,
+            TRUE,
+            TRUE,
+            TRUE
+            );
+    }
+
+    Status = IoCallDriver(DeviceObject, Irp);
+
+    if (bSynchronous) {
+
+        if (STATUS_PENDING == Status) {
+
+            Status = KeWaitForSingleObject(
+                        &Event,
+                        Executive,
+                        KernelMode,
+                        FALSE,
+                        NULL
+                        );
+        }
+
+        Status = Irp->IoStatus.Status;
+
+        if (Information) {
+            *Information = (ULONG)(Irp->IoStatus.Information);
+        }
+
+        Irp->MdlAddress = NULL;
+        IoFreeIrp(Irp);
+    }
+
+    if (!NT_SUCCESS(Status)) {
+
+        KsPrint((2, "KsSubmitTdiIrp: Error when submitting the Irp: Status = %xh (%s) ...\n",
+                    Status, KsNtStatusToString(Status)));
+    }
+
+    return (Status);
+}
+
+
+
+/*
+ * KsOpenControl
+ *   Open the Control Channel Object ...
+ *
+ * Arguments:
+ *   DeviceName:   the device name to be opened
+ *   Handle:       opened handle in success case
+ *   FileObject:   the fileobject of the device
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsOpenControl(
+    IN PUNICODE_STRING      DeviceName,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   )
+{
+    NTSTATUS          Status = STATUS_SUCCESS;
+
+    OBJECT_ATTRIBUTES ObjectAttributes;
+    IO_STATUS_BLOCK   IoStatus;
+
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    //
+    // Initializing ...
+    //
+
+    InitializeObjectAttributes(
+        &ObjectAttributes,
+        DeviceName,
+        OBJ_CASE_INSENSITIVE |
+        OBJ_KERNEL_HANDLE,
+        NULL,
+        NULL
+        );
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    //
+    // Creating the Transport Address Object ...
+    //
+
+    Status = ZwCreateFile(
+                Handle,
+                FILE_READ_DATA | FILE_WRITE_DATA,
+                &ObjectAttributes,
+                &IoStatus,
+                0,
+                FILE_ATTRIBUTE_NORMAL,
+                FILE_SHARE_READ | FILE_SHARE_WRITE,
+                FILE_OPEN,
+                0,
+                NULL,
+                0
+                );
+
+
+    if (NT_SUCCESS(Status)) {
+
+        //
+        // Now Obtaining the FileObject of the Transport Address ...
+        //
+
+        Status = ObReferenceObjectByHandle(
+                    *Handle,
+                    FILE_ANY_ACCESS,
+                    NULL,
+                    KernelMode,
+                    FileObject,
+                    NULL
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+
+            cfs_enter_debugger();
+            ZwClose(*Handle);
+        }
+
+    } else {
+
+        cfs_enter_debugger();
+    }
+
+    return (Status);
+}
+
+
+/*
+ * KsCloseControl
+ *   Release the Control Channel Handle and FileObject
+ *
+ * Arguments:
+ *   Handle:       the channel handle to be released
+ *   FileObject:   the fileobject to be released
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsCloseControl(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+   )
+{
+    NTSTATUS  Status = STATUS_SUCCESS;
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    if (FileObject) {
+
+        ObDereferenceObject(FileObject);
+    }
+
+    if (Handle) {
+
+        Status = ZwClose(Handle);
+    }
+
+    ASSERT(NT_SUCCESS(Status));
+
+    return (Status);
+}
+
+
+/*
+ * KsOpenAddress
+ *   Open the tdi address object
+ *
+ * Arguments:
+ *   DeviceName:   device name of the address object
+ *   pAddress:     tdi address of the address object
+ *   AddressLength: length in bytes of the tdi address
+ *   Handle:       the newly opened handle
+ *   FileObject:   the newly opened fileobject
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsOpenAddress(
+    IN PUNICODE_STRING      DeviceName,
+    IN PTRANSPORT_ADDRESS   pAddress,
+    IN ULONG                AddressLength,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   )
+{
+    NTSTATUS          Status = STATUS_SUCCESS;
+
+    PFILE_FULL_EA_INFORMATION Ea = NULL;
+    ULONG             EaLength;
+    UCHAR             EaBuffer[EA_MAX_LENGTH];
+
+    OBJECT_ATTRIBUTES ObjectAttributes;
+    IO_STATUS_BLOCK   IoStatus;
+
+    //
+    // Building EA for the Address Object to be Opened ...
+    //
+
+    Ea = (PFILE_FULL_EA_INFORMATION)EaBuffer;
+    Ea->NextEntryOffset = 0;
+    Ea->Flags = 0;
+    Ea->EaNameLength = TDI_TRANSPORT_ADDRESS_LENGTH;
+    Ea->EaValueLength = (USHORT)AddressLength;
+    RtlCopyMemory(
+        &(Ea->EaName),
+        TdiTransportAddress,
+        Ea->EaNameLength + 1
+        );
+    RtlMoveMemory(
+        &(Ea->EaName[Ea->EaNameLength + 1]),
+        pAddress,
+        AddressLength
+        );
+    EaLength =  sizeof(FILE_FULL_EA_INFORMATION) +
+                Ea->EaNameLength + AddressLength;
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+
+    //
+    // Initializing ...
+    //
+
+    InitializeObjectAttributes(
+        &ObjectAttributes,
+        DeviceName,
+        OBJ_CASE_INSENSITIVE |
+        OBJ_KERNEL_HANDLE,
+        NULL,
+        NULL
+        );
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    //
+    // Creating the Transport Address Object ...
+    //
+
+    Status = ZwCreateFile(
+                Handle,
+                FILE_READ_DATA | FILE_WRITE_DATA,
+                &ObjectAttributes,
+                &IoStatus,
+                0,
+                FILE_ATTRIBUTE_NORMAL,
+                0, /* DON'T REUSE: FILE_SHARE_READ | FILE_SHARE_WRITE, */
+                FILE_OPEN,
+                0,
+                Ea,
+                EaLength
+                );
+
+
+    if (NT_SUCCESS(Status)) {
+
+        //
+        // Now Obtaining the FileObject of the Transport Address ...
+        //
+
+        Status = ObReferenceObjectByHandle(
+                    *Handle,
+                    FILE_ANY_ACCESS,
+                    NULL,
+                    KernelMode,
+                    FileObject,
+                    NULL
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+
+            cfs_enter_debugger();
+            ZwClose(*Handle);
+        }
+
+    } else {
+
+        cfs_enter_debugger();
+    }
+
+    return (Status);
+}
+
+/*
+ * KsCloseAddress
+ *   Release the Hanlde and FileObject of an opened tdi
+ *   address object
+ *
+ * Arguments:
+ *   Handle:       the handle to be released
+ *   FileObject:   the fileobject to be released
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsCloseAddress(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+)
+{
+    NTSTATUS  Status = STATUS_SUCCESS;
+
+    if (FileObject) {
+
+        ObDereferenceObject(FileObject);
+    }
+
+    if (Handle) {
+
+        Status = ZwClose(Handle);
+    }
+
+    ASSERT(NT_SUCCESS(Status));
+
+    return (Status);
+}
+
+
+/*
+ * KsOpenConnection
+ *   Open a tdi connection object
+ *
+ * Arguments:
+ *   DeviceName:   device name of the connection object
+ *   ConnectionContext: the connection context
+ *   Handle:       the newly opened handle
+ *   FileObject:   the newly opened fileobject
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsOpenConnection(
+    IN PUNICODE_STRING      DeviceName,
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   )
+{
+    NTSTATUS            Status = STATUS_SUCCESS;
+
+    PFILE_FULL_EA_INFORMATION Ea = NULL;
+    ULONG               EaLength;
+    UCHAR               EaBuffer[EA_MAX_LENGTH];
+
+    OBJECT_ATTRIBUTES   ObjectAttributes;
+    IO_STATUS_BLOCK     IoStatus;
+
+    //
+    // Building EA for the Address Object to be Opened ...
+    //
+
+    Ea = (PFILE_FULL_EA_INFORMATION)EaBuffer;
+    Ea->NextEntryOffset = 0;
+    Ea->Flags = 0;
+    Ea->EaNameLength = TDI_CONNECTION_CONTEXT_LENGTH;
+    Ea->EaValueLength = (USHORT)sizeof(CONNECTION_CONTEXT);
+    RtlCopyMemory(
+        &(Ea->EaName),
+        TdiConnectionContext,
+        Ea->EaNameLength + 1
+        );
+    RtlMoveMemory(
+        &(Ea->EaName[Ea->EaNameLength + 1]),
+        &ConnectionContext,
+        sizeof(CONNECTION_CONTEXT)
+        );
+    EaLength = sizeof(FILE_FULL_EA_INFORMATION) - 1 +
+                               Ea->EaNameLength + 1 + sizeof(CONNECTION_CONTEXT);
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+
+    //
+    // Initializing ...
+    //
+
+    InitializeObjectAttributes(
+        &ObjectAttributes,
+        DeviceName,
+        OBJ_CASE_INSENSITIVE |
+        OBJ_KERNEL_HANDLE,
+        NULL,
+        NULL
+        );
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    //
+    // Creating the Connection Object ...
+    //
+
+    Status = ZwCreateFile(
+                Handle,
+                FILE_READ_DATA | FILE_WRITE_DATA,
+                &ObjectAttributes,
+                &IoStatus,
+                NULL,
+                FILE_ATTRIBUTE_NORMAL,
+                0,
+                FILE_OPEN,
+                0,
+                Ea,
+                EaLength
+                );
+
+
+    if (NT_SUCCESS(Status)) {
+
+        //
+        // Now Obtaining the FileObject of the Transport Address ...
+        //
+
+        Status = ObReferenceObjectByHandle(
+                    *Handle,
+                    FILE_ANY_ACCESS,
+                    NULL,
+                    KernelMode,
+                    FileObject,
+                    NULL
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+
+            cfs_enter_debugger();
+            ZwClose(*Handle);
+        }
+
+    } else {
+
+        cfs_enter_debugger();
+    }
+
+    return (Status);
+}
+
+/*
+ * KsCloseConnection
+ *   Release the Hanlde and FileObject of an opened tdi
+ *   connection object
+ *
+ * Arguments:
+ *   Handle:       the handle to be released
+ *   FileObject:   the fileobject to be released
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsCloseConnection(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+    )
+{
+    NTSTATUS  Status = STATUS_SUCCESS;
+
+    if (FileObject) {
+
+        ObDereferenceObject(FileObject);
+    }
+
+    if (Handle) {
+
+        Status = ZwClose(Handle);
+    }
+
+    ASSERT(NT_SUCCESS(Status));
+
+    return (Status);
+}
+
+
+/*
+ * KsAssociateAddress
+ *   Associate an address object with a connection object
+ *
+ * Arguments:
+ *   AddressHandle:  the handle of the address object
+ *   ConnectionObject:  the FileObject of the connection
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsAssociateAddress(
+    IN HANDLE           AddressHandle,
+    IN PFILE_OBJECT     ConnectionObject
+    )
+{
+    NTSTATUS            Status;
+    PDEVICE_OBJECT      DeviceObject;
+    PIRP                Irp;
+
+    //
+    // Getting the DeviceObject from Connection FileObject 
+    //
+
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    //
+    // Building Tdi Internal Irp ...
+    //
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+
+    } else {
+
+        //
+        // Assocating the Address Object with the Connection Object
+        //
+
+        TdiBuildAssociateAddress(
+            Irp,
+            DeviceObject,
+            ConnectionObject,
+            NULL,
+            NULL,
+            AddressHandle
+            );
+
+        //
+        // Calling the Transprot Driver with the Prepared Irp
+        //
+
+        Status = KsSubmitTdiIrp(DeviceObject, Irp, TRUE, NULL);
+    }
+    return (Status);
+}
+
+
+/*
+ * KsDisassociateAddress
+ *   Disassociate the connection object (the relationship will
+ *   the corresponding address object will be dismissed. )
+ *
+ * Arguments:
+ *   ConnectionObject:  the FileObject of the connection
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsDisassociateAddress(
+    IN PFILE_OBJECT     ConnectionObject
+    )
+{
+    NTSTATUS            Status;
+    PDEVICE_OBJECT      DeviceObject;
+    PIRP                   Irp;
+
+    //
+    // Getting the DeviceObject from Connection FileObject 
+    //
+
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    //
+    // Building Tdi Internal Irp ...
+    //
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+
+    } else {
+
+        //
+        // Disassocating the Address Object with the Connection Object
+        //
+
+        TdiBuildDisassociateAddress(
+            Irp,
+            DeviceObject,
+            ConnectionObject,
+            NULL,
+            NULL
+            );
+
+        //
+        // Calling the Transprot Driver with the Prepared Irp
+        //
+
+        Status = KsSubmitTdiIrp(DeviceObject, Irp, TRUE, NULL);
+    }
+
+    return (Status);
+}
+
+
+/*
+
+//
+// Connection Control Event Callbacks
+//
+
+TDI_EVENT_CONNECT
+TDI_EVENT_DISCONNECT
+TDI_EVENT_ERROR
+
+//
+// Tcp Event Callbacks
+//
+
+TDI_EVENT_RECEIVE
+TDI_EVENT_RECEIVE_EXPEDITED
+TDI_EVENT_CHAINED_RECEIVE
+TDI_EVENT_CHAINED_RECEIVE_EXPEDITED
+
+//
+// Udp Event Callbacks
+//
+
+TDI_EVENT_RECEIVE_DATAGRAM
+TDI_EVENT_CHAINED_RECEIVE_DATAGRAM
+
+*/
+
+
+/*
+ * KsSetEventHandlers
+ *   Set the tdi event callbacks with an address object
+ *
+ * Arguments:
+ *   AddressObject: the FileObject of the address object
+ *   EventContext:  the parameter for the callbacks
+ *   Handlers:      the handlers indictor array
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+NTSTATUS
+KsSetEventHandlers(
+    IN PFILE_OBJECT                         AddressObject,  // Address File Object
+    IN PVOID                                EventContext,   // Context for Handlers
+    IN PKS_EVENT_HANDLERS                   Handlers        // Handlers Indictor
+   )
+{
+    NTSTATUS             Status = STATUS_SUCCESS;
+    PDEVICE_OBJECT       DeviceObject;
+    USHORT               i = 0;
+
+    DeviceObject = IoGetRelatedDeviceObject(AddressObject);
+
+    for (i=0; i < TDI_EVENT_MAXIMUM_HANDLER; i++) {
+
+        //
+        // Setup the tdi event callback handler if requested.
+        //
+
+        if (Handlers->IsActive[i]) {
+
+            PIRP            Irp;
+
+            //
+            // Building Tdi Internal Irp ...
+            //
+
+            Irp = KsBuildTdiIrp(DeviceObject);
+
+            if (NULL == Irp) {
+
+                Status = STATUS_INSUFFICIENT_RESOURCES;
+
+            } else {
+
+                //
+                // Building the Irp to set the Event Handler ...
+                //
+
+                TdiBuildSetEventHandler(
+                    Irp,
+                    DeviceObject,
+                    AddressObject,
+                    NULL,
+                    NULL,
+                    i,                      /* tdi event type */
+                    Handlers->Handler[i],   /* tdi event handler */
+                    EventContext            /* context for the handler */
+                    );
+
+                //
+                // Calling the Transprot Driver with the Prepared Irp
+                //
+
+                Status = KsSubmitTdiIrp(DeviceObject, Irp, TRUE, NULL);
+
+                //
+                // tcp/ip tdi does not support these two event callbacks
+                //
+
+                if ((!NT_SUCCESS(Status)) && ( i == TDI_EVENT_SEND_POSSIBLE ||
+                     i == TDI_EVENT_CHAINED_RECEIVE_EXPEDITED )) {
+                    cfs_enter_debugger();
+                    Status = STATUS_SUCCESS;
+                }
+            }
+        
+            if (!NT_SUCCESS(Status)) {
+                cfs_enter_debugger();
+                goto errorout;
+            }
+        }
+    } 
+
+
+errorout:
+
+    if (!NT_SUCCESS(Status)) {
+
+        KsPrint((2, "KsSetEventHandlers: Error Status = %xh (%s)\n",
+                    Status, KsNtStatusToString(Status) ));
+    }
+
+    return (Status);
+}
+
+
+
+/*
+ * KsQueryAddressInfo
+ *   Query the address of the FileObject specified
+ *
+ * Arguments:
+ *   FileObject:  the FileObject to be queried
+ *   AddressInfo: buffer to contain the address info
+ *   AddressSize: length of the AddressInfo buffer
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsQueryAddressInfo(
+    PFILE_OBJECT            FileObject,
+    PTDI_ADDRESS_INFO       AddressInfo,
+    PULONG                  AddressSize
+   )
+{
+    NTSTATUS          Status = STATUS_UNSUCCESSFUL;
+    PIRP              Irp = NULL;
+    PMDL              Mdl;
+    PDEVICE_OBJECT    DeviceObject;
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    DeviceObject = IoGetRelatedDeviceObject(FileObject);
+
+    RtlZeroMemory(AddressInfo, *(AddressSize));
+
+    //
+    // Allocating the Tdi Setting Irp ...
+    //
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+
+    } else {
+
+        //
+        // Locking the User Buffer / Allocating a MDL for it
+        //
+
+        Status = KsLockUserBuffer(
+                    AddressInfo,
+                    FALSE,
+                    *(AddressSize),
+                    IoModifyAccess,
+                    &Mdl
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+
+            IoFreeIrp(Irp);
+            Irp = NULL;
+        }
+    }
+
+    if (Irp) {
+
+        LASSERT(NT_SUCCESS(Status));
+
+        TdiBuildQueryInformation(
+                    Irp,
+                    DeviceObject,
+                    FileObject,
+                    NULL,
+                    NULL,
+                    TDI_QUERY_ADDRESS_INFO,
+                    Mdl
+                    );
+
+        Status = KsSubmitTdiIrp(
+                    DeviceObject,
+                    Irp,
+                    TRUE,
+                    AddressSize
+                    );
+
+        KsReleaseMdl(Mdl, FALSE);
+    }
+
+    if (!NT_SUCCESS(Status)) {
+
+        cfs_enter_debugger();
+        //TDI_BUFFER_OVERFLOW
+    }
+
+    return (Status);
+}
+
+/*
+ * KsQueryProviderInfo
+ *   Query the underlying transport device's information
+ *
+ * Arguments:
+ *   TdiDeviceName:  the transport device's name string
+ *   ProviderInfo:   TDI_PROVIDER_INFO struncture
+ *
+ * Return Value:
+ *   NTSTATUS:       Nt system status code
+  *
+ * NOTES: 
+ *   N/A
+ */
+
+NTSTATUS
+KsQueryProviderInfo(
+    PWSTR               TdiDeviceName,
+    PTDI_PROVIDER_INFO  ProviderInfo
+   )
+{
+    NTSTATUS            Status = STATUS_SUCCESS;
+
+    PIRP                Irp = NULL;
+    PMDL                Mdl = NULL;
+
+    UNICODE_STRING      ControlName;
+
+    HANDLE              Handle;
+    PFILE_OBJECT        FileObject;
+    PDEVICE_OBJECT      DeviceObject;
+
+    ULONG               ProviderSize = 0;
+
+    RtlInitUnicodeString(&ControlName, TdiDeviceName);
+
+    //
+    // Open the Tdi Control Channel
+    //
+
+    Status = KsOpenControl(
+                &ControlName,
+                &Handle,
+                &FileObject
+                );
+
+    if (!NT_SUCCESS(Status)) {
+
+        KsPrint((2, "KsQueryProviderInfo: Fail to open the tdi control channel.\n"));
+        return (Status);
+    }
+
+    //
+    // Obtain The Related Device Object
+    //
+
+    DeviceObject = IoGetRelatedDeviceObject(FileObject);
+
+    ProviderSize = sizeof(TDI_PROVIDER_INFO);
+    RtlZeroMemory(ProviderInfo, ProviderSize);
+
+    //
+    // Allocating the Tdi Setting Irp ...
+    //
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+
+    } else {
+
+        //
+        // Locking the User Buffer / Allocating a MDL for it
+        //
+
+        Status = KsLockUserBuffer(
+                    ProviderInfo,
+                    FALSE,
+                    ProviderSize,
+                    IoModifyAccess,
+                    &Mdl
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+
+            IoFreeIrp(Irp);
+            Irp = NULL;
+        }
+    }
+
+    if (Irp) {
+
+        LASSERT(NT_SUCCESS(Status));
+
+        TdiBuildQueryInformation(
+                    Irp,
+                    DeviceObject,
+                    FileObject,
+                    NULL,
+                    NULL,
+                    TDI_QUERY_PROVIDER_INFO,
+                    Mdl
+                    );
+
+        Status = KsSubmitTdiIrp(
+                    DeviceObject,
+                    Irp,
+                    TRUE,
+                    &ProviderSize
+                    );
+
+        KsReleaseMdl(Mdl, FALSE);
+    }
+
+    if (!NT_SUCCESS(Status)) {
+
+        cfs_enter_debugger();
+        //TDI_BUFFER_OVERFLOW
+    }
+
+    KsCloseControl(Handle, FileObject);
+
+    return (Status);
+}
+
+/*
+ * KsQueryConnectionInfo
+ *   Query the connection info of the FileObject specified
+ *   (some statics data of the traffic)
+ *
+ * Arguments:
+ *   FileObject:     the FileObject to be queried
+ *   ConnectionInfo: buffer to contain the connection info
+ *   ConnectionSize: length of the ConnectionInfo buffer
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+NTSTATUS
+KsQueryConnectionInfo(
+    PFILE_OBJECT            ConnectionObject,
+    PTDI_CONNECTION_INFO    ConnectionInfo,
+    PULONG                  ConnectionSize
+   )
+{
+    NTSTATUS          Status = STATUS_UNSUCCESSFUL;
+    PIRP              Irp = NULL;
+    PMDL              Mdl;
+    PDEVICE_OBJECT    DeviceObject;
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    RtlZeroMemory(ConnectionInfo, *(ConnectionSize));
+
+    //
+    // Allocating the Tdi Query Irp ...
+    //
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+
+    } else {
+
+        //
+        // Locking the User Buffer / Allocating a MDL for it
+        //
+
+        Status = KsLockUserBuffer(
+                    ConnectionInfo,
+                    FALSE,
+                    *(ConnectionSize),
+                    IoModifyAccess,
+                    &Mdl
+                    );
+
+        if (NT_SUCCESS(Status)) {
+
+            IoFreeIrp(Irp);
+            Irp = NULL;
+        }
+    }
+
+    if (Irp) {
+
+        LASSERT(NT_SUCCESS(Status));
+
+        TdiBuildQueryInformation(
+                    Irp,
+                    DeviceObject,
+                    ConnectionObject,
+                    NULL,
+                    NULL,
+                    TDI_QUERY_CONNECTION_INFO,
+                    Mdl
+                    );
+
+        Status = KsSubmitTdiIrp(
+                    DeviceObject,
+                    Irp,
+                    TRUE,
+                    ConnectionSize
+                    );
+
+        KsReleaseMdl(Mdl, FALSE);
+    }
+
+    return (Status);
+}
+
+
+/*
+ * KsInitializeTdiAddress
+ *   Initialize the tdi addresss
+ *
+ * Arguments:
+ *   pTransportAddress: tdi address to be initialized
+ *   IpAddress:         the ip address of object
+ *   IpPort:            the ip port of the object
+ *
+ * Return Value:
+ *   ULONG: the total size of the tdi address
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+ULONG
+KsInitializeTdiAddress(
+    IN OUT PTA_IP_ADDRESS   pTransportAddress,
+    IN ULONG                IpAddress,
+    IN USHORT               IpPort
+    )
+{
+    pTransportAddress->TAAddressCount = 1;
+    pTransportAddress->Address[ 0 ].AddressLength = TDI_ADDRESS_LENGTH_IP;
+    pTransportAddress->Address[ 0 ].AddressType   = TDI_ADDRESS_TYPE_IP;
+    pTransportAddress->Address[ 0 ].Address[ 0 ].sin_port = IpPort;
+    pTransportAddress->Address[ 0 ].Address[ 0 ].in_addr  = IpAddress;
+
+    return (FIELD_OFFSET(TRANSPORT_ADDRESS, Address->Address) + TDI_ADDRESS_LENGTH_IP);
+}
+
+/*
+ * KsQueryTdiAddressLength
+ *   Query the total size of the tdi address
+ *
+ * Arguments:
+ *   pTransportAddress: tdi address to be queried
+ *
+ * Return Value:
+ *   ULONG: the total size of the tdi address
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+ULONG
+KsQueryTdiAddressLength(
+    PTRANSPORT_ADDRESS      pTransportAddress
+    )
+{
+    ULONG                   TotalLength = 0;
+    LONG                    i;
+
+    PTA_ADDRESS UNALIGNED   pTaAddress = NULL;
+
+    ASSERT (NULL != pTransportAddress);
+
+    TotalLength  = FIELD_OFFSET(TRANSPORT_ADDRESS, Address) +
+                   FIELD_OFFSET(TA_ADDRESS, Address) * pTransportAddress->TAAddressCount;
+
+    pTaAddress = (TA_ADDRESS UNALIGNED *)pTransportAddress->Address;
+
+    for (i = 0; i < pTransportAddress->TAAddressCount; i++)
+    {
+        TotalLength += pTaAddress->AddressLength;
+        pTaAddress = (TA_ADDRESS UNALIGNED *)((PCHAR)pTaAddress +
+                                           FIELD_OFFSET(TA_ADDRESS,Address) +
+                                           pTaAddress->AddressLength );
+    }
+
+    return (TotalLength);
+}
+
+
+/*
+ * KsQueryIpAddress
+ *   Query the ip address of the tdi object
+ *
+ * Arguments:
+ *   FileObject: tdi object to be queried
+ *   TdiAddress: TdiAddress buffer, to store the queried
+ *               tdi ip address
+ *   AddressLength: buffer length of the TdiAddress
+ *
+ * Return Value:
+ *   ULONG: the total size of the tdi ip address
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+NTSTATUS
+KsQueryIpAddress(
+    PFILE_OBJECT    FileObject,
+    PVOID           TdiAddress,
+    ULONG*          AddressLength
+    )
+{
+    NTSTATUS        Status;
+
+    PTDI_ADDRESS_INFO   TdiAddressInfo;
+    ULONG               Length;
+
+
+    //
+    // Maximum length of TDI_ADDRESSS_INFO with one TRANSPORT_ADDRESS
+    //
+
+    Length = MAX_ADDRESS_LENGTH;
+
+    TdiAddressInfo = (PTDI_ADDRESS_INFO)
+                        ExAllocatePoolWithTag(
+                            NonPagedPool,
+                            Length,
+                            'KSAI' );
+
+    if (NULL == TdiAddressInfo) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+
+    Status = KsQueryAddressInfo(
+        FileObject,
+        TdiAddressInfo,
+        &Length
+        );
+
+errorout:
+
+    if (NT_SUCCESS(Status))
+    {
+        if (*AddressLength < Length) {
+
+            Status = STATUS_BUFFER_TOO_SMALL;
+
+        } else {
+
+            *AddressLength = Length;
+            RtlCopyMemory(
+                TdiAddress,
+                &(TdiAddressInfo->Address),
+                Length
+                );
+
+            Status = STATUS_SUCCESS;
+        }
+
+    } else {
+
+    }
+
+
+    if (NULL != TdiAddressInfo) {
+
+        ExFreePool(TdiAddressInfo);
+    }
+
+    return Status;
+}
+
+
+/*
+ * KsErrorEventHandler
+ *   the common error event handler callback
+ *
+ * Arguments:
+ *   TdiEventContext: should be the socket
+ *   Status: the error code
+ *
+ * Return Value:
+ *   Status: STATS_SUCCESS
+ *
+ * NOTES: 
+ *   We need not do anything in such a severe
+ *   error case. System will process it for us.
+ */
+
+NTSTATUS
+KsErrorEventHandler(
+    IN PVOID        TdiEventContext,
+    IN NTSTATUS     Status
+   )
+{
+    KsPrint((2, "KsErrorEventHandler called at Irql = %xh ...\n",
+                KeGetCurrentIrql()));
+
+    cfs_enter_debugger();
+
+    return (STATUS_SUCCESS);
+}
+
+
+/*
+ * ksocknal_set_handlers
+ *   setup all the event handler callbacks
+ *
+ * Arguments:
+ *   tconn: the tdi connecton object
+ *
+ * Return Value:
+ *   int: ksocknal error code
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+int
+ksocknal_set_handlers(
+    ksock_tconn_t *     tconn
+    )
+{
+    NTSTATUS            status = STATUS_SUCCESS;
+    KS_EVENT_HANDLERS   handlers;
+
+    /* to make sure the address object is opened already */
+    if (tconn->kstc_addr.FileObject == NULL) {
+        goto errorout;
+    }
+
+    /* initialize the handlers indictor array. for sender and listenr, 
+       there are different set of callbacks. for child, we just return. */
+
+    memset(&handlers, 0, sizeof(KS_EVENT_HANDLERS));
+
+    SetEventHandler(handlers, TDI_EVENT_ERROR, KsErrorEventHandler);
+    SetEventHandler(handlers, TDI_EVENT_DISCONNECT, KsDisconnectEventHandler);
+    SetEventHandler(handlers, TDI_EVENT_RECEIVE, KsTcpReceiveEventHandler);
+    SetEventHandler(handlers, TDI_EVENT_RECEIVE_EXPEDITED, KsTcpReceiveExpeditedEventHandler);
+    SetEventHandler(handlers, TDI_EVENT_CHAINED_RECEIVE, KsTcpChainedReceiveEventHandler);
+
+    // SetEventHandler(handlers, TDI_EVENT_CHAINED_RECEIVE_EXPEDITED, KsTcpChainedReceiveExpeditedEventHandler);
+
+    if (tconn->kstc_type == kstt_listener) {
+        SetEventHandler(handlers, TDI_EVENT_CONNECT, KsConnectEventHandler);
+    } else if (tconn->kstc_type == kstt_child) {
+        goto errorout;
+    }
+
+    /* set all the event callbacks */
+    status = KsSetEventHandlers(
+                tconn->kstc_addr.FileObject, /* Address File Object  */
+                tconn,                       /* Event Context */
+                &handlers                    /* Event callback handlers */
+                );
+
+errorout:
+
+    return cfs_error_code(status);
+}
+
+
+/*
+ * ksocknal_reset_handlers
+ *   disable all the event handler callbacks (set to NULL)
+ *
+ * Arguments:
+ *   tconn: the tdi connecton object
+ *
+ * Return Value:
+ *   int: ksocknal error code
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+int
+ksocknal_reset_handlers(
+    ksock_tconn_t *     tconn
+    )
+{
+    NTSTATUS            status = STATUS_SUCCESS;
+    KS_EVENT_HANDLERS   handlers;
+
+    /* to make sure the address object is opened already */
+    if (tconn->kstc_addr.FileObject == NULL) {
+        goto errorout;
+    }
+
+    /* initialize the handlers indictor array. for sender and listenr, 
+       there are different set of callbacks. for child, we just return. */
+
+    memset(&handlers, 0, sizeof(KS_EVENT_HANDLERS));
+
+    SetEventHandler(handlers, TDI_EVENT_ERROR, NULL);
+    SetEventHandler(handlers, TDI_EVENT_DISCONNECT, NULL);
+    SetEventHandler(handlers, TDI_EVENT_RECEIVE, NULL);
+    SetEventHandler(handlers, TDI_EVENT_RECEIVE_EXPEDITED, NULL);
+    SetEventHandler(handlers, TDI_EVENT_CHAINED_RECEIVE, NULL);
+    // SetEventHandler(handlers, TDI_EVENT_CHAINED_RECEIVE_EXPEDITED, NULL);
+
+    if (tconn->kstc_type == kstt_listener) {
+        SetEventHandler(handlers, TDI_EVENT_CONNECT, NULL);
+    } else if (tconn->kstc_type == kstt_child) {
+        goto errorout;
+    }
+
+    /* set all the event callbacks */
+    status = KsSetEventHandlers(
+                tconn->kstc_addr.FileObject, /* Address File Object  */
+                tconn,                       /* Event Context */
+                &handlers                    /* Event callback handlers */
+                );
+
+errorout:
+
+    return cfs_error_code(status);
+}
+
+
+/*
+ * KsAcceptCompletionRoutine
+ *   Irp completion routine for TdiBuildAccept (KsConnectEventHandler)
+ *
+ *   Here system gives us a chance to check the conneciton is built
+ *   ready or not.
+ *
+ * Arguments:
+ *   DeviceObject:  the device object of the transport driver
+ *   Irp:           the Irp is being completed.
+ *   Context:       the context we specified when issuing the Irp
+ *
+ * Return Value:
+ *   Nt status code
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsAcceptCompletionRoutine(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    )
+{
+    ksock_tconn_t * child = (ksock_tconn_t *) Context;
+    ksock_tconn_t * parent = child->child.kstc_parent;
+
+    KsPrint((2, "KsAcceptCompletionRoutine: called at Irql: %xh\n",
+                KeGetCurrentIrql() ));
+
+    KsPrint((2, "KsAcceptCompletionRoutine: Context = %xh Status = %xh\n",
+                 Context, Irp->IoStatus.Status));
+
+    LASSERT(child->kstc_type == kstt_child);
+
+    spin_lock(&(child->kstc_lock));
+
+    LASSERT(parent->kstc_state == ksts_listening);
+    LASSERT(child->kstc_state == ksts_connecting);
+
+    if (NT_SUCCESS(Irp->IoStatus.Status)) {
+
+        child->child.kstc_accepted = TRUE;
+
+        child->kstc_state = ksts_connected;
+
+        /* wake up the daemon thread which waits on this event */
+        KeSetEvent(
+            &(parent->listener.kstc_accept_event),
+            0,
+            FALSE
+            );
+
+        spin_unlock(&(child->kstc_lock));
+
+        KsPrint((2, "KsAcceptCompletionRoutine: Get %xh now signal the event ...\n", parent));
+
+    } else {
+
+        /* re-use this child connecton  */
+        child->child.kstc_accepted = FALSE;
+        child->child.kstc_busy = FALSE;
+        child->kstc_state = ksts_associated;
+
+        spin_unlock(&(child->kstc_lock));
+    }
+
+    /* now free the Irp */
+    IoFreeIrp(Irp);
+
+    /* drop the refer count of the child */
+    ksocknal_put_tconn(child);
+
+    return (STATUS_MORE_PROCESSING_REQUIRED);
+}
+
+
+/*
+ * ksocknal_get_vacancy_backlog
+ *   Get a vacancy listeing child from the backlog list
+ *
+ * Arguments:
+ *   parent: the listener daemon connection
+ *
+ * Return Value:
+ *   the child listening connection or NULL in failure
+ *
+ * Notes 
+ *   Parent's lock should be acquired before calling.
+ */
+
+ksock_tconn_t *
+ksocknal_get_vacancy_backlog(
+    ksock_tconn_t *  parent
+    )
+{
+    ksock_tconn_t * child;
+
+    LASSERT(parent->kstc_type == kstt_listener);
+    LASSERT(parent->kstc_state == ksts_listening);
+
+    if (list_empty(&(parent->listener.kstc_listening.list))) {
+
+        child = NULL;
+
+    } else {
+
+        struct list_head * tmp;
+
+        /* check the listening queue and try to get a free connecton */
+
+        list_for_each(tmp, &(parent->listener.kstc_listening.list)) {
+            child = list_entry (tmp, ksock_tconn_t, child.kstc_link);
+            spin_lock(&(child->kstc_lock));
+
+            if (!child->child.kstc_busy) {
+                LASSERT(child->kstc_state == ksts_associated);
+                child->child.kstc_busy = TRUE;
+                spin_unlock(&(child->kstc_lock));
+                break;
+            } else {
+                spin_unlock(&(child->kstc_lock));
+                child = NULL;
+            }
+        }
+    }
+
+    return child;
+}
+
+
+/*
+ * KsConnectEventHandler
+ *   Connect event handler event handler, called by the underlying TDI
+ *   transport in response to an incoming request to the listening daemon.
+ *   
+ *   it will grab a vacancy backlog from the children tconn list, and 
+ *   build an acception Irp with it, then transfer the Irp to TDI driver.
+ *
+ * Arguments:
+ *   TdiEventContext:  the tdi connnection object of the listening daemon 
+ *   ......
+ *
+ * Return Value:
+ *   Nt kernel status code
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsConnectEventHandler(
+    IN PVOID                    TdiEventContext,
+    IN LONG                     RemoteAddressLength,
+    IN PVOID                    RemoteAddress,
+    IN LONG                     UserDataLength,
+    IN PVOID                    UserData,
+    IN LONG                     OptionsLength,
+    IN PVOID                    Options,
+    OUT CONNECTION_CONTEXT *    ConnectionContext,
+    OUT PIRP *                  AcceptIrp
+    )
+{
+    ksock_tconn_t *             parent;
+    ksock_tconn_t *             child;
+
+    PFILE_OBJECT                FileObject;
+    PDEVICE_OBJECT              DeviceObject;
+    NTSTATUS                    Status;
+
+    PIRP                        Irp = NULL;
+    PTDI_CONNECTION_INFORMATION ConnectionInfo = NULL;
+
+    KsPrint((2,"KsConnectEventHandler: call at Irql: %u\n", KeGetCurrentIrql()));
+    parent = (ksock_tconn_t *) TdiEventContext;
+
+    LASSERT(parent->kstc_type == kstt_listener);
+
+    spin_lock(&(parent->kstc_lock));
+
+    if (parent->kstc_state == ksts_listening) {
+
+        /* allocate a new ConnectionInfo to backup the peer's info */
+
+        ConnectionInfo = (PTDI_CONNECTION_INFORMATION)ExAllocatePoolWithTag(
+                NonPagedPool, sizeof(TDI_CONNECTION_INFORMATION) +
+                RemoteAddressLength, 'iCsK' );
+
+        if (NULL == ConnectionInfo) {
+
+            Status = STATUS_INSUFFICIENT_RESOURCES;
+            cfs_enter_debugger();
+            goto errorout;
+        }
+
+        /* initializing ConnectionInfo structure ... */
+
+        ConnectionInfo->UserDataLength = UserDataLength;
+        ConnectionInfo->UserData = UserData;
+        ConnectionInfo->OptionsLength = OptionsLength;
+        ConnectionInfo->Options = Options;
+        ConnectionInfo->RemoteAddressLength = RemoteAddressLength;
+        ConnectionInfo->RemoteAddress = ConnectionInfo + 1;
+
+        RtlCopyMemory(
+                ConnectionInfo->RemoteAddress,
+                RemoteAddress,
+                RemoteAddressLength
+                );
+
+        /* get the vacancy listening child tdi connections */
+
+        child = ksocknal_get_vacancy_backlog(parent); 
+
+        if (child) {
+
+            spin_lock(&(child->kstc_lock));
+            child->child.kstc_info.ConnectionInfo = ConnectionInfo;
+            child->child.kstc_info.Remote = ConnectionInfo->RemoteAddress;
+            child->kstc_state = ksts_connecting;
+            spin_unlock(&(child->kstc_lock));
+
+        } else {
+
+            KsPrint((2, "KsConnectEventHandler: No enough backlogs: Refsued the connectio: %xh\n", parent));
+
+            Status = STATUS_INSUFFICIENT_RESOURCES;
+
+            goto errorout;
+        }
+
+        FileObject = child->child.kstc_info.FileObject;
+        DeviceObject = IoGetRelatedDeviceObject (FileObject);
+                            
+        Irp = KsBuildTdiIrp(DeviceObject);
+
+        TdiBuildAccept(
+                Irp,
+                DeviceObject,
+                FileObject,
+                KsAcceptCompletionRoutine,
+                child,
+                NULL,
+                NULL
+                );
+
+        IoSetNextIrpStackLocation(Irp);
+
+        /* grap the refer of the child tdi connection */
+        ksocknal_get_tconn(child);
+
+        Status = STATUS_MORE_PROCESSING_REQUIRED;
+
+        *AcceptIrp = Irp;
+        *ConnectionContext = child;
+
+    } else {
+
+        Status = STATUS_CONNECTION_REFUSED;
+        goto errorout;
+    }
+
+    spin_unlock(&(parent->kstc_lock));
+
+    return Status;
+
+errorout:
+
+    spin_unlock(&(parent->kstc_lock));
+
+    {
+        *AcceptIrp = NULL;
+        *ConnectionContext = NULL;
+
+        if (ConnectionInfo) {
+
+            ExFreePool(ConnectionInfo);
+        }
+
+        if (Irp) {
+
+            IoFreeIrp (Irp);
+        }
+    }
+
+    return Status;
+}
+
+
+
+/*
+ * KsDisconnectCompletionRoutine
+ *   the Irp completion routine for TdiBuildDisconect
+ *
+ *   We just signal the event and return MORE_PRO... to
+ *   let the caller take the responsibility of the Irp.
+ *
+ * Arguments:
+ *   DeviceObject:  the device object of the transport
+ *   Irp:           the Irp is being completed.
+ *   Context:       the event specified by the caller
+ *
+ * Return Value:
+ *   Nt status code
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsDisconectCompletionRoutine (
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    )
+{
+
+    KeSetEvent((PKEVENT) Context, 0, FALSE);
+
+    return STATUS_MORE_PROCESSING_REQUIRED;
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+
+/*
+ * KsDisconnectHelper
+ *   the routine to be executed in the WorkItem procedure
+ *   this routine is to disconnect a tdi connection
+ *
+ * Arguments:
+ *   Workitem:  the context transferred to the workitem
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   tconn is already referred in abort_connecton ...
+ */
+
+VOID
+KsDisconnectHelper(PKS_DISCONNECT_WORKITEM WorkItem)
+{
+    ksock_tconn_t * tconn = WorkItem->tconn;
+
+    ksocknal_disconnect_tconn(tconn, WorkItem->Flags);
+
+    KeSetEvent(&(WorkItem->Event), 0, FALSE);
+
+    spin_lock(&(tconn->kstc_lock));
+    cfs_clear_flag(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY);
+    spin_unlock(&(tconn->kstc_lock));
+    ksocknal_put_tconn(tconn);
+}
+
+
+/*
+ * KsDisconnectEventHandler
+ *   Disconnect event handler event handler, called by the underlying TDI transport
+ *   in response to an incoming disconnection notification from a remote node.
+ *
+ * Arguments:
+ *   ConnectionContext:  tdi connnection object
+ *   DisconnectFlags:    specifies the nature of the disconnection
+ *   ......
+ *
+ * Return Value:
+ *   Nt kernel status code
+ *
+ * Notes: 
+ *   N/A
+ */
+
+
+NTSTATUS 
+KsDisconnectEventHandler(
+    IN PVOID                TdiEventContext,
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN LONG                 DisconnectDataLength,
+    IN PVOID                DisconnectData,
+    IN LONG                 DisconnectInformationLength,
+    IN PVOID                DisconnectInformation,
+    IN ULONG                DisconnectFlags
+    )
+{
+    ksock_tconn_t *         tconn;
+    NTSTATUS                Status;
+    PKS_DISCONNECT_WORKITEM WorkItem;
+    
+    tconn = (ksock_tconn_t *)ConnectionContext;
+
+    KsPrint((2, "KsTcpDisconnectEventHandler: called at Irql: %xh\n",
+                KeGetCurrentIrql() ));
+
+    KsPrint((2, "tconn = %x DisconnectFlags= %xh\n",
+                 tconn, DisconnectFlags));
+
+    ksocknal_get_tconn(tconn);
+    spin_lock(&(tconn->kstc_lock));
+
+    WorkItem = &(tconn->kstc_disconnect);
+
+    if (tconn->kstc_state != ksts_connected) {
+
+        Status = STATUS_SUCCESS;
+
+    } else {
+
+        if (cfs_is_flag_set(DisconnectFlags, TDI_DISCONNECT_ABORT)) {
+
+            Status = STATUS_REMOTE_DISCONNECT;
+
+        } else if (cfs_is_flag_set(DisconnectFlags, TDI_DISCONNECT_RELEASE)) {
+
+            Status = STATUS_GRACEFUL_DISCONNECT;
+        }
+
+        if (!cfs_is_flag_set(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY)) {
+
+            ksocknal_get_tconn(tconn);
+
+            WorkItem->Flags = DisconnectFlags;
+            WorkItem->tconn = tconn;
+
+            cfs_set_flag(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY);
+
+            /* queue the workitem to call */
+            ExQueueWorkItem(&(WorkItem->WorkItem), DelayedWorkQueue);
+        }
+    }
+
+    spin_unlock(&(tconn->kstc_lock));
+    ksocknal_put_tconn(tconn);
+
+    return  (Status);
+}
+
+NTSTATUS
+KsTcpReceiveCompletionRoutine(
+    IN PIRP                         Irp,
+    IN PKS_TCP_COMPLETION_CONTEXT   Context
+    )
+{
+    NTSTATUS Status = Irp->IoStatus.Status;
+
+    if (NT_SUCCESS(Status)) {
+
+        ksock_tconn_t *tconn = Context->tconn;
+
+        PKS_TSDU_DAT  KsTsduDat = Context->CompletionContext;
+        PKS_TSDU_BUF  KsTsduBuf = Context->CompletionContext;
+
+        KsPrint((1, "KsTcpReceiveCompletionRoutine: Total %xh bytes.\n", 
+                   Context->KsTsduMgr->TotalBytes ));
+
+        spin_lock(&(tconn->kstc_lock));
+
+        if (TSDU_TYPE_DAT == KsTsduDat->TsduType) {
+            if (cfs_is_flag_set(KsTsduDat->TsduFlags, KS_TSDU_DAT_RECEIVING)) {
+                cfs_clear_flag(KsTsduDat->TsduFlags, KS_TSDU_DAT_RECEIVING);
+            } else {
+                cfs_enter_debugger();
+            }
+        } else {
+            ASSERT(TSDU_TYPE_BUF == KsTsduBuf->TsduType);
+            if (cfs_is_flag_set(KsTsduBuf->TsduFlags, KS_TSDU_BUF_RECEIVING)) {
+                cfs_clear_flag(KsTsduBuf->TsduFlags, KS_TSDU_BUF_RECEIVING);
+            } else {
+                cfs_enter_debugger();
+            }
+        }
+
+        spin_unlock(&(tconn->kstc_lock));
+
+        /* wake up the thread waiting for the completion of this Irp */
+        KeSetEvent(Context->Event, 0, FALSE);
+
+        /* re-active the ksocknal connection and wake up the scheduler */
+        if (tconn->kstc_conn && tconn->kstc_sched_cb) {
+            tconn->kstc_sched_cb( tconn, FALSE, NULL,
+                                  Context->KsTsduMgr->TotalBytes );
+        }
+
+    } else {
+
+        /* un-expected errors occur, we must abort the connection */
+        ksocknal_abort_tconn(Context->tconn);
+    }
+
+    if (Context) {
+
+        /* Freeing the Context structure... */
+        ExFreePool(Context);
+        Context = NULL;
+    }
+
+
+    /* free the Irp */
+    if (Irp) {
+        IoFreeIrp(Irp);
+    }
+
+    return (Status);
+}
+
+
+/*
+ * KsTcpCompletionRoutine
+ *   the Irp completion routine for TdiBuildSend and TdiBuildReceive ...
+ *   We need call the use's own CompletionRoutine if specified. Or
+ *   it's a synchronous case, we need signal the event.
+ *
+ * Arguments:
+ *   DeviceObject:  the device object of the transport
+ *   Irp:           the Irp is being completed.
+ *   Context:       the context we specified when issuing the Irp
+ *
+ * Return Value:
+ *   Nt status code
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsTcpCompletionRoutine(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    )
+{
+    if (Context) {
+
+        PKS_TCP_COMPLETION_CONTEXT  CompletionContext = NULL;
+        ksock_tconn_t * tconn = NULL;
+
+        CompletionContext = (PKS_TCP_COMPLETION_CONTEXT) Context;
+        tconn = CompletionContext->tconn;
+
+        /* release the chained mdl */
+        KsReleaseMdl(Irp->MdlAddress, FALSE);
+        Irp->MdlAddress = NULL;
+
+        if (CompletionContext->CompletionRoutine) {
+
+            if ( CompletionContext->bCounted &&
+                 InterlockedDecrement(&CompletionContext->ReferCount) != 0 ) {
+                    goto errorout;
+            }
+
+            //
+            // Giving control to user specified CompletionRoutine ...
+            //
+
+            CompletionContext->CompletionRoutine(
+                    Irp,
+                    CompletionContext
+                    );
+
+        } else {
+
+            //
+            // Signaling  the Event ...
+            //
+
+            KeSetEvent(CompletionContext->Event, 0, FALSE);
+        }
+
+        /* drop the reference count of the tconn object */
+        ksocknal_put_tconn(tconn);
+
+
+        /*
+         * We need free the Context now ...
+         */
+
+        if (Context) {
+            cfs_free(Context);
+        }
+
+    } else {
+
+        cfs_enter_debugger();
+    }    
+
+errorout:
+
+    return STATUS_MORE_PROCESSING_REQUIRED;
+}
+
+/*
+ * KsTcpSendCompletionRoutine
+ *   the user specified Irp completion routine for asynchronous
+ *   data transmission requests.
+ *
+ *   It will do th cleanup job of the ksock_tx_t and wake up the
+ *   ksocknal scheduler thread
+ *
+ * Arguments:
+ *   Irp:           the Irp is being completed.
+ *   Context:       the context we specified when issuing the Irp
+ *
+ * Return Value:
+ *   Nt status code
+ *
+ * Notes: 
+ *   N/A
+ */
+
+NTSTATUS
+KsTcpSendCompletionRoutine(
+    IN PIRP                         Irp,
+    IN PKS_TCP_COMPLETION_CONTEXT   Context
+    )
+{
+    NTSTATUS        Status = Irp->IoStatus.Status;
+    ULONG           rc = Irp->IoStatus.Information;
+    ksock_tconn_t * tconn = Context->tconn;
+    PKS_TSDUMGR     KsTsduMgr = Context->KsTsduMgr;
+
+    ENTRY;
+
+    LASSERT(tconn) ;
+
+    if (NT_SUCCESS(Status)) {
+
+        if (Context->bCounted) {
+            PVOID   tx = Context->CompletionContext;
+
+            ASSERT(tconn->kstc_update_tx != NULL);
+
+            /* update the tx, rebasing the kiov or iov pointers */
+            tx = tconn->kstc_update_tx(tconn, tx, rc);
+
+            /* update the KsTsudMgr total bytes */
+            spin_lock(&tconn->kstc_lock);
+            KsTsduMgr->TotalBytes -= rc;
+            spin_unlock(&tconn->kstc_lock);
+
+            /*
+             * now it's time to re-queue the conns into the
+             * scheduler queue and wake the scheduler thread.
+             */
+
+            if (tconn->kstc_conn && tconn->kstc_sched_cb) {
+                tconn->kstc_sched_cb( tconn, TRUE, tx, 0);
+            }
+
+        } else {
+
+            PKS_TSDU            KsTsdu = Context->CompletionContext;
+            PKS_TSDU_BUF        KsTsduBuf = Context->CompletionContext2;
+            PKS_TSDU_DAT        KsTsduDat = Context->CompletionContext2;
+
+            spin_lock(&tconn->kstc_lock);
+            /* This is bufferred sending ... */
+            ASSERT(KsTsduBuf->StartOffset == 0);
+
+            if (KsTsduBuf->DataLength > Irp->IoStatus.Information) {
+                /* not fully sent .... we have to abort the connection */
+                spin_unlock(&tconn->kstc_lock);
+                ksocknal_abort_tconn(tconn);
+                goto errorout;
+            }
+
+            if (KsTsduBuf->TsduType  == TSDU_TYPE_BUF) {
+                /* free the buffer */
+                ExFreePool(KsTsduBuf->UserBuffer);
+                KsTsduMgr->TotalBytes -= KsTsduBuf->DataLength;
+                KsTsdu->StartOffset   += sizeof(KS_TSDU_BUF);
+            } else if (KsTsduDat->TsduType  == TSDU_TYPE_DAT) {
+                KsTsduMgr->TotalBytes -= KsTsduDat->DataLength;
+                KsTsdu->StartOffset   += KsTsduDat->TotalLength;
+            } else {
+                cfs_enter_debugger(); /* shoult not get here */
+            }
+
+            if (KsTsdu->StartOffset == KsTsdu->LastOffset) {
+
+                list_del(&KsTsdu->Link);
+                KsTsduMgr->NumOfTsdu--;
+                KsPutKsTsdu(KsTsdu);
+            }
+
+            spin_unlock(&tconn->kstc_lock);
+        }
+        
+    } else {
+
+        /* cfs_enter_debugger(); */
+
+        /*
+         *  for the case that the transmission is ussuccessful,
+         *  we need abort the tdi connection, but not destroy it.
+         *  the socknal conn will drop the refer count, then the
+         *  tdi connection will be freed.
+         */
+
+        ksocknal_abort_tconn(tconn);
+    }
+
+errorout:
+
+    /*
+     *  it's our duty to free the Irp.
+     */
+
+    if (Irp) {
+        IoFreeIrp(Irp);
+        Irp = NULL;
+    }
+
+    EXIT;
+
+    return Status;
+}
+
+/*
+ *  Normal receive event handler
+ *
+ *  It will move data from system Tsdu to our TsduList
+ */
+
+NTSTATUS
+KsTcpReceiveEventHandler(
+    IN PVOID                TdiEventContext, 
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags,
+    IN ULONG                BytesIndicated,
+    IN ULONG                BytesAvailable,
+    OUT ULONG *             BytesTaken,
+    IN PVOID                Tsdu,
+    OUT PIRP *              IoRequestPacket
+   )
+{
+    NTSTATUS            Status;
+
+    ksock_tconn_t *     tconn;
+
+    PKS_CHAIN           KsChain;
+    PKS_TSDUMGR         KsTsduMgr;
+    PKS_TSDU            KsTsdu;
+    PKS_TSDU_DAT        KsTsduDat;
+    PKS_TSDU_BUF        KsTsduBuf;
+
+    BOOLEAN             bIsExpedited;
+    BOOLEAN             bIsCompleteTsdu;
+
+    BOOLEAN             bNewTsdu = FALSE;
+    BOOLEAN             bNewBuff = FALSE;
+
+    PCHAR               Buffer = NULL;
+
+    PIRP                Irp = NULL;
+    PMDL                Mdl = NULL;
+    PFILE_OBJECT        FileObject;
+    PDEVICE_OBJECT      DeviceObject;
+
+    ULONG               BytesReceived = 0;
+
+    PKS_TCP_COMPLETION_CONTEXT context = NULL;
+
+
+    tconn = (ksock_tconn_t *) ConnectionContext;
+
+    ksocknal_get_tconn(tconn);
+
+    /* check whether the whole body of payload is received or not */
+    if ( (cfs_is_flag_set(ReceiveFlags, TDI_RECEIVE_ENTIRE_MESSAGE)) &&
+         (BytesIndicated == BytesAvailable) ) {
+        bIsCompleteTsdu = TRUE;
+    } else {
+        bIsCompleteTsdu = FALSE;
+    }
+
+    bIsExpedited = cfs_is_flag_set(ReceiveFlags, TDI_RECEIVE_EXPEDITED);
+
+    KsPrint((2, "KsTcpReceiveEventHandler BytesIndicated = %d BytesAvailable = %d ...\n", BytesIndicated, BytesAvailable));
+    KsPrint((2, "bIsCompleteTsdu = %d bIsExpedited = %d\n", bIsCompleteTsdu, bIsExpedited ));
+
+    spin_lock(&(tconn->kstc_lock));
+
+    /*  check whether we are conntected or not listener Â¡Â­*/
+    if ( !((tconn->kstc_state == ksts_connected) &&
+           (tconn->kstc_type == kstt_sender || 
+            tconn->kstc_type == kstt_child))) {
+
+        *BytesTaken = BytesIndicated;
+
+        spin_unlock(&(tconn->kstc_lock));
+        ksocknal_put_tconn(tconn);
+
+        return (STATUS_SUCCESS);
+    }
+
+    if (tconn->kstc_type == kstt_sender) {
+        KsChain = &(tconn->sender.kstc_recv);
+    } else {
+        LASSERT(tconn->kstc_type == kstt_child);
+        KsChain = &(tconn->child.kstc_recv);
+    }
+
+    if (bIsExpedited) {
+        KsTsduMgr = &(KsChain->Expedited);
+    } else {
+        KsTsduMgr = &(KsChain->Normal);
+    }
+
+    /* if the Tsdu is even larger than the biggest Tsdu, we have
+       to allocate new buffer and use TSDU_TYOE_BUF to store it */
+
+    if ( KS_TSDU_STRU_SIZE(BytesAvailable) > ksocknal_data.ksnd_tsdu_size -
+         KS_DWORD_ALIGN(sizeof(KS_TSDU))) {
+        bNewBuff = TRUE;
+    }
+
+    /* retrieve the latest Tsdu buffer form TsduMgr
+       list if the list is not empty. */
+
+    if (list_empty(&(KsTsduMgr->TsduList))) {
+
+        LASSERT(KsTsduMgr->NumOfTsdu == 0);
+        KsTsdu = NULL;
+
+    } else {
+
+        LASSERT(KsTsduMgr->NumOfTsdu > 0);
+        KsTsdu = list_entry(KsTsduMgr->TsduList.prev, KS_TSDU, Link);
+
+        /* if this Tsdu does not contain enough space, we need
+           allocate a new Tsdu queue. */
+
+        if (bNewBuff) {
+            if ( KsTsdu->LastOffset + sizeof(KS_TSDU_BUF) >
+                 KsTsdu->TotalLength )  {
+                KsTsdu = NULL;
+            }
+        } else {
+            if ( KS_TSDU_STRU_SIZE(BytesAvailable) >
+                 KsTsdu->TotalLength - KsTsdu->LastOffset ) {
+                KsTsdu = NULL;
+            }
+        }
+    }
+
+    /* allocating the buffer for TSDU_TYPE_BUF */
+    if (bNewBuff) {
+        Buffer = ExAllocatePool(NonPagedPool, BytesAvailable);
+        if (NULL == Buffer) {
+            /* there's no enough memory for us. We just try to
+               receive maximum bytes with a new Tsdu */
+            bNewBuff = FALSE;
+            KsTsdu = NULL;
+        }
+    }
+
+    /* allocate a new Tsdu in case we are not statisfied. */
+
+    if (NULL == KsTsdu) {
+
+        KsTsdu = KsAllocateKsTsdu();
+
+        if (NULL == KsTsdu) {
+            goto errorout;
+        } else {
+            bNewTsdu = TRUE;
+        }
+    }
+
+    KsTsduBuf = (PKS_TSDU_BUF)((PUCHAR)KsTsdu + KsTsdu->LastOffset);
+    KsTsduDat = (PKS_TSDU_DAT)((PUCHAR)KsTsdu + KsTsdu->LastOffset);
+
+    if (bNewBuff) {
+
+        /* setup up the KS_TSDU_BUF record */
+
+        KsTsduBuf->TsduType     = TSDU_TYPE_BUF;
+        KsTsduBuf->TsduFlags    = 0;
+        KsTsduBuf->StartOffset  = 0;
+        KsTsduBuf->UserBuffer   = Buffer;
+        KsTsduBuf->DataLength   = BytesReceived = BytesAvailable;
+
+        KsTsdu->LastOffset += sizeof(KS_TSDU_BUF);
+
+    } else {
+
+        /* setup the KS_TSDU_DATA to contain all the messages */
+
+        KsTsduDat->TsduType     =  TSDU_TYPE_DAT;
+        KsTsduDat->TsduFlags    = 0;
+
+        if ( KsTsdu->TotalLength - KsTsdu->LastOffset >=
+            KS_TSDU_STRU_SIZE(BytesAvailable) ) {
+            BytesReceived = BytesAvailable;
+        } else {
+            BytesReceived = KsTsdu->TotalLength - KsTsdu->LastOffset -
+                            FIELD_OFFSET(KS_TSDU_DAT, Data);
+            BytesReceived &= (~((ULONG)3));
+        }
+        KsTsduDat->DataLength   =  BytesReceived;
+        KsTsduDat->TotalLength  =  KS_TSDU_STRU_SIZE(BytesReceived);
+        KsTsduDat->StartOffset  = 0;
+
+        Buffer = &KsTsduDat->Data[0];
+
+        KsTsdu->LastOffset += KsTsduDat->TotalLength;
+    }
+
+    KsTsduMgr->TotalBytes  +=  BytesReceived;
+
+    if (bIsCompleteTsdu) {
+
+        /* It's a complete receive, we just move all
+           the data from system to our Tsdu */
+
+        RtlMoveMemory(
+            Buffer,
+            Tsdu,
+            BytesReceived
+            );
+
+        *BytesTaken = BytesReceived;
+        Status = STATUS_SUCCESS;
+
+        if (bNewTsdu) {
+            list_add_tail(&(KsTsdu->Link), &(KsTsduMgr->TsduList));
+            KsTsduMgr->NumOfTsdu++;
+        }
+
+        KeSetEvent(&(KsTsduMgr->Event), 0, FALSE);
+
+        /* re-active the ksocknal connection and wake up the scheduler */
+        if (tconn->kstc_conn && tconn->kstc_sched_cb) {
+            tconn->kstc_sched_cb( tconn, FALSE, NULL,
+                                  KsTsduMgr->TotalBytes );
+        }
+        
+    } else {
+
+        /* there's still data in tdi internal queue, we need issue a new
+           Irp to receive all of them. first allocate the tcp context */
+
+        context = ExAllocatePoolWithTag(
+                        NonPagedPool,
+                        sizeof(KS_TCP_COMPLETION_CONTEXT),
+                        'cTsK');
+
+        if (!context) {
+
+            Status = STATUS_INSUFFICIENT_RESOURCES;
+            goto errorout;
+        }
+
+        /* setup the context */
+        RtlZeroMemory(context, sizeof(KS_TCP_COMPLETION_CONTEXT));
+
+        context->tconn             = tconn;
+        context->CompletionRoutine = KsTcpReceiveCompletionRoutine;
+        context->CompletionContext = KsTsdu;
+        context->CompletionContext = bNewBuff ? (PVOID)KsTsduBuf : (PVOID)KsTsduDat;
+        context->KsTsduMgr         = KsTsduMgr;
+        context->Event             = &(KsTsduMgr->Event);
+    
+        if (tconn->kstc_type == kstt_sender) {
+            FileObject = tconn->sender.kstc_info.FileObject;
+        } else {
+            FileObject = tconn->child.kstc_info.FileObject;
+        }
+
+        DeviceObject = IoGetRelatedDeviceObject(FileObject);
+
+        /* build new tdi Irp and setup it. */
+        Irp = KsBuildTdiIrp(DeviceObject);
+
+        if (NULL == Irp) {
+            goto errorout;
+        }
+
+        Status = KsLockUserBuffer(
+                    Buffer,
+                    FALSE,
+                    BytesReceived,
+                    IoModifyAccess,
+                    &Mdl
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+            goto errorout;
+        }
+
+        TdiBuildReceive(
+            Irp,
+            DeviceObject,
+            FileObject,
+            KsTcpCompletionRoutine,
+            context,
+            Mdl,
+            ReceiveFlags & (TDI_RECEIVE_NORMAL | TDI_RECEIVE_EXPEDITED),
+            BytesReceived
+          );
+            
+        IoSetNextIrpStackLocation(Irp);
+
+        /* return the newly built Irp to transport driver,
+           it will process it to receive all the data */
+
+        *IoRequestPacket = Irp;
+        *BytesTaken = 0;
+
+        if (bNewTsdu) {
+
+            list_add_tail(&(KsTsdu->Link), &(KsTsduMgr->TsduList));
+            KsTsduMgr->NumOfTsdu++;
+        }
+
+        if (bNewBuff) {
+            cfs_set_flag(KsTsduBuf->TsduFlags, KS_TSDU_BUF_RECEIVING);
+        } else {
+            cfs_set_flag(KsTsduDat->TsduFlags, KS_TSDU_DAT_RECEIVING);
+        }
+        ksocknal_get_tconn(tconn);
+        Status = STATUS_MORE_PROCESSING_REQUIRED;
+    }
+
+    spin_unlock(&(tconn->kstc_lock));
+    ksocknal_put_tconn(tconn);
+
+    return (Status);
+
+errorout:
+
+    spin_unlock(&(tconn->kstc_lock));
+
+    if (bNewTsdu && (KsTsdu != NULL)) {
+        KsFreeKsTsdu(KsTsdu);
+    }
+
+    if (Mdl) {
+        KsReleaseMdl(Mdl, FALSE);
+    }
+
+    if (Irp) {
+        IoFreeIrp(Irp);
+    }
+
+    if (context) {
+        ExFreePool(context);
+    }
+
+    ksocknal_abort_tconn(tconn);
+    ksocknal_put_tconn(tconn);
+
+    *BytesTaken = BytesAvailable;
+    Status = STATUS_SUCCESS;
+
+    return (Status);
+}
+
+/*
+ *  Expedited receive event handler
+ */
+
+NTSTATUS
+KsTcpReceiveExpeditedEventHandler(
+    IN PVOID                TdiEventContext, 
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags,
+    IN ULONG                BytesIndicated,
+    IN ULONG                BytesAvailable,
+    OUT ULONG *             BytesTaken,
+    IN PVOID                Tsdu,
+    OUT PIRP *              IoRequestPacket
+    )
+{
+    return KsTcpReceiveEventHandler(
+                TdiEventContext,
+                ConnectionContext,
+                ReceiveFlags | TDI_RECEIVE_EXPEDITED,
+                BytesIndicated,
+                BytesAvailable,
+                BytesTaken,
+                Tsdu,
+                IoRequestPacket
+                ); 
+}
+
+
+/*
+ *  Bulk receive event handler
+ *
+ *  It will queue all the system Tsdus to our TsduList.
+ *  Then later ksocknal_recv_mdl will release them.
+ */
+
+NTSTATUS
+KsTcpChainedReceiveEventHandler (
+    IN PVOID TdiEventContext,       // the event context
+    IN CONNECTION_CONTEXT ConnectionContext,
+    IN ULONG ReceiveFlags, 
+    IN ULONG ReceiveLength,
+    IN ULONG StartingOffset,        // offset of start of client data in TSDU
+    IN PMDL  Tsdu,                  // TSDU data chain
+    IN PVOID TsduDescriptor         // for call to TdiReturnChainedReceives
+    )
+{
+
+    NTSTATUS            Status;
+
+    ksock_tconn_t *     tconn;
+
+    PKS_CHAIN           KsChain;
+    PKS_TSDUMGR         KsTsduMgr;
+    PKS_TSDU            KsTsdu;
+    PKS_TSDU_MDL        KsTsduMdl;
+
+    BOOLEAN             bIsExpedited;
+    BOOLEAN             bNewTsdu = FALSE;
+
+    tconn = (ksock_tconn_t *) ConnectionContext;
+
+    bIsExpedited = cfs_is_flag_set(ReceiveFlags, TDI_RECEIVE_EXPEDITED);
+
+    KsPrint((2, "KsTcpChainedReceive: ReceiveLength = %xh bIsExpedited = %d\n", ReceiveLength, bIsExpedited));
+
+    ksocknal_get_tconn(tconn);
+    spin_lock(&(tconn->kstc_lock));
+
+    /* check whether we are conntected or not listener Â¡Â­*/
+    if ( !((tconn->kstc_state == ksts_connected) &&
+         (tconn->kstc_type == kstt_sender || 
+          tconn->kstc_type == kstt_child))) {
+
+        spin_unlock(&(tconn->kstc_lock));
+        ksocknal_put_tconn(tconn);
+
+        return (STATUS_SUCCESS);
+    }
+
+    /* get the latest Tsdu buffer form TsduMgr list.
+       just set NULL if the list is empty. */
+
+    if (tconn->kstc_type == kstt_sender) {
+        KsChain = &(tconn->sender.kstc_recv);
+    } else {
+        LASSERT(tconn->kstc_type == kstt_child);
+        KsChain = &(tconn->child.kstc_recv);
+    }
+
+    if (bIsExpedited) {
+        KsTsduMgr = &(KsChain->Expedited);
+    } else {
+        KsTsduMgr = &(KsChain->Normal);
+    }
+
+    if (list_empty(&(KsTsduMgr->TsduList))) {
+
+        LASSERT(KsTsduMgr->NumOfTsdu == 0);
+        KsTsdu = NULL;
+
+    } else {
+
+        LASSERT(KsTsduMgr->NumOfTsdu > 0);
+        KsTsdu = list_entry(KsTsduMgr->TsduList.prev, KS_TSDU, Link);
+        LASSERT(KsTsdu->Magic == KS_TSDU_MAGIC);
+
+        if (sizeof(KS_TSDU_MDL) > KsTsdu->TotalLength - KsTsdu->LastOffset) {
+            KsTsdu = NULL;
+        }
+    }
+
+    /* if there's no Tsdu or the free size is not enough for this
+       KS_TSDU_MDL structure. We need re-allocate a new Tsdu.  */
+
+    if (NULL == KsTsdu) {
+
+        KsTsdu = KsAllocateKsTsdu();
+
+        if (NULL == KsTsdu) {
+            goto errorout;
+        } else {
+            bNewTsdu = TRUE;
+        }
+    }
+
+    /* just queue the KS_TSDU_MDL to the Tsdu buffer */
+
+    KsTsduMdl = (PKS_TSDU_MDL)((PUCHAR)KsTsdu + KsTsdu->LastOffset);
+
+    KsTsduMdl->TsduType     =  TSDU_TYPE_MDL;
+    KsTsduMdl->DataLength   =  ReceiveLength;
+    KsTsduMdl->StartOffset  =  StartingOffset;
+    KsTsduMdl->Mdl          =  Tsdu;
+    KsTsduMdl->Descriptor   =  TsduDescriptor;
+
+    KsTsdu->LastOffset     += sizeof(KS_TSDU_MDL);
+    KsTsduMgr->TotalBytes  += ReceiveLength;
+
+    KsPrint((2, "KsTcpChainedReceiveEventHandler: Total %xh bytes.\n",
+                KsTsduMgr->TotalBytes ));
+
+    Status = STATUS_PENDING;
+
+    /* attach it to the TsduMgr list if the Tsdu is newly created. */
+    if (bNewTsdu) {
+
+        list_add_tail(&(KsTsdu->Link), &(KsTsduMgr->TsduList));
+        KsTsduMgr->NumOfTsdu++;
+    }
+
+    spin_unlock(&(tconn->kstc_lock));
+
+    /* wake up the threads waiing in ksocknal_recv_mdl */
+    KeSetEvent(&(KsTsduMgr->Event), 0, FALSE);
+
+    if (tconn->kstc_conn && tconn->kstc_sched_cb) {
+        tconn->kstc_sched_cb( tconn, FALSE, NULL,
+                              KsTsduMgr->TotalBytes );
+    }
+
+    ksocknal_put_tconn(tconn);
+
+    /* Return STATUS_PENDING to system because we are still
+       owning the MDL resources. ksocknal_recv_mdl is expected
+       to free the MDL resources. */
+
+    return (Status);
+
+errorout:
+
+    spin_unlock(&(tconn->kstc_lock));
+
+    if (bNewTsdu && (KsTsdu != NULL)) {
+        KsFreeKsTsdu(KsTsdu);
+    }
+
+    /* abort the tdi connection */
+    ksocknal_abort_tconn(tconn);
+    ksocknal_put_tconn(tconn);
+
+
+    Status = STATUS_SUCCESS;
+
+    return (Status);
+}
+
+
+/*
+ *  Expedited & Bulk receive event handler
+ */
+
+NTSTATUS
+KsTcpChainedReceiveExpeditedEventHandler (
+    IN PVOID                TdiEventContext,       // the event context
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags, 
+    IN ULONG                ReceiveLength,
+    IN ULONG                StartingOffset,        // offset of start of client data in TSDU
+    IN PMDL                 Tsdu,                  // TSDU data chain
+    IN PVOID                TsduDescriptor         // for call to TdiReturnChainedReceives
+    )
+{
+    return KsTcpChainedReceiveEventHandler(
+                TdiEventContext,
+                ConnectionContext,
+                ReceiveFlags | TDI_RECEIVE_EXPEDITED,
+                ReceiveLength,
+                StartingOffset,
+                Tsdu,
+                TsduDescriptor );
+}
+
+
+VOID
+KsPrintProviderInfo(
+   PWSTR DeviceName,
+   PTDI_PROVIDER_INFO ProviderInfo
+   )
+{
+    KsPrint((2, "%ws ProviderInfo:\n", DeviceName));
+
+    KsPrint((2, "  Version              : 0x%4.4X\n", ProviderInfo->Version ));
+    KsPrint((2, "  MaxSendSize          : %d\n", ProviderInfo->MaxSendSize ));
+    KsPrint((2, "  MaxConnectionUserData: %d\n", ProviderInfo->MaxConnectionUserData ));
+    KsPrint((2, "  MaxDatagramSize      : %d\n", ProviderInfo->MaxDatagramSize ));
+    KsPrint((2, "  ServiceFlags         : 0x%8.8X\n", ProviderInfo->ServiceFlags ));
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_CONNECTION_MODE) {
+        KsPrint((2, "  CONNECTION_MODE\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_ORDERLY_RELEASE) {
+        KsPrint((2, "  ORDERLY_RELEASE\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_CONNECTIONLESS_MODE) {
+        KsPrint((2, "  CONNECTIONLESS_MODE\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_ERROR_FREE_DELIVERY) {
+        KsPrint((2, "  ERROR_FREE_DELIVERY\n"));
+    }
+
+    if( ProviderInfo->ServiceFlags & TDI_SERVICE_SECURITY_LEVEL ) {
+        KsPrint((2, "  SECURITY_LEVEL\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_BROADCAST_SUPPORTED) {
+        KsPrint((2, "  BROADCAST_SUPPORTED\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_MULTICAST_SUPPORTED) {
+        KsPrint((2, "  MULTICAST_SUPPORTED\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_DELAYED_ACCEPTANCE) {
+        KsPrint((2, "  DELAYED_ACCEPTANCE\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_EXPEDITED_DATA) {
+        KsPrint((2, "  EXPEDITED_DATA\n"));
+    }
+
+    if( ProviderInfo->ServiceFlags & TDI_SERVICE_INTERNAL_BUFFERING) {
+        KsPrint((2, "  INTERNAL_BUFFERING\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_ROUTE_DIRECTED) {
+        KsPrint((2, "  ROUTE_DIRECTED\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_NO_ZERO_LENGTH) {
+        KsPrint((2, "  NO_ZERO_LENGTH\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_POINT_TO_POINT) {
+        KsPrint((2, "  POINT_TO_POINT\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_MESSAGE_MODE) {
+        KsPrint((2, "  MESSAGE_MODE\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_HALF_DUPLEX) {
+        KsPrint((2, "  HALF_DUPLEX\n"));
+    }
+
+    KsPrint((2, "  MinimumLookaheadData : %d\n", ProviderInfo->MinimumLookaheadData ));
+    KsPrint((2, "  MaximumLookaheadData : %d\n", ProviderInfo->MaximumLookaheadData ));
+    KsPrint((2, "  NumberOfResources    : %d\n", ProviderInfo->NumberOfResources ));
+}
+
+
+/*
+ * KsAllocateKsTsdu
+ *   Reuse a Tsdu from the freelist or allocate a new Tsdu
+ *   from the LookAsideList table or the NonPagedPool
+ *
+ * Arguments:
+ *   N/A
+ *
+ * Return Value:
+ *   PKS_Tsdu: the new Tsdu or NULL if it fails
+ *
+ * Notes: 
+ *   N/A
+ */
+
+PKS_TSDU
+KsAllocateKsTsdu()
+{
+    PKS_TSDU    KsTsdu = NULL;
+
+    spin_lock(&(ksocknal_data.ksnd_tsdu_lock));
+
+    if (!list_empty (&(ksocknal_data.ksnd_freetsdus))) {
+
+        LASSERT(ksocknal_data.ksnd_nfreetsdus > 0);
+
+        KsTsdu = list_entry(ksocknal_data.ksnd_freetsdus.next, KS_TSDU, Link);
+        list_del(&(KsTsdu->Link));
+        ksocknal_data.ksnd_nfreetsdus--;
+
+    } else {
+
+        KsTsdu = (PKS_TSDU) cfs_mem_cache_alloc(
+                        ksocknal_data.ksnd_tsdu_slab, 0);
+    }
+
+    spin_unlock(&(ksocknal_data.ksnd_tsdu_lock));
+
+    if (NULL != KsTsdu) {
+        KsInitializeKsTsdu(KsTsdu, ksocknal_data.ksnd_tsdu_size);
+    }
+
+    return (KsTsdu);
+}
+
+
+/*
+ * KsPutKsTsdu
+ *   Move the Tsdu to the free tsdu list in ksocknal_data.
+ *
+ * Arguments:
+ *   KsTsdu: Tsdu to be moved.
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+VOID
+KsPutKsTsdu(
+    PKS_TSDU  KsTsdu
+    )
+{
+    spin_lock(&(ksocknal_data.ksnd_tsdu_lock));
+
+    list_add_tail( &(KsTsdu->Link), &(ksocknal_data.ksnd_freetsdus));
+    ksocknal_data.ksnd_nfreetsdus++;
+
+    spin_unlock(&(ksocknal_data.ksnd_tsdu_lock));
+}
+
+
+/*
+ * KsFreeKsTsdu
+ *   Release a Tsdu: uninitialize then free it.
+ *
+ * Arguments:
+ *   KsTsdu: Tsdu to be freed.
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+VOID
+KsFreeKsTsdu(
+    PKS_TSDU  KsTsdu
+    )
+{
+    cfs_mem_cache_free(
+            ksocknal_data.ksnd_tsdu_slab,
+            KsTsdu );
+}
+
+
+/*
+ * KsInitializeKsTsdu
+ *   Initialize the Tsdu buffer header
+ *
+ * Arguments:
+ *   KsTsdu: the Tsdu to be initialized
+ *   Length: the total length of the Tsdu
+ *
+ * Return Value:
+ *   VOID
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+VOID
+KsInitializeKsTsdu(
+    PKS_TSDU    KsTsdu,
+    ULONG       Length
+    )
+{
+    RtlZeroMemory(KsTsdu, Length);
+    KsTsdu->Magic = KS_TSDU_MAGIC;
+    KsTsdu->TotalLength = Length;
+    KsTsdu->StartOffset = KsTsdu->LastOffset =
+    KS_DWORD_ALIGN(sizeof(KS_TSDU));
+}
+
+
+/*
+ * KsInitializeKsTsduMgr
+ *   Initialize the management structure of
+ *   Tsdu buffers
+ *
+ * Arguments:
+ *   TsduMgr: the TsduMgr to be initialized
+ *
+ * Return Value:
+ *   VOID
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+VOID
+KsInitializeKsTsduMgr(
+    PKS_TSDUMGR     TsduMgr
+    )
+{
+    KeInitializeEvent(
+            &(TsduMgr->Event),
+            NotificationEvent,
+            FALSE
+            );
+
+    CFS_INIT_LIST_HEAD(
+            &(TsduMgr->TsduList)
+            );
+
+    TsduMgr->NumOfTsdu  = 0;
+    TsduMgr->TotalBytes = 0;
+}
+
+
+/*
+ * KsInitializeKsChain
+ *   Initialize the China structure for receiving
+ *   or transmitting
+ *
+ * Arguments:
+ *   KsChain: the KsChain to be initialized
+ *
+ * Return Value:
+ *   VOID
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+VOID
+KsInitializeKsChain(
+    PKS_CHAIN       KsChain
+    )
+{
+    KsInitializeKsTsduMgr(&(KsChain->Normal));
+    KsInitializeKsTsduMgr(&(KsChain->Expedited));
+}
+
+
+/*
+ * KsCleanupTsduMgr
+ *   Clean up all the Tsdus in the TsduMgr list
+ *
+ * Arguments:
+ *   KsTsduMgr: the Tsdu list manager
+ *
+ * Return Value:
+ *   NTSTATUS:  nt status code
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+NTSTATUS
+KsCleanupTsduMgr(
+    PKS_TSDUMGR     KsTsduMgr
+    )
+{
+    PKS_TSDU        KsTsdu;
+    PKS_TSDU_DAT    KsTsduDat;
+    PKS_TSDU_BUF    KsTsduBuf;
+    PKS_TSDU_MDL    KsTsduMdl;
+
+    LASSERT(NULL != KsTsduMgr);
+
+    KeSetEvent(&(KsTsduMgr->Event), 0, FALSE);
+
+    while (!list_empty(&KsTsduMgr->TsduList)) {
+
+        KsTsdu = list_entry(KsTsduMgr->TsduList.next, KS_TSDU, Link);
+        LASSERT(KsTsdu->Magic == KS_TSDU_MAGIC);
+
+        if (KsTsdu->StartOffset == KsTsdu->LastOffset) {
+
+            //
+            // KsTsdu is empty now, we need free it ...
+            //
+
+            list_del(&(KsTsdu->Link));
+            KsTsduMgr->NumOfTsdu--;
+
+            KsFreeKsTsdu(KsTsdu);
+
+        } else {
+
+            KsTsduDat = (PKS_TSDU_DAT)((PUCHAR)KsTsdu + KsTsdu->StartOffset);
+            KsTsduBuf = (PKS_TSDU_BUF)((PUCHAR)KsTsdu + KsTsdu->StartOffset);
+            KsTsduMdl = (PKS_TSDU_MDL)((PUCHAR)KsTsdu + KsTsdu->StartOffset);
+    
+            if (TSDU_TYPE_DAT == KsTsduDat->TsduType) {
+
+                KsTsdu->StartOffset += KsTsduDat->TotalLength;
+
+            } else if (TSDU_TYPE_BUF == KsTsduBuf->TsduType) {
+
+                ASSERT(KsTsduBuf->UserBuffer != NULL);
+
+                if (KsTsduBuf->DataLength > KsTsduBuf->StartOffset) {
+                    ExFreePool(KsTsduBuf->UserBuffer);
+                } else {
+                    cfs_enter_debugger();
+                }
+
+                KsTsdu->StartOffset += sizeof(KS_TSDU_BUF);
+
+            } else if (TSDU_TYPE_MDL == KsTsduMdl->TsduType) {
+
+                //
+                // MDL Tsdu Unit ...
+                //
+
+                TdiReturnChainedReceives(
+                    &(KsTsduMdl->Descriptor),
+                    1 );
+
+                KsTsdu->StartOffset += sizeof(KS_TSDU_MDL);
+            }
+        }
+    }
+
+    return STATUS_SUCCESS;
+}
+
+
+/*
+ * KsCleanupKsChain
+ *   Clean up the TsduMgrs of the KsChain
+ *
+ * Arguments:
+ *   KsChain: the chain managing TsduMgr
+ *
+ * Return Value:
+ *   NTSTATUS:  nt status code
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+NTSTATUS
+KsCleanupKsChain(
+    PKS_CHAIN   KsChain
+    )
+{
+    NTSTATUS    Status;
+
+    LASSERT(NULL != KsChain);
+
+    Status = KsCleanupTsduMgr(
+                &(KsChain->Normal)
+                );
+
+    if (!NT_SUCCESS(Status)) {
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    Status = KsCleanupTsduMgr(
+                &(KsChain->Expedited)
+                );
+
+    if (!NT_SUCCESS(Status)) {
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+errorout:
+
+    return Status;
+}
+
+
+/*
+ * KsCleanupTsdu
+ *   Clean up all the Tsdus of a tdi connected object
+ *
+ * Arguments:
+ *   tconn: the tdi connection which is connected already.
+ *
+ * Return Value:
+ *   Nt status code
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+NTSTATUS
+KsCleanupTsdu(
+    ksock_tconn_t * tconn
+    )
+{
+    NTSTATUS        Status = STATUS_SUCCESS;
+
+
+    if (tconn->kstc_type != kstt_sender && 
+        tconn->kstc_type != kstt_child ) {
+
+        goto errorout;
+    }
+
+    if (tconn->kstc_type == kstt_sender) {
+
+        Status = KsCleanupKsChain(
+                    &(tconn->sender.kstc_recv)
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+            cfs_enter_debugger();
+            goto errorout;
+        }
+
+        Status = KsCleanupKsChain(
+                    &(tconn->sender.kstc_send)
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+            cfs_enter_debugger();
+            goto errorout;
+        }
+
+    } else {
+
+        Status = KsCleanupKsChain(
+                    &(tconn->child.kstc_recv)
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+            cfs_enter_debugger();
+            goto errorout;
+        }
+
+        Status = KsCleanupKsChain(
+                    &(tconn->child.kstc_send)
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+            cfs_enter_debugger();
+            goto errorout;
+        }
+
+    }
+
+errorout:
+
+    return (Status);
+}
+
+
+/*
+ * KsCopyMdlChainToMdlChain
+ *   Copy data from  a [chained] Mdl to anther [chained] Mdl.
+ *   Tdi library does not provide this function. We have to
+ *   realize it ourselives.
+ *
+ * Arguments:
+ *   SourceMdlChain: the source mdl
+ *   SourceOffset:   start offset of the source
+ *   DestinationMdlChain: the dst mdl
+ *   DestinationOffset: the offset where data are to be copied.
+ *   BytesTobecopied:   the expteced bytes to be copied
+ *   BytesCopied:    to store the really copied data length
+ *
+ * Return Value:
+ *   NTSTATUS: STATUS_SUCCESS or other error code
+ *
+ * NOTES: 
+ *   The length of source mdl must be >= SourceOffset + BytesTobecopied
+ */
+
+NTSTATUS
+KsCopyMdlChainToMdlChain(
+    IN PMDL     SourceMdlChain,
+    IN ULONG    SourceOffset,
+    IN PMDL     DestinationMdlChain,
+    IN ULONG    DestinationOffset,
+    IN ULONG    BytesTobecopied,
+    OUT PULONG  BytesCopied
+    )
+{
+    PMDL        SrcMdl = SourceMdlChain;
+    PMDL        DstMdl = DestinationMdlChain;
+
+    PUCHAR      SrcBuf = NULL;
+    PUCHAR      DstBuf = NULL;
+
+    ULONG       dwBytes = 0;
+
+    NTSTATUS    Status = STATUS_SUCCESS;
+
+
+    while (dwBytes < BytesTobecopied) {
+
+        ULONG   Length = 0;
+
+        while (MmGetMdlByteCount(SrcMdl) <= SourceOffset) {
+
+            SourceOffset -= MmGetMdlByteCount(SrcMdl);
+
+            SrcMdl = SrcMdl->Next;
+
+            if (NULL == SrcMdl) {
+
+                Status = STATUS_INVALID_PARAMETER;
+                goto errorout;
+            }
+        }
+
+        while (MmGetMdlByteCount(DstMdl) <= DestinationOffset) {
+
+            DestinationOffset -= MmGetMdlByteCount(DstMdl);
+
+            DstMdl = DstMdl->Next;
+
+            if (NULL == DstMdl) {
+
+                Status = STATUS_INVALID_PARAMETER;
+                goto errorout;
+            }
+        }
+
+        DstBuf = (PUCHAR)KsMapMdlBuffer(DstMdl);
+
+        if ((NULL == DstBuf)) {
+            Status = STATUS_INSUFFICIENT_RESOURCES;
+            goto errorout;
+        }
+
+        //
+        // Here we need skip the OVERFLOW case via RtlCopyMemory :-(
+        //
+
+        if ( KsQueryMdlsSize(SrcMdl) - SourceOffset >
+             MmGetMdlByteCount(DstMdl) - DestinationOffset ) {
+
+            Length = BytesTobecopied - dwBytes;
+
+            if (Length > KsQueryMdlsSize(SrcMdl) - SourceOffset) {
+                Length = KsQueryMdlsSize(SrcMdl) - SourceOffset;
+            }
+
+            if (Length > MmGetMdlByteCount(DstMdl) - DestinationOffset) {
+                Length = MmGetMdlByteCount(DstMdl) - DestinationOffset;
+            }
+
+            SrcBuf = (PUCHAR)KsMapMdlBuffer(SrcMdl);
+
+            if ((NULL == DstBuf)) {
+                Status = STATUS_INSUFFICIENT_RESOURCES;
+                goto errorout;
+            }
+
+            RtlCopyMemory(
+                DstBuf + DestinationOffset,
+                SrcBuf + SourceOffset,
+                Length 
+                );
+
+        } else {
+
+            Status = TdiCopyMdlToBuffer(
+                        SrcMdl,
+                        SourceOffset,
+                        DstBuf,
+                        DestinationOffset,
+                        MmGetMdlByteCount(DstMdl),
+                        &Length
+                        );
+
+            if (STATUS_BUFFER_OVERFLOW == Status) {
+                cfs_enter_debugger();
+            } else if (!NT_SUCCESS(Status)) {
+                cfs_enter_debugger();
+                goto errorout;
+            }
+        }
+
+        SourceOffset += Length;
+        DestinationOffset += Length;
+        dwBytes += Length;
+    }
+
+errorout:
+
+    if (NT_SUCCESS(Status)) {
+        *BytesCopied = dwBytes;
+    } else {
+        *BytesCopied = 0;
+    }
+
+    return Status;
+}
+
+
+
+/*
+ * KsQueryMdlSize
+ *   Query the whole size of a MDL (may be chained)
+ *
+ * Arguments:
+ *   Mdl:  the Mdl to be queried
+ *
+ * Return Value:
+ *   ULONG: the total size of the mdl
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+ULONG
+KsQueryMdlsSize (PMDL Mdl)
+{
+    PMDL    Next = Mdl;
+    ULONG   Length = 0;
+
+
+    //
+    // Walking the MDL Chain ...
+    //
+
+    while (Next) {
+        Length += MmGetMdlByteCount(Next);
+        Next = Next->Next;
+    }
+
+    return (Length);
+}
+
+
+/*
+ * KsLockUserBuffer
+ *   Allocate MDL for the buffer and lock the pages into
+ *   nonpaged pool
+ *
+ * Arguments:
+ *   UserBuffer:  the user buffer to be locked
+ *   Length:      length in bytes of the buffer
+ *   Operation:   read or write access
+ *   pMdl:        the result of the created mdl
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+NTSTATUS
+KsLockUserBuffer (
+    IN PVOID            UserBuffer,
+    IN BOOLEAN          bPaged,
+    IN ULONG            Length,
+    IN LOCK_OPERATION   Operation,
+    OUT PMDL *          pMdl
+    )
+{
+    NTSTATUS    Status;
+    PMDL        Mdl = NULL;
+    
+    LASSERT(UserBuffer != NULL);
+
+    *pMdl = NULL;
+    
+    Mdl = IoAllocateMdl(
+                UserBuffer,
+                Length,
+                FALSE,
+                FALSE,
+                NULL
+                );
+    
+    if (Mdl == NULL) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+
+    } else {
+    
+        __try {
+
+            if (bPaged) {
+                MmProbeAndLockPages(
+                    Mdl,
+                    KernelMode,
+                    Operation
+                    );
+            } else {
+                MmBuildMdlForNonPagedPool(
+                    Mdl
+                    );
+            }
+        
+            Status = STATUS_SUCCESS;
+
+            *pMdl = Mdl;
+
+        } __except (EXCEPTION_EXECUTE_HANDLER) {
+
+            IoFreeMdl(Mdl);
+        
+            Mdl = NULL;
+
+            cfs_enter_debugger();
+        
+            Status = STATUS_INVALID_USER_BUFFER;
+        }
+    }
+    
+    return Status;
+}
+
+/*
+ * KsMapMdlBuffer
+ *   Map the mdl into a buffer in kernel space
+ *
+ * Arguments:
+ *   Mdl:  the mdl to be mapped
+ *
+ * Return Value:
+ *   PVOID: the buffer mapped or NULL in failure
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+PVOID
+KsMapMdlBuffer (PMDL    Mdl)
+{
+    LASSERT(Mdl != NULL);
+    
+    return MmGetSystemAddressForMdlSafe(
+                Mdl,
+                NormalPagePriority
+                );
+}
+
+
+/*
+ * KsReleaseMdl
+ *   Unlock all the pages in the mdl
+ *
+ * Arguments:
+ *   Mdl:  memory description list to be released
+ *
+ * Return Value:
+ *   N/A
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+VOID
+KsReleaseMdl (IN PMDL   Mdl,
+              IN int    Paged )
+{
+    LASSERT(Mdl != NULL);
+
+    while (Mdl) {
+
+        PMDL    Next;
+
+        Next = Mdl->Next;
+
+        if (Paged) {
+            MmUnlockPages(Mdl);
+        }
+
+        IoFreeMdl(Mdl);
+
+        Mdl = Next;
+    }
+}
+
+
+/*
+ * ksocknal_lock_buffer
+ *   allocate MDL for the user spepcified buffer and lock (paging-in)
+ *   all the pages of the buffer into system memory
+ *
+ * Arguments:
+ *   buffer:  the user buffer to be locked
+ *   length:  length in bytes of the buffer
+ *   access:  read or write access
+ *   mdl:     the result of the created mdl
+ *
+ * Return Value:
+ *   int:     the ksocknal error code: 0: success / -x: failture
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int
+ksocknal_lock_buffer (
+    void *            buffer,
+    int               paged,
+    int               length,
+    LOCK_OPERATION    access,
+    ksock_mdl_t **    kmdl
+    )
+{
+    NTSTATUS        status;
+
+    status = KsLockUserBuffer(
+                    buffer,
+                    paged !=0,
+                    length,
+                    access,
+                    kmdl
+                    );
+
+    return cfs_error_code(status);
+}
+
+
+/*
+ * ksocknal_map_mdl
+ *   Map the mdl pages into kernel space
+ *
+ * Arguments:
+ *   mdl:  the mdl to be mapped
+ *
+ * Return Value:
+ *   void *: the buffer mapped or NULL in failure
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void *
+ksocknal_map_mdl (ksock_mdl_t * mdl)
+{
+    LASSERT(mdl != NULL);
+
+    return KsMapMdlBuffer(mdl);    
+}
+
+/*
+ *  ksocknal_release_mdl
+ *   Unlock all the pages in the mdl and release the mdl
+ *
+ * Arguments:
+ *   mdl:  memory description list to be released
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+ksocknal_release_mdl (ksock_mdl_t *mdl, int paged)
+{
+    LASSERT(mdl != NULL);
+
+    KsReleaseMdl(mdl, paged);
+}
+
+
+/*
+ * ksocknal_create_tconn
+ *   allocate a new tconn structure from the SLAB cache or
+ *   NonPaged sysetm pool
+ *
+ * Arguments:
+ *   N/A
+ *
+ * Return Value:
+ *   ksock_tconn_t *: the address of tconn or NULL if it fails
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+ksock_tconn_t *
+ksocknal_create_tconn()
+{
+    ksock_tconn_t * tconn = NULL;
+
+    /* allocate ksoc_tconn_t from the slab cache memory */
+
+    tconn = (ksock_tconn_t *)cfs_mem_cache_alloc(
+                ksocknal_data.ksnd_tconn_slab, CFS_ALLOC_ZERO);
+
+    if (tconn) {
+
+        /* zero tconn elements */
+        memset(tconn, 0, sizeof(ksock_tconn_t));
+
+        /* initialize the tconn ... */
+        tconn->kstc_magic = KS_TCONN_MAGIC;
+        
+        ExInitializeWorkItem(
+            &(tconn->kstc_disconnect.WorkItem), 
+            KsDisconnectHelper,
+            &(tconn->kstc_disconnect)
+            );
+
+        KeInitializeEvent(
+                &(tconn->kstc_disconnect.Event),
+                SynchronizationEvent,
+                FALSE );
+
+        ExInitializeWorkItem(
+            &(tconn->kstc_destroy), 
+            ksocknal_destroy_tconn,
+            tconn
+            );
+
+        spin_lock_init(&(tconn->kstc_lock));
+
+        ksocknal_get_tconn(tconn);
+
+        spin_lock(&(ksocknal_data.ksnd_tconn_lock));
+
+        /* attach it into global list in ksocknal_data */
+
+        list_add(&(tconn->kstc_list), &(ksocknal_data.ksnd_tconns));
+        ksocknal_data.ksnd_ntconns++;
+        spin_unlock(&(ksocknal_data.ksnd_tconn_lock));
+
+        tconn->kstc_rcv_wnd = tconn->kstc_snd_wnd = 0x10000;
+    }
+
+    return (tconn);
+}
+
+
+/*
+ * ksocknal_free_tconn
+ *   free the tconn structure to the SLAB cache or NonPaged
+ *   sysetm pool
+ *
+ * Arguments:
+ *   tconn:  the tcon is to be freed
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+ksocknal_free_tconn(ksock_tconn_t * tconn)
+{
+    LASSERT(atomic_read(&(tconn->kstc_refcount)) == 0);
+
+    spin_lock(&(ksocknal_data.ksnd_tconn_lock));
+
+    /* remove it from the global list */
+    list_del(&tconn->kstc_list);
+    ksocknal_data.ksnd_ntconns--;
+
+    /* if this is the last tconn, it would be safe for
+       ksocknal_tdi_fini_data to quit ... */
+    if (ksocknal_data.ksnd_ntconns == 0) {
+        cfs_wake_event(&ksocknal_data.ksnd_tconn_exit);
+    }
+    spin_unlock(&(ksocknal_data.ksnd_tconn_lock));
+
+    /* free the structure memory */
+    cfs_mem_cache_free(ksocknal_data.ksnd_tconn_slab, tconn);
+}
+
+
+/*
+ * ksocknal_init_listener
+ *   Initialize the tconn as a listener (daemon)
+ *
+ * Arguments:
+ *   tconn: the listener tconn
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+ksocknal_init_listener(
+    ksock_tconn_t * tconn
+    )
+{
+    /* preparation: intialize the tconn members */
+
+    tconn->kstc_type = kstt_listener;
+
+    RtlInitUnicodeString(&(tconn->kstc_dev), TCP_DEVICE_NAME);
+
+    CFS_INIT_LIST_HEAD(&(tconn->listener.kstc_listening.list));
+    CFS_INIT_LIST_HEAD(&(tconn->listener.kstc_accepted.list));
+
+    cfs_init_event( &(tconn->listener.kstc_accept_event),
+                    TRUE,
+                    FALSE );
+
+    cfs_init_event( &(tconn->listener.kstc_destroy_event),
+                    TRUE,
+                    FALSE );
+
+    tconn->kstc_state = ksts_inited;
+}
+
+
+/*
+ * ksocknal_init_sender
+ *   Initialize the tconn as a sender
+ *
+ * Arguments:
+ *   tconn: the sender tconn
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+ksocknal_init_sender(
+    ksock_tconn_t * tconn
+    )
+{
+    tconn->kstc_type = kstt_sender;
+    RtlInitUnicodeString(&(tconn->kstc_dev), TCP_DEVICE_NAME);
+
+    KsInitializeKsChain(&(tconn->sender.kstc_recv));
+    KsInitializeKsChain(&(tconn->sender.kstc_send));
+
+    tconn->kstc_snd_wnd = TDINAL_WINDOW_DEFAULT_SIZE; 
+    tconn->kstc_rcv_wnd = TDINAL_WINDOW_DEFAULT_SIZE;
+
+    tconn->kstc_state = ksts_inited;
+}
+
+/*
+ * ksocknal_init_child
+ *   Initialize the tconn as a child
+ *
+ * Arguments:
+ *   tconn: the child tconn
+ *
+ * Return Value:
+ *   N/A
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+void
+ksocknal_init_child(
+    ksock_tconn_t * tconn
+    )
+{
+    tconn->kstc_type = kstt_child;
+    RtlInitUnicodeString(&(tconn->kstc_dev), TCP_DEVICE_NAME);
+
+    KsInitializeKsChain(&(tconn->child.kstc_recv));
+    KsInitializeKsChain(&(tconn->child.kstc_send));
+
+    tconn->kstc_snd_wnd = TDINAL_WINDOW_DEFAULT_SIZE; 
+    tconn->kstc_rcv_wnd = TDINAL_WINDOW_DEFAULT_SIZE;
+
+    tconn->kstc_state = ksts_inited;
+}
+
+/*
+ * ksocknal_get_tconn
+ *   increase the reference count of the tconn with 1
+ *
+ * Arguments:
+ *   tconn: the tdi connection to be referred
+ *
+ * Return Value:
+ *   N/A
+ *
+ * NOTES: 
+ *   N/A
+ */
+void
+ksocknal_get_tconn(
+    ksock_tconn_t * tconn
+    )
+{
+    atomic_inc(&(tconn->kstc_refcount));
+}
+
+/*
+ * ksocknal_put_tconn
+ *   decrease the reference count of the tconn and destroy
+ *   it if the refercount becomes 0.
+ *
+ * Arguments:
+ *   tconn: the tdi connection to be dereferred
+ *
+ * Return Value:
+ *   N/A
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+void
+ksocknal_put_tconn(
+    ksock_tconn_t *tconn
+    )
+{
+    if (atomic_dec_and_test(&(tconn->kstc_refcount))) {
+
+        spin_lock(&(tconn->kstc_lock));
+
+        if ( ( tconn->kstc_type == kstt_child || 
+               tconn->kstc_type == kstt_sender ) &&
+             ( tconn->kstc_state == ksts_connected ) ) {
+
+            spin_unlock(&(tconn->kstc_lock));
+
+            ksocknal_abort_tconn(tconn);
+
+        } else {
+
+            if (cfs_is_flag_set(tconn->kstc_flags, KS_TCONN_DESTROY_BUSY)) {
+                cfs_enter_debugger();
+            } else {
+                ExQueueWorkItem(
+                        &(tconn->kstc_destroy),
+                        DelayedWorkQueue
+                        );
+
+                cfs_set_flag(tconn->kstc_flags, KS_TCONN_DESTROY_BUSY);
+            }
+
+            spin_unlock(&(tconn->kstc_lock));
+        }
+    }
+}
+
+/*
+ * ksocknal_destroy_tconn
+ *   cleanup the tdi connection and free it
+ *
+ * Arguments:
+ *   tconn: the tdi connection to be cleaned.
+ *
+ * Return Value:
+ *   N/A
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+void
+ksocknal_destroy_tconn(
+    ksock_tconn_t *     tconn
+    )
+{
+    LASSERT(tconn->kstc_refcount.counter == 0);
+
+    if (tconn->kstc_type == kstt_listener) {
+
+        ksocknal_reset_handlers(tconn);
+
+        /* for listener, we just need to close the address object */
+        KsCloseAddress(
+                tconn->kstc_addr.Handle,
+                tconn->kstc_addr.FileObject
+                );
+
+        tconn->kstc_state = ksts_inited;
+
+    } else if (tconn->kstc_type == kstt_child) {
+
+        /* for child tdi conections */
+
+        /* disassociate the relation between it's connection object
+           and the address object */
+
+        if (tconn->kstc_state == ksts_associated) {
+            KsDisassociateAddress(
+                tconn->child.kstc_info.FileObject
+                ); 
+        }
+
+        /* release the connection object */
+
+        KsCloseConnection(
+                tconn->child.kstc_info.Handle,
+                tconn->child.kstc_info.FileObject
+                );
+
+        /* release it's refer of it's parent's address object */
+        KsCloseAddress(
+                NULL,
+                tconn->kstc_addr.FileObject
+                );
+
+        spin_lock(&tconn->child.kstc_parent->kstc_lock);
+        spin_lock(&tconn->kstc_lock);
+
+        tconn->kstc_state = ksts_inited;
+
+        /* remove it frome it's parent's queues */
+
+        if (tconn->child.kstc_queued) {
+
+            list_del(&(tconn->child.kstc_link));
+
+            if (tconn->child.kstc_queueno) {
+
+                LASSERT(tconn->child.kstc_parent->listener.kstc_accepted.num > 0);
+                tconn->child.kstc_parent->listener.kstc_accepted.num -= 1;
+
+            } else {
+
+                LASSERT(tconn->child.kstc_parent->listener.kstc_listening.num > 0);
+                tconn->child.kstc_parent->listener.kstc_listening.num -= 1;
+            }
+
+            tconn->child.kstc_queued = FALSE;
+        }
+
+        spin_unlock(&tconn->kstc_lock);
+        spin_unlock(&tconn->child.kstc_parent->kstc_lock);
+
+        /* drop the reference of the parent tconn */
+        ksocknal_put_tconn(tconn->child.kstc_parent);
+
+    } else if (tconn->kstc_type == kstt_sender) {
+
+        ksocknal_reset_handlers(tconn);
+
+        /* release the connection object */
+
+        KsCloseConnection(
+                tconn->sender.kstc_info.Handle,
+                tconn->sender.kstc_info.FileObject
+                );
+
+        /* release it's refer of it's parent's address object */
+        KsCloseAddress(
+                tconn->kstc_addr.Handle,
+                tconn->kstc_addr.FileObject
+                );
+
+        tconn->kstc_state = ksts_inited;
+
+    } else {
+        cfs_enter_debugger();
+    }
+
+    /* free the tconn structure ... */
+
+    ksocknal_free_tconn(tconn);
+}
+
+
+/*
+ * ksocknal_lock_iovs
+ *   Lock the i/o vector buffers into MDL structure
+ *
+ * Arguments:
+ *   iov:  the array of i/o vectors
+ *   niov: number of i/o vectors to be locked
+ *   len:  the real length of the iov vectors
+ *
+ * Return Value:
+ *   ksock_mdl_t *: the Mdl of the locked buffers or
+ *         NULL pointer in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+
+ksock_mdl_t *
+ksocknal_lock_iovs(
+    IN struct iovec  *iov,
+    IN int            niov,
+    IN int            recving,
+    IN int *          len )
+{
+    int             rc = 0;
+
+    int             i = 0;
+    int             total = 0;
+    ksock_mdl_t *   mdl = NULL;
+    ksock_mdl_t *   tail = NULL;
+
+    LASSERT(iov != NULL);
+    LASSERT(niov > 0);
+    LASSERT(len != NULL);
+
+    for (i=0; i < niov; i++) {
+
+        ksock_mdl_t * Iovec = NULL;
+            
+        rc = ksocknal_lock_buffer(
+                iov[i].iov_base,
+                FALSE,
+                iov[i].iov_len,
+                recving ? IoWriteAccess : IoReadAccess,
+                &Iovec );
+
+        if (rc < 0) {
+            break;
+        }
+
+        if (tail) {
+            tail->Next = Iovec;
+        } else {
+            mdl = Iovec;
+        }
+
+        tail = Iovec;
+
+        total +=iov[i].iov_len;
+    }
+
+    if (rc >= 0) {
+        *len = total;
+    } else {
+        if (mdl) {
+            ksocknal_release_mdl(mdl, FALSE);
+            mdl = NULL;
+        }
+    }
+
+    return mdl;
+}
+
+int
+ksocknal_query_data(
+    ksock_tconn_t * tconn,
+    size_t *        size,
+    int             bIsExpedited )
+{
+    int             rc = 0;
+
+    PKS_CHAIN       KsChain;
+    PKS_TSDUMGR     KsTsduMgr;
+
+    *size = 0;
+
+    ksocknal_get_tconn(tconn);
+    spin_lock(&(tconn->kstc_lock));
+
+    if ( tconn->kstc_type != kstt_sender &&
+         tconn->kstc_type != kstt_child) {
+        rc = -EINVAL;
+        spin_unlock(&(tconn->kstc_lock));
+        goto errorout;
+    }
+
+    if (tconn->kstc_state != ksts_connected) {
+        rc = -ENOTCONN;
+        spin_unlock(&(tconn->kstc_lock));
+        goto errorout;
+    }
+
+    if (tconn->kstc_type == kstt_sender) {
+        KsChain = &(tconn->sender.kstc_recv);
+    } else {
+        LASSERT(tconn->kstc_type == kstt_child);
+        KsChain = &(tconn->child.kstc_recv);
+    }
+
+    if (bIsExpedited) {
+        KsTsduMgr = &(KsChain->Expedited);
+    } else {
+        KsTsduMgr = &(KsChain->Normal);
+    }
+
+    *size = KsTsduMgr->TotalBytes;
+    spin_unlock(&(tconn->kstc_lock));
+
+errorout:
+
+    ksocknal_put_tconn(tconn);
+
+    return (rc);
+}
+
+/*
+ * ksocknal_get_tcp_option
+ *   Query the the options of the tcp stream connnection
+ *
+ * Arguments:
+ *   tconn:         the tdi connection
+ *   ID:            option id
+ *   OptionValue:   buffer to store the option value
+ *   Length:        the length of the value, to be returned
+ *
+ * Return Value:
+ *   int:           ksocknal return code
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+int
+ksocknal_get_tcp_option (
+    ksock_tconn_t *     tconn,
+    ULONG               ID,
+    PVOID               OptionValue,
+    PULONG              Length
+    )
+{
+    NTSTATUS            Status = STATUS_SUCCESS;
+
+    IO_STATUS_BLOCK     IoStatus;
+
+    TCP_REQUEST_QUERY_INFORMATION_EX QueryInfoEx;
+
+    PFILE_OBJECT        ConnectionObject;
+    PDEVICE_OBJECT      DeviceObject = NULL;
+
+    PIRP                Irp = NULL;
+    PIO_STACK_LOCATION  IrpSp = NULL;
+
+    KEVENT              Event;
+
+    /* make sure the tdi connection is connected ? */
+
+    ksocknal_get_tconn(tconn);
+
+    if (tconn->kstc_state != ksts_connected) {
+        Status = STATUS_INVALID_PARAMETER;
+        goto errorout;
+    }
+
+    LASSERT(tconn->kstc_type == kstt_sender ||
+           tconn->kstc_type == kstt_child);
+
+    if (tconn->kstc_type == kstt_sender) {
+        ConnectionObject = tconn->sender.kstc_info.FileObject;
+    } else {
+        ConnectionObject = tconn->child.kstc_info.FileObject;
+    }
+
+    QueryInfoEx.ID.toi_id = ID;
+    QueryInfoEx.ID.toi_type   = INFO_TYPE_CONNECTION;
+    QueryInfoEx.ID.toi_class  = INFO_CLASS_PROTOCOL;
+    QueryInfoEx.ID.toi_entity.tei_entity   = CO_TL_ENTITY;
+    QueryInfoEx.ID.toi_entity.tei_instance = 0;
+
+    RtlZeroMemory(&(QueryInfoEx.Context), CONTEXT_SIZE);
+
+    KeInitializeEvent(&Event, NotificationEvent, FALSE);
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    Irp = IoBuildDeviceIoControlRequest(
+                IOCTL_TCP_QUERY_INFORMATION_EX,
+                DeviceObject,
+                &QueryInfoEx,
+                sizeof(TCP_REQUEST_QUERY_INFORMATION_EX),
+                OptionValue,
+                *Length,
+                FALSE,
+                &Event,
+                &IoStatus
+                );
+
+    if (Irp == NULL) {
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    IrpSp = IoGetNextIrpStackLocation(Irp);
+
+    if (IrpSp == NULL) {
+
+        IoFreeIrp(Irp);
+        Irp = NULL;
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    IrpSp->FileObject = ConnectionObject;
+    IrpSp->DeviceObject = DeviceObject;
+
+    Status = IoCallDriver(DeviceObject, Irp);
+
+    if (Status == STATUS_PENDING) {
+
+        KeWaitForSingleObject(
+                &Event,
+                Executive,
+                KernelMode,
+                FALSE,
+                NULL
+                );
+
+        Status = IoStatus.Status;
+    }
+
+
+    if (NT_SUCCESS(Status)) {
+        *Length = IoStatus.Information;
+    } else {
+        cfs_enter_debugger();
+        memset(OptionValue, 0, *Length);
+        Status = STATUS_SUCCESS;
+    }    
+
+errorout:
+
+    ksocknal_put_tconn(tconn);
+
+    return cfs_error_code(Status);
+}
+
+/*
+ * ksocknal_set_tcp_option
+ *   Set the the options for the tcp stream connnection
+ *
+ * Arguments:
+ *   tconn:     the tdi connection
+ *   ID:        option id
+ *   OptionValue: buffer containing the new option value
+ *   Length:    the length of the value
+ *
+ * Return Value:
+ *   int:       ksocknal return code
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+NTSTATUS
+ksocknal_set_tcp_option (
+    ksock_tconn_t * tconn,
+    ULONG           ID,
+    PVOID           OptionValue,
+    ULONG           Length
+    )
+{
+    NTSTATUS            Status = STATUS_SUCCESS;
+
+    IO_STATUS_BLOCK     IoStatus;
+
+    ULONG               SetInfoExLength;
+    PTCP_REQUEST_SET_INFORMATION_EX SetInfoEx = NULL;
+
+    PFILE_OBJECT        ConnectionObject;
+    PDEVICE_OBJECT      DeviceObject = NULL;
+
+    PIRP                Irp = NULL;
+    PIO_STACK_LOCATION  IrpSp = NULL;
+
+    PKEVENT             Event;
+
+    /* make sure the tdi connection is connected ? */
+
+    ksocknal_get_tconn(tconn);
+
+    if (tconn->kstc_state != ksts_connected) {
+        Status = STATUS_INVALID_PARAMETER;
+        goto errorout;
+    }
+
+    LASSERT(tconn->kstc_type == kstt_sender ||
+           tconn->kstc_type == kstt_child);
+
+    if (tconn->kstc_type == kstt_sender) {
+        ConnectionObject = tconn->sender.kstc_info.FileObject;
+    } else {
+        ConnectionObject = tconn->child.kstc_info.FileObject;
+    }
+
+    SetInfoExLength =  sizeof(TCP_REQUEST_SET_INFORMATION_EX) - 1 + Length + sizeof(KEVENT);
+
+    SetInfoEx = ExAllocatePoolWithTag(
+                    NonPagedPool,
+                    SetInfoExLength,
+                    'TSSK'
+                    );
+
+    if (SetInfoEx == NULL) {
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    SetInfoEx->ID.toi_id = ID;
+
+    SetInfoEx->ID.toi_type  = INFO_TYPE_CONNECTION;
+    SetInfoEx->ID.toi_class = INFO_CLASS_PROTOCOL;
+    SetInfoEx->ID.toi_entity.tei_entity   = CO_TL_ENTITY;
+    SetInfoEx->ID.toi_entity.tei_instance = TL_INSTANCE;
+
+    SetInfoEx->BufferSize = Length;
+    RtlCopyMemory(&(SetInfoEx->Buffer[0]), OptionValue, Length);
+
+    Event = (PKEVENT)(&(SetInfoEx->Buffer[Length]));
+    KeInitializeEvent(Event, NotificationEvent, FALSE);
+
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    Irp = IoBuildDeviceIoControlRequest(
+                IOCTL_TCP_SET_INFORMATION_EX,
+                DeviceObject,
+                SetInfoEx,
+                SetInfoExLength,
+                NULL,
+                0,
+                FALSE,
+                Event,
+                &IoStatus
+                );
+
+    if (Irp == NULL) {
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    IrpSp = IoGetNextIrpStackLocation(Irp);
+
+    if (IrpSp == NULL) {
+        IoFreeIrp(Irp);
+        Irp = NULL;
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    IrpSp->FileObject = ConnectionObject;
+    IrpSp->DeviceObject = DeviceObject;
+
+    Status = IoCallDriver(DeviceObject, Irp);
+
+    if (Status == STATUS_PENDING) {
+
+        KeWaitForSingleObject(
+                Event,
+                Executive,
+                KernelMode,
+                FALSE,
+                NULL
+                );
+
+        Status = IoStatus.Status;
+    }
+
+errorout:
+
+    if (SetInfoEx) {
+        ExFreePool(SetInfoEx);
+    }
+
+    if (!NT_SUCCESS(Status)) {
+        printk("ksocknal_set_tcp_option: error setup tcp option: ID (%d), Status = %xh\n",
+               ID, Status);
+        Status = STATUS_SUCCESS;
+    }
+
+    ksocknal_put_tconn(tconn);
+
+    return cfs_error_code(Status);
+}
+
+/*
+ * ksocknal_bind_tconn
+ *   bind the tdi connection object with an address
+ *
+ * Arguments:
+ *   tconn:    tconn to be bound
+ *   parent:   the parent tconn object
+ *   ipaddr:   the ip address
+ *   port:     the port number
+ *
+ * Return Value:
+ *   int:   0 for success or ksocknal error codes.
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+int
+ksocknal_bind_tconn (
+    ksock_tconn_t * tconn,
+    ksock_tconn_t * parent,
+    ulong_ptr   addr,
+    unsigned short  port
+    )
+{
+    NTSTATUS            status;
+    int                 rc = 0;
+
+    ksock_tdi_addr_t    taddr;
+
+    memset(&taddr, 0, sizeof(ksock_tdi_addr_t));
+
+    if (tconn->kstc_state != ksts_inited) {
+
+        status = STATUS_INVALID_PARAMETER;
+        rc = cfs_error_code(status);
+
+        goto errorout;
+
+    } else if (tconn->kstc_type == kstt_child) {
+
+        if (NULL == parent) {
+            status = STATUS_INVALID_PARAMETER;
+            rc = cfs_error_code(status);
+
+            goto errorout;
+        }
+
+        /* refer it's parent's address object */
+
+        taddr = parent->kstc_addr;
+        ObReferenceObject(taddr.FileObject);
+
+        ksocknal_get_tconn(parent);
+
+    } else {
+
+        PTRANSPORT_ADDRESS TdiAddress = &(taddr.Tdi);
+        ULONG              AddrLen = 0;
+
+        /* intialize the tdi address*/
+
+        TdiAddress->TAAddressCount = 1;
+        TdiAddress->Address[0].AddressLength = TDI_ADDRESS_LENGTH_IP;
+        TdiAddress->Address[0].AddressType   = TDI_ADDRESS_TYPE_IP;
+
+        ((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->sin_port = htons(port);
+        ((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->in_addr = htonl(addr);
+
+        memset(&(((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->sin_zero[0]),0,8);
+
+
+        /* open the transport address object */
+
+        AddrLen = FIELD_OFFSET(TRANSPORT_ADDRESS, Address->Address) +
+                  TDI_ADDRESS_LENGTH_IP;
+
+        status = KsOpenAddress(
+                    &(tconn->kstc_dev),
+                    &(taddr.Tdi),
+                    AddrLen,
+                    &(taddr.Handle),
+                    &(taddr.FileObject)
+                    );
+
+        if (!NT_SUCCESS(status)) {
+
+            rc = cfs_error_code(status);
+            goto errorout;
+        }
+    }
+
+    if (tconn->kstc_type == kstt_child) {
+        tconn->child.kstc_parent = parent;
+    }
+
+    tconn->kstc_state = ksts_bind;
+    tconn->kstc_addr  = taddr;
+
+errorout:
+
+    return (rc);
+}
+
+/*
+ * ksocknal_build_tconn
+ *  build tcp/streaming connection to remote peer
+ *
+ * Arguments:
+ *   tconn:    tconn to be connected to the peer
+ *   addr:     the peer's ip address
+ *   port:     the peer's port number
+ *
+ * Return Value:
+ *   int:   0 for success or ksocknal error codes.
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int
+ksocknal_build_tconn(
+    ksock_tconn_t *                 tconn,
+    ulong_ptr                       addr,
+    unsigned short                  port
+    )
+{
+    int                             rc = 0;
+    NTSTATUS                        status = STATUS_SUCCESS;
+
+
+    PFILE_OBJECT                    ConnectionObject = NULL;
+    PDEVICE_OBJECT                  DeviceObject = NULL;
+
+    PTDI_CONNECTION_INFORMATION     ConnectionInfo = NULL;
+    ULONG                           AddrLength;
+
+    PIRP                            Irp = NULL;
+
+    LASSERT(tconn->kstc_type == kstt_sender);
+    LASSERT(tconn->kstc_state == ksts_bind);
+
+    ksocknal_get_tconn(tconn);
+
+    {
+        /* set the event callbacks */
+        rc = ksocknal_set_handlers(tconn);
+
+        if (rc < 0) {
+            cfs_enter_debugger();
+            goto errorout;
+        }
+    }
+
+    /* create the connection file handle / object  */
+    status = KsOpenConnection(
+                &(tconn->kstc_dev),
+                (CONNECTION_CONTEXT)tconn,
+                &(tconn->sender.kstc_info.Handle),
+                &(tconn->sender.kstc_info.FileObject)
+                );
+
+    if (!NT_SUCCESS(status)) {
+        rc = cfs_error_code(status);
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    /* associdate the the connection with the adress object of the tconn */
+
+    status = KsAssociateAddress(
+                tconn->kstc_addr.Handle,
+                tconn->sender.kstc_info.FileObject
+                );
+
+    if (!NT_SUCCESS(status)) {
+        rc = cfs_error_code(status);
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    tconn->kstc_state = ksts_associated;
+
+    /* Allocating Connection Info Together with the Address */
+    AddrLength = FIELD_OFFSET(TRANSPORT_ADDRESS, Address->Address)
+                 + TDI_ADDRESS_LENGTH_IP;
+
+    ConnectionInfo = (PTDI_CONNECTION_INFORMATION)ExAllocatePoolWithTag(
+    NonPagedPool, sizeof(TDI_CONNECTION_INFORMATION) + AddrLength, 'iCsK');
+
+    if (NULL == ConnectionInfo) {
+
+        status = STATUS_INSUFFICIENT_RESOURCES;
+        rc = cfs_error_code(status);
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    /* Initializing ConnectionInfo ... */
+    {
+        PTRANSPORT_ADDRESS TdiAddress;
+
+        /* ConnectionInfo settings */
+
+        ConnectionInfo->UserDataLength = 0;
+        ConnectionInfo->UserData = NULL;
+        ConnectionInfo->OptionsLength = 0;
+        ConnectionInfo->Options = NULL;
+        ConnectionInfo->RemoteAddressLength = AddrLength;
+        ConnectionInfo->RemoteAddress = ConnectionInfo + 1;
+
+
+        /* intialize the tdi address*/
+
+        TdiAddress = ConnectionInfo->RemoteAddress;
+
+        TdiAddress->TAAddressCount = 1;
+        TdiAddress->Address[0].AddressLength = TDI_ADDRESS_LENGTH_IP;
+        TdiAddress->Address[0].AddressType   = TDI_ADDRESS_TYPE_IP;
+
+        ((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->sin_port = htons(port);
+        ((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->in_addr = htonl(addr);
+
+        memset(&(((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->sin_zero[0]),0,8);
+    }
+
+    /* Now prepare to connect the remote peer ... */
+
+    ConnectionObject = tconn->sender.kstc_info.FileObject;
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    /* allocate a new Irp */
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        status = STATUS_INSUFFICIENT_RESOURCES;
+        rc = cfs_error_code(status);
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    /* setup the Irp */
+
+    TdiBuildConnect(
+            Irp,
+            DeviceObject,
+            ConnectionObject,
+            NULL,
+            NULL,
+            NULL,
+            ConnectionInfo,
+            NULL
+            );
+
+
+    /* sumbit the Irp to the underlying transport driver */
+    status = KsSubmitTdiIrp(
+                    DeviceObject,
+                    Irp,
+                    TRUE,
+                    NULL
+                    );
+
+    spin_lock(&(tconn->kstc_lock));
+
+    if (NT_SUCCESS(status)) {
+
+        /* Connected! the conneciton is built successfully. */
+
+        tconn->kstc_state = ksts_connected;
+
+        tconn->sender.kstc_info.ConnectionInfo = ConnectionInfo;
+        tconn->sender.kstc_info.Remote         = ConnectionInfo->RemoteAddress;
+
+        spin_unlock(&(tconn->kstc_lock));
+
+    } else {
+
+        /* Not connected! Abort it ... */
+
+        if (rc != 0) {
+            cfs_enter_debugger();
+        }
+
+        Irp = NULL;
+        rc = cfs_error_code(status);
+
+        tconn->kstc_state = ksts_associated;
+        spin_unlock(&(tconn->kstc_lock));
+
+        /* disassocidate the connection and the address object,
+           after cleanup,  it's safe to set the state to abort ... */
+
+        if ( NT_SUCCESS(KsDisassociateAddress(
+                        tconn->sender.kstc_info.FileObject))) {
+            tconn->kstc_state = ksts_aborted;
+        }
+
+        /* reset the event callbacks */
+        rc = ksocknal_reset_handlers(tconn);
+        
+        goto errorout;
+    }
+
+errorout:
+
+    if (NT_SUCCESS(status)) {
+
+        ksocknal_query_local_ipaddr(tconn);
+
+    } else {
+
+        if (ConnectionInfo) {
+            ExFreePool(ConnectionInfo);
+        }
+        if (Irp) {
+            IoFreeIrp(Irp);
+        }
+    }
+
+    ksocknal_put_tconn(tconn);
+
+    return (rc);
+}
+
+
+/*
+ * ksocknal_disconnect_tconn
+ *   disconnect the tconn from a connection
+ *
+ * Arguments:
+ *   tconn: the tdi connecton object connected already
+ *   flags: flags & options for disconnecting
+ *
+ * Return Value:
+ *   int: ksocknal error code
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int
+ksocknal_disconnect_tconn(
+    ksock_tconn_t *     tconn,
+    ulong_ptr       flags
+    )
+{
+    NTSTATUS            status = STATUS_SUCCESS;
+
+    ksock_tconn_info_t * info;
+  
+    PFILE_OBJECT        ConnectionObject;
+    PDEVICE_OBJECT      DeviceObject = NULL;
+
+    PIRP                Irp = NULL;
+
+    KEVENT              Event;
+
+    ksocknal_get_tconn(tconn);
+
+    /* make sure tt's connected already and it
+       must be a sender or a child ...       */
+
+    LASSERT(tconn->kstc_state == ksts_connected);
+    LASSERT( tconn->kstc_type == kstt_sender || 
+            tconn->kstc_type == kstt_child);
+
+    /* reset all the event handlers to NULL */
+
+    if (tconn->kstc_type != kstt_child) {
+        ksocknal_reset_handlers (tconn);
+    }
+
+    /* Disconnecting to the remote peer ... */
+
+    if (tconn->kstc_type == kstt_sender) {
+        info = &(tconn->sender.kstc_info);
+    } else {
+        info = &(tconn->child.kstc_info);
+    }
+
+    ConnectionObject = info->FileObject;
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    /* allocate an Irp and setup it */
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        status = STATUS_INSUFFICIENT_RESOURCES;
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    KeInitializeEvent(
+            &Event,
+            SynchronizationEvent,
+            FALSE
+            );
+
+    TdiBuildDisconnect(
+            Irp,
+            DeviceObject,
+            ConnectionObject,
+            KsDisconectCompletionRoutine,
+            &Event,
+            NULL,
+            flags,
+            NULL,
+            NULL
+            );
+
+    /* issue the Irp to the underlying transport
+       driver to disconnect the connection    */
+
+    status = IoCallDriver(DeviceObject, Irp);
+
+    if (STATUS_PENDING == status) {
+
+        status = KeWaitForSingleObject(
+                     &Event,
+                     Executive,
+                     KernelMode,
+                     FALSE,
+                     NULL
+                     );
+
+        status = Irp->IoStatus.Status;
+    }
+
+    KsPrint((2, "KsDisconnect: Disconnection is done with Status = %xh (%s) ...\n",
+                status, KsNtStatusToString(status)));
+
+    IoFreeIrp(Irp);
+
+    if (info->ConnectionInfo) {
+
+        /* disassociate the association between connection/address objects */
+
+        status = KsDisassociateAddress(ConnectionObject);
+
+        if (!NT_SUCCESS(status)) {
+            cfs_enter_debugger();
+        }
+
+        spin_lock(&(tconn->kstc_lock));
+
+        /* cleanup the tsdumgr Lists */
+        KsCleanupTsdu (tconn);
+
+        /* set the state of the tconn */
+        if (NT_SUCCESS(status)) {
+            tconn->kstc_state = ksts_disconnected;
+        } else {
+            tconn->kstc_state = ksts_associated;
+        }
+
+        /* free  the connection info to system pool*/
+        ExFreePool(info->ConnectionInfo);
+        info->ConnectionInfo = NULL;
+        info->Remote = NULL;
+
+        spin_unlock(&(tconn->kstc_lock));
+    }
+
+    status = STATUS_SUCCESS;
+
+errorout:
+
+    ksocknal_put_tconn(tconn);
+
+    return cfs_error_code(status);
+}
+
+
+/*
+ * ksocknal_abort_tconn
+ *   The connection is broken un-expectedly. We need do
+ *   some cleanup.
+ *
+ * Arguments:
+ *   tconn: the tdi connection
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+ksocknal_abort_tconn(
+    ksock_tconn_t *     tconn
+    )
+{
+    PKS_DISCONNECT_WORKITEM WorkItem = NULL;
+
+    WorkItem = &(tconn->kstc_disconnect);
+
+    ksocknal_get_tconn(tconn);
+    spin_lock(&(tconn->kstc_lock));
+
+    if (tconn->kstc_state != ksts_connected) {
+        ksocknal_put_tconn(tconn);
+    } else {
+
+        if (!cfs_is_flag_set(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY)) {
+
+            WorkItem->Flags = TDI_DISCONNECT_ABORT;
+            WorkItem->tconn = tconn;
+
+            cfs_set_flag(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY);
+
+            ExQueueWorkItem(
+                    &(WorkItem->WorkItem),
+                    DelayedWorkQueue
+                    );
+        }
+    }
+
+    spin_unlock(&(tconn->kstc_lock));
+}
+
+
+/*
+ * ksocknal_query_local_ipaddr
+ *   query the local connection ip address
+ *
+ * Arguments:
+ *   tconn:  the tconn which is connected
+ *
+ * Return Value:
+ *   int: ksocknal error code
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int
+ksocknal_query_local_ipaddr(
+    ksock_tconn_t *     tconn
+    )
+{
+    PFILE_OBJECT    FileObject = NULL;
+    NTSTATUS        status;
+    
+    PTRANSPORT_ADDRESS TdiAddress;
+    ULONG              AddressLength;
+
+    if (tconn->kstc_type == kstt_sender) {
+        FileObject = tconn->sender.kstc_info.FileObject;
+    } else if (tconn->kstc_type == kstt_child) {
+        FileObject = tconn->child.kstc_info.FileObject;
+    } else {
+        status = STATUS_INVALID_PARAMETER;
+        goto errorout;
+    }
+
+    TdiAddress = &(tconn->kstc_addr.Tdi);
+    AddressLength = MAX_ADDRESS_LENGTH;
+
+    status =  KsQueryIpAddress(FileObject, TdiAddress, &AddressLength);
+
+    if (NT_SUCCESS(status)) {
+
+        KsPrint((0, "ksocknal_query_local_ipaddr: Local ip address = %xh port = %xh\n",
+                ((PTDI_ADDRESS_IP)(&(TdiAddress->Address[0].Address)))->in_addr,
+                ((PTDI_ADDRESS_IP)(&(TdiAddress->Address[0].Address)))->sin_port ));
+    } else {
+        KsPrint((0, "KsQueryonnectionIpAddress: Failed to query the connection local ip address.\n"));
+    }
+
+errorout:
+
+    return cfs_error_code(status);
+}
+
+/*
+ * ksocknal_send_mdl
+ *   send MDL chain to the peer for a stream connection
+ *
+ * Arguments:
+ *   tconn: tdi connection object
+ *   tx:    the transmit context
+ *   mdl:   the mdl chain containing the data
+ *   len:   length of the data
+ *   flags: flags of the transmission
+ *
+ * Return Value:
+ *   ksocknal return code
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int
+ksocknal_send_mdl(
+    ksock_tconn_t * tconn,
+    void *          tx,
+    ksock_mdl_t *   mdl,
+    int             len,
+    int             flags
+    )
+{
+    NTSTATUS            Status;
+    int                 rc = 0;
+    ulong_ptr       length;
+    ulong_ptr       tflags;
+    ksock_tdi_tx_t *    context;
+
+    PKS_CHAIN           KsChain;
+    PKS_TSDUMGR         KsTsduMgr;
+    PKS_TSDU            KsTsdu;
+    PKS_TSDU_BUF        KsTsduBuf;
+    PKS_TSDU_DAT        KsTsduDat;
+
+    BOOLEAN             bNewTsdu = FALSE;   /* newly allocated */
+    BOOLEAN             bNewBuff = FALSE;   /* newly allocated */
+
+    BOOLEAN             bBuffed;            /* bufferred sending */
+
+    PUCHAR              Buffer = NULL;
+    ksock_mdl_t *       NewMdl = NULL;
+
+    PIRP                Irp = NULL;
+    PFILE_OBJECT        ConnObject;
+    PDEVICE_OBJECT      DeviceObject;
+
+    BOOLEAN             bIsNonBlock;
+
+    ksocknal_get_tconn(tconn);
+
+    tflags = ksocknal_tdi_send_flags(flags);
+    bIsNonBlock  = cfs_is_flag_set(flags, MSG_DONTWAIT);
+
+    spin_lock(&tconn->kstc_lock);
+
+    LASSERT( tconn->kstc_type == kstt_sender || 
+             tconn->kstc_type == kstt_child );
+
+    if (tconn->kstc_state != ksts_connected) {
+        spin_unlock(&tconn->kstc_lock);
+        ksocknal_put_tconn(tconn);
+        return -ENOTCONN;
+    }
+
+    /* get the latest Tsdu buffer form TsduMgr list.
+       just set NULL if the list is empty. */
+
+    if (tconn->kstc_type == kstt_sender) {
+        KsChain = &(tconn->sender.kstc_send);
+    } else {
+        LASSERT(tconn->kstc_type == kstt_child);
+        KsChain = &(tconn->child.kstc_send);
+    }
+
+    if (cfs_is_flag_set(tflags, TDI_SEND_EXPEDITED)) {
+        KsTsduMgr = &(KsChain->Expedited);
+    } else {
+        KsTsduMgr = &(KsChain->Normal);
+    }
+
+    if (KsTsduMgr->TotalBytes + len <= tconn->kstc_snd_wnd) {
+        bBuffed = TRUE;
+    } else {
+        bBuffed = FALSE;
+    }
+
+    /* do the preparation work for bufferred sending */
+
+    if (bBuffed) {
+
+        /* if the data is even larger than the biggest Tsdu, we have
+           to allocate new buffer and use TSDU_TYOE_BUF to store it */
+
+        if ( KS_TSDU_STRU_SIZE((ULONG)len) > ksocknal_data.ksnd_tsdu_size
+             - KS_DWORD_ALIGN(sizeof(KS_TSDU))) {
+            bNewBuff = TRUE;
+        }
+
+        if (list_empty(&(KsTsduMgr->TsduList))) {
+
+            LASSERT(KsTsduMgr->NumOfTsdu == 0);
+            KsTsdu = NULL;
+
+        } else {
+
+            LASSERT(KsTsduMgr->NumOfTsdu > 0);
+            KsTsdu = list_entry(KsTsduMgr->TsduList.prev, KS_TSDU, Link);
+            LASSERT(KsTsdu->Magic == KS_TSDU_MAGIC);
+
+
+            /* check whether KsTsdu free space is enough, or we need alloc new Tsdu */
+            if (bNewBuff) {
+                if (sizeof(KS_TSDU_BUF) + KsTsdu->LastOffset > KsTsdu->TotalLength) {
+                    KsTsdu = NULL;
+                }
+            } else {
+                if ( KS_TSDU_STRU_SIZE((ULONG)len) >
+                     KsTsdu->TotalLength - KsTsdu->LastOffset ) {
+                    KsTsdu = NULL;
+                }
+            }
+        }
+
+        /* if there's no Tsdu or the free size is not enough for the
+           KS_TSDU_BUF or KS_TSDU_DAT. We need re-allocate a new Tsdu.  */
+
+        if (NULL == KsTsdu) {
+
+            KsTsdu = KsAllocateKsTsdu();
+
+            if (NULL == KsTsdu) {
+                bBuffed = FALSE;
+                bNewBuff = FALSE;
+            } else {
+                bNewTsdu = TRUE;
+            }
+        }
+
+        /* process the case that a new buffer is to be allocated from system memory */
+        if (bNewBuff) {
+
+            /* now allocating internal buffer to contain the payload */
+            Buffer = ExAllocatePool(NonPagedPool, len);
+
+            if (NULL == Buffer) {
+                bBuffed = FALSE;
+            }
+        }
+    }
+
+    if (bBuffed) {
+
+        if (bNewBuff) {
+
+            /* queue a new KS_TSDU_BUF to the Tsdu buffer */
+            KsTsduBuf = (PKS_TSDU_BUF)((PUCHAR)KsTsdu + KsTsdu->LastOffset);
+
+            KsTsduBuf->TsduFlags    =  0;
+            KsTsduBuf->DataLength   =  (ULONG)len;
+            KsTsduBuf->StartOffset  =  0;
+            KsTsduBuf->UserBuffer   =  Buffer;
+        } else {
+            /* queue a new KS_TSDU_BUF to the Tsdu buffer */
+            KsTsduDat = (PKS_TSDU_DAT)((PUCHAR)KsTsdu + KsTsdu->LastOffset);
+
+            KsTsduDat->TsduFlags    =  0;
+            KsTsduDat->DataLength   =  (ULONG)len;
+            KsTsduDat->StartOffset  =  0;
+            KsTsduDat->TotalLength  = KS_TSDU_STRU_SIZE((ULONG)len);
+
+            Buffer = &KsTsduDat->Data[0];
+        }
+
+        /* now locking the Buffer and copy user payload into the buffer */
+        ASSERT(Buffer != NULL);
+
+        rc = ksocknal_lock_buffer(Buffer, FALSE, len, IoReadAccess, &NewMdl);
+        if (rc != 0) {
+            printk("ksocknal_send_mdl: bufferred: error allocating mdl.\n");
+            bBuffed = FALSE;
+        } else {
+            ULONG BytesCopied = 0;
+            TdiCopyMdlToBuffer(mdl, 0, Buffer, 0, (ULONG)len, &BytesCopied);
+            if (BytesCopied != (ULONG) len) {
+                bBuffed = FALSE;
+            }
+        }
+
+        /* Do the finializing job if we succeed to to lock the buffer and move
+           user data. Or we need do cleaning up ... */
+        if (bBuffed) {
+
+            if (bNewBuff) {
+                KsTsduBuf->TsduType     =  TSDU_TYPE_BUF;
+                KsTsdu->LastOffset += sizeof(KS_TSDU_BUF);
+            
+            } else {
+                KsTsduDat->TsduType     =  TSDU_TYPE_DAT;
+                KsTsdu->LastOffset += KsTsduDat->TotalLength;
+            }
+
+            /* attach it to the TsduMgr list if the Tsdu is newly created. */
+            if (bNewTsdu) {
+
+                list_add_tail(&(KsTsdu->Link), &(KsTsduMgr->TsduList));
+                KsTsduMgr->NumOfTsdu++;
+            }
+
+        } else {
+
+            if (NewMdl) {
+                ksocknal_release_mdl(NewMdl, FALSE);
+                NewMdl = NULL;
+            }
+
+            if (bNewBuff) {
+                ExFreePool(Buffer);
+                Buffer = NULL;
+                bNewBuff = FALSE;
+            }
+        }
+    }
+
+    /* update the TotalBytes being in sending */
+    KsTsduMgr->TotalBytes += (ULONG)len;
+
+    spin_unlock(&tconn->kstc_lock);
+
+    /* cleanup the Tsdu if not successful */
+    if (!bBuffed && bNewTsdu) {
+        KsPutKsTsdu(KsTsdu);
+        bNewTsdu = FALSE;
+        KsTsdu = NULL;
+    }
+
+    /* we need allocate the ksock_tx_t structure from memory pool. */
+
+    context = cfs_alloc(sizeof(ksock_tdi_tx_t) + sizeof(KEVENT),0);
+    if (!context) {
+        /* release the chained mdl */
+        ksocknal_release_mdl(mdl, FALSE);
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    /* intialize the TcpContext */
+
+    memset(context,0, sizeof(ksock_tdi_tx_t) + sizeof(KEVENT));
+
+    context->tconn = tconn;
+    context->Event = (PKEVENT) ((PUCHAR)context + sizeof(ksock_tdi_tx_t));
+
+    KeInitializeEvent(context->Event, SynchronizationEvent, FALSE);
+
+    if (bBuffed) {
+
+         /* for bufferred transmission, we need set
+            the internal completion routine.  */
+
+        context->CompletionRoutine  = KsTcpSendCompletionRoutine;
+        context->KsTsduMgr          = KsTsduMgr;
+        context->CompletionContext  = KsTsdu;
+        context->CompletionContext2 = (bNewBuff ? (PVOID)KsTsduBuf : (PVOID)KsTsduDat);
+        context->bCounted = FALSE;
+
+    } else if (bIsNonBlock) {
+
+         /* for non-blocking transmission, we need set
+            the internal completion routine too.  */
+
+        context->CompletionRoutine = KsTcpSendCompletionRoutine;
+        context->CompletionContext = tx;
+        context->KsTsduMgr         = KsTsduMgr;
+        context->bCounted = TRUE;
+        context->ReferCount = 2;
+    }
+
+    if (tconn->kstc_type == kstt_sender) {
+        ConnObject = tconn->sender.kstc_info.FileObject;
+    } else {
+        LASSERT(tconn->kstc_type == kstt_child);
+        ConnObject = tconn->child.kstc_info.FileObject;
+    }
+
+    DeviceObject = IoGetRelatedDeviceObject(ConnObject);
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        /* release the chained mdl */
+        ksocknal_release_mdl(mdl, FALSE);
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    length = KsQueryMdlsSize(mdl);
+
+    LASSERT((ULONG)len <= length);
+
+    ksocknal_get_tconn(tconn);
+
+    TdiBuildSend(
+        Irp,
+        DeviceObject,
+        ConnObject,
+        KsTcpCompletionRoutine,
+        context,
+        (bBuffed ? NewMdl : mdl),
+        (bBuffed ? (tflags | TDI_SEND_NON_BLOCKING) : tflags),
+        (ULONG)len;
+      );
+
+    Status = IoCallDriver(DeviceObject, Irp);
+
+    /* the context is to be freed by the complete routine */
+    context = NULL;
+
+    if (bBuffed) {
+        ksocknal_release_mdl(mdl, FALSE);
+        NewMdl = NULL;
+    }
+
+    if (!NT_SUCCESS(Status)) {
+        cfs_enter_debugger();
+        rc = cfs_error_code(Status);
+        goto errorout;
+    }
+
+    if (bBuffed) {
+        Status = STATUS_SUCCESS;
+        rc  = len;
+    } else {
+        if (bIsNonBlock) {
+            if (InterlockedDecrement(&context->ReferCount) == 0) {
+                Status = Irp->IoStatus.Status;
+            } else {
+                Status = STATUS_PENDING;
+            }
+        } else {
+
+            if (STATUS_PENDING == Status) {
+                Status = KeWaitForSingleObject(
+                         context->Event,
+                         Executive,
+                         KernelMode,
+                         FALSE,
+                         NULL
+                         );
+
+                if (NT_SUCCESS(Status)) {
+                    Status = Irp->IoStatus.Status;
+                }
+            }
+        }
+
+        if (Status == STATUS_SUCCESS) {
+            rc = (int)(Irp->IoStatus.Information);
+
+            spin_lock(&tconn->kstc_lock);
+            KsTsduMgr->TotalBytes -= rc;
+            spin_unlock(&tconn->kstc_lock);
+
+        } else {
+            rc = cfs_error_code(Status);
+        }
+    }
+
+errorout:
+
+    if (bBuffed) {
+
+        if (NewMdl) {
+            ksocknal_release_mdl(NewMdl, FALSE);
+            NewMdl = NULL;
+        }
+
+        if (bNewBuff) {
+            if (!NT_SUCCESS(Status)) {
+                ExFreePool(Buffer);
+                Buffer = NULL;
+            }
+        }
+
+    } else {
+
+        if (Status != STATUS_PENDING) {
+
+            if (Irp) {
+
+                /* Freeing the Irp ... */
+
+                IoFreeIrp(Irp);
+                Irp = NULL;
+            }
+        }
+    }
+
+    if (!NT_SUCCESS(Status)) {
+
+        spin_lock(&tconn->kstc_lock);
+
+        KsTsduMgr->TotalBytes -= (ULONG)len;
+
+        if (bBuffed) {
+
+            /* attach it to the TsduMgr list if the Tsdu is newly created. */
+            if (bNewTsdu) {
+
+                list_del(&(KsTsdu->Link));
+                KsTsduMgr->NumOfTsdu--;
+
+                KsPutKsTsdu(KsTsdu);
+            } else {
+                if (bNewBuff) {
+                    if ( (ulong_ptr)KsTsduBuf + sizeof(KS_TSDU_BUF) == 
+                         (ulong_ptr)KsTsdu + KsTsdu->LastOffset) {
+                        KsTsdu->LastOffset -= sizeof(KS_TSDU_BUF);
+                        KsTsduBuf->TsduType = 0;
+                    } else {
+                        cfs_enter_debugger();
+                        KsTsduBuf->StartOffset = KsTsduBuf->DataLength;
+                    }
+                } else {
+                    if ( (ulong_ptr)KsTsduDat + KsTsduDat->TotalLength == 
+                         (ulong_ptr)KsTsdu + KsTsdu->LastOffset) {
+                        KsTsdu->LastOffset -= KsTsduDat->TotalLength;
+                        KsTsduDat->TsduType = 0;
+                    } else {
+                        cfs_enter_debugger();
+                        KsTsduDat->StartOffset = KsTsduDat->DataLength;
+                    }
+                }
+            }
+        }
+
+        spin_unlock(&tconn->kstc_lock);
+    }
+
+    /* free the context if is not used at all */
+    if (context) {
+        cfs_free(context);
+    }
+
+    ksocknal_put_tconn(tconn);
+
+    return rc;
+}
+
+/*
+ * ksocknal_recv_mdl
+ *   Receive data from the peer for a stream connection
+ *
+ * Arguments:
+ *   tconn: tdi connection object
+ *   mdl:   the mdl chain to contain the incoming data
+ *   len:   length of the data
+ *   flags: flags of the receiving
+ *
+ * Return Value:
+ *   ksocknal return code
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int
+ksocknal_recv_mdl(
+    ksock_tconn_t * tconn,
+    ksock_mdl_t *   mdl,
+    int             size,
+    int             flags
+    )
+{
+    NTSTATUS        Status = STATUS_SUCCESS;
+    int             rc = 0;
+
+    BOOLEAN         bIsNonBlock;
+    BOOLEAN         bIsExpedited;
+
+    PKS_CHAIN       KsChain;
+    PKS_TSDUMGR     KsTsduMgr;
+    PKS_TSDU        KsTsdu;
+    PKS_TSDU_DAT    KsTsduDat;
+    PKS_TSDU_BUF    KsTsduBuf;
+    PKS_TSDU_MDL    KsTsduMdl;
+
+    PUCHAR          Buffer;
+
+    ULONG           BytesRecved = 0;
+    ULONG           RecvedOnce;
+
+    bIsNonBlock  = cfs_is_flag_set(flags, MSG_DONTWAIT);
+    bIsExpedited = cfs_is_flag_set(flags, MSG_OOB);
+
+    ksocknal_get_tconn(tconn);
+
+Again:
+
+    RecvedOnce = 0;
+
+    spin_lock(&(tconn->kstc_lock));
+
+    if ( tconn->kstc_type != kstt_sender &&
+         tconn->kstc_type != kstt_child) {
+
+        rc = -EINVAL;
+        spin_unlock(&(tconn->kstc_lock));
+
+        goto errorout;
+    }
+
+    if (tconn->kstc_state != ksts_connected) {
+
+        rc = -ENOTCONN;
+        spin_unlock(&(tconn->kstc_lock));
+
+        goto errorout;
+    }
+
+    if (tconn->kstc_type == kstt_sender) {
+        KsChain = &(tconn->sender.kstc_recv);
+    } else {
+        LASSERT(tconn->kstc_type == kstt_child);
+        KsChain = &(tconn->child.kstc_recv);
+    }
+
+    if (bIsExpedited) {
+        KsTsduMgr = &(KsChain->Expedited);
+    } else {
+        KsTsduMgr = &(KsChain->Normal);
+    }
+
+NextTsdu:
+
+    if (list_empty(&(KsTsduMgr->TsduList))) {
+
+        //
+        // It's a notification event. We need reset it to
+        // un-signaled state in case there no any tsdus.
+        //
+
+        KeResetEvent(&(KsTsduMgr->Event));
+
+    } else {
+
+        KsTsdu = list_entry(KsTsduMgr->TsduList.next, KS_TSDU, Link);
+        LASSERT(KsTsdu->Magic == KS_TSDU_MAGIC);
+
+        /* remove the KsTsdu from TsduMgr list to release the lock */
+        list_del(&(KsTsdu->Link));
+        KsTsduMgr->NumOfTsdu--;
+
+        spin_unlock(&(tconn->kstc_lock));
+
+        while ((ULONG)size > BytesRecved) {
+
+            ULONG BytesCopied = 0;
+            ULONG BytesToCopy = 0;
+            ULONG StartOffset = 0;
+
+            KsTsduDat = (PKS_TSDU_DAT)((PUCHAR)KsTsdu + KsTsdu->StartOffset);
+            KsTsduBuf = (PKS_TSDU_BUF)((PUCHAR)KsTsdu + KsTsdu->StartOffset);
+            KsTsduMdl = (PKS_TSDU_MDL)((PUCHAR)KsTsdu + KsTsdu->StartOffset);
+    
+            if ( TSDU_TYPE_DAT == KsTsduDat->TsduType ||
+                 TSDU_TYPE_BUF == KsTsduBuf->TsduType ) {
+
+
+                //
+                // Data Tsdu Unit ...
+                //
+
+                if (TSDU_TYPE_DAT == KsTsduDat->TsduType) {
+
+                    if (cfs_is_flag_set(KsTsduDat->TsduFlags, KS_TSDU_DAT_RECEIVING)) {
+                        /* data is not ready yet*/
+                        KeResetEvent(&(KsTsduMgr->Event));
+                        printk("ksocknal_recv_mdl: KsTsduDat (%xh) is not ready yet !!!!!!!\n", KsTsduDat);
+                        break;
+                    }
+
+                    Buffer = &KsTsduDat->Data[0];
+                    StartOffset = KsTsduDat->StartOffset;
+                    if (KsTsduDat->DataLength - KsTsduDat->StartOffset > size - BytesRecved) {
+                        /* Recvmsg requst could be statisfied ... */
+                        BytesToCopy = size - BytesRecved;
+                    } else {
+                        BytesToCopy = KsTsduDat->DataLength - KsTsduDat->StartOffset;
+                    }
+
+                } else {
+
+                    if (cfs_is_flag_set(KsTsduBuf->TsduFlags, KS_TSDU_BUF_RECEIVING)) {
+                        /* data is not ready yet*/
+                        KeResetEvent(&(KsTsduMgr->Event));
+                        DbgPrint("ksocknal_recv_mdl: KsTsduBuf (%xh) is not ready yet !!!!!!!\n", KsTsduBuf);
+                        break;
+                    }
+
+                    ASSERT(TSDU_TYPE_BUF == KsTsduBuf->TsduType);
+                    Buffer = KsTsduBuf->UserBuffer;
+                    StartOffset = KsTsduBuf->StartOffset;
+
+                    if (KsTsduBuf->DataLength - KsTsduBuf->StartOffset > size - BytesRecved) {
+                        /* Recvmsg requst could be statisfied ... */
+                        BytesToCopy = size - BytesRecved;
+                    } else {
+                        BytesToCopy = KsTsduBuf->DataLength - KsTsduBuf->StartOffset;
+                    }
+                }
+
+                if (BytesToCopy > 0) {
+                    Status = TdiCopyBufferToMdl(
+                                    Buffer,
+                                    StartOffset,
+                                    BytesToCopy,
+                                    mdl,
+                                    BytesRecved,
+                                    &BytesCopied
+                                    );
+
+                    if (NT_SUCCESS(Status)) {
+
+                        if (BytesToCopy != BytesCopied) {
+                            cfs_enter_debugger();
+                        }
+
+                        BytesRecved += BytesCopied;
+                        RecvedOnce  += BytesCopied;
+
+                    } else {
+
+                        cfs_enter_debugger();
+
+                        if (STATUS_BUFFER_OVERFLOW == Status) {
+                        }
+                    }
+                }
+
+                if (TSDU_TYPE_DAT == KsTsduDat->TsduType) {
+
+                    KsTsduDat->StartOffset += BytesCopied;
+
+                    if (KsTsduDat->StartOffset == KsTsduDat->DataLength) {
+                        KsTsdu->StartOffset += KsTsduDat->TotalLength;
+                    }
+
+                } else {
+
+                    ASSERT(TSDU_TYPE_BUF == KsTsduBuf->TsduType);
+                    KsTsduBuf->StartOffset += BytesCopied;
+                    if (KsTsduBuf->StartOffset == KsTsduBuf->DataLength) {
+                        KsTsdu->StartOffset += sizeof(KS_TSDU_BUF);
+                        /* now we need release the buf to system pool */
+                        ExFreePool(KsTsduBuf->UserBuffer);
+                    }
+                }
+
+            } else if (TSDU_TYPE_MDL == KsTsduMdl->TsduType) {
+
+                //
+                // MDL Tsdu Unit ...
+                //
+
+                if (KsTsduMdl->DataLength > size - BytesRecved) {
+
+                    /* Recvmsg requst could be statisfied ... */
+
+                    BytesToCopy = size - BytesRecved;
+
+                } else {
+
+                    BytesToCopy = KsTsduMdl->DataLength;
+                }
+
+                Status = KsCopyMdlChainToMdlChain(
+                            KsTsduMdl->Mdl,
+                            KsTsduMdl->StartOffset,
+                            mdl,
+                            BytesRecved,
+                            BytesToCopy,
+                            &BytesCopied
+                            );
+
+                if (NT_SUCCESS(Status)) {
+
+                    if (BytesToCopy != BytesCopied) {
+                        cfs_enter_debugger();
+                    }
+
+                    KsTsduMdl->StartOffset += BytesCopied;
+                    KsTsduMdl->DataLength  -= BytesCopied;
+
+                    BytesRecved += BytesCopied;
+                    RecvedOnce  += BytesCopied;
+                } else {
+                    cfs_enter_debugger();
+                }
+
+                if (0 == KsTsduMdl->DataLength) {
+
+                    //
+                    // Call TdiReturnChainedReceives to release the Tsdu memory
+                    //
+
+                    TdiReturnChainedReceives(
+                        &(KsTsduMdl->Descriptor),
+                        1 );
+
+                    KsTsdu->StartOffset += sizeof(KS_TSDU_MDL);
+                }
+
+            } else {
+                printk("ksocknal_recv_mdl: unknown tsdu slot: slot = %x type = %x Start= %x\n",
+                        KsTsduDat, KsTsduDat->TsduType, KsTsduDat->StartOffset, KsTsduDat->DataLength);
+                printk("        Tsdu = %x Magic=%x: Start = %x Last = %x Length = %x",
+                        KsTsdu, KsTsdu->Magic, KsTsdu->StartOffset, KsTsdu->LastOffset, KsTsdu->TotalLength);
+                cfs_enter_debugger();
+            }
+
+            if (KsTsdu->StartOffset == KsTsdu->LastOffset) {
+
+                //
+                // KsTsdu is empty now, we need free it ...
+                //
+
+                KsPutKsTsdu(KsTsdu);
+                KsTsdu = NULL;
+
+                break;
+            }
+        }
+
+        spin_lock(&(tconn->kstc_lock));
+
+        /* we need attach the KsTsdu to the list header */
+        if (KsTsdu) {
+            KsTsduMgr->NumOfTsdu++;
+            list_add(&(KsTsdu->Link), &(KsTsduMgr->TsduList));
+        } else if ((ULONG)size > BytesRecved) {
+            goto NextTsdu;
+        }
+    }
+
+    if (KsTsduMgr->TotalBytes < RecvedOnce) {
+        cfs_enter_debugger();
+        KsTsduMgr->TotalBytes = 0;
+    } else {
+        KsTsduMgr->TotalBytes -= RecvedOnce;
+    }
+
+    spin_unlock(&(tconn->kstc_lock));
+
+    if (NT_SUCCESS(Status)) {
+
+        if ((BytesRecved < (ulong_ptr)size) && (!bIsNonBlock)) {
+
+            KeWaitForSingleObject(
+                &(KsTsduMgr->Event),
+                Executive,
+                KernelMode,
+                FALSE,
+                NULL
+                );
+
+            goto Again;
+        }
+
+        if (bIsNonBlock && (BytesRecved == 0)) {
+            rc = -EAGAIN;
+        } else {
+            rc = BytesRecved;
+        }
+    }
+
+errorout:
+
+    ksocknal_put_tconn(tconn);
+
+    if (rc > 0) {
+        KsPrint((1, "ksocknal_recv_mdl: recvieving %d bytes ...\n", rc));
+    } else {
+        KsPrint((0, "ksocknal_recv_mdl: recvieving error code = %d Stauts = %xh ...\n", rc, Status));
+    }
+
+    /* release the chained mdl */
+    ksocknal_release_mdl(mdl, FALSE);
+
+    return (rc);
+}
+
+
+/*
+ * ksocknal_init_tdi_data
+ *   initialize the global data in ksockal_data
+ *
+ * Arguments:
+ *   N/A
+ *
+ * Return Value:
+ *   int: ksocknal error code
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int
+ksocknal_init_tdi_data()
+{
+    int rc = 0;
+
+    /* initialize tconn related globals */
+    
+    spin_lock_init(&ksocknal_data.ksnd_tconn_lock);
+    CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_tconns);
+    cfs_init_event(&ksocknal_data.ksnd_tconn_exit, TRUE, FALSE);
+
+    ksocknal_data.ksnd_tconn_slab = cfs_mem_cache_create(
+        "tcon", sizeof(ksock_tconn_t) , 0, 0);
+
+    if (!ksocknal_data.ksnd_tconn_slab) {
+        rc = -ENOMEM;
+        goto errorout;
+    }
+
+    /* initialize tsdu related globals */
+    
+    spin_lock_init(&ksocknal_data.ksnd_tsdu_lock);
+    CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_freetsdus);
+    ksocknal_data.ksnd_tsdu_size = TDINAL_TSDU_DEFAULT_SIZE; /* 64k */
+    ksocknal_data.ksnd_tsdu_slab = cfs_mem_cache_create(
+        "tsdu", ksocknal_data.ksnd_tsdu_size, 0, 0);
+
+    if (!ksocknal_data.ksnd_tsdu_slab) {
+        rc = -ENOMEM;
+        cfs_mem_cache_destroy(ksocknal_data.ksnd_tconn_slab);
+        ksocknal_data.ksnd_tconn_slab = NULL;
+        goto errorout;
+    }
+
+    /* initialize daemon related globals */
+
+    spin_lock_init(&ksocknal_data.ksnd_daemon_lock);
+    CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_daemons);
+    cfs_init_event(&ksocknal_data.ksnd_daemon_exit, TRUE, FALSE);
+
+errorout:
+
+    return rc;
+}
+
+
+/*
+ * ksocknal_fini_tdi_data
+ *   finalize the global data in ksockal_data
+ *
+ * Arguments:
+ *   N/A
+ *
+ * Return Value:
+ *   int: ksocknal error code
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+ksocknal_fini_tdi_data()
+{
+    PKS_TSDU            KsTsdu = NULL;
+    struct list_head *  list   = NULL;
+
+    /* we need wait until all the tconn are freed */
+    spin_lock(&(ksocknal_data.ksnd_tconn_lock));
+
+    if (list_empty(&(ksocknal_data.ksnd_tconns))) {
+        cfs_wake_event(&ksocknal_data.ksnd_tconn_exit);
+    }
+    spin_unlock(&(ksocknal_data.ksnd_tconn_lock));
+
+    /* now wait on the tconn exit event */    
+    cfs_wait_event(&ksocknal_data.ksnd_tconn_exit, 0);
+
+    /* it's safe to delete the tconn slab ... */
+    cfs_mem_cache_destroy(ksocknal_data.ksnd_tconn_slab);
+    ksocknal_data.ksnd_tconn_slab = NULL;
+
+    /* clean up all the tsud buffers in the free list */
+    spin_lock(&(ksocknal_data.ksnd_tsdu_lock));
+    list_for_each (list, &ksocknal_data.ksnd_freetsdus) {
+        KsTsdu = list_entry (list, KS_TSDU, Link);
+
+        cfs_mem_cache_free(
+                ksocknal_data.ksnd_tsdu_slab,
+                KsTsdu );
+    }
+    spin_unlock(&(ksocknal_data.ksnd_tsdu_lock));
+
+    /* it's safe to delete the tsdu slab ... */
+    cfs_mem_cache_destroy(ksocknal_data.ksnd_tsdu_slab);
+    ksocknal_data.ksnd_tsdu_slab = NULL;
+
+    /* good! it's smooth to do the cleaning up...*/
+}
+
+/*
+ * ksocknal_create_child_tconn
+ *   Create the backlog child connection for a listener
+ *
+ * Arguments:
+ *   parent: the listener daemon connection
+ *
+ * Return Value:
+ *   the child connection or NULL in failure
+ *
+ * Notes: 
+ *   N/A
+ */
+
+ksock_tconn_t *
+ksocknal_create_child_tconn(
+    ksock_tconn_t * parent
+    )
+{
+    NTSTATUS            status;
+    ksock_tconn_t *     backlog;
+
+    /* allocate the tdi connecton object */
+    backlog = ksocknal_create_tconn();
+
+    if (!backlog) {
+        goto errorout;
+    }
+
+    /* initialize the tconn as a child */
+    ksocknal_init_child(backlog);
+
+
+    /* now bind it */
+    if (ksocknal_bind_tconn(backlog, parent, 0, 0) < 0) {
+        ksocknal_free_tconn(backlog);
+        backlog = NULL;
+        goto errorout;
+    }
+
+    /* open the connection object */
+    status = KsOpenConnection(
+                &(backlog->kstc_dev),
+                (PVOID)backlog,
+                &(backlog->child.kstc_info.Handle),
+                &(backlog->child.kstc_info.FileObject)
+                );
+            
+    if (!NT_SUCCESS(status)) {
+
+        ksocknal_put_tconn(backlog);
+        backlog = NULL;
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    /* associate it now ... */
+    status = KsAssociateAddress(
+                backlog->kstc_addr.Handle,
+                backlog->child.kstc_info.FileObject
+                );
+
+    if (!NT_SUCCESS(status)) {
+            
+        ksocknal_put_tconn(backlog);
+        backlog = NULL;
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    backlog->kstc_state = ksts_associated;
+
+errorout:
+
+    return backlog;
+}
+
+/*
+ * ksocknal_replenish_backlogs(
+ *   to replenish the backlogs listening...
+ *
+ * Arguments:
+ *   tconn: the parent listen tdi connect
+ *   nbacklog: number fo child connections in queue
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+ksocknal_replenish_backlogs(
+    ksock_tconn_t * parent,
+    int     nbacklog
+    )
+{
+    ksock_tconn_t * backlog;
+    int            n = 0;
+
+    /* calculate how many backlogs needed */
+    if ( ( parent->listener.kstc_listening.num + 
+           parent->listener.kstc_accepted.num ) < nbacklog ) {
+        n = nbacklog - ( parent->listener.kstc_listening.num + 
+            parent->listener.kstc_accepted.num );
+    } else {
+        n = 0;
+    }
+
+    while (n--) {
+
+        /* create the backlog child tconn */
+        backlog = ksocknal_create_child_tconn(parent);
+
+        spin_lock(&(parent->kstc_lock));
+
+        if (backlog) {
+            spin_lock(&backlog->kstc_lock);
+            /* attch it into the listing list of daemon */
+            list_add( &backlog->child.kstc_link, 
+                      &parent->listener.kstc_listening.list );
+            parent->listener.kstc_listening.num++;
+
+            backlog->child.kstc_queued = TRUE;
+            spin_unlock(&backlog->kstc_lock);
+        } else {
+            cfs_enter_debugger();
+        }
+    }
+
+    spin_unlock(&(parent->kstc_lock));
+}
+
+/*
+ * ksocknal_start_listen
+ *   setup the listener tdi connection and make it listen
+ *    on the user specified ip address and port.
+ *
+ * Arguments:
+ *   tconn: the parent listen tdi connect
+ *   nbacklog: number fo child connections in queue
+ *
+ * Return Value:
+ *   ksocknal error code >=: success; otherwise error.
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int
+ksocknal_start_listen(ksock_tconn_t *tconn, int nbacklog)
+{
+    int rc = 0;
+
+    /* now replenish the backlogs */
+    ksocknal_replenish_backlogs(tconn, nbacklog);
+
+    /* set the event callback handlers */
+    rc = ksocknal_set_handlers(tconn);
+
+    if (rc < 0) {
+        return rc;
+    }
+
+    spin_lock(&(tconn->kstc_lock));
+    tconn->listener.nbacklog = nbacklog;
+    tconn->kstc_state = ksts_listening;
+    cfs_set_flag(tconn->kstc_flags, KS_TCONN_DAEMON_STARTED);
+    spin_unlock(&(tconn->kstc_lock));
+
+    return rc;
+}
+
+
+/*
+ * ksocknal_wait_child_tconn
+ *   accept a child connection from peer
+ *
+ * Arguments:
+ *   parent:   the daemon tdi connection listening
+ *   child:    to contain the accepted connection
+ *
+ * Return Value:
+ *   ksocknal error code;
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int
+ksocknal_wait_child_tconn(
+    ksock_tconn_t *  parent,
+    ksock_tconn_t ** child
+    )
+{
+    struct list_head * tmp;
+    ksock_tconn_t * backlog = NULL;
+
+    ksocknal_replenish_backlogs(parent, parent->listener.nbacklog);
+
+    spin_lock(&(parent->kstc_lock));
+
+    if (parent->listener.kstc_listening.num <=0 ) {
+        spin_unlock(&(parent->kstc_lock));
+        return -1;
+    }
+
+again:
+
+    /* check the listening queue and try to search the accepted connecton */
+
+    list_for_each(tmp, &(parent->listener.kstc_listening.list)) {
+        backlog = list_entry (tmp, ksock_tconn_t, child.kstc_link);
+
+        spin_lock(&(backlog->kstc_lock));
+
+        if (backlog->child.kstc_accepted) {
+
+            LASSERT(backlog->kstc_state == ksts_connected);
+            LASSERT(backlog->child.kstc_busy);
+
+            list_del(&(backlog->child.kstc_link));
+            list_add(&(backlog->child.kstc_link), 
+                     &(parent->listener.kstc_accepted.list));
+            parent->listener.kstc_accepted.num++;
+            parent->listener.kstc_listening.num--;
+            backlog->child.kstc_queueno = 1;
+
+            spin_unlock(&(backlog->kstc_lock));
+
+            break;
+        } else {
+            spin_unlock(&(backlog->kstc_lock));
+            backlog = NULL;
+        }
+    }
+
+    spin_unlock(&(parent->kstc_lock));
+
+    /* we need wait until new incoming connections are requested
+       or the case of shuting down the listenig daemon thread  */
+    if (backlog == NULL) {
+
+        NTSTATUS    Status;
+
+        Status = KeWaitForSingleObject(
+                &(parent->listener.kstc_accept_event),
+                Executive,
+                KernelMode,
+                FALSE,
+                NULL
+                );
+
+        spin_lock(&(parent->kstc_lock));
+
+        /* check whether it's exptected to exit ? */
+        if (!cfs_is_flag_set(parent->kstc_flags, KS_TCONN_DAEMON_STARTED)) {
+            spin_unlock(&(parent->kstc_lock));
+        } else {
+            goto again;
+        }
+    }
+
+    if (backlog) {    
+        /* query the local ip address of the connection */
+        ksocknal_query_local_ipaddr(backlog);
+    }
+
+    *child = backlog;
+
+    return 0;
+}
+
+int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask)
+{
+    return 0;
+}
+
+int libcfs_ipif_enumerate(char ***names){
+    return 0;
+}
+
+void libcfs_ipif_free_enumeration(char **names, int n)
+{
+}
+
+int libcfs_sock_listen(struct socket **sockp, __u32 ip, int port, int backlog)
+{
+    int                     rc = 0;
+    ksock_tconn_t *         parent;
+
+    parent = ksocknal_create_tconn();
+    if (!parent) {
+        rc = -ENOMEM;
+        goto errorout;
+    }
+
+    /* initialize the tconn as a listener */
+    ksocknal_init_listener(parent);
+
+    /* bind the daemon->tconn */
+    rc = ksocknal_bind_tconn(parent, NULL, ip, port);
+
+    if (rc < 0) {
+        ksocknal_free_tconn(parent);
+        goto errorout;
+    }
+
+    /* create listening children and make it to listen state*/
+    rc = ksocknal_start_listen(parent, backlog);
+
+errorout:
+
+    return rc;
+}
+
+int libcfs_sock_accept(struct socket **newsockp, struct socket *sock)
+{
+    int                     rc;
+    ksock_tconn_t *         child = NULL;
+
+    /* wait for incoming connecitons */
+    rc = ksocknal_wait_child_tconn(sock, &child);
+
+    *newsockp = child;
+    return rc;
+}
+
+void libcfs_sock_abort_accept(struct socket *sock)
+{
+    spin_lock(&(sock->kstc_lock));
+
+    /* mark the flag to shutdonw it */
+    cfs_clear_flag(sock->kstc_flags, KS_TCONN_DAEMON_STARTED);
+
+    /* wake up it from the waiting on new incoming connections */
+    KeSetEvent(&sock->listener.kstc_accept_event, 0, FALSE);
+
+    spin_unlock(&(sock->kstc_lock));
+}
+
+/*
+ * libcfs_sock_connect
+ *   build a conntion between local ip/port and the peer ip/port.
+ *
+ * Arguments:
+ *   laddr: local ip address
+ *   lport: local port number
+ *   paddr: peer's ip address
+ *   pport: peer's port number
+ *
+ * Return Value:
+ *   int:   return code ...
+ *
+ * Notes: 
+ *   N/A
+ */
+
+
+int libcfs_sock_connect(struct socket **sockp, int *fatal,
+                        __u32 local_ip, int local_port,
+                        __u32 peer_ip, int peer_port)
+{
+    ksock_tconn_t * tconn = NULL;
+    int             rc = 0;
+
+    /* create the tdi connecion structure */
+    tconn = ksocknal_create_tconn();
+    if (!tconn) {
+        rc = -ENOMEM;
+        goto errorout;
+    }
+
+    /* initialize the tdi sender connection */
+    ksocknal_init_sender(tconn);
+
+    /* bind the local ip address with the tconn */
+    rc = ksocknal_bind_tconn(tconn, NULL, local_ip, local_port);
+    if (rc < 0) {
+        ksocknal_free_tconn(tconn);
+        goto errorout;
+    }
+
+errorout:
+
+    *sockp = tconn;
+    return rc;
+}
+
+int libcfs_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize)
+{
+    return 0;
+}
+
+int libcfs_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize)
+{
+    return 0;
+}
+
+int libcfs_sock_getaddr(struct socket *socket, int remote, __u32 *ip, int *port)
+{
+    PTRANSPORT_ADDRESS  taddr = NULL;
+
+    if (remote) {
+        if (socket->kstc_type == kstt_sender) {
+            taddr = socket->sender.kstc_info.Remote;
+        } else if (socket->kstc_type == kstt_sender) {
+            taddr = socket->child.kstc_info.Remote;
+        }
+    } else {
+        NTSTATUS    status;
+        status = ksocknal_query_local_ipaddr(socket);
+        if (NT_SUCCESS(status)) {
+            taddr = &(socket->kstc_addr.Tdi);
+        } else {
+        }
+    }
+
+    if (taddr) {
+        PTDI_ADDRESS_IP addr = (PTDI_ADDRESS_IP)(&(taddr->Address[0].Address));
+        if (ip != NULL)
+            *ip = ntohl (addr->in_addr);
+        if (port != NULL)
+            *port = ntohs (addr->sin_port);
+    } else {
+        return -ENOTCONN;
+    }
+
+    return 0;
+}
+
+int libcfs_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
+{
+    int           rc;
+    ksock_mdl_t * mdl;
+
+    int           offset = 0;
+
+    while (nob > offset) {
+
+        /* lock the user buffer */
+        rc = ksocknal_lock_buffer( (char *)buffer + offset,
+                        FALSE, nob - offset, IoReadAccess, &mdl );
+
+        if (rc < 0) {
+            return (rc);
+        }
+
+        /* send out the whole mdl */
+        rc = ksocknal_send_mdl( sock, NULL, mdl,
+                                nob - offset, 0 );
+
+        if (rc > 0) {
+            offset += rc;
+        } else {
+            return (rc);
+        }
+    }
+        
+    return (0);
+}
+
+int libcfs_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
+{
+    int           rc;
+    ksock_mdl_t * mdl;
+
+    int           offset = 0;
+
+    while (nob > offset) {
+
+        /* lock the user buffer */
+        rc = ksocknal_lock_buffer( (char *)buffer + offset,
+                               FALSE, nob - offset, IoWriteAccess, &mdl );
+
+        if (rc < 0) {
+            return (rc);
+        }
+
+        /* recv the requested buffer */
+        rc = ksocknal_recv_mdl( sock, mdl, nob - offset, 0 );
+
+        if (rc > 0) {
+            offset += rc;
+        } else {
+            return (rc);
+        }
+    }
+
+    return (0);
+}
+
+void libcfs_sock_release(struct socket *sock)
+{
+}
diff --git a/lnet/libcfs/winnt/winnt-tracefile.c b/lnet/libcfs/winnt/winnt-tracefile.c
new file mode 100644 (file)
index 0000000..42841b3
--- /dev/null
@@ -0,0 +1,258 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#define LUSTRE_TRACEFILE_PRIVATE
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+#include "tracefile.h"
+
+#ifndef get_cpu
+#define get_cpu() smp_processor_id()
+#define put_cpu() do { } while (0)
+#endif
+
+extern union trace_data_union trace_data[NR_CPUS];
+extern char *tracefile;
+extern int64_t tracefile_size;
+
+event_t     tracefile_event;
+
+void tracefile_lock_init()
+{
+    cfs_init_event(&tracefile_event, TRUE, TRUE);
+}
+
+void tracefile_read_lock()
+{
+    cfs_wait_event(&tracefile_event, 0);
+}
+
+void tracefile_read_unlock()
+{
+    cfs_wake_event(&tracefile_event);
+}
+
+void tracefile_write_lock()
+{
+    cfs_wait_event(&tracefile_event, 0);
+}
+
+void tracefile_write_unlock()
+{
+    cfs_wake_event(&tracefile_event);
+}
+
+
+inline struct trace_cpu_data *
+__trace_get_tcd(unsigned long *flags) 
+{
+       struct trace_cpu_data *ret;           
+
+       int cpu = get_cpu();                
+       local_irq_save(*flags);               
+       ret = &trace_data[cpu].tcd;     
+
+       return ret;                             
+}
+
+inline void 
+trace_put_tcd (struct trace_cpu_data *tcd, unsigned long flags)
+{
+       local_irq_restore(flags); 
+       put_cpu();               
+}
+
+void
+set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, 
+                   const int line, unsigned long stack)
+{ 
+       struct timeval tv; 
+       
+       do_gettimeofday(&tv); 
+       
+       header->ph_subsys = subsys; 
+       header->ph_mask = mask; 
+       header->ph_cpu_id = smp_processor_id(); 
+       header->ph_sec = (__u32)tv.tv_sec; 
+       header->ph_usec = tv.tv_usec; 
+       header->ph_stack = stack; 
+       header->ph_pid = current->pid; 
+       header->ph_line_num = line; 
+       header->ph_extern_pid = 0;
+       return;
+}
+
+void print_to_console(struct ptldebug_header *hdr, int mask, char *buf, 
+                                 int len, char *file, const char *fn)
+{ 
+       char *prefix = NULL, *ptype = NULL; 
+       
+       if ((mask & D_EMERG) != 0) { 
+               prefix = "LustreError"; 
+               ptype = KERN_EMERG; 
+       } else if ((mask & D_ERROR) != 0) { 
+               prefix = "LustreError"; 
+               ptype = KERN_ERR; 
+       } else if ((mask & D_WARNING) != 0) { 
+               prefix = "Lustre"; 
+               ptype = KERN_WARNING; 
+       } else if (libcfs_printk != 0 || (mask & D_CONSOLE)) {
+               prefix = "Lustre"; 
+               ptype = KERN_INFO; 
+       } 
+
+       if ((mask & D_CONSOLE) != 0) {
+               printk("%s%s: %s", ptype, prefix, buf);
+       } else {
+               printk("%s%s: %d:%d:(%s:%d:%s()) %s", ptype, prefix, hdr->ph_pid, 
+                      hdr->ph_extern_pid, file, hdr->ph_line_num, fn, buf);
+       }
+       return;
+}
+
+int tcd_owns_tage(struct trace_cpu_data *tcd, struct trace_page *tage)
+{
+       return 1;
+}
+
+
+int trace_write_daemon_file(struct file *file, const char *buffer, 
+                           unsigned long count, void *data)
+{ 
+       char *name; 
+       unsigned long off; 
+       int rc; 
+       
+       name =cfs_alloc(count + 1, 0); 
+       if (name == NULL) 
+               return -ENOMEM; 
+       
+       if (copy_from_user((void *)name, (void*)buffer, count)) { 
+               rc = -EFAULT; 
+               goto out; 
+       } 
+       
+       /* be nice and strip out trailing '\n' */ 
+       for (off = count ; off > 2 && isspace(name[off - 1]); off--) 
+               ; 
+       
+       name[off] = '\0'; 
+       
+       tracefile_write_lock(); 
+       if (strcmp(name, "stop") == 0) { 
+               tracefile = NULL; 
+               trace_stop_thread(); 
+               goto out_sem; 
+       } else if (strncmp(name, "size=", 5) == 0) { 
+               tracefile_size = simple_strtoul(name + 5, NULL, 0); 
+               if (tracefile_size < 10 || tracefile_size > 20480) 
+                       tracefile_size = TRACEFILE_SIZE; 
+               else 
+                       tracefile_size <<= 20; 
+               goto out_sem; 
+       } 
+       
+#ifndef __WINNT__
+        if (name[0] != '/') {
+               rc = -EINVAL; 
+               goto out_sem; 
+       } 
+#endif
+       
+       if (tracefile != NULL) 
+               cfs_free(tracefile); 
+       
+       tracefile = name; 
+       name = NULL; 
+       printk(KERN_INFO "Lustre: debug daemon will attempt to start writing " 
+              "to %s (%lukB max)\n", tracefile, (long)(tracefile_size >> 10)); 
+       
+       trace_start_thread(); 
+out_sem: 
+    tracefile_write_unlock(); 
+out:
+    if (name != NULL) 
+           cfs_free(name);
+       return count;
+}
+
+int trace_read_daemon_file(char *page, char **start, off_t off, int count, 
+                          int *eof, void *data)
+{ 
+       int rc; 
+       
+       tracefile_read_lock();
+       rc = snprintf(page, count, "%s", tracefile); 
+       tracefile_read_unlock();
+
+       return rc;
+}
+
+int trace_write_debug_mb(struct file *file, const char *buffer, 
+                        unsigned long count, void *data)
+{ 
+       char string[32]; 
+       int i; 
+       unsigned max; 
+       
+       if (count >= sizeof(string)) { 
+               printk(KERN_ERR "Lustre: value too large (length %lu bytes)\n", 
+                      count); 
+               return -EOVERFLOW; 
+       } 
+       
+       if (copy_from_user((void *)string, (void *)buffer, count)) 
+               return -EFAULT; 
+       
+       max = simple_strtoul(string, NULL, 0); 
+       if (max == 0) 
+               return -EINVAL;
+
+       if (max > (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5 || max >= 512) { 
+               printk(KERN_ERR "Lustre: Refusing to set debug buffer size to " 
+                      "%dMB, which is more than 80%% of available RAM (%lu)\n", 
+                      max, (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5); 
+               return -EINVAL; 
+       } 
+
+       max /= smp_num_cpus; 
+       
+       for (i = 0; i < NR_CPUS; i++) { 
+               struct trace_cpu_data *tcd; 
+               tcd = &trace_data[i].tcd; 
+               tcd->tcd_max_pages = max << (20 - PAGE_SHIFT); 
+       } 
+       return count;
+}
+
+int trace_read_debug_mb(char *page, char **start, off_t off, int count,
+                                       int *eof, void *data)
+{ 
+       struct trace_cpu_data *tcd; 
+       unsigned long flags; 
+       int rc;
+                                       
+       tcd = trace_get_tcd(flags); 
+       rc = snprintf(page, count, "%lu\n", 
+                     (tcd->tcd_max_pages >> (20 - PAGE_SHIFT)) * smp_num_cpus); 
+       trace_put_tcd(tcd, flags); 
+       return rc;
+}
+
diff --git a/lnet/libcfs/winnt/winnt-usr.c b/lnet/libcfs/winnt/winnt-usr.c
new file mode 100644 (file)
index 0000000..fc6b346
--- /dev/null
@@ -0,0 +1,79 @@
+
+#ifndef __KERNEL__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <io.h>
+#include <time.h>
+#include <windows.h>
+
+void portals_debug_msg(int subsys, int mask, char *file, const char *fn,
+                              const int line, unsigned long stack,
+                              char *format, ...) {
+    }
+
+void print_last_error(char* Prefix)
+{
+    LPVOID lpMsgBuf;
+
+    FormatMessage( 
+        FORMAT_MESSAGE_ALLOCATE_BUFFER |
+        FORMAT_MESSAGE_FROM_SYSTEM |
+        FORMAT_MESSAGE_IGNORE_INSERTS,
+        NULL,
+        GetLastError(),
+        0,
+        (LPTSTR) &lpMsgBuf,
+        0,
+        NULL
+        );
+
+    printf("%s %s", Prefix, (LPTSTR) lpMsgBuf);
+
+    LocalFree(lpMsgBuf);
+}
+
+//
+// The following declarations are defined in io.h of VC
+// sys/types.h will conflict with io.h, so we need place
+// these declartions here.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+    void
+    __declspec (naked) __cdecl _chkesp(void)
+    {
+#if _X86_
+        __asm {  jz      exit_chkesp     };
+        __asm {  int     3               };
+    exit_chkesp:
+        __asm {  ret                     };
+#endif
+    }
+#ifdef __cplusplus
+}
+#endif
+
+unsigned int sleep (unsigned int seconds)
+{
+    Sleep(seconds * 1000);
+    return 0;
+}
+
+int gethostname(char * name, int namelen)
+{
+    return 0;
+}
+
+int ioctl (
+    int handle,
+    int cmd,
+    void *buffer
+    )
+{
+    printf("hello, world\n");
+    return 0;
+}
+
+#endif /* __KERNEL__ */
\ No newline at end of file
diff --git a/lnet/libcfs/winnt/winnt-utils.c b/lnet/libcfs/winnt/winnt-utils.c
new file mode 100644 (file)
index 0000000..1b7c7c5
--- /dev/null
@@ -0,0 +1,158 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+
+/*
+ * miscellaneous libcfs stuff
+ */
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <lnet/types.h>
+
+/*
+ * Convert server error code to client format. Error codes are from
+ * Linux errno.h, so for Linux client---identity.
+ */
+int convert_server_error(__u64 ecode)
+{
+       return cfs_error_code((NTSTATUS)ecode);
+}
+
+/*
+ * convert <fcntl.h> flag from client to server.
+ * 
+ * nt kernel uses several members to describe the open flags
+ * such as DesiredAccess/ShareAccess/CreateDisposition/CreateOptions
+ * so it's better to convert when using, not here.
+ */
+
+int convert_client_oflag(int cflag, int *result)
+{
+    *result = 0;
+       return 0;
+}
+
+
+int cfs_error_code(NTSTATUS Status)
+{
+    switch (Status) {
+
+        case STATUS_ACCESS_DENIED:
+            return (-EACCES);
+
+        case STATUS_ACCESS_VIOLATION:
+            return (-EFAULT);
+    
+        case STATUS_BUFFER_TOO_SMALL:
+            return (-ETOOSMALL);
+
+        case STATUS_INVALID_PARAMETER:
+            return (-EINVAL);
+
+        case STATUS_NOT_IMPLEMENTED:
+        case STATUS_NOT_SUPPORTED:
+            return (-EOPNOTSUPP);
+
+        case STATUS_INVALID_ADDRESS:
+        case STATUS_INVALID_ADDRESS_COMPONENT:
+            return (-EADDRNOTAVAIL);
+
+        case STATUS_NO_SUCH_DEVICE:
+        case STATUS_NO_SUCH_FILE:
+        case STATUS_OBJECT_NAME_NOT_FOUND:
+        case STATUS_OBJECT_PATH_NOT_FOUND:  
+        case STATUS_NETWORK_BUSY:
+        case STATUS_INVALID_NETWORK_RESPONSE:
+        case STATUS_UNEXPECTED_NETWORK_ERROR:
+            return (-ENETDOWN);
+
+        case STATUS_BAD_NETWORK_PATH:
+        case STATUS_NETWORK_UNREACHABLE:
+        case STATUS_PROTOCOL_UNREACHABLE:     
+            return (-ENETUNREACH);
+
+        case STATUS_LOCAL_DISCONNECT:
+        case STATUS_TRANSACTION_ABORTED:
+        case STATUS_CONNECTION_ABORTED:
+            return (-ECONNABORTED);
+
+        case STATUS_REMOTE_DISCONNECT:
+        case STATUS_LINK_FAILED:
+        case STATUS_CONNECTION_DISCONNECTED:
+        case STATUS_CONNECTION_RESET:
+        case STATUS_PORT_UNREACHABLE:
+            return (-ECONNRESET);
+
+        case STATUS_PAGEFILE_QUOTA:
+        case STATUS_NO_MEMORY:
+        case STATUS_CONFLICTING_ADDRESSES:
+        case STATUS_QUOTA_EXCEEDED:
+        case STATUS_TOO_MANY_PAGING_FILES:
+        case STATUS_INSUFFICIENT_RESOURCES:
+        case STATUS_WORKING_SET_QUOTA:
+        case STATUS_COMMITMENT_LIMIT:
+        case STATUS_TOO_MANY_ADDRESSES:
+        case STATUS_REMOTE_RESOURCES:
+            return (-ENOBUFS);
+
+        case STATUS_INVALID_CONNECTION:
+            return (-ENOTCONN);
+
+        case STATUS_PIPE_DISCONNECTED:
+            return (-ESHUTDOWN);
+
+        case STATUS_TIMEOUT:
+        case STATUS_IO_TIMEOUT:
+        case STATUS_LINK_TIMEOUT:
+            return (-ETIMEDOUT);
+
+        case STATUS_REMOTE_NOT_LISTENING:
+        case STATUS_CONNECTION_REFUSED:
+            return (-ECONNREFUSED);
+
+        case STATUS_HOST_UNREACHABLE:
+            return (-EHOSTUNREACH);
+
+        case STATUS_PENDING:
+        case STATUS_DEVICE_NOT_READY:
+            return (-EAGAIN);
+
+        case STATUS_CANCELLED:
+        case STATUS_REQUEST_ABORTED:
+            return (-EINTR);
+
+        case STATUS_BUFFER_OVERFLOW:
+        case STATUS_INVALID_BUFFER_SIZE:
+            return (-EMSGSIZE);
+
+    }
+
+    if (NT_SUCCESS(Status)) 
+        return 0;
+
+    return (-EINVAL);
+}
+
+
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace)
+{
+}
+
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no)
+{
+    return NULL;
+}
\ No newline at end of file
index 60c304b..2f00d64 100644 (file)
@@ -5,11 +5,11 @@
        <key>CFBundleDevelopmentRegion</key>
        <string>English</string>
        <key>CFBundleExecutable</key>
-       <string>portals</string>
+       <string>lnet</string>
        <key>CFBundleIconFile</key>
        <string></string>
        <key>CFBundleIdentifier</key>
-       <string>com.clusterfs.lustre.portals</string>
+       <string>com.clusterfs.lustre.lnet</string>
        <key>CFBundleInfoDictionaryVersion</key>
        <string>6.0</string>
        <key>CFBundlePackageType</key>
index 56d8308..88692ec 100644 (file)
@@ -328,8 +328,8 @@ lnet_acceptor(void *arg)
         }
 
        snprintf(name, sizeof(name), "acceptor_%03d", accept_port);
-       libcfs_daemonize(name);
-       libcfs_blockallsigs();
+       cfs_daemonize(name);
+       cfs_block_allsigs();
 
        rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock,
                                0, accept_port, accept_backlog);
index f2f31bb..9ce40fe 100644 (file)
@@ -19,16 +19,16 @@ modulenet_DATA = lnet$(KMODEXT)
 endif # LINUX
 
 if DARWIN
-macos_PROGRAMS := portals
+macos_PROGRAMS := lnet
 
-portals_SOURCES := api-errno.c api-ni.c config.c
-portals_SOURCES += lib-me.c lib-msg.c lib-eq.c lib-md.c
-portals_SOURCES += lib-move.c module.c lo.c
-portals_SOURCES += router.c acceptor.c
+lnet_SOURCES := api-errno.c api-ni.c config.c
+lnet_SOURCES += lib-me.c lib-msg.c lib-eq.c lib-md.c
+lnet_SOURCES += lib-move.c module.c lo.c router.c router_proc.c
+lnet_SOURCES += acceptor.c peer.c
 
-portals_CFLAGS := $(EXTRA_KCFLAGS)
-portals_LDFLAGS := $(EXTRA_KLDFLAGS)
-portals_LDADD := $(EXTRA_KLIBS)
+lnet_CFLAGS := $(EXTRA_KCFLAGS)
+lnet_LDFLAGS := $(EXTRA_KLDFLAGS)
+lnet_LDADD := $(EXTRA_KLIBS)
 
 plist_DATA := Info.plist
 
index f8986a5..aa4e52d 100644 (file)
@@ -20,6 +20,7 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LNET
+#include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 
 typedef struct {                                /* tmp struct for parsing routes */
index cafcb86..ec0077c 100644 (file)
@@ -58,6 +58,8 @@ LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback,
                 LNET_LOCK();
                 lnet_eq_free (eq);
                 LNET_UNLOCK();
+
+                return -ENOMEM;
         }
 
         /* NB this resets all event sequence numbers to 0, to be earlier
@@ -221,13 +223,15 @@ LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
                 LNET_UNLOCK();
 
                 if (timeout_ms < 0) {
-                        cfs_waitq_wait (&wl);
+                        cfs_waitq_wait (&wl, CFS_TASK_INTERRUPTIBLE);
                 } else { 
                         struct timeval tv;
 
                         now = cfs_time_current();
-                        cfs_waitq_timedwait(&wl, cfs_time_seconds(timeout_ms)/1000);
-                        cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv); 
+                        cfs_waitq_timedwait(&wl, CFS_TASK_INTERRUPTIBLE,
+                                            cfs_time_seconds(timeout_ms)/1000);
+                        cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), 
+                                          &tv); 
                         timeout_ms -= tv.tv_sec * 1000 + tv.tv_usec / 1000;
                         if (timeout_ms < 0)
                                 timeout_ms = 0;
index 22d49cf..7941bfd 100644 (file)
@@ -310,8 +310,8 @@ lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset,
                                siov->iov_len - soffset);
                 this_nob = MIN(this_nob, nob);
 
-                memcpy (diov->iov_base + doffset,
-                        siov->iov_base + soffset, this_nob);
+                memcpy ((char *)diov->iov_base + doffset,
+                        (char *)siov->iov_base + soffset, this_nob);
                 nob -= this_nob;
 
                 if (diov->iov_len > doffset + this_nob) {
@@ -553,7 +553,7 @@ lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset
                         addr = ((char *)cfs_kmap(kiov->kiov_page)) + 
                                 kiov->kiov_offset + kiovoffset;
 
-                memcpy (iov->iov_base + iovoffset, addr, this_nob);
+                memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob);
                 nob -= this_nob;
 
                 if (iov->iov_len > iovoffset + this_nob) {
@@ -622,7 +622,7 @@ lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffs
                         addr = ((char *)cfs_kmap(kiov->kiov_page)) + 
                                 kiov->kiov_offset + kiovoffset;
 
-                memcpy (addr, iov->iov_base + iovoffset, this_nob);
+                memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob);
                 nob -= this_nob;
 
                 if (kiov->kiov_len > kiovoffset + this_nob) {
@@ -1331,8 +1331,8 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
         unsigned int      rlength = hdr->payload_length;
         unsigned int      mlength = 0;
         unsigned int      offset = 0;
-        lnet_process_id_t src = {.nid = hdr->src_nid,
-                                 .pid = hdr->src_pid};
+        lnet_process_id_t src = {/* .nid = */ hdr->src_nid,
+                                 /* .pid = */ hdr->src_pid};
         lnet_libmd_t     *md;
 
         /* Convert put fields to host byte order */
@@ -1379,8 +1379,8 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
         lnet_hdr_t        *hdr = &msg->msg_hdr;
         unsigned int       mlength = 0;
         unsigned int       offset = 0;
-        lnet_process_id_t  src = {.nid = hdr->src_nid,
-                                  .pid = hdr->src_pid};
+        lnet_process_id_t  src = {/* .nid = */ hdr->src_nid,
+                                  /* .pid = */ hdr->src_pid};
         lnet_handle_wire_t reply_wmd;
         lnet_libmd_t      *md;
         int                rc;
@@ -1444,8 +1444,8 @@ lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
 {
         void             *private = msg->msg_private;
         lnet_hdr_t       *hdr = &msg->msg_hdr;
-        lnet_process_id_t src = {.nid = hdr->src_nid,
-                                 .pid = hdr->src_pid};
+        lnet_process_id_t src = {/* .nid = */ hdr->src_nid,
+                                 /* .pid = */ hdr->src_pid};
         lnet_libmd_t     *md;
         int               rlength;
         int               mlength;
@@ -1515,8 +1515,8 @@ static int
 lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg)
 {
         lnet_hdr_t       *hdr = &msg->msg_hdr;
-        lnet_process_id_t src = {.nid = hdr->src_nid,
-                                 .pid = hdr->src_pid};
+        lnet_process_id_t src = {/* .nid = */ hdr->src_nid,
+                                 /* .pid = */ hdr->src_pid};
         lnet_libmd_t    *md;
 
         /* Convert ack fields to host byte order */
@@ -1586,10 +1586,10 @@ lnet_msgtyp2str (int type)
 void
 lnet_print_hdr(lnet_hdr_t * hdr)
 {
-        lnet_process_id_t src = {.nid = hdr->src_nid,
-                                 .pid = hdr->src_pid};
-        lnet_process_id_t dst = {.nid = hdr->dest_nid,
-                                 .pid = hdr->dest_pid};
+        lnet_process_id_t src = {/* .nid = */ hdr->src_nid,
+                                 /* .pid = */ hdr->src_pid};
+        lnet_process_id_t dst = {/* .nid = */ hdr->dest_nid,
+                                 /* .pid = */ hdr->dest_pid};
         char *type_str = lnet_msgtyp2str (hdr->type);
 
         CWARN("P3 Header at %p of type %s\n", hdr, type_str);
index 33e374f..5af6938 100644 (file)
@@ -95,10 +95,17 @@ lolnd_startup (lnet_ni_t *ni)
 }
 
 lnd_t the_lolnd = {
-        .lnd_type       = LOLND,
-        .lnd_startup    = lolnd_startup,
-        .lnd_shutdown   = lolnd_shutdown,
-        .lnd_send       = lolnd_send,
-        .lnd_recv       = lolnd_recv,
+        /* .lnd_list       = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list},
+        /* .lnd_refcount   = */ 0,
+        /* .lnd_type       = */ LOLND,
+        /* .lnd_startup    = */ lolnd_startup,
+        /* .lnd_shutdown   = */ lolnd_shutdown,
+        /* .lnt_ctl        = */ NULL, 
+        /* .lnd_send       = */ lolnd_send,
+        /* .lnd_recv       = */ lolnd_recv,
+        /* .lnd_eager_recv = */ NULL,
+        /* .lnd_notify     = */ NULL,
+        /* .lnd_accept
+           .lnd_wait       = */ NULL
 };
 
index 03dc839..ab9b9ac 100644 (file)
@@ -23,7 +23,7 @@
 
 #include <lnet/lib-lnet.h>
 
-#ifdef __KERNEL__
+#if defined(__KERNEL__) && defined(LNET_ROUTER)
 
 static char *forwarding = "";
 CFS_MODULE_PARM(forwarding, "s", charp, 0444,
@@ -55,6 +55,9 @@ void
 kpr_do_upcall (void *arg)
 {
         kpr_upcall_t *u = (kpr_upcall_t *)arg;
+
+#ifndef __WINNT__
+
         char          nidstr[36];
         char          whenstr[36];
         char         *argv[] = {
@@ -70,6 +73,10 @@ kpr_do_upcall (void *arg)
 
         libcfs_run_upcall (argv);
 
+        libcfs_run_upcall (argv);
+
+#endif /* __WINNT__ */
+
         LIBCFS_FREE(u, sizeof(*u));
 }
 
@@ -480,7 +487,7 @@ lnet_get_route (int idx, __u32 *net, __u32 *hops,
         return -ENOENT;
 }
 
-#ifdef __KERNEL__
+#if defined(__KERNEL__) && defined(LNET_ROUTER)
 
 void
 lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
@@ -488,7 +495,7 @@ lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
         int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
 
         while (--npages >= 0)
-                __free_page(rb->rb_kiov[npages].kiov_page);
+                cfs_free_page(rb->rb_kiov[npages].kiov_page);
 
         LIBCFS_FREE(rb, sz);
 }
@@ -507,10 +514,10 @@ lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp)
         rb->rb_pool = rbp;
 
         for (i = 0; i < npages; i++) {
-                page = alloc_page(GFP_KERNEL); /* HIGH? */
+                page = cfs_alloc_page(CFS_ALLOC_ZERO /*GFP_KERNEL*/); /* HIGH? */
                 if (page == NULL) {
                         while (--i >= 0)
-                                __free_page(rb->rb_kiov[i].kiov_page);
+                                cfs_free_page(rb->rb_kiov[i].kiov_page);
 
                         LIBCFS_FREE(rb, sz);
                         return NULL;
index 1cdab53..5996319 100644 (file)
  *
  */
 
+#include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 
-#ifdef __KERNEL__
+#if defined(__KERNEL__) && defined(LNET_ROUTER)
 
 #include <linux/seq_file.h>
 #include <linux/lustre_compat25.h>
@@ -81,7 +82,7 @@ lnet_router_proc_stats_write(struct file *file, const char *ubuffer,
 }
 
 typedef struct {
-        unsigned long long   lrsi_version;
+        __u64                lrsi_version;
         lnet_remotenet_t    *lrsi_net;
         lnet_route_t        *lrsi_route;
         loff_t               lrsi_off;
index 28ec36e..f714081 100644 (file)
@@ -90,8 +90,8 @@ static void pingcli_callback(lnet_event_t *ev)
 {
         int i;
         unsigned magic;
-        i = __le32_to_cpu(*(int *)(ev->md.start + ev->offset + sizeof(unsigned)));
-        magic = __le32_to_cpu(*(int *)(ev->md.start + ev->offset));
+        i = __le32_to_cpu(*(int *)((char *)ev->md.start + ev->offset + sizeof(unsigned)));
+        magic = __le32_to_cpu(*(int *)((char *)ev->md.start + ev->offset));
 
         if(magic != 0xcafebabe) {
                 CERROR("Unexpected response %x\n", magic);
diff --git a/lnet/tests/ping_cli/winnt-pingcli.c b/lnet/tests/ping_cli/winnt-pingcli.c
new file mode 100644 (file)
index 0000000..7c9a1a1
--- /dev/null
@@ -0,0 +1,634 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Matt Wu <mattwu@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+/*
+ *  Included Headers 
+ */
+
+
+#include <libcfs/libcfs.h>
+
+
+/* libcfs module init/exit routines */
+DECLARE_INIT(init_libcfs_module);
+DECLARE_EXIT(exit_libcfs_module);
+
+/* portal module init/exit routines */
+DECLARE_INIT(init_lnet);
+DECLARE_EXIT(fini_lnet);
+
+/* tdinal module init/exit routines */
+DECLARE_INIT(ksocknal_module_init);
+DECLARE_EXIT(ksocknal_module_fini);
+
+/* pingcli module init/exit routines */
+DECLARE_INIT(pingcli_init);
+DECLARE_EXIT(pingcli_cleanup);
+
+
+/* pingsrv module init/exit routines */
+DECLARE_INIT(pingsrv_init);
+DECLARE_EXIT(pingsrv_cleanup);
+
+/*
+ * structure definitions
+ */
+
+
+#define LUSTRE_PING_VERSION   0x00010000               /* ping srv/cli version: 0001.0000 */
+
+#define LUSTRE_PING_DEVICE    L"\\Device\\LNET"     /* device object name */
+#define LUSTRE_PING_SYMLNK    L"\\DosDevices\\LNET" /* user-visible name for the device*/
+
+typedef struct _DEVICE_EXTENSION
+{
+    BOOLEAN    bProcFS;
+
+} DEVICE_EXTENSION, *PDEVICE_EXTENSION;
+
+
+/*
+ *  global definitions
+ */
+
+PDEVICE_OBJECT  PingObject = NULL;  /* ping device object */
+PDEVICE_OBJECT  ProcObject = NULL;  /* procfs emulator device */
+
+
+/*
+ *  common routines
+ */
+
+
+//
+// complete Irp request ...
+//
+
+NTSTATUS
+UTCompleteIrp(
+    PIRP        Irp,
+    NTSTATUS    Status,
+    ULONG       Info
+    )
+{
+    Irp->IoStatus.Status = Status;
+    Irp->IoStatus.Information = Info;
+    IoCompleteRequest(Irp,IO_NO_INCREMENT);
+
+    return Status;
+}
+
+//
+//  Open/Create Device ...
+//
+
+NTSTATUS
+UTCreate(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    KdPrint(("UTCreate: DeviceCreate ...\n"));
+
+    return UTCompleteIrp(Irp,STATUS_SUCCESS,0);
+}
+
+//
+// Close Devcie ...
+//
+
+NTSTATUS
+UTClose(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp)
+{
+    KdPrint(("UTClose: Device Closed.\n"));
+
+    return UTCompleteIrp(Irp, STATUS_SUCCESS, 0);
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+
+
+NTSTATUS
+UTShutdown(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    KdPrint(("UTShutdown: shuting TdiSock ...\n"));
+
+    return UTCompleteIrp(Irp, STATUS_SUCCESS, 0);
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+//
+// driver frame Routines ...
+//
+
+
+NTSTATUS
+UTDeviceControl(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    NTSTATUS            Status = STATUS_INVALID_DEVICE_REQUEST;
+    PIO_STACK_LOCATION  IrpSp;
+
+    ULONG               ControlCode;
+    ULONG               InputLength;
+    ULONG               OutputLength;
+
+    PVOID               lpvInBuffer;
+
+    KdPrint(("UTDeviceControl: Device Ioctl ...\n"));
+
+    Irp->IoStatus.Information = 0;
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+    ControlCode  = IrpSp->Parameters.DeviceIoControl.IoControlCode;
+    InputLength  = IrpSp->Parameters.DeviceIoControl.InputBufferLength;
+    OutputLength = IrpSp->Parameters.DeviceIoControl.OutputBufferLength;
+    lpvInBuffer  = Irp->AssociatedIrp.SystemBuffer;
+
+    ASSERT (IrpSp->MajorFunction == IRP_MJ_DEVICE_CONTROL);
+
+    switch (ControlCode)
+    {
+        case IOCTL_LIBCFS_VERSION:
+
+            *((ULONG *)lpvInBuffer) = (ULONG)(LUSTRE_PING_VERSION);
+            Irp->IoStatus.Information = sizeof(ULONG);
+            Status = STATUS_SUCCESS;
+            break;
+
+        default:
+            break;
+    }
+
+    Irp->IoStatus.Status = Status;
+
+    IoCompleteRequest(Irp, IO_NO_INCREMENT);
+
+    KdPrint(("UTDeviceControl: Device Ioctl returned.\n"));
+
+    return Status;
+}
+
+NTSTATUS
+ProcCreate(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    NTSTATUS                    Status;
+    PIO_STACK_LOCATION          IrpSp;
+
+    FILE_FULL_EA_INFORMATION *  ea;
+    cfs_file_t *                fp;
+
+    KdPrint(("ProcCreate: Proc device is being opened ...\n"));
+
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+    ea = (PFILE_FULL_EA_INFORMATION) Irp->AssociatedIrp.SystemBuffer;
+
+    if (!ea) {
+        Status = STATUS_INVALID_PARAMETER;
+    } else {
+        fp = lustre_open_file(&ea->EaName[0]);
+        if (!fp) {
+            Status = STATUS_OBJECT_NAME_NOT_FOUND;
+        } else {
+            IrpSp->FileObject->FsContext = fp;
+            IrpSp->FileObject->FsContext2 = fp->private_data;
+            Status = STATUS_SUCCESS;
+        }
+    }
+
+    return UTCompleteIrp(Irp, Status, 0);
+}
+
+//
+// Close Devcie ...
+//
+
+NTSTATUS
+ProcClose(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp)
+{
+    PIO_STACK_LOCATION          IrpSp;
+
+    cfs_file_t *                fp;
+
+    KdPrint(("ProcClose: Proc device object is to be closed.\n"));
+
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+    fp = (cfs_file_t *) IrpSp->FileObject->FsContext;
+
+    ASSERT(fp != NULL);
+    ASSERT(IrpSp->FileObject->FsContext2 == fp->private_data);
+
+    lustre_close_file(fp);
+
+    return UTCompleteIrp(Irp, STATUS_SUCCESS, 0);
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+/*
+ * proc frame routines
+ */
+
+NTSTATUS
+ProcDeviceControl(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    NTSTATUS            Status = STATUS_INVALID_DEVICE_REQUEST;
+    PIO_STACK_LOCATION  IrpSp;
+
+    ULONG               ControlCode;
+    ULONG               InputLength;
+    ULONG               OutputLength;
+
+    PVOID               lpvInBuffer;
+
+    KdPrint(("ProcDeviceControl: Proc device ioctling ...\n"));
+
+    Irp->IoStatus.Information = 0;
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+    ControlCode  = IrpSp->Parameters.DeviceIoControl.IoControlCode;
+    InputLength  = IrpSp->Parameters.DeviceIoControl.InputBufferLength;
+    OutputLength = IrpSp->Parameters.DeviceIoControl.OutputBufferLength;
+    lpvInBuffer  = Irp->AssociatedIrp.SystemBuffer;
+
+    ASSERT (IrpSp->MajorFunction == IRP_MJ_DEVICE_CONTROL);
+
+    switch (ControlCode)
+    {
+        case IOCTL_LIBCFS_VERSION:
+
+            *((ULONG *)lpvInBuffer) = (ULONG)(LUSTRE_PING_VERSION);
+            Irp->IoStatus.Information = sizeof(ULONG);
+
+            Status = STATUS_SUCCESS;
+
+            break;
+
+        case IOCTL_LIBCFS_ENTRY:
+        {
+            int rc = 0;
+            cfs_file_t * fp;
+
+            fp = (cfs_file_t *) IrpSp->FileObject->FsContext;
+
+            if (!fp) {
+                rc = -EINVAL;
+            } else {
+                rc = lustre_ioctl_file(fp, (PCFS_PROC_IOCTL) (lpvInBuffer));
+            }
+
+            if (rc == 0) {
+                Irp->IoStatus.Information = InputLength;
+                Status = STATUS_SUCCESS;
+            }
+        }    
+    }
+
+    Irp->IoStatus.Status = Status;
+
+    IoCompleteRequest(Irp, IO_NO_INCREMENT);
+
+    KdPrint(("ProcDeviceControl: Proc device ioctl returned with status = %xh.\n", Status));
+
+    return Status;
+}
+
+
+
+NTSTATUS
+ProcReadWrite (PDEVICE_OBJECT DeviceObject, PIRP Irp)
+{
+    PIO_STACK_LOCATION  IrpSp;
+    NTSTATUS            Status;
+
+    cfs_file_t *        fp;
+    int                 rc;
+    PCHAR               buf;
+
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+    if (Irp->MdlAddress) {
+        buf = MmGetSystemAddressForMdlSafe(
+                        Irp->MdlAddress,
+                        NormalPagePriority);
+    } else {
+        buf = Irp->AssociatedIrp.SystemBuffer;
+    }
+
+    if (buf == NULL) {
+        Status = STATUS_SUCCESS;
+        rc = 0;
+    } else {
+        fp = (cfs_file_t *) IrpSp->FileObject->FsContext;
+
+        if (!fp) {
+            Status = STATUS_INVALID_PARAMETER;
+            goto errorout;
+        }
+
+        if (IrpSp->MajorFunction == IRP_MJ_READ) {
+            rc = lustre_read_file(
+                    fp, IrpSp->Parameters.Read.ByteOffset.LowPart,
+                    IrpSp->Parameters.Read.Length, buf);
+        } else {
+            rc = lustre_write_file(
+                    fp, IrpSp->Parameters.Write.ByteOffset.LowPart,
+                    IrpSp->Parameters.Write.Length, buf);
+        }
+        if (rc < 0) {
+            cfs_enter_debugger();
+            Status = STATUS_UNSUCCESSFUL;
+        } else {
+            Status = STATUS_SUCCESS;
+        }
+    }
+
+errorout:
+    return UTCompleteIrp(Irp, Status, rc);
+}
+
+
+//
+//  common dispatch routines
+//
+
+NTSTATUS
+UTDispatchRequest(
+    IN PDEVICE_OBJECT DeviceObject,
+    IN PIRP           Irp
+    )
+{
+    NTSTATUS            Status;
+    PIO_STACK_LOCATION  IrpSp;
+
+    Status = STATUS_INVALID_DEVICE_REQUEST;
+
+    __try {
+
+        IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+        switch (IrpSp->MajorFunction) {
+
+            case IRP_MJ_CREATE:
+                if (DeviceObject == PingObject) {
+                    Status = UTCreate(DeviceObject, Irp);
+                } else if (DeviceObject == ProcObject) {
+                    Status = ProcCreate(DeviceObject, Irp);
+                }
+                break;
+        
+            case IRP_MJ_CLOSE:
+                if (DeviceObject == PingObject) {
+                    Status = UTClose(DeviceObject, Irp);
+                } else if (DeviceObject == ProcObject) {
+                    Status = ProcClose(DeviceObject, Irp);
+                }
+                break;
+
+            case IRP_MJ_READ:
+            case IRP_MJ_WRITE:
+                if (DeviceObject == ProcObject) {
+                    Status = ProcReadWrite(DeviceObject, Irp);
+                }
+                break;
+        
+            case IRP_MJ_DEVICE_CONTROL:
+                if (DeviceObject == PingObject) {
+                    Status = UTDeviceControl(DeviceObject, Irp);
+                } else if (DeviceObject == ProcObject) {
+                    Status = ProcDeviceControl(DeviceObject, Irp);
+                }
+                break;
+
+            case IRP_MJ_SHUTDOWN:
+                Status = UTShutdown(DeviceObject, Irp);
+                break;
+
+            default:
+
+                KdPrint(("UTDispatchRequest: Major Function: %xh is not supported.\n",
+                           IrpSp->MajorFunction));
+                UTCompleteIrp(Irp, Status, 0);
+                break;
+        }
+    }
+
+    __finally {
+    }
+
+    return Status;
+}
+
+//
+// create a device object and a dosdevice symbol link
+//
+
+PDEVICE_OBJECT
+CreateDevice(
+    IN PDRIVER_OBJECT   DriverObject,
+    IN PWCHAR           DeviceName,
+    IN PWCHAR           SymlnkName,
+    IN BOOLEAN          bProcFS
+    )
+{
+    NTSTATUS            Status;
+
+    UNICODE_STRING      NtDevName;
+    UNICODE_STRING      Win32DevName;
+
+    PDEVICE_EXTENSION   DeviceExtension;
+    PDEVICE_OBJECT      DeviceObject;
+
+    /* create the device object with the specified name */
+
+    RtlInitUnicodeString(&NtDevName, DeviceName);
+    
+    Status = IoCreateDevice(
+                    DriverObject,
+                    sizeof(DEVICE_EXTENSION),
+                    &NtDevName,
+                    FILE_DEVICE_UNKNOWN,
+                    0,
+                    FALSE,
+                    &DeviceObject );
+        
+    if (!NT_SUCCESS(Status)) {
+
+        cfs_enter_debugger();
+        return NULL;
+    }
+
+    /* create the symlink to make the device visible to user */
+
+    RtlInitUnicodeString(&Win32DevName, SymlnkName);
+        
+    Status = IoCreateSymbolicLink(&Win32DevName, &NtDevName);
+
+    if (!NT_SUCCESS(Status)) {
+
+        IoDeleteDevice(DeviceObject);
+        return NULL;
+    }
+
+    DeviceExtension = (PDEVICE_EXTENSION)DeviceObject->DeviceObjectExtension;
+    DeviceExtension->bProcFS = bProcFS;
+
+    DeviceObject->Flags |= DO_BUFFERED_IO;
+    DeviceObject->Flags &= ~DO_DEVICE_INITIALIZING;
+
+    return DeviceObject;
+}
+
+
+//
+// DriverEntry
+//
+
+NTSTATUS DriverEntry(
+    IN PDRIVER_OBJECT  DriverObject,
+    IN PUNICODE_STRING RegistryPath 
+    )
+{
+    KdPrint(("Lustre ping test: Build Time: " __DATE__ " " __TIME__ "\n"));
+    KdPrint(("Lustre ping test: DriverEntry ... \n"));
+
+    /* initialize libcfs module */
+    if (module_init_libcfs_module() != 0) {
+        KdPrint(("ping: error initialize module: libcfs ...\n"));
+        goto errorout;
+    }
+
+    /* initialize lnet module */
+    if (module_init_lnet() != 0) {
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: lnet ...\n"));
+        goto errorout;
+    }
+
+    /* initialize tdinal module */
+    if (module_ksocknal_module_init() != 0) {
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: tdilnd ...\n"));
+        goto errorout;
+    }
+
+#if defined(LUSTRE_PING_CLI)
+    /* initialize pingcli module */
+    if (module_pingcli_init() != 0) {
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: pingcli ...\n"));
+        goto errorout;
+    }
+#endif
+
+#if defined(LUSTRE_PING_SRV)
+    /* initialize pingsrv module */
+    if (module_pingsrv_init() != 0) {
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: pingsrv ...\n"));
+        goto errorout;
+    }
+#endif
+
+    /* create the ping device object */
+    PingObject = CreateDevice(
+                        DriverObject,
+                        LUSTRE_PING_DEVICE,
+                        LUSTRE_PING_SYMLNK,
+                        FALSE );
+    if (!PingObject) {
+#if defined(LUSTRE_PING_CLI)
+        module_pingcli_cleanup();
+#endif
+#if defined(LUSTRE_PING_SRV)
+        module_pingsrv_cleanup();
+#endif
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
+
+    /* create the libcfs proc fs emultor device object */
+    ProcObject  = CreateDevice(
+                        DriverObject,
+                        LUSTRE_PROC_DEVICE,
+                        LUSTRE_PROC_SYMLNK,
+                        TRUE );
+    if (!ProcObject) {
+
+        IoDeleteDevice(PingObject);
+#if defined(LUSTRE_PING_CLI)
+        module_pingcli_cleanup();
+#endif
+#if defined(LUSTRE_PING_SRV)
+        module_pingsrv_cleanup();
+#endif
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
+
+    /* initialize the driver callback routines */
+
+    DriverObject->MajorFunction[IRP_MJ_CREATE]          = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_CLOSE]           = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_READ]            = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_WRITE]           = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_SHUTDOWN]        = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_DEVICE_CONTROL]  = UTDispatchRequest;
+
+    return STATUS_SUCCESS;
+
+errorout:
+
+    cfs_enter_debugger();
+
+    return STATUS_UNSUCCESSFUL;
+}
index c50a749..ab5af5f 100644 (file)
@@ -98,7 +98,7 @@ int pingsrv_thread(void *arg)
                         continue;
                 }
                
-                magic =  __le32_to_cpu(*((int *)(server->evnt.md.start 
+                magic =  __le32_to_cpu(*((int *)((char *)server->evnt.md.start 
                                         + server->evnt.offset)));
                 
                 
@@ -170,9 +170,9 @@ static void pingsrv_callback(lnet_event_t *ev)
                "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n",
                libcfs_nid2str(ev->initiator.nid), 
                ev->offset, ev->rlength, ev->mlength,
-               __le32_to_cpu(*((int *)(ev->md.start + ev->offset))),
-               __le32_to_cpu(*((int *)(ev->md.start + ev->offset + sizeof(unsigned)))),
-               __le32_to_cpu(*((int *)(ev->md.start + ev->offset + 2 * 
+               __le32_to_cpu(*((int *)((char *)ev->md.start + ev->offset))),
+               __le32_to_cpu(*((int *)((char *)ev->md.start + ev->offset + sizeof(unsigned)))),
+               __le32_to_cpu(*((int *)((char *)ev->md.start + ev->offset + 2 * 
                                sizeof(unsigned)))));
         
         packets_valid++;
@@ -186,7 +186,7 @@ static struct pingsrv_data *pingsrv_setup(void)
 {
         int rc;
 
-       /* Aquire and initialize the proper nal for portals. */
+        /* Aquire and initialize the proper nal for portals. */
         rc = LNetNIInit(0);
         if (!(rc == 0 || rc == 1)) {
                 CDEBUG (D_OTHER, "LNetNIInit: error %d\n", rc);
diff --git a/lnet/tests/ping_srv/winnt-pingsrv.c b/lnet/tests/ping_srv/winnt-pingsrv.c
new file mode 100644 (file)
index 0000000..7c9a1a1
--- /dev/null
@@ -0,0 +1,634 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Matt Wu <mattwu@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+/*
+ *  Included Headers 
+ */
+
+
+#include <libcfs/libcfs.h>
+
+
+/* libcfs module init/exit routines */
+DECLARE_INIT(init_libcfs_module);
+DECLARE_EXIT(exit_libcfs_module);
+
+/* portal module init/exit routines */
+DECLARE_INIT(init_lnet);
+DECLARE_EXIT(fini_lnet);
+
+/* tdinal module init/exit routines */
+DECLARE_INIT(ksocknal_module_init);
+DECLARE_EXIT(ksocknal_module_fini);
+
+/* pingcli module init/exit routines */
+DECLARE_INIT(pingcli_init);
+DECLARE_EXIT(pingcli_cleanup);
+
+
+/* pingsrv module init/exit routines */
+DECLARE_INIT(pingsrv_init);
+DECLARE_EXIT(pingsrv_cleanup);
+
+/*
+ * structure definitions
+ */
+
+
+#define LUSTRE_PING_VERSION   0x00010000               /* ping srv/cli version: 0001.0000 */
+
+#define LUSTRE_PING_DEVICE    L"\\Device\\LNET"     /* device object name */
+#define LUSTRE_PING_SYMLNK    L"\\DosDevices\\LNET" /* user-visible name for the device*/
+
+typedef struct _DEVICE_EXTENSION
+{
+    BOOLEAN    bProcFS;
+
+} DEVICE_EXTENSION, *PDEVICE_EXTENSION;
+
+
+/*
+ *  global definitions
+ */
+
+PDEVICE_OBJECT  PingObject = NULL;  /* ping device object */
+PDEVICE_OBJECT  ProcObject = NULL;  /* procfs emulator device */
+
+
+/*
+ *  common routines
+ */
+
+
+//
+// complete Irp request ...
+//
+
+NTSTATUS
+UTCompleteIrp(
+    PIRP        Irp,
+    NTSTATUS    Status,
+    ULONG       Info
+    )
+{
+    Irp->IoStatus.Status = Status;
+    Irp->IoStatus.Information = Info;
+    IoCompleteRequest(Irp,IO_NO_INCREMENT);
+
+    return Status;
+}
+
+//
+//  Open/Create Device ...
+//
+
+NTSTATUS
+UTCreate(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    KdPrint(("UTCreate: DeviceCreate ...\n"));
+
+    return UTCompleteIrp(Irp,STATUS_SUCCESS,0);
+}
+
+//
+// Close Devcie ...
+//
+
+NTSTATUS
+UTClose(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp)
+{
+    KdPrint(("UTClose: Device Closed.\n"));
+
+    return UTCompleteIrp(Irp, STATUS_SUCCESS, 0);
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+
+
+NTSTATUS
+UTShutdown(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    KdPrint(("UTShutdown: shuting TdiSock ...\n"));
+
+    return UTCompleteIrp(Irp, STATUS_SUCCESS, 0);
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+//
+// driver frame Routines ...
+//
+
+
+NTSTATUS
+UTDeviceControl(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    NTSTATUS            Status = STATUS_INVALID_DEVICE_REQUEST;
+    PIO_STACK_LOCATION  IrpSp;
+
+    ULONG               ControlCode;
+    ULONG               InputLength;
+    ULONG               OutputLength;
+
+    PVOID               lpvInBuffer;
+
+    KdPrint(("UTDeviceControl: Device Ioctl ...\n"));
+
+    Irp->IoStatus.Information = 0;
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+    ControlCode  = IrpSp->Parameters.DeviceIoControl.IoControlCode;
+    InputLength  = IrpSp->Parameters.DeviceIoControl.InputBufferLength;
+    OutputLength = IrpSp->Parameters.DeviceIoControl.OutputBufferLength;
+    lpvInBuffer  = Irp->AssociatedIrp.SystemBuffer;
+
+    ASSERT (IrpSp->MajorFunction == IRP_MJ_DEVICE_CONTROL);
+
+    switch (ControlCode)
+    {
+        case IOCTL_LIBCFS_VERSION:
+
+            *((ULONG *)lpvInBuffer) = (ULONG)(LUSTRE_PING_VERSION);
+            Irp->IoStatus.Information = sizeof(ULONG);
+            Status = STATUS_SUCCESS;
+            break;
+
+        default:
+            break;
+    }
+
+    Irp->IoStatus.Status = Status;
+
+    IoCompleteRequest(Irp, IO_NO_INCREMENT);
+
+    KdPrint(("UTDeviceControl: Device Ioctl returned.\n"));
+
+    return Status;
+}
+
+NTSTATUS
+ProcCreate(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    NTSTATUS                    Status;
+    PIO_STACK_LOCATION          IrpSp;
+
+    FILE_FULL_EA_INFORMATION *  ea;
+    cfs_file_t *                fp;
+
+    KdPrint(("ProcCreate: Proc device is being opened ...\n"));
+
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+    ea = (PFILE_FULL_EA_INFORMATION) Irp->AssociatedIrp.SystemBuffer;
+
+    if (!ea) {
+        Status = STATUS_INVALID_PARAMETER;
+    } else {
+        fp = lustre_open_file(&ea->EaName[0]);
+        if (!fp) {
+            Status = STATUS_OBJECT_NAME_NOT_FOUND;
+        } else {
+            IrpSp->FileObject->FsContext = fp;
+            IrpSp->FileObject->FsContext2 = fp->private_data;
+            Status = STATUS_SUCCESS;
+        }
+    }
+
+    return UTCompleteIrp(Irp, Status, 0);
+}
+
+//
+// Close Devcie ...
+//
+
+NTSTATUS
+ProcClose(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp)
+{
+    PIO_STACK_LOCATION          IrpSp;
+
+    cfs_file_t *                fp;
+
+    KdPrint(("ProcClose: Proc device object is to be closed.\n"));
+
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+    fp = (cfs_file_t *) IrpSp->FileObject->FsContext;
+
+    ASSERT(fp != NULL);
+    ASSERT(IrpSp->FileObject->FsContext2 == fp->private_data);
+
+    lustre_close_file(fp);
+
+    return UTCompleteIrp(Irp, STATUS_SUCCESS, 0);
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+/*
+ * proc frame routines
+ */
+
+NTSTATUS
+ProcDeviceControl(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    NTSTATUS            Status = STATUS_INVALID_DEVICE_REQUEST;
+    PIO_STACK_LOCATION  IrpSp;
+
+    ULONG               ControlCode;
+    ULONG               InputLength;
+    ULONG               OutputLength;
+
+    PVOID               lpvInBuffer;
+
+    KdPrint(("ProcDeviceControl: Proc device ioctling ...\n"));
+
+    Irp->IoStatus.Information = 0;
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+    ControlCode  = IrpSp->Parameters.DeviceIoControl.IoControlCode;
+    InputLength  = IrpSp->Parameters.DeviceIoControl.InputBufferLength;
+    OutputLength = IrpSp->Parameters.DeviceIoControl.OutputBufferLength;
+    lpvInBuffer  = Irp->AssociatedIrp.SystemBuffer;
+
+    ASSERT (IrpSp->MajorFunction == IRP_MJ_DEVICE_CONTROL);
+
+    switch (ControlCode)
+    {
+        case IOCTL_LIBCFS_VERSION:
+
+            *((ULONG *)lpvInBuffer) = (ULONG)(LUSTRE_PING_VERSION);
+            Irp->IoStatus.Information = sizeof(ULONG);
+
+            Status = STATUS_SUCCESS;
+
+            break;
+
+        case IOCTL_LIBCFS_ENTRY:
+        {
+            int rc = 0;
+            cfs_file_t * fp;
+
+            fp = (cfs_file_t *) IrpSp->FileObject->FsContext;
+
+            if (!fp) {
+                rc = -EINVAL;
+            } else {
+                rc = lustre_ioctl_file(fp, (PCFS_PROC_IOCTL) (lpvInBuffer));
+            }
+
+            if (rc == 0) {
+                Irp->IoStatus.Information = InputLength;
+                Status = STATUS_SUCCESS;
+            }
+        }    
+    }
+
+    Irp->IoStatus.Status = Status;
+
+    IoCompleteRequest(Irp, IO_NO_INCREMENT);
+
+    KdPrint(("ProcDeviceControl: Proc device ioctl returned with status = %xh.\n", Status));
+
+    return Status;
+}
+
+
+
+NTSTATUS
+ProcReadWrite (PDEVICE_OBJECT DeviceObject, PIRP Irp)
+{
+    PIO_STACK_LOCATION  IrpSp;
+    NTSTATUS            Status;
+
+    cfs_file_t *        fp;
+    int                 rc;
+    PCHAR               buf;
+
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+    if (Irp->MdlAddress) {
+        buf = MmGetSystemAddressForMdlSafe(
+                        Irp->MdlAddress,
+                        NormalPagePriority);
+    } else {
+        buf = Irp->AssociatedIrp.SystemBuffer;
+    }
+
+    if (buf == NULL) {
+        Status = STATUS_SUCCESS;
+        rc = 0;
+    } else {
+        fp = (cfs_file_t *) IrpSp->FileObject->FsContext;
+
+        if (!fp) {
+            Status = STATUS_INVALID_PARAMETER;
+            goto errorout;
+        }
+
+        if (IrpSp->MajorFunction == IRP_MJ_READ) {
+            rc = lustre_read_file(
+                    fp, IrpSp->Parameters.Read.ByteOffset.LowPart,
+                    IrpSp->Parameters.Read.Length, buf);
+        } else {
+            rc = lustre_write_file(
+                    fp, IrpSp->Parameters.Write.ByteOffset.LowPart,
+                    IrpSp->Parameters.Write.Length, buf);
+        }
+        if (rc < 0) {
+            cfs_enter_debugger();
+            Status = STATUS_UNSUCCESSFUL;
+        } else {
+            Status = STATUS_SUCCESS;
+        }
+    }
+
+errorout:
+    return UTCompleteIrp(Irp, Status, rc);
+}
+
+
+//
+//  common dispatch routines
+//
+
+NTSTATUS
+UTDispatchRequest(
+    IN PDEVICE_OBJECT DeviceObject,
+    IN PIRP           Irp
+    )
+{
+    NTSTATUS            Status;
+    PIO_STACK_LOCATION  IrpSp;
+
+    Status = STATUS_INVALID_DEVICE_REQUEST;
+
+    __try {
+
+        IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+        switch (IrpSp->MajorFunction) {
+
+            case IRP_MJ_CREATE:
+                if (DeviceObject == PingObject) {
+                    Status = UTCreate(DeviceObject, Irp);
+                } else if (DeviceObject == ProcObject) {
+                    Status = ProcCreate(DeviceObject, Irp);
+                }
+                break;
+        
+            case IRP_MJ_CLOSE:
+                if (DeviceObject == PingObject) {
+                    Status = UTClose(DeviceObject, Irp);
+                } else if (DeviceObject == ProcObject) {
+                    Status = ProcClose(DeviceObject, Irp);
+                }
+                break;
+
+            case IRP_MJ_READ:
+            case IRP_MJ_WRITE:
+                if (DeviceObject == ProcObject) {
+                    Status = ProcReadWrite(DeviceObject, Irp);
+                }
+                break;
+        
+            case IRP_MJ_DEVICE_CONTROL:
+                if (DeviceObject == PingObject) {
+                    Status = UTDeviceControl(DeviceObject, Irp);
+                } else if (DeviceObject == ProcObject) {
+                    Status = ProcDeviceControl(DeviceObject, Irp);
+                }
+                break;
+
+            case IRP_MJ_SHUTDOWN:
+                Status = UTShutdown(DeviceObject, Irp);
+                break;
+
+            default:
+
+                KdPrint(("UTDispatchRequest: Major Function: %xh is not supported.\n",
+                           IrpSp->MajorFunction));
+                UTCompleteIrp(Irp, Status, 0);
+                break;
+        }
+    }
+
+    __finally {
+    }
+
+    return Status;
+}
+
+//
+// create a device object and a dosdevice symbol link
+//
+
+PDEVICE_OBJECT
+CreateDevice(
+    IN PDRIVER_OBJECT   DriverObject,
+    IN PWCHAR           DeviceName,
+    IN PWCHAR           SymlnkName,
+    IN BOOLEAN          bProcFS
+    )
+{
+    NTSTATUS            Status;
+
+    UNICODE_STRING      NtDevName;
+    UNICODE_STRING      Win32DevName;
+
+    PDEVICE_EXTENSION   DeviceExtension;
+    PDEVICE_OBJECT      DeviceObject;
+
+    /* create the device object with the specified name */
+
+    RtlInitUnicodeString(&NtDevName, DeviceName);
+    
+    Status = IoCreateDevice(
+                    DriverObject,
+                    sizeof(DEVICE_EXTENSION),
+                    &NtDevName,
+                    FILE_DEVICE_UNKNOWN,
+                    0,
+                    FALSE,
+                    &DeviceObject );
+        
+    if (!NT_SUCCESS(Status)) {
+
+        cfs_enter_debugger();
+        return NULL;
+    }
+
+    /* create the symlink to make the device visible to user */
+
+    RtlInitUnicodeString(&Win32DevName, SymlnkName);
+        
+    Status = IoCreateSymbolicLink(&Win32DevName, &NtDevName);
+
+    if (!NT_SUCCESS(Status)) {
+
+        IoDeleteDevice(DeviceObject);
+        return NULL;
+    }
+
+    DeviceExtension = (PDEVICE_EXTENSION)DeviceObject->DeviceObjectExtension;
+    DeviceExtension->bProcFS = bProcFS;
+
+    DeviceObject->Flags |= DO_BUFFERED_IO;
+    DeviceObject->Flags &= ~DO_DEVICE_INITIALIZING;
+
+    return DeviceObject;
+}
+
+
+//
+// DriverEntry
+//
+
+NTSTATUS DriverEntry(
+    IN PDRIVER_OBJECT  DriverObject,
+    IN PUNICODE_STRING RegistryPath 
+    )
+{
+    KdPrint(("Lustre ping test: Build Time: " __DATE__ " " __TIME__ "\n"));
+    KdPrint(("Lustre ping test: DriverEntry ... \n"));
+
+    /* initialize libcfs module */
+    if (module_init_libcfs_module() != 0) {
+        KdPrint(("ping: error initialize module: libcfs ...\n"));
+        goto errorout;
+    }
+
+    /* initialize lnet module */
+    if (module_init_lnet() != 0) {
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: lnet ...\n"));
+        goto errorout;
+    }
+
+    /* initialize tdinal module */
+    if (module_ksocknal_module_init() != 0) {
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: tdilnd ...\n"));
+        goto errorout;
+    }
+
+#if defined(LUSTRE_PING_CLI)
+    /* initialize pingcli module */
+    if (module_pingcli_init() != 0) {
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: pingcli ...\n"));
+        goto errorout;
+    }
+#endif
+
+#if defined(LUSTRE_PING_SRV)
+    /* initialize pingsrv module */
+    if (module_pingsrv_init() != 0) {
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: pingsrv ...\n"));
+        goto errorout;
+    }
+#endif
+
+    /* create the ping device object */
+    PingObject = CreateDevice(
+                        DriverObject,
+                        LUSTRE_PING_DEVICE,
+                        LUSTRE_PING_SYMLNK,
+                        FALSE );
+    if (!PingObject) {
+#if defined(LUSTRE_PING_CLI)
+        module_pingcli_cleanup();
+#endif
+#if defined(LUSTRE_PING_SRV)
+        module_pingsrv_cleanup();
+#endif
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
+
+    /* create the libcfs proc fs emultor device object */
+    ProcObject  = CreateDevice(
+                        DriverObject,
+                        LUSTRE_PROC_DEVICE,
+                        LUSTRE_PROC_SYMLNK,
+                        TRUE );
+    if (!ProcObject) {
+
+        IoDeleteDevice(PingObject);
+#if defined(LUSTRE_PING_CLI)
+        module_pingcli_cleanup();
+#endif
+#if defined(LUSTRE_PING_SRV)
+        module_pingsrv_cleanup();
+#endif
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
+
+    /* initialize the driver callback routines */
+
+    DriverObject->MajorFunction[IRP_MJ_CREATE]          = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_CLOSE]           = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_READ]            = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_WRITE]           = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_SHUTDOWN]        = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_DEVICE_CONTROL]  = UTDispatchRequest;
+
+    return STATUS_SUCCESS;
+
+errorout:
+
+    cfs_enter_debugger();
+
+    return STATUS_UNSUCCESSFUL;
+}
index dc60e7f..3e07242 100644 (file)
@@ -68,7 +68,7 @@ ptllnd_get_tunables(lnet_ni_t *ni)
                                       "PTLLND_PID", PTLLND_PID);
         if (rc != 0)
                 return rc;
-        plni->plni_pid = (ptl_pid_t)temp;
+        plni->plni_ptllnd_pid = (ptl_pid_t)temp;
 
         rc = ptllnd_parse_int_tunable(&plni->plni_peer_credits,
                                       "PTLLND_PEERCREDITS", PTLLND_PEERCREDITS);
@@ -111,15 +111,15 @@ ptllnd_get_tunables(lnet_ni_t *ni)
 
         plni->plni_buffer_size = plni->plni_max_msg_size * msgs_per_buffer;
 
-        PJK_UT_MSG("portal          = %d\n",plni->plni_portal);
-        PJK_UT_MSG("pid             = %d\n",plni->plni_pid);
-        PJK_UT_MSG("max_immediate   = %d\n",max_immediate);
-        PJK_UT_MSG("msgs_per_buffer = %d\n",msgs_per_buffer);
-        PJK_UT_MSG("msgs_spare      = %d\n",plni->plni_msgs_spare);
-        PJK_UT_MSG("peer_hash_size  = %d\n",plni->plni_peer_hash_size);
-        PJK_UT_MSG("eq_size         = %d\n",plni->plni_eq_size);
-        PJK_UT_MSG("max_msg_size    = %d\n",plni->plni_max_msg_size);
-        PJK_UT_MSG("buffer_size     = %d\n",plni->plni_buffer_size);
+        CDEBUG(D_NET, "portal          = %d\n",plni->plni_portal);
+        CDEBUG(D_NET, "ptllnd_pid      = %d\n",plni->plni_ptllnd_pid);
+        CDEBUG(D_NET, "max_immediate   = %d\n",max_immediate);
+        CDEBUG(D_NET, "msgs_per_buffer = %d\n",msgs_per_buffer);
+        CDEBUG(D_NET, "msgs_spare      = %d\n",plni->plni_msgs_spare);
+        CDEBUG(D_NET, "peer_hash_size  = %d\n",plni->plni_peer_hash_size);
+        CDEBUG(D_NET, "eq_size         = %d\n",plni->plni_eq_size);
+        CDEBUG(D_NET, "max_msg_size    = %d\n",plni->plni_max_msg_size);
+        CDEBUG(D_NET, "buffer_size     = %d\n",plni->plni_buffer_size);
 
         return 0;
 }
@@ -176,8 +176,8 @@ ptllnd_grow_buffers (lnet_ni_t *ni)
         int              nbufs;
         int              rc;
 
-        PJK_UT_MSG("nposted_buffers = %d (before)\n",plni->plni_nposted_buffers);
-        PJK_UT_MSG("nbuffers = %d (before)\n",plni->plni_nbuffers);
+        CDEBUG(D_NET, "nposted_buffers = %d (before)\n",plni->plni_nposted_buffers);
+        CDEBUG(D_NET, "nbuffers = %d (before)\n",plni->plni_nbuffers);
 
 
         nmsgs = plni->plni_npeers * plni->plni_peer_credits +
@@ -204,8 +204,8 @@ ptllnd_grow_buffers (lnet_ni_t *ni)
                 }
         }
 
-        PJK_UT_MSG("nposted_buffers = %d (after)\n",plni->plni_nposted_buffers);
-        PJK_UT_MSG("nbuffers = %d (after)\n",plni->plni_nbuffers);
+        CDEBUG(D_NET, "nposted_buffers = %d (after)\n",plni->plni_nposted_buffers);
+        CDEBUG(D_NET, "nbuffers = %d (after)\n",plni->plni_nbuffers);
         return 0;
 }
 
@@ -217,13 +217,13 @@ ptllnd_destroy_buffers (lnet_ni_t *ni)
         struct list_head  *tmp;
         struct list_head  *nxt;
 
-        PJK_UT_MSG("nposted_buffers = %d (before)\n",plni->plni_nposted_buffers);
-        PJK_UT_MSG("nbuffers = %d (before)\n",plni->plni_nbuffers);
+        CDEBUG(D_NET, "nposted_buffers = %d (before)\n",plni->plni_nposted_buffers);
+        CDEBUG(D_NET, "nbuffers = %d (before)\n",plni->plni_nbuffers);
 
         list_for_each_safe(tmp, nxt, &plni->plni_buffers) {
                 buf = list_entry(tmp, ptllnd_buffer_t, plb_list);
 
-                //PJK_UT_MSG("buf=%p posted=%d\n",buf,buf->plb_posted);
+                //CDEBUG(D_NET, "buf=%p posted=%d\n",buf,buf->plb_posted);
 
                 LASSERT (plni->plni_nbuffers > 0);
                 if (buf->plb_posted) {
@@ -249,8 +249,8 @@ ptllnd_destroy_buffers (lnet_ni_t *ni)
                 ptllnd_destroy_buffer(buf);
         }
 
-        PJK_UT_MSG("nposted_buffers = %d (after)\n",plni->plni_nposted_buffers);
-        PJK_UT_MSG("nbuffers = %d (after)\n",plni->plni_nbuffers);
+        CDEBUG(D_NET, "nposted_buffers = %d (after)\n",plni->plni_nposted_buffers);
+        CDEBUG(D_NET, "nbuffers = %d (after)\n",plni->plni_nbuffers);
 
         LASSERT (plni->plni_nposted_buffers == 0);
         LASSERT (plni->plni_nbuffers == 0);
@@ -300,7 +300,7 @@ ptllnd_close_peers (lnet_ni_t *ni)
         ptllnd_peer_t  *plp;
         int             i;
 
-        PJK_UT_MSG(">>> npeers=%d\n",plni->plni_npeers);
+        CDEBUG(D_NET, ">>> npeers=%d\n",plni->plni_npeers);
 
         for (i = 0; i < plni->plni_peer_hash_size; i++)
                 while (!list_empty(&plni->plni_peer_hash[i])) {
@@ -310,7 +310,7 @@ ptllnd_close_peers (lnet_ni_t *ni)
                         ptllnd_close_peer(plp);
                 }
 
-        PJK_UT_MSG("<<< npeers=%d\n",plni->plni_npeers);
+        CDEBUG(D_NET, "<<< npeers=%d\n",plni->plni_npeers);
 }
 
 __u64
@@ -329,7 +329,7 @@ ptllnd_shutdown (lnet_ni_t *ni)
         ptllnd_ni_t *plni = ni->ni_data;
         int          rc;
 
-        PJK_UT_MSG(">>>\n");
+        CDEBUG(D_NET, ">>>\n");
 
         LASSERT (ptllnd_ni_count == 1);
 
@@ -353,7 +353,7 @@ ptllnd_shutdown (lnet_ni_t *ni)
         LIBCFS_FREE(plni, sizeof(*plni));
         ptllnd_ni_count--;
 
-        PJK_UT_MSG("<<<\n");
+        CDEBUG(D_NET, "<<<\n");
 }
 
 int
@@ -362,7 +362,7 @@ ptllnd_startup (lnet_ni_t *ni)
         ptllnd_ni_t *plni;
         int          rc;
 
-        PJK_UT_MSG(">>> ni=%p\n",ni);
+        CDEBUG(D_NET, ">>> ni=%p\n",ni);
 
        /* could get limits from portals I guess... */
        ni->ni_maxtxcredits =
@@ -393,7 +393,7 @@ ptllnd_startup (lnet_ni_t *ni)
          * the lnet pid to the pid of this process.
          */
         the_lnet.ln_pid = getpid();
-        PJK_UT_MSG("Forcing LNET pid to %d\n",the_lnet.ln_pid);
+        CDEBUG(D_NET, "Forcing LNET pid to %d\n",the_lnet.ln_pid);
 
         plni->plni_stamp = ptllnd_get_timestamp();
         plni->plni_nrxs = 0;
@@ -416,14 +416,18 @@ ptllnd_startup (lnet_ni_t *ni)
         if (rc != 0)
                 goto failed1;
 
-        rc = PtlNIInit(PTL_IFACE_DEFAULT, plni->plni_pid,
+        /* NB I most probably won't get the PID I requested here.  It doesn't
+         * matter because I don't need a fixed PID (only connection acceptors
+         * need a "well known" PID). */
+
+        rc = PtlNIInit(PTL_IFACE_DEFAULT, plni->plni_ptllnd_pid,
                        NULL, NULL, &plni->plni_nih);
         if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
                 CERROR("PtlNIInit failed: %d\n", rc);
                 rc = -ENODEV;
                 goto failed2;
         }
-        PJK_UT_MSG("plni->plni_nih=%x\n",plni->plni_nih);
+        CDEBUG(D_NET, "plni->plni_nih=%x\n",plni->plni_nih);
 
         rc = PtlEQAlloc(plni->plni_nih, plni->plni_eq_size,
                         PTL_EQ_HANDLER_NONE, &plni->plni_eqh);
@@ -432,7 +436,7 @@ ptllnd_startup (lnet_ni_t *ni)
                 rc = -ENODEV;
                 goto failed3;
         }
-        PJK_UT_MSG("plni->plni_eqh=%x\n",plni->plni_eqh);
+        CDEBUG(D_NET, "plni->plni_eqh=%x\n",plni->plni_eqh);
 
         /*
          * Fetch the Portals NID
@@ -443,7 +447,7 @@ ptllnd_startup (lnet_ni_t *ni)
                 goto failed4;
         }
 
-        PJK_UT_MSG("lnet nid=" LPX64 " (passed in)\n",ni->ni_nid);
+        CDEBUG(D_NET, "lnet nid=" LPX64 " (passed in)\n",ni->ni_nid);
 
         /*
          * Create the new NID.  Based on the LND network type
@@ -451,19 +455,15 @@ ptllnd_startup (lnet_ni_t *ni)
          */
         ni->ni_nid = ptl2lnetnid(ni,plni->plni_portals_id.nid);
 
-        PJK_UT_MSG("ptl  pid=" FMT_PID "\n",plni->plni_portals_id.pid);
-        PJK_UT_MSG("ptl  nid=" FMT_NID "\n",plni->plni_portals_id.nid);
-        PJK_UT_MSG("lnet nid=" LPX64 " (passed back)\n",ni->ni_nid);
-
-        CDEBUG(D_INFO,"ptl  pid=" FMT_PID "\n",plni->plni_portals_id.pid);
-        CDEBUG(D_INFO,"ptl  nid=" FMT_NID "\n",plni->plni_portals_id.nid);
-        CDEBUG(D_INFO,"lnet nid=" LPX64 "\n",ni->ni_nid);
+        CDEBUG(D_NET, "ptl  pid=" FMT_PID "\n",plni->plni_portals_id.pid);
+        CDEBUG(D_NET, "ptl  nid=" FMT_NID "\n",plni->plni_portals_id.nid);
+        CDEBUG(D_NET, "lnet nid=" LPX64 " (passed back)\n",ni->ni_nid);
 
         rc = ptllnd_grow_buffers(ni);
         if (rc != 0)
                 goto failed4;
 
-        PJK_UT_MSG("<<<\n");
+        CDEBUG(D_NET, "<<<\n");
        return 0;
 
  failed4:
@@ -477,7 +477,7 @@ ptllnd_startup (lnet_ni_t *ni)
         LIBCFS_FREE(plni, sizeof(*plni));
  failed0:
         ptllnd_ni_count--;
-        PJK_UT_MSG("<<< rc=%d\n",rc);
+        CDEBUG(D_NET, "<<< rc=%d\n",rc);
         return rc;
 }
 
index ad98b33..ffcd559 100644 (file)
@@ -37,7 +37,7 @@
 typedef struct
 {
         int                        plni_portal;
-        ptl_pid_t                  plni_pid;
+        ptl_pid_t                  plni_ptllnd_pid; /* Portals PID of peers I may connect to */
         int                        plni_peer_credits;
         int                        plni_max_msg_size;
         int                        plni_buffer_size;
@@ -214,12 +214,6 @@ lnet2ptlnid(lnet_nid_t lnet_nid)
 }
 
 /*
- * Define this to enable console debug logging
- * and simulation
- */
-//#define PJK_DEBUGGING
-
-/*
  * A note about lprintf():
  *  Normally printf() is redirected to stdout of the console
  *  from which yod launched the catamount application.  However
@@ -232,37 +226,4 @@ lnet2ptlnid(lnet_nid_t lnet_nid)
  *  cases.
  */
 
-#ifdef PJK_DEBUGGING
-
-#define PJK_UT_MSG_ALWAYS(fmt, a...)                    \
-do{                                                     \
-        lprintf("ptllnd:%-30s:",__FUNCTION__);          \
-        lprintf(fmt,## a);                              \
-}while(0)
-
-
-#define PJK_UT_MSG_SIMULATION(fmt, a...)        PJK_UT_MSG_ALWAYS(fmt, ## a )
-
-
-#if 1
-#define PJK_UT_MSG_DATA(fmt, a...)              PJK_UT_MSG_ALWAYS(fmt, ## a )
-#else
-#define PJK_UT_MSG_DATA(fmt, a...)              do{}while(0)
-#endif
-
-#if 1
-#define PJK_UT_MSG(fmt, a...)                   PJK_UT_MSG_ALWAYS(fmt, ## a )
-#else
-#define PJK_UT_MSG(fmt, a...)                   do{}while(0)
-#endif
-
-#else
-
-
-#define PJK_UT_MSG_ALWAYS(fmt, a...)            do{}while(0)
-#define PJK_UT_MSG_SIMULATION(fmt, a...)        do{}while(0)
-#define PJK_UT_MSG_DATA(fmt, a...)              do{}while(0)
-#define PJK_UT_MSG(fmt, a...)                   do{}while(0)
-
-#endif
 
index 08f5ed3..7d0d23f 100644 (file)
@@ -76,7 +76,7 @@ ptllnd_find_peer(lnet_ni_t *ni, lnet_nid_t nid, int create)
         ptllnd_tx_t       *tx;
         int                rc;
 
-        PJK_UT_MSG(">>> nid=%s\n",libcfs_nid2str(nid));
+        CDEBUG(D_NET, ">>> nid=%s\n",libcfs_nid2str(nid));
 
         LASSERT (LNET_NIDNET(nid) == LNET_NIDNET(ni->ni_nid));
 
@@ -85,7 +85,7 @@ ptllnd_find_peer(lnet_ni_t *ni, lnet_nid_t nid, int create)
 
                 if (plp->plp_nid == nid) {
                         ptllnd_peer_addref(plp);
-                        PJK_UT_MSG("<<< peer=%p FOUND\n",plp);
+                        CDEBUG(D_NET, "<<< peer=%p FOUND\n",plp);
                         return plp;
                 }
         }
@@ -109,12 +109,12 @@ ptllnd_find_peer(lnet_ni_t *ni, lnet_nid_t nid, int create)
                 return NULL;
         }
 
-        PJK_UT_MSG("new peer=%p\n",plp);
+        CDEBUG(D_NET, "new peer=%p\n",plp);
 
         plp->plp_ni = ni;
         plp->plp_nid = nid;
         plp->plp_ptlid.nid = LNET_NIDADDR(nid);
-        plp->plp_ptlid.pid = plni->plni_pid;
+        plp->plp_ptlid.pid = plni->plni_ptllnd_pid;
         plp->plp_max_credits =
         plp->plp_credits = 1; /* add more later when she gives me credits */
         plp->plp_max_msg_size = plni->plni_max_msg_size; /* until I hear from her */
@@ -143,7 +143,7 @@ ptllnd_find_peer(lnet_ni_t *ni, lnet_nid_t nid, int create)
 
         ptllnd_post_tx(tx);
 
-        PJK_UT_MSG("<<< peer=%p NEW\n",plp);
+        CDEBUG(D_NET, "<<< peer=%p NEW\n",plp);
         return plp;
 }
 
@@ -177,7 +177,7 @@ ptllnd_new_tx(ptllnd_peer_t *peer, int type, int payload_nob)
         ptllnd_tx_t *tx;
         int          msgsize;
 
-        PJK_UT_MSG("peer=%p type=%d payload=%d\n",peer,type,payload_nob);
+        CDEBUG(D_NET, "peer=%p type=%d payload=%d\n",peer,type,payload_nob);
 
         switch (type) {
         default:
@@ -215,7 +215,7 @@ ptllnd_new_tx(ptllnd_peer_t *peer, int type, int payload_nob)
 
         LASSERT (msgsize <= peer->plp_max_msg_size);
 
-        PJK_UT_MSG("msgsize=%d\n",msgsize);
+        CDEBUG(D_NET, "msgsize=%d\n",msgsize);
 
         LIBCFS_ALLOC(tx, offsetof(ptllnd_tx_t, tx_msg) + msgsize);
 
@@ -254,7 +254,7 @@ ptllnd_new_tx(ptllnd_peer_t *peer, int type, int payload_nob)
         ptllnd_peer_addref(peer);
         plni->plni_ntxs++;
 
-        PJK_UT_MSG("tx=%p\n",tx);
+        CDEBUG(D_NET, "tx=%p\n",tx);
 
         return tx;
 }
@@ -289,12 +289,12 @@ ptllnd_tx_done(ptllnd_tx_t *tx)
          * events for this tx until it's unlinked.  So I set tx_completing to
          * flag the tx is getting handled */
 
-        PJK_UT_MSG(">>> tx=%p peer=%p\n",tx,peer);
-        PJK_UT_MSG("completing=%d\n",tx->tx_completing);
-        PJK_UT_MSG("status=%d\n",tx->tx_status);
-        PJK_UT_MSG("niov=%d\n",tx->tx_niov);
-        PJK_UT_MSG("lnetreplymsg=%p\n",tx->tx_lnetreplymsg);
-        PJK_UT_MSG("lnetmsg=%p\n",tx->tx_lnetmsg);
+        CDEBUG(D_NET, ">>> tx=%p peer=%p\n",tx,peer);
+        CDEBUG(D_NET, "completing=%d\n",tx->tx_completing);
+        CDEBUG(D_NET, "status=%d\n",tx->tx_status);
+        CDEBUG(D_NET, "niov=%d\n",tx->tx_niov);
+        CDEBUG(D_NET, "lnetreplymsg=%p\n",tx->tx_lnetreplymsg);
+        CDEBUG(D_NET, "lnetmsg=%p\n",tx->tx_lnetmsg);
 
         if (tx->tx_completing)
                 return;
@@ -320,7 +320,7 @@ ptllnd_tx_done(ptllnd_tx_t *tx)
                 LASSERT (tx->tx_lnetmsg != NULL);
                 /* Simulate GET success always  */
                 lnet_finalize(ni, tx->tx_lnetmsg, 0);
-                PJK_UT_MSG("lnet_finalize(tx_lnetreplymsg=%p)\n",tx->tx_lnetreplymsg);
+                CDEBUG(D_NET, "lnet_finalize(tx_lnetreplymsg=%p)\n",tx->tx_lnetreplymsg);
                 lnet_finalize(ni, tx->tx_lnetreplymsg, tx->tx_status);
         } else if (tx->tx_lnetmsg != NULL) {
                 lnet_finalize(ni, tx->tx_lnetmsg, tx->tx_status);
@@ -332,7 +332,7 @@ ptllnd_tx_done(ptllnd_tx_t *tx)
         plni->plni_ntxs--;
         LIBCFS_FREE(tx, offsetof(ptllnd_tx_t, tx_msg) + tx->tx_msgsize);
 
-        PJK_UT_MSG("<<< tx=%p\n",tx);
+        CDEBUG(D_NET, "<<< tx=%p\n",tx);
 }
 
 void
@@ -361,9 +361,9 @@ ptllnd_set_txiov(ptllnd_tx_t *tx,
                 return 0;
         }
 
-        PJK_UT_MSG("niov  =%d\n",niov);
-        PJK_UT_MSG("offset=%d\n",offset);
-        PJK_UT_MSG("len   =%d\n",len);
+        CDEBUG(D_NET, "niov  =%d\n",niov);
+        CDEBUG(D_NET, "offset=%d\n",offset);
+        CDEBUG(D_NET, "len   =%d\n",len);
 
 
         /*
@@ -380,9 +380,9 @@ ptllnd_set_txiov(ptllnd_tx_t *tx,
                 iov++;
         }
 
-        PJK_UT_MSG("niov  =%d (after)\n",niov);
-        PJK_UT_MSG("offset=%d (after)\n",offset);
-        PJK_UT_MSG("len   =%d (after)\n",len);
+        CDEBUG(D_NET, "niov  =%d (after)\n",niov);
+        CDEBUG(D_NET, "offset=%d (after)\n",offset);
+        CDEBUG(D_NET, "len   =%d (after)\n",len);
 
         for (;;) {
                 int temp_offset = offset;
@@ -392,10 +392,10 @@ ptllnd_set_txiov(ptllnd_tx_t *tx,
                         return -ENOMEM;
 
                 for (npiov = 0;; npiov++) {
-                        PJK_UT_MSG("npiov=%d\n",npiov);
-                        PJK_UT_MSG("offset=%d\n",temp_offset);
-                        PJK_UT_MSG("len=%d\n",resid);
-                        PJK_UT_MSG("iov[npiov].iov_len=%d\n",iov[npiov].iov_len);
+                        CDEBUG(D_NET, "npiov=%d\n",npiov);
+                        CDEBUG(D_NET, "offset=%d\n",temp_offset);
+                        CDEBUG(D_NET, "len=%d\n",resid);
+                        CDEBUG(D_NET, "iov[npiov].iov_len=%d\n",iov[npiov].iov_len);
 
                         LASSERT (npiov < niov);
                         LASSERT (iov->iov_len >= temp_offset);
@@ -415,8 +415,8 @@ ptllnd_set_txiov(ptllnd_tx_t *tx,
                 if (npiov == niov) {
                         tx->tx_niov = niov;
                         tx->tx_iov = piov;
-                        PJK_UT_MSG("tx->tx_iov=%p\n",tx->tx_iov);
-                        PJK_UT_MSG("tx->tx_niov=%d\n",tx->tx_niov);
+                        CDEBUG(D_NET, "tx->tx_iov=%p\n",tx->tx_iov);
+                        CDEBUG(D_NET, "tx->tx_niov=%d\n",tx->tx_niov);
                         return 0;
                 }
 
@@ -507,15 +507,15 @@ ptllnd_check_sends(ptllnd_peer_t *peer)
         ptl_handle_md_t mdh;
         int             rc;
 
-        PJK_UT_MSG(">>> peer=%p\n",peer);
-        PJK_UT_MSG("plp_outstanding_credits=%d\n",peer->plp_outstanding_credits);
+        CDEBUG(D_NET, ">>> peer=%p\n",peer);
+        CDEBUG(D_NET, "plp_outstanding_credits=%d\n",peer->plp_outstanding_credits);
 
         if (list_empty(&peer->plp_txq) &&
             peer->plp_outstanding_credits >=
             PTLLND_CREDIT_HIGHWATER(plni)) {
 
                 tx = ptllnd_new_tx(peer, PTLLND_MSG_TYPE_NOOP, 0);
-                PJK_UT_MSG("NOOP tx=%p\n",tx);
+                CDEBUG(D_NET, "NOOP tx=%p\n",tx);
                 if (tx == NULL) {
                         CERROR("Can't return credits to %s\n",
                                libcfs_nid2str(peer->plp_nid));
@@ -527,9 +527,9 @@ ptllnd_check_sends(ptllnd_peer_t *peer)
         while (!list_empty(&peer->plp_txq)) {
                 tx = list_entry(peer->plp_txq.next, ptllnd_tx_t, tx_list);
 
-                PJK_UT_MSG("Looking at TX=%p\n",tx);
-                PJK_UT_MSG("plp_credits=%d\n",peer->plp_credits);
-                PJK_UT_MSG("plp_outstanding_credits=%d\n",peer->plp_outstanding_credits);
+                CDEBUG(D_NET, "Looking at TX=%p\n",tx);
+                CDEBUG(D_NET, "plp_credits=%d\n",peer->plp_credits);
+                CDEBUG(D_NET, "plp_outstanding_credits=%d\n",peer->plp_outstanding_credits);
 
                 LASSERT (tx->tx_msgsize > 0);
 
@@ -548,7 +548,7 @@ ptllnd_check_sends(ptllnd_peer_t *peer)
 
                 list_del_init(&tx->tx_list);
 
-                PJK_UT_MSG("Sending at TX=%p type=%s (%d)\n",tx,
+                CDEBUG(D_NET, "Sending at TX=%p type=%s (%d)\n",tx,
                         get_msg_type_string(tx->tx_type),tx->tx_type);
 
                 if (tx->tx_type == PTLLND_MSG_TYPE_NOOP &&
@@ -567,7 +567,7 @@ ptllnd_check_sends(ptllnd_peer_t *peer)
                  */
                 tx->tx_msg.ptlm_dststamp = peer->plp_stamp;
 
-                PJK_UT_MSG("Returning %d to peer\n",peer->plp_outstanding_credits);
+                CDEBUG(D_NET, "Returning %d to peer\n",peer->plp_outstanding_credits);
 
                 /*
                  * Return all the credits we have
@@ -611,7 +611,7 @@ ptllnd_check_sends(ptllnd_peer_t *peer)
                 list_add_tail(&tx->tx_list, &plni->plni_active_txs);
         }
 
-        PJK_UT_MSG("<<< peer=%p\n",peer);
+        CDEBUG(D_NET, "<<< peer=%p\n",peer);
 }
 
 int
@@ -629,9 +629,9 @@ ptllnd_passive_rdma(ptllnd_peer_t *peer, int type, lnet_msg_t *msg,
         int             rc;
         int             rc2;
 
-        PJK_UT_MSG(">>> peer=%p type=%s(%d) tx=%p\n",peer,
+        CDEBUG(D_NET, ">>> peer=%p type=%s(%d) tx=%p\n",peer,
                 type == PTLLND_MSG_TYPE_GET ? "GET" : "PUT/REPLY",type,tx);
-        PJK_UT_MSG("niov=%d offset=%d len=%d\n",niov,offset,len);
+        CDEBUG(D_NET, "niov=%d offset=%d len=%d\n",niov,offset,len);
 
         LASSERT (type == PTLLND_MSG_TYPE_GET ||
                  type == PTLLND_MSG_TYPE_PUT);
@@ -663,7 +663,7 @@ ptllnd_passive_rdma(ptllnd_peer_t *peer, int type, lnet_msg_t *msg,
         ptllnd_set_md_buffer(&md, tx);
 
         while (!peer->plp_recvd_hello) {        /* wait to validate plp_match */
-                PJK_UT_MSG("Wait For Hello\n");
+                CDEBUG(D_NET, "Wait For Hello\n");
                 if (peer->plp_closing) {
                         rc = -EIO;
                         goto failed;
@@ -674,8 +674,8 @@ ptllnd_passive_rdma(ptllnd_peer_t *peer, int type, lnet_msg_t *msg,
         if(peer->plp_match < PTL_RESERVED_MATCHBITS)
                 peer->plp_match = PTL_RESERVED_MATCHBITS;
         matchbits = peer->plp_match++;
-        PJK_UT_MSG("matchbits " LPX64 "\n",matchbits);
-        PJK_UT_MSG("nid " FMT_NID " pid=%d\n",peer->plp_ptlid.nid,peer->plp_ptlid.pid);
+        CDEBUG(D_NET, "matchbits " LPX64 "\n",matchbits);
+        CDEBUG(D_NET, "nid " FMT_NID " pid=%d\n",peer->plp_ptlid.nid,peer->plp_ptlid.pid);
 
         rc = PtlMEAttach(plni->plni_nih, plni->plni_portal, peer->plp_ptlid,
                          matchbits, 0, PTL_UNLINK, PTL_INS_BEFORE, &meh);
@@ -686,15 +686,13 @@ ptllnd_passive_rdma(ptllnd_peer_t *peer, int type, lnet_msg_t *msg,
                 goto failed;
         }
 
-/*
-        PJK_UT_MSG("md.start=%p\n",md.start);
-        PJK_UT_MSG("md.length=%d\n",md.length);
-        PJK_UT_MSG("md.threshold=%d\n",md.threshold);
-        PJK_UT_MSG("md.max_size=%d\n",md.max_size);
-        PJK_UT_MSG("md.options=0x%x\n",md.options);
-        PJK_UT_MSG("md.user_ptr=%p\n",md.user_ptr);
-        PJK_UT_MSG("md.eq_handle=%p\n",md.eq_handle);
-*/
+        CDEBUG(D_NET, "md.start=%p\n",md.start);
+        CDEBUG(D_NET, "md.length=%d\n",md.length);
+        CDEBUG(D_NET, "md.threshold=%d\n",md.threshold);
+        CDEBUG(D_NET, "md.max_size=%d\n",md.max_size);
+        CDEBUG(D_NET, "md.options=0x%x\n",md.options);
+        CDEBUG(D_NET, "md.user_ptr=%p\n",md.user_ptr);
+
         rc = PtlMDAttach(meh, md, LNET_UNLINK, &mdh);
         if (rc != PTL_OK) {
                 CERROR("PtlMDAttach for %s failed: %d\n",
@@ -728,12 +726,12 @@ ptllnd_passive_rdma(ptllnd_peer_t *peer, int type, lnet_msg_t *msg,
 
         tx->tx_lnetmsg = msg;
         ptllnd_post_tx(tx);
-        PJK_UT_MSG("<<<\n");
+        CDEBUG(D_NET, "<<<\n");
         return 0;
 
  failed:
         ptllnd_tx_done(tx);
-        PJK_UT_MSG("<<< rc=%d\n",rc);
+        CDEBUG(D_NET, "<<< rc=%d\n",rc);
         return rc;
 }
 
@@ -750,9 +748,9 @@ ptllnd_active_rdma(ptllnd_peer_t *peer, int type,
         ptl_handle_md_t  mdh;
         int              rc;
 
-        PJK_UT_MSG(">>> peer=%p type=%d tx=%p\n",peer,type,tx);
-        PJK_UT_MSG("niov=%u offset=%u len=%u\n",niov,offset,len);
-        PJK_UT_MSG("matchbits " LPX64 "\n",matchbits);
+        CDEBUG(D_NET, ">>> peer=%p type=%d tx=%p\n",peer,type,tx);
+        CDEBUG(D_NET, "niov=%u offset=%u len=%u\n",niov,offset,len);
+        CDEBUG(D_NET, "matchbits " LPX64 "\n",matchbits);
 
         LASSERT (type == PTLLND_RDMA_READ ||
                  type == PTLLND_RDMA_WRITE);
@@ -805,7 +803,7 @@ ptllnd_active_rdma(ptllnd_peer_t *peer, int type,
                 rc = PtlPut(mdh, PTL_NOACK_REQ, peer->plp_ptlid,
                             plni->plni_portal, 0, matchbits, 0, 0);
         if (rc == 0){
-                PJK_UT_MSG("<<<\n");
+                CDEBUG(D_NET, "<<<\n");
                 return 0;
         }
 
@@ -813,7 +811,7 @@ ptllnd_active_rdma(ptllnd_peer_t *peer, int type,
  failed:
         tx->tx_status = rc;
         ptllnd_tx_done(tx);    /* this will close peer */
-        PJK_UT_MSG("<<< rc=%d\n",rc);
+        CDEBUG(D_NET, "<<< rc=%d\n",rc);
         return rc;
 }
 
@@ -831,11 +829,11 @@ ptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *msg)
 
         LASSERT (msg->msg_niov <= PTL_MD_MAX_IOV); /* !!! */
 
-        PJK_UT_MSG("msg=%p nid=%s\n",msg,libcfs_nid2str(msg->msg_target.nid));
-        PJK_UT_MSG("is_target_router=%d\n",msg->msg_target_is_router);
-        PJK_UT_MSG("msg_niov=%d\n",msg->msg_niov);
-        PJK_UT_MSG("msg_offset=%d\n",msg->msg_offset);
-        PJK_UT_MSG("msg_len=%d\n",msg->msg_len);
+        CDEBUG(D_NET, "msg=%p nid=%s\n",msg,libcfs_nid2str(msg->msg_target.nid));
+        CDEBUG(D_NET, "is_target_router=%d\n",msg->msg_target_is_router);
+        CDEBUG(D_NET, "msg_niov=%d\n",msg->msg_niov);
+        CDEBUG(D_NET, "msg_offset=%d\n",msg->msg_offset);
+        CDEBUG(D_NET, "msg_len=%d\n",msg->msg_len);
 
         plp = ptllnd_find_peer(ni, msg->msg_target.nid, 1);
         if (plp == NULL)
@@ -846,13 +844,13 @@ ptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *msg)
                 LBUG();
 
         case LNET_MSG_ACK:
-                PJK_UT_MSG("LNET_MSG_ACK\n");
+                CDEBUG(D_NET, "LNET_MSG_ACK\n");
 
                 LASSERT (msg->msg_len == 0);
                 break;                          /* send IMMEDIATE */
 
         case LNET_MSG_GET:
-                PJK_UT_MSG("LNET_MSG_GET nob=%d\n",msg->msg_md->md_length);
+                CDEBUG(D_NET, "LNET_MSG_GET nob=%d\n",msg->msg_md->md_length);
 
                 if (msg->msg_target_is_router)
                         break;                  /* send IMMEDIATE */
@@ -868,15 +866,15 @@ ptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *msg)
                                          msg->msg_md->md_iov.iov,
                                          0, msg->msg_md->md_length);
                 ptllnd_peer_decref(plp);
-                PJK_UT_MSG("<<< rc=%d\n",rc);
+                CDEBUG(D_NET, "<<< rc=%d\n",rc);
                 return rc;
 
         case LNET_MSG_REPLY:
         case LNET_MSG_PUT:
-                PJK_UT_MSG("LNET_MSG_PUT nob=%d\n",msg->msg_len);
+                CDEBUG(D_NET, "LNET_MSG_PUT nob=%d\n",msg->msg_len);
                 nob = msg->msg_len;
                 nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[nob]);
-                PJK_UT_MSG("msg_size=%d max=%d\n",msg->msg_len,plp->plp_max_msg_size);
+                CDEBUG(D_NET, "msg_size=%d max=%d\n",msg->msg_len,plp->plp_max_msg_size);
                 if (nob <= plp->plp_max_msg_size)
                         break;                  /* send IMMEDIATE */
 
@@ -884,14 +882,14 @@ ptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *msg)
                                          msg->msg_niov, msg->msg_iov,
                                          msg->msg_offset, msg->msg_len);
                 ptllnd_peer_decref(plp);
-                PJK_UT_MSG("<<< rc=%d\n",rc);
+                CDEBUG(D_NET, "<<< rc=%d\n",rc);
                 return rc;
         }
 
         /* send IMMEDIATE
          * NB copy the payload so we don't have to do a fragmented send */
 
-        PJK_UT_MSG("IMMEDIATE len=%d\n", msg->msg_len);
+        CDEBUG(D_NET, "IMMEDIATE len=%d\n", msg->msg_len);
         tx = ptllnd_new_tx(plp, PTLLND_MSG_TYPE_IMMEDIATE, msg->msg_len);
         if (tx == NULL) {
                 CERROR("Can't allocate tx for lnet type %d to %s\n",
@@ -909,7 +907,7 @@ ptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *msg)
         tx->tx_lnetmsg = msg;
         ptllnd_post_tx(tx);
         ptllnd_peer_decref(plp);
-        PJK_UT_MSG("<<<\n");
+        CDEBUG(D_NET, "<<<\n");
         return 0;
 }
 
@@ -920,7 +918,7 @@ ptllnd_rx_done(ptllnd_rx_t *rx)
         lnet_ni_t     *ni = plp->plp_ni;
         ptllnd_ni_t   *plni = ni->ni_data;
 
-        PJK_UT_MSG("rx=%p\n", rx);
+        CDEBUG(D_NET, "rx=%p\n", rx);
 
         plp->plp_outstanding_credits++;
         ptllnd_check_sends(rx->rx_peer);
@@ -941,8 +939,8 @@ ptllnd_eager_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg,
 
         /* Shouldn't get here; recvs only block for router buffers */
         LBUG();
-        
-        PJK_UT_MSG("rx=%p (stack)\n", stackrx);
+
+        CDEBUG(D_NET, "rx=%p (stack)\n", stackrx);
 
         /* Don't ++plni_nrxs: heaprx replaces stackrx */
 
@@ -952,7 +950,7 @@ ptllnd_eager_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg,
         if (heaprx == NULL)
                 return -ENOMEM;
 
-        PJK_UT_MSG("rx=%p (new heap)\n", stackrx);
+        CDEBUG(D_NET, "rx=%p (new heap)\n", stackrx);
 
         heaprx->rx_msg = (kptl_msg_t *)heaprx->rx_space;
         memcpy(&heaprx->rx_msg, stackrx->rx_msg, stackrx->rx_nob);
@@ -973,11 +971,11 @@ ptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg,
         LASSERT (kiov == NULL);
         LASSERT (niov <= PTL_MD_MAX_IOV);       /* !!! */
 
-        PJK_UT_MSG(">>> msg=%p\n",msg);
-        PJK_UT_MSG("rx=%p rx_nob=%d\n",rx,rx->rx_nob);
-        PJK_UT_MSG("niov=%d\n",niov);
-        PJK_UT_MSG("offset=%d\n",offset);
-        PJK_UT_MSG("mlen=%d rlen=%d\n",mlen,rlen);
+        CDEBUG(D_NET, ">>> msg=%p\n",msg);
+        CDEBUG(D_NET, "rx=%p rx_nob=%d\n",rx,rx->rx_nob);
+        CDEBUG(D_NET, "niov=%d\n",niov);
+        CDEBUG(D_NET, "offset=%d\n",offset);
+        CDEBUG(D_NET, "mlen=%d rlen=%d\n",mlen,rlen);
 
         switch (rx->rx_msg->ptlm_type) {
         default:
@@ -985,7 +983,7 @@ ptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg,
 
         case PTLLND_MSG_TYPE_IMMEDIATE:
                 nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[mlen]);
-                PJK_UT_MSG("PTLLND_MSG_TYPE_IMMEDIATE nob=%d\n",nob);
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE nob=%d\n",nob);
                 if (nob > rx->rx_nob) {
                         CERROR("Immediate message from %s too big: %d(%d)\n",
                                libcfs_nid2str(rx->rx_peer->plp_nid),
@@ -1001,24 +999,24 @@ ptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg,
                 break;
 
         case PTLLND_MSG_TYPE_PUT:
-                PJK_UT_MSG("PTLLND_MSG_TYPE_PUT offset=%d mlen=%d\n",offset,mlen);
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_PUT offset=%d mlen=%d\n",offset,mlen);
                 rc = ptllnd_active_rdma(rx->rx_peer, PTLLND_RDMA_READ, msg,
                                         rx->rx_msg->ptlm_u.req.kptlrm_matchbits,
                                         niov, iov, offset, mlen);
                 break;
 
         case PTLLND_MSG_TYPE_GET:
-                PJK_UT_MSG("PTLLND_MSG_TYPE_GET\n");
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_GET\n");
                 if (msg != NULL) {
                         /* matched! */
-                        PJK_UT_MSG("matchbits="LPX64"\n",
+                        CDEBUG(D_NET, "matchbits="LPX64"\n",
                                    rx->rx_msg->ptlm_u.req.kptlrm_matchbits);
 
                         rc = ptllnd_active_rdma(rx->rx_peer, PTLLND_RDMA_WRITE, msg,
                                                 rx->rx_msg->ptlm_u.req.kptlrm_matchbits,
                                                 msg->msg_niov, msg->msg_iov,
                                                 msg->msg_offset, msg->msg_len);
-                        PJK_UT_MSG("<<< rc=%d\n",rc);
+                        CDEBUG(D_NET, "<<< rc=%d\n",rc);
                         break;
                 } else {
                         ptllnd_close_peer(rx->rx_peer);
@@ -1027,7 +1025,7 @@ ptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg,
         }
 
         ptllnd_rx_done(rx);
-        PJK_UT_MSG("<<< rc=%d\n",rc);
+        CDEBUG(D_NET, "<<< rc=%d\n",rc);
         return rc;
 }
 
@@ -1043,7 +1041,7 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
         int            rc;
 
 
-        PJK_UT_MSG(">>> initiator=%s nob=%d\n",ptllnd_ptlid2str(initiator),nob);
+        CDEBUG(D_NET, ">>> initiator=%s nob=%d\n",ptllnd_ptlid2str(initiator),nob);
 
         if (nob < basenob) {
                 CERROR("Short receive from %s\n",
@@ -1068,7 +1066,7 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
                 __swab64s(&msg->ptlm_seq);
         }
         
-        PJK_UT_MSG_ALWAYS("src = %s\n",libcfs_nid2str(msg->ptlm_srcnid));
+        CDEBUG(D_NET, "src = %s\n",libcfs_nid2str(msg->ptlm_srcnid));
 
         if (msg->ptlm_version != PTLLND_MSG_VERSION) {
                 CERROR("Bad version %d from %s\n", (__u32)msg->ptlm_version,
@@ -1100,7 +1098,7 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
         switch (msg->ptlm_type) {
         case PTLLND_MSG_TYPE_PUT:
         case PTLLND_MSG_TYPE_GET:
-                PJK_UT_MSG("PTLLND_MSG_TYPE_%s\n",
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_%s\n",
                         msg->ptlm_type==PTLLND_MSG_TYPE_PUT ? "PUT" : "GET");
                 if (nob < basenob + sizeof(kptl_request_msg_t)) {
                         CERROR("Short rdma request from %s(%s)\n",
@@ -1113,7 +1111,7 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
                 break;
 
         case PTLLND_MSG_TYPE_IMMEDIATE:
-                PJK_UT_MSG("PTLLND_MSG_TYPE_IMMEDIATE\n");
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE\n");
                 if (nob < offsetof(kptl_msg_t,
                                    ptlm_u.immediate.kptlim_payload)) {
                         CERROR("Short immediate from %s(%s)\n",
@@ -1124,7 +1122,7 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
                 break;
 
         case PTLLND_MSG_TYPE_HELLO:
-                PJK_UT_MSG("PTLLND_MSG_TYPE_HELLO from %s(%s)\n",
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_HELLO from %s(%s)\n",
                                libcfs_nid2str(msg->ptlm_srcnid),
                                ptllnd_ptlid2str(initiator));
                 if (nob < basenob + sizeof(kptl_hello_msg_t)) {
@@ -1140,7 +1138,7 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
                 break;
                 
         case PTLLND_MSG_TYPE_NOOP:
-                PJK_UT_MSG("PTLLND_MSG_TYPE_NOOP from %s(%s)\n",
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_NOOP from %s(%s)\n",
                                libcfs_nid2str(msg->ptlm_srcnid),
                                ptllnd_ptlid2str(initiator));        
                 break;
@@ -1168,9 +1166,9 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
                         return;
                 }
 
-                PJK_UT_MSG("kptlhm_max_msg_size=%d\n",msg->ptlm_u.hello.kptlhm_max_msg_size);
-                PJK_UT_MSG("kptlhm_matchbits="LPX64"\n",msg->ptlm_u.hello.kptlhm_matchbits);
-                PJK_UT_MSG("ptlm_srcstamp="LPX64"\n",msg->ptlm_srcstamp);
+                CDEBUG(D_NET, "kptlhm_max_msg_size=%d\n",msg->ptlm_u.hello.kptlhm_max_msg_size);
+                CDEBUG(D_NET, "kptlhm_matchbits="LPX64"\n",msg->ptlm_u.hello.kptlhm_matchbits);
+                CDEBUG(D_NET, "ptlm_srcstamp="LPX64"\n",msg->ptlm_srcstamp);
 
                 plp->plp_max_msg_size = MAX(plni->plni_max_msg_size,
                         msg->ptlm_u.hello.kptlhm_max_msg_size);
@@ -1179,7 +1177,7 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
                 plp->plp_max_credits += msg->ptlm_credits;
                 plp->plp_recvd_hello = 1;
 
-                PJK_UT_MSG("plp_max_msg_size=%d\n",plp->plp_max_msg_size);
+                CDEBUG(D_NET, "plp_max_msg_size=%d\n",plp->plp_max_msg_size);
 
         } else if (!plp->plp_recvd_hello) {
 
@@ -1198,7 +1196,7 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
         }
 
         if (msg->ptlm_credits > 0) {
-                PJK_UT_MSG("Getting back %d credits from peer\n",msg->ptlm_credits);
+                CDEBUG(D_NET, "Getting back %d credits from peer\n",msg->ptlm_credits);
                 if (plp->plp_credits + msg->ptlm_credits >
                     plp->plp_max_credits) {
                         CWARN("Too many credits from %s: %d + %d > %d\n",
@@ -1219,7 +1217,7 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
         rx.rx_nob       = nob;
         plni->plni_nrxs++;
 
-        PJK_UT_MSG("rx=%p type=%d\n",&rx,msg->ptlm_type);
+        CDEBUG(D_NET, "rx=%p type=%d\n",&rx,msg->ptlm_type);
 
         switch (msg->ptlm_type) {
         default: /* message types have been checked already */
@@ -1228,20 +1226,20 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
 
         case PTLLND_MSG_TYPE_PUT:
         case PTLLND_MSG_TYPE_GET:
-                PJK_UT_MSG("PTLLND_MSG_TYPE_%s\n",
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_%s\n",
                         msg->ptlm_type==PTLLND_MSG_TYPE_PUT ? "PUT" : "GET");
                 rc = lnet_parse(ni, &msg->ptlm_u.req.kptlrm_hdr,
                                 msg->ptlm_srcnid, &rx, 1);
-                PJK_UT_MSG("lnet_parse rc=%d\n",rc);
+                CDEBUG(D_NET, "lnet_parse rc=%d\n",rc);
                 if (rc < 0)
                         ptllnd_rx_done(&rx);
                 break;
 
         case PTLLND_MSG_TYPE_IMMEDIATE:
-                PJK_UT_MSG("PTLLND_MSG_TYPE_IMMEDIATE\n");
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE\n");
                 rc = lnet_parse(ni, &msg->ptlm_u.immediate.kptlim_hdr,
                                 msg->ptlm_srcnid, &rx, 0);
-                PJK_UT_MSG("lnet_parse rc=%d\n",rc, 0);
+                CDEBUG(D_NET, "lnet_parse rc=%d\n",rc);
                 if (rc < 0)
                         ptllnd_rx_done(&rx);
                 break;
@@ -1249,7 +1247,7 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
 
         ptllnd_peer_decref(plp);
 
-        PJK_UT_MSG("<<<\n");
+        CDEBUG(D_NET, "<<<\n");
 }
 
 void
@@ -1265,7 +1263,7 @@ ptllnd_buf_event (lnet_ni_t *ni, ptl_event_t *event)
         LASSERT (event->type == PTL_EVENT_PUT_END ||
                  event->type == PTL_EVENT_UNLINK);
 
-        PJK_UT_MSG("buf=%p event=%d\n",buf,event->type);
+        CDEBUG(D_NET, "buf=%p event=%d\n",buf,event->type);
 
         if (event->type == PTL_EVENT_PUT_END)
                 ptllnd_parse_request(ni, event->initiator,
@@ -1281,7 +1279,7 @@ ptllnd_buf_event (lnet_ni_t *ni, ptl_event_t *event)
         repost = (event->type == PTL_EVENT_UNLINK);
 #endif
 
-        PJK_UT_MSG("repost=%d unlinked=%d\n",repost,unlinked);
+        CDEBUG(D_NET, "repost=%d unlinked=%d\n",repost,unlinked);
 
         if(unlinked){
                 LASSERT(buf->plb_posted);
@@ -1309,13 +1307,13 @@ ptllnd_tx_event (lnet_ni_t *ni, ptl_event_t *event)
 
         LASSERT (!PtlHandleIsEqual(event->md_handle, PTL_INVALID_HANDLE));
 
-        PJK_UT_MSG("tx=%p type=%s (%d)\n",tx,
+        CDEBUG(D_NET, "tx=%p type=%s (%d)\n",tx,
                 get_msg_type_string(tx->tx_type),tx->tx_type);
-        PJK_UT_MSG("unlinked=%d\n",unlinked);
-        PJK_UT_MSG("error=%d\n",error);
+        CDEBUG(D_NET, "unlinked=%d\n",unlinked);
+        CDEBUG(D_NET, "error=%d\n",error);
 
         isreq = PtlHandleIsEqual(event->md_handle, tx->tx_reqmdh);
-        PJK_UT_MSG("isreq=%d\n",isreq);
+        CDEBUG(D_NET, "isreq=%d\n",isreq);
         if (isreq) {
                 LASSERT (event->md.start == (void *)&tx->tx_msg);
                 if (unlinked)
@@ -1324,7 +1322,7 @@ ptllnd_tx_event (lnet_ni_t *ni, ptl_event_t *event)
 
 
         isbulk = PtlHandleIsEqual(event->md_handle, tx->tx_bulkmdh);
-        PJK_UT_MSG("isbulk=%d\n",isbulk);
+        CDEBUG(D_NET, "isbulk=%d\n",isbulk);
         if ( isbulk && unlinked )
                 tx->tx_bulkmdh = PTL_INVALID_HANDLE;
 
@@ -1381,7 +1379,7 @@ ptllnd_tx_event (lnet_ni_t *ni, ptl_event_t *event)
                         tx->tx_status = -EIO;
                 list_del(&tx->tx_list);
                 list_add_tail(&tx->tx_list, &plni->plni_zombie_txs);
-                PJK_UT_MSG("tx=%p ONTO ZOMBIE LIST\n",tx);
+                CDEBUG(D_NET, "tx=%p ONTO ZOMBIE LIST\n",tx);
         }
 }
 
@@ -1397,7 +1395,7 @@ ptllnd_wait (lnet_ni_t *ni, int milliseconds)
         int            found = 0;
         int            timeout = 0;
 
-        PJK_UT_MSG(">>> ms=%d\n",milliseconds);
+        CDEBUG(D_NET, ">>> ms=%d\n",milliseconds);
 
         /* Handle any currently queued events, returning immediately if any.
          * Otherwise block for the timeout and handle all events queued
@@ -1406,13 +1404,13 @@ ptllnd_wait (lnet_ni_t *ni, int milliseconds)
         for (;;) {
                 rc = PtlEQPoll(&plni->plni_eqh, 1, timeout, &event, &which);
                 timeout = 0;
-                PJK_UT_MSG("PtlEQPoll rc=%d\n",rc);
+                CDEBUG(D_NET, "PtlEQPoll rc=%d\n",rc);
 
                 if (rc == PTL_EQ_EMPTY) {
                         if (found ||            /* handled some events */
                             milliseconds == 0 || /* just checking */
                             blocked){            /* blocked already */
-                                PJK_UT_MSG("found=%d blocked=%d\n",found,blocked);
+                                CDEBUG(D_NET, "found=%d blocked=%d\n",found,blocked);
                                 break;
                             }
 
@@ -1427,7 +1425,7 @@ ptllnd_wait (lnet_ni_t *ni, int milliseconds)
                         CERROR("Event queue: size %d is too small\n",
                                plni->plni_eq_size);
 
-                PJK_UT_MSG("event.type=%s(%d)\n",
+                CDEBUG(D_NET, "event.type=%s(%d)\n",
                         get_ev_type_string(event.type),event.type);
 
                 found = 1;
@@ -1448,9 +1446,9 @@ ptllnd_wait (lnet_ni_t *ni, int milliseconds)
         while (!list_empty(&plni->plni_zombie_txs)) {
                 tx = list_entry(plni->plni_zombie_txs.next,
                                 ptllnd_tx_t, tx_list);
-                PJK_UT_MSG("Process ZOMBIE tx=%p\n",tx);
+                CDEBUG(D_NET, "Process ZOMBIE tx=%p\n",tx);
                 ptllnd_tx_done(tx);
         }
 
-        PJK_UT_MSG("<<<\n");
+        CDEBUG(D_NET, "<<<\n");
 }
index ef5deda..e6ddfbf 100644 (file)
@@ -620,6 +620,7 @@ int jt_dbg_clear_debug_buf(int argc, char **argv)
 
 int jt_dbg_mark_debug_buf(int argc, char **argv)
 {
+        char scratch[MAX_MARK_SIZE];
         int rc, max_size = MAX_MARK_SIZE-1;
         struct libcfs_ioctl_data data;
         char *text;
@@ -627,7 +628,7 @@ int jt_dbg_mark_debug_buf(int argc, char **argv)
 
         if (argc > 1) {
                 int counter;
-                text = malloc(MAX_MARK_SIZE);
+                text = scratch;
                 strncpy(text, argv[1], max_size);
                 max_size-=strlen(argv[1]);
                 for(counter = 2; (counter < argc) && (max_size > 0) ; counter++){
@@ -709,12 +710,11 @@ static struct mod_paths {
         {"mds_ext3", "lustre/mds"},
         {"mds_extN", "lustre/mds"},
         {"ptlbd", "lustre/ptlbd"},
-        {"mgmt_svc", "lustre/mgmt"},
-        {"mgmt_cli", "lustre/mgmt"},
         {"cobd", "lustre/cobd"},
         {"cmobd", "lustre/cmobd"},
-        {"confobd", "lustre/obdclass"},
         {"lquota", "lustre/quota"},
+        {"mgs", "lustre/mgs"},
+        {"mgc", "lustre/mgc"},
         {NULL, NULL}
 };