Whamcloud - gitweb
i=maxim,b=18460,b=20171:
authorisaac <isaac>
Tue, 1 Dec 2009 15:00:55 +0000 (15:00 +0000)
committerisaac <isaac>
Tue, 1 Dec 2009 15:00:55 +0000 (15:00 +0000)
- avoid asymmetrical router failures; monotonic timing source for router checker.

19 files changed:
lnet/ChangeLog
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/klnds/mxlnd/mxlnd_cb.c
lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c
lnet/klnds/ptllnd/ptllnd.c
lnet/klnds/ptllnd/ptllnd.h
lnet/klnds/ptllnd/ptllnd_peer.c
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/lnet/api-ni.c
lnet/lnet/config.c
lnet/lnet/lib-md.c
lnet/lnet/lib-move.c
lnet/lnet/peer.c
lnet/lnet/router.c
lnet/lnet/router_proc.c

index 84f18b2..8f7a42d 100644 (file)
@@ -17,6 +17,15 @@ Bugzilla   :
 Description: 
 Details    : 
 
 Description: 
 Details    : 
 
+Severity   : normal
+Bugzilla   : 20171
+Description: router checker stops working when system wall clock goes backward
+Details    : use monotonic timing source instead of system wall clock time.
+
+Severity   : enhancement
+Bugzilla   : 18460
+Description: avoid asymmetrical router failures
+
 Severity   : enhancement
 Bugzilla   : 19735
 Description: multiple-instance support for kptllnd
 Severity   : enhancement
 Bugzilla   : 19735
 Description: multiple-instance support for kptllnd
index 87a29c7..5332664 100644 (file)
@@ -537,8 +537,8 @@ lnet_net2ni (__u32 net)
         return ni;
 }
 
         return ni;
 }
 
-int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, time_t when);
-void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, time_t when);
+int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, cfs_time_t when);
+void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when);
 int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid);
 int lnet_check_routes(void);
 int lnet_del_route(__u32 net, lnet_nid_t gw_nid);
 int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid);
 int lnet_check_routes(void);
 int lnet_del_route(__u32 net, lnet_nid_t gw_nid);
@@ -673,8 +673,14 @@ void lnet_acceptor_stop(void);
 void lnet_get_tunables(void);
 int lnet_peers_start_down(void);
 int lnet_peer_buffer_credits(lnet_ni_t *ni);
 void lnet_get_tunables(void);
 int lnet_peers_start_down(void);
 int lnet_peer_buffer_credits(lnet_ni_t *ni);
+
+extern int router_ping_timeout;
+extern int dead_router_check_interval;
+extern int live_router_check_interval;
 int lnet_router_checker_start(void);
 void lnet_router_checker_stop(void);
 int lnet_router_checker_start(void);
 void lnet_router_checker_stop(void);
+void lnet_swap_pinginfo(lnet_ping_info_t *info);
+int lnet_router_down_ni(lnet_peer_t *rtr, __u32 net);
 
 int lnet_ping_target_init(void);
 void lnet_ping_target_fini(void);
 
 int lnet_ping_target_init(void);
 void lnet_ping_target_fini(void);
index 8e5f1a0..9efa5da 100644 (file)
@@ -358,7 +358,7 @@ typedef struct lnet_lnd
         void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
 
         /* query of peer aliveness */
         void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
 
         /* query of peer aliveness */
-        void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, time_t *when);
+        void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when);
 
 #if defined(__KERNEL__) || defined(HAVE_LIBPTHREAD)
         /* accept a new connection */
 
 #if defined(__KERNEL__) || defined(HAVE_LIBPTHREAD)
         /* accept a new connection */
@@ -374,6 +374,15 @@ typedef struct lnet_lnd
 #endif
 } lnd_t;
 
 #endif
 } lnd_t;
 
+#define LNET_NI_STATUS_UP      0x15aac0de
+#define LNET_NI_STATUS_DOWN    0xdeadface
+#define LNET_NI_STATUS_INVALID 0x00000000
+typedef struct {
+        lnet_nid_t ns_nid;
+        __u32      ns_status;
+        __u32      ns_unused;
+} WIRE_ATTR lnet_ni_status_t;
+
 #define LNET_MAX_INTERFACES   16
 
 typedef struct lnet_ni {
 #define LNET_MAX_INTERFACES   16
 
 typedef struct lnet_ni {
@@ -389,9 +398,31 @@ typedef struct lnet_ni {
         void             *ni_data;              /* instance-specific data */
         lnd_t            *ni_lnd;               /* procedural interface */
         int               ni_refcount;          /* reference count */
         void             *ni_data;              /* instance-specific data */
         lnd_t            *ni_lnd;               /* procedural interface */
         int               ni_refcount;          /* reference count */
+        cfs_time_t        ni_last_alive;        /* when I was last alive */
+        lnet_ni_status_t *ni_status;            /* my health status */
         char             *ni_interfaces[LNET_MAX_INTERFACES]; /* equivalent interfaces to use */
 } lnet_ni_t;
 
         char             *ni_interfaces[LNET_MAX_INTERFACES]; /* equivalent interfaces to use */
 } lnet_ni_t;
 
+#define LNET_PROTO_PING_MATCHBITS     0x8000000000000000LL
+#define LNET_PROTO_PING_VERSION       2
+#define LNET_PROTO_PING_VERSION1      1
+typedef struct {
+        __u32            pi_magic;
+        __u32            pi_version;
+        lnet_pid_t       pi_pid;
+        __u32            pi_nnis;
+        lnet_ni_status_t pi_ni[0];
+} WIRE_ATTR lnet_ping_info_t;
+
+/* router checker data, per router */
+#define LNET_MAX_RTR_NIS   16
+#define LNET_PINGINFO_SIZE offsetof(lnet_ping_info_t, pi_ni[LNET_MAX_RTR_NIS])
+typedef struct {
+        struct list_head  rcd_list;             /* chain on the_lnet.ln_zombie_rcd */
+        lnet_handle_md_t  rcd_mdh;              /* ping buffer MD */
+        lnet_ping_info_t *rcd_pinginfo;         /* ping buffer */
+} lnet_rc_data_t;
+
 typedef struct lnet_peer {
         struct list_head  lp_hashlist;          /* chain on peer hash */
         struct list_head  lp_txq;               /* messages blocking for tx credits */
 typedef struct lnet_peer {
         struct list_head  lp_hashlist;          /* chain on peer hash */
         struct list_head  lp_txq;               /* messages blocking for tx credits */
@@ -408,15 +439,16 @@ typedef struct lnet_peer {
         unsigned int      lp_ping_notsent;      /* SEND event outstanding from ping */
         int               lp_alive_count;       /* # times router went dead<->alive */
         long              lp_txqnob;            /* bytes queued for sending */
         unsigned int      lp_ping_notsent;      /* SEND event outstanding from ping */
         int               lp_alive_count;       /* # times router went dead<->alive */
         long              lp_txqnob;            /* bytes queued for sending */
-        time_t            lp_timestamp;         /* time of last aliveness news */
-        time_t            lp_last_alive;        /* when I was last alive */
-        time_t            lp_last_query;        /* when LND was queried last time */
-        time_t            lp_ping_timestamp;    /* time of last ping attempt */
-        time_t            lp_ping_deadline;     /* != 0 if ping reply expected */
+        cfs_time_t        lp_timestamp;         /* time of last aliveness news */
+        cfs_time_t        lp_ping_timestamp;    /* time of last ping attempt */
+        cfs_time_t        lp_ping_deadline;     /* != 0 if ping reply expected */
+        cfs_time_t        lp_last_alive;        /* when I was last alive */
+        cfs_time_t        lp_last_query;        /* when lp_ni was queried last time */
         lnet_ni_t        *lp_ni;                /* interface peer is on */
         lnet_nid_t        lp_nid;               /* peer's NID */
         int               lp_refcount;          /* # refs */
         int               lp_rtr_refcount;      /* # refs from lnet_route_t::lr_gateway */
         lnet_ni_t        *lp_ni;                /* interface peer is on */
         lnet_nid_t        lp_nid;               /* peer's NID */
         int               lp_refcount;          /* # refs */
         int               lp_rtr_refcount;      /* # refs from lnet_route_t::lr_gateway */
+        lnet_rc_data_t   *lp_rcd;               /* router checker state */
 } lnet_peer_t;
 
 typedef struct {
 } lnet_peer_t;
 
 typedef struct {
@@ -466,16 +498,6 @@ typedef struct {
 
 #define LNET_NRBPOOLS         3                 /* # different router buffer pools */
 
 
 #define LNET_NRBPOOLS         3                 /* # different router buffer pools */
 
-#define LNET_PROTO_PING_MATCHBITS     0x8000000000000000LL
-#define LNET_PROTO_PING_VERSION       1
-typedef struct {
-        __u32          pi_magic;
-        __u32          pi_version;
-        lnet_pid_t     pi_pid;
-        __u32          pi_nnids;
-        lnet_nid_t     pi_nid[0];
-} WIRE_ATTR lnet_ping_info_t;
-
 /* Options for lnet_portal_t::ptl_options */
 #define LNET_PTL_LAZY               (1 << 0)
 typedef struct {
 /* Options for lnet_portal_t::ptl_options */
 #define LNET_PTL_LAZY               (1 << 0)
 typedef struct {
@@ -576,6 +598,7 @@ typedef struct
         int                ln_rc_state;         /* router checker startup/shutdown state */
         lnet_handle_eq_t   ln_rc_eqh;           /* router checker's event queue */
         lnet_handle_md_t   ln_rc_mdh;
         int                ln_rc_state;         /* router checker startup/shutdown state */
         lnet_handle_eq_t   ln_rc_eqh;           /* router checker's event queue */
         lnet_handle_md_t   ln_rc_mdh;
+        struct list_head   ln_zombie_rcd;
 
 #ifdef LNET_USE_LIB_FREELIST
         lnet_freelist_t    ln_free_mes;
 
 #ifdef LNET_USE_LIB_FREELIST
         lnet_freelist_t    ln_free_mes;
index 14b49ed..6dee555 100644 (file)
@@ -484,8 +484,7 @@ mxlnd_conn_disconnect(struct kmx_conn *conn, int mx_dis, int send_bye)
         }
 
         if (kmxlnd_data.kmx_shutdown != 1) {
         }
 
         if (kmxlnd_data.kmx_shutdown != 1) {
-                time_t          last_alive      = 0;
-                unsigned long   last_msg        = 0;
+                unsigned long last_msg = 0;
 
                 /* notify LNET that we are giving up on this peer */
                 if (time_after(conn->mxk_last_rx, conn->mxk_last_tx))
 
                 /* notify LNET that we are giving up on this peer */
                 if (time_after(conn->mxk_last_rx, conn->mxk_last_tx))
@@ -493,9 +492,7 @@ mxlnd_conn_disconnect(struct kmx_conn *conn, int mx_dis, int send_bye)
                 else
                         last_msg = conn->mxk_last_tx;
 
                 else
                         last_msg = conn->mxk_last_tx;
 
-                last_alive = cfs_time_current_sec() -
-                             cfs_duration_sec(cfs_time_current() - last_msg);
-                lnet_notify(kmxlnd_data.kmx_ni, conn->mxk_peer->mxp_nid, 0, last_alive);
+                lnet_notify(kmxlnd_data.kmx_ni, conn->mxk_peer->mxp_nid, 0, last_msg);
 
                 if (mx_dis && valid)
                         mx_disconnect(kmxlnd_data.kmx_endpt, epa);
 
                 if (mx_dis && valid)
                         mx_disconnect(kmxlnd_data.kmx_endpt, epa);
index 419448d..d10041a 100644 (file)
@@ -1075,7 +1075,7 @@ kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
 }
 
 void
 }
 
 void
-kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when)
+kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
 {
         cfs_time_t     last_alive = 0;
         rwlock_t      *glock = &kiblnd_data.kib_global_lock;
 {
         cfs_time_t     last_alive = 0;
         rwlock_t      *glock = &kiblnd_data.kib_global_lock;
@@ -1095,8 +1095,7 @@ kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when)
         read_unlock_irqrestore(glock, flags);
 
         if (last_alive != 0)
         read_unlock_irqrestore(glock, flags);
 
         if (last_alive != 0)
-                *when = cfs_time_current_sec() -
-                        cfs_duration_sec(cfs_time_current() - last_alive);
+                *when = last_alive;
 
         /* peer is not persistent in hash, trigger peer creation
          * and connection establishment with a NULL tx */
 
         /* peer is not persistent in hash, trigger peer creation
          * and connection establishment with a NULL tx */
index 7d81190..8ccb4df 100644 (file)
@@ -942,7 +942,7 @@ void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr);
 int  kiblnd_startup (lnet_ni_t *ni);
 void kiblnd_shutdown (lnet_ni_t *ni);
 int  kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
 int  kiblnd_startup (lnet_ni_t *ni);
 void kiblnd_shutdown (lnet_ni_t *ni);
 int  kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
-void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, time_t *when);
+void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
 
 int  kiblnd_tunables_init(void);
 void kiblnd_tunables_fini(void);
 
 int  kiblnd_tunables_init(void);
 void kiblnd_tunables_fini(void);
index ec34090..01c4621 100644 (file)
@@ -1724,8 +1724,8 @@ kiblnd_peer_alive (kib_peer_t *peer)
 void
 kiblnd_peer_notify (kib_peer_t *peer)
 {
 void
 kiblnd_peer_notify (kib_peer_t *peer)
 {
-        time_t        last_alive = 0;
         int           error = 0;
         int           error = 0;
+        cfs_time_t    last_alive = 0;
         unsigned long flags;
 
         read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
         unsigned long flags;
 
         read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
@@ -1737,9 +1737,7 @@ kiblnd_peer_notify (kib_peer_t *peer)
                 error = peer->ibp_error;
                 peer->ibp_error = 0;
 
                 error = peer->ibp_error;
                 peer->ibp_error = 0;
 
-                last_alive = cfs_time_current_sec() -
-                             cfs_duration_sec(cfs_time_current() -
-                                              peer->ibp_last_alive);
+                last_alive = peer->ibp_last_alive;
         }
 
         read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
         }
 
         read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
index 6562561..0abaa37 100755 (executable)
@@ -482,7 +482,7 @@ kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
 }
 
 void
 }
 
 void
-kptllnd_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when)
+kptllnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
 {
         kptl_net_t        *net = ni->ni_data;
         kptl_peer_t       *peer = NULL;
 {
         kptl_net_t        *net = ni->ni_data;
         kptl_peer_t       *peer = NULL;
@@ -495,9 +495,7 @@ kptllnd_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when)
 
         spin_lock_irqsave(&peer->peer_lock, flags);
         if (peer->peer_last_alive != 0)
 
         spin_lock_irqsave(&peer->peer_lock, flags);
         if (peer->peer_last_alive != 0)
-                *when = cfs_time_current_sec() -
-                        cfs_duration_sec(cfs_time_current() -
-                                         peer->peer_last_alive);
+                *when = peer->peer_last_alive;
         spin_unlock_irqrestore(&peer->peer_lock, flags);
         kptllnd_peer_decref(peer);
         return;
         spin_unlock_irqrestore(&peer->peer_lock, flags);
         kptllnd_peer_decref(peer);
         return;
index e747812..49b90d3 100755 (executable)
@@ -362,7 +362,7 @@ kptllnd_schedule_ptltrace_dump (void)
 int  kptllnd_startup(lnet_ni_t *ni);
 void kptllnd_shutdown(lnet_ni_t *ni);
 int  kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
 int  kptllnd_startup(lnet_ni_t *ni);
 void kptllnd_shutdown(lnet_ni_t *ni);
 int  kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
-void kptllnd_query (struct lnet_ni *ni, lnet_nid_t nid, time_t *when);
+void kptllnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
 int  kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
 int  kptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
                   int delayed, unsigned int niov, 
 int  kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
 int  kptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
                   int delayed, unsigned int niov, 
index 5d659d8..62ff41f 100644 (file)
@@ -289,17 +289,14 @@ kptllnd_peer_notify (kptl_peer_t *peer)
         int           i = 0;
         int           nnets = 0;
         int           error = 0;
         int           i = 0;
         int           nnets = 0;
         int           error = 0;
-        time_t        last_alive = 0;
+        cfs_time_t    last_alive = 0;
         
         spin_lock_irqsave(&peer->peer_lock, flags);
 
         if (peer->peer_error != 0) {
                 error = peer->peer_error;
                 peer->peer_error = 0;
         
         spin_lock_irqsave(&peer->peer_lock, flags);
 
         if (peer->peer_error != 0) {
                 error = peer->peer_error;
                 peer->peer_error = 0;
-                
-                last_alive = cfs_time_current_sec() - 
-                             cfs_duration_sec(cfs_time_current() - 
-                                              peer->peer_last_alive);
+                last_alive = peer->peer_last_alive;
         }
         
         spin_unlock_irqrestore(&peer->peer_lock, flags);
         }
         
         spin_unlock_irqrestore(&peer->peer_lock, flags);
index a616be1..15bb8cf 100644 (file)
@@ -1495,8 +1495,8 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
 void
 ksocknal_peer_failed (ksock_peer_t *peer)
 {
 void
 ksocknal_peer_failed (ksock_peer_t *peer)
 {
-        time_t    last_alive = 0;
-        int       notify = 0;
+        int        notify = 0;
+        cfs_time_t last_alive = 0;
 
         /* There has been a connection failure or comms error; but I'll only
          * tell LNET I think the peer is dead if it's to another kernel and
 
         /* There has been a connection failure or comms error; but I'll only
          * tell LNET I think the peer is dead if it's to another kernel and
@@ -1509,9 +1509,7 @@ ksocknal_peer_failed (ksock_peer_t *peer)
             peer->ksnp_accepting == 0 &&
             ksocknal_find_connecting_route_locked(peer) == NULL) {
                 notify = 1;
             peer->ksnp_accepting == 0 &&
             ksocknal_find_connecting_route_locked(peer) == NULL) {
                 notify = 1;
-                last_alive = (time_t) (cfs_time_current_sec() -
-                        cfs_duration_sec(cfs_time_current() -
-                                         peer->ksnp_last_alive));
+                last_alive = peer->ksnp_last_alive;
         }
 
         cfs_read_unlock (&ksocknal_data.ksnd_global_lock);
         }
 
         cfs_read_unlock (&ksocknal_data.ksnd_global_lock);
@@ -1792,7 +1790,7 @@ ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive)
 }
 
 void
 }
 
 void
-ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when)
+ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
 {
         int                connect = 1;
         cfs_time_t         last_alive = 0;
 {
         int                connect = 1;
         cfs_time_t         last_alive = 0;
@@ -1829,8 +1827,7 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when)
         read_unlock(glock);
 
         if (last_alive != 0)
         read_unlock(glock);
 
         if (last_alive != 0)
-                *when = cfs_time_current_sec() -
-                        cfs_duration_sec(cfs_time_current() - last_alive);
+                *when = last_alive;
 
         if (!connect)
                 return;
 
         if (!connect)
                 return;
index e4386ab..a4cec42 100644 (file)
@@ -548,7 +548,7 @@ extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
 extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
 extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error);
 extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
 extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
 extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error);
 extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
-extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, time_t *when);
+extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
 extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg);
 extern void ksocknal_thread_fini (void);
 extern void ksocknal_launch_all_connections_locked (ksock_peer_t *peer);
 extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg);
 extern void ksocknal_thread_fini (void);
 extern void ksocknal_launch_all_connections_locked (ksock_peer_t *peer);
index 86541a7..75cbbcc 100644 (file)
@@ -1120,9 +1120,11 @@ LNetInit(void)
         memset(&the_lnet, 0, sizeof(the_lnet));
 
         lnet_init_locks();
         memset(&the_lnet, 0, sizeof(the_lnet));
 
         lnet_init_locks();
-        CFS_INIT_LIST_HEAD(&the_lnet.ln_lnds);
         the_lnet.ln_refcount = 0;
         the_lnet.ln_init = 1;
         the_lnet.ln_refcount = 0;
         the_lnet.ln_init = 1;
+        LNetInvalidateHandle(&the_lnet.ln_rc_eqh);
+        CFS_INIT_LIST_HEAD(&the_lnet.ln_lnds);
+        CFS_INIT_LIST_HEAD(&the_lnet.ln_zombie_rcd);
 
 #ifdef __KERNEL__
         /* All LNDs apart from the LOLND are in separate modules.  They
 
 #ifdef __KERNEL__
         /* All LNDs apart from the LOLND are in separate modules.  They
@@ -1207,11 +1209,13 @@ LNetNIInit(lnet_pid_t requested_pid)
         the_lnet.ln_refcount = 1;
         /* Now I may use my own API functions... */
 
         the_lnet.ln_refcount = 1;
         /* Now I may use my own API functions... */
 
-        rc = lnet_router_checker_start();
+        /* NB router checker needs the_lnet.ln_ping_info in
+         * lnet_router_checker -> lnet_update_ni_status */
+        rc = lnet_ping_target_init();
         if (rc != 0)
                 goto failed3;
 
         if (rc != 0)
                 goto failed3;
 
-        rc = lnet_ping_target_init();
+        rc = lnet_router_checker_start();
         if (rc != 0)
                 goto failed4;
 
         if (rc != 0)
                 goto failed4;
 
@@ -1219,7 +1223,7 @@ LNetNIInit(lnet_pid_t requested_pid)
         goto out;
 
  failed4:
         goto out;
 
  failed4:
-        lnet_router_checker_stop();
+        lnet_ping_target_fini();
  failed3:
         the_lnet.ln_refcount = 0;
         lnet_acceptor_stop();
  failed3:
         the_lnet.ln_refcount = 0;
         lnet_acceptor_stop();
@@ -1249,8 +1253,8 @@ LNetNIFini()
                 LASSERT (!the_lnet.ln_niinit_self);
 
                 lnet_proc_fini();
                 LASSERT (!the_lnet.ln_niinit_self);
 
                 lnet_proc_fini();
-                lnet_ping_target_fini();
                 lnet_router_checker_stop();
                 lnet_router_checker_stop();
+                lnet_ping_target_fini();
 
                 /* Teardown fns that use my own API functions BEFORE here */
                 the_lnet.ln_refcount = 0;
 
                 /* Teardown fns that use my own API functions BEFORE here */
                 the_lnet.ln_refcount = 0;
@@ -1299,7 +1303,9 @@ LNetCtl(unsigned int cmd, void *arg)
                                       &data->ioc_nid, &data->ioc_flags);
         case IOC_LIBCFS_NOTIFY_ROUTER:
                 return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
                                       &data->ioc_nid, &data->ioc_flags);
         case IOC_LIBCFS_NOTIFY_ROUTER:
                 return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
-                                   (time_t)data->ioc_u64[0]);
+                                   cfs_time_current() -
+                                   cfs_time_seconds(cfs_time_current_sec() -
+                                                    (time_t)data->ioc_u64[0]));
 
         case IOC_LIBCFS_PORTALS_COMPATIBILITY:
                 /* This can be removed once lustre stops calling it */
 
         case IOC_LIBCFS_PORTALS_COMPATIBILITY:
                 /* This can be removed once lustre stops calling it */
@@ -1404,18 +1410,16 @@ LNetSnprintHandle(char *str, int len, lnet_handle_any_t h)
         snprintf(str, len, LPX64, h.cookie);
 }
 
         snprintf(str, len, LPX64, h.cookie);
 }
 
-
-int
-lnet_ping_target_init(void)
+static int
+lnet_create_ping_info(void)
 {
 {
-        lnet_handle_me_t  meh;
-        lnet_process_id_t id;
-        lnet_md_t         md = {0};
-        int               rc;
-        int               rc2;
+        int               i;
         int               n;
         int               n;
+        int               rc;
         unsigned int      infosz;
         unsigned int      infosz;
-        int               i;
+        lnet_ni_t        *ni;
+        lnet_process_id_t id;
+        lnet_ping_info_t *pinfo;
 
         for (n = 0; ; n++) {
                 rc = LNetGetId(n, &id);
 
         for (n = 0; ; n++) {
                 rc = LNetGetId(n, &id);
@@ -1425,24 +1429,76 @@ lnet_ping_target_init(void)
                 LASSERT (rc == 0);
         }
 
                 LASSERT (rc == 0);
         }
 
-        infosz = offsetof(lnet_ping_info_t, pi_nid[n]);
-        LIBCFS_ALLOC(the_lnet.ln_ping_info, infosz);
-        if (the_lnet.ln_ping_info == NULL) {
+        infosz = offsetof(lnet_ping_info_t, pi_ni[n]);
+        LIBCFS_ALLOC(pinfo, infosz);
+        if (pinfo == NULL) {
                 CERROR("Can't allocate ping info[%d]\n", n);
                 return -ENOMEM;
         }
 
                 CERROR("Can't allocate ping info[%d]\n", n);
                 return -ENOMEM;
         }
 
-        the_lnet.ln_ping_info->pi_magic   = LNET_PROTO_PING_MAGIC;
-        the_lnet.ln_ping_info->pi_version = LNET_PROTO_PING_VERSION;
-        the_lnet.ln_ping_info->pi_pid     = the_lnet.ln_pid;
-        the_lnet.ln_ping_info->pi_nnids   = n;
+        pinfo->pi_nnis    = n;
+        pinfo->pi_pid     = the_lnet.ln_pid;
+        pinfo->pi_magic   = LNET_PROTO_PING_MAGIC;
+        pinfo->pi_version = LNET_PROTO_PING_VERSION;
 
         for (i = 0; i < n; i++) {
 
         for (i = 0; i < n; i++) {
+                lnet_ni_status_t *ns = &pinfo->pi_ni[i];
+
                 rc = LNetGetId(i, &id);
                 LASSERT (rc == 0);
                 rc = LNetGetId(i, &id);
                 LASSERT (rc == 0);
-                the_lnet.ln_ping_info->pi_nid[i] = id.nid;
+
+                ns->ns_nid    = id.nid;
+                ns->ns_status = LNET_NI_STATUS_UP;
+
+                LNET_LOCK();
+
+                ni = lnet_nid2ni_locked(id.nid);
+                LASSERT (ni != NULL);
+                LASSERT (ni->ni_status == NULL);
+                ni->ni_status = ns;
+                lnet_ni_decref_locked(ni);
+
+                LNET_UNLOCK();
         }
 
         }
 
+        the_lnet.ln_ping_info = pinfo;
+        return 0;
+}
+
+static void
+lnet_destroy_ping_info(void)
+{
+        lnet_ni_t *ni;
+
+        LNET_LOCK();
+
+        list_for_each_entry (ni, &the_lnet.ln_nis, ni_list) {
+                ni->ni_status = NULL;
+        }
+
+        LNET_UNLOCK();
+
+        LIBCFS_FREE(the_lnet.ln_ping_info,
+                    offsetof(lnet_ping_info_t,
+                             pi_ni[the_lnet.ln_ping_info->pi_nnis]));
+        the_lnet.ln_ping_info = NULL;
+        return;
+}
+
+int
+lnet_ping_target_init(void)
+{
+        lnet_md_t         md = {0};
+        lnet_handle_me_t  meh;
+        lnet_process_id_t id;
+        int               rc;
+        int               rc2;
+        int               infosz;
+
+        rc = lnet_create_ping_info();
+        if (rc != 0)
+                return rc;
+
         /* We can have a tiny EQ since we only need to see the unlink event on
          * teardown, which by definition is the last one! */
         rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq);
         /* We can have a tiny EQ since we only need to see the unlink event on
          * teardown, which by definition is the last one! */
         rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq);
@@ -1465,6 +1521,8 @@ lnet_ping_target_init(void)
         }
 
         /* initialize md content */
         }
 
         /* initialize md content */
+        infosz = offsetof(lnet_ping_info_t,
+                          pi_ni[the_lnet.ln_ping_info->pi_nnis]);
         md.start     = the_lnet.ln_ping_info;
         md.length    = infosz;
         md.threshold = LNET_MD_THRESH_INF;
         md.start     = the_lnet.ln_ping_info;
         md.length    = infosz;
         md.threshold = LNET_MD_THRESH_INF;
@@ -1491,8 +1549,7 @@ lnet_ping_target_init(void)
         rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
         LASSERT (rc2 == 0);
  failed_0:
         rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
         LASSERT (rc2 == 0);
  failed_0:
-        LIBCFS_FREE(the_lnet.ln_ping_info, infosz);
-
+        lnet_destroy_ping_info();
         return rc;
 }
 
         return rc;
 }
 
@@ -1529,11 +1586,7 @@ lnet_ping_target_fini(void)
 
         rc = LNetEQFree(the_lnet.ln_ping_target_eq);
         LASSERT (rc == 0);
 
         rc = LNetEQFree(the_lnet.ln_ping_target_eq);
         LASSERT (rc == 0);
-
-        LIBCFS_FREE(the_lnet.ln_ping_info,
-                    offsetof(lnet_ping_info_t,
-                             pi_nid[the_lnet.ln_ping_info->pi_nnids]));
-
+        lnet_destroy_ping_info();
         cfs_restore_sigs(blocked);
 }
 
         cfs_restore_sigs(blocked);
 }
 
@@ -1548,7 +1601,7 @@ lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_i
         int                  unlinked = 0;
         int                  replied = 0;
         const int            a_long_time = 60000; /* mS */
         int                  unlinked = 0;
         int                  replied = 0;
         const int            a_long_time = 60000; /* mS */
-        int                  infosz = offsetof(lnet_ping_info_t, pi_nid[n_ids]);
+        int                  infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]);
         lnet_ping_info_t    *info;
         lnet_process_id_t    tmpid;
         int                  i;
         lnet_ping_info_t    *info;
         lnet_process_id_t    tmpid;
         int                  i;
@@ -1642,7 +1695,6 @@ lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_i
                                 CWARN("ping %s: late network completion\n",
                                       libcfs_id2str(id));
                         }
                                 CWARN("ping %s: late network completion\n",
                                       libcfs_id2str(id));
                         }
-
                 } else if (event.type == LNET_EVENT_REPLY) {
                         replied = 1;
                         rc = event.mlength;
                 } else if (event.type == LNET_EVENT_REPLY) {
                         replied = 1;
                         rc = event.mlength;
@@ -1671,14 +1723,7 @@ lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_i
         }
 
         if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
         }
 
         if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
-                /* NB I might be swabbing garbage until I check below, but it
-                 * doesn't matter */
-                __swab32s(&info->pi_version);
-                __swab32s(&info->pi_pid);
-                __swab32s(&info->pi_nnids);
-                for (i = 0; i < (int)info->pi_nnids && i < (int)n_ids; i++)
-                        __swab64s(&info->pi_nid[i]);
-
+                lnet_swap_pinginfo(info);
         } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
                 CERROR("%s: Unexpected magic %08x\n", 
                        libcfs_id2str(id), info->pi_magic);
         } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
                 CERROR("%s: Unexpected magic %08x\n", 
                        libcfs_id2str(id), info->pi_magic);
@@ -1691,18 +1736,18 @@ lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_i
                 goto out_1;
         }
 
                 goto out_1;
         }
 
-        if (nob < (int)offsetof(lnet_ping_info_t, pi_nid[0])) {
+        if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) {
                 CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
                 CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
-                       nob, (int)offsetof(lnet_ping_info_t, pi_nid[0]));
+                       nob, (int)offsetof(lnet_ping_info_t, pi_ni[0]));
                 goto out_1;
         }
 
                 goto out_1;
         }
 
-        if ((int) info->pi_nnids < n_ids)
-                n_ids = info->pi_nnids;
+        if (info->pi_nnis < n_ids)
+                n_ids = info->pi_nnis;
 
 
-        if (nob < (int)offsetof(lnet_ping_info_t, pi_nid[n_ids])) {
+        if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) {
                 CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
                 CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
-                       nob, (int)offsetof(lnet_ping_info_t, pi_nid[n_ids]));
+                       nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids]));
                 goto out_1;
         }
 
                 goto out_1;
         }
 
@@ -1710,7 +1755,7 @@ lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_i
 
         for (i = 0; i < n_ids; i++) {
                 tmpid.pid = info->pi_pid;
 
         for (i = 0; i < n_ids; i++) {
                 tmpid.pid = info->pi_pid;
-                tmpid.nid = info->pi_nid[i];
+                tmpid.nid = info->pi_ni[i].ns_nid;
 #ifdef __KERNEL__
                 if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
                         goto out_1;
 #ifdef __KERNEL__
                 if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
                         goto out_1;
@@ -1718,7 +1763,7 @@ lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_i
                 ids[i] = tmpid;
 #endif
         }
                 ids[i] = tmpid;
 #endif
         }
-        rc = info->pi_nnids;
+        rc = info->pi_nnis;
 
  out_1:
         rc2 = LNetEQFree(eqh);
 
  out_1:
         rc2 = LNetEQFree(eqh);
index 45bd89f..f8d6dea 100644 (file)
@@ -117,7 +117,7 @@ lnet_net_unique(__u32 net, struct list_head *nilist)
                 if (LNET_NIDNET(ni->ni_nid) == net)
                         return 0;
         }
                 if (LNET_NIDNET(ni->ni_nid) == net)
                         return 0;
         }
-        
+
         return 1;
 }
 
         return 1;
 }
 
@@ -131,20 +131,21 @@ lnet_new_ni(__u32 net, struct list_head *nilist)
                                    libcfs_net2str(net));
                 return NULL;
         }
                                    libcfs_net2str(net));
                 return NULL;
         }
-        
+
         LIBCFS_ALLOC(ni, sizeof(*ni));
         if (ni == NULL) {
                 CERROR("Out of memory creating network %s\n",
                        libcfs_net2str(net));
                 return NULL;
         }
         LIBCFS_ALLOC(ni, sizeof(*ni));
         if (ni == NULL) {
                 CERROR("Out of memory creating network %s\n",
                        libcfs_net2str(net));
                 return NULL;
         }
-        
+
         /* zero counters/flags, NULL pointers... */
         memset(ni, 0, sizeof(*ni));
 
         /* LND will fill in the address part of the NID */
         ni->ni_nid = LNET_MKNID(net, 0);
         CFS_INIT_LIST_HEAD(&ni->ni_txq);
         /* zero counters/flags, NULL pointers... */
         memset(ni, 0, sizeof(*ni));
 
         /* LND will fill in the address part of the NID */
         ni->ni_nid = LNET_MKNID(net, 0);
         CFS_INIT_LIST_HEAD(&ni->ni_txq);
+        ni->ni_last_alive = cfs_time_current();
 
         list_add_tail(&ni->ni_list, nilist);
         return ni;
 
         list_add_tail(&ni->ni_list, nilist);
         return ni;
index 6c988f2..6e64ad2 100644 (file)
@@ -213,6 +213,24 @@ lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd)
 }
 
 int
 }
 
 int
+lnet_md_validate(lnet_md_t *umd)
+{
+        if (umd->start == NULL) {
+                CERROR("MD start pointer can not be NULL\n");
+                return -EINVAL;
+        }
+
+        if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+            umd->length > LNET_MAX_IOV) {
+                CERROR("Invalid option: too many fragments %d, %d max\n",
+                       umd->length, LNET_MAX_IOV);
+                return -EINVAL;
+        }
+
+        return 0;
+}
+
+int
 LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
              lnet_unlink_t unlink, lnet_handle_md_t *handle)
 {
 LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
              lnet_unlink_t unlink, lnet_handle_md_t *handle)
 {
@@ -223,12 +241,13 @@ LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
         LASSERT (the_lnet.ln_init);
         LASSERT (the_lnet.ln_refcount > 0);
 
         LASSERT (the_lnet.ln_init);
         LASSERT (the_lnet.ln_refcount > 0);
 
-        if ((umd.options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
-            umd.length > LNET_MAX_IOV) /* too many fragments */
+        if (lnet_md_validate(&umd) != 0)
                 return -EINVAL;
 
                 return -EINVAL;
 
-        if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0)
+        if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) {
+                CERROR("Invalid option: no MD_OP set\n");
                 return -EINVAL;
                 return -EINVAL;
+        }
 
         md = lnet_md_alloc(&umd);
         if (md == NULL)
 
         md = lnet_md_alloc(&umd);
         if (md == NULL)
@@ -274,12 +293,13 @@ LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle)
         LASSERT (the_lnet.ln_init);
         LASSERT (the_lnet.ln_refcount > 0);
 
         LASSERT (the_lnet.ln_init);
         LASSERT (the_lnet.ln_refcount > 0);
 
-        if ((umd.options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
-            umd.length > LNET_MAX_IOV) /* too many fragments */
+        if (lnet_md_validate(&umd) != 0)
                 return -EINVAL;
 
                 return -EINVAL;
 
-        if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0)
+        if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) {
+                CERROR("Invalid option: GET|PUT illegal on active MDs\n");
                 return -EINVAL;
                 return -EINVAL;
+        }
 
         md = lnet_md_alloc(&umd);
         if (md == NULL)
 
         md = lnet_md_alloc(&umd);
         if (md == NULL)
index f31b0fe..743146c 100644 (file)
@@ -903,7 +903,7 @@ lnet_eager_recv_locked(lnet_msg_t *msg)
 void
 lnet_ni_peer_alive(lnet_peer_t *lp)
 {
 void
 lnet_ni_peer_alive(lnet_peer_t *lp)
 {
-        time_t      last_alive = 0;
+        cfs_time_t  last_alive = 0;
         lnet_ni_t  *ni = lp->lp_ni;
 
         LASSERT (ni != NULL);
         lnet_ni_t  *ni = lp->lp_ni;
 
         LASSERT (ni != NULL);
@@ -914,7 +914,7 @@ lnet_ni_peer_alive(lnet_peer_t *lp)
         (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
         LNET_LOCK();
 
         (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
         LNET_LOCK();
 
-        lp->lp_last_query = cfs_time_current_sec();
+        lp->lp_last_query = cfs_time_current();
 
         if (last_alive != 0) /* NI has updated timestamp */
                 lp->lp_last_alive = last_alive;
 
         if (last_alive != 0) /* NI has updated timestamp */
                 lp->lp_last_alive = last_alive;
@@ -923,29 +923,34 @@ lnet_ni_peer_alive(lnet_peer_t *lp)
 
 /* NB: always called with LNET_LOCK held */
 static inline int
 
 /* NB: always called with LNET_LOCK held */
 static inline int
-lnet_peer_is_alive (lnet_peer_t *lp, time_t now)
+lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
 {
         lnet_ni_t  *ni = lp->lp_ni;
 {
         lnet_ni_t  *ni = lp->lp_ni;
-        time_t      deadline;
+        cfs_time_t  deadline;
         int         alive;
 
         LASSERT (ni != NULL);
         LASSERT (ni->ni_peertimeout > 0);
 
         int         alive;
 
         LASSERT (ni != NULL);
         LASSERT (ni->ni_peertimeout > 0);
 
+        /* Trust lnet_notify() if it has more recent aliveness news, but
+         * ignore the initial assumed death (see lnet_peers_start_down()).
+         */
         if (!lp->lp_alive && lp->lp_alive_count > 0 &&
             cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
         if (!lp->lp_alive && lp->lp_alive_count > 0 &&
             cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
-                        return 0;
+                return 0;
 
 
-        deadline = cfs_time_add(lp->lp_last_alive, ni->ni_peertimeout);
+        deadline = cfs_time_add(lp->lp_last_alive,
+                                cfs_time_seconds(ni->ni_peertimeout));
         alive = cfs_time_after(deadline, now);
         alive = cfs_time_after(deadline, now);
-        if (alive && !lp->lp_alive) /* update obsolete lp_alive */
+
+        /* Update obsolete lp_alive */
+        if (alive && !lp->lp_alive && lp->lp_timestamp != 0 &&
+            cfs_time_before(lp->lp_timestamp, lp->lp_last_alive))
                 lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
 
         return alive;
 }
 
                 lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
 
         return alive;
 }
 
-/* don't query LND about aliveness of a dead peer more frequently than: */
-static int lnet_queryinterval = 1; /* 1 second */
 
 /* NB: returns 1 when alive, 0 when dead, negative when error;
  *     may drop the LNET_LOCK */
 
 /* NB: returns 1 when alive, 0 when dead, negative when error;
  *     may drop the LNET_LOCK */
@@ -953,7 +958,7 @@ int
 lnet_peer_alive_locked (lnet_peer_t *lp)
 {
         lnet_ni_t  *ni = lp->lp_ni;
 lnet_peer_alive_locked (lnet_peer_t *lp)
 {
         lnet_ni_t  *ni = lp->lp_ni;
-        time_t      now = cfs_time_current_sec();
+        cfs_time_t  now = cfs_time_current();
 
         LASSERT (ni != NULL);
 
 
         LASSERT (ni != NULL);
 
@@ -963,24 +968,27 @@ lnet_peer_alive_locked (lnet_peer_t *lp)
         if (lnet_peer_is_alive(lp, now))
                 return 1;
 
         if (lnet_peer_is_alive(lp, now))
                 return 1;
 
-        /* peer appears dead, should we query right now? */
+        /* Peer appears dead, but we should avoid frequent NI queries (at
+         * most once per lnet_queryinterval seconds). */
         if (lp->lp_last_query != 0) {
         if (lp->lp_last_query != 0) {
-                time_t deadline =
-                        cfs_time_add(lp->lp_last_query,
-                                     lnet_queryinterval);
+                static const int lnet_queryinterval = 1;
 
 
-                if (cfs_time_before(now, deadline)) {
+                cfs_time_t next_query =
+                           cfs_time_add(lp->lp_last_query,
+                                        cfs_time_seconds(lnet_queryinterval));
+
+                if (cfs_time_before(now, next_query)) {
                         if (lp->lp_alive)
                                 CWARN("Unexpected aliveness of peer %s: "
                                       "%d < %d (%d/%d)\n",
                                       libcfs_nid2str(lp->lp_nid),
                         if (lp->lp_alive)
                                 CWARN("Unexpected aliveness of peer %s: "
                                       "%d < %d (%d/%d)\n",
                                       libcfs_nid2str(lp->lp_nid),
-                                      (int)now, (int)deadline,
+                                      (int)now, (int)next_query,
                                       lnet_queryinterval, ni->ni_peertimeout);
                         return 0;
                 }
         }
 
                                       lnet_queryinterval, ni->ni_peertimeout);
                         return 0;
                 }
         }
 
-        /* query LND for latest aliveness news */
+        /* query NI for latest aliveness news */
         lnet_ni_peer_alive(lp);
 
         if (lnet_peer_is_alive(lp, now))
         lnet_ni_peer_alive(lp);
 
         if (lnet_peer_is_alive(lp, now))
@@ -1392,6 +1400,7 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
                         lp2 = route->lr_gateway;
 
                         if (lp2->lp_alive &&
                         lp2 = route->lr_gateway;
 
                         if (lp2->lp_alive &&
+                            lnet_router_down_ni(lp2, rnet->lrn_net) <= 0 &&
                             (src_ni == NULL || lp2->lp_ni == src_ni) &&
                             (lp == NULL || lnet_compare_routers(lp2, lp) > 0)) {
                                 best_route = route;
                             (src_ni == NULL || lp2->lp_ni == src_ni) &&
                             (lp == NULL || lnet_compare_routers(lp2, lp) > 0)) {
                                 best_route = route;
@@ -2097,7 +2106,6 @@ lnet_print_hdr(lnet_hdr_t * hdr)
 
 }
 
 
 }
 
-
 int
 lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, 
            void *private, int rdma_req)
 int
 lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, 
            void *private, int rdma_req)
@@ -2154,6 +2162,19 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
                 return -EPROTO;
         }
 
                 return -EPROTO;
         }
 
+        if (the_lnet.ln_routing) {
+                cfs_time_t now = cfs_time_current();
+
+                LNET_LOCK();
+
+                ni->ni_last_alive = now;
+                if (ni->ni_status != NULL &&
+                    ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+                        ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+
+                LNET_UNLOCK();
+        }
+
         /* Regard a bad destination NID as a protocol error.  Senders should
          * know what they're doing; if they don't they're misconfigured, buggy
          * or malicious so we chop them off at the knees :) */
         /* Regard a bad destination NID as a protocol error.  Senders should
          * know what they're doing; if they don't they're misconfigured, buggy
          * or malicious so we chop them off at the knees :) */
index d39507b..fd7d31c 100644 (file)
@@ -122,6 +122,7 @@ lnet_destroy_peer_locked (lnet_peer_t *lp)
         LASSERT (lp->lp_rtr_refcount == 0);
        LASSERT (list_empty(&lp->lp_txq));
         LASSERT (lp->lp_txqnob == 0);
         LASSERT (lp->lp_rtr_refcount == 0);
        LASSERT (list_empty(&lp->lp_txq));
         LASSERT (lp->lp_txqnob == 0);
+        LASSERT (lp->lp_rcd == NULL);
 
        LIBCFS_FREE(lp, sizeof(*lp));
 
 
        LIBCFS_FREE(lp, sizeof(*lp));
 
@@ -186,8 +187,8 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid)
         lp->lp_alive_count = 0;
         lp->lp_timestamp = 0;
         lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
         lp->lp_alive_count = 0;
         lp->lp_timestamp = 0;
         lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
-        lp->lp_last_alive = cfs_time_current_sec(); /* assumes alive */
-        lp->lp_last_query = 0; /* didn't ask LND yet */
+        lp->lp_last_alive = cfs_time_current(); /* assumes alive */
+        lp->lp_last_query = 0; /* haven't asked NI yet */
         lp->lp_ping_timestamp = 0;
         lp->lp_nid = nid;
         lp->lp_refcount = 2;                    /* 1 for caller; 1 for hash */
         lp->lp_ping_timestamp = 0;
         lp->lp_nid = nid;
         lp->lp_refcount = 2;                    /* 1 for caller; 1 for hash */
index 9714c21..b89c5f3 100644 (file)
@@ -77,15 +77,19 @@ static int check_routers_before_use = 0;
 CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
                 "Assume routers are down and ping them before use");
 
 CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
                 "Assume routers are down and ping them before use");
 
-static int dead_router_check_interval = 0;
+static int avoid_asym_router_failure = 0;
+CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0444,
+                "Avoid asymmetrical failures: reserved, use at your own risk");
+
+int dead_router_check_interval = 0;
 CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0444,
                 "Seconds between dead router health checks (<= 0 to disable)");
 
 CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0444,
                 "Seconds between dead router health checks (<= 0 to disable)");
 
-static int live_router_check_interval = 0;
+int live_router_check_interval = 0;
 CFS_MODULE_PARM(live_router_check_interval, "i", int, 0444,
                 "Seconds between live router health checks (<= 0 to disable)");
 
 CFS_MODULE_PARM(live_router_check_interval, "i", int, 0444,
                 "Seconds between live router health checks (<= 0 to disable)");
 
-static int router_ping_timeout = 50;
+int router_ping_timeout = 50;
 CFS_MODULE_PARM(router_ping_timeout, "i", int, 0444,
                 "Seconds to wait for the reply to a router health query");
 
 CFS_MODULE_PARM(router_ping_timeout, "i", int, 0444,
                 "Seconds to wait for the reply to a router health query");
 
@@ -96,9 +100,9 @@ lnet_peers_start_down(void)
 }
 
 void
 }
 
 void
-lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, time_t when)
+lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when)
 {
 {
-        if (when < lp->lp_timestamp) {          /* out of date information */
+        if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */
                 CDEBUG(D_NET, "Out of date\n");
                 return;
         }
                 CDEBUG(D_NET, "Out of date\n");
                 return;
         }
@@ -201,6 +205,12 @@ lnet_rtr_decref_locked(lnet_peer_t *lp)
 
         lp->lp_rtr_refcount--;
         if (lp->lp_rtr_refcount == 0) {
 
         lp->lp_rtr_refcount--;
         if (lp->lp_rtr_refcount == 0) {
+                if (lp->lp_rcd != NULL) {
+                        list_add(&lp->lp_rcd->rcd_list,
+                                 &the_lnet.ln_zombie_rcd);
+                        lp->lp_rcd = NULL;
+                }
+
                 list_del(&lp->lp_rtr_list);
                 /* decref for the_lnet.ln_routers */
                 lnet_peer_decref_locked(lp);
                 list_del(&lp->lp_rtr_list);
                 /* decref for the_lnet.ln_routers */
                 lnet_peer_decref_locked(lp);
@@ -496,6 +506,103 @@ lnet_get_route (int idx, __u32 *net, __u32 *hops,
 }
 
 void
 }
 
 void
+lnet_swap_pinginfo(lnet_ping_info_t *info)
+{
+        int               i;
+        lnet_ni_status_t *stat;
+
+        __swab32s(&info->pi_version);
+        __swab32s(&info->pi_pid);
+        __swab32s(&info->pi_nnis);
+        for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+                stat = &info->pi_ni[i];
+                __swab64s(&stat->ns_nid);
+                __swab32s(&stat->ns_status);
+        }
+        return;
+}
+
+/* Returns # of down NIs, or negative error codes; ignore downed NIs
+ * if a NI in 'net' is up */
+int
+lnet_router_down_ni(lnet_peer_t *rtr, __u32 net)
+{
+        int               i;
+        int               down = 0;
+        int               ptl_up = 0;
+        int               ptl_down = 0;
+        lnet_ping_info_t *info;
+
+        if (!avoid_asym_router_failure)
+                return -ENOENT;
+
+        if (rtr->lp_rcd == NULL)
+                return -EINVAL;
+
+        if (!rtr->lp_alive)
+                return -EINVAL;  /* stale lp_rcd */
+
+        info = rtr->lp_rcd->rcd_pinginfo;
+        LASSERT (info != NULL);
+
+        /* NB always racing with network! */
+        if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+                lnet_swap_pinginfo(info);
+        } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+                CDEBUG(D_NETERROR, "%s: Unexpected magic %08x\n",
+                       libcfs_nid2str(rtr->lp_nid), info->pi_magic);
+                return -EPROTO;
+        }
+
+        if (info->pi_version == LNET_PROTO_PING_VERSION1)
+                return -ENOENT;  /* v1 doesn't carry NI status info */
+
+        if (info->pi_version != LNET_PROTO_PING_VERSION) {
+                CDEBUG(D_NETERROR, "%s: Unexpected version 0x%x\n",
+                       libcfs_nid2str(rtr->lp_nid), info->pi_version);
+                return -EPROTO;
+        }
+
+        for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+                lnet_ni_status_t *stat = &info->pi_ni[i];
+                lnet_nid_t        nid = stat->ns_nid;
+
+                if (nid == LNET_NID_ANY) {
+                        CDEBUG(D_NETERROR, "%s: unexpected LNET_NID_ANY\n",
+                               libcfs_nid2str(rtr->lp_nid));
+                        return -EPROTO;
+                }
+
+                if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+                        continue;
+
+                if (stat->ns_status == LNET_NI_STATUS_DOWN) {
+                        if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+                                ptl_down = 1;
+                        else
+                                down++;
+                        continue;
+                }
+
+                if (stat->ns_status != LNET_NI_STATUS_UP) {
+                        CDEBUG(D_NETERROR, "%s: Unexpected status 0x%x\n",
+                               libcfs_nid2str(rtr->lp_nid), stat->ns_status);
+                        return -EPROTO;
+                }
+
+                /* ignore downed NIs if there's a NI up for dest network */
+                if (LNET_NIDNET(nid) == net)
+                        return 0;
+
+                if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+                        ptl_up = 1;
+        }
+
+        /* ptl NIs are considered down only when they're all down */
+        return down + (ptl_up ? 0 : ptl_down);
+}
+
+void
 lnet_wait_known_routerstate(void)
 {
         lnet_peer_t         *rtr;
 lnet_wait_known_routerstate(void)
 {
         lnet_peer_t         *rtr;
@@ -535,11 +642,17 @@ lnet_router_checker_event (lnet_event_t *event)
         /* CAVEAT EMPTOR: I'm called with LNET_LOCKed and I'm not allowed to
          * drop it (that's how come I see _every_ event, even ones that would
          * overflow my EQ) */
         /* CAVEAT EMPTOR: I'm called with LNET_LOCKed and I'm not allowed to
          * drop it (that's how come I see _every_ event, even ones that would
          * overflow my EQ) */
-        lnet_peer_t   *lp;
-        lnet_nid_t     nid;
+        lnet_rc_data_t *rcd = event->md.user_ptr;
+        lnet_peer_t    *lp;
+        lnet_nid_t      nid;
 
         if (event->unlinked) {
 
         if (event->unlinked) {
-                /* The router checker thread has unlinked the rc_md
+                if (rcd != NULL) {
+                        LNetInvalidateHandle(&rcd->rcd_mdh);
+                        return;
+                }
+
+                /* The router checker thread has unlinked the default rc_md
                  * and exited. */
                 LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKING);
                 the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKED;
                  * and exited. */
                 LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKING);
                 the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKED;
@@ -575,7 +688,7 @@ lnet_router_checker_event (lnet_event_t *event)
                  * apps get burned). */
 
                 lnet_notify_locked(lp, 1, (event->status == 0),
                  * apps get burned). */
 
                 lnet_notify_locked(lp, 1, (event->status == 0),
-                                   cfs_time_current_sec());
+                                   cfs_time_current());
 
                 /* The router checker will wake up very shortly and do the
                  * actual notification.  
 
                 /* The router checker will wake up very shortly and do the
                  * actual notification.  
@@ -591,6 +704,100 @@ lnet_router_checker_event (lnet_event_t *event)
         lnet_peer_decref_locked(lp);
 }
 
         lnet_peer_decref_locked(lp);
 }
 
+void
+lnet_update_ni_status(void)
+{
+        cfs_time_t now = cfs_time_current();
+        lnet_ni_t *ni;
+        int        status;
+        int        timeout;
+
+        LASSERT (the_lnet.ln_routing);
+
+        timeout = router_ping_timeout +
+                  MAX(live_router_check_interval, dead_router_check_interval);
+
+        LNET_LOCK();
+
+        list_for_each_entry (ni, &the_lnet.ln_nis, ni_list) {
+                lnet_ni_status_t *ns = ni->ni_status;
+
+                LASSERT (ns != NULL);
+
+                status = LNET_NI_STATUS_UP;
+                if (ni->ni_lnd->lnd_type != LOLND &&  /* @lo forever alive */
+                    cfs_time_after(now, cfs_time_add(ni->ni_last_alive,
+                                                     cfs_time_seconds(timeout))))
+                        status = LNET_NI_STATUS_DOWN;
+
+                if (ns->ns_status != status) {
+                        ns->ns_status = status;
+                        CDEBUG(D_NET, "NI(%s:%d) status changed to %s\n",
+                               libcfs_nid2str(ni->ni_nid), timeout,
+                               status == LNET_NI_STATUS_UP ? "up" : "down");
+                }
+        }
+
+        LNET_UNLOCK();
+}
+
+void
+lnet_destroy_rc_data (lnet_rc_data_t *rcd)
+{
+        LASSERT (list_empty(&rcd->rcd_list));
+        /* detached from network */
+        LASSERT (LNetHandleIsInvalid(rcd->rcd_mdh));
+
+        LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+        LIBCFS_FREE(rcd, sizeof(*rcd));
+        return;
+}
+
+lnet_rc_data_t *
+lnet_create_rc_data (void)
+{
+        int               i;
+        int               rc;
+        lnet_ping_info_t *pi;
+        lnet_rc_data_t   *rcd;
+
+        LIBCFS_ALLOC(rcd, sizeof(*rcd));
+        if (rcd == NULL)
+                return NULL;
+
+        LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+        if (pi == NULL) {
+                LIBCFS_FREE(rcd, sizeof(*rcd));
+                return NULL;
+        }
+
+        memset(pi, 0, LNET_PINGINFO_SIZE);
+        for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+                pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+                pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+        }
+        rcd->rcd_pinginfo = pi;
+        LNetInvalidateHandle(&rcd->rcd_mdh);
+        CFS_INIT_LIST_HEAD(&rcd->rcd_list);
+
+        LASSERT (!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
+        rc = LNetMDBind((lnet_md_t){.start     = pi,
+                                    .user_ptr  = rcd,
+                                    .length    = LNET_PINGINFO_SIZE,
+                                    .threshold = LNET_MD_THRESH_INF,
+                                    .options   = LNET_MD_TRUNCATE,
+                                    .eq_handle = the_lnet.ln_rc_eqh},
+                        LNET_UNLINK,
+                        &rcd->rcd_mdh);
+        if (rc < 0) {
+                CERROR("Can't bind MD: %d\n", rc);
+                lnet_destroy_rc_data(rcd);
+                return NULL;
+        }
+        LASSERT (rc == 0);
+        return rcd;
+}
+
 static int
 lnet_router_check_interval (lnet_peer_t *rtr)
 {
 static int
 lnet_router_check_interval (lnet_peer_t *rtr)
 {
@@ -607,23 +814,42 @@ lnet_router_check_interval (lnet_peer_t *rtr)
 static void
 lnet_ping_router_locked (lnet_peer_t *rtr)
 {
 static void
 lnet_ping_router_locked (lnet_peer_t *rtr)
 {
-        lnet_process_id_t id;
-        int               secs;
-        time_t            now = cfs_time_current_sec();
+        int             newrcd = 0;
+        lnet_rc_data_t *rcd = NULL;
+        cfs_time_t      now = cfs_time_current();
+        int             secs;
 
         lnet_peer_addref_locked(rtr);
 
         if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
 
         lnet_peer_addref_locked(rtr);
 
         if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
-            now > rtr->lp_ping_deadline)
+            cfs_time_after(now, rtr->lp_ping_deadline))
                 lnet_notify_locked(rtr, 1, 0, now);
 
                 lnet_notify_locked(rtr, 1, 0, now);
 
+        if (avoid_asym_router_failure && rtr->lp_rcd == NULL)
+                newrcd = 1;
+
         LNET_UNLOCK();
 
         /* Run any outstanding notifications */
         lnet_do_notify(rtr);
 
         LNET_UNLOCK();
 
         /* Run any outstanding notifications */
         lnet_do_notify(rtr);
 
+        if (newrcd)
+                rcd = lnet_create_rc_data();
+
         LNET_LOCK();
 
         LNET_LOCK();
 
+        if (!lnet_isrouter(rtr)) {
+                lnet_peer_decref_locked(rtr);
+                if (rcd != NULL)
+                        list_add(&rcd->rcd_list, &the_lnet.ln_zombie_rcd);
+                return; /* router table changed! */
+        }
+
+        if (rcd != NULL) {
+                LASSERT (rtr->lp_rcd == NULL);
+                rtr->lp_rcd = rcd;
+        }
+
         secs = lnet_router_check_interval(rtr);
 
         CDEBUG(D_NET,
         secs = lnet_router_check_interval(rtr);
 
         CDEBUG(D_NET,
@@ -634,23 +860,32 @@ lnet_ping_router_locked (lnet_peer_t *rtr)
                rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
 
         if (secs != 0 && !rtr->lp_ping_notsent &&
                rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
 
         if (secs != 0 && !rtr->lp_ping_notsent &&
-            now > rtr->lp_ping_timestamp + secs) {
+            cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
+                                             cfs_time_seconds(secs)))) {
+                int               rc;
+                lnet_process_id_t id;
+                lnet_handle_md_t  mdh;
+
                 id.nid = rtr->lp_nid;
                 id.pid = LUSTRE_SRV_LNET_PID;
                 CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
 
                 rtr->lp_ping_notsent   = 1;
                 rtr->lp_ping_timestamp = now;
                 id.nid = rtr->lp_nid;
                 id.pid = LUSTRE_SRV_LNET_PID;
                 CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
 
                 rtr->lp_ping_notsent   = 1;
                 rtr->lp_ping_timestamp = now;
+                mdh = (rtr->lp_rcd == NULL) ? the_lnet.ln_rc_mdh :
+                                              rtr->lp_rcd->rcd_mdh;
 
                 if (rtr->lp_ping_deadline == 0)
 
                 if (rtr->lp_ping_deadline == 0)
-                        rtr->lp_ping_deadline = now + router_ping_timeout;
+                        rtr->lp_ping_deadline = cfs_time_shift(router_ping_timeout);
 
                 LNET_UNLOCK();
 
 
                 LNET_UNLOCK();
 
-                LNetGet(LNET_NID_ANY, the_lnet.ln_rc_mdh, id,
-                        LNET_RESERVED_PORTAL, LNET_PROTO_PING_MATCHBITS, 0);
+                rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
+                             LNET_PROTO_PING_MATCHBITS, 0);
 
                 LNET_LOCK();
 
                 LNET_LOCK();
+                if (rc != 0)
+                        rtr->lp_ping_notsent = 0; /* no event pending */
         }
 
         lnet_peer_decref_locked(rtr);
         }
 
         lnet_peer_decref_locked(rtr);
@@ -732,7 +967,8 @@ lnet_router_checker_start(void)
                 return -EINVAL;
         }
 
                 return -EINVAL;
         }
 
-        if (live_router_check_interval <= 0 &&
+        if (!the_lnet.ln_routing &&
+            live_router_check_interval <= 0 &&
             dead_router_check_interval <= 0)
                 return 0;
 
             dead_router_check_interval <= 0)
                 return 0;
 
@@ -753,6 +989,7 @@ lnet_router_checker_start(void)
         }
 
         memset(&md, 0, sizeof(md));
         }
 
         memset(&md, 0, sizeof(md));
+        md.user_ptr  = NULL;
         md.start     = &pinginfo;
         md.length    = sizeof(pinginfo);
         md.options   = LNET_MD_TRUNCATE;
         md.start     = &pinginfo;
         md.length    = sizeof(pinginfo);
         md.options   = LNET_MD_TRUNCATE;
@@ -824,6 +1061,68 @@ lnet_router_checker_stop (void)
 
 #if defined(__KERNEL__) && defined(LNET_ROUTER)
 
 
 #if defined(__KERNEL__) && defined(LNET_ROUTER)
 
+static void
+lnet_prune_zombie_rcd (int wait_unlink)
+{
+        lnet_rc_data_t   *rcd;
+        lnet_rc_data_t   *tmp;
+        struct list_head  free_rcd;
+        int               i;
+        __u64             version;
+
+        CFS_INIT_LIST_HEAD(&free_rcd);
+
+        LNET_LOCK();
+rescan:
+        version = the_lnet.ln_routers_version;
+        list_for_each_entry_safe (rcd, tmp, &the_lnet.ln_zombie_rcd, rcd_list) {
+                if (LNetHandleIsInvalid(rcd->rcd_mdh)) {
+                        list_del(&rcd->rcd_list);
+                        list_add(&rcd->rcd_list, &free_rcd);
+                        continue;
+                }
+
+                LNET_UNLOCK();
+
+                LNetMDUnlink(rcd->rcd_mdh);
+
+                LNET_LOCK();
+                if (version != the_lnet.ln_routers_version)
+                        goto rescan;
+        }
+
+        i = 2;
+        while (wait_unlink && !list_empty(&the_lnet.ln_zombie_rcd)) {
+                rcd = list_entry(the_lnet.ln_zombie_rcd.next,
+                                 lnet_rc_data_t, rcd_list);
+                if (LNetHandleIsInvalid(rcd->rcd_mdh)) {
+                        list_del(&rcd->rcd_list);
+                        list_add(&rcd->rcd_list, &free_rcd);
+                        continue;
+                }
+
+                LNET_UNLOCK();
+
+                LNetMDUnlink(rcd->rcd_mdh);
+
+                i++;
+                CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+                       "Waiting for rc buffers to unlink\n");
+                cfs_pause(cfs_time_seconds(1));
+
+                LNET_LOCK();
+        }
+
+        LNET_UNLOCK();
+
+        while (!list_empty(&free_rcd)) {
+                rcd = list_entry(free_rcd.next, lnet_rc_data_t, rcd_list);
+                list_del_init(&rcd->rcd_list);
+                lnet_destroy_rc_data(rcd);
+        }
+        return;
+}
+
 static int
 lnet_router_checker(void *arg)
 {
 static int
 lnet_router_checker(void *arg)
 {
@@ -859,6 +1158,11 @@ rescan:
 
                 LNET_UNLOCK();
 
 
                 LNET_UNLOCK();
 
+                if (the_lnet.ln_routing)
+                        lnet_update_ni_status();
+
+                lnet_prune_zombie_rcd(0); /* don't wait for UNLINK */
+
                 /* Call cfs_pause() here always adds 1 to load average 
                  * because kernel counts # active tasks as nr_running 
                  * + nr_uninterruptible. */
                 /* Call cfs_pause() here always adds 1 to load average 
                  * because kernel counts # active tasks as nr_running 
                  * + nr_uninterruptible. */
@@ -866,6 +1170,23 @@ rescan:
                                      cfs_time_seconds(1));
         }
 
                                      cfs_time_seconds(1));
         }
 
+        LNET_LOCK();
+
+        list_for_each (entry, &the_lnet.ln_routers) {
+                rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+                if (rtr->lp_rcd == NULL)
+                        continue;
+
+                LASSERT (list_empty(&rtr->lp_rcd->rcd_list));
+                list_add(&rtr->lp_rcd->rcd_list, &the_lnet.ln_zombie_rcd);
+                rtr->lp_rcd = NULL;
+        }
+
+        LNET_UNLOCK();
+
+        lnet_prune_zombie_rcd(1); /* wait for UNLINK */
+
         LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_STOPTHREAD);
         the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKING;
 
         LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_STOPTHREAD);
         the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKING;
 
@@ -1079,10 +1400,10 @@ lnet_alloc_rtrpools(int im_a_router)
 }
 
 int
 }
 
 int
-lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when)
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
 {
 {
-        lnet_peer_t         *lp = NULL;
-        time_t               now = cfs_time_current_sec();
+        lnet_peer_t *lp = NULL;
+        cfs_time_t   now = cfs_time_current();
 
         LASSERT (!in_interrupt ());
 
 
         LASSERT (!in_interrupt ());
 
@@ -1100,12 +1421,12 @@ lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when)
         }
 
         /* can't do predictions... */
         }
 
         /* can't do predictions... */
-        if (when > now) {
+        if (cfs_time_after(when, now)) {
                 CWARN ("Ignoring prediction from %s of %s %s "
                        "%ld seconds in the future\n",
                        (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
                        libcfs_nid2str(nid), alive ? "up" : "down",
                 CWARN ("Ignoring prediction from %s of %s %s "
                        "%ld seconds in the future\n",
                        (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
                        libcfs_nid2str(nid), alive ? "up" : "down",
-                       when - now);
+                       cfs_duration_sec(cfs_time_sub(when, now)));
                 return -EINVAL;
         }
 
                 return -EINVAL;
         }
 
@@ -1156,7 +1477,7 @@ lnet_get_tunables (void)
 #else
 
 int
 #else
 
 int
-lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when)
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
 {
         return -EOPNOTSUPP;
 }
 {
         return -EOPNOTSUPP;
 }
index 82a82af..f04ccc9 100644 (file)
@@ -235,9 +235,9 @@ int LL_PROC_PROTO(proc_lnet_routers)
 
         if (*ppos == 0) {
                 s += snprintf(s, tmpstr + tmpsiz - s,
 
         if (*ppos == 0) {
                 s += snprintf(s, tmpstr + tmpsiz - s,
-                              "%-4s %7s %9s %6s %12s %s\n",
-                              "ref", "rtr_ref", "alive_cnt", "state",
-                              "last_ping", "router");
+                              "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n",
+                              "ref", "rtr_ref", "alive_cnt", "state", "last_ping",
+                              "ping_sent", "deadline", "down_ni", "router");
                 LASSERT (tmpstr + tmpsiz - s > 0);
 
                 LNET_LOCK();
                 LASSERT (tmpstr + tmpsiz - s > 0);
 
                 LNET_LOCK();
@@ -272,18 +272,32 @@ int LL_PROC_PROTO(proc_lnet_routers)
                 }
 
                 if (peer != NULL) {
                 }
 
                 if (peer != NULL) {
-                        int        nrefs     = peer->lp_refcount;
-                        int        nrtrrefs  = peer->lp_rtr_refcount;
-                        int        alive_cnt = peer->lp_alive_count;
-                        int        alive     = peer->lp_alive;
-                        time_t     last_ping = peer->lp_ping_timestamp;
-                        lnet_nid_t nid       = peer->lp_nid;
-
-                        s += snprintf(s, tmpstr + tmpsiz - s,
-                                      "%-4d %7d %9d %6s %12lu %s\n",
-                                      nrefs, nrtrrefs,
-                                      alive_cnt, alive ? "up" : "down",
-                                      last_ping, libcfs_nid2str(nid));
+                        lnet_nid_t nid = peer->lp_nid;
+                        cfs_time_t now = cfs_time_current();
+                        cfs_time_t deadline = peer->lp_ping_deadline;
+                        int nrefs     = peer->lp_refcount;
+                        int nrtrrefs  = peer->lp_rtr_refcount;
+                        int alive_cnt = peer->lp_alive_count;
+                        int alive     = peer->lp_alive;
+                        int pingsent  = !peer->lp_ping_notsent;
+                        int last_ping = cfs_duration_sec(now - peer->lp_ping_timestamp);
+                        int down_ni   = lnet_router_down_ni(peer, LNET_NIDNET(LNET_NID_ANY));
+
+                        if (deadline == 0)
+                                s += snprintf(s, tmpstr + tmpsiz - s,
+                                              "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+                                              nrefs, nrtrrefs, alive_cnt,
+                                              alive ? "up" : "down", last_ping,
+                                              pingsent, "NA", down_ni,
+                                              libcfs_nid2str(nid));
+                        else
+                                s += snprintf(s, tmpstr + tmpsiz - s,
+                                              "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+                                              nrefs, nrtrrefs, alive_cnt,
+                                              alive ? "up" : "down", last_ping,
+                                              pingsent,
+                                              cfs_duration_sec(deadline - now),
+                                              down_ni, libcfs_nid2str(nid));
                         LASSERT (tmpstr + tmpsiz - s > 0);
                 }
 
                         LASSERT (tmpstr + tmpsiz - s > 0);
                 }
 
@@ -539,9 +553,9 @@ int LL_PROC_PROTO(proc_lnet_nis)
 
         if (*ppos == 0) {
                 s += snprintf(s, tmpstr + tmpsiz - s,
 
         if (*ppos == 0) {
                 s += snprintf(s, tmpstr + tmpsiz - s,
-                              "%-24s %4s %4s %4s %5s %5s %5s\n",
-                              "nid", "refs", "peer", "rtr", "max",
-                              "tx", "min");
+                              "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
+                              "nid", "status", "alive", "refs", "peer",
+                              "rtr", "max", "tx", "min");
                 LASSERT (tmpstr + tmpsiz - s > 0);
         } else {
                 struct list_head  *n;
                 LASSERT (tmpstr + tmpsiz - s > 0);
         } else {
                 struct list_head  *n;
@@ -565,6 +579,7 @@ int LL_PROC_PROTO(proc_lnet_nis)
                 }
 
                 if (ni != NULL) {
                 }
 
                 if (ni != NULL) {
+                        cfs_time_t now = cfs_time_current();
                         int        maxtxcr = ni->ni_maxtxcredits;
                         int        txcr = ni->ni_txcredits;
                         int        mintxcr = ni->ni_mintxcredits;
                         int        maxtxcr = ni->ni_maxtxcredits;
                         int        txcr = ni->ni_txcredits;
                         int        mintxcr = ni->ni_mintxcredits;
@@ -572,10 +587,21 @@ int LL_PROC_PROTO(proc_lnet_nis)
                         int        npeerrtrcr = ni->ni_peerrtrcredits;
                         lnet_nid_t nid = ni->ni_nid;
                         int        nref = ni->ni_refcount;
                         int        npeerrtrcr = ni->ni_peerrtrcredits;
                         lnet_nid_t nid = ni->ni_nid;
                         int        nref = ni->ni_refcount;
+                        int        last_alive;
+                        char      *stat;
+
+                        last_alive = (the_lnet.ln_routing) ?
+                                 cfs_duration_sec(now - ni->ni_last_alive) : -1;
+                        if (ni->ni_lnd->lnd_type == LOLND)  /* @lo forever alive */
+                                last_alive = 0;
+
+                        LASSERT (ni->ni_status != NULL);
+                        stat = (ni->ni_status->ns_status == LNET_NI_STATUS_UP) ?
+                                                                  "up" : "down";
 
                         s += snprintf(s, tmpstr + tmpsiz - s,
 
                         s += snprintf(s, tmpstr + tmpsiz - s,
-                                      "%-24s %4d %4d %4d %5d %5d %5d\n",
-                                      libcfs_nid2str(nid), nref,
+                                      "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+                                      libcfs_nid2str(nid), stat, last_alive, nref,
                                       npeertxcr, npeerrtrcr, maxtxcr,
                                       txcr, mintxcr);
                         LASSERT (tmpstr + tmpsiz - s > 0);
                                       npeertxcr, npeerrtrcr, maxtxcr,
                                       txcr, mintxcr);
                         LASSERT (tmpstr + tmpsiz - s > 0);