Whamcloud - gitweb
i=liangzhen:
authorisaac <isaac>
Wed, 4 Jul 2007 16:45:05 +0000 (16:45 +0000)
committerisaac <isaac>
Wed, 4 Jul 2007 16:45:05 +0000 (16:45 +0000)
-   collect session-specific ping error counter.
-   set brw_inject_errors and session_timeout via environment variables
    when in userland.
-   removed tsu_error and renamed tsi_stop_onerr as tsi_stoptsu_onerr.

lnet/include/lnet/lnetst.h
lnet/selftest/brw_test.c
lnet/selftest/framework.c
lnet/selftest/ping_test.c
lnet/selftest/selftest.h

index 768de14..b5e69cb 100644 (file)
@@ -441,6 +441,7 @@ typedef struct {
         __u32 active_batches;
         __u32 zombie_sessions;
         __u32 brw_errors;
+        __u32 ping_errors;
 } sfw_counters_t;
 
 #endif
index da9d150..78bed3c 100644 (file)
@@ -10,9 +10,7 @@
 #include "selftest.h"
 
 
-static int brw_inject_errors = 0;
-CFS_MODULE_PARM(brw_inject_errors, "i", int, 0644,
-                "# data errors to inject randomly");
+extern int brw_inject_errors;
 
 static void
 brw_client_fini (sfw_test_instance_t *tsi)
@@ -20,6 +18,8 @@ brw_client_fini (sfw_test_instance_t *tsi)
         srpc_bulk_t     *bulk;
         sfw_test_unit_t *tsu;
 
+        LASSERT (tsi->tsi_is_client);
+
         list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
                 bulk = tsu->tsu_private;
                 if (bulk == NULL) continue;
@@ -38,6 +38,8 @@ brw_client_init (sfw_test_instance_t *tsi)
         srpc_bulk_t      *bulk;
         sfw_test_unit_t  *tsu;
 
+        LASSERT (tsi->tsi_is_client);
+
         if (npg > LNET_MAX_IOV || npg <= 0)
                 return -EINVAL;
 
@@ -91,8 +93,7 @@ brw_fill_page (cfs_page_t *pg, int pattern, __u64 magic)
 
         LASSERT (addr != NULL);
 
-        if (pattern == LST_BRW_CHECK_NONE)
-                return;
+        if (pattern == LST_BRW_CHECK_NONE) return;
 
         if (magic == BRW_MAGIC)
                 magic += brw_inject_one_error();
@@ -231,17 +232,20 @@ brw_client_prep_rpc (sfw_test_unit_t *tsu,
 static void
 brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
 {
-        __u64             magic = BRW_MAGIC;
-        srpc_msg_t       *msg = &rpc->crpc_replymsg;
-        srpc_brw_reply_t *reply = &msg->msg_body.brw_reply;
-        srpc_brw_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
-        sfw_session_t    *sn = tsu->tsu_instance->tsi_batch->bat_session;
+        __u64                magic = BRW_MAGIC;
+        sfw_test_instance_t *tsi = tsu->tsu_instance;
+        sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+        srpc_msg_t          *msg = &rpc->crpc_replymsg;
+        srpc_brw_reply_t    *reply = &msg->msg_body.brw_reply;
+        srpc_brw_reqst_t    *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
 
         LASSERT (sn != NULL);
 
         if (rpc->crpc_status != 0) {
                 CERROR ("BRW RPC to %s failed with %d\n",
                         libcfs_id2str(rpc->crpc_dest), rpc->crpc_status);
+                if (!tsi->tsi_stopping) /* rpc could have been aborted */
+                        atomic_inc(&sn->sn_brw_errors);
                 goto out;
         }
 
@@ -250,15 +254,13 @@ brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
                 __swab32s(&reply->brw_status);
         }
 
-        if (tsu->tsu_error == 0)
-                tsu->tsu_error = -reply->brw_status;
-
         CDEBUG (reply->brw_status ? D_WARNING : D_NET,
                 "BRW RPC to %s finished with brw_status: %d\n",
                 libcfs_id2str(rpc->crpc_dest), reply->brw_status);
 
         if (reply->brw_status != 0) {
                 atomic_inc(&sn->sn_brw_errors);
+                rpc->crpc_status = -reply->brw_status;
                 goto out;
         }
 
@@ -267,8 +269,8 @@ brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
         if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) {
                 CERROR ("Bulk data from %s is corrupted!\n",
                         libcfs_id2str(rpc->crpc_dest));
-                tsu->tsu_error = -EBADMSG;
                 atomic_inc(&sn->sn_brw_errors);
+                rpc->crpc_status = -EBADMSG;
         }
 
 out:
index 7e79455..fc6eaaa 100644 (file)
 #include "selftest.h"
 
 
+int brw_inject_errors = 0;
+CFS_MODULE_PARM(brw_inject_errors, "i", int, 0644,
+                "# data errors to inject randomly, zero by default");
+
 static int session_timeout = 100;
 CFS_MODULE_PARM(session_timeout, "i", int, 0444,
                 "test session timeout in seconds (100 by default, 0 == never)");
@@ -41,6 +45,7 @@ do {                                    \
 #define sfw_unpack_fw_counters(fc)        \
 do {                                      \
         __swab32s(&(fc).brw_errors);      \
+        __swab32s(&(fc).ping_errors);     \
         __swab32s(&(fc).active_tests);    \
         __swab32s(&(fc).active_batches);  \
         __swab32s(&(fc).zombie_sessions); \
@@ -179,8 +184,7 @@ sfw_deactivate_session (void)
         int            nactive = 0;
         sfw_batch_t   *tsb;
 
-        if (sn == NULL)
-                return;
+        if (sn == NULL) return;
 
         LASSERT (!sn->sn_timer_active);
 
@@ -215,12 +219,6 @@ sfw_session_removed (void)
         return (sfw_data.fw_session == NULL) ? 1 : 0;
 }
 
-void
-sfw_set_session_timeout (int timeout)
-{
-        session_timeout = timeout;
-}
-
 #endif
 
 void
@@ -253,6 +251,7 @@ sfw_init_session (sfw_session_t *sn, lst_sid_t sid, const char *name)
         CFS_INIT_LIST_HEAD(&sn->sn_list);
         CFS_INIT_LIST_HEAD(&sn->sn_batches);
         atomic_set(&sn->sn_brw_errors, 0);
+        atomic_set(&sn->sn_ping_errors, 0);
         strncpy(&sn->sn_name[0], name, LST_NAME_SIZE);
 
         sn->sn_timer_active = 0;
@@ -378,6 +377,7 @@ sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
         srpc_get_counters(&reply->str_rpc);
 
         cnt->brw_errors      = atomic_read(&sn->sn_brw_errors);
+        cnt->ping_errors     = atomic_read(&sn->sn_ping_errors);
         cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
 
         cnt->active_tests = cnt->active_batches = 0;
@@ -538,8 +538,7 @@ sfw_destroy_test_instance (sfw_test_instance_t *tsi)
         srpc_client_rpc_t *rpc;
         sfw_test_unit_t   *tsu;
 
-        if (!tsi->tsi_is_client)
-                goto clean;
+        if (!tsi->tsi_is_client) goto clean;
 
         tsi->tsi_ops->tso_fini(tsi);
 
@@ -666,13 +665,13 @@ sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
         CFS_INIT_LIST_HEAD(&tsi->tsi_free_rpcs);
         CFS_INIT_LIST_HEAD(&tsi->tsi_active_rpcs);
 
-        tsi->tsi_stopping   = 0;
-        tsi->tsi_batch      = tsb;
-        tsi->tsi_loop       = req->tsr_loop;
-        tsi->tsi_concur     = req->tsr_concur;
-        tsi->tsi_service    = req->tsr_service;
-        tsi->tsi_is_client  = !!(req->tsr_is_client);
-        tsi->tsi_stop_onerr = !!(req->tsr_stop_onerr);
+        tsi->tsi_stopping      = 0;
+        tsi->tsi_batch         = tsb;
+        tsi->tsi_loop          = req->tsr_loop;
+        tsi->tsi_concur        = req->tsr_concur;
+        tsi->tsi_service       = req->tsr_service;
+        tsi->tsi_is_client     = !!(req->tsr_is_client);
+        tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr);
 
         rc = sfw_load_test(tsi);
         if (rc != 0) {
@@ -791,10 +790,6 @@ sfw_test_rpc_done (srpc_client_rpc_t *rpc)
         sfw_test_instance_t *tsi = tsu->tsu_instance;
         int                  done = 0;
 
-        if (rpc->crpc_status != 0 && tsu->tsu_error == 0 &&
-            (rpc->crpc_status != -EINTR || !tsi->tsi_stopping))
-                tsu->tsu_error = rpc->crpc_status;
-
         tsi->tsi_ops->tso_done_rpc(tsu, rpc);
                       
         spin_lock(&tsi->tsi_lock);
@@ -807,7 +802,7 @@ sfw_test_rpc_done (srpc_client_rpc_t *rpc)
         /* batch is stopping or loop is done or get error */
         if (tsi->tsi_stopping ||
             tsu->tsu_loop == 0 ||
-            (tsu->tsu_error != 0 && tsi->tsi_stop_onerr))
+            (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr))
                 done = 1;
 
         /* dec ref for poster */
@@ -871,8 +866,7 @@ sfw_run_test (swi_workitem_t *wi)
 
         LASSERT (wi == &tsu->tsu_worker);
 
-        tsu->tsu_error = tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc);
-        if (tsu->tsu_error != 0) {
+        if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) {
                 LASSERT (rpc == NULL);
                 goto test_done;
         }
@@ -937,10 +931,7 @@ sfw_run_batch (sfw_batch_t *tsb)
 
                 list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
                         atomic_inc(&tsi->tsi_nactive);
-
-                        tsu->tsu_error = 0;
-                        tsu->tsu_loop  = tsi->tsi_loop;
-
+                        tsu->tsu_loop = tsi->tsi_loop;
                         wi = &tsu->tsu_worker;
                         swi_init_workitem(wi, tsu, sfw_run_test);
                         swi_schedule_workitem(wi);
@@ -1517,6 +1508,16 @@ sfw_startup (void)
         srpc_service_t  *sv;
         sfw_test_case_t *tsc;
 
+#ifndef __KERNEL__
+        char *s;
+
+        s = getenv("SESSION_TIMEOUT");
+        session_timeout = s != NULL ? atoi(s) : session_timeout;
+
+        s = getenv("BRW_INJECT_ERRORS");
+        brw_inject_errors = s != NULL ? atoi(s) : brw_inject_errors;
+#endif
+
         if (session_timeout < 0) {
                 CERROR ("Session timeout must be non-negative: %d\n",
                         session_timeout);
@@ -1525,7 +1526,7 @@ sfw_startup (void)
 
         if (session_timeout == 0)
                 CWARN ("Zero session_timeout specified "
-                       "- test sessions never timeout.\n");
+                       "- test sessions never expire.\n");
 
         memset(&sfw_data, 0, sizeof(struct smoketest_framework));
 
index 93fc9cf..6f0334b 100644 (file)
@@ -15,7 +15,6 @@
 typedef struct {
         spinlock_t      pnd_lock;       /* serialize */
         int             pnd_counter;    /* sequence counter */
-        int             pnd_err_count;  /* error count */
 } lst_ping_data_t;
 
 static lst_ping_data_t  lst_ping_data;
@@ -23,18 +22,28 @@ static lst_ping_data_t  lst_ping_data;
 static int
 ping_client_init(sfw_test_instance_t *tsi)
 {
+        LASSERT (tsi->tsi_is_client);
+
         spin_lock_init(&lst_ping_data.pnd_lock);
-        lst_ping_data.pnd_counter   = 0;
-        lst_ping_data.pnd_err_count = 0;
+        lst_ping_data.pnd_counter = 0;
 
         return 0;
 }
 
 static void
-ping_client_fini(sfw_test_instance_t *tsi)
+ping_client_fini (sfw_test_instance_t *tsi)
 {
-        CWARN("Total ping %d, failed ping: %d\n",
-              lst_ping_data.pnd_counter, lst_ping_data.pnd_err_count);
+        sfw_session_t *sn = tsi->tsi_batch->bat_session;
+        int            errors;
+
+        LASSERT (sn != NULL);
+        LASSERT (tsi->tsi_is_client);
+
+        errors = atomic_read(&sn->sn_ping_errors);
+        if (errors)
+                CWARN ("%d pings have failed.\n", errors);
+        else
+                CDEBUG (D_NET, "Ping test finished OK.\n");
 }
 
 static int
@@ -65,49 +74,53 @@ ping_client_prep_rpc(sfw_test_unit_t *tsu,
 }
 
 static void
-ping_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
 {
-        srpc_ping_reqst_t *req;
-        srpc_ping_reply_t *rep;
-        struct timeval     tv;
-
-        req = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
-        rep = &rpc->crpc_replymsg.msg_body.ping_reply;
+        sfw_test_instance_t *tsi = tsu->tsu_instance;
+        sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+        srpc_ping_reqst_t   *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+        srpc_ping_reply_t   *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+        struct timeval       tv;
 
-        if (rpc->crpc_status == 0 &&
-            rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) {
-                __swab32s(&rep->pnr_seq);
-                __swab32s(&rep->pnr_magic);
-                __swab32s(&rep->pnr_status);
-        }
+        LASSERT (sn != NULL);
 
         if (rpc->crpc_status != 0) {
+                if (!tsi->tsi_stopping) /* rpc could have been aborted */
+                        atomic_inc(&sn->sn_ping_errors);
                 CERROR ("Unable to ping %s (%d): %d\n",
                         libcfs_id2str(rpc->crpc_dest),
-                        req->pnr_seq, rpc->crpc_status);
-        } else if (rep->pnr_magic != LST_PING_TEST_MAGIC) {
-                tsu->tsu_error = -EBADMSG;
+                        reqst->pnr_seq, rpc->crpc_status);
+                return;
+        } 
+
+        if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) {
+                __swab32s(&reply->pnr_seq);
+                __swab32s(&reply->pnr_magic);
+                __swab32s(&reply->pnr_status);
+        }
+        
+        if (reply->pnr_magic != LST_PING_TEST_MAGIC) {
+                rpc->crpc_status = -EBADMSG;
+                atomic_inc(&sn->sn_ping_errors);
                 CERROR ("Bad magic %u from %s, %u expected.\n",
-                        rep->pnr_magic, libcfs_id2str(rpc->crpc_dest),
+                        reply->pnr_magic, libcfs_id2str(rpc->crpc_dest),
                         LST_PING_TEST_MAGIC);
-        } else if (rep->pnr_seq != req->pnr_seq) {
-                tsu->tsu_error = -EBADMSG;
+                return;
+        } 
+        
+        if (reply->pnr_seq != reqst->pnr_seq) {
+                rpc->crpc_status = -EBADMSG;
+                atomic_inc(&sn->sn_ping_errors);
                 CERROR ("Bad seq %u from %s, %u expected.\n",
-                        rep->pnr_seq, libcfs_id2str(rpc->crpc_dest),
-                        req->pnr_seq);
-        }
-
-        if (tsu->tsu_error != 0) {
-                spin_lock(&lst_ping_data.pnd_lock);
-                lst_ping_data.pnd_err_count++;
-                spin_unlock(&lst_ping_data.pnd_lock);
+                        reply->pnr_seq, libcfs_id2str(rpc->crpc_dest),
+                        reqst->pnr_seq);
                 return;
         }
 
         cfs_fs_timeval(&tv);
-        CDEBUG (D_NET, "%d reply in %u usec\n", rep->pnr_seq,
-                (unsigned)((tv.tv_sec - (unsigned)req->pnr_time_sec) * 1000000 +
-                           (tv.tv_usec - req->pnr_time_usec)));
+        CDEBUG (D_NET, "%d reply in %u usec\n", reply->pnr_seq,
+                (unsigned)((tv.tv_sec - (unsigned)reqst->pnr_time_sec) * 1000000
+                           + (tv.tv_usec - reqst->pnr_time_usec)));
         return;
 }
 
index b759e6a..63179ff 100644 (file)
@@ -316,6 +316,7 @@ typedef struct {
         struct list_head  sn_batches; /* list of batches */
         char              sn_name[LST_NAME_SIZE];
         atomic_t          sn_brw_errors;
+        atomic_t          sn_ping_errors;
 } sfw_session_t;
 
 #define sfw_sid_equal(sid0, sid1)     ((sid0).ses_nid == (sid1).ses_nid && \
@@ -347,10 +348,10 @@ typedef struct sfw_test_instance {
         sfw_test_client_ops_t  *tsi_ops;          /* test client operations */
 
         /* public parameter for all test units */
-        int                     tsi_is_client:1;  /* is test client */
-        int                     tsi_stop_onerr:1; /* stop on error */
-        int                     tsi_concur;       /* concurrency */
-        int                     tsi_loop;         /* loop count */
+        int                     tsi_is_client:1;     /* is test client */
+        int                     tsi_stoptsu_onerr:1; /* stop tsu on error */
+        int                     tsi_concur;          /* concurrency */
+        int                     tsi_loop;            /* loop count */
 
         /* status of test instance */
         spinlock_t              tsi_lock;         /* serialize */
@@ -377,7 +378,6 @@ typedef struct sfw_test_unit {
         struct list_head        tsu_list;         /* chain on lst_test_instance */
         lnet_process_id_t       tsu_dest;         /* id of dest node */
         int                     tsu_loop;         /* loop count of the test */
-        int                     tsu_error;        /* error code */
         sfw_test_instance_t    *tsu_instance;     /* pointer to test instance */
         void                   *tsu_private;      /* private data */
         swi_workitem_t          tsu_worker;       /* workitem of the test unit */
@@ -517,7 +517,6 @@ swi_state2str (int state)
 
 int stt_poll_interval(void);
 int sfw_session_removed(void);
-void sfw_set_session_timeout(int timeout);
 
 int stt_check_events(void);
 int swi_check_events(void);