From: isaac Date: Wed, 4 Jul 2007 16:45:05 +0000 (+0000) Subject: i=liangzhen: X-Git-Tag: v1_7_100~34 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=e38aad0021f4ccd43f5a6659097c45eecb1ecf35 i=liangzhen: - collect session-specific ping error counter. - set brw_inject_errors and session_timeout via environment variables when in userland. - removed tsu_error and renamed tsi_stop_onerr as tsi_stoptsu_onerr. --- diff --git a/lnet/include/lnet/lnetst.h b/lnet/include/lnet/lnetst.h index 768de14..b5e69cb 100644 --- a/lnet/include/lnet/lnetst.h +++ b/lnet/include/lnet/lnetst.h @@ -441,6 +441,7 @@ typedef struct { __u32 active_batches; __u32 zombie_sessions; __u32 brw_errors; + __u32 ping_errors; } sfw_counters_t; #endif diff --git a/lnet/selftest/brw_test.c b/lnet/selftest/brw_test.c index da9d150..78bed3c 100644 --- a/lnet/selftest/brw_test.c +++ b/lnet/selftest/brw_test.c @@ -10,9 +10,7 @@ #include "selftest.h" -static int brw_inject_errors = 0; -CFS_MODULE_PARM(brw_inject_errors, "i", int, 0644, - "# data errors to inject randomly"); +extern int brw_inject_errors; static void brw_client_fini (sfw_test_instance_t *tsi) @@ -20,6 +18,8 @@ brw_client_fini (sfw_test_instance_t *tsi) srpc_bulk_t *bulk; sfw_test_unit_t *tsu; + LASSERT (tsi->tsi_is_client); + list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) { bulk = tsu->tsu_private; if (bulk == NULL) continue; @@ -38,6 +38,8 @@ brw_client_init (sfw_test_instance_t *tsi) srpc_bulk_t *bulk; sfw_test_unit_t *tsu; + LASSERT (tsi->tsi_is_client); + if (npg > LNET_MAX_IOV || npg <= 0) return -EINVAL; @@ -91,8 +93,7 @@ brw_fill_page (cfs_page_t *pg, int pattern, __u64 magic) LASSERT (addr != NULL); - if (pattern == LST_BRW_CHECK_NONE) - return; + if (pattern == LST_BRW_CHECK_NONE) return; if (magic == BRW_MAGIC) magic += brw_inject_one_error(); @@ -231,17 +232,20 @@ brw_client_prep_rpc (sfw_test_unit_t *tsu, static void brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc) { - __u64 magic = BRW_MAGIC; - srpc_msg_t *msg = &rpc->crpc_replymsg; - srpc_brw_reply_t *reply = &msg->msg_body.brw_reply; - srpc_brw_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst; - sfw_session_t *sn = tsu->tsu_instance->tsi_batch->bat_session; + __u64 magic = BRW_MAGIC; + sfw_test_instance_t *tsi = tsu->tsu_instance; + sfw_session_t *sn = tsi->tsi_batch->bat_session; + srpc_msg_t *msg = &rpc->crpc_replymsg; + srpc_brw_reply_t *reply = &msg->msg_body.brw_reply; + srpc_brw_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst; LASSERT (sn != NULL); if (rpc->crpc_status != 0) { CERROR ("BRW RPC to %s failed with %d\n", libcfs_id2str(rpc->crpc_dest), rpc->crpc_status); + if (!tsi->tsi_stopping) /* rpc could have been aborted */ + atomic_inc(&sn->sn_brw_errors); goto out; } @@ -250,15 +254,13 @@ brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc) __swab32s(&reply->brw_status); } - if (tsu->tsu_error == 0) - tsu->tsu_error = -reply->brw_status; - CDEBUG (reply->brw_status ? D_WARNING : D_NET, "BRW RPC to %s finished with brw_status: %d\n", libcfs_id2str(rpc->crpc_dest), reply->brw_status); if (reply->brw_status != 0) { atomic_inc(&sn->sn_brw_errors); + rpc->crpc_status = -reply->brw_status; goto out; } @@ -267,8 +269,8 @@ brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc) if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) { CERROR ("Bulk data from %s is corrupted!\n", libcfs_id2str(rpc->crpc_dest)); - tsu->tsu_error = -EBADMSG; atomic_inc(&sn->sn_brw_errors); + rpc->crpc_status = -EBADMSG; } out: diff --git a/lnet/selftest/framework.c b/lnet/selftest/framework.c index 7e79455..fc6eaaa 100644 --- a/lnet/selftest/framework.c +++ b/lnet/selftest/framework.c @@ -15,6 +15,10 @@ #include "selftest.h" +int brw_inject_errors = 0; +CFS_MODULE_PARM(brw_inject_errors, "i", int, 0644, + "# data errors to inject randomly, zero by default"); + static int session_timeout = 100; CFS_MODULE_PARM(session_timeout, "i", int, 0444, "test session timeout in seconds (100 by default, 0 == never)"); @@ -41,6 +45,7 @@ do { \ #define sfw_unpack_fw_counters(fc) \ do { \ __swab32s(&(fc).brw_errors); \ + __swab32s(&(fc).ping_errors); \ __swab32s(&(fc).active_tests); \ __swab32s(&(fc).active_batches); \ __swab32s(&(fc).zombie_sessions); \ @@ -179,8 +184,7 @@ sfw_deactivate_session (void) int nactive = 0; sfw_batch_t *tsb; - if (sn == NULL) - return; + if (sn == NULL) return; LASSERT (!sn->sn_timer_active); @@ -215,12 +219,6 @@ sfw_session_removed (void) return (sfw_data.fw_session == NULL) ? 1 : 0; } -void -sfw_set_session_timeout (int timeout) -{ - session_timeout = timeout; -} - #endif void @@ -253,6 +251,7 @@ sfw_init_session (sfw_session_t *sn, lst_sid_t sid, const char *name) CFS_INIT_LIST_HEAD(&sn->sn_list); CFS_INIT_LIST_HEAD(&sn->sn_batches); atomic_set(&sn->sn_brw_errors, 0); + atomic_set(&sn->sn_ping_errors, 0); strncpy(&sn->sn_name[0], name, LST_NAME_SIZE); sn->sn_timer_active = 0; @@ -378,6 +377,7 @@ sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply) srpc_get_counters(&reply->str_rpc); cnt->brw_errors = atomic_read(&sn->sn_brw_errors); + cnt->ping_errors = atomic_read(&sn->sn_ping_errors); cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies); cnt->active_tests = cnt->active_batches = 0; @@ -538,8 +538,7 @@ sfw_destroy_test_instance (sfw_test_instance_t *tsi) srpc_client_rpc_t *rpc; sfw_test_unit_t *tsu; - if (!tsi->tsi_is_client) - goto clean; + if (!tsi->tsi_is_client) goto clean; tsi->tsi_ops->tso_fini(tsi); @@ -666,13 +665,13 @@ sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc) CFS_INIT_LIST_HEAD(&tsi->tsi_free_rpcs); CFS_INIT_LIST_HEAD(&tsi->tsi_active_rpcs); - tsi->tsi_stopping = 0; - tsi->tsi_batch = tsb; - tsi->tsi_loop = req->tsr_loop; - tsi->tsi_concur = req->tsr_concur; - tsi->tsi_service = req->tsr_service; - tsi->tsi_is_client = !!(req->tsr_is_client); - tsi->tsi_stop_onerr = !!(req->tsr_stop_onerr); + tsi->tsi_stopping = 0; + tsi->tsi_batch = tsb; + tsi->tsi_loop = req->tsr_loop; + tsi->tsi_concur = req->tsr_concur; + tsi->tsi_service = req->tsr_service; + tsi->tsi_is_client = !!(req->tsr_is_client); + tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr); rc = sfw_load_test(tsi); if (rc != 0) { @@ -791,10 +790,6 @@ sfw_test_rpc_done (srpc_client_rpc_t *rpc) sfw_test_instance_t *tsi = tsu->tsu_instance; int done = 0; - if (rpc->crpc_status != 0 && tsu->tsu_error == 0 && - (rpc->crpc_status != -EINTR || !tsi->tsi_stopping)) - tsu->tsu_error = rpc->crpc_status; - tsi->tsi_ops->tso_done_rpc(tsu, rpc); spin_lock(&tsi->tsi_lock); @@ -807,7 +802,7 @@ sfw_test_rpc_done (srpc_client_rpc_t *rpc) /* batch is stopping or loop is done or get error */ if (tsi->tsi_stopping || tsu->tsu_loop == 0 || - (tsu->tsu_error != 0 && tsi->tsi_stop_onerr)) + (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr)) done = 1; /* dec ref for poster */ @@ -871,8 +866,7 @@ sfw_run_test (swi_workitem_t *wi) LASSERT (wi == &tsu->tsu_worker); - tsu->tsu_error = tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc); - if (tsu->tsu_error != 0) { + if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) { LASSERT (rpc == NULL); goto test_done; } @@ -937,10 +931,7 @@ sfw_run_batch (sfw_batch_t *tsb) list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) { atomic_inc(&tsi->tsi_nactive); - - tsu->tsu_error = 0; - tsu->tsu_loop = tsi->tsi_loop; - + tsu->tsu_loop = tsi->tsi_loop; wi = &tsu->tsu_worker; swi_init_workitem(wi, tsu, sfw_run_test); swi_schedule_workitem(wi); @@ -1517,6 +1508,16 @@ sfw_startup (void) srpc_service_t *sv; sfw_test_case_t *tsc; +#ifndef __KERNEL__ + char *s; + + s = getenv("SESSION_TIMEOUT"); + session_timeout = s != NULL ? atoi(s) : session_timeout; + + s = getenv("BRW_INJECT_ERRORS"); + brw_inject_errors = s != NULL ? atoi(s) : brw_inject_errors; +#endif + if (session_timeout < 0) { CERROR ("Session timeout must be non-negative: %d\n", session_timeout); @@ -1525,7 +1526,7 @@ sfw_startup (void) if (session_timeout == 0) CWARN ("Zero session_timeout specified " - "- test sessions never timeout.\n"); + "- test sessions never expire.\n"); memset(&sfw_data, 0, sizeof(struct smoketest_framework)); diff --git a/lnet/selftest/ping_test.c b/lnet/selftest/ping_test.c index 93fc9cf..6f0334b 100644 --- a/lnet/selftest/ping_test.c +++ b/lnet/selftest/ping_test.c @@ -15,7 +15,6 @@ typedef struct { spinlock_t pnd_lock; /* serialize */ int pnd_counter; /* sequence counter */ - int pnd_err_count; /* error count */ } lst_ping_data_t; static lst_ping_data_t lst_ping_data; @@ -23,18 +22,28 @@ static lst_ping_data_t lst_ping_data; static int ping_client_init(sfw_test_instance_t *tsi) { + LASSERT (tsi->tsi_is_client); + spin_lock_init(&lst_ping_data.pnd_lock); - lst_ping_data.pnd_counter = 0; - lst_ping_data.pnd_err_count = 0; + lst_ping_data.pnd_counter = 0; return 0; } static void -ping_client_fini(sfw_test_instance_t *tsi) +ping_client_fini (sfw_test_instance_t *tsi) { - CWARN("Total ping %d, failed ping: %d\n", - lst_ping_data.pnd_counter, lst_ping_data.pnd_err_count); + sfw_session_t *sn = tsi->tsi_batch->bat_session; + int errors; + + LASSERT (sn != NULL); + LASSERT (tsi->tsi_is_client); + + errors = atomic_read(&sn->sn_ping_errors); + if (errors) + CWARN ("%d pings have failed.\n", errors); + else + CDEBUG (D_NET, "Ping test finished OK.\n"); } static int @@ -65,49 +74,53 @@ ping_client_prep_rpc(sfw_test_unit_t *tsu, } static void -ping_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc) +ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc) { - srpc_ping_reqst_t *req; - srpc_ping_reply_t *rep; - struct timeval tv; - - req = &rpc->crpc_reqstmsg.msg_body.ping_reqst; - rep = &rpc->crpc_replymsg.msg_body.ping_reply; + sfw_test_instance_t *tsi = tsu->tsu_instance; + sfw_session_t *sn = tsi->tsi_batch->bat_session; + srpc_ping_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst; + srpc_ping_reply_t *reply = &rpc->crpc_replymsg.msg_body.ping_reply; + struct timeval tv; - if (rpc->crpc_status == 0 && - rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) { - __swab32s(&rep->pnr_seq); - __swab32s(&rep->pnr_magic); - __swab32s(&rep->pnr_status); - } + LASSERT (sn != NULL); if (rpc->crpc_status != 0) { + if (!tsi->tsi_stopping) /* rpc could have been aborted */ + atomic_inc(&sn->sn_ping_errors); CERROR ("Unable to ping %s (%d): %d\n", libcfs_id2str(rpc->crpc_dest), - req->pnr_seq, rpc->crpc_status); - } else if (rep->pnr_magic != LST_PING_TEST_MAGIC) { - tsu->tsu_error = -EBADMSG; + reqst->pnr_seq, rpc->crpc_status); + return; + } + + if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) { + __swab32s(&reply->pnr_seq); + __swab32s(&reply->pnr_magic); + __swab32s(&reply->pnr_status); + } + + if (reply->pnr_magic != LST_PING_TEST_MAGIC) { + rpc->crpc_status = -EBADMSG; + atomic_inc(&sn->sn_ping_errors); CERROR ("Bad magic %u from %s, %u expected.\n", - rep->pnr_magic, libcfs_id2str(rpc->crpc_dest), + reply->pnr_magic, libcfs_id2str(rpc->crpc_dest), LST_PING_TEST_MAGIC); - } else if (rep->pnr_seq != req->pnr_seq) { - tsu->tsu_error = -EBADMSG; + return; + } + + if (reply->pnr_seq != reqst->pnr_seq) { + rpc->crpc_status = -EBADMSG; + atomic_inc(&sn->sn_ping_errors); CERROR ("Bad seq %u from %s, %u expected.\n", - rep->pnr_seq, libcfs_id2str(rpc->crpc_dest), - req->pnr_seq); - } - - if (tsu->tsu_error != 0) { - spin_lock(&lst_ping_data.pnd_lock); - lst_ping_data.pnd_err_count++; - spin_unlock(&lst_ping_data.pnd_lock); + reply->pnr_seq, libcfs_id2str(rpc->crpc_dest), + reqst->pnr_seq); return; } cfs_fs_timeval(&tv); - CDEBUG (D_NET, "%d reply in %u usec\n", rep->pnr_seq, - (unsigned)((tv.tv_sec - (unsigned)req->pnr_time_sec) * 1000000 + - (tv.tv_usec - req->pnr_time_usec))); + CDEBUG (D_NET, "%d reply in %u usec\n", reply->pnr_seq, + (unsigned)((tv.tv_sec - (unsigned)reqst->pnr_time_sec) * 1000000 + + (tv.tv_usec - reqst->pnr_time_usec))); return; } diff --git a/lnet/selftest/selftest.h b/lnet/selftest/selftest.h index b759e6a..63179ff 100644 --- a/lnet/selftest/selftest.h +++ b/lnet/selftest/selftest.h @@ -316,6 +316,7 @@ typedef struct { struct list_head sn_batches; /* list of batches */ char sn_name[LST_NAME_SIZE]; atomic_t sn_brw_errors; + atomic_t sn_ping_errors; } sfw_session_t; #define sfw_sid_equal(sid0, sid1) ((sid0).ses_nid == (sid1).ses_nid && \ @@ -347,10 +348,10 @@ typedef struct sfw_test_instance { sfw_test_client_ops_t *tsi_ops; /* test client operations */ /* public parameter for all test units */ - int tsi_is_client:1; /* is test client */ - int tsi_stop_onerr:1; /* stop on error */ - int tsi_concur; /* concurrency */ - int tsi_loop; /* loop count */ + int tsi_is_client:1; /* is test client */ + int tsi_stoptsu_onerr:1; /* stop tsu on error */ + int tsi_concur; /* concurrency */ + int tsi_loop; /* loop count */ /* status of test instance */ spinlock_t tsi_lock; /* serialize */ @@ -377,7 +378,6 @@ typedef struct sfw_test_unit { struct list_head tsu_list; /* chain on lst_test_instance */ lnet_process_id_t tsu_dest; /* id of dest node */ int tsu_loop; /* loop count of the test */ - int tsu_error; /* error code */ sfw_test_instance_t *tsu_instance; /* pointer to test instance */ void *tsu_private; /* private data */ swi_workitem_t tsu_worker; /* workitem of the test unit */ @@ -517,7 +517,6 @@ swi_state2str (int state) int stt_poll_interval(void); int sfw_session_removed(void); -void sfw_set_session_timeout(int timeout); int stt_check_events(void); int swi_check_events(void);