From 7b8487f170af5ddb80ffe7a3b6e2655550006ff8 Mon Sep 17 00:00:00 2001 From: Jian Yu Date: Fri, 1 Aug 2014 01:41:17 -0700 Subject: [PATCH] LU-5443 lnet: replace direct HZ access with kernel APIs MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit On some customers’ systems, kernel was compiled with HZ defined to 100, instead of 1000. This improves performance for HPC applications. However, to use these systems with Lustre, customers have to re-build Lustre for the kernel because Lustre directly uses the defined constant HZ. Since kernel 2.6.21, some non-HZ dependent timing APIs become non- inline functions, which can be used in Lustre codes to replace the direct HZ access. These kernel APIs include: jiffies_to_msecs() jiffies_to_usecs() jiffies_to_timespec() msecs_to_jiffies() usecs_to_jiffies() timespec_to_jiffies() And here are some samples of the replacement: HZ -> msecs_to_jiffies(MSEC_PER_SEC) n * HZ -> msecs_to_jiffies(n * MSEC_PER_SEC) HZ / n -> msecs_to_jiffies(MSEC_PER_SEC / n) n / HZ -> jiffies_to_msecs(n) / MSEC_PER_SEC n / HZ * 1000 -> jiffies_to_msecs(n) This patch replaces the direct HZ access in lnet module. Signed-off-by: Jian Yu Change-Id: I0be6c82636df08b0a0a763ea31dafa817c077fe1 Reviewed-on: http://review.whamcloud.com/11303 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Nathaniel Clark Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- lnet/klnds/gnilnd/gnilnd.h | 3 +-- lnet/klnds/mxlnd/mxlnd.c | 7 ++++--- lnet/klnds/mxlnd/mxlnd.h | 9 ++++++--- lnet/klnds/mxlnd/mxlnd_cb.c | 14 ++++++------- lnet/klnds/o2iblnd/o2iblnd.h | 3 ++- lnet/klnds/o2iblnd/o2iblnd_cb.c | 6 ++++-- lnet/klnds/ralnd/ralnd.c | 3 ++- lnet/klnds/ralnd/ralnd_cb.c | 44 ++++++++++++++++++++++++----------------- 8 files changed, 52 insertions(+), 37 deletions(-) diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h index f433247..3bc4a8f 100644 --- a/lnet/klnds/gnilnd/gnilnd.h +++ b/lnet/klnds/gnilnd/gnilnd.h @@ -1042,8 +1042,7 @@ do { \ atomic_inc(&kgnilnd_data.kgn_nquiesce); \ CDEBUG(D_NET, "Waiting for thread pause to be over...\n"); \ while (kgnilnd_data.kgn_quiesce_trigger) { \ - set_current_state(TASK_INTERRUPTIBLE); \ - schedule_timeout(HZ); \ + msleep_interruptible(MSEC_PER_SEC); \ } \ /* Mom, my homework is done */ \ CDEBUG(D_NET, "Waking up from thread pause\n"); \ diff --git a/lnet/klnds/mxlnd/mxlnd.c b/lnet/klnds/mxlnd/mxlnd.c index 6c9d7b8..25a1bfd 100644 --- a/lnet/klnds/mxlnd/mxlnd.c +++ b/lnet/klnds/mxlnd/mxlnd.c @@ -345,7 +345,8 @@ mxlnd_init_mx(lnet_ni_t *ni) mx_get_endpoint_addr(kmxlnd_data.kmx_endpt, &kmxlnd_data.kmx_epa); mx_decompose_endpoint_addr(kmxlnd_data.kmx_epa, &nic_id, &ep_id); mxret = mx_connect(kmxlnd_data.kmx_endpt, nic_id, ep_id, - MXLND_MSG_MAGIC, MXLND_CONNECT_TIMEOUT/HZ*1000, + MXLND_MSG_MAGIC, + jiffies_to_msecs(MXLND_CONNECT_TIMEOUT), &kmxlnd_data.kmx_epa); if (mxret != MX_SUCCESS) { CNETERR("unable to connect to myself (%s)\n", mx_strerror(mxret)); @@ -365,7 +366,7 @@ mxlnd_init_mx(lnet_ni_t *ni) goto failed_with_endpoint; } mxret = mx_set_request_timeout(kmxlnd_data.kmx_endpt, NULL, - MXLND_COMM_TIMEOUT/HZ*1000); + jiffies_to_msecs(MXLND_COMM_TIMEOUT)); if (mxret != MX_SUCCESS) { CERROR("mx_set_request_timeout() failed with %s\n", mx_strerror(mxret)); @@ -455,7 +456,7 @@ mxlnd_shutdown (lnet_ni_t *ni) mx_wakeup(kmxlnd_data.kmx_endpt); up(&kmxlnd_data.kmx_tx_queue_sem); up(&kmxlnd_data.kmx_conn_sem); - mxlnd_sleep(2 * HZ); + mxlnd_sleep(msecs_to_jiffies(2 * MSEC_PER_SEC)); /* fall through */ diff --git a/lnet/klnds/mxlnd/mxlnd.h b/lnet/klnds/mxlnd/mxlnd.h index 86ddef2..c2b285c 100644 --- a/lnet/klnds/mxlnd/mxlnd.h +++ b/lnet/klnds/mxlnd/mxlnd.h @@ -133,9 +133,12 @@ #define MXLND_NDAEMONS 3 /* connd, timeoutd, tx_queued */ #define MXLND_MX_BOARD 0 /* Use the first MX NIC if more than 1 avail */ #define MXLND_MX_EP_ID 0 /* MX endpoint ID */ -#define MXLND_COMM_TIMEOUT (20 * HZ) /* timeout for send/recv (jiffies) */ -#define MXLND_WAIT_TIMEOUT HZ /* timeout for wait (jiffies) */ -#define MXLND_CONNECT_TIMEOUT (5 * HZ) /* timeout for connections (jiffies) */ +/* timeout for send/recv (jiffies) */ +#define MXLND_COMM_TIMEOUT msecs_to_jiffies(20 * MSEC_PER_SEC) +/* timeout for wait (jiffies) */ +#define MXLND_WAIT_TIMEOUT msecs_to_jiffies(MSEC_PER_SEC) +/* timeout for connections (jiffies) */ +#define MXLND_CONNECT_TIMEOUT msecs_to_jiffies(5 * MSEC_PER_SEC) #define MXLND_POLLING 1000 /* poll iterations before blocking */ #define MXLND_LOOKUP_COUNT 5 /* how many times to try to resolve MAC */ #define MXLND_MAX_PEERS 1024 /* number of nodes talking to me */ diff --git a/lnet/klnds/mxlnd/mxlnd_cb.c b/lnet/klnds/mxlnd/mxlnd_cb.c index 104820c..191af47 100644 --- a/lnet/klnds/mxlnd/mxlnd_cb.c +++ b/lnet/klnds/mxlnd/mxlnd_cb.c @@ -845,9 +845,9 @@ mxlnd_ip2nic_id(u32 ip, u64 *nic_id, int tries) break; } else if (ret == -EHOSTUNREACH && try < tries) { /* add a little backoff */ - CDEBUG(D_NET, "sleeping for %d jiffies\n", - HZ/4); - mxlnd_sleep(HZ/4); + CDEBUG(D_NET, "sleeping for %lu jiffies\n", + msecs_to_jiffies(MSEC_PER_SEC / 4)); + mxlnd_sleep(msecs_to_jiffies(MSEC_PER_SEC / 4)); } } } while (try++ < tries); @@ -2715,7 +2715,7 @@ mxlnd_iconnect(kmx_peer_t *peer, u8 msg_type) mxlnd_conn_decref(conn); } mx_set_request_timeout(kmxlnd_data.kmx_endpt, request, - MXLND_CONNECT_TIMEOUT/HZ*1000); + jiffies_to_msecs(MXLND_CONNECT_TIMEOUT)); return; } @@ -2765,7 +2765,7 @@ mxlnd_check_sends(kmx_peer_t *peer) #if MXLND_STATS if (cfs_time_after(jiffies, last)) { - last = jiffies + HZ; + last = jiffies + msecs_to_jiffies(MSEC_PER_SEC); CDEBUG(D_NET, "status= %s credits= %d outstanding= %d ntx_msgs= %d " "ntx_posted= %d ntx_data= %d data_posted= %d\n", mxlnd_connstatus_to_str(conn->mxk_status), conn->mxk_credits, @@ -4025,7 +4025,7 @@ mxlnd_timeoutd(void *arg) long id = (long) arg; unsigned long now = 0; unsigned long next = 0; - unsigned long delay = HZ; + unsigned long delay = msecs_to_jiffies(MSEC_PER_SEC); kmx_peer_t *peer = NULL; kmx_peer_t *temp = NULL; kmx_conn_t *conn = NULL; @@ -4067,7 +4067,7 @@ mxlnd_timeoutd(void *arg) conn->mxk_status == MXLND_CONN_FAIL) && cfs_time_after(now, conn->mxk_last_tx + - HZ)) { + msecs_to_jiffies(MSEC_PER_SEC))) { write_unlock(g_lock); mxlnd_check_sends(peer); write_lock(g_lock); diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index a3edea4..149e962 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -862,7 +862,8 @@ kiblnd_send_keepalive(kib_conn_t *conn) { return (*kiblnd_tunables.kib_keepalive > 0) && cfs_time_after(jiffies, conn->ibc_last_send + - *kiblnd_tunables.kib_keepalive*HZ); + msecs_to_jiffies(*kiblnd_tunables.kib_keepalive * + MSEC_PER_SEC)); } static inline int diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index de402e7..5b3363d 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -1187,7 +1187,9 @@ kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn) LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); tx->tx_queued = 1; - tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ); + tx->tx_deadline = jiffies + + msecs_to_jiffies(*kiblnd_tunables.kib_timeout * + MSEC_PER_SEC); if (tx->tx_conn == NULL) { kiblnd_conn_addref(conn); @@ -3221,7 +3223,7 @@ kiblnd_connd (void *arg) kiblnd_data.kib_peer_hash_size; } - deadline += p * HZ; + deadline += msecs_to_jiffies(p * MSEC_PER_SEC); spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); } diff --git a/lnet/klnds/ralnd/ralnd.c b/lnet/klnds/ralnd/ralnd.c index a6d61e2..b931c0c 100644 --- a/lnet/klnds/ralnd/ralnd.c +++ b/lnet/klnds/ralnd/ralnd.c @@ -838,7 +838,8 @@ kranal_connect (kra_peer_t *peer) MIN(peer->rap_reconnect_interval, *kranal_tunables.kra_max_reconnect_interval); - peer->rap_reconnect_time = jiffies + peer->rap_reconnect_interval * HZ; + peer->rap_reconnect_time = jiffies + + msecs_to_jiffies(peer->rap_reconnect_interval * MSEC_PER_SEC); /* Grab all blocked packets while we have the global lock */ cfs_list_add(&zombies, &peer->rap_tx_queue); diff --git a/lnet/klnds/ralnd/ralnd_cb.c b/lnet/klnds/ralnd/ralnd_cb.c index ff437f6..b9e2bc0 100644 --- a/lnet/klnds/ralnd/ralnd_cb.c +++ b/lnet/klnds/ralnd/ralnd_cb.c @@ -919,15 +919,16 @@ kranal_check_conn_timeouts (kra_conn_t *conn) conn->rac_state == RANAL_CONN_CLOSING); if (!conn->rac_close_sent && - cfs_time_aftereq(now, conn->rac_last_tx + conn->rac_keepalive * - HZ)) { + cfs_time_aftereq(now, conn->rac_last_tx + + msecs_to_jiffies(conn->rac_keepalive * + MSEC_PER_SEC))) { /* not sent in a while; schedule conn so scheduler sends a keepalive */ CDEBUG(D_NET, "Scheduling keepalive %p->%s\n", conn, libcfs_nid2str(conn->rac_peer->rap_nid)); kranal_schedule_conn(conn); } - timeout = conn->rac_timeout * HZ; + timeout = msecs_to_jiffies(conn->rac_timeout * MSEC_PER_SEC); if (!conn->rac_close_recvd && cfs_time_aftereq(now, conn->rac_last_rx + timeout)) { @@ -935,7 +936,7 @@ kranal_check_conn_timeouts (kra_conn_t *conn) (conn->rac_state == RANAL_CONN_ESTABLISHED) ? "Nothing" : "CLOSE not", libcfs_nid2str(conn->rac_peer->rap_nid), - (now - conn->rac_last_rx)/HZ); + jiffies_to_msecs(now - conn->rac_last_rx)/MSEC_PER_SEC); return -ETIMEDOUT; } @@ -955,7 +956,7 @@ kranal_check_conn_timeouts (kra_conn_t *conn) spin_unlock_irqrestore(&conn->rac_lock, flags); CERROR("tx on fmaq for %s blocked %lu seconds\n", libcfs_nid2str(conn->rac_peer->rap_nid), - (now - tx->tx_qtime)/HZ); + jiffies_to_msecs(now-tx->tx_qtime)/MSEC_PER_SEC); return -ETIMEDOUT; } } @@ -967,7 +968,7 @@ kranal_check_conn_timeouts (kra_conn_t *conn) spin_unlock_irqrestore(&conn->rac_lock, flags); CERROR("tx on rdmaq for %s blocked %lu seconds\n", libcfs_nid2str(conn->rac_peer->rap_nid), - (now - tx->tx_qtime)/HZ); + jiffies_to_msecs(now-tx->tx_qtime)/MSEC_PER_SEC); return -ETIMEDOUT; } } @@ -979,7 +980,7 @@ kranal_check_conn_timeouts (kra_conn_t *conn) spin_unlock_irqrestore(&conn->rac_lock, flags); CERROR("tx on replyq for %s blocked %lu seconds\n", libcfs_nid2str(conn->rac_peer->rap_nid), - (now - tx->tx_qtime)/HZ); + jiffies_to_msecs(now-tx->tx_qtime)/MSEC_PER_SEC); return -ETIMEDOUT; } } @@ -1231,7 +1232,7 @@ kranal_reaper (void *arg) conn_index = (conn_index + 1) % conn_entries; } - next_check_time += p * HZ; + next_check_time += msecs_to_jiffies(p * MSEC_PER_SEC); spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags); @@ -1420,11 +1421,13 @@ kranal_sendmsg(kra_conn_t *conn, kra_msg_t *msg, case RAP_NOT_DONE: if (cfs_time_aftereq(jiffies, - conn->rac_last_tx + conn->rac_keepalive * - HZ)) + conn->rac_last_tx + + msecs_to_jiffies(conn->rac_keepalive * + MSEC_PER_SEC))) CWARN("EAGAIN sending %02x (idle %lu secs)\n", msg->ram_type, - (jiffies - conn->rac_last_tx)/HZ); + jiffies_to_msecs(jiffies - conn->rac_last_tx) / + MSEC_PER_SEC); return -EAGAIN; } } @@ -1456,7 +1459,8 @@ kranal_process_fmaq (kra_conn_t *conn) if (cfs_time_aftereq(jiffies, conn->rac_last_tx + - conn->rac_keepalive * HZ)) { + msecs_to_jiffies(conn->rac_keepalive * + MSEC_PER_SEC))) { CDEBUG(D_NET, "sending NOOP (rdma in progress)\n"); kranal_init_msg(&conn->rac_msg, RANAL_MSG_NOOP); kranal_sendmsg(conn, &conn->rac_msg, NULL, 0); @@ -1495,11 +1499,13 @@ kranal_process_fmaq (kra_conn_t *conn) spin_unlock_irqrestore(&conn->rac_lock, flags); if (cfs_time_aftereq(jiffies, - conn->rac_last_tx + conn->rac_keepalive * - HZ)) { + conn->rac_last_tx + + msecs_to_jiffies(conn->rac_keepalive * + MSEC_PER_SEC))) { CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%ld))\n", libcfs_nid2str(conn->rac_peer->rap_nid), conn, - (jiffies - conn->rac_last_tx)/HZ, + jiffies_to_msecs(jiffies - conn->rac_last_tx) / + MSEC_PER_SEC, conn->rac_keepalive); kranal_init_msg(&conn->rac_msg, RANAL_MSG_NOOP); kranal_sendmsg(conn, &conn->rac_msg, NULL, 0); @@ -1911,7 +1917,7 @@ int kranal_process_new_conn (kra_conn_t *conn) LASSERT (rrc == RAP_NOT_DONE); if (!cfs_time_aftereq(jiffies, conn->rac_last_tx + - conn->rac_timeout * HZ)) + msecs_to_jiffies(conn->rac_timeout*MSEC_PER_SEC))) return -EAGAIN; /* Too late */ @@ -2021,10 +2027,12 @@ kranal_scheduler (void *arg) /* retry with exponential backoff until HZ */ if (conn->rac_keepalive == 0) conn->rac_keepalive = 1; - else if (conn->rac_keepalive <= HZ) + else if (conn->rac_keepalive <= + msecs_to_jiffies(MSEC_PER_SEC)) conn->rac_keepalive *= 2; else - conn->rac_keepalive += HZ; + conn->rac_keepalive += + msecs_to_jiffies(MSEC_PER_SEC); deadline = conn->rac_last_tx + conn->rac_keepalive; spin_lock_irqsave(&dev->rad_lock, flags); -- 1.8.3.1