From 6e6dd43a957bc585ef07b2f4d1b9dc45aaa5aac3 Mon Sep 17 00:00:00 2001 From: yury Date: Thu, 6 Nov 2008 07:32:41 +0000 Subject: [PATCH] b=17310 r=johann,shadow - fixes ptlrpcd blocking on very long reply unlink waiting. To do so new rpc phase introduced RQ_PHASE_UNREGISTERING in which request stay until we have reply_in_callback() called by lnet signaling that reply is unlinked. All requests in this state are skipped in processing by prlrcd instead of waiting n * 300s on each of them. This allows ptlrpcd to process other rpcs in the set; - make sure that inflight count is coherent with being present on sending or delay list. That is, if we see inflight != 0, rpc must be on one of these lists. This is very helpful in ptlrpc_invalidate_import() to show all rpcs still waiting after invalidating import; - in ptlrpc_invalidate_import() wait maximal rq_deadline - now from all inflight rpcs instead of obd_timeout which may be much longer. If calculated timeout is 0, obd_timeout is used. This fixes the issue that rq_deadline - now > obd_timeout (very easy to see in logs) which led to inflight != 0 assert because inflight rpcs timed out later than our wait period is finished; - in ptlrpc_invalidate_import() wait forever for rpcs in UNREGISTERING phase. Check in assert for inflight == 0 for wait timed out case if no rpcs in UNREGISTERING phase. Only those in UNREGISTERING phase are allowed to stay longer than obd_timeout; - added ptlrpc_move_rqphase() function. All phase changes go through it. Add debug_req() there to track down all phase changes; - conf_sanity.sh test_45 added to emulate very long reply unlink and also situation when rq_deadline - now > obd_timeout; - fixed using rq_timedout in debug_req(); - do not wait forever in ptlrpc_unregister_reply() for async case (using it from sets). Sync case left unchanged. --- lustre/obdclass/lprocfs_status.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 73be24a..2107b04 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -661,6 +661,7 @@ int lprocfs_rd_import(char *page, char **start, off_t off, int count, " target: %s@%s\n" " state: %s\n" " inflight: %u\n" + " unregistering: %u\n" " conn_cnt: %u\n" " generation: %u\n" " inval_cnt: %u\n" @@ -672,6 +673,7 @@ int lprocfs_rd_import(char *page, char **start, off_t off, int count, obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid, imp_state_name, atomic_read(&imp->imp_inflight), + atomic_read(&imp->imp_unregistering), imp->imp_conn_cnt, imp->imp_generation, atomic_read(&imp->imp_inval_count), -- 1.8.3.1