From: adilger Date: Wed, 14 Apr 2004 20:29:30 +0000 (+0000) Subject: Add a common error handling function for AST errors. This fixes a case where X-Git-Tag: v1_8_0_110~486^6~6 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=2210ce2da844357e37e1a1f92aab1b2b3adbaace;p=fs%2Flustre-release.git Add a common error handling function for AST errors. This fixes a case where we don't drop locks on failed completion ASTs and end up evicting the client. b=3145 --- diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 8cb70ba..c50c019 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -14,6 +14,7 @@ tbd Cluster File Systems, Inc. - don't dereference ERR_PTR() dentry in error handling path (3107) - fix thread race in portals_debug_dumplog() (3122) - create lprocfs device entries at setup instead of at attach (1519) + - common AST error handler, don't evict client on completion race (3145) * miscellania - allow default OST striping configuration per directory (1414) diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 964af8d..f75d8f1 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -306,7 +306,7 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock) #endif /* __KERNEL__ */ -static void ldlm_failed_ast(struct ldlm_lock *lock, int rc, char *ast_type) +static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,const char *ast_type) { const struct ptlrpc_connection *conn = lock->l_export->exp_connection; char str[PTL_NALFMT_SIZE]; @@ -324,6 +324,42 @@ static void ldlm_failed_ast(struct ldlm_lock *lock, int rc, char *ast_type) ptlrpc_fail_export(lock->l_export); } +static int ldlm_handle_ast_error(struct ldlm_lock *lock, + struct ptlrpc_request *req, int rc, + const char *ast_type) +{ + if (rc == -ETIMEDOUT || rc == -EINTR || rc == -ENOTCONN) { + LASSERT(lock->l_export); + if (lock->l_export->exp_libclient) { + LDLM_DEBUG(lock, "%s AST to liblustre client (nid " + LPU64") timeout, just cancelling lock", + ast_type, req->rq_peer.peer_nid); + ldlm_lock_cancel(lock); + rc = -ERESTART; + } else { + ldlm_del_waiting_lock(lock); + ldlm_failed_ast(lock, rc, ast_type); + } + } else if (rc) { + if (rc == -EINVAL) + LDLM_DEBUG(lock, "client (nid "LPU64") returned %d " + "from %s AST - normal race", + req->rq_peer.peer_nid, + req->rq_repmsg->status, ast_type); + else + LDLM_ERROR(lock, "client (nid "LPU64") returned %d " + "from %s AST", req->rq_peer.peer_nid, + (req->rq_repmsg != NULL)? + req->rq_repmsg->status : 0, ast_type); + ldlm_lock_cancel(lock); + /* Server-side AST functions are called from ldlm_reprocess_all, + * which needs to be told to please restart its reprocessing. */ + rc = -ERESTART; + } + + return rc; +} + int ldlm_server_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, void *data, int flag) @@ -385,44 +421,8 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock, req->rq_send_state = LUSTRE_IMP_FULL; req->rq_timeout = 2; /* 2 second timeout for initial AST reply */ rc = ptlrpc_queue_wait(req); - if (rc == -ETIMEDOUT || rc == -EINTR || rc == -ENOTCONN) { - LASSERT(lock->l_export); - if (lock->l_export->exp_libclient) { - CDEBUG(D_HA, "BLOCKING AST to liblustre client (nid " - LPU64") timeout, simply cancel lock 0x%p\n", - req->rq_peer.peer_nid, lock); - ldlm_lock_cancel(lock); - rc = -ERESTART; - } else { - ldlm_del_waiting_lock(lock); - ldlm_failed_ast(lock, rc, "blocking"); - } - } else if (rc) { - if (rc == -EINVAL) - CDEBUG(D_DLMTRACE, "client (nid "LPU64") returned %d " - "from blocking AST for lock %p--normal race\n", - req->rq_peer.peer_nid, - req->rq_repmsg->status, lock); - else if (rc == -ENOTCONN) - CDEBUG(D_DLMTRACE, "client (nid "LPU64") returned %d " - "from blocking AST for lock %p--this client was " - "probably rebooted while it held a lock, nothing" - " serious\n",req->rq_peer.peer_nid, - req->rq_repmsg->status, lock); - else - CDEBUG(D_ERROR, "client (nid "LPU64") returned %d " - "from blocking AST for lock %p\n", - req->rq_peer.peer_nid, - (req->rq_repmsg != NULL)? - req->rq_repmsg->status : 0, - lock); - LDLM_DEBUG(lock, "client sent rc %d rq_status %d from blocking " - "AST", rc, req->rq_status); - ldlm_lock_cancel(lock); - /* Server-side AST functions are called from ldlm_reprocess_all, - * which needs to be told to please restart its reprocessing. */ - rc = -ERESTART; - } + if (rc != 0) + rc = ldlm_handle_ast_error(lock, req, rc, "blocking"); ptlrpc_req_finished(req); @@ -492,20 +492,8 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) l_unlock(&lock->l_resource->lr_namespace->ns_lock); rc = ptlrpc_queue_wait(req); - if (rc == -ETIMEDOUT || rc == -EINTR || rc == -ENOTCONN) { - ldlm_del_waiting_lock(lock); - ldlm_failed_ast(lock, rc, "completion"); - } else if (rc == -EINVAL) { - LDLM_DEBUG(lock, "lost the race -- client no longer has this " - "lock"); - } else if (rc) { - LDLM_ERROR(lock, "client sent rc %d rq_status %d from " - "completion AST", rc, req->rq_status); - ldlm_lock_cancel(lock); - /* Server-side AST functions are called from ldlm_reprocess_all, - * which needs to be told to please restart its reprocessing. */ - rc = -ERESTART; - } + if (rc != 0) + rc = ldlm_handle_ast_error(lock, req, rc, "completion"); ptlrpc_req_finished(req); RETURN(rc); @@ -538,22 +526,13 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data) req->rq_timeout = 2; /* 2 second timeout for initial AST reply */ rc = ptlrpc_queue_wait(req); - if (rc == -ETIMEDOUT || rc == -EINTR || rc == -ENOTCONN) { - ldlm_del_waiting_lock(lock); - ldlm_failed_ast(lock, rc, "glimpse"); - } else if (rc == -EINVAL) { - LDLM_DEBUG(lock, "lost the race -- client no longer has this " - "lock"); - } else if (rc == -ELDLM_NO_LOCK_DATA) { - LDLM_DEBUG(lock, "lost a race -- client has a lock, but no " - "inode"); - } else if (rc) { - LDLM_ERROR(lock, "client sent rc %d rq_status %d from " - "glimpse AST", rc, req->rq_status); - } else { + if (rc == -ELDLM_NO_LOCK_DATA) + LDLM_DEBUG(lock, "lost race - client has a lock but no inode"); + else if (rc) + rc = ldlm_handle_ast_error(lock, req, rc, "glimpse"); + else rc = res->lr_namespace->ns_lvbo->lvbo_update (res, req->rq_repmsg, 0, 1); - } ptlrpc_req_finished(req); RETURN(rc); } diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index ad953e4..2b88d3c 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -50,15 +50,15 @@ static int ldlm_proc_dump_ns(struct file *file, const char *buffer, int ldlm_proc_setup(void) { int rc; - struct lprocfs_vars list[] = { + struct lprocfs_vars list[] = { { "dump_namespaces", NULL, ldlm_proc_dump_ns, NULL }, { NULL }}; ENTRY; LASSERT(ldlm_ns_proc_dir == NULL); ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME, - proc_lustre_root, - NULL, NULL); + proc_lustre_root, + NULL, NULL); if (IS_ERR(ldlm_type_proc_dir)) { CERROR("LProcFS failed in ldlm-init\n"); rc = PTR_ERR(ldlm_type_proc_dir); @@ -87,9 +87,9 @@ int ldlm_proc_setup(void) RETURN(0); -err_ns: +err_ns: lprocfs_remove(ldlm_ns_proc_dir); -err_type: +err_type: lprocfs_remove(ldlm_type_proc_dir); err: ldlm_type_proc_dir = NULL;