From: Jinshan Xiong Date: Wed, 24 Aug 2011 23:03:53 +0000 (-0700) Subject: ORNL-28: Solve reconnecting race between IR and SR X-Git-Tag: 2.1.51~9 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=6fc2d762ef7cfc453e13468721582f3beea6fc40;hp=eb062ffc1cce793e24dd85f30011a18037b6e609 ORNL-28: Solve reconnecting race between IR and SR if there is a connecting request on the fly when client import is notified by IR, it will set the corresponding conn uuid to a higher prio and set imp_force_verify so that it will do reconnection immediately in case RPC timeout happens. Change-Id: I77e799e1f12b49f3c0271585c1ce812d16dc1ef6 Signed-off-by: Jinshan Xiong Reviewed-on: http://review.whamcloud.com/1291 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 3bd7caf..82fe858 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -1518,7 +1518,7 @@ struct ptlrpc_svc_data { * Import API * @{ */ -int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid); +int ptlrpc_connect_import(struct obd_import *imp); int ptlrpc_init_import(struct obd_import *imp); int ptlrpc_disconnect_import(struct obd_import *imp, int noclose); int ptlrpc_import_recovery_state_machine(struct obd_import *imp); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 4047407..ab65cb0 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -471,7 +471,7 @@ int client_connect_import(const struct lu_env *env, imp->imp_connect_flags_orig = data->ocd_connect_flags; } - rc = ptlrpc_connect_import(imp, NULL); + rc = ptlrpc_connect_import(imp); if (rc != 0) { LASSERT (imp->imp_state == LUSTRE_IMP_DISCON); GOTO(out_ldlm, rc); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index cf1ba95..d244a75 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -2060,7 +2060,7 @@ static int replay_lock_interpret(const struct lu_env *env, LDLM_LOCK_PUT(lock); out: if (rc != ELDLM_OK) - ptlrpc_connect_import(req->rq_import, NULL); + ptlrpc_connect_import(req->rq_import); RETURN(rc); } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index eb871fd..687cb6e 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -2590,7 +2590,7 @@ static int ptlrpc_replay_interpret(const struct lu_env *env, if (rc != 0) /* this replay failed, so restart recovery */ - ptlrpc_connect_import(imp, NULL); + ptlrpc_connect_import(imp); RETURN(rc); } diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 0715ec6..83e4937 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -583,7 +583,7 @@ static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno) * actual sending. * Returns 0 on success or error code. */ -int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) +int ptlrpc_connect_import(struct obd_import *imp) { struct obd_device *obd = imp->imp_obd; int initial_connect = 0; @@ -628,15 +628,6 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) &imp->imp_connect_data.ocd_transno); cfs_spin_unlock(&imp->imp_lock); - if (new_uuid) { - struct obd_uuid uuid; - - obd_str2uuid(&uuid, new_uuid); - rc = import_set_conn_priority(imp, &uuid); - if (rc) - GOTO(out, rc); - } - rc = import_select_connection(imp); if (rc) GOTO(out, rc); @@ -728,8 +719,14 @@ EXPORT_SYMBOL(ptlrpc_connect_import); static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp) { #ifdef __KERNEL__ - /* the pinger takes care of issuing the next reconnect request */ - return; + int force_verify; + + cfs_spin_lock(&imp->imp_lock); + force_verify = imp->imp_force_verify != 0; + cfs_spin_unlock(&imp->imp_lock); + + if (force_verify) + ptlrpc_pinger_wake_up(); #else /* liblustre has no pinger thread, so we wakeup pinger anyway */ ptlrpc_pinger_wake_up(); @@ -780,6 +777,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, /* All imports are pingable */ imp->imp_pingable = 1; imp->imp_force_reconnect = 0; + imp->imp_force_verify = 0; if (aa->pcaa_initial_connect) { if (msg_flags & MSG_CONNECT_REPLAYABLE) { @@ -920,7 +918,7 @@ finish: "invalidating and reconnecting\n", obd2cli_tgt(imp->imp_obd), imp->imp_connection->c_remote_uuid.uuid); - ptlrpc_connect_import(imp, NULL); + ptlrpc_connect_import(imp); RETURN(0); } } else { @@ -1157,7 +1155,7 @@ static int completed_replay_interpret(const struct lu_env *env, req->rq_import->imp_obd->obd_name, req->rq_status); } - ptlrpc_connect_import(req->rq_import, NULL); + ptlrpc_connect_import(req->rq_import); } RETURN(0); diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index fade6b0..f56d16a 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -67,7 +67,7 @@ void ptlrpc_initiate_recovery(struct obd_import *imp) ENTRY; CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd)); - ptlrpc_connect_import(imp, NULL); + ptlrpc_connect_import(imp); EXIT; } @@ -223,7 +223,7 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) } /* to control recovery via lctl {disable|enable}_recovery */ if (imp->imp_deactive == 0) - ptlrpc_connect_import(imp, NULL); + ptlrpc_connect_import(imp); } /* Wait for recovery to complete and resend. If evicted, then @@ -301,15 +301,27 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async) /* force import to be disconnected. */ ptlrpc_set_import_discon(imp, 0); + if (new_uuid) { + struct obd_uuid uuid; + + /* intruct import to use new uuid */ + obd_str2uuid(&uuid, new_uuid); + rc = import_set_conn_priority(imp, &uuid); + if (rc) + GOTO(out, rc); + } + /* Check if reconnect is already in progress */ cfs_spin_lock(&imp->imp_lock); - if (imp->imp_state != LUSTRE_IMP_DISCON) + if (imp->imp_state != LUSTRE_IMP_DISCON) { + imp->imp_force_verify = 1; rc = -EALREADY; + } cfs_spin_unlock(&imp->imp_lock); if (rc) GOTO(out, rc); - rc = ptlrpc_connect_import(imp, new_uuid); + rc = ptlrpc_connect_import(imp); if (rc) GOTO(out, rc);