From 932f21b55864e210ef461ceb9a8590960f6e6a7e Mon Sep 17 00:00:00 2001 From: alex Date: Thu, 19 May 2005 21:18:22 +0000 Subject: [PATCH] b=6316 - all clients report their minimal transno to be replayed - server waits for all clients to connect in order to collect minimal transno's from clients and replay all requests in right order --- lustre/include/linux/lustre_export.h | 1 + lustre/include/linux/lustre_idl.h | 4 +- lustre/ldlm/ldlm_lib.c | 73 ++++++++++++++++++++++++++++++++---- lustre/mds/mds_fs.c | 1 + lustre/obdclass/genops.c | 1 + lustre/ptlrpc/import.c | 21 ++++++++++- 6 files changed, 91 insertions(+), 10 deletions(-) diff --git a/lustre/include/linux/lustre_export.h b/lustre/include/linux/lustre_export.h index ec36e72..0a91241 100644 --- a/lustre/include/linux/lustre_export.h +++ b/lustre/include/linux/lustre_export.h @@ -86,6 +86,7 @@ struct obd_export { int exp_failed:1, exp_req_replay_needed:1, exp_lock_replay_needed:1, + exp_connected:1, exp_libclient:1, /* liblustre client? */ exp_sync:1; union { diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index 0bd16ce..6a19cf2 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -225,6 +225,7 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) #define MSG_CONNECT_LIBCLIENT 0x10 #define MSG_CONNECT_INITIAL 0x20 #define MSG_CONNECT_ASYNC 0x40 +#define MSG_CONNECT_TRANSNO 0X80 /* report transno */ /* Connect flags */ @@ -243,7 +244,8 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) struct obd_connect_data { __u64 ocd_connect_flags; __u32 ocd_nllu[2]; - __u64 padding[6]; + __u64 transno; /* first transno from client to be replayed */ + __u64 padding[5]; }; extern void lustre_swab_connect(struct obd_connect_data *ocd); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index dcedbbb..e94e216 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -666,9 +666,10 @@ int target_handle_connect(struct ptlrpc_request *req) /* Tell the client if we're in recovery. */ /* If this is the first client, start the recovery timer */ - CWARN("%s: connection from %s@%s/%lu %s\n", target->obd_name, cluuid.uuid, + CWARN("%s: connection from %s@%s/%lu %st"LPU64"\n", target->obd_name, cluuid.uuid, ptlrpc_peernid2str(&req->rq_peer, peer_str), *cfp, - target->obd_recovering ? "(recovering)" : ""); + target->obd_recovering ? "recovering/" : "", + conn_data->transno); if (target->obd_recovering) { lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); @@ -767,8 +768,18 @@ int target_handle_connect(struct ptlrpc_request *req) GOTO(out, rc = 0); } - if (target->obd_recovering) + spin_lock_bh(&target->obd_processing_task_lock); + if (target->obd_recovering && export->exp_connected == 0) { + __u64 t = conn_data->transno; + export->exp_connected = 1; + if ((lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_TRANSNO) + && t < target->obd_next_recovery_transno) + target->obd_next_recovery_transno = t; target->obd_connected_clients++; + if (target->obd_connected_clients == target->obd_max_recoverable_clients) + wake_up(&target->obd_next_transno_waitq); + } + spin_unlock_bh(&target->obd_processing_task_lock); memcpy(&conn, lustre_msg_buf(req->rq_reqmsg, offset + 2, sizeof(conn)), sizeof(conn)); @@ -1094,7 +1105,7 @@ static int check_for_next_transno(struct obd_device *obd) max = obd->obd_max_recoverable_clients; connected = obd->obd_connected_clients; - completed = max - atomic_read(&obd->obd_req_replay_clients); + completed = max - obd->obd_recoverable_clients; queue_len = obd->obd_requests_queued_for_recovery; next_transno = obd->obd_next_recovery_transno; @@ -1118,6 +1129,13 @@ static int check_for_next_transno(struct obd_device *obd) next_transno, queue_len, completed, max, req_transno); obd->obd_next_recovery_transno = req_transno; wake_up = 1; + } else if (queue_len == atomic_read(&obd->obd_req_replay_clients)) { + /* some clients haven't connected in time, but we need + * their requests to continue recovery. so, we abort ... */ + CDEBUG(D_ERROR, "abort due to missed clients: queue: %d max: %d\n", + queue_len, max); + obd->obd_abort_recovery = 1; + wake_up = 1; } spin_unlock_bh(&obd->obd_processing_task_lock); @@ -1228,12 +1246,30 @@ static int lock_replay_done(struct obd_export *exp) return 1; } +static int connect_done(struct obd_export *exp) +{ + if (exp->exp_connected) + return 1; + return 0; +} + +static int check_for_clients(struct obd_device *obd) +{ + if (obd->obd_abort_recovery) + return 1; + LASSERT(obd->obd_connected_clients <= obd->obd_max_recoverable_clients); + if (obd->obd_connected_clients == obd->obd_max_recoverable_clients) + return 1; + return 0; +} + static int target_recovery_thread(void *arg) { struct obd_device *obd = arg; struct ptlrpc_request *req; struct target_recovery_data *trd = &obd->obd_recovery_data; char peer_str[PTL_NALFMT_SIZE]; + struct l_wait_info lwi = { 0 }; unsigned long flags; ENTRY; @@ -1251,9 +1287,29 @@ static int target_recovery_thread(void *arg) obd->obd_recovering = 1; complete(&trd->trd_starting); - /* The first stage: replay requests */ - CWARN("1: request replay stage - %d clients\n", - atomic_read(&obd->obd_req_replay_clients)); + /* first of all, we have to know the first transno to replay */ + obd->obd_abort_recovery = 0; + l_wait_event(obd->obd_next_transno_waitq, + check_for_clients(obd), &lwi); + + spin_lock_bh(&obd->obd_processing_task_lock); + target_cancel_recovery_timer(obd); + spin_unlock_bh(&obd->obd_processing_task_lock); + + /* If some clients haven't connected in time, evict them */ + if (obd->obd_abort_recovery) { + int stale; + CERROR("some clients haven't connect in time, evict them ...\n"); + obd->obd_abort_recovery = 0; + stale = class_disconnect_stale_exports(obd, connect_done, 0); + atomic_sub(stale, &obd->obd_req_replay_clients); + atomic_sub(stale, &obd->obd_lock_replay_clients); + } + + /* next stage: replay requests */ + CWARN("1: request replay stage - %d clients from t"LPU64"\n", + atomic_read(&obd->obd_req_replay_clients), + obd->obd_next_recovery_transno); while ((req = target_next_replay_req(obd))) { LASSERT(trd->trd_processing_task == current->pid); DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s: ", @@ -1380,6 +1436,7 @@ int target_process_req_flags(struct obd_device *obd, struct ptlrpc_request *req) LASSERT(atomic_read(&obd->obd_req_replay_clients) > 0); exp->exp_req_replay_needed = 0; atomic_dec(&obd->obd_req_replay_clients); + obd->obd_recoverable_clients--; if (atomic_read(&obd->obd_req_replay_clients) == 0) { CDEBUG(D_HA, "all clients have replayed reqs\n"); wake_up(&obd->obd_next_transno_waitq); @@ -1468,7 +1525,7 @@ int target_queue_recovery_request(struct ptlrpc_request *req, * handled will pass through here and be processed immediately. */ spin_lock_bh(&obd->obd_processing_task_lock); - if (transno < obd->obd_next_recovery_transno) { + if (transno < obd->obd_next_recovery_transno && check_for_clients(obd)) { /* Processing the queue right now, don't re-add. */ LASSERT(list_empty(&req->rq_list)); spin_unlock_bh(&obd->obd_processing_task_lock); diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index 471bd3a..e083e55 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -380,6 +380,7 @@ static int mds_read_last_rcvd(struct obd_device *obd, struct file *file) spin_lock_init(&med->med_open_lock); mcd = NULL; + exp->exp_connected = 0; exp->exp_req_replay_needed = 1; obd->obd_recoverable_clients++; obd->obd_max_recoverable_clients++; diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 596bc0b..4a7d1c6 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -499,6 +499,7 @@ struct obd_export *class_new_export(struct obd_device *obd) list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports); export->exp_obd->obd_num_exports++; spin_unlock(&obd->obd_dev_lock); + export->exp_connected = 1; obd_init_export(export); return export; } diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index af772e7..96ad1c6 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -298,12 +298,27 @@ static int import_select_connection(struct obd_import *imp) RETURN(0); } - +/* + * must be called under imp_lock + */ +int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno) +{ + struct ptlrpc_request *req; + struct list_head *tmp; + + if (list_empty(&imp->imp_replay_list)) + return 0; + tmp = imp->imp_replay_list.next; + req = list_entry(tmp, struct ptlrpc_request, rq_replay_list); + *transno = req->rq_transno; + return 1; +} int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) { struct obd_device *obd = imp->imp_obd; int initial_connect = 0; + int set_transno = 0; int rc; __u64 committed_before_reconnect = 0; struct ptlrpc_request *request; @@ -350,6 +365,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) imp->imp_conn_cnt++; } + set_transno = ptlrpc_first_transno(imp, &imp->imp_connect_data.transno); spin_unlock_irqrestore(&imp->imp_lock, flags); @@ -399,6 +415,9 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) MSG_CONNECT_INITIAL); imp->imp_replayable = 1; } + if (set_transno) + lustre_msg_add_op_flags(request->rq_reqmsg, + MSG_CONNECT_TRANSNO); imp->imp_reqs_replayed = imp->imp_locks_replayed = 0; -- 1.8.3.1