Whamcloud - gitweb
b=6316
authoralex <alex>
Thu, 19 May 2005 21:18:22 +0000 (21:18 +0000)
committeralex <alex>
Thu, 19 May 2005 21:18:22 +0000 (21:18 +0000)
 - all clients report their minimal transno to be replayed
 - server waits for all clients to connect in order to collect
   minimal transno's from clients and replay all requests
   in right order

lustre/include/linux/lustre_export.h
lustre/include/linux/lustre_idl.h
lustre/ldlm/ldlm_lib.c
lustre/mds/mds_fs.c
lustre/obdclass/genops.c
lustre/ptlrpc/import.c

index ec36e72..0a91241 100644 (file)
@@ -86,6 +86,7 @@ struct obd_export {
         int                       exp_failed:1,
                                   exp_req_replay_needed:1,
                                   exp_lock_replay_needed:1,
+                                  exp_connected:1,
                                   exp_libclient:1, /* liblustre client? */
                                   exp_sync:1;
         union {
index 0bd16ce..6a19cf2 100644 (file)
@@ -225,6 +225,7 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
 #define MSG_CONNECT_LIBCLIENT   0x10
 #define MSG_CONNECT_INITIAL     0x20
 #define MSG_CONNECT_ASYNC       0x40
+#define MSG_CONNECT_TRANSNO     0X80    /* report transno */
 
 /* Connect flags */
 
@@ -243,7 +244,8 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
 struct obd_connect_data {
         __u64 ocd_connect_flags;
         __u32 ocd_nllu[2];
-        __u64 padding[6];
+        __u64 transno;          /* first transno from client to be replayed */
+        __u64 padding[5];
 };
 
 extern void lustre_swab_connect(struct obd_connect_data *ocd);
index dcedbbb..e94e216 100644 (file)
@@ -666,9 +666,10 @@ int target_handle_connect(struct ptlrpc_request *req)
 
         /* Tell the client if we're in recovery. */
         /* If this is the first client, start the recovery timer */
-        CWARN("%s: connection from %s@%s/%lu %s\n", target->obd_name, cluuid.uuid,
+        CWARN("%s: connection from %s@%s/%lu %st"LPU64"\n", target->obd_name, cluuid.uuid,
               ptlrpc_peernid2str(&req->rq_peer, peer_str), *cfp,
-              target->obd_recovering ? "(recovering)" : "");
+              target->obd_recovering ? "recovering/" : "",
+              conn_data->transno);
 
         if (target->obd_recovering) {
                 lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING);
@@ -767,8 +768,18 @@ int target_handle_connect(struct ptlrpc_request *req)
                 GOTO(out, rc = 0);
         }
 
-        if (target->obd_recovering)
+        spin_lock_bh(&target->obd_processing_task_lock);
+        if (target->obd_recovering && export->exp_connected == 0) {
+                __u64 t = conn_data->transno;
+                export->exp_connected = 1;
+                if ((lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_TRANSNO)
+                                && t < target->obd_next_recovery_transno)
+                        target->obd_next_recovery_transno = t;
                 target->obd_connected_clients++;
+                if (target->obd_connected_clients == target->obd_max_recoverable_clients)
+                        wake_up(&target->obd_next_transno_waitq);
+        }
+        spin_unlock_bh(&target->obd_processing_task_lock);
 
         memcpy(&conn, lustre_msg_buf(req->rq_reqmsg, offset + 2, sizeof(conn)),
                sizeof(conn));
@@ -1094,7 +1105,7 @@ static int check_for_next_transno(struct obd_device *obd)
 
         max = obd->obd_max_recoverable_clients;
         connected = obd->obd_connected_clients;
-        completed = max - atomic_read(&obd->obd_req_replay_clients);
+        completed = max - obd->obd_recoverable_clients;
         queue_len = obd->obd_requests_queued_for_recovery;
         next_transno = obd->obd_next_recovery_transno;
 
@@ -1118,6 +1129,13 @@ static int check_for_next_transno(struct obd_device *obd)
                        next_transno, queue_len, completed, max, req_transno);
                 obd->obd_next_recovery_transno = req_transno;
                 wake_up = 1;
+        } else if (queue_len == atomic_read(&obd->obd_req_replay_clients)) {
+                /* some clients haven't connected in time, but we need
+                 * their requests to continue recovery. so, we abort ... */
+                CDEBUG(D_ERROR, "abort due to missed clients: queue: %d max: %d\n",
+                       queue_len, max);
+                obd->obd_abort_recovery = 1;
+                wake_up = 1;
         }
         spin_unlock_bh(&obd->obd_processing_task_lock);
         
@@ -1228,12 +1246,30 @@ static int lock_replay_done(struct obd_export *exp)
         return 1;
 }
 
+static int connect_done(struct obd_export *exp)
+{
+        if (exp->exp_connected)
+                return 1;
+        return 0;
+}
+
+static int check_for_clients(struct obd_device *obd)
+{
+        if (obd->obd_abort_recovery)
+                return 1;
+        LASSERT(obd->obd_connected_clients <= obd->obd_max_recoverable_clients);
+        if (obd->obd_connected_clients == obd->obd_max_recoverable_clients)
+                return 1;
+        return 0;
+}
+
 static int target_recovery_thread(void *arg)
 {
         struct obd_device *obd = arg;
         struct ptlrpc_request *req;
         struct target_recovery_data *trd = &obd->obd_recovery_data;
         char peer_str[PTL_NALFMT_SIZE];
+        struct l_wait_info lwi = { 0 };
         unsigned long flags;
         ENTRY;
 
@@ -1251,9 +1287,29 @@ static int target_recovery_thread(void *arg)
         obd->obd_recovering = 1;
         complete(&trd->trd_starting);
 
-        /* The first stage: replay requests */
-        CWARN("1: request replay stage - %d clients\n",
-              atomic_read(&obd->obd_req_replay_clients));
+        /* first of all, we have to know the first transno to replay */
+        obd->obd_abort_recovery = 0;
+        l_wait_event(obd->obd_next_transno_waitq,
+                     check_for_clients(obd), &lwi);
+        
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        target_cancel_recovery_timer(obd);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+
+        /* If some clients haven't connected in time, evict them */
+        if (obd->obd_abort_recovery) {
+                int stale;
+                CERROR("some clients haven't connect in time, evict them ...\n");
+                obd->obd_abort_recovery = 0;
+                stale = class_disconnect_stale_exports(obd, connect_done, 0);
+                atomic_sub(stale, &obd->obd_req_replay_clients);
+                atomic_sub(stale, &obd->obd_lock_replay_clients);
+        }
+
+        /* next stage: replay requests */
+        CWARN("1: request replay stage - %d clients from t"LPU64"\n",
+              atomic_read(&obd->obd_req_replay_clients),
+              obd->obd_next_recovery_transno);
         while ((req = target_next_replay_req(obd))) {
                 LASSERT(trd->trd_processing_task == current->pid);
                 DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s: ", 
@@ -1380,6 +1436,7 @@ int target_process_req_flags(struct obd_device *obd, struct ptlrpc_request *req)
                         LASSERT(atomic_read(&obd->obd_req_replay_clients) > 0);
                         exp->exp_req_replay_needed = 0;
                         atomic_dec(&obd->obd_req_replay_clients);
+                        obd->obd_recoverable_clients--;
                         if (atomic_read(&obd->obd_req_replay_clients) == 0) {
                                 CDEBUG(D_HA, "all clients have replayed reqs\n");
                                 wake_up(&obd->obd_next_transno_waitq);
@@ -1468,7 +1525,7 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
          * handled will pass through here and be processed immediately.
          */
         spin_lock_bh(&obd->obd_processing_task_lock);
-        if (transno < obd->obd_next_recovery_transno) {
+        if (transno < obd->obd_next_recovery_transno && check_for_clients(obd)) {
                 /* Processing the queue right now, don't re-add. */
                 LASSERT(list_empty(&req->rq_list));
                 spin_unlock_bh(&obd->obd_processing_task_lock);
index 471bd3a..e083e55 100644 (file)
@@ -380,6 +380,7 @@ static int mds_read_last_rcvd(struct obd_device *obd, struct file *file)
                 spin_lock_init(&med->med_open_lock);
 
                 mcd = NULL;
+                exp->exp_connected = 0;
                 exp->exp_req_replay_needed = 1;
                 obd->obd_recoverable_clients++;
                 obd->obd_max_recoverable_clients++;
index 596bc0b..4a7d1c6 100644 (file)
@@ -499,6 +499,7 @@ struct obd_export *class_new_export(struct obd_device *obd)
         list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
         export->exp_obd->obd_num_exports++;
         spin_unlock(&obd->obd_dev_lock);
+        export->exp_connected = 1;
         obd_init_export(export);
         return export;
 }
index af772e7..96ad1c6 100644 (file)
@@ -298,12 +298,27 @@ static int import_select_connection(struct obd_import *imp)
         RETURN(0);
 }
 
-
+/*
+ * must be called under imp_lock
+ */
+int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
+{
+        struct ptlrpc_request *req;
+        struct list_head *tmp;
+        
+        if (list_empty(&imp->imp_replay_list))
+                return 0;
+        tmp = imp->imp_replay_list.next;
+        req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+        *transno = req->rq_transno;
+        return 1;
+}
 
 int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
 {
         struct obd_device *obd = imp->imp_obd;
         int initial_connect = 0;
+        int set_transno = 0;
         int rc;
         __u64 committed_before_reconnect = 0;
         struct ptlrpc_request *request;
@@ -350,6 +365,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
                 imp->imp_conn_cnt++;
         }
 
+        set_transno = ptlrpc_first_transno(imp, &imp->imp_connect_data.transno);
 
         spin_unlock_irqrestore(&imp->imp_lock, flags);
 
@@ -399,6 +415,9 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
                                         MSG_CONNECT_INITIAL);
                 imp->imp_replayable = 1; 
         }
+        if (set_transno)
+                lustre_msg_add_op_flags(request->rq_reqmsg, 
+                                        MSG_CONNECT_TRANSNO);
         
         imp->imp_reqs_replayed = imp->imp_locks_replayed = 0;