*/
int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
{
- struct client_obd *cli = &obddev->u.cli;
- struct obd_import *imp;
- struct obd_uuid server_uuid;
- int rq_portal, rp_portal, connect_op;
- char *name = obddev->obd_type->typ_name;
- ldlm_ns_type_t ns_type = LDLM_NS_TYPE_UNKNOWN;
- int rc;
- ENTRY;
+ struct client_obd *cli = &obddev->u.cli;
+ struct obd_import *imp;
+ struct obd_uuid server_uuid;
+ int rq_portal, rp_portal, connect_op;
+ char *name = obddev->obd_type->typ_name;
+ ldlm_ns_type_t ns_type = LDLM_NS_TYPE_UNKNOWN;
+ int rc;
+ char *cli_name = lustre_cfg_buf(lcfg, 0);
+ ENTRY;
/* In a more perfect world, we would hang a ptlrpc_client off of
* obd_type and just use the values from there. */
rq_portal = MDS_REQUEST_PORTAL;
rp_portal = MDC_REPLY_PORTAL;
connect_op = MDS_CONNECT;
- cli->cl_sp_me = LUSTRE_SP_CLI;
+ if (is_lwp_on_ost(cli_name))
+ cli->cl_sp_me = LUSTRE_SP_OST;
+ else if (is_lwp_on_mdt(cli_name))
+ cli->cl_sp_me = LUSTRE_SP_MDT;
+ else
+ cli->cl_sp_me = LUSTRE_SP_CLI;
cli->cl_sp_to = LUSTRE_SP_MDT;
ns_type = LDLM_NS_TYPE_MDC;
} else if (!strcmp(name, LUSTRE_OSP_NAME)) {
rq_portal = OST_REQUEST_PORTAL;
}
rp_portal = OSC_REPLY_PORTAL;
- cli->cl_sp_me = LUSTRE_SP_CLI;
+ cli->cl_sp_me = LUSTRE_SP_MDT;
} else if (!strcmp(name, LUSTRE_MGC_NAME)) {
rq_portal = MGS_REQUEST_PORTAL;
rp_portal = MGC_REPLY_PORTAL;
else
cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
}
+
+ spin_lock_init(&cli->cl_mod_rpcs_lock);
+ spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock);
+ cli->cl_max_mod_rpcs_in_flight = 0;
+ cli->cl_mod_rpcs_in_flight = 0;
+ cli->cl_close_rpcs_in_flight = 0;
+ init_waitqueue_head(&cli->cl_mod_rpcs_waitq);
+ cli->cl_mod_tag_bitmap = NULL;
+
+ if (connect_op == MDS_CONNECT) {
+ cli->cl_max_mod_rpcs_in_flight = cli->cl_max_rpcs_in_flight - 1;
+ OBD_ALLOC(cli->cl_mod_tag_bitmap,
+ BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
+ if (cli->cl_mod_tag_bitmap == NULL)
+ GOTO(err, rc = -ENOMEM);
+ }
+
rc = ldlm_get_ref();
if (rc) {
CERROR("ldlm_get_ref failed: %d\n", rc);
GOTO(err_import, rc);
}
- cli->cl_import = imp;
- /* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */
- cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
- cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
+ cli->cl_import = imp;
+ /* cli->cl_max_mds_easize updated by mdc_init_ea_size() */
+ cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
GOTO(err_import, rc = -ENOMEM);
}
- cli->cl_qchk_stat = CL_NOT_QUOTACHECKED;
-
RETURN(rc);
err_import:
err_ldlm:
ldlm_put_ref();
err:
+ if (cli->cl_mod_tag_bitmap != NULL)
+ OBD_FREE(cli->cl_mod_tag_bitmap,
+ BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
+ cli->cl_mod_tag_bitmap = NULL;
RETURN(rc);
}
int client_obd_cleanup(struct obd_device *obddev)
{
+ struct client_obd *cli = &obddev->u.cli;
ENTRY;
ldlm_namespace_free_post(obddev->obd_namespace);
LASSERT(obddev->u.cli.cl_import == NULL);
ldlm_put_ref();
+
+ if (cli->cl_mod_tag_bitmap != NULL)
+ OBD_FREE(cli->cl_mod_tag_bitmap,
+ BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
+ cli->cl_mod_tag_bitmap = NULL;
+
RETURN(0);
}
EXPORT_SYMBOL(client_obd_cleanup);
struct obd_connect_data *ocd;
struct lustre_handle conn = { 0 };
int rc;
+ bool is_mdc = false;
ENTRY;
*exp = NULL;
ocd = &imp->imp_connect_data;
if (data) {
*ocd = *data;
+ is_mdc = strncmp(imp->imp_obd->obd_type->typ_name,
+ LUSTRE_MDC_NAME, 3) == 0;
+ if (is_mdc)
+ data->ocd_connect_flags |= OBD_CONNECT_MULTIMODRPCS;
imp->imp_connect_flags_orig = data->ocd_connect_flags;
}
ocd->ocd_connect_flags, "old "LPX64", new "LPX64"\n",
data->ocd_connect_flags, ocd->ocd_connect_flags);
data->ocd_connect_flags = ocd->ocd_connect_flags;
+ /* clear the flag as it was not set and is not known
+ * by upper layers */
+ if (is_mdc)
+ data->ocd_connect_flags &= ~OBD_CONNECT_MULTIMODRPCS;
}
ptlrpc_pinger_add_import(imp);
if (exp->exp_imp_reverse)
ptlrpc_cleanup_imp(exp->exp_imp_reverse);
- if (exp->exp_obd->obd_namespace != NULL)
- ldlm_cancel_locks_for_export(exp);
+ ldlm_bl_thread_wakeup();
/* complete all outstanding replies */
spin_lock(&exp->exp_lock);
char *target_start;
int target_len;
bool mds_conn = false, lw_client = false;
+ bool mds_mds_conn = false;
+ bool new_mds_mds_conn = false;
struct obd_connect_data *data, *tmpdata;
int size, tmpsize;
lnet_nid_t *client_nid = NULL;
if (rc)
GOTO(out, rc);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+ /* Don't allow clients to connect that are using old 1.8 format
+ * protocol conventions (LUSTRE_MSG_MAGIC_v1, !MSGHDR_CKSUM_INCOMPAT18,
+ * ldlm_flock_policy_wire format, MDT_ATTR_xTIME_SET, etc). The
+ * FULL20 flag should be set on all connections since 2.0, but no
+ * longer affects behaviour.
+ *
+ * Later this check will be disabled and the flag can be retired
+ * completely once interop with 3.0 is no longer needed.
+ */
+ if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
+ GOTO(out, rc = -EPROTO);
+#endif
+
if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
if (data->ocd_version < LUSTRE_VERSION_CODE -
LUSTRE_VERSION_ALLOWED_OFFSET ||
}
}
+ /* Note: lw_client is needed in MDS-MDS failover during update log
+ * processing, so we needs to allow lw_client to be connected at
+ * anytime, instead of only the initial connection */
+ lw_client = (data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0;
+
if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) {
mds_conn = (data->ocd_connect_flags & OBD_CONNECT_MDS) != 0;
- lw_client = (data->ocd_connect_flags &
- OBD_CONNECT_LIGHTWEIGHT) != 0;
+ mds_mds_conn = (data->ocd_connect_flags &
+ OBD_CONNECT_MDS_MDS) != 0;
/* OBD_CONNECT_MNE_SWAB is defined as OBD_CONNECT_MDS_MDS
* for Imperative Recovery connection from MGC to MGS.
if (export == NULL) {
/* allow lightweight connections during recovery */
- if (target->obd_recovering && !lw_client) {
+ /* allow "new" MDT to be connected during recovery, since we
+ * need retrieve recovery update records from it */
+ if (target->obd_recovering && !lw_client && !mds_mds_conn) {
cfs_time_t t;
int c; /* connected */
int i; /* in progress */
t = cfs_timer_deadline(&target->obd_recovery_timer);
t = cfs_time_sub(t, cfs_time_current());
t = cfs_duration_sec(t);
- LCONSOLE_WARN("%s: Denying connection for new client "
- "%s (at %s), waiting for all %d known "
- "clients (%d recovered, %d in progress, "
- "and %d evicted) to recover in %d:%.02d\n",
+ LCONSOLE_WARN("%s: Denying connection for new client %s"
+ "(at %s), waiting for %d known clients "
+ "(%d recovered, %d in progress, and %d "
+ "evicted) to recover in %d:%.02d\n",
target->obd_name, cluuid.uuid,
libcfs_nid2str(req->rq_peer.nid), k,
c - i, i, s, (int)t / 60,
(int)t % 60);
- rc = -EBUSY;
- } else {
+ rc = -EBUSY;
+ } else {
dont_check_exports:
- rc = obd_connect(req->rq_svc_thread->t_env,
- &export, target, &cluuid, data,
- client_nid);
+ rc = obd_connect(req->rq_svc_thread->t_env,
+ &export, target, &cluuid, data,
+ client_nid);
if (mds_conn && OBD_FAIL_CHECK(OBD_FAIL_TGT_RCVG_FLAG))
lustre_msg_add_op_flags(req->rq_repmsg,
- MSG_CONNECT_RECOVERING);
- if (rc == 0)
- conn.cookie = export->exp_handle.h_cookie;
- }
- } else {
- rc = obd_reconnect(req->rq_svc_thread->t_env,
- export, target, &cluuid, data, client_nid);
- }
+ MSG_CONNECT_RECOVERING);
+ if (rc == 0)
+ conn.cookie = export->exp_handle.h_cookie;
+
+ if (mds_mds_conn)
+ new_mds_mds_conn = true;
+ }
+ } else {
+ rc = obd_reconnect(req->rq_svc_thread->t_env,
+ export, target, &cluuid, data, client_nid);
+ }
if (rc)
GOTO(out, rc);
atomic_inc(&target->obd_req_replay_clients);
atomic_inc(&target->obd_lock_replay_clients);
+ /* Note: MDS-MDS connection is allowed to be connected during
+ * recovery, no matter if the exports needs to be recoveried.
+ * Because we need retrieve updates logs from all other MDTs.
+ * So if the MDS-MDS export is new, obd_max_recoverable_clients
+ * also needs to be increased to match other recovery checking
+ * condition. */
+ if (new_mds_mds_conn)
+ target->obd_max_recoverable_clients++;
if (atomic_inc_return(&target->obd_connected_clients) ==
target->obd_max_recoverable_clients)
wake_up(&target->obd_next_transno_waitq);
* ptlrpc_handle_server_req_in->lustre_unpack_msg(). */
revimp->imp_msg_magic = req->rq_reqmsg->lm_magic;
- if ((data->ocd_connect_flags & OBD_CONNECT_AT) &&
- (revimp->imp_msg_magic != LUSTRE_MSG_MAGIC_V1))
+ if (data->ocd_connect_flags & OBD_CONNECT_AT)
revimp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
else
revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
- if ((data->ocd_connect_flags & OBD_CONNECT_FULL20) &&
- (revimp->imp_msg_magic != LUSTRE_MSG_MAGIC_V1))
- revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
- else
- revimp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+ revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, &req->rq_flvr);
if (rc) {
__u64 transno = lustre_msg_get_transno(req->rq_reqmsg);
struct obd_export *exp = req->rq_export;
struct ptlrpc_request *reqiter;
+ struct ptlrpc_request *dup_req = NULL;
int dup = 0;
LASSERT(exp);
list_for_each_entry(reqiter, &exp->exp_req_replay_queue,
rq_replay_list) {
if (lustre_msg_get_transno(reqiter->rq_reqmsg) == transno) {
+ dup_req = reqiter;
dup = 1;
break;
}
(MSG_RESENT | MSG_REPLAY)) != (MSG_RESENT | MSG_REPLAY))
CERROR("invalid flags %x of resent replay\n",
lustre_msg_get_flags(req->rq_reqmsg));
+
+ if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+ __u32 new_conn;
+
+ new_conn = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+ if (new_conn >
+ lustre_msg_get_conn_cnt(dup_req->rq_reqmsg))
+ lustre_msg_set_conn_cnt(dup_req->rq_reqmsg,
+ new_conn);
+ }
} else {
list_add_tail(&req->rq_replay_list,
&exp->exp_req_replay_queue);
spin_unlock(&req->rq_export->exp_lock);
}
-static void target_finish_recovery(struct obd_device *obd)
+static void target_finish_recovery(struct lu_target *lut)
{
+ struct obd_device *obd = lut->lut_obd;
ENTRY;
/* Only log a recovery message when recovery has occurred. */
}
spin_unlock(&obd->obd_recovery_task_lock);
+ if (lut->lut_tdtd != NULL &&
+ !list_empty(&lut->lut_tdtd->tdtd_replay_list))
+ dtrq_list_dump(lut->lut_tdtd, D_ERROR);
+
obd->obd_recovery_end = cfs_time_current_sec();
/* When recovery finished, cleanup orphans on MDS and OST. */
return;
}
obd->obd_recovering = obd->obd_abort_recovery = 0;
+ obd->obd_force_abort_recovery = 0;
spin_unlock(&obd->obd_dev_lock);
spin_lock(&obd->obd_recovery_task_lock);
return;
spin_lock(&obd->obd_dev_lock);
- if (!obd->obd_recovering || obd->obd_abort_recovery) {
+ if (!obd->obd_recovering || obd->obd_abort_recovery ||
+ obd->obd_force_abort_recovery) {
spin_unlock(&obd->obd_dev_lock);
return;
}
int to;
spin_lock(&obd->obd_dev_lock);
- if (!obd->obd_recovering || obd->obd_abort_recovery) {
+ if (!obd->obd_recovering || obd->obd_abort_recovery ||
+ obd->obd_force_abort_recovery) {
spin_unlock(&obd->obd_dev_lock);
return;
}
return (exp->exp_in_recovery && !exp->exp_lock_replay_needed);
}
-/** Checking routines for recovery */
-static int check_for_clients(struct obd_device *obd)
-{
- unsigned int clnts = atomic_read(&obd->obd_connected_clients);
-
- if (obd->obd_abort_recovery || obd->obd_recovery_expired)
- return 1;
- LASSERT(clnts <= obd->obd_max_recoverable_clients);
- return (clnts + obd->obd_stale_clients ==
- obd->obd_max_recoverable_clients);
-}
-
-static int check_for_next_transno(struct obd_device *obd)
+static int check_for_next_transno(struct lu_target *lut)
{
struct ptlrpc_request *req = NULL;
+ struct obd_device *obd = lut->lut_obd;
int wake_up = 0, connected, completed, queue_len;
- __u64 next_transno, req_transno;
+ __u64 req_transno = 0;
+ __u64 update_transno = 0;
+ __u64 next_transno = 0;
ENTRY;
spin_lock(&obd->obd_recovery_task_lock);
req = list_entry(obd->obd_req_replay_queue.next,
struct ptlrpc_request, rq_list);
req_transno = lustre_msg_get_transno(req->rq_reqmsg);
- } else {
- req_transno = 0;
+ }
+
+ if (lut->lut_tdtd != NULL) {
+ struct target_distribute_txn_data *tdtd;
+
+ tdtd = lut->lut_tdtd;
+ update_transno = distribute_txn_get_next_transno(lut->lut_tdtd);
}
connected = atomic_read(&obd->obd_connected_clients);
obd->obd_max_recoverable_clients, connected, completed,
queue_len, req_transno, next_transno);
- if (obd->obd_abort_recovery) {
+ if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
CDEBUG(D_HA, "waking for aborted recovery\n");
wake_up = 1;
} else if (obd->obd_recovery_expired) {
CDEBUG(D_HA, "waking for expired recovery\n");
wake_up = 1;
- } else if (req_transno == next_transno) {
+ } else if (req_transno == next_transno ||
+ (update_transno != 0 && update_transno <= next_transno)) {
CDEBUG(D_HA, "waking for next ("LPD64")\n", next_transno);
wake_up = 1;
} else if (queue_len > 0 &&
CDEBUG(d_lvl,
"%s: waking for gap in transno, VBR is %s (skip: "
LPD64", ql: %d, comp: %d, conn: %d, next: "LPD64
- ", last_committed: "LPD64")\n",
+ ", next_update "LPD64" last_committed: "LPD64")\n",
obd->obd_name, obd->obd_version_recov ? "ON" : "OFF",
next_transno, queue_len, completed, connected,
- req_transno, obd->obd_last_committed);
+ req_transno, update_transno, obd->obd_last_committed);
obd->obd_next_recovery_transno = req_transno;
wake_up = 1;
} else if (atomic_read(&obd->obd_req_replay_clients) == 0) {
return wake_up;
}
-static int check_for_next_lock(struct obd_device *obd)
+static int check_for_next_lock(struct lu_target *lut)
{
+ struct obd_device *obd = lut->lut_obd;
int wake_up = 0;
spin_lock(&obd->obd_recovery_task_lock);
} else if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
CDEBUG(D_HA, "waking for completed lock replay\n");
wake_up = 1;
- } else if (obd->obd_abort_recovery) {
+ } else if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
CDEBUG(D_HA, "waking for aborted recovery\n");
wake_up = 1;
} else if (obd->obd_recovery_expired) {
* check its status with help of check_routine
* evict dead clients via health_check
*/
-static int target_recovery_overseer(struct obd_device *obd,
- int (*check_routine)(struct obd_device *),
+static int target_recovery_overseer(struct lu_target *lut,
+ int (*check_routine)(struct lu_target *),
int (*health_check)(struct obd_export *))
{
+ struct obd_device *obd = lut->lut_obd;
repeat:
if ((obd->obd_recovery_start != 0) && (cfs_time_current_sec() >=
(obd->obd_recovery_start + obd->obd_recovery_time_hard))) {
}
while (wait_event_timeout(obd->obd_next_transno_waitq,
- check_routine(obd),
+ check_routine(lut),
msecs_to_jiffies(60 * MSEC_PER_SEC)) == 0)
/* wait indefinitely for event, but don't trigger watchdog */;
- if (obd->obd_abort_recovery) {
+ if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
CWARN("recovery is aborted, evict exports in recovery\n");
/** evict exports which didn't finish recovery yet */
class_disconnect_stale_exports(obd, exp_finished);
return 0;
}
-static struct ptlrpc_request *target_next_replay_req(struct obd_device *obd)
-{
- struct ptlrpc_request *req = NULL;
- ENTRY;
-
- CDEBUG(D_HA, "Waiting for transno "LPD64"\n",
- obd->obd_next_recovery_transno);
-
- CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);
- /** It is needed to extend recovery window above recovery_time_soft.
- * Extending is possible only in the end of recovery window
- * (see more details in handle_recovery_req).
- */
- CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
-
- if (target_recovery_overseer(obd, check_for_next_transno,
- exp_req_replay_healthy)) {
- abort_req_replay_queue(obd);
- abort_lock_replay_queue(obd);
- }
-
- spin_lock(&obd->obd_recovery_task_lock);
- if (!list_empty(&obd->obd_req_replay_queue)) {
- req = list_entry(obd->obd_req_replay_queue.next,
- struct ptlrpc_request, rq_list);
- list_del_init(&req->rq_list);
- obd->obd_requests_queued_for_recovery--;
- spin_unlock(&obd->obd_recovery_task_lock);
- } else {
- spin_unlock(&obd->obd_recovery_task_lock);
- LASSERT(list_empty(&obd->obd_req_replay_queue));
- LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
- /** evict exports failed VBR */
- class_disconnect_stale_exports(obd, exp_vbr_healthy);
- }
- RETURN(req);
-}
-
-static struct ptlrpc_request *target_next_replay_lock(struct obd_device *obd)
+static struct ptlrpc_request *target_next_replay_lock(struct lu_target *lut)
{
+ struct obd_device *obd = lut->lut_obd;
struct ptlrpc_request *req = NULL;
CDEBUG(D_HA, "Waiting for lock\n");
- if (target_recovery_overseer(obd, check_for_next_lock,
+ if (target_recovery_overseer(lut, check_for_next_lock,
exp_lock_replay_healthy))
abort_lock_replay_queue(obd);
EXIT;
}
+/** Checking routines for recovery */
+static int check_for_recovery_ready(struct lu_target *lut)
+{
+ struct obd_device *obd = lut->lut_obd;
+ unsigned int clnts = atomic_read(&obd->obd_connected_clients);
+
+ CDEBUG(D_HA, "connected %d stale %d max_recoverable_clients %d"
+ " abort %d expired %d\n", clnts, obd->obd_stale_clients,
+ obd->obd_max_recoverable_clients, obd->obd_abort_recovery,
+ obd->obd_recovery_expired);
+
+ if (obd->obd_force_abort_recovery)
+ return 1;
+
+ if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
+ LASSERT(clnts <= obd->obd_max_recoverable_clients);
+ if (clnts + obd->obd_stale_clients <
+ obd->obd_max_recoverable_clients)
+ return 0;
+ }
+
+ if (lut->lut_tdtd != NULL) {
+ if (!lut->lut_tdtd->tdtd_replay_ready) {
+ /* Let's extend recovery timer, in case the recovery
+ * timer expired, and some clients got evicted */
+ extend_recovery_timer(obd, obd->obd_recovery_timeout,
+ true);
+ return 0;
+ } else {
+ dtrq_list_dump(lut->lut_tdtd, D_HA);
+ }
+ }
+
+ return 1;
+}
+
+enum {
+ REQUEST_RECOVERY = 1,
+ UPDATE_RECOVERY = 2,
+};
+
+static __u64 get_next_replay_req_transno(struct obd_device *obd)
+{
+ __u64 transno = 0;
+
+ if (!list_empty(&obd->obd_req_replay_queue)) {
+ struct ptlrpc_request *req;
+
+ req = list_entry(obd->obd_req_replay_queue.next,
+ struct ptlrpc_request, rq_list);
+ transno = lustre_msg_get_transno(req->rq_reqmsg);
+ }
+
+ return transno;
+}
+__u64 get_next_transno(struct lu_target *lut, int *type)
+{
+ struct obd_device *obd = lut->lut_obd;
+ struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+ __u64 transno = 0;
+ __u64 update_transno;
+ ENTRY;
+
+ transno = get_next_replay_req_transno(obd);
+ if (type != NULL)
+ *type = REQUEST_RECOVERY;
+
+ if (tdtd == NULL)
+ RETURN(transno);
+
+ update_transno = distribute_txn_get_next_transno(tdtd);
+ if (transno == 0 || (transno >= update_transno &&
+ update_transno != 0)) {
+ transno = update_transno;
+ if (type != NULL)
+ *type = UPDATE_RECOVERY;
+ }
+
+ RETURN(transno);
+}
+
+/**
+ * drop duplicate replay request
+ *
+ * Because the operation has been replayed by update recovery, the request
+ * with the same transno will be dropped and also notify the client to send
+ * next replay request.
+ *
+ * \param[in] env execution environment
+ * \param[in] obd failover obd device
+ * \param[in] req request to be dropped
+ */
+static void drop_duplicate_replay_req(struct lu_env *env,
+ struct obd_device *obd,
+ struct ptlrpc_request *req)
+{
+ DEBUG_REQ(D_HA, req, "remove t"LPD64" from %s because of duplicate"
+ " update records are found.\n",
+ lustre_msg_get_transno(req->rq_reqmsg),
+ libcfs_nid2str(req->rq_peer.nid));
+
+ /* Right now, only for MDS reint operation update replay and
+ * normal request replay can have the same transno */
+ if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT) {
+ req_capsule_set(&req->rq_pill, &RQF_MDS_REINT);
+ req->rq_status = req_capsule_server_pack(&req->rq_pill);
+ if (likely(req->rq_export))
+ target_committed_to_req(req);
+ lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
+ target_send_reply(req, req->rq_status, 0);
+ } else {
+ DEBUG_REQ(D_ERROR, req, "wrong opc" "from %s\n",
+ libcfs_nid2str(req->rq_peer.nid));
+ }
+ target_exp_dequeue_req_replay(req);
+ target_request_copy_put(req);
+ obd->obd_replayed_requests++;
+}
+
+/**
+ * Update last_rcvd of the update
+ *
+ * Because update recovery might update the last_rcvd by updates, i.e.
+ * it will not update the last_rcvd information in memory, so we need
+ * refresh these information in memory after update recovery.
+ *
+ * \param[in] obd obd_device under recoverying.
+ * \param[in] dtrq the update replay requests being replayed.
+ */
+static void target_update_lcd(struct lu_env *env, struct lu_target *lut,
+ struct distribute_txn_replay_req *dtrq)
+{
+ struct obd_device *obd = lut->lut_obd;
+ struct obd_export *export;
+ struct tg_export_data *ted;
+ struct distribute_txn_replay_req_sub *dtrqs;
+ struct seq_server_site *site;
+ struct update_records *ur;
+ const struct lu_fid *fid;
+ struct update_ops *ops;
+ struct update_params *params;
+ struct update_op *op;
+ __u32 mdt_index;
+ unsigned int i;
+ struct lsd_client_data *lcd = NULL;
+
+ /* if Updates has been executed(committed) on the recovery target,
+ * i.e. the updates is not being executed on the target, so we do
+ * not need update it in memory */
+ site = lu_site2seq(obd->obd_lu_dev->ld_site);
+ mdt_index = site->ss_node_id;
+ dtrqs = dtrq_sub_lookup(dtrq, mdt_index);
+ if (dtrqs != NULL)
+ return;
+
+ if (dtrq->dtrq_lur == NULL)
+ return;
+
+ /* Find the update last_rcvd record */
+ fid = lu_object_fid(&lut->lut_last_rcvd->do_lu);
+ ur = &dtrq->dtrq_lur->lur_update_rec;
+ ops = &ur->ur_ops;
+ params = update_records_get_params(ur);
+ for (i = 0, op = &ops->uops_op[0]; i < ur->ur_update_count;
+ i++, op = update_op_next_op(op)) {
+ __u64 pos;
+ __u16 size;
+ void *buf;
+
+ if (!lu_fid_eq(&op->uop_fid, fid))
+ continue;
+
+ if (op->uop_type != OUT_WRITE)
+ continue;
+
+ buf = update_params_get_param_buf(params, op->uop_params_off[1],
+ ur->ur_param_count, NULL);
+ if (buf == NULL)
+ continue;
+
+ pos = le64_to_cpu(*(__u64 *)buf);
+ if (pos == 0)
+ continue;
+
+ buf = update_params_get_param_buf(params, op->uop_params_off[0],
+ ur->ur_param_count, &size);
+ if (buf == NULL)
+ continue;
+
+ if (size != sizeof(*lcd))
+ continue;
+ lcd = buf;
+ }
+
+ if (lcd == NULL || lcd->lcd_uuid[0] == '\0')
+ return;
+
+ /* locate the export then update the exp_target_data if needed */
+ export = cfs_hash_lookup(obd->obd_uuid_hash, lcd->lcd_uuid);
+ if (export == NULL)
+ return;
+
+ ted = &export->exp_target_data;
+ if (lcd->lcd_last_xid > ted->ted_lcd->lcd_last_xid) {
+ CDEBUG(D_HA, "%s update xid from "LPU64" to "LPU64"\n",
+ lut->lut_obd->obd_name, ted->ted_lcd->lcd_last_xid,
+ lcd->lcd_last_xid);
+ ted->ted_lcd->lcd_last_xid = lcd->lcd_last_xid;
+ ted->ted_lcd->lcd_last_result = lcd->lcd_last_result;
+ }
+ class_export_put(export);
+}
+
+static void replay_request_or_update(struct lu_env *env,
+ struct lu_target *lut,
+ struct target_recovery_data *trd,
+ struct ptlrpc_thread *thread)
+{
+ struct obd_device *obd = lut->lut_obd;
+ struct ptlrpc_request *req = NULL;
+ int type;
+ __u64 transno;
+ ENTRY;
+
+ CDEBUG(D_HA, "Waiting for transno "LPD64"\n",
+ obd->obd_next_recovery_transno);
+
+ /* Replay all of request and update by transno */
+ do {
+ struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+
+ CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);
+
+ /** It is needed to extend recovery window above
+ * recovery_time_soft. Extending is possible only in the
+ * end of recovery window (see more details in
+ * handle_recovery_req()).
+ */
+ CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
+
+ if (target_recovery_overseer(lut, check_for_next_transno,
+ exp_req_replay_healthy)) {
+ abort_req_replay_queue(obd);
+ abort_lock_replay_queue(obd);
+ }
+
+ spin_lock(&obd->obd_recovery_task_lock);
+ transno = get_next_transno(lut, &type);
+ if (type == REQUEST_RECOVERY && tdtd != NULL &&
+ transno == tdtd->tdtd_last_update_transno) {
+ /* Drop replay request from client side, if the
+ * replay has been executed by update with the
+ * same transno */
+ req = list_entry(obd->obd_req_replay_queue.next,
+ struct ptlrpc_request, rq_list);
+ list_del_init(&req->rq_list);
+ obd->obd_requests_queued_for_recovery--;
+ spin_unlock(&obd->obd_recovery_task_lock);
+ drop_duplicate_replay_req(env, obd, req);
+ } else if (type == REQUEST_RECOVERY && transno != 0) {
+ req = list_entry(obd->obd_req_replay_queue.next,
+ struct ptlrpc_request, rq_list);
+ list_del_init(&req->rq_list);
+ obd->obd_requests_queued_for_recovery--;
+ spin_unlock(&obd->obd_recovery_task_lock);
+ LASSERT(trd->trd_processing_task == current_pid());
+ DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s",
+ lustre_msg_get_transno(req->rq_reqmsg),
+ libcfs_nid2str(req->rq_peer.nid));
+
+ handle_recovery_req(thread, req,
+ trd->trd_recovery_handler);
+ /**
+ * bz18031: increase next_recovery_transno before
+ * target_request_copy_put() will drop exp_rpc reference
+ */
+ spin_lock(&obd->obd_recovery_task_lock);
+ obd->obd_next_recovery_transno++;
+ spin_unlock(&obd->obd_recovery_task_lock);
+ target_exp_dequeue_req_replay(req);
+ target_request_copy_put(req);
+ obd->obd_replayed_requests++;
+ } else if (type == UPDATE_RECOVERY && transno != 0) {
+ struct distribute_txn_replay_req *dtrq;
+
+ spin_unlock(&obd->obd_recovery_task_lock);
+
+ LASSERT(tdtd != NULL);
+ dtrq = distribute_txn_get_next_req(tdtd);
+ lu_context_enter(&thread->t_env->le_ctx);
+ tdtd->tdtd_replay_handler(env, tdtd, dtrq);
+ lu_context_exit(&thread->t_env->le_ctx);
+ extend_recovery_timer(obd, obd_timeout, true);
+ LASSERT(tdtd->tdtd_last_update_transno <= transno);
+ tdtd->tdtd_last_update_transno = transno;
+ spin_lock(&obd->obd_recovery_task_lock);
+ if (transno > obd->obd_next_recovery_transno)
+ obd->obd_next_recovery_transno = transno;
+ spin_unlock(&obd->obd_recovery_task_lock);
+ target_update_lcd(env, lut, dtrq);
+ dtrq_destroy(dtrq);
+ } else {
+ spin_unlock(&obd->obd_recovery_task_lock);
+ LASSERT(list_empty(&obd->obd_req_replay_queue));
+ LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
+ /** evict exports failed VBR */
+ class_disconnect_stale_exports(obd, exp_vbr_healthy);
+ break;
+ }
+ } while (1);
+}
+
static int target_recovery_thread(void *arg)
{
struct lu_target *lut = arg;
spin_unlock(&obd->obd_dev_lock);
complete(&trd->trd_starting);
- /* first of all, we have to know the first transno to replay */
- if (target_recovery_overseer(obd, check_for_clients,
- exp_connect_healthy)) {
- abort_req_replay_queue(obd);
- abort_lock_replay_queue(obd);
- }
+ /* first of all, we have to know the first transno to replay */
+ if (target_recovery_overseer(lut, check_for_recovery_ready,
+ exp_connect_healthy)) {
+ abort_req_replay_queue(obd);
+ abort_lock_replay_queue(obd);
+ if (lut->lut_tdtd != NULL)
+ dtrq_list_destroy(lut->lut_tdtd);
+ }
- /* next stage: replay requests */
+ /* next stage: replay requests or update */
delta = jiffies;
CDEBUG(D_INFO, "1: request replay stage - %d clients from t"LPU64"\n",
atomic_read(&obd->obd_req_replay_clients),
obd->obd_next_recovery_transno);
- while ((req = target_next_replay_req(obd))) {
- LASSERT(trd->trd_processing_task == current_pid());
- DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s",
- lustre_msg_get_transno(req->rq_reqmsg),
- libcfs_nid2str(req->rq_peer.nid));
- handle_recovery_req(thread, req,
- trd->trd_recovery_handler);
- /**
- * bz18031: increase next_recovery_transno before
- * target_request_copy_put() will drop exp_rpc reference
- */
- spin_lock(&obd->obd_recovery_task_lock);
- obd->obd_next_recovery_transno++;
- spin_unlock(&obd->obd_recovery_task_lock);
- target_exp_dequeue_req_replay(req);
- target_request_copy_put(req);
- obd->obd_replayed_requests++;
- }
+ replay_request_or_update(env, lut, trd, thread);
/**
* The second stage: replay locks
*/
CDEBUG(D_INFO, "2: lock replay stage - %d clients\n",
atomic_read(&obd->obd_lock_replay_clients));
- while ((req = target_next_replay_lock(obd))) {
+ while ((req = target_next_replay_lock(lut))) {
LASSERT(trd->trd_processing_task == current_pid());
DEBUG_REQ(D_HA, req, "processing lock from %s: ",
libcfs_nid2str(req->rq_peer.nid));
libcfs_debug_dumplog();
}
- target_finish_recovery(obd);
+ target_finish_recovery(lut);
lu_context_fini(&env->le_ctx);
trd->trd_processing_task = 0;
void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
{
struct obd_device *obd = lut->lut_obd;
+
if (obd->obd_max_recoverable_clients == 0) {
/** Update server last boot epoch */
tgt_boot_epoch_update(lut);
}
EXPORT_SYMBOL(target_recovery_init);
-
static int target_process_req_flags(struct obd_device *obd,
struct ptlrpc_request *req)
{
static int target_send_reply_msg(struct ptlrpc_request *req,
int rc, int fail_id)
{
- if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
- DEBUG_REQ(D_ERROR, req, "dropping reply");
- return (-ECOMM);
- }
+ if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
+ DEBUG_REQ(D_ERROR, req, "dropping reply");
+ return -ECOMM;
+ }
+ if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT &&
+ OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET_REP)))
+ return -ECOMM;
- if (unlikely(rc)) {
- DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
- req->rq_status = rc;
- return (ptlrpc_send_error(req, 1));
- } else {
- DEBUG_REQ(D_NET, req, "sending reply");
- }
+ if (unlikely(rc)) {
+ DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
+ req->rq_status = rc;
+ return ptlrpc_send_error(req, 1);
+ } else {
+ DEBUG_REQ(D_NET, req, "sending reply");
+ }
- return (ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT));
+ return ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT);
}
void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
} else {
if (req->rq_bulk_read)
rc = sptlrpc_svc_wrap_bulk(req, desc);
+
+ if ((exp->exp_connect_data.ocd_connect_flags &
+ OBD_CONNECT_BULK_MBITS) != 0)
+ req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg);
+ else /* old version, bulk matchbits is rq_xid */
+ req->rq_mbits = req->rq_xid;
+
if (rc == 0)
rc = ptlrpc_start_bulk_transfer(desc);
}