* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* Use is subject to license terms.
*
- * Copyright (c) 2010, 2013, Intel Corporation.
+ * Copyright (c) 2010, 2014, Intel Corporation.
*/
/*
* This file is part of Lustre, http://www.lustre.org/
#define DEBUG_SUBSYSTEM S_LDLM
+#include <linux/kthread.h>
#include <libcfs/libcfs.h>
#include <obd.h>
#include <obd_class.h>
INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
INIT_LIST_HEAD(&cli->cl_loi_write_list);
INIT_LIST_HEAD(&cli->cl_loi_read_list);
- client_obd_list_lock_init(&cli->cl_loi_list_lock);
+ spin_lock_init(&cli->cl_loi_list_lock);
atomic_set(&cli->cl_pending_w_pages, 0);
atomic_set(&cli->cl_pending_r_pages, 0);
cli->cl_r_in_flight = 0;
atomic_long_set(&cli->cl_lru_busy, 0);
atomic_long_set(&cli->cl_lru_in_list, 0);
INIT_LIST_HEAD(&cli->cl_lru_list);
- client_obd_list_lock_init(&cli->cl_lru_list_lock);
+ spin_lock_init(&cli->cl_lru_list_lock);
atomic_long_set(&cli->cl_unstable_count, 0);
init_waitqueue_head(&cli->cl_destroy_waitq);
GOTO(err_import, rc);
}
- cli->cl_import = imp;
- /* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */
- cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
- cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
+ cli->cl_import = imp;
+ /* cli->cl_max_mds_easize updated by mdc_init_ea_size() */
+ cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
GOTO(err_import, rc = -ENOMEM);
}
- cli->cl_qchk_stat = CL_NOT_QUOTACHECKED;
-
RETURN(rc);
err_import:
*exp = NULL;
down_write(&cli->cl_sem);
- if (cli->cl_conn_count > 0 )
+ if (cli->cl_conn_count > 0)
GOTO(out_sem, rc = -EALREADY);
rc = class_connect(&conn, obd, cluuid);
imp = cli->cl_import;
down_write(&cli->cl_sem);
- CDEBUG(D_INFO, "disconnect %s - %d\n", obd->obd_name,
- cli->cl_conn_count);
+ CDEBUG(D_INFO, "disconnect %s - %zu\n", obd->obd_name,
+ cli->cl_conn_count);
- if (!cli->cl_conn_count) {
+ if (cli->cl_conn_count == 0) {
CERROR("disconnecting disconnected device (%s)\n",
obd->obd_name);
GOTO(out_disconnect, rc = -EINVAL);
}
cli->cl_conn_count--;
- if (cli->cl_conn_count)
+ if (cli->cl_conn_count != 0)
GOTO(out_disconnect, rc = 0);
/* Mark import deactivated now, so we don't try to reconnect if any
spin_unlock(&exp->exp_lock);
class_export_cb_put(exp);
}
-EXPORT_SYMBOL(target_client_add_cb);
static void
check_and_start_recovery_timer(struct obd_device *obd,
char *target_start;
int target_len;
bool mds_conn = false, lw_client = false;
+ bool mds_mds_conn = false;
+ bool new_mds_mds_conn = false;
struct obd_connect_data *data, *tmpdata;
int size, tmpsize;
lnet_nid_t *client_nid = NULL;
if (rc)
GOTO(out, rc);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+ /* Don't allow clients to connect that are using old 1.8 format
+ * protocol conventions (LUSTRE_MSG_MAGIC_v1, !MSGHDR_CKSUM_INCOMPAT18,
+ * ldlm_flock_policy_wire format, MDT_ATTR_xTIME_SET, etc). The
+ * FULL20 flag should be set on all connections since 2.0, but no
+ * longer affects behaviour.
+ *
+ * Later this check will be disabled and the flag can be retired
+ * completely once interop with 3.0 is no longer needed.
+ */
+ if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
+ GOTO(out, rc = -EPROTO);
+#endif
+
if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
if (data->ocd_version < LUSTRE_VERSION_CODE -
LUSTRE_VERSION_ALLOWED_OFFSET ||
}
}
+ /* Note: lw_client is needed in MDS-MDS failover during update log
+ * processing, so we needs to allow lw_client to be connected at
+ * anytime, instead of only the initial connection */
+ lw_client = (data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0;
+
if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) {
mds_conn = (data->ocd_connect_flags & OBD_CONNECT_MDS) != 0;
- lw_client = (data->ocd_connect_flags &
- OBD_CONNECT_LIGHTWEIGHT) != 0;
+ mds_mds_conn = (data->ocd_connect_flags &
+ OBD_CONNECT_MDS_MDS) != 0;
+
+ /* OBD_CONNECT_MNE_SWAB is defined as OBD_CONNECT_MDS_MDS
+ * for Imperative Recovery connection from MGC to MGS.
+ *
+ * Via check OBD_CONNECT_FID, we can distinguish whether
+ * the OBD_CONNECT_MDS_MDS/OBD_CONNECT_MNE_SWAB is from
+ * MGC or MDT. */
+ if (!lw_client &&
+ (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
+ (data->ocd_connect_flags & OBD_CONNECT_FID) &&
+ (data->ocd_connect_flags & OBD_CONNECT_VERSION)) {
+ __u32 major = OBD_OCD_VERSION_MAJOR(data->ocd_version);
+ __u32 minor = OBD_OCD_VERSION_MINOR(data->ocd_version);
+ __u32 patch = OBD_OCD_VERSION_PATCH(data->ocd_version);
+
+ /* We do not support the MDT-MDT interoperations with
+ * different version MDT because of protocol changes. */
+ if (unlikely(major != LUSTRE_MAJOR ||
+ minor != LUSTRE_MINOR ||
+ abs(patch - LUSTRE_PATCH) > 3)) {
+ LCONSOLE_WARN("%s (%u.%u.%u.%u) refused the "
+ "connection from different version MDT "
+ "(%d.%d.%d.%d) %s %s\n",
+ target->obd_name, LUSTRE_MAJOR,
+ LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX,
+ major, minor, patch,
+ OBD_OCD_VERSION_FIX(data->ocd_version),
+ libcfs_nid2str(req->rq_peer.nid), str);
+
+ GOTO(out, rc = -EPROTO);
+ }
+ }
}
/* lctl gets a backstage, all-access pass. */
if (export == NULL) {
/* allow lightweight connections during recovery */
- if (target->obd_recovering && !lw_client) {
+ /* allow "new" MDT to be connected during recovery, since we
+ * need retrieve recovery update records from it */
+ if (target->obd_recovering && !lw_client && !mds_mds_conn) {
cfs_time_t t;
int c; /* connected */
int i; /* in progress */
t = cfs_timer_deadline(&target->obd_recovery_timer);
t = cfs_time_sub(t, cfs_time_current());
t = cfs_duration_sec(t);
- LCONSOLE_WARN("%s: Denying connection for new client "
- "%s (at %s), waiting for all %d known "
- "clients (%d recovered, %d in progress, "
- "and %d evicted) to recover in %d:%.02d\n",
+ LCONSOLE_WARN("%s: Denying connection for new client %s"
+ "(at %s), waiting for %d known clients "
+ "(%d recovered, %d in progress, and %d "
+ "evicted) to recover in %d:%.02d\n",
target->obd_name, cluuid.uuid,
libcfs_nid2str(req->rq_peer.nid), k,
c - i, i, s, (int)t / 60,
(int)t % 60);
- rc = -EBUSY;
- } else {
+ rc = -EBUSY;
+ } else {
dont_check_exports:
- rc = obd_connect(req->rq_svc_thread->t_env,
- &export, target, &cluuid, data,
- client_nid);
+ rc = obd_connect(req->rq_svc_thread->t_env,
+ &export, target, &cluuid, data,
+ client_nid);
if (mds_conn && OBD_FAIL_CHECK(OBD_FAIL_TGT_RCVG_FLAG))
lustre_msg_add_op_flags(req->rq_repmsg,
- MSG_CONNECT_RECOVERING);
- if (rc == 0)
- conn.cookie = export->exp_handle.h_cookie;
- }
- } else {
- rc = obd_reconnect(req->rq_svc_thread->t_env,
- export, target, &cluuid, data, client_nid);
- }
- if (rc)
- GOTO(out, rc);
+ MSG_CONNECT_RECOVERING);
+ if (rc == 0)
+ conn.cookie = export->exp_handle.h_cookie;
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 6, 53, 0)
- /* 2.2.0 clients always swab nidtbl entries due to a bug, so server
- * will do the swabbing for if the client is using the same endianness.
- *
- * This fixup is version-limited, because we don't want to carry the
- * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we need
- * interop with unpatched 2.2 clients. For newer clients, servers
- * will never do MNE swabbing, let the client handle that. LU-1644 */
- export->exp_need_mne_swab = !ptlrpc_req_need_swab(req) &&
- !(data->ocd_connect_flags & OBD_CONNECT_MNE_SWAB);
-#endif
+ if (mds_mds_conn)
+ new_mds_mds_conn = true;
+ }
+ } else {
+ rc = obd_reconnect(req->rq_svc_thread->t_env,
+ export, target, &cluuid, data, client_nid);
+ }
+ if (rc)
+ GOTO(out, rc);
- LASSERT(target->u.obt.obt_magic == OBT_MAGIC);
- data->ocd_instance = target->u.obt.obt_instance;
+ LASSERT(target->u.obt.obt_magic == OBT_MAGIC);
+ data->ocd_instance = target->u.obt.obt_instance;
/* Return only the parts of obd_connect_data that we understand, so the
* client knows that we don't understand the rest. */
atomic_inc(&target->obd_req_replay_clients);
atomic_inc(&target->obd_lock_replay_clients);
+ /* Note: MDS-MDS connection is allowed to be connected during
+ * recovery, no matter if the exports needs to be recoveried.
+ * Because we need retrieve updates logs from all other MDTs.
+ * So if the MDS-MDS export is new, obd_max_recoverable_clients
+ * also needs to be increased to match other recovery checking
+ * condition. */
+ if (new_mds_mds_conn)
+ target->obd_max_recoverable_clients++;
if (atomic_inc_return(&target->obd_connected_clients) ==
target->obd_max_recoverable_clients)
wake_up(&target->obd_next_transno_waitq);
* ptlrpc_handle_server_req_in->lustre_unpack_msg(). */
revimp->imp_msg_magic = req->rq_reqmsg->lm_magic;
- if ((data->ocd_connect_flags & OBD_CONNECT_AT) &&
- (revimp->imp_msg_magic != LUSTRE_MSG_MAGIC_V1))
+ if (data->ocd_connect_flags & OBD_CONNECT_AT)
revimp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
else
revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
- if ((data->ocd_connect_flags & OBD_CONNECT_FULL20) &&
- (revimp->imp_msg_magic != LUSTRE_MSG_MAGIC_V1))
- revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
- else
- revimp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+ revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, &req->rq_flvr);
if (rc) {
req->rq_status = rc;
RETURN(rc);
}
-EXPORT_SYMBOL(target_handle_connect);
int target_handle_disconnect(struct ptlrpc_request *req)
{
RETURN(0);
}
-EXPORT_SYMBOL(target_handle_disconnect);
void target_destroy_export(struct obd_export *exp)
{
spin_unlock(&req->rq_export->exp_lock);
}
-static void target_finish_recovery(struct obd_device *obd)
+static void target_finish_recovery(struct lu_target *lut)
{
+ struct obd_device *obd = lut->lut_obd;
ENTRY;
/* Only log a recovery message when recovery has occurred. */
}
spin_unlock(&obd->obd_recovery_task_lock);
+ if (lut->lut_tdtd != NULL &&
+ !list_empty(&lut->lut_tdtd->tdtd_replay_list))
+ dtrq_list_dump(lut->lut_tdtd, D_ERROR);
+
obd->obd_recovery_end = cfs_time_current_sec();
/* When recovery finished, cleanup orphans on MDS and OST. */
return;
}
obd->obd_recovering = obd->obd_abort_recovery = 0;
+ obd->obd_force_abort_recovery = 0;
spin_unlock(&obd->obd_dev_lock);
spin_lock(&obd->obd_recovery_task_lock);
spin_unlock(&obd->obd_recovery_task_lock);
list_for_each_entry_safe(req, n, &clean_list, rq_list) {
- LASSERT(req->rq_reply_state == 0);
+ LASSERT(req->rq_reply_state == NULL);
target_exp_dequeue_req_replay(req);
target_request_copy_put(req);
}
spin_unlock(&obd->obd_recovery_task_lock);
list_for_each_entry_safe(req, n, &clean_list, rq_list) {
- LASSERT(req->rq_reply_state == 0);
+ LASSERT(req->rq_reply_state == NULL);
target_request_copy_put(req);
}
CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name);
cfs_timer_disarm(&obd->obd_recovery_timer);
}
-EXPORT_SYMBOL(target_cancel_recovery_timer);
static void target_start_recovery_timer(struct obd_device *obd)
{
return;
spin_lock(&obd->obd_dev_lock);
- if (!obd->obd_recovering || obd->obd_abort_recovery) {
+ if (!obd->obd_recovering || obd->obd_abort_recovery ||
+ obd->obd_force_abort_recovery) {
spin_unlock(&obd->obd_dev_lock);
return;
}
int to;
spin_lock(&obd->obd_dev_lock);
- if (!obd->obd_recovering || obd->obd_abort_recovery) {
+ if (!obd->obd_recovering || obd->obd_abort_recovery ||
+ obd->obd_force_abort_recovery) {
spin_unlock(&obd->obd_dev_lock);
return;
}
to += drt - left;
} else if (!extend && (drt > to)) {
to = drt;
- /* reduce drt by already passed time */
- drt -= obd->obd_recovery_timeout - left;
}
if (to > obd->obd_recovery_time_hard)
to = obd->obd_recovery_time_hard;
- if (obd->obd_recovery_timeout < to ||
- obd->obd_recovery_timeout == obd->obd_recovery_time_hard) {
+ if (obd->obd_recovery_timeout < to) {
obd->obd_recovery_timeout = to;
- cfs_timer_arm(&obd->obd_recovery_timer,
- cfs_time_shift(drt));
+ end = obd->obd_recovery_start + to;
+ cfs_timer_arm(&obd->obd_recovery_timer,
+ cfs_time_shift(end - now));
}
spin_unlock(&obd->obd_dev_lock);
CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n",
- obd->obd_name, (unsigned)drt);
+ obd->obd_name, (unsigned)cfs_time_sub(end, now));
}
/* Reset the timer with each new client connection */
return (exp->exp_in_recovery && !exp->exp_lock_replay_needed);
}
-/** Checking routines for recovery */
-static int check_for_clients(struct obd_device *obd)
-{
- unsigned int clnts = atomic_read(&obd->obd_connected_clients);
-
- if (obd->obd_abort_recovery || obd->obd_recovery_expired)
- return 1;
- LASSERT(clnts <= obd->obd_max_recoverable_clients);
- return (clnts + obd->obd_stale_clients ==
- obd->obd_max_recoverable_clients);
-}
-
-static int check_for_next_transno(struct obd_device *obd)
+static int check_for_next_transno(struct lu_target *lut)
{
struct ptlrpc_request *req = NULL;
+ struct obd_device *obd = lut->lut_obd;
int wake_up = 0, connected, completed, queue_len;
- __u64 next_transno, req_transno;
+ __u64 req_transno = 0;
+ __u64 update_transno = 0;
+ __u64 next_transno = 0;
ENTRY;
spin_lock(&obd->obd_recovery_task_lock);
req = list_entry(obd->obd_req_replay_queue.next,
struct ptlrpc_request, rq_list);
req_transno = lustre_msg_get_transno(req->rq_reqmsg);
- } else {
- req_transno = 0;
+ }
+
+ if (lut->lut_tdtd != NULL) {
+ struct target_distribute_txn_data *tdtd;
+ __u64 update_transno;
+
+ tdtd = lut->lut_tdtd;
+ update_transno = distribute_txn_get_next_transno(lut->lut_tdtd);
}
connected = atomic_read(&obd->obd_connected_clients);
obd->obd_max_recoverable_clients, connected, completed,
queue_len, req_transno, next_transno);
- if (obd->obd_abort_recovery) {
+ if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
CDEBUG(D_HA, "waking for aborted recovery\n");
wake_up = 1;
} else if (obd->obd_recovery_expired) {
CDEBUG(D_HA, "waking for expired recovery\n");
wake_up = 1;
- } else if (req_transno == next_transno) {
+ } else if (req_transno == next_transno || (update_transno != 0 &&
+ update_transno <= next_transno)) {
CDEBUG(D_HA, "waking for next ("LPD64")\n", next_transno);
wake_up = 1;
} else if (queue_len > 0 &&
CDEBUG(d_lvl,
"%s: waking for gap in transno, VBR is %s (skip: "
LPD64", ql: %d, comp: %d, conn: %d, next: "LPD64
- ", last_committed: "LPD64")\n",
+ ", next_update "LPD64" last_committed: "LPD64")\n",
obd->obd_name, obd->obd_version_recov ? "ON" : "OFF",
next_transno, queue_len, completed, connected,
- req_transno, obd->obd_last_committed);
+ req_transno, update_transno, obd->obd_last_committed);
obd->obd_next_recovery_transno = req_transno;
wake_up = 1;
} else if (atomic_read(&obd->obd_req_replay_clients) == 0) {
return wake_up;
}
-static int check_for_next_lock(struct obd_device *obd)
+static int check_for_next_lock(struct lu_target *lut)
{
+ struct obd_device *obd = lut->lut_obd;
int wake_up = 0;
spin_lock(&obd->obd_recovery_task_lock);
} else if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
CDEBUG(D_HA, "waking for completed lock replay\n");
wake_up = 1;
- } else if (obd->obd_abort_recovery) {
+ } else if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
CDEBUG(D_HA, "waking for aborted recovery\n");
wake_up = 1;
} else if (obd->obd_recovery_expired) {
* check its status with help of check_routine
* evict dead clients via health_check
*/
-static int target_recovery_overseer(struct obd_device *obd,
- int (*check_routine)(struct obd_device *),
+static int target_recovery_overseer(struct lu_target *lut,
+ int (*check_routine)(struct lu_target *),
int (*health_check)(struct obd_export *))
{
+ struct obd_device *obd = lut->lut_obd;
repeat:
- wait_event(obd->obd_next_transno_waitq, check_routine(obd));
- if (obd->obd_abort_recovery) {
+ if ((obd->obd_recovery_start != 0) && (cfs_time_current_sec() >=
+ (obd->obd_recovery_start + obd->obd_recovery_time_hard))) {
+ CWARN("recovery is aborted by hard timeout\n");
+ obd->obd_abort_recovery = 1;
+ }
+
+ while (wait_event_timeout(obd->obd_next_transno_waitq,
+ check_routine(lut),
+ msecs_to_jiffies(60 * MSEC_PER_SEC)) == 0)
+ /* wait indefinitely for event, but don't trigger watchdog */;
+
+ if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
CWARN("recovery is aborted, evict exports in recovery\n");
/** evict exports which didn't finish recovery yet */
class_disconnect_stale_exports(obd, exp_finished);
return 0;
}
-static struct ptlrpc_request *target_next_replay_req(struct obd_device *obd)
-{
- struct ptlrpc_request *req = NULL;
- ENTRY;
-
- CDEBUG(D_HA, "Waiting for transno "LPD64"\n",
- obd->obd_next_recovery_transno);
-
- CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);
-
- if (target_recovery_overseer(obd, check_for_next_transno,
- exp_req_replay_healthy)) {
- abort_req_replay_queue(obd);
- abort_lock_replay_queue(obd);
- }
-
- spin_lock(&obd->obd_recovery_task_lock);
- if (!list_empty(&obd->obd_req_replay_queue)) {
- req = list_entry(obd->obd_req_replay_queue.next,
- struct ptlrpc_request, rq_list);
- list_del_init(&req->rq_list);
- obd->obd_requests_queued_for_recovery--;
- spin_unlock(&obd->obd_recovery_task_lock);
- } else {
- spin_unlock(&obd->obd_recovery_task_lock);
- LASSERT(list_empty(&obd->obd_req_replay_queue));
- LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
- /** evict exports failed VBR */
- class_disconnect_stale_exports(obd, exp_vbr_healthy);
- }
- RETURN(req);
-}
-
-static struct ptlrpc_request *target_next_replay_lock(struct obd_device *obd)
+static struct ptlrpc_request *target_next_replay_lock(struct lu_target *lut)
{
+ struct obd_device *obd = lut->lut_obd;
struct ptlrpc_request *req = NULL;
CDEBUG(D_HA, "Waiting for lock\n");
- if (target_recovery_overseer(obd, check_for_next_lock,
+ if (target_recovery_overseer(lut, check_for_next_lock,
exp_lock_replay_healthy))
abort_lock_replay_queue(obd);
EXIT;
}
+/** Checking routines for recovery */
+static int check_for_recovery_ready(struct lu_target *lut)
+{
+ struct obd_device *obd = lut->lut_obd;
+ unsigned int clnts = atomic_read(&obd->obd_connected_clients);
+
+ CDEBUG(D_HA, "connected %d stale %d max_recoverable_clients %d"
+ " abort %d expired %d\n", clnts, obd->obd_stale_clients,
+ obd->obd_max_recoverable_clients, obd->obd_abort_recovery,
+ obd->obd_recovery_expired);
+
+ if (obd->obd_force_abort_recovery)
+ return 1;
+
+ if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
+ LASSERT(clnts <= obd->obd_max_recoverable_clients);
+ if (clnts + obd->obd_stale_clients <
+ obd->obd_max_recoverable_clients)
+ return 0;
+ }
+
+ if (lut->lut_tdtd != NULL) {
+ if (!lut->lut_tdtd->tdtd_replay_ready) {
+ /* Let's extend recovery timer, in case the recovery
+ * timer expired, and some clients got evicted */
+ extend_recovery_timer(obd, obd->obd_recovery_timeout,
+ true);
+ return 0;
+ } else {
+ dtrq_list_dump(lut->lut_tdtd, D_HA);
+ }
+ }
+
+ return 1;
+}
+
+enum {
+ REQUEST_RECOVERY = 1,
+ UPDATE_RECOVERY = 2,
+};
+
+static __u64 get_next_replay_req_transno(struct obd_device *obd)
+{
+ __u64 transno = 0;
+
+ if (!list_empty(&obd->obd_req_replay_queue)) {
+ struct ptlrpc_request *req;
+
+ req = list_entry(obd->obd_req_replay_queue.next,
+ struct ptlrpc_request, rq_list);
+ transno = lustre_msg_get_transno(req->rq_reqmsg);
+ }
+
+ return transno;
+}
+__u64 get_next_transno(struct lu_target *lut, int *type)
+{
+ struct obd_device *obd = lut->lut_obd;
+ struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+ __u64 transno = 0;
+ __u64 update_transno;
+ ENTRY;
+
+ transno = get_next_replay_req_transno(obd);
+ if (type != NULL)
+ *type = REQUEST_RECOVERY;
+
+ if (tdtd == NULL)
+ RETURN(transno);
+
+ update_transno = distribute_txn_get_next_transno(tdtd);
+ if (transno == 0 || (transno >= update_transno &&
+ update_transno != 0)) {
+ transno = update_transno;
+ if (type != NULL)
+ *type = UPDATE_RECOVERY;
+ }
+
+ RETURN(transno);
+}
+
+/**
+ * drop duplicate replay request
+ *
+ * Because the operation has been replayed by update recovery, the request
+ * with the same transno will be dropped and also notify the client to send
+ * next replay request.
+ *
+ * \param[in] env execution environment
+ * \param[in] obd failover obd device
+ * \param[in] req request to be dropped
+ */
+static void drop_duplicate_replay_req(struct lu_env *env,
+ struct obd_device *obd,
+ struct ptlrpc_request *req)
+{
+ DEBUG_REQ(D_HA, req, "remove t"LPD64" from %s because of duplicate"
+ " update records are found.\n",
+ lustre_msg_get_transno(req->rq_reqmsg),
+ libcfs_nid2str(req->rq_peer.nid));
+
+ /* Right now, only for MDS reint operation update replay and
+ * normal request replay can have the same transno */
+ if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT) {
+ req_capsule_set(&req->rq_pill, &RQF_MDS_REINT);
+ req->rq_status = req_capsule_server_pack(&req->rq_pill);
+ if (likely(req->rq_export))
+ target_committed_to_req(req);
+ lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
+ target_send_reply(req, req->rq_status, 0);
+ } else {
+ DEBUG_REQ(D_ERROR, req, "wrong opc" "from %s\n",
+ libcfs_nid2str(req->rq_peer.nid));
+ }
+ target_exp_dequeue_req_replay(req);
+ target_request_copy_put(req);
+ obd->obd_replayed_requests++;
+}
+
+/**
+ * Update last_rcvd of the update
+ *
+ * Because update recovery might update the last_rcvd by updates, i.e.
+ * it will not update the last_rcvd information in memory, so we need
+ * refresh these information in memory after update recovery.
+ *
+ * \param[in] obd obd_device under recoverying.
+ * \param[in] dtrq the update replay requests being replayed.
+ */
+static void target_update_lcd(struct lu_env *env, struct lu_target *lut,
+ struct distribute_txn_replay_req *dtrq)
+{
+ struct obd_device *obd = lut->lut_obd;
+ struct obd_export *export;
+ struct tg_export_data *ted;
+ struct distribute_txn_replay_req_sub *dtrqs;
+ struct seq_server_site *site;
+ struct update_records *ur;
+ const struct lu_fid *fid;
+ struct update_ops *ops;
+ struct update_params *params;
+ struct update_op *op;
+ __u32 mdt_index;
+ unsigned int i;
+ struct lsd_client_data *lcd = NULL;
+
+ /* if Updates has been executed(committed) on the recovery target,
+ * i.e. the updates is not being executed on the target, so we do
+ * not need update it in memory */
+ site = lu_site2seq(obd->obd_lu_dev->ld_site);
+ mdt_index = site->ss_node_id;
+ dtrqs = dtrq_sub_lookup(dtrq, mdt_index);
+ if (dtrqs != NULL)
+ return;
+
+ if (dtrq->dtrq_lur == NULL)
+ return;
+
+ /* Find the update last_rcvd record */
+ fid = lu_object_fid(&lut->lut_last_rcvd->do_lu);
+ ur = &dtrq->dtrq_lur->lur_update_rec;
+ ops = &ur->ur_ops;
+ params = update_records_get_params(ur);
+ for (i = 0, op = &ops->uops_op[0]; i < ur->ur_update_count;
+ i++, op = update_op_next_op(op)) {
+ __u64 pos;
+ __u16 size;
+ void *buf;
+
+ if (!lu_fid_eq(&op->uop_fid, fid))
+ continue;
+
+ if (op->uop_type != OUT_WRITE)
+ continue;
+
+ buf = update_params_get_param_buf(params, op->uop_params_off[1],
+ ur->ur_param_count, NULL);
+ if (buf == NULL)
+ continue;
+
+ pos = le64_to_cpu(*(__u64 *)buf);
+ if (pos == 0)
+ continue;
+
+ buf = update_params_get_param_buf(params, op->uop_params_off[0],
+ ur->ur_param_count, &size);
+ if (buf == NULL)
+ continue;
+
+ if (size != sizeof(*lcd))
+ continue;
+ lcd = buf;
+ }
+
+ if (lcd == NULL || lcd->lcd_uuid[0] == '\0')
+ return;
+
+ /* locate the export then update the exp_target_data if needed */
+ export = cfs_hash_lookup(obd->obd_uuid_hash, lcd->lcd_uuid);
+ if (export == NULL)
+ return;
+
+ ted = &export->exp_target_data;
+ if (lcd->lcd_last_xid > ted->ted_lcd->lcd_last_xid) {
+ CDEBUG(D_HA, "%s update xid from "LPU64" to "LPU64"\n",
+ lut->lut_obd->obd_name, ted->ted_lcd->lcd_last_xid,
+ lcd->lcd_last_xid);
+ ted->ted_lcd->lcd_last_xid = lcd->lcd_last_xid;
+ ted->ted_lcd->lcd_last_result = lcd->lcd_last_result;
+ }
+ class_export_put(export);
+}
+
+static void replay_request_or_update(struct lu_env *env,
+ struct lu_target *lut,
+ struct target_recovery_data *trd,
+ struct ptlrpc_thread *thread)
+{
+ struct obd_device *obd = lut->lut_obd;
+ struct ptlrpc_request *req = NULL;
+ int type;
+ __u64 transno;
+ ENTRY;
+
+ CDEBUG(D_HA, "Waiting for transno "LPD64"\n",
+ obd->obd_next_recovery_transno);
+
+ /* Replay all of request and update by transno */
+ do {
+ struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+
+ CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);
+
+ /** It is needed to extend recovery window above
+ * recovery_time_soft. Extending is possible only in the
+ * end of recovery window (see more details in
+ * handle_recovery_req()).
+ */
+ CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
+
+ if (target_recovery_overseer(lut, check_for_next_transno,
+ exp_req_replay_healthy)) {
+ abort_req_replay_queue(obd);
+ abort_lock_replay_queue(obd);
+ }
+
+ spin_lock(&obd->obd_recovery_task_lock);
+ transno = get_next_transno(lut, &type);
+ if (type == REQUEST_RECOVERY && tdtd != NULL &&
+ transno == tdtd->tdtd_last_update_transno) {
+ /* Drop replay request from client side, if the
+ * replay has been executed by update with the
+ * same transno */
+ req = list_entry(obd->obd_req_replay_queue.next,
+ struct ptlrpc_request, rq_list);
+ list_del_init(&req->rq_list);
+ obd->obd_requests_queued_for_recovery--;
+ spin_unlock(&obd->obd_recovery_task_lock);
+ drop_duplicate_replay_req(env, obd, req);
+ } else if (type == REQUEST_RECOVERY && transno != 0) {
+ req = list_entry(obd->obd_req_replay_queue.next,
+ struct ptlrpc_request, rq_list);
+ list_del_init(&req->rq_list);
+ obd->obd_requests_queued_for_recovery--;
+ spin_unlock(&obd->obd_recovery_task_lock);
+ LASSERT(trd->trd_processing_task == current_pid());
+ DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s",
+ lustre_msg_get_transno(req->rq_reqmsg),
+ libcfs_nid2str(req->rq_peer.nid));
+
+ handle_recovery_req(thread, req,
+ trd->trd_recovery_handler);
+ /**
+ * bz18031: increase next_recovery_transno before
+ * target_request_copy_put() will drop exp_rpc reference
+ */
+ spin_lock(&obd->obd_recovery_task_lock);
+ obd->obd_next_recovery_transno++;
+ spin_unlock(&obd->obd_recovery_task_lock);
+ target_exp_dequeue_req_replay(req);
+ target_request_copy_put(req);
+ obd->obd_replayed_requests++;
+ } else if (type == UPDATE_RECOVERY && transno != 0) {
+ struct distribute_txn_replay_req *dtrq;
+
+ spin_unlock(&obd->obd_recovery_task_lock);
+
+ LASSERT(tdtd != NULL);
+ dtrq = distribute_txn_get_next_req(tdtd);
+ lu_context_enter(&thread->t_env->le_ctx);
+ tdtd->tdtd_replay_handler(env, tdtd, dtrq);
+ lu_context_exit(&thread->t_env->le_ctx);
+ extend_recovery_timer(obd, obd_timeout, true);
+ LASSERT(tdtd->tdtd_last_update_transno <= transno);
+ tdtd->tdtd_last_update_transno = transno;
+ spin_lock(&obd->obd_recovery_task_lock);
+ if (transno > obd->obd_next_recovery_transno)
+ obd->obd_next_recovery_transno = transno;
+ spin_unlock(&obd->obd_recovery_task_lock);
+ target_update_lcd(env, lut, dtrq);
+ dtrq_destory(dtrq);
+ } else {
+ spin_unlock(&obd->obd_recovery_task_lock);
+ LASSERT(list_empty(&obd->obd_req_replay_queue));
+ LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
+ /** evict exports failed VBR */
+ class_disconnect_stale_exports(obd, exp_vbr_healthy);
+ break;
+ }
+ } while (1);
+}
+
static int target_recovery_thread(void *arg)
{
struct lu_target *lut = arg;
spin_unlock(&obd->obd_dev_lock);
complete(&trd->trd_starting);
- /* first of all, we have to know the first transno to replay */
- if (target_recovery_overseer(obd, check_for_clients,
- exp_connect_healthy)) {
- abort_req_replay_queue(obd);
- abort_lock_replay_queue(obd);
- }
+ /* first of all, we have to know the first transno to replay */
+ if (target_recovery_overseer(lut, check_for_recovery_ready,
+ exp_connect_healthy)) {
+ abort_req_replay_queue(obd);
+ abort_lock_replay_queue(obd);
+ if (lut->lut_tdtd != NULL)
+ dtrq_list_destroy(lut->lut_tdtd);
+ }
- /* next stage: replay requests */
+ /* next stage: replay requests or update */
delta = jiffies;
CDEBUG(D_INFO, "1: request replay stage - %d clients from t"LPU64"\n",
atomic_read(&obd->obd_req_replay_clients),
obd->obd_next_recovery_transno);
- while ((req = target_next_replay_req(obd))) {
- LASSERT(trd->trd_processing_task == current_pid());
- DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s",
- lustre_msg_get_transno(req->rq_reqmsg),
- libcfs_nid2str(req->rq_peer.nid));
- handle_recovery_req(thread, req,
- trd->trd_recovery_handler);
- /**
- * bz18031: increase next_recovery_transno before
- * target_request_copy_put() will drop exp_rpc reference
- */
- spin_lock(&obd->obd_recovery_task_lock);
- obd->obd_next_recovery_transno++;
- spin_unlock(&obd->obd_recovery_task_lock);
- target_exp_dequeue_req_replay(req);
- target_request_copy_put(req);
- obd->obd_replayed_requests++;
- }
+ replay_request_or_update(env, lut, trd, thread);
/**
* The second stage: replay locks
*/
CDEBUG(D_INFO, "2: lock replay stage - %d clients\n",
atomic_read(&obd->obd_lock_replay_clients));
- while ((req = target_next_replay_lock(obd))) {
+ while ((req = target_next_replay_lock(lut))) {
LASSERT(trd->trd_processing_task == current_pid());
DEBUG_REQ(D_HA, req, "processing lock from %s: ",
libcfs_nid2str(req->rq_peer.nid));
* The third stage: reply on final pings, at this moment all clients
* must have request in final queue
*/
+ CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_RECONNECT, cfs_fail_val);
CDEBUG(D_INFO, "3: final stage - process recovery completion pings\n");
/** Update server last boot epoch */
tgt_boot_epoch_update(lut);
* export is being evicted */
ptlrpc_update_export_timer(req->rq_export, 0);
target_request_copy_put(req);
- }
+ }
- delta = (jiffies - delta) / HZ;
+ delta = jiffies_to_msecs(jiffies - delta) / MSEC_PER_SEC;
CDEBUG(D_INFO,"4: recovery completed in %lus - %d/%d reqs/locks\n",
delta, obd->obd_replayed_requests, obd->obd_replayed_locks);
if (delta > OBD_RECOVERY_TIME_SOFT) {
libcfs_debug_dumplog();
}
- target_finish_recovery(obd);
+ target_finish_recovery(lut);
lu_context_fini(&env->le_ctx);
trd->trd_processing_task = 0;
void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
{
struct obd_device *obd = lut->lut_obd;
+
if (obd->obd_max_recoverable_clients == 0) {
/** Update server last boot epoch */
tgt_boot_epoch_update(lut);
}
EXPORT_SYMBOL(target_recovery_init);
-
static int target_process_req_flags(struct obd_device *obd,
struct ptlrpc_request *req)
{
wake_up(&obd->obd_next_transno_waitq);
RETURN(0);
}
-EXPORT_SYMBOL(target_queue_recovery_request);
int target_handle_ping(struct ptlrpc_request *req)
{
obd_ping(req->rq_svc_thread->t_env, req->rq_export);
return req_capsule_server_pack(&req->rq_pill);
}
-EXPORT_SYMBOL(target_handle_ping);
void target_committed_to_req(struct ptlrpc_request *req)
{
CDEBUG(D_INFO, "last_committed "LPU64", transno "LPU64", xid "LPU64"\n",
exp->exp_last_committed, req->rq_transno, req->rq_xid);
}
-EXPORT_SYMBOL(target_committed_to_req);
#endif /* HAVE_SERVER_SUPPORT */
RETURN(0);
}
-EXPORT_SYMBOL(target_pack_pool_reply);
-int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id)
+static int target_send_reply_msg(struct ptlrpc_request *req,
+ int rc, int fail_id)
{
if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
DEBUG_REQ(D_ERROR, req, "dropping reply");
spin_unlock(&svcpt->scp_rep_lock);
EXIT;
}
-EXPORT_SYMBOL(target_send_reply);
ldlm_mode_t lck_compat_array[] = {
[LCK_EX] = LCK_COMPAT_EX,
switch (error) {
case ELDLM_OK:
+ case ELDLM_LOCK_MATCHED:
result = 0;
break;
case ELDLM_LOCK_CHANGED:
}
return error;
}
-EXPORT_SYMBOL(ldlm_errno2error);
#if LUSTRE_TRACKS_LOCK_EXP_REFS
void ldlm_dump_export_locks(struct obd_export *exp)
RETURN(1);
}
-static inline char *bulk2type(struct ptlrpc_bulk_desc *desc)
+static inline const char *bulk2type(struct ptlrpc_request *req)
{
- return desc->bd_type == BULK_GET_SINK ? "GET" : "PUT";
+ if (req->rq_bulk_read)
+ return "READ";
+ if (req->rq_bulk_write)
+ return "WRITE";
+ return "UNKNOWN";
}
int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
exp->exp_conn_cnt > lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
rc = -ENOTCONN;
} else {
- if (desc->bd_type == BULK_PUT_SINK)
+ if (req->rq_bulk_read)
rc = sptlrpc_svc_wrap_bulk(req, desc);
if (rc == 0)
rc = ptlrpc_start_bulk_transfer(desc);
if (rc < 0) {
DEBUG_REQ(D_ERROR, req, "bulk %s failed: rc %d",
- bulk2type(desc), rc);
+ bulk2type(req), rc);
RETURN(rc);
}
long timeoutl = deadline - cfs_time_current_sec();
cfs_duration_t timeout = timeoutl <= 0 ?
CFS_TICK : cfs_time_seconds(timeoutl);
+ time_t rq_deadline;
*lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
target_bulk_timeout, desc);
lwi);
LASSERT(rc == 0 || rc == -ETIMEDOUT);
/* Wait again if we changed rq_deadline. */
+ rq_deadline = ACCESS_ONCE(req->rq_deadline);
deadline = start + bulk_timeout;
- if (deadline > req->rq_deadline)
- deadline = req->rq_deadline;
+ if (deadline > rq_deadline)
+ deadline = rq_deadline;
} while ((rc == -ETIMEDOUT) &&
(deadline > cfs_time_current_sec()));
if (rc == -ETIMEDOUT) {
DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds",
- bulk2type(desc), deadline - start,
+ bulk2type(req), deadline - start,
cfs_time_current_sec() - deadline);
ptlrpc_abort_bulk(desc);
} else if (exp->exp_failed) {
DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
- bulk2type(desc));
+ bulk2type(req));
rc = -ENOTCONN;
ptlrpc_abort_bulk(desc);
} else if (exp->exp_conn_cnt >
lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
DEBUG_REQ(D_ERROR, req, "Reconnect on bulk %s",
- bulk2type(desc));
+ bulk2type(req));
/* We don't reply anyway. */
rc = -ETIMEDOUT;
ptlrpc_abort_bulk(desc);
- } else if (desc->bd_failure ||
- desc->bd_nob_transferred != desc->bd_nob) {
- DEBUG_REQ(D_ERROR, req, "%s bulk %s %d(%d)",
- desc->bd_failure ? "network error on" : "truncated",
- bulk2type(desc), desc->bd_nob_transferred,
- desc->bd_nob);
- /* XXX Should this be a different errno? */
+ } else if (desc->bd_failure) {
+ DEBUG_REQ(D_ERROR, req, "network error on bulk %s",
+ bulk2type(req));
+ /* XXX should this be a different errno? */
rc = -ETIMEDOUT;
- } else if (desc->bd_type == BULK_GET_SINK) {
- rc = sptlrpc_svc_unwrap_bulk(req, desc);
+ } else {
+ if (req->rq_bulk_write)
+ rc = sptlrpc_svc_unwrap_bulk(req, desc);
+ if (rc == 0 && desc->bd_nob_transferred != desc->bd_nob) {
+ DEBUG_REQ(D_ERROR, req, "truncated bulk %s %d(%d)",
+ bulk2type(req), desc->bd_nob_transferred,
+ desc->bd_nob);
+ /* XXX should this be a different errno? */
+ rc = -ETIMEDOUT;
+ }
}
RETURN(rc);