* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* Use is subject to license terms.
*
- * Copyright (c) 2011, 2012, Whamcloud, Inc.
+ * Copyright (c) 2010, 2012, Intel Corporation.
*/
/*
* This file is part of Lustre, http://www.lustre.org/
* Lustre is a trademark of Sun Microsystems, Inc.
*/
+/**
+ * This file deals with various client/target related logic including recovery.
+ *
+ * TODO: This code more logically belongs in the ptlrpc module than in ldlm and
+ * should be moved.
+ */
+
#define DEBUG_SUBSYSTEM S_LDLM
#ifdef __KERNEL__
#include <lustre_sec.h>
#include "ldlm_internal.h"
-/* @priority: if non-zero, move the selected to the list head
- * @create: if zero, only search in existed connections
+/* @priority: If non-zero, move the selected connection to the list head.
+ * @create: If zero, only search in existing connections.
*/
static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid,
int priority, int create)
GOTO(out_free, rc = 0);
}
}
- /* not found */
+ /* No existing import connection found for \a uuid. */
if (create) {
imp_conn->oic_conn = ptlrpc_conn;
imp_conn->oic_uuid = *uuid;
continue;
LASSERT(imp_conn->oic_conn);
- /* is current conn? */
if (imp_conn == imp->imp_conn_current) {
LASSERT(imp_conn->oic_conn == imp->imp_connection);
EXPORT_SYMBOL(client_import_del_conn);
/**
- * Find conn uuid by peer nid. @peer is a server nid. This function is used
- * to find a conn uuid of @imp which can reach @peer.
+ * Find conn UUID by peer NID. \a peer is a server NID. This function is used
+ * to find a conn uuid of \a imp which can reach \a peer.
*/
int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
struct obd_uuid *uuid)
spin_lock(&imp->imp_lock);
cfs_list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
- /* check if conn uuid does have this peer nid */
+ /* Check if conn UUID does have this peer NID. */
if (class_check_uuid(&conn->oic_uuid, peer)) {
*uuid = conn->oic_uuid;
rc = 0;
void client_destroy_import(struct obd_import *imp)
{
- /* drop security policy instance after all rpc finished/aborted
- * to let all busy contexts be released. */
+ /* Drop security policy instance after all RPCs have finished/aborted
+ * to let all busy contexts be released. */
class_import_get(imp);
class_destroy_import(imp);
sptlrpc_import_sec_put(imp);
EXPORT_SYMBOL(client_destroy_import);
/**
- * check whether the osc is on MDT or not
+ * Check whether or not the OSC is on MDT.
* In the config log,
* osc on MDT
* setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID
return 0;
}
-/* configure an RPC client OBD device
+/* Configure an RPC client OBD device.
*
* lcfg parameters:
* 1 - client UUID
cli->cl_dirty = 0;
cli->cl_avail_grant = 0;
- /* FIXME: should limit this for the sum of all cl_dirty_max */
+ /* FIXME: Should limit this for the sum of all cl_dirty_max. */
cli->cl_dirty_max = OSC_MAX_DIRTY_DEFAULT * 1024 * 1024;
if (cli->cl_dirty_max >> CFS_PAGE_SHIFT > cfs_num_physpages / 8)
cli->cl_dirty_max = cfs_num_physpages << (CFS_PAGE_SHIFT - 3);
if (cli->cl_conn_count)
GOTO(out_disconnect, rc = 0);
- /* Mark import deactivated now, so we don't try to reconnect if any
- * of the cleanup RPCs fails (e.g. ldlm cancel, etc). We don't
- * fully deactivate the import, or that would drop all requests. */
+ /* Mark import deactivated now, so we don't try to reconnect if any
+ * of the cleanup RPCs fails (e.g. LDLM cancel, etc). We don't
+ * fully deactivate the import, or that would drop all requests. */
spin_lock(&imp->imp_lock);
imp->imp_deactive = 1;
spin_unlock(&imp->imp_lock);
ldlm_namespace_free_prior(obd->obd_namespace, imp, obd->obd_force);
}
- /*
- * there's no need to hold sem during disconnecting an import,
- * and actually it may cause deadlock in gss.
- */
+ /* There's no need to hold sem while disconnecting an import,
+ * and it may actually cause deadlock in GSS. */
up_write(&cli->cl_sem);
rc = ptlrpc_disconnect_import(imp, 0);
down_write(&cli->cl_sem);
EXIT;
- out_disconnect:
- /* use server style - class_disconnect should be always called for
- * o_disconnect */
+out_disconnect:
+ /* Use server style - class_disconnect should be always called for
+ * o_disconnect. */
err = class_disconnect(exp);
if (!rc && err)
rc = err;
int rc;
ENTRY;
- /* Disconnect early so that clients can't keep using export */
- rc = class_disconnect(exp);
- /* close import for avoid sending any requests */
- if (exp->exp_imp_reverse)
- ptlrpc_cleanup_imp(exp->exp_imp_reverse);
+ /* Disconnect early so that clients can't keep using export. */
+ rc = class_disconnect(exp);
+ /* Close import to avoid sending any requests. */
+ if (exp->exp_imp_reverse)
+ ptlrpc_cleanup_imp(exp->exp_imp_reverse);
- if (exp->exp_obd->obd_namespace != NULL)
- ldlm_cancel_locks_for_export(exp);
+ if (exp->exp_obd->obd_namespace != NULL)
+ ldlm_cancel_locks_for_export(exp);
/* complete all outstanding replies */
spin_lock(&exp->exp_lock);
GOTO(out, rc = -EAGAIN);
}
- /* Make sure the target isn't cleaned up while we're here. Yes,
- there's still a race between the above check and our incref here.
- Really, class_uuid2obd should take the ref. */
+ /* Make sure the target isn't cleaned up while we're here. Yes,
+ * there's still a race between the above check and our incref here.
+ * Really, class_uuid2obd should take the ref. */
targref = class_incref(target, __FUNCTION__, cfs_current());
target->obd_conn_inprogress++;
obd_str2uuid(&cluuid, str);
- /* XXX extract a nettype and format accordingly */
- switch (sizeof(lnet_nid_t)) {
- /* NB the casts only avoid compiler warnings */
+ /* XXX Extract a nettype and format accordingly. */
+ switch (sizeof(lnet_nid_t)) {
+ /* NB the casts only avoid compiler warnings. */
case 8:
snprintf(remote_uuid.uuid, sizeof remote_uuid,
"NET_"LPX64"_UUID", (__u64)req->rq_peer.nid);
if (!export)
goto no_export;
- /* we've found an export in the hash */
+ /* We've found an export in the hash. */
spin_lock(&export->exp_lock);
} else if (mds_conn && export->exp_connection) {
spin_unlock(&export->exp_lock);
if (req->rq_peer.nid != export->exp_connection->c_peer.nid)
- /* mds reconnected after failover */
- LCONSOLE_WARN("%s: Received MDS connection from "
- "%s, removing former export from %s\n",
- target->obd_name, libcfs_nid2str(req->rq_peer.nid),
- libcfs_nid2str(export->exp_connection->c_peer.nid));
- else
- /* new mds connection from the same nid */
+ /* MDS reconnected after failover. */
+ LCONSOLE_WARN("%s: Received MDS connection from "
+ "%s, removing former export from %s\n",
+ target->obd_name, libcfs_nid2str(req->rq_peer.nid),
+ libcfs_nid2str(export->exp_connection->c_peer.nid));
+ else
+ /* New MDS connection from the same NID. */
LCONSOLE_WARN("%s: Received new MDS connection from "
"%s, removing former export from same NID\n",
target->obd_name, libcfs_nid2str(req->rq_peer.nid));
(lustre_msg_get_op_flags(req->rq_reqmsg) &
MSG_CONNECT_INITIAL)) {
spin_unlock(&export->exp_lock);
- /* in mds failover we have static uuid but nid can be
- * changed*/
+ /* In MDS failover we have static UUID but NID can change. */
LCONSOLE_WARN("%s: Client %s seen on new nid %s when "
"existing nid %s is already connected\n",
target->obd_name, cluuid.uuid,
GOTO(out, rc = -EBUSY);
} else if (req->rq_export != NULL &&
(cfs_atomic_read(&export->exp_rpc_count) > 1)) {
- /* the current connect rpc has increased exp_rpc_count */
+ /* The current connect RPC has increased exp_rpc_count. */
LCONSOLE_WARN("%s: Client %s (at %s) refused reconnection, "
"still busy with %d active RPCs\n",
target->obd_name, cluuid.uuid,
export ? (long)export->exp_last_request_time : 0);
/* If this is the first time a client connects, reset the recovery
- * timer. Discard lightweight connections which might be local */
+ * timer. Discard lightweight connections which might be local. */
if (!lw_client && rc == 0 && target->obd_recovering)
- check_and_start_recovery_timer(target, req, export == NULL);
+ check_and_start_recovery_timer(target, req, export == NULL);
- /* We want to handle EALREADY but *not* -EALREADY from
- * target_handle_reconnect(), return reconnection state in a flag */
+ /* We want to handle EALREADY but *not* -EALREADY from
+ * target_handle_reconnect(), return reconnection state in a flag. */
if (rc == EALREADY) {
lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
rc = 0;
LASSERT(rc == 0);
}
- /* Tell the client if we support replayable requests */
+ /* Tell the client if we support replayable requests. */
if (target->obd_replayable)
lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE);
client_nid = &req->rq_peer.nid;
/* allow lightweight connections during recovery */
if (target->obd_recovering && !lw_client) {
cfs_time_t t;
- int c; /* connected */
- int i; /* in progress */
- int k; /* known */
+ int c; /* connected */
+ int i; /* in progress */
+ int k; /* known */
+ int s; /* stale/evicted */
c = cfs_atomic_read(&target->obd_connected_clients);
i = cfs_atomic_read(&target->obd_lock_replay_clients);
k = target->obd_max_recoverable_clients;
+ s = target->obd_stale_clients;
t = cfs_timer_deadline(&target->obd_recovery_timer);
t = cfs_time_sub(t, cfs_time_current());
t = cfs_duration_sec(t);
LCONSOLE_WARN("%s: Denying connection for new client "
"%s (at %s), waiting for all %d known "
"clients (%d recovered, %d in progress, "
- "and %d unseen) to recover in %d:%.02d\n",
+ "and %d evicted) to recover in %d:%.02d\n",
target->obd_name, cluuid.uuid,
libcfs_nid2str(req->rq_peer.nid), k,
- c - i, i, k - c, (int)t / 60,
+ c - i, i, s, (int)t / 60,
(int)t % 60);
rc = -EBUSY;
} else {
if (req->rq_export != NULL)
class_export_put(req->rq_export);
- /* request takes one export refcount */
+ /* Request takes one export reference. */
req->rq_export = class_export_get(export);
spin_lock(&export->exp_lock);
export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
export->exp_abort_active_req = 0;
- /* request from liblustre? Don't evict it for not pinging. */
+ /* Don't evict liblustre clients for not pinging. */
if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
export->exp_libclient = 1;
spin_unlock(&export->exp_lock);
}
if (export->exp_connection != NULL) {
- /* Check to see if connection came from another NID */
+ /* Check to see if connection came from another NID. */
if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
!cfs_hlist_unhashed(&export->exp_nid_hash))
cfs_hash_del(export->exp_obd->obd_nid_hash,
if (has_transno && transno > 0 &&
transno < target->obd_next_recovery_transno &&
transno > target->obd_last_committed) {
- /* another way is to use cmpxchg() so it will be
- * lock free */
+ /* Another way is to use cmpxchg() to be lock-free. */
spin_lock(&target->obd_recovery_task_lock);
if (transno < target->obd_next_recovery_transno)
target->obd_next_recovery_transno = transno;
tmp = req_capsule_client_get(&req->rq_pill, &RMF_CONN);
conn = *tmp;
- /* for the rest part, we return -ENOTCONN in case of errors
- * in order to let client initialize connection again.
- */
+ /* Return -ENOTCONN in case of errors to let client reconnect. */
revimp = class_new_import(target);
if (revimp == NULL) {
CERROR("fail to alloc new reverse import.\n");
revimp->imp_dlm_fake = 1;
revimp->imp_state = LUSTRE_IMP_FULL;
- /* unknown versions will be caught in
- * ptlrpc_handle_server_req_in->lustre_unpack_msg() */
+ /* Unknown versions will be caught in
+ * ptlrpc_handle_server_req_in->lustre_unpack_msg(). */
revimp->imp_msg_magic = req->rq_reqmsg->lm_magic;
if ((data->ocd_connect_flags & OBD_CONNECT_AT) &&
if (rc)
RETURN(rc);
- /* keep the rq_export around so we can send the reply */
+ /* Keep the rq_export around so we can send the reply. */
req->rq_status = obd_disconnect(class_export_get(req->rq_export));
RETURN(0);
LASSERT(cfs_list_empty(&req->rq_list));
CFS_INIT_LIST_HEAD(&req->rq_replay_list);
- /* increase refcount to keep request in queue */
- cfs_atomic_inc(&req->rq_refcount);
- /** let export know it has replays to be handled */
+ /* Increase refcount to keep request in queue. */
+ cfs_atomic_inc(&req->rq_refcount);
+ /* Let export know it has replays to be handled. */
cfs_atomic_inc(&req->rq_export->exp_replay_count);
}
}
if (dup) {
- /* we expect it with RESENT and REPLAY flags */
+ /* We expect it with RESENT and REPLAY flags. */
if ((lustre_msg_get_flags(req->rq_reqmsg) &
(MSG_RESENT | MSG_REPLAY)) != (MSG_RESENT | MSG_REPLAY))
CERROR("invalid flags %x of resent replay\n",
{
ENTRY;
- /* only log a recovery message when recovery has occurred */
+ /* Only log a recovery message when recovery has occurred. */
if (obd->obd_recovery_start) {
time_t elapsed_time = max_t(time_t, 1, cfs_time_current_sec() -
obd->obd_recovery_start);
obd->obd_recovery_end = cfs_time_current_sec();
- /* when recovery finished, cleanup orphans on mds and ost */
+ /* When recovery finished, cleanup orphans on MDS and OST. */
if (OBT(obd) && OBP(obd, postrecov)) {
int rc = OBP(obd, postrecov)(obd);
if (rc < 0)
if (to > obd->obd_recovery_time_hard)
to = obd->obd_recovery_time_hard;
- if (obd->obd_recovery_timeout < to) {
+ if (obd->obd_recovery_timeout < to ||
+ obd->obd_recovery_timeout == obd->obd_recovery_time_hard) {
obd->obd_recovery_timeout = to;
cfs_timer_arm(&obd->obd_recovery_timer,
cfs_time_shift(drt));
target_start_recovery_timer(obd);
- /* convert the service time to rpc timeout,
- * reuse service_time to limit stack usage */
- service_time = at_est2timeout(service_time);
+ /* Convert the service time to RPC timeout,
+ * and reuse service_time to limit stack usage. */
+ service_time = at_est2timeout(service_time);
- /* We expect other clients to timeout within service_time, then try
- * to reconnect, then try the failover server. The max delay between
- * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL */
+ /* We expect other clients to timeout within service_time, then try
+ * to reconnect, then try the failover server. The max delay between
+ * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL. */
service_time += 2 * INITIAL_CONNECT_TIMEOUT;
LASSERT(obt->obt_magic == OBT_MAGIC);
if (obd->obd_abort_recovery || obd->obd_recovery_expired)
return 1;
LASSERT(clnts <= obd->obd_max_recoverable_clients);
- if (obd->obd_no_conn == 0 &&
- clnts + obd->obd_stale_clients == obd->obd_max_recoverable_clients)
- return 1;
- return 0;
+ return (clnts + obd->obd_stale_clients ==
+ obd->obd_max_recoverable_clients);
}
static int check_for_next_transno(struct obd_device *obd)
} else if (obd->obd_recovery_expired) {
CDEBUG(D_HA, "waking for expired recovery\n");
wake_up = 1;
- } else if (cfs_atomic_read(&obd->obd_req_replay_clients) == 0) {
- CDEBUG(D_HA, "waking for completed recovery\n");
- wake_up = 1;
} else if (req_transno == next_transno) {
CDEBUG(D_HA, "waking for next ("LPD64")\n", next_transno);
wake_up = 1;
- } else if (queue_len == cfs_atomic_read(&obd->obd_req_replay_clients)) {
+ } else if (queue_len > 0 &&
+ queue_len == cfs_atomic_read(&obd->obd_req_replay_clients)) {
int d_lvl = D_HA;
/** handle gaps occured due to lost reply or VBR */
LASSERTF(req_transno >= next_transno,
req_transno, obd->obd_last_committed);
obd->obd_next_recovery_transno = req_transno;
wake_up = 1;
+ } else if (cfs_atomic_read(&obd->obd_req_replay_clients) == 0) {
+ CDEBUG(D_HA, "waking for completed recovery\n");
+ wake_up = 1;
} else if (OBD_FAIL_CHECK(OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS)) {
CDEBUG(D_HA, "accepting transno gaps is explicitly allowed"
" by fail_lock, waking up ("LPD64")\n", next_transno);
struct obd_device *obd;
ENTRY;
- /*
- * Check that we still have all structures alive as this may
- * be some late rpc in shutdown time.
- */
+ /* Check that we still have all structures alive as this may
+ * be some late RPC at shutdown time. */
if (unlikely(!req->rq_export || !req->rq_export->exp_obd ||
!exp_connect_lru_resize(req->rq_export))) {
lustre_msg_set_slv(req->rq_repmsg, 0);
RETURN(0);
}
- /*
- * OBD is alive here as export is alive, which we checked above.
- */
+ /* OBD is alive here as export is alive, which we checked above. */
obd = req->rq_export->exp_obd;
read_lock(&obd->obd_pool_lock);
int rc = 0;
ENTRY;
- /* Check if there is eviction in progress, and if so, wait for
- * it to finish */
+ /* If there is eviction in progress, wait for it to finish. */
if (unlikely(cfs_atomic_read(&exp->exp_obd->obd_evict_inprogress))) {
*lwi = LWI_INTR(NULL, NULL);
rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq,
lwi);
}
- /* Check if client was evicted or tried to reconnect already */
+ /* Check if client was evicted or tried to reconnect already. */
if (exp->exp_failed || exp->exp_abort_active_req) {
rc = -ENOTCONN;
} else {
exp->exp_abort_active_req,
lwi);
LASSERT(rc == 0 || rc == -ETIMEDOUT);
- /* Wait again if we changed deadline */
+ /* Wait again if we changed deadline. */
} while ((rc == -ETIMEDOUT) &&
(req->rq_deadline > cfs_time_current_sec()));
} else if (exp->exp_abort_active_req) {
DEBUG_REQ(D_ERROR, req, "Reconnect on bulk %s",
bulk2type(desc));
- /* we don't reply anyway */
+ /* We don't reply anyway. */
rc = -ETIMEDOUT;
ptlrpc_abort_bulk(desc);
} else if (!desc->bd_success ||
bulk2type(desc),
desc->bd_nob_transferred,
desc->bd_nob);
- /* XXX should this be a different errno? */
+ /* XXX Should this be a different errno? */
rc = -ETIMEDOUT;
} else if (desc->bd_type == BULK_GET_SINK) {
rc = sptlrpc_svc_unwrap_bulk(req, desc);