*exp = NULL;
down_write(&cli->cl_sem);
+ if (cli->cl_conn_count > 0 )
+ GOTO(out_sem, rc = -EALREADY);
+
rc = class_connect(&conn, obd, cluuid);
if (rc)
GOTO(out_sem, rc);
-
- *exp = class_conn2export(&conn);
-
+
cli->cl_conn_count++;
- if (cli->cl_conn_count > 1)
- GOTO(out_sem, rc);
+ *exp = class_conn2export(&conn);
if (obd->obd_namespace != NULL)
CERROR("already have namespace!\n");
if (!cli->cl_conn_count) {
CERROR("disconnecting disconnected device (%s)\n",
obd->obd_name);
- GOTO(out_sem, rc = -EINVAL);
+ GOTO(out_disconnect, rc = -EINVAL);
}
cli->cl_conn_count--;
if (cli->cl_conn_count)
- GOTO(out_no_disconnect, rc = 0);
+ GOTO(out_disconnect, rc = 0);
/* Mark import deactivated now, so we don't try to reconnect if any
* of the cleanup RPCs fails (e.g. ldlm cancel, etc). We don't
cli->cl_import = NULL;
EXIT;
- out_no_disconnect:
+
+ out_disconnect:
+ /* use server style - class_disconnect should be always called for
+ * o_disconnect */
err = class_disconnect(exp);
if (!rc && err)
rc = err;
- out_sem:
+
up_write(&cli->cl_sem);
if (to_be_freed)
ldlm_namespace_free_post(to_be_freed);
GOTO(out, rc = -EBUSY);
} else if (req->rq_export != NULL &&
(atomic_read(&export->exp_rpc_count) > 1)) {
+ /* the current connect rpc has increased exp_rpc_count */
CWARN("%s: refuse reconnection from %s@%s to 0x%p/%d\n",
target->obd_name, cluuid.uuid,
libcfs_nid2str(req->rq_peer.nid),
- export, atomic_read(&export->exp_rpc_count));
+ export, atomic_read(&export->exp_rpc_count) - 1);
+ spin_lock(&export->exp_lock);
+ if (req->rq_export->exp_conn_cnt <
+ lustre_msg_get_conn_cnt(req->rq_reqmsg))
+ /* try to abort active requests */
+ req->rq_export->exp_abort_active_req = 1;
+ spin_unlock(&export->exp_lock);
GOTO(out, rc = -EBUSY);
} else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) {
CERROR("%s: NID %s (%s) reconnected with 1 conn_cnt; "
GOTO(out, rc = -EALREADY);
}
export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+ export->exp_abort_active_req = 0;
/* request from liblustre? Don't evict it for not pinging. */
if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
/* when recovery finished, cleanup orphans on mds and ost */
if (OBT(obd) && OBP(obd, postrecov)) {
int rc = OBP(obd, postrecov)(obd);
- LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name,
- rc < 0 ? "failed" : "complete", rc);
+ if (rc < 0)
+ LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+ obd->obd_name, rc);
}
obd->obd_recovery_end = cfs_time_current_sec();
spin_lock_bh(&obd->obd_processing_task_lock);
list_splice_init(&obd->obd_lock_replay_queue, &abort_list);
spin_unlock_bh(&obd->obd_processing_task_lock);
- list_for_each_entry_safe(req, n, &obd->obd_lock_replay_queue, rq_list){
+ list_for_each_entry_safe(req, n, &abort_list, rq_list){
DEBUG_REQ(D_ERROR, req, "aborted:");
req->rq_status = -ENOTCONN;
if (ptlrpc_error(req)) {
spin_unlock_bh(&obd->obd_processing_task_lock);
return;
}
- CWARN("%s: starting recovery timer\n", obd->obd_name);
+ CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
obd->obd_recovery_start = cfs_time_current_sec();
/* minimum */
obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
} else if (!list_empty(&obd->obd_req_replay_queue)) {
req = list_entry(obd->obd_req_replay_queue.next,
struct ptlrpc_request, rq_list);
- target_exp_dequeue_req_replay(req);
list_del_init(&req->rq_list);
obd->obd_requests_queued_for_recovery--;
} else {
if (!req_replay_done(req->rq_export) ||
!lock_replay_done(req->rq_export))
reset_recovery_timer(class_exp2obd(req->rq_export),
- AT_OFF ? obd_timeout :
+ AT_OFF ? obd_timeout :
at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate), 1);
+
+ /**
+ * bz18031: increase next_recovery_transno before ptlrpc_free_clone()
+ * will drop exp_rpc reference
+ */
+ if (!req_replay_done(req->rq_export)) {
+ spin_lock_bh(&req->rq_export->exp_obd->obd_processing_task_lock);
+ req->rq_export->exp_obd->obd_next_recovery_transno++;
+ spin_unlock_bh(&req->rq_export->exp_obd->obd_processing_task_lock);
+ target_exp_dequeue_req_replay(req);
+ }
ptlrpc_free_clone(req);
RETURN(0);
}
-static void resume_recovery_timer(struct obd_device *obd)
-{
- /* to be safe, make it at least OBD_RECOVERY_FACTOR * obd_timeout */
- reset_recovery_timer(obd, OBD_RECOVERY_FACTOR * obd_timeout, 1);
-}
-
static int target_recovery_thread(void *arg)
{
struct lu_target *lut = arg;
l_wait_event(obd->obd_next_transno_waitq,
check_for_clients(obd), &lwi);
- spin_lock_bh(&obd->obd_processing_task_lock);
- target_cancel_recovery_timer(obd);
- spin_unlock_bh(&obd->obd_processing_task_lock);
-
/* If some clients haven't connected in time, evict them */
if (obd->obd_connected_clients < obd->obd_max_recoverable_clients) {
CWARN("Some clients haven't connect in time (%d/%d),"
CDEBUG(D_INFO, "1: request replay stage - %d clients from t"LPU64"\n",
atomic_read(&obd->obd_req_replay_clients),
obd->obd_next_recovery_transno);
- resume_recovery_timer(obd);
while ((req = target_next_replay_req(obd))) {
LASSERT(trd->trd_processing_task == cfs_curproc_pid());
DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s",
handle_recovery_req(thread, req,
trd->trd_recovery_handler);
obd->obd_replayed_requests++;
- spin_lock_bh(&obd->obd_processing_task_lock);
- obd->obd_next_recovery_transno++;
- spin_unlock_bh(&obd->obd_processing_task_lock);
}
- spin_lock_bh(&obd->obd_processing_task_lock);
- target_cancel_recovery_timer(obd);
- spin_unlock_bh(&obd->obd_processing_task_lock);
-
/* If some clients haven't replayed requests in time, evict them */
if (obd->obd_abort_recovery) {
CDEBUG(D_WARNING, "req replay is aborted\n");
/* The second stage: replay locks */
CDEBUG(D_INFO, "2: lock replay stage - %d clients\n",
atomic_read(&obd->obd_lock_replay_clients));
- resume_recovery_timer(obd);
while ((req = target_next_replay_lock(obd))) {
LASSERT(trd->trd_processing_task == cfs_curproc_pid());
DEBUG_REQ(D_HA, req, "processing lock from %s: ",
obd->obd_replayed_locks++;
}
- spin_lock_bh(&obd->obd_processing_task_lock);
- target_cancel_recovery_timer(obd);
- spin_unlock_bh(&obd->obd_processing_task_lock);
/* If some clients haven't replayed requests in time, evict them */
if (obd->obd_abort_recovery) {
- int stale;
CERROR("lock replay is aborted\n");
- stale = class_disconnect_stale_exports(obd, lock_replay_done,
- exp_flags_from_obd(obd) |
- OBD_OPT_ABORT_RECOV);
+ class_disconnect_stale_exports(obd, lock_replay_done,
+ exp_flags_from_obd(obd) |
+ OBD_OPT_ABORT_RECOV);
abort_lock_replay_queue(obd);
}
LASSERT(list_empty(&obd->obd_lock_replay_queue));
* to regular mds_handle() since now */
spin_lock_bh(&obd->obd_processing_task_lock);
obd->obd_recovering = obd->obd_abort_recovery = 0;
+ target_cancel_recovery_timer(obd);
spin_unlock_bh(&obd->obd_processing_task_lock);
while ((req = target_next_final_ping(obd))) {
LASSERT(trd->trd_processing_task == cfs_curproc_pid());
static void target_recovery_expired(unsigned long castmeharder)
{
struct obd_device *obd = (struct obd_device *)castmeharder;
- LCONSOLE_WARN("%s: recovery timed out; %d clients never reconnected "
- "after %lds (%d clients did)\n",
- obd->obd_name, obd->obd_recoverable_clients,
- cfs_time_current_sec()- obd->obd_recovery_start,
- obd->obd_connected_clients);
+ CDEBUG(D_HA, "%s: recovery timed out; %d clients never reconnected "
+ "after %lds (%d clients did)\n",
+ obd->obd_name, obd->obd_recoverable_clients,
+ cfs_time_current_sec()- obd->obd_recovery_start,
+ obd->obd_connected_clients);
spin_lock_bh(&obd->obd_processing_task_lock);
obd->obd_version_recov = 1;
void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
{
struct obd_device *obd = lut->lut_obd;
- if (obd->obd_max_recoverable_clients == 0)
+ if (obd->obd_max_recoverable_clients == 0) {
+ /** Update server last boot epoch */
+ lut_boot_epoch_update(lut);
return;
+ }
CWARN("RECOVERY: service %s, %d recoverable clients, "
"last_transno "LPU64"\n", obd->obd_name,
rs->rs_xid = req->rq_xid;
rs->rs_transno = req->rq_transno;
rs->rs_export = exp;
+ rs->rs_opc = lustre_msg_get_opc(rs->rs_msg);
spin_lock(&exp->exp_uncommitted_replies_lock);
CDEBUG(D_NET, "rs transno = "LPU64", last committed = "LPU64"\n",
LASSERT(req->rq_export);
- OBD_ALLOC(qdata, sizeof(struct qunit_data));
- if (!qdata)
- RETURN(-ENOMEM);
- rc = quota_get_qdata(req, qdata, QUOTA_REQUEST, QUOTA_EXPORT);
- if (rc < 0) {
+ qdata = quota_get_qdata(req, QUOTA_REQUEST, QUOTA_EXPORT);
+ if (IS_ERR(qdata)) {
+ rc = PTR_ERR(qdata);
CDEBUG(D_ERROR, "Can't unpack qunit_data(rc: %d)\n", rc);
+ req->rq_status = rc;
GOTO(out, rc);
}
if (!obd->obd_observer || !obd->obd_observer->obd_observer) {
CERROR("Can't find the observer, it is recovering\n");
req->rq_status = -EAGAIN;
- GOTO(send_reply, rc = -EAGAIN);
+ GOTO(out, rc);
}
master_obd = obd->obd_observer->obd_observer;
CDEBUG(D_QUOTA, "quota_type not processed yet, return "
"-EAGAIN\n");
req->rq_status = -EAGAIN;
- rc = ptlrpc_reply(req);
GOTO(out, rc);
}
CDEBUG(D_QUOTA, "quota_ctxt is not ready yet, return "
"-EAGAIN\n");
req->rq_status = -EAGAIN;
- rc = ptlrpc_reply(req);
GOTO(out, rc);
}
up_read(&obt->obt_rwsem);
if (rc && rc != -EDQUOT)
CDEBUG(rc == -EBUSY ? D_QUOTA : D_ERROR,
- "dqacq failed! (rc:%d)\n", rc);
+ "dqacq/dqrel failed! (rc:%d)\n", rc);
req->rq_status = rc;
- /* there are three forms of qunit(historic causes), so we need to
- * adjust the same form to different forms slaves needed */
rc = quota_copy_qdata(req, qdata, QUOTA_REPLY, QUOTA_EXPORT);
if (rc < 0) {
- CDEBUG(D_ERROR, "Can't pack qunit_data(rc: %d)\n", rc);
+ CERROR("Can't pack qunit_data(rc: %d)\n", rc);
GOTO(out, rc);
}
/* Block the quota req. b=14840 */
OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_BLOCK_QUOTA_REQ, obd_timeout);
-send_reply:
- rc = ptlrpc_reply(req);
+ EXIT;
+
out:
- OBD_FREE(qdata, sizeof(struct qunit_data));
- RETURN(rc);
+ rc = ptlrpc_reply(req);
+ return rc;
#else
return 0;
#endif /* !__KERNEL__ */