X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fptlrpc%2Fservice.c;h=8962690c71286928fcab804c98d764deaa40bdca;hp=44dfec0a9a4d5a9bee69b862384a21717c8c8d40;hb=5e30a2c06176f50f5e17aba68fdae7e38d922d33;hpb=9ae40e4c5ecbc754e47b0570fedddc6963e03f9a diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 44dfec0..8962690 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -41,6 +41,7 @@ #include #include #include "ptlrpc_internal.h" +#include /* The following are visible and mutable through /sys/module/ptlrpc */ int test_req_buffer_pressure = 0; @@ -449,11 +450,10 @@ static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt) rqbd = list_entry(svcpt->scp_rqbd_idle.next, struct ptlrpc_request_buffer_desc, rqbd_list); - list_del(&rqbd->rqbd_list); /* assume we will post successfully */ svcpt->scp_nrqbds_posted++; - list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted); + list_move(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted); spin_unlock(&svcpt->scp_lock); @@ -467,8 +467,7 @@ static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt) spin_lock(&svcpt->scp_lock); svcpt->scp_nrqbds_posted--; - list_del(&rqbd->rqbd_list); - list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); + list_move_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); /* * Don't complain if no request buffers are posted right now; LNET @@ -571,12 +570,14 @@ static void ptlrpc_server_nthreads_check(struct ptlrpc_service *svc, * have too many threads no matter how many cores/HTs * there are. */ + preempt_disable(); if (cpumask_weight (topology_sibling_cpumask(smp_processor_id())) > 1) { /* weight is # of HTs */ /* depress thread factor for hyper-thread */ factor = factor - (factor >> 1) + (factor >> 3); } + preempt_enable(); weight = cfs_cpt_weight(svc->srv_cptable, 0); @@ -942,9 +943,7 @@ void ptlrpc_server_drop_request(struct ptlrpc_request *req) refcount = --(rqbd->rqbd_refcount); if (refcount == 0) { /* request buffer is now idle: add to history */ - list_del(&rqbd->rqbd_list); - - list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds); + list_move_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds); svcpt->scp_hist_nrqbds++; /* @@ -1026,6 +1025,30 @@ void ptlrpc_server_drop_request(struct ptlrpc_request *req) } } +static void ptlrpc_add_exp_list_nolock(struct ptlrpc_request *req, + struct obd_export *export, bool hp) +{ + __u16 tag = lustre_msg_get_tag(req->rq_reqmsg); + + if (hp) + list_add(&req->rq_exp_list, &export->exp_hp_rpcs); + else + list_add(&req->rq_exp_list, &export->exp_reg_rpcs); + if (tag && export->exp_used_slots) + set_bit(tag - 1, export->exp_used_slots); +} + +static void ptlrpc_del_exp_list(struct ptlrpc_request *req) +{ + __u16 tag = lustre_msg_get_tag(req->rq_reqmsg); + + spin_lock(&req->rq_export->exp_rpc_lock); + list_del_init(&req->rq_exp_list); + if (tag && !req->rq_obsolete && req->rq_export->exp_used_slots) + clear_bit(tag - 1, req->rq_export->exp_used_slots); + spin_unlock(&req->rq_export->exp_rpc_lock); +} + /** Change request export and move hp request from old export to new */ void ptlrpc_request_change_export(struct ptlrpc_request *req, struct obd_export *export) @@ -1033,19 +1056,13 @@ void ptlrpc_request_change_export(struct ptlrpc_request *req, if (req->rq_export != NULL) { LASSERT(!list_empty(&req->rq_exp_list)); /* remove rq_exp_list from last export */ - spin_lock(&req->rq_export->exp_rpc_lock); - list_del_init(&req->rq_exp_list); - spin_unlock(&req->rq_export->exp_rpc_lock); - /* - * export has one reference already, so it`s safe to + ptlrpc_del_exp_list(req); + /* export has one reference already, so it's safe to * add req to export queue here and get another * reference for request later */ spin_lock(&export->exp_rpc_lock); - if (req->rq_ops != NULL) /* hp request */ - list_add(&req->rq_exp_list, &export->exp_hp_rpcs); - else - list_add(&req->rq_exp_list, &export->exp_reg_rpcs); + ptlrpc_add_exp_list_nolock(req, export, req->rq_ops != NULL); spin_unlock(&export->exp_rpc_lock); class_export_rpc_dec(req->rq_export); @@ -1055,8 +1072,6 @@ void ptlrpc_request_change_export(struct ptlrpc_request *req, /* request takes one export refcount */ req->rq_export = class_export_get(export); class_export_rpc_inc(export); - - return; } /** @@ -1361,6 +1376,7 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) RETURN(0); if (olddl < 0) { + /* below message is checked in replay-ost-single.sh test_9 */ DEBUG_REQ(D_WARNING, req, "Already past deadline (%+llds), not sending early reply. Consider increasing at_early_margin (%d)?", (s64)olddl, at_early_margin); @@ -1419,7 +1435,8 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) * we may be past adaptive_max */ if (req->rq_deadline >= newdl) { - DEBUG_REQ(D_WARNING, req, "Couldn't add any time (%lld/%lld), not sending early reply\n", + DEBUG_REQ(D_WARNING, req, + "Could not add any time (%lld/%lld), not sending early reply", (s64)olddl, (s64)(newdl - ktime_get_real_seconds())); RETURN(-ETIMEDOUT); } @@ -1451,10 +1468,10 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) GOTO(out, rc = -ETIMEDOUT); LASSERT(atomic_read(&req->rq_refcount)); - /** if it is last refcount then early reply isn't needed */ + /* if it is last refcount then early reply isn't needed */ if (atomic_read(&req->rq_refcount) == 1) { DEBUG_REQ(D_ADAPTTO, reqcopy, - "Normal reply already sent out, abort sending early reply\n"); + "Normal reply already sent, abort early reply"); GOTO(out, rc = -EINVAL); } @@ -1481,7 +1498,7 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) req->rq_deadline = newdl; req->rq_early_count++; /* number sent, server side */ } else { - DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); + DEBUG_REQ(D_ERROR, req, "Early reply send failed: rc = %d", rc); } /* @@ -1664,6 +1681,47 @@ found: return tmp; } +#ifdef HAVE_SERVER_SUPPORT +static void ptlrpc_server_mark_obsolete(struct ptlrpc_request *req) +{ + req->rq_obsolete = 1; +} + +static void +ptlrpc_server_mark_in_progress_obsolete(struct ptlrpc_request *req) +{ + struct ptlrpc_request *tmp = NULL; + __u16 tag; + + if (!tgt_is_increasing_xid_client(req->rq_export) || + req->rq_export->exp_used_slots == NULL) + return; + + tag = lustre_msg_get_tag(req->rq_reqmsg); + if (tag == 0) + return; + + if (!test_bit(tag - 1, req->rq_export->exp_used_slots)) + return; + + /* This list should not be longer than max_requests in + * flights on the client, so it is not all that long. + * Also we only hit this codepath in case of a resent + * request which makes it even more rarely hit */ + list_for_each_entry(tmp, &req->rq_export->exp_reg_rpcs, rq_exp_list) { + if (tag == lustre_msg_get_tag(tmp->rq_reqmsg) && + req->rq_xid > tmp->rq_xid) + ptlrpc_server_mark_obsolete(tmp); + + } + list_for_each_entry(tmp, &req->rq_export->exp_hp_rpcs, rq_exp_list) { + if (tag == lustre_msg_get_tag(tmp->rq_reqmsg) && + req->rq_xid > tmp->rq_xid) + ptlrpc_server_mark_obsolete(tmp); + } +} +#endif + /** * Check if a request should be assigned with a high priority. * @@ -1721,9 +1779,7 @@ static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req) if (req->rq_ops && req->rq_ops->hpreq_fini) req->rq_ops->hpreq_fini(req); - spin_lock(&req->rq_export->exp_rpc_lock); - list_del_init(&req->rq_exp_list); - spin_unlock(&req->rq_export->exp_rpc_lock); + ptlrpc_del_exp_list(req); } EXIT; } @@ -1770,7 +1826,7 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, hp = rc > 0; ptlrpc_nrs_req_initialize(svcpt, req, hp); - if (req->rq_export != NULL) { + while (req->rq_export != NULL) { struct obd_export *exp = req->rq_export; /* @@ -1778,7 +1834,18 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, * atomically */ spin_lock_bh(&exp->exp_rpc_lock); +#ifdef HAVE_SERVER_SUPPORT + ptlrpc_server_mark_in_progress_obsolete(req); +#endif orig = ptlrpc_server_check_resend_in_progress(req); + if (orig && OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_RESEND_RACE)) { + spin_unlock_bh(&exp->exp_rpc_lock); + + OBD_RACE(OBD_FAIL_PTLRPC_RESEND_RACE); + msleep(4 * MSEC_PER_SEC); + continue; + } + if (orig && likely(atomic_inc_not_zero(&orig->rq_refcount))) { bool linked; @@ -1801,14 +1868,17 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, ptlrpc_at_add_timed(orig); ptlrpc_server_drop_request(orig); ptlrpc_nrs_req_finalize(req); + + /* don't mark slot unused for resend in progress */ + req->rq_obsolete = 1; + RETURN(-EBUSY); } - if (hp || req->rq_ops != NULL) - list_add(&req->rq_exp_list, &exp->exp_hp_rpcs); - else - list_add(&req->rq_exp_list, &exp->exp_reg_rpcs); + ptlrpc_add_exp_list_nolock(req, exp, hp || req->rq_ops != NULL); + spin_unlock_bh(&exp->exp_rpc_lock); + break; } /* @@ -2081,7 +2151,7 @@ static int ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt, rc = sptlrpc_target_export_check(req->rq_export, req); if (rc) DEBUG_REQ(D_ERROR, req, - "DROPPING req with illegal security flavor,"); + "DROPPING req with illegal security flavor"); } if (rc) @@ -2209,7 +2279,8 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, * The deadline is increased if we send an early reply. */ if (ktime_get_real_seconds() > request->rq_deadline) { - DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline %lld:%llds ago\n", + DEBUG_REQ(D_ERROR, request, + "Dropping timed-out request from %s: deadline %lld/%llds ago", libcfs_id2str(request->rq_peer), request->rq_deadline - request->rq_arrival_time.tv_sec, @@ -2223,11 +2294,11 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, (request->rq_export ? (char *)request->rq_export->exp_client_uuid.uuid : "0"), (request->rq_export ? - atomic_read(&request->rq_export->exp_refcount) : -99), + refcount_read(&request->rq_export->exp_handle.h_ref) : -99), lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, libcfs_id2str(request->rq_peer), lustre_msg_get_opc(request->rq_reqmsg), - lustre_msg_get_jobid(request->rq_reqmsg)); + lustre_msg_get_jobid(request->rq_reqmsg) ?: ""); if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING) CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val); @@ -2248,8 +2319,7 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, put_conn: if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) { DEBUG_REQ(D_WARNING, request, - "Request took longer than estimated (%lld:%llds); " - "client may timeout.", + "Request took longer than estimated (%lld/%llds); client may timeout", request->rq_deadline - request->rq_arrival_time.tv_sec, ktime_get_real_seconds() - request->rq_deadline); @@ -2264,12 +2334,12 @@ put_conn: (request->rq_export ? (char *)request->rq_export->exp_client_uuid.uuid : "0"), (request->rq_export ? - atomic_read(&request->rq_export->exp_refcount) : -99), + refcount_read(&request->rq_export->exp_handle.h_ref) : -99), lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, libcfs_id2str(request->rq_peer), lustre_msg_get_opc(request->rq_reqmsg), - lustre_msg_get_jobid(request->rq_reqmsg), + lustre_msg_get_jobid(request->rq_reqmsg) ?: "", timediff_usecs, arrived_usecs, (request->rq_repmsg ? @@ -2584,13 +2654,16 @@ static void ptlrpc_watchdog_fire(struct work_struct *w) u64 ms_lapse = ktime_ms_delta(ktime_get(), thread->t_touched); u32 ms_frac = do_div(ms_lapse, MSEC_PER_SEC); - if (!__ratelimit(&watchdog_limit)) { + /* ___ratelimit() returns true if the action is NOT ratelimited */ + if (__ratelimit(&watchdog_limit)) { + /* below message is checked in sanity-quota.sh test_6,18 */ LCONSOLE_WARN("%s: service thread pid %u was inactive for %llu.%03u seconds. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:\n", thread->t_task->comm, thread->t_task->pid, ms_lapse, ms_frac); libcfs_debug_dumpstack(thread->t_task); } else { + /* below message is checked in sanity-quota.sh test_6,18 */ LCONSOLE_WARN("%s: service thread pid %u was inactive for %llu.%03u seconds. Watchdog stack traces are limited to 3 per %u seconds, skipping this one.\n", thread->t_task->comm, thread->t_task->pid, ms_lapse, ms_frac, libcfs_watchdog_ratelimit); @@ -2776,6 +2849,9 @@ static int ptlrpc_main(void *arg) /* reset le_ses to initial state */ env->le_ses = NULL; + /* Refill the context before execution to make sure + * all thread keys are allocated */ + lu_env_refill(env); /* Process all incoming reqs before handling any */ if (ptlrpc_server_request_incoming(svcpt)) { lu_context_enter(&env->le_ctx); @@ -3000,7 +3076,6 @@ static int ptlrpc_start_hr_threads(void) static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt) { - struct l_wait_info lwi = { 0 }; struct ptlrpc_thread *thread; struct list_head zombie; @@ -3021,16 +3096,15 @@ static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt) thread = list_entry(svcpt->scp_threads.next, struct ptlrpc_thread, t_link); if (thread_is_stopped(thread)) { - list_del(&thread->t_link); - list_add(&thread->t_link, &zombie); + list_move(&thread->t_link, &zombie); continue; } spin_unlock(&svcpt->scp_lock); CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n", svcpt->scp_service->srv_thread_name, thread->t_id); - l_wait_event(thread->t_ctl_waitq, - thread_is_stopped(thread), &lwi); + wait_event_idle(thread->t_ctl_waitq, + thread_is_stopped(thread)); spin_lock(&svcpt->scp_lock); } @@ -3098,7 +3172,6 @@ int ptlrpc_start_threads(struct ptlrpc_service *svc) int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait) { - struct l_wait_info lwi = { 0 }; struct ptlrpc_thread *thread; struct ptlrpc_service *svc; struct task_struct *task; @@ -3198,9 +3271,8 @@ int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait) if (!wait) RETURN(0); - l_wait_event(thread->t_ctl_waitq, - thread_is_running(thread) || thread_is_stopped(thread), - &lwi); + wait_event_idle(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread)); rc = thread_is_stopped(thread) ? thread->t_id : 0; RETURN(rc); @@ -3230,7 +3302,9 @@ int ptlrpc_hr_init(void) init_waitqueue_head(&ptlrpc_hr.hr_waitq); + preempt_disable(); weight = cpumask_weight(topology_sibling_cpumask(smp_processor_id())); + preempt_enable(); cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) { hrp->hrp_cpt = cpt; @@ -3294,13 +3368,10 @@ void ptlrpc_hr_fini(void) static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt) { while (1) { - int rc; - struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10), - NULL, NULL); - - rc = l_wait_event(svcpt->scp_waitq, - atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi); - if (rc == 0) + if (wait_event_idle_timeout( + svcpt->scp_waitq, + atomic_read(&svcpt->scp_nreps_difficult) == 0, + cfs_time_seconds(10)) > 0) break; CWARN("Unexpectedly long timeout %s %p\n", svcpt->scp_service->srv_name, svcpt->scp_service); @@ -3325,7 +3396,6 @@ ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) { struct ptlrpc_service_part *svcpt; struct ptlrpc_request_buffer_desc *rqbd; - struct l_wait_info lwi; int rc; int i; @@ -3363,18 +3433,21 @@ ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) */ spin_lock(&svcpt->scp_lock); while (svcpt->scp_nrqbds_posted != 0) { + int seconds = LONG_UNLINK; + spin_unlock(&svcpt->scp_lock); /* * Network access will complete in finite time but * the HUGE timeout lets us CWARN for visibility * of sluggish NALs */ - lwi = LWI_TIMEOUT_INTERVAL( - cfs_time_seconds(LONG_UNLINK), - cfs_time_seconds(1), NULL, NULL); - rc = l_wait_event(svcpt->scp_waitq, - svcpt->scp_nrqbds_posted == 0, &lwi); - if (rc == -ETIMEDOUT) { + while (seconds > 0 && + wait_event_idle_timeout( + svcpt->scp_waitq, + svcpt->scp_nrqbds_posted == 0, + cfs_time_seconds(1)) == 0) + seconds -= 1; + if (seconds == 0) { CWARN("Service %s waiting for request buffers\n", svcpt->scp_service->srv_name); }