Whamcloud - gitweb
ca13f96d81c84f8618df897106b8e14ac683f81a
[fs/lustre-release.git] / lustre / ptlrpc / pinger.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2015, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  *
32  * lustre/ptlrpc/pinger.c
33  *
34  * Portal-RPC reconnection and replay operations, for use in recovery.
35  */
36
37 #define DEBUG_SUBSYSTEM S_RPC
38
39 #include <linux/kthread.h>
40 #include <linux/workqueue.h>
41 #include <obd_support.h>
42 #include <obd_class.h>
43 #include "ptlrpc_internal.h"
44
45 static int suppress_pings;
46 module_param(suppress_pings, int, 0644);
47 MODULE_PARM_DESC(suppress_pings, "Suppress pings");
48
49 struct mutex pinger_mutex;
50 static struct list_head pinger_imports =
51                 LIST_HEAD_INIT(pinger_imports);
52
53 int ptlrpc_pinger_suppress_pings(void)
54 {
55         return suppress_pings;
56 }
57 EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings);
58
59 struct ptlrpc_request *
60 ptlrpc_prep_ping(struct obd_import *imp)
61 {
62         struct ptlrpc_request *req;
63
64         req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING,
65                                         LUSTRE_OBD_VERSION, OBD_PING);
66         if (req) {
67                 ptlrpc_request_set_replen(req);
68                 req->rq_no_resend = req->rq_no_delay = 1;
69         }
70         return req;
71 }
72
73 int ptlrpc_obd_ping(struct obd_device *obd)
74 {
75         int rc;
76         struct ptlrpc_request *req;
77
78         ENTRY;
79
80         req = ptlrpc_prep_ping(obd->u.cli.cl_import);
81         if (!req)
82                 RETURN(-ENOMEM);
83
84         req->rq_send_state = LUSTRE_IMP_FULL;
85
86         rc = ptlrpc_queue_wait(req);
87
88         ptlrpc_req_finished(req);
89
90         RETURN(rc);
91 }
92 EXPORT_SYMBOL(ptlrpc_obd_ping);
93
94 static bool ptlrpc_check_import_is_idle(struct obd_import *imp)
95 {
96         struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
97         time64_t now;
98
99         if (!imp->imp_idle_timeout)
100                 return false;
101
102         if (atomic_read(&imp->imp_reqs) > 0)
103                 return false;
104
105         /* any lock increases ns_bref being a resource holder */
106         if (ns && atomic_read(&ns->ns_bref) > 0)
107                 return false;
108
109         now = ktime_get_real_seconds();
110         if (now - imp->imp_last_reply_time < imp->imp_idle_timeout)
111                 return false;
112
113         return true;
114 }
115
116 static int ptlrpc_ping(struct obd_import *imp)
117 {
118         struct ptlrpc_request *req;
119
120         ENTRY;
121
122         if (ptlrpc_check_import_is_idle(imp))
123                 RETURN(ptlrpc_disconnect_and_idle_import(imp));
124
125         req = ptlrpc_prep_ping(imp);
126         if (!req) {
127                 CERROR("OOM trying to ping %s->%s\n",
128                        imp->imp_obd->obd_uuid.uuid,
129                        obd2cli_tgt(imp->imp_obd));
130                 RETURN(-ENOMEM);
131         }
132
133         DEBUG_REQ(D_INFO, req, "pinging %s->%s",
134                   imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
135         ptlrpcd_add_req(req);
136
137         RETURN(0);
138 }
139
140 static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
141 {
142 #ifdef CONFIG_LUSTRE_FS_PINGER
143         time64_t time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
144
145         if (imp->imp_state == LUSTRE_IMP_DISCON) {
146                 time64_t dtime = max_t(time64_t, CONNECTION_SWITCH_MIN,
147                                        AT_OFF ? 0 :
148                                        at_get(&imp->imp_at.iat_net_latency));
149                 time = min(time, dtime);
150         }
151         imp->imp_next_ping = ktime_get_seconds() + time;
152 #endif /* CONFIG_LUSTRE_FS_PINGER */
153 }
154
155 void ptlrpc_ping_import_soon(struct obd_import *imp)
156 {
157         imp->imp_next_ping = ktime_get_seconds();
158 }
159
160 static inline int imp_is_deactive(struct obd_import *imp)
161 {
162         return imp->imp_deactive ||
163                OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE);
164 }
165
166 static inline time64_t ptlrpc_next_reconnect(struct obd_import *imp)
167 {
168         if (imp->imp_server_timeout)
169                 return ktime_get_seconds() + (obd_timeout >> 1);
170         else
171                 return ktime_get_seconds() + obd_timeout;
172 }
173
174 static time64_t pinger_check_timeout(time64_t time)
175 {
176         time64_t timeout = PING_INTERVAL;
177
178         return time + timeout - ktime_get_seconds();
179 }
180
181 static bool ir_up;
182
183 void ptlrpc_pinger_ir_up(void)
184 {
185         CDEBUG(D_HA, "IR up\n");
186         ir_up = true;
187 }
188 EXPORT_SYMBOL(ptlrpc_pinger_ir_up);
189
190 void ptlrpc_pinger_ir_down(void)
191 {
192         CDEBUG(D_HA, "IR down\n");
193         ir_up = false;
194 }
195 EXPORT_SYMBOL(ptlrpc_pinger_ir_down);
196
197 static void ptlrpc_pinger_process_import(struct obd_import *imp,
198                                          time64_t this_ping)
199 {
200         int level;
201         int force;
202         int force_next;
203         int suppress;
204
205         spin_lock(&imp->imp_lock);
206
207         level = imp->imp_state;
208         force = imp->imp_force_verify;
209         force_next = imp->imp_force_next_verify;
210         /*
211          * This will be used below only if the import is "FULL".
212          */
213         suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS);
214
215         imp->imp_force_verify = 0;
216
217         if (imp->imp_next_ping - 5 >= this_ping && !force) {
218                 spin_unlock(&imp->imp_lock);
219                 return;
220         }
221
222         imp->imp_force_next_verify = 0;
223
224         CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA,
225                "%s->%s: level %s/%u force %u force_next %u deactive %u pingable %u suppress %u\n",
226                imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
227                ptlrpc_import_state_name(level), level, force, force_next,
228                imp->imp_deactive, imp->imp_pingable, suppress);
229
230         if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
231                 /* wait for a while before trying recovery again */
232                 imp->imp_next_ping = ptlrpc_next_reconnect(imp);
233                 spin_unlock(&imp->imp_lock);
234                 if (!imp->imp_no_pinger_recover ||
235                     imp->imp_connect_error == -EAGAIN)
236                         ptlrpc_initiate_recovery(imp);
237         } else if (level != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov ||
238                    imp_is_deactive(imp)) {
239                 CDEBUG(D_HA,
240                        "%s->%s: not pinging (in recovery or recovery disabled: %s)\n",
241                        imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
242                        ptlrpc_import_state_name(level));
243                 if (force)
244                         imp->imp_force_verify = 1;
245                 spin_unlock(&imp->imp_lock);
246         } else if ((imp->imp_pingable && !suppress) || force_next || force) {
247                 spin_unlock(&imp->imp_lock);
248                 ptlrpc_ping(imp);
249         } else {
250                 spin_unlock(&imp->imp_lock);
251         }
252 }
253
254 static struct workqueue_struct *pinger_wq;
255 static void ptlrpc_pinger_main(struct work_struct *ws);
256 static DECLARE_DELAYED_WORK(ping_work, ptlrpc_pinger_main);
257
258 static void ptlrpc_pinger_main(struct work_struct *ws)
259 {
260         time64_t this_ping, time_after_ping, time_to_next_wake;
261         struct obd_import *imp;
262         struct list_head *iter;
263
264         do {
265                 this_ping = ktime_get_seconds();
266
267                 mutex_lock(&pinger_mutex);
268
269                 list_for_each(iter, &pinger_imports) {
270                         imp = list_entry(iter, struct obd_import,
271                                          imp_pinger_chain);
272
273                         ptlrpc_pinger_process_import(imp, this_ping);
274                         /* obd_timeout might have changed */
275                         if (imp->imp_pingable && imp->imp_next_ping &&
276                             imp->imp_next_ping > this_ping + PING_INTERVAL)
277                                 ptlrpc_update_next_ping(imp, 0);
278                 }
279                 mutex_unlock(&pinger_mutex);
280
281                 time_after_ping = ktime_get_seconds();
282                 /* update memory usage info */
283                 obd_update_maxusage();
284
285                 if ((ktime_get_seconds() - this_ping - 3) > PING_INTERVAL)
286                         CDEBUG(D_HA, "long time to ping: %lld, %lld, %lld\n",
287                                this_ping, time_after_ping, ktime_get_seconds());
288
289                 /* Wait until the next ping time, or until we're stopped. */
290                 time_to_next_wake = pinger_check_timeout(this_ping);
291                 /*
292                  * The ping sent by ptlrpc_send_rpc may get sent out
293                  * say .01 second after this.
294                  * ptlrpc_pinger_sending_on_import will then set the
295                  * next ping time to next_ping + .01 sec, which means
296                  * we will SKIP the next ping at next_ping, and the
297                  * ping will get sent 2 timeouts from now!  Beware.
298                  */
299                 CDEBUG(D_INFO, "next wakeup in %lld (%lld)\n",
300                        time_to_next_wake, this_ping + PING_INTERVAL);
301         } while (time_to_next_wake <= 0);
302
303         queue_delayed_work(pinger_wq, &ping_work,
304                            cfs_time_seconds(max(time_to_next_wake, 1LL)));
305 }
306
307 int ptlrpc_start_pinger(void)
308 {
309 #ifdef CONFIG_LUSTRE_FS_PINGER
310         if (pinger_wq)
311                 return -EALREADY;
312
313         pinger_wq = cfs_cpt_bind_workqueue("ptlrpc_pinger", cfs_cpt_tab,
314                                            0, CFS_CPT_ANY, 1);
315         if (IS_ERR(pinger_wq)) {
316                 CERROR("cannot start pinger workqueue\n");
317                 return PTR_ERR(pinger_wq);
318         }
319
320         queue_delayed_work(pinger_wq, &ping_work, 0);
321
322         if (suppress_pings)
323                 CWARN("Pings will be suppressed at the request of the administrator. The configuration shall meet the additional requirements described in the manual. (Search for the \"suppress_pings\" kernel module parameter.)\n");
324 #endif
325         return 0;
326 }
327
328 int ptlrpc_stop_pinger(void)
329 {
330 #ifdef CONFIG_LUSTRE_FS_PINGER
331         if (!pinger_wq)
332                 return -EALREADY;
333
334         cancel_delayed_work_sync(&ping_work);
335         destroy_workqueue(pinger_wq);
336         pinger_wq = NULL;
337 #endif
338         return 0;
339 }
340
341 void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
342 {
343         ptlrpc_update_next_ping(imp, 0);
344 }
345
346 void ptlrpc_pinger_commit_expected(struct obd_import *imp)
347 {
348         ptlrpc_update_next_ping(imp, 1);
349         assert_spin_locked(&imp->imp_lock);
350         /*
351          * Avoid reading stale imp_connect_data.  When not sure if pings are
352          * expected or not on next connection, we assume they are not and force
353          * one anyway to guarantee the chance of updating
354          * imp_peer_committed_transno.
355          */
356         if (imp->imp_state != LUSTRE_IMP_FULL ||
357             OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS))
358                 imp->imp_force_next_verify = 1;
359 }
360
361 int ptlrpc_pinger_add_import(struct obd_import *imp)
362 {
363         ENTRY;
364         if (!list_empty(&imp->imp_pinger_chain))
365                 RETURN(-EALREADY);
366
367         mutex_lock(&pinger_mutex);
368         CDEBUG(D_HA, "adding pingable import %s->%s\n",
369                imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
370         /* if we add to pinger we want recovery on this import */
371         imp->imp_obd->obd_no_recov = 0;
372         ptlrpc_update_next_ping(imp, 0);
373         /* XXX sort, blah blah */
374         list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
375         class_import_get(imp);
376
377         ptlrpc_pinger_wake_up();
378         mutex_unlock(&pinger_mutex);
379
380         RETURN(0);
381 }
382 EXPORT_SYMBOL(ptlrpc_pinger_add_import);
383
384 int ptlrpc_pinger_del_import(struct obd_import *imp)
385 {
386         ENTRY;
387
388         if (list_empty(&imp->imp_pinger_chain))
389                 RETURN(-ENOENT);
390
391         mutex_lock(&pinger_mutex);
392         list_del_init(&imp->imp_pinger_chain);
393         CDEBUG(D_HA, "removing pingable import %s->%s\n",
394                imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
395         /* if we remove from pinger we don't want recovery on this import */
396         imp->imp_obd->obd_no_recov = 1;
397         class_import_put(imp);
398         mutex_unlock(&pinger_mutex);
399         RETURN(0);
400 }
401 EXPORT_SYMBOL(ptlrpc_pinger_del_import);
402
403 void ptlrpc_pinger_wake_up(void)
404 {
405 #ifdef CONFIG_LUSTRE_FS_PINGER
406         mod_delayed_work(pinger_wq, &ping_work, 0);
407 #endif
408 }
409
410 /* Ping evictor thread */
411 #define PET_READY     1
412 #define PET_TERMINATE 2
413
414 static int pet_refcount;
415 static int pet_state;
416 static wait_queue_head_t pet_waitq;
417 static LIST_HEAD(pet_list);
418 static DEFINE_SPINLOCK(pet_lock);
419
420 int ping_evictor_wake(struct obd_export *exp)
421 {
422         struct obd_device *obd;
423
424         spin_lock(&pet_lock);
425         if (pet_state != PET_READY) {
426                 /* eventually the new obd will call here again. */
427                 spin_unlock(&pet_lock);
428                 return 1;
429         }
430
431         obd = class_exp2obd(exp);
432         if (list_empty(&obd->obd_evict_list)) {
433                 class_incref(obd, "evictor", obd);
434                 list_add(&obd->obd_evict_list, &pet_list);
435         }
436         spin_unlock(&pet_lock);
437
438         wake_up(&pet_waitq);
439         return 0;
440 }
441
442 static int ping_evictor_main(void *arg)
443 {
444         struct obd_device *obd;
445         struct obd_export *exp;
446         time64_t expire_time;
447
448         ENTRY;
449         CDEBUG(D_HA, "Starting Ping Evictor\n");
450         pet_state = PET_READY;
451         while (1) {
452                 wait_event_idle(pet_waitq,
453                                 (!list_empty(&pet_list)) ||
454                                 (pet_state == PET_TERMINATE));
455
456                 /* loop until all obd's will be removed */
457                 if ((pet_state == PET_TERMINATE) && list_empty(&pet_list))
458                         break;
459
460                 /*
461                  * we only get here if pet_exp != NULL, and the end of this
462                  * loop is the only place which sets it NULL again, so lock
463                  * is not strictly necessary.
464                  */
465                 spin_lock(&pet_lock);
466                 obd = list_entry(pet_list.next, struct obd_device,
467                                  obd_evict_list);
468                 spin_unlock(&pet_lock);
469
470                 expire_time = ktime_get_real_seconds() - PING_EVICT_TIMEOUT;
471
472                 CDEBUG(D_HA, "evicting all exports of obd %s older than %lld\n",
473                        obd->obd_name, expire_time);
474
475                 /*
476                  * Exports can't be deleted out of the list while we hold
477                  * the obd lock (class_unlink_export), which means we can't
478                  * lose the last ref on the export.  If they've already been
479                  * removed from the list, we won't find them here.
480                  */
481                 spin_lock(&obd->obd_dev_lock);
482                 while (!list_empty(&obd->obd_exports_timed)) {
483                         exp = list_entry(obd->obd_exports_timed.next,
484                                          struct obd_export,
485                                          exp_obd_chain_timed);
486                         if (expire_time > exp->exp_last_request_time) {
487                                 struct obd_uuid *client_uuid;
488
489                                 class_export_get(exp);
490                                 client_uuid = &exp->exp_client_uuid;
491                                 spin_unlock(&obd->obd_dev_lock);
492                                 LCONSOLE_WARN("%s: haven't heard from client %s (at %s) in %lld seconds. I think it's dead, and I am evicting it. exp %p, cur %lld expire %lld last %lld\n",
493                                               obd->obd_name,
494                                               obd_uuid2str(client_uuid),
495                                               obd_export_nid2str(exp),
496                                               ktime_get_real_seconds() -
497                                               exp->exp_last_request_time,
498                                               exp, ktime_get_real_seconds(),
499                                               expire_time,
500                                               exp->exp_last_request_time);
501                                 CDEBUG(D_HA, "Last request was at %lld\n",
502                                        exp->exp_last_request_time);
503                                 class_fail_export(exp);
504                                 class_export_put(exp);
505                                 spin_lock(&obd->obd_dev_lock);
506                         } else {
507                                 /* List is sorted, so everyone below is ok */
508                                 break;
509                         }
510                 }
511                 spin_unlock(&obd->obd_dev_lock);
512
513                 spin_lock(&pet_lock);
514                 list_del_init(&obd->obd_evict_list);
515                 spin_unlock(&pet_lock);
516
517                 class_decref(obd, "evictor", obd);
518         }
519         CDEBUG(D_HA, "Exiting Ping Evictor\n");
520
521         RETURN(0);
522 }
523
524 void ping_evictor_start(void)
525 {
526         struct task_struct *task;
527
528         if (++pet_refcount > 1)
529                 return;
530
531         init_waitqueue_head(&pet_waitq);
532
533         task = kthread_run(ping_evictor_main, NULL, "ll_evictor");
534         if (IS_ERR(task)) {
535                 pet_refcount--;
536                 CERROR("Cannot start ping evictor thread: %ld\n",
537                         PTR_ERR(task));
538         }
539 }
540 EXPORT_SYMBOL(ping_evictor_start);
541
542 void ping_evictor_stop(void)
543 {
544         if (--pet_refcount > 0)
545                 return;
546
547         pet_state = PET_TERMINATE;
548         wake_up(&pet_waitq);
549 }
550 EXPORT_SYMBOL(ping_evictor_stop);