Whamcloud - gitweb
d93c3e957d2efebd0fbe326afff02390e1aa45e8
[fs/lustre-release.git] / libcfs / libcfs / workitem.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2014, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * libcfs/libcfs/workitem.c
37  *
38  * Author: Isaac Huang <isaac@clusterfs.com>
39  *         Liang Zhen  <zhen.liang@sun.com>
40  */
41
42 #define DEBUG_SUBSYSTEM S_LNET
43
44 #include <linux/kthread.h>
45 #include <libcfs/libcfs.h>
46
47 #define CFS_WS_NAME_LEN         16
48
49 typedef struct cfs_wi_sched {
50         struct list_head                ws_list;        /* chain on global list */
51 #ifdef __KERNEL__
52         /** serialised workitems */
53         spinlock_t                      ws_lock;
54         /** where schedulers sleep */
55         wait_queue_head_t               ws_waitq;
56 #endif
57         /** concurrent workitems */
58         struct list_head                ws_runq;
59         /** rescheduled running-workitems, a workitem can be rescheduled
60          * while running in wi_action(), but we don't to execute it again
61          * unless it returns from wi_action(), so we put it on ws_rerunq
62          * while rescheduling, and move it to runq after it returns
63          * from wi_action() */
64         struct list_head                ws_rerunq;
65         /** CPT-table for this scheduler */
66         struct cfs_cpt_table    *ws_cptab;
67         /** CPT id for affinity */
68         int                     ws_cpt;
69         /** number of scheduled workitems */
70         int                     ws_nscheduled;
71         /** started scheduler thread, protected by cfs_wi_data::wi_glock */
72         unsigned int            ws_nthreads:30;
73         /** shutting down, protected by cfs_wi_data::wi_glock */
74         unsigned int            ws_stopping:1;
75         /** serialize starting thread, protected by cfs_wi_data::wi_glock */
76         unsigned int            ws_starting:1;
77         /** scheduler name */
78         char                    ws_name[CFS_WS_NAME_LEN];
79 } cfs_wi_sched_t;
80
81 static struct cfs_workitem_data {
82         /** serialize */
83         spinlock_t              wi_glock;
84         /** list of all schedulers */
85         struct list_head                wi_scheds;
86         /** WI module is initialized */
87         int                     wi_init;
88         /** shutting down the whole WI module */
89         int                     wi_stopping;
90 } cfs_wi_data;
91
92 #ifdef __KERNEL__
93 static inline void
94 cfs_wi_sched_lock(cfs_wi_sched_t *sched)
95 {
96         spin_lock(&sched->ws_lock);
97 }
98
99 static inline void
100 cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
101 {
102         spin_unlock(&sched->ws_lock);
103 }
104
105 static inline int
106 cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
107 {
108         cfs_wi_sched_lock(sched);
109         if (sched->ws_stopping) {
110                 cfs_wi_sched_unlock(sched);
111                 return 0;
112         }
113
114         if (!list_empty(&sched->ws_runq)) {
115                 cfs_wi_sched_unlock(sched);
116                 return 0;
117         }
118         cfs_wi_sched_unlock(sched);
119         return 1;
120 }
121
122 #else /* !__KERNEL__ */
123
124 static inline void
125 cfs_wi_sched_lock(cfs_wi_sched_t *sched)
126 {
127         spin_lock(&cfs_wi_data.wi_glock);
128 }
129
130 static inline void
131 cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
132 {
133         spin_unlock(&cfs_wi_data.wi_glock);
134 }
135
136 #endif /* __KERNEL__ */
137
138 /* XXX:
139  * 0. it only works when called from wi->wi_action.
140  * 1. when it returns no one shall try to schedule the workitem.
141  */
142 void
143 cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
144 {
145         LASSERT(!in_interrupt()); /* because we use plain spinlock */
146         LASSERT(!sched->ws_stopping);
147
148         cfs_wi_sched_lock(sched);
149
150 #ifdef __KERNEL__
151         LASSERT(wi->wi_running);
152 #endif
153         if (wi->wi_scheduled) { /* cancel pending schedules */
154                 LASSERT(!list_empty(&wi->wi_list));
155                 list_del_init(&wi->wi_list);
156
157                 LASSERT(sched->ws_nscheduled > 0);
158                 sched->ws_nscheduled--;
159         }
160
161         LASSERT(list_empty(&wi->wi_list));
162
163         wi->wi_scheduled = 1; /* LBUG future schedule attempts */
164         cfs_wi_sched_unlock(sched);
165
166         return;
167 }
168 EXPORT_SYMBOL(cfs_wi_exit);
169
170 /**
171  * cancel schedule request of workitem \a wi
172  */
173 int
174 cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
175 {
176         int     rc;
177
178         LASSERT(!in_interrupt()); /* because we use plain spinlock */
179         LASSERT(!sched->ws_stopping);
180
181         /*
182          * return 0 if it's running already, otherwise return 1, which
183          * means the workitem will not be scheduled and will not have
184          * any race with wi_action.
185          */
186         cfs_wi_sched_lock(sched);
187
188         rc = !(wi->wi_running);
189
190         if (wi->wi_scheduled) { /* cancel pending schedules */
191                 LASSERT(!list_empty(&wi->wi_list));
192                 list_del_init(&wi->wi_list);
193
194                 LASSERT(sched->ws_nscheduled > 0);
195                 sched->ws_nscheduled--;
196
197                 wi->wi_scheduled = 0;
198         }
199
200         LASSERT (list_empty(&wi->wi_list));
201
202         cfs_wi_sched_unlock(sched);
203         return rc;
204 }
205 EXPORT_SYMBOL(cfs_wi_deschedule);
206
207 /*
208  * Workitem scheduled with (serial == 1) is strictly serialised not only with
209  * itself, but also with others scheduled this way.
210  *
211  * Now there's only one static serialised queue, but in the future more might
212  * be added, and even dynamic creation of serialised queues might be supported.
213  */
214 void
215 cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
216 {
217         LASSERT(!in_interrupt()); /* because we use plain spinlock */
218         LASSERT(!sched->ws_stopping);
219
220         cfs_wi_sched_lock(sched);
221
222         if (!wi->wi_scheduled) {
223                 LASSERT (list_empty(&wi->wi_list));
224
225                 wi->wi_scheduled = 1;
226                 sched->ws_nscheduled++;
227                 if (!wi->wi_running) {
228                         list_add_tail(&wi->wi_list, &sched->ws_runq);
229 #ifdef __KERNEL__
230                         wake_up(&sched->ws_waitq);
231 #endif
232                 } else {
233                         list_add(&wi->wi_list, &sched->ws_rerunq);
234                 }
235         }
236
237         LASSERT (!list_empty(&wi->wi_list));
238         cfs_wi_sched_unlock(sched);
239         return;
240 }
241 EXPORT_SYMBOL(cfs_wi_schedule);
242
243 #ifdef __KERNEL__
244
245 static int
246 cfs_wi_scheduler (void *arg)
247 {
248         struct cfs_wi_sched     *sched = (cfs_wi_sched_t *)arg;
249
250         cfs_block_allsigs();
251
252         /* CPT affinity scheduler? */
253         if (sched->ws_cptab != NULL)
254                 if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
255                         CWARN("Failed to bind %s on CPT %d\n",
256                                 sched->ws_name, sched->ws_cpt);
257
258         spin_lock(&cfs_wi_data.wi_glock);
259
260         LASSERT(sched->ws_starting == 1);
261         sched->ws_starting--;
262         sched->ws_nthreads++;
263
264         spin_unlock(&cfs_wi_data.wi_glock);
265
266         cfs_wi_sched_lock(sched);
267
268         while (!sched->ws_stopping) {
269                 int             nloops = 0;
270                 int             rc;
271                 cfs_workitem_t *wi;
272
273                 while (!list_empty(&sched->ws_runq) &&
274                        nloops < CFS_WI_RESCHED) {
275                         wi = list_entry(sched->ws_runq.next,
276                                             cfs_workitem_t, wi_list);
277                         LASSERT(wi->wi_scheduled && !wi->wi_running);
278
279                         list_del_init(&wi->wi_list);
280
281                         LASSERT(sched->ws_nscheduled > 0);
282                         sched->ws_nscheduled--;
283
284                         wi->wi_running   = 1;
285                         wi->wi_scheduled = 0;
286
287
288                         cfs_wi_sched_unlock(sched);
289                         nloops++;
290
291                         rc = (*wi->wi_action) (wi);
292
293                         cfs_wi_sched_lock(sched);
294                         if (rc != 0) /* WI should be dead, even be freed! */
295                                 continue;
296
297                         wi->wi_running = 0;
298                         if (list_empty(&wi->wi_list))
299                                 continue;
300
301                         LASSERT(wi->wi_scheduled);
302                         /* wi is rescheduled, should be on rerunq now, we
303                          * move it to runq so it can run action now */
304                         list_move_tail(&wi->wi_list, &sched->ws_runq);
305                 }
306
307                 if (!list_empty(&sched->ws_runq)) {
308                         cfs_wi_sched_unlock(sched);
309                         /* don't sleep because some workitems still
310                          * expect me to come back soon */
311                         cond_resched();
312                         cfs_wi_sched_lock(sched);
313                         continue;
314                 }
315
316                 cfs_wi_sched_unlock(sched);
317                 rc = wait_event_interruptible_exclusive(sched->ws_waitq,
318                                 !cfs_wi_sched_cansleep(sched));
319                 cfs_wi_sched_lock(sched);
320         }
321
322         cfs_wi_sched_unlock(sched);
323
324         spin_lock(&cfs_wi_data.wi_glock);
325         sched->ws_nthreads--;
326         spin_unlock(&cfs_wi_data.wi_glock);
327
328         return 0;
329 }
330
331 #else /* __KERNEL__ */
332
333 int
334 cfs_wi_check_events (void)
335 {
336         int               n = 0;
337         cfs_workitem_t   *wi;
338
339         spin_lock(&cfs_wi_data.wi_glock);
340
341         for (;;) {
342                 struct cfs_wi_sched     *sched = NULL;
343                 struct cfs_wi_sched     *tmp;
344
345                 /** rerunq is always empty for userspace */
346                 list_for_each_entry(tmp, &cfs_wi_data.wi_scheds, ws_list) {
347                         if (!list_empty(&tmp->ws_runq)) {
348                                 sched = tmp;
349                                 break;
350                         }
351                 }
352
353                 if (sched == NULL)
354                         break;
355
356                 wi = list_entry(sched->ws_runq.next,
357                                     cfs_workitem_t, wi_list);
358                 list_del_init(&wi->wi_list);
359
360                 LASSERT(sched->ws_nscheduled > 0);
361                 sched->ws_nscheduled--;
362
363                 LASSERT(wi->wi_scheduled);
364                 wi->wi_scheduled = 0;
365                 spin_unlock(&cfs_wi_data.wi_glock);
366
367                 n++;
368                 (*wi->wi_action) (wi);
369
370                 spin_lock(&cfs_wi_data.wi_glock);
371         }
372
373         spin_unlock(&cfs_wi_data.wi_glock);
374         return n;
375 }
376
377 #endif
378
379 void
380 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
381 {
382         LASSERT(cfs_wi_data.wi_init);
383         LASSERT(!cfs_wi_data.wi_stopping);
384
385         spin_lock(&cfs_wi_data.wi_glock);
386         if (sched->ws_stopping) {
387                 CDEBUG(D_INFO, "%s is in progress of stopping\n",
388                        sched->ws_name);
389                 spin_unlock(&cfs_wi_data.wi_glock);
390                 return;
391         }
392
393         LASSERT(!list_empty(&sched->ws_list));
394         sched->ws_stopping = 1;
395
396         spin_unlock(&cfs_wi_data.wi_glock);
397
398 #ifdef __KERNEL__
399         wake_up_all(&sched->ws_waitq);
400
401         spin_lock(&cfs_wi_data.wi_glock);
402         {
403                 int i = 2;
404
405                 while (sched->ws_nthreads > 0) {
406                         CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
407                                "waiting for %d threads of WI sched[%s] to "
408                                "terminate\n", sched->ws_nthreads,
409                                sched->ws_name);
410
411                         spin_unlock(&cfs_wi_data.wi_glock);
412                         set_current_state(TASK_UNINTERRUPTIBLE);
413                         schedule_timeout(cfs_time_seconds(1) / 20);
414                         spin_lock(&cfs_wi_data.wi_glock);
415                 }
416         }
417
418         list_del(&sched->ws_list);
419
420         spin_unlock(&cfs_wi_data.wi_glock);
421 #endif
422         LASSERT(sched->ws_nscheduled == 0);
423
424         LIBCFS_FREE(sched, sizeof(*sched));
425 }
426 EXPORT_SYMBOL(cfs_wi_sched_destroy);
427
428 int
429 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
430                     int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
431 {
432         struct cfs_wi_sched     *sched;
433
434         LASSERT(cfs_wi_data.wi_init);
435         LASSERT(!cfs_wi_data.wi_stopping);
436         LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
437                 (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
438
439         LIBCFS_ALLOC(sched, sizeof(*sched));
440         if (sched == NULL)
441                 return -ENOMEM;
442
443         if (strlen(name) > sizeof(sched->ws_name)-1) {
444                 LIBCFS_FREE(sched, sizeof(*sched));
445                 return -E2BIG;
446         }
447         strlcpy(sched->ws_name, name, sizeof(sched->ws_name));
448
449         sched->ws_cptab = cptab;
450         sched->ws_cpt = cpt;
451
452 #ifdef __KERNEL__
453         spin_lock_init(&sched->ws_lock);
454         init_waitqueue_head(&sched->ws_waitq);
455 #endif
456         INIT_LIST_HEAD(&sched->ws_runq);
457         INIT_LIST_HEAD(&sched->ws_rerunq);
458         INIT_LIST_HEAD(&sched->ws_list);
459
460 #ifdef __KERNEL__
461         for (; nthrs > 0; nthrs--)  {
462                 char                    name[16];
463                 struct task_struct      *task;
464
465                 spin_lock(&cfs_wi_data.wi_glock);
466                 while (sched->ws_starting > 0) {
467                         spin_unlock(&cfs_wi_data.wi_glock);
468                         schedule();
469                         spin_lock(&cfs_wi_data.wi_glock);
470                 }
471
472                 sched->ws_starting++;
473                 spin_unlock(&cfs_wi_data.wi_glock);
474
475                 if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
476                         snprintf(name, sizeof(name), "%s_%02d_%02d",
477                                  sched->ws_name, sched->ws_cpt,
478                                  sched->ws_nthreads);
479                 } else {
480                         snprintf(name, sizeof(name), "%s_%02d",
481                                  sched->ws_name, sched->ws_nthreads);
482                 }
483
484                 task = kthread_run(cfs_wi_scheduler, sched, name);
485                 if (IS_ERR(task)) {
486                         int rc = PTR_ERR(task);
487
488                         CERROR("Failed to create thread for "
489                                 "WI scheduler %s: %d\n", name, rc);
490
491                         spin_lock(&cfs_wi_data.wi_glock);
492
493                         /* make up for cfs_wi_sched_destroy */
494                         list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
495                         sched->ws_starting--;
496
497                         spin_unlock(&cfs_wi_data.wi_glock);
498
499                         cfs_wi_sched_destroy(sched);
500                         return rc;
501                 }
502         }
503 #endif
504         spin_lock(&cfs_wi_data.wi_glock);
505         list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
506         spin_unlock(&cfs_wi_data.wi_glock);
507
508         *sched_pp = sched;
509         return 0;
510 }
511 EXPORT_SYMBOL(cfs_wi_sched_create);
512
513 int
514 cfs_wi_startup(void)
515 {
516         memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
517
518         spin_lock_init(&cfs_wi_data.wi_glock);
519         INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
520         cfs_wi_data.wi_init = 1;
521
522         return 0;
523 }
524
525 void
526 cfs_wi_shutdown (void)
527 {
528         struct cfs_wi_sched     *sched;
529
530         spin_lock(&cfs_wi_data.wi_glock);
531         cfs_wi_data.wi_stopping = 1;
532         spin_unlock(&cfs_wi_data.wi_glock);
533
534 #ifdef __KERNEL__
535         /* nobody should contend on this list */
536         list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
537                 sched->ws_stopping = 1;
538                 wake_up_all(&sched->ws_waitq);
539         }
540
541         list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
542                 spin_lock(&cfs_wi_data.wi_glock);
543
544                 while (sched->ws_nthreads != 0) {
545                         spin_unlock(&cfs_wi_data.wi_glock);
546                         set_current_state(TASK_UNINTERRUPTIBLE);
547                         schedule_timeout(cfs_time_seconds(1) / 20);
548                         spin_lock(&cfs_wi_data.wi_glock);
549                 }
550                 spin_unlock(&cfs_wi_data.wi_glock);
551         }
552 #endif
553         while (!list_empty(&cfs_wi_data.wi_scheds)) {
554                 sched = list_entry(cfs_wi_data.wi_scheds.next,
555                                        struct cfs_wi_sched, ws_list);
556                 list_del(&sched->ws_list);
557                 LIBCFS_FREE(sched, sizeof(*sched));
558         }
559
560         cfs_wi_data.wi_stopping = 0;
561         cfs_wi_data.wi_init = 0;
562 }