Whamcloud - gitweb
40d5331cba92e19fa9e4cd73cb7c6397cce9bf96
[fs/lustre-release.git] / libcfs / libcfs / workitem.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2014, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  *
32  * libcfs/libcfs/workitem.c
33  *
34  * Author: Isaac Huang <isaac@clusterfs.com>
35  *         Liang Zhen  <zhen.liang@sun.com>
36  */
37
38 #define DEBUG_SUBSYSTEM S_LNET
39
40 #include <linux/kthread.h>
41 #include <libcfs/libcfs.h>
42
43 #define CFS_WS_NAME_LEN         16
44
45 struct cfs_wi_sched {
46         struct list_head                ws_list;        /* chain on global list */
47         /** serialised workitems */
48         spinlock_t                      ws_lock;
49         /** where schedulers sleep */
50         wait_queue_head_t               ws_waitq;
51         /** concurrent workitems */
52         struct list_head                ws_runq;
53         /** rescheduled running-workitems, a workitem can be rescheduled
54          * while running in wi_action(), but we don't to execute it again
55          * unless it returns from wi_action(), so we put it on ws_rerunq
56          * while rescheduling, and move it to runq after it returns
57          * from wi_action() */
58         struct list_head                ws_rerunq;
59         /** CPT-table for this scheduler */
60         struct cfs_cpt_table    *ws_cptab;
61         /** CPT id for affinity */
62         int                     ws_cpt;
63         /** number of scheduled workitems */
64         int                     ws_nscheduled;
65         /** started scheduler thread, protected by cfs_wi_data::wi_glock */
66         unsigned int            ws_nthreads:30;
67         /** shutting down, protected by cfs_wi_data::wi_glock */
68         unsigned int            ws_stopping:1;
69         /** serialize starting thread, protected by cfs_wi_data::wi_glock */
70         unsigned int            ws_starting:1;
71         /** scheduler name */
72         char                    ws_name[CFS_WS_NAME_LEN];
73 };
74
75 static struct cfs_workitem_data {
76         /** serialize */
77         spinlock_t              wi_glock;
78         /** list of all schedulers */
79         struct list_head        wi_scheds;
80         /** WI module is initialized */
81         int                     wi_init;
82         /** shutting down the whole WI module */
83         int                     wi_stopping;
84 } cfs_wi_data;
85
86 static inline int
87 cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
88 {
89         spin_lock(&sched->ws_lock);
90         if (sched->ws_stopping) {
91                 spin_unlock(&sched->ws_lock);
92                 return 0;
93         }
94
95         if (!list_empty(&sched->ws_runq)) {
96                 spin_unlock(&sched->ws_lock);
97                 return 0;
98         }
99         spin_unlock(&sched->ws_lock);
100         return 1;
101 }
102
103 /* XXX:
104  * 0. it only works when called from wi->wi_action.
105  * 1. when it returns no one shall try to schedule the workitem.
106  */
107 void
108 cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
109 {
110         LASSERT(!in_interrupt()); /* because we use plain spinlock */
111         LASSERT(!sched->ws_stopping);
112
113         spin_lock(&sched->ws_lock);
114
115         LASSERT(wi->wi_running);
116
117         if (wi->wi_scheduled) { /* cancel pending schedules */
118                 LASSERT(!list_empty(&wi->wi_list));
119                 list_del_init(&wi->wi_list);
120
121                 LASSERT(sched->ws_nscheduled > 0);
122                 sched->ws_nscheduled--;
123         }
124
125         LASSERT(list_empty(&wi->wi_list));
126
127         wi->wi_scheduled = 1; /* LBUG future schedule attempts */
128         spin_unlock(&sched->ws_lock);
129
130         return;
131 }
132 EXPORT_SYMBOL(cfs_wi_exit);
133
134 /**
135  * cancel schedule request of workitem \a wi
136  */
137 int
138 cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
139 {
140         int     rc;
141
142         LASSERT(!in_interrupt()); /* because we use plain spinlock */
143         LASSERT(!sched->ws_stopping);
144
145         /*
146          * return 0 if it's running already, otherwise return 1, which
147          * means the workitem will not be scheduled and will not have
148          * any race with wi_action.
149          */
150         spin_lock(&sched->ws_lock);
151
152         rc = !(wi->wi_running);
153
154         if (wi->wi_scheduled) { /* cancel pending schedules */
155                 LASSERT(!list_empty(&wi->wi_list));
156                 list_del_init(&wi->wi_list);
157
158                 LASSERT(sched->ws_nscheduled > 0);
159                 sched->ws_nscheduled--;
160
161                 wi->wi_scheduled = 0;
162         }
163
164         LASSERT (list_empty(&wi->wi_list));
165
166         spin_unlock(&sched->ws_lock);
167         return rc;
168 }
169 EXPORT_SYMBOL(cfs_wi_deschedule);
170
171 /*
172  * Workitem scheduled with (serial == 1) is strictly serialised not only with
173  * itself, but also with others scheduled this way.
174  *
175  * Now there's only one static serialised queue, but in the future more might
176  * be added, and even dynamic creation of serialised queues might be supported.
177  */
178 void
179 cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
180 {
181         LASSERT(!in_interrupt()); /* because we use plain spinlock */
182         LASSERT(!sched->ws_stopping);
183
184         spin_lock(&sched->ws_lock);
185
186         if (!wi->wi_scheduled) {
187                 LASSERT (list_empty(&wi->wi_list));
188
189                 wi->wi_scheduled = 1;
190                 sched->ws_nscheduled++;
191                 if (!wi->wi_running) {
192                         list_add_tail(&wi->wi_list, &sched->ws_runq);
193                         wake_up(&sched->ws_waitq);
194                 } else {
195                         list_add(&wi->wi_list, &sched->ws_rerunq);
196                 }
197         }
198
199         LASSERT (!list_empty(&wi->wi_list));
200         spin_unlock(&sched->ws_lock);
201         return;
202 }
203 EXPORT_SYMBOL(cfs_wi_schedule);
204
205 static int
206 cfs_wi_scheduler(void *arg)
207 {
208         struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg;
209
210         cfs_block_allsigs();
211
212         /* CPT affinity scheduler? */
213         if (sched->ws_cptab != NULL)
214                 if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
215                         CWARN("Unable to bind %s on CPU partition %d\n",
216                                 sched->ws_name, sched->ws_cpt);
217
218         spin_lock(&cfs_wi_data.wi_glock);
219
220         LASSERT(sched->ws_starting == 1);
221         sched->ws_starting--;
222         sched->ws_nthreads++;
223
224         spin_unlock(&cfs_wi_data.wi_glock);
225
226         spin_lock(&sched->ws_lock);
227
228         while (!sched->ws_stopping) {
229                 int             nloops = 0;
230                 int             rc;
231                 struct cfs_workitem *wi;
232
233                 while (!list_empty(&sched->ws_runq) &&
234                        nloops < CFS_WI_RESCHED) {
235                         wi = list_entry(sched->ws_runq.next,
236                                         struct cfs_workitem, wi_list);
237                         LASSERT(wi->wi_scheduled && !wi->wi_running);
238
239                         list_del_init(&wi->wi_list);
240
241                         LASSERT(sched->ws_nscheduled > 0);
242                         sched->ws_nscheduled--;
243
244                         wi->wi_running   = 1;
245                         wi->wi_scheduled = 0;
246
247                         spin_unlock(&sched->ws_lock);
248                         nloops++;
249
250                         rc = (*wi->wi_action) (wi);
251
252                         spin_lock(&sched->ws_lock);
253                         if (rc != 0) /* WI should be dead, even be freed! */
254                                 continue;
255
256                         wi->wi_running = 0;
257                         if (list_empty(&wi->wi_list))
258                                 continue;
259
260                         LASSERT(wi->wi_scheduled);
261                         /* wi is rescheduled, should be on rerunq now, we
262                          * move it to runq so it can run action now */
263                         list_move_tail(&wi->wi_list, &sched->ws_runq);
264                 }
265
266                 if (!list_empty(&sched->ws_runq)) {
267                         spin_unlock(&sched->ws_lock);
268                         /* don't sleep because some workitems still
269                          * expect me to come back soon */
270                         cond_resched();
271                         spin_lock(&sched->ws_lock);
272                         continue;
273                 }
274
275                 spin_unlock(&sched->ws_lock);
276                 rc = wait_event_interruptible_exclusive(sched->ws_waitq,
277                                 !cfs_wi_sched_cansleep(sched));
278                 spin_lock(&sched->ws_lock);
279         }
280
281         spin_unlock(&sched->ws_lock);
282
283         spin_lock(&cfs_wi_data.wi_glock);
284         sched->ws_nthreads--;
285         spin_unlock(&cfs_wi_data.wi_glock);
286
287         return 0;
288 }
289
290 void
291 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
292 {
293         LASSERT(cfs_wi_data.wi_init);
294         LASSERT(!cfs_wi_data.wi_stopping);
295
296         spin_lock(&cfs_wi_data.wi_glock);
297         if (sched->ws_stopping) {
298                 CDEBUG(D_INFO, "%s is in progress of stopping\n",
299                        sched->ws_name);
300                 spin_unlock(&cfs_wi_data.wi_glock);
301                 return;
302         }
303
304         LASSERT(!list_empty(&sched->ws_list));
305         sched->ws_stopping = 1;
306
307         spin_unlock(&cfs_wi_data.wi_glock);
308
309         wake_up_all(&sched->ws_waitq);
310
311         spin_lock(&cfs_wi_data.wi_glock);
312         {
313                 int i = 2;
314
315                 while (sched->ws_nthreads > 0) {
316                         CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
317                                "waiting for %d threads of WI sched[%s] to "
318                                "terminate\n", sched->ws_nthreads,
319                                sched->ws_name);
320
321                         spin_unlock(&cfs_wi_data.wi_glock);
322                         set_current_state(TASK_UNINTERRUPTIBLE);
323                         schedule_timeout(cfs_time_seconds(1) / 20);
324                         spin_lock(&cfs_wi_data.wi_glock);
325                 }
326         }
327
328         list_del(&sched->ws_list);
329
330         spin_unlock(&cfs_wi_data.wi_glock);
331
332         LASSERT(sched->ws_nscheduled == 0);
333
334         LIBCFS_FREE(sched, sizeof(*sched));
335 }
336 EXPORT_SYMBOL(cfs_wi_sched_destroy);
337
338 int
339 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
340                     int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
341 {
342         struct cfs_wi_sched     *sched;
343
344         LASSERT(cfs_wi_data.wi_init);
345         LASSERT(!cfs_wi_data.wi_stopping);
346         LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
347                 (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
348
349         LIBCFS_ALLOC(sched, sizeof(*sched));
350         if (sched == NULL)
351                 return -ENOMEM;
352
353         if (strlen(name) > sizeof(sched->ws_name)-1) {
354                 LIBCFS_FREE(sched, sizeof(*sched));
355                 return -E2BIG;
356         }
357         strlcpy(sched->ws_name, name, sizeof(sched->ws_name));
358
359         sched->ws_cptab = cptab;
360         sched->ws_cpt = cpt;
361
362         spin_lock_init(&sched->ws_lock);
363         init_waitqueue_head(&sched->ws_waitq);
364
365         INIT_LIST_HEAD(&sched->ws_runq);
366         INIT_LIST_HEAD(&sched->ws_rerunq);
367         INIT_LIST_HEAD(&sched->ws_list);
368
369         for (; nthrs > 0; nthrs--)  {
370                 char                    name[16];
371                 struct task_struct      *task;
372
373                 spin_lock(&cfs_wi_data.wi_glock);
374                 while (sched->ws_starting > 0) {
375                         spin_unlock(&cfs_wi_data.wi_glock);
376                         schedule();
377                         spin_lock(&cfs_wi_data.wi_glock);
378                 }
379
380                 sched->ws_starting++;
381                 spin_unlock(&cfs_wi_data.wi_glock);
382
383                 if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
384                         snprintf(name, sizeof(name), "%s_%02d_%02d",
385                                  sched->ws_name, sched->ws_cpt,
386                                  sched->ws_nthreads);
387                 } else {
388                         snprintf(name, sizeof(name), "%s_%02d",
389                                  sched->ws_name, sched->ws_nthreads);
390                 }
391
392                 task = kthread_run(cfs_wi_scheduler, sched, name);
393                 if (IS_ERR(task)) {
394                         int rc = PTR_ERR(task);
395
396                         CERROR("Failed to create thread for "
397                                 "WI scheduler %s: %d\n", name, rc);
398
399                         spin_lock(&cfs_wi_data.wi_glock);
400
401                         /* make up for cfs_wi_sched_destroy */
402                         list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
403                         sched->ws_starting--;
404
405                         spin_unlock(&cfs_wi_data.wi_glock);
406
407                         cfs_wi_sched_destroy(sched);
408                         return rc;
409                 }
410         }
411
412         spin_lock(&cfs_wi_data.wi_glock);
413         list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
414         spin_unlock(&cfs_wi_data.wi_glock);
415
416         *sched_pp = sched;
417         return 0;
418 }
419 EXPORT_SYMBOL(cfs_wi_sched_create);
420
421 int
422 cfs_wi_startup(void)
423 {
424         memset(&cfs_wi_data, 0, sizeof(struct cfs_workitem_data));
425
426         spin_lock_init(&cfs_wi_data.wi_glock);
427         INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
428         cfs_wi_data.wi_init = 1;
429
430         return 0;
431 }
432
433 void
434 cfs_wi_shutdown (void)
435 {
436         struct cfs_wi_sched     *sched;
437
438         spin_lock(&cfs_wi_data.wi_glock);
439         cfs_wi_data.wi_stopping = 1;
440         spin_unlock(&cfs_wi_data.wi_glock);
441
442         /* nobody should contend on this list */
443         list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
444                 sched->ws_stopping = 1;
445                 wake_up_all(&sched->ws_waitq);
446         }
447
448         list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
449                 spin_lock(&cfs_wi_data.wi_glock);
450
451                 while (sched->ws_nthreads != 0) {
452                         spin_unlock(&cfs_wi_data.wi_glock);
453                         set_current_state(TASK_UNINTERRUPTIBLE);
454                         schedule_timeout(cfs_time_seconds(1) / 20);
455                         spin_lock(&cfs_wi_data.wi_glock);
456                 }
457                 spin_unlock(&cfs_wi_data.wi_glock);
458         }
459
460         while (!list_empty(&cfs_wi_data.wi_scheds)) {
461                 sched = list_entry(cfs_wi_data.wi_scheds.next,
462                                        struct cfs_wi_sched, ws_list);
463                 list_del(&sched->ws_list);
464                 LIBCFS_FREE(sched, sizeof(*sched));
465         }
466
467         cfs_wi_data.wi_stopping = 0;
468         cfs_wi_data.wi_init = 0;
469 }