4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2013, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
32 * libcfs/libcfs/workitem.c
34 * Author: Isaac Huang <isaac@clusterfs.com>
35 * Liang Zhen <zhen.liang@sun.com>
38 #define DEBUG_SUBSYSTEM S_LNET
40 #include <linux/kthread.h>
41 #include <libcfs/libcfs.h>
43 #define CFS_WS_NAME_LEN 16
46 struct list_head ws_list; /* chain on global list */
47 /** serialised workitems */
49 /** where schedulers sleep */
50 wait_queue_head_t ws_waitq;
51 /** concurrent workitems */
52 struct list_head ws_runq;
53 /** rescheduled running-workitems, a workitem can be rescheduled
54 * while running in wi_action(), but we don't to execute it again
55 * unless it returns from wi_action(), so we put it on ws_rerunq
56 * while rescheduling, and move it to runq after it returns
58 struct list_head ws_rerunq;
59 /** CPT-table for this scheduler */
60 struct cfs_cpt_table *ws_cptab;
61 /** CPT id for affinity */
63 /** number of scheduled workitems */
65 /** started scheduler thread, protected by cfs_wi_data::wi_glock */
66 unsigned int ws_nthreads:30;
67 /** shutting down, protected by cfs_wi_data::wi_glock */
68 unsigned int ws_stopping:1;
69 /** serialize starting thread, protected by cfs_wi_data::wi_glock */
70 unsigned int ws_starting:1;
72 char ws_name[CFS_WS_NAME_LEN];
75 static struct cfs_workitem_data {
78 /** list of all schedulers */
79 struct list_head wi_scheds;
80 /** WI module is initialized */
82 /** shutting down the whole WI module */
87 cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
89 spin_lock(&sched->ws_lock);
90 if (sched->ws_stopping) {
91 spin_unlock(&sched->ws_lock);
95 if (!list_empty(&sched->ws_runq)) {
96 spin_unlock(&sched->ws_lock);
99 spin_unlock(&sched->ws_lock);
104 * 0. it only works when called from wi->wi_action.
105 * 1. when it returns no one shall try to schedule the workitem.
108 cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
110 LASSERT(!in_interrupt()); /* because we use plain spinlock */
111 LASSERT(!sched->ws_stopping);
113 spin_lock(&sched->ws_lock);
115 LASSERT(wi->wi_running);
117 if (wi->wi_scheduled) { /* cancel pending schedules */
118 LASSERT(!list_empty(&wi->wi_list));
119 list_del_init(&wi->wi_list);
121 LASSERT(sched->ws_nscheduled > 0);
122 sched->ws_nscheduled--;
125 LASSERT(list_empty(&wi->wi_list));
127 wi->wi_scheduled = 1; /* LBUG future schedule attempts */
128 spin_unlock(&sched->ws_lock);
130 EXPORT_SYMBOL(cfs_wi_exit);
133 * cancel schedule request of workitem \a wi
136 cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
140 LASSERT(!in_interrupt()); /* because we use plain spinlock */
141 LASSERT(!sched->ws_stopping);
144 * return 0 if it's running already, otherwise return 1, which
145 * means the workitem will not be scheduled and will not have
146 * any race with wi_action.
148 spin_lock(&sched->ws_lock);
150 rc = !(wi->wi_running);
152 if (wi->wi_scheduled) { /* cancel pending schedules */
153 LASSERT(!list_empty(&wi->wi_list));
154 list_del_init(&wi->wi_list);
156 LASSERT(sched->ws_nscheduled > 0);
157 sched->ws_nscheduled--;
159 wi->wi_scheduled = 0;
162 LASSERT (list_empty(&wi->wi_list));
164 spin_unlock(&sched->ws_lock);
167 EXPORT_SYMBOL(cfs_wi_deschedule);
170 * Workitem scheduled with (serial == 1) is strictly serialised not only with
171 * itself, but also with others scheduled this way.
173 * Now there's only one static serialised queue, but in the future more might
174 * be added, and even dynamic creation of serialised queues might be supported.
177 cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
179 LASSERT(!in_interrupt()); /* because we use plain spinlock */
180 LASSERT(!sched->ws_stopping);
182 spin_lock(&sched->ws_lock);
184 if (!wi->wi_scheduled) {
185 LASSERT (list_empty(&wi->wi_list));
187 wi->wi_scheduled = 1;
188 sched->ws_nscheduled++;
189 if (!wi->wi_running) {
190 list_add_tail(&wi->wi_list, &sched->ws_runq);
191 wake_up(&sched->ws_waitq);
193 list_add(&wi->wi_list, &sched->ws_rerunq);
197 LASSERT (!list_empty(&wi->wi_list));
198 spin_unlock(&sched->ws_lock);
200 EXPORT_SYMBOL(cfs_wi_schedule);
203 cfs_wi_scheduler(void *arg)
205 struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg;
207 /* CPT affinity scheduler? */
208 if (sched->ws_cptab != NULL)
209 if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
210 CWARN("Unable to bind %s on CPU partition %d\n",
211 sched->ws_name, sched->ws_cpt);
213 spin_lock(&cfs_wi_data.wi_glock);
215 LASSERT(sched->ws_starting == 1);
216 sched->ws_starting--;
217 sched->ws_nthreads++;
219 spin_unlock(&cfs_wi_data.wi_glock);
221 spin_lock(&sched->ws_lock);
223 while (!sched->ws_stopping) {
226 struct cfs_workitem *wi;
228 while (!list_empty(&sched->ws_runq) &&
229 nloops < CFS_WI_RESCHED) {
230 wi = list_entry(sched->ws_runq.next,
231 struct cfs_workitem, wi_list);
232 LASSERT(wi->wi_scheduled && !wi->wi_running);
234 list_del_init(&wi->wi_list);
236 LASSERT(sched->ws_nscheduled > 0);
237 sched->ws_nscheduled--;
240 wi->wi_scheduled = 0;
242 spin_unlock(&sched->ws_lock);
245 rc = (*wi->wi_action) (wi);
247 spin_lock(&sched->ws_lock);
248 if (rc != 0) /* WI should be dead, even be freed! */
252 if (list_empty(&wi->wi_list))
255 LASSERT(wi->wi_scheduled);
256 /* wi is rescheduled, should be on rerunq now, we
257 * move it to runq so it can run action now */
258 list_move_tail(&wi->wi_list, &sched->ws_runq);
261 if (!list_empty(&sched->ws_runq)) {
262 spin_unlock(&sched->ws_lock);
263 /* don't sleep because some workitems still
264 * expect me to come back soon */
266 spin_lock(&sched->ws_lock);
270 spin_unlock(&sched->ws_lock);
271 rc = wait_event_interruptible_exclusive(sched->ws_waitq,
272 !cfs_wi_sched_cansleep(sched));
273 spin_lock(&sched->ws_lock);
276 spin_unlock(&sched->ws_lock);
278 spin_lock(&cfs_wi_data.wi_glock);
279 sched->ws_nthreads--;
280 spin_unlock(&cfs_wi_data.wi_glock);
286 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
288 LASSERT(cfs_wi_data.wi_init);
289 LASSERT(!cfs_wi_data.wi_stopping);
291 spin_lock(&cfs_wi_data.wi_glock);
292 if (sched->ws_stopping) {
293 CDEBUG(D_INFO, "%s is in progress of stopping\n",
295 spin_unlock(&cfs_wi_data.wi_glock);
299 LASSERT(!list_empty(&sched->ws_list));
300 sched->ws_stopping = 1;
302 spin_unlock(&cfs_wi_data.wi_glock);
304 wake_up_all(&sched->ws_waitq);
306 spin_lock(&cfs_wi_data.wi_glock);
310 while (sched->ws_nthreads > 0) {
311 CDEBUG(is_power_of_2(++i / 20) ? D_WARNING : D_NET,
312 "waiting %us for %d %s worker threads to exit\n",
313 i / 20, sched->ws_nthreads, sched->ws_name);
315 spin_unlock(&cfs_wi_data.wi_glock);
316 schedule_timeout_uninterruptible(cfs_time_seconds(1)
318 spin_lock(&cfs_wi_data.wi_glock);
322 list_del(&sched->ws_list);
324 spin_unlock(&cfs_wi_data.wi_glock);
326 LASSERT(sched->ws_nscheduled == 0);
328 LIBCFS_FREE(sched, sizeof(*sched));
330 EXPORT_SYMBOL(cfs_wi_sched_destroy);
333 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
334 int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
336 struct cfs_wi_sched *sched;
338 LASSERT(cfs_wi_data.wi_init);
339 LASSERT(!cfs_wi_data.wi_stopping);
340 LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
341 (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
343 LIBCFS_ALLOC(sched, sizeof(*sched));
347 if (strlen(name) > sizeof(sched->ws_name)-1) {
348 LIBCFS_FREE(sched, sizeof(*sched));
351 strlcpy(sched->ws_name, name, sizeof(sched->ws_name));
353 sched->ws_cptab = cptab;
356 spin_lock_init(&sched->ws_lock);
357 init_waitqueue_head(&sched->ws_waitq);
359 INIT_LIST_HEAD(&sched->ws_runq);
360 INIT_LIST_HEAD(&sched->ws_rerunq);
361 INIT_LIST_HEAD(&sched->ws_list);
363 for (; nthrs > 0; nthrs--) {
365 struct task_struct *task;
367 spin_lock(&cfs_wi_data.wi_glock);
368 while (sched->ws_starting > 0) {
369 spin_unlock(&cfs_wi_data.wi_glock);
371 spin_lock(&cfs_wi_data.wi_glock);
374 sched->ws_starting++;
375 spin_unlock(&cfs_wi_data.wi_glock);
377 if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
378 snprintf(name, sizeof(name), "%s_%02d_%02d",
379 sched->ws_name, sched->ws_cpt,
382 snprintf(name, sizeof(name), "%s_%02d",
383 sched->ws_name, sched->ws_nthreads);
386 task = kthread_run(cfs_wi_scheduler, sched, "%s", name);
388 int rc = PTR_ERR(task);
390 CERROR("Failed to create thread for "
391 "WI scheduler %s: %d\n", name, rc);
393 spin_lock(&cfs_wi_data.wi_glock);
395 /* make up for cfs_wi_sched_destroy */
396 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
397 sched->ws_starting--;
399 spin_unlock(&cfs_wi_data.wi_glock);
401 cfs_wi_sched_destroy(sched);
406 spin_lock(&cfs_wi_data.wi_glock);
407 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
408 spin_unlock(&cfs_wi_data.wi_glock);
413 EXPORT_SYMBOL(cfs_wi_sched_create);
418 memset(&cfs_wi_data, 0, sizeof(struct cfs_workitem_data));
420 spin_lock_init(&cfs_wi_data.wi_glock);
421 INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
422 cfs_wi_data.wi_init = 1;
428 cfs_wi_shutdown (void)
430 struct cfs_wi_sched *sched;
432 spin_lock(&cfs_wi_data.wi_glock);
433 cfs_wi_data.wi_stopping = 1;
434 spin_unlock(&cfs_wi_data.wi_glock);
436 /* nobody should contend on this list */
437 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
438 sched->ws_stopping = 1;
439 wake_up_all(&sched->ws_waitq);
442 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
443 spin_lock(&cfs_wi_data.wi_glock);
445 while (sched->ws_nthreads != 0) {
446 spin_unlock(&cfs_wi_data.wi_glock);
447 schedule_timeout_uninterruptible(cfs_time_seconds(1)
449 spin_lock(&cfs_wi_data.wi_glock);
451 spin_unlock(&cfs_wi_data.wi_glock);
454 while (!list_empty(&cfs_wi_data.wi_scheds)) {
455 sched = list_entry(cfs_wi_data.wi_scheds.next,
456 struct cfs_wi_sched, ws_list);
457 list_del(&sched->ws_list);
458 LIBCFS_FREE(sched, sizeof(*sched));
461 cfs_wi_data.wi_stopping = 0;
462 cfs_wi_data.wi_init = 0;