Whamcloud - gitweb
obdfs/flushd.c: pupdated stopped on module unload.
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7  * Copryright (C) 1999 Seagate Technology Inc.
8  *
9  */
10 #define __NO_VERSION__
11 #include <linux/module.h>
12 #include <linux/sched.h>
13 #include <linux/fs.h>
14 #include <linux/malloc.h>
15 #include <linux/locks.h>
16 #include <linux/errno.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/vmalloc.h>
20 #include <linux/blkdev.h>
21 #include <linux/sysrq.h>
22 #include <linux/file.h>
23 #include <linux/init.h>
24 #include <linux/quotaops.h>
25 #include <linux/iobuf.h>
26 #include <linux/highmem.h>
27
28 #include <asm/uaccess.h>
29 #include <asm/io.h>
30 #include <asm/bitops.h>
31 #include <asm/mmu_context.h>
32
33 #include <linux/obd_support.h>
34 #include <linux/obd_class.h>
35 #include <linux/obdfs.h>
36
37
38 struct {
39         int nfract;  /* Percentage of buffer cache dirty to 
40                         activate bdflush */
41         int ndirty;  /* Maximum number of dirty blocks to write out per
42                         wake-cycle */
43         int nrefill; /* Number of clean buffers to try to obtain
44                                 each time we call refill */
45         int nref_dirt; /* Dirty buffer threshold for activating bdflush
46                           when trying to refill buffers. */
47         int interval; /* jiffies delay between pupdate flushes */
48         int age_buffer;  /* Time for normal buffer to age before we flush it */
49         int age_super;  /* Time for superblock to age before we flush it */
50 } pupd_prm = {40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ };
51
52 /* Called with the superblock list lock */
53 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
54                                int nr_slots, struct page **pages, char **bufs,
55                                obd_size *counts, obd_off *offsets,
56                                obd_flag *flag, unsigned long check_time)
57 {
58         struct list_head *page_list = obdfs_iplist(inode);
59         struct list_head *tmp;
60         int num = 0;
61
62         ENTRY;
63
64         tmp = page_list;
65         /* Traverse list in reverse order, so we do FIFO, not LIFO order */
66         while ( (tmp = tmp->prev) != page_list && num < nr_slots ) {
67                 struct obdfs_pgrq *req;
68                 struct page *page;
69                 
70                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
71                 page = req->rq_page;
72
73                 
74                 if (req->rq_jiffies > check_time)
75                         break;          /* pages are in chronological order */
76
77                 /* Only allocate the obdo if we will actually do I/O here */
78                 if ( !*obdo ) {
79                         OIDEBUG(inode);
80                         *obdo = obdo_fromid(IID(inode), inode->i_ino,
81                                             OBD_MD_FLNOTOBD);
82                         if ( IS_ERR(*obdo) ) {
83                                 int err = PTR_ERR(*obdo);
84                                 *obdo = NULL;
85
86                                 EXIT;
87                                 return err;
88                         }
89
90                         /* FIXME revisit fromid & from_inode */
91                         obdfs_from_inode(*obdo, inode);
92                         *flag = OBD_BRW_CREATE;
93                 }
94
95                 /* Remove request from list before write to avoid conflict.
96                  * Note that obdfs_pgrq_del() also deletes the request.
97                  */
98                 obdfs_pgrq_del(req);
99                 if ( !page ) {
100                         CDEBUG(D_CACHE, "no page \n");
101                         continue;
102                 }
103
104                 bufs[num] = (char *)page_address(page);
105                 pages[num] = page;
106                 counts[num] = PAGE_SIZE;
107                 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
108                 CDEBUG(D_INFO, "ENQ inode %ld, page %p addr %p to vector\n", 
109                        inode->i_ino, page, (char *)page_address(page));
110                 num++;
111         }
112
113         if (!list_empty(page_list))
114                 CDEBUG(D_INFO, "inode %ld list not empty\n", inode->i_ino);
115         CDEBUG(D_INFO, "added %d page(s) to vector\n", num);
116
117         EXIT;
118         return num;  
119 } /* obdfs_enqueue_pages */
120
121 /* Remove writeback requests for the superblock */
122 int obdfs_flush_reqs(struct list_head *inode_list, unsigned long check_time)
123 {
124         struct list_head *tmp;
125         int               total_io = 0;
126         obd_count         num_io;
127         obd_count         num_obdos;
128         struct inode     *inodes[MAX_IOVEC];    /* write data back to these */
129         struct page      *pages[MAX_IOVEC];     /* call put_page on these */
130         struct obdo      *obdos[MAX_IOVEC];
131         char             *bufs[MAX_IOVEC];
132         obd_size          counts[MAX_IOVEC];
133         obd_off           offsets[MAX_IOVEC];
134         obd_flag          flags[MAX_IOVEC];
135         obd_count         bufs_per_obdo[MAX_IOVEC];
136         int               err = 0;
137         struct obdfs_sb_info *sbi;
138
139         ENTRY;
140         if (!inode_list) {
141                 CDEBUG(D_INODE, "no list\n");
142                 EXIT;
143                 return 0;
144         }
145
146         sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
147
148         obd_down(&sbi->osi_list_mutex);
149         if ( list_empty(inode_list) ) {
150                 CDEBUG(D_CACHE, "list empty: memory %ld, inodes %d, pages %d\n",
151                        obd_memory, obd_inodes, obd_pages);
152                 obd_up(&sbi->osi_list_mutex);
153                 EXIT;
154                 return 0;
155         }
156
157         /* Add each inode's dirty pages to a write vector, and write it.
158          * Traverse list in reverse order, so we do FIFO, not LIFO order
159          */
160  again:
161         tmp = inode_list;
162         num_io = 0;
163         num_obdos = 0;
164         while ( (tmp = tmp->prev) != inode_list && total_io < pupd_prm.ndirty) {
165                 struct obdfs_inode_info *ii;
166                 struct inode *inode;
167                 int res;
168
169                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
170                 inode = list_entry(ii, struct inode, u);
171                 inodes[num_obdos] = inode;
172                 obdos[num_obdos] = NULL;
173                 CDEBUG(D_INFO, "checking inode %ld pages\n", inode->i_ino);
174
175                 /* Make sure we reference "inode" and not "inodes[num_obdos]",
176                  * as num_obdos will change after the loop is run.
177                  */
178                 if (!list_empty(obdfs_iplist(inode))) {
179                         res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
180                                                   MAX_IOVEC - num_io,
181                                                   &pages[num_io], &bufs[num_io],
182                                                   &counts[num_io],
183                                                   &offsets[num_io],
184                                                   &flags[num_obdos],
185                                                   check_time);
186                         CDEBUG(D_INFO, "FLUSH inode %ld, pages flushed: %d\n",
187                                inode->i_ino, res);
188                         if ( res < 0 ) {
189                                 CDEBUG(D_INODE,
190                                        "fatal: unable to enqueue inode %ld (err %d)\n",
191                                        inode->i_ino, err);
192                                 /* XXX Move bad inode to end of list so we can
193                                  * continue with flushing list.  This is a
194                                  * temporary measure to avoid machine lockups.
195                                  */
196                                 list_del(tmp);
197                                 list_add(tmp, inode_list);
198                                 err = res;
199                                 EXIT;
200                                 goto BREAK;
201                         } else if (res) {
202                                 num_io += res;
203                                 total_io += res;
204                                 bufs_per_obdo[num_obdos] = res;
205                                 num_obdos++;
206                         }
207
208                         if ( num_io == MAX_IOVEC ) {
209                                 obd_up(&sbi->osi_list_mutex);
210                                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
211                                                       obdos, bufs_per_obdo,
212                                                       pages, bufs, counts,
213                                                       offsets, flags);
214                                 if ( err ) {
215                                         CDEBUG(D_INODE,
216                                                 "fatal: unable to do vec_wr (err %d)\n", err);
217                                         EXIT;
218                                         goto ERR;
219                                 }
220                                 obd_down(&sbi->osi_list_mutex);
221                                 goto again;
222                         }
223                 }
224         }
225
226 BREAK:
227         obd_up(&sbi->osi_list_mutex);
228
229         /* flush any remaining I/Os */
230         if ( num_io ) {
231                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
232                                       bufs_per_obdo, pages, bufs, counts,
233                                       offsets, flags);
234                 if (err)
235                         CDEBUG(D_INODE, "fatal: unable to do vec_wr (err %d)\n", err);
236                 num_io = 0;
237                 num_obdos = 0;
238         }
239
240         /* Remove inode from superblock dirty list when no more pages.
241          * Make sure we don't point at the current inode with tmp
242          * when we re-init the list on the inode, or we will loop.
243          */
244         obd_down(&sbi->osi_list_mutex);
245         tmp = inode_list;
246         while ( (tmp = tmp->prev) != inode_list ) {
247                 struct obdfs_inode_info *ii;
248                 struct inode *inode;
249
250                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
251                 inode = list_entry(ii, struct inode, u);
252                 CDEBUG(D_INFO, "checking inode %ld empty\n", inode->i_ino);
253                 if (list_empty(obdfs_iplist(inode))) {
254                         CDEBUG(D_INFO, "remove inode %ld from dirty list\n",
255                                inode->i_ino);
256                         tmp = tmp->next;
257                         list_del(obdfs_islist(inode));
258                         /* decrement inode reference for page cache */
259                         inode->i_count--;
260                         INIT_LIST_HEAD(obdfs_islist(inode));
261                 }
262         }
263         obd_up(&sbi->osi_list_mutex);
264
265         CDEBUG(D_INFO, "flushed %d pages in total\n", total_io);
266         EXIT;
267 ERR:
268         return err ? err : total_io;
269 } /* obdfs_flush_reqs */
270
271
272 /* Walk all of the superblocks and write out blocks which are too old.
273  * Return the maximum number of blocks written for a single filesystem.
274  */
275 int obdfs_flush_dirty_pages(unsigned long check_time)
276 {
277         struct list_head *sl;
278         int max = 0;
279
280         ENTRY;
281         sl = &obdfs_super_list;
282         while ( (sl = sl->prev) != &obdfs_super_list ) {
283                 struct obdfs_sb_info *sbi = 
284                         list_entry(sl, struct obdfs_sb_info, osi_list);
285                 int ret;
286
287                 /* walk write requests here, use the sb, check the time */
288                 ret = obdfs_flush_reqs(&sbi->osi_inodes, check_time);
289                 /* XXX handle error?  What to do with it? */
290
291                 max = ret > max ? ret : max;
292         }
293         EXIT;
294         return max;
295 } /* obdfs_flush_dirty_pages */
296
297
298 static struct task_struct *pupdated;
299
300 static int pupdate(void *unused) 
301 {
302         int interval = pupd_prm.interval;
303         long age = pupd_prm.age_buffer;
304         int wrote = 0;
305         
306         exit_files(current);
307         exit_mm(current);
308
309         pupdated = current;
310         pupdated->session = 1;
311         pupdated->pgrp = 1;
312         strcpy(pupdated->comm, "pupdated");
313
314         printk("pupdated activated...\n");
315
316         spin_lock_irq(&pupdated->sigmask_lock);
317         sigfillset(&pupdated->blocked);
318         siginitsetinv(&pupdated->blocked, sigmask(SIGTERM));
319         recalc_sigpending(pupdated);
320         spin_unlock_irq(&pupdated->sigmask_lock);
321
322         for (;;) {
323                 long dirty_limit;
324
325                 /* update interval */
326                 if (interval) {
327                         set_task_state(pupdated, TASK_INTERRUPTIBLE);
328                         schedule_timeout(interval);
329                 }
330                 if (signal_pending(pupdated))
331                 {
332                         int stopped = 0;
333                         spin_lock_irq(&pupdated->sigmask_lock);
334                         if (sigismember(&pupdated->signal, SIGTERM))
335                         {
336                                 sigdelset(&pupdated->signal, SIGTERM);
337                                 stopped = 1;
338                         }
339                         recalc_sigpending(pupdated);
340                         spin_unlock_irq(&pupdated->sigmask_lock);
341                         if (stopped) {
342                                 printk("pupdated stopped...\n");
343                                 set_task_state(pupdated, TASK_STOPPED);
344                                 pupdated = NULL;
345                                 return 0;
346                         }
347                 }
348                 /* asynchronous setattr etc for the future ...
349                 obdfs_flush_dirty_inodes(jiffies - pupd_prm.age_super);
350                  */
351                 dirty_limit = nr_free_buffer_pages() * pupd_prm.nfract / 100;
352                 CDEBUG(D_CACHE, "dirty_limit %ld, cache_count %ld\n",
353                        dirty_limit, obdfs_cache_count);
354
355                 if (obdfs_cache_count > dirty_limit) {
356                         interval = 0;
357                         if ( wrote < pupd_prm.ndirty )
358                                 age >>= 1;
359                 } else {
360                         int isave = interval;
361                         int asave = age;
362
363                         if ( wrote < pupd_prm.ndirty >> 1 )
364                                 interval = pupd_prm.interval;
365                         else
366                                 interval = isave >> 1;
367
368                         if (obdfs_cache_count > dirty_limit / 3) {
369                                 age = asave >> 1;
370                                 interval = isave >> 1;
371                         } else
372                                 age = pupd_prm.age_buffer;
373                 }
374
375                 CDEBUG(D_CACHE, "age %ld, interval %d\n", age, interval);
376                 wrote = obdfs_flush_dirty_pages(jiffies - age);
377         }
378
379 }
380
381
382 int obdfs_flushd_init(void)
383 {
384         /*
385         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
386          */
387         kernel_thread(pupdate, NULL, 0);
388         CDEBUG(D_PSDEV, __FUNCTION__ ": flushd inited\n");
389         return 0;
390 }
391
392 int obdfs_flushd_cleanup(void)
393 {
394         ENTRY;
395
396         if (pupdated) /* for debugging purposes only */
397                 CDEBUG(D_CACHE, "pupdated->state = %lx\n", pupdated->state);
398
399         /* deliver a signal to pupdated to shut it down */
400         if (pupdated && (pupdated->state == TASK_RUNNING ||
401                          pupdated->state == TASK_INTERRUPTIBLE )) {
402                 unsigned long timeout = HZ/20;
403                 unsigned long count = 0;
404                 send_sig_info(SIGTERM, (struct siginfo *)1, pupdated);
405                 while (pupdated) {
406                         if ((count % 2*HZ) == timeout)
407                                 printk(KERN_INFO "wait for pupdated to stop\n");
408                         count += timeout;
409                         set_current_state(TASK_INTERRUPTIBLE);
410                         schedule_timeout(timeout);
411                 }
412         }
413
414         EXIT;
415         /* not reached */
416         return 0;
417
418 }