Whamcloud - gitweb
flush daemon debugging/testing
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7  * Copryright (C) 1999 Seagate Technology Inc.
8  *
9  */
10 #define __NO_VERSION__
11 #include <linux/module.h>
12 #include <linux/sched.h>
13 #include <linux/fs.h>
14 #include <linux/malloc.h>
15 #include <linux/locks.h>
16 #include <linux/errno.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/vmalloc.h>
20 #include <linux/blkdev.h>
21 #include <linux/sysrq.h>
22 #include <linux/file.h>
23 #include <linux/init.h>
24 #include <linux/quotaops.h>
25 #include <linux/iobuf.h>
26 #include <linux/highmem.h>
27
28 #include <asm/uaccess.h>
29 #include <asm/io.h>
30 #include <asm/bitops.h>
31 #include <asm/mmu_context.h>
32
33 #include <linux/obd_support.h>
34 #include <linux/obd_class.h>
35 #include <linux/obdfs.h>
36
37
38
39 struct {
40         int nfract;  /* Percentage of buffer cache dirty to 
41                         activate bdflush */
42         int ndirty;  /* Maximum number of dirty blocks to write out per
43                         wake-cycle */
44         int nrefill; /* Number of clean buffers to try to obtain
45                                 each time we call refill */
46         int nref_dirt; /* Dirty buffer threshold for activating bdflush
47                           when trying to refill buffers. */
48         int interval; /* jiffies delay between kupdate flushes */
49         int age_buffer;  /* Time for normal buffer to age before we flush it */
50         int age_super;  /* Time for superblock to age before we flush it */
51 /* } pupd_prm = {40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ };  */
52 } pupd_prm = {40, 500, 64, 256, 10*HZ, 30*HZ, 5*HZ }; 
53
54
55 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
56                                int nr_slots, struct page **pages, char **bufs,
57                                obd_size *counts, obd_off *offsets,
58                                obd_flag *flag, int check_time)
59 {
60         struct list_head *page_list = obdfs_iplist(inode);
61         struct list_head *tmp;
62         int num = 0;
63
64         ENTRY;
65         OIDEBUG(inode);
66
67         *obdo = obdo_fromid(IID(inode), inode->i_ino, OBD_MD_FLNOTOBD);
68         if ( IS_ERR(*obdo) ) {
69                 EXIT;
70                 return PTR_ERR(*obdo);
71         }
72
73         obdfs_from_inode(*obdo, inode); /* FIXME revisit fromid & from_inode */
74         *flag = OBD_BRW_CREATE;
75
76         tmp = page_list;
77         while ( (tmp = tmp->next) != page_list && (num < nr_slots) ) {
78                 struct obdfs_pgrq *req;
79                 struct page *page;
80                 
81                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
82                 page = req->rq_page;
83
84                 if (check_time && 
85                     (jiffies - req->rq_jiffies) < pupd_prm.age_buffer)
86                         continue;
87
88                 /* Remove request from list before write to avoid conflict.
89                  * Note that obdfs_pgrq_del() also deletes the request.
90                  */
91                 obdfs_pgrq_del(req);
92                 
93                 if ( !page ) {
94                         CDEBUG(D_INODE, "no page \n");
95                         continue;
96                 }
97
98                 CDEBUG(D_INODE, "adding page %p to vector\n", page);
99                 bufs[num] = (char *)page_address(page);
100                 pages[num] = page;
101                 counts[num] = PAGE_SIZE;
102                 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
103                 num++;
104         }
105
106         if (!list_empty(page_list))
107                 CDEBUG(D_INODE, "inode %ld list not empty\n", inode->i_ino);
108         CDEBUG(D_INODE, "added %d page(s) to vector\n", num);
109
110         EXIT;
111         return num;  
112 }
113
114
115 /* Remove writeback requests for the superblock */
116 int obdfs_flush_reqs(struct list_head *inode_list, int flush_inode,
117                      int check_time)
118 {
119         struct list_head *tmp = inode_list;
120         int               total_io = 0;
121         obd_count         num_io = 0;
122         obd_count         num_obdos = 0;
123         struct inode     *inodes[MAX_IOVEC];    /* write data back to these */
124         struct page      *pages[MAX_IOVEC];     /* call put_page on these */
125         struct obdo      *obdos[MAX_IOVEC];
126         char             *bufs[MAX_IOVEC];
127         obd_size          counts[MAX_IOVEC];
128         obd_off           offsets[MAX_IOVEC];
129         obd_flag          flags[MAX_IOVEC];
130         obd_count         bufs_per_obdo[MAX_IOVEC];
131         int               err = 0;
132
133         ENTRY;
134
135         if (!inode_list) {
136                 CDEBUG(D_INODE, "no list\n");
137                 EXIT;
138                 return 0;
139         }
140
141         if ( list_empty(inode_list)) {
142                 CDEBUG(D_INODE, "list empty\n");
143                 EXIT;
144                 return 0;
145         }
146
147         /* add each inode's outstanding pages to a write vector, and write it */
148         while ( (tmp = tmp->next) != inode_list && total_io < pupd_prm.ndirty) {
149                 struct obdfs_inode_info *ii;
150                 struct inode *inode;
151                 int res;
152
153                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
154                 inode = list_entry(ii, struct inode, u);
155                 inodes[num_obdos] = inode;
156
157                 res = 1;
158
159                 /* Loop on this inode until we can't get more pages from it
160                  * (either no more pages, or the pages aren't old enough).
161                  * Make sure we reference "inode" and not "inodes[num_obdos]",
162                  * as num_obdos will change after the loop is run.
163                  */
164                 while (!list_empty(obdfs_iplist(inode)) && res &&
165                        total_io < pupd_prm.ndirty ) {
166                         res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
167                                                   MAX_IOVEC - num_io,
168                                                   &pages[num_io], &bufs[num_io],
169                                                   &counts[num_io],
170                                                   &offsets[num_io],
171                                                   &flags[num_obdos],1);
172                         if ( res < 0 ) {
173                                 err = res;
174                                 goto ERR;
175                         }
176                         
177                         num_io += res;
178                         total_io += res;
179                         bufs_per_obdo[num_obdos] = res;
180                         num_obdos++;
181
182                         if ( num_io == MAX_IOVEC ) {
183                                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
184                                                       obdos, bufs_per_obdo,
185                                                       pages, bufs, counts,
186                                                       offsets, flags);
187                                 if ( err ) {
188                                         EXIT;
189                                         goto ERR;
190                                 }
191                                 inodes[0] = inode;
192                                 num_io = 0;
193                                 num_obdos = 0;
194                         }
195                 }
196
197                 /* Remove inode from superblock dirty list when no more pages.
198                  * Make sure we don't point at the current inode with tmp
199                  * when we re-init the list on the inode, or we will loop.
200                  */
201                 if (list_empty(obdfs_iplist(inode))) {
202                         CDEBUG(D_INODE, "remove inode %ld from dirty list\n",
203                                inode->i_ino);
204                         tmp = tmp->prev;
205                         list_del(obdfs_islist(inode));
206                         INIT_LIST_HEAD(obdfs_islist(inode));
207                 }
208         }
209
210         /* flush any remaining I/Os */
211         if ( num_io ) {
212                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
213                                       bufs_per_obdo, pages, bufs, counts,
214                                       offsets, flags);
215         }
216         CDEBUG(D_INODE, "flushed %d pages in total\n", total_io);
217         EXIT;
218 ERR:
219         return err;
220 } /* obdfs_remove_pages_from_cache */
221
222
223 static void obdfs_flush_dirty_pages(int check_time)
224 {
225         struct list_head *sl;
226
227         sl = &obdfs_super_list;
228         while ( (sl = sl->next) != &obdfs_super_list ) {
229                 struct obdfs_sb_info *sbi = 
230                         list_entry(sl, struct obdfs_sb_info, osi_list);
231
232                 /* walk write requests here, use the sb, check the time */
233                 obdfs_flush_reqs(&sbi->osi_inodes, 0, 1);
234         }
235
236 #if 0
237         /* again, but now we wait for completion */
238         sl = &obdfs_super_list;
239         while ( (sl = sl->next) != &obdfs_super_list ) {
240                 struct obdfs_sb_info *sbi = 
241                         list_entry(sl, struct obdfs_sb_info, sl_chain);
242
243                 /* walk write requests here */
244                 obdfs_flush_reqs(&sbi->osi_pages, 0, check_time);
245         }
246 #endif
247 }
248
249
250 static struct task_struct *pupdated;
251
252
253 static int pupdate(void *unused) 
254 {
255         struct task_struct * tsk = current;
256         int interval;
257         
258         pupdated = current;
259
260         exit_files(current);
261         exit_mm(current);
262
263         tsk->session = 1;
264         tsk->pgrp = 1;
265         sprintf(tsk->comm, "pupdated");
266         pupdated = current;
267
268         printk("pupdated activated...\n");
269
270         /* sigstop and sigcont will stop and wakeup pupdate */
271         spin_lock_irq(&tsk->sigmask_lock);
272         sigfillset(&tsk->blocked);
273         siginitsetinv(&tsk->blocked, sigmask(SIGTERM));
274         recalc_sigpending(tsk);
275         spin_unlock_irq(&tsk->sigmask_lock);
276
277         for (;;) {
278                 /* update interval */
279                 interval = pupd_prm.interval;
280                 if (interval)
281                 {
282                         tsk->state = TASK_INTERRUPTIBLE;
283                         schedule_timeout(interval);
284                 }
285                 else
286                 {
287                 stop_pupdate:
288                         tsk->state = TASK_STOPPED;
289                         /* MOD_DEC_USE_COUNT; */
290                         printk("pupdated stopped...\n");
291                         return 0;
292                 }
293                 /* check for sigstop */
294                 if (signal_pending(tsk))
295                 {
296                         int stopped = 0;
297                         spin_lock_irq(&tsk->sigmask_lock);
298                         if (sigismember(&tsk->signal, SIGTERM))
299                         {
300                                 sigdelset(&tsk->signal, SIGTERM);
301                                 stopped = 1;
302                         }
303                         recalc_sigpending(tsk);
304                         spin_unlock_irq(&tsk->sigmask_lock);
305                         if (stopped)
306                                 goto stop_pupdate;
307                 }
308                 /* asynchronous setattr etc for the future ... */
309                 /* flush_inodes(); */
310                 obdfs_flush_dirty_pages(1);
311         }
312 }
313
314
315 int flushd_init(void)
316 {
317         /*
318         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
319          */
320         /* MOD_INC_USE_COUNT; */
321         kernel_thread(pupdate, NULL, 0);
322         printk("flushd inited\n");
323         return 0;
324 }
325
326 int flushd_cleanup(void)
327 {
328         /* this should deliver a signal to */
329         
330
331         /* XXX Andreas, we will do this later, for now, you must kill
332            pupdated with a SIGTERM from userland, before unloading obdfs.o
333         */
334         if (pupdated) {
335                 /* then let it run at least once, before continuing */
336
337                 /* XXX need to do something like this here:
338                 send_sig(SIGTERM, current, 0);
339                  */
340                 1;
341         }
342
343         /* not reached */
344         return 0;
345
346 }