Whamcloud - gitweb
Several bugfixes. Most notably ext2obd_brw is still totally broken,
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7  * Copryright (C) 1999 Seagate Technology Inc.
8  *
9  */
10 #define __NO_VERSION__
11 #include <linux/module.h>
12 #include <linux/sched.h>
13 #include <linux/fs.h>
14 #include <linux/malloc.h>
15 #include <linux/locks.h>
16 #include <linux/errno.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/vmalloc.h>
20 #include <linux/blkdev.h>
21 #include <linux/sysrq.h>
22 #include <linux/file.h>
23 #include <linux/init.h>
24 #include <linux/quotaops.h>
25 #include <linux/iobuf.h>
26 #include <linux/highmem.h>
27
28 #include <asm/uaccess.h>
29 #include <asm/io.h>
30 #include <asm/bitops.h>
31 #include <asm/mmu_context.h>
32
33 #include <linux/obd_support.h>
34 #include <linux/obd_class.h>
35 #include <linux/obdfs.h>
36
37
38
39 struct {
40         int nfract;  /* Percentage of buffer cache dirty to 
41                         activate bdflush */
42         int ndirty;  /* Maximum number of dirty blocks to write out per
43                         wake-cycle */
44         int nrefill; /* Number of clean buffers to try to obtain
45                                 each time we call refill */
46         int nref_dirt; /* Dirty buffer threshold for activating bdflush
47                           when trying to refill buffers. */
48         int interval; /* jiffies delay between kupdate flushes */
49         int age_buffer;  /* Time for normal buffer to age before we flush it */
50         int age_super;  /* Time for superblock to age before we flush it */
51 /* } pupd_prm = {40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ };  */
52 } pupd_prm = {40, 500, 64, 256, 10*HZ, 30*HZ, 5*HZ }; 
53
54 #if 0
55 static void obdfs_lock_page(struct page *page)
56 {
57         while (TryLockPage(page))
58                 ___wait_on_page(page);
59 }
60 #endif
61
62 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
63                                int nr_slots, struct page **pages, char **bufs,
64                                obd_size *counts, obd_off *offsets,
65                                obd_flag *flag, int check_time)
66 {
67         struct list_head *page_list = obdfs_iplist(inode);
68         struct list_head *tmp;
69         int num = 0;
70
71         ENTRY;
72         OIDEBUG(inode);
73
74         *obdo = obdo_fromid(IID(inode), inode->i_ino, OBD_MD_FLNOTOBD);
75         if ( IS_ERR(*obdo) ) {
76                 EXIT;
77                 return PTR_ERR(*obdo);
78         }
79
80         obdfs_from_inode(*obdo, inode); /* FIXME revisit fromid & from_inode */
81         *flag = OBD_BRW_CREATE;
82
83         tmp = page_list;
84         while ( ((tmp = tmp->next) != page_list) && (num < nr_slots) ) {
85                 struct obdfs_pgrq *req;
86                 struct page *page;
87                 
88                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
89                 page = req->rq_page;
90
91                 
92                 if (check_time && 
93                     (jiffies - req->rq_jiffies) < pupd_prm.age_buffer)
94                         continue;
95
96                 /* Remove request from list before write to avoid conflict.
97                  * Note that obdfs_pgrq_del() also deletes the request.
98                  */
99                 obdfs_pgrq_del(req);
100                 /* 
101                 obdfs_lock_page(page);
102                 */
103                 if ( !page ) {
104                         CDEBUG(D_INODE, "no page \n");
105                         continue;
106                 }
107
108                 bufs[num] = (char *)page_address(page);
109                 pages[num] = page;
110                 counts[num] = PAGE_SIZE;
111                 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
112                 CDEBUG(D_INODE, "ENQ inode %ld, page %p addr %p to vector\n", 
113                        inode->i_ino, page, (char *)page_address(page));
114                 num++;
115         }
116
117         if (!list_empty(page_list))
118                 CDEBUG(D_INODE, "inode %ld list not empty\n", inode->i_ino);
119         CDEBUG(D_INODE, "added %d page(s) to vector\n", num);
120
121         EXIT;
122         return num;  
123 }
124
125 /* dequeue requests for a dying inode */
126 void obdfs_dequeue_reqs(struct inode *inode)
127 {
128
129         struct list_head *tmp;
130
131         tmp = obdfs_islist(inode);
132         if ( list_empty(tmp) ) {
133                 EXIT;
134                 return;
135         }
136
137         /* take it out of the super list */
138         list_del(tmp);
139         INIT_LIST_HEAD(obdfs_islist(inode));
140
141         tmp = obdfs_iplist(inode);
142         while ( (tmp = tmp->next) != obdfs_iplist(inode) ) {
143                 struct obdfs_pgrq *req;
144                 struct page *page;
145                 
146                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
147                 page = req->rq_page;
148                 /* take it out of the list and free */
149                 obdfs_pgrq_del(req);
150                 /* now put the page away */
151                 put_page(page);
152         }
153
154 }
155
156
157 /* Remove writeback requests for the superblock */
158 int obdfs_flush_reqs(struct list_head *inode_list, int check_time)
159 {
160         struct list_head *tmp = inode_list;
161         int               total_io = 0;
162         obd_count         num_io = 0;
163         obd_count         num_obdos = 0;
164         struct inode     *inodes[MAX_IOVEC];    /* write data back to these */
165         struct page      *pages[MAX_IOVEC];     /* call put_page on these */
166         struct obdo      *obdos[MAX_IOVEC];
167         char             *bufs[MAX_IOVEC];
168         obd_size          counts[MAX_IOVEC];
169         obd_off           offsets[MAX_IOVEC];
170         obd_flag          flags[MAX_IOVEC];
171         obd_count         bufs_per_obdo[MAX_IOVEC];
172         int               err = 0;
173
174         ENTRY;
175
176         if (!inode_list) {
177                 CDEBUG(D_INODE, "no list\n");
178                 EXIT;
179                 return 0;
180         }
181
182         if ( list_empty(inode_list)) {
183                 CDEBUG(D_INODE, "list empty\n");
184                 EXIT;
185                 return 0;
186         }
187
188         /* add each inode's outstanding pages to a write vector, and write it */
189         while ( (tmp = tmp->next) != inode_list && total_io < pupd_prm.ndirty) {
190                 struct obdfs_inode_info *ii;
191                 struct inode *inode;
192                 int res;
193
194                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
195                 inode = list_entry(ii, struct inode, u);
196                 inodes[num_obdos] = inode;
197
198                 res = 1;
199
200                 /* Loop on this inode until we can't get more pages from it
201                  * (either no more pages, or the pages aren't old enough).
202                  * Make sure we reference "inode" and not "inodes[num_obdos]",
203                  * as num_obdos will change after the loop is run.
204                  */
205                 while (!list_empty(obdfs_iplist(inode)) && res &&
206                        total_io < pupd_prm.ndirty ) {
207                         res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
208                                                   MAX_IOVEC - num_io,
209                                                   &pages[num_io], &bufs[num_io],
210                                                   &counts[num_io],
211                                                   &offsets[num_io],
212                                                   &flags[num_obdos],1);
213                         CDEBUG(D_INODE, "FLUSHED inode %ld, pages flushed: %d\n", 
214                                inode->i_ino, res);
215                         if ( res < 0 ) {
216                                 err = res;
217                                 goto ERR;
218                         }
219                         
220                         num_io += res;
221                         total_io += res;
222                         bufs_per_obdo[num_obdos] = res;
223                         num_obdos++;
224
225                         if ( num_io == MAX_IOVEC ) {
226                                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
227                                                       obdos, bufs_per_obdo,
228                                                       pages, bufs, counts,
229                                                       offsets, flags);
230                                 if ( err ) {
231                                         EXIT;
232                                         goto ERR;
233                                 }
234                                 inodes[0] = inode;
235                                 num_io = 0;
236                                 num_obdos = 0;
237                         }
238                 }
239
240                 /* Remove inode from superblock dirty list when no more pages.
241                  * Make sure we don't point at the current inode with tmp
242                  * when we re-init the list on the inode, or we will loop.
243                  */
244                 if (list_empty(obdfs_iplist(inode))) {
245                         CDEBUG(D_INODE, "remove inode %ld from dirty list\n",
246                                inode->i_ino);
247                         tmp = tmp->prev;
248                         list_del(obdfs_islist(inode));
249                         INIT_LIST_HEAD(obdfs_islist(inode));
250                 }
251         }
252
253         /* flush any remaining I/Os */
254         if ( num_io ) {
255                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
256                                       bufs_per_obdo, pages, bufs, counts,
257                                       offsets, flags);
258         }
259         CDEBUG(D_INODE, "flushed %d pages in total\n", total_io);
260         EXIT;
261 ERR:
262         return err;
263 } /* obdfs_remove_pages_from_cache */
264
265
266 void obdfs_flush_dirty_pages(int check_time)
267 {
268         struct list_head *sl;
269
270         sl = &obdfs_super_list;
271         while ( (sl = sl->next) != &obdfs_super_list ) {
272                 struct obdfs_sb_info *sbi = 
273                         list_entry(sl, struct obdfs_sb_info, osi_list);
274
275                 /* walk write requests here, use the sb, check the time */
276                 obdfs_flush_reqs(&sbi->osi_inodes, 0);
277         }
278
279 }
280
281
282 static struct task_struct *pupdated;
283
284
285 static int pupdate(void *unused) 
286 {
287         struct task_struct * tsk = current;
288         int interval;
289         
290         pupdated = current;
291
292         exit_files(current);
293         exit_mm(current);
294
295         tsk->session = 1;
296         tsk->pgrp = 1;
297         sprintf(tsk->comm, "pupdated");
298         pupdated = current;
299
300         printk("pupdated activated...\n");
301
302         /* sigstop and sigcont will stop and wakeup pupdate */
303         spin_lock_irq(&tsk->sigmask_lock);
304         sigfillset(&tsk->blocked);
305         siginitsetinv(&tsk->blocked, sigmask(SIGTERM));
306         recalc_sigpending(tsk);
307         spin_unlock_irq(&tsk->sigmask_lock);
308
309         for (;;) {
310                 /* update interval */
311                 interval = pupd_prm.interval;
312                 if (interval)
313                 {
314                         tsk->state = TASK_INTERRUPTIBLE;
315                         schedule_timeout(interval);
316                 }
317                 else
318                 {
319                 stop_pupdate:
320                         obdfs_flush_dirty_pages(0);
321                         tsk->state = TASK_STOPPED;
322                         /* MOD_DEC_USE_COUNT; */
323                         printk("pupdated stopped...\n");
324                         return 0;
325                 }
326                 /* check for sigstop */
327                 if (signal_pending(tsk))
328                 {
329                         int stopped = 0;
330                         spin_lock_irq(&tsk->sigmask_lock);
331                         if (sigismember(&tsk->signal, SIGTERM))
332                         {
333                                 sigdelset(&tsk->signal, SIGTERM);
334                                 stopped = 1;
335                         }
336                         recalc_sigpending(tsk);
337                         spin_unlock_irq(&tsk->sigmask_lock);
338                         if (stopped)
339                                 goto stop_pupdate;
340                 }
341                 /* asynchronous setattr etc for the future ... */
342                 /* flush_inodes(); */
343                 obdfs_flush_dirty_pages(1); 
344         }
345 }
346
347
348 int flushd_init(void)
349 {
350         /*
351         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
352          */
353         /* MOD_INC_USE_COUNT; */
354         kernel_thread(pupdate, NULL, 0);
355         printk("flushd inited\n");
356         return 0;
357 }
358
359 int flushd_cleanup(void)
360 {
361         /* this should deliver a signal to */
362         
363
364         /* XXX Andreas, we will do this later, for now, you must kill
365            pupdated with a SIGTERM from userland, before unloading obdfs.o
366         */
367         if (pupdated) {
368                 /* then let it run at least once, before continuing */
369
370                 /* XXX need to do something like this here:
371                 send_sig(SIGTERM, current, 0);
372                  */
373                 1;
374                 /*obdfs_flush_dirty_pages(0); */
375         }
376
377         /* not reached */
378         return 0;
379
380 }