Whamcloud - gitweb
c090a0c1dd74afd694432667d308d0a144df76e1
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7  * Copryright (C) 1999 Seagate Technology Inc.
8  *
9  */
10 #define __NO_VERSION__
11 #include <linux/module.h>
12 #include <linux/sched.h>
13 #include <linux/fs.h>
14 #include <linux/malloc.h>
15 #include <linux/locks.h>
16 #include <linux/errno.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/vmalloc.h>
20 #include <linux/blkdev.h>
21 #include <linux/sysrq.h>
22 #include <linux/file.h>
23 #include <linux/init.h>
24 #include <linux/quotaops.h>
25 #include <linux/iobuf.h>
26 #include <linux/highmem.h>
27
28 #include <asm/uaccess.h>
29 #include <asm/io.h>
30 #include <asm/bitops.h>
31 #include <asm/mmu_context.h>
32
33 #include <linux/obd_support.h>
34 #include <linux/obd_class.h>
35 #include <linux/obdfs.h>
36
37
38
39 struct {
40         int nfract;  /* Percentage of buffer cache dirty to 
41                         activate bdflush */
42         int ndirty;  /* Maximum number of dirty blocks to write out per
43                         wake-cycle */
44         int nrefill; /* Number of clean buffers to try to obtain
45                                 each time we call refill */
46         int nref_dirt; /* Dirty buffer threshold for activating bdflush
47                           when trying to refill buffers. */
48         int interval; /* jiffies delay between kupdate flushes */
49         int age_buffer;  /* Time for normal buffer to age before we flush it */
50         int age_super;  /* Time for superblock to age before we flush it */
51 /* } pupd_prm = {40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ };  */
52 } pupd_prm = {40, 500, 64, 256, 10*HZ, 30*HZ, 5*HZ }; 
53
54
55 /* Called with the superblock list lock */
56 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
57                                int nr_slots, struct page **pages, char **bufs,
58                                obd_size *counts, obd_off *offsets,
59                                obd_flag *flag, int check_time)
60 {
61         struct list_head *page_list = obdfs_iplist(inode);
62         struct list_head *tmp;
63         int num = 0;
64
65         ENTRY;
66         OIDEBUG(inode);
67
68         *obdo = obdo_fromid(IID(inode), inode->i_ino, OBD_MD_FLNOTOBD);
69         if ( IS_ERR(*obdo) ) {
70                 EXIT;
71                 return PTR_ERR(*obdo);
72         }
73
74         obdfs_from_inode(*obdo, inode); /* FIXME revisit fromid & from_inode */
75         *flag = OBD_BRW_CREATE;
76
77         tmp = page_list;
78         while ( ((tmp = tmp->next) != page_list) && (num < nr_slots) ) {
79                 struct obdfs_pgrq *req;
80                 struct page *page;
81                 
82                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
83                 page = req->rq_page;
84
85                 
86                 if (check_time && 
87                     (jiffies - req->rq_jiffies) < pupd_prm.age_buffer)
88                         continue;
89
90                 /* Remove request from list before write to avoid conflict.
91                  * Note that obdfs_pgrq_del() also deletes the request.
92                  */
93                 obdfs_pgrq_del(req);
94                 if ( !page ) {
95                         CDEBUG(D_INODE, "no page \n");
96                         continue;
97                 }
98
99                 bufs[num] = (char *)page_address(page);
100                 pages[num] = page;
101                 counts[num] = PAGE_SIZE;
102                 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
103                 CDEBUG(D_INODE, "ENQ inode %ld, page %p addr %p to vector\n", 
104                        inode->i_ino, page, (char *)page_address(page));
105                 num++;
106         }
107
108         if (!list_empty(page_list))
109                 CDEBUG(D_INODE, "inode %ld list not empty\n", inode->i_ino);
110         CDEBUG(D_INODE, "added %d page(s) to vector\n", num);
111
112         EXIT;
113         return num;  
114 }
115
116 /* dequeue requests for a dying inode */
117 void obdfs_dequeue_reqs(struct inode *inode)
118 {
119
120         struct list_head *tmp;
121
122         obd_down(&obdfs_i2sbi(inode)->osi_list_mutex);
123         tmp = obdfs_islist(inode);
124         if ( list_empty(tmp) ) {
125                 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
126                 EXIT;
127                 return;
128         }
129
130         /* take it out of the super list */
131         list_del(tmp);
132         INIT_LIST_HEAD(obdfs_islist(inode));
133
134         tmp = obdfs_iplist(inode);
135         while ( (tmp = tmp->next) != obdfs_iplist(inode) ) {
136                 struct obdfs_pgrq *req;
137                 struct page *page;
138                 
139                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
140                 page = req->rq_page;
141                 /* take it out of the list and free */
142                 obdfs_pgrq_del(req);
143                 /* now put the page away */
144                 put_page(page);
145         }
146         iput(inode);
147         obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
148 }
149
150 /* Remove writeback requests for the superblock */
151 int obdfs_flush_reqs(struct list_head *inode_list, int check_time)
152 {
153         struct list_head *tmp;
154         int               total_io = 0;
155         obd_count         num_io = 0;
156         obd_count         num_obdos = 0;
157         struct inode     *inodes[MAX_IOVEC];    /* write data back to these */
158         struct page      *pages[MAX_IOVEC];     /* call put_page on these */
159         struct obdo      *obdos[MAX_IOVEC];
160         char             *bufs[MAX_IOVEC];
161         obd_size          counts[MAX_IOVEC];
162         obd_off           offsets[MAX_IOVEC];
163         obd_flag          flags[MAX_IOVEC];
164         obd_count         bufs_per_obdo[MAX_IOVEC];
165         int               err = 0;
166         struct obdfs_sb_info *sbi;
167
168
169         ENTRY;
170
171         if (!inode_list) {
172                 CDEBUG(D_INODE, "no list\n");
173                 EXIT;
174                 return 0;
175         }
176
177         sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
178
179         obd_down(&sbi->osi_list_mutex);
180         if ( list_empty(inode_list)) {
181                 CDEBUG(D_INODE, "list empty\n");
182                 obd_up(&sbi->osi_list_mutex);
183                 EXIT;
184                 return 0;
185         }
186
187         /* add each inode's dirty pages to a write vector, and write it */
188  again:
189         tmp = inode_list;
190         while ( (tmp = tmp->next) != inode_list && 
191                 total_io < pupd_prm.ndirty) {
192                 struct obdfs_inode_info *ii;
193                 struct inode *inode;
194                 int res;
195
196                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
197                 inode = list_entry(ii, struct inode, u);
198                 inodes[num_obdos] = inode;
199                 CDEBUG(D_INODE, "checking inode %ld pages\n", inode->i_ino);
200
201                 res = 1;
202
203                 /* Loop on this inode until we can't get more pages from it
204                  * (either no more pages, or the pages aren't old enough).
205                  * Make sure we reference "inode" and not "inodes[num_obdos]",
206                  * as num_obdos will change after the loop is run.
207                  */
208                 while (!list_empty(obdfs_iplist(inode)) && res &&
209                        total_io < pupd_prm.ndirty ) {
210                         res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
211                                                   MAX_IOVEC - num_io,
212                                                   &pages[num_io], &bufs[num_io],
213                                                   &counts[num_io],
214                                                   &offsets[num_io],
215                                                   &flags[num_obdos], 1);
216                         CDEBUG(D_INODE, "FLUSHED inode %ld, pages flushed: %d\n", 
217                                inode->i_ino, res);
218                         if ( res < 0 ) {
219                                 obd_up(&sbi->osi_list_mutex);
220                                 err = res;
221                                 goto ERR;
222                         }
223                         
224                         num_io += res;
225                         total_io += res;
226                         bufs_per_obdo[num_obdos] = res;
227                         num_obdos++;
228
229                         if ( num_io == MAX_IOVEC ) {
230                                 obd_up(&sbi->osi_list_mutex);
231                                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
232                                                       obdos, bufs_per_obdo,
233                                                       pages, bufs, counts,
234                                                       offsets, flags);
235                                 if ( err ) {
236                                         EXIT;
237                                         goto ERR;
238                                 }
239                                 inodes[0] = inode;
240                                 num_io = 0;
241                                 num_obdos = 0;
242                                 obd_down(&sbi->osi_list_mutex);
243                                 goto again;
244                         }
245                 }
246         }
247
248         obd_up(&sbi->osi_list_mutex);
249
250         /* flush any remaining I/Os */
251         if ( num_io ) {
252                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
253                                       bufs_per_obdo, pages, bufs, counts,
254                                       offsets, flags);
255         }
256
257         /* Remove inode from superblock dirty list when no more pages.
258          * Make sure we don't point at the current inode with tmp
259          * when we re-init the list on the inode, or we will loop.
260          */
261         obd_down(&sbi->osi_list_mutex);
262         tmp = inode_list;
263         while ( (tmp = tmp->next) != inode_list ) {
264                 struct obdfs_inode_info *ii;
265                 struct inode *inode;
266
267                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
268                 inode = list_entry(ii, struct inode, u);
269                 CDEBUG(D_INODE, "checking inode %ld empty\n", inode->i_ino);
270                 if (list_empty(obdfs_iplist(inode))) {
271                         CDEBUG(D_INODE, "remove inode %ld from dirty list\n",
272                                inode->i_ino);
273                         tmp = tmp->prev;
274                         list_del(obdfs_islist(inode));
275                         iput(inode);
276                         INIT_LIST_HEAD(obdfs_islist(inode));
277                 }
278         }
279         obd_up(&sbi->osi_list_mutex);
280
281         CDEBUG(D_INODE, "flushed %d pages in total\n", total_io);
282         EXIT;
283 ERR:
284         return err;
285 } /* obdfs_remove_pages_from_cache */
286
287
288 void obdfs_flush_dirty_pages(int check_time)
289 {
290         struct list_head *sl;
291
292         sl = &obdfs_super_list;
293         while ( (sl = sl->next) != &obdfs_super_list ) {
294                 struct obdfs_sb_info *sbi = 
295                         list_entry(sl, struct obdfs_sb_info, osi_list);
296
297                 /* walk write requests here, use the sb, check the time */
298                 obdfs_flush_reqs(&sbi->osi_inodes, 0);
299         }
300
301 }
302
303
304 static struct task_struct *pupdated;
305
306
307 static int pupdate(void *unused) 
308 {
309         struct task_struct * tsk = current;
310         int interval;
311         
312         pupdated = current;
313
314         exit_files(current);
315         exit_mm(current);
316
317         tsk->session = 1;
318         tsk->pgrp = 1;
319         sprintf(tsk->comm, "pupdated");
320         pupdated = current;
321
322         printk("pupdated activated...\n");
323
324         /* sigstop and sigcont will stop and wakeup pupdate */
325         spin_lock_irq(&tsk->sigmask_lock);
326         sigfillset(&tsk->blocked);
327         siginitsetinv(&tsk->blocked, sigmask(SIGTERM));
328         recalc_sigpending(tsk);
329         spin_unlock_irq(&tsk->sigmask_lock);
330
331         for (;;) {
332                 /* update interval */
333                 interval = pupd_prm.interval;
334                 if (interval)
335                 {
336                         tsk->state = TASK_INTERRUPTIBLE;
337                         schedule_timeout(interval);
338                 }
339                 else
340                 {
341                 stop_pupdate:
342                         obdfs_flush_dirty_pages(0);
343                         tsk->state = TASK_STOPPED;
344                         /* MOD_DEC_USE_COUNT; */
345                         printk("pupdated stopped...\n");
346                         return 0;
347                 }
348                 /* check for sigstop */
349                 if (signal_pending(tsk))
350                 {
351                         int stopped = 0;
352                         spin_lock_irq(&tsk->sigmask_lock);
353                         if (sigismember(&tsk->signal, SIGTERM))
354                         {
355                                 sigdelset(&tsk->signal, SIGTERM);
356                                 stopped = 1;
357                         }
358                         recalc_sigpending(tsk);
359                         spin_unlock_irq(&tsk->sigmask_lock);
360                         if (stopped)
361                                 goto stop_pupdate;
362                 }
363                 /* asynchronous setattr etc for the future ... */
364                 /* flush_inodes(); */
365                 obdfs_flush_dirty_pages(1); 
366         }
367 }
368
369
370 int flushd_init(void)
371 {
372         /*
373         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
374          */
375         /* MOD_INC_USE_COUNT; */
376         kernel_thread(pupdate, NULL, 0);
377         printk("flushd inited\n");
378         return 0;
379 }
380
381 int flushd_cleanup(void)
382 {
383         /* this should deliver a signal to */
384         
385
386         /* XXX Andreas, we will do this later, for now, you must kill
387            pupdated with a SIGTERM from userland, before unloading obdfs.o
388         */
389         if (pupdated) {
390                 /* then let it run at least once, before continuing */
391
392                 /* XXX need to do something like this here:
393                 send_sig(SIGTERM, current, 0);
394                  */
395                 1;
396                 /*obdfs_flush_dirty_pages(0); */
397         }
398
399         /* not reached */
400         return 0;
401
402 }