Whamcloud - gitweb
obdfs/flushd.c: removed iput() from dequeue routine, as we no longer remove
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7  * Copryright (C) 1999 Seagate Technology Inc.
8  *
9  */
10 #define __NO_VERSION__
11 #include <linux/module.h>
12 #include <linux/sched.h>
13 #include <linux/fs.h>
14 #include <linux/malloc.h>
15 #include <linux/locks.h>
16 #include <linux/errno.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/vmalloc.h>
20 #include <linux/blkdev.h>
21 #include <linux/sysrq.h>
22 #include <linux/file.h>
23 #include <linux/init.h>
24 #include <linux/quotaops.h>
25 #include <linux/iobuf.h>
26 #include <linux/highmem.h>
27
28 #include <asm/uaccess.h>
29 #include <asm/io.h>
30 #include <asm/bitops.h>
31 #include <asm/mmu_context.h>
32
33 #include <linux/obd_support.h>
34 #include <linux/obd_class.h>
35 #include <linux/obdfs.h>
36
37
38 struct {
39         int nfract;  /* Percentage of buffer cache dirty to 
40                         activate bdflush */
41         int ndirty;  /* Maximum number of dirty blocks to write out per
42                         wake-cycle */
43         int nrefill; /* Number of clean buffers to try to obtain
44                                 each time we call refill */
45         int nref_dirt; /* Dirty buffer threshold for activating bdflush
46                           when trying to refill buffers. */
47         int interval; /* jiffies delay between kupdate flushes */
48         int age_buffer;  /* Time for normal buffer to age before we flush it */
49         int age_super;  /* Time for superblock to age before we flush it */
50 /* } pupd_prm = {40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ };  */
51 } pupd_prm = {40, 500, 64, 256, 10*HZ, 30*HZ, 5*HZ }; 
52
53 /* Called with the superblock list lock */
54 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
55                                int nr_slots, struct page **pages, char **bufs,
56                                obd_size *counts, obd_off *offsets,
57                                obd_flag *flag, int check_time)
58 {
59         struct list_head *page_list = obdfs_iplist(inode);
60         struct list_head *tmp;
61         int num = 0;
62
63         ENTRY;
64         OIDEBUG(inode);
65
66         *obdo = obdo_fromid(IID(inode), inode->i_ino, OBD_MD_FLNOTOBD);
67         if ( IS_ERR(*obdo) ) {
68                 EXIT;
69                 return PTR_ERR(*obdo);
70         }
71
72         obdfs_from_inode(*obdo, inode); /* FIXME revisit fromid & from_inode */
73         *flag = OBD_BRW_CREATE;
74
75         tmp = page_list;
76         while ( ((tmp = tmp->next) != page_list) && (num < nr_slots) ) {
77                 struct obdfs_pgrq *req;
78                 struct page *page;
79                 
80                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
81                 page = req->rq_page;
82
83                 
84                 if (check_time && 
85                     (jiffies - req->rq_jiffies) < pupd_prm.age_buffer)
86                         continue;
87
88                 /* Remove request from list before write to avoid conflict.
89                  * Note that obdfs_pgrq_del() also deletes the request.
90                  */
91                 obdfs_pgrq_del(req);
92                 if ( !page ) {
93                         CDEBUG(D_INODE, "no page \n");
94                         continue;
95                 }
96
97                 bufs[num] = (char *)page_address(page);
98                 pages[num] = page;
99                 counts[num] = PAGE_SIZE;
100                 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
101                 CDEBUG(D_INODE, "ENQ inode %ld, page %p addr %p to vector\n", 
102                        inode->i_ino, page, (char *)page_address(page));
103                 num++;
104         }
105
106         if (!list_empty(page_list))
107                 CDEBUG(D_INODE, "inode %ld list not empty\n", inode->i_ino);
108         CDEBUG(D_INODE, "added %d page(s) to vector\n", num);
109
110         EXIT;
111         return num;  
112 } /* obdfs_enqueue_pages */
113
114 /* dequeue requests for a dying inode */
115 void obdfs_dequeue_reqs(struct inode *inode)
116 {
117
118         struct list_head *tmp;
119
120         obd_down(&obdfs_i2sbi(inode)->osi_list_mutex);
121         tmp = obdfs_islist(inode);
122         if ( list_empty(tmp) ) {
123                 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
124                 EXIT;
125                 return;
126         }
127
128         /* take it out of the super list */
129         list_del(tmp);
130         INIT_LIST_HEAD(obdfs_islist(inode));
131
132         tmp = obdfs_iplist(inode);
133         while ( (tmp = tmp->next) != obdfs_iplist(inode) ) {
134                 struct obdfs_pgrq *req;
135                 struct page *page;
136                 
137                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
138                 page = req->rq_page;
139                 /* take it out of the list and free */
140                 obdfs_pgrq_del(req);
141                 /* now put the page away */
142                 put_page(page);
143         }
144         obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
145 } /* obdfs_dequeue_reqs */
146
147 /* Remove writeback requests for the superblock */
148 int obdfs_flush_reqs(struct list_head *inode_list, int check_time)
149 {
150         struct list_head *tmp;
151         int               total_io = 0;
152         obd_count         num_io = 0;
153         obd_count         num_obdos = 0;
154         struct inode     *inodes[MAX_IOVEC];    /* write data back to these */
155         struct page      *pages[MAX_IOVEC];     /* call put_page on these */
156         struct obdo      *obdos[MAX_IOVEC];
157         char             *bufs[MAX_IOVEC];
158         obd_size          counts[MAX_IOVEC];
159         obd_off           offsets[MAX_IOVEC];
160         obd_flag          flags[MAX_IOVEC];
161         obd_count         bufs_per_obdo[MAX_IOVEC];
162         int               err = 0;
163         struct obdfs_sb_info *sbi;
164
165
166         ENTRY;
167
168         if (!inode_list) {
169                 CDEBUG(D_INODE, "no list\n");
170                 EXIT;
171                 return 0;
172         }
173
174         sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
175
176         obd_down(&sbi->osi_list_mutex);
177         if ( list_empty(inode_list)) {
178                 CDEBUG(D_INODE, "list empty\n");
179                 obd_up(&sbi->osi_list_mutex);
180                 EXIT;
181                 return 0;
182         }
183
184         /* add each inode's dirty pages to a write vector, and write it */
185  again:
186         tmp = inode_list;
187         while ( (tmp = tmp->next) != inode_list && 
188                 total_io < pupd_prm.ndirty) {
189                 struct obdfs_inode_info *ii;
190                 struct inode *inode;
191                 int res;
192
193                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
194                 inode = list_entry(ii, struct inode, u);
195                 inodes[num_obdos] = inode;
196                 CDEBUG(D_INODE, "checking inode %ld pages\n", inode->i_ino);
197
198                 res = 1;
199
200                 /* Loop on this inode until we can't get more pages from it
201                  * (either no more pages, or the pages aren't old enough).
202                  * Make sure we reference "inode" and not "inodes[num_obdos]",
203                  * as num_obdos will change after the loop is run.
204                  */
205                 while (!list_empty(obdfs_iplist(inode)) && res &&
206                        total_io < pupd_prm.ndirty ) {
207                         res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
208                                                   MAX_IOVEC - num_io,
209                                                   &pages[num_io], &bufs[num_io],
210                                                   &counts[num_io],
211                                                   &offsets[num_io],
212                                                   &flags[num_obdos],
213                                                   check_time);
214                         CDEBUG(D_INODE, "FLUSHED inode %ld, pages flushed: %d\n", 
215                                inode->i_ino, res);
216                         if ( res < 0 ) {
217                                 obd_up(&sbi->osi_list_mutex);
218                                 err = res;
219                                 goto ERR;
220                         }
221                         
222                         num_io += res;
223                         total_io += res;
224                         bufs_per_obdo[num_obdos] = res;
225                         num_obdos++;
226
227                         if ( num_io == MAX_IOVEC ) {
228                                 obd_up(&sbi->osi_list_mutex);
229                                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
230                                                       obdos, bufs_per_obdo,
231                                                       pages, bufs, counts,
232                                                       offsets, flags);
233                                 if ( err ) {
234                                         EXIT;
235                                         goto ERR;
236                                 }
237                                 inodes[0] = inode;
238                                 num_io = 0;
239                                 num_obdos = 0;
240                                 obd_down(&sbi->osi_list_mutex);
241                                 goto again;
242                         }
243                 }
244         }
245
246         obd_up(&sbi->osi_list_mutex);
247
248         /* flush any remaining I/Os */
249         if ( num_io ) {
250                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
251                                       bufs_per_obdo, pages, bufs, counts,
252                                       offsets, flags);
253         }
254
255         /* Remove inode from superblock dirty list when no more pages.
256          * Make sure we don't point at the current inode with tmp
257          * when we re-init the list on the inode, or we will loop.
258          */
259         obd_down(&sbi->osi_list_mutex);
260         tmp = inode_list;
261         while ( (tmp = tmp->next) != inode_list ) {
262                 struct obdfs_inode_info *ii;
263                 struct inode *inode;
264
265                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
266                 inode = list_entry(ii, struct inode, u);
267                 CDEBUG(D_INODE, "checking inode %ld empty\n", inode->i_ino);
268                 if (list_empty(obdfs_iplist(inode))) {
269                         CDEBUG(D_INODE, "remove inode %ld from dirty list\n",
270                                inode->i_ino);
271                         tmp = tmp->prev;
272                         list_del(obdfs_islist(inode));
273                         iput(inode);
274                         INIT_LIST_HEAD(obdfs_islist(inode));
275                 }
276         }
277         obd_up(&sbi->osi_list_mutex);
278
279         CDEBUG(D_INODE, "flushed %d pages in total\n", total_io);
280         EXIT;
281 ERR:
282         return err;
283 } /* obdfs_remove_pages_from_cache */
284
285
286 void obdfs_flush_dirty_pages(int check_time)
287 {
288         struct list_head *sl;
289
290         ENTRY;
291         sl = &obdfs_super_list;
292         while ( (sl = sl->next) != &obdfs_super_list ) {
293                 struct obdfs_sb_info *sbi = 
294                         list_entry(sl, struct obdfs_sb_info, osi_list);
295
296                 /* walk write requests here, use the sb, check the time */
297                 obdfs_flush_reqs(&sbi->osi_inodes, check_time);
298         }
299         EXIT;
300 }
301
302
303 static struct task_struct *pupdated;
304
305 static int pupdate(void *unused) 
306 {
307         struct task_struct * tsk = current;
308         int interval;
309         
310         pupdated = current;
311
312         exit_files(current);
313         exit_mm(current);
314
315         tsk->session = 1;
316         tsk->pgrp = 1;
317         sprintf(tsk->comm, "pupdated");
318         pupdated = current;
319
320         MOD_INC_USE_COUNT;      /* XXX until send_sig works */
321         printk("pupdated activated...\n");
322
323         /* sigstop and sigcont will stop and wakeup pupdate */
324         spin_lock_irq(&tsk->sigmask_lock);
325         sigfillset(&tsk->blocked);
326         siginitsetinv(&tsk->blocked, sigmask(SIGTERM));
327         recalc_sigpending(tsk);
328         spin_unlock_irq(&tsk->sigmask_lock);
329
330         for (;;) {
331                 /* update interval */
332                 interval = pupd_prm.interval;
333                 if (interval)
334                 {
335                         tsk->state = TASK_INTERRUPTIBLE;
336                         schedule_timeout(interval);
337                 }
338                 else
339                 {
340                 stop_pupdate:
341                         tsk->state = TASK_STOPPED;
342                         MOD_DEC_USE_COUNT; /* XXX until send_sig works */
343                         printk("pupdated stopped...\n");
344                         return 0;
345                 }
346                 /* check for sigstop */
347                 if (signal_pending(tsk))
348                 {
349                         int stopped = 0;
350                         spin_lock_irq(&tsk->sigmask_lock);
351                         if (sigismember(&tsk->signal, SIGTERM))
352                         {
353                                 sigdelset(&tsk->signal, SIGTERM);
354                                 stopped = 1;
355                         }
356                         recalc_sigpending(tsk);
357                         spin_unlock_irq(&tsk->sigmask_lock);
358                         if (stopped)
359                                 goto stop_pupdate;
360                 }
361                 /* asynchronous setattr etc for the future ...
362                 flush_inodes();
363                  */
364                 /* we don't currently check the time on the pages
365                 obdfs_flush_dirty_pages(1); 
366                  */
367                 obdfs_flush_dirty_pages(0); 
368         }
369 }
370
371
372 int obdfs_flushd_init(void)
373 {
374         /*
375         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
376          */
377         kernel_thread(pupdate, NULL, 0);
378         CDEBUG(D_PSDEV, __FUNCTION__ ": flushd inited\n");
379         return 0;
380 }
381
382 int obdfs_flushd_cleanup(void)
383 {
384         ENTRY;
385         /* deliver a signal to pupdated to shut it down
386            XXX need to kill it from user space for now XXX
387         if (pupdated) {
388                 send_sig_info(SIGTERM, 1, pupdated);
389         }
390          */
391
392         EXIT;
393         /* not reached */
394         return 0;
395
396 }