Whamcloud - gitweb
obdfs/flushd.c: turned on page aging, and set pupdated to run more often
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7  * Copryright (C) 1999 Seagate Technology Inc.
8  *
9  */
10 #define __NO_VERSION__
11 #include <linux/module.h>
12 #include <linux/sched.h>
13 #include <linux/fs.h>
14 #include <linux/malloc.h>
15 #include <linux/locks.h>
16 #include <linux/errno.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/vmalloc.h>
20 #include <linux/blkdev.h>
21 #include <linux/sysrq.h>
22 #include <linux/file.h>
23 #include <linux/init.h>
24 #include <linux/quotaops.h>
25 #include <linux/iobuf.h>
26 #include <linux/highmem.h>
27
28 #include <asm/uaccess.h>
29 #include <asm/io.h>
30 #include <asm/bitops.h>
31 #include <asm/mmu_context.h>
32
33 #include <linux/obd_support.h>
34 #include <linux/obd_class.h>
35 #include <linux/obdfs.h>
36
37
38 struct {
39         int nfract;  /* Percentage of buffer cache dirty to 
40                         activate bdflush */
41         int ndirty;  /* Maximum number of dirty blocks to write out per
42                         wake-cycle */
43         int nrefill; /* Number of clean buffers to try to obtain
44                                 each time we call refill */
45         int nref_dirt; /* Dirty buffer threshold for activating bdflush
46                           when trying to refill buffers. */
47         int interval; /* jiffies delay between pupdate flushes */
48         int age_buffer;  /* Time for normal buffer to age before we flush it */
49         int age_super;  /* Time for superblock to age before we flush it */
50 } pupd_prm = {40, 500, 64, 256, 2*HZ, 30*HZ, 5*HZ };
51
52 /* Called with the superblock list lock */
53 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
54                                int nr_slots, struct page **pages, char **bufs,
55                                obd_size *counts, obd_off *offsets,
56                                obd_flag *flag, int check_time)
57 {
58         struct list_head *page_list = obdfs_iplist(inode);
59         struct list_head *tmp;
60         int num = 0;
61
62         ENTRY;
63         OIDEBUG(inode);
64
65         *obdo = obdo_fromid(IID(inode), inode->i_ino, OBD_MD_FLNOTOBD);
66         if ( IS_ERR(*obdo) ) {
67                 EXIT;
68                 return PTR_ERR(*obdo);
69         }
70
71         obdfs_from_inode(*obdo, inode); /* FIXME revisit fromid & from_inode */
72         *flag = OBD_BRW_CREATE;
73
74         tmp = page_list;
75         while ( ((tmp = tmp->next) != page_list) && (num < nr_slots) ) {
76                 struct obdfs_pgrq *req;
77                 struct page *page;
78                 
79                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
80                 page = req->rq_page;
81
82                 
83                 if (check_time && 
84                     (jiffies - req->rq_jiffies) < pupd_prm.age_buffer)
85                         continue;
86
87                 /* Remove request from list before write to avoid conflict.
88                  * Note that obdfs_pgrq_del() also deletes the request.
89                  */
90                 obdfs_pgrq_del(req);
91                 if ( !page ) {
92                         CDEBUG(D_INODE, "no page \n");
93                         continue;
94                 }
95
96                 bufs[num] = (char *)page_address(page);
97                 pages[num] = page;
98                 counts[num] = PAGE_SIZE;
99                 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
100                 CDEBUG(D_INODE, "ENQ inode %ld, page %p addr %p to vector\n", 
101                        inode->i_ino, page, (char *)page_address(page));
102                 num++;
103         }
104
105         if (!list_empty(page_list))
106                 CDEBUG(D_INODE, "inode %ld list not empty\n", inode->i_ino);
107         CDEBUG(D_INODE, "added %d page(s) to vector\n", num);
108
109         EXIT;
110         return num;  
111 } /* obdfs_enqueue_pages */
112
113 /* Remove writeback requests for the superblock */
114 int obdfs_flush_reqs(struct list_head *inode_list, int check_time)
115 {
116         struct list_head *tmp;
117         int               total_io = 0;
118         obd_count         num_io = 0;
119         obd_count         num_obdos = 0;
120         struct inode     *inodes[MAX_IOVEC];    /* write data back to these */
121         struct page      *pages[MAX_IOVEC];     /* call put_page on these */
122         struct obdo      *obdos[MAX_IOVEC];
123         char             *bufs[MAX_IOVEC];
124         obd_size          counts[MAX_IOVEC];
125         obd_off           offsets[MAX_IOVEC];
126         obd_flag          flags[MAX_IOVEC];
127         obd_count         bufs_per_obdo[MAX_IOVEC];
128         int               err = 0;
129         struct obdfs_sb_info *sbi;
130
131         ENTRY;
132         if (!inode_list) {
133                 CDEBUG(D_INODE, "no list\n");
134                 EXIT;
135                 return 0;
136         }
137
138         sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
139
140         obd_down(&sbi->osi_list_mutex);
141         if ( list_empty(inode_list)) {
142                 CDEBUG(D_INODE, "list empty\n");
143                 obd_up(&sbi->osi_list_mutex);
144                 EXIT;
145                 return 0;
146         }
147
148         /* add each inode's dirty pages to a write vector, and write it */
149  again:
150         tmp = inode_list;
151         while ( (tmp = tmp->next) != inode_list && 
152                 total_io < pupd_prm.ndirty) {
153                 struct obdfs_inode_info *ii;
154                 struct inode *inode;
155                 int res;
156
157                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
158                 inode = list_entry(ii, struct inode, u);
159                 inodes[num_obdos] = inode;
160                 CDEBUG(D_INODE, "checking inode %ld pages\n", inode->i_ino);
161
162                 res = 1;
163
164                 /* Loop on this inode until we can't get more pages from it
165                  * (either no more pages, or the pages aren't old enough).
166                  * Make sure we reference "inode" and not "inodes[num_obdos]",
167                  * as num_obdos will change after the loop is run.
168                  */
169                 while (!list_empty(obdfs_iplist(inode)) && res &&
170                        total_io < pupd_prm.ndirty ) {
171                         res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
172                                                   MAX_IOVEC - num_io,
173                                                   &pages[num_io], &bufs[num_io],
174                                                   &counts[num_io],
175                                                   &offsets[num_io],
176                                                   &flags[num_obdos],
177                                                   check_time);
178                         CDEBUG(D_INODE, "FLUSHED inode %ld, pages flushed: %d\n", 
179                                inode->i_ino, res);
180                         if ( res < 0 ) {
181                                 obd_up(&sbi->osi_list_mutex);
182                                 err = res;
183                                 goto ERR;
184                         }
185                         
186                         num_io += res;
187                         total_io += res;
188                         bufs_per_obdo[num_obdos] = res;
189                         num_obdos++;
190
191                         if ( num_io == MAX_IOVEC ) {
192                                 obd_up(&sbi->osi_list_mutex);
193                                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
194                                                       obdos, bufs_per_obdo,
195                                                       pages, bufs, counts,
196                                                       offsets, flags);
197                                 if ( err ) {
198                                         EXIT;
199                                         goto ERR;
200                                 }
201                                 inodes[0] = inode;
202                                 num_io = 0;
203                                 num_obdos = 0;
204                                 obd_down(&sbi->osi_list_mutex);
205                                 goto again;
206                         }
207                 }
208         }
209
210         obd_up(&sbi->osi_list_mutex);
211
212         /* flush any remaining I/Os */
213         if ( num_io ) {
214                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
215                                       bufs_per_obdo, pages, bufs, counts,
216                                       offsets, flags);
217         }
218
219         /* Remove inode from superblock dirty list when no more pages.
220          * Make sure we don't point at the current inode with tmp
221          * when we re-init the list on the inode, or we will loop.
222          */
223         obd_down(&sbi->osi_list_mutex);
224         tmp = inode_list;
225         while ( (tmp = tmp->next) != inode_list ) {
226                 struct obdfs_inode_info *ii;
227                 struct inode *inode;
228
229                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
230                 inode = list_entry(ii, struct inode, u);
231                 CDEBUG(D_INODE, "checking inode %ld empty\n", inode->i_ino);
232                 if (list_empty(obdfs_iplist(inode))) {
233                         CDEBUG(D_INODE, "remove inode %ld from dirty list\n",
234                                inode->i_ino);
235                         tmp = tmp->prev;
236                         list_del(obdfs_islist(inode));
237                         /* decrement inode reference for page cache */
238                         inode->i_count--;
239                         INIT_LIST_HEAD(obdfs_islist(inode));
240                 }
241         }
242         obd_up(&sbi->osi_list_mutex);
243
244         CDEBUG(D_INODE, "flushed %d pages in total\n", total_io);
245         EXIT;
246 ERR:
247         return err;
248 } /* obdfs_flush_reqs */
249
250
251 void obdfs_flush_dirty_pages(int check_time)
252 {
253         struct list_head *sl;
254
255         ENTRY;
256         sl = &obdfs_super_list;
257         while ( (sl = sl->next) != &obdfs_super_list ) {
258                 struct obdfs_sb_info *sbi = 
259                         list_entry(sl, struct obdfs_sb_info, osi_list);
260
261                 /* walk write requests here, use the sb, check the time */
262                 obdfs_flush_reqs(&sbi->osi_inodes, check_time);
263         }
264         EXIT;
265 } /* obdfs_flush_dirty_pages */
266
267
268 static struct task_struct *pupdated;
269
270 static int pupdate(void *unused) 
271 {
272         struct task_struct * tsk = current;
273         int interval;
274         
275         pupdated = current;
276
277         exit_files(current);
278         exit_mm(current);
279
280         tsk->session = 1;
281         tsk->pgrp = 1;
282         sprintf(tsk->comm, "pupdated");
283         pupdated = current;
284
285         MOD_INC_USE_COUNT;      /* XXX until send_sig works */
286         printk("pupdated activated...\n");
287
288         /* sigstop and sigcont will stop and wakeup pupdate */
289         spin_lock_irq(&tsk->sigmask_lock);
290         sigfillset(&tsk->blocked);
291         siginitsetinv(&tsk->blocked, sigmask(SIGTERM));
292         recalc_sigpending(tsk);
293         spin_unlock_irq(&tsk->sigmask_lock);
294
295         for (;;) {
296                 /* update interval */
297                 interval = pupd_prm.interval;
298                 if (interval)
299                 {
300                         tsk->state = TASK_INTERRUPTIBLE;
301                         schedule_timeout(interval);
302                 }
303                 else
304                 {
305                 stop_pupdate:
306                         tsk->state = TASK_STOPPED;
307                         MOD_DEC_USE_COUNT; /* XXX until send_sig works */
308                         printk("pupdated stopped...\n");
309                         return 0;
310                 }
311                 /* check for sigstop */
312                 if (signal_pending(tsk))
313                 {
314                         int stopped = 0;
315                         spin_lock_irq(&tsk->sigmask_lock);
316                         if (sigismember(&tsk->signal, SIGTERM))
317                         {
318                                 sigdelset(&tsk->signal, SIGTERM);
319                                 stopped = 1;
320                         }
321                         recalc_sigpending(tsk);
322                         spin_unlock_irq(&tsk->sigmask_lock);
323                         if (stopped)
324                                 goto stop_pupdate;
325                 }
326                 /* asynchronous setattr etc for the future ...
327                 flush_inodes();
328                  */
329                 obdfs_flush_dirty_pages(1); 
330         }
331 }
332
333
334 int obdfs_flushd_init(void)
335 {
336         /*
337         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
338          */
339         kernel_thread(pupdate, NULL, 0);
340         CDEBUG(D_PSDEV, __FUNCTION__ ": flushd inited\n");
341         return 0;
342 }
343
344 int obdfs_flushd_cleanup(void)
345 {
346         ENTRY;
347         /* deliver a signal to pupdated to shut it down
348            XXX need to kill it from user space for now XXX
349         if (pupdated) {
350                 send_sig_info(SIGTERM, 1, pupdated);
351         }
352          */
353
354         EXIT;
355         /* not reached */
356         return 0;
357
358 }