Whamcloud - gitweb
Reworked to have a dirty inode list on superblock, dirty pages per inode.
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7  * Copryright (C) 1999 Seagate Technology Inc.
8  *
9  */
10 #define __NO_VERSION__
11 #include <linux/module.h>
12 #include <linux/sched.h>
13 #include <linux/fs.h>
14 #include <linux/malloc.h>
15 #include <linux/locks.h>
16 #include <linux/errno.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/vmalloc.h>
20 #include <linux/blkdev.h>
21 #include <linux/sysrq.h>
22 #include <linux/file.h>
23 #include <linux/init.h>
24 #include <linux/quotaops.h>
25 #include <linux/iobuf.h>
26 #include <linux/highmem.h>
27
28 #include <asm/uaccess.h>
29 #include <asm/io.h>
30 #include <asm/bitops.h>
31 #include <asm/mmu_context.h>
32
33 #include <linux/obd_support.h>
34 #include <linux/obd_class.h>
35 #include <linux/obdfs.h>
36
37
38
39 struct {
40         int nfract;  /* Percentage of buffer cache dirty to 
41                         activate bdflush */
42         int ndirty;  /* Maximum number of dirty blocks to write out per
43                         wake-cycle */
44         int nrefill; /* Number of clean buffers to try to obtain
45                                 each time we call refill */
46         int nref_dirt; /* Dirty buffer threshold for activating bdflush
47                           when trying to refill buffers. */
48         int interval; /* jiffies delay between kupdate flushes */
49         int age_buffer;  /* Time for normal buffer to age before we flush it */
50         int age_super;  /* Time for superblock to age before we flush it */
51 } pupd_prm = {40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ }; 
52
53
54 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
55                                int nr_slots, struct page **pages, char **bufs,
56                                obd_size *counts, obd_off *offsets,
57                                obd_flag *flag, int check_time)
58 {
59         struct list_head *page_list = obdfs_iplist(inode);
60         struct list_head *tmp;
61         int i = 0;
62
63         ENTRY;
64         if (list_empty(obdfs_iplist(inode))) {
65                 list_del(obdfs_islist(inode));
66                 CDEBUG(D_INODE, "empty list\n");
67                 EXIT;
68                 return 0;
69         }
70
71         *obdo = obdo_fromid(IID(inode), inode->i_ino, OBD_MD_FLNOTOBD);
72         if ( IS_ERR(*obdo) ) {
73                 EXIT;
74                 return PTR_ERR(*obdo);
75         }
76
77         obdfs_from_inode(*obdo, inode);
78         *flag = OBD_BRW_CREATE;
79
80         tmp = page_list;
81         while ( (tmp = tmp->next) != page_list && (i < nr_slots) ) {
82                 struct obdfs_pgrq *req;
83                 struct page *page;
84                 
85                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
86                 /* remove request from list before write to avoid conflict */
87                 obdfs_pgrq_del(req);
88                 page = req->rq_page;
89
90                 if ( !page  ) {
91                         CDEBUG(D_INODE, "no page \n");
92                         EXIT;
93                         return 0;
94                 }
95
96                 if (check_time && 
97                     req->rq_jiffies > (jiffies - pupd_prm.age_buffer))
98                         continue;
99                 
100                 CDEBUG(D_INODE, "adding page %p to vector\n", page);
101                 bufs[i] = (char *)page_address(page);
102                 pages[i] = page;
103                 counts[i] = PAGE_SIZE;
104                 offsets[i] = ((obd_off)page->index) << PAGE_SHIFT;
105                 i++;
106         }
107
108         /* If no more pages for this inode, remove from superblock list */
109         if ( list_empty(obdfs_iplist(inode)) )
110                 list_del(obdfs_islist(inode));
111
112         EXIT;
113         return i;  
114 }
115
116
117 /* Remove writeback requests from an inode */
118 int obdfs_flush_reqs(struct list_head *inode_list, int flush_inode,
119                      int check_time)
120 {
121         struct list_head *tmp = inode_list;
122         obd_count         num_io = 0;
123         obd_count         num_obdos = 0;
124         struct inode     *inodes[MAX_IOVEC];
125         struct obdo      *obdos[MAX_IOVEC];
126         struct page      *pages[MAX_IOVEC];
127         char             *bufs[MAX_IOVEC];
128         obd_size          counts[MAX_IOVEC];
129         obd_off           offsets[MAX_IOVEC];
130         obd_flag          flags[MAX_IOVEC];
131         obd_count         bufs_per_obdo[MAX_IOVEC];
132         int               err = 0;
133         int i;
134
135         ENTRY;
136
137         if (!inode_list) {
138                 CDEBUG(D_INODE, "no list\n");
139                 EXIT;
140                 return 0;
141         }
142
143         if ( list_empty(inode_list)) {
144                 CDEBUG(D_INODE, "list empty\n");
145                 EXIT;
146                 return 0;
147         }
148
149
150         /* add all of the outstanding pages to a write vector, and write it */
151         while ( (tmp = tmp->next) != inode_list ) {
152                 struct obdfs_inode_info *ii;
153                 int res;
154
155                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
156                 inodes[num_obdos] = list_entry(ii, struct inode, u);
157
158                 res = obdfs_enqueue_pages(inodes[num_obdos], &obdos[num_obdos],
159                                           MAX_IOVEC - num_io, &pages[num_io],
160                                           &bufs[num_io], &counts[num_io],
161                                           &offsets[num_io], &flags[num_obdos],1);
162                 if ( res < 0 ) {
163                         return -EIO;
164                 }
165                 
166                 bufs_per_obdo[num_obdos] = res;
167                 num_io += res;
168                 num_obdos++;
169
170                 if ( num_io == MAX_IOVEC ) {
171                         err = obdfs_do_vec_wr(inodes[0]->i_sb, num_io,
172                                               num_obdos, obdos, bufs_per_obdo,
173                                               pages, bufs, counts, offsets,
174                                               flags);
175                         for (i = 0 ; i < num_obdos ; i++) {
176                                 obdfs_to_inode(inodes[i], obdos[i]);
177                                 obdo_free(obdos[i]);
178                         }
179                         if ( err ) {
180                                 EXIT;
181                                 goto ERR;
182                         }
183                         num_io = 0;
184                         num_obdos = 0;
185                 }
186         } 
187
188         /* flush any remaining I/Os */
189         if ( num_io ) {
190                 err = obdfs_do_vec_wr(inodes[0]->i_sb, num_io, num_obdos, 
191                                       obdos, bufs_per_obdo, pages, bufs,
192                                       counts, offsets, flags);
193                 for (i = 0 ; i < num_obdos ; i++) {
194                         obdfs_to_inode(inodes[i], obdos[i]);
195                         obdo_free(obdos[i]);
196                 }
197         }
198         EXIT;
199 ERR:
200
201         return err;
202 } /* obdfs_remove_pages_from_cache */
203
204
205 static void obdfs_flush_dirty_pages(int check_time)
206 {
207         struct list_head *sl;
208
209         sl = &obdfs_super_list;
210         while ( (sl = sl->next) != &obdfs_super_list ) {
211                 struct obdfs_sb_info *sbi = 
212                         list_entry(sl, struct obdfs_sb_info, osi_list);
213
214                 /* walk write requests here, use the sb, check the time */
215                 obdfs_flush_reqs(&sbi->osi_inodes, 0, 1);
216         }
217
218 #if 0
219         /* again, but now we wait for completion */
220         sl = &obdfs_super_list;
221         while ( (sl = sl->next) != &obdfs_super_list ) {
222                 struct obdfs_sb_info *sbi = 
223                         list_entry(sl, struct obdfs_sb_info, sl_chain);
224
225                 /* walk write requests here */
226                 obdfs_flush_reqs(&sbi->osi_pages, 0, check_time);
227         }
228 #endif
229 }
230
231
232 static struct task_struct *pupdated;
233
234 static int pupdate(void *unused) 
235 {
236         struct task_struct * tsk = current;
237         int interval;
238         
239         pupdated = current;
240
241         exit_files(current);
242         exit_mm(current);
243
244         tsk->session = 1;
245         tsk->pgrp = 1;
246         sprintf(tsk->comm, "pupdated");
247         pupdated = current;
248
249         printk("pupdated activated...\n");
250
251         /* sigstop and sigcont will stop and wakeup pupdate */
252         spin_lock_irq(&tsk->sigmask_lock);
253         sigfillset(&tsk->blocked);
254         siginitsetinv(&tsk->blocked, sigmask(SIGTERM));
255         recalc_sigpending(tsk);
256         spin_unlock_irq(&tsk->sigmask_lock);
257
258         for (;;) {
259                 /* update interval */
260                 interval = pupd_prm.interval;
261                 if (interval)
262                 {
263                         tsk->state = TASK_INTERRUPTIBLE;
264                         schedule_timeout(interval);
265                 }
266                 else
267                 {
268                 stop_pupdate:
269                         tsk->state = TASK_STOPPED;
270                         /* MOD_DEC_USE_COUNT; */
271                         printk("pupdated stopped...\n");
272                         return 0;
273                 }
274                 /* check for sigstop */
275                 if (signal_pending(tsk))
276                 {
277                         int stopped = 0;
278                         spin_lock_irq(&tsk->sigmask_lock);
279                         if (sigismember(&tsk->signal, SIGTERM))
280                         {
281                                 sigdelset(&tsk->signal, SIGTERM);
282                                 stopped = 1;
283                         }
284                         recalc_sigpending(tsk);
285                         spin_unlock_irq(&tsk->sigmask_lock);
286                         if (stopped)
287                                 goto stop_pupdate;
288                 }
289                 /* asynchronous setattr etc for the future ... */
290                 /* flush_inodes(); */
291                 CDEBUG(D_INODE, "about to flush pages...\n");
292                 obdfs_flush_dirty_pages(1);
293                 CDEBUG(D_INODE, "done flushing pages...\n");
294         }
295 }
296
297
298 int flushd_init(void)
299 {
300         /*
301         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
302          */
303         /* MOD_INC_USE_COUNT; */
304         kernel_thread(pupdate, NULL, 0);
305         printk("flushd inited\n");
306         return 0;
307 }
308
309 int flushd_cleanup(void)
310 {
311         /* this should deliver a signal to */
312         
313
314         /* XXX Andreas, we will do this later, for now, you must kill
315            pupdated with a SIGTERM from userland, before unloading obdfs.o
316         */
317         if (pupdated) {
318                 /* then let it run at least once, before continuing */
319
320                 /* XXX need to do something like this here:
321                 send_sig(SIGTERM, current, 0);
322                  */
323                 1;
324         }
325
326         /* not reached */
327         return 0;
328
329 }