Whamcloud - gitweb
obdfs/dir.c: fix bug when reading directories > 4096 bytes
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7  * Copryright (C) 1999 Seagate Technology Inc.
8  *
9  */
10 #define __NO_VERSION__
11 #include <linux/module.h>
12 #include <linux/sched.h>
13 #include <linux/fs.h>
14 #include <linux/malloc.h>
15 #include <linux/locks.h>
16 #include <linux/errno.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/vmalloc.h>
20 #include <linux/blkdev.h>
21 #include <linux/sysrq.h>
22 #include <linux/file.h>
23 #include <linux/init.h>
24 #include <linux/quotaops.h>
25 #include <linux/iobuf.h>
26 #include <linux/highmem.h>
27
28 #include <asm/uaccess.h>
29 #include <asm/io.h>
30 #include <asm/bitops.h>
31 #include <asm/mmu_context.h>
32
33 #include <linux/obd_support.h>
34 #include <linux/obd_class.h>
35 #include <linux/obdfs.h>
36
37
38 struct {
39         int nfract;  /* Percentage of buffer cache dirty to 
40                         activate bdflush */
41         int ndirty;  /* Maximum number of dirty blocks to write out per
42                         wake-cycle */
43         int nrefill; /* Number of clean buffers to try to obtain
44                                 each time we call refill */
45         int nref_dirt; /* Dirty buffer threshold for activating bdflush
46                           when trying to refill buffers. */
47         int interval; /* jiffies delay between pupdate flushes */
48         int age_buffer;  /* Time for normal buffer to age before we flush it */
49         int age_super;  /* Time for superblock to age before we flush it */
50 } pupd_prm = {40, 500, 64, 256, 3*HZ, 30*HZ, 5*HZ };
51
52 /* Called with the superblock list lock */
53 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
54                                int nr_slots, struct page **pages, char **bufs,
55                                obd_size *counts, obd_off *offsets,
56                                obd_flag *flag, int check_time)
57 {
58         struct list_head *page_list = obdfs_iplist(inode);
59         struct list_head *tmp;
60         int num = 0;
61
62         ENTRY;
63
64         tmp = page_list;
65         /* Traverse list in reverse order, so we do FIFO, not LIFO order */
66         while ( (tmp = tmp->prev) != page_list && num < nr_slots ) {
67                 struct obdfs_pgrq *req;
68                 struct page *page;
69                 
70                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
71                 page = req->rq_page;
72
73                 
74                 if (check_time && 
75                     (jiffies - req->rq_jiffies) < pupd_prm.age_buffer)
76                         break;          /* pages are in chronological order */
77
78                 /* Only allocate the obdo if we will actually do I/O here */
79                 if ( !*obdo ) {
80                         OIDEBUG(inode);
81                         *obdo = obdo_fromid(IID(inode), inode->i_ino,
82                                             OBD_MD_FLNOTOBD);
83                         if ( IS_ERR(*obdo) ) {
84                                 int err = PTR_ERR(*obdo);
85                                 *obdo = NULL;
86
87                                 EXIT;
88                                 return err;
89                         }
90
91                         /* FIXME revisit fromid & from_inode */
92                         obdfs_from_inode(*obdo, inode);
93                         *flag = OBD_BRW_CREATE;
94                 }
95
96                 /* Remove request from list before write to avoid conflict.
97                  * Note that obdfs_pgrq_del() also deletes the request.
98                  */
99                 obdfs_pgrq_del(req);
100                 if ( !page ) {
101                         CDEBUG(D_CACHE, "no page \n");
102                         continue;
103                 }
104
105                 bufs[num] = (char *)page_address(page);
106                 pages[num] = page;
107                 counts[num] = PAGE_SIZE;
108                 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
109                 CDEBUG(D_INFO, "ENQ inode %ld, page %p addr %p to vector\n", 
110                        inode->i_ino, page, (char *)page_address(page));
111                 num++;
112         }
113
114         if (!list_empty(page_list))
115                 CDEBUG(D_CACHE, "inode %ld list not empty\n", inode->i_ino);
116         CDEBUG(D_INFO, "added %d page(s) to vector\n", num);
117
118         EXIT;
119         return num;  
120 } /* obdfs_enqueue_pages */
121
122 /* Remove writeback requests for the superblock */
123 int obdfs_flush_reqs(struct list_head *inode_list, int check_time)
124 {
125         struct list_head *tmp;
126         int               total_io = 0;
127         obd_count         num_io;
128         obd_count         num_obdos;
129         struct inode     *inodes[MAX_IOVEC];    /* write data back to these */
130         struct page      *pages[MAX_IOVEC];     /* call put_page on these */
131         struct obdo      *obdos[MAX_IOVEC];
132         char             *bufs[MAX_IOVEC];
133         obd_size          counts[MAX_IOVEC];
134         obd_off           offsets[MAX_IOVEC];
135         obd_flag          flags[MAX_IOVEC];
136         obd_count         bufs_per_obdo[MAX_IOVEC];
137         int               err = 0;
138         struct obdfs_sb_info *sbi;
139
140         ENTRY;
141         if (!inode_list) {
142                 CDEBUG(D_INODE, "no list\n");
143                 EXIT;
144                 return 0;
145         }
146
147         sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
148
149         obd_down(&sbi->osi_list_mutex);
150         if ( list_empty(inode_list) ) {
151                 CDEBUG(D_CACHE, "list empty\n");
152                 obd_up(&sbi->osi_list_mutex);
153                 EXIT;
154                 return 0;
155         }
156
157         /* Add each inode's dirty pages to a write vector, and write it.
158          * Traverse list in reverse order, so we do FIFO, not LIFO order
159          */
160  again:
161         tmp = inode_list;
162         num_io = 0;
163         num_obdos = 0;
164         while ( (tmp = tmp->prev) != inode_list && total_io < pupd_prm.ndirty) {
165                 struct obdfs_inode_info *ii;
166                 struct inode *inode;
167                 int res;
168
169                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
170                 inode = list_entry(ii, struct inode, u);
171                 inodes[num_obdos] = inode;
172                 obdos[num_obdos] = NULL;
173                 CDEBUG(D_INFO, "checking inode %ld pages\n", inode->i_ino);
174
175                 /* Make sure we reference "inode" and not "inodes[num_obdos]",
176                  * as num_obdos will change after the loop is run.
177                  */
178                 if (!list_empty(obdfs_iplist(inode))) {
179                         res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
180                                                   MAX_IOVEC - num_io,
181                                                   &pages[num_io], &bufs[num_io],
182                                                   &counts[num_io],
183                                                   &offsets[num_io],
184                                                   &flags[num_obdos],
185                                                   check_time);
186                         CDEBUG(D_CACHE, "FLUSH inode %ld, pages flushed: %d\n",
187                                inode->i_ino, res);
188                         if ( res < 0 ) {
189                                 CDEBUG(D_INODE,
190                                        "fatal: unable to enqueue inode %ld (err %d)\n",
191                                        inode->i_ino, err);
192                                 /* XXX Move bad inode to end of list so we can
193                                  * continue with flushing list.  This is a
194                                  * temporary measure to avoid machine lockups.
195                                  */
196                                 list_del(tmp);
197                                 list_add(tmp, inode_list);
198                                 err = res;
199                                 EXIT;
200                                 goto BREAK;
201                         } else if (res) {
202                                 num_io += res;
203                                 total_io += res;
204                                 bufs_per_obdo[num_obdos] = res;
205                                 num_obdos++;
206                         }
207
208                         if ( num_io == MAX_IOVEC ) {
209                                 obd_up(&sbi->osi_list_mutex);
210                                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
211                                                       obdos, bufs_per_obdo,
212                                                       pages, bufs, counts,
213                                                       offsets, flags);
214                                 if ( err ) {
215                                         CDEBUG(D_INODE,
216                                                 "fatal: unable to do vec_wr (err %d)\n", err);
217                                         EXIT;
218                                         goto ERR;
219                                 }
220                                 obd_down(&sbi->osi_list_mutex);
221                                 goto again;
222                         }
223                 }
224         }
225
226 BREAK:
227         obd_up(&sbi->osi_list_mutex);
228
229         /* flush any remaining I/Os */
230         if ( num_io ) {
231                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
232                                       bufs_per_obdo, pages, bufs, counts,
233                                       offsets, flags);
234                 if (err)
235                         CDEBUG(D_INODE, "fatal: unable to do vec_wr (err %d)\n", err);
236                 num_io = 0;
237                 num_obdos = 0;
238         }
239
240         /* Remove inode from superblock dirty list when no more pages.
241          * Make sure we don't point at the current inode with tmp
242          * when we re-init the list on the inode, or we will loop.
243          */
244         obd_down(&sbi->osi_list_mutex);
245         tmp = inode_list;
246         while ( (tmp = tmp->prev) != inode_list ) {
247                 struct obdfs_inode_info *ii;
248                 struct inode *inode;
249
250                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
251                 inode = list_entry(ii, struct inode, u);
252                 CDEBUG(D_INFO, "checking inode %ld empty\n", inode->i_ino);
253                 if (list_empty(obdfs_iplist(inode))) {
254                         CDEBUG(D_CACHE, "remove inode %ld from dirty list\n",
255                                inode->i_ino);
256                         tmp = tmp->next;
257                         list_del(obdfs_islist(inode));
258                         /* decrement inode reference for page cache */
259                         inode->i_count--;
260                         INIT_LIST_HEAD(obdfs_islist(inode));
261                 }
262         }
263         obd_up(&sbi->osi_list_mutex);
264
265         CDEBUG(D_INFO, "flushed %d pages in total\n", total_io);
266         EXIT;
267 ERR:
268         return err;
269 } /* obdfs_flush_reqs */
270
271
272 void obdfs_flush_dirty_pages(int check_time)
273 {
274         struct list_head *sl;
275
276         ENTRY;
277         sl = &obdfs_super_list;
278         while ( (sl = sl->prev) != &obdfs_super_list ) {
279                 struct obdfs_sb_info *sbi = 
280                         list_entry(sl, struct obdfs_sb_info, osi_list);
281
282                 /* walk write requests here, use the sb, check the time */
283                 obdfs_flush_reqs(&sbi->osi_inodes, check_time);
284         }
285         EXIT;
286 } /* obdfs_flush_dirty_pages */
287
288
289 static struct task_struct *pupdated;
290
291 static int pupdate(void *unused) 
292 {
293         struct task_struct * tsk = current;
294         int interval;
295         
296         pupdated = current;
297
298         exit_files(current);
299         exit_mm(current);
300
301         tsk->session = 1;
302         tsk->pgrp = 1;
303         sprintf(tsk->comm, "pupdated");
304         pupdated = current;
305
306         MOD_INC_USE_COUNT;      /* XXX until send_sig works */
307         printk("pupdated activated...\n");
308
309         /* sigstop and sigcont will stop and wakeup pupdate */
310         spin_lock_irq(&tsk->sigmask_lock);
311         sigfillset(&tsk->blocked);
312         siginitsetinv(&tsk->blocked, sigmask(SIGTERM));
313         recalc_sigpending(tsk);
314         spin_unlock_irq(&tsk->sigmask_lock);
315
316         for (;;) {
317                 /* update interval */
318                 interval = pupd_prm.interval;
319                 if (interval)
320                 {
321                         tsk->state = TASK_INTERRUPTIBLE;
322                         schedule_timeout(interval);
323                 }
324                 else
325                 {
326                 stop_pupdate:
327                         tsk->state = TASK_STOPPED;
328                         MOD_DEC_USE_COUNT; /* XXX until send_sig works */
329                         printk("pupdated stopped...\n");
330                         return 0;
331                 }
332                 /* check for sigstop */
333                 if (signal_pending(tsk))
334                 {
335                         int stopped = 0;
336                         spin_lock_irq(&tsk->sigmask_lock);
337                         if (sigismember(&tsk->signal, SIGTERM))
338                         {
339                                 sigdelset(&tsk->signal, SIGTERM);
340                                 stopped = 1;
341                         }
342                         recalc_sigpending(tsk);
343                         spin_unlock_irq(&tsk->sigmask_lock);
344                         if (stopped)
345                                 goto stop_pupdate;
346                 }
347                 /* asynchronous setattr etc for the future ...
348                 flush_inodes();
349                  */
350                 obdfs_flush_dirty_pages(1); 
351         }
352 }
353
354
355 int obdfs_flushd_init(void)
356 {
357         /*
358         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
359          */
360         kernel_thread(pupdate, NULL, 0);
361         CDEBUG(D_PSDEV, __FUNCTION__ ": flushd inited\n");
362         return 0;
363 }
364
365 int obdfs_flushd_cleanup(void)
366 {
367         ENTRY;
368         /* deliver a signal to pupdated to shut it down
369            XXX need to kill it from user space for now XXX
370         if (pupdated) {
371                 send_sig_info(SIGTERM, 1, pupdated);
372         }
373          */
374
375         EXIT;
376         /* not reached */
377         return 0;
378
379 }