Whamcloud - gitweb
Branch b1_8
[fs/lustre-release.git] / lustre / llite / dir.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/dir.c
37  *
38  * Directory code for lustre client.
39  */
40
41 #include <linux/fs.h>
42 #include <linux/pagemap.h>
43 #include <linux/mm.h>
44 #include <linux/version.h>
45 #include <linux/smp_lock.h>
46 #include <asm/uaccess.h>
47 #include <linux/buffer_head.h>   // for wait_on_buffer
48
49 #define DEBUG_SUBSYSTEM S_LLITE
50
51 #include <obd_support.h>
52 #include <obd_class.h>
53 #include <lustre_lib.h>
54 #include <lustre/lustre_idl.h>
55 #include <lustre_lite.h>
56 #include <lustre_dlm.h>
57 #include "llite_internal.h"
58
59 #ifndef HAVE_PAGE_CHECKED
60 #ifdef HAVE_PG_FS_MISC
61 #define PageChecked(page)        test_bit(PG_fs_misc, &(page)->flags)
62 #define SetPageChecked(page)     set_bit(PG_fs_misc, &(page)->flags)
63 #else
64 #error PageChecked or PageFsMisc not defined in kernel
65 #endif
66 #endif
67
68 /* returns the page unlocked, but with a reference */
69 static int ll_dir_readpage(struct file *file, struct page *page)
70 {
71         struct inode *inode = page->mapping->host;
72         struct ll_fid mdc_fid;
73         __u64 offset;
74         struct ptlrpc_request *request;
75         struct mds_body *body;
76         int rc = 0;
77         ENTRY;
78
79         offset = (__u64)page->index << CFS_PAGE_SHIFT;
80         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off "LPU64"\n",
81                inode->i_ino, inode->i_generation, inode, offset);
82
83         ll_pack_fid(&mdc_fid, inode->i_ino, inode->i_generation, S_IFDIR);
84
85         rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &mdc_fid,
86                           offset, page, &request);
87         if (!rc) {
88                 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
89                                       sizeof(*body));
90                 LASSERT(body != NULL); /* checked by mdc_readpage() */
91                 /* swabbed by mdc_readpage() */
92                 LASSERT(lustre_rep_swabbed(request, REPLY_REC_OFF));
93
94                 if (body->size != i_size_read(inode)) {
95                         ll_inode_size_lock(inode, 0);
96                         i_size_write(inode, body->size);
97                         ll_inode_size_unlock(inode, 0);
98                 }
99
100                 SetPageUptodate(page);
101         }
102         ptlrpc_req_finished(request);
103
104         unlock_page(page);
105         EXIT;
106         return rc;
107 }
108
109 struct address_space_operations ll_dir_aops = {
110         .readpage  = ll_dir_readpage,
111 };
112
113 static inline unsigned ll_dir_page_mask(struct inode *inode)
114 {
115         return ~(inode->i_sb->s_blocksize - 1);
116 }
117
118 /*
119  * Check consistency of a single entry.
120  */
121 static int ll_dir_check_entry(struct inode *dir, struct ll_dir_entry *ent,
122                               unsigned offset, unsigned rec_len, pgoff_t index)
123 {
124         const char *msg;
125
126         /*
127          * Consider adding more checks.
128          */
129
130         if (unlikely(rec_len < ll_dir_rec_len(1)))
131                 msg = "entry is too short";
132         else if (unlikely(rec_len & 3))
133                 msg = "wrong alignment";
134         else if (unlikely(rec_len < ll_dir_rec_len(ent->lde_name_len)))
135                 msg = "rec_len doesn't match name_len";
136         else if (unlikely(((offset + rec_len - 1) ^ offset) &
137                           ll_dir_page_mask(dir)))
138                 msg = "directory entry across blocks";
139         else
140                 return 0;
141         CERROR("%s: bad entry in directory %lu/%u: %s - "
142                "offset=%lu+%u, inode=%lu, rec_len=%d,"
143                " name_len=%d\n", ll_i2mdcexp(dir)->exp_obd->obd_name,
144                dir->i_ino, dir->i_generation, msg,
145                index << CFS_PAGE_SHIFT,
146                offset, (unsigned long)le32_to_cpu(ent->lde_inode),
147                rec_len, ent->lde_name_len);
148         return -EIO;
149 }
150
151 static void ll_dir_check_page(struct inode *dir, struct page *page)
152 {
153         int      err;
154         unsigned size = dir->i_sb->s_blocksize;
155         char    *addr = page_address(page);
156         unsigned off;
157         unsigned limit;
158         unsigned reclen;
159
160         struct ll_dir_entry *ent;
161
162         err = 0;
163         if ((i_size_read(dir) >> CFS_PAGE_SHIFT) == (__u64)page->index) {
164                 /*
165                  * Last page.
166                  */
167                 limit = i_size_read(dir) & ~CFS_PAGE_MASK;
168                 if (limit & (size - 1)) {
169                         CERROR("%s: dir %lu/%u size %llu doesn't match %u\n",
170                                ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino,
171                                dir->i_generation, i_size_read(dir), size);
172                         err++;
173                 } else {
174                         /*
175                          * Place dummy forwarding entries to streamline
176                          * ll_readdir().
177                          */
178                         for (off = limit; off < CFS_PAGE_SIZE; off += size) {
179                                 ent = ll_entry_at(addr, off);
180                                 ent->lde_rec_len = cpu_to_le16(size);
181                                 ent->lde_name_len = 0;
182                                 ent->lde_inode = 0;
183                         }
184                 }
185         } else
186                 limit = CFS_PAGE_SIZE;
187
188         for (off = 0;
189              !err && off <= limit - ll_dir_rec_len(1); off += reclen) {
190                 ent    = ll_entry_at(addr, off);
191                 reclen = le16_to_cpu(ent->lde_rec_len);
192                 err    = ll_dir_check_entry(dir, ent, off, reclen, page->index);
193         }
194
195         if (!err && off != limit) {
196                 ent = ll_entry_at(addr, off);
197                 CERROR("%s: entry in directory %lu/%u spans the page boundary "
198                        "offset="LPU64"+%u, inode=%lu\n",
199                        ll_i2mdcexp(dir)->exp_obd->obd_name,
200                        dir->i_ino, dir->i_generation,
201                        (__u64)page->index << CFS_PAGE_SHIFT,
202                        off, (unsigned long)le32_to_cpu(ent->lde_inode));
203                 err++;
204         }
205         if (err)
206                 SetPageError(page);
207         SetPageChecked(page);
208 }
209
210 struct page *ll_get_dir_page(struct inode *dir, unsigned long n)
211 {
212         struct ldlm_res_id res_id;
213         struct lustre_handle lockh;
214         struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
215         struct address_space *mapping = dir->i_mapping;
216         struct page *page;
217         ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
218         int rc;
219
220         fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id);
221         rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
222                              &res_id, LDLM_IBITS, &policy, LCK_CR, &lockh);
223         if (!rc) {
224                 struct lookup_intent it = { .it_op = IT_READDIR };
225                 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CR,
226                        ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir };
227                 struct ptlrpc_request *request;
228                 struct mdc_op_data data = { { 0 } };
229
230                 ll_prepare_mdc_op_data(&data, dir, NULL, NULL, 0, 0, NULL);
231
232                 rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, &einfo, &it,
233                                  &data, &lockh, NULL, 0, 0);
234
235                 request = (struct ptlrpc_request *)it.d.lustre.it_data;
236                 if (request)
237                         ptlrpc_req_finished(request);
238                 if (rc < 0) {
239                         CERROR("lock enqueue: rc: %d\n", rc);
240                         return ERR_PTR(rc);
241                 }
242         }
243         ldlm_lock_dump_handle(D_OTHER, &lockh);
244
245         page = read_cache_page(mapping, n,
246                                (filler_t*)mapping->a_ops->readpage, NULL);
247         if (IS_ERR(page))
248                 GOTO(out_unlock, page);
249
250         wait_on_page(page);
251         (void)kmap(page);
252         if (!PageUptodate(page))
253                 goto fail;
254         if (!PageChecked(page))
255                 ll_dir_check_page(dir, page);
256         if (PageError(page))
257                 goto fail;
258
259 out_unlock:
260         ldlm_lock_decref(&lockh, LCK_CR);
261         return page;
262
263 fail:
264         ll_put_page(page);
265         page = ERR_PTR(-EIO);
266         goto out_unlock;
267 }
268
269 static inline unsigned ll_dir_validate_entry(char *base, unsigned offset,
270                                              unsigned mask)
271 {
272         struct ll_dir_entry *de = ll_entry_at(base, offset);
273         struct ll_dir_entry *p  = ll_entry_at(base, offset & mask);
274         while (p < de && p->lde_rec_len > 0)
275                 p = ll_dir_next_entry(p);
276         return (char *)p - base;
277 }
278
279 /*
280  * File type constants. The same as in ext2 for compatibility.
281  */
282
283 enum {
284         LL_DIR_FT_UNKNOWN,
285         LL_DIR_FT_REG_FILE,
286         LL_DIR_FT_DIR,
287         LL_DIR_FT_CHRDEV,
288         LL_DIR_FT_BLKDEV,
289         LL_DIR_FT_FIFO,
290         LL_DIR_FT_SOCK,
291         LL_DIR_FT_SYMLINK,
292         LL_DIR_FT_MAX
293 };
294
295 static unsigned char ll_dir_filetype_table[LL_DIR_FT_MAX] = {
296         [LL_DIR_FT_UNKNOWN]  = DT_UNKNOWN,
297         [LL_DIR_FT_REG_FILE] = DT_REG,
298         [LL_DIR_FT_DIR]      = DT_DIR,
299         [LL_DIR_FT_CHRDEV]   = DT_CHR,
300         [LL_DIR_FT_BLKDEV]   = DT_BLK,
301         [LL_DIR_FT_FIFO]     = DT_FIFO,
302         [LL_DIR_FT_SOCK]     = DT_SOCK,
303         [LL_DIR_FT_SYMLINK]  = DT_LNK,
304 };
305
306 /*
307  * Process one page. Returns:
308  *
309  *     -ve: filldir commands readdir to stop.
310  *     +ve: number of entries submitted to filldir.
311  *       0: no live entries on this page.
312  */
313
314 static int ll_readdir_page(char *addr, __u64 base, unsigned *offset,
315                            filldir_t filldir, void *cookie)
316 {
317         struct ll_dir_entry *de;
318         char *end;
319         int nr;
320
321         de = ll_entry_at(addr, *offset);
322         end = addr + CFS_PAGE_SIZE - ll_dir_rec_len(1);
323         for (nr = 0 ;(char*)de <= end; de = ll_dir_next_entry(de)) {
324                 if (de->lde_inode != 0) {
325                         nr++;
326                         *offset = (char *)de - addr;
327                         if (filldir(cookie, de->lde_name, de->lde_name_len,
328                                     base | *offset, le32_to_cpu(de->lde_inode),
329                                     ll_dir_filetype_table[de->lde_file_type &
330                                                           (LL_DIR_FT_MAX - 1)]))
331                                 return -1;
332                 }
333         }
334         return nr;
335 }
336
337 static int ll_readdir_18(struct file *filp, void *dirent, filldir_t filldir)
338 {
339         struct inode *inode = filp->f_dentry->d_inode;
340         loff_t pos          = filp->f_pos;
341         unsigned offset     = pos & ~CFS_PAGE_MASK;
342         pgoff_t idx         = pos >> CFS_PAGE_SHIFT;
343         pgoff_t npages      = dir_pages(inode);
344         unsigned chunk_mask = ll_dir_page_mask(inode);
345         int need_revalidate = (filp->f_version != inode->i_version);
346         int rc              = 0;
347         int done; /* when this becomes negative --- stop iterating */
348
349         ENTRY;
350
351         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %llu/%llu\n",
352                inode->i_ino, inode->i_generation, inode,
353                pos, i_size_read(inode));
354
355         /*
356          * Checking ->i_size without the lock. Should be harmless, as server
357          * re-checks.
358          */
359         if (pos > i_size_read(inode) - ll_dir_rec_len(1))
360                 RETURN(0);
361
362         for (done = 0; idx < npages; idx++, offset = 0) {
363                 /*
364                  * We can assume that all blocks on this page are filled with
365                  * entries, because ll_dir_check_page() placed special dummy
366                  * entries for us.
367                  */
368
369                 char *kaddr;
370                 struct page *page;
371
372                 CDEBUG(D_EXT2,"read %lu of dir %lu/%u page %lu/%lu "
373                        "size %llu\n",
374                        CFS_PAGE_SIZE, inode->i_ino, inode->i_generation,
375                        idx, npages, i_size_read(inode));
376                 page = ll_get_dir_page(inode, idx);
377
378                 /* size might have been updated by mdc_readpage */
379                 npages = dir_pages(inode);
380
381                 if (IS_ERR(page)) {
382                         rc = PTR_ERR(page);
383                         CERROR("error reading dir %lu/%u page %lu: rc %d\n",
384                                inode->i_ino, inode->i_generation, idx, rc);
385                         continue;
386                 }
387
388                 kaddr = page_address(page);
389                 if (need_revalidate) {
390                         /*
391                          * File offset was changed by lseek() and possibly
392                          * points in the middle of an entry. Re-scan from the
393                          * beginning of the chunk.
394                          */
395                         offset = ll_dir_validate_entry(kaddr, offset,
396                                                        chunk_mask);
397                         need_revalidate = 0;
398                 }
399                 done = ll_readdir_page(kaddr, idx << CFS_PAGE_SHIFT,
400                                        &offset, filldir, dirent);
401                 ll_put_page(page);
402                 if (done > 0)
403                         /*
404                          * Some entries were sent to the user space, return
405                          * success.
406                          */
407                         rc = 0;
408                 else if (done < 0)
409                         /*
410                          * filldir is satisfied.
411                          */
412                         break;
413         }
414
415         filp->f_pos = (idx << CFS_PAGE_SHIFT) | offset;
416         filp->f_version = inode->i_version;
417         touch_atime(filp->f_vfsmnt, filp->f_dentry);
418
419         RETURN(rc);
420 }
421
422 /*      
423  * Chain of hash overflow pages.
424  */            
425 struct ll_dir_chain {
426         /* XXX something. Later */
427 };
428   
429 static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
430 {  
431 }
432
433 static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
434 {
435 }
436
437 static inline __u32 hash_x_index(__u32 value)
438 {
439         return ((__u32)~0) - value;
440 }
441
442 /**
443  * Layout of readdir pages, as transmitted on wire.
444  */
445 struct lu_dirent {
446         /** valid if LUDA_FID is set. */
447         struct lu_fid lde_fid;
448         /** a unique entry identifier: a hash or an offset. */
449         __u64         lde_hash;
450         /** total record length, including all attributes. */
451         __u16         lde_reclen;
452         /** name length */
453         __u16         lde_namelen;
454         /** optional variable size attributes following this entry.
455          *  taken from enum lu_dirent_attrs.
456          */
457         __u32         lde_attrs;
458         /** name is followed by the attributes indicated in ->ldp_attrs, in
459          *  their natural order. After the last attribute, padding bytes are
460          *  added to make ->lde_reclen a multiple of 8.
461          */
462         char          lde_name[0];
463 };
464
465 struct lu_dirpage {
466         __u64            ldp_hash_start;
467         __u64            ldp_hash_end;
468         __u16            ldp_flags;
469         __u16            ldp_pad0;
470         __u32            ldp_pad1;
471         struct lu_dirent ldp_entries[0];
472 };
473
474 /*
475  * Definitions of optional directory entry attributes formats.
476  *
477  * Individual attributes do not have their length encoded in a generic way. It
478  * is assumed that consumer of an attribute knows its format. This means that
479  * it is impossible to skip over an unknown attribute, except by skipping over all
480  * remaining attributes (by using ->lde_reclen), which is not too
481  * constraining, because new server versions will append new attributes at
482  * the end of an entry.
483  */
484
485 /**
486  * Fid directory attribute: a fid of an object referenced by the entry. This
487  * will be almost always requested by the client and supplied by the server.
488  *
489  * Aligned to 8 bytes.
490  */
491 /* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
492
493 /**
494  * File type.
495  *
496  * Aligned to 2 bytes.
497  */
498 struct luda_type {
499         __u16 lt_type;
500 };
501
502 enum lu_dirpage_flags {
503         LDF_EMPTY = 1 << 0
504 };
505
506 static inline int lu_dirent_calc_size(int namelen, __u16 attr)
507 {
508         int size;
509
510         if (attr & LUDA_TYPE) {
511                 const unsigned align = sizeof(struct luda_type) - 1;
512                 size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
513                 size += sizeof(struct luda_type);
514         } else
515                 size = sizeof(struct lu_dirent) + namelen;
516
517         return (size + 7) & ~7;
518 }
519
520 /**
521  * return IF_* type for given lu_dirent entry.
522  * IF_* flag shld be converted to particular OS file type in
523  * platform llite module.
524  */
525 __u16 ll_dirent_type_get(struct lu_dirent *ent)
526 {
527         __u16 type = 0;
528         struct luda_type *lt;
529         int len = 0;
530
531         if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
532                 const unsigned align = sizeof(struct luda_type) - 1;
533
534                 len = le16_to_cpu(ent->lde_namelen);
535                 len = (len + align) & ~align;
536                 lt = (void *) ent->lde_name + len;
537                 type = CFS_IFTODT(le16_to_cpu(lt->lt_type));
538         }
539         return type;
540 }
541
542 static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
543 {
544         if (le16_to_cpu(dp->ldp_flags) & LDF_EMPTY)
545                 return NULL;
546         else
547                 return dp->ldp_entries;
548 }
549
550 static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
551 {
552         struct lu_dirent *next;
553
554         if (le16_to_cpu(ent->lde_reclen) != 0)
555                 next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
556         else
557                 next = NULL;
558
559         return next;
560 }
561
562 static inline int lu_dirent_size(struct lu_dirent *ent)
563 {
564         if (le16_to_cpu(ent->lde_reclen) == 0) {
565                 return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen),
566                                            le32_to_cpu(ent->lde_attrs));
567         }
568         return le16_to_cpu(ent->lde_reclen);
569 }
570
571 #define DIR_END_OFF              0xfffffffffffffffeULL
572
573 #ifdef HAVE_RW_TREE_LOCK
574 #define TREE_READ_LOCK_IRQ(mapping)     read_lock_irq(&(mapping)->tree_lock)
575 #define TREE_READ_UNLOCK_IRQ(mapping) read_unlock_irq(&(mapping)->tree_lock)
576 #else
577 #define TREE_READ_LOCK_IRQ(mapping) spin_lock_irq(&(mapping)->tree_lock)
578 #define TREE_READ_UNLOCK_IRQ(mapping) spin_unlock_irq(&(mapping)->tree_lock)
579 #endif
580
581 /* returns the page unlocked, but with a reference */
582 static int ll_dir_readpage_20(struct file *file, struct page *page)
583 {
584         struct inode *inode = page->mapping->host;
585         struct ptlrpc_request *request;
586         struct mdt_body *body;
587         struct ll_fid fid;
588         __u64 hash;
589         int rc;
590         ENTRY;
591
592         hash = hash_x_index(page->index);
593         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off %lu\n",
594                inode->i_ino, inode->i_generation, inode, (unsigned long)hash);
595
596         ll_inode2fid(&fid, inode);
597         rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &fid,
598                           hash, page, &request);
599         if (!rc) {
600                 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
601                                       sizeof(*body));
602                 /* Checked by mdc_readpage() */
603                 LASSERT(body != NULL);
604
605                 if (body->valid & OBD_MD_FLSIZE) {
606                         ll_inode_size_lock(inode, 0);
607                         i_size_write(inode, body->size);
608                         ll_inode_size_unlock(inode, 0);
609                 }
610                 SetPageUptodate(page);
611         }
612         ptlrpc_req_finished(request);
613
614         unlock_page(page);
615         EXIT;
616         return rc;
617 }
618
619
620 static void ll_check_page(struct inode *dir, struct page *page)
621 {
622         /* XXX: check page format later */
623         SetPageChecked(page);
624 }
625
626
627 /*
628  * Find, kmap and return page that contains given hash.
629  */
630 static struct page *ll_dir_page_locate(struct inode *dir, unsigned long hash,
631                                        __u64 *start, __u64 *end)
632 {
633         struct address_space *mapping = dir->i_mapping;
634         /*
635          * Complement of hash is used as an index so that
636          * radix_tree_gang_lookup() can be used to find a page with starting
637          * hash _smaller_ than one we are looking for.
638          */
639         unsigned long offset = hash_x_index(hash);
640         struct page *page;
641         int found;
642         ENTRY;
643
644         TREE_READ_LOCK_IRQ(mapping);
645         found = radix_tree_gang_lookup(&mapping->page_tree,
646                                        (void **)&page, offset, 1);
647         if (found > 0) {
648                 struct lu_dirpage *dp;
649
650                 page_cache_get(page);
651                 TREE_READ_UNLOCK_IRQ(mapping);
652                 /*
653                  * In contrast to find_lock_page() we are sure that directory
654                  * page cannot be truncated (while DLM lock is held) and,
655                  * hence, can avoid restart.
656                  *
657                  * In fact, page cannot be locked here at all, because
658                  * ll_dir_readpage() does synchronous io.
659                  */
660                 wait_on_page(page);
661                 if (PageUptodate(page)) {
662                         dp = kmap(page);
663                         *start = le64_to_cpu(dp->ldp_hash_start);
664                         *end   = le64_to_cpu(dp->ldp_hash_end);
665                         LASSERT(*start <= hash);
666                         if (hash > *end || (*end != *start && hash == *end)) {
667                                 kunmap(page);
668                                 lock_page(page);
669                                 ll_truncate_complete_page(page);
670                                 unlock_page(page);
671                                 page_cache_release(page);
672                                 page = NULL;
673                         }
674                 } else {
675                         page_cache_release(page);
676                         page = ERR_PTR(-EIO);
677                 }
678
679         } else {
680                 TREE_READ_UNLOCK_IRQ(mapping);
681                 page = NULL;
682         }
683         RETURN(page);
684 }
685
686 static struct page *ll_get_dir_page_20(struct inode *dir, __u64 hash, int exact,
687                                        struct ll_dir_chain *chain)
688 {
689         struct ldlm_res_id res_id;
690         struct lustre_handle lockh;
691         struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
692         struct address_space *mapping = dir->i_mapping;
693         struct lu_dirpage *dp;
694         struct page *page;
695         ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
696         ldlm_mode_t mode;
697         int rc;
698         __u64 start = 0;
699         __u64 end = 0;
700         ENTRY;
701  
702         fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id);
703         mode = LCK_PR;
704         rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
705                              &res_id, LDLM_IBITS, &policy, mode, &lockh);
706         if (!rc) {
707                 struct lookup_intent it = { .it_op = IT_READDIR };
708                 struct ldlm_enqueue_info einfo = { LDLM_IBITS, mode,
709                        ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir };
710                 struct ptlrpc_request *request;
711                 struct mdc_op_data op_data = { { 0 } };
712
713                 ll_prepare_mdc_op_data(&op_data, dir, NULL, NULL, 0, 0, NULL);
714
715                 rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, &einfo, &it,
716                                  &op_data, &lockh, NULL, 0, 0);
717
718                 request = (struct ptlrpc_request *)it.d.lustre.it_data;
719                 if (request)
720                         ptlrpc_req_finished(request);
721                 if (rc < 0) {
722                         CERROR("lock enqueue: rc: %d\n", rc);
723                         RETURN(ERR_PTR(rc));
724                 }
725         }
726         ldlm_lock_dump_handle(D_OTHER, &lockh);
727
728         page = ll_dir_page_locate(dir, hash, &start, &end);
729         if (IS_ERR(page))
730                 GOTO(out_unlock, page);
731
732         if (page != NULL) {
733                 /*
734                  * XXX nikita: not entirely correct handling of a corner case:
735                  * suppose hash chain of entries with hash value HASH crosses
736                  * border between pages P0 and P1. First both P0 and P1 are
737                  * cached, seekdir() is called for some entry from the P0 part
738                  * of the chain. Later P0 goes out of cache. telldir(HASH)
739                  * happens and finds P1, as it starts with matching hash
740                  * value. Remaining entries from P0 part of the chain are
741                  * skipped. (Is that really a bug?)
742                  *
743                  * Possible solutions: 0. don't cache P1 is such case, handle
744                  * it as an "overflow" page. 1. invalidate all pages at
745                  * once. 2. use HASH|1 as an index for P1.
746                  */
747                 if (exact && hash != start) {
748                         /*
749                          * readdir asked for a page starting _exactly_ from
750                          * given hash, but cache contains stale page, with
751                          * entries with smaller hash values. Stale page should
752                          * be invalidated, and new one fetched.
753                          */
754                         CDEBUG(D_INFO, "Stale readpage page %p: %#lx != %#lx\n",
755                               page, (unsigned long)hash, (unsigned long)start);
756                         lock_page(page);
757                         ll_truncate_complete_page(page);
758                         unlock_page(page);
759                         page_cache_release(page);
760                 } else {
761                         GOTO(hash_collision, page);
762                 }
763         }
764
765         page = read_cache_page(mapping, hash_x_index(hash),
766                                (filler_t*)ll_dir_readpage_20, NULL);
767         if (IS_ERR(page))
768                 GOTO(out_unlock, page);
769
770         wait_on_page(page);
771         (void)kmap(page);
772         if (!PageUptodate(page))
773                 goto fail;
774         if (!PageChecked(page))
775                 ll_check_page(dir, page);
776         if (PageError(page))
777                 goto fail;
778 hash_collision:
779         dp = page_address(page);
780
781         start = le64_to_cpu(dp->ldp_hash_start);
782         end   = le64_to_cpu(dp->ldp_hash_end);
783         if (end == start) {
784                 LASSERT(start == hash);
785                 CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end);
786                 /*
787                  * Fetch whole overflow chain...
788                  *
789                  * XXX not yet.
790                  */
791                 goto fail;
792         }
793 out_unlock:
794         ldlm_lock_decref(&lockh, mode);
795         RETURN(page);
796
797 fail:
798         ll_put_page(page);
799         page = ERR_PTR(-EIO);
800         goto out_unlock;
801 }
802
803 static int ll_readdir_20(struct file *filp, void *cookie, filldir_t filldir)
804 {
805         struct inode         *inode = filp->f_dentry->d_inode;
806         struct ll_sb_info    *sbi   = ll_i2sbi(inode);
807         __u64                 pos   = filp->f_pos;
808         struct page          *page;
809         struct ll_dir_chain   chain;
810         int rc;
811         int done;
812         int shift;
813         __u16 type;
814         ENTRY;
815
816         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu\n",
817                inode->i_ino, inode->i_generation, inode,
818                (unsigned long)pos, i_size_read(inode));
819
820         if (pos == DIR_END_OFF)
821                 /*
822                  * end-of-file.
823                  */
824                 RETURN(0);
825
826         rc    = 0;
827         done  = 0;
828         shift = 0;
829         ll_dir_chain_init(&chain);
830
831         page = ll_get_dir_page_20(inode, pos, 0, &chain);
832
833         while (rc == 0 && !done) {
834                 struct lu_dirpage *dp;
835                 struct lu_dirent  *ent;
836
837                 if (!IS_ERR(page)) {
838                         /* 
839                          * If page is empty (end of directoryis reached),
840                          * use this value. 
841                          */
842                         __u64 hash = DIR_END_OFF;
843                         __u64 next;
844
845                         dp = page_address(page);
846                         for (ent = lu_dirent_start(dp); ent != NULL && !done;
847                              ent = lu_dirent_next(ent)) {
848                                 char          *name;
849                                 int            namelen;
850                                 struct lu_fid  fid;
851                                 ino_t          ino;
852
853                                 hash    = le64_to_cpu(ent->lde_hash);
854                                 namelen = le16_to_cpu(ent->lde_namelen);
855
856                                 if (hash < pos)
857                                         /*
858                                          * Skip until we find target hash
859                                          * value.
860                                          */
861                                         continue;
862
863                                 if (namelen == 0)
864                                         /*
865                                          * Skip dummy record.
866                                          */
867                                         continue;
868
869                                 fid  = ent->lde_fid;
870                                 name = ent->lde_name;
871                                 fid_le_to_cpu(&fid, &fid);
872                                 ino  = ll_fid_build_ino(sbi, (struct ll_fid*)&fid);
873                                 type = ll_dirent_type_get(ent);
874                                 done = filldir(cookie, name, namelen,
875                                                (loff_t)hash, ino, type);
876                         }
877                         next = le64_to_cpu(dp->ldp_hash_end);
878                         ll_put_page(page);
879                         if (!done) {
880                                 pos = next;
881                                 if (pos == DIR_END_OFF)
882                                         /*
883                                          * End of directory reached.
884                                          */
885                                         done = 1;
886                                 else if (1 /* chain is exhausted*/)
887                                         /*
888                                          * Normal case: continue to the next
889                                          * page.
890                                          */
891                                         page = ll_get_dir_page_20(inode, pos, 1,
892                                                                   &chain);
893                                 else {
894                                         /*
895                                          * go into overflow page.
896                                          */
897                                 }
898                         } else {
899                                 pos = hash;
900                         }
901                 } else {
902                         rc = PTR_ERR(page);
903                         CERROR("error reading dir "DFID" at %lu: rc %d\n",
904                                PFID(ll_inode_lu_fid(inode)),
905                                (unsigned long)pos, rc);
906                 }
907         }
908
909         filp->f_pos = (loff_t)(__s32)pos;
910         filp->f_version = inode->i_version;
911         touch_atime(filp->f_vfsmnt, filp->f_dentry);
912
913         ll_dir_chain_fini(&chain);
914
915         RETURN(rc);
916 }
917
918 static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
919 {
920         struct inode      *inode = filp->f_dentry->d_inode;
921         struct ll_sb_info *sbi = ll_i2sbi(inode);
922
923         if (sbi->ll_mdc_exp->exp_connect_flags & OBD_CONNECT_FID) {
924                 return ll_readdir_20(filp, cookie, filldir);
925         } else {
926                 return ll_readdir_18(filp, cookie, filldir);
927         }
928 }
929
930 #define QCTL_COPY(out, in)              \
931 do {                                    \
932         Q_COPY(out, in, qc_cmd);        \
933         Q_COPY(out, in, qc_type);       \
934         Q_COPY(out, in, qc_id);         \
935         Q_COPY(out, in, qc_stat);       \
936         Q_COPY(out, in, qc_dqinfo);     \
937         Q_COPY(out, in, qc_dqblk);      \
938 } while (0)
939
940 static int ll_send_mgc_param(struct obd_export *mgc, char *string)
941 {
942         struct mgs_send_param *msp;
943         int rc = 0;
944
945         OBD_ALLOC_PTR(msp);
946         if (!msp)
947                 return -ENOMEM;
948
949         strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
950         rc = obd_set_info_async(mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
951                                 sizeof(struct mgs_send_param), msp, NULL);
952         if (rc)
953                 CERROR("Failed to set parameter: %d\n", rc);
954
955         OBD_FREE_PTR(msp);
956         return rc;
957 }
958
959 static char *ll_get_fsname(struct inode *inode)
960 {
961         struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
962         char *ptr, *fsname;
963         int len;
964
965         OBD_ALLOC(fsname, MGS_PARAM_MAXLEN);
966         len = strlen(lsi->lsi_lmd->lmd_profile);
967         ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
968         if (ptr && (strcmp(ptr, "-client") == 0))
969                 len -= 7;
970         strncpy(fsname, lsi->lsi_lmd->lmd_profile, len);
971         fsname[len] = '\0';
972
973         return fsname;
974 }
975
976 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
977                      int set_default)
978 {
979         struct ll_sb_info *sbi = ll_i2sbi(inode);
980         struct mdc_op_data data = { { 0 } };
981         struct ptlrpc_request *req = NULL;
982         struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
983         struct obd_device *mgc = lsi->lsi_mgc;
984         char *fsname = NULL, *param = NULL;
985         int lum_size = sizeof(struct lov_user_md_v1);
986
987         struct iattr attr = { 0 };
988         int rc = 0;
989
990         if (lump->lmm_magic == LOV_USER_MAGIC_V3)
991                 lum_size = sizeof(struct lov_user_md_v3);
992         /*
993          * This is coming from userspace, so should be in
994          * local endian.  But the MDS would like it in little
995          * endian, so we swab it before we send it.
996          */
997         if ((lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1)) &&
998             (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))) {
999                 rc = lustre_swab_lov_user_md(lump);
1000                 if (rc) 
1001                         return rc;
1002         }
1003
1004         ll_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0, NULL);
1005
1006         /* swabbing is done in lov_setstripe() on server side */
1007         rc = mdc_setattr(sbi->ll_mdc_exp, &data,
1008                          &attr, lump, lum_size, NULL, 0, &req);
1009         if (rc) {
1010                 ptlrpc_req_finished(req);
1011                 if (rc != -EPERM && rc != -EACCES)
1012                         CERROR("mdc_setattr fails: rc = %d\n", rc);
1013                 return rc;
1014         }
1015         ptlrpc_req_finished(req);
1016
1017         /* In the following we use the fact that LOV_USER_MAGIC_V1 and
1018          LOV_USER_MAGIC_V3 have the same initial fields so we do not
1019          need the make the distiction between the 2 versions */
1020         if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
1021                 OBD_ALLOC(param, MGS_PARAM_MAXLEN);
1022
1023                 /* Get fsname and assume devname to be -MDT0000. */
1024                 fsname = ll_get_fsname(inode);
1025                 /* Set root stripesize */
1026                 sprintf(param, "%s-MDT0000.lov.stripesize=%u", fsname,
1027                         le32_to_cpu(lump->lmm_stripe_size));
1028                 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
1029                 if (rc)
1030                         goto end;
1031
1032                 /* Set root stripecount */
1033                 sprintf(param, "%s-MDT0000.lov.stripecount=%u", fsname,
1034                         le16_to_cpu(lump->lmm_stripe_count));
1035                 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
1036                 if (rc)
1037                         goto end;
1038
1039                 /* Set root stripeoffset */
1040                 sprintf(param, "%s-MDT0000.lov.stripeoffset=%u", fsname,
1041                         le16_to_cpu(lump->lmm_stripe_offset));
1042                 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
1043                 if (rc)
1044                         goto end;
1045 end:
1046                 if (fsname)
1047                         OBD_FREE(fsname, MGS_PARAM_MAXLEN);
1048                 if (param)
1049                         OBD_FREE(param, MGS_PARAM_MAXLEN);
1050         }
1051         return rc;
1052 }
1053
1054 int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
1055                      int *lmm_size, struct ptlrpc_request **request)
1056 {
1057         struct ll_sb_info *sbi = ll_i2sbi(inode);
1058         struct ll_fid     fid;
1059         struct mds_body   *body;
1060         struct lov_mds_md *lmm = NULL;
1061         struct ptlrpc_request *req = NULL;
1062         int rc, lmmsize;
1063
1064         ll_inode2fid(&fid, inode);
1065
1066         rc = ll_get_max_mdsize(sbi, &lmmsize);
1067         if (rc)
1068                 RETURN(rc);
1069
1070         rc = mdc_getattr(sbi->ll_mdc_exp, &fid,
1071                         OBD_MD_FLEASIZE|OBD_MD_FLDIREA,
1072                         lmmsize, &req);
1073         if (rc < 0) {
1074                 CDEBUG(D_INFO, "mdc_getattr failed on inode "
1075                        "%lu/%u: rc %d\n", inode->i_ino,
1076                        inode->i_generation, rc);
1077                 GOTO(out, rc);
1078         }
1079         body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
1080                         sizeof(*body));
1081         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1082         /* swabbed by mdc_getattr_name */
1083         LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
1084
1085         lmmsize = body->eadatasize;
1086
1087         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1088             lmmsize == 0) {
1089                 GOTO(out, rc = -ENODATA);
1090         }
1091
1092         lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1093         LASSERT(lmm != NULL);
1094         LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
1095
1096         /*
1097          * This is coming from the MDS, so is probably in
1098          * little endian.  We convert it to host endian before
1099          * passing it to userspace.
1100          */
1101         /* We don't swab objects for directories */
1102         if (((le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1) ||
1103             (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3)) &&
1104             (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))) {
1105                 rc = lustre_swab_lov_user_md((struct lov_user_md*)lmm);
1106                 if (rc)
1107                         GOTO(out, rc);
1108         }
1109
1110 out:
1111         *lmmp = lmm;
1112         *lmm_size = lmmsize;
1113         *request = req;
1114         return rc;
1115 }
1116
1117 static int ll_dir_ioctl(struct inode *inode, struct file *file,
1118                         unsigned int cmd, unsigned long arg)
1119 {
1120         struct ll_sb_info *sbi = ll_i2sbi(inode);
1121         struct obd_ioctl_data *data;
1122         ENTRY;
1123
1124         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n",
1125                inode->i_ino, inode->i_generation, inode, cmd);
1126
1127         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1128         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1129                 return -ENOTTY;
1130
1131         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1132         switch(cmd) {
1133         case EXT3_IOC_GETFLAGS:
1134         case EXT3_IOC_SETFLAGS:
1135                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1136         case EXT3_IOC_GETVERSION_OLD:
1137         case EXT3_IOC_GETVERSION:
1138                 RETURN(put_user(inode->i_generation, (int *)arg));
1139         /* We need to special case any other ioctls we want to handle,
1140          * to send them to the MDS/OST as appropriate and to properly
1141          * network encode the arg field.
1142         case EXT3_IOC_SETVERSION_OLD:
1143         case EXT3_IOC_SETVERSION:
1144         */
1145         case IOC_MDC_LOOKUP: {
1146                 struct ptlrpc_request *request = NULL;
1147                 struct ll_fid fid;
1148                 char *buf = NULL;
1149                 char *filename;
1150                 int namelen, rc, len = 0;
1151
1152                 rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
1153                 if (rc)
1154                         RETURN(rc);
1155                 data = (void *)buf;
1156
1157                 filename = data->ioc_inlbuf1;
1158                 namelen = data->ioc_inllen1;
1159
1160                 if (namelen < 1) {
1161                         CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
1162                         GOTO(out, rc = -EINVAL);
1163                 }
1164
1165                 ll_inode2fid(&fid, inode);
1166                 rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid, filename, namelen,
1167                                       OBD_MD_FLID, 0, &request);
1168                 if (rc < 0) {
1169                         CDEBUG(D_INFO, "mdc_getattr_name: %d\n", rc);
1170                         GOTO(out, rc);
1171                 }
1172
1173                 ptlrpc_req_finished(request);
1174
1175                 EXIT;
1176         out:
1177                 obd_ioctl_freedata(buf, len);
1178                 return rc;
1179         }
1180         case LL_IOC_LOV_SETSTRIPE: {
1181                 struct lov_user_md_v3 lumv3;
1182                 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1183                 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1184                 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1185
1186                 int rc = 0;
1187                 int set_default = 0;
1188
1189                 LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
1190                 LASSERT(sizeof(lumv3.lmm_objects[0]) ==
1191                         sizeof(lumv3p->lmm_objects[0]));
1192
1193                 /* first try with v1 which is smaller than v3 */
1194                 rc = copy_from_user(lumv1, lumv1p, sizeof(*lumv1));
1195                 if (rc)
1196                         return(-EFAULT);
1197
1198                 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1199                         rc = copy_from_user(&lumv3, lumv3p, sizeof(lumv3));
1200                         if (rc)
1201                                 RETURN(-EFAULT);
1202                 }
1203
1204                 if (inode->i_sb->s_root == file->f_dentry)
1205                         set_default = 1;
1206
1207                 /* in v1 and v3 cases lumv1 points to data */
1208                 rc = ll_dir_setstripe(inode, lumv1, set_default);
1209
1210                 return rc;
1211         }
1212         case LL_IOC_OBD_STATFS:
1213                 RETURN(ll_obd_statfs(inode, (void *)arg));
1214         case LL_IOC_LOV_GETSTRIPE:
1215         case LL_IOC_MDC_GETINFO:
1216         case IOC_MDC_GETFILEINFO:
1217         case IOC_MDC_GETFILESTRIPE: {
1218                 struct ptlrpc_request *request = NULL;
1219                 struct mds_body *body;
1220                 struct lov_user_md *lump;
1221                 struct lov_mds_md *lmm = NULL;
1222                 char *filename = NULL;
1223                 int rc, lmmsize;
1224
1225                 if (cmd == IOC_MDC_GETFILEINFO ||
1226                     cmd == IOC_MDC_GETFILESTRIPE) {
1227                         filename = getname((const char *)arg);
1228                         if (IS_ERR(filename))
1229                                 RETURN(PTR_ERR(filename));
1230
1231                         rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
1232                                                       &lmmsize, &request);
1233                 } else {
1234                         rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
1235                 }
1236
1237                 if (request) {
1238                         body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
1239                                               sizeof(*body));
1240                         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1241                         /* swabbed by mdc_getattr_name */
1242                         LASSERT(lustre_rep_swabbed(request, REPLY_REC_OFF));
1243                 } else {
1244                         GOTO(out_req, rc);
1245                 }
1246
1247                 if (rc < 0) {
1248                         if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
1249                                                cmd == LL_IOC_MDC_GETINFO))
1250                                 GOTO(skip_lmm, rc = 0);
1251                         else
1252                                 GOTO(out_req, rc);
1253                 }
1254
1255                 if (cmd == IOC_MDC_GETFILESTRIPE ||
1256                     cmd == LL_IOC_LOV_GETSTRIPE) {
1257                         lump = (struct lov_user_md *)arg;
1258                 } else {
1259                         struct lov_user_mds_data *lmdp;
1260                         lmdp = (struct lov_user_mds_data *)arg;
1261                         lump = &lmdp->lmd_lmm;
1262                 }
1263                 rc = copy_to_user(lump, lmm, lmmsize);
1264                 if (rc)
1265                         GOTO(out_lmm, rc = -EFAULT);
1266         skip_lmm:
1267                 if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
1268                         struct lov_user_mds_data *lmdp;
1269                         lstat_t st = { 0 };
1270
1271                         st.st_dev     = inode->i_sb->s_dev;
1272                         st.st_mode    = body->mode;
1273                         st.st_nlink   = body->nlink;
1274                         st.st_uid     = body->uid;
1275                         st.st_gid     = body->gid;
1276                         st.st_rdev    = body->rdev;
1277                         st.st_size    = body->size;
1278                         st.st_blksize = CFS_PAGE_SIZE;
1279                         st.st_blocks  = body->blocks;
1280                         st.st_atime   = body->atime;
1281                         st.st_mtime   = body->mtime;
1282                         st.st_ctime   = body->ctime;
1283                         st.st_ino     = body->ino;
1284
1285                         lmdp = (struct lov_user_mds_data *)arg;
1286                         rc = copy_to_user(&lmdp->lmd_st, &st, sizeof(st));
1287                         if (rc)
1288                                 GOTO(out_lmm, rc = -EFAULT);
1289                 }
1290
1291                 EXIT;
1292         out_lmm:
1293                 if (lmm && lmm->lmm_magic == LOV_MAGIC_JOIN)
1294                         OBD_FREE(lmm, lmmsize);
1295         out_req:
1296                 ptlrpc_req_finished(request);
1297                 if (filename)
1298                         putname(filename);
1299                 return rc;
1300         }
1301         case IOC_LOV_GETINFO: {
1302                 struct lov_user_mds_data *lumd;
1303                 struct lov_stripe_md *lsm;
1304                 struct lov_user_md *lum;
1305                 struct lov_mds_md *lmm;
1306                 int lmmsize;
1307                 lstat_t st;
1308                 int rc;
1309
1310                 lumd = (struct lov_user_mds_data *)arg;
1311                 lum = &lumd->lmd_lmm;
1312
1313                 rc = ll_get_max_mdsize(sbi, &lmmsize);
1314                 if (rc)
1315                         RETURN(rc);
1316
1317                 OBD_ALLOC(lmm, lmmsize);
1318                 rc = copy_from_user(lmm, lum, lmmsize);
1319                 if (rc)
1320                         GOTO(free_lmm, rc = -EFAULT);
1321
1322                 if (LOV_USER_MAGIC != cpu_to_le32(LOV_USER_MAGIC)) {
1323                         rc = lustre_swab_lov_user_md(
1324                                                 (struct lov_user_md_v1 *)lmm);
1325                         if (rc) 
1326                                 GOTO(free_lmm, rc);
1327                         rc = lustre_swab_lov_user_md_objects(
1328                                                 (struct lov_user_md*)lmm);
1329                         if (rc) 
1330                                 GOTO(free_lmm, rc);
1331                 }
1332
1333                 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
1334                 if (rc < 0)
1335                         GOTO(free_lmm, rc = -ENOMEM);
1336
1337                 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
1338                 if (rc)
1339                         GOTO(free_lsm, rc);
1340
1341                 /* Perform glimpse_size operation. */
1342                 memset(&st, 0, sizeof(st));
1343
1344                 rc = ll_glimpse_ioctl(sbi, lsm, &st);
1345                 if (rc)
1346                         GOTO(free_lsm, rc);
1347
1348                 rc = copy_to_user(&lumd->lmd_st, &st, sizeof(st));
1349                 if (rc)
1350                         GOTO(free_lsm, rc = -EFAULT);
1351
1352                 EXIT;
1353         free_lsm:
1354                 obd_free_memmd(sbi->ll_osc_exp, &lsm);
1355         free_lmm:
1356                 OBD_FREE(lmm, lmmsize);
1357                 return rc;
1358         }
1359         case OBD_IOC_LLOG_CATINFO: {
1360                 struct ptlrpc_request *req = NULL;
1361                 char *buf = NULL;
1362                 int rc, len = 0;
1363                 char *bufs[3] = { NULL }, *str;
1364                 int lens[3] = { sizeof(struct ptlrpc_body) };
1365                 int size[2] = { sizeof(struct ptlrpc_body) };
1366
1367                 rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
1368                 if (rc)
1369                         RETURN(rc);
1370                 data = (void *)buf;
1371
1372                 if (!data->ioc_inlbuf1) {
1373                         obd_ioctl_freedata(buf, len);
1374                         RETURN(-EINVAL);
1375                 }
1376
1377                 lens[REQ_REC_OFF] = data->ioc_inllen1;
1378                 bufs[REQ_REC_OFF] = data->ioc_inlbuf1;
1379                 if (data->ioc_inllen2) {
1380                         lens[REQ_REC_OFF + 1] = data->ioc_inllen2;
1381                         bufs[REQ_REC_OFF + 1] = data->ioc_inlbuf2;
1382                 } else {
1383                         lens[REQ_REC_OFF + 1] = 0;
1384                         bufs[REQ_REC_OFF + 1] = NULL;
1385                 }
1386
1387                 req = ptlrpc_prep_req(sbi2mdc(sbi)->cl_import,
1388                                       LUSTRE_LOG_VERSION, LLOG_CATINFO, 3, lens,
1389                                       bufs);
1390                 if (!req)
1391                         GOTO(out_catinfo, rc = -ENOMEM);
1392
1393                 size[REPLY_REC_OFF] = data->ioc_plen1;
1394                 ptlrpc_req_set_repsize(req, 2, size);
1395
1396                 rc = ptlrpc_queue_wait(req);
1397                 str = lustre_msg_string(req->rq_repmsg, REPLY_REC_OFF,
1398                                         data->ioc_plen1);
1399                 if (!rc)
1400                         rc = copy_to_user(data->ioc_pbuf1, str,data->ioc_plen1);
1401                 ptlrpc_req_finished(req);
1402         out_catinfo:
1403                 obd_ioctl_freedata(buf, len);
1404                 RETURN(rc);
1405         }
1406         case OBD_IOC_QUOTACHECK: {
1407                 struct obd_quotactl *oqctl;
1408                 int rc, error = 0;
1409
1410                 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1411                         RETURN(-EPERM);
1412
1413                 OBD_ALLOC_PTR(oqctl);
1414                 if (!oqctl)
1415                         RETURN(-ENOMEM);
1416                 oqctl->qc_type = arg;
1417                 rc = obd_quotacheck(sbi->ll_mdc_exp, oqctl);
1418                 if (rc < 0) {
1419                         CDEBUG(D_INFO, "mdc_quotacheck failed: rc %d\n", rc);
1420                         error = rc;
1421                 }
1422
1423                 rc = obd_quotacheck(sbi->ll_osc_exp, oqctl);
1424                 if (rc < 0)
1425                         CDEBUG(D_INFO, "osc_quotacheck failed: rc %d\n", rc);
1426
1427                 OBD_FREE_PTR(oqctl);
1428                 return error ?: rc;
1429         }
1430         case OBD_IOC_POLL_QUOTACHECK: {
1431                 struct if_quotacheck *check;
1432                 int rc;
1433
1434                 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1435                         RETURN(-EPERM);
1436
1437                 OBD_ALLOC_PTR(check);
1438                 if (!check)
1439                         RETURN(-ENOMEM);
1440
1441                 rc = obd_iocontrol(cmd, sbi->ll_mdc_exp, 0, (void *)check,
1442                                    NULL);
1443                 if (rc) {
1444                         CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
1445                         if (copy_to_user((void *)arg, check, sizeof(*check)))
1446                                 rc = -EFAULT;
1447                         GOTO(out_poll, rc);
1448                 }
1449
1450                 rc = obd_iocontrol(cmd, sbi->ll_osc_exp, 0, (void *)check,
1451                                    NULL);
1452                 if (rc) {
1453                         CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
1454                         if (copy_to_user((void *)arg, check, sizeof(*check)))
1455                                 rc = -EFAULT;
1456                         GOTO(out_poll, rc);
1457                 }
1458         out_poll:
1459                 OBD_FREE_PTR(check);
1460                 RETURN(rc);
1461         }
1462         case OBD_IOC_QUOTACTL: {
1463                 struct if_quotactl *qctl;
1464                 struct obd_quotactl *oqctl;
1465
1466                 int cmd, type, id, rc = 0;
1467
1468                 OBD_ALLOC_PTR(qctl);
1469                 if (!qctl)
1470                         RETURN(-ENOMEM);
1471
1472                 OBD_ALLOC_PTR(oqctl);
1473                 if (!oqctl) {
1474                         OBD_FREE_PTR(qctl);
1475                         RETURN(-ENOMEM);
1476                 }
1477                 if (copy_from_user(qctl, (void *)arg, sizeof(*qctl)))
1478                         GOTO(out_quotactl, rc = -EFAULT);
1479
1480                 cmd = qctl->qc_cmd;
1481                 type = qctl->qc_type;
1482                 id = qctl->qc_id;
1483                 switch (cmd) {
1484                 case LUSTRE_Q_INVALIDATE:
1485                 case LUSTRE_Q_FINVALIDATE:
1486                 case Q_QUOTAON:
1487                 case Q_QUOTAOFF:
1488                 case Q_SETQUOTA:
1489                 case Q_SETINFO:
1490                         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1491                                 GOTO(out_quotactl, rc = -EPERM);
1492                         break;
1493                 case Q_GETQUOTA:
1494                         if (((type == USRQUOTA && current->euid != id) ||
1495                              (type == GRPQUOTA && !in_egroup_p(id))) &&
1496                             !cfs_capable(CFS_CAP_SYS_ADMIN))
1497                                 GOTO(out_quotactl, rc = -EPERM);
1498
1499                         /* XXX: dqb_valid is borrowed as a flag to mark that
1500                          *      only mds quota is wanted */
1501                         if (qctl->qc_dqblk.dqb_valid) {
1502                                 qctl->obd_uuid = sbi->ll_mdc_exp->exp_obd->
1503                                                         u.cli.cl_target_uuid;
1504                                 qctl->qc_dqblk.dqb_valid = 0;
1505                         }
1506
1507                         break;
1508                 case Q_GETINFO:
1509                         break;
1510                 default:
1511                         CERROR("unsupported quotactl op: %#x\n", cmd);
1512                         GOTO(out_quotactl, -ENOTTY);
1513                 }
1514
1515                 QCTL_COPY(oqctl, qctl);
1516
1517                 if (qctl->obd_uuid.uuid[0]) {
1518                         struct obd_device *obd;
1519                         struct obd_uuid *uuid = &qctl->obd_uuid;
1520
1521                         obd = class_find_client_notype(uuid,
1522                                          &sbi->ll_osc_exp->exp_obd->obd_uuid);
1523                         if (!obd)
1524                                 GOTO(out_quotactl, rc = -ENOENT);
1525
1526                         if (cmd == Q_GETINFO)
1527                                 oqctl->qc_cmd = Q_GETOINFO;
1528                         else if (cmd == Q_GETQUOTA)
1529                                 oqctl->qc_cmd = Q_GETOQUOTA;
1530                         else
1531                                 GOTO(out_quotactl, rc = -EINVAL);
1532
1533                         if (sbi->ll_mdc_exp->exp_obd == obd) {
1534                                 rc = obd_quotactl(sbi->ll_mdc_exp, oqctl);
1535                         } else {
1536                                 int i;
1537                                 struct obd_export *exp;
1538                                 struct lov_obd *lov = &sbi->ll_osc_exp->
1539                                                             exp_obd->u.lov;
1540
1541                                 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
1542                                         if (!lov->lov_tgts[i] ||
1543                                             !lov->lov_tgts[i]->ltd_active)
1544                                                 continue;
1545                                         exp = lov->lov_tgts[i]->ltd_exp;
1546                                         if (exp->exp_obd == obd) {
1547                                                 rc = obd_quotactl(exp, oqctl);
1548                                                 break;
1549                                         }
1550                                 }
1551                         }
1552
1553                         oqctl->qc_cmd = cmd;
1554                         QCTL_COPY(qctl, oqctl);
1555
1556                         if (copy_to_user((void *)arg, qctl, sizeof(*qctl)))
1557                                 rc = -EFAULT;
1558
1559                         GOTO(out_quotactl, rc);
1560                 }
1561
1562                 rc = obd_quotactl(sbi->ll_mdc_exp, oqctl);
1563                 if (rc && rc != -EBUSY && cmd == Q_QUOTAON) {
1564                         oqctl->qc_cmd = Q_QUOTAOFF;
1565                         obd_quotactl(sbi->ll_mdc_exp, oqctl);
1566                 }
1567
1568                 QCTL_COPY(qctl, oqctl);
1569
1570                 if (copy_to_user((void *)arg, qctl, sizeof(*qctl)))
1571                         rc = -EFAULT;
1572         out_quotactl:
1573                 OBD_FREE_PTR(qctl);
1574                 OBD_FREE_PTR(oqctl);
1575                 RETURN(rc);
1576         }
1577         case OBD_IOC_GETNAME_OLD:
1578         case OBD_IOC_GETNAME: {
1579                 struct obd_device *obd = class_exp2obd(sbi->ll_osc_exp);
1580                 if (!obd)
1581                         RETURN(-EFAULT);
1582                 if (copy_to_user((void *)arg, obd->obd_name,
1583                                 strlen(obd->obd_name) + 1))
1584                         RETURN (-EFAULT);
1585                 RETURN(0);
1586         }
1587         default:
1588                 RETURN(obd_iocontrol(cmd, sbi->ll_osc_exp,0,NULL,(void *)arg));
1589         }
1590 }
1591
1592 struct file_operations ll_dir_operations = {
1593         .open     = ll_file_open,
1594         .release  = ll_file_release,
1595         .read     = generic_read_dir,
1596         .readdir  = ll_readdir,
1597         .ioctl    = ll_dir_ioctl
1598 };