lustre/llite/dir.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  * GPL HEADER START
   5  *
   6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License version 2 only,
  10  * as published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * General Public License version 2 for more details (a copy is included
  16  * in the LICENSE file that accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * version 2 along with this program; If not, see
  20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  21  *
  22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  23  * CA 95054 USA or visit www.sun.com if you need additional information or
  24  * have any questions.
  25  *
  26  * GPL HEADER END
  27  */
  28 /*
  29  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  30  * Use is subject to license terms.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lustre/llite/dir.c
  37  *
  38  * Directory code for lustre client.
  39  */
  40
  41 #include <linux/fs.h>
  42 #include <linux/pagemap.h>
  43 #include <linux/mm.h>
  44 #include <linux/version.h>
  45 #include <linux/smp_lock.h>
  46 #include <asm/uaccess.h>
  47 #include <linux/buffer_head.h>   // for wait_on_buffer
  48
  49 #define DEBUG_SUBSYSTEM S_LLITE
  50
  51 #include <obd_support.h>
  52 #include <obd_class.h>
  53 #include <lustre_lib.h>
  54 #include <lustre/lustre_idl.h>
  55 #include <lustre_lite.h>
  56 #include <lustre_dlm.h>
  57 #include "llite_internal.h"
  58
  59 #ifndef HAVE_PAGE_CHECKED
  60 #ifdef HAVE_PG_FS_MISC
  61 #define PageChecked(page)        test_bit(PG_fs_misc, &(page)->flags)
  62 #define SetPageChecked(page)     set_bit(PG_fs_misc, &(page)->flags)
  63 #else
  64 #error PageChecked or PageFsMisc not defined in kernel
  65 #endif
  66 #endif
  67
  68 /* returns the page unlocked, but with a reference */
  69 static int ll_dir_readpage(struct file *file, struct page *page)
  70 {
  71         struct inode *inode = page->mapping->host;
  72         struct ll_fid mdc_fid;
  73         __u64 offset;
  74         struct ptlrpc_request *request;
  75         struct mds_body *body;
  76         int rc = 0;
  77         ENTRY;
  78
  79         offset = (__u64)page->index << CFS_PAGE_SHIFT;
  80         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off "LPU64"\n",
  81                inode->i_ino, inode->i_generation, inode, offset);
  82
  83         ll_pack_fid(&mdc_fid, inode->i_ino, inode->i_generation, S_IFDIR);
  84
  85         rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &mdc_fid,
  86                           offset, page, &request);
  87         if (!rc) {
  88                 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
  89                                       sizeof(*body));
  90                 LASSERT(body != NULL); /* checked by mdc_readpage() */
  91                 /* swabbed by mdc_readpage() */
  92                 LASSERT(lustre_rep_swabbed(request, REPLY_REC_OFF));
  93
  94                 if (body->size != i_size_read(inode)) {
  95                         ll_inode_size_lock(inode, 0);
  96                         i_size_write(inode, body->size);
  97                         ll_inode_size_unlock(inode, 0);
  98                 }
  99
 100                 SetPageUptodate(page);
 101         }
 102         ptlrpc_req_finished(request);
 103
 104         unlock_page(page);
 105         EXIT;
 106         return rc;
 107 }
 108
 109 struct address_space_operations ll_dir_aops = {
 110         .readpage  = ll_dir_readpage,
 111 };
 112
 113 static inline unsigned ll_dir_page_mask(struct inode *inode)
 114 {
 115         return ~(inode->i_sb->s_blocksize - 1);
 116 }
 117
 118 /*
 119  * Check consistency of a single entry.
 120  */
 121 static int ll_dir_check_entry(struct inode *dir, struct ll_dir_entry *ent,
 122                               unsigned offset, unsigned rec_len, pgoff_t index)
 123 {
 124         const char *msg;
 125
 126         /*
 127          * Consider adding more checks.
 128          */
 129
 130         if (unlikely(rec_len < ll_dir_rec_len(1)))
 131                 msg = "entry is too short";
 132         else if (unlikely(rec_len & 3))
 133                 msg = "wrong alignment";
 134         else if (unlikely(rec_len < ll_dir_rec_len(ent->lde_name_len)))
 135                 msg = "rec_len doesn't match name_len";
 136         else if (unlikely(((offset + rec_len - 1) ^ offset) &
 137                           ll_dir_page_mask(dir)))
 138                 msg = "directory entry across blocks";
 139         else
 140                 return 0;
 141         CERROR("%s: bad entry in directory %lu/%u: %s - "
 142                "offset=%lu+%u, inode=%lu, rec_len=%d,"
 143                " name_len=%d\n", ll_i2mdcexp(dir)->exp_obd->obd_name,
 144                dir->i_ino, dir->i_generation, msg,
 145                index << CFS_PAGE_SHIFT,
 146                offset, (unsigned long)le32_to_cpu(ent->lde_inode),
 147                rec_len, ent->lde_name_len);
 148         return -EIO;
 149 }
 150
 151 static void ll_dir_check_page(struct inode *dir, struct page *page)
 152 {
 153         int      err;
 154         unsigned size = dir->i_sb->s_blocksize;
 155         char    *addr = page_address(page);
 156         unsigned off;
 157         unsigned limit;
 158         unsigned reclen;
 159
 160         struct ll_dir_entry *ent;
 161
 162         err = 0;
 163         if ((i_size_read(dir) >> CFS_PAGE_SHIFT) == (__u64)page->index) {
 164                 /*
 165                  * Last page.
 166                  */
 167                 limit = i_size_read(dir) & ~CFS_PAGE_MASK;
 168                 if (limit & (size - 1)) {
 169                         CERROR("%s: dir %lu/%u size %llu doesn't match %u\n",
 170                                ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino,
 171                                dir->i_generation, i_size_read(dir), size);
 172                         err++;
 173                 } else {
 174                         /*
 175                          * Place dummy forwarding entries to streamline
 176                          * ll_readdir().
 177                          */
 178                         for (off = limit; off < CFS_PAGE_SIZE; off += size) {
 179                                 ent = ll_entry_at(addr, off);
 180                                 ent->lde_rec_len = cpu_to_le16(size);
 181                                 ent->lde_name_len = 0;
 182                                 ent->lde_inode = 0;
 183                         }
 184                 }
 185         } else
 186                 limit = CFS_PAGE_SIZE;
 187
 188         for (off = 0;
 189              !err && off <= limit - ll_dir_rec_len(1); off += reclen) {
 190                 ent    = ll_entry_at(addr, off);
 191                 reclen = le16_to_cpu(ent->lde_rec_len);
 192                 err    = ll_dir_check_entry(dir, ent, off, reclen, page->index);
 193         }
 194
 195         if (!err && off != limit) {
 196                 ent = ll_entry_at(addr, off);
 197                 CERROR("%s: entry in directory %lu/%u spans the page boundary "
 198                        "offset="LPU64"+%u, inode=%lu\n",
 199                        ll_i2mdcexp(dir)->exp_obd->obd_name,
 200                        dir->i_ino, dir->i_generation,
 201                        (__u64)page->index << CFS_PAGE_SHIFT,
 202                        off, (unsigned long)le32_to_cpu(ent->lde_inode));
 203                 err++;
 204         }
 205         if (err)
 206                 SetPageError(page);
 207         SetPageChecked(page);
 208 }
 209
 210 struct page *ll_get_dir_page(struct inode *dir, unsigned long n)
 211 {
 212         struct ldlm_res_id res_id;
 213         struct lustre_handle lockh;
 214         struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
 215         struct address_space *mapping = dir->i_mapping;
 216         struct page *page;
 217         ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
 218         int rc;
 219
 220         fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id);
 221         rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
 222                              &res_id, LDLM_IBITS, &policy, LCK_CR, &lockh);
 223         if (!rc) {
 224                 struct lookup_intent it = { .it_op = IT_READDIR };
 225                 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CR,
 226                        ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir };
 227                 struct ptlrpc_request *request;
 228                 struct mdc_op_data data = { { 0 } };
 229
 230                 ll_prepare_mdc_op_data(&data, dir, NULL, NULL, 0, 0, NULL);
 231
 232                 rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, &einfo, &it,
 233                                  &data, &lockh, NULL, 0, 0);
 234
 235                 request = (struct ptlrpc_request *)it.d.lustre.it_data;
 236                 if (request)
 237                         ptlrpc_req_finished(request);
 238                 if (rc < 0) {
 239                         CERROR("lock enqueue: rc: %d\n", rc);
 240                         return ERR_PTR(rc);
 241                 }
 242         }
 243         ldlm_lock_dump_handle(D_OTHER, &lockh);
 244
 245         page = read_cache_page(mapping, n,
 246                                (filler_t*)mapping->a_ops->readpage, NULL);
 247         if (IS_ERR(page))
 248                 GOTO(out_unlock, page);
 249
 250         wait_on_page(page);
 251         (void)kmap(page);
 252         if (!PageUptodate(page))
 253                 goto fail;
 254         if (!PageChecked(page))
 255                 ll_dir_check_page(dir, page);
 256         if (PageError(page))
 257                 goto fail;
 258
 259 out_unlock:
 260         ldlm_lock_decref(&lockh, LCK_CR);
 261         return page;
 262
 263 fail:
 264         ll_put_page(page);
 265         page = ERR_PTR(-EIO);
 266         goto out_unlock;
 267 }
 268
 269 static inline unsigned ll_dir_validate_entry(char *base, unsigned offset,
 270                                              unsigned mask)
 271 {
 272         struct ll_dir_entry *de = ll_entry_at(base, offset);
 273         struct ll_dir_entry *p  = ll_entry_at(base, offset & mask);
 274         while (p < de && p->lde_rec_len > 0)
 275                 p = ll_dir_next_entry(p);
 276         return (char *)p - base;
 277 }
 278
 279 /*
 280  * File type constants. The same as in ext2 for compatibility.
 281  */
 282
 283 enum {
 284         LL_DIR_FT_UNKNOWN,
 285         LL_DIR_FT_REG_FILE,
 286         LL_DIR_FT_DIR,
 287         LL_DIR_FT_CHRDEV,
 288         LL_DIR_FT_BLKDEV,
 289         LL_DIR_FT_FIFO,
 290         LL_DIR_FT_SOCK,
 291         LL_DIR_FT_SYMLINK,
 292         LL_DIR_FT_MAX
 293 };
 294
 295 static unsigned char ll_dir_filetype_table[LL_DIR_FT_MAX] = {
 296         [LL_DIR_FT_UNKNOWN]  = DT_UNKNOWN,
 297         [LL_DIR_FT_REG_FILE] = DT_REG,
 298         [LL_DIR_FT_DIR]      = DT_DIR,
 299         [LL_DIR_FT_CHRDEV]   = DT_CHR,
 300         [LL_DIR_FT_BLKDEV]   = DT_BLK,
 301         [LL_DIR_FT_FIFO]     = DT_FIFO,
 302         [LL_DIR_FT_SOCK]     = DT_SOCK,
 303         [LL_DIR_FT_SYMLINK]  = DT_LNK,
 304 };
 305
 306 /*
 307  * Process one page. Returns:
 308  *
 309  *     -ve: filldir commands readdir to stop.
 310  *     +ve: number of entries submitted to filldir.
 311  *       0: no live entries on this page.
 312  */
 313
 314 static int ll_readdir_page(char *addr, __u64 base, unsigned *offset,
 315                            filldir_t filldir, void *cookie)
 316 {
 317         struct ll_dir_entry *de;
 318         char *end;
 319         int nr;
 320
 321         de = ll_entry_at(addr, *offset);
 322         end = addr + CFS_PAGE_SIZE - ll_dir_rec_len(1);
 323         for (nr = 0 ;(char*)de <= end; de = ll_dir_next_entry(de)) {
 324                 if (de->lde_inode != 0) {
 325                         nr++;
 326                         *offset = (char *)de - addr;
 327                         if (filldir(cookie, de->lde_name, de->lde_name_len,
 328                                     base | *offset, le32_to_cpu(de->lde_inode),
 329                                     ll_dir_filetype_table[de->lde_file_type &
 330                                                           (LL_DIR_FT_MAX - 1)]))
 331                                 return -1;
 332                 }
 333         }
 334         return nr;
 335 }
 336
 337 static int ll_readdir_18(struct file *filp, void *dirent, filldir_t filldir)
 338 {
 339         struct inode *inode = filp->f_dentry->d_inode;
 340         loff_t pos          = filp->f_pos;
 341         unsigned offset     = pos & ~CFS_PAGE_MASK;
 342         pgoff_t idx         = pos >> CFS_PAGE_SHIFT;
 343         pgoff_t npages      = dir_pages(inode);
 344         unsigned chunk_mask = ll_dir_page_mask(inode);
 345         int need_revalidate = (filp->f_version != inode->i_version);
 346         int rc              = 0;
 347         int done; /* when this becomes negative --- stop iterating */
 348
 349         ENTRY;
 350
 351         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %llu/%llu\n",
 352                inode->i_ino, inode->i_generation, inode,
 353                pos, i_size_read(inode));
 354
 355         /*
 356          * Checking ->i_size without the lock. Should be harmless, as server
 357          * re-checks.
 358          */
 359         if (pos > i_size_read(inode) - ll_dir_rec_len(1))
 360                 RETURN(0);
 361
 362         for (done = 0; idx < npages; idx++, offset = 0) {
 363                 /*
 364                  * We can assume that all blocks on this page are filled with
 365                  * entries, because ll_dir_check_page() placed special dummy
 366                  * entries for us.
 367                  */
 368
 369                 char *kaddr;
 370                 struct page *page;
 371
 372                 CDEBUG(D_EXT2,"read %lu of dir %lu/%u page %lu/%lu "
 373                        "size %llu\n",
 374                        CFS_PAGE_SIZE, inode->i_ino, inode->i_generation,
 375                        idx, npages, i_size_read(inode));
 376                 page = ll_get_dir_page(inode, idx);
 377
 378                 /* size might have been updated by mdc_readpage */
 379                 npages = dir_pages(inode);
 380
 381                 if (IS_ERR(page)) {
 382                         rc = PTR_ERR(page);
 383                         CERROR("error reading dir %lu/%u page %lu: rc %d\n",
 384                                inode->i_ino, inode->i_generation, idx, rc);
 385                         continue;
 386                 }
 387
 388                 kaddr = page_address(page);
 389                 if (need_revalidate) {
 390                         /*
 391                          * File offset was changed by lseek() and possibly
 392                          * points in the middle of an entry. Re-scan from the
 393                          * beginning of the chunk.
 394                          */
 395                         offset = ll_dir_validate_entry(kaddr, offset,
 396                                                        chunk_mask);
 397                         need_revalidate = 0;
 398                 }
 399                 done = ll_readdir_page(kaddr, idx << CFS_PAGE_SHIFT,
 400                                        &offset, filldir, dirent);
 401                 ll_put_page(page);
 402                 if (done > 0)
 403                         /*
 404                          * Some entries were sent to the user space, return
 405                          * success.
 406                          */
 407                         rc = 0;
 408                 else if (done < 0)
 409                         /*
 410                          * filldir is satisfied.
 411                          */
 412                         break;
 413         }
 414
 415         filp->f_pos = (idx << CFS_PAGE_SHIFT) | offset;
 416         filp->f_version = inode->i_version;
 417         touch_atime(filp->f_vfsmnt, filp->f_dentry);
 418
 419         RETURN(rc);
 420 }
 421
 422 /*
 423  * Chain of hash overflow pages.
 424  */
 425 struct ll_dir_chain {
 426         /* XXX something. Later */
 427 };
 428
 429 static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
 430 {
 431 }
 432
 433 static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
 434 {
 435 }
 436
 437 static inline __u32 hash_x_index(__u32 value)
 438 {
 439         return ((__u32)~0) - value;
 440 }
 441
 442 /**
 443  * Layout of readdir pages, as transmitted on wire.
 444  */
 445 struct lu_dirent {
 446         /** valid if LUDA_FID is set. */
 447         struct lu_fid lde_fid;
 448         /** a unique entry identifier: a hash or an offset. */
 449         __u64         lde_hash;
 450         /** total record length, including all attributes. */
 451         __u16         lde_reclen;
 452         /** name length */
 453         __u16         lde_namelen;
 454         /** optional variable size attributes following this entry.
 455          *  taken from enum lu_dirent_attrs.
 456          */
 457         __u32         lde_attrs;
 458         /** name is followed by the attributes indicated in ->ldp_attrs, in
 459          *  their natural order. After the last attribute, padding bytes are
 460          *  added to make ->lde_reclen a multiple of 8.
 461          */
 462         char          lde_name[0];
 463 };
 464
 465 struct lu_dirpage {
 466         __u64            ldp_hash_start;
 467         __u64            ldp_hash_end;
 468         __u16            ldp_flags;
 469         __u16            ldp_pad0;
 470         __u32            ldp_pad1;
 471         struct lu_dirent ldp_entries[0];
 472 };
 473
 474 /*
 475  * Definitions of optional directory entry attributes formats.
 476  *
 477  * Individual attributes do not have their length encoded in a generic way. It
 478  * is assumed that consumer of an attribute knows its format. This means that
 479  * it is impossible to skip over an unknown attribute, except by skipping over all
 480  * remaining attributes (by using ->lde_reclen), which is not too
 481  * constraining, because new server versions will append new attributes at
 482  * the end of an entry.
 483  */
 484
 485 /**
 486  * Fid directory attribute: a fid of an object referenced by the entry. This
 487  * will be almost always requested by the client and supplied by the server.
 488  *
 489  * Aligned to 8 bytes.
 490  */
 491 /* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
 492
 493 /**
 494  * File type.
 495  *
 496  * Aligned to 2 bytes.
 497  */
 498 struct luda_type {
 499         __u16 lt_type;
 500 };
 501
 502 enum lu_dirpage_flags {
 503         LDF_EMPTY = 1 << 0
 504 };
 505
 506 static inline int lu_dirent_calc_size(int namelen, __u16 attr)
 507 {
 508         int size;
 509
 510         if (attr & LUDA_TYPE) {
 511                 const unsigned align = sizeof(struct luda_type) - 1;
 512                 size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
 513                 size += sizeof(struct luda_type);
 514         } else
 515                 size = sizeof(struct lu_dirent) + namelen;
 516
 517         return (size + 7) & ~7;
 518 }
 519
 520 /**
 521  * return IF_* type for given lu_dirent entry.
 522  * IF_* flag shld be converted to particular OS file type in
 523  * platform llite module.
 524  */
 525 __u16 ll_dirent_type_get(struct lu_dirent *ent)
 526 {
 527         __u16 type = 0;
 528         struct luda_type *lt;
 529         int len = 0;
 530
 531         if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
 532                 const unsigned align = sizeof(struct luda_type) - 1;
 533
 534                 len = le16_to_cpu(ent->lde_namelen);
 535                 len = (len + align) & ~align;
 536                 lt = (void *) ent->lde_name + len;
 537                 type = CFS_IFTODT(le16_to_cpu(lt->lt_type));
 538         }
 539         return type;
 540 }
 541
 542 static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
 543 {
 544         if (le16_to_cpu(dp->ldp_flags) & LDF_EMPTY)
 545                 return NULL;
 546         else
 547                 return dp->ldp_entries;
 548 }
 549
 550 static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
 551 {
 552         struct lu_dirent *next;
 553
 554         if (le16_to_cpu(ent->lde_reclen) != 0)
 555                 next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
 556         else
 557                 next = NULL;
 558
 559         return next;
 560 }
 561
 562 static inline int lu_dirent_size(struct lu_dirent *ent)
 563 {
 564         if (le16_to_cpu(ent->lde_reclen) == 0) {
 565                 return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen),
 566                                            le32_to_cpu(ent->lde_attrs));
 567         }
 568         return le16_to_cpu(ent->lde_reclen);
 569 }
 570
 571 #define DIR_END_OFF              0xfffffffffffffffeULL
 572
 573 #ifdef HAVE_RW_TREE_LOCK
 574 #define TREE_READ_LOCK_IRQ(mapping)     read_lock_irq(&(mapping)->tree_lock)
 575 #define TREE_READ_UNLOCK_IRQ(mapping) read_unlock_irq(&(mapping)->tree_lock)
 576 #else
 577 #define TREE_READ_LOCK_IRQ(mapping) spin_lock_irq(&(mapping)->tree_lock)
 578 #define TREE_READ_UNLOCK_IRQ(mapping) spin_unlock_irq(&(mapping)->tree_lock)
 579 #endif
 580
 581 /* returns the page unlocked, but with a reference */
 582 static int ll_dir_readpage_20(struct file *file, struct page *page)
 583 {
 584         struct inode *inode = page->mapping->host;
 585         struct ptlrpc_request *request;
 586         struct mdt_body *body;
 587         struct ll_fid fid;
 588         __u64 hash;
 589         int rc;
 590         ENTRY;
 591
 592         hash = hash_x_index(page->index);
 593         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off %lu\n",
 594                inode->i_ino, inode->i_generation, inode, (unsigned long)hash);
 595
 596         ll_inode2fid(&fid, inode);
 597         rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &fid,
 598                           hash, page, &request);
 599         if (!rc) {
 600                 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
 601                                       sizeof(*body));
 602                 /* Checked by mdc_readpage() */
 603                 LASSERT(body != NULL);
 604
 605                 if (body->valid & OBD_MD_FLSIZE) {
 606                         ll_inode_size_lock(inode, 0);
 607                         i_size_write(inode, body->size);
 608                         ll_inode_size_unlock(inode, 0);
 609                 }
 610                 SetPageUptodate(page);
 611         }
 612         ptlrpc_req_finished(request);
 613
 614         unlock_page(page);
 615         EXIT;
 616         return rc;
 617 }
 618
 619
 620 static void ll_check_page(struct inode *dir, struct page *page)
 621 {
 622         /* XXX: check page format later */
 623         SetPageChecked(page);
 624 }
 625
 626
 627 /*
 628  * Find, kmap and return page that contains given hash.
 629  */
 630 static struct page *ll_dir_page_locate(struct inode *dir, unsigned long hash,
 631                                        __u64 *start, __u64 *end)
 632 {
 633         struct address_space *mapping = dir->i_mapping;
 634         /*
 635          * Complement of hash is used as an index so that
 636          * radix_tree_gang_lookup() can be used to find a page with starting
 637          * hash _smaller_ than one we are looking for.
 638          */
 639         unsigned long offset = hash_x_index(hash);
 640         struct page *page;
 641         int found;
 642         ENTRY;
 643
 644         TREE_READ_LOCK_IRQ(mapping);
 645         found = radix_tree_gang_lookup(&mapping->page_tree,
 646                                        (void **)&page, offset, 1);
 647         if (found > 0) {
 648                 struct lu_dirpage *dp;
 649
 650                 page_cache_get(page);
 651                 TREE_READ_UNLOCK_IRQ(mapping);
 652                 /*
 653                  * In contrast to find_lock_page() we are sure that directory
 654                  * page cannot be truncated (while DLM lock is held) and,
 655                  * hence, can avoid restart.
 656                  *
 657                  * In fact, page cannot be locked here at all, because
 658                  * ll_dir_readpage() does synchronous io.
 659                  */
 660                 wait_on_page(page);
 661                 if (PageUptodate(page)) {
 662                         dp = kmap(page);
 663                         *start = le64_to_cpu(dp->ldp_hash_start);
 664                         *end   = le64_to_cpu(dp->ldp_hash_end);
 665                         LASSERT(*start <= hash);
 666                         if (hash > *end || (*end != *start && hash == *end)) {
 667                                 kunmap(page);
 668                                 lock_page(page);
 669                                 ll_truncate_complete_page(page);
 670                                 unlock_page(page);
 671                                 page_cache_release(page);
 672                                 page = NULL;
 673                         }
 674                 } else {
 675                         page_cache_release(page);
 676                         page = ERR_PTR(-EIO);
 677                 }
 678
 679         } else {
 680                 TREE_READ_UNLOCK_IRQ(mapping);
 681                 page = NULL;
 682         }
 683         RETURN(page);
 684 }
 685
 686 static struct page *ll_get_dir_page_20(struct inode *dir, __u64 hash, int exact,
 687                                        struct ll_dir_chain *chain)
 688 {
 689         struct ldlm_res_id res_id;
 690         struct lustre_handle lockh;
 691         struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
 692         struct address_space *mapping = dir->i_mapping;
 693         struct lu_dirpage *dp;
 694         struct page *page;
 695         ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
 696         ldlm_mode_t mode;
 697         int rc;
 698         __u64 start = 0;
 699         __u64 end = 0;
 700         ENTRY;
 701
 702         fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id);
 703         mode = LCK_PR;
 704         rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
 705                              &res_id, LDLM_IBITS, &policy, mode, &lockh);
 706         if (!rc) {
 707                 struct lookup_intent it = { .it_op = IT_READDIR };
 708                 struct ldlm_enqueue_info einfo = { LDLM_IBITS, mode,
 709                        ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir };
 710                 struct ptlrpc_request *request;
 711                 struct mdc_op_data op_data = { { 0 } };
 712
 713                 ll_prepare_mdc_op_data(&op_data, dir, NULL, NULL, 0, 0, NULL);
 714
 715                 rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, &einfo, &it,
 716                                  &op_data, &lockh, NULL, 0, 0);
 717
 718                 request = (struct ptlrpc_request *)it.d.lustre.it_data;
 719                 if (request)
 720                         ptlrpc_req_finished(request);
 721                 if (rc < 0) {
 722                         CERROR("lock enqueue: rc: %d\n", rc);
 723                         RETURN(ERR_PTR(rc));
 724                 }
 725         }
 726         ldlm_lock_dump_handle(D_OTHER, &lockh);
 727
 728         page = ll_dir_page_locate(dir, hash, &start, &end);
 729         if (IS_ERR(page))
 730                 GOTO(out_unlock, page);
 731
 732         if (page != NULL) {
 733                 /*
 734                  * XXX nikita: not entirely correct handling of a corner case:
 735                  * suppose hash chain of entries with hash value HASH crosses
 736                  * border between pages P0 and P1. First both P0 and P1 are
 737                  * cached, seekdir() is called for some entry from the P0 part
 738                  * of the chain. Later P0 goes out of cache. telldir(HASH)
 739                  * happens and finds P1, as it starts with matching hash
 740                  * value. Remaining entries from P0 part of the chain are
 741                  * skipped. (Is that really a bug?)
 742                  *
 743                  * Possible solutions: 0. don't cache P1 is such case, handle
 744                  * it as an "overflow" page. 1. invalidate all pages at
 745                  * once. 2. use HASH|1 as an index for P1.
 746                  */
 747                 if (exact && hash != start) {
 748                         /*
 749                          * readdir asked for a page starting _exactly_ from
 750                          * given hash, but cache contains stale page, with
 751                          * entries with smaller hash values. Stale page should
 752                          * be invalidated, and new one fetched.
 753                          */
 754                         CDEBUG(D_INFO, "Stale readpage page %p: %#lx != %#lx\n",
 755                               page, (unsigned long)hash, (unsigned long)start);
 756                         lock_page(page);
 757                         ll_truncate_complete_page(page);
 758                         unlock_page(page);
 759                         page_cache_release(page);
 760                 } else {
 761                         GOTO(hash_collision, page);
 762                 }
 763         }
 764
 765         page = read_cache_page(mapping, hash_x_index(hash),
 766                                (filler_t*)ll_dir_readpage_20, NULL);
 767         if (IS_ERR(page))
 768                 GOTO(out_unlock, page);
 769
 770         wait_on_page(page);
 771         (void)kmap(page);
 772         if (!PageUptodate(page))
 773                 goto fail;
 774         if (!PageChecked(page))
 775                 ll_check_page(dir, page);
 776         if (PageError(page))
 777                 goto fail;
 778 hash_collision:
 779         dp = page_address(page);
 780
 781         start = le64_to_cpu(dp->ldp_hash_start);
 782         end   = le64_to_cpu(dp->ldp_hash_end);
 783         if (end == start) {
 784                 LASSERT(start == hash);
 785                 CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end);
 786                 /*
 787                  * Fetch whole overflow chain...
 788                  *
 789                  * XXX not yet.
 790                  */
 791                 goto fail;
 792         }
 793 out_unlock:
 794         ldlm_lock_decref(&lockh, mode);
 795         RETURN(page);
 796
 797 fail:
 798         ll_put_page(page);
 799         page = ERR_PTR(-EIO);
 800         goto out_unlock;
 801 }
 802
 803 static int ll_readdir_20(struct file *filp, void *cookie, filldir_t filldir)
 804 {
 805         struct inode         *inode = filp->f_dentry->d_inode;
 806         __u64                 pos   = filp->f_pos;
 807         struct ll_sb_info    *sbi   = ll_i2sbi(inode);
 808         struct page          *page;
 809         struct ll_dir_chain   chain;
 810         int rc;
 811         int done;
 812         int shift,need_32bit;
 813         __u16 type;
 814         ENTRY;
 815
 816         need_32bit = ll_need_32bit_api(sbi);
 817
 818         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu 32bit_api %d\n",
 819                inode->i_ino, inode->i_generation, inode,
 820                (unsigned long)pos, i_size_read(inode), need_32bit);
 821
 822         if (pos == DIR_END_OFF)
 823                 /*
 824                  * end-of-file.
 825                  */
 826                 RETURN(0);
 827
 828         rc    = 0;
 829         done  = 0;
 830         shift = 0;
 831         ll_dir_chain_init(&chain);
 832
 833         page = ll_get_dir_page_20(inode, pos, 0, &chain);
 834
 835         while (rc == 0 && !done) {
 836                 struct lu_dirpage *dp;
 837                 struct lu_dirent  *ent;
 838
 839                 if (!IS_ERR(page)) {
 840                         /*
 841                          * If page is empty (end of directoryis reached),
 842                          * use this value.
 843                          */
 844                         __u64 hash = DIR_END_OFF;
 845                         __u64 next;
 846
 847                         dp = page_address(page);
 848                         for (ent = lu_dirent_start(dp); ent != NULL && !done;
 849                              ent = lu_dirent_next(ent)) {
 850                                 char          *name;
 851                                 int            namelen;
 852                                 struct lu_fid  fid;
 853                                 __u64          ino;
 854
 855                                 hash    = le64_to_cpu(ent->lde_hash);
 856                                 namelen = le16_to_cpu(ent->lde_namelen);
 857
 858                                 if (hash < pos)
 859                                         /*
 860                                          * Skip until we find target hash
 861                                          * value.
 862                                          */
 863                                         continue;
 864
 865                                 if (namelen == 0)
 866                                         /*
 867                                          * Skip dummy record.
 868                                          */
 869                                         continue;
 870
 871                                 fid  = ent->lde_fid;
 872                                 name = ent->lde_name;
 873                                 fid_le_to_cpu(&fid, &fid);
 874                                 if (need_32bit)
 875                                         ino = ll_fid_build_ino32((struct ll_fid *)&fid);
 876                                 else
 877                                         ino = ll_fid_build_ino((struct ll_fid *)&fid);
 878
 879                                 type = ll_dirent_type_get(ent);
 880                                 done = filldir(cookie, name, namelen,
 881                                                (loff_t)hash, ino, type);
 882                         }
 883                         next = le64_to_cpu(dp->ldp_hash_end);
 884                         ll_put_page(page);
 885                         if (!done) {
 886                                 pos = next;
 887                                 if (pos == DIR_END_OFF)
 888                                         /*
 889                                          * End of directory reached.
 890                                          */
 891                                         done = 1;
 892                                 else if (1 /* chain is exhausted*/)
 893                                         /*
 894                                          * Normal case: continue to the next
 895                                          * page.
 896                                          */
 897                                         page = ll_get_dir_page_20(inode, pos, 1,
 898                                                                   &chain);
 899                                 else {
 900                                         /*
 901                                          * go into overflow page.
 902                                          */
 903                                 }
 904                         } else {
 905                                 pos = hash;
 906                         }
 907                 } else {
 908                         rc = PTR_ERR(page);
 909                         CERROR("error reading dir "DFID" at %lu: rc %d\n",
 910                                PFID(ll_inode_lu_fid(inode)),
 911                                (unsigned long)pos, rc);
 912                 }
 913         }
 914
 915         filp->f_pos = (loff_t)(__s32)pos;
 916         filp->f_version = inode->i_version;
 917         touch_atime(filp->f_vfsmnt, filp->f_dentry);
 918
 919         ll_dir_chain_fini(&chain);
 920
 921         RETURN(rc);
 922 }
 923
 924 static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
 925 {
 926         struct inode      *inode = filp->f_dentry->d_inode;
 927         struct ll_sb_info *sbi = ll_i2sbi(inode);
 928
 929         if (sbi->ll_mdc_exp->exp_connect_flags & OBD_CONNECT_FID) {
 930                 return ll_readdir_20(filp, cookie, filldir);
 931         } else {
 932                 return ll_readdir_18(filp, cookie, filldir);
 933         }
 934 }
 935
 936 #define QCTL_COPY(out, in)              \
 937 do {                                    \
 938         Q_COPY(out, in, qc_cmd);        \
 939         Q_COPY(out, in, qc_type);       \
 940         Q_COPY(out, in, qc_id);         \
 941         Q_COPY(out, in, qc_stat);       \
 942         Q_COPY(out, in, qc_dqinfo);     \
 943         Q_COPY(out, in, qc_dqblk);      \
 944 } while (0)
 945
 946 static int ll_send_mgc_param(struct obd_export *mgc, char *string)
 947 {
 948         struct mgs_send_param *msp;
 949         int rc = 0;
 950
 951         OBD_ALLOC_PTR(msp);
 952         if (!msp)
 953                 return -ENOMEM;
 954
 955         strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
 956         rc = obd_set_info_async(mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
 957                                 sizeof(struct mgs_send_param), msp, NULL);
 958         if (rc)
 959                 CERROR("Failed to set parameter: %d\n", rc);
 960
 961         OBD_FREE_PTR(msp);
 962         return rc;
 963 }
 964
 965 static char *ll_get_fsname(struct inode *inode)
 966 {
 967         struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
 968         char *ptr, *fsname;
 969         int len;
 970
 971         OBD_ALLOC(fsname, MGS_PARAM_MAXLEN);
 972         len = strlen(lsi->lsi_lmd->lmd_profile);
 973         ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
 974         if (ptr && (strcmp(ptr, "-client") == 0))
 975                 len -= 7;
 976         strncpy(fsname, lsi->lsi_lmd->lmd_profile, len);
 977         fsname[len] = '\0';
 978
 979         return fsname;
 980 }
 981
 982 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
 983                      int set_default)
 984 {
 985         struct ll_sb_info *sbi = ll_i2sbi(inode);
 986         struct mdc_op_data data = { { 0 } };
 987         struct ptlrpc_request *req = NULL;
 988         struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
 989         struct obd_device *mgc = lsi->lsi_mgc;
 990         char *fsname = NULL, *param = NULL;
 991         struct iattr attr = { 0 };
 992         int lum_size = 0, rc = 0;
 993
 994         if (lump != NULL) {
 995                 if (lump->lmm_magic == LOV_USER_MAGIC_V3)
 996                         lum_size = sizeof(struct lov_user_md_v3);
 997                 else
 998                         lum_size = sizeof(struct lov_user_md_v1);
 999                 /*
1000                  * This is coming from userspace, so should be in
1001                  * local endian.  But the MDS would like it in little
1002                  * endian, so we swab it before we send it.
1003                  */
1004                 if ((lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1)) &&
1005                     (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))) {
1006                         rc = lustre_swab_lov_user_md(lump);
1007                         if (rc)
1008                                 return rc;
1009                 }
1010         } else { /* NULL value means remove LOV EA */
1011                 lum_size = sizeof(struct lov_user_md_v1);
1012         }
1013
1014         ll_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0, NULL);
1015
1016         /* swabbing is done in lov_setstripe() on server side */
1017         rc = mdc_setattr(sbi->ll_mdc_exp, &data,
1018                          &attr, lump, lum_size, NULL, 0, &req);
1019         if (rc) {
1020                 ptlrpc_req_finished(req);
1021                 if (rc != -EPERM && rc != -EACCES)
1022                         CERROR("mdc_setattr fails: rc = %d\n", rc);
1023                 return rc;
1024         }
1025         ptlrpc_req_finished(req);
1026
1027         /* In the following we use the fact that LOV_USER_MAGIC_V1 and
1028          LOV_USER_MAGIC_V3 have the same initial fields so we do not
1029          need the make the distiction between the 2 versions */
1030         if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
1031                 OBD_ALLOC(param, MGS_PARAM_MAXLEN);
1032
1033                 /* Get fsname and assume devname to be -MDT0000. */
1034                 fsname = ll_get_fsname(inode);
1035                 /* Set root stripesize */
1036                 sprintf(param, "%s-MDT0000.lov.stripesize=%u", fsname,
1037                         lump ? le32_to_cpu(lump->lmm_stripe_size) : 0);
1038                 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
1039                 if (rc)
1040                         goto end;
1041
1042                 /* Set root stripecount */
1043                 sprintf(param, "%s-MDT0000.lov.stripecount=%u", fsname,
1044                         lump ? le16_to_cpu(lump->lmm_stripe_count) : 0);
1045                 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
1046                 if (rc)
1047                         goto end;
1048
1049                 /* Set root stripeoffset */
1050                 sprintf(param, "%s-MDT0000.lov.stripeoffset=%u", fsname,
1051                         lump ? le16_to_cpu(lump->lmm_stripe_offset) :
1052                         (typeof(lump->lmm_stripe_offset))(-1));
1053                 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
1054                 if (rc)
1055                         goto end;
1056 end:
1057                 if (fsname)
1058                         OBD_FREE(fsname, MGS_PARAM_MAXLEN);
1059                 if (param)
1060                         OBD_FREE(param, MGS_PARAM_MAXLEN);
1061         }
1062         return rc;
1063 }
1064
1065 int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
1066                      int *lmm_size, struct ptlrpc_request **request)
1067 {
1068         struct ll_sb_info *sbi = ll_i2sbi(inode);
1069         struct ll_fid     fid;
1070         struct mds_body   *body;
1071         struct lov_mds_md *lmm = NULL;
1072         struct ptlrpc_request *req = NULL;
1073         int rc, lmmsize;
1074
1075         ll_inode2fid(&fid, inode);
1076
1077         rc = ll_get_max_mdsize(sbi, &lmmsize);
1078         if (rc)
1079                 RETURN(rc);
1080
1081         rc = mdc_getattr(sbi->ll_mdc_exp, &fid,
1082                         OBD_MD_FLEASIZE|OBD_MD_FLDIREA,
1083                         lmmsize, &req);
1084         if (rc < 0) {
1085                 CDEBUG(D_INFO, "mdc_getattr failed on inode "
1086                        "%lu/%u: rc %d\n", inode->i_ino,
1087                        inode->i_generation, rc);
1088                 GOTO(out, rc);
1089         }
1090         body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
1091                         sizeof(*body));
1092         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1093         /* swabbed by mdc_getattr_name */
1094         LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
1095
1096         lmmsize = body->eadatasize;
1097
1098         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1099             lmmsize == 0) {
1100                 GOTO(out, rc = -ENODATA);
1101         }
1102
1103         lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1104         LASSERT(lmm != NULL);
1105         LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
1106
1107         /*
1108          * This is coming from the MDS, so is probably in
1109          * little endian.  We convert it to host endian before
1110          * passing it to userspace.
1111          */
1112         /* We don't swab objects for directories */
1113         if (((le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1) ||
1114             (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3)) &&
1115             (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))) {
1116                 rc = lustre_swab_lov_user_md((struct lov_user_md*)lmm);
1117                 if (rc)
1118                         GOTO(out, rc);
1119         }
1120
1121 out:
1122         *lmmp = lmm;
1123         *lmm_size = lmmsize;
1124         *request = req;
1125         return rc;
1126 }
1127
1128 static int ll_dir_ioctl(struct inode *inode, struct file *file,
1129                         unsigned int cmd, unsigned long arg)
1130 {
1131         struct ll_sb_info *sbi = ll_i2sbi(inode);
1132         struct obd_ioctl_data *data;
1133         ENTRY;
1134
1135         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n",
1136                inode->i_ino, inode->i_generation, inode, cmd);
1137
1138         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1139         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1140                 return -ENOTTY;
1141
1142         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1143         switch(cmd) {
1144         case FSFILT_IOC_GETFLAGS:
1145         case FSFILT_IOC_SETFLAGS:
1146                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1147         case FSFILT_IOC_GETVERSION_OLD:
1148         case FSFILT_IOC_GETVERSION:
1149                 RETURN(put_user(inode->i_generation, (int *)arg));
1150         /* We need to special case any other ioctls we want to handle,
1151          * to send them to the MDS/OST as appropriate and to properly
1152          * network encode the arg field.
1153         case EXT3_IOC_SETVERSION_OLD:
1154         case EXT3_IOC_SETVERSION:
1155         */
1156         case IOC_MDC_LOOKUP: {
1157                 struct ptlrpc_request *request = NULL;
1158                 struct ll_fid fid;
1159                 char *buf = NULL;
1160                 char *filename;
1161                 int namelen, rc, len = 0;
1162
1163                 rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
1164                 if (rc)
1165                         RETURN(rc);
1166                 data = (void *)buf;
1167
1168                 filename = data->ioc_inlbuf1;
1169                 namelen = data->ioc_inllen1;
1170
1171                 if (namelen < 1) {
1172                         CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
1173                         GOTO(out, rc = -EINVAL);
1174                 }
1175
1176                 ll_inode2fid(&fid, inode);
1177                 rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid, filename, namelen,
1178                                       OBD_MD_FLID, 0, &request);
1179                 if (rc < 0) {
1180                         CDEBUG(D_INFO, "mdc_getattr_name: %d\n", rc);
1181                         GOTO(out, rc);
1182                 }
1183
1184                 ptlrpc_req_finished(request);
1185
1186                 EXIT;
1187         out:
1188                 obd_ioctl_freedata(buf, len);
1189                 return rc;
1190         }
1191         case LL_IOC_LOV_SETSTRIPE: {
1192                 struct lov_user_md_v3 lumv3;
1193                 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1194                 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1195                 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1196
1197                 int rc = 0;
1198                 int set_default = 0;
1199
1200                 LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
1201                 LASSERT(sizeof(lumv3.lmm_objects[0]) ==
1202                         sizeof(lumv3p->lmm_objects[0]));
1203
1204                 /* first try with v1 which is smaller than v3 */
1205                 if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
1206                         RETURN(-EFAULT);
1207
1208                 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1209                         if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
1210                                 RETURN(-EFAULT);
1211                 }
1212
1213                 if (inode->i_sb->s_root == file->f_dentry)
1214                         set_default = 1;
1215
1216                 /* in v1 and v3 cases lumv1 points to data */
1217                 rc = ll_dir_setstripe(inode, lumv1, set_default);
1218
1219                 return rc;
1220         }
1221         case LL_IOC_OBD_STATFS:
1222                 RETURN(ll_obd_statfs(inode, (void *)arg));
1223         case LL_IOC_LOV_GETSTRIPE:
1224         case LL_IOC_MDC_GETINFO:
1225         case IOC_MDC_GETFILEINFO:
1226         case IOC_MDC_GETFILESTRIPE: {
1227                 struct ptlrpc_request *request = NULL;
1228                 struct mds_body *body;
1229                 struct lov_user_md *lump;
1230                 struct lov_mds_md *lmm = NULL;
1231                 char *filename = NULL;
1232                 int rc, lmmsize;
1233
1234                 if (cmd == IOC_MDC_GETFILEINFO ||
1235                     cmd == IOC_MDC_GETFILESTRIPE) {
1236                         filename = getname((const char *)arg);
1237                         if (IS_ERR(filename))
1238                                 RETURN(PTR_ERR(filename));
1239
1240                         rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
1241                                                       &lmmsize, &request);
1242                 } else {
1243                         rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
1244                 }
1245
1246                 if (request) {
1247                         body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
1248                                               sizeof(*body));
1249                         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1250                         /* swabbed by mdc_getattr_name */
1251                         LASSERT(lustre_rep_swabbed(request, REPLY_REC_OFF));
1252                 } else {
1253                         GOTO(out_req, rc);
1254                 }
1255
1256                 if (rc < 0) {
1257                         if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
1258                                                cmd == LL_IOC_MDC_GETINFO))
1259                                 GOTO(skip_lmm, rc = 0);
1260                         else
1261                                 GOTO(out_req, rc);
1262                 }
1263
1264                 if (cmd == IOC_MDC_GETFILESTRIPE ||
1265                     cmd == LL_IOC_LOV_GETSTRIPE) {
1266                         lump = (struct lov_user_md *)arg;
1267                 } else {
1268                         struct lov_user_mds_data *lmdp;
1269                         lmdp = (struct lov_user_mds_data *)arg;
1270                         lump = &lmdp->lmd_lmm;
1271                 }
1272                 if (copy_to_user(lump, lmm, lmmsize) != 0) {
1273                         if (copy_to_user(lump, lmm, sizeof(*lump)) != 0)
1274                                 GOTO(out_lmm, rc = -EFAULT);
1275                         rc = -EOVERFLOW;
1276                 }
1277         skip_lmm:
1278                 if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
1279                         struct lov_user_mds_data *lmdp;
1280                         lstat_t st = { 0 };
1281
1282                         st.st_dev     = inode->i_sb->s_dev;
1283                         st.st_mode    = body->mode;
1284                         st.st_nlink   = body->nlink;
1285                         st.st_uid     = body->uid;
1286                         st.st_gid     = body->gid;
1287                         st.st_rdev    = body->rdev;
1288                         st.st_size    = body->size;
1289                         st.st_blksize = CFS_PAGE_SIZE;
1290                         st.st_blocks  = body->blocks;
1291                         st.st_atime   = body->atime;
1292                         st.st_mtime   = body->mtime;
1293                         st.st_ctime   = body->ctime;
1294                         st.st_ino     = body->ino;
1295
1296                         lmdp = (struct lov_user_mds_data *)arg;
1297                         if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st)))
1298                                 GOTO(out_lmm, rc = -EFAULT);
1299                 }
1300
1301                 EXIT;
1302         out_lmm:
1303                 if (lmm && lmm->lmm_magic == LOV_MAGIC_JOIN)
1304                         OBD_FREE(lmm, lmmsize);
1305         out_req:
1306                 ptlrpc_req_finished(request);
1307                 if (filename)
1308                         putname(filename);
1309                 return rc;
1310         }
1311         case IOC_LOV_GETINFO: {
1312                 struct lov_user_mds_data *lumd;
1313                 struct lov_stripe_md *lsm;
1314                 struct lov_user_md *lum;
1315                 struct lov_mds_md *lmm;
1316                 int lmmsize;
1317                 lstat_t st;
1318                 int rc;
1319
1320                 lumd = (struct lov_user_mds_data *)arg;
1321                 lum = &lumd->lmd_lmm;
1322
1323                 rc = ll_get_max_mdsize(sbi, &lmmsize);
1324                 if (rc)
1325                         RETURN(rc);
1326
1327                 OBD_ALLOC(lmm, lmmsize);
1328                 if (copy_from_user(lmm, lum, lmmsize))
1329                         GOTO(free_lmm, rc = -EFAULT);
1330
1331                 if (LOV_USER_MAGIC != cpu_to_le32(LOV_USER_MAGIC)) {
1332                         rc = lustre_swab_lov_user_md(
1333                                                 (struct lov_user_md_v1 *)lmm);
1334                         if (rc)
1335                                 GOTO(free_lmm, rc);
1336                         rc = lustre_swab_lov_user_md_objects(
1337                                                 (struct lov_user_md*)lmm);
1338                         if (rc)
1339                                 GOTO(free_lmm, rc);
1340                 }
1341
1342                 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
1343                 if (rc < 0)
1344                         GOTO(free_lmm, rc = -ENOMEM);
1345
1346                 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
1347                 if (rc)
1348                         GOTO(free_lsm, rc);
1349
1350                 /* Perform glimpse_size operation. */
1351                 memset(&st, 0, sizeof(st));
1352
1353                 rc = ll_glimpse_ioctl(sbi, lsm, &st);
1354                 if (rc)
1355                         GOTO(free_lsm, rc);
1356
1357                 if (copy_to_user(&lumd->lmd_st, &st, sizeof(st)))
1358                         GOTO(free_lsm, rc = -EFAULT);
1359
1360                 EXIT;
1361         free_lsm:
1362                 obd_free_memmd(sbi->ll_osc_exp, &lsm);
1363         free_lmm:
1364                 OBD_FREE(lmm, lmmsize);
1365                 return rc;
1366         }
1367         case OBD_IOC_LLOG_CATINFO: {
1368                 struct ptlrpc_request *req = NULL;
1369                 char *buf = NULL;
1370                 int rc, len = 0;
1371                 char *bufs[3] = { NULL }, *str;
1372                 int lens[3] = { sizeof(struct ptlrpc_body) };
1373                 int size[2] = { sizeof(struct ptlrpc_body) };
1374
1375                 rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
1376                 if (rc)
1377                         RETURN(rc);
1378                 data = (void *)buf;
1379
1380                 if (!data->ioc_inlbuf1) {
1381                         obd_ioctl_freedata(buf, len);
1382                         RETURN(-EINVAL);
1383                 }
1384
1385                 lens[REQ_REC_OFF] = data->ioc_inllen1;
1386                 bufs[REQ_REC_OFF] = data->ioc_inlbuf1;
1387                 if (data->ioc_inllen2) {
1388                         lens[REQ_REC_OFF + 1] = data->ioc_inllen2;
1389                         bufs[REQ_REC_OFF + 1] = data->ioc_inlbuf2;
1390                 } else {
1391                         lens[REQ_REC_OFF + 1] = 0;
1392                         bufs[REQ_REC_OFF + 1] = NULL;
1393                 }
1394
1395                 req = ptlrpc_prep_req(sbi2mdc(sbi)->cl_import,
1396                                       LUSTRE_LOG_VERSION, LLOG_CATINFO, 3, lens,
1397                                       bufs);
1398                 if (!req)
1399                         GOTO(out_catinfo, rc = -ENOMEM);
1400
1401                 size[REPLY_REC_OFF] = data->ioc_plen1;
1402                 ptlrpc_req_set_repsize(req, 2, size);
1403
1404                 rc = ptlrpc_queue_wait(req);
1405                 str = lustre_msg_string(req->rq_repmsg, REPLY_REC_OFF,
1406                                         data->ioc_plen1);
1407                 if (!rc)
1408                         if (copy_to_user(data->ioc_pbuf1, str,data->ioc_plen1))
1409                                 rc = -EFAULT;
1410                 ptlrpc_req_finished(req);
1411         out_catinfo:
1412                 obd_ioctl_freedata(buf, len);
1413                 RETURN(rc);
1414         }
1415         case OBD_IOC_QUOTACHECK: {
1416                 struct obd_quotactl *oqctl;
1417                 int rc, error = 0;
1418
1419                 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1420                         RETURN(-EPERM);
1421
1422                 OBD_ALLOC_PTR(oqctl);
1423                 if (!oqctl)
1424                         RETURN(-ENOMEM);
1425                 oqctl->qc_type = arg;
1426                 rc = obd_quotacheck(sbi->ll_mdc_exp, oqctl);
1427                 if (rc < 0) {
1428                         CDEBUG(D_INFO, "mdc_quotacheck failed: rc %d\n", rc);
1429                         error = rc;
1430                 }
1431
1432                 rc = obd_quotacheck(sbi->ll_osc_exp, oqctl);
1433                 if (rc < 0)
1434                         CDEBUG(D_INFO, "osc_quotacheck failed: rc %d\n", rc);
1435
1436                 OBD_FREE_PTR(oqctl);
1437                 return error ?: rc;
1438         }
1439         case OBD_IOC_POLL_QUOTACHECK: {
1440                 struct if_quotacheck *check;
1441                 int rc;
1442
1443                 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1444                         RETURN(-EPERM);
1445
1446                 OBD_ALLOC_PTR(check);
1447                 if (!check)
1448                         RETURN(-ENOMEM);
1449
1450                 rc = obd_iocontrol(cmd, sbi->ll_mdc_exp, 0, (void *)check,
1451                                    NULL);
1452                 if (rc) {
1453                         CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
1454                         if (copy_to_user((void *)arg, check, sizeof(*check)))
1455                                 CDEBUG(D_QUOTA, "copy_to_user failed\n");
1456                         GOTO(out_poll, rc);
1457                 }
1458
1459                 rc = obd_iocontrol(cmd, sbi->ll_osc_exp, 0, (void *)check,
1460                                    NULL);
1461                 if (rc) {
1462                         CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
1463                         if (copy_to_user((void *)arg, check, sizeof(*check)))
1464                                 CDEBUG(D_QUOTA, "copy_to_user failed\n");
1465                         GOTO(out_poll, rc);
1466                 }
1467         out_poll:
1468                 OBD_FREE_PTR(check);
1469                 RETURN(rc);
1470         }
1471         case OBD_IOC_QUOTACTL: {
1472                 struct if_quotactl *qctl;
1473                 struct obd_quotactl *oqctl;
1474
1475                 int cmd, type, id, rc = 0;
1476
1477                 OBD_ALLOC_PTR(qctl);
1478                 if (!qctl)
1479                         RETURN(-ENOMEM);
1480
1481                 OBD_ALLOC_PTR(oqctl);
1482                 if (!oqctl) {
1483                         OBD_FREE_PTR(qctl);
1484                         RETURN(-ENOMEM);
1485                 }
1486                 if (copy_from_user(qctl, (void *)arg, sizeof(*qctl)))
1487                         GOTO(out_quotactl, rc = -EFAULT);
1488
1489                 cmd = qctl->qc_cmd;
1490                 type = qctl->qc_type;
1491                 id = qctl->qc_id;
1492                 switch (cmd) {
1493                 case LUSTRE_Q_INVALIDATE:
1494                 case LUSTRE_Q_FINVALIDATE:
1495                 case Q_QUOTAON:
1496                 case Q_QUOTAOFF:
1497                 case Q_SETQUOTA:
1498                 case Q_SETINFO:
1499                         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1500                                 GOTO(out_quotactl, rc = -EPERM);
1501                         break;
1502                 case Q_GETQUOTA:
1503                         if (((type == USRQUOTA && cfs_curproc_euid() != id) ||
1504                              (type == GRPQUOTA && !in_egroup_p(id))) &&
1505                             !cfs_capable(CFS_CAP_SYS_ADMIN))
1506                                 GOTO(out_quotactl, rc = -EPERM);
1507
1508                         /* XXX: dqb_valid is borrowed as a flag to mark that
1509                          *      only mds quota is wanted */
1510                         if (qctl->qc_dqblk.dqb_valid) {
1511                                 qctl->obd_uuid = sbi->ll_mdc_exp->exp_obd->
1512                                                         u.cli.cl_target_uuid;
1513                                 qctl->qc_dqblk.dqb_valid = 0;
1514                         }
1515
1516                         break;
1517                 case Q_GETINFO:
1518                         break;
1519                 default:
1520                         CERROR("unsupported quotactl op: %#x\n", cmd);
1521                         GOTO(out_quotactl, -ENOTTY);
1522                 }
1523
1524                 QCTL_COPY(oqctl, qctl);
1525
1526                 if (qctl->obd_uuid.uuid[0]) {
1527                         struct obd_device *obd;
1528                         struct obd_uuid *uuid = &qctl->obd_uuid;
1529
1530                         obd = class_find_client_notype(uuid,
1531                                          &sbi->ll_osc_exp->exp_obd->obd_uuid);
1532                         if (!obd)
1533                                 GOTO(out_quotactl, rc = -ENOENT);
1534
1535                         if (cmd == Q_GETINFO)
1536                                 oqctl->qc_cmd = Q_GETOINFO;
1537                         else if (cmd == Q_GETQUOTA)
1538                                 oqctl->qc_cmd = Q_GETOQUOTA;
1539                         else
1540                                 GOTO(out_quotactl, rc = -EINVAL);
1541
1542                         if (sbi->ll_mdc_exp->exp_obd == obd) {
1543                                 rc = obd_quotactl(sbi->ll_mdc_exp, oqctl);
1544                         } else {
1545                                 int i;
1546                                 struct obd_export *exp;
1547                                 struct lov_obd *lov = &sbi->ll_osc_exp->
1548                                                             exp_obd->u.lov;
1549
1550                                 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
1551                                         if (!lov->lov_tgts[i] ||
1552                                             !lov->lov_tgts[i]->ltd_active)
1553                                                 continue;
1554                                         exp = lov->lov_tgts[i]->ltd_exp;
1555                                         if (exp->exp_obd == obd) {
1556                                                 rc = obd_quotactl(exp, oqctl);
1557                                                 break;
1558                                         }
1559                                 }
1560                         }
1561
1562                         oqctl->qc_cmd = cmd;
1563                         QCTL_COPY(qctl, oqctl);
1564
1565                         if (copy_to_user((void *)arg, qctl, sizeof(*qctl)))
1566                                 rc = -EFAULT;
1567
1568                         GOTO(out_quotactl, rc);
1569                 }
1570
1571                 rc = obd_quotactl(sbi->ll_mdc_exp, oqctl);
1572                 if (rc && rc != -EBUSY && cmd == Q_QUOTAON) {
1573                         oqctl->qc_cmd = Q_QUOTAOFF;
1574                         obd_quotactl(sbi->ll_mdc_exp, oqctl);
1575                 }
1576
1577                 QCTL_COPY(qctl, oqctl);
1578
1579                 if (copy_to_user((void *)arg, qctl, sizeof(*qctl)))
1580                         rc = -EFAULT;
1581         out_quotactl:
1582                 OBD_FREE_PTR(qctl);
1583                 OBD_FREE_PTR(oqctl);
1584                 RETURN(rc);
1585         }
1586         case OBD_IOC_GETNAME_OLD:
1587         case OBD_IOC_GETNAME: {
1588                 struct obd_device *obd = class_exp2obd(sbi->ll_osc_exp);
1589                 if (!obd)
1590                         RETURN(-EFAULT);
1591                 if (copy_to_user((void *)arg, obd->obd_name,
1592                                 strlen(obd->obd_name) + 1))
1593                         RETURN (-EFAULT);
1594                 RETURN(0);
1595         }
1596         case LL_IOC_PATH2FID: {
1597                 if (copy_to_user((void *)arg, ll_inode_lu_fid(inode),
1598                                  sizeof(struct lu_fid)))
1599                         RETURN(-EFAULT);
1600
1601                 RETURN(0);
1602         }
1603         case LL_IOC_GET_CONNECT_FLAGS: {
1604                 if (copy_to_user((void *)arg,
1605                                  &sbi->ll_mdc_exp->exp_connect_flags,
1606                                  sizeof(__u64)))
1607                         RETURN(-EFAULT);
1608                 RETURN(0);
1609         }
1610         default:
1611                 RETURN(obd_iocontrol(cmd, sbi->ll_osc_exp,0,NULL,(void *)arg));
1612         }
1613 }
1614
1615 struct file_operations ll_dir_operations = {
1616         .open     = ll_file_open,
1617         .release  = ll_file_release,
1618         .read     = generic_read_dir,
1619         .readdir  = ll_readdir,
1620         .ioctl    = ll_dir_ioctl
1621 };