lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19  *
  20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21  * CA 95054 USA or visit www.sun.com if you need additional information or
  22  * have any questions.
  23  *
  24  * GPL HEADER END
  25  */
  26 /*
  27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Use is subject to license terms.
  29  *
  30  * Copyright (c) 2011, 2012, Intel Corporation.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lustre/llite/file.c
  37  *
  38  * Author: Peter Braam <braam@clusterfs.com>
  39  * Author: Phil Schwan <phil@clusterfs.com>
  40  * Author: Andreas Dilger <adilger@clusterfs.com>
  41  */
  42
  43 #define DEBUG_SUBSYSTEM S_LLITE
  44 #include <lustre_dlm.h>
  45 #include <lustre_lite.h>
  46 #include <linux/pagemap.h>
  47 #include <linux/file.h>
  48 #include "llite_internal.h"
  49 #include <lustre/ll_fiemap.h>
  50
  51 #include "cl_object.h"
  52
  53 struct ll_file_data *ll_file_data_get(void)
  54 {
  55         struct ll_file_data *fd;
  56
  57         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
  58         fd->fd_write_failed = false;
  59         return fd;
  60 }
  61
  62 static void ll_file_data_put(struct ll_file_data *fd)
  63 {
  64         if (fd != NULL)
  65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  66 }
  67
  68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  69                           struct lustre_handle *fh)
  70 {
  71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  72         op_data->op_attr.ia_mode = inode->i_mode;
  73         op_data->op_attr.ia_atime = inode->i_atime;
  74         op_data->op_attr.ia_mtime = inode->i_mtime;
  75         op_data->op_attr.ia_ctime = inode->i_ctime;
  76         op_data->op_attr.ia_size = i_size_read(inode);
  77         op_data->op_attr_blocks = inode->i_blocks;
  78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
  79                                         ll_inode_to_ext_flags(inode->i_flags);
  80         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  81         if (fh)
  82                 op_data->op_handle = *fh;
  83         op_data->op_capa1 = ll_mdscapa_get(inode);
  84
  85         if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
  86                 op_data->op_bias |= MDS_DATA_MODIFIED;
  87 }
  88
  89 /**
  90  * Closes the IO epoch and packs all the attributes into @op_data for
  91  * the CLOSE rpc.
  92  */
  93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  94                              struct obd_client_handle *och)
  95 {
  96         ENTRY;
  97
  98         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
  99                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
 100
 101         if (!(och->och_flags & FMODE_WRITE))
 102                 goto out;
 103
 104         if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
 105                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 106         else
 107                 ll_ioepoch_close(inode, op_data, &och, 0);
 108
 109 out:
 110         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
 111         ll_prep_md_op_data(op_data, inode, NULL, NULL,
 112                            0, 0, LUSTRE_OPC_ANY, NULL);
 113         EXIT;
 114 }
 115
 116 static int ll_close_inode_openhandle(struct obd_export *md_exp,
 117                                      struct inode *inode,
 118                                      struct obd_client_handle *och)
 119 {
 120         struct obd_export *exp = ll_i2mdexp(inode);
 121         struct md_op_data *op_data;
 122         struct ptlrpc_request *req = NULL;
 123         struct obd_device *obd = class_exp2obd(exp);
 124         int epoch_close = 1;
 125         int rc;
 126         ENTRY;
 127
 128         if (obd == NULL) {
 129                 /*
 130                  * XXX: in case of LMV, is this correct to access
 131                  * ->exp_handle?
 132                  */
 133                 CERROR("Invalid MDC connection handle "LPX64"\n",
 134                        ll_i2mdexp(inode)->exp_handle.h_cookie);
 135                 GOTO(out, rc = 0);
 136         }
 137
 138         OBD_ALLOC_PTR(op_data);
 139         if (op_data == NULL)
 140                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
 141
 142         ll_prepare_close(inode, op_data, och);
 143         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 144         rc = md_close(md_exp, op_data, och->och_mod, &req);
 145         if (rc == -EAGAIN) {
 146                 /* This close must have the epoch closed. */
 147                 LASSERT(epoch_close);
 148                 /* MDS has instructed us to obtain Size-on-MDS attribute from
 149                  * OSTs and send setattr to back to MDS. */
 150                 rc = ll_som_update(inode, op_data);
 151                 if (rc) {
 152                         CERROR("inode %lu mdc Size-on-MDS update failed: "
 153                                "rc = %d\n", inode->i_ino, rc);
 154                         rc = 0;
 155                 }
 156         } else if (rc) {
 157                 CERROR("inode %lu mdc close failed: rc = %d\n",
 158                        inode->i_ino, rc);
 159         }
 160
 161         /* DATA_MODIFIED flag was successfully sent on close, cancel data
 162          * modification flag. */
 163         if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
 164                 struct ll_inode_info *lli = ll_i2info(inode);
 165
 166                 spin_lock(&lli->lli_lock);
 167                 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
 168                 spin_unlock(&lli->lli_lock);
 169         }
 170
 171         ll_finish_md_op_data(op_data);
 172
 173         if (rc == 0) {
 174                 rc = ll_objects_destroy(req, inode);
 175                 if (rc)
 176                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
 177                                inode->i_ino, rc);
 178         }
 179
 180         EXIT;
 181 out:
 182
 183         if (exp_connect_som(exp) && !epoch_close &&
 184             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 185                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 186         } else {
 187                 md_clear_open_replay_data(md_exp, och);
 188                 /* Free @och if it is not waiting for DONE_WRITING. */
 189                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 190                 OBD_FREE_PTR(och);
 191         }
 192         if (req) /* This is close request */
 193                 ptlrpc_req_finished(req);
 194         return rc;
 195 }
 196
 197 int ll_md_real_close(struct inode *inode, int flags)
 198 {
 199         struct ll_inode_info *lli = ll_i2info(inode);
 200         struct obd_client_handle **och_p;
 201         struct obd_client_handle *och;
 202         __u64 *och_usecount;
 203         int rc = 0;
 204         ENTRY;
 205
 206         if (flags & FMODE_WRITE) {
 207                 och_p = &lli->lli_mds_write_och;
 208                 och_usecount = &lli->lli_open_fd_write_count;
 209         } else if (flags & FMODE_EXEC) {
 210                 och_p = &lli->lli_mds_exec_och;
 211                 och_usecount = &lli->lli_open_fd_exec_count;
 212         } else {
 213                 LASSERT(flags & FMODE_READ);
 214                 och_p = &lli->lli_mds_read_och;
 215                 och_usecount = &lli->lli_open_fd_read_count;
 216         }
 217
 218         mutex_lock(&lli->lli_och_mutex);
 219         if (*och_usecount) { /* There are still users of this handle, so
 220                                 skip freeing it. */
 221                 mutex_unlock(&lli->lli_och_mutex);
 222                 RETURN(0);
 223         }
 224         och=*och_p;
 225         *och_p = NULL;
 226         mutex_unlock(&lli->lli_och_mutex);
 227
 228         if (och) { /* There might be a race and somebody have freed this och
 229                       already */
 230                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 231                                                inode, och);
 232         }
 233
 234         RETURN(rc);
 235 }
 236
 237 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 238                 struct file *file)
 239 {
 240         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 241         struct ll_inode_info *lli = ll_i2info(inode);
 242         int rc = 0;
 243         ENTRY;
 244
 245         /* clear group lock, if present */
 246         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 247                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
 248
 249         /* Let's see if we have good enough OPEN lock on the file and if
 250            we can skip talking to MDS */
 251         if (file->f_dentry->d_inode) { /* Can this ever be false? */
 252                 int lockmode;
 253                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 254                 struct lustre_handle lockh;
 255                 struct inode *inode = file->f_dentry->d_inode;
 256                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 257
 258                 mutex_lock(&lli->lli_och_mutex);
 259                 if (fd->fd_omode & FMODE_WRITE) {
 260                         lockmode = LCK_CW;
 261                         LASSERT(lli->lli_open_fd_write_count);
 262                         lli->lli_open_fd_write_count--;
 263                 } else if (fd->fd_omode & FMODE_EXEC) {
 264                         lockmode = LCK_PR;
 265                         LASSERT(lli->lli_open_fd_exec_count);
 266                         lli->lli_open_fd_exec_count--;
 267                 } else {
 268                         lockmode = LCK_CR;
 269                         LASSERT(lli->lli_open_fd_read_count);
 270                         lli->lli_open_fd_read_count--;
 271                 }
 272                 mutex_unlock(&lli->lli_och_mutex);
 273
 274                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 275                                    LDLM_IBITS, &policy, lockmode,
 276                                    &lockh)) {
 277                         rc = ll_md_real_close(file->f_dentry->d_inode,
 278                                               fd->fd_omode);
 279                 }
 280         } else {
 281                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
 282                        file, file->f_dentry, file->f_dentry->d_name.name);
 283         }
 284
 285         LUSTRE_FPRIVATE(file) = NULL;
 286         ll_file_data_put(fd);
 287         ll_capa_close(inode);
 288
 289         RETURN(rc);
 290 }
 291
 292 /* While this returns an error code, fput() the caller does not, so we need
 293  * to make every effort to clean up all of our state here.  Also, applications
 294  * rarely check close errors and even if an error is returned they will not
 295  * re-try the close call.
 296  */
 297 int ll_file_release(struct inode *inode, struct file *file)
 298 {
 299         struct ll_file_data *fd;
 300         struct ll_sb_info *sbi = ll_i2sbi(inode);
 301         struct ll_inode_info *lli = ll_i2info(inode);
 302         int rc;
 303         ENTRY;
 304
 305         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 306                inode->i_generation, inode);
 307
 308 #ifdef CONFIG_FS_POSIX_ACL
 309         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
 310             inode == inode->i_sb->s_root->d_inode) {
 311                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 312
 313                 LASSERT(fd != NULL);
 314                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 315                         fd->fd_flags &= ~LL_FILE_RMTACL;
 316                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
 317                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
 318                 }
 319         }
 320 #endif
 321
 322         if (inode->i_sb->s_root != file->f_dentry)
 323                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 324         fd = LUSTRE_FPRIVATE(file);
 325         LASSERT(fd != NULL);
 326
 327         /* The last ref on @file, maybe not the the owner pid of statahead.
 328          * Different processes can open the same dir, "ll_opendir_key" means:
 329          * it is me that should stop the statahead thread. */
 330         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
 331             lli->lli_opendir_pid != 0)
 332                 ll_stop_statahead(inode, lli->lli_opendir_key);
 333
 334         if (inode->i_sb->s_root == file->f_dentry) {
 335                 LUSTRE_FPRIVATE(file) = NULL;
 336                 ll_file_data_put(fd);
 337                 RETURN(0);
 338         }
 339
 340         if (!S_ISDIR(inode->i_mode)) {
 341                 lov_read_and_clear_async_rc(lli->lli_clob);
 342                 lli->lli_async_rc = 0;
 343         }
 344
 345         rc = ll_md_close(sbi->ll_md_exp, inode, file);
 346
 347         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 348                 libcfs_debug_dumplog();
 349
 350         RETURN(rc);
 351 }
 352
 353 static int ll_intent_file_open(struct file *file, void *lmm,
 354                                int lmmsize, struct lookup_intent *itp)
 355 {
 356         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 357         struct dentry *parent = file->f_dentry->d_parent;
 358         const char *name = file->f_dentry->d_name.name;
 359         const int len = file->f_dentry->d_name.len;
 360         struct md_op_data *op_data;
 361         struct ptlrpc_request *req;
 362         __u32 opc = LUSTRE_OPC_ANY;
 363         int rc;
 364         ENTRY;
 365
 366         if (!parent)
 367                 RETURN(-ENOENT);
 368
 369         /* Usually we come here only for NFSD, and we want open lock.
 370            But we can also get here with pre 2.6.15 patchless kernels, and in
 371            that case that lock is also ok */
 372         /* We can also get here if there was cached open handle in revalidate_it
 373          * but it disappeared while we were getting from there to ll_file_open.
 374          * But this means this file was closed and immediatelly opened which
 375          * makes a good candidate for using OPEN lock */
 376         /* If lmmsize & lmm are not 0, we are just setting stripe info
 377          * parameters. No need for the open lock */
 378         if (lmm == NULL && lmmsize == 0) {
 379                 itp->it_flags |= MDS_OPEN_LOCK;
 380                 if (itp->it_flags & FMODE_WRITE)
 381                         opc = LUSTRE_OPC_CREATE;
 382         }
 383
 384         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 385                                       file->f_dentry->d_inode, name, len,
 386                                       O_RDWR, opc, NULL);
 387         if (IS_ERR(op_data))
 388                 RETURN(PTR_ERR(op_data));
 389
 390         itp->it_flags |= MDS_OPEN_BY_FID;
 391         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 392                             0 /*unused */, &req, ll_md_blocking_ast, 0);
 393         ll_finish_md_op_data(op_data);
 394         if (rc == -ESTALE) {
 395                 /* reason for keep own exit path - don`t flood log
 396                 * with messages with -ESTALE errors.
 397                 */
 398                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 399                      it_open_error(DISP_OPEN_OPEN, itp))
 400                         GOTO(out, rc);
 401                 ll_release_openhandle(file->f_dentry, itp);
 402                 GOTO(out, rc);
 403         }
 404
 405         if (it_disposition(itp, DISP_LOOKUP_NEG))
 406                 GOTO(out, rc = -ENOENT);
 407
 408         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 409                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 410                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 411                 GOTO(out, rc);
 412         }
 413
 414         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
 415         if (!rc && itp->d.lustre.it_lock_mode)
 416                 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
 417                                  itp, NULL);
 418
 419 out:
 420         ptlrpc_req_finished(itp->d.lustre.it_data);
 421         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
 422         ll_intent_drop_lock(itp);
 423
 424         RETURN(rc);
 425 }
 426
 427 /**
 428  * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
 429  * not believe attributes if a few ioepoch holders exist. Attributes for
 430  * previous ioepoch if new one is opened are also skipped by MDS.
 431  */
 432 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
 433 {
 434         if (ioepoch && lli->lli_ioepoch != ioepoch) {
 435                 lli->lli_ioepoch = ioepoch;
 436                 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 437                        ioepoch, PFID(&lli->lli_fid));
 438         }
 439 }
 440
 441 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
 442                        struct lookup_intent *it, struct obd_client_handle *och)
 443 {
 444         struct ptlrpc_request *req = it->d.lustre.it_data;
 445         struct mdt_body *body;
 446
 447         LASSERT(och);
 448
 449         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 450         LASSERT(body != NULL);                      /* reply already checked out */
 451
 452         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
 453         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 454         och->och_fid = lli->lli_fid;
 455         och->och_flags = it->it_flags;
 456         ll_ioepoch_open(lli, body->ioepoch);
 457
 458         return md_set_open_replay_data(md_exp, och, req);
 459 }
 460
 461 int ll_local_open(struct file *file, struct lookup_intent *it,
 462                   struct ll_file_data *fd, struct obd_client_handle *och)
 463 {
 464         struct inode *inode = file->f_dentry->d_inode;
 465         struct ll_inode_info *lli = ll_i2info(inode);
 466         ENTRY;
 467
 468         LASSERT(!LUSTRE_FPRIVATE(file));
 469
 470         LASSERT(fd != NULL);
 471
 472         if (och) {
 473                 struct ptlrpc_request *req = it->d.lustre.it_data;
 474                 struct mdt_body *body;
 475                 int rc;
 476
 477                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
 478                 if (rc)
 479                         RETURN(rc);
 480
 481                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 482                 if ((it->it_flags & FMODE_WRITE) &&
 483                     (body->valid & OBD_MD_FLSIZE))
 484                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 485                                lli->lli_ioepoch, PFID(&lli->lli_fid));
 486         }
 487
 488         LUSTRE_FPRIVATE(file) = fd;
 489         ll_readahead_init(inode, &fd->fd_ras);
 490         fd->fd_omode = it->it_flags;
 491         RETURN(0);
 492 }
 493
 494 /* Open a file, and (for the very first open) create objects on the OSTs at
 495  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 496  * creation or open until ll_lov_setstripe() ioctl is called.
 497  *
 498  * If we already have the stripe MD locally then we don't request it in
 499  * md_open(), by passing a lmm_size = 0.
 500  *
 501  * It is up to the application to ensure no other processes open this file
 502  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 503  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 504  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 505  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 506  */
 507 int ll_file_open(struct inode *inode, struct file *file)
 508 {
 509         struct ll_inode_info *lli = ll_i2info(inode);
 510         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 511                                           .it_flags = file->f_flags };
 512         struct obd_client_handle **och_p = NULL;
 513         __u64 *och_usecount = NULL;
 514         struct ll_file_data *fd;
 515         int rc = 0, opendir_set = 0;
 516         ENTRY;
 517
 518         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 519                inode->i_generation, inode, file->f_flags);
 520
 521         it = file->private_data; /* XXX: compat macro */
 522         file->private_data = NULL; /* prevent ll_local_open assertion */
 523
 524         fd = ll_file_data_get();
 525         if (fd == NULL)
 526                 GOTO(out_och_free, rc = -ENOMEM);
 527
 528         fd->fd_file = file;
 529         if (S_ISDIR(inode->i_mode)) {
 530                 spin_lock(&lli->lli_sa_lock);
 531                 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
 532                     lli->lli_opendir_pid == 0) {
 533                         lli->lli_opendir_key = fd;
 534                         lli->lli_opendir_pid = cfs_curproc_pid();
 535                         opendir_set = 1;
 536                 }
 537                 spin_unlock(&lli->lli_sa_lock);
 538         }
 539
 540         if (inode->i_sb->s_root == file->f_dentry) {
 541                 LUSTRE_FPRIVATE(file) = fd;
 542                 RETURN(0);
 543         }
 544
 545         if (!it || !it->d.lustre.it_disposition) {
 546                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 547                  * because everything but O_ACCMODE mask was stripped from
 548                  * there */
 549                 if ((oit.it_flags + 1) & O_ACCMODE)
 550                         oit.it_flags++;
 551                 if (file->f_flags & O_TRUNC)
 552                         oit.it_flags |= FMODE_WRITE;
 553
 554                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 555                  * dentry_open after call to open_namei that checks permissions.
 556                  * Only nfsd_open call dentry_open directly without checking
 557                  * permissions and because of that this code below is safe. */
 558                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 559                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 560
 561                 /* We do not want O_EXCL here, presumably we opened the file
 562                  * already? XXX - NFS implications? */
 563                 oit.it_flags &= ~O_EXCL;
 564
 565                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 566                  * created if necessary, then "IT_CREAT" should be set to keep
 567                  * consistent with it */
 568                 if (oit.it_flags & O_CREAT)
 569                         oit.it_op |= IT_CREAT;
 570
 571                 it = &oit;
 572         }
 573
 574 restart:
 575         /* Let's see if we have file open on MDS already. */
 576         if (it->it_flags & FMODE_WRITE) {
 577                 och_p = &lli->lli_mds_write_och;
 578                 och_usecount = &lli->lli_open_fd_write_count;
 579         } else if (it->it_flags & FMODE_EXEC) {
 580                 och_p = &lli->lli_mds_exec_och;
 581                 och_usecount = &lli->lli_open_fd_exec_count;
 582          } else {
 583                 och_p = &lli->lli_mds_read_och;
 584                 och_usecount = &lli->lli_open_fd_read_count;
 585         }
 586
 587         mutex_lock(&lli->lli_och_mutex);
 588         if (*och_p) { /* Open handle is present */
 589                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 590                         /* Well, there's extra open request that we do not need,
 591                            let's close it somehow. This will decref request. */
 592                         rc = it_open_error(DISP_OPEN_OPEN, it);
 593                         if (rc) {
 594                                 mutex_unlock(&lli->lli_och_mutex);
 595                                 GOTO(out_openerr, rc);
 596                         }
 597
 598                         ll_release_openhandle(file->f_dentry, it);
 599                 }
 600                 (*och_usecount)++;
 601
 602                 rc = ll_local_open(file, it, fd, NULL);
 603                 if (rc) {
 604                         (*och_usecount)--;
 605                         mutex_unlock(&lli->lli_och_mutex);
 606                         GOTO(out_openerr, rc);
 607                 }
 608         } else {
 609                 LASSERT(*och_usecount == 0);
 610                 if (!it->d.lustre.it_disposition) {
 611                         /* We cannot just request lock handle now, new ELC code
 612                            means that one of other OPEN locks for this file
 613                            could be cancelled, and since blocking ast handler
 614                            would attempt to grab och_mutex as well, that would
 615                            result in a deadlock */
 616                         mutex_unlock(&lli->lli_och_mutex);
 617                         it->it_create_mode |= M_CHECK_STALE;
 618                         rc = ll_intent_file_open(file, NULL, 0, it);
 619                         it->it_create_mode &= ~M_CHECK_STALE;
 620                         if (rc)
 621                                 GOTO(out_openerr, rc);
 622
 623                         goto restart;
 624                 }
 625                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 626                 if (!*och_p)
 627                         GOTO(out_och_free, rc = -ENOMEM);
 628
 629                 (*och_usecount)++;
 630
 631                 /* md_intent_lock() didn't get a request ref if there was an
 632                  * open error, so don't do cleanup on the request here
 633                  * (bug 3430) */
 634                 /* XXX (green): Should not we bail out on any error here, not
 635                  * just open error? */
 636                 rc = it_open_error(DISP_OPEN_OPEN, it);
 637                 if (rc)
 638                         GOTO(out_och_free, rc);
 639
 640                 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
 641
 642                 rc = ll_local_open(file, it, fd, *och_p);
 643                 if (rc)
 644                         GOTO(out_och_free, rc);
 645         }
 646         mutex_unlock(&lli->lli_och_mutex);
 647         fd = NULL;
 648
 649         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 650            different kind of OPEN lock for this same inode gets cancelled
 651            by ldlm_cancel_lru */
 652         if (!S_ISREG(inode->i_mode))
 653                 GOTO(out_och_free, rc);
 654
 655         ll_capa_open(inode);
 656
 657         if (!lli->lli_has_smd) {
 658                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 659                     !(file->f_mode & FMODE_WRITE)) {
 660                         CDEBUG(D_INODE, "object creation was delayed\n");
 661                         GOTO(out_och_free, rc);
 662                 }
 663         }
 664         file->f_flags &= ~O_LOV_DELAY_CREATE;
 665         GOTO(out_och_free, rc);
 666
 667 out_och_free:
 668         if (rc) {
 669                 if (och_p && *och_p) {
 670                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 671                         *och_p = NULL; /* OBD_FREE writes some magic there */
 672                         (*och_usecount)--;
 673                 }
 674                 mutex_unlock(&lli->lli_och_mutex);
 675
 676 out_openerr:
 677                 if (opendir_set != 0)
 678                         ll_stop_statahead(inode, lli->lli_opendir_key);
 679                 if (fd != NULL)
 680                         ll_file_data_put(fd);
 681         } else {
 682                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 683         }
 684
 685         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 686                 ptlrpc_req_finished(it->d.lustre.it_data);
 687                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 688         }
 689
 690         return rc;
 691 }
 692
 693 /* Fills the obdo with the attributes for the lsm */
 694 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
 695                           struct obd_capa *capa, struct obdo *obdo,
 696                           __u64 ioepoch, int sync)
 697 {
 698         struct ptlrpc_request_set *set;
 699         struct obd_info            oinfo = { { { 0 } } };
 700         int                        rc;
 701
 702         ENTRY;
 703
 704         LASSERT(lsm != NULL);
 705
 706         oinfo.oi_md = lsm;
 707         oinfo.oi_oa = obdo;
 708         oinfo.oi_oa->o_id = lsm->lsm_object_id;
 709         oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
 710         oinfo.oi_oa->o_mode = S_IFREG;
 711         oinfo.oi_oa->o_ioepoch = ioepoch;
 712         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 713                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 714                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 715                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 716                                OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
 717                                OBD_MD_FLDATAVERSION;
 718         oinfo.oi_capa = capa;
 719         if (sync) {
 720                 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
 721                 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
 722         }
 723
 724         set = ptlrpc_prep_set();
 725         if (set == NULL) {
 726                 CERROR("can't allocate ptlrpc set\n");
 727                 rc = -ENOMEM;
 728         } else {
 729                 rc = obd_getattr_async(exp, &oinfo, set);
 730                 if (rc == 0)
 731                         rc = ptlrpc_set_wait(set);
 732                 ptlrpc_set_destroy(set);
 733         }
 734         if (rc == 0)
 735                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 736                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
 737                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE |
 738                                          OBD_MD_FLDATAVERSION);
 739         RETURN(rc);
 740 }
 741
 742 /**
 743   * Performs the getattr on the inode and updates its fields.
 744   * If @sync != 0, perform the getattr under the server-side lock.
 745   */
 746 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
 747                      __u64 ioepoch, int sync)
 748 {
 749         struct obd_capa      *capa = ll_mdscapa_get(inode);
 750         struct lov_stripe_md *lsm;
 751         int rc;
 752         ENTRY;
 753
 754         lsm = ccc_inode_lsm_get(inode);
 755         rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
 756                             capa, obdo, ioepoch, sync);
 757         capa_put(capa);
 758         if (rc == 0) {
 759                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
 760                 CDEBUG(D_INODE,
 761                        "objid "LPX64" size %llu, blocks %llu, blksize %lu\n",
 762                        lsm ? lsm->lsm_object_id : 0, i_size_read(inode),
 763                        (unsigned long long)inode->i_blocks,
 764                        (unsigned long)ll_inode_blksize(inode));
 765         }
 766         ccc_inode_lsm_put(inode, lsm);
 767         RETURN(rc);
 768 }
 769
 770 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
 771 {
 772         struct ll_inode_info *lli = ll_i2info(inode);
 773         struct cl_object *obj = lli->lli_clob;
 774         struct cl_attr *attr = ccc_env_thread_attr(env);
 775         struct ost_lvb lvb;
 776         int rc = 0;
 777
 778         ENTRY;
 779
 780         ll_inode_size_lock(inode);
 781         /* merge timestamps the most recently obtained from mds with
 782            timestamps obtained from osts */
 783         LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
 784         LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
 785         LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
 786         inode_init_lvb(inode, &lvb);
 787
 788         cl_object_attr_lock(obj);
 789         rc = cl_object_attr_get(env, obj, attr);
 790         cl_object_attr_unlock(obj);
 791
 792         if (rc == 0) {
 793                 if (lvb.lvb_atime < attr->cat_atime)
 794                         lvb.lvb_atime = attr->cat_atime;
 795                 if (lvb.lvb_ctime < attr->cat_ctime)
 796                         lvb.lvb_ctime = attr->cat_ctime;
 797                 if (lvb.lvb_mtime < attr->cat_mtime)
 798                         lvb.lvb_mtime = attr->cat_mtime;
 799
 800                 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
 801                                 PFID(&lli->lli_fid), attr->cat_size);
 802                 cl_isize_write_nolock(inode, attr->cat_size);
 803
 804                 inode->i_blocks = attr->cat_blocks;
 805
 806                 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
 807                 LTIME_S(inode->i_atime) = lvb.lvb_atime;
 808                 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
 809         }
 810         ll_inode_size_unlock(inode);
 811
 812         RETURN(rc);
 813 }
 814
 815 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
 816                      lstat_t *st)
 817 {
 818         struct obdo obdo = { 0 };
 819         int rc;
 820
 821         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
 822         if (rc == 0) {
 823                 st->st_size   = obdo.o_size;
 824                 st->st_blocks = obdo.o_blocks;
 825                 st->st_mtime  = obdo.o_mtime;
 826                 st->st_atime  = obdo.o_atime;
 827                 st->st_ctime  = obdo.o_ctime;
 828         }
 829         return rc;
 830 }
 831
 832 void ll_io_init(struct cl_io *io, const struct file *file, int write)
 833 {
 834         struct inode *inode = file->f_dentry->d_inode;
 835
 836         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
 837         if (write) {
 838                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
 839                 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
 840                                       file->f_flags & O_DIRECT ||
 841                                       IS_SYNC(inode);
 842         }
 843         io->ci_obj     = ll_i2info(inode)->lli_clob;
 844         io->ci_lockreq = CILR_MAYBE;
 845         if (ll_file_nolock(file)) {
 846                 io->ci_lockreq = CILR_NEVER;
 847                 io->ci_no_srvlock = 1;
 848         } else if (file->f_flags & O_APPEND) {
 849                 io->ci_lockreq = CILR_MANDATORY;
 850         }
 851 }
 852
 853 static ssize_t
 854 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 855                    struct file *file, enum cl_io_type iot,
 856                    loff_t *ppos, size_t count)
 857 {
 858         struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
 859         struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
 860         struct cl_io         *io;
 861         ssize_t               result;
 862         ENTRY;
 863
 864 restart:
 865         io = ccc_env_thread_io(env);
 866         ll_io_init(io, file, iot == CIT_WRITE);
 867
 868         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
 869                 struct vvp_io *vio = vvp_env_io(env);
 870                 struct ccc_io *cio = ccc_env_io(env);
 871                 int write_mutex_locked = 0;
 872
 873                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
 874                 vio->cui_io_subtype = args->via_io_subtype;
 875
 876                 switch (vio->cui_io_subtype) {
 877                 case IO_NORMAL:
 878                         cio->cui_iov = args->u.normal.via_iov;
 879                         cio->cui_nrsegs = args->u.normal.via_nrsegs;
 880                         cio->cui_tot_nrsegs = cio->cui_nrsegs;
 881 #ifndef HAVE_FILE_WRITEV
 882                         cio->cui_iocb = args->u.normal.via_iocb;
 883 #endif
 884                         if ((iot == CIT_WRITE) &&
 885                             !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
 886                                 if (mutex_lock_interruptible(&lli->
 887                                                                lli_write_mutex))
 888                                         GOTO(out, result = -ERESTARTSYS);
 889                                 write_mutex_locked = 1;
 890                         } else if (iot == CIT_READ) {
 891                                 down_read(&lli->lli_trunc_sem);
 892                         }
 893                         break;
 894                 case IO_SENDFILE:
 895                         vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
 896                         vio->u.sendfile.cui_target = args->u.sendfile.via_target;
 897                         break;
 898                 case IO_SPLICE:
 899                         vio->u.splice.cui_pipe = args->u.splice.via_pipe;
 900                         vio->u.splice.cui_flags = args->u.splice.via_flags;
 901                         break;
 902                 default:
 903                         CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
 904                         LBUG();
 905                 }
 906                 result = cl_io_loop(env, io);
 907                 if (write_mutex_locked)
 908                         mutex_unlock(&lli->lli_write_mutex);
 909                 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
 910                         up_read(&lli->lli_trunc_sem);
 911         } else {
 912                 /* cl_io_rw_init() handled IO */
 913                 result = io->ci_result;
 914         }
 915
 916         if (io->ci_nob > 0) {
 917                 result = io->ci_nob;
 918                 *ppos = io->u.ci_wr.wr.crw_pos;
 919         }
 920         GOTO(out, result);
 921 out:
 922         cl_io_fini(env, io);
 923         /* If any bit been read/written (result != 0), we just return
 924          * short read/write instead of restart io. */
 925         if (result == 0 && io->ci_need_restart) {
 926                 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
 927                        iot == CIT_READ ? "read" : "write",
 928                        file->f_dentry->d_name.name, *ppos, count);
 929                 LASSERTF(io->u.ci_rw.crw_count == count, "%zd != %zd\n",
 930                          io->u.ci_rw.crw_count, count);
 931                 goto restart;
 932         }
 933
 934         if (iot == CIT_READ) {
 935                 if (result >= 0)
 936                         ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
 937                                            LPROC_LL_READ_BYTES, result);
 938         } else if (iot == CIT_WRITE) {
 939                 if (result >= 0) {
 940                         ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
 941                                            LPROC_LL_WRITE_BYTES, result);
 942                         fd->fd_write_failed = false;
 943                 } else if (result != -ERESTARTSYS) {
 944                         fd->fd_write_failed = true;
 945                 }
 946         }
 947
 948         return result;
 949 }
 950
 951
 952 /*
 953  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
 954  */
 955 static int ll_file_get_iov_count(const struct iovec *iov,
 956                                  unsigned long *nr_segs, size_t *count)
 957 {
 958         size_t cnt = 0;
 959         unsigned long seg;
 960
 961         for (seg = 0; seg < *nr_segs; seg++) {
 962                 const struct iovec *iv = &iov[seg];
 963
 964                 /*
 965                  * If any segment has a negative length, or the cumulative
 966                  * length ever wraps negative then return -EINVAL.
 967                  */
 968                 cnt += iv->iov_len;
 969                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
 970                         return -EINVAL;
 971                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
 972                         continue;
 973                 if (seg == 0)
 974                         return -EFAULT;
 975                 *nr_segs = seg;
 976                 cnt -= iv->iov_len;   /* This segment is no good */
 977                 break;
 978         }
 979         *count = cnt;
 980         return 0;
 981 }
 982
 983 #ifdef HAVE_FILE_READV
 984 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
 985                               unsigned long nr_segs, loff_t *ppos)
 986 {
 987         struct lu_env      *env;
 988         struct vvp_io_args *args;
 989         size_t              count;
 990         ssize_t             result;
 991         int                 refcheck;
 992         ENTRY;
 993
 994         result = ll_file_get_iov_count(iov, &nr_segs, &count);
 995         if (result)
 996                 RETURN(result);
 997
 998         env = cl_env_get(&refcheck);
 999         if (IS_ERR(env))
1000                 RETURN(PTR_ERR(env));
1001
1002         args = vvp_env_args(env, IO_NORMAL);
1003         args->u.normal.via_iov = (struct iovec *)iov;
1004         args->u.normal.via_nrsegs = nr_segs;
1005
1006         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1007         cl_env_put(env, &refcheck);
1008         RETURN(result);
1009 }
1010
1011 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1012                             loff_t *ppos)
1013 {
1014         struct lu_env *env;
1015         struct iovec  *local_iov;
1016         ssize_t        result;
1017         int            refcheck;
1018         ENTRY;
1019
1020         env = cl_env_get(&refcheck);
1021         if (IS_ERR(env))
1022                 RETURN(PTR_ERR(env));
1023
1024         local_iov = &vvp_env_info(env)->vti_local_iov;
1025         local_iov->iov_base = (void __user *)buf;
1026         local_iov->iov_len = count;
1027         result = ll_file_readv(file, local_iov, 1, ppos);
1028         cl_env_put(env, &refcheck);
1029         RETURN(result);
1030 }
1031
1032 #else
1033 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1034                                 unsigned long nr_segs, loff_t pos)
1035 {
1036         struct lu_env      *env;
1037         struct vvp_io_args *args;
1038         size_t              count;
1039         ssize_t             result;
1040         int                 refcheck;
1041         ENTRY;
1042
1043         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1044         if (result)
1045                 RETURN(result);
1046
1047         env = cl_env_get(&refcheck);
1048         if (IS_ERR(env))
1049                 RETURN(PTR_ERR(env));
1050
1051         args = vvp_env_args(env, IO_NORMAL);
1052         args->u.normal.via_iov = (struct iovec *)iov;
1053         args->u.normal.via_nrsegs = nr_segs;
1054         args->u.normal.via_iocb = iocb;
1055
1056         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1057                                     &iocb->ki_pos, count);
1058         cl_env_put(env, &refcheck);
1059         RETURN(result);
1060 }
1061
1062 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1063                             loff_t *ppos)
1064 {
1065         struct lu_env *env;
1066         struct iovec  *local_iov;
1067         struct kiocb  *kiocb;
1068         ssize_t        result;
1069         int            refcheck;
1070         ENTRY;
1071
1072         env = cl_env_get(&refcheck);
1073         if (IS_ERR(env))
1074                 RETURN(PTR_ERR(env));
1075
1076         local_iov = &vvp_env_info(env)->vti_local_iov;
1077         kiocb = &vvp_env_info(env)->vti_kiocb;
1078         local_iov->iov_base = (void __user *)buf;
1079         local_iov->iov_len = count;
1080         init_sync_kiocb(kiocb, file);
1081         kiocb->ki_pos = *ppos;
1082         kiocb->ki_left = count;
1083
1084         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1085         *ppos = kiocb->ki_pos;
1086
1087         cl_env_put(env, &refcheck);
1088         RETURN(result);
1089 }
1090 #endif
1091
1092 /*
1093  * Write to a file (through the page cache).
1094  */
1095 #ifdef HAVE_FILE_WRITEV
1096 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1097                               unsigned long nr_segs, loff_t *ppos)
1098 {
1099         struct lu_env      *env;
1100         struct vvp_io_args *args;
1101         size_t              count;
1102         ssize_t             result;
1103         int                 refcheck;
1104         ENTRY;
1105
1106         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1107         if (result)
1108                 RETURN(result);
1109
1110         env = cl_env_get(&refcheck);
1111         if (IS_ERR(env))
1112                 RETURN(PTR_ERR(env));
1113
1114         args = vvp_env_args(env, IO_NORMAL);
1115         args->u.normal.via_iov = (struct iovec *)iov;
1116         args->u.normal.via_nrsegs = nr_segs;
1117
1118         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1119         cl_env_put(env, &refcheck);
1120         RETURN(result);
1121 }
1122
1123 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1124                              loff_t *ppos)
1125 {
1126         struct lu_env    *env;
1127         struct iovec     *local_iov;
1128         ssize_t           result;
1129         int               refcheck;
1130         ENTRY;
1131
1132         env = cl_env_get(&refcheck);
1133         if (IS_ERR(env))
1134                 RETURN(PTR_ERR(env));
1135
1136         local_iov = &vvp_env_info(env)->vti_local_iov;
1137         local_iov->iov_base = (void __user *)buf;
1138         local_iov->iov_len = count;
1139
1140         result = ll_file_writev(file, local_iov, 1, ppos);
1141         cl_env_put(env, &refcheck);
1142         RETURN(result);
1143 }
1144
1145 #else /* AIO stuff */
1146 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1147                                  unsigned long nr_segs, loff_t pos)
1148 {
1149         struct lu_env      *env;
1150         struct vvp_io_args *args;
1151         size_t              count;
1152         ssize_t             result;
1153         int                 refcheck;
1154         ENTRY;
1155
1156         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1157         if (result)
1158                 RETURN(result);
1159
1160         env = cl_env_get(&refcheck);
1161         if (IS_ERR(env))
1162                 RETURN(PTR_ERR(env));
1163
1164         args = vvp_env_args(env, IO_NORMAL);
1165         args->u.normal.via_iov = (struct iovec *)iov;
1166         args->u.normal.via_nrsegs = nr_segs;
1167         args->u.normal.via_iocb = iocb;
1168
1169         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1170                                   &iocb->ki_pos, count);
1171         cl_env_put(env, &refcheck);
1172         RETURN(result);
1173 }
1174
1175 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1176                              loff_t *ppos)
1177 {
1178         struct lu_env *env;
1179         struct iovec  *local_iov;
1180         struct kiocb  *kiocb;
1181         ssize_t        result;
1182         int            refcheck;
1183         ENTRY;
1184
1185         env = cl_env_get(&refcheck);
1186         if (IS_ERR(env))
1187                 RETURN(PTR_ERR(env));
1188
1189         local_iov = &vvp_env_info(env)->vti_local_iov;
1190         kiocb = &vvp_env_info(env)->vti_kiocb;
1191         local_iov->iov_base = (void __user *)buf;
1192         local_iov->iov_len = count;
1193         init_sync_kiocb(kiocb, file);
1194         kiocb->ki_pos = *ppos;
1195         kiocb->ki_left = count;
1196
1197         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1198         *ppos = kiocb->ki_pos;
1199
1200         cl_env_put(env, &refcheck);
1201         RETURN(result);
1202 }
1203 #endif
1204
1205
1206 #ifdef HAVE_KERNEL_SENDFILE
1207 /*
1208  * Send file content (through pagecache) somewhere with helper
1209  */
1210 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1211                                 read_actor_t actor, void *target)
1212 {
1213         struct lu_env      *env;
1214         struct vvp_io_args *args;
1215         ssize_t             result;
1216         int                 refcheck;
1217         ENTRY;
1218
1219         env = cl_env_get(&refcheck);
1220         if (IS_ERR(env))
1221                 RETURN(PTR_ERR(env));
1222
1223         args = vvp_env_args(env, IO_SENDFILE);
1224         args->u.sendfile.via_target = target;
1225         args->u.sendfile.via_actor = actor;
1226
1227         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1228         cl_env_put(env, &refcheck);
1229         RETURN(result);
1230 }
1231 #endif
1232
1233 #ifdef HAVE_KERNEL_SPLICE_READ
1234 /*
1235  * Send file content (through pagecache) somewhere with helper
1236  */
1237 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1238                                    struct pipe_inode_info *pipe, size_t count,
1239                                    unsigned int flags)
1240 {
1241         struct lu_env      *env;
1242         struct vvp_io_args *args;
1243         ssize_t             result;
1244         int                 refcheck;
1245         ENTRY;
1246
1247         env = cl_env_get(&refcheck);
1248         if (IS_ERR(env))
1249                 RETURN(PTR_ERR(env));
1250
1251         args = vvp_env_args(env, IO_SPLICE);
1252         args->u.splice.via_pipe = pipe;
1253         args->u.splice.via_flags = flags;
1254
1255         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1256         cl_env_put(env, &refcheck);
1257         RETURN(result);
1258 }
1259 #endif
1260
1261 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1262                            obd_count ost_idx)
1263 {
1264         struct obd_export *exp = ll_i2dtexp(inode);
1265         struct obd_trans_info oti = { 0 };
1266         struct obdo *oa = NULL;
1267         int lsm_size;
1268         int rc = 0;
1269         struct lov_stripe_md *lsm = NULL, *lsm2;
1270         ENTRY;
1271
1272         OBDO_ALLOC(oa);
1273         if (oa == NULL)
1274                 RETURN(-ENOMEM);
1275
1276         lsm = ccc_inode_lsm_get(inode);
1277         if (lsm == NULL)
1278                 GOTO(out, rc = -ENOENT);
1279
1280         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1281                    (lsm->lsm_stripe_count));
1282
1283         OBD_ALLOC_LARGE(lsm2, lsm_size);
1284         if (lsm2 == NULL)
1285                 GOTO(out, rc = -ENOMEM);
1286
1287         oa->o_id = id;
1288         oa->o_seq = seq;
1289         oa->o_nlink = ost_idx;
1290         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1291         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1292         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1293                                    OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1294         obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1295         memcpy(lsm2, lsm, lsm_size);
1296         ll_inode_size_lock(inode);
1297         rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1298         ll_inode_size_unlock(inode);
1299
1300         OBD_FREE_LARGE(lsm2, lsm_size);
1301         GOTO(out, rc);
1302 out:
1303         ccc_inode_lsm_put(inode, lsm);
1304         OBDO_FREE(oa);
1305         return rc;
1306 }
1307
1308 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1309 {
1310         struct ll_recreate_obj ucreat;
1311         ENTRY;
1312
1313         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1314                 RETURN(-EPERM);
1315
1316         if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1317                            sizeof(ucreat)))
1318                 RETURN(-EFAULT);
1319
1320         RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1321                                ucreat.lrc_ost_idx));
1322 }
1323
1324 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1325 {
1326         struct lu_fid   fid;
1327         obd_id          id;
1328         obd_count       ost_idx;
1329         ENTRY;
1330
1331         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1332                 RETURN(-EPERM);
1333
1334         if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1335                 RETURN(-EFAULT);
1336
1337         id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1338         ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1339         RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1340 }
1341
1342 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1343                              int flags, struct lov_user_md *lum, int lum_size)
1344 {
1345         struct lov_stripe_md *lsm = NULL;
1346         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1347         int rc = 0;
1348         ENTRY;
1349
1350         lsm = ccc_inode_lsm_get(inode);
1351         if (lsm != NULL) {
1352                 ccc_inode_lsm_put(inode, lsm);
1353                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1354                        inode->i_ino);
1355                 RETURN(-EEXIST);
1356         }
1357
1358         ll_inode_size_lock(inode);
1359         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1360         if (rc)
1361                 GOTO(out, rc);
1362         rc = oit.d.lustre.it_status;
1363         if (rc < 0)
1364                 GOTO(out_req_free, rc);
1365
1366         ll_release_openhandle(file->f_dentry, &oit);
1367
1368  out:
1369         ll_inode_size_unlock(inode);
1370         ll_intent_release(&oit);
1371         ccc_inode_lsm_put(inode, lsm);
1372         RETURN(rc);
1373 out_req_free:
1374         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1375         goto out;
1376 }
1377
1378 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1379                              struct lov_mds_md **lmmp, int *lmm_size,
1380                              struct ptlrpc_request **request)
1381 {
1382         struct ll_sb_info *sbi = ll_i2sbi(inode);
1383         struct mdt_body  *body;
1384         struct lov_mds_md *lmm = NULL;
1385         struct ptlrpc_request *req = NULL;
1386         struct md_op_data *op_data;
1387         int rc, lmmsize;
1388
1389         rc = ll_get_max_mdsize(sbi, &lmmsize);
1390         if (rc)
1391                 RETURN(rc);
1392
1393         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1394                                      strlen(filename), lmmsize,
1395                                      LUSTRE_OPC_ANY, NULL);
1396         if (IS_ERR(op_data))
1397                 RETURN(PTR_ERR(op_data));
1398
1399         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1400         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1401         ll_finish_md_op_data(op_data);
1402         if (rc < 0) {
1403                 CDEBUG(D_INFO, "md_getattr_name failed "
1404                        "on %s: rc %d\n", filename, rc);
1405                 GOTO(out, rc);
1406         }
1407
1408         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1409         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1410
1411         lmmsize = body->eadatasize;
1412
1413         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1414                         lmmsize == 0) {
1415                 GOTO(out, rc = -ENODATA);
1416         }
1417
1418         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1419         LASSERT(lmm != NULL);
1420
1421         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1422             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1423                 GOTO(out, rc = -EPROTO);
1424         }
1425
1426         /*
1427          * This is coming from the MDS, so is probably in
1428          * little endian.  We convert it to host endian before
1429          * passing it to userspace.
1430          */
1431         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1432                 /* if function called for directory - we should
1433                  * avoid swab not existent lsm objects */
1434                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1435                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1436                         if (S_ISREG(body->mode))
1437                                 lustre_swab_lov_user_md_objects(
1438                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1439                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1440                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1441                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1442                         if (S_ISREG(body->mode))
1443                                 lustre_swab_lov_user_md_objects(
1444                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1445                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1446                 }
1447         }
1448
1449 out:
1450         *lmmp = lmm;
1451         *lmm_size = lmmsize;
1452         *request = req;
1453         return rc;
1454 }
1455
1456 static int ll_lov_setea(struct inode *inode, struct file *file,
1457                             unsigned long arg)
1458 {
1459         int                      flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1460         struct lov_user_md      *lump;
1461         int                      lum_size = sizeof(struct lov_user_md) +
1462                                             sizeof(struct lov_user_ost_data);
1463         int                      rc;
1464         ENTRY;
1465
1466         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1467                 RETURN(-EPERM);
1468
1469         OBD_ALLOC_LARGE(lump, lum_size);
1470         if (lump == NULL)
1471                 RETURN(-ENOMEM);
1472
1473         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1474                 OBD_FREE_LARGE(lump, lum_size);
1475                 RETURN(-EFAULT);
1476         }
1477
1478         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1479
1480         OBD_FREE_LARGE(lump, lum_size);
1481         RETURN(rc);
1482 }
1483
1484 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1485                             unsigned long arg)
1486 {
1487         struct lov_user_md_v3    lumv3;
1488         struct lov_user_md_v1   *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1489         struct lov_user_md_v1   *lumv1p = (struct lov_user_md_v1 *)arg;
1490         struct lov_user_md_v3   *lumv3p = (struct lov_user_md_v3 *)arg;
1491         int                      lum_size, rc;
1492         int                      flags = FMODE_WRITE;
1493         ENTRY;
1494
1495         /* first try with v1 which is smaller than v3 */
1496         lum_size = sizeof(struct lov_user_md_v1);
1497         if (copy_from_user(lumv1, lumv1p, lum_size))
1498                 RETURN(-EFAULT);
1499
1500         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1501                 lum_size = sizeof(struct lov_user_md_v3);
1502                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1503                         RETURN(-EFAULT);
1504         }
1505
1506         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1507         if (rc == 0) {
1508                 struct lov_stripe_md *lsm;
1509                 __u32 gen;
1510
1511                 put_user(0, &lumv1p->lmm_stripe_count);
1512
1513                 ll_layout_refresh(inode, &gen);
1514                 lsm = ccc_inode_lsm_get(inode);
1515                 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1516                                    0, lsm, (void *)arg);
1517                 ccc_inode_lsm_put(inode, lsm);
1518         }
1519         RETURN(rc);
1520 }
1521
1522 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1523 {
1524         struct lov_stripe_md *lsm;
1525         int rc = -ENODATA;
1526         ENTRY;
1527
1528         lsm = ccc_inode_lsm_get(inode);
1529         if (lsm != NULL)
1530                 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1531                                    lsm, (void *)arg);
1532         ccc_inode_lsm_put(inode, lsm);
1533         RETURN(rc);
1534 }
1535
1536 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1537 {
1538         struct ll_inode_info   *lli = ll_i2info(inode);
1539         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1540         struct ccc_grouplock    grouplock;
1541         int                     rc;
1542         ENTRY;
1543
1544         if (ll_file_nolock(file))
1545                 RETURN(-EOPNOTSUPP);
1546
1547         spin_lock(&lli->lli_lock);
1548         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1549                 CWARN("group lock already existed with gid %lu\n",
1550                       fd->fd_grouplock.cg_gid);
1551                 spin_unlock(&lli->lli_lock);
1552                 RETURN(-EINVAL);
1553         }
1554         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1555         spin_unlock(&lli->lli_lock);
1556
1557         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1558                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1559         if (rc)
1560                 RETURN(rc);
1561
1562         spin_lock(&lli->lli_lock);
1563         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1564                 spin_unlock(&lli->lli_lock);
1565                 CERROR("another thread just won the race\n");
1566                 cl_put_grouplock(&grouplock);
1567                 RETURN(-EINVAL);
1568         }
1569
1570         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1571         fd->fd_grouplock = grouplock;
1572         spin_unlock(&lli->lli_lock);
1573
1574         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1575         RETURN(0);
1576 }
1577
1578 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1579 {
1580         struct ll_inode_info   *lli = ll_i2info(inode);
1581         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1582         struct ccc_grouplock    grouplock;
1583         ENTRY;
1584
1585         spin_lock(&lli->lli_lock);
1586         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1587                 spin_unlock(&lli->lli_lock);
1588                 CWARN("no group lock held\n");
1589                 RETURN(-EINVAL);
1590         }
1591         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1592
1593         if (fd->fd_grouplock.cg_gid != arg) {
1594                 CWARN("group lock %lu doesn't match current id %lu\n",
1595                        arg, fd->fd_grouplock.cg_gid);
1596                 spin_unlock(&lli->lli_lock);
1597                 RETURN(-EINVAL);
1598         }
1599
1600         grouplock = fd->fd_grouplock;
1601         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1602         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1603         spin_unlock(&lli->lli_lock);
1604
1605         cl_put_grouplock(&grouplock);
1606         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1607         RETURN(0);
1608 }
1609
1610 /**
1611  * Close inode open handle
1612  *
1613  * \param dentry [in]     dentry which contains the inode
1614  * \param it     [in,out] intent which contains open info and result
1615  *
1616  * \retval 0     success
1617  * \retval <0    failure
1618  */
1619 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1620 {
1621         struct inode *inode = dentry->d_inode;
1622         struct obd_client_handle *och;
1623         int rc;
1624         ENTRY;
1625
1626         LASSERT(inode);
1627
1628         /* Root ? Do nothing. */
1629         if (dentry->d_inode->i_sb->s_root == dentry)
1630                 RETURN(0);
1631
1632         /* No open handle to close? Move away */
1633         if (!it_disposition(it, DISP_OPEN_OPEN))
1634                 RETURN(0);
1635
1636         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1637
1638         OBD_ALLOC(och, sizeof(*och));
1639         if (!och)
1640                 GOTO(out, rc = -ENOMEM);
1641
1642         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1643                     ll_i2info(inode), it, och);
1644
1645         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1646                                        inode, och);
1647  out:
1648         /* this one is in place of ll_file_open */
1649         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1650                 ptlrpc_req_finished(it->d.lustre.it_data);
1651                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1652         }
1653         RETURN(rc);
1654 }
1655
1656 /**
1657  * Get size for inode for which FIEMAP mapping is requested.
1658  * Make the FIEMAP get_info call and returns the result.
1659  */
1660 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1661               int num_bytes)
1662 {
1663         struct obd_export *exp = ll_i2dtexp(inode);
1664         struct lov_stripe_md *lsm = NULL;
1665         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1666         int vallen = num_bytes;
1667         int rc;
1668         ENTRY;
1669
1670         /* Checks for fiemap flags */
1671         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1672                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1673                 return -EBADR;
1674         }
1675
1676         /* Check for FIEMAP_FLAG_SYNC */
1677         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1678                 rc = filemap_fdatawrite(inode->i_mapping);
1679                 if (rc)
1680                         return rc;
1681         }
1682
1683         lsm = ccc_inode_lsm_get(inode);
1684         if (lsm == NULL)
1685                 return -ENOENT;
1686
1687         /* If the stripe_count > 1 and the application does not understand
1688          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1689          */
1690         if (lsm->lsm_stripe_count > 1 &&
1691             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1692                 GOTO(out, rc = -EOPNOTSUPP);
1693
1694         fm_key.oa.o_id = lsm->lsm_object_id;
1695         fm_key.oa.o_seq = lsm->lsm_object_seq;
1696         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1697
1698         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1699         obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1700         /* If filesize is 0, then there would be no objects for mapping */
1701         if (fm_key.oa.o_size == 0) {
1702                 fiemap->fm_mapped_extents = 0;
1703                 GOTO(out, rc = 0);
1704         }
1705
1706         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1707
1708         rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1709                           fiemap, lsm);
1710         if (rc)
1711                 CERROR("obd_get_info failed: rc = %d\n", rc);
1712
1713 out:
1714         ccc_inode_lsm_put(inode, lsm);
1715         RETURN(rc);
1716 }
1717
1718 int ll_fid2path(struct inode *inode, void *arg)
1719 {
1720         struct obd_export       *exp = ll_i2mdexp(inode);
1721         struct getinfo_fid2path *gfout, *gfin;
1722         int                      outsize, rc;
1723         ENTRY;
1724
1725         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1726             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1727                 RETURN(-EPERM);
1728
1729         /* Need to get the buflen */
1730         OBD_ALLOC_PTR(gfin);
1731         if (gfin == NULL)
1732                 RETURN(-ENOMEM);
1733         if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1734                 OBD_FREE_PTR(gfin);
1735                 RETURN(-EFAULT);
1736         }
1737
1738         outsize = sizeof(*gfout) + gfin->gf_pathlen;
1739         OBD_ALLOC(gfout, outsize);
1740         if (gfout == NULL) {
1741                 OBD_FREE_PTR(gfin);
1742                 RETURN(-ENOMEM);
1743         }
1744         memcpy(gfout, gfin, sizeof(*gfout));
1745         OBD_FREE_PTR(gfin);
1746
1747         /* Call mdc_iocontrol */
1748         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1749         if (rc)
1750                 GOTO(gf_free, rc);
1751
1752         if (copy_to_user(arg, gfout, outsize))
1753                 rc = -EFAULT;
1754
1755 gf_free:
1756         OBD_FREE(gfout, outsize);
1757         RETURN(rc);
1758 }
1759
1760 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1761 {
1762         struct ll_user_fiemap *fiemap_s;
1763         size_t num_bytes, ret_bytes;
1764         unsigned int extent_count;
1765         int rc = 0;
1766
1767         /* Get the extent count so we can calculate the size of
1768          * required fiemap buffer */
1769         if (get_user(extent_count,
1770             &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1771                 RETURN(-EFAULT);
1772         num_bytes = sizeof(*fiemap_s) + (extent_count *
1773                                          sizeof(struct ll_fiemap_extent));
1774
1775         OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1776         if (fiemap_s == NULL)
1777                 RETURN(-ENOMEM);
1778
1779         /* get the fiemap value */
1780         if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1781                            sizeof(*fiemap_s)))
1782                 GOTO(error, rc = -EFAULT);
1783
1784         /* If fm_extent_count is non-zero, read the first extent since
1785          * it is used to calculate end_offset and device from previous
1786          * fiemap call. */
1787         if (extent_count) {
1788                 if (copy_from_user(&fiemap_s->fm_extents[0],
1789                     (char __user *)arg + sizeof(*fiemap_s),
1790                     sizeof(struct ll_fiemap_extent)))
1791                         GOTO(error, rc = -EFAULT);
1792         }
1793
1794         rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1795         if (rc)
1796                 GOTO(error, rc);
1797
1798         ret_bytes = sizeof(struct ll_user_fiemap);
1799
1800         if (extent_count != 0)
1801                 ret_bytes += (fiemap_s->fm_mapped_extents *
1802                                  sizeof(struct ll_fiemap_extent));
1803
1804         if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1805                 rc = -EFAULT;
1806
1807 error:
1808         OBD_FREE_LARGE(fiemap_s, num_bytes);
1809         RETURN(rc);
1810 }
1811
1812 /*
1813  * Read the data_version for inode.
1814  *
1815  * This value is computed using stripe object version on OST.
1816  * Version is computed using server side locking.
1817  *
1818  * @param extent_lock  Take extent lock. Not needed if a process is already
1819  *                     holding the OST object group locks.
1820  */
1821 int ll_data_version(struct inode *inode, __u64 *data_version,
1822                     int extent_lock)
1823 {
1824         struct lov_stripe_md    *lsm = NULL;
1825         struct ll_sb_info       *sbi = ll_i2sbi(inode);
1826         struct obdo             *obdo = NULL;
1827         int                      rc;
1828         ENTRY;
1829
1830         /* If no stripe, we consider version is 0. */
1831         lsm = ccc_inode_lsm_get(inode);
1832         if (lsm == NULL) {
1833                 *data_version = 0;
1834                 CDEBUG(D_INODE, "No object for inode\n");
1835                 RETURN(0);
1836         }
1837
1838         OBD_ALLOC_PTR(obdo);
1839         if (obdo == NULL) {
1840                 ccc_inode_lsm_put(inode, lsm);
1841                 RETURN(-ENOMEM);
1842         }
1843
1844         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1845         if (!rc) {
1846                 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1847                         rc = -EOPNOTSUPP;
1848                 else
1849                         *data_version = obdo->o_data_version;
1850         }
1851
1852         OBD_FREE_PTR(obdo);
1853         ccc_inode_lsm_put(inode, lsm);
1854
1855         RETURN(rc);
1856 }
1857
1858 struct ll_swap_stack {
1859         struct iattr             ia1, ia2;
1860         __u64                    dv1, dv2;
1861         struct inode            *inode1, *inode2;
1862         bool                     check_dv1, check_dv2;
1863 };
1864
1865 static int ll_swap_layouts(struct file *file1, struct file *file2,
1866                            struct lustre_swap_layouts *lsl)
1867 {
1868         struct mdc_swap_layouts  msl;
1869         struct md_op_data       *op_data;
1870         __u32                    gid;
1871         __u64                    dv;
1872         struct ll_swap_stack    *llss = NULL;
1873         int                      rc, rc1;
1874
1875         OBD_ALLOC_PTR(llss);
1876         if (llss == NULL)
1877                 RETURN(-ENOMEM);
1878
1879         llss->inode1 = file1->f_dentry->d_inode;
1880         llss->inode2 = file2->f_dentry->d_inode;
1881
1882         if (!S_ISREG(llss->inode2->i_mode))
1883                 GOTO(free, rc = -EINVAL);
1884
1885         if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1886             ll_permission(llss->inode2, MAY_WRITE, NULL))
1887                 GOTO(free, rc = -EPERM);
1888
1889         if (llss->inode2->i_sb != llss->inode1->i_sb)
1890                 GOTO(free, rc = -EXDEV);
1891
1892         /* we use 2 bool because it is easier to swap than 2 bits */
1893         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1894                 llss->check_dv1 = true;
1895
1896         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1897                 llss->check_dv2 = true;
1898
1899         /* we cannot use lsl->sl_dvX directly because we may swap them */
1900         llss->dv1 = lsl->sl_dv1;
1901         llss->dv2 = lsl->sl_dv2;
1902
1903         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1904         if (rc == 0) /* same file, done! */
1905                 GOTO(free, rc = 0);
1906
1907         if (rc < 0) { /* sequentialize it */
1908                 swap(llss->inode1, llss->inode2);
1909                 swap(file1, file2);
1910                 swap(llss->dv1, llss->dv2);
1911                 swap(llss->check_dv1, llss->check_dv2);
1912         }
1913
1914         gid = lsl->sl_gid;
1915         if (gid != 0) { /* application asks to flush dirty cache */
1916                 rc = ll_get_grouplock(llss->inode1, file1, gid);
1917                 if (rc < 0)
1918                         GOTO(free, rc);
1919
1920                 rc = ll_get_grouplock(llss->inode2, file2, gid);
1921                 if (rc < 0) {
1922                         ll_put_grouplock(llss->inode1, file1, gid);
1923                         GOTO(free, rc);
1924                 }
1925         }
1926
1927         /* to be able to restore mtime and atime after swap
1928          * we need to first save them */
1929         if (lsl->sl_flags &
1930             (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1931                 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1932                 llss->ia1.ia_atime = llss->inode1->i_atime;
1933                 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1934                 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1935                 llss->ia2.ia_atime = llss->inode2->i_atime;
1936                 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1937         }
1938
1939         /* ultimate check, before swaping the layouts we check if
1940          * dataversion has changed (if requested) */
1941         if (llss->check_dv1) {
1942                 rc = ll_data_version(llss->inode1, &dv, 0);
1943                 if (rc)
1944                         GOTO(putgl, rc);
1945                 if (dv != llss->dv1)
1946                         GOTO(putgl, rc = -EAGAIN);
1947         }
1948
1949         if (llss->check_dv2) {
1950                 rc = ll_data_version(llss->inode2, &dv, 0);
1951                 if (rc)
1952                         GOTO(putgl, rc);
1953                 if (dv != llss->dv2)
1954                         GOTO(putgl, rc = -EAGAIN);
1955         }
1956
1957         /* struct md_op_data is used to send the swap args to the mdt
1958          * only flags is missing, so we use struct mdc_swap_layouts
1959          * through the md_op_data->op_data */
1960         /* flags from user space have to be converted before they are send to
1961          * server, no flag is sent today, they are only used on the client */
1962         msl.msl_flags = 0;
1963         rc = -ENOMEM;
1964         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1965                                      0, LUSTRE_OPC_ANY, &msl);
1966         if (op_data != NULL) {
1967                 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
1968                                    ll_i2mdexp(llss->inode1),
1969                                    sizeof(*op_data), op_data, NULL);
1970                 ll_finish_md_op_data(op_data);
1971         }
1972
1973 putgl:
1974         if (gid != 0) {
1975                 ll_put_grouplock(llss->inode2, file2, gid);
1976                 ll_put_grouplock(llss->inode1, file1, gid);
1977         }
1978
1979         /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1980         if (rc != 0)
1981                 GOTO(free, rc);
1982
1983         /* clear useless flags */
1984         if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1985                 llss->ia1.ia_valid &= ~ATTR_MTIME;
1986                 llss->ia2.ia_valid &= ~ATTR_MTIME;
1987         }
1988
1989         if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1990                 llss->ia1.ia_valid &= ~ATTR_ATIME;
1991                 llss->ia2.ia_valid &= ~ATTR_ATIME;
1992         }
1993
1994         /* update time if requested */
1995         rc = rc1 = 0;
1996         if (llss->ia2.ia_valid != 0)
1997                 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1998
1999         if (llss->ia1.ia_valid != 0)
2000                 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2001
2002 free:
2003         if (llss != NULL)
2004                 OBD_FREE_PTR(llss);
2005
2006         RETURN(rc);
2007 }
2008
2009 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2010 {
2011         struct inode            *inode = file->f_dentry->d_inode;
2012         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2013         int                      flags, rc;
2014         ENTRY;
2015
2016         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2017                inode->i_generation, inode, cmd);
2018         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2019
2020         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2021         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2022                 RETURN(-ENOTTY);
2023
2024         switch(cmd) {
2025         case LL_IOC_GETFLAGS:
2026                 /* Get the current value of the file flags */
2027                 return put_user(fd->fd_flags, (int *)arg);
2028         case LL_IOC_SETFLAGS:
2029         case LL_IOC_CLRFLAGS:
2030                 /* Set or clear specific file flags */
2031                 /* XXX This probably needs checks to ensure the flags are
2032                  *     not abused, and to handle any flag side effects.
2033                  */
2034                 if (get_user(flags, (int *) arg))
2035                         RETURN(-EFAULT);
2036
2037                 if (cmd == LL_IOC_SETFLAGS) {
2038                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2039                             !(file->f_flags & O_DIRECT)) {
2040                                 CERROR("%s: unable to disable locking on "
2041                                        "non-O_DIRECT file\n", current->comm);
2042                                 RETURN(-EINVAL);
2043                         }
2044
2045                         fd->fd_flags |= flags;
2046                 } else {
2047                         fd->fd_flags &= ~flags;
2048                 }
2049                 RETURN(0);
2050         case LL_IOC_LOV_SETSTRIPE:
2051                 RETURN(ll_lov_setstripe(inode, file, arg));
2052         case LL_IOC_LOV_SETEA:
2053                 RETURN(ll_lov_setea(inode, file, arg));
2054         case LL_IOC_LOV_SWAP_LAYOUTS: {
2055                 struct file *file2;
2056                 struct lustre_swap_layouts lsl;
2057
2058                 if (cfs_copy_from_user(&lsl, (char *)arg,
2059                                        sizeof(struct lustre_swap_layouts)))
2060                         RETURN(-EFAULT);
2061
2062                 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2063                         RETURN(-EPERM);
2064
2065                 file2 = fget(lsl.sl_fd);
2066                 if (file2 == NULL)
2067                         RETURN(-EBADF);
2068
2069                 rc = -EPERM;
2070                 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2071                         rc = ll_swap_layouts(file, file2, &lsl);
2072                 fput(file2);
2073                 RETURN(rc);
2074         }
2075         case LL_IOC_LOV_GETSTRIPE:
2076                 RETURN(ll_lov_getstripe(inode, arg));
2077         case LL_IOC_RECREATE_OBJ:
2078                 RETURN(ll_lov_recreate_obj(inode, arg));
2079         case LL_IOC_RECREATE_FID:
2080                 RETURN(ll_lov_recreate_fid(inode, arg));
2081         case FSFILT_IOC_FIEMAP:
2082                 RETURN(ll_ioctl_fiemap(inode, arg));
2083         case FSFILT_IOC_GETFLAGS:
2084         case FSFILT_IOC_SETFLAGS:
2085                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2086         case FSFILT_IOC_GETVERSION_OLD:
2087         case FSFILT_IOC_GETVERSION:
2088                 RETURN(put_user(inode->i_generation, (int *)arg));
2089         case LL_IOC_GROUP_LOCK:
2090                 RETURN(ll_get_grouplock(inode, file, arg));
2091         case LL_IOC_GROUP_UNLOCK:
2092                 RETURN(ll_put_grouplock(inode, file, arg));
2093         case IOC_OBD_STATFS:
2094                 RETURN(ll_obd_statfs(inode, (void *)arg));
2095
2096         /* We need to special case any other ioctls we want to handle,
2097          * to send them to the MDS/OST as appropriate and to properly
2098          * network encode the arg field.
2099         case FSFILT_IOC_SETVERSION_OLD:
2100         case FSFILT_IOC_SETVERSION:
2101         */
2102         case LL_IOC_FLUSHCTX:
2103                 RETURN(ll_flush_ctx(inode));
2104         case LL_IOC_PATH2FID: {
2105                 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2106                                  sizeof(struct lu_fid)))
2107                         RETURN(-EFAULT);
2108
2109                 RETURN(0);
2110         }
2111         case OBD_IOC_FID2PATH:
2112                 RETURN(ll_fid2path(inode, (void *)arg));
2113         case LL_IOC_DATA_VERSION: {
2114                 struct ioc_data_version idv;
2115                 int                     rc;
2116
2117                 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2118                         RETURN(-EFAULT);
2119
2120                 rc = ll_data_version(inode, &idv.idv_version,
2121                                 !(idv.idv_flags & LL_DV_NOFLUSH));
2122
2123                 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2124                         RETURN(-EFAULT);
2125
2126                 RETURN(rc);
2127         }
2128
2129         case LL_IOC_GET_MDTIDX: {
2130                 int mdtidx;
2131
2132                 mdtidx = ll_get_mdt_idx(inode);
2133                 if (mdtidx < 0)
2134                         RETURN(mdtidx);
2135
2136                 if (put_user((int)mdtidx, (int*)arg))
2137                         RETURN(-EFAULT);
2138
2139                 RETURN(0);
2140         }
2141         case OBD_IOC_GETDTNAME:
2142         case OBD_IOC_GETMDNAME:
2143                 RETURN(ll_get_obd_name(inode, cmd, arg));
2144         case LL_IOC_HSM_STATE_GET: {
2145                 struct md_op_data       *op_data;
2146                 struct hsm_user_state   *hus;
2147                 int                      rc;
2148
2149                 OBD_ALLOC_PTR(hus);
2150                 if (hus == NULL)
2151                         RETURN(-ENOMEM);
2152
2153                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2154                                              LUSTRE_OPC_ANY, hus);
2155                 if (op_data == NULL) {
2156                         OBD_FREE_PTR(hus);
2157                         RETURN(-ENOMEM);
2158                 }
2159
2160                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2161                                    op_data, NULL);
2162
2163                 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2164                         rc = -EFAULT;
2165
2166                 ll_finish_md_op_data(op_data);
2167                 OBD_FREE_PTR(hus);
2168                 RETURN(rc);
2169         }
2170         case LL_IOC_HSM_STATE_SET: {
2171                 struct md_op_data       *op_data;
2172                 struct hsm_state_set    *hss;
2173                 int                      rc;
2174
2175                 OBD_ALLOC_PTR(hss);
2176                 if (hss == NULL)
2177                         RETURN(-ENOMEM);
2178                 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2179                         OBD_FREE_PTR(hss);
2180                         RETURN(-EFAULT);
2181                 }
2182
2183                 /* Non-root users are forbidden to set or clear flags which are
2184                  * NOT defined in HSM_USER_MASK. */
2185                 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2186                     && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2187                         OBD_FREE_PTR(hss);
2188                         RETURN(-EPERM);
2189                 }
2190
2191                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2192                                              LUSTRE_OPC_ANY, hss);
2193                 if (op_data == NULL) {
2194                         OBD_FREE_PTR(hss);
2195                         RETURN(-ENOMEM);
2196                 }
2197
2198                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2199                                    op_data, NULL);
2200
2201                 ll_finish_md_op_data(op_data);
2202
2203                 OBD_FREE_PTR(hss);
2204                 RETURN(rc);
2205         }
2206         case LL_IOC_HSM_ACTION: {
2207                 struct md_op_data               *op_data;
2208                 struct hsm_current_action       *hca;
2209                 int                              rc;
2210
2211                 OBD_ALLOC_PTR(hca);
2212                 if (hca == NULL)
2213                         RETURN(-ENOMEM);
2214
2215                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2216                                              LUSTRE_OPC_ANY, hca);
2217                 if (op_data == NULL) {
2218                         OBD_FREE_PTR(hca);
2219                         RETURN(-ENOMEM);
2220                 }
2221
2222                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2223                                    op_data, NULL);
2224
2225                 if (cfs_copy_to_user((char *)arg, hca, sizeof(*hca)))
2226                         rc = -EFAULT;
2227
2228                 ll_finish_md_op_data(op_data);
2229                 OBD_FREE_PTR(hca);
2230                 RETURN(rc);
2231         }
2232         default: {
2233                 int err;
2234
2235                 if (LLIOC_STOP ==
2236                      ll_iocontrol_call(inode, file, cmd, arg, &err))
2237                         RETURN(err);
2238
2239                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2240                                      (void *)arg));
2241         }
2242         }
2243 }
2244
2245 #ifndef HAVE_FILE_LLSEEK_SIZE
2246 static inline loff_t
2247 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2248 {
2249         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2250                 return -EINVAL;
2251         if (offset > maxsize)
2252                 return -EINVAL;
2253
2254         if (offset != file->f_pos) {
2255                 file->f_pos = offset;
2256                 file->f_version = 0;
2257         }
2258         return offset;
2259 }
2260
2261 static loff_t
2262 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2263                 loff_t maxsize, loff_t eof)
2264 {
2265         struct inode *inode = file->f_dentry->d_inode;
2266
2267         switch (origin) {
2268         case SEEK_END:
2269                 offset += eof;
2270                 break;
2271         case SEEK_CUR:
2272                 /*
2273                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
2274                  * position-querying operation.  Avoid rewriting the "same"
2275                  * f_pos value back to the file because a concurrent read(),
2276                  * write() or lseek() might have altered it
2277                  */
2278                 if (offset == 0)
2279                         return file->f_pos;
2280                 /*
2281                  * f_lock protects against read/modify/write race with other
2282                  * SEEK_CURs. Note that parallel writes and reads behave
2283                  * like SEEK_SET.
2284                  */
2285                 mutex_lock(&inode->i_mutex);
2286                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2287                 mutex_unlock(&inode->i_mutex);
2288                 return offset;
2289         case SEEK_DATA:
2290                 /*
2291                  * In the generic case the entire file is data, so as long as
2292                  * offset isn't at the end of the file then the offset is data.
2293                  */
2294                 if (offset >= eof)
2295                         return -ENXIO;
2296                 break;
2297         case SEEK_HOLE:
2298                 /*
2299                  * There is a virtual hole at the end of the file, so as long as
2300                  * offset isn't i_size or larger, return i_size.
2301                  */
2302                 if (offset >= eof)
2303                         return -ENXIO;
2304                 offset = eof;
2305                 break;
2306         }
2307
2308         return llseek_execute(file, offset, maxsize);
2309 }
2310 #endif
2311
2312 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2313 {
2314         struct inode *inode = file->f_dentry->d_inode;
2315         loff_t retval, eof = 0;
2316
2317         ENTRY;
2318         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2319                            (origin == SEEK_CUR) ? file->f_pos : 0);
2320         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2321                inode->i_ino, inode->i_generation, inode, retval, retval,
2322                origin);
2323         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2324
2325         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2326                 retval = ll_glimpse_size(inode);
2327                 if (retval != 0)
2328                         RETURN(retval);
2329                 eof = i_size_read(inode);
2330         }
2331
2332         retval = ll_generic_file_llseek_size(file, offset, origin,
2333                                           ll_file_maxbytes(inode), eof);
2334         RETURN(retval);
2335 }
2336
2337 int ll_flush(struct file *file, fl_owner_t id)
2338 {
2339         struct inode *inode = file->f_dentry->d_inode;
2340         struct ll_inode_info *lli = ll_i2info(inode);
2341         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2342         int rc, err;
2343
2344         LASSERT(!S_ISDIR(inode->i_mode));
2345
2346         /* catch async errors that were recorded back when async writeback
2347          * failed for pages in this mapping. */
2348         rc = lli->lli_async_rc;
2349         lli->lli_async_rc = 0;
2350         err = lov_read_and_clear_async_rc(lli->lli_clob);
2351         if (rc == 0)
2352                 rc = err;
2353
2354         /* The application has been told write failure already.
2355          * Do not report failure again. */
2356         if (fd->fd_write_failed)
2357                 return 0;
2358         return rc ? -EIO : 0;
2359 }
2360
2361 /**
2362  * Called to make sure a portion of file has been written out.
2363  * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2364  *
2365  * Return how many pages have been written.
2366  */
2367 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2368                        enum cl_fsync_mode mode)
2369 {
2370         struct cl_env_nest nest;
2371         struct lu_env *env;
2372         struct cl_io *io;
2373         struct obd_capa *capa = NULL;
2374         struct cl_fsync_io *fio;
2375         int result;
2376         ENTRY;
2377
2378         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2379             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2380                 RETURN(-EINVAL);
2381
2382         env = cl_env_nested_get(&nest);
2383         if (IS_ERR(env))
2384                 RETURN(PTR_ERR(env));
2385
2386         capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2387
2388         io = ccc_env_thread_io(env);
2389         io->ci_obj = cl_i2info(inode)->lli_clob;
2390         io->ci_ignore_layout = 1;
2391
2392         /* initialize parameters for sync */
2393         fio = &io->u.ci_fsync;
2394         fio->fi_capa = capa;
2395         fio->fi_start = start;
2396         fio->fi_end = end;
2397         fio->fi_fid = ll_inode2fid(inode);
2398         fio->fi_mode = mode;
2399         fio->fi_nr_written = 0;
2400
2401         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2402                 result = cl_io_loop(env, io);
2403         else
2404                 result = io->ci_result;
2405         if (result == 0)
2406                 result = fio->fi_nr_written;
2407         cl_io_fini(env, io);
2408         cl_env_nested_put(&nest, env);
2409
2410         capa_put(capa);
2411
2412         RETURN(result);
2413 }
2414
2415 #ifdef HAVE_FILE_FSYNC_4ARGS
2416 int ll_fsync(struct file *file, loff_t start, loff_t end, int data)
2417 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2418 int ll_fsync(struct file *file, int data)
2419 #else
2420 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2421 #endif
2422 {
2423         struct inode *inode = file->f_dentry->d_inode;
2424         struct ll_inode_info *lli = ll_i2info(inode);
2425         struct ptlrpc_request *req;
2426         struct obd_capa *oc;
2427         int rc, err;
2428         ENTRY;
2429
2430         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2431                inode->i_generation, inode);
2432         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2433
2434 #ifdef HAVE_FILE_FSYNC_4ARGS
2435         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2436         mutex_lock(&inode->i_mutex);
2437 #else
2438         /* fsync's caller has already called _fdata{sync,write}, we want
2439          * that IO to finish before calling the osc and mdc sync methods */
2440         rc = filemap_fdatawait(inode->i_mapping);
2441 #endif
2442
2443         /* catch async errors that were recorded back when async writeback
2444          * failed for pages in this mapping. */
2445         if (!S_ISDIR(inode->i_mode)) {
2446                 err = lli->lli_async_rc;
2447                 lli->lli_async_rc = 0;
2448                 if (rc == 0)
2449                         rc = err;
2450                 err = lov_read_and_clear_async_rc(lli->lli_clob);
2451                 if (rc == 0)
2452                         rc = err;
2453         }
2454
2455         oc = ll_mdscapa_get(inode);
2456         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2457                       &req);
2458         capa_put(oc);
2459         if (!rc)
2460                 rc = err;
2461         if (!err)
2462                 ptlrpc_req_finished(req);
2463
2464         if (data) {
2465                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2466
2467                 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2468                                 CL_FSYNC_ALL);
2469                 if (rc == 0 && err < 0)
2470                         rc = err;
2471                 if (rc < 0)
2472                         fd->fd_write_failed = true;
2473                 else
2474                         fd->fd_write_failed = false;
2475         }
2476
2477 #ifdef HAVE_FILE_FSYNC_4ARGS
2478         mutex_unlock(&inode->i_mutex);
2479 #endif
2480         RETURN(rc);
2481 }
2482
2483 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2484 {
2485         struct inode *inode = file->f_dentry->d_inode;
2486         struct ll_sb_info *sbi = ll_i2sbi(inode);
2487         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2488                                            .ei_cb_cp =ldlm_flock_completion_ast,
2489                                            .ei_cbdata = file_lock };
2490         struct md_op_data *op_data;
2491         struct lustre_handle lockh = {0};
2492         ldlm_policy_data_t flock = {{0}};
2493         int flags = 0;
2494         int rc;
2495         int rc2 = 0;
2496         ENTRY;
2497
2498         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2499                inode->i_ino, file_lock);
2500
2501         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2502
2503         if (file_lock->fl_flags & FL_FLOCK) {
2504                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2505                 /* flocks are whole-file locks */
2506                 flock.l_flock.end = OFFSET_MAX;
2507                 /* For flocks owner is determined by the local file desctiptor*/
2508                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2509         } else if (file_lock->fl_flags & FL_POSIX) {
2510                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2511                 flock.l_flock.start = file_lock->fl_start;
2512                 flock.l_flock.end = file_lock->fl_end;
2513         } else {
2514                 RETURN(-EINVAL);
2515         }
2516         flock.l_flock.pid = file_lock->fl_pid;
2517
2518         /* Somewhat ugly workaround for svc lockd.
2519          * lockd installs custom fl_lmops->lm_compare_owner that checks
2520          * for the fl_owner to be the same (which it always is on local node
2521          * I guess between lockd processes) and then compares pid.
2522          * As such we assign pid to the owner field to make it all work,
2523          * conflict with normal locks is unlikely since pid space and
2524          * pointer space for current->files are not intersecting */
2525         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2526                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2527
2528         switch (file_lock->fl_type) {
2529         case F_RDLCK:
2530                 einfo.ei_mode = LCK_PR;
2531                 break;
2532         case F_UNLCK:
2533                 /* An unlock request may or may not have any relation to
2534                  * existing locks so we may not be able to pass a lock handle
2535                  * via a normal ldlm_lock_cancel() request. The request may even
2536                  * unlock a byte range in the middle of an existing lock. In
2537                  * order to process an unlock request we need all of the same
2538                  * information that is given with a normal read or write record
2539                  * lock request. To avoid creating another ldlm unlock (cancel)
2540                  * message we'll treat a LCK_NL flock request as an unlock. */
2541                 einfo.ei_mode = LCK_NL;
2542                 break;
2543         case F_WRLCK:
2544                 einfo.ei_mode = LCK_PW;
2545                 break;
2546         default:
2547                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2548                         file_lock->fl_type);
2549                 RETURN (-ENOTSUPP);
2550         }
2551
2552         switch (cmd) {
2553         case F_SETLKW:
2554 #ifdef F_SETLKW64
2555         case F_SETLKW64:
2556 #endif
2557                 flags = 0;
2558                 break;
2559         case F_SETLK:
2560 #ifdef F_SETLK64
2561         case F_SETLK64:
2562 #endif
2563                 flags = LDLM_FL_BLOCK_NOWAIT;
2564                 break;
2565         case F_GETLK:
2566 #ifdef F_GETLK64
2567         case F_GETLK64:
2568 #endif
2569                 flags = LDLM_FL_TEST_LOCK;
2570                 /* Save the old mode so that if the mode in the lock changes we
2571                  * can decrement the appropriate reader or writer refcount. */
2572                 file_lock->fl_type = einfo.ei_mode;
2573                 break;
2574         default:
2575                 CERROR("unknown fcntl lock command: %d\n", cmd);
2576                 RETURN (-EINVAL);
2577         }
2578
2579         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2580                                      LUSTRE_OPC_ANY, NULL);
2581         if (IS_ERR(op_data))
2582                 RETURN(PTR_ERR(op_data));
2583
2584         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2585                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2586                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2587
2588         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2589                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2590
2591         if ((file_lock->fl_flags & FL_FLOCK) &&
2592             (rc == 0 || file_lock->fl_type == F_UNLCK))
2593                 rc2  = flock_lock_file_wait(file, file_lock);
2594         if ((file_lock->fl_flags & FL_POSIX) &&
2595             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2596             !(flags & LDLM_FL_TEST_LOCK))
2597                 rc2  = posix_lock_file_wait(file, file_lock);
2598
2599         if (rc2 && file_lock->fl_type != F_UNLCK) {
2600                 einfo.ei_mode = LCK_NL;
2601                 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2602                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2603                 rc = rc2;
2604         }
2605
2606         ll_finish_md_op_data(op_data);
2607
2608         RETURN(rc);
2609 }
2610
2611 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2612 {
2613         ENTRY;
2614
2615         RETURN(-ENOSYS);
2616 }
2617
2618 /**
2619  * test if some locks matching bits and l_req_mode are acquired
2620  * - bits can be in different locks
2621  * - if found clear the common lock bits in *bits
2622  * - the bits not found, are kept in *bits
2623  * \param inode [IN]
2624  * \param bits [IN] searched lock bits [IN]
2625  * \param l_req_mode [IN] searched lock mode
2626  * \retval boolean, true iff all bits are found
2627  */
2628 int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2629 {
2630         struct lustre_handle lockh;
2631         ldlm_policy_data_t policy;
2632         ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2633                                 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2634         struct lu_fid *fid;
2635         __u64 flags;
2636         int i;
2637         ENTRY;
2638
2639         if (!inode)
2640                RETURN(0);
2641
2642         fid = &ll_i2info(inode)->lli_fid;
2643         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2644                ldlm_lockname[mode]);
2645
2646         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2647         for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2648                 policy.l_inodebits.bits = *bits & (1 << i);
2649                 if (policy.l_inodebits.bits == 0)
2650                         continue;
2651
2652                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2653                                   &policy, mode, &lockh)) {
2654                         struct ldlm_lock *lock;
2655
2656                         lock = ldlm_handle2lock(&lockh);
2657                         if (lock) {
2658                                 *bits &=
2659                                       ~(lock->l_policy_data.l_inodebits.bits);
2660                                 LDLM_LOCK_PUT(lock);
2661                         } else {
2662                                 *bits &= ~policy.l_inodebits.bits;
2663                         }
2664                 }
2665         }
2666         RETURN(*bits == 0);
2667 }
2668
2669 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2670                             struct lustre_handle *lockh, __u64 flags)
2671 {
2672         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2673         struct lu_fid *fid;
2674         ldlm_mode_t rc;
2675         ENTRY;
2676
2677         fid = &ll_i2info(inode)->lli_fid;
2678         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2679
2680         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2681                            fid, LDLM_IBITS, &policy,
2682                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2683         RETURN(rc);
2684 }
2685
2686 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2687 {
2688         /* Already unlinked. Just update nlink and return success */
2689         if (rc == -ENOENT) {
2690                 clear_nlink(inode);
2691                 /* This path cannot be hit for regular files unless in
2692                  * case of obscure races, so no need to to validate
2693                  * size. */
2694                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2695                         return 0;
2696         } else if (rc != 0) {
2697                 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2698                        ll_get_fsname(inode->i_sb, NULL, 0),
2699                        PFID(ll_inode2fid(inode)), rc);
2700         }
2701
2702         return rc;
2703 }
2704
2705 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2706                              __u64 ibits)
2707 {
2708         struct inode *inode = dentry->d_inode;
2709         struct ptlrpc_request *req = NULL;
2710         struct obd_export *exp;
2711         int rc = 0;
2712         ENTRY;
2713
2714         LASSERT(inode != NULL);
2715
2716         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2717                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2718
2719         exp = ll_i2mdexp(inode);
2720
2721         /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2722          *      But under CMD case, it caused some lock issues, should be fixed
2723          *      with new CMD ibits lock. See bug 12718 */
2724         if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2725                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2726                 struct md_op_data *op_data;
2727
2728                 if (ibits == MDS_INODELOCK_LOOKUP)
2729                         oit.it_op = IT_LOOKUP;
2730
2731                 /* Call getattr by fid, so do not provide name at all. */
2732                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2733                                              dentry->d_inode, NULL, 0, 0,
2734                                              LUSTRE_OPC_ANY, NULL);
2735                 if (IS_ERR(op_data))
2736                         RETURN(PTR_ERR(op_data));
2737
2738                 oit.it_create_mode |= M_CHECK_STALE;
2739                 rc = md_intent_lock(exp, op_data, NULL, 0,
2740                                     /* we are not interested in name
2741                                        based lookup */
2742                                     &oit, 0, &req,
2743                                     ll_md_blocking_ast, 0);
2744                 ll_finish_md_op_data(op_data);
2745                 oit.it_create_mode &= ~M_CHECK_STALE;
2746                 if (rc < 0) {
2747                         rc = ll_inode_revalidate_fini(inode, rc);
2748                         GOTO (out, rc);
2749                 }
2750
2751                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2752                 if (rc != 0) {
2753                         ll_intent_release(&oit);
2754                         GOTO(out, rc);
2755                 }
2756
2757                 /* Unlinked? Unhash dentry, so it is not picked up later by
2758                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2759                    here to preserve get_cwd functionality on 2.6.
2760                    Bug 10503 */
2761                 if (!dentry->d_inode->i_nlink)
2762                         d_lustre_invalidate(dentry);
2763
2764                 ll_lookup_finish_locks(&oit, dentry);
2765         } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2766                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2767                 obd_valid valid = OBD_MD_FLGETATTR;
2768                 struct md_op_data *op_data;
2769                 int ealen = 0;
2770
2771                 if (S_ISREG(inode->i_mode)) {
2772                         rc = ll_get_max_mdsize(sbi, &ealen);
2773                         if (rc)
2774                                 RETURN(rc);
2775                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2776                 }
2777
2778                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2779                                              0, ealen, LUSTRE_OPC_ANY,
2780                                              NULL);
2781                 if (IS_ERR(op_data))
2782                         RETURN(PTR_ERR(op_data));
2783
2784                 op_data->op_valid = valid;
2785                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2786                  * capa for this inode. Because we only keep capas of dirs
2787                  * fresh. */
2788                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2789                 ll_finish_md_op_data(op_data);
2790                 if (rc) {
2791                         rc = ll_inode_revalidate_fini(inode, rc);
2792                         RETURN(rc);
2793                 }
2794
2795                 rc = ll_prep_inode(&inode, req, NULL, NULL);
2796         }
2797 out:
2798         ptlrpc_req_finished(req);
2799         return rc;
2800 }
2801
2802 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2803                            __u64 ibits)
2804 {
2805         struct inode *inode = dentry->d_inode;
2806         int rc;
2807         ENTRY;
2808
2809         rc = __ll_inode_revalidate_it(dentry, it, ibits);
2810         if (rc != 0)
2811                 RETURN(rc);
2812
2813         /* if object isn't regular file, don't validate size */
2814         if (!S_ISREG(inode->i_mode)) {
2815                 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2816                 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2817                 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2818         } else {
2819                 rc = ll_glimpse_size(inode);
2820         }
2821         RETURN(rc);
2822 }
2823
2824 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2825                   struct lookup_intent *it, struct kstat *stat)
2826 {
2827         struct inode *inode = de->d_inode;
2828         struct ll_sb_info *sbi = ll_i2sbi(inode);
2829         struct ll_inode_info *lli = ll_i2info(inode);
2830         int res = 0;
2831
2832         res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2833                                              MDS_INODELOCK_LOOKUP);
2834         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2835
2836         if (res)
2837                 return res;
2838
2839         stat->dev = inode->i_sb->s_dev;
2840         if (ll_need_32bit_api(sbi))
2841                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2842         else
2843                 stat->ino = inode->i_ino;
2844         stat->mode = inode->i_mode;
2845         stat->nlink = inode->i_nlink;
2846         stat->uid = inode->i_uid;
2847         stat->gid = inode->i_gid;
2848         stat->rdev = inode->i_rdev;
2849         stat->atime = inode->i_atime;
2850         stat->mtime = inode->i_mtime;
2851         stat->ctime = inode->i_ctime;
2852         stat->blksize = 1 << inode->i_blkbits;
2853
2854         stat->size = i_size_read(inode);
2855         stat->blocks = inode->i_blocks;
2856
2857         return 0;
2858 }
2859 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2860 {
2861         struct lookup_intent it = { .it_op = IT_GETATTR };
2862
2863         return ll_getattr_it(mnt, de, &it, stat);
2864 }
2865
2866 #ifdef HAVE_LINUX_FIEMAP_H
2867 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2868                 __u64 start, __u64 len)
2869 {
2870         int rc;
2871         size_t num_bytes;
2872         struct ll_user_fiemap *fiemap;
2873         unsigned int extent_count = fieinfo->fi_extents_max;
2874
2875         num_bytes = sizeof(*fiemap) + (extent_count *
2876                                        sizeof(struct ll_fiemap_extent));
2877         OBD_ALLOC_LARGE(fiemap, num_bytes);
2878
2879         if (fiemap == NULL)
2880                 RETURN(-ENOMEM);
2881
2882         fiemap->fm_flags = fieinfo->fi_flags;
2883         fiemap->fm_extent_count = fieinfo->fi_extents_max;
2884         fiemap->fm_start = start;
2885         fiemap->fm_length = len;
2886         memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2887                sizeof(struct ll_fiemap_extent));
2888
2889         rc = ll_do_fiemap(inode, fiemap, num_bytes);
2890
2891         fieinfo->fi_flags = fiemap->fm_flags;
2892         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2893         memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2894                fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2895
2896         OBD_FREE_LARGE(fiemap, num_bytes);
2897         return rc;
2898 }
2899 #endif
2900
2901 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2902 {
2903         struct ll_inode_info *lli = ll_i2info(inode);
2904         struct posix_acl *acl = NULL;
2905         ENTRY;
2906
2907         spin_lock(&lli->lli_lock);
2908         /* VFS' acl_permission_check->check_acl will release the refcount */
2909         acl = posix_acl_dup(lli->lli_posix_acl);
2910         spin_unlock(&lli->lli_lock);
2911
2912         RETURN(acl);
2913 }
2914
2915 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2916 static int
2917 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2918 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2919 # else
2920 ll_check_acl(struct inode *inode, int mask)
2921 # endif
2922 {
2923 # ifdef CONFIG_FS_POSIX_ACL
2924         struct posix_acl *acl;
2925         int rc;
2926         ENTRY;
2927
2928 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
2929         if (flags & IPERM_FLAG_RCU)
2930                 return -ECHILD;
2931 #  endif
2932         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2933
2934         if (!acl)
2935                 RETURN(-EAGAIN);
2936
2937         rc = posix_acl_permission(inode, acl, mask);
2938         posix_acl_release(acl);
2939
2940         RETURN(rc);
2941 # else /* !CONFIG_FS_POSIX_ACL */
2942         return -EAGAIN;
2943 # endif /* CONFIG_FS_POSIX_ACL */
2944 }
2945 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2946
2947 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2948 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2949 #else
2950 # ifdef HAVE_INODE_PERMISION_2ARGS
2951 int ll_inode_permission(struct inode *inode, int mask)
2952 # else
2953 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2954 # endif
2955 #endif
2956 {
2957         int rc = 0;
2958         ENTRY;
2959
2960 #ifdef MAY_NOT_BLOCK
2961         if (mask & MAY_NOT_BLOCK)
2962                 return -ECHILD;
2963 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2964         if (flags & IPERM_FLAG_RCU)
2965                 return -ECHILD;
2966 #endif
2967
2968        /* as root inode are NOT getting validated in lookup operation,
2969         * need to do it before permission check. */
2970
2971         if (inode == inode->i_sb->s_root->d_inode) {
2972                 struct lookup_intent it = { .it_op = IT_LOOKUP };
2973
2974                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2975                                               MDS_INODELOCK_LOOKUP);
2976                 if (rc)
2977                         RETURN(rc);
2978         }
2979
2980         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2981                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2982
2983         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2984                 return lustre_check_remote_perm(inode, mask);
2985
2986         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2987         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
2988
2989         RETURN(rc);
2990 }
2991
2992 #ifdef HAVE_FILE_READV
2993 #define READ_METHOD readv
2994 #define READ_FUNCTION ll_file_readv
2995 #define WRITE_METHOD writev
2996 #define WRITE_FUNCTION ll_file_writev
2997 #else
2998 #define READ_METHOD aio_read
2999 #define READ_FUNCTION ll_file_aio_read
3000 #define WRITE_METHOD aio_write
3001 #define WRITE_FUNCTION ll_file_aio_write
3002 #endif
3003
3004 /* -o localflock - only provides locally consistent flock locks */
3005 struct file_operations ll_file_operations = {
3006         .read           = ll_file_read,
3007         .READ_METHOD    = READ_FUNCTION,
3008         .write          = ll_file_write,
3009         .WRITE_METHOD   = WRITE_FUNCTION,
3010         .unlocked_ioctl = ll_file_ioctl,
3011         .open           = ll_file_open,
3012         .release        = ll_file_release,
3013         .mmap           = ll_file_mmap,
3014         .llseek         = ll_file_seek,
3015 #ifdef HAVE_KERNEL_SENDFILE
3016         .sendfile       = ll_file_sendfile,
3017 #endif
3018 #ifdef HAVE_KERNEL_SPLICE_READ
3019         .splice_read    = ll_file_splice_read,
3020 #endif
3021         .fsync          = ll_fsync,
3022         .flush          = ll_flush
3023 };
3024
3025 struct file_operations ll_file_operations_flock = {
3026         .read           = ll_file_read,
3027         .READ_METHOD    = READ_FUNCTION,
3028         .write          = ll_file_write,
3029         .WRITE_METHOD   = WRITE_FUNCTION,
3030         .unlocked_ioctl = ll_file_ioctl,
3031         .open           = ll_file_open,
3032         .release        = ll_file_release,
3033         .mmap           = ll_file_mmap,
3034         .llseek         = ll_file_seek,
3035 #ifdef HAVE_KERNEL_SENDFILE
3036         .sendfile       = ll_file_sendfile,
3037 #endif
3038 #ifdef HAVE_KERNEL_SPLICE_READ
3039         .splice_read    = ll_file_splice_read,
3040 #endif
3041         .fsync          = ll_fsync,
3042         .flush          = ll_flush,
3043         .flock          = ll_file_flock,
3044         .lock           = ll_file_flock
3045 };
3046
3047 /* These are for -o noflock - to return ENOSYS on flock calls */
3048 struct file_operations ll_file_operations_noflock = {
3049         .read           = ll_file_read,
3050         .READ_METHOD    = READ_FUNCTION,
3051         .write          = ll_file_write,
3052         .WRITE_METHOD   = WRITE_FUNCTION,
3053         .unlocked_ioctl = ll_file_ioctl,
3054         .open           = ll_file_open,
3055         .release        = ll_file_release,
3056         .mmap           = ll_file_mmap,
3057         .llseek         = ll_file_seek,
3058 #ifdef HAVE_KERNEL_SENDFILE
3059         .sendfile       = ll_file_sendfile,
3060 #endif
3061 #ifdef HAVE_KERNEL_SPLICE_READ
3062         .splice_read    = ll_file_splice_read,
3063 #endif
3064         .fsync          = ll_fsync,
3065         .flush          = ll_flush,
3066         .flock          = ll_file_noflock,
3067         .lock           = ll_file_noflock
3068 };
3069
3070 struct inode_operations ll_file_inode_operations = {
3071         .setattr        = ll_setattr,
3072         .getattr        = ll_getattr,
3073         .permission     = ll_inode_permission,
3074         .setxattr       = ll_setxattr,
3075         .getxattr       = ll_getxattr,
3076         .listxattr      = ll_listxattr,
3077         .removexattr    = ll_removexattr,
3078 #ifdef  HAVE_LINUX_FIEMAP_H
3079         .fiemap         = ll_fiemap,
3080 #endif
3081 #ifdef HAVE_IOP_GET_ACL
3082         .get_acl        = ll_get_acl,
3083 #endif
3084 };
3085
3086 /* dynamic ioctl number support routins */
3087 static struct llioc_ctl_data {
3088         struct rw_semaphore     ioc_sem;
3089         cfs_list_t              ioc_head;
3090 } llioc = {
3091         __RWSEM_INITIALIZER(llioc.ioc_sem),
3092         CFS_LIST_HEAD_INIT(llioc.ioc_head)
3093 };
3094
3095
3096 struct llioc_data {
3097         cfs_list_t              iocd_list;
3098         unsigned int            iocd_size;
3099         llioc_callback_t        iocd_cb;
3100         unsigned int            iocd_count;
3101         unsigned int            iocd_cmd[0];
3102 };
3103
3104 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3105 {
3106         unsigned int size;
3107         struct llioc_data *in_data = NULL;
3108         ENTRY;
3109
3110         if (cb == NULL || cmd == NULL ||
3111             count > LLIOC_MAX_CMD || count < 0)
3112                 RETURN(NULL);
3113
3114         size = sizeof(*in_data) + count * sizeof(unsigned int);
3115         OBD_ALLOC(in_data, size);
3116         if (in_data == NULL)
3117                 RETURN(NULL);
3118
3119         memset(in_data, 0, sizeof(*in_data));
3120         in_data->iocd_size = size;
3121         in_data->iocd_cb = cb;
3122         in_data->iocd_count = count;
3123         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3124
3125         down_write(&llioc.ioc_sem);
3126         cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3127         up_write(&llioc.ioc_sem);
3128
3129         RETURN(in_data);
3130 }
3131
3132 void ll_iocontrol_unregister(void *magic)
3133 {
3134         struct llioc_data *tmp;
3135
3136         if (magic == NULL)
3137                 return;
3138
3139         down_write(&llioc.ioc_sem);
3140         cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3141                 if (tmp == magic) {
3142                         unsigned int size = tmp->iocd_size;
3143
3144                         cfs_list_del(&tmp->iocd_list);
3145                         up_write(&llioc.ioc_sem);
3146
3147                         OBD_FREE(tmp, size);
3148                         return;
3149                 }
3150         }
3151         up_write(&llioc.ioc_sem);
3152
3153         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3154 }
3155
3156 EXPORT_SYMBOL(ll_iocontrol_register);
3157 EXPORT_SYMBOL(ll_iocontrol_unregister);
3158
3159 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3160                         unsigned int cmd, unsigned long arg, int *rcp)
3161 {
3162         enum llioc_iter ret = LLIOC_CONT;
3163         struct llioc_data *data;
3164         int rc = -EINVAL, i;
3165
3166         down_read(&llioc.ioc_sem);
3167         cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3168                 for (i = 0; i < data->iocd_count; i++) {
3169                         if (cmd != data->iocd_cmd[i])
3170                                 continue;
3171
3172                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3173                         break;
3174                 }
3175
3176                 if (ret == LLIOC_STOP)
3177                         break;
3178         }
3179         up_read(&llioc.ioc_sem);
3180
3181         if (rcp)
3182                 *rcp = rc;
3183         return ret;
3184 }
3185
3186 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3187 {
3188         struct ll_inode_info *lli = ll_i2info(inode);
3189         struct cl_env_nest nest;
3190         struct lu_env *env;
3191         int result;
3192         ENTRY;
3193
3194         if (lli->lli_clob == NULL)
3195                 RETURN(0);
3196
3197         env = cl_env_nested_get(&nest);
3198         if (IS_ERR(env))
3199                 RETURN(PTR_ERR(env));
3200
3201         result = cl_conf_set(env, lli->lli_clob, conf);
3202         cl_env_nested_put(&nest, env);
3203
3204         if (conf->coc_opc == OBJECT_CONF_SET) {
3205                 struct ldlm_lock *lock = conf->coc_lock;
3206
3207                 LASSERT(lock != NULL);
3208                 LASSERT(ldlm_has_layout(lock));
3209                 if (result == 0) {
3210                         /* it can only be allowed to match after layout is
3211                          * applied to inode otherwise false layout would be
3212                          * seen. Applying layout shoud happen before dropping
3213                          * the intent lock. */
3214                         ldlm_lock_allow_match(lock);
3215                 }
3216         }
3217         RETURN(result);
3218 }
3219
3220 /**
3221  * Apply the layout to the inode. Layout lock is held and will be released
3222  * in this function.
3223  */
3224 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3225                                 struct inode *inode, __u32 *gen, bool reconf)
3226 {
3227         struct ll_inode_info *lli = ll_i2info(inode);
3228         struct ll_sb_info    *sbi = ll_i2sbi(inode);
3229         struct ldlm_lock *lock;
3230         struct lustre_md md = { NULL };
3231         struct cl_object_conf conf;
3232         int rc = 0;
3233         bool lvb_ready;
3234         ENTRY;
3235
3236         LASSERT(lustre_handle_is_used(lockh));
3237
3238         lock = ldlm_handle2lock(lockh);
3239         LASSERT(lock != NULL);
3240         LASSERT(ldlm_has_layout(lock));
3241
3242         LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3243                 inode, PFID(&lli->lli_fid), reconf);
3244
3245         lock_res_and_lock(lock);
3246         lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3247         unlock_res_and_lock(lock);
3248         /* checking lvb_ready is racy but this is okay. The worst case is
3249          * that multi processes may configure the file on the same time. */
3250         if (lvb_ready || !reconf) {
3251                 LDLM_LOCK_PUT(lock);
3252
3253                 rc = -ENODATA;
3254                 if (lvb_ready) {
3255                         /* layout_gen must be valid if layout lock is not
3256                          * cancelled and stripe has already set */
3257                         *gen = lli->lli_layout_gen;
3258                         rc = 0;
3259                 }
3260                 ldlm_lock_decref(lockh, mode);
3261                 RETURN(rc);
3262         }
3263
3264         /* for layout lock, lmm is returned in lock's lvb.
3265          * lvb_data is immutable if the lock is held so it's safe to access it
3266          * without res lock. See the description in ldlm_lock_decref_internal()
3267          * for the condition to free lvb_data of layout lock */
3268         if (lock->l_lvb_data != NULL) {
3269                 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3270                                   lock->l_lvb_data, lock->l_lvb_len);
3271                 if (rc >= 0) {
3272                         *gen = LL_LAYOUT_GEN_EMPTY;
3273                         if (md.lsm != NULL)
3274                                 *gen = md.lsm->lsm_layout_gen;
3275                         rc = 0;
3276                 } else {
3277                         CERROR("%s: file "DFID" unpackmd error: %d\n",
3278                                 ll_get_fsname(inode->i_sb, NULL, 0),
3279                                 PFID(&lli->lli_fid), rc);
3280                 }
3281         }
3282         if (rc < 0) {
3283                 LDLM_LOCK_PUT(lock);
3284                 ldlm_lock_decref(lockh, mode);
3285                 RETURN(rc);
3286         }
3287
3288         /* set layout to file. Unlikely this will fail as old layout was
3289          * surely eliminated */
3290         memset(&conf, 0, sizeof conf);
3291         conf.coc_opc = OBJECT_CONF_SET;
3292         conf.coc_inode = inode;
3293         conf.coc_lock = lock;
3294         conf.u.coc_md = &md;
3295         rc = ll_layout_conf(inode, &conf);
3296         LDLM_LOCK_PUT(lock);
3297
3298         ldlm_lock_decref(lockh, mode);
3299
3300         if (md.lsm != NULL)
3301                 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3302
3303         /* wait for IO to complete if it's still being used. */
3304         if (rc == -EBUSY) {
3305                 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3306                         ll_get_fsname(inode->i_sb, NULL, 0),
3307                         inode, PFID(&lli->lli_fid));
3308
3309                 memset(&conf, 0, sizeof conf);
3310                 conf.coc_opc = OBJECT_CONF_WAIT;
3311                 conf.coc_inode = inode;
3312                 rc = ll_layout_conf(inode, &conf);
3313                 if (rc == 0)
3314                         rc = -EAGAIN;
3315
3316                 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3317                         PFID(&lli->lli_fid), rc);
3318         }
3319
3320         RETURN(rc);
3321 }
3322
3323 /**
3324  * This function checks if there exists a LAYOUT lock on the client side,
3325  * or enqueues it if it doesn't have one in cache.
3326  *
3327  * This function will not hold layout lock so it may be revoked any time after
3328  * this function returns. Any operations depend on layout should be redone
3329  * in that case.
3330  *
3331  * This function should be called before lov_io_init() to get an uptodate
3332  * layout version, the caller should save the version number and after IO
3333  * is finished, this function should be called again to verify that layout
3334  * is not changed during IO time.
3335  */
3336 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3337 {
3338         struct ll_inode_info  *lli = ll_i2info(inode);
3339         struct ll_sb_info     *sbi = ll_i2sbi(inode);
3340         struct md_op_data     *op_data;
3341         struct lookup_intent   it;
3342         struct lustre_handle   lockh;
3343         ldlm_mode_t            mode;
3344         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3345                                            .ei_mode = LCK_CR,
3346                                            .ei_cb_bl = ll_md_blocking_ast,
3347                                            .ei_cb_cp = ldlm_completion_ast,
3348                                            .ei_cbdata = NULL };
3349         int rc;
3350         ENTRY;
3351
3352         *gen = LL_LAYOUT_GEN_NONE;
3353         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3354                 RETURN(0);
3355
3356         /* sanity checks */
3357         LASSERT(fid_is_sane(ll_inode2fid(inode)));
3358         LASSERT(S_ISREG(inode->i_mode));
3359
3360         /* mostly layout lock is caching on the local side, so try to match
3361          * it before grabbing layout lock mutex. */
3362         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3363         if (mode != 0) { /* hit cached lock */
3364                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3365                 if (rc == 0)
3366                         RETURN(0);
3367
3368                 /* better hold lli_layout_mutex to try again otherwise
3369                  * it will have starvation problem. */
3370         }
3371
3372         /* take layout lock mutex to enqueue layout lock exclusively. */
3373         mutex_lock(&lli->lli_layout_mutex);
3374
3375 again:
3376         /* try again. Maybe somebody else has done this. */
3377         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3378         if (mode != 0) { /* hit cached lock */
3379                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3380                 if (rc == -EAGAIN)
3381                         goto again;
3382
3383                 mutex_unlock(&lli->lli_layout_mutex);
3384                 RETURN(rc);
3385         }
3386
3387         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3388                         0, 0, LUSTRE_OPC_ANY, NULL);
3389         if (IS_ERR(op_data)) {
3390                 mutex_unlock(&lli->lli_layout_mutex);
3391                 RETURN(PTR_ERR(op_data));
3392         }
3393
3394         /* have to enqueue one */
3395         memset(&it, 0, sizeof(it));
3396         it.it_op = IT_LAYOUT;
3397         lockh.cookie = 0ULL;
3398
3399         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3400                         ll_get_fsname(inode->i_sb, NULL, 0), inode,
3401                         PFID(&lli->lli_fid));
3402
3403         rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3404                         NULL, 0, NULL, 0);
3405         if (it.d.lustre.it_data != NULL)
3406                 ptlrpc_req_finished(it.d.lustre.it_data);
3407         it.d.lustre.it_data = NULL;
3408
3409         ll_finish_md_op_data(op_data);
3410
3411         md_set_lock_data(sbi->ll_md_exp, &it.d.lustre.it_lock_handle, inode, NULL);
3412
3413         mode = it.d.lustre.it_lock_mode;
3414         it.d.lustre.it_lock_mode = 0;
3415         ll_intent_drop_lock(&it);
3416
3417         if (rc == 0) {
3418                 /* set lock data in case this is a new lock */
3419                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3420                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3421                 if (rc == -EAGAIN)
3422                         goto again;
3423         }
3424         mutex_unlock(&lli->lli_layout_mutex);
3425
3426         RETURN(rc);
3427 }