lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19  *
  20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21  * CA 95054 USA or visit www.sun.com if you need additional information or
  22  * have any questions.
  23  *
  24  * GPL HEADER END
  25  */
  26 /*
  27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Use is subject to license terms.
  29  *
  30  * Copyright (c) 2011, 2012, Intel Corporation.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lustre/llite/file.c
  37  *
  38  * Author: Peter Braam <braam@clusterfs.com>
  39  * Author: Phil Schwan <phil@clusterfs.com>
  40  * Author: Andreas Dilger <adilger@clusterfs.com>
  41  */
  42
  43 #define DEBUG_SUBSYSTEM S_LLITE
  44 #include <lustre_dlm.h>
  45 #include <lustre_lite.h>
  46 #include <linux/pagemap.h>
  47 #include <linux/file.h>
  48 #include "llite_internal.h"
  49 #include <lustre/ll_fiemap.h>
  50
  51 #include "cl_object.h"
  52
  53 struct ll_file_data *ll_file_data_get(void)
  54 {
  55         struct ll_file_data *fd;
  56
  57         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
  58         fd->fd_write_failed = false;
  59         return fd;
  60 }
  61
  62 static void ll_file_data_put(struct ll_file_data *fd)
  63 {
  64         if (fd != NULL)
  65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  66 }
  67
  68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  69                           struct lustre_handle *fh)
  70 {
  71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  72         op_data->op_attr.ia_mode = inode->i_mode;
  73         op_data->op_attr.ia_atime = inode->i_atime;
  74         op_data->op_attr.ia_mtime = inode->i_mtime;
  75         op_data->op_attr.ia_ctime = inode->i_ctime;
  76         op_data->op_attr.ia_size = i_size_read(inode);
  77         op_data->op_attr_blocks = inode->i_blocks;
  78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
  79                                         ll_inode_to_ext_flags(inode->i_flags);
  80         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  81         if (fh)
  82                 op_data->op_handle = *fh;
  83         op_data->op_capa1 = ll_mdscapa_get(inode);
  84
  85         if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
  86                 op_data->op_bias |= MDS_DATA_MODIFIED;
  87 }
  88
  89 /**
  90  * Closes the IO epoch and packs all the attributes into @op_data for
  91  * the CLOSE rpc.
  92  */
  93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  94                              struct obd_client_handle *och)
  95 {
  96         ENTRY;
  97
  98         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
  99                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
 100
 101         if (!(och->och_flags & FMODE_WRITE))
 102                 goto out;
 103
 104         if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
 105                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 106         else
 107                 ll_ioepoch_close(inode, op_data, &och, 0);
 108
 109 out:
 110         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
 111         ll_prep_md_op_data(op_data, inode, NULL, NULL,
 112                            0, 0, LUSTRE_OPC_ANY, NULL);
 113         EXIT;
 114 }
 115
 116 static int ll_close_inode_openhandle(struct obd_export *md_exp,
 117                                      struct inode *inode,
 118                                      struct obd_client_handle *och)
 119 {
 120         struct obd_export *exp = ll_i2mdexp(inode);
 121         struct md_op_data *op_data;
 122         struct ptlrpc_request *req = NULL;
 123         struct obd_device *obd = class_exp2obd(exp);
 124         int epoch_close = 1;
 125         int rc;
 126         ENTRY;
 127
 128         if (obd == NULL) {
 129                 /*
 130                  * XXX: in case of LMV, is this correct to access
 131                  * ->exp_handle?
 132                  */
 133                 CERROR("Invalid MDC connection handle "LPX64"\n",
 134                        ll_i2mdexp(inode)->exp_handle.h_cookie);
 135                 GOTO(out, rc = 0);
 136         }
 137
 138         OBD_ALLOC_PTR(op_data);
 139         if (op_data == NULL)
 140                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
 141
 142         ll_prepare_close(inode, op_data, och);
 143         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 144         rc = md_close(md_exp, op_data, och->och_mod, &req);
 145         if (rc == -EAGAIN) {
 146                 /* This close must have the epoch closed. */
 147                 LASSERT(epoch_close);
 148                 /* MDS has instructed us to obtain Size-on-MDS attribute from
 149                  * OSTs and send setattr to back to MDS. */
 150                 rc = ll_som_update(inode, op_data);
 151                 if (rc) {
 152                         CERROR("inode %lu mdc Size-on-MDS update failed: "
 153                                "rc = %d\n", inode->i_ino, rc);
 154                         rc = 0;
 155                 }
 156         } else if (rc) {
 157                 CERROR("inode %lu mdc close failed: rc = %d\n",
 158                        inode->i_ino, rc);
 159         }
 160
 161         /* DATA_MODIFIED flag was successfully sent on close, cancel data
 162          * modification flag. */
 163         if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
 164                 struct ll_inode_info *lli = ll_i2info(inode);
 165
 166                 spin_lock(&lli->lli_lock);
 167                 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
 168                 spin_unlock(&lli->lli_lock);
 169         }
 170
 171         ll_finish_md_op_data(op_data);
 172
 173         if (rc == 0) {
 174                 rc = ll_objects_destroy(req, inode);
 175                 if (rc)
 176                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
 177                                inode->i_ino, rc);
 178         }
 179
 180         EXIT;
 181 out:
 182
 183         if (exp_connect_som(exp) && !epoch_close &&
 184             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 185                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 186         } else {
 187                 md_clear_open_replay_data(md_exp, och);
 188                 /* Free @och if it is not waiting for DONE_WRITING. */
 189                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 190                 OBD_FREE_PTR(och);
 191         }
 192         if (req) /* This is close request */
 193                 ptlrpc_req_finished(req);
 194         return rc;
 195 }
 196
 197 int ll_md_real_close(struct inode *inode, int flags)
 198 {
 199         struct ll_inode_info *lli = ll_i2info(inode);
 200         struct obd_client_handle **och_p;
 201         struct obd_client_handle *och;
 202         __u64 *och_usecount;
 203         int rc = 0;
 204         ENTRY;
 205
 206         if (flags & FMODE_WRITE) {
 207                 och_p = &lli->lli_mds_write_och;
 208                 och_usecount = &lli->lli_open_fd_write_count;
 209         } else if (flags & FMODE_EXEC) {
 210                 och_p = &lli->lli_mds_exec_och;
 211                 och_usecount = &lli->lli_open_fd_exec_count;
 212         } else {
 213                 LASSERT(flags & FMODE_READ);
 214                 och_p = &lli->lli_mds_read_och;
 215                 och_usecount = &lli->lli_open_fd_read_count;
 216         }
 217
 218         mutex_lock(&lli->lli_och_mutex);
 219         if (*och_usecount) { /* There are still users of this handle, so
 220                                 skip freeing it. */
 221                 mutex_unlock(&lli->lli_och_mutex);
 222                 RETURN(0);
 223         }
 224         och=*och_p;
 225         *och_p = NULL;
 226         mutex_unlock(&lli->lli_och_mutex);
 227
 228         if (och) { /* There might be a race and somebody have freed this och
 229                       already */
 230                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 231                                                inode, och);
 232         }
 233
 234         RETURN(rc);
 235 }
 236
 237 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 238                 struct file *file)
 239 {
 240         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 241         struct ll_inode_info *lli = ll_i2info(inode);
 242         int rc = 0;
 243         ENTRY;
 244
 245         /* clear group lock, if present */
 246         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 247                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
 248
 249         /* Let's see if we have good enough OPEN lock on the file and if
 250            we can skip talking to MDS */
 251         if (file->f_dentry->d_inode) { /* Can this ever be false? */
 252                 int lockmode;
 253                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 254                 struct lustre_handle lockh;
 255                 struct inode *inode = file->f_dentry->d_inode;
 256                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 257
 258                 mutex_lock(&lli->lli_och_mutex);
 259                 if (fd->fd_omode & FMODE_WRITE) {
 260                         lockmode = LCK_CW;
 261                         LASSERT(lli->lli_open_fd_write_count);
 262                         lli->lli_open_fd_write_count--;
 263                 } else if (fd->fd_omode & FMODE_EXEC) {
 264                         lockmode = LCK_PR;
 265                         LASSERT(lli->lli_open_fd_exec_count);
 266                         lli->lli_open_fd_exec_count--;
 267                 } else {
 268                         lockmode = LCK_CR;
 269                         LASSERT(lli->lli_open_fd_read_count);
 270                         lli->lli_open_fd_read_count--;
 271                 }
 272                 mutex_unlock(&lli->lli_och_mutex);
 273
 274                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 275                                    LDLM_IBITS, &policy, lockmode,
 276                                    &lockh)) {
 277                         rc = ll_md_real_close(file->f_dentry->d_inode,
 278                                               fd->fd_omode);
 279                 }
 280         } else {
 281                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
 282                        file, file->f_dentry, file->f_dentry->d_name.name);
 283         }
 284
 285         LUSTRE_FPRIVATE(file) = NULL;
 286         ll_file_data_put(fd);
 287         ll_capa_close(inode);
 288
 289         RETURN(rc);
 290 }
 291
 292 /* While this returns an error code, fput() the caller does not, so we need
 293  * to make every effort to clean up all of our state here.  Also, applications
 294  * rarely check close errors and even if an error is returned they will not
 295  * re-try the close call.
 296  */
 297 int ll_file_release(struct inode *inode, struct file *file)
 298 {
 299         struct ll_file_data *fd;
 300         struct ll_sb_info *sbi = ll_i2sbi(inode);
 301         struct ll_inode_info *lli = ll_i2info(inode);
 302         int rc;
 303         ENTRY;
 304
 305         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 306                inode->i_generation, inode);
 307
 308 #ifdef CONFIG_FS_POSIX_ACL
 309         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
 310             inode == inode->i_sb->s_root->d_inode) {
 311                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 312
 313                 LASSERT(fd != NULL);
 314                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 315                         fd->fd_flags &= ~LL_FILE_RMTACL;
 316                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
 317                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
 318                 }
 319         }
 320 #endif
 321
 322         if (inode->i_sb->s_root != file->f_dentry)
 323                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 324         fd = LUSTRE_FPRIVATE(file);
 325         LASSERT(fd != NULL);
 326
 327         /* The last ref on @file, maybe not the the owner pid of statahead.
 328          * Different processes can open the same dir, "ll_opendir_key" means:
 329          * it is me that should stop the statahead thread. */
 330         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
 331             lli->lli_opendir_pid != 0)
 332                 ll_stop_statahead(inode, lli->lli_opendir_key);
 333
 334         if (inode->i_sb->s_root == file->f_dentry) {
 335                 LUSTRE_FPRIVATE(file) = NULL;
 336                 ll_file_data_put(fd);
 337                 RETURN(0);
 338         }
 339
 340         if (!S_ISDIR(inode->i_mode)) {
 341                 lov_read_and_clear_async_rc(lli->lli_clob);
 342                 lli->lli_async_rc = 0;
 343         }
 344
 345         rc = ll_md_close(sbi->ll_md_exp, inode, file);
 346
 347         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 348                 libcfs_debug_dumplog();
 349
 350         RETURN(rc);
 351 }
 352
 353 static int ll_intent_file_open(struct file *file, void *lmm,
 354                                int lmmsize, struct lookup_intent *itp)
 355 {
 356         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 357         struct dentry *parent = file->f_dentry->d_parent;
 358         const char *name = file->f_dentry->d_name.name;
 359         const int len = file->f_dentry->d_name.len;
 360         struct md_op_data *op_data;
 361         struct ptlrpc_request *req;
 362         __u32 opc = LUSTRE_OPC_ANY;
 363         int rc;
 364         ENTRY;
 365
 366         if (!parent)
 367                 RETURN(-ENOENT);
 368
 369         /* Usually we come here only for NFSD, and we want open lock.
 370            But we can also get here with pre 2.6.15 patchless kernels, and in
 371            that case that lock is also ok */
 372         /* We can also get here if there was cached open handle in revalidate_it
 373          * but it disappeared while we were getting from there to ll_file_open.
 374          * But this means this file was closed and immediatelly opened which
 375          * makes a good candidate for using OPEN lock */
 376         /* If lmmsize & lmm are not 0, we are just setting stripe info
 377          * parameters. No need for the open lock */
 378         if (lmm == NULL && lmmsize == 0) {
 379                 itp->it_flags |= MDS_OPEN_LOCK;
 380                 if (itp->it_flags & FMODE_WRITE)
 381                         opc = LUSTRE_OPC_CREATE;
 382         }
 383
 384         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 385                                       file->f_dentry->d_inode, name, len,
 386                                       O_RDWR, opc, NULL);
 387         if (IS_ERR(op_data))
 388                 RETURN(PTR_ERR(op_data));
 389
 390         itp->it_flags |= MDS_OPEN_BY_FID;
 391         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 392                             0 /*unused */, &req, ll_md_blocking_ast, 0);
 393         ll_finish_md_op_data(op_data);
 394         if (rc == -ESTALE) {
 395                 /* reason for keep own exit path - don`t flood log
 396                 * with messages with -ESTALE errors.
 397                 */
 398                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 399                      it_open_error(DISP_OPEN_OPEN, itp))
 400                         GOTO(out, rc);
 401                 ll_release_openhandle(file->f_dentry, itp);
 402                 GOTO(out, rc);
 403         }
 404
 405         if (it_disposition(itp, DISP_LOOKUP_NEG))
 406                 GOTO(out, rc = -ENOENT);
 407
 408         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 409                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 410                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 411                 GOTO(out, rc);
 412         }
 413
 414         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
 415         if (!rc && itp->d.lustre.it_lock_mode)
 416                 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
 417                                  itp, NULL);
 418
 419 out:
 420         ptlrpc_req_finished(itp->d.lustre.it_data);
 421         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
 422         ll_intent_drop_lock(itp);
 423
 424         RETURN(rc);
 425 }
 426
 427 /**
 428  * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
 429  * not believe attributes if a few ioepoch holders exist. Attributes for
 430  * previous ioepoch if new one is opened are also skipped by MDS.
 431  */
 432 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
 433 {
 434         if (ioepoch && lli->lli_ioepoch != ioepoch) {
 435                 lli->lli_ioepoch = ioepoch;
 436                 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 437                        ioepoch, PFID(&lli->lli_fid));
 438         }
 439 }
 440
 441 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
 442                        struct lookup_intent *it, struct obd_client_handle *och)
 443 {
 444         struct ptlrpc_request *req = it->d.lustre.it_data;
 445         struct mdt_body *body;
 446
 447         LASSERT(och);
 448
 449         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 450         LASSERT(body != NULL);                      /* reply already checked out */
 451
 452         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
 453         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 454         och->och_fid = lli->lli_fid;
 455         och->och_flags = it->it_flags;
 456         ll_ioepoch_open(lli, body->ioepoch);
 457
 458         return md_set_open_replay_data(md_exp, och, req);
 459 }
 460
 461 int ll_local_open(struct file *file, struct lookup_intent *it,
 462                   struct ll_file_data *fd, struct obd_client_handle *och)
 463 {
 464         struct inode *inode = file->f_dentry->d_inode;
 465         struct ll_inode_info *lli = ll_i2info(inode);
 466         ENTRY;
 467
 468         LASSERT(!LUSTRE_FPRIVATE(file));
 469
 470         LASSERT(fd != NULL);
 471
 472         if (och) {
 473                 struct ptlrpc_request *req = it->d.lustre.it_data;
 474                 struct mdt_body *body;
 475                 int rc;
 476
 477                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
 478                 if (rc)
 479                         RETURN(rc);
 480
 481                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 482                 if ((it->it_flags & FMODE_WRITE) &&
 483                     (body->valid & OBD_MD_FLSIZE))
 484                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 485                                lli->lli_ioepoch, PFID(&lli->lli_fid));
 486         }
 487
 488         LUSTRE_FPRIVATE(file) = fd;
 489         ll_readahead_init(inode, &fd->fd_ras);
 490         fd->fd_omode = it->it_flags;
 491         RETURN(0);
 492 }
 493
 494 /* Open a file, and (for the very first open) create objects on the OSTs at
 495  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 496  * creation or open until ll_lov_setstripe() ioctl is called.
 497  *
 498  * If we already have the stripe MD locally then we don't request it in
 499  * md_open(), by passing a lmm_size = 0.
 500  *
 501  * It is up to the application to ensure no other processes open this file
 502  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 503  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 504  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 505  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 506  */
 507 int ll_file_open(struct inode *inode, struct file *file)
 508 {
 509         struct ll_inode_info *lli = ll_i2info(inode);
 510         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 511                                           .it_flags = file->f_flags };
 512         struct obd_client_handle **och_p = NULL;
 513         __u64 *och_usecount = NULL;
 514         struct ll_file_data *fd;
 515         int rc = 0, opendir_set = 0;
 516         ENTRY;
 517
 518         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 519                inode->i_generation, inode, file->f_flags);
 520
 521         it = file->private_data; /* XXX: compat macro */
 522         file->private_data = NULL; /* prevent ll_local_open assertion */
 523
 524         fd = ll_file_data_get();
 525         if (fd == NULL)
 526                 GOTO(out_och_free, rc = -ENOMEM);
 527
 528         fd->fd_file = file;
 529         if (S_ISDIR(inode->i_mode)) {
 530                 spin_lock(&lli->lli_sa_lock);
 531                 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
 532                     lli->lli_opendir_pid == 0) {
 533                         lli->lli_opendir_key = fd;
 534                         lli->lli_opendir_pid = cfs_curproc_pid();
 535                         opendir_set = 1;
 536                 }
 537                 spin_unlock(&lli->lli_sa_lock);
 538         }
 539
 540         if (inode->i_sb->s_root == file->f_dentry) {
 541                 LUSTRE_FPRIVATE(file) = fd;
 542                 RETURN(0);
 543         }
 544
 545         if (!it || !it->d.lustre.it_disposition) {
 546                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 547                  * because everything but O_ACCMODE mask was stripped from
 548                  * there */
 549                 if ((oit.it_flags + 1) & O_ACCMODE)
 550                         oit.it_flags++;
 551                 if (file->f_flags & O_TRUNC)
 552                         oit.it_flags |= FMODE_WRITE;
 553
 554                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 555                  * dentry_open after call to open_namei that checks permissions.
 556                  * Only nfsd_open call dentry_open directly without checking
 557                  * permissions and because of that this code below is safe. */
 558                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 559                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 560
 561                 /* We do not want O_EXCL here, presumably we opened the file
 562                  * already? XXX - NFS implications? */
 563                 oit.it_flags &= ~O_EXCL;
 564
 565                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 566                  * created if necessary, then "IT_CREAT" should be set to keep
 567                  * consistent with it */
 568                 if (oit.it_flags & O_CREAT)
 569                         oit.it_op |= IT_CREAT;
 570
 571                 it = &oit;
 572         }
 573
 574 restart:
 575         /* Let's see if we have file open on MDS already. */
 576         if (it->it_flags & FMODE_WRITE) {
 577                 och_p = &lli->lli_mds_write_och;
 578                 och_usecount = &lli->lli_open_fd_write_count;
 579         } else if (it->it_flags & FMODE_EXEC) {
 580                 och_p = &lli->lli_mds_exec_och;
 581                 och_usecount = &lli->lli_open_fd_exec_count;
 582          } else {
 583                 och_p = &lli->lli_mds_read_och;
 584                 och_usecount = &lli->lli_open_fd_read_count;
 585         }
 586
 587         mutex_lock(&lli->lli_och_mutex);
 588         if (*och_p) { /* Open handle is present */
 589                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 590                         /* Well, there's extra open request that we do not need,
 591                            let's close it somehow. This will decref request. */
 592                         rc = it_open_error(DISP_OPEN_OPEN, it);
 593                         if (rc) {
 594                                 mutex_unlock(&lli->lli_och_mutex);
 595                                 GOTO(out_openerr, rc);
 596                         }
 597
 598                         ll_release_openhandle(file->f_dentry, it);
 599                 }
 600                 (*och_usecount)++;
 601
 602                 rc = ll_local_open(file, it, fd, NULL);
 603                 if (rc) {
 604                         (*och_usecount)--;
 605                         mutex_unlock(&lli->lli_och_mutex);
 606                         GOTO(out_openerr, rc);
 607                 }
 608         } else {
 609                 LASSERT(*och_usecount == 0);
 610                 if (!it->d.lustre.it_disposition) {
 611                         /* We cannot just request lock handle now, new ELC code
 612                            means that one of other OPEN locks for this file
 613                            could be cancelled, and since blocking ast handler
 614                            would attempt to grab och_mutex as well, that would
 615                            result in a deadlock */
 616                         mutex_unlock(&lli->lli_och_mutex);
 617                         it->it_create_mode |= M_CHECK_STALE;
 618                         rc = ll_intent_file_open(file, NULL, 0, it);
 619                         it->it_create_mode &= ~M_CHECK_STALE;
 620                         if (rc)
 621                                 GOTO(out_openerr, rc);
 622
 623                         goto restart;
 624                 }
 625                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 626                 if (!*och_p)
 627                         GOTO(out_och_free, rc = -ENOMEM);
 628
 629                 (*och_usecount)++;
 630
 631                 /* md_intent_lock() didn't get a request ref if there was an
 632                  * open error, so don't do cleanup on the request here
 633                  * (bug 3430) */
 634                 /* XXX (green): Should not we bail out on any error here, not
 635                  * just open error? */
 636                 rc = it_open_error(DISP_OPEN_OPEN, it);
 637                 if (rc)
 638                         GOTO(out_och_free, rc);
 639
 640                 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
 641
 642                 rc = ll_local_open(file, it, fd, *och_p);
 643                 if (rc)
 644                         GOTO(out_och_free, rc);
 645         }
 646         mutex_unlock(&lli->lli_och_mutex);
 647         fd = NULL;
 648
 649         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 650            different kind of OPEN lock for this same inode gets cancelled
 651            by ldlm_cancel_lru */
 652         if (!S_ISREG(inode->i_mode))
 653                 GOTO(out_och_free, rc);
 654
 655         ll_capa_open(inode);
 656
 657         if (!lli->lli_has_smd) {
 658                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 659                     !(file->f_mode & FMODE_WRITE)) {
 660                         CDEBUG(D_INODE, "object creation was delayed\n");
 661                         GOTO(out_och_free, rc);
 662                 }
 663         }
 664         file->f_flags &= ~O_LOV_DELAY_CREATE;
 665         GOTO(out_och_free, rc);
 666
 667 out_och_free:
 668         if (rc) {
 669                 if (och_p && *och_p) {
 670                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 671                         *och_p = NULL; /* OBD_FREE writes some magic there */
 672                         (*och_usecount)--;
 673                 }
 674                 mutex_unlock(&lli->lli_och_mutex);
 675
 676 out_openerr:
 677                 if (opendir_set != 0)
 678                         ll_stop_statahead(inode, lli->lli_opendir_key);
 679                 if (fd != NULL)
 680                         ll_file_data_put(fd);
 681         } else {
 682                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 683         }
 684
 685         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 686                 ptlrpc_req_finished(it->d.lustre.it_data);
 687                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 688         }
 689
 690         return rc;
 691 }
 692
 693 /* Fills the obdo with the attributes for the lsm */
 694 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
 695                           struct obd_capa *capa, struct obdo *obdo,
 696                           __u64 ioepoch, int sync)
 697 {
 698         struct ptlrpc_request_set *set;
 699         struct obd_info            oinfo = { { { 0 } } };
 700         int                        rc;
 701
 702         ENTRY;
 703
 704         LASSERT(lsm != NULL);
 705
 706         oinfo.oi_md = lsm;
 707         oinfo.oi_oa = obdo;
 708         oinfo.oi_oa->o_id = lsm->lsm_object_id;
 709         oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
 710         oinfo.oi_oa->o_mode = S_IFREG;
 711         oinfo.oi_oa->o_ioepoch = ioepoch;
 712         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 713                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 714                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 715                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 716                                OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
 717                                OBD_MD_FLDATAVERSION;
 718         oinfo.oi_capa = capa;
 719         if (sync) {
 720                 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
 721                 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
 722         }
 723
 724         set = ptlrpc_prep_set();
 725         if (set == NULL) {
 726                 CERROR("can't allocate ptlrpc set\n");
 727                 rc = -ENOMEM;
 728         } else {
 729                 rc = obd_getattr_async(exp, &oinfo, set);
 730                 if (rc == 0)
 731                         rc = ptlrpc_set_wait(set);
 732                 ptlrpc_set_destroy(set);
 733         }
 734         if (rc == 0)
 735                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 736                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
 737                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE |
 738                                          OBD_MD_FLDATAVERSION);
 739         RETURN(rc);
 740 }
 741
 742 /**
 743   * Performs the getattr on the inode and updates its fields.
 744   * If @sync != 0, perform the getattr under the server-side lock.
 745   */
 746 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
 747                      __u64 ioepoch, int sync)
 748 {
 749         struct obd_capa      *capa = ll_mdscapa_get(inode);
 750         struct lov_stripe_md *lsm;
 751         int rc;
 752         ENTRY;
 753
 754         lsm = ccc_inode_lsm_get(inode);
 755         rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
 756                             capa, obdo, ioepoch, sync);
 757         capa_put(capa);
 758         if (rc == 0) {
 759                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
 760                 CDEBUG(D_INODE,
 761                        "objid "LPX64" size %llu, blocks %llu, blksize %lu\n",
 762                        lsm ? lsm->lsm_object_id : 0, i_size_read(inode),
 763                        (unsigned long long)inode->i_blocks,
 764                        (unsigned long)ll_inode_blksize(inode));
 765         }
 766         ccc_inode_lsm_put(inode, lsm);
 767         RETURN(rc);
 768 }
 769
 770 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
 771 {
 772         struct ll_inode_info *lli = ll_i2info(inode);
 773         struct cl_object *obj = lli->lli_clob;
 774         struct cl_attr *attr = ccc_env_thread_attr(env);
 775         struct ost_lvb lvb;
 776         int rc = 0;
 777
 778         ENTRY;
 779
 780         ll_inode_size_lock(inode);
 781         /* merge timestamps the most recently obtained from mds with
 782            timestamps obtained from osts */
 783         LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
 784         LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
 785         LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
 786         inode_init_lvb(inode, &lvb);
 787
 788         cl_object_attr_lock(obj);
 789         rc = cl_object_attr_get(env, obj, attr);
 790         cl_object_attr_unlock(obj);
 791
 792         if (rc == 0) {
 793                 if (lvb.lvb_atime < attr->cat_atime)
 794                         lvb.lvb_atime = attr->cat_atime;
 795                 if (lvb.lvb_ctime < attr->cat_ctime)
 796                         lvb.lvb_ctime = attr->cat_ctime;
 797                 if (lvb.lvb_mtime < attr->cat_mtime)
 798                         lvb.lvb_mtime = attr->cat_mtime;
 799
 800                 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
 801                                 PFID(&lli->lli_fid), attr->cat_size);
 802                 cl_isize_write_nolock(inode, attr->cat_size);
 803
 804                 inode->i_blocks = attr->cat_blocks;
 805
 806                 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
 807                 LTIME_S(inode->i_atime) = lvb.lvb_atime;
 808                 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
 809         }
 810         ll_inode_size_unlock(inode);
 811
 812         RETURN(rc);
 813 }
 814
 815 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
 816                      lstat_t *st)
 817 {
 818         struct obdo obdo = { 0 };
 819         int rc;
 820
 821         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
 822         if (rc == 0) {
 823                 st->st_size   = obdo.o_size;
 824                 st->st_blocks = obdo.o_blocks;
 825                 st->st_mtime  = obdo.o_mtime;
 826                 st->st_atime  = obdo.o_atime;
 827                 st->st_ctime  = obdo.o_ctime;
 828         }
 829         return rc;
 830 }
 831
 832 void ll_io_init(struct cl_io *io, const struct file *file, int write)
 833 {
 834         struct inode *inode = file->f_dentry->d_inode;
 835
 836         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
 837         if (write) {
 838                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
 839                 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
 840                                       file->f_flags & O_DIRECT ||
 841                                       IS_SYNC(inode);
 842         }
 843         io->ci_obj     = ll_i2info(inode)->lli_clob;
 844         io->ci_lockreq = CILR_MAYBE;
 845         if (ll_file_nolock(file)) {
 846                 io->ci_lockreq = CILR_NEVER;
 847                 io->ci_no_srvlock = 1;
 848         } else if (file->f_flags & O_APPEND) {
 849                 io->ci_lockreq = CILR_MANDATORY;
 850         }
 851 }
 852
 853 static ssize_t
 854 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 855                    struct file *file, enum cl_io_type iot,
 856                    loff_t *ppos, size_t count)
 857 {
 858         struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
 859         struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
 860         struct cl_io         *io;
 861         ssize_t               result;
 862         ENTRY;
 863
 864 restart:
 865         io = ccc_env_thread_io(env);
 866         ll_io_init(io, file, iot == CIT_WRITE);
 867
 868         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
 869                 struct vvp_io *vio = vvp_env_io(env);
 870                 struct ccc_io *cio = ccc_env_io(env);
 871                 int write_mutex_locked = 0;
 872
 873                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
 874                 vio->cui_io_subtype = args->via_io_subtype;
 875
 876                 switch (vio->cui_io_subtype) {
 877                 case IO_NORMAL:
 878                         cio->cui_iov = args->u.normal.via_iov;
 879                         cio->cui_nrsegs = args->u.normal.via_nrsegs;
 880                         cio->cui_tot_nrsegs = cio->cui_nrsegs;
 881 #ifndef HAVE_FILE_WRITEV
 882                         cio->cui_iocb = args->u.normal.via_iocb;
 883 #endif
 884                         if ((iot == CIT_WRITE) &&
 885                             !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
 886                                 if (mutex_lock_interruptible(&lli->
 887                                                                lli_write_mutex))
 888                                         GOTO(out, result = -ERESTARTSYS);
 889                                 write_mutex_locked = 1;
 890                         } else if (iot == CIT_READ) {
 891                                 down_read(&lli->lli_trunc_sem);
 892                         }
 893                         break;
 894                 case IO_SENDFILE:
 895                         vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
 896                         vio->u.sendfile.cui_target = args->u.sendfile.via_target;
 897                         break;
 898                 case IO_SPLICE:
 899                         vio->u.splice.cui_pipe = args->u.splice.via_pipe;
 900                         vio->u.splice.cui_flags = args->u.splice.via_flags;
 901                         break;
 902                 default:
 903                         CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
 904                         LBUG();
 905                 }
 906                 result = cl_io_loop(env, io);
 907                 if (write_mutex_locked)
 908                         mutex_unlock(&lli->lli_write_mutex);
 909                 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
 910                         up_read(&lli->lli_trunc_sem);
 911         } else {
 912                 /* cl_io_rw_init() handled IO */
 913                 result = io->ci_result;
 914         }
 915
 916         if (io->ci_nob > 0) {
 917                 result = io->ci_nob;
 918                 *ppos = io->u.ci_wr.wr.crw_pos;
 919         }
 920         GOTO(out, result);
 921 out:
 922         cl_io_fini(env, io);
 923         if (result == 0 && io->ci_need_restart) /* need to restart whole IO */
 924                 goto restart;
 925
 926         if (iot == CIT_READ) {
 927                 if (result >= 0)
 928                         ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
 929                                            LPROC_LL_READ_BYTES, result);
 930         } else if (iot == CIT_WRITE) {
 931                 if (result >= 0) {
 932                         ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
 933                                            LPROC_LL_WRITE_BYTES, result);
 934                         fd->fd_write_failed = false;
 935                 } else if (result != -ERESTARTSYS) {
 936                         fd->fd_write_failed = true;
 937                 }
 938         }
 939
 940         return result;
 941 }
 942
 943
 944 /*
 945  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
 946  */
 947 static int ll_file_get_iov_count(const struct iovec *iov,
 948                                  unsigned long *nr_segs, size_t *count)
 949 {
 950         size_t cnt = 0;
 951         unsigned long seg;
 952
 953         for (seg = 0; seg < *nr_segs; seg++) {
 954                 const struct iovec *iv = &iov[seg];
 955
 956                 /*
 957                  * If any segment has a negative length, or the cumulative
 958                  * length ever wraps negative then return -EINVAL.
 959                  */
 960                 cnt += iv->iov_len;
 961                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
 962                         return -EINVAL;
 963                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
 964                         continue;
 965                 if (seg == 0)
 966                         return -EFAULT;
 967                 *nr_segs = seg;
 968                 cnt -= iv->iov_len;   /* This segment is no good */
 969                 break;
 970         }
 971         *count = cnt;
 972         return 0;
 973 }
 974
 975 #ifdef HAVE_FILE_READV
 976 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
 977                               unsigned long nr_segs, loff_t *ppos)
 978 {
 979         struct lu_env      *env;
 980         struct vvp_io_args *args;
 981         size_t              count;
 982         ssize_t             result;
 983         int                 refcheck;
 984         ENTRY;
 985
 986         result = ll_file_get_iov_count(iov, &nr_segs, &count);
 987         if (result)
 988                 RETURN(result);
 989
 990         env = cl_env_get(&refcheck);
 991         if (IS_ERR(env))
 992                 RETURN(PTR_ERR(env));
 993
 994         args = vvp_env_args(env, IO_NORMAL);
 995         args->u.normal.via_iov = (struct iovec *)iov;
 996         args->u.normal.via_nrsegs = nr_segs;
 997
 998         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
 999         cl_env_put(env, &refcheck);
1000         RETURN(result);
1001 }
1002
1003 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1004                             loff_t *ppos)
1005 {
1006         struct lu_env *env;
1007         struct iovec  *local_iov;
1008         ssize_t        result;
1009         int            refcheck;
1010         ENTRY;
1011
1012         env = cl_env_get(&refcheck);
1013         if (IS_ERR(env))
1014                 RETURN(PTR_ERR(env));
1015
1016         local_iov = &vvp_env_info(env)->vti_local_iov;
1017         local_iov->iov_base = (void __user *)buf;
1018         local_iov->iov_len = count;
1019         result = ll_file_readv(file, local_iov, 1, ppos);
1020         cl_env_put(env, &refcheck);
1021         RETURN(result);
1022 }
1023
1024 #else
1025 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1026                                 unsigned long nr_segs, loff_t pos)
1027 {
1028         struct lu_env      *env;
1029         struct vvp_io_args *args;
1030         size_t              count;
1031         ssize_t             result;
1032         int                 refcheck;
1033         ENTRY;
1034
1035         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1036         if (result)
1037                 RETURN(result);
1038
1039         env = cl_env_get(&refcheck);
1040         if (IS_ERR(env))
1041                 RETURN(PTR_ERR(env));
1042
1043         args = vvp_env_args(env, IO_NORMAL);
1044         args->u.normal.via_iov = (struct iovec *)iov;
1045         args->u.normal.via_nrsegs = nr_segs;
1046         args->u.normal.via_iocb = iocb;
1047
1048         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1049                                     &iocb->ki_pos, count);
1050         cl_env_put(env, &refcheck);
1051         RETURN(result);
1052 }
1053
1054 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1055                             loff_t *ppos)
1056 {
1057         struct lu_env *env;
1058         struct iovec  *local_iov;
1059         struct kiocb  *kiocb;
1060         ssize_t        result;
1061         int            refcheck;
1062         ENTRY;
1063
1064         env = cl_env_get(&refcheck);
1065         if (IS_ERR(env))
1066                 RETURN(PTR_ERR(env));
1067
1068         local_iov = &vvp_env_info(env)->vti_local_iov;
1069         kiocb = &vvp_env_info(env)->vti_kiocb;
1070         local_iov->iov_base = (void __user *)buf;
1071         local_iov->iov_len = count;
1072         init_sync_kiocb(kiocb, file);
1073         kiocb->ki_pos = *ppos;
1074         kiocb->ki_left = count;
1075
1076         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1077         *ppos = kiocb->ki_pos;
1078
1079         cl_env_put(env, &refcheck);
1080         RETURN(result);
1081 }
1082 #endif
1083
1084 /*
1085  * Write to a file (through the page cache).
1086  */
1087 #ifdef HAVE_FILE_WRITEV
1088 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1089                               unsigned long nr_segs, loff_t *ppos)
1090 {
1091         struct lu_env      *env;
1092         struct vvp_io_args *args;
1093         size_t              count;
1094         ssize_t             result;
1095         int                 refcheck;
1096         ENTRY;
1097
1098         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1099         if (result)
1100                 RETURN(result);
1101
1102         env = cl_env_get(&refcheck);
1103         if (IS_ERR(env))
1104                 RETURN(PTR_ERR(env));
1105
1106         args = vvp_env_args(env, IO_NORMAL);
1107         args->u.normal.via_iov = (struct iovec *)iov;
1108         args->u.normal.via_nrsegs = nr_segs;
1109
1110         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1111         cl_env_put(env, &refcheck);
1112         RETURN(result);
1113 }
1114
1115 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1116                              loff_t *ppos)
1117 {
1118         struct lu_env    *env;
1119         struct iovec     *local_iov;
1120         ssize_t           result;
1121         int               refcheck;
1122         ENTRY;
1123
1124         env = cl_env_get(&refcheck);
1125         if (IS_ERR(env))
1126                 RETURN(PTR_ERR(env));
1127
1128         local_iov = &vvp_env_info(env)->vti_local_iov;
1129         local_iov->iov_base = (void __user *)buf;
1130         local_iov->iov_len = count;
1131
1132         result = ll_file_writev(file, local_iov, 1, ppos);
1133         cl_env_put(env, &refcheck);
1134         RETURN(result);
1135 }
1136
1137 #else /* AIO stuff */
1138 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1139                                  unsigned long nr_segs, loff_t pos)
1140 {
1141         struct lu_env      *env;
1142         struct vvp_io_args *args;
1143         size_t              count;
1144         ssize_t             result;
1145         int                 refcheck;
1146         ENTRY;
1147
1148         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1149         if (result)
1150                 RETURN(result);
1151
1152         env = cl_env_get(&refcheck);
1153         if (IS_ERR(env))
1154                 RETURN(PTR_ERR(env));
1155
1156         args = vvp_env_args(env, IO_NORMAL);
1157         args->u.normal.via_iov = (struct iovec *)iov;
1158         args->u.normal.via_nrsegs = nr_segs;
1159         args->u.normal.via_iocb = iocb;
1160
1161         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1162                                   &iocb->ki_pos, count);
1163         cl_env_put(env, &refcheck);
1164         RETURN(result);
1165 }
1166
1167 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1168                              loff_t *ppos)
1169 {
1170         struct lu_env *env;
1171         struct iovec  *local_iov;
1172         struct kiocb  *kiocb;
1173         ssize_t        result;
1174         int            refcheck;
1175         ENTRY;
1176
1177         env = cl_env_get(&refcheck);
1178         if (IS_ERR(env))
1179                 RETURN(PTR_ERR(env));
1180
1181         local_iov = &vvp_env_info(env)->vti_local_iov;
1182         kiocb = &vvp_env_info(env)->vti_kiocb;
1183         local_iov->iov_base = (void __user *)buf;
1184         local_iov->iov_len = count;
1185         init_sync_kiocb(kiocb, file);
1186         kiocb->ki_pos = *ppos;
1187         kiocb->ki_left = count;
1188
1189         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1190         *ppos = kiocb->ki_pos;
1191
1192         cl_env_put(env, &refcheck);
1193         RETURN(result);
1194 }
1195 #endif
1196
1197
1198 #ifdef HAVE_KERNEL_SENDFILE
1199 /*
1200  * Send file content (through pagecache) somewhere with helper
1201  */
1202 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1203                                 read_actor_t actor, void *target)
1204 {
1205         struct lu_env      *env;
1206         struct vvp_io_args *args;
1207         ssize_t             result;
1208         int                 refcheck;
1209         ENTRY;
1210
1211         env = cl_env_get(&refcheck);
1212         if (IS_ERR(env))
1213                 RETURN(PTR_ERR(env));
1214
1215         args = vvp_env_args(env, IO_SENDFILE);
1216         args->u.sendfile.via_target = target;
1217         args->u.sendfile.via_actor = actor;
1218
1219         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1220         cl_env_put(env, &refcheck);
1221         RETURN(result);
1222 }
1223 #endif
1224
1225 #ifdef HAVE_KERNEL_SPLICE_READ
1226 /*
1227  * Send file content (through pagecache) somewhere with helper
1228  */
1229 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1230                                    struct pipe_inode_info *pipe, size_t count,
1231                                    unsigned int flags)
1232 {
1233         struct lu_env      *env;
1234         struct vvp_io_args *args;
1235         ssize_t             result;
1236         int                 refcheck;
1237         ENTRY;
1238
1239         env = cl_env_get(&refcheck);
1240         if (IS_ERR(env))
1241                 RETURN(PTR_ERR(env));
1242
1243         args = vvp_env_args(env, IO_SPLICE);
1244         args->u.splice.via_pipe = pipe;
1245         args->u.splice.via_flags = flags;
1246
1247         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1248         cl_env_put(env, &refcheck);
1249         RETURN(result);
1250 }
1251 #endif
1252
1253 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1254                            obd_count ost_idx)
1255 {
1256         struct obd_export *exp = ll_i2dtexp(inode);
1257         struct obd_trans_info oti = { 0 };
1258         struct obdo *oa = NULL;
1259         int lsm_size;
1260         int rc = 0;
1261         struct lov_stripe_md *lsm = NULL, *lsm2;
1262         ENTRY;
1263
1264         OBDO_ALLOC(oa);
1265         if (oa == NULL)
1266                 RETURN(-ENOMEM);
1267
1268         lsm = ccc_inode_lsm_get(inode);
1269         if (lsm == NULL)
1270                 GOTO(out, rc = -ENOENT);
1271
1272         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1273                    (lsm->lsm_stripe_count));
1274
1275         OBD_ALLOC_LARGE(lsm2, lsm_size);
1276         if (lsm2 == NULL)
1277                 GOTO(out, rc = -ENOMEM);
1278
1279         oa->o_id = id;
1280         oa->o_seq = seq;
1281         oa->o_nlink = ost_idx;
1282         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1283         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1284         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1285                                    OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1286         obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1287         memcpy(lsm2, lsm, lsm_size);
1288         ll_inode_size_lock(inode);
1289         rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1290         ll_inode_size_unlock(inode);
1291
1292         OBD_FREE_LARGE(lsm2, lsm_size);
1293         GOTO(out, rc);
1294 out:
1295         ccc_inode_lsm_put(inode, lsm);
1296         OBDO_FREE(oa);
1297         return rc;
1298 }
1299
1300 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1301 {
1302         struct ll_recreate_obj ucreat;
1303         ENTRY;
1304
1305         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1306                 RETURN(-EPERM);
1307
1308         if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1309                            sizeof(ucreat)))
1310                 RETURN(-EFAULT);
1311
1312         RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1313                                ucreat.lrc_ost_idx));
1314 }
1315
1316 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1317 {
1318         struct lu_fid   fid;
1319         obd_id          id;
1320         obd_count       ost_idx;
1321         ENTRY;
1322
1323         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1324                 RETURN(-EPERM);
1325
1326         if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1327                 RETURN(-EFAULT);
1328
1329         id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1330         ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1331         RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1332 }
1333
1334 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1335                              int flags, struct lov_user_md *lum, int lum_size)
1336 {
1337         struct lov_stripe_md *lsm = NULL;
1338         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1339         int rc = 0;
1340         ENTRY;
1341
1342         lsm = ccc_inode_lsm_get(inode);
1343         if (lsm != NULL) {
1344                 ccc_inode_lsm_put(inode, lsm);
1345                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1346                        inode->i_ino);
1347                 RETURN(-EEXIST);
1348         }
1349
1350         ll_inode_size_lock(inode);
1351         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1352         if (rc)
1353                 GOTO(out, rc);
1354         rc = oit.d.lustre.it_status;
1355         if (rc < 0)
1356                 GOTO(out_req_free, rc);
1357
1358         ll_release_openhandle(file->f_dentry, &oit);
1359
1360  out:
1361         ll_inode_size_unlock(inode);
1362         ll_intent_release(&oit);
1363         ccc_inode_lsm_put(inode, lsm);
1364         RETURN(rc);
1365 out_req_free:
1366         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1367         goto out;
1368 }
1369
1370 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1371                              struct lov_mds_md **lmmp, int *lmm_size,
1372                              struct ptlrpc_request **request)
1373 {
1374         struct ll_sb_info *sbi = ll_i2sbi(inode);
1375         struct mdt_body  *body;
1376         struct lov_mds_md *lmm = NULL;
1377         struct ptlrpc_request *req = NULL;
1378         struct md_op_data *op_data;
1379         int rc, lmmsize;
1380
1381         rc = ll_get_max_mdsize(sbi, &lmmsize);
1382         if (rc)
1383                 RETURN(rc);
1384
1385         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1386                                      strlen(filename), lmmsize,
1387                                      LUSTRE_OPC_ANY, NULL);
1388         if (IS_ERR(op_data))
1389                 RETURN(PTR_ERR(op_data));
1390
1391         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1392         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1393         ll_finish_md_op_data(op_data);
1394         if (rc < 0) {
1395                 CDEBUG(D_INFO, "md_getattr_name failed "
1396                        "on %s: rc %d\n", filename, rc);
1397                 GOTO(out, rc);
1398         }
1399
1400         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1401         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1402
1403         lmmsize = body->eadatasize;
1404
1405         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1406                         lmmsize == 0) {
1407                 GOTO(out, rc = -ENODATA);
1408         }
1409
1410         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1411         LASSERT(lmm != NULL);
1412
1413         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1414             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1415                 GOTO(out, rc = -EPROTO);
1416         }
1417
1418         /*
1419          * This is coming from the MDS, so is probably in
1420          * little endian.  We convert it to host endian before
1421          * passing it to userspace.
1422          */
1423         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1424                 /* if function called for directory - we should
1425                  * avoid swab not existent lsm objects */
1426                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1427                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1428                         if (S_ISREG(body->mode))
1429                                 lustre_swab_lov_user_md_objects(
1430                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1431                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1432                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1433                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1434                         if (S_ISREG(body->mode))
1435                                 lustre_swab_lov_user_md_objects(
1436                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1437                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1438                 }
1439         }
1440
1441 out:
1442         *lmmp = lmm;
1443         *lmm_size = lmmsize;
1444         *request = req;
1445         return rc;
1446 }
1447
1448 static int ll_lov_setea(struct inode *inode, struct file *file,
1449                             unsigned long arg)
1450 {
1451         int                      flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1452         struct lov_user_md      *lump;
1453         int                      lum_size = sizeof(struct lov_user_md) +
1454                                             sizeof(struct lov_user_ost_data);
1455         int                      rc;
1456         ENTRY;
1457
1458         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1459                 RETURN(-EPERM);
1460
1461         OBD_ALLOC_LARGE(lump, lum_size);
1462         if (lump == NULL)
1463                 RETURN(-ENOMEM);
1464
1465         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1466                 OBD_FREE_LARGE(lump, lum_size);
1467                 RETURN(-EFAULT);
1468         }
1469
1470         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1471
1472         OBD_FREE_LARGE(lump, lum_size);
1473         RETURN(rc);
1474 }
1475
1476 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1477                             unsigned long arg)
1478 {
1479         struct lov_user_md_v3    lumv3;
1480         struct lov_user_md_v1   *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1481         struct lov_user_md_v1   *lumv1p = (struct lov_user_md_v1 *)arg;
1482         struct lov_user_md_v3   *lumv3p = (struct lov_user_md_v3 *)arg;
1483         int                      lum_size, rc;
1484         int                      flags = FMODE_WRITE;
1485         ENTRY;
1486
1487         /* first try with v1 which is smaller than v3 */
1488         lum_size = sizeof(struct lov_user_md_v1);
1489         if (copy_from_user(lumv1, lumv1p, lum_size))
1490                 RETURN(-EFAULT);
1491
1492         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1493                 lum_size = sizeof(struct lov_user_md_v3);
1494                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1495                         RETURN(-EFAULT);
1496         }
1497
1498         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1499         if (rc == 0) {
1500                 struct lov_stripe_md *lsm;
1501                 __u32 gen;
1502
1503                 put_user(0, &lumv1p->lmm_stripe_count);
1504
1505                 ll_layout_refresh(inode, &gen);
1506                 lsm = ccc_inode_lsm_get(inode);
1507                 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1508                                    0, lsm, (void *)arg);
1509                 ccc_inode_lsm_put(inode, lsm);
1510         }
1511         RETURN(rc);
1512 }
1513
1514 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1515 {
1516         struct lov_stripe_md *lsm;
1517         int rc = -ENODATA;
1518         ENTRY;
1519
1520         lsm = ccc_inode_lsm_get(inode);
1521         if (lsm != NULL)
1522                 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1523                                    lsm, (void *)arg);
1524         ccc_inode_lsm_put(inode, lsm);
1525         RETURN(rc);
1526 }
1527
1528 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1529 {
1530         struct ll_inode_info   *lli = ll_i2info(inode);
1531         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1532         struct ccc_grouplock    grouplock;
1533         int                     rc;
1534         ENTRY;
1535
1536         if (ll_file_nolock(file))
1537                 RETURN(-EOPNOTSUPP);
1538
1539         spin_lock(&lli->lli_lock);
1540         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1541                 CWARN("group lock already existed with gid %lu\n",
1542                       fd->fd_grouplock.cg_gid);
1543                 spin_unlock(&lli->lli_lock);
1544                 RETURN(-EINVAL);
1545         }
1546         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1547         spin_unlock(&lli->lli_lock);
1548
1549         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1550                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1551         if (rc)
1552                 RETURN(rc);
1553
1554         spin_lock(&lli->lli_lock);
1555         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1556                 spin_unlock(&lli->lli_lock);
1557                 CERROR("another thread just won the race\n");
1558                 cl_put_grouplock(&grouplock);
1559                 RETURN(-EINVAL);
1560         }
1561
1562         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1563         fd->fd_grouplock = grouplock;
1564         spin_unlock(&lli->lli_lock);
1565
1566         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1567         RETURN(0);
1568 }
1569
1570 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1571 {
1572         struct ll_inode_info   *lli = ll_i2info(inode);
1573         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1574         struct ccc_grouplock    grouplock;
1575         ENTRY;
1576
1577         spin_lock(&lli->lli_lock);
1578         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1579                 spin_unlock(&lli->lli_lock);
1580                 CWARN("no group lock held\n");
1581                 RETURN(-EINVAL);
1582         }
1583         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1584
1585         if (fd->fd_grouplock.cg_gid != arg) {
1586                 CWARN("group lock %lu doesn't match current id %lu\n",
1587                        arg, fd->fd_grouplock.cg_gid);
1588                 spin_unlock(&lli->lli_lock);
1589                 RETURN(-EINVAL);
1590         }
1591
1592         grouplock = fd->fd_grouplock;
1593         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1594         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1595         spin_unlock(&lli->lli_lock);
1596
1597         cl_put_grouplock(&grouplock);
1598         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1599         RETURN(0);
1600 }
1601
1602 /**
1603  * Close inode open handle
1604  *
1605  * \param dentry [in]     dentry which contains the inode
1606  * \param it     [in,out] intent which contains open info and result
1607  *
1608  * \retval 0     success
1609  * \retval <0    failure
1610  */
1611 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1612 {
1613         struct inode *inode = dentry->d_inode;
1614         struct obd_client_handle *och;
1615         int rc;
1616         ENTRY;
1617
1618         LASSERT(inode);
1619
1620         /* Root ? Do nothing. */
1621         if (dentry->d_inode->i_sb->s_root == dentry)
1622                 RETURN(0);
1623
1624         /* No open handle to close? Move away */
1625         if (!it_disposition(it, DISP_OPEN_OPEN))
1626                 RETURN(0);
1627
1628         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1629
1630         OBD_ALLOC(och, sizeof(*och));
1631         if (!och)
1632                 GOTO(out, rc = -ENOMEM);
1633
1634         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1635                     ll_i2info(inode), it, och);
1636
1637         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1638                                        inode, och);
1639  out:
1640         /* this one is in place of ll_file_open */
1641         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1642                 ptlrpc_req_finished(it->d.lustre.it_data);
1643                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1644         }
1645         RETURN(rc);
1646 }
1647
1648 /**
1649  * Get size for inode for which FIEMAP mapping is requested.
1650  * Make the FIEMAP get_info call and returns the result.
1651  */
1652 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1653               int num_bytes)
1654 {
1655         struct obd_export *exp = ll_i2dtexp(inode);
1656         struct lov_stripe_md *lsm = NULL;
1657         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1658         int vallen = num_bytes;
1659         int rc;
1660         ENTRY;
1661
1662         /* Checks for fiemap flags */
1663         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1664                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1665                 return -EBADR;
1666         }
1667
1668         /* Check for FIEMAP_FLAG_SYNC */
1669         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1670                 rc = filemap_fdatawrite(inode->i_mapping);
1671                 if (rc)
1672                         return rc;
1673         }
1674
1675         lsm = ccc_inode_lsm_get(inode);
1676         if (lsm == NULL)
1677                 return -ENOENT;
1678
1679         /* If the stripe_count > 1 and the application does not understand
1680          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1681          */
1682         if (lsm->lsm_stripe_count > 1 &&
1683             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1684                 GOTO(out, rc = -EOPNOTSUPP);
1685
1686         fm_key.oa.o_id = lsm->lsm_object_id;
1687         fm_key.oa.o_seq = lsm->lsm_object_seq;
1688         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1689
1690         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1691         obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1692         /* If filesize is 0, then there would be no objects for mapping */
1693         if (fm_key.oa.o_size == 0) {
1694                 fiemap->fm_mapped_extents = 0;
1695                 GOTO(out, rc = 0);
1696         }
1697
1698         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1699
1700         rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1701                           fiemap, lsm);
1702         if (rc)
1703                 CERROR("obd_get_info failed: rc = %d\n", rc);
1704
1705 out:
1706         ccc_inode_lsm_put(inode, lsm);
1707         RETURN(rc);
1708 }
1709
1710 int ll_fid2path(struct inode *inode, void *arg)
1711 {
1712         struct obd_export       *exp = ll_i2mdexp(inode);
1713         struct getinfo_fid2path *gfout, *gfin;
1714         int                      outsize, rc;
1715         ENTRY;
1716
1717         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1718             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1719                 RETURN(-EPERM);
1720
1721         /* Need to get the buflen */
1722         OBD_ALLOC_PTR(gfin);
1723         if (gfin == NULL)
1724                 RETURN(-ENOMEM);
1725         if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1726                 OBD_FREE_PTR(gfin);
1727                 RETURN(-EFAULT);
1728         }
1729
1730         outsize = sizeof(*gfout) + gfin->gf_pathlen;
1731         OBD_ALLOC(gfout, outsize);
1732         if (gfout == NULL) {
1733                 OBD_FREE_PTR(gfin);
1734                 RETURN(-ENOMEM);
1735         }
1736         memcpy(gfout, gfin, sizeof(*gfout));
1737         OBD_FREE_PTR(gfin);
1738
1739         /* Call mdc_iocontrol */
1740         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1741         if (rc)
1742                 GOTO(gf_free, rc);
1743         if (copy_to_user(arg, gfout, outsize))
1744                 rc = -EFAULT;
1745
1746 gf_free:
1747         OBD_FREE(gfout, outsize);
1748         RETURN(rc);
1749 }
1750
1751 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1752 {
1753         struct ll_user_fiemap *fiemap_s;
1754         size_t num_bytes, ret_bytes;
1755         unsigned int extent_count;
1756         int rc = 0;
1757
1758         /* Get the extent count so we can calculate the size of
1759          * required fiemap buffer */
1760         if (get_user(extent_count,
1761             &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1762                 RETURN(-EFAULT);
1763         num_bytes = sizeof(*fiemap_s) + (extent_count *
1764                                          sizeof(struct ll_fiemap_extent));
1765
1766         OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1767         if (fiemap_s == NULL)
1768                 RETURN(-ENOMEM);
1769
1770         /* get the fiemap value */
1771         if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1772                            sizeof(*fiemap_s)))
1773                 GOTO(error, rc = -EFAULT);
1774
1775         /* If fm_extent_count is non-zero, read the first extent since
1776          * it is used to calculate end_offset and device from previous
1777          * fiemap call. */
1778         if (extent_count) {
1779                 if (copy_from_user(&fiemap_s->fm_extents[0],
1780                     (char __user *)arg + sizeof(*fiemap_s),
1781                     sizeof(struct ll_fiemap_extent)))
1782                         GOTO(error, rc = -EFAULT);
1783         }
1784
1785         rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1786         if (rc)
1787                 GOTO(error, rc);
1788
1789         ret_bytes = sizeof(struct ll_user_fiemap);
1790
1791         if (extent_count != 0)
1792                 ret_bytes += (fiemap_s->fm_mapped_extents *
1793                                  sizeof(struct ll_fiemap_extent));
1794
1795         if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1796                 rc = -EFAULT;
1797
1798 error:
1799         OBD_FREE_LARGE(fiemap_s, num_bytes);
1800         RETURN(rc);
1801 }
1802
1803 /*
1804  * Read the data_version for inode.
1805  *
1806  * This value is computed using stripe object version on OST.
1807  * Version is computed using server side locking.
1808  *
1809  * @param extent_lock  Take extent lock. Not needed if a process is already
1810  *                     holding the OST object group locks.
1811  */
1812 int ll_data_version(struct inode *inode, __u64 *data_version,
1813                     int extent_lock)
1814 {
1815         struct lov_stripe_md    *lsm = NULL;
1816         struct ll_sb_info       *sbi = ll_i2sbi(inode);
1817         struct obdo             *obdo = NULL;
1818         int                      rc;
1819         ENTRY;
1820
1821         /* If no stripe, we consider version is 0. */
1822         lsm = ccc_inode_lsm_get(inode);
1823         if (lsm == NULL) {
1824                 *data_version = 0;
1825                 CDEBUG(D_INODE, "No object for inode\n");
1826                 RETURN(0);
1827         }
1828
1829         OBD_ALLOC_PTR(obdo);
1830         if (obdo == NULL) {
1831                 ccc_inode_lsm_put(inode, lsm);
1832                 RETURN(-ENOMEM);
1833         }
1834
1835         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1836         if (!rc) {
1837                 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1838                         rc = -EOPNOTSUPP;
1839                 else
1840                         *data_version = obdo->o_data_version;
1841         }
1842
1843         OBD_FREE_PTR(obdo);
1844         ccc_inode_lsm_put(inode, lsm);
1845
1846         RETURN(rc);
1847 }
1848
1849 static int ll_swap_layout(struct file *file, struct file *file2,
1850                         struct lustre_swap_layouts *lsl)
1851 {
1852         struct mdc_swap_layouts  msl = { .msl_flags = lsl->sl_flags };
1853         struct md_op_data       *op_data;
1854         struct inode            *inode = file->f_dentry->d_inode;
1855         struct inode            *inode2 = file2->f_dentry->d_inode;
1856         __u32 gid;
1857         int rc;
1858
1859         if (!S_ISREG(inode2->i_mode))
1860                 RETURN(-EINVAL);
1861
1862         if (inode_permission(inode, MAY_WRITE) ||
1863             inode_permission(inode2, MAY_WRITE))
1864                 RETURN(-EPERM);
1865
1866         if (inode2->i_sb != inode->i_sb)
1867                 RETURN(-EXDEV);
1868
1869         rc = lu_fid_cmp(ll_inode2fid(inode), ll_inode2fid(inode2));
1870         if (rc == 0) /* same file, done! */
1871                 RETURN(0);
1872
1873         if (rc < 0) { /* sequentialize it */
1874                 swap(inode, inode2);
1875                 swap(file, file2);
1876         }
1877
1878         gid = lsl->sl_gid;
1879         if (gid != 0) { /* application asks to flush dirty cache */
1880                 rc = ll_get_grouplock(inode, file, gid);
1881                 if (rc < 0)
1882                         RETURN(rc);
1883
1884                 rc = ll_get_grouplock(inode2, file2, gid);
1885                 if (rc < 0) {
1886                         ll_put_grouplock(inode, file, gid);
1887                         RETURN(rc);
1888                 }
1889         }
1890
1891         /* struct md_op_data is used to send the swap args to the mdt
1892          * only flags is missing, so we use struct mdc_swap_layouts
1893          * through the md_op_data->op_data */
1894         rc = -ENOMEM;
1895         op_data = ll_prep_md_op_data(NULL, inode, inode2, NULL, 0, 0,
1896                                         LUSTRE_OPC_ANY, &msl);
1897         if (op_data != NULL) {
1898                 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(inode),
1899                                         sizeof(*op_data), op_data, NULL);
1900                 ll_finish_md_op_data(op_data);
1901         }
1902
1903         if (gid != 0) {
1904                 ll_put_grouplock(inode2, file2, gid);
1905                 ll_put_grouplock(inode, file, gid);
1906         }
1907
1908         RETURN(rc);
1909 }
1910
1911 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1912 {
1913         struct inode            *inode = file->f_dentry->d_inode;
1914         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
1915         int                      flags, rc;
1916         ENTRY;
1917
1918         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1919                inode->i_generation, inode, cmd);
1920         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1921
1922         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1923         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1924                 RETURN(-ENOTTY);
1925
1926         switch(cmd) {
1927         case LL_IOC_GETFLAGS:
1928                 /* Get the current value of the file flags */
1929                 return put_user(fd->fd_flags, (int *)arg);
1930         case LL_IOC_SETFLAGS:
1931         case LL_IOC_CLRFLAGS:
1932                 /* Set or clear specific file flags */
1933                 /* XXX This probably needs checks to ensure the flags are
1934                  *     not abused, and to handle any flag side effects.
1935                  */
1936                 if (get_user(flags, (int *) arg))
1937                         RETURN(-EFAULT);
1938
1939                 if (cmd == LL_IOC_SETFLAGS) {
1940                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1941                             !(file->f_flags & O_DIRECT)) {
1942                                 CERROR("%s: unable to disable locking on "
1943                                        "non-O_DIRECT file\n", current->comm);
1944                                 RETURN(-EINVAL);
1945                         }
1946
1947                         fd->fd_flags |= flags;
1948                 } else {
1949                         fd->fd_flags &= ~flags;
1950                 }
1951                 RETURN(0);
1952         case LL_IOC_LOV_SETSTRIPE:
1953                 RETURN(ll_lov_setstripe(inode, file, arg));
1954         case LL_IOC_LOV_SETEA:
1955                 RETURN(ll_lov_setea(inode, file, arg));
1956         case LL_IOC_LOV_SWAP_LAYOUTS: {
1957                 struct file *file2;
1958                 struct lustre_swap_layouts lsl;
1959
1960                 if (cfs_copy_from_user(&lsl, (char *)arg,
1961                                        sizeof(struct lustre_swap_layouts)))
1962                         RETURN(-EFAULT);
1963
1964                 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
1965                         RETURN(-EPERM);
1966
1967                 file2 = cfs_get_fd(lsl.sl_fd);
1968                 if (file2 == NULL)
1969                         RETURN(-EBADF);
1970
1971                 rc = -EPERM;
1972                 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
1973                         rc = ll_swap_layout(file, file2, &lsl);
1974                 cfs_put_file(file2);
1975                 RETURN(rc);
1976         }
1977         case LL_IOC_LOV_GETSTRIPE:
1978                 RETURN(ll_lov_getstripe(inode, arg));
1979         case LL_IOC_RECREATE_OBJ:
1980                 RETURN(ll_lov_recreate_obj(inode, arg));
1981         case LL_IOC_RECREATE_FID:
1982                 RETURN(ll_lov_recreate_fid(inode, arg));
1983         case FSFILT_IOC_FIEMAP:
1984                 RETURN(ll_ioctl_fiemap(inode, arg));
1985         case FSFILT_IOC_GETFLAGS:
1986         case FSFILT_IOC_SETFLAGS:
1987                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1988         case FSFILT_IOC_GETVERSION_OLD:
1989         case FSFILT_IOC_GETVERSION:
1990                 RETURN(put_user(inode->i_generation, (int *)arg));
1991         case LL_IOC_GROUP_LOCK:
1992                 RETURN(ll_get_grouplock(inode, file, arg));
1993         case LL_IOC_GROUP_UNLOCK:
1994                 RETURN(ll_put_grouplock(inode, file, arg));
1995         case IOC_OBD_STATFS:
1996                 RETURN(ll_obd_statfs(inode, (void *)arg));
1997
1998         /* We need to special case any other ioctls we want to handle,
1999          * to send them to the MDS/OST as appropriate and to properly
2000          * network encode the arg field.
2001         case FSFILT_IOC_SETVERSION_OLD:
2002         case FSFILT_IOC_SETVERSION:
2003         */
2004         case LL_IOC_FLUSHCTX:
2005                 RETURN(ll_flush_ctx(inode));
2006         case LL_IOC_PATH2FID: {
2007                 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2008                                  sizeof(struct lu_fid)))
2009                         RETURN(-EFAULT);
2010
2011                 RETURN(0);
2012         }
2013         case OBD_IOC_FID2PATH:
2014                 RETURN(ll_fid2path(inode, (void *)arg));
2015         case LL_IOC_DATA_VERSION: {
2016                 struct ioc_data_version idv;
2017                 int                     rc;
2018
2019                 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2020                         RETURN(-EFAULT);
2021
2022                 rc = ll_data_version(inode, &idv.idv_version,
2023                                 !(idv.idv_flags & LL_DV_NOFLUSH));
2024
2025                 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2026                         RETURN(-EFAULT);
2027
2028                 RETURN(rc);
2029         }
2030
2031         case LL_IOC_GET_MDTIDX: {
2032                 int mdtidx;
2033
2034                 mdtidx = ll_get_mdt_idx(inode);
2035                 if (mdtidx < 0)
2036                         RETURN(mdtidx);
2037
2038                 if (put_user((int)mdtidx, (int*)arg))
2039                         RETURN(-EFAULT);
2040
2041                 RETURN(0);
2042         }
2043         case OBD_IOC_GETDTNAME:
2044         case OBD_IOC_GETMDNAME:
2045                 RETURN(ll_get_obd_name(inode, cmd, arg));
2046         case LL_IOC_HSM_STATE_GET: {
2047                 struct md_op_data       *op_data;
2048                 struct hsm_user_state   *hus;
2049                 int                      rc;
2050
2051                 OBD_ALLOC_PTR(hus);
2052                 if (hus == NULL)
2053                         RETURN(-ENOMEM);
2054
2055                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2056                                              LUSTRE_OPC_ANY, hus);
2057                 if (op_data == NULL) {
2058                         OBD_FREE_PTR(hus);
2059                         RETURN(-ENOMEM);
2060                 }
2061
2062                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2063                                    op_data, NULL);
2064
2065                 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2066                         rc = -EFAULT;
2067
2068                 ll_finish_md_op_data(op_data);
2069                 OBD_FREE_PTR(hus);
2070                 RETURN(rc);
2071         }
2072         case LL_IOC_HSM_STATE_SET: {
2073                 struct md_op_data       *op_data;
2074                 struct hsm_state_set    *hss;
2075                 int                      rc;
2076
2077                 OBD_ALLOC_PTR(hss);
2078                 if (hss == NULL)
2079                         RETURN(-ENOMEM);
2080                 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2081                         OBD_FREE_PTR(hss);
2082                         RETURN(-EFAULT);
2083                 }
2084
2085                 /* Non-root users are forbidden to set or clear flags which are
2086                  * NOT defined in HSM_USER_MASK. */
2087                 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2088                     && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2089                         OBD_FREE_PTR(hss);
2090                         RETURN(-EPERM);
2091                 }
2092
2093                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2094                                              LUSTRE_OPC_ANY, hss);
2095                 if (op_data == NULL) {
2096                         OBD_FREE_PTR(hss);
2097                         RETURN(-ENOMEM);
2098                 }
2099
2100                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2101                                    op_data, NULL);
2102
2103                 ll_finish_md_op_data(op_data);
2104
2105                 OBD_FREE_PTR(hss);
2106                 RETURN(rc);
2107         }
2108         case LL_IOC_HSM_ACTION: {
2109                 struct md_op_data               *op_data;
2110                 struct hsm_current_action       *hca;
2111                 int                              rc;
2112
2113                 OBD_ALLOC_PTR(hca);
2114                 if (hca == NULL)
2115                         RETURN(-ENOMEM);
2116
2117                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2118                                              LUSTRE_OPC_ANY, hca);
2119                 if (op_data == NULL) {
2120                         OBD_FREE_PTR(hca);
2121                         RETURN(-ENOMEM);
2122                 }
2123
2124                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2125                                    op_data, NULL);
2126
2127                 if (cfs_copy_to_user((char *)arg, hca, sizeof(*hca)))
2128                         rc = -EFAULT;
2129
2130                 ll_finish_md_op_data(op_data);
2131                 OBD_FREE_PTR(hca);
2132                 RETURN(rc);
2133         }
2134         default: {
2135                 int err;
2136
2137                 if (LLIOC_STOP ==
2138                      ll_iocontrol_call(inode, file, cmd, arg, &err))
2139                         RETURN(err);
2140
2141                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2142                                      (void *)arg));
2143         }
2144         }
2145 }
2146
2147 #ifndef HAVE_FILE_LLSEEK_SIZE
2148 static inline loff_t
2149 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2150 {
2151         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2152                 return -EINVAL;
2153         if (offset > maxsize)
2154                 return -EINVAL;
2155
2156         if (offset != file->f_pos) {
2157                 file->f_pos = offset;
2158                 file->f_version = 0;
2159         }
2160         return offset;
2161 }
2162
2163 static loff_t
2164 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2165                 loff_t maxsize, loff_t eof)
2166 {
2167         struct inode *inode = file->f_dentry->d_inode;
2168
2169         switch (origin) {
2170         case SEEK_END:
2171                 offset += eof;
2172                 break;
2173         case SEEK_CUR:
2174                 /*
2175                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
2176                  * position-querying operation.  Avoid rewriting the "same"
2177                  * f_pos value back to the file because a concurrent read(),
2178                  * write() or lseek() might have altered it
2179                  */
2180                 if (offset == 0)
2181                         return file->f_pos;
2182                 /*
2183                  * f_lock protects against read/modify/write race with other
2184                  * SEEK_CURs. Note that parallel writes and reads behave
2185                  * like SEEK_SET.
2186                  */
2187                 mutex_lock(&inode->i_mutex);
2188                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2189                 mutex_unlock(&inode->i_mutex);
2190                 return offset;
2191         case SEEK_DATA:
2192                 /*
2193                  * In the generic case the entire file is data, so as long as
2194                  * offset isn't at the end of the file then the offset is data.
2195                  */
2196                 if (offset >= eof)
2197                         return -ENXIO;
2198                 break;
2199         case SEEK_HOLE:
2200                 /*
2201                  * There is a virtual hole at the end of the file, so as long as
2202                  * offset isn't i_size or larger, return i_size.
2203                  */
2204                 if (offset >= eof)
2205                         return -ENXIO;
2206                 offset = eof;
2207                 break;
2208         }
2209
2210         return llseek_execute(file, offset, maxsize);
2211 }
2212 #endif
2213
2214 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2215 {
2216         struct inode *inode = file->f_dentry->d_inode;
2217         loff_t retval, eof = 0;
2218
2219         ENTRY;
2220         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2221                            (origin == SEEK_CUR) ? file->f_pos : 0);
2222         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2223                inode->i_ino, inode->i_generation, inode, retval, retval,
2224                origin);
2225         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2226
2227         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2228                 retval = ll_glimpse_size(inode);
2229                 if (retval != 0)
2230                         RETURN(retval);
2231                 eof = i_size_read(inode);
2232         }
2233
2234         retval = generic_file_llseek_size(file, offset, origin,
2235                                           ll_file_maxbytes(inode), eof);
2236         RETURN(retval);
2237 }
2238
2239 int ll_flush(struct file *file, fl_owner_t id)
2240 {
2241         struct inode *inode = file->f_dentry->d_inode;
2242         struct ll_inode_info *lli = ll_i2info(inode);
2243         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2244         int rc, err;
2245
2246         LASSERT(!S_ISDIR(inode->i_mode));
2247
2248         /* catch async errors that were recorded back when async writeback
2249          * failed for pages in this mapping. */
2250         rc = lli->lli_async_rc;
2251         lli->lli_async_rc = 0;
2252         err = lov_read_and_clear_async_rc(lli->lli_clob);
2253         if (rc == 0)
2254                 rc = err;
2255
2256         /* The application has been told write failure already.
2257          * Do not report failure again. */
2258         if (fd->fd_write_failed)
2259                 return 0;
2260         return rc ? -EIO : 0;
2261 }
2262
2263 /**
2264  * Called to make sure a portion of file has been written out.
2265  * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2266  *
2267  * Return how many pages have been written.
2268  */
2269 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2270                        enum cl_fsync_mode mode)
2271 {
2272         struct cl_env_nest nest;
2273         struct lu_env *env;
2274         struct cl_io *io;
2275         struct obd_capa *capa = NULL;
2276         struct cl_fsync_io *fio;
2277         int result;
2278         ENTRY;
2279
2280         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2281             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2282                 RETURN(-EINVAL);
2283
2284         env = cl_env_nested_get(&nest);
2285         if (IS_ERR(env))
2286                 RETURN(PTR_ERR(env));
2287
2288         capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2289
2290         io = ccc_env_thread_io(env);
2291         io->ci_obj = cl_i2info(inode)->lli_clob;
2292         io->ci_ignore_layout = 1;
2293
2294         /* initialize parameters for sync */
2295         fio = &io->u.ci_fsync;
2296         fio->fi_capa = capa;
2297         fio->fi_start = start;
2298         fio->fi_end = end;
2299         fio->fi_fid = ll_inode2fid(inode);
2300         fio->fi_mode = mode;
2301         fio->fi_nr_written = 0;
2302
2303         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2304                 result = cl_io_loop(env, io);
2305         else
2306                 result = io->ci_result;
2307         if (result == 0)
2308                 result = fio->fi_nr_written;
2309         cl_io_fini(env, io);
2310         cl_env_nested_put(&nest, env);
2311
2312         capa_put(capa);
2313
2314         RETURN(result);
2315 }
2316
2317 #ifdef HAVE_FILE_FSYNC_4ARGS
2318 int ll_fsync(struct file *file, loff_t start, loff_t end, int data)
2319 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2320 int ll_fsync(struct file *file, int data)
2321 #else
2322 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2323 #endif
2324 {
2325         struct inode *inode = file->f_dentry->d_inode;
2326         struct ll_inode_info *lli = ll_i2info(inode);
2327         struct ptlrpc_request *req;
2328         struct obd_capa *oc;
2329         int rc, err;
2330         ENTRY;
2331
2332         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2333                inode->i_generation, inode);
2334         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2335
2336 #ifdef HAVE_FILE_FSYNC_4ARGS
2337         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2338         mutex_lock(&inode->i_mutex);
2339 #else
2340         /* fsync's caller has already called _fdata{sync,write}, we want
2341          * that IO to finish before calling the osc and mdc sync methods */
2342         rc = filemap_fdatawait(inode->i_mapping);
2343 #endif
2344
2345         /* catch async errors that were recorded back when async writeback
2346          * failed for pages in this mapping. */
2347         if (!S_ISDIR(inode->i_mode)) {
2348                 err = lli->lli_async_rc;
2349                 lli->lli_async_rc = 0;
2350                 if (rc == 0)
2351                         rc = err;
2352                 err = lov_read_and_clear_async_rc(lli->lli_clob);
2353                 if (rc == 0)
2354                         rc = err;
2355         }
2356
2357         oc = ll_mdscapa_get(inode);
2358         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2359                       &req);
2360         capa_put(oc);
2361         if (!rc)
2362                 rc = err;
2363         if (!err)
2364                 ptlrpc_req_finished(req);
2365
2366         if (data) {
2367                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2368
2369                 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2370                                 CL_FSYNC_ALL);
2371                 if (rc == 0 && err < 0)
2372                         rc = err;
2373                 if (rc < 0)
2374                         fd->fd_write_failed = true;
2375                 else
2376                         fd->fd_write_failed = false;
2377         }
2378
2379 #ifdef HAVE_FILE_FSYNC_4ARGS
2380         mutex_unlock(&inode->i_mutex);
2381 #endif
2382         RETURN(rc);
2383 }
2384
2385 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2386 {
2387         struct inode *inode = file->f_dentry->d_inode;
2388         struct ll_sb_info *sbi = ll_i2sbi(inode);
2389         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2390                                            .ei_cb_cp =ldlm_flock_completion_ast,
2391                                            .ei_cbdata = file_lock };
2392         struct md_op_data *op_data;
2393         struct lustre_handle lockh = {0};
2394         ldlm_policy_data_t flock = {{0}};
2395         int flags = 0;
2396         int rc;
2397         int rc2 = 0;
2398         ENTRY;
2399
2400         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2401                inode->i_ino, file_lock);
2402
2403         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2404
2405         if (file_lock->fl_flags & FL_FLOCK) {
2406                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2407                 /* flocks are whole-file locks */
2408                 flock.l_flock.end = OFFSET_MAX;
2409                 /* For flocks owner is determined by the local file desctiptor*/
2410                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2411         } else if (file_lock->fl_flags & FL_POSIX) {
2412                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2413                 flock.l_flock.start = file_lock->fl_start;
2414                 flock.l_flock.end = file_lock->fl_end;
2415         } else {
2416                 RETURN(-EINVAL);
2417         }
2418         flock.l_flock.pid = file_lock->fl_pid;
2419
2420         /* Somewhat ugly workaround for svc lockd.
2421          * lockd installs custom fl_lmops->lm_compare_owner that checks
2422          * for the fl_owner to be the same (which it always is on local node
2423          * I guess between lockd processes) and then compares pid.
2424          * As such we assign pid to the owner field to make it all work,
2425          * conflict with normal locks is unlikely since pid space and
2426          * pointer space for current->files are not intersecting */
2427         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2428                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2429
2430         switch (file_lock->fl_type) {
2431         case F_RDLCK:
2432                 einfo.ei_mode = LCK_PR;
2433                 break;
2434         case F_UNLCK:
2435                 /* An unlock request may or may not have any relation to
2436                  * existing locks so we may not be able to pass a lock handle
2437                  * via a normal ldlm_lock_cancel() request. The request may even
2438                  * unlock a byte range in the middle of an existing lock. In
2439                  * order to process an unlock request we need all of the same
2440                  * information that is given with a normal read or write record
2441                  * lock request. To avoid creating another ldlm unlock (cancel)
2442                  * message we'll treat a LCK_NL flock request as an unlock. */
2443                 einfo.ei_mode = LCK_NL;
2444                 break;
2445         case F_WRLCK:
2446                 einfo.ei_mode = LCK_PW;
2447                 break;
2448         default:
2449                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2450                         file_lock->fl_type);
2451                 RETURN (-ENOTSUPP);
2452         }
2453
2454         switch (cmd) {
2455         case F_SETLKW:
2456 #ifdef F_SETLKW64
2457         case F_SETLKW64:
2458 #endif
2459                 flags = 0;
2460                 break;
2461         case F_SETLK:
2462 #ifdef F_SETLK64
2463         case F_SETLK64:
2464 #endif
2465                 flags = LDLM_FL_BLOCK_NOWAIT;
2466                 break;
2467         case F_GETLK:
2468 #ifdef F_GETLK64
2469         case F_GETLK64:
2470 #endif
2471                 flags = LDLM_FL_TEST_LOCK;
2472                 /* Save the old mode so that if the mode in the lock changes we
2473                  * can decrement the appropriate reader or writer refcount. */
2474                 file_lock->fl_type = einfo.ei_mode;
2475                 break;
2476         default:
2477                 CERROR("unknown fcntl lock command: %d\n", cmd);
2478                 RETURN (-EINVAL);
2479         }
2480
2481         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2482                                      LUSTRE_OPC_ANY, NULL);
2483         if (IS_ERR(op_data))
2484                 RETURN(PTR_ERR(op_data));
2485
2486         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2487                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2488                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2489
2490         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2491                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2492
2493         if ((file_lock->fl_flags & FL_FLOCK) &&
2494             (rc == 0 || file_lock->fl_type == F_UNLCK))
2495                 rc2  = flock_lock_file_wait(file, file_lock);
2496         if ((file_lock->fl_flags & FL_POSIX) &&
2497             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2498             !(flags & LDLM_FL_TEST_LOCK))
2499                 rc2  = posix_lock_file_wait(file, file_lock);
2500
2501         if (rc2 && file_lock->fl_type != F_UNLCK) {
2502                 einfo.ei_mode = LCK_NL;
2503                 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2504                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2505                 rc = rc2;
2506         }
2507
2508         ll_finish_md_op_data(op_data);
2509
2510         RETURN(rc);
2511 }
2512
2513 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2514 {
2515         ENTRY;
2516
2517         RETURN(-ENOSYS);
2518 }
2519
2520 /**
2521  * test if some locks matching bits and l_req_mode are acquired
2522  * - bits can be in different locks
2523  * - if found clear the common lock bits in *bits
2524  * - the bits not found, are kept in *bits
2525  * \param inode [IN]
2526  * \param bits [IN] searched lock bits [IN]
2527  * \param l_req_mode [IN] searched lock mode
2528  * \retval boolean, true iff all bits are found
2529  */
2530 int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2531 {
2532         struct lustre_handle lockh;
2533         ldlm_policy_data_t policy;
2534         ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2535                                 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2536         struct lu_fid *fid;
2537         __u64 flags;
2538         int i;
2539         ENTRY;
2540
2541         if (!inode)
2542                RETURN(0);
2543
2544         fid = &ll_i2info(inode)->lli_fid;
2545         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2546                ldlm_lockname[mode]);
2547
2548         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2549         for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2550                 policy.l_inodebits.bits = *bits & (1 << i);
2551                 if (policy.l_inodebits.bits == 0)
2552                         continue;
2553
2554                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2555                                   &policy, mode, &lockh)) {
2556                         struct ldlm_lock *lock;
2557
2558                         lock = ldlm_handle2lock(&lockh);
2559                         if (lock) {
2560                                 *bits &=
2561                                       ~(lock->l_policy_data.l_inodebits.bits);
2562                                 LDLM_LOCK_PUT(lock);
2563                         } else {
2564                                 *bits &= ~policy.l_inodebits.bits;
2565                         }
2566                 }
2567         }
2568         RETURN(*bits == 0);
2569 }
2570
2571 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2572                             struct lustre_handle *lockh, __u64 flags)
2573 {
2574         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2575         struct lu_fid *fid;
2576         ldlm_mode_t rc;
2577         ENTRY;
2578
2579         fid = &ll_i2info(inode)->lli_fid;
2580         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2581
2582         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2583                            fid, LDLM_IBITS, &policy,
2584                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2585         RETURN(rc);
2586 }
2587
2588 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2589 {
2590         /* Already unlinked. Just update nlink and return success */
2591         if (rc == -ENOENT) {
2592                 clear_nlink(inode);
2593                 /* This path cannot be hit for regular files unless in
2594                  * case of obscure races, so no need to to validate
2595                  * size. */
2596                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2597                         return 0;
2598         } else if (rc != 0) {
2599                 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2600                        ll_get_fsname(inode->i_sb, NULL, 0),
2601                        PFID(ll_inode2fid(inode)), rc);
2602         }
2603
2604         return rc;
2605 }
2606
2607 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2608                              __u64 ibits)
2609 {
2610         struct inode *inode = dentry->d_inode;
2611         struct ptlrpc_request *req = NULL;
2612         struct obd_export *exp;
2613         int rc = 0;
2614         ENTRY;
2615
2616         LASSERT(inode != NULL);
2617
2618         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2619                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2620
2621         exp = ll_i2mdexp(inode);
2622
2623         /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2624          *      But under CMD case, it caused some lock issues, should be fixed
2625          *      with new CMD ibits lock. See bug 12718 */
2626         if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2627                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2628                 struct md_op_data *op_data;
2629
2630                 if (ibits == MDS_INODELOCK_LOOKUP)
2631                         oit.it_op = IT_LOOKUP;
2632
2633                 /* Call getattr by fid, so do not provide name at all. */
2634                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2635                                              dentry->d_inode, NULL, 0, 0,
2636                                              LUSTRE_OPC_ANY, NULL);
2637                 if (IS_ERR(op_data))
2638                         RETURN(PTR_ERR(op_data));
2639
2640                 oit.it_create_mode |= M_CHECK_STALE;
2641                 rc = md_intent_lock(exp, op_data, NULL, 0,
2642                                     /* we are not interested in name
2643                                        based lookup */
2644                                     &oit, 0, &req,
2645                                     ll_md_blocking_ast, 0);
2646                 ll_finish_md_op_data(op_data);
2647                 oit.it_create_mode &= ~M_CHECK_STALE;
2648                 if (rc < 0) {
2649                         rc = ll_inode_revalidate_fini(inode, rc);
2650                         GOTO (out, rc);
2651                 }
2652
2653                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2654                 if (rc != 0) {
2655                         ll_intent_release(&oit);
2656                         GOTO(out, rc);
2657                 }
2658
2659                 /* Unlinked? Unhash dentry, so it is not picked up later by
2660                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2661                    here to preserve get_cwd functionality on 2.6.
2662                    Bug 10503 */
2663                 if (!dentry->d_inode->i_nlink)
2664                         d_lustre_invalidate(dentry);
2665
2666                 ll_lookup_finish_locks(&oit, dentry);
2667         } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2668                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2669                 obd_valid valid = OBD_MD_FLGETATTR;
2670                 struct md_op_data *op_data;
2671                 int ealen = 0;
2672
2673                 if (S_ISREG(inode->i_mode)) {
2674                         rc = ll_get_max_mdsize(sbi, &ealen);
2675                         if (rc)
2676                                 RETURN(rc);
2677                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2678                 }
2679
2680                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2681                                              0, ealen, LUSTRE_OPC_ANY,
2682                                              NULL);
2683                 if (IS_ERR(op_data))
2684                         RETURN(PTR_ERR(op_data));
2685
2686                 op_data->op_valid = valid;
2687                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2688                  * capa for this inode. Because we only keep capas of dirs
2689                  * fresh. */
2690                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2691                 ll_finish_md_op_data(op_data);
2692                 if (rc) {
2693                         rc = ll_inode_revalidate_fini(inode, rc);
2694                         RETURN(rc);
2695                 }
2696
2697                 rc = ll_prep_inode(&inode, req, NULL, NULL);
2698         }
2699 out:
2700         ptlrpc_req_finished(req);
2701         return rc;
2702 }
2703
2704 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2705                            __u64 ibits)
2706 {
2707         struct inode *inode = dentry->d_inode;
2708         int rc;
2709         ENTRY;
2710
2711         rc = __ll_inode_revalidate_it(dentry, it, ibits);
2712         if (rc != 0)
2713                 RETURN(rc);
2714
2715         /* if object isn't regular file, don't validate size */
2716         if (!S_ISREG(inode->i_mode)) {
2717                 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2718                 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2719                 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2720         } else {
2721                 rc = ll_glimpse_size(inode);
2722         }
2723         RETURN(rc);
2724 }
2725
2726 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2727                   struct lookup_intent *it, struct kstat *stat)
2728 {
2729         struct inode *inode = de->d_inode;
2730         struct ll_sb_info *sbi = ll_i2sbi(inode);
2731         struct ll_inode_info *lli = ll_i2info(inode);
2732         int res = 0;
2733
2734         res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2735                                              MDS_INODELOCK_LOOKUP);
2736         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2737
2738         if (res)
2739                 return res;
2740
2741         stat->dev = inode->i_sb->s_dev;
2742         if (ll_need_32bit_api(sbi))
2743                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2744         else
2745                 stat->ino = inode->i_ino;
2746         stat->mode = inode->i_mode;
2747         stat->nlink = inode->i_nlink;
2748         stat->uid = inode->i_uid;
2749         stat->gid = inode->i_gid;
2750         stat->rdev = inode->i_rdev;
2751         stat->atime = inode->i_atime;
2752         stat->mtime = inode->i_mtime;
2753         stat->ctime = inode->i_ctime;
2754         stat->blksize = 1 << inode->i_blkbits;
2755
2756         stat->size = i_size_read(inode);
2757         stat->blocks = inode->i_blocks;
2758
2759         return 0;
2760 }
2761 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2762 {
2763         struct lookup_intent it = { .it_op = IT_GETATTR };
2764
2765         return ll_getattr_it(mnt, de, &it, stat);
2766 }
2767
2768 #ifdef HAVE_LINUX_FIEMAP_H
2769 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2770                 __u64 start, __u64 len)
2771 {
2772         int rc;
2773         size_t num_bytes;
2774         struct ll_user_fiemap *fiemap;
2775         unsigned int extent_count = fieinfo->fi_extents_max;
2776
2777         num_bytes = sizeof(*fiemap) + (extent_count *
2778                                        sizeof(struct ll_fiemap_extent));
2779         OBD_ALLOC_LARGE(fiemap, num_bytes);
2780
2781         if (fiemap == NULL)
2782                 RETURN(-ENOMEM);
2783
2784         fiemap->fm_flags = fieinfo->fi_flags;
2785         fiemap->fm_extent_count = fieinfo->fi_extents_max;
2786         fiemap->fm_start = start;
2787         fiemap->fm_length = len;
2788         memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2789                sizeof(struct ll_fiemap_extent));
2790
2791         rc = ll_do_fiemap(inode, fiemap, num_bytes);
2792
2793         fieinfo->fi_flags = fiemap->fm_flags;
2794         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2795         memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2796                fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2797
2798         OBD_FREE_LARGE(fiemap, num_bytes);
2799         return rc;
2800 }
2801 #endif
2802
2803 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2804 {
2805         struct ll_inode_info *lli = ll_i2info(inode);
2806         struct posix_acl *acl = NULL;
2807         ENTRY;
2808
2809         spin_lock(&lli->lli_lock);
2810         /* VFS' acl_permission_check->check_acl will release the refcount */
2811         acl = posix_acl_dup(lli->lli_posix_acl);
2812         spin_unlock(&lli->lli_lock);
2813
2814         RETURN(acl);
2815 }
2816
2817 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2818 static int
2819 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2820 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2821 # else
2822 ll_check_acl(struct inode *inode, int mask)
2823 # endif
2824 {
2825 # ifdef CONFIG_FS_POSIX_ACL
2826         struct posix_acl *acl;
2827         int rc;
2828         ENTRY;
2829
2830 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
2831         if (flags & IPERM_FLAG_RCU)
2832                 return -ECHILD;
2833 #  endif
2834         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2835
2836         if (!acl)
2837                 RETURN(-EAGAIN);
2838
2839         rc = posix_acl_permission(inode, acl, mask);
2840         posix_acl_release(acl);
2841
2842         RETURN(rc);
2843 # else /* !CONFIG_FS_POSIX_ACL */
2844         return -EAGAIN;
2845 # endif /* CONFIG_FS_POSIX_ACL */
2846 }
2847 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2848
2849 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2850 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2851 #else
2852 # ifdef HAVE_INODE_PERMISION_2ARGS
2853 int ll_inode_permission(struct inode *inode, int mask)
2854 # else
2855 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2856 # endif
2857 #endif
2858 {
2859         int rc = 0;
2860         ENTRY;
2861
2862 #ifdef MAY_NOT_BLOCK
2863         if (mask & MAY_NOT_BLOCK)
2864                 return -ECHILD;
2865 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2866         if (flags & IPERM_FLAG_RCU)
2867                 return -ECHILD;
2868 #endif
2869
2870        /* as root inode are NOT getting validated in lookup operation,
2871         * need to do it before permission check. */
2872
2873         if (inode == inode->i_sb->s_root->d_inode) {
2874                 struct lookup_intent it = { .it_op = IT_LOOKUP };
2875
2876                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2877                                               MDS_INODELOCK_LOOKUP);
2878                 if (rc)
2879                         RETURN(rc);
2880         }
2881
2882         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2883                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2884
2885         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2886                 return lustre_check_remote_perm(inode, mask);
2887
2888         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2889         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
2890
2891         RETURN(rc);
2892 }
2893
2894 #ifdef HAVE_FILE_READV
2895 #define READ_METHOD readv
2896 #define READ_FUNCTION ll_file_readv
2897 #define WRITE_METHOD writev
2898 #define WRITE_FUNCTION ll_file_writev
2899 #else
2900 #define READ_METHOD aio_read
2901 #define READ_FUNCTION ll_file_aio_read
2902 #define WRITE_METHOD aio_write
2903 #define WRITE_FUNCTION ll_file_aio_write
2904 #endif
2905
2906 /* -o localflock - only provides locally consistent flock locks */
2907 struct file_operations ll_file_operations = {
2908         .read           = ll_file_read,
2909         .READ_METHOD    = READ_FUNCTION,
2910         .write          = ll_file_write,
2911         .WRITE_METHOD   = WRITE_FUNCTION,
2912         .unlocked_ioctl = ll_file_ioctl,
2913         .open           = ll_file_open,
2914         .release        = ll_file_release,
2915         .mmap           = ll_file_mmap,
2916         .llseek         = ll_file_seek,
2917 #ifdef HAVE_KERNEL_SENDFILE
2918         .sendfile       = ll_file_sendfile,
2919 #endif
2920 #ifdef HAVE_KERNEL_SPLICE_READ
2921         .splice_read    = ll_file_splice_read,
2922 #endif
2923         .fsync          = ll_fsync,
2924         .flush          = ll_flush
2925 };
2926
2927 struct file_operations ll_file_operations_flock = {
2928         .read           = ll_file_read,
2929         .READ_METHOD    = READ_FUNCTION,
2930         .write          = ll_file_write,
2931         .WRITE_METHOD   = WRITE_FUNCTION,
2932         .unlocked_ioctl = ll_file_ioctl,
2933         .open           = ll_file_open,
2934         .release        = ll_file_release,
2935         .mmap           = ll_file_mmap,
2936         .llseek         = ll_file_seek,
2937 #ifdef HAVE_KERNEL_SENDFILE
2938         .sendfile       = ll_file_sendfile,
2939 #endif
2940 #ifdef HAVE_KERNEL_SPLICE_READ
2941         .splice_read    = ll_file_splice_read,
2942 #endif
2943         .fsync          = ll_fsync,
2944         .flush          = ll_flush,
2945         .flock          = ll_file_flock,
2946         .lock           = ll_file_flock
2947 };
2948
2949 /* These are for -o noflock - to return ENOSYS on flock calls */
2950 struct file_operations ll_file_operations_noflock = {
2951         .read           = ll_file_read,
2952         .READ_METHOD    = READ_FUNCTION,
2953         .write          = ll_file_write,
2954         .WRITE_METHOD   = WRITE_FUNCTION,
2955         .unlocked_ioctl = ll_file_ioctl,
2956         .open           = ll_file_open,
2957         .release        = ll_file_release,
2958         .mmap           = ll_file_mmap,
2959         .llseek         = ll_file_seek,
2960 #ifdef HAVE_KERNEL_SENDFILE
2961         .sendfile       = ll_file_sendfile,
2962 #endif
2963 #ifdef HAVE_KERNEL_SPLICE_READ
2964         .splice_read    = ll_file_splice_read,
2965 #endif
2966         .fsync          = ll_fsync,
2967         .flush          = ll_flush,
2968         .flock          = ll_file_noflock,
2969         .lock           = ll_file_noflock
2970 };
2971
2972 struct inode_operations ll_file_inode_operations = {
2973         .setattr        = ll_setattr,
2974         .getattr        = ll_getattr,
2975         .permission     = ll_inode_permission,
2976         .setxattr       = ll_setxattr,
2977         .getxattr       = ll_getxattr,
2978         .listxattr      = ll_listxattr,
2979         .removexattr    = ll_removexattr,
2980 #ifdef  HAVE_LINUX_FIEMAP_H
2981         .fiemap         = ll_fiemap,
2982 #endif
2983 #ifdef HAVE_IOP_GET_ACL
2984         .get_acl        = ll_get_acl,
2985 #endif
2986 };
2987
2988 /* dynamic ioctl number support routins */
2989 static struct llioc_ctl_data {
2990         struct rw_semaphore     ioc_sem;
2991         cfs_list_t              ioc_head;
2992 } llioc = {
2993         __RWSEM_INITIALIZER(llioc.ioc_sem),
2994         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2995 };
2996
2997
2998 struct llioc_data {
2999         cfs_list_t              iocd_list;
3000         unsigned int            iocd_size;
3001         llioc_callback_t        iocd_cb;
3002         unsigned int            iocd_count;
3003         unsigned int            iocd_cmd[0];
3004 };
3005
3006 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3007 {
3008         unsigned int size;
3009         struct llioc_data *in_data = NULL;
3010         ENTRY;
3011
3012         if (cb == NULL || cmd == NULL ||
3013             count > LLIOC_MAX_CMD || count < 0)
3014                 RETURN(NULL);
3015
3016         size = sizeof(*in_data) + count * sizeof(unsigned int);
3017         OBD_ALLOC(in_data, size);
3018         if (in_data == NULL)
3019                 RETURN(NULL);
3020
3021         memset(in_data, 0, sizeof(*in_data));
3022         in_data->iocd_size = size;
3023         in_data->iocd_cb = cb;
3024         in_data->iocd_count = count;
3025         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3026
3027         down_write(&llioc.ioc_sem);
3028         cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3029         up_write(&llioc.ioc_sem);
3030
3031         RETURN(in_data);
3032 }
3033
3034 void ll_iocontrol_unregister(void *magic)
3035 {
3036         struct llioc_data *tmp;
3037
3038         if (magic == NULL)
3039                 return;
3040
3041         down_write(&llioc.ioc_sem);
3042         cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3043                 if (tmp == magic) {
3044                         unsigned int size = tmp->iocd_size;
3045
3046                         cfs_list_del(&tmp->iocd_list);
3047                         up_write(&llioc.ioc_sem);
3048
3049                         OBD_FREE(tmp, size);
3050                         return;
3051                 }
3052         }
3053         up_write(&llioc.ioc_sem);
3054
3055         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3056 }
3057
3058 EXPORT_SYMBOL(ll_iocontrol_register);
3059 EXPORT_SYMBOL(ll_iocontrol_unregister);
3060
3061 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3062                         unsigned int cmd, unsigned long arg, int *rcp)
3063 {
3064         enum llioc_iter ret = LLIOC_CONT;
3065         struct llioc_data *data;
3066         int rc = -EINVAL, i;
3067
3068         down_read(&llioc.ioc_sem);
3069         cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3070                 for (i = 0; i < data->iocd_count; i++) {
3071                         if (cmd != data->iocd_cmd[i])
3072                                 continue;
3073
3074                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3075                         break;
3076                 }
3077
3078                 if (ret == LLIOC_STOP)
3079                         break;
3080         }
3081         up_read(&llioc.ioc_sem);
3082
3083         if (rcp)
3084                 *rcp = rc;
3085         return ret;
3086 }
3087
3088 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3089 {
3090         struct ll_inode_info *lli = ll_i2info(inode);
3091         struct cl_env_nest nest;
3092         struct lu_env *env;
3093         int result;
3094         ENTRY;
3095
3096         if (lli->lli_clob == NULL)
3097                 RETURN(0);
3098
3099         env = cl_env_nested_get(&nest);
3100         if (IS_ERR(env))
3101                 RETURN(PTR_ERR(env));
3102
3103         result = cl_conf_set(env, lli->lli_clob, conf);
3104         cl_env_nested_put(&nest, env);
3105
3106         if (conf->coc_opc == OBJECT_CONF_SET) {
3107                 struct ldlm_lock *lock = conf->coc_lock;
3108
3109                 LASSERT(lock != NULL);
3110                 LASSERT(ldlm_has_layout(lock));
3111                 if (result == 0) {
3112                         /* it can only be allowed to match after layout is
3113                          * applied to inode otherwise false layout would be
3114                          * seen. Applying layout shoud happen before dropping
3115                          * the intent lock. */
3116                         ldlm_lock_allow_match(lock);
3117                 }
3118         }
3119         RETURN(result);
3120 }
3121
3122 /**
3123  * Apply the layout to the inode. Layout lock is held and will be released
3124  * in this function.
3125  */
3126 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3127                                 struct inode *inode, __u32 *gen, bool reconf)
3128 {
3129         struct ll_inode_info *lli = ll_i2info(inode);
3130         struct ll_sb_info    *sbi = ll_i2sbi(inode);
3131         struct ldlm_lock *lock;
3132         struct lustre_md md = { NULL };
3133         struct cl_object_conf conf;
3134         int rc = 0;
3135         bool lvb_ready;
3136         ENTRY;
3137
3138         LASSERT(lustre_handle_is_used(lockh));
3139
3140         lock = ldlm_handle2lock(lockh);
3141         LASSERT(lock != NULL);
3142         LASSERT(ldlm_has_layout(lock));
3143
3144         LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3145                 inode, PFID(&lli->lli_fid), reconf);
3146
3147         lock_res_and_lock(lock);
3148         lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3149         unlock_res_and_lock(lock);
3150         /* checking lvb_ready is racy but this is okay. The worst case is
3151          * that multi processes may configure the file on the same time. */
3152         if (lvb_ready || !reconf) {
3153                 LDLM_LOCK_PUT(lock);
3154
3155                 rc = -ENODATA;
3156                 if (lvb_ready) {
3157                         /* layout_gen must be valid if layout lock is not
3158                          * cancelled and stripe has already set */
3159                         *gen = lli->lli_layout_gen;
3160                         rc = 0;
3161                 }
3162                 ldlm_lock_decref(lockh, mode);
3163                 RETURN(rc);
3164         }
3165
3166         /* for layout lock, lmm is returned in lock's lvb.
3167          * lvb_data is immutable if the lock is held so it's safe to access it
3168          * without res lock. See the description in ldlm_lock_decref_internal()
3169          * for the condition to free lvb_data of layout lock */
3170         if (lock->l_lvb_data != NULL) {
3171                 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3172                                   lock->l_lvb_data, lock->l_lvb_len);
3173                 if (rc >= 0) {
3174                         *gen = LL_LAYOUT_GEN_EMPTY;
3175                         if (md.lsm != NULL)
3176                                 *gen = md.lsm->lsm_layout_gen;
3177                         rc = 0;
3178                 } else {
3179                         CERROR("%s: file "DFID" unpackmd error: %d\n",
3180                                 ll_get_fsname(inode->i_sb, NULL, 0),
3181                                 PFID(&lli->lli_fid), rc);
3182                 }
3183         }
3184         if (rc < 0) {
3185                 LDLM_LOCK_PUT(lock);
3186                 ldlm_lock_decref(lockh, mode);
3187                 RETURN(rc);
3188         }
3189
3190         /* set layout to file. Unlikely this will fail as old layout was
3191          * surely eliminated */
3192         memset(&conf, 0, sizeof conf);
3193         conf.coc_opc = OBJECT_CONF_SET;
3194         conf.coc_inode = inode;
3195         conf.coc_lock = lock;
3196         conf.u.coc_md = &md;
3197         rc = ll_layout_conf(inode, &conf);
3198         LDLM_LOCK_PUT(lock);
3199
3200         ldlm_lock_decref(lockh, mode);
3201
3202         if (md.lsm != NULL)
3203                 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3204
3205         /* wait for IO to complete if it's still being used. */
3206         if (rc == -EBUSY) {
3207                 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3208                         ll_get_fsname(inode->i_sb, NULL, 0),
3209                         inode, PFID(&lli->lli_fid));
3210
3211                 memset(&conf, 0, sizeof conf);
3212                 conf.coc_opc = OBJECT_CONF_WAIT;
3213                 conf.coc_inode = inode;
3214                 rc = ll_layout_conf(inode, &conf);
3215                 if (rc == 0)
3216                         rc = -EAGAIN;
3217
3218                 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3219                         PFID(&lli->lli_fid), rc);
3220         }
3221
3222         RETURN(rc);
3223 }
3224
3225 /**
3226  * This function checks if there exists a LAYOUT lock on the client side,
3227  * or enqueues it if it doesn't have one in cache.
3228  *
3229  * This function will not hold layout lock so it may be revoked any time after
3230  * this function returns. Any operations depend on layout should be redone
3231  * in that case.
3232  *
3233  * This function should be called before lov_io_init() to get an uptodate
3234  * layout version, the caller should save the version number and after IO
3235  * is finished, this function should be called again to verify that layout
3236  * is not changed during IO time.
3237  */
3238 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3239 {
3240         struct ll_inode_info  *lli = ll_i2info(inode);
3241         struct ll_sb_info     *sbi = ll_i2sbi(inode);
3242         struct md_op_data     *op_data;
3243         struct lookup_intent   it;
3244         struct lustre_handle   lockh;
3245         ldlm_mode_t            mode;
3246         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3247                                            .ei_mode = LCK_CR,
3248                                            .ei_cb_bl = ll_md_blocking_ast,
3249                                            .ei_cb_cp = ldlm_completion_ast,
3250                                            .ei_cbdata = inode };
3251         int rc;
3252         ENTRY;
3253
3254         *gen = LL_LAYOUT_GEN_NONE;
3255         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3256                 RETURN(0);
3257
3258         /* sanity checks */
3259         LASSERT(fid_is_sane(ll_inode2fid(inode)));
3260         LASSERT(S_ISREG(inode->i_mode));
3261
3262         /* mostly layout lock is caching on the local side, so try to match
3263          * it before grabbing layout lock mutex. */
3264         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3265         if (mode != 0) { /* hit cached lock */
3266                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3267                 if (rc == 0)
3268                         RETURN(0);
3269
3270                 /* better hold lli_layout_mutex to try again otherwise
3271                  * it will have starvation problem. */
3272         }
3273
3274         /* take layout lock mutex to enqueue layout lock exclusively. */
3275         mutex_lock(&lli->lli_layout_mutex);
3276
3277 again:
3278         /* try again. Maybe somebody else has done this. */
3279         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3280         if (mode != 0) { /* hit cached lock */
3281                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3282                 if (rc == -EAGAIN)
3283                         goto again;
3284
3285                 mutex_unlock(&lli->lli_layout_mutex);
3286                 RETURN(rc);
3287         }
3288
3289         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3290                         0, 0, LUSTRE_OPC_ANY, NULL);
3291         if (IS_ERR(op_data)) {
3292                 mutex_unlock(&lli->lli_layout_mutex);
3293                 RETURN(PTR_ERR(op_data));
3294         }
3295
3296         /* have to enqueue one */
3297         memset(&it, 0, sizeof(it));
3298         it.it_op = IT_LAYOUT;
3299         lockh.cookie = 0ULL;
3300
3301         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3302                         ll_get_fsname(inode->i_sb, NULL, 0), inode,
3303                         PFID(&lli->lli_fid));
3304
3305         rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3306                         NULL, 0, NULL, 0);
3307         if (it.d.lustre.it_data != NULL)
3308                 ptlrpc_req_finished(it.d.lustre.it_data);
3309         it.d.lustre.it_data = NULL;
3310
3311         ll_finish_md_op_data(op_data);
3312
3313         mode = it.d.lustre.it_lock_mode;
3314         it.d.lustre.it_lock_mode = 0;
3315         ll_intent_drop_lock(&it);
3316
3317         if (rc == 0) {
3318                 /* set lock data in case this is a new lock */
3319                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3320                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3321                 if (rc == -EAGAIN)
3322                         goto again;
3323         }
3324         mutex_unlock(&lli->lli_layout_mutex);
3325
3326         RETURN(rc);
3327 }