lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19  *
  20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21  * CA 95054 USA or visit www.sun.com if you need additional information or
  22  * have any questions.
  23  *
  24  * GPL HEADER END
  25  */
  26 /*
  27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Use is subject to license terms.
  29  *
  30  * Copyright (c) 2011, 2012, Intel Corporation.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lustre/llite/file.c
  37  *
  38  * Author: Peter Braam <braam@clusterfs.com>
  39  * Author: Phil Schwan <phil@clusterfs.com>
  40  * Author: Andreas Dilger <adilger@clusterfs.com>
  41  */
  42
  43 #define DEBUG_SUBSYSTEM S_LLITE
  44 #include <lustre_dlm.h>
  45 #include <lustre_lite.h>
  46 #include <linux/pagemap.h>
  47 #include <linux/file.h>
  48 #include "llite_internal.h"
  49 #include <lustre/ll_fiemap.h>
  50
  51 #include "cl_object.h"
  52
  53 struct ll_file_data *ll_file_data_get(void)
  54 {
  55         struct ll_file_data *fd;
  56
  57         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
  58         fd->fd_write_failed = false;
  59         return fd;
  60 }
  61
  62 static void ll_file_data_put(struct ll_file_data *fd)
  63 {
  64         if (fd != NULL)
  65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  66 }
  67
  68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  69                           struct lustre_handle *fh)
  70 {
  71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  72         op_data->op_attr.ia_mode = inode->i_mode;
  73         op_data->op_attr.ia_atime = inode->i_atime;
  74         op_data->op_attr.ia_mtime = inode->i_mtime;
  75         op_data->op_attr.ia_ctime = inode->i_ctime;
  76         op_data->op_attr.ia_size = i_size_read(inode);
  77         op_data->op_attr_blocks = inode->i_blocks;
  78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
  79                                         ll_inode_to_ext_flags(inode->i_flags);
  80         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  81         if (fh)
  82                 op_data->op_handle = *fh;
  83         op_data->op_capa1 = ll_mdscapa_get(inode);
  84
  85         if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
  86                 op_data->op_bias |= MDS_DATA_MODIFIED;
  87 }
  88
  89 /**
  90  * Closes the IO epoch and packs all the attributes into @op_data for
  91  * the CLOSE rpc.
  92  */
  93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  94                              struct obd_client_handle *och)
  95 {
  96         ENTRY;
  97
  98         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
  99                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
 100
 101         if (!(och->och_flags & FMODE_WRITE))
 102                 goto out;
 103
 104         if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
 105                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 106         else
 107                 ll_ioepoch_close(inode, op_data, &och, 0);
 108
 109 out:
 110         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
 111         ll_prep_md_op_data(op_data, inode, NULL, NULL,
 112                            0, 0, LUSTRE_OPC_ANY, NULL);
 113         EXIT;
 114 }
 115
 116 static int ll_close_inode_openhandle(struct obd_export *md_exp,
 117                                      struct inode *inode,
 118                                      struct obd_client_handle *och)
 119 {
 120         struct obd_export *exp = ll_i2mdexp(inode);
 121         struct md_op_data *op_data;
 122         struct ptlrpc_request *req = NULL;
 123         struct obd_device *obd = class_exp2obd(exp);
 124         int epoch_close = 1;
 125         int rc;
 126         ENTRY;
 127
 128         if (obd == NULL) {
 129                 /*
 130                  * XXX: in case of LMV, is this correct to access
 131                  * ->exp_handle?
 132                  */
 133                 CERROR("Invalid MDC connection handle "LPX64"\n",
 134                        ll_i2mdexp(inode)->exp_handle.h_cookie);
 135                 GOTO(out, rc = 0);
 136         }
 137
 138         OBD_ALLOC_PTR(op_data);
 139         if (op_data == NULL)
 140                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
 141
 142         ll_prepare_close(inode, op_data, och);
 143         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 144         rc = md_close(md_exp, op_data, och->och_mod, &req);
 145         if (rc == -EAGAIN) {
 146                 /* This close must have the epoch closed. */
 147                 LASSERT(epoch_close);
 148                 /* MDS has instructed us to obtain Size-on-MDS attribute from
 149                  * OSTs and send setattr to back to MDS. */
 150                 rc = ll_som_update(inode, op_data);
 151                 if (rc) {
 152                         CERROR("inode %lu mdc Size-on-MDS update failed: "
 153                                "rc = %d\n", inode->i_ino, rc);
 154                         rc = 0;
 155                 }
 156         } else if (rc) {
 157                 CERROR("inode %lu mdc close failed: rc = %d\n",
 158                        inode->i_ino, rc);
 159         }
 160
 161         /* DATA_MODIFIED flag was successfully sent on close, cancel data
 162          * modification flag. */
 163         if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
 164                 struct ll_inode_info *lli = ll_i2info(inode);
 165
 166                 spin_lock(&lli->lli_lock);
 167                 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
 168                 spin_unlock(&lli->lli_lock);
 169         }
 170
 171         ll_finish_md_op_data(op_data);
 172
 173         if (rc == 0) {
 174                 rc = ll_objects_destroy(req, inode);
 175                 if (rc)
 176                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
 177                                inode->i_ino, rc);
 178         }
 179
 180         EXIT;
 181 out:
 182
 183         if (exp_connect_som(exp) && !epoch_close &&
 184             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 185                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 186         } else {
 187                 md_clear_open_replay_data(md_exp, och);
 188                 /* Free @och if it is not waiting for DONE_WRITING. */
 189                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 190                 OBD_FREE_PTR(och);
 191         }
 192         if (req) /* This is close request */
 193                 ptlrpc_req_finished(req);
 194         return rc;
 195 }
 196
 197 int ll_md_real_close(struct inode *inode, int flags)
 198 {
 199         struct ll_inode_info *lli = ll_i2info(inode);
 200         struct obd_client_handle **och_p;
 201         struct obd_client_handle *och;
 202         __u64 *och_usecount;
 203         int rc = 0;
 204         ENTRY;
 205
 206         if (flags & FMODE_WRITE) {
 207                 och_p = &lli->lli_mds_write_och;
 208                 och_usecount = &lli->lli_open_fd_write_count;
 209         } else if (flags & FMODE_EXEC) {
 210                 och_p = &lli->lli_mds_exec_och;
 211                 och_usecount = &lli->lli_open_fd_exec_count;
 212         } else {
 213                 LASSERT(flags & FMODE_READ);
 214                 och_p = &lli->lli_mds_read_och;
 215                 och_usecount = &lli->lli_open_fd_read_count;
 216         }
 217
 218         mutex_lock(&lli->lli_och_mutex);
 219         if (*och_usecount) { /* There are still users of this handle, so
 220                                 skip freeing it. */
 221                 mutex_unlock(&lli->lli_och_mutex);
 222                 RETURN(0);
 223         }
 224         och=*och_p;
 225         *och_p = NULL;
 226         mutex_unlock(&lli->lli_och_mutex);
 227
 228         if (och) { /* There might be a race and somebody have freed this och
 229                       already */
 230                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 231                                                inode, och);
 232         }
 233
 234         RETURN(rc);
 235 }
 236
 237 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 238                 struct file *file)
 239 {
 240         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 241         struct ll_inode_info *lli = ll_i2info(inode);
 242         int rc = 0;
 243         ENTRY;
 244
 245         /* clear group lock, if present */
 246         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 247                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
 248
 249         /* Let's see if we have good enough OPEN lock on the file and if
 250            we can skip talking to MDS */
 251         if (file->f_dentry->d_inode) { /* Can this ever be false? */
 252                 int lockmode;
 253                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 254                 struct lustre_handle lockh;
 255                 struct inode *inode = file->f_dentry->d_inode;
 256                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 257
 258                 mutex_lock(&lli->lli_och_mutex);
 259                 if (fd->fd_omode & FMODE_WRITE) {
 260                         lockmode = LCK_CW;
 261                         LASSERT(lli->lli_open_fd_write_count);
 262                         lli->lli_open_fd_write_count--;
 263                 } else if (fd->fd_omode & FMODE_EXEC) {
 264                         lockmode = LCK_PR;
 265                         LASSERT(lli->lli_open_fd_exec_count);
 266                         lli->lli_open_fd_exec_count--;
 267                 } else {
 268                         lockmode = LCK_CR;
 269                         LASSERT(lli->lli_open_fd_read_count);
 270                         lli->lli_open_fd_read_count--;
 271                 }
 272                 mutex_unlock(&lli->lli_och_mutex);
 273
 274                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 275                                    LDLM_IBITS, &policy, lockmode,
 276                                    &lockh)) {
 277                         rc = ll_md_real_close(file->f_dentry->d_inode,
 278                                               fd->fd_omode);
 279                 }
 280         } else {
 281                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
 282                        file, file->f_dentry, file->f_dentry->d_name.name);
 283         }
 284
 285         LUSTRE_FPRIVATE(file) = NULL;
 286         ll_file_data_put(fd);
 287         ll_capa_close(inode);
 288
 289         RETURN(rc);
 290 }
 291
 292 /* While this returns an error code, fput() the caller does not, so we need
 293  * to make every effort to clean up all of our state here.  Also, applications
 294  * rarely check close errors and even if an error is returned they will not
 295  * re-try the close call.
 296  */
 297 int ll_file_release(struct inode *inode, struct file *file)
 298 {
 299         struct ll_file_data *fd;
 300         struct ll_sb_info *sbi = ll_i2sbi(inode);
 301         struct ll_inode_info *lli = ll_i2info(inode);
 302         int rc;
 303         ENTRY;
 304
 305         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 306                inode->i_generation, inode);
 307
 308 #ifdef CONFIG_FS_POSIX_ACL
 309         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
 310             inode == inode->i_sb->s_root->d_inode) {
 311                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 312
 313                 LASSERT(fd != NULL);
 314                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 315                         fd->fd_flags &= ~LL_FILE_RMTACL;
 316                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
 317                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
 318                 }
 319         }
 320 #endif
 321
 322         if (inode->i_sb->s_root != file->f_dentry)
 323                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 324         fd = LUSTRE_FPRIVATE(file);
 325         LASSERT(fd != NULL);
 326
 327         /* The last ref on @file, maybe not the the owner pid of statahead.
 328          * Different processes can open the same dir, "ll_opendir_key" means:
 329          * it is me that should stop the statahead thread. */
 330         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
 331             lli->lli_opendir_pid != 0)
 332                 ll_stop_statahead(inode, lli->lli_opendir_key);
 333
 334         if (inode->i_sb->s_root == file->f_dentry) {
 335                 LUSTRE_FPRIVATE(file) = NULL;
 336                 ll_file_data_put(fd);
 337                 RETURN(0);
 338         }
 339
 340         if (!S_ISDIR(inode->i_mode)) {
 341                 lov_read_and_clear_async_rc(lli->lli_clob);
 342                 lli->lli_async_rc = 0;
 343         }
 344
 345         rc = ll_md_close(sbi->ll_md_exp, inode, file);
 346
 347         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 348                 libcfs_debug_dumplog();
 349
 350         RETURN(rc);
 351 }
 352
 353 static int ll_intent_file_open(struct file *file, void *lmm,
 354                                int lmmsize, struct lookup_intent *itp)
 355 {
 356         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 357         struct dentry *parent = file->f_dentry->d_parent;
 358         const char *name = file->f_dentry->d_name.name;
 359         const int len = file->f_dentry->d_name.len;
 360         struct md_op_data *op_data;
 361         struct ptlrpc_request *req;
 362         __u32 opc = LUSTRE_OPC_ANY;
 363         int rc;
 364         ENTRY;
 365
 366         if (!parent)
 367                 RETURN(-ENOENT);
 368
 369         /* Usually we come here only for NFSD, and we want open lock.
 370            But we can also get here with pre 2.6.15 patchless kernels, and in
 371            that case that lock is also ok */
 372         /* We can also get here if there was cached open handle in revalidate_it
 373          * but it disappeared while we were getting from there to ll_file_open.
 374          * But this means this file was closed and immediatelly opened which
 375          * makes a good candidate for using OPEN lock */
 376         /* If lmmsize & lmm are not 0, we are just setting stripe info
 377          * parameters. No need for the open lock */
 378         if (lmm == NULL && lmmsize == 0) {
 379                 itp->it_flags |= MDS_OPEN_LOCK;
 380                 if (itp->it_flags & FMODE_WRITE)
 381                         opc = LUSTRE_OPC_CREATE;
 382         }
 383
 384         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 385                                       file->f_dentry->d_inode, name, len,
 386                                       O_RDWR, opc, NULL);
 387         if (IS_ERR(op_data))
 388                 RETURN(PTR_ERR(op_data));
 389
 390         itp->it_flags |= MDS_OPEN_BY_FID;
 391         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 392                             0 /*unused */, &req, ll_md_blocking_ast, 0);
 393         ll_finish_md_op_data(op_data);
 394         if (rc == -ESTALE) {
 395                 /* reason for keep own exit path - don`t flood log
 396                 * with messages with -ESTALE errors.
 397                 */
 398                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 399                      it_open_error(DISP_OPEN_OPEN, itp))
 400                         GOTO(out, rc);
 401                 ll_release_openhandle(file->f_dentry, itp);
 402                 GOTO(out, rc);
 403         }
 404
 405         if (it_disposition(itp, DISP_LOOKUP_NEG))
 406                 GOTO(out, rc = -ENOENT);
 407
 408         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 409                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 410                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 411                 GOTO(out, rc);
 412         }
 413
 414         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
 415         if (!rc && itp->d.lustre.it_lock_mode)
 416                 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
 417                                  itp, NULL);
 418
 419 out:
 420         ptlrpc_req_finished(itp->d.lustre.it_data);
 421         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
 422         ll_intent_drop_lock(itp);
 423
 424         RETURN(rc);
 425 }
 426
 427 /**
 428  * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
 429  * not believe attributes if a few ioepoch holders exist. Attributes for
 430  * previous ioepoch if new one is opened are also skipped by MDS.
 431  */
 432 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
 433 {
 434         if (ioepoch && lli->lli_ioepoch != ioepoch) {
 435                 lli->lli_ioepoch = ioepoch;
 436                 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 437                        ioepoch, PFID(&lli->lli_fid));
 438         }
 439 }
 440
 441 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
 442                        struct lookup_intent *it, struct obd_client_handle *och)
 443 {
 444         struct ptlrpc_request *req = it->d.lustre.it_data;
 445         struct mdt_body *body;
 446
 447         LASSERT(och);
 448
 449         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 450         LASSERT(body != NULL);                      /* reply already checked out */
 451
 452         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
 453         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 454         och->och_fid = lli->lli_fid;
 455         och->och_flags = it->it_flags;
 456         ll_ioepoch_open(lli, body->ioepoch);
 457
 458         return md_set_open_replay_data(md_exp, och, req);
 459 }
 460
 461 int ll_local_open(struct file *file, struct lookup_intent *it,
 462                   struct ll_file_data *fd, struct obd_client_handle *och)
 463 {
 464         struct inode *inode = file->f_dentry->d_inode;
 465         struct ll_inode_info *lli = ll_i2info(inode);
 466         ENTRY;
 467
 468         LASSERT(!LUSTRE_FPRIVATE(file));
 469
 470         LASSERT(fd != NULL);
 471
 472         if (och) {
 473                 struct ptlrpc_request *req = it->d.lustre.it_data;
 474                 struct mdt_body *body;
 475                 int rc;
 476
 477                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
 478                 if (rc)
 479                         RETURN(rc);
 480
 481                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 482                 if ((it->it_flags & FMODE_WRITE) &&
 483                     (body->valid & OBD_MD_FLSIZE))
 484                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 485                                lli->lli_ioepoch, PFID(&lli->lli_fid));
 486         }
 487
 488         LUSTRE_FPRIVATE(file) = fd;
 489         ll_readahead_init(inode, &fd->fd_ras);
 490         fd->fd_omode = it->it_flags;
 491         RETURN(0);
 492 }
 493
 494 /* Open a file, and (for the very first open) create objects on the OSTs at
 495  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 496  * creation or open until ll_lov_setstripe() ioctl is called.
 497  *
 498  * If we already have the stripe MD locally then we don't request it in
 499  * md_open(), by passing a lmm_size = 0.
 500  *
 501  * It is up to the application to ensure no other processes open this file
 502  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 503  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 504  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 505  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 506  */
 507 int ll_file_open(struct inode *inode, struct file *file)
 508 {
 509         struct ll_inode_info *lli = ll_i2info(inode);
 510         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 511                                           .it_flags = file->f_flags };
 512         struct obd_client_handle **och_p = NULL;
 513         __u64 *och_usecount = NULL;
 514         struct ll_file_data *fd;
 515         int rc = 0, opendir_set = 0;
 516         ENTRY;
 517
 518         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 519                inode->i_generation, inode, file->f_flags);
 520
 521         it = file->private_data; /* XXX: compat macro */
 522         file->private_data = NULL; /* prevent ll_local_open assertion */
 523
 524         fd = ll_file_data_get();
 525         if (fd == NULL)
 526                 GOTO(out_och_free, rc = -ENOMEM);
 527
 528         fd->fd_file = file;
 529         if (S_ISDIR(inode->i_mode)) {
 530                 spin_lock(&lli->lli_sa_lock);
 531                 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
 532                     lli->lli_opendir_pid == 0) {
 533                         lli->lli_opendir_key = fd;
 534                         lli->lli_opendir_pid = cfs_curproc_pid();
 535                         opendir_set = 1;
 536                 }
 537                 spin_unlock(&lli->lli_sa_lock);
 538         }
 539
 540         if (inode->i_sb->s_root == file->f_dentry) {
 541                 LUSTRE_FPRIVATE(file) = fd;
 542                 RETURN(0);
 543         }
 544
 545         if (!it || !it->d.lustre.it_disposition) {
 546                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 547                  * because everything but O_ACCMODE mask was stripped from
 548                  * there */
 549                 if ((oit.it_flags + 1) & O_ACCMODE)
 550                         oit.it_flags++;
 551                 if (file->f_flags & O_TRUNC)
 552                         oit.it_flags |= FMODE_WRITE;
 553
 554                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 555                  * dentry_open after call to open_namei that checks permissions.
 556                  * Only nfsd_open call dentry_open directly without checking
 557                  * permissions and because of that this code below is safe. */
 558                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 559                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 560
 561                 /* We do not want O_EXCL here, presumably we opened the file
 562                  * already? XXX - NFS implications? */
 563                 oit.it_flags &= ~O_EXCL;
 564
 565                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 566                  * created if necessary, then "IT_CREAT" should be set to keep
 567                  * consistent with it */
 568                 if (oit.it_flags & O_CREAT)
 569                         oit.it_op |= IT_CREAT;
 570
 571                 it = &oit;
 572         }
 573
 574 restart:
 575         /* Let's see if we have file open on MDS already. */
 576         if (it->it_flags & FMODE_WRITE) {
 577                 och_p = &lli->lli_mds_write_och;
 578                 och_usecount = &lli->lli_open_fd_write_count;
 579         } else if (it->it_flags & FMODE_EXEC) {
 580                 och_p = &lli->lli_mds_exec_och;
 581                 och_usecount = &lli->lli_open_fd_exec_count;
 582          } else {
 583                 och_p = &lli->lli_mds_read_och;
 584                 och_usecount = &lli->lli_open_fd_read_count;
 585         }
 586
 587         mutex_lock(&lli->lli_och_mutex);
 588         if (*och_p) { /* Open handle is present */
 589                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 590                         /* Well, there's extra open request that we do not need,
 591                            let's close it somehow. This will decref request. */
 592                         rc = it_open_error(DISP_OPEN_OPEN, it);
 593                         if (rc) {
 594                                 mutex_unlock(&lli->lli_och_mutex);
 595                                 GOTO(out_openerr, rc);
 596                         }
 597
 598                         ll_release_openhandle(file->f_dentry, it);
 599                 }
 600                 (*och_usecount)++;
 601
 602                 rc = ll_local_open(file, it, fd, NULL);
 603                 if (rc) {
 604                         (*och_usecount)--;
 605                         mutex_unlock(&lli->lli_och_mutex);
 606                         GOTO(out_openerr, rc);
 607                 }
 608         } else {
 609                 LASSERT(*och_usecount == 0);
 610                 if (!it->d.lustre.it_disposition) {
 611                         /* We cannot just request lock handle now, new ELC code
 612                            means that one of other OPEN locks for this file
 613                            could be cancelled, and since blocking ast handler
 614                            would attempt to grab och_mutex as well, that would
 615                            result in a deadlock */
 616                         mutex_unlock(&lli->lli_och_mutex);
 617                         it->it_create_mode |= M_CHECK_STALE;
 618                         rc = ll_intent_file_open(file, NULL, 0, it);
 619                         it->it_create_mode &= ~M_CHECK_STALE;
 620                         if (rc)
 621                                 GOTO(out_openerr, rc);
 622
 623                         goto restart;
 624                 }
 625                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 626                 if (!*och_p)
 627                         GOTO(out_och_free, rc = -ENOMEM);
 628
 629                 (*och_usecount)++;
 630
 631                 /* md_intent_lock() didn't get a request ref if there was an
 632                  * open error, so don't do cleanup on the request here
 633                  * (bug 3430) */
 634                 /* XXX (green): Should not we bail out on any error here, not
 635                  * just open error? */
 636                 rc = it_open_error(DISP_OPEN_OPEN, it);
 637                 if (rc)
 638                         GOTO(out_och_free, rc);
 639
 640                 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
 641
 642                 rc = ll_local_open(file, it, fd, *och_p);
 643                 if (rc)
 644                         GOTO(out_och_free, rc);
 645         }
 646         mutex_unlock(&lli->lli_och_mutex);
 647         fd = NULL;
 648
 649         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 650            different kind of OPEN lock for this same inode gets cancelled
 651            by ldlm_cancel_lru */
 652         if (!S_ISREG(inode->i_mode))
 653                 GOTO(out_och_free, rc);
 654
 655         ll_capa_open(inode);
 656
 657         if (!lli->lli_has_smd) {
 658                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 659                     !(file->f_mode & FMODE_WRITE)) {
 660                         CDEBUG(D_INODE, "object creation was delayed\n");
 661                         GOTO(out_och_free, rc);
 662                 }
 663         }
 664         file->f_flags &= ~O_LOV_DELAY_CREATE;
 665         GOTO(out_och_free, rc);
 666
 667 out_och_free:
 668         if (rc) {
 669                 if (och_p && *och_p) {
 670                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 671                         *och_p = NULL; /* OBD_FREE writes some magic there */
 672                         (*och_usecount)--;
 673                 }
 674                 mutex_unlock(&lli->lli_och_mutex);
 675
 676 out_openerr:
 677                 if (opendir_set != 0)
 678                         ll_stop_statahead(inode, lli->lli_opendir_key);
 679                 if (fd != NULL)
 680                         ll_file_data_put(fd);
 681         } else {
 682                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 683         }
 684
 685         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 686                 ptlrpc_req_finished(it->d.lustre.it_data);
 687                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 688         }
 689
 690         return rc;
 691 }
 692
 693 /* Fills the obdo with the attributes for the lsm */
 694 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
 695                           struct obd_capa *capa, struct obdo *obdo,
 696                           __u64 ioepoch, int sync)
 697 {
 698         struct ptlrpc_request_set *set;
 699         struct obd_info            oinfo = { { { 0 } } };
 700         int                        rc;
 701
 702         ENTRY;
 703
 704         LASSERT(lsm != NULL);
 705
 706         oinfo.oi_md = lsm;
 707         oinfo.oi_oa = obdo;
 708         oinfo.oi_oa->o_oi = lsm->lsm_oi;
 709         oinfo.oi_oa->o_mode = S_IFREG;
 710         oinfo.oi_oa->o_ioepoch = ioepoch;
 711         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 712                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 713                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 714                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 715                                OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
 716                                OBD_MD_FLDATAVERSION;
 717         oinfo.oi_capa = capa;
 718         if (sync) {
 719                 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
 720                 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
 721         }
 722
 723         set = ptlrpc_prep_set();
 724         if (set == NULL) {
 725                 CERROR("can't allocate ptlrpc set\n");
 726                 rc = -ENOMEM;
 727         } else {
 728                 rc = obd_getattr_async(exp, &oinfo, set);
 729                 if (rc == 0)
 730                         rc = ptlrpc_set_wait(set);
 731                 ptlrpc_set_destroy(set);
 732         }
 733         if (rc == 0)
 734                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 735                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
 736                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE |
 737                                          OBD_MD_FLDATAVERSION);
 738         RETURN(rc);
 739 }
 740
 741 /**
 742   * Performs the getattr on the inode and updates its fields.
 743   * If @sync != 0, perform the getattr under the server-side lock.
 744   */
 745 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
 746                      __u64 ioepoch, int sync)
 747 {
 748         struct obd_capa      *capa = ll_mdscapa_get(inode);
 749         struct lov_stripe_md *lsm;
 750         int rc;
 751         ENTRY;
 752
 753         lsm = ccc_inode_lsm_get(inode);
 754         rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
 755                             capa, obdo, ioepoch, sync);
 756         capa_put(capa);
 757         if (rc == 0) {
 758                 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
 759
 760                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
 761                 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
 762                        " blksize %lu\n", POSTID(oi), i_size_read(inode),
 763                        (unsigned long long)inode->i_blocks,
 764                        (unsigned long)ll_inode_blksize(inode));
 765         }
 766         ccc_inode_lsm_put(inode, lsm);
 767         RETURN(rc);
 768 }
 769
 770 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
 771 {
 772         struct ll_inode_info *lli = ll_i2info(inode);
 773         struct cl_object *obj = lli->lli_clob;
 774         struct cl_attr *attr = ccc_env_thread_attr(env);
 775         struct ost_lvb lvb;
 776         int rc = 0;
 777
 778         ENTRY;
 779
 780         ll_inode_size_lock(inode);
 781         /* merge timestamps the most recently obtained from mds with
 782            timestamps obtained from osts */
 783         LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
 784         LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
 785         LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
 786         inode_init_lvb(inode, &lvb);
 787
 788         cl_object_attr_lock(obj);
 789         rc = cl_object_attr_get(env, obj, attr);
 790         cl_object_attr_unlock(obj);
 791
 792         if (rc == 0) {
 793                 if (lvb.lvb_atime < attr->cat_atime)
 794                         lvb.lvb_atime = attr->cat_atime;
 795                 if (lvb.lvb_ctime < attr->cat_ctime)
 796                         lvb.lvb_ctime = attr->cat_ctime;
 797                 if (lvb.lvb_mtime < attr->cat_mtime)
 798                         lvb.lvb_mtime = attr->cat_mtime;
 799
 800                 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
 801                                 PFID(&lli->lli_fid), attr->cat_size);
 802                 cl_isize_write_nolock(inode, attr->cat_size);
 803
 804                 inode->i_blocks = attr->cat_blocks;
 805
 806                 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
 807                 LTIME_S(inode->i_atime) = lvb.lvb_atime;
 808                 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
 809         }
 810         ll_inode_size_unlock(inode);
 811
 812         RETURN(rc);
 813 }
 814
 815 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
 816                      lstat_t *st)
 817 {
 818         struct obdo obdo = { 0 };
 819         int rc;
 820
 821         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
 822         if (rc == 0) {
 823                 st->st_size   = obdo.o_size;
 824                 st->st_blocks = obdo.o_blocks;
 825                 st->st_mtime  = obdo.o_mtime;
 826                 st->st_atime  = obdo.o_atime;
 827                 st->st_ctime  = obdo.o_ctime;
 828         }
 829         return rc;
 830 }
 831
 832 void ll_io_init(struct cl_io *io, const struct file *file, int write)
 833 {
 834         struct inode *inode = file->f_dentry->d_inode;
 835
 836         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
 837         if (write) {
 838                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
 839                 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
 840                                       file->f_flags & O_DIRECT ||
 841                                       IS_SYNC(inode);
 842         }
 843         io->ci_obj     = ll_i2info(inode)->lli_clob;
 844         io->ci_lockreq = CILR_MAYBE;
 845         if (ll_file_nolock(file)) {
 846                 io->ci_lockreq = CILR_NEVER;
 847                 io->ci_no_srvlock = 1;
 848         } else if (file->f_flags & O_APPEND) {
 849                 io->ci_lockreq = CILR_MANDATORY;
 850         }
 851 }
 852
 853 static ssize_t
 854 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 855                    struct file *file, enum cl_io_type iot,
 856                    loff_t *ppos, size_t count)
 857 {
 858         struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
 859         struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
 860         struct cl_io         *io;
 861         ssize_t               result;
 862         ENTRY;
 863
 864 restart:
 865         io = ccc_env_thread_io(env);
 866         ll_io_init(io, file, iot == CIT_WRITE);
 867
 868         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
 869                 struct vvp_io *vio = vvp_env_io(env);
 870                 struct ccc_io *cio = ccc_env_io(env);
 871                 int write_mutex_locked = 0;
 872
 873                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
 874                 vio->cui_io_subtype = args->via_io_subtype;
 875
 876                 switch (vio->cui_io_subtype) {
 877                 case IO_NORMAL:
 878                         cio->cui_iov = args->u.normal.via_iov;
 879                         cio->cui_nrsegs = args->u.normal.via_nrsegs;
 880                         cio->cui_tot_nrsegs = cio->cui_nrsegs;
 881 #ifndef HAVE_FILE_WRITEV
 882                         cio->cui_iocb = args->u.normal.via_iocb;
 883 #endif
 884                         if ((iot == CIT_WRITE) &&
 885                             !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
 886                                 if (mutex_lock_interruptible(&lli->
 887                                                                lli_write_mutex))
 888                                         GOTO(out, result = -ERESTARTSYS);
 889                                 write_mutex_locked = 1;
 890                         } else if (iot == CIT_READ) {
 891                                 down_read(&lli->lli_trunc_sem);
 892                         }
 893                         break;
 894                 case IO_SENDFILE:
 895                         vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
 896                         vio->u.sendfile.cui_target = args->u.sendfile.via_target;
 897                         break;
 898                 case IO_SPLICE:
 899                         vio->u.splice.cui_pipe = args->u.splice.via_pipe;
 900                         vio->u.splice.cui_flags = args->u.splice.via_flags;
 901                         break;
 902                 default:
 903                         CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
 904                         LBUG();
 905                 }
 906                 result = cl_io_loop(env, io);
 907                 if (write_mutex_locked)
 908                         mutex_unlock(&lli->lli_write_mutex);
 909                 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
 910                         up_read(&lli->lli_trunc_sem);
 911         } else {
 912                 /* cl_io_rw_init() handled IO */
 913                 result = io->ci_result;
 914         }
 915
 916         if (io->ci_nob > 0) {
 917                 result = io->ci_nob;
 918                 *ppos = io->u.ci_wr.wr.crw_pos;
 919         }
 920         GOTO(out, result);
 921 out:
 922         cl_io_fini(env, io);
 923         /* If any bit been read/written (result != 0), we just return
 924          * short read/write instead of restart io. */
 925         if (result == 0 && io->ci_need_restart) {
 926                 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
 927                        iot == CIT_READ ? "read" : "write",
 928                        file->f_dentry->d_name.name, *ppos, count);
 929                 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
 930                 goto restart;
 931         }
 932
 933         if (iot == CIT_READ) {
 934                 if (result >= 0)
 935                         ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
 936                                            LPROC_LL_READ_BYTES, result);
 937         } else if (iot == CIT_WRITE) {
 938                 if (result >= 0) {
 939                         ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
 940                                            LPROC_LL_WRITE_BYTES, result);
 941                         fd->fd_write_failed = false;
 942                 } else if (result != -ERESTARTSYS) {
 943                         fd->fd_write_failed = true;
 944                 }
 945         }
 946
 947         return result;
 948 }
 949
 950
 951 /*
 952  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
 953  */
 954 static int ll_file_get_iov_count(const struct iovec *iov,
 955                                  unsigned long *nr_segs, size_t *count)
 956 {
 957         size_t cnt = 0;
 958         unsigned long seg;
 959
 960         for (seg = 0; seg < *nr_segs; seg++) {
 961                 const struct iovec *iv = &iov[seg];
 962
 963                 /*
 964                  * If any segment has a negative length, or the cumulative
 965                  * length ever wraps negative then return -EINVAL.
 966                  */
 967                 cnt += iv->iov_len;
 968                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
 969                         return -EINVAL;
 970                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
 971                         continue;
 972                 if (seg == 0)
 973                         return -EFAULT;
 974                 *nr_segs = seg;
 975                 cnt -= iv->iov_len;   /* This segment is no good */
 976                 break;
 977         }
 978         *count = cnt;
 979         return 0;
 980 }
 981
 982 #ifdef HAVE_FILE_READV
 983 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
 984                               unsigned long nr_segs, loff_t *ppos)
 985 {
 986         struct lu_env      *env;
 987         struct vvp_io_args *args;
 988         size_t              count;
 989         ssize_t             result;
 990         int                 refcheck;
 991         ENTRY;
 992
 993         result = ll_file_get_iov_count(iov, &nr_segs, &count);
 994         if (result)
 995                 RETURN(result);
 996
 997         env = cl_env_get(&refcheck);
 998         if (IS_ERR(env))
 999                 RETURN(PTR_ERR(env));
1000
1001         args = vvp_env_args(env, IO_NORMAL);
1002         args->u.normal.via_iov = (struct iovec *)iov;
1003         args->u.normal.via_nrsegs = nr_segs;
1004
1005         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1006         cl_env_put(env, &refcheck);
1007         RETURN(result);
1008 }
1009
1010 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1011                             loff_t *ppos)
1012 {
1013         struct lu_env *env;
1014         struct iovec  *local_iov;
1015         ssize_t        result;
1016         int            refcheck;
1017         ENTRY;
1018
1019         env = cl_env_get(&refcheck);
1020         if (IS_ERR(env))
1021                 RETURN(PTR_ERR(env));
1022
1023         local_iov = &vvp_env_info(env)->vti_local_iov;
1024         local_iov->iov_base = (void __user *)buf;
1025         local_iov->iov_len = count;
1026         result = ll_file_readv(file, local_iov, 1, ppos);
1027         cl_env_put(env, &refcheck);
1028         RETURN(result);
1029 }
1030
1031 #else
1032 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1033                                 unsigned long nr_segs, loff_t pos)
1034 {
1035         struct lu_env      *env;
1036         struct vvp_io_args *args;
1037         size_t              count;
1038         ssize_t             result;
1039         int                 refcheck;
1040         ENTRY;
1041
1042         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1043         if (result)
1044                 RETURN(result);
1045
1046         env = cl_env_get(&refcheck);
1047         if (IS_ERR(env))
1048                 RETURN(PTR_ERR(env));
1049
1050         args = vvp_env_args(env, IO_NORMAL);
1051         args->u.normal.via_iov = (struct iovec *)iov;
1052         args->u.normal.via_nrsegs = nr_segs;
1053         args->u.normal.via_iocb = iocb;
1054
1055         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1056                                     &iocb->ki_pos, count);
1057         cl_env_put(env, &refcheck);
1058         RETURN(result);
1059 }
1060
1061 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1062                             loff_t *ppos)
1063 {
1064         struct lu_env *env;
1065         struct iovec  *local_iov;
1066         struct kiocb  *kiocb;
1067         ssize_t        result;
1068         int            refcheck;
1069         ENTRY;
1070
1071         env = cl_env_get(&refcheck);
1072         if (IS_ERR(env))
1073                 RETURN(PTR_ERR(env));
1074
1075         local_iov = &vvp_env_info(env)->vti_local_iov;
1076         kiocb = &vvp_env_info(env)->vti_kiocb;
1077         local_iov->iov_base = (void __user *)buf;
1078         local_iov->iov_len = count;
1079         init_sync_kiocb(kiocb, file);
1080         kiocb->ki_pos = *ppos;
1081         kiocb->ki_left = count;
1082
1083         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1084         *ppos = kiocb->ki_pos;
1085
1086         cl_env_put(env, &refcheck);
1087         RETURN(result);
1088 }
1089 #endif
1090
1091 /*
1092  * Write to a file (through the page cache).
1093  */
1094 #ifdef HAVE_FILE_WRITEV
1095 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1096                               unsigned long nr_segs, loff_t *ppos)
1097 {
1098         struct lu_env      *env;
1099         struct vvp_io_args *args;
1100         size_t              count;
1101         ssize_t             result;
1102         int                 refcheck;
1103         ENTRY;
1104
1105         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1106         if (result)
1107                 RETURN(result);
1108
1109         env = cl_env_get(&refcheck);
1110         if (IS_ERR(env))
1111                 RETURN(PTR_ERR(env));
1112
1113         args = vvp_env_args(env, IO_NORMAL);
1114         args->u.normal.via_iov = (struct iovec *)iov;
1115         args->u.normal.via_nrsegs = nr_segs;
1116
1117         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1118         cl_env_put(env, &refcheck);
1119         RETURN(result);
1120 }
1121
1122 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1123                              loff_t *ppos)
1124 {
1125         struct lu_env    *env;
1126         struct iovec     *local_iov;
1127         ssize_t           result;
1128         int               refcheck;
1129         ENTRY;
1130
1131         env = cl_env_get(&refcheck);
1132         if (IS_ERR(env))
1133                 RETURN(PTR_ERR(env));
1134
1135         local_iov = &vvp_env_info(env)->vti_local_iov;
1136         local_iov->iov_base = (void __user *)buf;
1137         local_iov->iov_len = count;
1138
1139         result = ll_file_writev(file, local_iov, 1, ppos);
1140         cl_env_put(env, &refcheck);
1141         RETURN(result);
1142 }
1143
1144 #else /* AIO stuff */
1145 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1146                                  unsigned long nr_segs, loff_t pos)
1147 {
1148         struct lu_env      *env;
1149         struct vvp_io_args *args;
1150         size_t              count;
1151         ssize_t             result;
1152         int                 refcheck;
1153         ENTRY;
1154
1155         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1156         if (result)
1157                 RETURN(result);
1158
1159         env = cl_env_get(&refcheck);
1160         if (IS_ERR(env))
1161                 RETURN(PTR_ERR(env));
1162
1163         args = vvp_env_args(env, IO_NORMAL);
1164         args->u.normal.via_iov = (struct iovec *)iov;
1165         args->u.normal.via_nrsegs = nr_segs;
1166         args->u.normal.via_iocb = iocb;
1167
1168         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1169                                   &iocb->ki_pos, count);
1170         cl_env_put(env, &refcheck);
1171         RETURN(result);
1172 }
1173
1174 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1175                              loff_t *ppos)
1176 {
1177         struct lu_env *env;
1178         struct iovec  *local_iov;
1179         struct kiocb  *kiocb;
1180         ssize_t        result;
1181         int            refcheck;
1182         ENTRY;
1183
1184         env = cl_env_get(&refcheck);
1185         if (IS_ERR(env))
1186                 RETURN(PTR_ERR(env));
1187
1188         local_iov = &vvp_env_info(env)->vti_local_iov;
1189         kiocb = &vvp_env_info(env)->vti_kiocb;
1190         local_iov->iov_base = (void __user *)buf;
1191         local_iov->iov_len = count;
1192         init_sync_kiocb(kiocb, file);
1193         kiocb->ki_pos = *ppos;
1194         kiocb->ki_left = count;
1195
1196         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1197         *ppos = kiocb->ki_pos;
1198
1199         cl_env_put(env, &refcheck);
1200         RETURN(result);
1201 }
1202 #endif
1203
1204
1205 #ifdef HAVE_KERNEL_SENDFILE
1206 /*
1207  * Send file content (through pagecache) somewhere with helper
1208  */
1209 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1210                                 read_actor_t actor, void *target)
1211 {
1212         struct lu_env      *env;
1213         struct vvp_io_args *args;
1214         ssize_t             result;
1215         int                 refcheck;
1216         ENTRY;
1217
1218         env = cl_env_get(&refcheck);
1219         if (IS_ERR(env))
1220                 RETURN(PTR_ERR(env));
1221
1222         args = vvp_env_args(env, IO_SENDFILE);
1223         args->u.sendfile.via_target = target;
1224         args->u.sendfile.via_actor = actor;
1225
1226         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1227         cl_env_put(env, &refcheck);
1228         RETURN(result);
1229 }
1230 #endif
1231
1232 #ifdef HAVE_KERNEL_SPLICE_READ
1233 /*
1234  * Send file content (through pagecache) somewhere with helper
1235  */
1236 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1237                                    struct pipe_inode_info *pipe, size_t count,
1238                                    unsigned int flags)
1239 {
1240         struct lu_env      *env;
1241         struct vvp_io_args *args;
1242         ssize_t             result;
1243         int                 refcheck;
1244         ENTRY;
1245
1246         env = cl_env_get(&refcheck);
1247         if (IS_ERR(env))
1248                 RETURN(PTR_ERR(env));
1249
1250         args = vvp_env_args(env, IO_SPLICE);
1251         args->u.splice.via_pipe = pipe;
1252         args->u.splice.via_flags = flags;
1253
1254         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1255         cl_env_put(env, &refcheck);
1256         RETURN(result);
1257 }
1258 #endif
1259
1260 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1261                            obd_count ost_idx)
1262 {
1263         struct obd_export *exp = ll_i2dtexp(inode);
1264         struct obd_trans_info oti = { 0 };
1265         struct obdo *oa = NULL;
1266         int lsm_size;
1267         int rc = 0;
1268         struct lov_stripe_md *lsm = NULL, *lsm2;
1269         ENTRY;
1270
1271         OBDO_ALLOC(oa);
1272         if (oa == NULL)
1273                 RETURN(-ENOMEM);
1274
1275         lsm = ccc_inode_lsm_get(inode);
1276         if (lsm == NULL)
1277                 GOTO(out, rc = -ENOENT);
1278
1279         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1280                    (lsm->lsm_stripe_count));
1281
1282         OBD_ALLOC_LARGE(lsm2, lsm_size);
1283         if (lsm2 == NULL)
1284                 GOTO(out, rc = -ENOMEM);
1285
1286         oa->o_id = id;
1287         oa->o_seq = seq;
1288         oa->o_nlink = ost_idx;
1289         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1290         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1291         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1292                                    OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1293         obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1294         memcpy(lsm2, lsm, lsm_size);
1295         ll_inode_size_lock(inode);
1296         rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1297         ll_inode_size_unlock(inode);
1298
1299         OBD_FREE_LARGE(lsm2, lsm_size);
1300         GOTO(out, rc);
1301 out:
1302         ccc_inode_lsm_put(inode, lsm);
1303         OBDO_FREE(oa);
1304         return rc;
1305 }
1306
1307 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1308 {
1309         struct ll_recreate_obj ucreat;
1310         ENTRY;
1311
1312         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1313                 RETURN(-EPERM);
1314
1315         if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1316                            sizeof(ucreat)))
1317                 RETURN(-EFAULT);
1318
1319         RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1320                                ucreat.lrc_ost_idx));
1321 }
1322
1323 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1324 {
1325         struct lu_fid   fid;
1326         obd_id          id;
1327         obd_count       ost_idx;
1328         ENTRY;
1329
1330         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1331                 RETURN(-EPERM);
1332
1333         if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1334                 RETURN(-EFAULT);
1335
1336         id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1337         ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1338         RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1339 }
1340
1341 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1342                              int flags, struct lov_user_md *lum, int lum_size)
1343 {
1344         struct lov_stripe_md *lsm = NULL;
1345         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1346         int rc = 0;
1347         ENTRY;
1348
1349         lsm = ccc_inode_lsm_get(inode);
1350         if (lsm != NULL) {
1351                 ccc_inode_lsm_put(inode, lsm);
1352                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1353                        inode->i_ino);
1354                 RETURN(-EEXIST);
1355         }
1356
1357         ll_inode_size_lock(inode);
1358         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1359         if (rc)
1360                 GOTO(out, rc);
1361         rc = oit.d.lustre.it_status;
1362         if (rc < 0)
1363                 GOTO(out_req_free, rc);
1364
1365         ll_release_openhandle(file->f_dentry, &oit);
1366
1367  out:
1368         ll_inode_size_unlock(inode);
1369         ll_intent_release(&oit);
1370         ccc_inode_lsm_put(inode, lsm);
1371         RETURN(rc);
1372 out_req_free:
1373         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1374         goto out;
1375 }
1376
1377 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1378                              struct lov_mds_md **lmmp, int *lmm_size,
1379                              struct ptlrpc_request **request)
1380 {
1381         struct ll_sb_info *sbi = ll_i2sbi(inode);
1382         struct mdt_body  *body;
1383         struct lov_mds_md *lmm = NULL;
1384         struct ptlrpc_request *req = NULL;
1385         struct md_op_data *op_data;
1386         int rc, lmmsize;
1387
1388         rc = ll_get_max_mdsize(sbi, &lmmsize);
1389         if (rc)
1390                 RETURN(rc);
1391
1392         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1393                                      strlen(filename), lmmsize,
1394                                      LUSTRE_OPC_ANY, NULL);
1395         if (IS_ERR(op_data))
1396                 RETURN(PTR_ERR(op_data));
1397
1398         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1399         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1400         ll_finish_md_op_data(op_data);
1401         if (rc < 0) {
1402                 CDEBUG(D_INFO, "md_getattr_name failed "
1403                        "on %s: rc %d\n", filename, rc);
1404                 GOTO(out, rc);
1405         }
1406
1407         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1408         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1409
1410         lmmsize = body->eadatasize;
1411
1412         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1413                         lmmsize == 0) {
1414                 GOTO(out, rc = -ENODATA);
1415         }
1416
1417         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1418         LASSERT(lmm != NULL);
1419
1420         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1421             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1422                 GOTO(out, rc = -EPROTO);
1423         }
1424
1425         /*
1426          * This is coming from the MDS, so is probably in
1427          * little endian.  We convert it to host endian before
1428          * passing it to userspace.
1429          */
1430         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1431                 /* if function called for directory - we should
1432                  * avoid swab not existent lsm objects */
1433                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1434                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1435                         if (S_ISREG(body->mode))
1436                                 lustre_swab_lov_user_md_objects(
1437                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1438                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1439                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1440                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1441                         if (S_ISREG(body->mode))
1442                                 lustre_swab_lov_user_md_objects(
1443                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1444                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1445                 }
1446         }
1447
1448 out:
1449         *lmmp = lmm;
1450         *lmm_size = lmmsize;
1451         *request = req;
1452         return rc;
1453 }
1454
1455 static int ll_lov_setea(struct inode *inode, struct file *file,
1456                             unsigned long arg)
1457 {
1458         int                      flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1459         struct lov_user_md      *lump;
1460         int                      lum_size = sizeof(struct lov_user_md) +
1461                                             sizeof(struct lov_user_ost_data);
1462         int                      rc;
1463         ENTRY;
1464
1465         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1466                 RETURN(-EPERM);
1467
1468         OBD_ALLOC_LARGE(lump, lum_size);
1469         if (lump == NULL)
1470                 RETURN(-ENOMEM);
1471
1472         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1473                 OBD_FREE_LARGE(lump, lum_size);
1474                 RETURN(-EFAULT);
1475         }
1476
1477         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1478
1479         OBD_FREE_LARGE(lump, lum_size);
1480         RETURN(rc);
1481 }
1482
1483 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1484                             unsigned long arg)
1485 {
1486         struct lov_user_md_v3    lumv3;
1487         struct lov_user_md_v1   *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1488         struct lov_user_md_v1   *lumv1p = (struct lov_user_md_v1 *)arg;
1489         struct lov_user_md_v3   *lumv3p = (struct lov_user_md_v3 *)arg;
1490         int                      lum_size, rc;
1491         int                      flags = FMODE_WRITE;
1492         ENTRY;
1493
1494         /* first try with v1 which is smaller than v3 */
1495         lum_size = sizeof(struct lov_user_md_v1);
1496         if (copy_from_user(lumv1, lumv1p, lum_size))
1497                 RETURN(-EFAULT);
1498
1499         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1500                 lum_size = sizeof(struct lov_user_md_v3);
1501                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1502                         RETURN(-EFAULT);
1503         }
1504
1505         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1506         if (rc == 0) {
1507                 struct lov_stripe_md *lsm;
1508                 __u32 gen;
1509
1510                 put_user(0, &lumv1p->lmm_stripe_count);
1511
1512                 ll_layout_refresh(inode, &gen);
1513                 lsm = ccc_inode_lsm_get(inode);
1514                 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1515                                    0, lsm, (void *)arg);
1516                 ccc_inode_lsm_put(inode, lsm);
1517         }
1518         RETURN(rc);
1519 }
1520
1521 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1522 {
1523         struct lov_stripe_md *lsm;
1524         int rc = -ENODATA;
1525         ENTRY;
1526
1527         lsm = ccc_inode_lsm_get(inode);
1528         if (lsm != NULL)
1529                 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1530                                    lsm, (void *)arg);
1531         ccc_inode_lsm_put(inode, lsm);
1532         RETURN(rc);
1533 }
1534
1535 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1536 {
1537         struct ll_inode_info   *lli = ll_i2info(inode);
1538         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1539         struct ccc_grouplock    grouplock;
1540         int                     rc;
1541         ENTRY;
1542
1543         if (ll_file_nolock(file))
1544                 RETURN(-EOPNOTSUPP);
1545
1546         spin_lock(&lli->lli_lock);
1547         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1548                 CWARN("group lock already existed with gid %lu\n",
1549                       fd->fd_grouplock.cg_gid);
1550                 spin_unlock(&lli->lli_lock);
1551                 RETURN(-EINVAL);
1552         }
1553         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1554         spin_unlock(&lli->lli_lock);
1555
1556         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1557                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1558         if (rc)
1559                 RETURN(rc);
1560
1561         spin_lock(&lli->lli_lock);
1562         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1563                 spin_unlock(&lli->lli_lock);
1564                 CERROR("another thread just won the race\n");
1565                 cl_put_grouplock(&grouplock);
1566                 RETURN(-EINVAL);
1567         }
1568
1569         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1570         fd->fd_grouplock = grouplock;
1571         spin_unlock(&lli->lli_lock);
1572
1573         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1574         RETURN(0);
1575 }
1576
1577 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1578 {
1579         struct ll_inode_info   *lli = ll_i2info(inode);
1580         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1581         struct ccc_grouplock    grouplock;
1582         ENTRY;
1583
1584         spin_lock(&lli->lli_lock);
1585         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1586                 spin_unlock(&lli->lli_lock);
1587                 CWARN("no group lock held\n");
1588                 RETURN(-EINVAL);
1589         }
1590         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1591
1592         if (fd->fd_grouplock.cg_gid != arg) {
1593                 CWARN("group lock %lu doesn't match current id %lu\n",
1594                        arg, fd->fd_grouplock.cg_gid);
1595                 spin_unlock(&lli->lli_lock);
1596                 RETURN(-EINVAL);
1597         }
1598
1599         grouplock = fd->fd_grouplock;
1600         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1601         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1602         spin_unlock(&lli->lli_lock);
1603
1604         cl_put_grouplock(&grouplock);
1605         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1606         RETURN(0);
1607 }
1608
1609 /**
1610  * Close inode open handle
1611  *
1612  * \param dentry [in]     dentry which contains the inode
1613  * \param it     [in,out] intent which contains open info and result
1614  *
1615  * \retval 0     success
1616  * \retval <0    failure
1617  */
1618 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1619 {
1620         struct inode *inode = dentry->d_inode;
1621         struct obd_client_handle *och;
1622         int rc;
1623         ENTRY;
1624
1625         LASSERT(inode);
1626
1627         /* Root ? Do nothing. */
1628         if (dentry->d_inode->i_sb->s_root == dentry)
1629                 RETURN(0);
1630
1631         /* No open handle to close? Move away */
1632         if (!it_disposition(it, DISP_OPEN_OPEN))
1633                 RETURN(0);
1634
1635         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1636
1637         OBD_ALLOC(och, sizeof(*och));
1638         if (!och)
1639                 GOTO(out, rc = -ENOMEM);
1640
1641         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1642                     ll_i2info(inode), it, och);
1643
1644         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1645                                        inode, och);
1646  out:
1647         /* this one is in place of ll_file_open */
1648         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1649                 ptlrpc_req_finished(it->d.lustre.it_data);
1650                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1651         }
1652         RETURN(rc);
1653 }
1654
1655 /**
1656  * Get size for inode for which FIEMAP mapping is requested.
1657  * Make the FIEMAP get_info call and returns the result.
1658  */
1659 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1660               int num_bytes)
1661 {
1662         struct obd_export *exp = ll_i2dtexp(inode);
1663         struct lov_stripe_md *lsm = NULL;
1664         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1665         int vallen = num_bytes;
1666         int rc;
1667         ENTRY;
1668
1669         /* Checks for fiemap flags */
1670         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1671                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1672                 return -EBADR;
1673         }
1674
1675         /* Check for FIEMAP_FLAG_SYNC */
1676         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1677                 rc = filemap_fdatawrite(inode->i_mapping);
1678                 if (rc)
1679                         return rc;
1680         }
1681
1682         lsm = ccc_inode_lsm_get(inode);
1683         if (lsm == NULL)
1684                 return -ENOENT;
1685
1686         /* If the stripe_count > 1 and the application does not understand
1687          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1688          */
1689         if (lsm->lsm_stripe_count > 1 &&
1690             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1691                 GOTO(out, rc = -EOPNOTSUPP);
1692
1693         fm_key.oa.o_oi = lsm->lsm_oi;
1694         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1695
1696         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1697         obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1698         /* If filesize is 0, then there would be no objects for mapping */
1699         if (fm_key.oa.o_size == 0) {
1700                 fiemap->fm_mapped_extents = 0;
1701                 GOTO(out, rc = 0);
1702         }
1703
1704         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1705
1706         rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1707                           fiemap, lsm);
1708         if (rc)
1709                 CERROR("obd_get_info failed: rc = %d\n", rc);
1710
1711 out:
1712         ccc_inode_lsm_put(inode, lsm);
1713         RETURN(rc);
1714 }
1715
1716 int ll_fid2path(struct inode *inode, void *arg)
1717 {
1718         struct obd_export       *exp = ll_i2mdexp(inode);
1719         struct getinfo_fid2path *gfout, *gfin;
1720         int                      outsize, rc;
1721         ENTRY;
1722
1723         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1724             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1725                 RETURN(-EPERM);
1726
1727         /* Need to get the buflen */
1728         OBD_ALLOC_PTR(gfin);
1729         if (gfin == NULL)
1730                 RETURN(-ENOMEM);
1731         if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1732                 OBD_FREE_PTR(gfin);
1733                 RETURN(-EFAULT);
1734         }
1735
1736         outsize = sizeof(*gfout) + gfin->gf_pathlen;
1737         OBD_ALLOC(gfout, outsize);
1738         if (gfout == NULL) {
1739                 OBD_FREE_PTR(gfin);
1740                 RETURN(-ENOMEM);
1741         }
1742         memcpy(gfout, gfin, sizeof(*gfout));
1743         OBD_FREE_PTR(gfin);
1744
1745         /* Call mdc_iocontrol */
1746         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1747         if (rc)
1748                 GOTO(gf_free, rc);
1749
1750         if (copy_to_user(arg, gfout, outsize))
1751                 rc = -EFAULT;
1752
1753 gf_free:
1754         OBD_FREE(gfout, outsize);
1755         RETURN(rc);
1756 }
1757
1758 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1759 {
1760         struct ll_user_fiemap *fiemap_s;
1761         size_t num_bytes, ret_bytes;
1762         unsigned int extent_count;
1763         int rc = 0;
1764
1765         /* Get the extent count so we can calculate the size of
1766          * required fiemap buffer */
1767         if (get_user(extent_count,
1768             &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1769                 RETURN(-EFAULT);
1770         num_bytes = sizeof(*fiemap_s) + (extent_count *
1771                                          sizeof(struct ll_fiemap_extent));
1772
1773         OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1774         if (fiemap_s == NULL)
1775                 RETURN(-ENOMEM);
1776
1777         /* get the fiemap value */
1778         if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1779                            sizeof(*fiemap_s)))
1780                 GOTO(error, rc = -EFAULT);
1781
1782         /* If fm_extent_count is non-zero, read the first extent since
1783          * it is used to calculate end_offset and device from previous
1784          * fiemap call. */
1785         if (extent_count) {
1786                 if (copy_from_user(&fiemap_s->fm_extents[0],
1787                     (char __user *)arg + sizeof(*fiemap_s),
1788                     sizeof(struct ll_fiemap_extent)))
1789                         GOTO(error, rc = -EFAULT);
1790         }
1791
1792         rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1793         if (rc)
1794                 GOTO(error, rc);
1795
1796         ret_bytes = sizeof(struct ll_user_fiemap);
1797
1798         if (extent_count != 0)
1799                 ret_bytes += (fiemap_s->fm_mapped_extents *
1800                                  sizeof(struct ll_fiemap_extent));
1801
1802         if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1803                 rc = -EFAULT;
1804
1805 error:
1806         OBD_FREE_LARGE(fiemap_s, num_bytes);
1807         RETURN(rc);
1808 }
1809
1810 /*
1811  * Read the data_version for inode.
1812  *
1813  * This value is computed using stripe object version on OST.
1814  * Version is computed using server side locking.
1815  *
1816  * @param extent_lock  Take extent lock. Not needed if a process is already
1817  *                     holding the OST object group locks.
1818  */
1819 int ll_data_version(struct inode *inode, __u64 *data_version,
1820                     int extent_lock)
1821 {
1822         struct lov_stripe_md    *lsm = NULL;
1823         struct ll_sb_info       *sbi = ll_i2sbi(inode);
1824         struct obdo             *obdo = NULL;
1825         int                      rc;
1826         ENTRY;
1827
1828         /* If no stripe, we consider version is 0. */
1829         lsm = ccc_inode_lsm_get(inode);
1830         if (lsm == NULL) {
1831                 *data_version = 0;
1832                 CDEBUG(D_INODE, "No object for inode\n");
1833                 RETURN(0);
1834         }
1835
1836         OBD_ALLOC_PTR(obdo);
1837         if (obdo == NULL) {
1838                 ccc_inode_lsm_put(inode, lsm);
1839                 RETURN(-ENOMEM);
1840         }
1841
1842         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1843         if (!rc) {
1844                 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1845                         rc = -EOPNOTSUPP;
1846                 else
1847                         *data_version = obdo->o_data_version;
1848         }
1849
1850         OBD_FREE_PTR(obdo);
1851         ccc_inode_lsm_put(inode, lsm);
1852
1853         RETURN(rc);
1854 }
1855
1856 struct ll_swap_stack {
1857         struct iattr             ia1, ia2;
1858         __u64                    dv1, dv2;
1859         struct inode            *inode1, *inode2;
1860         bool                     check_dv1, check_dv2;
1861 };
1862
1863 static int ll_swap_layouts(struct file *file1, struct file *file2,
1864                            struct lustre_swap_layouts *lsl)
1865 {
1866         struct mdc_swap_layouts  msl;
1867         struct md_op_data       *op_data;
1868         __u32                    gid;
1869         __u64                    dv;
1870         struct ll_swap_stack    *llss = NULL;
1871         int                      rc, rc1;
1872
1873         OBD_ALLOC_PTR(llss);
1874         if (llss == NULL)
1875                 RETURN(-ENOMEM);
1876
1877         llss->inode1 = file1->f_dentry->d_inode;
1878         llss->inode2 = file2->f_dentry->d_inode;
1879
1880         if (!S_ISREG(llss->inode2->i_mode))
1881                 GOTO(free, rc = -EINVAL);
1882
1883         if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1884             ll_permission(llss->inode2, MAY_WRITE, NULL))
1885                 GOTO(free, rc = -EPERM);
1886
1887         if (llss->inode2->i_sb != llss->inode1->i_sb)
1888                 GOTO(free, rc = -EXDEV);
1889
1890         /* we use 2 bool because it is easier to swap than 2 bits */
1891         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1892                 llss->check_dv1 = true;
1893
1894         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1895                 llss->check_dv2 = true;
1896
1897         /* we cannot use lsl->sl_dvX directly because we may swap them */
1898         llss->dv1 = lsl->sl_dv1;
1899         llss->dv2 = lsl->sl_dv2;
1900
1901         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1902         if (rc == 0) /* same file, done! */
1903                 GOTO(free, rc = 0);
1904
1905         if (rc < 0) { /* sequentialize it */
1906                 swap(llss->inode1, llss->inode2);
1907                 swap(file1, file2);
1908                 swap(llss->dv1, llss->dv2);
1909                 swap(llss->check_dv1, llss->check_dv2);
1910         }
1911
1912         gid = lsl->sl_gid;
1913         if (gid != 0) { /* application asks to flush dirty cache */
1914                 rc = ll_get_grouplock(llss->inode1, file1, gid);
1915                 if (rc < 0)
1916                         GOTO(free, rc);
1917
1918                 rc = ll_get_grouplock(llss->inode2, file2, gid);
1919                 if (rc < 0) {
1920                         ll_put_grouplock(llss->inode1, file1, gid);
1921                         GOTO(free, rc);
1922                 }
1923         }
1924
1925         /* to be able to restore mtime and atime after swap
1926          * we need to first save them */
1927         if (lsl->sl_flags &
1928             (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1929                 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1930                 llss->ia1.ia_atime = llss->inode1->i_atime;
1931                 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1932                 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1933                 llss->ia2.ia_atime = llss->inode2->i_atime;
1934                 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1935         }
1936
1937         /* ultimate check, before swaping the layouts we check if
1938          * dataversion has changed (if requested) */
1939         if (llss->check_dv1) {
1940                 rc = ll_data_version(llss->inode1, &dv, 0);
1941                 if (rc)
1942                         GOTO(putgl, rc);
1943                 if (dv != llss->dv1)
1944                         GOTO(putgl, rc = -EAGAIN);
1945         }
1946
1947         if (llss->check_dv2) {
1948                 rc = ll_data_version(llss->inode2, &dv, 0);
1949                 if (rc)
1950                         GOTO(putgl, rc);
1951                 if (dv != llss->dv2)
1952                         GOTO(putgl, rc = -EAGAIN);
1953         }
1954
1955         /* struct md_op_data is used to send the swap args to the mdt
1956          * only flags is missing, so we use struct mdc_swap_layouts
1957          * through the md_op_data->op_data */
1958         /* flags from user space have to be converted before they are send to
1959          * server, no flag is sent today, they are only used on the client */
1960         msl.msl_flags = 0;
1961         rc = -ENOMEM;
1962         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1963                                      0, LUSTRE_OPC_ANY, &msl);
1964         if (op_data != NULL) {
1965                 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
1966                                    ll_i2mdexp(llss->inode1),
1967                                    sizeof(*op_data), op_data, NULL);
1968                 ll_finish_md_op_data(op_data);
1969         }
1970
1971 putgl:
1972         if (gid != 0) {
1973                 ll_put_grouplock(llss->inode2, file2, gid);
1974                 ll_put_grouplock(llss->inode1, file1, gid);
1975         }
1976
1977         /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1978         if (rc != 0)
1979                 GOTO(free, rc);
1980
1981         /* clear useless flags */
1982         if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1983                 llss->ia1.ia_valid &= ~ATTR_MTIME;
1984                 llss->ia2.ia_valid &= ~ATTR_MTIME;
1985         }
1986
1987         if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1988                 llss->ia1.ia_valid &= ~ATTR_ATIME;
1989                 llss->ia2.ia_valid &= ~ATTR_ATIME;
1990         }
1991
1992         /* update time if requested */
1993         rc = rc1 = 0;
1994         if (llss->ia2.ia_valid != 0)
1995                 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1996
1997         if (llss->ia1.ia_valid != 0)
1998                 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
1999
2000 free:
2001         if (llss != NULL)
2002                 OBD_FREE_PTR(llss);
2003
2004         RETURN(rc);
2005 }
2006
2007 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2008 {
2009         struct inode            *inode = file->f_dentry->d_inode;
2010         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2011         int                      flags, rc;
2012         ENTRY;
2013
2014         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2015                inode->i_generation, inode, cmd);
2016         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2017
2018         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2019         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2020                 RETURN(-ENOTTY);
2021
2022         switch(cmd) {
2023         case LL_IOC_GETFLAGS:
2024                 /* Get the current value of the file flags */
2025                 return put_user(fd->fd_flags, (int *)arg);
2026         case LL_IOC_SETFLAGS:
2027         case LL_IOC_CLRFLAGS:
2028                 /* Set or clear specific file flags */
2029                 /* XXX This probably needs checks to ensure the flags are
2030                  *     not abused, and to handle any flag side effects.
2031                  */
2032                 if (get_user(flags, (int *) arg))
2033                         RETURN(-EFAULT);
2034
2035                 if (cmd == LL_IOC_SETFLAGS) {
2036                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2037                             !(file->f_flags & O_DIRECT)) {
2038                                 CERROR("%s: unable to disable locking on "
2039                                        "non-O_DIRECT file\n", current->comm);
2040                                 RETURN(-EINVAL);
2041                         }
2042
2043                         fd->fd_flags |= flags;
2044                 } else {
2045                         fd->fd_flags &= ~flags;
2046                 }
2047                 RETURN(0);
2048         case LL_IOC_LOV_SETSTRIPE:
2049                 RETURN(ll_lov_setstripe(inode, file, arg));
2050         case LL_IOC_LOV_SETEA:
2051                 RETURN(ll_lov_setea(inode, file, arg));
2052         case LL_IOC_LOV_SWAP_LAYOUTS: {
2053                 struct file *file2;
2054                 struct lustre_swap_layouts lsl;
2055
2056                 if (cfs_copy_from_user(&lsl, (char *)arg,
2057                                        sizeof(struct lustre_swap_layouts)))
2058                         RETURN(-EFAULT);
2059
2060                 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2061                         RETURN(-EPERM);
2062
2063                 file2 = fget(lsl.sl_fd);
2064                 if (file2 == NULL)
2065                         RETURN(-EBADF);
2066
2067                 rc = -EPERM;
2068                 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2069                         rc = ll_swap_layouts(file, file2, &lsl);
2070                 fput(file2);
2071                 RETURN(rc);
2072         }
2073         case LL_IOC_LOV_GETSTRIPE:
2074                 RETURN(ll_lov_getstripe(inode, arg));
2075         case LL_IOC_RECREATE_OBJ:
2076                 RETURN(ll_lov_recreate_obj(inode, arg));
2077         case LL_IOC_RECREATE_FID:
2078                 RETURN(ll_lov_recreate_fid(inode, arg));
2079         case FSFILT_IOC_FIEMAP:
2080                 RETURN(ll_ioctl_fiemap(inode, arg));
2081         case FSFILT_IOC_GETFLAGS:
2082         case FSFILT_IOC_SETFLAGS:
2083                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2084         case FSFILT_IOC_GETVERSION_OLD:
2085         case FSFILT_IOC_GETVERSION:
2086                 RETURN(put_user(inode->i_generation, (int *)arg));
2087         case LL_IOC_GROUP_LOCK:
2088                 RETURN(ll_get_grouplock(inode, file, arg));
2089         case LL_IOC_GROUP_UNLOCK:
2090                 RETURN(ll_put_grouplock(inode, file, arg));
2091         case IOC_OBD_STATFS:
2092                 RETURN(ll_obd_statfs(inode, (void *)arg));
2093
2094         /* We need to special case any other ioctls we want to handle,
2095          * to send them to the MDS/OST as appropriate and to properly
2096          * network encode the arg field.
2097         case FSFILT_IOC_SETVERSION_OLD:
2098         case FSFILT_IOC_SETVERSION:
2099         */
2100         case LL_IOC_FLUSHCTX:
2101                 RETURN(ll_flush_ctx(inode));
2102         case LL_IOC_PATH2FID: {
2103                 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2104                                  sizeof(struct lu_fid)))
2105                         RETURN(-EFAULT);
2106
2107                 RETURN(0);
2108         }
2109         case OBD_IOC_FID2PATH:
2110                 RETURN(ll_fid2path(inode, (void *)arg));
2111         case LL_IOC_DATA_VERSION: {
2112                 struct ioc_data_version idv;
2113                 int                     rc;
2114
2115                 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2116                         RETURN(-EFAULT);
2117
2118                 rc = ll_data_version(inode, &idv.idv_version,
2119                                 !(idv.idv_flags & LL_DV_NOFLUSH));
2120
2121                 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2122                         RETURN(-EFAULT);
2123
2124                 RETURN(rc);
2125         }
2126
2127         case LL_IOC_GET_MDTIDX: {
2128                 int mdtidx;
2129
2130                 mdtidx = ll_get_mdt_idx(inode);
2131                 if (mdtidx < 0)
2132                         RETURN(mdtidx);
2133
2134                 if (put_user((int)mdtidx, (int*)arg))
2135                         RETURN(-EFAULT);
2136
2137                 RETURN(0);
2138         }
2139         case OBD_IOC_GETDTNAME:
2140         case OBD_IOC_GETMDNAME:
2141                 RETURN(ll_get_obd_name(inode, cmd, arg));
2142         case LL_IOC_HSM_STATE_GET: {
2143                 struct md_op_data       *op_data;
2144                 struct hsm_user_state   *hus;
2145                 int                      rc;
2146
2147                 OBD_ALLOC_PTR(hus);
2148                 if (hus == NULL)
2149                         RETURN(-ENOMEM);
2150
2151                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2152                                              LUSTRE_OPC_ANY, hus);
2153                 if (op_data == NULL) {
2154                         OBD_FREE_PTR(hus);
2155                         RETURN(-ENOMEM);
2156                 }
2157
2158                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2159                                    op_data, NULL);
2160
2161                 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2162                         rc = -EFAULT;
2163
2164                 ll_finish_md_op_data(op_data);
2165                 OBD_FREE_PTR(hus);
2166                 RETURN(rc);
2167         }
2168         case LL_IOC_HSM_STATE_SET: {
2169                 struct md_op_data       *op_data;
2170                 struct hsm_state_set    *hss;
2171                 int                      rc;
2172
2173                 OBD_ALLOC_PTR(hss);
2174                 if (hss == NULL)
2175                         RETURN(-ENOMEM);
2176                 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2177                         OBD_FREE_PTR(hss);
2178                         RETURN(-EFAULT);
2179                 }
2180
2181                 /* Non-root users are forbidden to set or clear flags which are
2182                  * NOT defined in HSM_USER_MASK. */
2183                 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2184                     && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2185                         OBD_FREE_PTR(hss);
2186                         RETURN(-EPERM);
2187                 }
2188
2189                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2190                                              LUSTRE_OPC_ANY, hss);
2191                 if (op_data == NULL) {
2192                         OBD_FREE_PTR(hss);
2193                         RETURN(-ENOMEM);
2194                 }
2195
2196                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2197                                    op_data, NULL);
2198
2199                 ll_finish_md_op_data(op_data);
2200
2201                 OBD_FREE_PTR(hss);
2202                 RETURN(rc);
2203         }
2204         case LL_IOC_HSM_ACTION: {
2205                 struct md_op_data               *op_data;
2206                 struct hsm_current_action       *hca;
2207                 int                              rc;
2208
2209                 OBD_ALLOC_PTR(hca);
2210                 if (hca == NULL)
2211                         RETURN(-ENOMEM);
2212
2213                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2214                                              LUSTRE_OPC_ANY, hca);
2215                 if (op_data == NULL) {
2216                         OBD_FREE_PTR(hca);
2217                         RETURN(-ENOMEM);
2218                 }
2219
2220                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2221                                    op_data, NULL);
2222
2223                 if (cfs_copy_to_user((char *)arg, hca, sizeof(*hca)))
2224                         rc = -EFAULT;
2225
2226                 ll_finish_md_op_data(op_data);
2227                 OBD_FREE_PTR(hca);
2228                 RETURN(rc);
2229         }
2230         default: {
2231                 int err;
2232
2233                 if (LLIOC_STOP ==
2234                      ll_iocontrol_call(inode, file, cmd, arg, &err))
2235                         RETURN(err);
2236
2237                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2238                                      (void *)arg));
2239         }
2240         }
2241 }
2242
2243 #ifndef HAVE_FILE_LLSEEK_SIZE
2244 static inline loff_t
2245 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2246 {
2247         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2248                 return -EINVAL;
2249         if (offset > maxsize)
2250                 return -EINVAL;
2251
2252         if (offset != file->f_pos) {
2253                 file->f_pos = offset;
2254                 file->f_version = 0;
2255         }
2256         return offset;
2257 }
2258
2259 static loff_t
2260 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2261                 loff_t maxsize, loff_t eof)
2262 {
2263         struct inode *inode = file->f_dentry->d_inode;
2264
2265         switch (origin) {
2266         case SEEK_END:
2267                 offset += eof;
2268                 break;
2269         case SEEK_CUR:
2270                 /*
2271                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
2272                  * position-querying operation.  Avoid rewriting the "same"
2273                  * f_pos value back to the file because a concurrent read(),
2274                  * write() or lseek() might have altered it
2275                  */
2276                 if (offset == 0)
2277                         return file->f_pos;
2278                 /*
2279                  * f_lock protects against read/modify/write race with other
2280                  * SEEK_CURs. Note that parallel writes and reads behave
2281                  * like SEEK_SET.
2282                  */
2283                 mutex_lock(&inode->i_mutex);
2284                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2285                 mutex_unlock(&inode->i_mutex);
2286                 return offset;
2287         case SEEK_DATA:
2288                 /*
2289                  * In the generic case the entire file is data, so as long as
2290                  * offset isn't at the end of the file then the offset is data.
2291                  */
2292                 if (offset >= eof)
2293                         return -ENXIO;
2294                 break;
2295         case SEEK_HOLE:
2296                 /*
2297                  * There is a virtual hole at the end of the file, so as long as
2298                  * offset isn't i_size or larger, return i_size.
2299                  */
2300                 if (offset >= eof)
2301                         return -ENXIO;
2302                 offset = eof;
2303                 break;
2304         }
2305
2306         return llseek_execute(file, offset, maxsize);
2307 }
2308 #endif
2309
2310 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2311 {
2312         struct inode *inode = file->f_dentry->d_inode;
2313         loff_t retval, eof = 0;
2314
2315         ENTRY;
2316         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2317                            (origin == SEEK_CUR) ? file->f_pos : 0);
2318         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2319                inode->i_ino, inode->i_generation, inode, retval, retval,
2320                origin);
2321         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2322
2323         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2324                 retval = ll_glimpse_size(inode);
2325                 if (retval != 0)
2326                         RETURN(retval);
2327                 eof = i_size_read(inode);
2328         }
2329
2330         retval = ll_generic_file_llseek_size(file, offset, origin,
2331                                           ll_file_maxbytes(inode), eof);
2332         RETURN(retval);
2333 }
2334
2335 int ll_flush(struct file *file, fl_owner_t id)
2336 {
2337         struct inode *inode = file->f_dentry->d_inode;
2338         struct ll_inode_info *lli = ll_i2info(inode);
2339         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2340         int rc, err;
2341
2342         LASSERT(!S_ISDIR(inode->i_mode));
2343
2344         /* catch async errors that were recorded back when async writeback
2345          * failed for pages in this mapping. */
2346         rc = lli->lli_async_rc;
2347         lli->lli_async_rc = 0;
2348         err = lov_read_and_clear_async_rc(lli->lli_clob);
2349         if (rc == 0)
2350                 rc = err;
2351
2352         /* The application has been told write failure already.
2353          * Do not report failure again. */
2354         if (fd->fd_write_failed)
2355                 return 0;
2356         return rc ? -EIO : 0;
2357 }
2358
2359 /**
2360  * Called to make sure a portion of file has been written out.
2361  * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2362  *
2363  * Return how many pages have been written.
2364  */
2365 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2366                        enum cl_fsync_mode mode)
2367 {
2368         struct cl_env_nest nest;
2369         struct lu_env *env;
2370         struct cl_io *io;
2371         struct obd_capa *capa = NULL;
2372         struct cl_fsync_io *fio;
2373         int result;
2374         ENTRY;
2375
2376         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2377             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2378                 RETURN(-EINVAL);
2379
2380         env = cl_env_nested_get(&nest);
2381         if (IS_ERR(env))
2382                 RETURN(PTR_ERR(env));
2383
2384         capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2385
2386         io = ccc_env_thread_io(env);
2387         io->ci_obj = cl_i2info(inode)->lli_clob;
2388         io->ci_ignore_layout = 1;
2389
2390         /* initialize parameters for sync */
2391         fio = &io->u.ci_fsync;
2392         fio->fi_capa = capa;
2393         fio->fi_start = start;
2394         fio->fi_end = end;
2395         fio->fi_fid = ll_inode2fid(inode);
2396         fio->fi_mode = mode;
2397         fio->fi_nr_written = 0;
2398
2399         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2400                 result = cl_io_loop(env, io);
2401         else
2402                 result = io->ci_result;
2403         if (result == 0)
2404                 result = fio->fi_nr_written;
2405         cl_io_fini(env, io);
2406         cl_env_nested_put(&nest, env);
2407
2408         capa_put(capa);
2409
2410         RETURN(result);
2411 }
2412
2413 /*
2414  * When dentry is provided (the 'else' case), *file->f_dentry may be
2415  * null and dentry must be used directly rather than pulled from
2416  * *file->f_dentry as is done otherwise.
2417  */
2418
2419 #ifdef HAVE_FILE_FSYNC_4ARGS
2420 int ll_fsync(struct file *file, loff_t start, loff_t end, int data)
2421 {
2422         struct dentry *dentry = file->f_dentry;
2423 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2424 int ll_fsync(struct file *file, int data)
2425 {
2426         struct dentry *dentry = file->f_dentry;
2427 #else
2428 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2429 {
2430 #endif
2431         struct inode *inode = dentry->d_inode;
2432         struct ll_inode_info *lli = ll_i2info(inode);
2433         struct ptlrpc_request *req;
2434         struct obd_capa *oc;
2435         int rc, err;
2436         ENTRY;
2437
2438         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2439                inode->i_generation, inode);
2440         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2441
2442 #ifdef HAVE_FILE_FSYNC_4ARGS
2443         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2444         mutex_lock(&inode->i_mutex);
2445 #else
2446         /* fsync's caller has already called _fdata{sync,write}, we want
2447          * that IO to finish before calling the osc and mdc sync methods */
2448         rc = filemap_fdatawait(inode->i_mapping);
2449 #endif
2450
2451         /* catch async errors that were recorded back when async writeback
2452          * failed for pages in this mapping. */
2453         if (!S_ISDIR(inode->i_mode)) {
2454                 err = lli->lli_async_rc;
2455                 lli->lli_async_rc = 0;
2456                 if (rc == 0)
2457                         rc = err;
2458                 err = lov_read_and_clear_async_rc(lli->lli_clob);
2459                 if (rc == 0)
2460                         rc = err;
2461         }
2462
2463         oc = ll_mdscapa_get(inode);
2464         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2465                       &req);
2466         capa_put(oc);
2467         if (!rc)
2468                 rc = err;
2469         if (!err)
2470                 ptlrpc_req_finished(req);
2471
2472         if (data) {
2473                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2474
2475                 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2476                                 CL_FSYNC_ALL);
2477                 if (rc == 0 && err < 0)
2478                         rc = err;
2479                 if (rc < 0)
2480                         fd->fd_write_failed = true;
2481                 else
2482                         fd->fd_write_failed = false;
2483         }
2484
2485 #ifdef HAVE_FILE_FSYNC_4ARGS
2486         mutex_unlock(&inode->i_mutex);
2487 #endif
2488         RETURN(rc);
2489 }
2490
2491 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2492 {
2493         struct inode *inode = file->f_dentry->d_inode;
2494         struct ll_sb_info *sbi = ll_i2sbi(inode);
2495         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2496                                            .ei_cb_cp =ldlm_flock_completion_ast,
2497                                            .ei_cbdata = file_lock };
2498         struct md_op_data *op_data;
2499         struct lustre_handle lockh = {0};
2500         ldlm_policy_data_t flock = {{0}};
2501         int flags = 0;
2502         int rc;
2503         int rc2 = 0;
2504         ENTRY;
2505
2506         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2507                inode->i_ino, file_lock);
2508
2509         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2510
2511         if (file_lock->fl_flags & FL_FLOCK) {
2512                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2513                 /* flocks are whole-file locks */
2514                 flock.l_flock.end = OFFSET_MAX;
2515                 /* For flocks owner is determined by the local file desctiptor*/
2516                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2517         } else if (file_lock->fl_flags & FL_POSIX) {
2518                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2519                 flock.l_flock.start = file_lock->fl_start;
2520                 flock.l_flock.end = file_lock->fl_end;
2521         } else {
2522                 RETURN(-EINVAL);
2523         }
2524         flock.l_flock.pid = file_lock->fl_pid;
2525
2526         /* Somewhat ugly workaround for svc lockd.
2527          * lockd installs custom fl_lmops->lm_compare_owner that checks
2528          * for the fl_owner to be the same (which it always is on local node
2529          * I guess between lockd processes) and then compares pid.
2530          * As such we assign pid to the owner field to make it all work,
2531          * conflict with normal locks is unlikely since pid space and
2532          * pointer space for current->files are not intersecting */
2533         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2534                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2535
2536         switch (file_lock->fl_type) {
2537         case F_RDLCK:
2538                 einfo.ei_mode = LCK_PR;
2539                 break;
2540         case F_UNLCK:
2541                 /* An unlock request may or may not have any relation to
2542                  * existing locks so we may not be able to pass a lock handle
2543                  * via a normal ldlm_lock_cancel() request. The request may even
2544                  * unlock a byte range in the middle of an existing lock. In
2545                  * order to process an unlock request we need all of the same
2546                  * information that is given with a normal read or write record
2547                  * lock request. To avoid creating another ldlm unlock (cancel)
2548                  * message we'll treat a LCK_NL flock request as an unlock. */
2549                 einfo.ei_mode = LCK_NL;
2550                 break;
2551         case F_WRLCK:
2552                 einfo.ei_mode = LCK_PW;
2553                 break;
2554         default:
2555                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2556                         file_lock->fl_type);
2557                 RETURN (-ENOTSUPP);
2558         }
2559
2560         switch (cmd) {
2561         case F_SETLKW:
2562 #ifdef F_SETLKW64
2563         case F_SETLKW64:
2564 #endif
2565                 flags = 0;
2566                 break;
2567         case F_SETLK:
2568 #ifdef F_SETLK64
2569         case F_SETLK64:
2570 #endif
2571                 flags = LDLM_FL_BLOCK_NOWAIT;
2572                 break;
2573         case F_GETLK:
2574 #ifdef F_GETLK64
2575         case F_GETLK64:
2576 #endif
2577                 flags = LDLM_FL_TEST_LOCK;
2578                 /* Save the old mode so that if the mode in the lock changes we
2579                  * can decrement the appropriate reader or writer refcount. */
2580                 file_lock->fl_type = einfo.ei_mode;
2581                 break;
2582         default:
2583                 CERROR("unknown fcntl lock command: %d\n", cmd);
2584                 RETURN (-EINVAL);
2585         }
2586
2587         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2588                                      LUSTRE_OPC_ANY, NULL);
2589         if (IS_ERR(op_data))
2590                 RETURN(PTR_ERR(op_data));
2591
2592         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2593                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2594                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2595
2596         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2597                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2598
2599         if ((file_lock->fl_flags & FL_FLOCK) &&
2600             (rc == 0 || file_lock->fl_type == F_UNLCK))
2601                 rc2  = flock_lock_file_wait(file, file_lock);
2602         if ((file_lock->fl_flags & FL_POSIX) &&
2603             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2604             !(flags & LDLM_FL_TEST_LOCK))
2605                 rc2  = posix_lock_file_wait(file, file_lock);
2606
2607         if (rc2 && file_lock->fl_type != F_UNLCK) {
2608                 einfo.ei_mode = LCK_NL;
2609                 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2610                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2611                 rc = rc2;
2612         }
2613
2614         ll_finish_md_op_data(op_data);
2615
2616         RETURN(rc);
2617 }
2618
2619 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2620 {
2621         ENTRY;
2622
2623         RETURN(-ENOSYS);
2624 }
2625
2626 /**
2627  * test if some locks matching bits and l_req_mode are acquired
2628  * - bits can be in different locks
2629  * - if found clear the common lock bits in *bits
2630  * - the bits not found, are kept in *bits
2631  * \param inode [IN]
2632  * \param bits [IN] searched lock bits [IN]
2633  * \param l_req_mode [IN] searched lock mode
2634  * \retval boolean, true iff all bits are found
2635  */
2636 int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2637 {
2638         struct lustre_handle lockh;
2639         ldlm_policy_data_t policy;
2640         ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2641                                 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2642         struct lu_fid *fid;
2643         __u64 flags;
2644         int i;
2645         ENTRY;
2646
2647         if (!inode)
2648                RETURN(0);
2649
2650         fid = &ll_i2info(inode)->lli_fid;
2651         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2652                ldlm_lockname[mode]);
2653
2654         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2655         for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2656                 policy.l_inodebits.bits = *bits & (1 << i);
2657                 if (policy.l_inodebits.bits == 0)
2658                         continue;
2659
2660                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2661                                   &policy, mode, &lockh)) {
2662                         struct ldlm_lock *lock;
2663
2664                         lock = ldlm_handle2lock(&lockh);
2665                         if (lock) {
2666                                 *bits &=
2667                                       ~(lock->l_policy_data.l_inodebits.bits);
2668                                 LDLM_LOCK_PUT(lock);
2669                         } else {
2670                                 *bits &= ~policy.l_inodebits.bits;
2671                         }
2672                 }
2673         }
2674         RETURN(*bits == 0);
2675 }
2676
2677 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2678                             struct lustre_handle *lockh, __u64 flags)
2679 {
2680         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2681         struct lu_fid *fid;
2682         ldlm_mode_t rc;
2683         ENTRY;
2684
2685         fid = &ll_i2info(inode)->lli_fid;
2686         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2687
2688         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2689                            fid, LDLM_IBITS, &policy,
2690                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2691         RETURN(rc);
2692 }
2693
2694 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2695 {
2696         /* Already unlinked. Just update nlink and return success */
2697         if (rc == -ENOENT) {
2698                 clear_nlink(inode);
2699                 /* This path cannot be hit for regular files unless in
2700                  * case of obscure races, so no need to to validate
2701                  * size. */
2702                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2703                         return 0;
2704         } else if (rc != 0) {
2705                 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2706                        ll_get_fsname(inode->i_sb, NULL, 0),
2707                        PFID(ll_inode2fid(inode)), rc);
2708         }
2709
2710         return rc;
2711 }
2712
2713 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2714                              __u64 ibits)
2715 {
2716         struct inode *inode = dentry->d_inode;
2717         struct ptlrpc_request *req = NULL;
2718         struct obd_export *exp;
2719         int rc = 0;
2720         ENTRY;
2721
2722         LASSERT(inode != NULL);
2723
2724         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2725                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2726
2727         exp = ll_i2mdexp(inode);
2728
2729         /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2730          *      But under CMD case, it caused some lock issues, should be fixed
2731          *      with new CMD ibits lock. See bug 12718 */
2732         if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2733                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2734                 struct md_op_data *op_data;
2735
2736                 if (ibits == MDS_INODELOCK_LOOKUP)
2737                         oit.it_op = IT_LOOKUP;
2738
2739                 /* Call getattr by fid, so do not provide name at all. */
2740                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2741                                              dentry->d_inode, NULL, 0, 0,
2742                                              LUSTRE_OPC_ANY, NULL);
2743                 if (IS_ERR(op_data))
2744                         RETURN(PTR_ERR(op_data));
2745
2746                 oit.it_create_mode |= M_CHECK_STALE;
2747                 rc = md_intent_lock(exp, op_data, NULL, 0,
2748                                     /* we are not interested in name
2749                                        based lookup */
2750                                     &oit, 0, &req,
2751                                     ll_md_blocking_ast, 0);
2752                 ll_finish_md_op_data(op_data);
2753                 oit.it_create_mode &= ~M_CHECK_STALE;
2754                 if (rc < 0) {
2755                         rc = ll_inode_revalidate_fini(inode, rc);
2756                         GOTO (out, rc);
2757                 }
2758
2759                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2760                 if (rc != 0) {
2761                         ll_intent_release(&oit);
2762                         GOTO(out, rc);
2763                 }
2764
2765                 /* Unlinked? Unhash dentry, so it is not picked up later by
2766                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2767                    here to preserve get_cwd functionality on 2.6.
2768                    Bug 10503 */
2769                 if (!dentry->d_inode->i_nlink)
2770                         d_lustre_invalidate(dentry);
2771
2772                 ll_lookup_finish_locks(&oit, dentry);
2773         } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2774                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2775                 obd_valid valid = OBD_MD_FLGETATTR;
2776                 struct md_op_data *op_data;
2777                 int ealen = 0;
2778
2779                 if (S_ISREG(inode->i_mode)) {
2780                         rc = ll_get_max_mdsize(sbi, &ealen);
2781                         if (rc)
2782                                 RETURN(rc);
2783                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2784                 }
2785
2786                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2787                                              0, ealen, LUSTRE_OPC_ANY,
2788                                              NULL);
2789                 if (IS_ERR(op_data))
2790                         RETURN(PTR_ERR(op_data));
2791
2792                 op_data->op_valid = valid;
2793                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2794                  * capa for this inode. Because we only keep capas of dirs
2795                  * fresh. */
2796                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2797                 ll_finish_md_op_data(op_data);
2798                 if (rc) {
2799                         rc = ll_inode_revalidate_fini(inode, rc);
2800                         RETURN(rc);
2801                 }
2802
2803                 rc = ll_prep_inode(&inode, req, NULL, NULL);
2804         }
2805 out:
2806         ptlrpc_req_finished(req);
2807         return rc;
2808 }
2809
2810 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2811                            __u64 ibits)
2812 {
2813         struct inode *inode = dentry->d_inode;
2814         int rc;
2815         ENTRY;
2816
2817         rc = __ll_inode_revalidate_it(dentry, it, ibits);
2818         if (rc != 0)
2819                 RETURN(rc);
2820
2821         /* if object isn't regular file, don't validate size */
2822         if (!S_ISREG(inode->i_mode)) {
2823                 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2824                 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2825                 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2826         } else {
2827                 rc = ll_glimpse_size(inode);
2828         }
2829         RETURN(rc);
2830 }
2831
2832 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2833                   struct lookup_intent *it, struct kstat *stat)
2834 {
2835         struct inode *inode = de->d_inode;
2836         struct ll_sb_info *sbi = ll_i2sbi(inode);
2837         struct ll_inode_info *lli = ll_i2info(inode);
2838         int res = 0;
2839
2840         res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2841                                              MDS_INODELOCK_LOOKUP);
2842         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2843
2844         if (res)
2845                 return res;
2846
2847         stat->dev = inode->i_sb->s_dev;
2848         if (ll_need_32bit_api(sbi))
2849                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2850         else
2851                 stat->ino = inode->i_ino;
2852         stat->mode = inode->i_mode;
2853         stat->nlink = inode->i_nlink;
2854         stat->uid = inode->i_uid;
2855         stat->gid = inode->i_gid;
2856         stat->rdev = inode->i_rdev;
2857         stat->atime = inode->i_atime;
2858         stat->mtime = inode->i_mtime;
2859         stat->ctime = inode->i_ctime;
2860         stat->blksize = 1 << inode->i_blkbits;
2861
2862         stat->size = i_size_read(inode);
2863         stat->blocks = inode->i_blocks;
2864
2865         return 0;
2866 }
2867 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2868 {
2869         struct lookup_intent it = { .it_op = IT_GETATTR };
2870
2871         return ll_getattr_it(mnt, de, &it, stat);
2872 }
2873
2874 #ifdef HAVE_LINUX_FIEMAP_H
2875 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2876                 __u64 start, __u64 len)
2877 {
2878         int rc;
2879         size_t num_bytes;
2880         struct ll_user_fiemap *fiemap;
2881         unsigned int extent_count = fieinfo->fi_extents_max;
2882
2883         num_bytes = sizeof(*fiemap) + (extent_count *
2884                                        sizeof(struct ll_fiemap_extent));
2885         OBD_ALLOC_LARGE(fiemap, num_bytes);
2886
2887         if (fiemap == NULL)
2888                 RETURN(-ENOMEM);
2889
2890         fiemap->fm_flags = fieinfo->fi_flags;
2891         fiemap->fm_extent_count = fieinfo->fi_extents_max;
2892         fiemap->fm_start = start;
2893         fiemap->fm_length = len;
2894         memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2895                sizeof(struct ll_fiemap_extent));
2896
2897         rc = ll_do_fiemap(inode, fiemap, num_bytes);
2898
2899         fieinfo->fi_flags = fiemap->fm_flags;
2900         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2901         memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2902                fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2903
2904         OBD_FREE_LARGE(fiemap, num_bytes);
2905         return rc;
2906 }
2907 #endif
2908
2909 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2910 {
2911         struct ll_inode_info *lli = ll_i2info(inode);
2912         struct posix_acl *acl = NULL;
2913         ENTRY;
2914
2915         spin_lock(&lli->lli_lock);
2916         /* VFS' acl_permission_check->check_acl will release the refcount */
2917         acl = posix_acl_dup(lli->lli_posix_acl);
2918         spin_unlock(&lli->lli_lock);
2919
2920         RETURN(acl);
2921 }
2922
2923 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2924 static int
2925 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2926 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2927 # else
2928 ll_check_acl(struct inode *inode, int mask)
2929 # endif
2930 {
2931 # ifdef CONFIG_FS_POSIX_ACL
2932         struct posix_acl *acl;
2933         int rc;
2934         ENTRY;
2935
2936 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
2937         if (flags & IPERM_FLAG_RCU)
2938                 return -ECHILD;
2939 #  endif
2940         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2941
2942         if (!acl)
2943                 RETURN(-EAGAIN);
2944
2945         rc = posix_acl_permission(inode, acl, mask);
2946         posix_acl_release(acl);
2947
2948         RETURN(rc);
2949 # else /* !CONFIG_FS_POSIX_ACL */
2950         return -EAGAIN;
2951 # endif /* CONFIG_FS_POSIX_ACL */
2952 }
2953 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2954
2955 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2956 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2957 #else
2958 # ifdef HAVE_INODE_PERMISION_2ARGS
2959 int ll_inode_permission(struct inode *inode, int mask)
2960 # else
2961 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2962 # endif
2963 #endif
2964 {
2965         int rc = 0;
2966         ENTRY;
2967
2968 #ifdef MAY_NOT_BLOCK
2969         if (mask & MAY_NOT_BLOCK)
2970                 return -ECHILD;
2971 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2972         if (flags & IPERM_FLAG_RCU)
2973                 return -ECHILD;
2974 #endif
2975
2976        /* as root inode are NOT getting validated in lookup operation,
2977         * need to do it before permission check. */
2978
2979         if (inode == inode->i_sb->s_root->d_inode) {
2980                 struct lookup_intent it = { .it_op = IT_LOOKUP };
2981
2982                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2983                                               MDS_INODELOCK_LOOKUP);
2984                 if (rc)
2985                         RETURN(rc);
2986         }
2987
2988         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2989                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2990
2991         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2992                 return lustre_check_remote_perm(inode, mask);
2993
2994         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2995         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
2996
2997         RETURN(rc);
2998 }
2999
3000 #ifdef HAVE_FILE_READV
3001 #define READ_METHOD readv
3002 #define READ_FUNCTION ll_file_readv
3003 #define WRITE_METHOD writev
3004 #define WRITE_FUNCTION ll_file_writev
3005 #else
3006 #define READ_METHOD aio_read
3007 #define READ_FUNCTION ll_file_aio_read
3008 #define WRITE_METHOD aio_write
3009 #define WRITE_FUNCTION ll_file_aio_write
3010 #endif
3011
3012 /* -o localflock - only provides locally consistent flock locks */
3013 struct file_operations ll_file_operations = {
3014         .read           = ll_file_read,
3015         .READ_METHOD    = READ_FUNCTION,
3016         .write          = ll_file_write,
3017         .WRITE_METHOD   = WRITE_FUNCTION,
3018         .unlocked_ioctl = ll_file_ioctl,
3019         .open           = ll_file_open,
3020         .release        = ll_file_release,
3021         .mmap           = ll_file_mmap,
3022         .llseek         = ll_file_seek,
3023 #ifdef HAVE_KERNEL_SENDFILE
3024         .sendfile       = ll_file_sendfile,
3025 #endif
3026 #ifdef HAVE_KERNEL_SPLICE_READ
3027         .splice_read    = ll_file_splice_read,
3028 #endif
3029         .fsync          = ll_fsync,
3030         .flush          = ll_flush
3031 };
3032
3033 struct file_operations ll_file_operations_flock = {
3034         .read           = ll_file_read,
3035         .READ_METHOD    = READ_FUNCTION,
3036         .write          = ll_file_write,
3037         .WRITE_METHOD   = WRITE_FUNCTION,
3038         .unlocked_ioctl = ll_file_ioctl,
3039         .open           = ll_file_open,
3040         .release        = ll_file_release,
3041         .mmap           = ll_file_mmap,
3042         .llseek         = ll_file_seek,
3043 #ifdef HAVE_KERNEL_SENDFILE
3044         .sendfile       = ll_file_sendfile,
3045 #endif
3046 #ifdef HAVE_KERNEL_SPLICE_READ
3047         .splice_read    = ll_file_splice_read,
3048 #endif
3049         .fsync          = ll_fsync,
3050         .flush          = ll_flush,
3051         .flock          = ll_file_flock,
3052         .lock           = ll_file_flock
3053 };
3054
3055 /* These are for -o noflock - to return ENOSYS on flock calls */
3056 struct file_operations ll_file_operations_noflock = {
3057         .read           = ll_file_read,
3058         .READ_METHOD    = READ_FUNCTION,
3059         .write          = ll_file_write,
3060         .WRITE_METHOD   = WRITE_FUNCTION,
3061         .unlocked_ioctl = ll_file_ioctl,
3062         .open           = ll_file_open,
3063         .release        = ll_file_release,
3064         .mmap           = ll_file_mmap,
3065         .llseek         = ll_file_seek,
3066 #ifdef HAVE_KERNEL_SENDFILE
3067         .sendfile       = ll_file_sendfile,
3068 #endif
3069 #ifdef HAVE_KERNEL_SPLICE_READ
3070         .splice_read    = ll_file_splice_read,
3071 #endif
3072         .fsync          = ll_fsync,
3073         .flush          = ll_flush,
3074         .flock          = ll_file_noflock,
3075         .lock           = ll_file_noflock
3076 };
3077
3078 struct inode_operations ll_file_inode_operations = {
3079         .setattr        = ll_setattr,
3080         .getattr        = ll_getattr,
3081         .permission     = ll_inode_permission,
3082         .setxattr       = ll_setxattr,
3083         .getxattr       = ll_getxattr,
3084         .listxattr      = ll_listxattr,
3085         .removexattr    = ll_removexattr,
3086 #ifdef  HAVE_LINUX_FIEMAP_H
3087         .fiemap         = ll_fiemap,
3088 #endif
3089 #ifdef HAVE_IOP_GET_ACL
3090         .get_acl        = ll_get_acl,
3091 #endif
3092 };
3093
3094 /* dynamic ioctl number support routins */
3095 static struct llioc_ctl_data {
3096         struct rw_semaphore     ioc_sem;
3097         cfs_list_t              ioc_head;
3098 } llioc = {
3099         __RWSEM_INITIALIZER(llioc.ioc_sem),
3100         CFS_LIST_HEAD_INIT(llioc.ioc_head)
3101 };
3102
3103
3104 struct llioc_data {
3105         cfs_list_t              iocd_list;
3106         unsigned int            iocd_size;
3107         llioc_callback_t        iocd_cb;
3108         unsigned int            iocd_count;
3109         unsigned int            iocd_cmd[0];
3110 };
3111
3112 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3113 {
3114         unsigned int size;
3115         struct llioc_data *in_data = NULL;
3116         ENTRY;
3117
3118         if (cb == NULL || cmd == NULL ||
3119             count > LLIOC_MAX_CMD || count < 0)
3120                 RETURN(NULL);
3121
3122         size = sizeof(*in_data) + count * sizeof(unsigned int);
3123         OBD_ALLOC(in_data, size);
3124         if (in_data == NULL)
3125                 RETURN(NULL);
3126
3127         memset(in_data, 0, sizeof(*in_data));
3128         in_data->iocd_size = size;
3129         in_data->iocd_cb = cb;
3130         in_data->iocd_count = count;
3131         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3132
3133         down_write(&llioc.ioc_sem);
3134         cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3135         up_write(&llioc.ioc_sem);
3136
3137         RETURN(in_data);
3138 }
3139
3140 void ll_iocontrol_unregister(void *magic)
3141 {
3142         struct llioc_data *tmp;
3143
3144         if (magic == NULL)
3145                 return;
3146
3147         down_write(&llioc.ioc_sem);
3148         cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3149                 if (tmp == magic) {
3150                         unsigned int size = tmp->iocd_size;
3151
3152                         cfs_list_del(&tmp->iocd_list);
3153                         up_write(&llioc.ioc_sem);
3154
3155                         OBD_FREE(tmp, size);
3156                         return;
3157                 }
3158         }
3159         up_write(&llioc.ioc_sem);
3160
3161         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3162 }
3163
3164 EXPORT_SYMBOL(ll_iocontrol_register);
3165 EXPORT_SYMBOL(ll_iocontrol_unregister);
3166
3167 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3168                         unsigned int cmd, unsigned long arg, int *rcp)
3169 {
3170         enum llioc_iter ret = LLIOC_CONT;
3171         struct llioc_data *data;
3172         int rc = -EINVAL, i;
3173
3174         down_read(&llioc.ioc_sem);
3175         cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3176                 for (i = 0; i < data->iocd_count; i++) {
3177                         if (cmd != data->iocd_cmd[i])
3178                                 continue;
3179
3180                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3181                         break;
3182                 }
3183
3184                 if (ret == LLIOC_STOP)
3185                         break;
3186         }
3187         up_read(&llioc.ioc_sem);
3188
3189         if (rcp)
3190                 *rcp = rc;
3191         return ret;
3192 }
3193
3194 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3195 {
3196         struct ll_inode_info *lli = ll_i2info(inode);
3197         struct cl_env_nest nest;
3198         struct lu_env *env;
3199         int result;
3200         ENTRY;
3201
3202         if (lli->lli_clob == NULL)
3203                 RETURN(0);
3204
3205         env = cl_env_nested_get(&nest);
3206         if (IS_ERR(env))
3207                 RETURN(PTR_ERR(env));
3208
3209         result = cl_conf_set(env, lli->lli_clob, conf);
3210         cl_env_nested_put(&nest, env);
3211
3212         if (conf->coc_opc == OBJECT_CONF_SET) {
3213                 struct ldlm_lock *lock = conf->coc_lock;
3214
3215                 LASSERT(lock != NULL);
3216                 LASSERT(ldlm_has_layout(lock));
3217                 if (result == 0) {
3218                         /* it can only be allowed to match after layout is
3219                          * applied to inode otherwise false layout would be
3220                          * seen. Applying layout shoud happen before dropping
3221                          * the intent lock. */
3222                         ldlm_lock_allow_match(lock);
3223                 }
3224         }
3225         RETURN(result);
3226 }
3227
3228 /**
3229  * Apply the layout to the inode. Layout lock is held and will be released
3230  * in this function.
3231  */
3232 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3233                                 struct inode *inode, __u32 *gen, bool reconf)
3234 {
3235         struct ll_inode_info *lli = ll_i2info(inode);
3236         struct ll_sb_info    *sbi = ll_i2sbi(inode);
3237         struct ldlm_lock *lock;
3238         struct lustre_md md = { NULL };
3239         struct cl_object_conf conf;
3240         int rc = 0;
3241         bool lvb_ready;
3242         ENTRY;
3243
3244         LASSERT(lustre_handle_is_used(lockh));
3245
3246         lock = ldlm_handle2lock(lockh);
3247         LASSERT(lock != NULL);
3248         LASSERT(ldlm_has_layout(lock));
3249
3250         LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3251                 inode, PFID(&lli->lli_fid), reconf);
3252
3253         lock_res_and_lock(lock);
3254         lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3255         unlock_res_and_lock(lock);
3256         /* checking lvb_ready is racy but this is okay. The worst case is
3257          * that multi processes may configure the file on the same time. */
3258         if (lvb_ready || !reconf) {
3259                 LDLM_LOCK_PUT(lock);
3260
3261                 rc = -ENODATA;
3262                 if (lvb_ready) {
3263                         /* layout_gen must be valid if layout lock is not
3264                          * cancelled and stripe has already set */
3265                         *gen = lli->lli_layout_gen;
3266                         rc = 0;
3267                 }
3268                 ldlm_lock_decref(lockh, mode);
3269                 RETURN(rc);
3270         }
3271
3272         /* for layout lock, lmm is returned in lock's lvb.
3273          * lvb_data is immutable if the lock is held so it's safe to access it
3274          * without res lock. See the description in ldlm_lock_decref_internal()
3275          * for the condition to free lvb_data of layout lock */
3276         if (lock->l_lvb_data != NULL) {
3277                 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3278                                   lock->l_lvb_data, lock->l_lvb_len);
3279                 if (rc >= 0) {
3280                         *gen = LL_LAYOUT_GEN_EMPTY;
3281                         if (md.lsm != NULL)
3282                                 *gen = md.lsm->lsm_layout_gen;
3283                         rc = 0;
3284                 } else {
3285                         CERROR("%s: file "DFID" unpackmd error: %d\n",
3286                                 ll_get_fsname(inode->i_sb, NULL, 0),
3287                                 PFID(&lli->lli_fid), rc);
3288                 }
3289         }
3290         if (rc < 0) {
3291                 LDLM_LOCK_PUT(lock);
3292                 ldlm_lock_decref(lockh, mode);
3293                 RETURN(rc);
3294         }
3295
3296         /* set layout to file. Unlikely this will fail as old layout was
3297          * surely eliminated */
3298         memset(&conf, 0, sizeof conf);
3299         conf.coc_opc = OBJECT_CONF_SET;
3300         conf.coc_inode = inode;
3301         conf.coc_lock = lock;
3302         conf.u.coc_md = &md;
3303         rc = ll_layout_conf(inode, &conf);
3304         LDLM_LOCK_PUT(lock);
3305
3306         ldlm_lock_decref(lockh, mode);
3307
3308         if (md.lsm != NULL)
3309                 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3310
3311         /* wait for IO to complete if it's still being used. */
3312         if (rc == -EBUSY) {
3313                 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3314                         ll_get_fsname(inode->i_sb, NULL, 0),
3315                         inode, PFID(&lli->lli_fid));
3316
3317                 memset(&conf, 0, sizeof conf);
3318                 conf.coc_opc = OBJECT_CONF_WAIT;
3319                 conf.coc_inode = inode;
3320                 rc = ll_layout_conf(inode, &conf);
3321                 if (rc == 0)
3322                         rc = -EAGAIN;
3323
3324                 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3325                         PFID(&lli->lli_fid), rc);
3326         }
3327
3328         RETURN(rc);
3329 }
3330
3331 /**
3332  * This function checks if there exists a LAYOUT lock on the client side,
3333  * or enqueues it if it doesn't have one in cache.
3334  *
3335  * This function will not hold layout lock so it may be revoked any time after
3336  * this function returns. Any operations depend on layout should be redone
3337  * in that case.
3338  *
3339  * This function should be called before lov_io_init() to get an uptodate
3340  * layout version, the caller should save the version number and after IO
3341  * is finished, this function should be called again to verify that layout
3342  * is not changed during IO time.
3343  */
3344 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3345 {
3346         struct ll_inode_info  *lli = ll_i2info(inode);
3347         struct ll_sb_info     *sbi = ll_i2sbi(inode);
3348         struct md_op_data     *op_data;
3349         struct lookup_intent   it;
3350         struct lustre_handle   lockh;
3351         ldlm_mode_t            mode;
3352         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3353                                            .ei_mode = LCK_CR,
3354                                            .ei_cb_bl = ll_md_blocking_ast,
3355                                            .ei_cb_cp = ldlm_completion_ast,
3356                                            .ei_cbdata = NULL };
3357         int rc;
3358         ENTRY;
3359
3360         *gen = LL_LAYOUT_GEN_NONE;
3361         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3362                 RETURN(0);
3363
3364         /* sanity checks */
3365         LASSERT(fid_is_sane(ll_inode2fid(inode)));
3366         LASSERT(S_ISREG(inode->i_mode));
3367
3368         /* mostly layout lock is caching on the local side, so try to match
3369          * it before grabbing layout lock mutex. */
3370         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3371         if (mode != 0) { /* hit cached lock */
3372                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3373                 if (rc == 0)
3374                         RETURN(0);
3375
3376                 /* better hold lli_layout_mutex to try again otherwise
3377                  * it will have starvation problem. */
3378         }
3379
3380         /* take layout lock mutex to enqueue layout lock exclusively. */
3381         mutex_lock(&lli->lli_layout_mutex);
3382
3383 again:
3384         /* try again. Maybe somebody else has done this. */
3385         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3386         if (mode != 0) { /* hit cached lock */
3387                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3388                 if (rc == -EAGAIN)
3389                         goto again;
3390
3391                 mutex_unlock(&lli->lli_layout_mutex);
3392                 RETURN(rc);
3393         }
3394
3395         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3396                         0, 0, LUSTRE_OPC_ANY, NULL);
3397         if (IS_ERR(op_data)) {
3398                 mutex_unlock(&lli->lli_layout_mutex);
3399                 RETURN(PTR_ERR(op_data));
3400         }
3401
3402         /* have to enqueue one */
3403         memset(&it, 0, sizeof(it));
3404         it.it_op = IT_LAYOUT;
3405         lockh.cookie = 0ULL;
3406
3407         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3408                         ll_get_fsname(inode->i_sb, NULL, 0), inode,
3409                         PFID(&lli->lli_fid));
3410
3411         rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3412                         NULL, 0, NULL, 0);
3413         if (it.d.lustre.it_data != NULL)
3414                 ptlrpc_req_finished(it.d.lustre.it_data);
3415         it.d.lustre.it_data = NULL;
3416
3417         ll_finish_md_op_data(op_data);
3418
3419         md_set_lock_data(sbi->ll_md_exp, &it.d.lustre.it_lock_handle, inode, NULL);
3420
3421         mode = it.d.lustre.it_lock_mode;
3422         it.d.lustre.it_lock_mode = 0;
3423         ll_intent_drop_lock(&it);
3424
3425         if (rc == 0) {
3426                 /* set lock data in case this is a new lock */
3427                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3428                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3429                 if (rc == -EAGAIN)
3430                         goto again;
3431         }
3432         mutex_unlock(&lli->lli_layout_mutex);
3433
3434         RETURN(rc);
3435 }