lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19  *
  20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21  * CA 95054 USA or visit www.sun.com if you need additional information or
  22  * have any questions.
  23  *
  24  * GPL HEADER END
  25  */
  26 /*
  27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Use is subject to license terms.
  29  *
  30  * Copyright (c) 2011, 2012, Intel Corporation.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lustre/llite/file.c
  37  *
  38  * Author: Peter Braam <braam@clusterfs.com>
  39  * Author: Phil Schwan <phil@clusterfs.com>
  40  * Author: Andreas Dilger <adilger@clusterfs.com>
  41  */
  42
  43 #define DEBUG_SUBSYSTEM S_LLITE
  44 #include <lustre_dlm.h>
  45 #include <lustre_lite.h>
  46 #include <linux/pagemap.h>
  47 #include <linux/file.h>
  48 #include "llite_internal.h"
  49 #include <lustre/ll_fiemap.h>
  50
  51 #include "cl_object.h"
  52
  53 struct ll_file_data *ll_file_data_get(void)
  54 {
  55         struct ll_file_data *fd;
  56
  57         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
  58         fd->fd_write_failed = false;
  59         return fd;
  60 }
  61
  62 static void ll_file_data_put(struct ll_file_data *fd)
  63 {
  64         if (fd != NULL)
  65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  66 }
  67
  68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  69                           struct lustre_handle *fh)
  70 {
  71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  72         op_data->op_attr.ia_mode = inode->i_mode;
  73         op_data->op_attr.ia_atime = inode->i_atime;
  74         op_data->op_attr.ia_mtime = inode->i_mtime;
  75         op_data->op_attr.ia_ctime = inode->i_ctime;
  76         op_data->op_attr.ia_size = i_size_read(inode);
  77         op_data->op_attr_blocks = inode->i_blocks;
  78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
  79                                         ll_inode_to_ext_flags(inode->i_flags);
  80         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  81         if (fh)
  82                 op_data->op_handle = *fh;
  83         op_data->op_capa1 = ll_mdscapa_get(inode);
  84
  85         if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
  86                 op_data->op_bias |= MDS_DATA_MODIFIED;
  87 }
  88
  89 /**
  90  * Closes the IO epoch and packs all the attributes into @op_data for
  91  * the CLOSE rpc.
  92  */
  93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  94                              struct obd_client_handle *och)
  95 {
  96         ENTRY;
  97
  98         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
  99                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
 100
 101         if (!(och->och_flags & FMODE_WRITE))
 102                 goto out;
 103
 104         if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
 105                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 106         else
 107                 ll_ioepoch_close(inode, op_data, &och, 0);
 108
 109 out:
 110         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
 111         ll_prep_md_op_data(op_data, inode, NULL, NULL,
 112                            0, 0, LUSTRE_OPC_ANY, NULL);
 113         EXIT;
 114 }
 115
 116 static int ll_close_inode_openhandle(struct obd_export *md_exp,
 117                                      struct inode *inode,
 118                                      struct obd_client_handle *och)
 119 {
 120         struct obd_export *exp = ll_i2mdexp(inode);
 121         struct md_op_data *op_data;
 122         struct ptlrpc_request *req = NULL;
 123         struct obd_device *obd = class_exp2obd(exp);
 124         int epoch_close = 1;
 125         int rc;
 126         ENTRY;
 127
 128         if (obd == NULL) {
 129                 /*
 130                  * XXX: in case of LMV, is this correct to access
 131                  * ->exp_handle?
 132                  */
 133                 CERROR("Invalid MDC connection handle "LPX64"\n",
 134                        ll_i2mdexp(inode)->exp_handle.h_cookie);
 135                 GOTO(out, rc = 0);
 136         }
 137
 138         OBD_ALLOC_PTR(op_data);
 139         if (op_data == NULL)
 140                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
 141
 142         ll_prepare_close(inode, op_data, och);
 143         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 144         rc = md_close(md_exp, op_data, och->och_mod, &req);
 145         if (rc == -EAGAIN) {
 146                 /* This close must have the epoch closed. */
 147                 LASSERT(epoch_close);
 148                 /* MDS has instructed us to obtain Size-on-MDS attribute from
 149                  * OSTs and send setattr to back to MDS. */
 150                 rc = ll_som_update(inode, op_data);
 151                 if (rc) {
 152                         CERROR("inode %lu mdc Size-on-MDS update failed: "
 153                                "rc = %d\n", inode->i_ino, rc);
 154                         rc = 0;
 155                 }
 156         } else if (rc) {
 157                 CERROR("inode %lu mdc close failed: rc = %d\n",
 158                        inode->i_ino, rc);
 159         }
 160
 161         /* DATA_MODIFIED flag was successfully sent on close, cancel data
 162          * modification flag. */
 163         if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
 164                 struct ll_inode_info *lli = ll_i2info(inode);
 165
 166                 spin_lock(&lli->lli_lock);
 167                 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
 168                 spin_unlock(&lli->lli_lock);
 169         }
 170
 171         ll_finish_md_op_data(op_data);
 172
 173         if (rc == 0) {
 174                 rc = ll_objects_destroy(req, inode);
 175                 if (rc)
 176                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
 177                                inode->i_ino, rc);
 178         }
 179
 180         EXIT;
 181 out:
 182
 183         if (exp_connect_som(exp) && !epoch_close &&
 184             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 185                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 186         } else {
 187                 md_clear_open_replay_data(md_exp, och);
 188                 /* Free @och if it is not waiting for DONE_WRITING. */
 189                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 190                 OBD_FREE_PTR(och);
 191         }
 192         if (req) /* This is close request */
 193                 ptlrpc_req_finished(req);
 194         return rc;
 195 }
 196
 197 int ll_md_real_close(struct inode *inode, int flags)
 198 {
 199         struct ll_inode_info *lli = ll_i2info(inode);
 200         struct obd_client_handle **och_p;
 201         struct obd_client_handle *och;
 202         __u64 *och_usecount;
 203         int rc = 0;
 204         ENTRY;
 205
 206         if (flags & FMODE_WRITE) {
 207                 och_p = &lli->lli_mds_write_och;
 208                 och_usecount = &lli->lli_open_fd_write_count;
 209         } else if (flags & FMODE_EXEC) {
 210                 och_p = &lli->lli_mds_exec_och;
 211                 och_usecount = &lli->lli_open_fd_exec_count;
 212         } else {
 213                 LASSERT(flags & FMODE_READ);
 214                 och_p = &lli->lli_mds_read_och;
 215                 och_usecount = &lli->lli_open_fd_read_count;
 216         }
 217
 218         mutex_lock(&lli->lli_och_mutex);
 219         if (*och_usecount) { /* There are still users of this handle, so
 220                                 skip freeing it. */
 221                 mutex_unlock(&lli->lli_och_mutex);
 222                 RETURN(0);
 223         }
 224         och=*och_p;
 225         *och_p = NULL;
 226         mutex_unlock(&lli->lli_och_mutex);
 227
 228         if (och) { /* There might be a race and somebody have freed this och
 229                       already */
 230                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 231                                                inode, och);
 232         }
 233
 234         RETURN(rc);
 235 }
 236
 237 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 238                 struct file *file)
 239 {
 240         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 241         struct ll_inode_info *lli = ll_i2info(inode);
 242         int rc = 0;
 243         ENTRY;
 244
 245         /* clear group lock, if present */
 246         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 247                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
 248
 249         /* Let's see if we have good enough OPEN lock on the file and if
 250            we can skip talking to MDS */
 251         if (file->f_dentry->d_inode) { /* Can this ever be false? */
 252                 int lockmode;
 253                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 254                 struct lustre_handle lockh;
 255                 struct inode *inode = file->f_dentry->d_inode;
 256                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 257
 258                 mutex_lock(&lli->lli_och_mutex);
 259                 if (fd->fd_omode & FMODE_WRITE) {
 260                         lockmode = LCK_CW;
 261                         LASSERT(lli->lli_open_fd_write_count);
 262                         lli->lli_open_fd_write_count--;
 263                 } else if (fd->fd_omode & FMODE_EXEC) {
 264                         lockmode = LCK_PR;
 265                         LASSERT(lli->lli_open_fd_exec_count);
 266                         lli->lli_open_fd_exec_count--;
 267                 } else {
 268                         lockmode = LCK_CR;
 269                         LASSERT(lli->lli_open_fd_read_count);
 270                         lli->lli_open_fd_read_count--;
 271                 }
 272                 mutex_unlock(&lli->lli_och_mutex);
 273
 274                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 275                                    LDLM_IBITS, &policy, lockmode,
 276                                    &lockh)) {
 277                         rc = ll_md_real_close(file->f_dentry->d_inode,
 278                                               fd->fd_omode);
 279                 }
 280         } else {
 281                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
 282                        file, file->f_dentry, file->f_dentry->d_name.name);
 283         }
 284
 285         LUSTRE_FPRIVATE(file) = NULL;
 286         ll_file_data_put(fd);
 287         ll_capa_close(inode);
 288
 289         RETURN(rc);
 290 }
 291
 292 /* While this returns an error code, fput() the caller does not, so we need
 293  * to make every effort to clean up all of our state here.  Also, applications
 294  * rarely check close errors and even if an error is returned they will not
 295  * re-try the close call.
 296  */
 297 int ll_file_release(struct inode *inode, struct file *file)
 298 {
 299         struct ll_file_data *fd;
 300         struct ll_sb_info *sbi = ll_i2sbi(inode);
 301         struct ll_inode_info *lli = ll_i2info(inode);
 302         int rc;
 303         ENTRY;
 304
 305         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 306                inode->i_generation, inode);
 307
 308 #ifdef CONFIG_FS_POSIX_ACL
 309         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
 310             inode == inode->i_sb->s_root->d_inode) {
 311                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 312
 313                 LASSERT(fd != NULL);
 314                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 315                         fd->fd_flags &= ~LL_FILE_RMTACL;
 316                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
 317                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
 318                 }
 319         }
 320 #endif
 321
 322         if (inode->i_sb->s_root != file->f_dentry)
 323                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 324         fd = LUSTRE_FPRIVATE(file);
 325         LASSERT(fd != NULL);
 326
 327         /* The last ref on @file, maybe not the the owner pid of statahead.
 328          * Different processes can open the same dir, "ll_opendir_key" means:
 329          * it is me that should stop the statahead thread. */
 330         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
 331             lli->lli_opendir_pid != 0)
 332                 ll_stop_statahead(inode, lli->lli_opendir_key);
 333
 334         if (inode->i_sb->s_root == file->f_dentry) {
 335                 LUSTRE_FPRIVATE(file) = NULL;
 336                 ll_file_data_put(fd);
 337                 RETURN(0);
 338         }
 339
 340         if (!S_ISDIR(inode->i_mode)) {
 341                 lov_read_and_clear_async_rc(lli->lli_clob);
 342                 lli->lli_async_rc = 0;
 343         }
 344
 345         rc = ll_md_close(sbi->ll_md_exp, inode, file);
 346
 347         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 348                 libcfs_debug_dumplog();
 349
 350         RETURN(rc);
 351 }
 352
 353 static int ll_intent_file_open(struct file *file, void *lmm,
 354                                int lmmsize, struct lookup_intent *itp)
 355 {
 356         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 357         struct dentry *parent = file->f_dentry->d_parent;
 358         const char *name = file->f_dentry->d_name.name;
 359         const int len = file->f_dentry->d_name.len;
 360         struct md_op_data *op_data;
 361         struct ptlrpc_request *req;
 362         __u32 opc = LUSTRE_OPC_ANY;
 363         int rc;
 364         ENTRY;
 365
 366         if (!parent)
 367                 RETURN(-ENOENT);
 368
 369         /* Usually we come here only for NFSD, and we want open lock.
 370            But we can also get here with pre 2.6.15 patchless kernels, and in
 371            that case that lock is also ok */
 372         /* We can also get here if there was cached open handle in revalidate_it
 373          * but it disappeared while we were getting from there to ll_file_open.
 374          * But this means this file was closed and immediatelly opened which
 375          * makes a good candidate for using OPEN lock */
 376         /* If lmmsize & lmm are not 0, we are just setting stripe info
 377          * parameters. No need for the open lock */
 378         if (lmm == NULL && lmmsize == 0) {
 379                 itp->it_flags |= MDS_OPEN_LOCK;
 380                 if (itp->it_flags & FMODE_WRITE)
 381                         opc = LUSTRE_OPC_CREATE;
 382         }
 383
 384         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 385                                       file->f_dentry->d_inode, name, len,
 386                                       O_RDWR, opc, NULL);
 387         if (IS_ERR(op_data))
 388                 RETURN(PTR_ERR(op_data));
 389
 390         itp->it_flags |= MDS_OPEN_BY_FID;
 391         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 392                             0 /*unused */, &req, ll_md_blocking_ast, 0);
 393         ll_finish_md_op_data(op_data);
 394         if (rc == -ESTALE) {
 395                 /* reason for keep own exit path - don`t flood log
 396                 * with messages with -ESTALE errors.
 397                 */
 398                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 399                      it_open_error(DISP_OPEN_OPEN, itp))
 400                         GOTO(out, rc);
 401                 ll_release_openhandle(file->f_dentry, itp);
 402                 GOTO(out, rc);
 403         }
 404
 405         if (it_disposition(itp, DISP_LOOKUP_NEG))
 406                 GOTO(out, rc = -ENOENT);
 407
 408         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 409                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 410                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 411                 GOTO(out, rc);
 412         }
 413
 414         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
 415         if (!rc && itp->d.lustre.it_lock_mode)
 416                 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
 417                                  itp, NULL);
 418
 419 out:
 420         ptlrpc_req_finished(itp->d.lustre.it_data);
 421         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
 422         ll_intent_drop_lock(itp);
 423
 424         RETURN(rc);
 425 }
 426
 427 /**
 428  * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
 429  * not believe attributes if a few ioepoch holders exist. Attributes for
 430  * previous ioepoch if new one is opened are also skipped by MDS.
 431  */
 432 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
 433 {
 434         if (ioepoch && lli->lli_ioepoch != ioepoch) {
 435                 lli->lli_ioepoch = ioepoch;
 436                 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 437                        ioepoch, PFID(&lli->lli_fid));
 438         }
 439 }
 440
 441 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
 442                        struct lookup_intent *it, struct obd_client_handle *och)
 443 {
 444         struct ptlrpc_request *req = it->d.lustre.it_data;
 445         struct mdt_body *body;
 446
 447         LASSERT(och);
 448
 449         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 450         LASSERT(body != NULL);                      /* reply already checked out */
 451
 452         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
 453         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 454         och->och_fid = lli->lli_fid;
 455         och->och_flags = it->it_flags;
 456         ll_ioepoch_open(lli, body->ioepoch);
 457
 458         return md_set_open_replay_data(md_exp, och, req);
 459 }
 460
 461 int ll_local_open(struct file *file, struct lookup_intent *it,
 462                   struct ll_file_data *fd, struct obd_client_handle *och)
 463 {
 464         struct inode *inode = file->f_dentry->d_inode;
 465         struct ll_inode_info *lli = ll_i2info(inode);
 466         ENTRY;
 467
 468         LASSERT(!LUSTRE_FPRIVATE(file));
 469
 470         LASSERT(fd != NULL);
 471
 472         if (och) {
 473                 struct ptlrpc_request *req = it->d.lustre.it_data;
 474                 struct mdt_body *body;
 475                 int rc;
 476
 477                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
 478                 if (rc)
 479                         RETURN(rc);
 480
 481                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 482                 if ((it->it_flags & FMODE_WRITE) &&
 483                     (body->valid & OBD_MD_FLSIZE))
 484                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 485                                lli->lli_ioepoch, PFID(&lli->lli_fid));
 486         }
 487
 488         LUSTRE_FPRIVATE(file) = fd;
 489         ll_readahead_init(inode, &fd->fd_ras);
 490         fd->fd_omode = it->it_flags;
 491         RETURN(0);
 492 }
 493
 494 /* Open a file, and (for the very first open) create objects on the OSTs at
 495  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 496  * creation or open until ll_lov_setstripe() ioctl is called.
 497  *
 498  * If we already have the stripe MD locally then we don't request it in
 499  * md_open(), by passing a lmm_size = 0.
 500  *
 501  * It is up to the application to ensure no other processes open this file
 502  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 503  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 504  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 505  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 506  */
 507 int ll_file_open(struct inode *inode, struct file *file)
 508 {
 509         struct ll_inode_info *lli = ll_i2info(inode);
 510         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 511                                           .it_flags = file->f_flags };
 512         struct obd_client_handle **och_p = NULL;
 513         __u64 *och_usecount = NULL;
 514         struct ll_file_data *fd;
 515         int rc = 0, opendir_set = 0;
 516         ENTRY;
 517
 518         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 519                inode->i_generation, inode, file->f_flags);
 520
 521         it = file->private_data; /* XXX: compat macro */
 522         file->private_data = NULL; /* prevent ll_local_open assertion */
 523
 524         fd = ll_file_data_get();
 525         if (fd == NULL)
 526                 GOTO(out_och_free, rc = -ENOMEM);
 527
 528         fd->fd_file = file;
 529         if (S_ISDIR(inode->i_mode)) {
 530                 spin_lock(&lli->lli_sa_lock);
 531                 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
 532                     lli->lli_opendir_pid == 0) {
 533                         lli->lli_opendir_key = fd;
 534                         lli->lli_opendir_pid = cfs_curproc_pid();
 535                         opendir_set = 1;
 536                 }
 537                 spin_unlock(&lli->lli_sa_lock);
 538         }
 539
 540         if (inode->i_sb->s_root == file->f_dentry) {
 541                 LUSTRE_FPRIVATE(file) = fd;
 542                 RETURN(0);
 543         }
 544
 545         if (!it || !it->d.lustre.it_disposition) {
 546                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 547                  * because everything but O_ACCMODE mask was stripped from
 548                  * there */
 549                 if ((oit.it_flags + 1) & O_ACCMODE)
 550                         oit.it_flags++;
 551                 if (file->f_flags & O_TRUNC)
 552                         oit.it_flags |= FMODE_WRITE;
 553
 554                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 555                  * dentry_open after call to open_namei that checks permissions.
 556                  * Only nfsd_open call dentry_open directly without checking
 557                  * permissions and because of that this code below is safe. */
 558                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 559                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 560
 561                 /* We do not want O_EXCL here, presumably we opened the file
 562                  * already? XXX - NFS implications? */
 563                 oit.it_flags &= ~O_EXCL;
 564
 565                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 566                  * created if necessary, then "IT_CREAT" should be set to keep
 567                  * consistent with it */
 568                 if (oit.it_flags & O_CREAT)
 569                         oit.it_op |= IT_CREAT;
 570
 571                 it = &oit;
 572         }
 573
 574 restart:
 575         /* Let's see if we have file open on MDS already. */
 576         if (it->it_flags & FMODE_WRITE) {
 577                 och_p = &lli->lli_mds_write_och;
 578                 och_usecount = &lli->lli_open_fd_write_count;
 579         } else if (it->it_flags & FMODE_EXEC) {
 580                 och_p = &lli->lli_mds_exec_och;
 581                 och_usecount = &lli->lli_open_fd_exec_count;
 582          } else {
 583                 och_p = &lli->lli_mds_read_och;
 584                 och_usecount = &lli->lli_open_fd_read_count;
 585         }
 586
 587         mutex_lock(&lli->lli_och_mutex);
 588         if (*och_p) { /* Open handle is present */
 589                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 590                         /* Well, there's extra open request that we do not need,
 591                            let's close it somehow. This will decref request. */
 592                         rc = it_open_error(DISP_OPEN_OPEN, it);
 593                         if (rc) {
 594                                 mutex_unlock(&lli->lli_och_mutex);
 595                                 GOTO(out_openerr, rc);
 596                         }
 597
 598                         ll_release_openhandle(file->f_dentry, it);
 599                 }
 600                 (*och_usecount)++;
 601
 602                 rc = ll_local_open(file, it, fd, NULL);
 603                 if (rc) {
 604                         (*och_usecount)--;
 605                         mutex_unlock(&lli->lli_och_mutex);
 606                         GOTO(out_openerr, rc);
 607                 }
 608         } else {
 609                 LASSERT(*och_usecount == 0);
 610                 if (!it->d.lustre.it_disposition) {
 611                         /* We cannot just request lock handle now, new ELC code
 612                            means that one of other OPEN locks for this file
 613                            could be cancelled, and since blocking ast handler
 614                            would attempt to grab och_mutex as well, that would
 615                            result in a deadlock */
 616                         mutex_unlock(&lli->lli_och_mutex);
 617                         it->it_create_mode |= M_CHECK_STALE;
 618                         rc = ll_intent_file_open(file, NULL, 0, it);
 619                         it->it_create_mode &= ~M_CHECK_STALE;
 620                         if (rc)
 621                                 GOTO(out_openerr, rc);
 622
 623                         goto restart;
 624                 }
 625                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 626                 if (!*och_p)
 627                         GOTO(out_och_free, rc = -ENOMEM);
 628
 629                 (*och_usecount)++;
 630
 631                 /* md_intent_lock() didn't get a request ref if there was an
 632                  * open error, so don't do cleanup on the request here
 633                  * (bug 3430) */
 634                 /* XXX (green): Should not we bail out on any error here, not
 635                  * just open error? */
 636                 rc = it_open_error(DISP_OPEN_OPEN, it);
 637                 if (rc)
 638                         GOTO(out_och_free, rc);
 639
 640                 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
 641
 642                 rc = ll_local_open(file, it, fd, *och_p);
 643                 if (rc)
 644                         GOTO(out_och_free, rc);
 645         }
 646         mutex_unlock(&lli->lli_och_mutex);
 647         fd = NULL;
 648
 649         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 650            different kind of OPEN lock for this same inode gets cancelled
 651            by ldlm_cancel_lru */
 652         if (!S_ISREG(inode->i_mode))
 653                 GOTO(out_och_free, rc);
 654
 655         ll_capa_open(inode);
 656
 657         if (!lli->lli_has_smd) {
 658                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 659                     !(file->f_mode & FMODE_WRITE)) {
 660                         CDEBUG(D_INODE, "object creation was delayed\n");
 661                         GOTO(out_och_free, rc);
 662                 }
 663         }
 664         file->f_flags &= ~O_LOV_DELAY_CREATE;
 665         GOTO(out_och_free, rc);
 666
 667 out_och_free:
 668         if (rc) {
 669                 if (och_p && *och_p) {
 670                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 671                         *och_p = NULL; /* OBD_FREE writes some magic there */
 672                         (*och_usecount)--;
 673                 }
 674                 mutex_unlock(&lli->lli_och_mutex);
 675
 676 out_openerr:
 677                 if (opendir_set != 0)
 678                         ll_stop_statahead(inode, lli->lli_opendir_key);
 679                 if (fd != NULL)
 680                         ll_file_data_put(fd);
 681         } else {
 682                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 683         }
 684
 685         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 686                 ptlrpc_req_finished(it->d.lustre.it_data);
 687                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 688         }
 689
 690         return rc;
 691 }
 692
 693 /* Fills the obdo with the attributes for the lsm */
 694 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
 695                           struct obd_capa *capa, struct obdo *obdo,
 696                           __u64 ioepoch, int sync)
 697 {
 698         struct ptlrpc_request_set *set;
 699         struct obd_info            oinfo = { { { 0 } } };
 700         int                        rc;
 701
 702         ENTRY;
 703
 704         LASSERT(lsm != NULL);
 705
 706         oinfo.oi_md = lsm;
 707         oinfo.oi_oa = obdo;
 708         oinfo.oi_oa->o_id = lsm->lsm_object_id;
 709         oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
 710         oinfo.oi_oa->o_mode = S_IFREG;
 711         oinfo.oi_oa->o_ioepoch = ioepoch;
 712         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 713                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 714                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 715                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 716                                OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
 717                                OBD_MD_FLDATAVERSION;
 718         oinfo.oi_capa = capa;
 719         if (sync) {
 720                 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
 721                 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
 722         }
 723
 724         set = ptlrpc_prep_set();
 725         if (set == NULL) {
 726                 CERROR("can't allocate ptlrpc set\n");
 727                 rc = -ENOMEM;
 728         } else {
 729                 rc = obd_getattr_async(exp, &oinfo, set);
 730                 if (rc == 0)
 731                         rc = ptlrpc_set_wait(set);
 732                 ptlrpc_set_destroy(set);
 733         }
 734         if (rc == 0)
 735                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 736                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
 737                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE |
 738                                          OBD_MD_FLDATAVERSION);
 739         RETURN(rc);
 740 }
 741
 742 /**
 743   * Performs the getattr on the inode and updates its fields.
 744   * If @sync != 0, perform the getattr under the server-side lock.
 745   */
 746 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
 747                      __u64 ioepoch, int sync)
 748 {
 749         struct obd_capa      *capa = ll_mdscapa_get(inode);
 750         struct lov_stripe_md *lsm;
 751         int rc;
 752         ENTRY;
 753
 754         lsm = ccc_inode_lsm_get(inode);
 755         rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
 756                             capa, obdo, ioepoch, sync);
 757         capa_put(capa);
 758         if (rc == 0) {
 759                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
 760                 CDEBUG(D_INODE,
 761                        "objid "LPX64" size %llu, blocks %llu, blksize %lu\n",
 762                        lsm ? lsm->lsm_object_id : 0, i_size_read(inode),
 763                        (unsigned long long)inode->i_blocks,
 764                        (unsigned long)ll_inode_blksize(inode));
 765         }
 766         ccc_inode_lsm_put(inode, lsm);
 767         RETURN(rc);
 768 }
 769
 770 int ll_merge_lvb(struct inode *inode)
 771 {
 772         struct ll_inode_info *lli = ll_i2info(inode);
 773         struct ll_sb_info *sbi = ll_i2sbi(inode);
 774         struct lov_stripe_md *lsm;
 775         struct ost_lvb lvb;
 776         int rc = 0;
 777
 778         ENTRY;
 779
 780         lsm = ccc_inode_lsm_get(inode);
 781         ll_inode_size_lock(inode);
 782         inode_init_lvb(inode, &lvb);
 783
 784         /* merge timestamps the most resently obtained from mds with
 785            timestamps obtained from osts */
 786         lvb.lvb_atime = lli->lli_lvb.lvb_atime;
 787         lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
 788         lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
 789         if (lsm != NULL) {
 790                 rc = obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
 791                 cl_isize_write_nolock(inode, lvb.lvb_size);
 792
 793                 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
 794                                 PFID(&lli->lli_fid), lvb.lvb_size);
 795                 inode->i_blocks = lvb.lvb_blocks;
 796         }
 797         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
 798         LTIME_S(inode->i_atime) = lvb.lvb_atime;
 799         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
 800         ll_inode_size_unlock(inode);
 801         ccc_inode_lsm_put(inode, lsm);
 802
 803         RETURN(rc);
 804 }
 805
 806 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
 807                      lstat_t *st)
 808 {
 809         struct obdo obdo = { 0 };
 810         int rc;
 811
 812         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
 813         if (rc == 0) {
 814                 st->st_size   = obdo.o_size;
 815                 st->st_blocks = obdo.o_blocks;
 816                 st->st_mtime  = obdo.o_mtime;
 817                 st->st_atime  = obdo.o_atime;
 818                 st->st_ctime  = obdo.o_ctime;
 819         }
 820         return rc;
 821 }
 822
 823 void ll_io_init(struct cl_io *io, const struct file *file, int write)
 824 {
 825         struct inode *inode = file->f_dentry->d_inode;
 826
 827         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
 828         if (write) {
 829                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
 830                 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC || IS_SYNC(inode);
 831         }
 832         io->ci_obj     = ll_i2info(inode)->lli_clob;
 833         io->ci_lockreq = CILR_MAYBE;
 834         if (ll_file_nolock(file)) {
 835                 io->ci_lockreq = CILR_NEVER;
 836                 io->ci_no_srvlock = 1;
 837         } else if (file->f_flags & O_APPEND) {
 838                 io->ci_lockreq = CILR_MANDATORY;
 839         }
 840 }
 841
 842 static ssize_t
 843 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 844                    struct file *file, enum cl_io_type iot,
 845                    loff_t *ppos, size_t count)
 846 {
 847         struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
 848         struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
 849         struct cl_io         *io;
 850         ssize_t               result;
 851         ENTRY;
 852
 853         io = ccc_env_thread_io(env);
 854         ll_io_init(io, file, iot == CIT_WRITE);
 855
 856         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
 857                 struct vvp_io *vio = vvp_env_io(env);
 858                 struct ccc_io *cio = ccc_env_io(env);
 859                 int write_mutex_locked = 0;
 860
 861                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
 862                 vio->cui_io_subtype = args->via_io_subtype;
 863
 864                 switch (vio->cui_io_subtype) {
 865                 case IO_NORMAL:
 866                         cio->cui_iov = args->u.normal.via_iov;
 867                         cio->cui_nrsegs = args->u.normal.via_nrsegs;
 868                         cio->cui_tot_nrsegs = cio->cui_nrsegs;
 869 #ifndef HAVE_FILE_WRITEV
 870                         cio->cui_iocb = args->u.normal.via_iocb;
 871 #endif
 872                         if ((iot == CIT_WRITE) &&
 873                             !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
 874                                 if (mutex_lock_interruptible(&lli->
 875                                                                lli_write_mutex))
 876                                         GOTO(out, result = -ERESTARTSYS);
 877                                 write_mutex_locked = 1;
 878                         } else if (iot == CIT_READ) {
 879                                 down_read(&lli->lli_trunc_sem);
 880                         }
 881                         break;
 882                 case IO_SENDFILE:
 883                         vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
 884                         vio->u.sendfile.cui_target = args->u.sendfile.via_target;
 885                         break;
 886                 case IO_SPLICE:
 887                         vio->u.splice.cui_pipe = args->u.splice.via_pipe;
 888                         vio->u.splice.cui_flags = args->u.splice.via_flags;
 889                         break;
 890                 default:
 891                         CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
 892                         LBUG();
 893                 }
 894                 result = cl_io_loop(env, io);
 895                 if (write_mutex_locked)
 896                         mutex_unlock(&lli->lli_write_mutex);
 897                 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
 898                         up_read(&lli->lli_trunc_sem);
 899         } else {
 900                 /* cl_io_rw_init() handled IO */
 901                 result = io->ci_result;
 902         }
 903
 904         if (io->ci_nob > 0) {
 905                 result = io->ci_nob;
 906                 *ppos = io->u.ci_wr.wr.crw_pos;
 907         }
 908         GOTO(out, result);
 909 out:
 910         cl_io_fini(env, io);
 911
 912         if (iot == CIT_READ) {
 913                 if (result >= 0)
 914                         ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
 915                                            LPROC_LL_READ_BYTES, result);
 916         } else if (iot == CIT_WRITE) {
 917                 if (result >= 0) {
 918                         ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
 919                                            LPROC_LL_WRITE_BYTES, result);
 920                         fd->fd_write_failed = false;
 921                 } else {
 922                         fd->fd_write_failed = true;
 923                 }
 924         }
 925
 926         return result;
 927 }
 928
 929
 930 /*
 931  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
 932  */
 933 static int ll_file_get_iov_count(const struct iovec *iov,
 934                                  unsigned long *nr_segs, size_t *count)
 935 {
 936         size_t cnt = 0;
 937         unsigned long seg;
 938
 939         for (seg = 0; seg < *nr_segs; seg++) {
 940                 const struct iovec *iv = &iov[seg];
 941
 942                 /*
 943                  * If any segment has a negative length, or the cumulative
 944                  * length ever wraps negative then return -EINVAL.
 945                  */
 946                 cnt += iv->iov_len;
 947                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
 948                         return -EINVAL;
 949                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
 950                         continue;
 951                 if (seg == 0)
 952                         return -EFAULT;
 953                 *nr_segs = seg;
 954                 cnt -= iv->iov_len;   /* This segment is no good */
 955                 break;
 956         }
 957         *count = cnt;
 958         return 0;
 959 }
 960
 961 #ifdef HAVE_FILE_READV
 962 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
 963                               unsigned long nr_segs, loff_t *ppos)
 964 {
 965         struct lu_env      *env;
 966         struct vvp_io_args *args;
 967         size_t              count;
 968         ssize_t             result;
 969         int                 refcheck;
 970         ENTRY;
 971
 972         result = ll_file_get_iov_count(iov, &nr_segs, &count);
 973         if (result)
 974                 RETURN(result);
 975
 976         env = cl_env_get(&refcheck);
 977         if (IS_ERR(env))
 978                 RETURN(PTR_ERR(env));
 979
 980         args = vvp_env_args(env, IO_NORMAL);
 981         args->u.normal.via_iov = (struct iovec *)iov;
 982         args->u.normal.via_nrsegs = nr_segs;
 983
 984         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
 985         cl_env_put(env, &refcheck);
 986         RETURN(result);
 987 }
 988
 989 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
 990                             loff_t *ppos)
 991 {
 992         struct lu_env *env;
 993         struct iovec  *local_iov;
 994         ssize_t        result;
 995         int            refcheck;
 996         ENTRY;
 997
 998         env = cl_env_get(&refcheck);
 999         if (IS_ERR(env))
1000                 RETURN(PTR_ERR(env));
1001
1002         local_iov = &vvp_env_info(env)->vti_local_iov;
1003         local_iov->iov_base = (void __user *)buf;
1004         local_iov->iov_len = count;
1005         result = ll_file_readv(file, local_iov, 1, ppos);
1006         cl_env_put(env, &refcheck);
1007         RETURN(result);
1008 }
1009
1010 #else
1011 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1012                                 unsigned long nr_segs, loff_t pos)
1013 {
1014         struct lu_env      *env;
1015         struct vvp_io_args *args;
1016         size_t              count;
1017         ssize_t             result;
1018         int                 refcheck;
1019         ENTRY;
1020
1021         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1022         if (result)
1023                 RETURN(result);
1024
1025         env = cl_env_get(&refcheck);
1026         if (IS_ERR(env))
1027                 RETURN(PTR_ERR(env));
1028
1029         args = vvp_env_args(env, IO_NORMAL);
1030         args->u.normal.via_iov = (struct iovec *)iov;
1031         args->u.normal.via_nrsegs = nr_segs;
1032         args->u.normal.via_iocb = iocb;
1033
1034         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1035                                     &iocb->ki_pos, count);
1036         cl_env_put(env, &refcheck);
1037         RETURN(result);
1038 }
1039
1040 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1041                             loff_t *ppos)
1042 {
1043         struct lu_env *env;
1044         struct iovec  *local_iov;
1045         struct kiocb  *kiocb;
1046         ssize_t        result;
1047         int            refcheck;
1048         ENTRY;
1049
1050         env = cl_env_get(&refcheck);
1051         if (IS_ERR(env))
1052                 RETURN(PTR_ERR(env));
1053
1054         local_iov = &vvp_env_info(env)->vti_local_iov;
1055         kiocb = &vvp_env_info(env)->vti_kiocb;
1056         local_iov->iov_base = (void __user *)buf;
1057         local_iov->iov_len = count;
1058         init_sync_kiocb(kiocb, file);
1059         kiocb->ki_pos = *ppos;
1060         kiocb->ki_left = count;
1061
1062         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1063         *ppos = kiocb->ki_pos;
1064
1065         cl_env_put(env, &refcheck);
1066         RETURN(result);
1067 }
1068 #endif
1069
1070 /*
1071  * Write to a file (through the page cache).
1072  */
1073 #ifdef HAVE_FILE_WRITEV
1074 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1075                               unsigned long nr_segs, loff_t *ppos)
1076 {
1077         struct lu_env      *env;
1078         struct vvp_io_args *args;
1079         size_t              count;
1080         ssize_t             result;
1081         int                 refcheck;
1082         ENTRY;
1083
1084         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1085         if (result)
1086                 RETURN(result);
1087
1088         env = cl_env_get(&refcheck);
1089         if (IS_ERR(env))
1090                 RETURN(PTR_ERR(env));
1091
1092         args = vvp_env_args(env, IO_NORMAL);
1093         args->u.normal.via_iov = (struct iovec *)iov;
1094         args->u.normal.via_nrsegs = nr_segs;
1095
1096         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1097         cl_env_put(env, &refcheck);
1098         RETURN(result);
1099 }
1100
1101 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1102                              loff_t *ppos)
1103 {
1104         struct lu_env    *env;
1105         struct iovec     *local_iov;
1106         ssize_t           result;
1107         int               refcheck;
1108         ENTRY;
1109
1110         env = cl_env_get(&refcheck);
1111         if (IS_ERR(env))
1112                 RETURN(PTR_ERR(env));
1113
1114         local_iov = &vvp_env_info(env)->vti_local_iov;
1115         local_iov->iov_base = (void __user *)buf;
1116         local_iov->iov_len = count;
1117
1118         result = ll_file_writev(file, local_iov, 1, ppos);
1119         cl_env_put(env, &refcheck);
1120         RETURN(result);
1121 }
1122
1123 #else /* AIO stuff */
1124 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1125                                  unsigned long nr_segs, loff_t pos)
1126 {
1127         struct lu_env      *env;
1128         struct vvp_io_args *args;
1129         size_t              count;
1130         ssize_t             result;
1131         int                 refcheck;
1132         ENTRY;
1133
1134         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1135         if (result)
1136                 RETURN(result);
1137
1138         env = cl_env_get(&refcheck);
1139         if (IS_ERR(env))
1140                 RETURN(PTR_ERR(env));
1141
1142         args = vvp_env_args(env, IO_NORMAL);
1143         args->u.normal.via_iov = (struct iovec *)iov;
1144         args->u.normal.via_nrsegs = nr_segs;
1145         args->u.normal.via_iocb = iocb;
1146
1147         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1148                                   &iocb->ki_pos, count);
1149         cl_env_put(env, &refcheck);
1150         RETURN(result);
1151 }
1152
1153 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1154                              loff_t *ppos)
1155 {
1156         struct lu_env *env;
1157         struct iovec  *local_iov;
1158         struct kiocb  *kiocb;
1159         ssize_t        result;
1160         int            refcheck;
1161         ENTRY;
1162
1163         env = cl_env_get(&refcheck);
1164         if (IS_ERR(env))
1165                 RETURN(PTR_ERR(env));
1166
1167         local_iov = &vvp_env_info(env)->vti_local_iov;
1168         kiocb = &vvp_env_info(env)->vti_kiocb;
1169         local_iov->iov_base = (void __user *)buf;
1170         local_iov->iov_len = count;
1171         init_sync_kiocb(kiocb, file);
1172         kiocb->ki_pos = *ppos;
1173         kiocb->ki_left = count;
1174
1175         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1176         *ppos = kiocb->ki_pos;
1177
1178         cl_env_put(env, &refcheck);
1179         RETURN(result);
1180 }
1181 #endif
1182
1183
1184 #ifdef HAVE_KERNEL_SENDFILE
1185 /*
1186  * Send file content (through pagecache) somewhere with helper
1187  */
1188 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1189                                 read_actor_t actor, void *target)
1190 {
1191         struct lu_env      *env;
1192         struct vvp_io_args *args;
1193         ssize_t             result;
1194         int                 refcheck;
1195         ENTRY;
1196
1197         env = cl_env_get(&refcheck);
1198         if (IS_ERR(env))
1199                 RETURN(PTR_ERR(env));
1200
1201         args = vvp_env_args(env, IO_SENDFILE);
1202         args->u.sendfile.via_target = target;
1203         args->u.sendfile.via_actor = actor;
1204
1205         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1206         cl_env_put(env, &refcheck);
1207         RETURN(result);
1208 }
1209 #endif
1210
1211 #ifdef HAVE_KERNEL_SPLICE_READ
1212 /*
1213  * Send file content (through pagecache) somewhere with helper
1214  */
1215 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1216                                    struct pipe_inode_info *pipe, size_t count,
1217                                    unsigned int flags)
1218 {
1219         struct lu_env      *env;
1220         struct vvp_io_args *args;
1221         ssize_t             result;
1222         int                 refcheck;
1223         ENTRY;
1224
1225         env = cl_env_get(&refcheck);
1226         if (IS_ERR(env))
1227                 RETURN(PTR_ERR(env));
1228
1229         args = vvp_env_args(env, IO_SPLICE);
1230         args->u.splice.via_pipe = pipe;
1231         args->u.splice.via_flags = flags;
1232
1233         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1234         cl_env_put(env, &refcheck);
1235         RETURN(result);
1236 }
1237 #endif
1238
1239 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1240                            obd_count ost_idx)
1241 {
1242         struct obd_export *exp = ll_i2dtexp(inode);
1243         struct obd_trans_info oti = { 0 };
1244         struct obdo *oa = NULL;
1245         int lsm_size;
1246         int rc = 0;
1247         struct lov_stripe_md *lsm = NULL, *lsm2;
1248         ENTRY;
1249
1250         OBDO_ALLOC(oa);
1251         if (oa == NULL)
1252                 RETURN(-ENOMEM);
1253
1254         lsm = ccc_inode_lsm_get(inode);
1255         if (lsm == NULL)
1256                 GOTO(out, rc = -ENOENT);
1257
1258         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1259                    (lsm->lsm_stripe_count));
1260
1261         OBD_ALLOC_LARGE(lsm2, lsm_size);
1262         if (lsm2 == NULL)
1263                 GOTO(out, rc = -ENOMEM);
1264
1265         oa->o_id = id;
1266         oa->o_seq = seq;
1267         oa->o_nlink = ost_idx;
1268         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1269         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1270         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1271                                    OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1272         obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1273         memcpy(lsm2, lsm, lsm_size);
1274         ll_inode_size_lock(inode);
1275         rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1276         ll_inode_size_unlock(inode);
1277
1278         OBD_FREE_LARGE(lsm2, lsm_size);
1279         GOTO(out, rc);
1280 out:
1281         ccc_inode_lsm_put(inode, lsm);
1282         OBDO_FREE(oa);
1283         return rc;
1284 }
1285
1286 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1287 {
1288         struct ll_recreate_obj ucreat;
1289         ENTRY;
1290
1291         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1292                 RETURN(-EPERM);
1293
1294         if (cfs_copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1295                                sizeof(struct ll_recreate_obj)))
1296                 RETURN(-EFAULT);
1297
1298         RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1299                                ucreat.lrc_ost_idx));
1300 }
1301
1302 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1303 {
1304         struct lu_fid fid;
1305         obd_id id;
1306         obd_count ost_idx;
1307         ENTRY;
1308
1309         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1310                 RETURN(-EPERM);
1311
1312         if (cfs_copy_from_user(&fid, (struct lu_fid *)arg,
1313                                sizeof(struct lu_fid)))
1314                 RETURN(-EFAULT);
1315
1316         id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1317         ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1318         RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1319 }
1320
1321 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1322                              int flags, struct lov_user_md *lum, int lum_size)
1323 {
1324         struct lov_stripe_md *lsm = NULL;
1325         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1326         int rc = 0;
1327         ENTRY;
1328
1329         lsm = ccc_inode_lsm_get(inode);
1330         if (lsm != NULL) {
1331                 ccc_inode_lsm_put(inode, lsm);
1332                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1333                        inode->i_ino);
1334                 RETURN(-EEXIST);
1335         }
1336
1337         ll_inode_size_lock(inode);
1338         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1339         if (rc)
1340                 GOTO(out, rc);
1341         rc = oit.d.lustre.it_status;
1342         if (rc < 0)
1343                 GOTO(out_req_free, rc);
1344
1345         ll_release_openhandle(file->f_dentry, &oit);
1346
1347  out:
1348         ll_inode_size_unlock(inode);
1349         ll_intent_release(&oit);
1350         ccc_inode_lsm_put(inode, lsm);
1351         RETURN(rc);
1352 out_req_free:
1353         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1354         goto out;
1355 }
1356
1357 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1358                              struct lov_mds_md **lmmp, int *lmm_size,
1359                              struct ptlrpc_request **request)
1360 {
1361         struct ll_sb_info *sbi = ll_i2sbi(inode);
1362         struct mdt_body  *body;
1363         struct lov_mds_md *lmm = NULL;
1364         struct ptlrpc_request *req = NULL;
1365         struct md_op_data *op_data;
1366         int rc, lmmsize;
1367
1368         rc = ll_get_max_mdsize(sbi, &lmmsize);
1369         if (rc)
1370                 RETURN(rc);
1371
1372         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1373                                      strlen(filename), lmmsize,
1374                                      LUSTRE_OPC_ANY, NULL);
1375         if (IS_ERR(op_data))
1376                 RETURN(PTR_ERR(op_data));
1377
1378         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1379         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1380         ll_finish_md_op_data(op_data);
1381         if (rc < 0) {
1382                 CDEBUG(D_INFO, "md_getattr_name failed "
1383                        "on %s: rc %d\n", filename, rc);
1384                 GOTO(out, rc);
1385         }
1386
1387         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1388         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1389
1390         lmmsize = body->eadatasize;
1391
1392         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1393                         lmmsize == 0) {
1394                 GOTO(out, rc = -ENODATA);
1395         }
1396
1397         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1398         LASSERT(lmm != NULL);
1399
1400         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1401             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1402                 GOTO(out, rc = -EPROTO);
1403         }
1404
1405         /*
1406          * This is coming from the MDS, so is probably in
1407          * little endian.  We convert it to host endian before
1408          * passing it to userspace.
1409          */
1410         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1411                 /* if function called for directory - we should
1412                  * avoid swab not existent lsm objects */
1413                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1414                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1415                         if (S_ISREG(body->mode))
1416                                 lustre_swab_lov_user_md_objects(
1417                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1418                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1419                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1420                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1421                         if (S_ISREG(body->mode))
1422                                 lustre_swab_lov_user_md_objects(
1423                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1424                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1425                 }
1426         }
1427
1428 out:
1429         *lmmp = lmm;
1430         *lmm_size = lmmsize;
1431         *request = req;
1432         return rc;
1433 }
1434
1435 static int ll_lov_setea(struct inode *inode, struct file *file,
1436                             unsigned long arg)
1437 {
1438         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1439         struct lov_user_md  *lump;
1440         int lum_size = sizeof(struct lov_user_md) +
1441                        sizeof(struct lov_user_ost_data);
1442         int rc;
1443         ENTRY;
1444
1445         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1446                 RETURN(-EPERM);
1447
1448         OBD_ALLOC_LARGE(lump, lum_size);
1449         if (lump == NULL) {
1450                 RETURN(-ENOMEM);
1451         }
1452         if (cfs_copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1453                 OBD_FREE_LARGE(lump, lum_size);
1454                 RETURN(-EFAULT);
1455         }
1456
1457         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1458
1459         OBD_FREE_LARGE(lump, lum_size);
1460         RETURN(rc);
1461 }
1462
1463 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1464                             unsigned long arg)
1465 {
1466         struct lov_user_md_v3 lumv3;
1467         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1468         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1469         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1470         int lum_size;
1471         int rc;
1472         int flags = FMODE_WRITE;
1473         ENTRY;
1474
1475         /* first try with v1 which is smaller than v3 */
1476         lum_size = sizeof(struct lov_user_md_v1);
1477         if (cfs_copy_from_user(lumv1, lumv1p, lum_size))
1478                 RETURN(-EFAULT);
1479
1480         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1481                 lum_size = sizeof(struct lov_user_md_v3);
1482                 if (cfs_copy_from_user(&lumv3, lumv3p, lum_size))
1483                         RETURN(-EFAULT);
1484         }
1485
1486         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1487         if (rc == 0) {
1488                 struct lov_stripe_md *lsm;
1489                 __u32 gen;
1490
1491                 put_user(0, &lumv1p->lmm_stripe_count);
1492
1493                 ll_layout_refresh(inode, &gen);
1494                 lsm = ccc_inode_lsm_get(inode);
1495                 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1496                                    0, lsm, (void *)arg);
1497                 ccc_inode_lsm_put(inode, lsm);
1498         }
1499         RETURN(rc);
1500 }
1501
1502 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1503 {
1504         struct lov_stripe_md *lsm;
1505         int rc = -ENODATA;
1506         ENTRY;
1507
1508         lsm = ccc_inode_lsm_get(inode);
1509         if (lsm != NULL)
1510                 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1511                                    lsm, (void *)arg);
1512         ccc_inode_lsm_put(inode, lsm);
1513         RETURN(rc);
1514 }
1515
1516 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1517 {
1518         struct ll_inode_info   *lli = ll_i2info(inode);
1519         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1520         struct ccc_grouplock    grouplock;
1521         int                     rc;
1522         ENTRY;
1523
1524         if (ll_file_nolock(file))
1525                 RETURN(-EOPNOTSUPP);
1526
1527         spin_lock(&lli->lli_lock);
1528         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1529                 CWARN("group lock already existed with gid %lu\n",
1530                       fd->fd_grouplock.cg_gid);
1531                 spin_unlock(&lli->lli_lock);
1532                 RETURN(-EINVAL);
1533         }
1534         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1535         spin_unlock(&lli->lli_lock);
1536
1537         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1538                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1539         if (rc)
1540                 RETURN(rc);
1541
1542         spin_lock(&lli->lli_lock);
1543         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1544                 spin_unlock(&lli->lli_lock);
1545                 CERROR("another thread just won the race\n");
1546                 cl_put_grouplock(&grouplock);
1547                 RETURN(-EINVAL);
1548         }
1549
1550         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1551         fd->fd_grouplock = grouplock;
1552         spin_unlock(&lli->lli_lock);
1553
1554         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1555         RETURN(0);
1556 }
1557
1558 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1559 {
1560         struct ll_inode_info   *lli = ll_i2info(inode);
1561         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1562         struct ccc_grouplock    grouplock;
1563         ENTRY;
1564
1565         spin_lock(&lli->lli_lock);
1566         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1567                 spin_unlock(&lli->lli_lock);
1568                 CWARN("no group lock held\n");
1569                 RETURN(-EINVAL);
1570         }
1571         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1572
1573         if (fd->fd_grouplock.cg_gid != arg) {
1574                 CWARN("group lock %lu doesn't match current id %lu\n",
1575                        arg, fd->fd_grouplock.cg_gid);
1576                 spin_unlock(&lli->lli_lock);
1577                 RETURN(-EINVAL);
1578         }
1579
1580         grouplock = fd->fd_grouplock;
1581         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1582         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1583         spin_unlock(&lli->lli_lock);
1584
1585         cl_put_grouplock(&grouplock);
1586         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1587         RETURN(0);
1588 }
1589
1590 /**
1591  * Close inode open handle
1592  *
1593  * \param dentry [in]     dentry which contains the inode
1594  * \param it     [in,out] intent which contains open info and result
1595  *
1596  * \retval 0     success
1597  * \retval <0    failure
1598  */
1599 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1600 {
1601         struct inode *inode = dentry->d_inode;
1602         struct obd_client_handle *och;
1603         int rc;
1604         ENTRY;
1605
1606         LASSERT(inode);
1607
1608         /* Root ? Do nothing. */
1609         if (dentry->d_inode->i_sb->s_root == dentry)
1610                 RETURN(0);
1611
1612         /* No open handle to close? Move away */
1613         if (!it_disposition(it, DISP_OPEN_OPEN))
1614                 RETURN(0);
1615
1616         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1617
1618         OBD_ALLOC(och, sizeof(*och));
1619         if (!och)
1620                 GOTO(out, rc = -ENOMEM);
1621
1622         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1623                     ll_i2info(inode), it, och);
1624
1625         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1626                                        inode, och);
1627  out:
1628         /* this one is in place of ll_file_open */
1629         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1630                 ptlrpc_req_finished(it->d.lustre.it_data);
1631                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1632         }
1633         RETURN(rc);
1634 }
1635
1636 /**
1637  * Get size for inode for which FIEMAP mapping is requested.
1638  * Make the FIEMAP get_info call and returns the result.
1639  */
1640 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1641               int num_bytes)
1642 {
1643         struct obd_export *exp = ll_i2dtexp(inode);
1644         struct lov_stripe_md *lsm = NULL;
1645         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1646         int vallen = num_bytes;
1647         int rc;
1648         ENTRY;
1649
1650         /* Checks for fiemap flags */
1651         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1652                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1653                 return -EBADR;
1654         }
1655
1656         /* Check for FIEMAP_FLAG_SYNC */
1657         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1658                 rc = filemap_fdatawrite(inode->i_mapping);
1659                 if (rc)
1660                         return rc;
1661         }
1662
1663         lsm = ccc_inode_lsm_get(inode);
1664         if (lsm == NULL)
1665                 return -ENOENT;
1666
1667         /* If the stripe_count > 1 and the application does not understand
1668          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1669          */
1670         if (lsm->lsm_stripe_count > 1 &&
1671             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1672                 GOTO(out, rc = -EOPNOTSUPP);
1673
1674         fm_key.oa.o_id = lsm->lsm_object_id;
1675         fm_key.oa.o_seq = lsm->lsm_object_seq;
1676         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1677
1678         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1679         obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1680         /* If filesize is 0, then there would be no objects for mapping */
1681         if (fm_key.oa.o_size == 0) {
1682                 fiemap->fm_mapped_extents = 0;
1683                 GOTO(out, rc = 0);
1684         }
1685
1686         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1687
1688         rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1689                           fiemap, lsm);
1690         if (rc)
1691                 CERROR("obd_get_info failed: rc = %d\n", rc);
1692
1693 out:
1694         ccc_inode_lsm_put(inode, lsm);
1695         RETURN(rc);
1696 }
1697
1698 int ll_fid2path(struct inode *inode, void *arg)
1699 {
1700         struct obd_export *exp = ll_i2mdexp(inode);
1701         struct getinfo_fid2path *gfout, *gfin;
1702         int outsize, rc;
1703         ENTRY;
1704
1705         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1706             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1707                 RETURN(-EPERM);
1708
1709         /* Need to get the buflen */
1710         OBD_ALLOC_PTR(gfin);
1711         if (gfin == NULL)
1712                 RETURN(-ENOMEM);
1713         if (cfs_copy_from_user(gfin, arg, sizeof(*gfin))) {
1714                 OBD_FREE_PTR(gfin);
1715                 RETURN(-EFAULT);
1716         }
1717
1718         outsize = sizeof(*gfout) + gfin->gf_pathlen;
1719         OBD_ALLOC(gfout, outsize);
1720         if (gfout == NULL) {
1721                 OBD_FREE_PTR(gfin);
1722                 RETURN(-ENOMEM);
1723         }
1724         memcpy(gfout, gfin, sizeof(*gfout));
1725         OBD_FREE_PTR(gfin);
1726
1727         /* Call mdc_iocontrol */
1728         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1729         if (rc)
1730                 GOTO(gf_free, rc);
1731         if (cfs_copy_to_user(arg, gfout, outsize))
1732                 rc = -EFAULT;
1733
1734 gf_free:
1735         OBD_FREE(gfout, outsize);
1736         RETURN(rc);
1737 }
1738
1739 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1740 {
1741         struct ll_user_fiemap *fiemap_s;
1742         size_t num_bytes, ret_bytes;
1743         unsigned int extent_count;
1744         int rc = 0;
1745
1746         /* Get the extent count so we can calculate the size of
1747          * required fiemap buffer */
1748         if (get_user(extent_count,
1749             &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1750                 RETURN(-EFAULT);
1751         num_bytes = sizeof(*fiemap_s) + (extent_count *
1752                                          sizeof(struct ll_fiemap_extent));
1753
1754         OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1755         if (fiemap_s == NULL)
1756                 RETURN(-ENOMEM);
1757
1758         /* get the fiemap value */
1759         if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1760                            sizeof(*fiemap_s)))
1761                 GOTO(error, rc = -EFAULT);
1762
1763         /* If fm_extent_count is non-zero, read the first extent since
1764          * it is used to calculate end_offset and device from previous
1765          * fiemap call. */
1766         if (extent_count) {
1767                 if (copy_from_user(&fiemap_s->fm_extents[0],
1768                     (char __user *)arg + sizeof(*fiemap_s),
1769                     sizeof(struct ll_fiemap_extent)))
1770                         GOTO(error, rc = -EFAULT);
1771         }
1772
1773         rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1774         if (rc)
1775                 GOTO(error, rc);
1776
1777         ret_bytes = sizeof(struct ll_user_fiemap);
1778
1779         if (extent_count != 0)
1780                 ret_bytes += (fiemap_s->fm_mapped_extents *
1781                                  sizeof(struct ll_fiemap_extent));
1782
1783         if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1784                 rc = -EFAULT;
1785
1786 error:
1787         OBD_FREE_LARGE(fiemap_s, num_bytes);
1788         RETURN(rc);
1789 }
1790
1791 /*
1792  * Read the data_version for inode.
1793  *
1794  * This value is computed using stripe object version on OST.
1795  * Version is computed using server side locking.
1796  *
1797  * @param extent_lock  Take extent lock. Not needed if a process is already
1798  *                     holding the OST object group locks.
1799  */
1800 static int ll_data_version(struct inode *inode, __u64 *data_version,
1801                            int extent_lock)
1802 {
1803         struct lov_stripe_md *lsm = NULL;
1804         struct ll_sb_info    *sbi = ll_i2sbi(inode);
1805         struct obdo          *obdo = NULL;
1806         int                   rc;
1807         ENTRY;
1808
1809         /* If no stripe, we consider version is 0. */
1810         lsm = ccc_inode_lsm_get(inode);
1811         if (lsm == NULL) {
1812                 *data_version = 0;
1813                 CDEBUG(D_INODE, "No object for inode\n");
1814                 RETURN(0);
1815         }
1816
1817         OBD_ALLOC_PTR(obdo);
1818         if (obdo == NULL) {
1819                 ccc_inode_lsm_put(inode, lsm);
1820                 RETURN(-ENOMEM);
1821         }
1822
1823         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1824         if (!rc) {
1825                 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1826                         rc = -EOPNOTSUPP;
1827                 else
1828                         *data_version = obdo->o_data_version;
1829         }
1830
1831         OBD_FREE_PTR(obdo);
1832         ccc_inode_lsm_put(inode, lsm);
1833
1834         RETURN(rc);
1835 }
1836
1837 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1838 {
1839         struct inode *inode = file->f_dentry->d_inode;
1840         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1841         int flags;
1842
1843         ENTRY;
1844
1845         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1846                inode->i_generation, inode, cmd);
1847         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1848
1849         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1850         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1851                 RETURN(-ENOTTY);
1852
1853         switch(cmd) {
1854         case LL_IOC_GETFLAGS:
1855                 /* Get the current value of the file flags */
1856                 return put_user(fd->fd_flags, (int *)arg);
1857         case LL_IOC_SETFLAGS:
1858         case LL_IOC_CLRFLAGS:
1859                 /* Set or clear specific file flags */
1860                 /* XXX This probably needs checks to ensure the flags are
1861                  *     not abused, and to handle any flag side effects.
1862                  */
1863                 if (get_user(flags, (int *) arg))
1864                         RETURN(-EFAULT);
1865
1866                 if (cmd == LL_IOC_SETFLAGS) {
1867                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1868                             !(file->f_flags & O_DIRECT)) {
1869                                 CERROR("%s: unable to disable locking on "
1870                                        "non-O_DIRECT file\n", current->comm);
1871                                 RETURN(-EINVAL);
1872                         }
1873
1874                         fd->fd_flags |= flags;
1875                 } else {
1876                         fd->fd_flags &= ~flags;
1877                 }
1878                 RETURN(0);
1879         case LL_IOC_LOV_SETSTRIPE:
1880                 RETURN(ll_lov_setstripe(inode, file, arg));
1881         case LL_IOC_LOV_SETEA:
1882                 RETURN(ll_lov_setea(inode, file, arg));
1883         case LL_IOC_LOV_GETSTRIPE:
1884                 RETURN(ll_lov_getstripe(inode, arg));
1885         case LL_IOC_RECREATE_OBJ:
1886                 RETURN(ll_lov_recreate_obj(inode, arg));
1887         case LL_IOC_RECREATE_FID:
1888                 RETURN(ll_lov_recreate_fid(inode, arg));
1889         case FSFILT_IOC_FIEMAP:
1890                 RETURN(ll_ioctl_fiemap(inode, arg));
1891         case FSFILT_IOC_GETFLAGS:
1892         case FSFILT_IOC_SETFLAGS:
1893                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1894         case FSFILT_IOC_GETVERSION_OLD:
1895         case FSFILT_IOC_GETVERSION:
1896                 RETURN(put_user(inode->i_generation, (int *)arg));
1897         case LL_IOC_GROUP_LOCK:
1898                 RETURN(ll_get_grouplock(inode, file, arg));
1899         case LL_IOC_GROUP_UNLOCK:
1900                 RETURN(ll_put_grouplock(inode, file, arg));
1901         case IOC_OBD_STATFS:
1902                 RETURN(ll_obd_statfs(inode, (void *)arg));
1903
1904         /* We need to special case any other ioctls we want to handle,
1905          * to send them to the MDS/OST as appropriate and to properly
1906          * network encode the arg field.
1907         case FSFILT_IOC_SETVERSION_OLD:
1908         case FSFILT_IOC_SETVERSION:
1909         */
1910         case LL_IOC_FLUSHCTX:
1911                 RETURN(ll_flush_ctx(inode));
1912         case LL_IOC_PATH2FID: {
1913                 if (cfs_copy_to_user((void *)arg, ll_inode2fid(inode),
1914                                      sizeof(struct lu_fid)))
1915                         RETURN(-EFAULT);
1916
1917                 RETURN(0);
1918         }
1919         case OBD_IOC_FID2PATH:
1920                 RETURN(ll_fid2path(inode, (void *)arg));
1921         case LL_IOC_DATA_VERSION: {
1922                 struct ioc_data_version idv;
1923                 int rc;
1924
1925                 if (cfs_copy_from_user(&idv, (char *)arg, sizeof(idv)))
1926                         RETURN(-EFAULT);
1927
1928                 rc = ll_data_version(inode, &idv.idv_version,
1929                                      !(idv.idv_flags & LL_DV_NOFLUSH));
1930
1931                 if (rc == 0 &&
1932                     cfs_copy_to_user((char *) arg, &idv, sizeof(idv)))
1933                         RETURN(-EFAULT);
1934
1935                 RETURN(rc);
1936         }
1937
1938         case LL_IOC_GET_MDTIDX: {
1939                 int mdtidx;
1940
1941                 mdtidx = ll_get_mdt_idx(inode);
1942                 if (mdtidx < 0)
1943                         RETURN(mdtidx);
1944
1945                 if (put_user((int)mdtidx, (int*)arg))
1946                         RETURN(-EFAULT);
1947
1948                 RETURN(0);
1949         }
1950         case OBD_IOC_GETDTNAME:
1951         case OBD_IOC_GETMDNAME:
1952                 RETURN(ll_get_obd_name(inode, cmd, arg));
1953         default: {
1954                 int err;
1955
1956                 if (LLIOC_STOP ==
1957                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1958                         RETURN(err);
1959
1960                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1961                                      (void *)arg));
1962         }
1963         }
1964 }
1965
1966 #ifndef HAVE_FILE_LLSEEK_SIZE
1967 static inline loff_t
1968 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
1969 {
1970         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
1971                 return -EINVAL;
1972         if (offset > maxsize)
1973                 return -EINVAL;
1974
1975         if (offset != file->f_pos) {
1976                 file->f_pos = offset;
1977                 file->f_version = 0;
1978         }
1979         return offset;
1980 }
1981
1982 static loff_t
1983 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
1984                 loff_t maxsize, loff_t eof)
1985 {
1986         struct inode *inode = file->f_dentry->d_inode;
1987
1988         switch (origin) {
1989         case SEEK_END:
1990                 offset += eof;
1991                 break;
1992         case SEEK_CUR:
1993                 /*
1994                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
1995                  * position-querying operation.  Avoid rewriting the "same"
1996                  * f_pos value back to the file because a concurrent read(),
1997                  * write() or lseek() might have altered it
1998                  */
1999                 if (offset == 0)
2000                         return file->f_pos;
2001                 /*
2002                  * f_lock protects against read/modify/write race with other
2003                  * SEEK_CURs. Note that parallel writes and reads behave
2004                  * like SEEK_SET.
2005                  */
2006                 mutex_lock(&inode->i_mutex);
2007                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2008                 mutex_unlock(&inode->i_mutex);
2009                 return offset;
2010         case SEEK_DATA:
2011                 /*
2012                  * In the generic case the entire file is data, so as long as
2013                  * offset isn't at the end of the file then the offset is data.
2014                  */
2015                 if (offset >= eof)
2016                         return -ENXIO;
2017                 break;
2018         case SEEK_HOLE:
2019                 /*
2020                  * There is a virtual hole at the end of the file, so as long as
2021                  * offset isn't i_size or larger, return i_size.
2022                  */
2023                 if (offset >= eof)
2024                         return -ENXIO;
2025                 offset = eof;
2026                 break;
2027         }
2028
2029         return llseek_execute(file, offset, maxsize);
2030 }
2031 #endif
2032
2033 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2034 {
2035         struct inode *inode = file->f_dentry->d_inode;
2036         loff_t retval, eof = 0;
2037
2038         ENTRY;
2039         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2040                            (origin == SEEK_CUR) ? file->f_pos : 0);
2041         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2042                inode->i_ino, inode->i_generation, inode, retval, retval,
2043                origin);
2044         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2045
2046         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2047                 retval = ll_glimpse_size(inode);
2048                 if (retval != 0)
2049                         RETURN(retval);
2050                 eof = i_size_read(inode);
2051         }
2052
2053         retval = generic_file_llseek_size(file, offset, origin,
2054                                           ll_file_maxbytes(inode), eof);
2055         RETURN(retval);
2056 }
2057
2058 int ll_flush(struct file *file, fl_owner_t id)
2059 {
2060         struct inode *inode = file->f_dentry->d_inode;
2061         struct ll_inode_info *lli = ll_i2info(inode);
2062         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2063         int rc, err;
2064
2065         LASSERT(!S_ISDIR(inode->i_mode));
2066
2067         /* catch async errors that were recorded back when async writeback
2068          * failed for pages in this mapping. */
2069         rc = lli->lli_async_rc;
2070         lli->lli_async_rc = 0;
2071         err = lov_read_and_clear_async_rc(lli->lli_clob);
2072         if (rc == 0)
2073                 rc = err;
2074
2075         /* The application has been told write failure already.
2076          * Do not report failure again. */
2077         if (fd->fd_write_failed)
2078                 return 0;
2079         return rc ? -EIO : 0;
2080 }
2081
2082 /**
2083  * Called to make sure a portion of file has been written out.
2084  * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2085  *
2086  * Return how many pages have been written.
2087  */
2088 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2089                        enum cl_fsync_mode mode)
2090 {
2091         struct cl_env_nest nest;
2092         struct lu_env *env;
2093         struct cl_io *io;
2094         struct obd_capa *capa = NULL;
2095         struct cl_fsync_io *fio;
2096         int result;
2097         ENTRY;
2098
2099         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2100             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2101                 RETURN(-EINVAL);
2102
2103         env = cl_env_nested_get(&nest);
2104         if (IS_ERR(env))
2105                 RETURN(PTR_ERR(env));
2106
2107         capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2108
2109         io = ccc_env_thread_io(env);
2110         io->ci_obj = cl_i2info(inode)->lli_clob;
2111         io->ci_ignore_layout = 1;
2112
2113         /* initialize parameters for sync */
2114         fio = &io->u.ci_fsync;
2115         fio->fi_capa = capa;
2116         fio->fi_start = start;
2117         fio->fi_end = end;
2118         fio->fi_fid = ll_inode2fid(inode);
2119         fio->fi_mode = mode;
2120         fio->fi_nr_written = 0;
2121
2122         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2123                 result = cl_io_loop(env, io);
2124         else
2125                 result = io->ci_result;
2126         if (result == 0)
2127                 result = fio->fi_nr_written;
2128         cl_io_fini(env, io);
2129         cl_env_nested_put(&nest, env);
2130
2131         capa_put(capa);
2132
2133         RETURN(result);
2134 }
2135
2136 #ifdef HAVE_FILE_FSYNC_4ARGS
2137 int ll_fsync(struct file *file, loff_t start, loff_t end, int data)
2138 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2139 int ll_fsync(struct file *file, int data)
2140 #else
2141 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2142 #endif
2143 {
2144         struct inode *inode = file->f_dentry->d_inode;
2145         struct ll_inode_info *lli = ll_i2info(inode);
2146         struct ptlrpc_request *req;
2147         struct obd_capa *oc;
2148         int rc, err;
2149         ENTRY;
2150
2151         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2152                inode->i_generation, inode);
2153         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2154
2155 #ifdef HAVE_FILE_FSYNC_4ARGS
2156         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2157         mutex_lock(&inode->i_mutex);
2158 #else
2159         /* fsync's caller has already called _fdata{sync,write}, we want
2160          * that IO to finish before calling the osc and mdc sync methods */
2161         rc = filemap_fdatawait(inode->i_mapping);
2162 #endif
2163
2164         /* catch async errors that were recorded back when async writeback
2165          * failed for pages in this mapping. */
2166         if (!S_ISDIR(inode->i_mode)) {
2167                 err = lli->lli_async_rc;
2168                 lli->lli_async_rc = 0;
2169                 if (rc == 0)
2170                         rc = err;
2171                 err = lov_read_and_clear_async_rc(lli->lli_clob);
2172                 if (rc == 0)
2173                         rc = err;
2174         }
2175
2176         oc = ll_mdscapa_get(inode);
2177         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2178                       &req);
2179         capa_put(oc);
2180         if (!rc)
2181                 rc = err;
2182         if (!err)
2183                 ptlrpc_req_finished(req);
2184
2185         if (data) {
2186                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2187
2188                 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2189                                 CL_FSYNC_ALL);
2190                 if (rc == 0 && err < 0)
2191                         rc = err;
2192                 if (rc < 0)
2193                         fd->fd_write_failed = true;
2194                 else
2195                         fd->fd_write_failed = false;
2196         }
2197
2198 #ifdef HAVE_FILE_FSYNC_4ARGS
2199         mutex_unlock(&inode->i_mutex);
2200 #endif
2201         RETURN(rc);
2202 }
2203
2204 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2205 {
2206         struct inode *inode = file->f_dentry->d_inode;
2207         struct ll_sb_info *sbi = ll_i2sbi(inode);
2208         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2209                                            .ei_cb_cp =ldlm_flock_completion_ast,
2210                                            .ei_cbdata = file_lock };
2211         struct md_op_data *op_data;
2212         struct lustre_handle lockh = {0};
2213         ldlm_policy_data_t flock = {{0}};
2214         int flags = 0;
2215         int rc;
2216         ENTRY;
2217
2218         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2219                inode->i_ino, file_lock);
2220
2221         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2222
2223         if (file_lock->fl_flags & FL_FLOCK) {
2224                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2225                 /* flocks are whole-file locks */
2226                 flock.l_flock.end = OFFSET_MAX;
2227                 /* For flocks owner is determined by the local file desctiptor*/
2228                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2229         } else if (file_lock->fl_flags & FL_POSIX) {
2230                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2231                 flock.l_flock.start = file_lock->fl_start;
2232                 flock.l_flock.end = file_lock->fl_end;
2233         } else {
2234                 RETURN(-EINVAL);
2235         }
2236         flock.l_flock.pid = file_lock->fl_pid;
2237
2238         /* Somewhat ugly workaround for svc lockd.
2239          * lockd installs custom fl_lmops->lm_compare_owner that checks
2240          * for the fl_owner to be the same (which it always is on local node
2241          * I guess between lockd processes) and then compares pid.
2242          * As such we assign pid to the owner field to make it all work,
2243          * conflict with normal locks is unlikely since pid space and
2244          * pointer space for current->files are not intersecting */
2245         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2246                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2247
2248         switch (file_lock->fl_type) {
2249         case F_RDLCK:
2250                 einfo.ei_mode = LCK_PR;
2251                 break;
2252         case F_UNLCK:
2253                 /* An unlock request may or may not have any relation to
2254                  * existing locks so we may not be able to pass a lock handle
2255                  * via a normal ldlm_lock_cancel() request. The request may even
2256                  * unlock a byte range in the middle of an existing lock. In
2257                  * order to process an unlock request we need all of the same
2258                  * information that is given with a normal read or write record
2259                  * lock request. To avoid creating another ldlm unlock (cancel)
2260                  * message we'll treat a LCK_NL flock request as an unlock. */
2261                 einfo.ei_mode = LCK_NL;
2262                 break;
2263         case F_WRLCK:
2264                 einfo.ei_mode = LCK_PW;
2265                 break;
2266         default:
2267                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2268                         file_lock->fl_type);
2269                 RETURN (-ENOTSUPP);
2270         }
2271
2272         switch (cmd) {
2273         case F_SETLKW:
2274 #ifdef F_SETLKW64
2275         case F_SETLKW64:
2276 #endif
2277                 flags = 0;
2278                 break;
2279         case F_SETLK:
2280 #ifdef F_SETLK64
2281         case F_SETLK64:
2282 #endif
2283                 flags = LDLM_FL_BLOCK_NOWAIT;
2284                 break;
2285         case F_GETLK:
2286 #ifdef F_GETLK64
2287         case F_GETLK64:
2288 #endif
2289                 flags = LDLM_FL_TEST_LOCK;
2290                 /* Save the old mode so that if the mode in the lock changes we
2291                  * can decrement the appropriate reader or writer refcount. */
2292                 file_lock->fl_type = einfo.ei_mode;
2293                 break;
2294         default:
2295                 CERROR("unknown fcntl lock command: %d\n", cmd);
2296                 RETURN (-EINVAL);
2297         }
2298
2299         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2300                                      LUSTRE_OPC_ANY, NULL);
2301         if (IS_ERR(op_data))
2302                 RETURN(PTR_ERR(op_data));
2303
2304         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2305                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2306                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2307
2308         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2309                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2310
2311         ll_finish_md_op_data(op_data);
2312
2313         if ((file_lock->fl_flags & FL_FLOCK) &&
2314             (rc == 0 || file_lock->fl_type == F_UNLCK))
2315                 flock_lock_file_wait(file, file_lock);
2316         if ((file_lock->fl_flags & FL_POSIX) &&
2317             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2318             !(flags & LDLM_FL_TEST_LOCK))
2319                 posix_lock_file_wait(file, file_lock);
2320
2321         RETURN(rc);
2322 }
2323
2324 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2325 {
2326         ENTRY;
2327
2328         RETURN(-ENOSYS);
2329 }
2330
2331 /**
2332  * test if some locks matching bits and l_req_mode are acquired
2333  * - bits can be in different locks
2334  * - if found clear the common lock bits in *bits
2335  * - the bits not found, are kept in *bits
2336  * \param inode [IN]
2337  * \param bits [IN] searched lock bits [IN]
2338  * \param l_req_mode [IN] searched lock mode
2339  * \retval boolean, true iff all bits are found
2340  */
2341 int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2342 {
2343         struct lustre_handle lockh;
2344         ldlm_policy_data_t policy;
2345         ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2346                                 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2347         struct lu_fid *fid;
2348         __u64 flags;
2349         int i;
2350         ENTRY;
2351
2352         if (!inode)
2353                RETURN(0);
2354
2355         fid = &ll_i2info(inode)->lli_fid;
2356         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2357                ldlm_lockname[mode]);
2358
2359         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2360         for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2361                 policy.l_inodebits.bits = *bits & (1 << i);
2362                 if (policy.l_inodebits.bits == 0)
2363                         continue;
2364
2365                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2366                                   &policy, mode, &lockh)) {
2367                         struct ldlm_lock *lock;
2368
2369                         lock = ldlm_handle2lock(&lockh);
2370                         if (lock) {
2371                                 *bits &=
2372                                       ~(lock->l_policy_data.l_inodebits.bits);
2373                                 LDLM_LOCK_PUT(lock);
2374                         } else {
2375                                 *bits &= ~policy.l_inodebits.bits;
2376                         }
2377                 }
2378         }
2379         RETURN(*bits == 0);
2380 }
2381
2382 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2383                             struct lustre_handle *lockh, __u64 flags)
2384 {
2385         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2386         struct lu_fid *fid;
2387         ldlm_mode_t rc;
2388         ENTRY;
2389
2390         fid = &ll_i2info(inode)->lli_fid;
2391         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2392
2393         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2394                            fid, LDLM_IBITS, &policy,
2395                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2396         RETURN(rc);
2397 }
2398
2399 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2400 {
2401         /* Already unlinked. Just update nlink and return success */
2402         if (rc == -ENOENT) {
2403                 clear_nlink(inode);
2404                 /* This path cannot be hit for regular files unless in
2405                  * case of obscure races, so no need to to validate
2406                  * size. */
2407                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2408                         return 0;
2409         } else if (rc != 0) {
2410                 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2411                        ll_get_fsname(inode->i_sb, NULL, 0),
2412                        PFID(ll_inode2fid(inode)), rc);
2413         }
2414
2415         return rc;
2416 }
2417
2418 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2419                              __u64 ibits)
2420 {
2421         struct inode *inode = dentry->d_inode;
2422         struct ptlrpc_request *req = NULL;
2423         struct obd_export *exp;
2424         int rc = 0;
2425         ENTRY;
2426
2427         LASSERT(inode != NULL);
2428
2429         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2430                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2431
2432         exp = ll_i2mdexp(inode);
2433
2434         /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2435          *      But under CMD case, it caused some lock issues, should be fixed
2436          *      with new CMD ibits lock. See bug 12718 */
2437         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2438                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2439                 struct md_op_data *op_data;
2440
2441                 if (ibits == MDS_INODELOCK_LOOKUP)
2442                         oit.it_op = IT_LOOKUP;
2443
2444                 /* Call getattr by fid, so do not provide name at all. */
2445                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2446                                              dentry->d_inode, NULL, 0, 0,
2447                                              LUSTRE_OPC_ANY, NULL);
2448                 if (IS_ERR(op_data))
2449                         RETURN(PTR_ERR(op_data));
2450
2451                 oit.it_create_mode |= M_CHECK_STALE;
2452                 rc = md_intent_lock(exp, op_data, NULL, 0,
2453                                     /* we are not interested in name
2454                                        based lookup */
2455                                     &oit, 0, &req,
2456                                     ll_md_blocking_ast, 0);
2457                 ll_finish_md_op_data(op_data);
2458                 oit.it_create_mode &= ~M_CHECK_STALE;
2459                 if (rc < 0) {
2460                         rc = ll_inode_revalidate_fini(inode, rc);
2461                         GOTO (out, rc);
2462                 }
2463
2464                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2465                 if (rc != 0) {
2466                         ll_intent_release(&oit);
2467                         GOTO(out, rc);
2468                 }
2469
2470                 /* Unlinked? Unhash dentry, so it is not picked up later by
2471                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2472                    here to preserve get_cwd functionality on 2.6.
2473                    Bug 10503 */
2474                 if (!dentry->d_inode->i_nlink)
2475                         d_lustre_invalidate(dentry);
2476
2477                 ll_lookup_finish_locks(&oit, dentry);
2478         } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2479                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2480                 obd_valid valid = OBD_MD_FLGETATTR;
2481                 struct md_op_data *op_data;
2482                 int ealen = 0;
2483
2484                 if (S_ISREG(inode->i_mode)) {
2485                         rc = ll_get_max_mdsize(sbi, &ealen);
2486                         if (rc)
2487                                 RETURN(rc);
2488                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2489                 }
2490
2491                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2492                                              0, ealen, LUSTRE_OPC_ANY,
2493                                              NULL);
2494                 if (IS_ERR(op_data))
2495                         RETURN(PTR_ERR(op_data));
2496
2497                 op_data->op_valid = valid;
2498                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2499                  * capa for this inode. Because we only keep capas of dirs
2500                  * fresh. */
2501                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2502                 ll_finish_md_op_data(op_data);
2503                 if (rc) {
2504                         rc = ll_inode_revalidate_fini(inode, rc);
2505                         RETURN(rc);
2506                 }
2507
2508                 rc = ll_prep_inode(&inode, req, NULL, NULL);
2509         }
2510 out:
2511         ptlrpc_req_finished(req);
2512         return rc;
2513 }
2514
2515 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2516                            __u64 ibits)
2517 {
2518         struct inode *inode = dentry->d_inode;
2519         int rc;
2520         ENTRY;
2521
2522         rc = __ll_inode_revalidate_it(dentry, it, ibits);
2523         if (rc != 0)
2524                 RETURN(rc);
2525
2526         /* if object isn't regular file, don't validate size */
2527         if (!S_ISREG(inode->i_mode)) {
2528                 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2529                 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2530                 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2531         } else {
2532                 rc = ll_glimpse_size(inode);
2533         }
2534         RETURN(rc);
2535 }
2536
2537 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2538                   struct lookup_intent *it, struct kstat *stat)
2539 {
2540         struct inode *inode = de->d_inode;
2541         struct ll_sb_info *sbi = ll_i2sbi(inode);
2542         struct ll_inode_info *lli = ll_i2info(inode);
2543         int res = 0;
2544
2545         res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2546                                              MDS_INODELOCK_LOOKUP);
2547         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2548
2549         if (res)
2550                 return res;
2551
2552         stat->dev = inode->i_sb->s_dev;
2553         if (ll_need_32bit_api(sbi))
2554                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2555         else
2556                 stat->ino = inode->i_ino;
2557         stat->mode = inode->i_mode;
2558         stat->nlink = inode->i_nlink;
2559         stat->uid = inode->i_uid;
2560         stat->gid = inode->i_gid;
2561         stat->rdev = inode->i_rdev;
2562         stat->atime = inode->i_atime;
2563         stat->mtime = inode->i_mtime;
2564         stat->ctime = inode->i_ctime;
2565         stat->blksize = 1 << inode->i_blkbits;
2566
2567         stat->size = i_size_read(inode);
2568         stat->blocks = inode->i_blocks;
2569
2570         return 0;
2571 }
2572 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2573 {
2574         struct lookup_intent it = { .it_op = IT_GETATTR };
2575
2576         return ll_getattr_it(mnt, de, &it, stat);
2577 }
2578
2579 #ifdef HAVE_LINUX_FIEMAP_H
2580 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2581                 __u64 start, __u64 len)
2582 {
2583         int rc;
2584         size_t num_bytes;
2585         struct ll_user_fiemap *fiemap;
2586         unsigned int extent_count = fieinfo->fi_extents_max;
2587
2588         num_bytes = sizeof(*fiemap) + (extent_count *
2589                                        sizeof(struct ll_fiemap_extent));
2590         OBD_ALLOC_LARGE(fiemap, num_bytes);
2591
2592         if (fiemap == NULL)
2593                 RETURN(-ENOMEM);
2594
2595         fiemap->fm_flags = fieinfo->fi_flags;
2596         fiemap->fm_extent_count = fieinfo->fi_extents_max;
2597         fiemap->fm_start = start;
2598         fiemap->fm_length = len;
2599         memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2600                sizeof(struct ll_fiemap_extent));
2601
2602         rc = ll_do_fiemap(inode, fiemap, num_bytes);
2603
2604         fieinfo->fi_flags = fiemap->fm_flags;
2605         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2606         memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2607                fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2608
2609         OBD_FREE_LARGE(fiemap, num_bytes);
2610         return rc;
2611 }
2612 #endif
2613
2614 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2615 {
2616         struct ll_inode_info *lli = ll_i2info(inode);
2617         struct posix_acl *acl = NULL;
2618         ENTRY;
2619
2620         spin_lock(&lli->lli_lock);
2621         /* VFS' acl_permission_check->check_acl will release the refcount */
2622         acl = posix_acl_dup(lli->lli_posix_acl);
2623         spin_unlock(&lli->lli_lock);
2624
2625         RETURN(acl);
2626 }
2627
2628 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2629 static int
2630 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2631 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2632 # else
2633 ll_check_acl(struct inode *inode, int mask)
2634 # endif
2635 {
2636 # ifdef CONFIG_FS_POSIX_ACL
2637         struct posix_acl *acl;
2638         int rc;
2639         ENTRY;
2640
2641 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
2642         if (flags & IPERM_FLAG_RCU)
2643                 return -ECHILD;
2644 #  endif
2645         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2646
2647         if (!acl)
2648                 RETURN(-EAGAIN);
2649
2650         rc = posix_acl_permission(inode, acl, mask);
2651         posix_acl_release(acl);
2652
2653         RETURN(rc);
2654 # else /* !CONFIG_FS_POSIX_ACL */
2655         return -EAGAIN;
2656 # endif /* CONFIG_FS_POSIX_ACL */
2657 }
2658 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2659
2660 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2661 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2662 #else
2663 # ifdef HAVE_INODE_PERMISION_2ARGS
2664 int ll_inode_permission(struct inode *inode, int mask)
2665 # else
2666 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2667 # endif
2668 #endif
2669 {
2670         int rc = 0;
2671         ENTRY;
2672
2673 #ifdef MAY_NOT_BLOCK
2674         if (mask & MAY_NOT_BLOCK)
2675                 return -ECHILD;
2676 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2677         if (flags & IPERM_FLAG_RCU)
2678                 return -ECHILD;
2679 #endif
2680
2681        /* as root inode are NOT getting validated in lookup operation,
2682         * need to do it before permission check. */
2683
2684         if (inode == inode->i_sb->s_root->d_inode) {
2685                 struct lookup_intent it = { .it_op = IT_LOOKUP };
2686
2687                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2688                                               MDS_INODELOCK_LOOKUP);
2689                 if (rc)
2690                         RETURN(rc);
2691         }
2692
2693         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2694                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2695
2696         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2697                 return lustre_check_remote_perm(inode, mask);
2698
2699         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2700         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
2701
2702         RETURN(rc);
2703 }
2704
2705 #ifdef HAVE_FILE_READV
2706 #define READ_METHOD readv
2707 #define READ_FUNCTION ll_file_readv
2708 #define WRITE_METHOD writev
2709 #define WRITE_FUNCTION ll_file_writev
2710 #else
2711 #define READ_METHOD aio_read
2712 #define READ_FUNCTION ll_file_aio_read
2713 #define WRITE_METHOD aio_write
2714 #define WRITE_FUNCTION ll_file_aio_write
2715 #endif
2716
2717 /* -o localflock - only provides locally consistent flock locks */
2718 struct file_operations ll_file_operations = {
2719         .read           = ll_file_read,
2720         .READ_METHOD    = READ_FUNCTION,
2721         .write          = ll_file_write,
2722         .WRITE_METHOD   = WRITE_FUNCTION,
2723         .unlocked_ioctl = ll_file_ioctl,
2724         .open           = ll_file_open,
2725         .release        = ll_file_release,
2726         .mmap           = ll_file_mmap,
2727         .llseek         = ll_file_seek,
2728 #ifdef HAVE_KERNEL_SENDFILE
2729         .sendfile       = ll_file_sendfile,
2730 #endif
2731 #ifdef HAVE_KERNEL_SPLICE_READ
2732         .splice_read    = ll_file_splice_read,
2733 #endif
2734         .fsync          = ll_fsync,
2735         .flush          = ll_flush
2736 };
2737
2738 struct file_operations ll_file_operations_flock = {
2739         .read           = ll_file_read,
2740         .READ_METHOD    = READ_FUNCTION,
2741         .write          = ll_file_write,
2742         .WRITE_METHOD   = WRITE_FUNCTION,
2743         .unlocked_ioctl = ll_file_ioctl,
2744         .open           = ll_file_open,
2745         .release        = ll_file_release,
2746         .mmap           = ll_file_mmap,
2747         .llseek         = ll_file_seek,
2748 #ifdef HAVE_KERNEL_SENDFILE
2749         .sendfile       = ll_file_sendfile,
2750 #endif
2751 #ifdef HAVE_KERNEL_SPLICE_READ
2752         .splice_read    = ll_file_splice_read,
2753 #endif
2754         .fsync          = ll_fsync,
2755         .flush          = ll_flush,
2756         .flock          = ll_file_flock,
2757         .lock           = ll_file_flock
2758 };
2759
2760 /* These are for -o noflock - to return ENOSYS on flock calls */
2761 struct file_operations ll_file_operations_noflock = {
2762         .read           = ll_file_read,
2763         .READ_METHOD    = READ_FUNCTION,
2764         .write          = ll_file_write,
2765         .WRITE_METHOD   = WRITE_FUNCTION,
2766         .unlocked_ioctl = ll_file_ioctl,
2767         .open           = ll_file_open,
2768         .release        = ll_file_release,
2769         .mmap           = ll_file_mmap,
2770         .llseek         = ll_file_seek,
2771 #ifdef HAVE_KERNEL_SENDFILE
2772         .sendfile       = ll_file_sendfile,
2773 #endif
2774 #ifdef HAVE_KERNEL_SPLICE_READ
2775         .splice_read    = ll_file_splice_read,
2776 #endif
2777         .fsync          = ll_fsync,
2778         .flush          = ll_flush,
2779         .flock          = ll_file_noflock,
2780         .lock           = ll_file_noflock
2781 };
2782
2783 struct inode_operations ll_file_inode_operations = {
2784         .setattr        = ll_setattr,
2785         .getattr        = ll_getattr,
2786         .permission     = ll_inode_permission,
2787         .setxattr       = ll_setxattr,
2788         .getxattr       = ll_getxattr,
2789         .listxattr      = ll_listxattr,
2790         .removexattr    = ll_removexattr,
2791 #ifdef  HAVE_LINUX_FIEMAP_H
2792         .fiemap         = ll_fiemap,
2793 #endif
2794 #ifdef HAVE_IOP_GET_ACL
2795         .get_acl        = ll_get_acl,
2796 #endif
2797 };
2798
2799 /* dynamic ioctl number support routins */
2800 static struct llioc_ctl_data {
2801         struct rw_semaphore     ioc_sem;
2802         cfs_list_t              ioc_head;
2803 } llioc = {
2804         __RWSEM_INITIALIZER(llioc.ioc_sem),
2805         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2806 };
2807
2808
2809 struct llioc_data {
2810         cfs_list_t              iocd_list;
2811         unsigned int            iocd_size;
2812         llioc_callback_t        iocd_cb;
2813         unsigned int            iocd_count;
2814         unsigned int            iocd_cmd[0];
2815 };
2816
2817 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2818 {
2819         unsigned int size;
2820         struct llioc_data *in_data = NULL;
2821         ENTRY;
2822
2823         if (cb == NULL || cmd == NULL ||
2824             count > LLIOC_MAX_CMD || count < 0)
2825                 RETURN(NULL);
2826
2827         size = sizeof(*in_data) + count * sizeof(unsigned int);
2828         OBD_ALLOC(in_data, size);
2829         if (in_data == NULL)
2830                 RETURN(NULL);
2831
2832         memset(in_data, 0, sizeof(*in_data));
2833         in_data->iocd_size = size;
2834         in_data->iocd_cb = cb;
2835         in_data->iocd_count = count;
2836         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2837
2838         down_write(&llioc.ioc_sem);
2839         cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2840         up_write(&llioc.ioc_sem);
2841
2842         RETURN(in_data);
2843 }
2844
2845 void ll_iocontrol_unregister(void *magic)
2846 {
2847         struct llioc_data *tmp;
2848
2849         if (magic == NULL)
2850                 return;
2851
2852         down_write(&llioc.ioc_sem);
2853         cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2854                 if (tmp == magic) {
2855                         unsigned int size = tmp->iocd_size;
2856
2857                         cfs_list_del(&tmp->iocd_list);
2858                         up_write(&llioc.ioc_sem);
2859
2860                         OBD_FREE(tmp, size);
2861                         return;
2862                 }
2863         }
2864         up_write(&llioc.ioc_sem);
2865
2866         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2867 }
2868
2869 EXPORT_SYMBOL(ll_iocontrol_register);
2870 EXPORT_SYMBOL(ll_iocontrol_unregister);
2871
2872 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2873                         unsigned int cmd, unsigned long arg, int *rcp)
2874 {
2875         enum llioc_iter ret = LLIOC_CONT;
2876         struct llioc_data *data;
2877         int rc = -EINVAL, i;
2878
2879         down_read(&llioc.ioc_sem);
2880         cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2881                 for (i = 0; i < data->iocd_count; i++) {
2882                         if (cmd != data->iocd_cmd[i])
2883                                 continue;
2884
2885                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2886                         break;
2887                 }
2888
2889                 if (ret == LLIOC_STOP)
2890                         break;
2891         }
2892         up_read(&llioc.ioc_sem);
2893
2894         if (rcp)
2895                 *rcp = rc;
2896         return ret;
2897 }
2898
2899 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
2900 {
2901         struct ll_inode_info *lli = ll_i2info(inode);
2902         struct cl_env_nest nest;
2903         struct lu_env *env;
2904         int result;
2905         ENTRY;
2906
2907         if (lli->lli_clob == NULL)
2908                 RETURN(0);
2909
2910         env = cl_env_nested_get(&nest);
2911         if (IS_ERR(env))
2912                 RETURN(PTR_ERR(env));
2913
2914         result = cl_conf_set(env, lli->lli_clob, conf);
2915         cl_env_nested_put(&nest, env);
2916
2917         if (conf->coc_opc == OBJECT_CONF_SET) {
2918                 struct ldlm_lock *lock = conf->coc_lock;
2919
2920                 LASSERT(lock != NULL);
2921                 LASSERT(ldlm_has_layout(lock));
2922                 if (result == 0) {
2923                         /* it can only be allowed to match after layout is
2924                          * applied to inode otherwise false layout would be
2925                          * seen. Applying layout shoud happen before dropping
2926                          * the intent lock. */
2927                         ldlm_lock_allow_match(lock);
2928                 }
2929         }
2930         RETURN(result);
2931 }
2932
2933 /**
2934  * Apply the layout to the inode. Layout lock is held and will be released
2935  * in this function.
2936  */
2937 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
2938                                 struct inode *inode, __u32 *gen, bool reconf)
2939 {
2940         struct ll_inode_info *lli = ll_i2info(inode);
2941         struct ll_sb_info    *sbi = ll_i2sbi(inode);
2942         struct ldlm_lock *lock;
2943         struct lustre_md md = { NULL };
2944         struct cl_object_conf conf;
2945         int rc = 0;
2946         bool lvb_ready;
2947         ENTRY;
2948
2949         LASSERT(lustre_handle_is_used(lockh));
2950
2951         lock = ldlm_handle2lock(lockh);
2952         LASSERT(lock != NULL);
2953         LASSERT(ldlm_has_layout(lock));
2954
2955         LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
2956                 inode, PFID(&lli->lli_fid), reconf);
2957
2958         lock_res_and_lock(lock);
2959         lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
2960         unlock_res_and_lock(lock);
2961         /* checking lvb_ready is racy but this is okay. The worst case is
2962          * that multi processes may configure the file on the same time. */
2963         if (lvb_ready || !reconf) {
2964                 LDLM_LOCK_PUT(lock);
2965
2966                 rc = -ENODATA;
2967                 if (lvb_ready) {
2968                         /* layout_gen must be valid if layout lock is not
2969                          * cancelled and stripe has already set */
2970                         *gen = lli->lli_layout_gen;
2971                         rc = 0;
2972                 }
2973                 ldlm_lock_decref(lockh, mode);
2974                 RETURN(rc);
2975         }
2976
2977         /* for layout lock, lmm is returned in lock's lvb.
2978          * lvb_data is immutable if the lock is held so it's safe to access it
2979          * without res lock. See the description in ldlm_lock_decref_internal()
2980          * for the condition to free lvb_data of layout lock */
2981         if (lock->l_lvb_data != NULL) {
2982                 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
2983                                   lock->l_lvb_data, lock->l_lvb_len);
2984                 if (rc >= 0) {
2985                         if (md.lsm != NULL)
2986                                 *gen = md.lsm->lsm_layout_gen;
2987                         rc = 0;
2988                 } else {
2989                         CERROR("%s: file "DFID" unpackmd error: %d\n",
2990                                 ll_get_fsname(inode->i_sb, NULL, 0),
2991                                 PFID(&lli->lli_fid), rc);
2992                 }
2993         }
2994         if (rc < 0) {
2995                 LDLM_LOCK_PUT(lock);
2996                 ldlm_lock_decref(lockh, mode);
2997                 RETURN(rc);
2998         }
2999
3000         /* set layout to file. Unlikely this will fail as old layout was
3001          * surely eliminated */
3002         memset(&conf, 0, sizeof conf);
3003         conf.coc_opc = OBJECT_CONF_SET;
3004         conf.coc_inode = inode;
3005         conf.coc_lock = lock;
3006         conf.u.coc_md = &md;
3007         rc = ll_layout_conf(inode, &conf);
3008         LDLM_LOCK_PUT(lock);
3009
3010         ldlm_lock_decref(lockh, mode);
3011
3012         if (md.lsm != NULL)
3013                 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3014
3015         /* wait for IO to complete if it's still being used. */
3016         if (rc == -EBUSY) {
3017                 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3018                         ll_get_fsname(inode->i_sb, NULL, 0),
3019                         inode, PFID(&lli->lli_fid));
3020
3021                 memset(&conf, 0, sizeof conf);
3022                 conf.coc_opc = OBJECT_CONF_WAIT;
3023                 conf.coc_inode = inode;
3024                 rc = ll_layout_conf(inode, &conf);
3025                 if (rc == 0)
3026                         rc = -EAGAIN;
3027
3028                 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3029                         PFID(&lli->lli_fid), rc);
3030         }
3031
3032         RETURN(rc);
3033 }
3034
3035 /**
3036  * This function checks if there exists a LAYOUT lock on the client side,
3037  * or enqueues it if it doesn't have one in cache.
3038  *
3039  * This function will not hold layout lock so it may be revoked any time after
3040  * this function returns. Any operations depend on layout should be redone
3041  * in that case.
3042  *
3043  * This function should be called before lov_io_init() to get an uptodate
3044  * layout version, the caller should save the version number and after IO
3045  * is finished, this function should be called again to verify that layout
3046  * is not changed during IO time.
3047  */
3048 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3049 {
3050         struct ll_inode_info  *lli = ll_i2info(inode);
3051         struct ll_sb_info     *sbi = ll_i2sbi(inode);
3052         struct md_op_data     *op_data;
3053         struct lookup_intent   it;
3054         struct lustre_handle   lockh;
3055         ldlm_mode_t            mode;
3056         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3057                                            .ei_mode = LCK_CR,
3058                                            .ei_cb_bl = ll_md_blocking_ast,
3059                                            .ei_cb_cp = ldlm_completion_ast,
3060                                            .ei_cbdata = inode };
3061         int rc;
3062         ENTRY;
3063
3064         *gen = LL_LAYOUT_GEN_ZERO;
3065         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3066                 RETURN(0);
3067
3068         /* sanity checks */
3069         LASSERT(fid_is_sane(ll_inode2fid(inode)));
3070         LASSERT(S_ISREG(inode->i_mode));
3071
3072         /* mostly layout lock is caching on the local side, so try to match
3073          * it before grabbing layout lock mutex. */
3074         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3075         if (mode != 0) { /* hit cached lock */
3076                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3077                 if (rc == 0)
3078                         RETURN(0);
3079
3080                 /* better hold lli_layout_mutex to try again otherwise
3081                  * it will have starvation problem. */
3082         }
3083
3084         /* take layout lock mutex to enqueue layout lock exclusively. */
3085         mutex_lock(&lli->lli_layout_mutex);
3086
3087 again:
3088         /* try again. Maybe somebody else has done this. */
3089         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3090         if (mode != 0) { /* hit cached lock */
3091                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3092                 if (rc == -EAGAIN)
3093                         goto again;
3094
3095                 mutex_unlock(&lli->lli_layout_mutex);
3096                 RETURN(rc);
3097         }
3098
3099         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3100                         0, 0, LUSTRE_OPC_ANY, NULL);
3101         if (IS_ERR(op_data)) {
3102                 mutex_unlock(&lli->lli_layout_mutex);
3103                 RETURN(PTR_ERR(op_data));
3104         }
3105
3106         /* have to enqueue one */
3107         memset(&it, 0, sizeof(it));
3108         it.it_op = IT_LAYOUT;
3109         lockh.cookie = 0ULL;
3110
3111         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3112                         ll_get_fsname(inode->i_sb, NULL, 0), inode,
3113                         PFID(&lli->lli_fid));
3114
3115         rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3116                         NULL, 0, NULL, 0);
3117         if (it.d.lustre.it_data != NULL)
3118                 ptlrpc_req_finished(it.d.lustre.it_data);
3119         it.d.lustre.it_data = NULL;
3120
3121         ll_finish_md_op_data(op_data);
3122
3123         mode = it.d.lustre.it_lock_mode;
3124         it.d.lustre.it_lock_mode = 0;
3125         ll_intent_drop_lock(&it);
3126
3127         if (rc == 0) {
3128                 /* set lock data in case this is a new lock */
3129                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3130                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3131                 if (rc == -EAGAIN)
3132                         goto again;
3133         }
3134         mutex_unlock(&lli->lli_layout_mutex);
3135
3136         RETURN(rc);
3137 }